1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <immintrin.h>
9
10 #include <xnnpack/avgpool.h>
11 #include <xnnpack/common.h>
12 #include <xnnpack/conv.h>
13 #include <xnnpack/dwconv.h>
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/gemm.h>
16 #include <xnnpack/ibilinear.h>
17 #include <xnnpack/igemm.h>
18 #include <xnnpack/intrinsics-polyfill.h>
19 #include <xnnpack/math.h>
20 #include <xnnpack/maxpool.h>
21 #include <xnnpack/packx.h>
22 #include <xnnpack/pavgpool.h>
23 #include <xnnpack/rmax.h>
24 #include <xnnpack/spmm.h>
25 #include <xnnpack/transpose.h>
26 #include <xnnpack/vbinary.h>
27 #include <xnnpack/vmulcaddc.h>
28 #include <xnnpack/vunary.h>
29
30
xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,float * buffer,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])31 void xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4(
32 size_t output_pixels,
33 size_t kernel_elements,
34 size_t channels,
35 const float** input,
36 size_t input_offset,
37 const float* zero,
38 float* buffer,
39 float* output,
40 size_t input_increment,
41 size_t output_increment,
42 const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
43 {
44 assert(output_pixels != 0);
45 assert(kernel_elements > 9);
46 assert(channels != 0);
47
48 const __m128 vscale = _mm_load_ps(params->sse.scale);
49 const __m128 vmin = _mm_load_ps(params->sse.min);
50 const __m128 vmax = _mm_load_ps(params->sse.max);
51
52 do {
53 {
54 const float* i0 = *input++;
55 assert(i0 != NULL);
56 if XNN_UNPREDICTABLE(i0 != zero) {
57 i0 = (const float*) ((uintptr_t) i0 + input_offset);
58 }
59 const float* i1 = *input++;
60 assert(i1 != NULL);
61 if XNN_UNPREDICTABLE(i1 != zero) {
62 i1 = (const float*) ((uintptr_t) i1 + input_offset);
63 }
64 const float* i2 = *input++;
65 assert(i2 != NULL);
66 if XNN_UNPREDICTABLE(i2 != zero) {
67 i2 = (const float*) ((uintptr_t) i2 + input_offset);
68 }
69 const float* i3 = *input++;
70 assert(i3 != NULL);
71 if XNN_UNPREDICTABLE(i3 != zero) {
72 i3 = (const float*) ((uintptr_t) i3 + input_offset);
73 }
74 const float* i4 = *input++;
75 assert(i4 != NULL);
76 if XNN_UNPREDICTABLE(i4 != zero) {
77 i4 = (const float*) ((uintptr_t) i4 + input_offset);
78 }
79 const float* i5 = *input++;
80 assert(i5 != NULL);
81 if XNN_UNPREDICTABLE(i5 != zero) {
82 i5 = (const float*) ((uintptr_t) i5 + input_offset);
83 }
84 const float* i6 = *input++;
85 assert(i6 != NULL);
86 if XNN_UNPREDICTABLE(i6 != zero) {
87 i6 = (const float*) ((uintptr_t) i6 + input_offset);
88 }
89 const float* i7 = *input++;
90 assert(i7 != NULL);
91 if XNN_UNPREDICTABLE(i7 != zero) {
92 i7 = (const float*) ((uintptr_t) i7 + input_offset);
93 }
94 const float* i8 = *input++;
95 assert(i8 != NULL);
96 if XNN_UNPREDICTABLE(i8 != zero) {
97 i8 = (const float*) ((uintptr_t) i8 + input_offset);
98 }
99
100 float* b = buffer;
101 for (size_t c = 0; c < channels; c += 4) {
102 const __m128 vi0 = _mm_loadu_ps(i0);
103 i0 += 4;
104 const __m128 vi1 = _mm_loadu_ps(i1);
105 i1 += 4;
106 const __m128 vi2 = _mm_loadu_ps(i2);
107 i2 += 4;
108 const __m128 vi3 = _mm_loadu_ps(i3);
109 i3 += 4;
110 const __m128 vi4 = _mm_loadu_ps(i4);
111 i4 += 4;
112 const __m128 vi5 = _mm_loadu_ps(i5);
113 i5 += 4;
114 const __m128 vi6 = _mm_loadu_ps(i6);
115 i6 += 4;
116 const __m128 vi7 = _mm_loadu_ps(i7);
117 i7 += 4;
118 const __m128 vi8 = _mm_loadu_ps(i8);
119 i8 += 4;
120
121 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
122 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
123 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
124 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
125 const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
126 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
127 const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
128 const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
129
130 _mm_store_ps(b, vsum); b += 4;
131 }
132 }
133
134 size_t k = kernel_elements;
135 for (k -= 9; k > 8; k -= 8) {
136 const float* i0 = *input++;
137 assert(i0 != NULL);
138 if XNN_UNPREDICTABLE(i0 != zero) {
139 i0 = (const float*) ((uintptr_t) i0 + input_offset);
140 }
141 const float* i1 = *input++;
142 assert(i1 != NULL);
143 if XNN_UNPREDICTABLE(i1 != zero) {
144 i1 = (const float*) ((uintptr_t) i1 + input_offset);
145 }
146 const float* i2 = *input++;
147 assert(i2 != NULL);
148 if XNN_UNPREDICTABLE(i2 != zero) {
149 i2 = (const float*) ((uintptr_t) i2 + input_offset);
150 }
151 const float* i3 = *input++;
152 assert(i3 != NULL);
153 if XNN_UNPREDICTABLE(i3 != zero) {
154 i3 = (const float*) ((uintptr_t) i3 + input_offset);
155 }
156 const float* i4 = *input++;
157 assert(i4 != NULL);
158 if XNN_UNPREDICTABLE(i4 != zero) {
159 i4 = (const float*) ((uintptr_t) i4 + input_offset);
160 }
161 const float* i5 = *input++;
162 assert(i5 != NULL);
163 if XNN_UNPREDICTABLE(i5 != zero) {
164 i5 = (const float*) ((uintptr_t) i5 + input_offset);
165 }
166 const float* i6 = *input++;
167 assert(i6 != NULL);
168 if XNN_UNPREDICTABLE(i6 != zero) {
169 i6 = (const float*) ((uintptr_t) i6 + input_offset);
170 }
171 const float* i7 = *input++;
172 assert(i7 != NULL);
173 if XNN_UNPREDICTABLE(i7 != zero) {
174 i7 = (const float*) ((uintptr_t) i7 + input_offset);
175 }
176
177 float* b = buffer;
178 for (size_t c = 0; c < channels; c += 4) {
179 const __m128 vi0 = _mm_loadu_ps(i0);
180 i0 += 4;
181 const __m128 vi1 = _mm_loadu_ps(i1);
182 i1 += 4;
183 const __m128 vi2 = _mm_loadu_ps(i2);
184 i2 += 4;
185 const __m128 vi3 = _mm_loadu_ps(i3);
186 i3 += 4;
187 const __m128 vi4 = _mm_loadu_ps(i4);
188 i4 += 4;
189 const __m128 vi5 = _mm_loadu_ps(i5);
190 i5 += 4;
191 const __m128 vi6 = _mm_loadu_ps(i6);
192 i6 += 4;
193 const __m128 vi7 = _mm_loadu_ps(i7);
194 i7 += 4;
195 const __m128 vacc = _mm_load_ps(b);
196
197 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
198 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
199 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
200 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
201 const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
202 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
203 const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
204 const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
205
206 _mm_store_ps(b, vsum); b += 4;
207 }
208 }
209
210 {
211 const float* i0 = input[0];
212 assert(i0 != NULL);
213 const float* i1 = input[1];
214 const float* i2 = input[2];
215 const float* i3 = input[3];
216 const float* i4 = input[4];
217 const float* i5 = input[5];
218 const float* i6 = input[6];
219 const float* i7 = input[7];
220 input = (const float**) ((uintptr_t) input + input_increment);
221 if (k < 2) {
222 i1 = zero;
223 }
224 assert(i1 != NULL);
225 if (k <= 2) {
226 i2 = zero;
227 }
228 assert(i2 != NULL);
229 if (k < 4) {
230 i3 = zero;
231 }
232 assert(i3 != NULL);
233 if (k <= 4) {
234 i4 = zero;
235 }
236 assert(i4 != NULL);
237 if (k < 6) {
238 i5 = zero;
239 }
240 assert(i5 != NULL);
241 if (k <= 6) {
242 i6 = zero;
243 }
244 assert(i6 != NULL);
245 if (k < 8) {
246 i7 = zero;
247 }
248 assert(i7 != NULL);
249 if XNN_UNPREDICTABLE(i0 != zero) {
250 i0 = (const float*) ((uintptr_t) i0 + input_offset);
251 }
252 if XNN_UNPREDICTABLE(i1 != zero) {
253 i1 = (const float*) ((uintptr_t) i1 + input_offset);
254 }
255 if XNN_UNPREDICTABLE(i2 != zero) {
256 i2 = (const float*) ((uintptr_t) i2 + input_offset);
257 }
258 if XNN_UNPREDICTABLE(i3 != zero) {
259 i3 = (const float*) ((uintptr_t) i3 + input_offset);
260 }
261 if XNN_UNPREDICTABLE(i4 != zero) {
262 i4 = (const float*) ((uintptr_t) i4 + input_offset);
263 }
264 if XNN_UNPREDICTABLE(i5 != zero) {
265 i5 = (const float*) ((uintptr_t) i5 + input_offset);
266 }
267 if XNN_UNPREDICTABLE(i6 != zero) {
268 i6 = (const float*) ((uintptr_t) i6 + input_offset);
269 }
270 if XNN_UNPREDICTABLE(i7 != zero) {
271 i7 = (const float*) ((uintptr_t) i7 + input_offset);
272 }
273
274 size_t c = channels;
275 float* b = buffer;
276 while (c >= 4) {
277 const __m128 vi0 = _mm_loadu_ps(i0);
278 i0 += 4;
279 const __m128 vi1 = _mm_loadu_ps(i1);
280 i1 += 4;
281 const __m128 vi2 = _mm_loadu_ps(i2);
282 i2 += 4;
283 const __m128 vi3 = _mm_loadu_ps(i3);
284 i3 += 4;
285 const __m128 vi4 = _mm_loadu_ps(i4);
286 i4 += 4;
287 const __m128 vi5 = _mm_loadu_ps(i5);
288 i5 += 4;
289 const __m128 vi6 = _mm_loadu_ps(i6);
290 i6 += 4;
291 const __m128 vi7 = _mm_loadu_ps(i7);
292 i7 += 4;
293 const __m128 vacc = _mm_load_ps(b);
294 b += 4;
295
296 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
297 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
298 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
299 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
300 const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
301 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
302 const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
303 const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
304
305 __m128 vout = _mm_mul_ps(vsum, vscale);
306 vout = _mm_max_ps(vout, vmin);
307 vout = _mm_min_ps(vout, vmax);
308
309 _mm_storeu_ps(output, vout);
310 output += 4;
311
312 c -= 4;
313 }
314 if (c != 0) {
315 const __m128 vi0 = _mm_loadu_ps(i0);
316 const __m128 vi1 = _mm_loadu_ps(i1);
317 const __m128 vi2 = _mm_loadu_ps(i2);
318 const __m128 vi3 = _mm_loadu_ps(i3);
319 const __m128 vi4 = _mm_loadu_ps(i4);
320 const __m128 vi5 = _mm_loadu_ps(i5);
321 const __m128 vi6 = _mm_loadu_ps(i6);
322 const __m128 vi7 = _mm_loadu_ps(i7);
323 const __m128 vacc = _mm_load_ps(b);
324
325 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
326 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
327 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
328 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
329 const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
330 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
331 const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
332 const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
333
334 __m128 vout = _mm_mul_ps(vsum, vscale);
335 vout = _mm_max_ps(vout, vmin);
336 vout = _mm_min_ps(vout, vmax);
337
338 if (c & 2) {
339 _mm_storel_pi((__m64*) output, vout);
340 vout = _mm_movehl_ps(vout, vout);
341 output += 2;
342 }
343 if (c & 1) {
344 _mm_store_ss(output, vout);
345 output += 1;
346 }
347 }
348 }
349 output = (float*) ((uintptr_t) output + output_increment);
350 } while (--output_pixels != 0);
351 }
352
xnn_f32_avgpool_minmax_ukernel_9x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])353 void xnn_f32_avgpool_minmax_ukernel_9x__sse_c4(
354 size_t output_pixels,
355 size_t kernel_elements,
356 size_t channels,
357 const float** input,
358 size_t input_offset,
359 const float* zero,
360 float* output,
361 size_t input_increment,
362 size_t output_increment,
363 const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
364 {
365 assert(output_pixels != 0);
366 assert(kernel_elements != 0);
367 assert(kernel_elements <= 9);
368 assert(channels != 0);
369
370 const __m128 vscale = _mm_load_ps(params->sse.scale);
371 const __m128 vmin = _mm_load_ps(params->sse.min);
372 const __m128 vmax = _mm_load_ps(params->sse.max);
373
374 do {
375 const float* i0 = input[0];
376 assert(i0 != NULL);
377 const float* i1 = input[1];
378 const float* i2 = input[2];
379 const float* i3 = input[3];
380 const float* i4 = input[4];
381 const float* i5 = input[5];
382 const float* i6 = input[6];
383 const float* i7 = input[7];
384 const float* i8 = input[8];
385 input = (const float**) ((uintptr_t) input + input_increment);
386 if (kernel_elements < 2) {
387 i1 = zero;
388 }
389 assert(i1 != NULL);
390 if (kernel_elements <= 2) {
391 i2 = zero;
392 }
393 assert(i2 != NULL);
394 if (kernel_elements < 4) {
395 i3 = zero;
396 }
397 assert(i3 != NULL);
398 if (kernel_elements <= 4) {
399 i4 = zero;
400 }
401 assert(i4 != NULL);
402 if (kernel_elements < 6) {
403 i5 = zero;
404 }
405 assert(i5 != NULL);
406 if (kernel_elements <= 6) {
407 i6 = zero;
408 }
409 assert(i6 != NULL);
410 if (kernel_elements < 8) {
411 i7 = zero;
412 }
413 assert(i7 != NULL);
414 if (kernel_elements <= 8) {
415 i8 = zero;
416 }
417 assert(i8 != NULL);
418 if XNN_UNPREDICTABLE(i0 != zero) {
419 i0 = (const float*) ((uintptr_t) i0 + input_offset);
420 }
421 if XNN_UNPREDICTABLE(i1 != zero) {
422 i1 = (const float*) ((uintptr_t) i1 + input_offset);
423 }
424 if XNN_UNPREDICTABLE(i2 != zero) {
425 i2 = (const float*) ((uintptr_t) i2 + input_offset);
426 }
427 if XNN_UNPREDICTABLE(i3 != zero) {
428 i3 = (const float*) ((uintptr_t) i3 + input_offset);
429 }
430 if XNN_UNPREDICTABLE(i4 != zero) {
431 i4 = (const float*) ((uintptr_t) i4 + input_offset);
432 }
433 if XNN_UNPREDICTABLE(i5 != zero) {
434 i5 = (const float*) ((uintptr_t) i5 + input_offset);
435 }
436 if XNN_UNPREDICTABLE(i6 != zero) {
437 i6 = (const float*) ((uintptr_t) i6 + input_offset);
438 }
439 if XNN_UNPREDICTABLE(i7 != zero) {
440 i7 = (const float*) ((uintptr_t) i7 + input_offset);
441 }
442 if XNN_UNPREDICTABLE(i8 != zero) {
443 i8 = (const float*) ((uintptr_t) i8 + input_offset);
444 }
445
446 size_t c = channels;
447 while (c >= 4) {
448 const __m128 vi0 = _mm_loadu_ps(i0);
449 i0 += 4;
450 const __m128 vi1 = _mm_loadu_ps(i1);
451 i1 += 4;
452 const __m128 vi2 = _mm_loadu_ps(i2);
453 i2 += 4;
454 const __m128 vi3 = _mm_loadu_ps(i3);
455 i3 += 4;
456 const __m128 vi4 = _mm_loadu_ps(i4);
457 i4 += 4;
458 const __m128 vi5 = _mm_loadu_ps(i5);
459 i5 += 4;
460 const __m128 vi6 = _mm_loadu_ps(i6);
461 i6 += 4;
462 const __m128 vi7 = _mm_loadu_ps(i7);
463 i7 += 4;
464 const __m128 vi8 = _mm_loadu_ps(i8);
465 i8 += 4;
466
467 const __m128 vsum018 = _mm_add_ps(_mm_add_ps(vi0, vi1), vi8);
468 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
469 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
470 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
471
472 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
473 const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
474 const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
475
476 __m128 vout = _mm_mul_ps(vsum, vscale);
477 vout = _mm_max_ps(vout, vmin);
478 vout = _mm_min_ps(vout, vmax);
479
480 _mm_storeu_ps(output, vout); output += 4;
481
482 c -= 4;
483 }
484 if (c != 0) {
485 const __m128 vi0 = _mm_loadu_ps(i0);
486 const __m128 vi1 = _mm_loadu_ps(i1);
487 const __m128 vi2 = _mm_loadu_ps(i2);
488 const __m128 vi3 = _mm_loadu_ps(i3);
489 const __m128 vi4 = _mm_loadu_ps(i4);
490 const __m128 vi5 = _mm_loadu_ps(i5);
491 const __m128 vi6 = _mm_loadu_ps(i6);
492 const __m128 vi7 = _mm_loadu_ps(i7);
493 const __m128 vi8 = _mm_loadu_ps(i8);
494
495 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
496 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
497 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
498 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
499 const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
500 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
501 const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
502 const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
503
504 __m128 vout = _mm_mul_ps(vsum, vscale);
505 vout = _mm_max_ps(vout, vmin);
506 vout = _mm_min_ps(vout, vmax);
507
508 if (c & 2) {
509 _mm_storel_pi((__m64*) output, vout);
510 vout = _mm_movehl_ps(vout, vout);
511 output += 2;
512 }
513 if (c & 1) {
514 _mm_store_ss(output, vout);
515 output += 1;
516 }
517 }
518 output = (float*) ((uintptr_t) output + output_increment);
519 } while (--output_pixels != 0);
520 }
521
xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_channel_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])522 void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(
523 size_t input_height,
524 size_t input_width,
525 size_t output_y_start,
526 size_t output_y_end,
527 const float* input,
528 const float* zero,
529 const float* weights,
530 float* output,
531 size_t input_padding_top,
532 size_t output_channels,
533 size_t output_height_stride,
534 size_t output_channel_stride,
535 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
536 {
537 assert(input_width != 0);
538 assert(output_y_end > output_y_start);
539 assert(input_padding_top <= 1);
540 assert(output_channels != 0);
541
542 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
543 const size_t input_width_increment = round_down_po2(input_width, 4) * 3 /* channels */ * sizeof(float);
544 const size_t output_width = (input_width + 1) / 2;
545 const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
546
547 // Adjustment for padding processed below
548 const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
549 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
550 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
551 const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
552 const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
553 float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
554 float* output1 = (float*) ((uintptr_t) output0 + output_height_stride);
555
556 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
557 i0 = zero;
558 }
559
560 const __m128 vmin = _mm_load_ps(params->sse.min);
561 const __m128 vmax = _mm_load_ps(params->sse.max);
562
563 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
564 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
565 const size_t input_y4 = input_y2 + 2;
566 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
567 i2 = zero;
568 }
569 if XNN_UNPREDICTABLE(input_y4 > input_height) {
570 i3 = zero;
571 }
572 if XNN_UNPREDICTABLE(input_y4 >= input_height) {
573 i4 = zero;
574 }
575 if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
576 output1 = output0;
577 }
578
579 const float* w = weights;
580 size_t c = output_channels;
581 float* o0c0 = output0;
582 float* o1c0 = output1;
583 float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
584 float* o1c1 = (float*) ((uintptr_t) o1c0 + output_channel_stride);
585 float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
586 float* o1c2 = (float*) ((uintptr_t) o1c1 + output_channel_stride);
587 float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
588 float* o1c3 = (float*) ((uintptr_t) o1c2 + output_channel_stride);
589 do {
590 if XNN_UNPREDICTABLE(c < 2) {
591 o0c1 = o0c0;
592 o1c1 = o1c0;
593 }
594 if XNN_UNPREDICTABLE(c <= 2) {
595 o0c2 = o0c1;
596 o1c2 = o1c1;
597 }
598 if XNN_UNPREDICTABLE(c < 4) {
599 o0c3 = o0c2;
600 o1c3 = o1c2;
601 }
602
603 // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
604 __m128 vi0x0 = _mm_setzero_ps();
605 __m128 vi1x0 = _mm_setzero_ps();
606 __m128 vi2x0 = _mm_setzero_ps();
607 __m128 vi3x0 = _mm_setzero_ps();
608 __m128 vi4x0 = _mm_setzero_ps();
609
610 size_t iw = input_width;
611 for (; iw >= 4; iw -= 4) {
612 __m128 vo0x0 = _mm_load_ps(w);
613 __m128 vo1x0 = vo0x0;
614 __m128 vo0x1 = vo0x0;
615 __m128 vo1x1 = vo0x0;
616
617 const __m128 vk00c0 = _mm_load_ps(w + 4);
618
619 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
620 const __m128 vi0x1 = _mm_loadu_ps(i0); i0 += 4;
621 const __m128 vi1x1 = _mm_loadu_ps(i1); i1 += 4;
622 const __m128 vi2x1 = _mm_loadu_ps(i2); i2 += 4;
623 const __m128 vi3x1 = _mm_loadu_ps(i3); i3 += 4;
624 const __m128 vi4x1 = _mm_loadu_ps(i4); i4 += 4;
625
626 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
627 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
628 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
629 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
630
631 const __m128 vk10c0 = _mm_load_ps(w + 8);
632
633 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
634 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
635 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
636 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
637
638 const __m128 vk20c0 = _mm_load_ps(w + 12);
639
640 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
641 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
642 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
643 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
644
645 const __m128 vk00c1 = _mm_load_ps(w + 16);
646
647 // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
648 const __m128 vi0x2 = _mm_loadu_ps(i0); i0 += 4;
649 const __m128 vi1x2 = _mm_loadu_ps(i1); i1 += 4;
650 const __m128 vi2x2 = _mm_loadu_ps(i2); i2 += 4;
651 const __m128 vi3x2 = _mm_loadu_ps(i3); i3 += 4;
652 const __m128 vi4x2 = _mm_loadu_ps(i4); i4 += 4;
653
654 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
655 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
656 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
657 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
658
659 const __m128 vk10c1 = _mm_load_ps(w + 20);
660
661 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
662 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
663 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
664 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
665
666 const __m128 vk20c1 = _mm_load_ps(w + 24);
667
668 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
669 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
670 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
671 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
672
673 const __m128 vk00c2 = _mm_load_ps(w + 28);
674
675 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
676 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
677 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
678 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
679
680 const __m128 vk10c2 = _mm_load_ps(w + 32);
681
682 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
683 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
684 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
685 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
686
687 const __m128 vk20c2 = _mm_load_ps(w + 36);
688
689 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
690 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
691 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
692 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
693
694 const __m128 vk01c0 = _mm_load_ps(w + 40);
695
696 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
697 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
698 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
699 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
700
701 const __m128 vk11c0 = _mm_load_ps(w + 44);
702
703 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
704 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
705 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
706 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
707
708 const __m128 vk21c0 = _mm_load_ps(w + 48);
709
710 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
711 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
712 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
713 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
714
715 const __m128 vk01c1 = _mm_load_ps(w + 52);
716
717 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
718 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
719 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
720 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
721
722 const __m128 vk11c1 = _mm_load_ps(w + 56);
723
724 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
725 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
726 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
727 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
728
729 const __m128 vk21c1 = _mm_load_ps(w + 60);
730
731 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
732 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
733 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
734 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
735
736 const __m128 vk01c2 = _mm_load_ps(w + 64);
737
738 // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
739 const __m128 vi0x3 = _mm_loadu_ps(i0); i0 += 4;
740 const __m128 vi1x3 = _mm_loadu_ps(i1); i1 += 4;
741 const __m128 vi2x3 = _mm_loadu_ps(i2); i2 += 4;
742 const __m128 vi3x3 = _mm_loadu_ps(i3); i3 += 4;
743 const __m128 vi4x3 = _mm_loadu_ps(i4); i4 += 4;
744
745 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
746 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
747 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
748 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
749
750 const __m128 vk11c2 = _mm_load_ps(w + 68);
751
752 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
753 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
754 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
755 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
756
757 const __m128 vk21c2 = _mm_load_ps(w + 72);
758
759 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
760 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
761 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
762 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
763
764 const __m128 vk02c0 = _mm_load_ps(w + 76);
765
766 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
767 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
768 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(1, 1, 1, 1))));
769 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
770
771 const __m128 vk12c0 = _mm_load_ps(w + 80);
772
773 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
774 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
775 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(1, 1, 1, 1))));
776 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(1, 1, 1, 1))));
777
778 const __m128 vk22c0 = _mm_load_ps(w + 84);
779
780 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
781 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
782 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
783 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(1, 1, 1, 1))));
784
785 const __m128 vk02c1 = _mm_load_ps(w + 88);
786
787 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
788 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
789 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(2, 2, 2, 2))));
790 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
791
792 const __m128 vk12c1 = _mm_load_ps(w + 92);
793
794 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
795 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
796 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(2, 2, 2, 2))));
797 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(2, 2, 2, 2))));
798
799 const __m128 vk22c1 = _mm_load_ps(w + 96);
800
801 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
802 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
803 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
804 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(2, 2, 2, 2))));
805
806 const __m128 vk02c2 = _mm_load_ps(w + 100);
807
808 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
809 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
810 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(3, 3, 3, 3))));
811 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
812
813 const __m128 vk12c2 = _mm_load_ps(w + 104);
814
815 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
816 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
817 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(3, 3, 3, 3))));
818 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(3, 3, 3, 3))));
819
820 const __m128 vk22c2 = _mm_load_ps(w + 108);
821
822 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
823 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
824 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
825 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(3, 3, 3, 3))));
826
827 vi0x0 = vi0x3;
828 vi1x0 = vi1x3;
829 vi2x0 = vi2x3;
830 vi3x0 = vi3x3;
831 vi4x0 = vi4x3;
832
833 vo0x0 = _mm_max_ps(vo0x0, vmin);
834 vo1x0 = _mm_max_ps(vo1x0, vmin);
835 vo0x1 = _mm_max_ps(vo0x1, vmin);
836 vo1x1 = _mm_max_ps(vo1x1, vmin);
837
838 vo0x0 = _mm_min_ps(vo0x0, vmax);
839 vo1x0 = _mm_min_ps(vo1x0, vmax);
840 vo0x1 = _mm_min_ps(vo0x1, vmax);
841 vo1x1 = _mm_min_ps(vo1x1, vmax);
842
843 const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
844 const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
845 const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
846 const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
847
848 // Always 2+ output width elements remaining
849 _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
850 _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
851 _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
852 _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
853
854 _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
855 _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
856 _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
857 _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
858 }
859 assert(iw < 4);
860 if XNN_UNLIKELY(iw != 0) {
861 __m128 vo0x0 = _mm_load_ps(w);
862 __m128 vo1x0 = vo0x0;
863 __m128 vo0x1 = vo0x0;
864 __m128 vo1x1 = vo0x0;
865
866 const __m128 vk00c0 = _mm_load_ps(w + 4);
867
868 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
869 __m128 vi0x1 = _mm_loadu_ps(i0);
870 __m128 vi1x1 = _mm_loadu_ps(i1);
871 __m128 vi2x1 = _mm_loadu_ps(i2);
872 __m128 vi3x1 = _mm_loadu_ps(i3);
873 __m128 vi4x1 = _mm_loadu_ps(i4);
874
875 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
876 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
877 if (iw > 2) {
878 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
879 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
880 }
881
882 const __m128 vk10c0 = _mm_load_ps(w + 8);
883
884 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
885 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
886 if (iw > 2) {
887 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
888 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
889 }
890
891 const __m128 vk20c0 = _mm_load_ps(w + 12);
892
893 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
894 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
895 if (iw > 2) {
896 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
897 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
898 }
899
900 const __m128 vk00c1 = _mm_load_ps(w + 16);
901
902 __m128 vi0x2 = _mm_setzero_ps();
903 __m128 vi1x2 = _mm_setzero_ps();
904 __m128 vi2x2 = _mm_setzero_ps();
905 __m128 vi3x2 = _mm_setzero_ps();
906 __m128 vi4x2 = _mm_setzero_ps();
907 if (iw >= 2) {
908 // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
909 vi0x2 = _mm_loadu_ps(i0 + 4);
910 vi1x2 = _mm_loadu_ps(i1 + 4);
911 vi2x2 = _mm_loadu_ps(i2 + 4);
912 vi3x2 = _mm_loadu_ps(i3 + 4);
913 vi4x2 = _mm_loadu_ps(i4 + 4);
914 }
915
916 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
917 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
918 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
919 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
920
921 const __m128 vk10c1 = _mm_load_ps(w + 20);
922
923 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
924 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
925 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
926 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
927
928 const __m128 vk20c1 = _mm_load_ps(w + 24);
929
930 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
931 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
932 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
933 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
934
935 const __m128 vk00c2 = _mm_load_ps(w + 28);
936
937 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
938 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
939 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
940 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
941
942 const __m128 vk10c2 = _mm_load_ps(w + 32);
943
944 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
945 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
946 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
947 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
948
949 const __m128 vk20c2 = _mm_load_ps(w + 36);
950
951 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
952 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
953 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
954 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
955
956 const __m128 vk01c0 = _mm_load_ps(w + 40);
957
958 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
959 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
960 if (iw > 2) {
961 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
962 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
963 }
964
965 const __m128 vk11c0 = _mm_load_ps(w + 44);
966
967 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
968 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
969 if (iw > 2) {
970 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
971 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
972 }
973
974 const __m128 vk21c0 = _mm_load_ps(w + 48);
975
976 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
977 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
978 if (iw > 2) {
979 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
980 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
981 }
982
983 const __m128 vk01c1 = _mm_load_ps(w + 52);
984
985 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
986 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
987 if (iw > 2) {
988 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
989 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
990 }
991
992 const __m128 vk11c1 = _mm_load_ps(w + 56);
993
994 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
995 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
996 if (iw > 2) {
997 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
998 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
999 }
1000
1001 const __m128 vk21c1 = _mm_load_ps(w + 60);
1002
1003 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
1004 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
1005 if (iw > 2) {
1006 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
1007 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
1008 }
1009
1010 const __m128 vk01c2 = _mm_load_ps(w + 64);
1011
1012 __m128 vi0x3 = _mm_setzero_ps();
1013 __m128 vi1x3 = _mm_setzero_ps();
1014 __m128 vi2x3 = _mm_setzero_ps();
1015 __m128 vi3x3 = _mm_setzero_ps();
1016 __m128 vi4x3 = _mm_setzero_ps();
1017 if (iw > 2) {
1018 // viMx3 = ( 0.0, 0.0, 0.0, iM3c2 )
1019 vi0x3 = _mm_load_ss(i0 + 8);
1020 vi1x3 = _mm_load_ss(i1 + 8);
1021 vi2x3 = _mm_load_ss(i2 + 8);
1022 vi3x3 = _mm_load_ss(i3 + 8);
1023 vi4x3 = _mm_load_ss(i4 + 8);
1024 }
1025
1026 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
1027 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
1028 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
1029 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
1030
1031 const __m128 vk11c2 = _mm_load_ps(w + 68);
1032
1033 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
1034 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
1035 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
1036 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
1037
1038 const __m128 vk21c2 = _mm_load_ps(w + 72);
1039
1040 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
1041 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
1042 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
1043 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
1044
1045 if (iw >= 2) {
1046 const __m128 vk02c0 = _mm_load_ps(w + 76);
1047
1048 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
1049 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
1050
1051 const __m128 vk12c0 = _mm_load_ps(w + 80);
1052
1053 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
1054 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
1055
1056 const __m128 vk22c0 = _mm_load_ps(w + 84);
1057
1058 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
1059 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
1060
1061 const __m128 vk02c1 = _mm_load_ps(w + 88);
1062
1063 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
1064 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
1065
1066 const __m128 vk12c1 = _mm_load_ps(w + 92);
1067
1068 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
1069 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
1070
1071 const __m128 vk22c1 = _mm_load_ps(w + 96);
1072
1073 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
1074 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
1075
1076 const __m128 vk02c2 = _mm_load_ps(w + 100);
1077
1078 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
1079 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
1080
1081 const __m128 vk12c2 = _mm_load_ps(w + 104);
1082
1083 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
1084 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
1085
1086 const __m128 vk22c2 = _mm_load_ps(w + 108);
1087
1088 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
1089 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
1090 }
1091
1092 vo0x0 = _mm_max_ps(vo0x0, vmin);
1093 vo1x0 = _mm_max_ps(vo1x0, vmin);
1094 vo0x1 = _mm_max_ps(vo0x1, vmin);
1095 vo1x1 = _mm_max_ps(vo1x1, vmin);
1096
1097 vo0x0 = _mm_min_ps(vo0x0, vmax);
1098 vo1x0 = _mm_min_ps(vo1x0, vmax);
1099 vo0x1 = _mm_min_ps(vo0x1, vmax);
1100 vo1x1 = _mm_min_ps(vo1x1, vmax);
1101
1102 if (iw == 3) {
1103 // Exactly 2 output width elements remaining
1104 const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
1105 const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
1106 const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
1107 const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
1108
1109 _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
1110 _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
1111 _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
1112 _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
1113
1114 _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
1115 _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
1116 _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
1117 _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
1118 } else {
1119 // Exactly 1 output width element remaining
1120
1121 _mm_store_ss(o1c0, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(0, 0, 0, 0))); o1c0 += 1;
1122 _mm_store_ss(o1c1, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(1, 1, 1, 1))); o1c1 += 1;
1123 _mm_store_ss(o1c2, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(2, 2, 2, 2))); o1c2 += 1;
1124 _mm_store_ss(o1c3, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(3, 3, 3, 3))); o1c3 += 1;
1125
1126 _mm_store_ss(o0c0, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(0, 0, 0, 0))); o0c0 += 1;
1127 _mm_store_ss(o0c1, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(1, 1, 1, 1))); o0c1 += 1;
1128 _mm_store_ss(o0c2, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(2, 2, 2, 2))); o0c2 += 1;
1129 _mm_store_ss(o0c3, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(3, 3, 3, 3))); o0c3 += 1;
1130 }
1131 }
1132 // Move output pointers back to the position of the first pixel in a row,
1133 // and forward to the next block of output channels.
1134 o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
1135 o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
1136 o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
1137 o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
1138 o1c0 = (float*) ((uintptr_t) o1c0 + output_channel_increment);
1139 o1c1 = (float*) ((uintptr_t) o1c1 + output_channel_increment);
1140 o1c2 = (float*) ((uintptr_t) o1c2 + output_channel_increment);
1141 o1c3 = (float*) ((uintptr_t) o1c3 + output_channel_increment);
1142 // Revert input pointers to the position of the first pixel in a row
1143 i0 = (const float*) ((uintptr_t) i0 - input_width_increment);
1144 i1 = (const float*) ((uintptr_t) i1 - input_width_increment);
1145 i2 = (const float*) ((uintptr_t) i2 - input_width_increment);
1146 i3 = (const float*) ((uintptr_t) i3 - input_width_increment);
1147 i4 = (const float*) ((uintptr_t) i4 - input_width_increment);
1148 // Move to the block of weights for the next 4 output channels
1149 w += 112;
1150 c = doz(c, 4);
1151 } while (c != 0);
1152 // Move output pointers forward to the next two rows
1153 output0 = (float*) ((uintptr_t) output1 + output_height_stride);
1154 output1 = (float*) ((uintptr_t) output0 + output_height_stride);
1155 // Move input pointers forward to the next four rows
1156 i0 = i4;
1157 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
1158 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
1159 i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
1160 i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
1161 }
1162 }
1163
xnn_f32_dwconv_minmax_ukernel_up8x25__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1164 void xnn_f32_dwconv_minmax_ukernel_up8x25__sse(
1165 size_t channels,
1166 size_t output_width,
1167 const float** input,
1168 const float* weights,
1169 float* output,
1170 size_t input_stride,
1171 size_t output_increment,
1172 size_t input_offset,
1173 const float* zero,
1174 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1175 {
1176 assert(channels != 0);
1177 assert(output_width != 0);
1178
1179 const __m128 vmax = _mm_load_ps(params->sse.max);
1180 const __m128 vmin = _mm_load_ps(params->sse.min);
1181 do {
1182 const float* i0 = input[0];
1183 assert(i0 != NULL);
1184 if XNN_UNPREDICTABLE(i0 != zero) {
1185 i0 = (const float*) ((uintptr_t) i0 + input_offset);
1186 }
1187 const float* i1 = input[1];
1188 assert(i1 != NULL);
1189 if XNN_UNPREDICTABLE(i1 != zero) {
1190 i1 = (const float*) ((uintptr_t) i1 + input_offset);
1191 }
1192 const float* i2 = input[2];
1193 assert(i2 != NULL);
1194 if XNN_UNPREDICTABLE(i2 != zero) {
1195 i2 = (const float*) ((uintptr_t) i2 + input_offset);
1196 }
1197 const float* i3 = input[3];
1198 assert(i3 != NULL);
1199 if XNN_UNPREDICTABLE(i3 != zero) {
1200 i3 = (const float*) ((uintptr_t) i3 + input_offset);
1201 }
1202 const float* i4 = input[4];
1203 assert(i4 != NULL);
1204 if XNN_UNPREDICTABLE(i4 != zero) {
1205 i4 = (const float*) ((uintptr_t) i4 + input_offset);
1206 }
1207 const float* i5 = input[5];
1208 assert(i5 != NULL);
1209 if XNN_UNPREDICTABLE(i5 != zero) {
1210 i5 = (const float*) ((uintptr_t) i5 + input_offset);
1211 }
1212 const float* i6 = input[6];
1213 assert(i6 != NULL);
1214 if XNN_UNPREDICTABLE(i6 != zero) {
1215 i6 = (const float*) ((uintptr_t) i6 + input_offset);
1216 }
1217 const float* i7 = input[7];
1218 assert(i7 != NULL);
1219 if XNN_UNPREDICTABLE(i7 != zero) {
1220 i7 = (const float*) ((uintptr_t) i7 + input_offset);
1221 }
1222 const float* i8 = input[8];
1223 assert(i8 != NULL);
1224 if XNN_UNPREDICTABLE(i8 != zero) {
1225 i8 = (const float*) ((uintptr_t) i8 + input_offset);
1226 }
1227 const float* i9 = input[9];
1228 assert(i9 != NULL);
1229 if XNN_UNPREDICTABLE(i9 != zero) {
1230 i9 = (const float*) ((uintptr_t) i9 + input_offset);
1231 }
1232 const float* i10 = input[10];
1233 assert(i10 != NULL);
1234 if XNN_UNPREDICTABLE(i10 != zero) {
1235 i10 = (const float*) ((uintptr_t) i10 + input_offset);
1236 }
1237 const float* i11 = input[11];
1238 assert(i11 != NULL);
1239 if XNN_UNPREDICTABLE(i11 != zero) {
1240 i11 = (const float*) ((uintptr_t) i11 + input_offset);
1241 }
1242 const float* i12 = input[12];
1243 assert(i12 != NULL);
1244 if XNN_UNPREDICTABLE(i12 != zero) {
1245 i12 = (const float*) ((uintptr_t) i12 + input_offset);
1246 }
1247 const float* i13 = input[13];
1248 assert(i13 != NULL);
1249 if XNN_UNPREDICTABLE(i13 != zero) {
1250 i13 = (const float*) ((uintptr_t) i13 + input_offset);
1251 }
1252 const float* i14 = input[14];
1253 assert(i14 != NULL);
1254 if XNN_UNPREDICTABLE(i14 != zero) {
1255 i14 = (const float*) ((uintptr_t) i14 + input_offset);
1256 }
1257 const float* i15 = input[15];
1258 assert(i15 != NULL);
1259 if XNN_UNPREDICTABLE(i15 != zero) {
1260 i15 = (const float*) ((uintptr_t) i15 + input_offset);
1261 }
1262 const float* i16 = input[16];
1263 assert(i16 != NULL);
1264 if XNN_UNPREDICTABLE(i16 != zero) {
1265 i16 = (const float*) ((uintptr_t) i16 + input_offset);
1266 }
1267 const float* i17 = input[17];
1268 assert(i17 != NULL);
1269 if XNN_UNPREDICTABLE(i17 != zero) {
1270 i17 = (const float*) ((uintptr_t) i17 + input_offset);
1271 }
1272 const float* i18 = input[18];
1273 assert(i18 != NULL);
1274 if XNN_UNPREDICTABLE(i18 != zero) {
1275 i18 = (const float*) ((uintptr_t) i18 + input_offset);
1276 }
1277 const float* i19 = input[19];
1278 assert(i19 != NULL);
1279 if XNN_UNPREDICTABLE(i19 != zero) {
1280 i19 = (const float*) ((uintptr_t) i19 + input_offset);
1281 }
1282 const float* i20 = input[20];
1283 assert(i20 != NULL);
1284 if XNN_UNPREDICTABLE(i20 != zero) {
1285 i20 = (const float*) ((uintptr_t) i20 + input_offset);
1286 }
1287 const float* i21 = input[21];
1288 assert(i21 != NULL);
1289 if XNN_UNPREDICTABLE(i21 != zero) {
1290 i21 = (const float*) ((uintptr_t) i21 + input_offset);
1291 }
1292 const float* i22 = input[22];
1293 assert(i22 != NULL);
1294 if XNN_UNPREDICTABLE(i22 != zero) {
1295 i22 = (const float*) ((uintptr_t) i22 + input_offset);
1296 }
1297 const float* i23 = input[23];
1298 assert(i23 != NULL);
1299 if XNN_UNPREDICTABLE(i23 != zero) {
1300 i23 = (const float*) ((uintptr_t) i23 + input_offset);
1301 }
1302 const float* i24 = input[24];
1303 assert(i24 != NULL);
1304 if XNN_UNPREDICTABLE(i24 != zero) {
1305 i24 = (const float*) ((uintptr_t) i24 + input_offset);
1306 }
1307 input = (const float**) ((uintptr_t) input + input_stride);
1308
1309 size_t c = channels;
1310 const float* w = weights;
1311 for (; c >= 8; c -= 8) {
1312 __m128 vacc0123p0 = _mm_load_ps(w);
1313 __m128 vacc4567p0 = _mm_load_ps(w + 4);
1314
1315
1316 const __m128 vi0x0123 = _mm_loadu_ps(i0);
1317 const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
1318 i0 += 8;
1319
1320 const __m128 vk0x0123 = _mm_load_ps(w + 8);
1321 const __m128 vk0x4567 = _mm_load_ps(w + 12);
1322 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1323 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
1324
1325 const __m128 vi1x0123 = _mm_loadu_ps(i1);
1326 const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
1327 i1 += 8;
1328
1329 const __m128 vk1x0123 = _mm_load_ps(w + 16);
1330 const __m128 vk1x4567 = _mm_load_ps(w + 20);
1331 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1332 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
1333
1334 const __m128 vi2x0123 = _mm_loadu_ps(i2);
1335 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
1336 i2 += 8;
1337
1338 const __m128 vk2x0123 = _mm_load_ps(w + 24);
1339 const __m128 vk2x4567 = _mm_load_ps(w + 28);
1340 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1341 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
1342
1343 const __m128 vi3x0123 = _mm_loadu_ps(i3);
1344 const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
1345 i3 += 8;
1346
1347 const __m128 vk3x0123 = _mm_load_ps(w + 32);
1348 const __m128 vk3x4567 = _mm_load_ps(w + 36);
1349 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
1350 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
1351
1352 const __m128 vi4x0123 = _mm_loadu_ps(i4);
1353 const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
1354 i4 += 8;
1355
1356 const __m128 vk4x0123 = _mm_load_ps(w + 40);
1357 const __m128 vk4x4567 = _mm_load_ps(w + 44);
1358 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
1359 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
1360
1361 const __m128 vi5x0123 = _mm_loadu_ps(i5);
1362 const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
1363 i5 += 8;
1364
1365 const __m128 vk5x0123 = _mm_load_ps(w + 48);
1366 const __m128 vk5x4567 = _mm_load_ps(w + 52);
1367 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
1368 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
1369
1370 const __m128 vi6x0123 = _mm_loadu_ps(i6);
1371 const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
1372 i6 += 8;
1373
1374 const __m128 vk6x0123 = _mm_load_ps(w + 56);
1375 const __m128 vk6x4567 = _mm_load_ps(w + 60);
1376 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
1377 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
1378
1379 const __m128 vi7x0123 = _mm_loadu_ps(i7);
1380 const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
1381 i7 += 8;
1382
1383 const __m128 vk7x0123 = _mm_load_ps(w + 64);
1384 const __m128 vk7x4567 = _mm_load_ps(w + 68);
1385 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
1386 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
1387
1388 const __m128 vi8x0123 = _mm_loadu_ps(i8);
1389 const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
1390 i8 += 8;
1391
1392 const __m128 vk8x0123 = _mm_load_ps(w + 72);
1393 const __m128 vk8x4567 = _mm_load_ps(w + 76);
1394 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
1395 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
1396
1397 const __m128 vi9x0123 = _mm_loadu_ps(i9);
1398 const __m128 vi9x4567 = _mm_loadu_ps(i9 + 4);
1399 i9 += 8;
1400
1401 const __m128 vk9x0123 = _mm_load_ps(w + 80);
1402 const __m128 vk9x4567 = _mm_load_ps(w + 84);
1403 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
1404 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi9x4567, vk9x4567));
1405
1406 const __m128 vi10x0123 = _mm_loadu_ps(i10);
1407 const __m128 vi10x4567 = _mm_loadu_ps(i10 + 4);
1408 i10 += 8;
1409
1410 const __m128 vk10x0123 = _mm_load_ps(w + 88);
1411 const __m128 vk10x4567 = _mm_load_ps(w + 92);
1412 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
1413 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi10x4567, vk10x4567));
1414
1415 const __m128 vi11x0123 = _mm_loadu_ps(i11);
1416 const __m128 vi11x4567 = _mm_loadu_ps(i11 + 4);
1417 i11 += 8;
1418
1419 const __m128 vk11x0123 = _mm_load_ps(w + 96);
1420 const __m128 vk11x4567 = _mm_load_ps(w + 100);
1421 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
1422 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi11x4567, vk11x4567));
1423
1424 const __m128 vi12x0123 = _mm_loadu_ps(i12);
1425 const __m128 vi12x4567 = _mm_loadu_ps(i12 + 4);
1426 i12 += 8;
1427
1428 const __m128 vk12x0123 = _mm_load_ps(w + 104);
1429 const __m128 vk12x4567 = _mm_load_ps(w + 108);
1430 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
1431 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi12x4567, vk12x4567));
1432
1433 const __m128 vi13x0123 = _mm_loadu_ps(i13);
1434 const __m128 vi13x4567 = _mm_loadu_ps(i13 + 4);
1435 i13 += 8;
1436
1437 const __m128 vk13x0123 = _mm_load_ps(w + 112);
1438 const __m128 vk13x4567 = _mm_load_ps(w + 116);
1439 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
1440 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi13x4567, vk13x4567));
1441
1442 const __m128 vi14x0123 = _mm_loadu_ps(i14);
1443 const __m128 vi14x4567 = _mm_loadu_ps(i14 + 4);
1444 i14 += 8;
1445
1446 const __m128 vk14x0123 = _mm_load_ps(w + 120);
1447 const __m128 vk14x4567 = _mm_load_ps(w + 124);
1448 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
1449 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi14x4567, vk14x4567));
1450
1451 const __m128 vi15x0123 = _mm_loadu_ps(i15);
1452 const __m128 vi15x4567 = _mm_loadu_ps(i15 + 4);
1453 i15 += 8;
1454
1455 const __m128 vk15x0123 = _mm_load_ps(w + 128);
1456 const __m128 vk15x4567 = _mm_load_ps(w + 132);
1457 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
1458 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi15x4567, vk15x4567));
1459
1460 const __m128 vi16x0123 = _mm_loadu_ps(i16);
1461 const __m128 vi16x4567 = _mm_loadu_ps(i16 + 4);
1462 i16 += 8;
1463
1464 const __m128 vk16x0123 = _mm_load_ps(w + 136);
1465 const __m128 vk16x4567 = _mm_load_ps(w + 140);
1466 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
1467 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi16x4567, vk16x4567));
1468
1469 const __m128 vi17x0123 = _mm_loadu_ps(i17);
1470 const __m128 vi17x4567 = _mm_loadu_ps(i17 + 4);
1471 i17 += 8;
1472
1473 const __m128 vk17x0123 = _mm_load_ps(w + 144);
1474 const __m128 vk17x4567 = _mm_load_ps(w + 148);
1475 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
1476 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi17x4567, vk17x4567));
1477
1478 const __m128 vi18x0123 = _mm_loadu_ps(i18);
1479 const __m128 vi18x4567 = _mm_loadu_ps(i18 + 4);
1480 i18 += 8;
1481
1482 const __m128 vk18x0123 = _mm_load_ps(w + 152);
1483 const __m128 vk18x4567 = _mm_load_ps(w + 156);
1484 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
1485 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi18x4567, vk18x4567));
1486
1487 const __m128 vi19x0123 = _mm_loadu_ps(i19);
1488 const __m128 vi19x4567 = _mm_loadu_ps(i19 + 4);
1489 i19 += 8;
1490
1491 const __m128 vk19x0123 = _mm_load_ps(w + 160);
1492 const __m128 vk19x4567 = _mm_load_ps(w + 164);
1493 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
1494 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi19x4567, vk19x4567));
1495
1496 const __m128 vi20x0123 = _mm_loadu_ps(i20);
1497 const __m128 vi20x4567 = _mm_loadu_ps(i20 + 4);
1498 i20 += 8;
1499
1500 const __m128 vk20x0123 = _mm_load_ps(w + 168);
1501 const __m128 vk20x4567 = _mm_load_ps(w + 172);
1502 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
1503 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi20x4567, vk20x4567));
1504
1505 const __m128 vi21x0123 = _mm_loadu_ps(i21);
1506 const __m128 vi21x4567 = _mm_loadu_ps(i21 + 4);
1507 i21 += 8;
1508
1509 const __m128 vk21x0123 = _mm_load_ps(w + 176);
1510 const __m128 vk21x4567 = _mm_load_ps(w + 180);
1511 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
1512 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi21x4567, vk21x4567));
1513
1514 const __m128 vi22x0123 = _mm_loadu_ps(i22);
1515 const __m128 vi22x4567 = _mm_loadu_ps(i22 + 4);
1516 i22 += 8;
1517
1518 const __m128 vk22x0123 = _mm_load_ps(w + 184);
1519 const __m128 vk22x4567 = _mm_load_ps(w + 188);
1520 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
1521 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi22x4567, vk22x4567));
1522
1523 const __m128 vi23x0123 = _mm_loadu_ps(i23);
1524 const __m128 vi23x4567 = _mm_loadu_ps(i23 + 4);
1525 i23 += 8;
1526
1527 const __m128 vk23x0123 = _mm_load_ps(w + 192);
1528 const __m128 vk23x4567 = _mm_load_ps(w + 196);
1529 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
1530 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi23x4567, vk23x4567));
1531
1532 const __m128 vi24x0123 = _mm_loadu_ps(i24);
1533 const __m128 vi24x4567 = _mm_loadu_ps(i24 + 4);
1534 i24 += 8;
1535
1536 const __m128 vk24x0123 = _mm_load_ps(w + 200);
1537 const __m128 vk24x4567 = _mm_load_ps(w + 204);
1538 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
1539 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi24x4567, vk24x4567));
1540
1541 w += 208;
1542
1543
1544 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1545 __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
1546 vacc0123 = _mm_min_ps(vacc0123, vmax);
1547 vacc4567 = _mm_min_ps(vacc4567, vmax);
1548
1549 _mm_storeu_ps(output, vacc0123);
1550 _mm_storeu_ps(output + 4, vacc4567);
1551 output += 8;
1552 }
1553 for (; c >= 4; c -= 4) {
1554 __m128 vacc0123p0 = _mm_load_ps(w);
1555
1556 const __m128 vi0x0123 = _mm_loadu_ps(i0);
1557 i0 += 4;
1558
1559 const __m128 vk0x0123 = _mm_load_ps(w + 8);
1560 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1561
1562 const __m128 vi1x0123 = _mm_loadu_ps(i1);
1563 i1 += 4;
1564
1565 const __m128 vk1x0123 = _mm_load_ps(w + 16);
1566 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1567
1568 const __m128 vi2x0123 = _mm_loadu_ps(i2);
1569 i2 += 4;
1570
1571 const __m128 vk2x0123 = _mm_load_ps(w + 24);
1572 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1573
1574 const __m128 vi3x0123 = _mm_loadu_ps(i3);
1575 i3 += 4;
1576
1577 const __m128 vk3x0123 = _mm_load_ps(w + 32);
1578 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
1579
1580 const __m128 vi4x0123 = _mm_loadu_ps(i4);
1581 i4 += 4;
1582
1583 const __m128 vk4x0123 = _mm_load_ps(w + 40);
1584 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
1585
1586 const __m128 vi5x0123 = _mm_loadu_ps(i5);
1587 i5 += 4;
1588
1589 const __m128 vk5x0123 = _mm_load_ps(w + 48);
1590 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
1591
1592 const __m128 vi6x0123 = _mm_loadu_ps(i6);
1593 i6 += 4;
1594
1595 const __m128 vk6x0123 = _mm_load_ps(w + 56);
1596 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
1597
1598 const __m128 vi7x0123 = _mm_loadu_ps(i7);
1599 i7 += 4;
1600
1601 const __m128 vk7x0123 = _mm_load_ps(w + 64);
1602 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
1603
1604 const __m128 vi8x0123 = _mm_loadu_ps(i8);
1605 i8 += 4;
1606
1607 const __m128 vk8x0123 = _mm_load_ps(w + 72);
1608 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
1609
1610 const __m128 vi9x0123 = _mm_loadu_ps(i9);
1611 i9 += 4;
1612
1613 const __m128 vk9x0123 = _mm_load_ps(w + 80);
1614 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
1615
1616 const __m128 vi10x0123 = _mm_loadu_ps(i10);
1617 i10 += 4;
1618
1619 const __m128 vk10x0123 = _mm_load_ps(w + 88);
1620 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
1621
1622 const __m128 vi11x0123 = _mm_loadu_ps(i11);
1623 i11 += 4;
1624
1625 const __m128 vk11x0123 = _mm_load_ps(w + 96);
1626 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
1627
1628 const __m128 vi12x0123 = _mm_loadu_ps(i12);
1629 i12 += 4;
1630
1631 const __m128 vk12x0123 = _mm_load_ps(w + 104);
1632 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
1633
1634 const __m128 vi13x0123 = _mm_loadu_ps(i13);
1635 i13 += 4;
1636
1637 const __m128 vk13x0123 = _mm_load_ps(w + 112);
1638 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
1639
1640 const __m128 vi14x0123 = _mm_loadu_ps(i14);
1641 i14 += 4;
1642
1643 const __m128 vk14x0123 = _mm_load_ps(w + 120);
1644 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
1645
1646 const __m128 vi15x0123 = _mm_loadu_ps(i15);
1647 i15 += 4;
1648
1649 const __m128 vk15x0123 = _mm_load_ps(w + 128);
1650 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
1651
1652 const __m128 vi16x0123 = _mm_loadu_ps(i16);
1653 i16 += 4;
1654
1655 const __m128 vk16x0123 = _mm_load_ps(w + 136);
1656 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
1657
1658 const __m128 vi17x0123 = _mm_loadu_ps(i17);
1659 i17 += 4;
1660
1661 const __m128 vk17x0123 = _mm_load_ps(w + 144);
1662 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
1663
1664 const __m128 vi18x0123 = _mm_loadu_ps(i18);
1665 i18 += 4;
1666
1667 const __m128 vk18x0123 = _mm_load_ps(w + 152);
1668 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
1669
1670 const __m128 vi19x0123 = _mm_loadu_ps(i19);
1671 i19 += 4;
1672
1673 const __m128 vk19x0123 = _mm_load_ps(w + 160);
1674 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
1675
1676 const __m128 vi20x0123 = _mm_loadu_ps(i20);
1677 i20 += 4;
1678
1679 const __m128 vk20x0123 = _mm_load_ps(w + 168);
1680 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
1681
1682 const __m128 vi21x0123 = _mm_loadu_ps(i21);
1683 i21 += 4;
1684
1685 const __m128 vk21x0123 = _mm_load_ps(w + 176);
1686 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
1687
1688 const __m128 vi22x0123 = _mm_loadu_ps(i22);
1689 i22 += 4;
1690
1691 const __m128 vk22x0123 = _mm_load_ps(w + 184);
1692 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
1693
1694 const __m128 vi23x0123 = _mm_loadu_ps(i23);
1695 i23 += 4;
1696
1697 const __m128 vk23x0123 = _mm_load_ps(w + 192);
1698 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
1699
1700 const __m128 vi24x0123 = _mm_loadu_ps(i24);
1701 i24 += 4;
1702
1703 const __m128 vk24x0123 = _mm_load_ps(w + 200);
1704 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
1705
1706 w += 4;
1707
1708
1709 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1710 vacc0123 = _mm_min_ps(vacc0123, vmax);
1711
1712 _mm_storeu_ps(output, vacc0123);
1713 output += 4;
1714 }
1715 if XNN_UNLIKELY(c != 0) {
1716 __m128 vacc0123p0 = _mm_load_ps(w);
1717
1718 const __m128 vi0x0123 = _mm_loadu_ps(i0);
1719 const __m128 vk0x0123 = _mm_load_ps(w + 8);
1720 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1721
1722 const __m128 vi1x0123 = _mm_loadu_ps(i1);
1723 const __m128 vk1x0123 = _mm_load_ps(w + 16);
1724 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1725
1726 const __m128 vi2x0123 = _mm_loadu_ps(i2);
1727 const __m128 vk2x0123 = _mm_load_ps(w + 24);
1728 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1729
1730 const __m128 vi3x0123 = _mm_loadu_ps(i3);
1731 const __m128 vk3x0123 = _mm_load_ps(w + 32);
1732 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
1733
1734 const __m128 vi4x0123 = _mm_loadu_ps(i4);
1735 const __m128 vk4x0123 = _mm_load_ps(w + 40);
1736 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
1737
1738 const __m128 vi5x0123 = _mm_loadu_ps(i5);
1739 const __m128 vk5x0123 = _mm_load_ps(w + 48);
1740 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
1741
1742 const __m128 vi6x0123 = _mm_loadu_ps(i6);
1743 const __m128 vk6x0123 = _mm_load_ps(w + 56);
1744 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
1745
1746 const __m128 vi7x0123 = _mm_loadu_ps(i7);
1747 const __m128 vk7x0123 = _mm_load_ps(w + 64);
1748 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
1749
1750 const __m128 vi8x0123 = _mm_loadu_ps(i8);
1751 const __m128 vk8x0123 = _mm_load_ps(w + 72);
1752 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
1753
1754 const __m128 vi9x0123 = _mm_loadu_ps(i9);
1755 const __m128 vk9x0123 = _mm_load_ps(w + 80);
1756 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
1757
1758 const __m128 vi10x0123 = _mm_loadu_ps(i10);
1759 const __m128 vk10x0123 = _mm_load_ps(w + 88);
1760 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
1761
1762 const __m128 vi11x0123 = _mm_loadu_ps(i11);
1763 const __m128 vk11x0123 = _mm_load_ps(w + 96);
1764 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
1765
1766 const __m128 vi12x0123 = _mm_loadu_ps(i12);
1767 const __m128 vk12x0123 = _mm_load_ps(w + 104);
1768 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
1769
1770 const __m128 vi13x0123 = _mm_loadu_ps(i13);
1771 const __m128 vk13x0123 = _mm_load_ps(w + 112);
1772 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
1773
1774 const __m128 vi14x0123 = _mm_loadu_ps(i14);
1775 const __m128 vk14x0123 = _mm_load_ps(w + 120);
1776 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
1777
1778 const __m128 vi15x0123 = _mm_loadu_ps(i15);
1779 const __m128 vk15x0123 = _mm_load_ps(w + 128);
1780 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
1781
1782 const __m128 vi16x0123 = _mm_loadu_ps(i16);
1783 const __m128 vk16x0123 = _mm_load_ps(w + 136);
1784 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
1785
1786 const __m128 vi17x0123 = _mm_loadu_ps(i17);
1787 const __m128 vk17x0123 = _mm_load_ps(w + 144);
1788 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
1789
1790 const __m128 vi18x0123 = _mm_loadu_ps(i18);
1791 const __m128 vk18x0123 = _mm_load_ps(w + 152);
1792 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
1793
1794 const __m128 vi19x0123 = _mm_loadu_ps(i19);
1795 const __m128 vk19x0123 = _mm_load_ps(w + 160);
1796 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
1797
1798 const __m128 vi20x0123 = _mm_loadu_ps(i20);
1799 const __m128 vk20x0123 = _mm_load_ps(w + 168);
1800 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
1801
1802 const __m128 vi21x0123 = _mm_loadu_ps(i21);
1803 const __m128 vk21x0123 = _mm_load_ps(w + 176);
1804 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
1805
1806 const __m128 vi22x0123 = _mm_loadu_ps(i22);
1807 const __m128 vk22x0123 = _mm_load_ps(w + 184);
1808 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
1809
1810 const __m128 vi23x0123 = _mm_loadu_ps(i23);
1811 const __m128 vk23x0123 = _mm_load_ps(w + 192);
1812 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
1813
1814 const __m128 vi24x0123 = _mm_loadu_ps(i24);
1815 const __m128 vk24x0123 = _mm_load_ps(w + 200);
1816 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
1817
1818
1819 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1820 vacc0123 = _mm_min_ps(vacc0123, vmax);
1821
1822 if (c & 2) {
1823 _mm_storel_pi((__m64*) output, vacc0123);
1824 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1825 output += 2;
1826 }
1827 if (c & 1) {
1828 _mm_store_ss(output, vacc0123);
1829 output += 1;
1830 }
1831 }
1832
1833 output = (float*) ((uintptr_t) output + output_increment);
1834 } while (--output_width != 0);
1835 }
1836
xnn_f32_dwconv_minmax_ukernel_up8x3__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1837 void xnn_f32_dwconv_minmax_ukernel_up8x3__sse(
1838 size_t channels,
1839 size_t output_width,
1840 const float** input,
1841 const float* weights,
1842 float* output,
1843 size_t input_stride,
1844 size_t output_increment,
1845 size_t input_offset,
1846 const float* zero,
1847 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1848 {
1849 assert(channels != 0);
1850 assert(output_width != 0);
1851
1852 const __m128 vmax = _mm_load_ps(params->sse.max);
1853 const __m128 vmin = _mm_load_ps(params->sse.min);
1854 do {
1855 const float* i0 = input[0];
1856 assert(i0 != NULL);
1857 if XNN_UNPREDICTABLE(i0 != zero) {
1858 i0 = (const float*) ((uintptr_t) i0 + input_offset);
1859 }
1860 const float* i1 = input[1];
1861 assert(i1 != NULL);
1862 if XNN_UNPREDICTABLE(i1 != zero) {
1863 i1 = (const float*) ((uintptr_t) i1 + input_offset);
1864 }
1865 const float* i2 = input[2];
1866 assert(i2 != NULL);
1867 if XNN_UNPREDICTABLE(i2 != zero) {
1868 i2 = (const float*) ((uintptr_t) i2 + input_offset);
1869 }
1870 input = (const float**) ((uintptr_t) input + input_stride);
1871
1872 size_t c = channels;
1873 const float* w = weights;
1874 for (; c >= 8; c -= 8) {
1875 __m128 vacc0123p0 = _mm_load_ps(w);
1876 __m128 vacc4567p0 = _mm_load_ps(w + 4);
1877
1878
1879 const __m128 vi0x0123 = _mm_loadu_ps(i0);
1880 const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
1881 i0 += 8;
1882
1883 const __m128 vk0x0123 = _mm_load_ps(w + 8);
1884 const __m128 vk0x4567 = _mm_load_ps(w + 12);
1885 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1886 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
1887
1888 const __m128 vi1x0123 = _mm_loadu_ps(i1);
1889 const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
1890 i1 += 8;
1891
1892 const __m128 vk1x0123 = _mm_load_ps(w + 16);
1893 const __m128 vk1x4567 = _mm_load_ps(w + 20);
1894 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1895 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
1896
1897 const __m128 vi2x0123 = _mm_loadu_ps(i2);
1898 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
1899 i2 += 8;
1900
1901 const __m128 vk2x0123 = _mm_load_ps(w + 24);
1902 const __m128 vk2x4567 = _mm_load_ps(w + 28);
1903 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1904 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
1905
1906 w += 32;
1907
1908
1909 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1910 __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
1911 vacc0123 = _mm_min_ps(vacc0123, vmax);
1912 vacc4567 = _mm_min_ps(vacc4567, vmax);
1913
1914 _mm_storeu_ps(output, vacc0123);
1915 _mm_storeu_ps(output + 4, vacc4567);
1916 output += 8;
1917 }
1918 for (; c >= 4; c -= 4) {
1919 __m128 vacc0123p0 = _mm_load_ps(w);
1920
1921 const __m128 vi0x0123 = _mm_loadu_ps(i0);
1922 i0 += 4;
1923
1924 const __m128 vk0x0123 = _mm_load_ps(w + 8);
1925 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1926
1927 const __m128 vi1x0123 = _mm_loadu_ps(i1);
1928 i1 += 4;
1929
1930 const __m128 vk1x0123 = _mm_load_ps(w + 16);
1931 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1932
1933 const __m128 vi2x0123 = _mm_loadu_ps(i2);
1934 i2 += 4;
1935
1936 const __m128 vk2x0123 = _mm_load_ps(w + 24);
1937 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1938
1939 w += 4;
1940
1941
1942 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1943 vacc0123 = _mm_min_ps(vacc0123, vmax);
1944
1945 _mm_storeu_ps(output, vacc0123);
1946 output += 4;
1947 }
1948 if XNN_UNLIKELY(c != 0) {
1949 __m128 vacc0123p0 = _mm_load_ps(w);
1950
1951 const __m128 vi0x0123 = _mm_loadu_ps(i0);
1952 const __m128 vk0x0123 = _mm_load_ps(w + 8);
1953 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1954
1955 const __m128 vi1x0123 = _mm_loadu_ps(i1);
1956 const __m128 vk1x0123 = _mm_load_ps(w + 16);
1957 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1958
1959 const __m128 vi2x0123 = _mm_loadu_ps(i2);
1960 const __m128 vk2x0123 = _mm_load_ps(w + 24);
1961 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1962
1963
1964 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1965 vacc0123 = _mm_min_ps(vacc0123, vmax);
1966
1967 if (c & 2) {
1968 _mm_storel_pi((__m64*) output, vacc0123);
1969 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1970 output += 2;
1971 }
1972 if (c & 1) {
1973 _mm_store_ss(output, vacc0123);
1974 output += 1;
1975 }
1976 }
1977
1978 output = (float*) ((uintptr_t) output + output_increment);
1979 } while (--output_width != 0);
1980 }
1981
xnn_f32_dwconv_minmax_ukernel_up8x4__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1982 void xnn_f32_dwconv_minmax_ukernel_up8x4__sse(
1983 size_t channels,
1984 size_t output_width,
1985 const float** input,
1986 const float* weights,
1987 float* output,
1988 size_t input_stride,
1989 size_t output_increment,
1990 size_t input_offset,
1991 const float* zero,
1992 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1993 {
1994 assert(channels != 0);
1995 assert(output_width != 0);
1996
1997 const __m128 vmax = _mm_load_ps(params->sse.max);
1998 const __m128 vmin = _mm_load_ps(params->sse.min);
1999 do {
2000 const float* i0 = input[0];
2001 assert(i0 != NULL);
2002 if XNN_UNPREDICTABLE(i0 != zero) {
2003 i0 = (const float*) ((uintptr_t) i0 + input_offset);
2004 }
2005 const float* i1 = input[1];
2006 assert(i1 != NULL);
2007 if XNN_UNPREDICTABLE(i1 != zero) {
2008 i1 = (const float*) ((uintptr_t) i1 + input_offset);
2009 }
2010 const float* i2 = input[2];
2011 assert(i2 != NULL);
2012 if XNN_UNPREDICTABLE(i2 != zero) {
2013 i2 = (const float*) ((uintptr_t) i2 + input_offset);
2014 }
2015 const float* i3 = input[3];
2016 assert(i3 != NULL);
2017 if XNN_UNPREDICTABLE(i3 != zero) {
2018 i3 = (const float*) ((uintptr_t) i3 + input_offset);
2019 }
2020 input = (const float**) ((uintptr_t) input + input_stride);
2021
2022 size_t c = channels;
2023 const float* w = weights;
2024 for (; c >= 8; c -= 8) {
2025 __m128 vacc0123p0 = _mm_load_ps(w);
2026 __m128 vacc4567p0 = _mm_load_ps(w + 4);
2027
2028
2029 const __m128 vi0x0123 = _mm_loadu_ps(i0);
2030 const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
2031 i0 += 8;
2032
2033 const __m128 vk0x0123 = _mm_load_ps(w + 8);
2034 const __m128 vk0x4567 = _mm_load_ps(w + 12);
2035 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2036 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
2037
2038 const __m128 vi1x0123 = _mm_loadu_ps(i1);
2039 const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
2040 i1 += 8;
2041
2042 const __m128 vk1x0123 = _mm_load_ps(w + 16);
2043 const __m128 vk1x4567 = _mm_load_ps(w + 20);
2044 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2045 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
2046
2047 const __m128 vi2x0123 = _mm_loadu_ps(i2);
2048 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
2049 i2 += 8;
2050
2051 const __m128 vk2x0123 = _mm_load_ps(w + 24);
2052 const __m128 vk2x4567 = _mm_load_ps(w + 28);
2053 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2054 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
2055
2056 const __m128 vi3x0123 = _mm_loadu_ps(i3);
2057 const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
2058 i3 += 8;
2059
2060 const __m128 vk3x0123 = _mm_load_ps(w + 32);
2061 const __m128 vk3x4567 = _mm_load_ps(w + 36);
2062 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2063 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
2064
2065 w += 40;
2066
2067
2068 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2069 __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
2070 vacc0123 = _mm_min_ps(vacc0123, vmax);
2071 vacc4567 = _mm_min_ps(vacc4567, vmax);
2072
2073 _mm_storeu_ps(output, vacc0123);
2074 _mm_storeu_ps(output + 4, vacc4567);
2075 output += 8;
2076 }
2077 for (; c >= 4; c -= 4) {
2078 __m128 vacc0123p0 = _mm_load_ps(w);
2079
2080 const __m128 vi0x0123 = _mm_loadu_ps(i0);
2081 i0 += 4;
2082
2083 const __m128 vk0x0123 = _mm_load_ps(w + 8);
2084 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2085
2086 const __m128 vi1x0123 = _mm_loadu_ps(i1);
2087 i1 += 4;
2088
2089 const __m128 vk1x0123 = _mm_load_ps(w + 16);
2090 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2091
2092 const __m128 vi2x0123 = _mm_loadu_ps(i2);
2093 i2 += 4;
2094
2095 const __m128 vk2x0123 = _mm_load_ps(w + 24);
2096 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2097
2098 const __m128 vi3x0123 = _mm_loadu_ps(i3);
2099 i3 += 4;
2100
2101 const __m128 vk3x0123 = _mm_load_ps(w + 32);
2102 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2103
2104 w += 4;
2105
2106
2107 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2108 vacc0123 = _mm_min_ps(vacc0123, vmax);
2109
2110 _mm_storeu_ps(output, vacc0123);
2111 output += 4;
2112 }
2113 if XNN_UNLIKELY(c != 0) {
2114 __m128 vacc0123p0 = _mm_load_ps(w);
2115
2116 const __m128 vi0x0123 = _mm_loadu_ps(i0);
2117 const __m128 vk0x0123 = _mm_load_ps(w + 8);
2118 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2119
2120 const __m128 vi1x0123 = _mm_loadu_ps(i1);
2121 const __m128 vk1x0123 = _mm_load_ps(w + 16);
2122 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2123
2124 const __m128 vi2x0123 = _mm_loadu_ps(i2);
2125 const __m128 vk2x0123 = _mm_load_ps(w + 24);
2126 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2127
2128 const __m128 vi3x0123 = _mm_loadu_ps(i3);
2129 const __m128 vk3x0123 = _mm_load_ps(w + 32);
2130 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2131
2132
2133 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2134 vacc0123 = _mm_min_ps(vacc0123, vmax);
2135
2136 if (c & 2) {
2137 _mm_storel_pi((__m64*) output, vacc0123);
2138 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
2139 output += 2;
2140 }
2141 if (c & 1) {
2142 _mm_store_ss(output, vacc0123);
2143 output += 1;
2144 }
2145 }
2146
2147 output = (float*) ((uintptr_t) output + output_increment);
2148 } while (--output_width != 0);
2149 }
2150
xnn_f32_dwconv_minmax_ukernel_up8x9__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2151 void xnn_f32_dwconv_minmax_ukernel_up8x9__sse(
2152 size_t channels,
2153 size_t output_width,
2154 const float** input,
2155 const float* weights,
2156 float* output,
2157 size_t input_stride,
2158 size_t output_increment,
2159 size_t input_offset,
2160 const float* zero,
2161 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2162 {
2163 assert(channels != 0);
2164 assert(output_width != 0);
2165
2166 const __m128 vmax = _mm_load_ps(params->sse.max);
2167 const __m128 vmin = _mm_load_ps(params->sse.min);
2168 do {
2169 const float* i0 = input[0];
2170 assert(i0 != NULL);
2171 if XNN_UNPREDICTABLE(i0 != zero) {
2172 i0 = (const float*) ((uintptr_t) i0 + input_offset);
2173 }
2174 const float* i1 = input[1];
2175 assert(i1 != NULL);
2176 if XNN_UNPREDICTABLE(i1 != zero) {
2177 i1 = (const float*) ((uintptr_t) i1 + input_offset);
2178 }
2179 const float* i2 = input[2];
2180 assert(i2 != NULL);
2181 if XNN_UNPREDICTABLE(i2 != zero) {
2182 i2 = (const float*) ((uintptr_t) i2 + input_offset);
2183 }
2184 const float* i3 = input[3];
2185 assert(i3 != NULL);
2186 if XNN_UNPREDICTABLE(i3 != zero) {
2187 i3 = (const float*) ((uintptr_t) i3 + input_offset);
2188 }
2189 const float* i4 = input[4];
2190 assert(i4 != NULL);
2191 if XNN_UNPREDICTABLE(i4 != zero) {
2192 i4 = (const float*) ((uintptr_t) i4 + input_offset);
2193 }
2194 const float* i5 = input[5];
2195 assert(i5 != NULL);
2196 if XNN_UNPREDICTABLE(i5 != zero) {
2197 i5 = (const float*) ((uintptr_t) i5 + input_offset);
2198 }
2199 const float* i6 = input[6];
2200 assert(i6 != NULL);
2201 if XNN_UNPREDICTABLE(i6 != zero) {
2202 i6 = (const float*) ((uintptr_t) i6 + input_offset);
2203 }
2204 const float* i7 = input[7];
2205 assert(i7 != NULL);
2206 if XNN_UNPREDICTABLE(i7 != zero) {
2207 i7 = (const float*) ((uintptr_t) i7 + input_offset);
2208 }
2209 const float* i8 = input[8];
2210 assert(i8 != NULL);
2211 if XNN_UNPREDICTABLE(i8 != zero) {
2212 i8 = (const float*) ((uintptr_t) i8 + input_offset);
2213 }
2214 input = (const float**) ((uintptr_t) input + input_stride);
2215
2216 size_t c = channels;
2217 const float* w = weights;
2218 for (; c >= 8; c -= 8) {
2219 __m128 vacc0123p0 = _mm_load_ps(w);
2220 __m128 vacc4567p0 = _mm_load_ps(w + 4);
2221
2222
2223 const __m128 vi0x0123 = _mm_loadu_ps(i0);
2224 const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
2225 i0 += 8;
2226
2227 const __m128 vk0x0123 = _mm_load_ps(w + 8);
2228 const __m128 vk0x4567 = _mm_load_ps(w + 12);
2229 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2230 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
2231
2232 const __m128 vi1x0123 = _mm_loadu_ps(i1);
2233 const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
2234 i1 += 8;
2235
2236 const __m128 vk1x0123 = _mm_load_ps(w + 16);
2237 const __m128 vk1x4567 = _mm_load_ps(w + 20);
2238 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2239 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
2240
2241 const __m128 vi2x0123 = _mm_loadu_ps(i2);
2242 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
2243 i2 += 8;
2244
2245 const __m128 vk2x0123 = _mm_load_ps(w + 24);
2246 const __m128 vk2x4567 = _mm_load_ps(w + 28);
2247 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2248 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
2249
2250 const __m128 vi3x0123 = _mm_loadu_ps(i3);
2251 const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
2252 i3 += 8;
2253
2254 const __m128 vk3x0123 = _mm_load_ps(w + 32);
2255 const __m128 vk3x4567 = _mm_load_ps(w + 36);
2256 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2257 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
2258
2259 const __m128 vi4x0123 = _mm_loadu_ps(i4);
2260 const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
2261 i4 += 8;
2262
2263 const __m128 vk4x0123 = _mm_load_ps(w + 40);
2264 const __m128 vk4x4567 = _mm_load_ps(w + 44);
2265 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
2266 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
2267
2268 const __m128 vi5x0123 = _mm_loadu_ps(i5);
2269 const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
2270 i5 += 8;
2271
2272 const __m128 vk5x0123 = _mm_load_ps(w + 48);
2273 const __m128 vk5x4567 = _mm_load_ps(w + 52);
2274 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
2275 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
2276
2277 const __m128 vi6x0123 = _mm_loadu_ps(i6);
2278 const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
2279 i6 += 8;
2280
2281 const __m128 vk6x0123 = _mm_load_ps(w + 56);
2282 const __m128 vk6x4567 = _mm_load_ps(w + 60);
2283 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
2284 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
2285
2286 const __m128 vi7x0123 = _mm_loadu_ps(i7);
2287 const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
2288 i7 += 8;
2289
2290 const __m128 vk7x0123 = _mm_load_ps(w + 64);
2291 const __m128 vk7x4567 = _mm_load_ps(w + 68);
2292 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
2293 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
2294
2295 const __m128 vi8x0123 = _mm_loadu_ps(i8);
2296 const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
2297 i8 += 8;
2298
2299 const __m128 vk8x0123 = _mm_load_ps(w + 72);
2300 const __m128 vk8x4567 = _mm_load_ps(w + 76);
2301 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
2302 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
2303
2304 w += 80;
2305
2306
2307 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2308 __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
2309 vacc0123 = _mm_min_ps(vacc0123, vmax);
2310 vacc4567 = _mm_min_ps(vacc4567, vmax);
2311
2312 _mm_storeu_ps(output, vacc0123);
2313 _mm_storeu_ps(output + 4, vacc4567);
2314 output += 8;
2315 }
2316 for (; c >= 4; c -= 4) {
2317 __m128 vacc0123p0 = _mm_load_ps(w);
2318
2319 const __m128 vi0x0123 = _mm_loadu_ps(i0);
2320 i0 += 4;
2321
2322 const __m128 vk0x0123 = _mm_load_ps(w + 8);
2323 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2324
2325 const __m128 vi1x0123 = _mm_loadu_ps(i1);
2326 i1 += 4;
2327
2328 const __m128 vk1x0123 = _mm_load_ps(w + 16);
2329 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2330
2331 const __m128 vi2x0123 = _mm_loadu_ps(i2);
2332 i2 += 4;
2333
2334 const __m128 vk2x0123 = _mm_load_ps(w + 24);
2335 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2336
2337 const __m128 vi3x0123 = _mm_loadu_ps(i3);
2338 i3 += 4;
2339
2340 const __m128 vk3x0123 = _mm_load_ps(w + 32);
2341 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2342
2343 const __m128 vi4x0123 = _mm_loadu_ps(i4);
2344 i4 += 4;
2345
2346 const __m128 vk4x0123 = _mm_load_ps(w + 40);
2347 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
2348
2349 const __m128 vi5x0123 = _mm_loadu_ps(i5);
2350 i5 += 4;
2351
2352 const __m128 vk5x0123 = _mm_load_ps(w + 48);
2353 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
2354
2355 const __m128 vi6x0123 = _mm_loadu_ps(i6);
2356 i6 += 4;
2357
2358 const __m128 vk6x0123 = _mm_load_ps(w + 56);
2359 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
2360
2361 const __m128 vi7x0123 = _mm_loadu_ps(i7);
2362 i7 += 4;
2363
2364 const __m128 vk7x0123 = _mm_load_ps(w + 64);
2365 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
2366
2367 const __m128 vi8x0123 = _mm_loadu_ps(i8);
2368 i8 += 4;
2369
2370 const __m128 vk8x0123 = _mm_load_ps(w + 72);
2371 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
2372
2373 w += 4;
2374
2375
2376 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2377 vacc0123 = _mm_min_ps(vacc0123, vmax);
2378
2379 _mm_storeu_ps(output, vacc0123);
2380 output += 4;
2381 }
2382 if XNN_UNLIKELY(c != 0) {
2383 __m128 vacc0123p0 = _mm_load_ps(w);
2384
2385 const __m128 vi0x0123 = _mm_loadu_ps(i0);
2386 const __m128 vk0x0123 = _mm_load_ps(w + 8);
2387 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2388
2389 const __m128 vi1x0123 = _mm_loadu_ps(i1);
2390 const __m128 vk1x0123 = _mm_load_ps(w + 16);
2391 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2392
2393 const __m128 vi2x0123 = _mm_loadu_ps(i2);
2394 const __m128 vk2x0123 = _mm_load_ps(w + 24);
2395 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2396
2397 const __m128 vi3x0123 = _mm_loadu_ps(i3);
2398 const __m128 vk3x0123 = _mm_load_ps(w + 32);
2399 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2400
2401 const __m128 vi4x0123 = _mm_loadu_ps(i4);
2402 const __m128 vk4x0123 = _mm_load_ps(w + 40);
2403 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
2404
2405 const __m128 vi5x0123 = _mm_loadu_ps(i5);
2406 const __m128 vk5x0123 = _mm_load_ps(w + 48);
2407 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
2408
2409 const __m128 vi6x0123 = _mm_loadu_ps(i6);
2410 const __m128 vk6x0123 = _mm_load_ps(w + 56);
2411 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
2412
2413 const __m128 vi7x0123 = _mm_loadu_ps(i7);
2414 const __m128 vk7x0123 = _mm_load_ps(w + 64);
2415 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
2416
2417 const __m128 vi8x0123 = _mm_loadu_ps(i8);
2418 const __m128 vk8x0123 = _mm_load_ps(w + 72);
2419 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
2420
2421
2422 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2423 vacc0123 = _mm_min_ps(vacc0123, vmax);
2424
2425 if (c & 2) {
2426 _mm_storel_pi((__m64*) output, vacc0123);
2427 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
2428 output += 2;
2429 }
2430 if (c & 1) {
2431 _mm_store_ss(output, vacc0123);
2432 output += 1;
2433 }
2434 }
2435
2436 output = (float*) ((uintptr_t) output + output_increment);
2437 } while (--output_width != 0);
2438 }
2439
xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])2440 void xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2(
2441 size_t input_height,
2442 size_t input_width,
2443 const float* input,
2444 const float* weights,
2445 const float* zero,
2446 float* output,
2447 uint32_t padding_top,
2448 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2449 {
2450 assert(input_height != 0);
2451 assert(input_width != 0);
2452 assert(input_width % sizeof(float) == 0);
2453 assert(padding_top == 1);
2454
2455 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
2456 const __m128 vmax = _mm_load_ps(params->sse.max);
2457 const __m128 vmin = _mm_load_ps(params->sse.min);
2458
2459 const __m128 vbias = _mm_load1_ps(weights);
2460 const __m128 vk00 = _mm_load1_ps(weights + 1);
2461 const __m128 vk01 = _mm_load1_ps(weights + 2);
2462 const __m128 vk02 = _mm_load1_ps(weights + 3);
2463 const __m128 vk10 = _mm_load1_ps(weights + 4);
2464 const __m128 vk11 = _mm_load1_ps(weights + 5);
2465 const __m128 vk12 = _mm_load1_ps(weights + 6);
2466 const __m128 vk20 = _mm_load1_ps(weights + 7);
2467 const __m128 vk21 = _mm_load1_ps(weights + 8);
2468 const __m128 vk22 = _mm_load1_ps(weights + 9);
2469
2470 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
2471
2472 const float* i0 = zero;
2473 const float* i1 = input;
2474 const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
2475 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
2476
2477 float* o0 = output;
2478 float* o1 = (float*) ((uintptr_t) o0 + input_width);
2479
2480 size_t output_height = input_height;
2481 do {
2482 if XNN_UNPREDICTABLE(output_height < 2) {
2483 i2 = zero;
2484 o1 = o0;
2485 }
2486 if XNN_UNPREDICTABLE(output_height < 3) {
2487 i3 = zero;
2488 }
2489
2490 // vi0x3012 = ( vi02, vi01, vi{M}0, vi{M}3 )
2491 __m128 vi0x3012 = _mm_setzero_ps();
2492 // vi1x3012 = ( vi12, vi11, vi{M}0, vi{M}3 )
2493 __m128 vi1x3012 = _mm_setzero_ps();
2494 // vi2x3012 = ( vi22, vi21, vi{M}0, vi{M}3 )
2495 __m128 vi2x3012 = _mm_setzero_ps();
2496 // vi3x3012 = ( vi32, vi31, vi{M}0, vi{M}3 )
2497 __m128 vi3x3012 = _mm_setzero_ps();
2498
2499 __m128 vi0x4567 = _mm_loadu_ps(i0);
2500 i0 += 4;
2501 __m128 vi1x4567 = _mm_loadu_ps(i1);
2502 i1 += 4;
2503 __m128 vi2x4567 = _mm_loadu_ps(i2);
2504 i2 += 4;
2505 __m128 vi3x4567 = _mm_loadu_ps(i3);
2506 i3 += 4;
2507
2508 size_t w = input_width;
2509 for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) {
2510 // vi0x89AB = ( vi0B, vi0A, vi09, vi08 )
2511 const __m128 vi0x89AB = _mm_loadu_ps(i0);
2512 i0 += 4;
2513 // vi1x89AB = ( vi1B, vi1A, vi19, vi18 )
2514 const __m128 vi1x89AB = _mm_loadu_ps(i1);
2515 i1 += 4;
2516 // vi2x89AB = ( vi2B, vi2A, vi29, vi28 )
2517 const __m128 vi2x89AB = _mm_loadu_ps(i2);
2518 i2 += 4;
2519 // vi3x89AB = ( vi3B, vi3A, vi39, vi38 )
2520 const __m128 vi3x89AB = _mm_loadu_ps(i3);
2521 i3 += 4;
2522
2523 // vi0x7456 = ( vi06, vi05, vi04, vi07 )
2524 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
2525 // vi1x7456 = ( vi16, vi15, vi14, vi17 )
2526 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
2527 // vi2x7456 = ( vi26, vi25, vi24, vi27 )
2528 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
2529 // vi3x7456 = ( vi36, vi35, vi34, vi37 )
2530 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
2531
2532 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
2533 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
2534 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
2535 __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
2536 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
2537 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
2538
2539 // vi0x3456 = ( vi06, vi05, vi04, vi03 )
2540 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
2541 // vi1x3456 = ( vi16, vi15, vi14, vi13 )
2542 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
2543 // vi2x3456 = ( vi26, vi25, vi24, vi23 )
2544 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
2545 // vi3x3456 = ( vi36, vi35, vi34, vi33 )
2546 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
2547
2548 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
2549 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
2550 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
2551 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
2552 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
2553 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
2554
2555 vi0x3012 = vi0x7456;
2556 vi1x3012 = vi1x7456;
2557 vi2x3012 = vi2x7456;
2558 vi3x3012 = vi3x7456;
2559
2560 // vi0x8567 = ( vi07, vi06, vi05, vi08 )
2561 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
2562 // vi1x8567 = ( vi17, vi16, vi15, vi18 )
2563 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
2564 // vi2x8567 = ( vi27, vi26, vi25, vi28 )
2565 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
2566 // vi3x8567 = ( vi37, vi36, vi35, vi38 )
2567 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
2568
2569 // vi0x5678 = ( vi08, vi07, vi06, vi05 )
2570 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
2571 // vi1x5678 = ( vi18, vi17, vi16, vi15 )
2572 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
2573 // vi2x5678 = ( vi28, vi27, vi26, vi25 )
2574 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
2575 // vi3x5678 = ( vi38, vi37, vi36, vi35 )
2576 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
2577
2578 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
2579 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
2580 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
2581 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
2582 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
2583 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
2584
2585 vi0x4567 = vi0x89AB;
2586 vi1x4567 = vi1x89AB;
2587 vi2x4567 = vi2x89AB;
2588 vi3x4567 = vi3x89AB;
2589
2590 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2591 vo1p0 = _mm_add_ps(vo1p0, vo1p1);
2592
2593 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2594 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
2595
2596 vo0 = _mm_min_ps(vo0, vmax);
2597 vo1 = _mm_min_ps(vo1, vmax);
2598
2599 _mm_storeu_ps(o1, vo1);
2600 o1 += 4;
2601 _mm_storeu_ps(o0, vo0);
2602 o0 += 4;
2603 }
2604 // Always process the last block of 1..4 pixels.
2605 assert(w >= 1 * sizeof(float));
2606 assert(w <= 4 * sizeof(float));
2607 {
2608 vi0x4567 = _mm_and_ps(vmask, vi0x4567);
2609 vi1x4567 = _mm_and_ps(vmask, vi1x4567);
2610 vi2x4567 = _mm_and_ps(vmask, vi2x4567);
2611 vi3x4567 = _mm_and_ps(vmask, vi3x4567);
2612
2613 // vi0x7456 = ( vi06, vi05, vi04, vi07 )
2614 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
2615 // vi1x7456 = ( vi16, vi15, vi14, vi17 )
2616 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
2617 // vi2x7456 = ( vi26, vi25, vi24, vi27 )
2618 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
2619 // vi3x7456 = ( vi36, vi35, vi34, vi37 )
2620 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
2621
2622 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
2623 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
2624 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
2625 __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
2626 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
2627 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
2628
2629 // vi0x3456 = ( vi06, vi05, vi04, vi03 )
2630 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
2631 // vi1x3456 = ( vi16, vi15, vi14, vi13 )
2632 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
2633 // vi2x3456 = ( vi26, vi25, vi24, vi23 )
2634 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
2635 // vi3x3456 = ( vi36, vi35, vi34, vi33 )
2636 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
2637
2638 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
2639 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
2640 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
2641 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
2642 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
2643 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
2644
2645 const __m128 vzero = _mm_setzero_ps();
2646 // vi0x8567 = ( vi07, vi06, vi05, 0.0 )
2647 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
2648 // vi1x8567 = ( vi17, vi16, vi15, 0.0 )
2649 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
2650 // vi2x8567 = ( vi27, vi26, vi25, 0.0 )
2651 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
2652 // vi3x8567 = ( vi37, vi36, vi35, 0.0 )
2653 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
2654
2655 // vi0x5678 = ( vi08, vi07, vi06, vi05 )
2656 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
2657 // vi1x5678 = ( vi18, vi17, vi16, vi15 )
2658 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
2659 // vi2x5678 = ( vi28, vi27, vi26, vi25 )
2660 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
2661 // vi3x5678 = ( vi38, vi37, vi36, vi35 )
2662 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
2663
2664 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
2665 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
2666 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
2667 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
2668 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
2669 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
2670
2671 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2672 vo1p0 = _mm_add_ps(vo1p0, vo1p1);
2673
2674 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2675 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
2676
2677 vo0 = _mm_min_ps(vo0, vmax);
2678 vo1 = _mm_min_ps(vo1, vmax);
2679
2680 if XNN_LIKELY(w == 4 * sizeof(float)) {
2681 _mm_storeu_ps(o1, vo1);
2682 o1 += 4;
2683 _mm_storeu_ps(o0, vo0);
2684 o0 += 4;
2685 } else {
2686 if (w & (2 * sizeof(float))) {
2687 _mm_storel_pi((__m64*) o1, vo1);
2688 o1 += 2;
2689 _mm_storel_pi((__m64*) o0, vo0);
2690 o0 += 2;
2691
2692 vo0 = _mm_movehl_ps(vo0, vo0);
2693 vo1 = _mm_movehl_ps(vo1, vo1);
2694 }
2695 if (w & (1 * sizeof(float))) {
2696 _mm_store_ss(o1, vo1);
2697 o1 += 1;
2698 _mm_store_ss(o0, vo0);
2699 o0 += 1;
2700 }
2701 }
2702 }
2703
2704 i0 = (const float*) ((uintptr_t) i2 - input_decrement);
2705 i1 = (const float*) ((uintptr_t) i3 - input_decrement);
2706 i2 = (const float*) ((uintptr_t) i1 + input_width);
2707 i3 = (const float*) ((uintptr_t) i2 + input_width);
2708
2709 o0 = o1;
2710 o1 = (float*) ((uintptr_t) o0 + input_width);
2711
2712 output_height = doz(output_height, 2);
2713 } while (output_height != 0);
2714 }
2715
xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])2716 void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3(
2717 size_t input_height,
2718 size_t input_width,
2719 const float* input,
2720 const float* weights,
2721 const float* zero,
2722 float* output,
2723 uint32_t padding_top,
2724 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2725 {
2726 assert(input_height != 0);
2727 assert(input_width != 0);
2728 assert(input_width % sizeof(float) == 0);
2729 assert(padding_top >= 0);
2730 assert(padding_top <= 1);
2731
2732 const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
2733 const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd);
2734 const __m128 vmax = _mm_load_ps(params->sse.max);
2735 const __m128 vmin = _mm_load_ps(params->sse.min);
2736
2737 const __m128 vbias = _mm_load1_ps(weights);
2738 const __m128 vk00 = _mm_load1_ps(weights + 1);
2739 const __m128 vk01 = _mm_load1_ps(weights + 2);
2740 const __m128 vk02 = _mm_load1_ps(weights + 3);
2741 const __m128 vk10 = _mm_load1_ps(weights + 4);
2742 const __m128 vk11 = _mm_load1_ps(weights + 5);
2743 const __m128 vk12 = _mm_load1_ps(weights + 6);
2744 const __m128 vk20 = _mm_load1_ps(weights + 7);
2745 const __m128 vk21 = _mm_load1_ps(weights + 8);
2746 const __m128 vk22 = _mm_load1_ps(weights + 9);
2747
2748 const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float));
2749
2750 const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
2751 const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
2752 if XNN_UNPREDICTABLE(padding_top != 0) {
2753 i0 = zero;
2754 }
2755 const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
2756
2757 float* o0 = output;
2758
2759 size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
2760 size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
2761 do {
2762 if XNN_UNPREDICTABLE(padded_input_height < 4) {
2763 i2 = zero;
2764 }
2765
2766 __m128 vi0x7531 = _mm_setzero_ps();
2767 __m128 vi1x7531 = _mm_setzero_ps();
2768 __m128 vi2x7531 = _mm_setzero_ps();
2769
2770 size_t w = input_width;
2771 for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) {
2772 const __m128 vi0x89AB = _mm_loadu_ps(i0);
2773 const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
2774 i0 += 8;
2775 const __m128 vi1x89AB = _mm_loadu_ps(i1);
2776 const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
2777 i1 += 8;
2778 const __m128 vi2x89AB = _mm_loadu_ps(i2);
2779 const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
2780 i2 += 8;
2781
2782 const __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
2783 const __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
2784 const __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
2785 const __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
2786 const __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
2787 const __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
2788
2789 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
2790 __m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
2791 __m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
2792
2793 const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2794 const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2795 const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2796
2797 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
2798 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
2799 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
2800
2801 const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
2802 const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
2803 const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
2804
2805 vi0x7531 = vi0xF9BD;
2806 vi1x7531 = vi1xF9BD;
2807 vi2x7531 = vi2xF9BD;
2808
2809 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
2810 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
2811 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
2812
2813 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2814 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
2815
2816 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2817
2818 vo0 = _mm_min_ps(vo0, vmax);
2819
2820 _mm_storeu_ps(o0, vo0);
2821 o0 += 4;
2822 }
2823 // Potentially process the last block of 0..7 pixels.
2824 assert(w < 8 * sizeof(float));
2825 if XNN_LIKELY(w != 0) {
2826 const __m128 vi0x89AB = _mm_loadu_ps(i0);
2827 const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
2828 const __m128 vi1x89AB = _mm_loadu_ps(i1);
2829 const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
2830 const __m128 vi2x89AB = _mm_loadu_ps(i2);
2831 const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
2832
2833 const __m128 vi0x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
2834 const __m128 vi0x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
2835 const __m128 vi1x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
2836 const __m128 vi1x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
2837 const __m128 vi2x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
2838 const __m128 vi2x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
2839
2840 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
2841 __m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
2842 __m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
2843
2844 const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2845 const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2846 const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2847
2848 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
2849 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
2850 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
2851
2852 const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
2853 const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
2854 const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
2855
2856 vi0x7531 = vi0xF9BD;
2857 vi1x7531 = vi1xF9BD;
2858 vi2x7531 = vi2xF9BD;
2859
2860 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
2861 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
2862 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
2863
2864 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2865 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
2866
2867 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2868
2869 vo0 = _mm_min_ps(vo0, vmax);
2870
2871 if (w == 7 * sizeof(float)) {
2872 _mm_storeu_ps(o0, vo0);
2873 o0 += 4;
2874 } else {
2875 w += 1 * sizeof(float);
2876 if (w & (4 * sizeof(float))) {
2877 _mm_storel_pi((__m64*) o0, vo0);
2878 o0 += 2;
2879
2880 vo0 = _mm_movehl_ps(vo0, vo0);
2881 }
2882 if (w & (2 * sizeof(float))) {
2883 _mm_store_ss(o0, vo0);
2884 o0 += 1;
2885 }
2886 }
2887 }
2888
2889 i0 = (const float*) ((uintptr_t) i2 - input_decrement);
2890 i1 = (const float*) ((uintptr_t) i0 + input_width);
2891 i2 = (const float*) ((uintptr_t) i1 + input_width);
2892
2893
2894 output_height -= 1;
2895 padded_input_height -= 2;
2896 } while (output_height != 0);
2897 }
2898
xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])2899 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(
2900 size_t input_height,
2901 size_t input_width,
2902 const float* input,
2903 const float* weights,
2904 const float* zero,
2905 float* output,
2906 uint32_t padding_top,
2907 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2908 {
2909 assert(input_height != 0);
2910 assert(input_width != 0);
2911 assert(input_width % sizeof(float) == 0);
2912 assert(padding_top == 2);
2913
2914 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
2915 const __m128 vmax = _mm_load_ps(params->sse.max);
2916 const __m128 vmin = _mm_load_ps(params->sse.min);
2917
2918 const __m128 vbias = _mm_load1_ps(weights);
2919 const __m128 vk00 = _mm_load1_ps(weights + 1);
2920 const __m128 vk01 = _mm_load1_ps(weights + 2);
2921 const __m128 vk02 = _mm_load1_ps(weights + 3);
2922 const __m128 vk03 = _mm_load1_ps(weights + 4);
2923 const __m128 vk04 = _mm_load1_ps(weights + 5);
2924 const __m128 vk10 = _mm_load1_ps(weights + 6);
2925 const __m128 vk11 = _mm_load1_ps(weights + 7);
2926 const __m128 vk12 = _mm_load1_ps(weights + 8);
2927 const __m128 vk13 = _mm_load1_ps(weights + 9);
2928 const __m128 vk14 = _mm_load1_ps(weights + 10);
2929 const __m128 vk20 = _mm_load1_ps(weights + 11);
2930 const __m128 vk21 = _mm_load1_ps(weights + 12);
2931 const __m128 vk22 = _mm_load1_ps(weights + 13);
2932 const __m128 vk23 = _mm_load1_ps(weights + 14);
2933 const __m128 vk24 = _mm_load1_ps(weights + 15);
2934 const __m128 vk30 = _mm_load1_ps(weights + 16);
2935 const __m128 vk31 = _mm_load1_ps(weights + 17);
2936 const __m128 vk32 = _mm_load1_ps(weights + 18);
2937 const __m128 vk33 = _mm_load1_ps(weights + 19);
2938 const __m128 vk34 = _mm_load1_ps(weights + 20);
2939 const __m128 vk40 = _mm_load1_ps(weights + 21);
2940 const __m128 vk41 = _mm_load1_ps(weights + 22);
2941 const __m128 vk42 = _mm_load1_ps(weights + 23);
2942 const __m128 vk43 = _mm_load1_ps(weights + 24);
2943 const __m128 vk44 = _mm_load1_ps(weights + 25);
2944
2945 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
2946
2947 const float* i0 = zero;
2948 const float* i1 = zero;
2949 const float* i2 = input;
2950 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
2951 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
2952 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
2953 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
2954 const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
2955
2956 float* o0 = output;
2957 float* o1 = (float*) ((uintptr_t) o0 + input_width);
2958 float* o2 = (float*) ((uintptr_t) o1 + input_width);
2959 float* o3 = (float*) ((uintptr_t) o2 + input_width);
2960
2961 size_t output_height = input_height;
2962 do {
2963 if XNN_UNPREDICTABLE(output_height < 2) {
2964 i3 = zero;
2965 o1 = o0;
2966 }
2967 if XNN_UNPREDICTABLE(output_height < 3) {
2968 i4 = zero;
2969 o2 = o1;
2970 }
2971 if XNN_UNPREDICTABLE(output_height < 4) {
2972 i5 = zero;
2973 o3 = o2;
2974 }
2975 if XNN_UNPREDICTABLE(output_height < 5) {
2976 i6 = zero;
2977 }
2978 if XNN_UNPREDICTABLE(output_height < 6) {
2979 i7 = zero;
2980 }
2981
2982 __m128 vi0x3012 = _mm_setzero_ps();
2983 __m128 vi1x3012 = _mm_setzero_ps();
2984 __m128 vi2x3012 = _mm_setzero_ps();
2985 __m128 vi3x3012 = _mm_setzero_ps();
2986 __m128 vi4x3012 = _mm_setzero_ps();
2987 __m128 vi5x3012 = _mm_setzero_ps();
2988 __m128 vi6x3012 = _mm_setzero_ps();
2989 __m128 vi7x3012 = _mm_setzero_ps();
2990
2991 __m128 vi0x4567 = _mm_loadu_ps(i0);
2992 i0 += 4;
2993 __m128 vi1x4567 = _mm_loadu_ps(i1);
2994 i1 += 4;
2995 __m128 vi2x4567 = _mm_loadu_ps(i2);
2996 i2 += 4;
2997 __m128 vi3x4567 = _mm_loadu_ps(i3);
2998 i3 += 4;
2999 __m128 vi4x4567 = _mm_loadu_ps(i4);
3000 i4 += 4;
3001 __m128 vi5x4567 = _mm_loadu_ps(i5);
3002 i5 += 4;
3003 __m128 vi6x4567 = _mm_loadu_ps(i6);
3004 i6 += 4;
3005 __m128 vi7x4567 = _mm_loadu_ps(i7);
3006 i7 += 4;
3007
3008 size_t w = input_width;
3009 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
3010 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
3011 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
3012 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
3013 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
3014 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
3015 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
3016 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
3017 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
3018 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
3019 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
3020 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
3021 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
3022 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
3023 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
3024 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
3025 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
3026 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
3027 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
3028 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
3029 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
3030
3031 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
3032 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
3033 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
3034 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
3035 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
3036 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
3037 const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
3038 const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
3039
3040 const __m128 vi0x89AB = _mm_loadu_ps(i0);
3041 i0 += 4;
3042 const __m128 vi1x89AB = _mm_loadu_ps(i1);
3043 i1 += 4;
3044 const __m128 vi2x89AB = _mm_loadu_ps(i2);
3045 i2 += 4;
3046 const __m128 vi3x89AB = _mm_loadu_ps(i3);
3047 i3 += 4;
3048 const __m128 vi4x89AB = _mm_loadu_ps(i4);
3049 i4 += 4;
3050 const __m128 vi5x89AB = _mm_loadu_ps(i5);
3051 i5 += 4;
3052 const __m128 vi6x89AB = _mm_loadu_ps(i6);
3053 i6 += 4;
3054 const __m128 vi7x89AB = _mm_loadu_ps(i7);
3055 i7 += 4;
3056
3057 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
3058 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
3059 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
3060 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
3061 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
3062 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
3063 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
3064 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
3065
3066 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
3067 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
3068 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
3069 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
3070 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
3071 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
3072 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
3073 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
3074 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
3075 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
3076 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
3077 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
3078 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
3079 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
3080 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
3081 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
3082 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
3083 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
3084 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
3085 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
3086
3087 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
3088 vi0x3012 = vi0x7456;
3089 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
3090 vi1x3012 = vi1x7456;
3091 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
3092 vi2x3012 = vi2x7456;
3093 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
3094 vi3x3012 = vi3x7456;
3095 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
3096 vi4x3012 = vi4x7456;
3097 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
3098 vi5x3012 = vi5x7456;
3099 const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
3100 vi6x3012 = vi6x7456;
3101 const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
3102 vi7x3012 = vi7x7456;
3103
3104 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
3105 vi0x4567 = vi0x89AB;
3106 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
3107 vi1x4567 = vi1x89AB;
3108 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
3109 vi2x4567 = vi2x89AB;
3110 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
3111 vi3x4567 = vi3x89AB;
3112 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
3113 vi4x4567 = vi4x89AB;
3114 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
3115 vi5x4567 = vi5x89AB;
3116 const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
3117 vi6x4567 = vi6x89AB;
3118 const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
3119 vi7x4567 = vi7x89AB;
3120
3121 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
3122 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
3123 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
3124 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
3125 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
3126 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
3127 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
3128 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
3129 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
3130 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
3131 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
3132 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
3133 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
3134 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
3135 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
3136 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
3137 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
3138 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
3139 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
3140 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
3141
3142 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
3143 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
3144 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
3145 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
3146 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
3147 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
3148 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
3149 const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
3150
3151 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
3152 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
3153 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
3154 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
3155 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
3156 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
3157 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
3158 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
3159 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
3160 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
3161 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
3162 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
3163 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
3164 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
3165 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
3166 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
3167 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
3168 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
3169 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
3170 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
3171
3172 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3173 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3174 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3175 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3176 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3177 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3178 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3179 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3180
3181 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
3182 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
3183 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
3184 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
3185 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
3186 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
3187 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
3188 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
3189 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
3190 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
3191 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
3192 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
3193 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
3194 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
3195 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
3196 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
3197 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
3198 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
3199 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
3200 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
3201
3202
3203 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3204 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3205 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
3206 __m128 vo3 = _mm_max_ps(vo3p0, vmin);
3207
3208 vo0 = _mm_min_ps(vo0, vmax);
3209 vo1 = _mm_min_ps(vo1, vmax);
3210 vo2 = _mm_min_ps(vo2, vmax);
3211 vo3 = _mm_min_ps(vo3, vmax);
3212
3213 _mm_storeu_ps(o3, vo3);
3214 o3 += 4;
3215 _mm_storeu_ps(o2, vo2);
3216 o2 += 4;
3217 _mm_storeu_ps(o1, vo1);
3218 o1 += 4;
3219 _mm_storeu_ps(o0, vo0);
3220 o0 += 4;
3221 }
3222 // Always process the last block of 5..8 pixels.
3223 if XNN_LIKELY(w > 4 * sizeof(float)) {
3224 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
3225 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
3226 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
3227 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
3228 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
3229 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
3230 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
3231 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
3232 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
3233 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
3234 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
3235 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
3236 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
3237 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
3238 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
3239 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
3240 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
3241 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
3242 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
3243 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
3244
3245 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
3246 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
3247 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
3248 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
3249 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
3250 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
3251 const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
3252 const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
3253
3254 const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
3255 i0 += 4;
3256 const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
3257 i1 += 4;
3258 const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
3259 i2 += 4;
3260 const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
3261 i3 += 4;
3262 const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
3263 i4 += 4;
3264 const __m128 vi5x89AB = _mm_and_ps(_mm_loadu_ps(i5), vmask);
3265 i5 += 4;
3266 const __m128 vi6x89AB = _mm_and_ps(_mm_loadu_ps(i6), vmask);
3267 i6 += 4;
3268 const __m128 vi7x89AB = _mm_and_ps(_mm_loadu_ps(i7), vmask);
3269 i7 += 4;
3270
3271 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
3272 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
3273 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
3274 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
3275 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
3276 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
3277 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
3278 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
3279
3280 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
3281 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
3282 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
3283 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
3284 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
3285 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
3286 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
3287 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
3288 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
3289 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
3290 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
3291 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
3292 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
3293 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
3294 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
3295 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
3296 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
3297 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
3298 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
3299 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
3300
3301 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
3302 vi0x3012 = vi0x7456;
3303 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
3304 vi1x3012 = vi1x7456;
3305 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
3306 vi2x3012 = vi2x7456;
3307 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
3308 vi3x3012 = vi3x7456;
3309 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
3310 vi4x3012 = vi4x7456;
3311 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
3312 vi5x3012 = vi5x7456;
3313 const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
3314 vi6x3012 = vi6x7456;
3315 const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
3316 vi7x3012 = vi7x7456;
3317
3318 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
3319 vi0x4567 = vi0x89AB;
3320 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
3321 vi1x4567 = vi1x89AB;
3322 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
3323 vi2x4567 = vi2x89AB;
3324 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
3325 vi3x4567 = vi3x89AB;
3326 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
3327 vi4x4567 = vi4x89AB;
3328 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
3329 vi5x4567 = vi5x89AB;
3330 const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
3331 vi6x4567 = vi6x89AB;
3332 const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
3333 vi7x4567 = vi7x89AB;
3334
3335 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
3336 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
3337 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
3338 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
3339 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
3340 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
3341 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
3342 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
3343 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
3344 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
3345 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
3346 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
3347 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
3348 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
3349 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
3350 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
3351 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
3352 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
3353 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
3354 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
3355
3356 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
3357 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
3358 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
3359 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
3360 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
3361 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
3362 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
3363 const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
3364
3365 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
3366 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
3367 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
3368 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
3369 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
3370 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
3371 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
3372 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
3373 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
3374 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
3375 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
3376 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
3377 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
3378 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
3379 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
3380 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
3381 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
3382 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
3383 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
3384 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
3385
3386 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3387 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3388 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3389 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3390 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3391 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3392 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3393 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3394
3395 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
3396 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
3397 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
3398 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
3399 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
3400 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
3401 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
3402 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
3403 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
3404 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
3405 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
3406 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
3407 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
3408 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
3409 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
3410 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
3411 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
3412 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
3413 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
3414 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
3415
3416
3417 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3418 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3419 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
3420 __m128 vo3 = _mm_max_ps(vo3p0, vmin);
3421
3422 vo0 = _mm_min_ps(vo0, vmax);
3423 vo1 = _mm_min_ps(vo1, vmax);
3424 vo2 = _mm_min_ps(vo2, vmax);
3425 vo3 = _mm_min_ps(vo3, vmax);
3426
3427 _mm_storeu_ps(o3, vo3);
3428 o3 += 4;
3429 _mm_storeu_ps(o2, vo2);
3430 o2 += 4;
3431 _mm_storeu_ps(o1, vo1);
3432 o1 += 4;
3433 _mm_storeu_ps(o0, vo0);
3434 o0 += 4;
3435
3436 w -= 4 * sizeof(float);
3437 }
3438 assert(w >= 1 * sizeof(float));
3439 assert(w <= 4 * sizeof(float));
3440 {
3441 vi0x4567 = _mm_and_ps(vi0x4567, vmask);
3442 vi1x4567 = _mm_and_ps(vi1x4567, vmask);
3443 vi2x4567 = _mm_and_ps(vi2x4567, vmask);
3444 vi3x4567 = _mm_and_ps(vi3x4567, vmask);
3445 vi4x4567 = _mm_and_ps(vi4x4567, vmask);
3446 vi5x4567 = _mm_and_ps(vi5x4567, vmask);
3447 vi6x4567 = _mm_and_ps(vi6x4567, vmask);
3448 vi7x4567 = _mm_and_ps(vi7x4567, vmask);
3449
3450 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
3451 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
3452 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
3453 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
3454 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
3455 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
3456 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
3457 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
3458 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
3459 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
3460 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
3461 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
3462 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
3463 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
3464 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
3465 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
3466 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
3467 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
3468 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
3469 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
3470
3471 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
3472 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
3473 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
3474 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
3475 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
3476 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
3477 const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
3478 const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
3479
3480 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
3481 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
3482 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
3483 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
3484 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
3485 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
3486 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
3487 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
3488
3489 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
3490 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
3491 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
3492 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
3493 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
3494 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
3495 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
3496 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
3497 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
3498 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
3499 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
3500 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
3501 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
3502 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
3503 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
3504 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
3505 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
3506 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
3507 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
3508 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
3509
3510 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
3511 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
3512 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
3513 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
3514 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
3515 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
3516 const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
3517 const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
3518
3519 const __m128 vzero = _mm_setzero_ps();
3520 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
3521 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
3522 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
3523 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
3524 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
3525 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vzero);
3526 const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vzero);
3527 const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vzero);
3528
3529 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
3530 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
3531 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
3532 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
3533 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
3534 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
3535 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
3536 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
3537 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
3538 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
3539 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
3540 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
3541 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
3542 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
3543 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
3544 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
3545 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
3546 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
3547 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
3548 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
3549
3550 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
3551 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
3552 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
3553 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
3554 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
3555 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
3556 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
3557 const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
3558
3559 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
3560 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
3561 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
3562 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
3563 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
3564 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
3565 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
3566 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
3567 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
3568 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
3569 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
3570 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
3571 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
3572 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
3573 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
3574 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
3575 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
3576 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
3577 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
3578 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
3579
3580 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3581 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3582 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3583 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3584 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3585 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3586 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3587 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3588
3589 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
3590 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
3591 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
3592 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
3593 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
3594 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
3595 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
3596 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
3597 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
3598 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
3599 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
3600 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
3601 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
3602 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
3603 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
3604 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
3605 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
3606 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
3607 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
3608 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
3609
3610
3611 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3612 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3613 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
3614 __m128 vo3 = _mm_max_ps(vo3p0, vmin);
3615
3616 vo0 = _mm_min_ps(vo0, vmax);
3617 vo1 = _mm_min_ps(vo1, vmax);
3618 vo2 = _mm_min_ps(vo2, vmax);
3619 vo3 = _mm_min_ps(vo3, vmax);
3620
3621 if XNN_LIKELY(w & (4 * sizeof(float))) {
3622 _mm_storeu_ps(o3, vo3);
3623 o3 += 4;
3624 _mm_storeu_ps(o2, vo2);
3625 o2 += 4;
3626 _mm_storeu_ps(o1, vo1);
3627 o1 += 4;
3628 _mm_storeu_ps(o0, vo0);
3629 o0 += 4;
3630 } else {
3631 if (w & (2 * sizeof(float))) {
3632 _mm_storel_pi((__m64*) o3, vo3);
3633 o3 += 2;
3634 _mm_storel_pi((__m64*) o2, vo2);
3635 o2 += 2;
3636 _mm_storel_pi((__m64*) o1, vo1);
3637 o1 += 2;
3638 _mm_storel_pi((__m64*) o0, vo0);
3639 o0 += 2;
3640
3641 vo0 = _mm_movehl_ps(vo0, vo0);
3642 vo1 = _mm_movehl_ps(vo1, vo1);
3643 vo2 = _mm_movehl_ps(vo2, vo2);
3644 vo3 = _mm_movehl_ps(vo3, vo3);
3645 }
3646 if (w & (1 * sizeof(float))) {
3647 _mm_store_ss(o3, vo3);
3648 o3 += 1;
3649 _mm_store_ss(o2, vo2);
3650 o2 += 1;
3651 _mm_store_ss(o1, vo1);
3652 o1 += 1;
3653 _mm_store_ss(o0, vo0);
3654 o0 += 1;
3655 }
3656 }
3657 }
3658
3659 i0 = (const float*) ((uintptr_t) i4 - input_decrement);
3660 i1 = (const float*) ((uintptr_t) i5 - input_decrement);
3661 i2 = (const float*) ((uintptr_t) i1 + input_width);
3662 i3 = (const float*) ((uintptr_t) i2 + input_width);
3663 i4 = (const float*) ((uintptr_t) i3 + input_width);
3664 i5 = (const float*) ((uintptr_t) i4 + input_width);
3665 i6 = (const float*) ((uintptr_t) i5 + input_width);
3666 i7 = (const float*) ((uintptr_t) i6 + input_width);
3667
3668 o0 = o3;
3669 o1 = (float*) ((uintptr_t) o0 + input_width);
3670 o2 = (float*) ((uintptr_t) o1 + input_width);
3671 o3 = (float*) ((uintptr_t) o2 + input_width);
3672
3673 output_height = doz(output_height, 4);
3674 } while (output_height != 0);
3675 }
3676
xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])3677 void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4(
3678 size_t input_height,
3679 size_t input_width,
3680 const float* input,
3681 const float* weights,
3682 const float* zero,
3683 float* output,
3684 uint32_t padding_top,
3685 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3686 {
3687 assert(input_height != 0);
3688 assert(input_width != 0);
3689 assert(input_width % sizeof(float) == 0);
3690 assert(padding_top >= 1);
3691 assert(padding_top <= 2);
3692
3693 const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
3694 const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd);
3695 const __m128 vmax = _mm_load_ps(params->sse.max);
3696 const __m128 vmin = _mm_load_ps(params->sse.min);
3697
3698 const __m128 vbias = _mm_load1_ps(weights);
3699 const __m128 vk00 = _mm_load1_ps(weights + 1);
3700 const __m128 vk01 = _mm_load1_ps(weights + 2);
3701 const __m128 vk02 = _mm_load1_ps(weights + 3);
3702 const __m128 vk03 = _mm_load1_ps(weights + 4);
3703 const __m128 vk04 = _mm_load1_ps(weights + 5);
3704 const __m128 vk10 = _mm_load1_ps(weights + 6);
3705 const __m128 vk11 = _mm_load1_ps(weights + 7);
3706 const __m128 vk12 = _mm_load1_ps(weights + 8);
3707 const __m128 vk13 = _mm_load1_ps(weights + 9);
3708 const __m128 vk14 = _mm_load1_ps(weights + 10);
3709 const __m128 vk20 = _mm_load1_ps(weights + 11);
3710 const __m128 vk21 = _mm_load1_ps(weights + 12);
3711 const __m128 vk22 = _mm_load1_ps(weights + 13);
3712 const __m128 vk23 = _mm_load1_ps(weights + 14);
3713 const __m128 vk24 = _mm_load1_ps(weights + 15);
3714 const __m128 vk30 = _mm_load1_ps(weights + 16);
3715 const __m128 vk31 = _mm_load1_ps(weights + 17);
3716 const __m128 vk32 = _mm_load1_ps(weights + 18);
3717 const __m128 vk33 = _mm_load1_ps(weights + 19);
3718 const __m128 vk34 = _mm_load1_ps(weights + 20);
3719 const __m128 vk40 = _mm_load1_ps(weights + 21);
3720 const __m128 vk41 = _mm_load1_ps(weights + 22);
3721 const __m128 vk42 = _mm_load1_ps(weights + 23);
3722 const __m128 vk43 = _mm_load1_ps(weights + 24);
3723 const __m128 vk44 = _mm_load1_ps(weights + 25);
3724
3725 const uint32_t padding_top_less_1 = padding_top - 1;
3726 const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float));
3727
3728 const float* i0 = zero;
3729 const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
3730 const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3731 if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
3732 i1 = zero;
3733 }
3734 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3735 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3736 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
3737 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
3738
3739 const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
3740
3741 float* o0 = output;
3742 float* o1 = (float*) ((uintptr_t) o0 + output_width);
3743
3744 size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
3745 size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
3746 do {
3747 if XNN_UNPREDICTABLE(padded_input_height < 6) {
3748 i3 = zero;
3749 }
3750 if XNN_UNPREDICTABLE(padded_input_height < 7) {
3751 i4 = zero;
3752 o1 = o0;
3753 }
3754 if XNN_UNPREDICTABLE(padded_input_height < 8) {
3755 i5 = zero;
3756 }
3757 if XNN_UNPREDICTABLE(padded_input_height < 9) {
3758 i6 = zero;
3759 }
3760
3761 __m128 vi0x6024 = _mm_setzero_ps();
3762 __m128 vi1x6024 = _mm_setzero_ps();
3763 __m128 vi2x6024 = _mm_setzero_ps();
3764 __m128 vi3x6024 = _mm_setzero_ps();
3765 __m128 vi4x6024 = _mm_setzero_ps();
3766 __m128 vi5x6024 = _mm_setzero_ps();
3767 __m128 vi6x6024 = _mm_setzero_ps();
3768
3769 __m128 vi0x7135 = _mm_setzero_ps();
3770 __m128 vi1x7135 = _mm_setzero_ps();
3771 __m128 vi2x7135 = _mm_setzero_ps();
3772 __m128 vi3x7135 = _mm_setzero_ps();
3773 __m128 vi4x7135 = _mm_setzero_ps();
3774 __m128 vi5x7135 = _mm_setzero_ps();
3775 __m128 vi6x7135 = _mm_setzero_ps();
3776
3777 const __m128 vi0x89AB = _mm_loadu_ps(i0);
3778 const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
3779 i0 += 8;
3780 const __m128 vi1x89AB = _mm_loadu_ps(i1);
3781 const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
3782 i1 += 8;
3783 const __m128 vi2x89AB = _mm_loadu_ps(i2);
3784 const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
3785 i2 += 8;
3786 const __m128 vi3x89AB = _mm_loadu_ps(i3);
3787 const __m128 vi3xCDEF = _mm_loadu_ps(i3 + 4);
3788 i3 += 8;
3789 const __m128 vi4x89AB = _mm_loadu_ps(i4);
3790 const __m128 vi4xCDEF = _mm_loadu_ps(i4 + 4);
3791 i4 += 8;
3792 const __m128 vi5x89AB = _mm_loadu_ps(i5);
3793 const __m128 vi5xCDEF = _mm_loadu_ps(i5 + 4);
3794 i5 += 8;
3795 const __m128 vi6x89AB = _mm_loadu_ps(i6);
3796 const __m128 vi6xCDEF = _mm_loadu_ps(i6 + 4);
3797 i6 += 8;
3798
3799 __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3800 __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3801 __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3802 __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3803 __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3804 __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3805 __m128 vi3x8ACE = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3806 __m128 vi3x9BDF = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3807 __m128 vi4x8ACE = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3808 __m128 vi4x9BDF = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3809 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3810 __m128 vi5x9BDF = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3811 __m128 vi6x8ACE = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3812 __m128 vi6x9BDF = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3813
3814 size_t w = input_width;
3815 for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) {
3816 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
3817 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
3818 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
3819 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
3820 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
3821 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
3822 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
3823 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
3824 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
3825 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
3826
3827 const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3828 const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3829 const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3830 const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3831 const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3832 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3833 const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3834
3835 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
3836 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
3837 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
3838 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
3839 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
3840 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
3841 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
3842 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
3843 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
3844 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
3845
3846 const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
3847 vi0x6024 = vi0xE8AC;
3848 const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
3849 vi1x6024 = vi1xE8AC;
3850 const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
3851 vi2x6024 = vi2xE8AC;
3852 const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
3853 vi3x6024 = vi3xE8AC;
3854 const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
3855 vi4x6024 = vi4xE8AC;
3856 const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
3857 vi5x6024 = vi5xE8AC;
3858 const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
3859 vi6x6024 = vi6xE8AC;
3860
3861 const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3862 const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3863 const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3864 const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3865 const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3866 const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3867 const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3868
3869 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
3870 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
3871 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
3872 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
3873 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
3874 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
3875 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
3876 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
3877 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
3878 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
3879
3880 const __m128 vi0xGHIJ = _mm_loadu_ps(i0);
3881 const __m128 vi0xKLMN = _mm_loadu_ps(i0 + 4);
3882 i0 += 8;
3883 const __m128 vi1xGHIJ = _mm_loadu_ps(i1);
3884 const __m128 vi1xKLMN = _mm_loadu_ps(i1 + 4);
3885 i1 += 8;
3886 const __m128 vi2xGHIJ = _mm_loadu_ps(i2);
3887 const __m128 vi2xKLMN = _mm_loadu_ps(i2 + 4);
3888 i2 += 8;
3889 const __m128 vi3xGHIJ = _mm_loadu_ps(i3);
3890 const __m128 vi3xKLMN = _mm_loadu_ps(i3 + 4);
3891 i3 += 8;
3892 const __m128 vi4xGHIJ = _mm_loadu_ps(i4);
3893 const __m128 vi4xKLMN = _mm_loadu_ps(i4 + 4);
3894 i4 += 8;
3895 const __m128 vi5xGHIJ = _mm_loadu_ps(i5);
3896 const __m128 vi5xKLMN = _mm_loadu_ps(i5 + 4);
3897 i5 += 8;
3898 const __m128 vi6xGHIJ = _mm_loadu_ps(i6);
3899 const __m128 vi6xKLMN = _mm_loadu_ps(i6 + 4);
3900 i6 += 8;
3901
3902 const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
3903 vi0x7135 = vi0xF9BD;
3904 const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
3905 vi1x7135 = vi1xF9BD;
3906 const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
3907 vi2x7135 = vi2xF9BD;
3908 const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
3909 vi3x7135 = vi3xF9BD;
3910 const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
3911 vi4x7135 = vi4xF9BD;
3912 const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
3913 vi5x7135 = vi5xF9BD;
3914 const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
3915 vi6x7135 = vi6xF9BD;
3916
3917 const __m128 vi0xGIKM = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3918 const __m128 vi0xHJLN = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3919 vi0x9BDF = vi0xHJLN;
3920 const __m128 vi1xGIKM = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3921 const __m128 vi1xHJLN = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3922 vi1x9BDF = vi1xHJLN;
3923 const __m128 vi2xGIKM = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3924 const __m128 vi2xHJLN = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3925 vi2x9BDF = vi2xHJLN;
3926 const __m128 vi3xGIKM = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3927 const __m128 vi3xHJLN = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3928 vi3x9BDF = vi3xHJLN;
3929 const __m128 vi4xGIKM = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3930 const __m128 vi4xHJLN = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3931 vi4x9BDF = vi4xHJLN;
3932 const __m128 vi5xGIKM = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3933 const __m128 vi5xHJLN = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3934 vi5x9BDF = vi5xHJLN;
3935 const __m128 vi6xGIKM = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3936 const __m128 vi6xHJLN = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3937 vi6x9BDF = vi6xHJLN;
3938
3939 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
3940 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
3941 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
3942 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
3943 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
3944 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
3945 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
3946 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
3947 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
3948 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
3949
3950 const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vi0xGIKM);
3951 vi0x8ACE = vi0xGIKM;
3952 const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vi1xGIKM);
3953 vi1x8ACE = vi1xGIKM;
3954 const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vi2xGIKM);
3955 vi2x8ACE = vi2xGIKM;
3956 const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vi3xGIKM);
3957 vi3x8ACE = vi3xGIKM;
3958 const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vi4xGIKM);
3959 vi4x8ACE = vi4xGIKM;
3960 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM);
3961 vi5x8ACE = vi5xGIKM;
3962 const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vi6xGIKM);
3963 vi6x8ACE = vi6xGIKM;
3964
3965 const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3966 const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3967 const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3968 const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3969 const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3970 const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3971 const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3972
3973 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
3974 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
3975 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
3976 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
3977 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
3978 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
3979 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
3980 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
3981 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
3982 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
3983
3984
3985 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3986 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3987
3988 vo0 = _mm_min_ps(vo0, vmax);
3989 vo1 = _mm_min_ps(vo1, vmax);
3990
3991 _mm_storeu_ps(o1, vo1);
3992 o1 += 4;
3993 _mm_storeu_ps(o0, vo0);
3994 o0 += 4;
3995 }
3996 // Last block has 1-8 pixels to process.
3997 assert(w <= 8 * sizeof(float));
3998 assert(w >= 1 * sizeof(float));
3999 {
4000 vi0x8ACE = _mm_and_ps(vi0x8ACE, vmask_even);
4001 vi0x9BDF = _mm_and_ps(vi0x9BDF, vmask_odd);
4002 vi1x8ACE = _mm_and_ps(vi1x8ACE, vmask_even);
4003 vi1x9BDF = _mm_and_ps(vi1x9BDF, vmask_odd);
4004 vi2x8ACE = _mm_and_ps(vi2x8ACE, vmask_even);
4005 vi2x9BDF = _mm_and_ps(vi2x9BDF, vmask_odd);
4006 vi3x8ACE = _mm_and_ps(vi3x8ACE, vmask_even);
4007 vi3x9BDF = _mm_and_ps(vi3x9BDF, vmask_odd);
4008 vi4x8ACE = _mm_and_ps(vi4x8ACE, vmask_even);
4009 vi4x9BDF = _mm_and_ps(vi4x9BDF, vmask_odd);
4010 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even);
4011 vi5x9BDF = _mm_and_ps(vi5x9BDF, vmask_odd);
4012 vi6x8ACE = _mm_and_ps(vi6x8ACE, vmask_even);
4013 vi6x9BDF = _mm_and_ps(vi6x9BDF, vmask_odd);
4014
4015 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
4016 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
4017 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
4018 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
4019 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
4020 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
4021 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
4022 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
4023 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
4024 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
4025
4026 const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4027 const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4028 const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4029 const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4030 const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4031 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4032 const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4033
4034 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
4035 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
4036 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
4037 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
4038 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
4039 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
4040 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
4041 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
4042 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
4043 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
4044
4045 const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
4046 const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
4047 const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
4048 const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
4049 const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
4050 const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
4051 const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
4052
4053 const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4054 const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4055 const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4056 const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4057 const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4058 const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4059 const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4060
4061 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
4062 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
4063 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
4064 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
4065 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
4066 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
4067 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
4068 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
4069 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
4070 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
4071
4072 const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
4073 const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
4074 const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
4075 const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
4076 const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
4077 const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
4078 const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
4079
4080 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
4081 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
4082 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
4083 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
4084 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
4085 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
4086 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
4087 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
4088 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
4089 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
4090
4091 const __m128 vzero = _mm_setzero_ps();
4092 const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vzero);
4093 const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vzero);
4094 const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vzero);
4095 const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vzero);
4096 const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vzero);
4097 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero);
4098 const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vzero);
4099
4100 const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4101 const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4102 const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4103 const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4104 const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4105 const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4106 const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4107
4108 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
4109 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
4110 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
4111 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
4112 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
4113 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
4114 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
4115 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
4116 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
4117 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
4118
4119
4120 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
4121 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
4122
4123 vo0 = _mm_min_ps(vo0, vmax);
4124 vo1 = _mm_min_ps(vo1, vmax);
4125
4126 size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float));
4127 if XNN_LIKELY(w_tmp >= 4) {
4128 _mm_storeu_ps(o1, vo1);
4129 o1 += 4;
4130 _mm_storeu_ps(o0, vo0);
4131 o0 += 4;
4132 } else {
4133 if (w_tmp & 2) {
4134 _mm_storel_pi((__m64*) o1, vo1);
4135 o1 += 2;
4136 _mm_storel_pi((__m64*) o0, vo0);
4137 o0 += 2;
4138
4139 vo0 = _mm_movehl_ps(vo0, vo0);
4140 vo1 = _mm_movehl_ps(vo1, vo1);
4141 }
4142 if (w_tmp & 1) {
4143 _mm_store_ss(o1, vo1);
4144 o1 += 1;
4145 _mm_store_ss(o0, vo0);
4146 o0 += 1;
4147 }
4148 }
4149 }
4150
4151 i0 = (const float*) ((uintptr_t) i4 - input_decrement);
4152 i1 = (const float*) ((uintptr_t) i5 - input_decrement);
4153 i2 = (const float*) ((uintptr_t) i6 - input_decrement);
4154 i3 = (const float*) ((uintptr_t) i2 + input_width);
4155 i4 = (const float*) ((uintptr_t) i3 + input_width);
4156 i5 = (const float*) ((uintptr_t) i4 + input_width);
4157 i6 = (const float*) ((uintptr_t) i5 + input_width);
4158
4159 o0 = o1;
4160 o1 = (float*) ((uintptr_t) o0 + output_width);
4161
4162 output_height = doz(output_height, 2);
4163 padded_input_height = doz(padded_input_height, 4);
4164 } while (output_height != 0);
4165 }
4166
xnn_f32_gavgpool_cw_ukernel__sse_x4(size_t elements,size_t channels,const float * input,float * output,const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS (1)])4167 void xnn_f32_gavgpool_cw_ukernel__sse_x4(
4168 size_t elements,
4169 size_t channels,
4170 const float* input,
4171 float* output,
4172 const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4173 {
4174 assert(elements != 0);
4175 assert(elements % sizeof(float) == 0);
4176 assert(channels != 0);
4177
4178 const float* i0 = input;
4179 const float* i1 = (const float*) ((uintptr_t) i0 + elements);
4180 const float* i2 = (const float*) ((uintptr_t) i1 + elements);
4181 const float* i3 = (const float*) ((uintptr_t) i2 + elements);
4182
4183 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
4184 const __m128 vmultiplier = _mm_load_ps(params->sse.multiplier);
4185 const __m128 voutput_min = _mm_load_ps(params->sse.output_min);
4186 const __m128 voutput_max = _mm_load_ps(params->sse.output_max);
4187
4188 while (channels >= 4) {
4189 __m128 vsum0 = _mm_setzero_ps();
4190 __m128 vsum1 = _mm_setzero_ps();
4191 __m128 vsum2 = _mm_setzero_ps();
4192 __m128 vsum3 = _mm_setzero_ps();
4193 size_t n = elements;
4194 while (n >= 4 * sizeof(float)) {
4195 const __m128 vi0 = _mm_loadu_ps(i0);
4196 i0 += 4;
4197 const __m128 vi1 = _mm_loadu_ps(i1);
4198 i1 += 4;
4199 const __m128 vi2 = _mm_loadu_ps(i2);
4200 i2 += 4;
4201 const __m128 vi3 = _mm_loadu_ps(i3);
4202 i3 += 4;
4203
4204 vsum0 = _mm_add_ps(vsum0, vi0);
4205 vsum1 = _mm_add_ps(vsum1, vi1);
4206 vsum2 = _mm_add_ps(vsum2, vi2);
4207 vsum3 = _mm_add_ps(vsum3, vi3);
4208 n -= 4 * sizeof(float);
4209 }
4210
4211 if XNN_UNLIKELY(n != 0) {
4212 const __m128 vi0 = _mm_and_ps(_mm_loadu_ps(i0), vmask);
4213 i0 = (const float*) ((uintptr_t) i0 + n);
4214 const __m128 vi1 = _mm_and_ps(_mm_loadu_ps(i1), vmask);
4215 i1 = (const float*) ((uintptr_t) i1 + n);
4216 const __m128 vi2 = _mm_and_ps(_mm_loadu_ps(i2), vmask);
4217 i2 = (const float*) ((uintptr_t) i2 + n);
4218 const __m128 vi3 = _mm_and_ps(_mm_loadu_ps(i3), vmask);
4219 i3 = (const float*) ((uintptr_t) i3 + n);
4220
4221 vsum0 = _mm_add_ps(vsum0, vi0);
4222 vsum1 = _mm_add_ps(vsum1, vi1);
4223 vsum2 = _mm_add_ps(vsum2, vi2);
4224 vsum3 = _mm_add_ps(vsum3, vi3);
4225 }
4226
4227 // Having exactly 4 rows makes this work out nicely as we end up with
4228 // the 4 totals in 4 different lanes of the same vector.
4229 const __m128 vsum01 = _mm_add_ps(_mm_unpacklo_ps(vsum0, vsum1), _mm_unpackhi_ps(vsum0, vsum1));
4230 const __m128 vsum23 = _mm_add_ps(_mm_unpacklo_ps(vsum2, vsum3), _mm_unpackhi_ps(vsum2, vsum3));
4231 const __m128 vsum = _mm_add_ps(_mm_movelh_ps(vsum01, vsum23), _mm_movehl_ps(vsum23, vsum01));
4232 __m128 vout = _mm_mul_ps(vsum, vmultiplier);
4233
4234 vout = _mm_max_ps(vout, voutput_min);
4235 vout = _mm_min_ps(vout, voutput_max);
4236
4237 _mm_storeu_ps(output, vout);
4238 output += 4;
4239 i0 = i3;
4240 i1 = (const float*) ((uintptr_t) i0 + elements);
4241 i2 = (const float*) ((uintptr_t) i1 + elements);
4242 i3 = (const float*) ((uintptr_t) i2 + elements);
4243 channels -= 4;
4244 }
4245
4246 while (channels != 0) {
4247 __m128 vsum = _mm_setzero_ps();
4248 size_t n = elements;
4249 while (n >= 4 * sizeof(float)) {
4250 const __m128 vi0 = _mm_loadu_ps(i0);
4251 i0 += 4;
4252 vsum = _mm_add_ps(vsum, vi0);
4253 n -= 4 * sizeof(float);
4254 }
4255
4256 if XNN_UNLIKELY(n != 0) {
4257 __m128 vi0 = _mm_and_ps(_mm_loadu_ps(i0), vmask);
4258 i0 = (const float*) ((uintptr_t) i0 + n);
4259 vsum = _mm_add_ps(vsum, vi0);
4260 }
4261
4262 vsum = _mm_add_ps(vsum, _mm_movehl_ps(vsum, vsum));
4263 vsum = _mm_add_ss(vsum, _mm_shuffle_ps(vsum, vsum, _MM_SHUFFLE(3, 2, 1, 1)));
4264
4265 __m128 vout = _mm_mul_ss(vsum, vmultiplier);
4266
4267 vout = _mm_max_ss(vout, voutput_min);
4268 vout = _mm_min_ss(vout, voutput_max);
4269
4270 _mm_store_ss(output, vout);
4271 output += 1;
4272 channels -= 1;
4273 }
4274 }
4275
xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4(size_t rows,size_t channels,const float * input,size_t input_stride,const float * zero,float * buffer,float * output,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])4276 void xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4(
4277 size_t rows,
4278 size_t channels,
4279 const float* input,
4280 size_t input_stride,
4281 const float* zero,
4282 float* buffer,
4283 float* output,
4284 const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4285 {
4286 assert(rows > 7);
4287 assert(channels != 0);
4288
4289 const float* i0 = input;
4290 const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
4291 const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
4292 const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
4293 const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
4294 const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
4295 const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
4296 const size_t packed_channels = round_up_po2(channels, 4);
4297 const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float);
4298
4299 float* b = buffer;
4300 for (size_t c = 0; c < channels; c += 4) {
4301 const __m128 vi0 = _mm_loadu_ps(i0);
4302 i0 += 4;
4303 const __m128 vi1 = _mm_loadu_ps(i1);
4304 i1 += 4;
4305 const __m128 vi2 = _mm_loadu_ps(i2);
4306 i2 += 4;
4307 const __m128 vi3 = _mm_loadu_ps(i3);
4308 i3 += 4;
4309 const __m128 vi4 = _mm_loadu_ps(i4);
4310 i4 += 4;
4311 const __m128 vi5 = _mm_loadu_ps(i5);
4312 i5 += 4;
4313 const __m128 vi6 = _mm_loadu_ps(i6);
4314 i6 += 4;
4315
4316 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4317 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4318 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4319
4320 const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
4321 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
4322
4323 const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
4324
4325 _mm_store_ps(b, vsum); b += 4;
4326 }
4327 for (rows -= 7; rows > 7; rows -= 7) {
4328 b = buffer;
4329
4330 i0 = (const float*) ((uintptr_t) i0 + input_increment);
4331 i1 = (const float*) ((uintptr_t) i1 + input_increment);
4332 i2 = (const float*) ((uintptr_t) i2 + input_increment);
4333 i3 = (const float*) ((uintptr_t) i3 + input_increment);
4334 i4 = (const float*) ((uintptr_t) i4 + input_increment);
4335 i5 = (const float*) ((uintptr_t) i5 + input_increment);
4336 i6 = (const float*) ((uintptr_t) i6 + input_increment);
4337
4338 for (size_t c = 0; c < channels; c += 4) {
4339 const __m128 vi0 = _mm_loadu_ps(i0);
4340 i0 += 4;
4341 const __m128 vi1 = _mm_loadu_ps(i1);
4342 i1 += 4;
4343 const __m128 vi2 = _mm_loadu_ps(i2);
4344 i2 += 4;
4345 const __m128 vi3 = _mm_loadu_ps(i3);
4346 i3 += 4;
4347 const __m128 vi4 = _mm_loadu_ps(i4);
4348 i4 += 4;
4349 const __m128 vi5 = _mm_loadu_ps(i5);
4350 i5 += 4;
4351 const __m128 vi6 = _mm_loadu_ps(i6);
4352 i6 += 4;
4353 const __m128 vacc = _mm_load_ps(b);
4354
4355 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4356 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4357 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4358 const __m128 vsum6a = _mm_add_ps(vi6, vacc);
4359
4360 const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
4361 const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
4362
4363 const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
4364
4365 _mm_store_ps(b, vsum); b += 4;
4366 }
4367 }
4368
4369 i0 = (const float*) ((uintptr_t) i0 + input_increment);
4370 i1 = (const float*) ((uintptr_t) i1 + input_increment);
4371 if (rows < 2) {
4372 i1 = zero;
4373 }
4374 i2 = (const float*) ((uintptr_t) i2 + input_increment);
4375 if (rows <= 2) {
4376 i2 = zero;
4377 }
4378 i3 = (const float*) ((uintptr_t) i3 + input_increment);
4379 if (rows < 4) {
4380 i3 = zero;
4381 }
4382 i4 = (const float*) ((uintptr_t) i4 + input_increment);
4383 if (rows <= 4) {
4384 i4 = zero;
4385 }
4386 i5 = (const float*) ((uintptr_t) i5 + input_increment);
4387 if (rows < 6) {
4388 i5 = zero;
4389 }
4390 i6 = (const float*) ((uintptr_t) i6 + input_increment);
4391 if (rows <= 6) {
4392 i6 = zero;
4393 }
4394 const __m128 vscale = _mm_load_ps(params->sse.scale);
4395 const __m128 vmin = _mm_load_ps(params->sse.min);
4396 const __m128 vmax = _mm_load_ps(params->sse.max);
4397
4398 b = buffer;
4399 while (channels >= 4) {
4400 const __m128 vi0 = _mm_loadu_ps(i0);
4401 i0 += 4;
4402 const __m128 vi1 = _mm_loadu_ps(i1);
4403 i1 += 4;
4404 const __m128 vi2 = _mm_loadu_ps(i2);
4405 i2 += 4;
4406 const __m128 vi3 = _mm_loadu_ps(i3);
4407 i3 += 4;
4408 const __m128 vi4 = _mm_loadu_ps(i4);
4409 i4 += 4;
4410 const __m128 vi5 = _mm_loadu_ps(i5);
4411 i5 += 4;
4412 const __m128 vi6 = _mm_loadu_ps(i6);
4413 i6 += 4;
4414 const __m128 vacc = _mm_load_ps(b);
4415 b += 4;
4416
4417 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4418 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4419 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4420 const __m128 vsum6a = _mm_add_ps(vi6, vacc);
4421
4422 const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
4423 const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
4424
4425 const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
4426
4427 __m128 vout = _mm_mul_ps(vsum, vscale);
4428 vout = _mm_max_ps(vout, vmin);
4429 vout = _mm_min_ps(vout, vmax);
4430
4431 _mm_storeu_ps(output, vout);
4432 output += 4;
4433
4434 channels -= 4;
4435 }
4436 if (channels != 0) {
4437 const __m128 vi0 = _mm_loadu_ps(i0);
4438 const __m128 vi1 = _mm_loadu_ps(i1);
4439 const __m128 vi2 = _mm_loadu_ps(i2);
4440 const __m128 vi3 = _mm_loadu_ps(i3);
4441 const __m128 vi4 = _mm_loadu_ps(i4);
4442 const __m128 vi5 = _mm_loadu_ps(i5);
4443 const __m128 vi6 = _mm_loadu_ps(i6);
4444 const __m128 vacc = _mm_loadu_ps(b);
4445
4446 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4447 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4448 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4449 const __m128 vsum6a = _mm_add_ps(vi6, vacc);
4450
4451 const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
4452 const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
4453
4454 const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
4455
4456 __m128 vout = _mm_mul_ps(vsum, vscale);
4457 vout = _mm_max_ps(vout, vmin);
4458 vout = _mm_min_ps(vout, vmax);
4459
4460 if (channels & 2) {
4461 _mm_storel_pi((__m64*) output, vout);
4462 vout = _mm_movehl_ps(vout, vout);
4463 output += 2;
4464 }
4465 if (channels & 1) {
4466 _mm_store_ss(output, vout);
4467 }
4468 }
4469 }
4470
xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4(size_t rows,size_t channels,const float * input,size_t input_stride,const float * zero,float * output,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])4471 void xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4(
4472 size_t rows,
4473 size_t channels,
4474 const float* input,
4475 size_t input_stride,
4476 const float* zero,
4477 float* output,
4478 const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4479 {
4480 assert(rows != 0);
4481 assert(rows <= 7);
4482 assert(channels != 0);
4483
4484 const float* i0 = input;
4485 const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
4486 if (rows < 2) {
4487 i1 = zero;
4488 }
4489 const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
4490 if (rows <= 2) {
4491 i2 = zero;
4492 }
4493 const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
4494 if (rows < 4) {
4495 i3 = zero;
4496 }
4497 const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
4498 if (rows <= 4) {
4499 i4 = zero;
4500 }
4501 const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
4502 if (rows < 6) {
4503 i5 = zero;
4504 }
4505 const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
4506 if (rows <= 6) {
4507 i6 = zero;
4508 }
4509 const __m128 vscale = _mm_load_ps(params->sse.scale);
4510 const __m128 vmin = _mm_load_ps(params->sse.min);
4511 const __m128 vmax = _mm_load_ps(params->sse.max);
4512
4513 while (channels >= 4) {
4514 const __m128 vi0 = _mm_loadu_ps(i0);
4515 i0 += 4;
4516 const __m128 vi1 = _mm_loadu_ps(i1);
4517 i1 += 4;
4518 const __m128 vi2 = _mm_loadu_ps(i2);
4519 i2 += 4;
4520 const __m128 vi3 = _mm_loadu_ps(i3);
4521 i3 += 4;
4522 const __m128 vi4 = _mm_loadu_ps(i4);
4523 i4 += 4;
4524 const __m128 vi5 = _mm_loadu_ps(i5);
4525 i5 += 4;
4526 const __m128 vi6 = _mm_loadu_ps(i6);
4527 i6 += 4;
4528
4529 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4530 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4531 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4532
4533 const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
4534 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
4535
4536 const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
4537
4538 __m128 vout = _mm_mul_ps(vsum, vscale);
4539 vout = _mm_max_ps(vout, vmin);
4540 vout = _mm_min_ps(vout, vmax);
4541
4542 _mm_storeu_ps(output, vout);
4543 output += 4;
4544
4545 channels -= 4;
4546 }
4547 if (channels != 0) {
4548 const __m128 vi0 = _mm_loadu_ps(i0);
4549 const __m128 vi1 = _mm_loadu_ps(i1);
4550 const __m128 vi2 = _mm_loadu_ps(i2);
4551 const __m128 vi3 = _mm_loadu_ps(i3);
4552 const __m128 vi4 = _mm_loadu_ps(i4);
4553 const __m128 vi5 = _mm_loadu_ps(i5);
4554 const __m128 vi6 = _mm_loadu_ps(i6);
4555
4556 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4557 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4558 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4559
4560 const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
4561 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
4562
4563 const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
4564
4565 __m128 vout = _mm_mul_ps(vsum, vscale);
4566 vout = _mm_max_ps(vout, vmin);
4567 vout = _mm_min_ps(vout, vmax);
4568
4569 if (channels & 2) {
4570 _mm_storel_pi((__m64*) output, vout);
4571 vout = _mm_movehl_ps(vout, vout);
4572 output += 2;
4573 }
4574 if (channels & 1) {
4575 _mm_store_ss(output, vout);
4576 }
4577 }
4578 }
4579
xnn_f32_gemm_minmax_ukernel_1x8__sse_load1(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4580 void xnn_f32_gemm_minmax_ukernel_1x8__sse_load1(
4581 size_t mr,
4582 size_t nc,
4583 size_t kc,
4584 const float*restrict a,
4585 size_t a_stride,
4586 const float*restrict w,
4587 float*restrict c,
4588 size_t cm_stride,
4589 size_t cn_stride,
4590 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
4591 {
4592 assert(mr != 0);
4593 assert(mr <= 1);
4594 assert(nc != 0);
4595 assert(kc != 0);
4596 assert(kc % sizeof(float) == 0);
4597 assert(a != NULL);
4598 assert(w != NULL);
4599 assert(c != NULL);
4600
4601 const float* a0 = a;
4602 float* c0 = c;
4603
4604 do {
4605 __m128 vacc0x0123 = _mm_load_ps(w + 0);
4606 __m128 vacc0x4567 = _mm_load_ps(w + 4);
4607 w += 8;
4608
4609 size_t k = kc;
4610 do {
4611 const __m128 va0 = _mm_load1_ps(a0);
4612 a0 += 1;
4613
4614 const __m128 vb0123 = _mm_load_ps(w);
4615 const __m128 vb4567 = _mm_load_ps(w + 4);
4616 w += 8;
4617
4618 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
4619 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
4620
4621 k -= sizeof(float);
4622 } while (k != 0);
4623
4624 const __m128 vmax = _mm_load_ps(params->sse.max);
4625 vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
4626 vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
4627
4628 const __m128 vmin = _mm_load_ps(params->sse.min);
4629 vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
4630 vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
4631
4632 if XNN_LIKELY(nc >= 8) {
4633 _mm_storeu_ps(c0, vacc0x0123);
4634 _mm_storeu_ps(c0 + 4, vacc0x4567);
4635 c0 = (float*) ((uintptr_t) c0 + cn_stride);
4636
4637 a0 = (const float*) ((uintptr_t) a0 - kc);
4638
4639 nc -= 8;
4640 } else {
4641 if (nc & 4) {
4642 _mm_storeu_ps(c0, vacc0x0123);
4643
4644 vacc0x0123 = vacc0x4567;
4645
4646 c0 += 4;
4647 }
4648 if (nc & 2) {
4649 _mm_storel_pi((__m64*) c0, vacc0x0123);
4650
4651 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
4652
4653 c0 += 2;
4654 }
4655 if (nc & 1) {
4656 _mm_store_ss(c0, vacc0x0123);
4657 }
4658
4659 nc = 0;
4660 }
4661 } while (nc != 0);
4662 }
4663
xnn_f32_gemm_minmax_ukernel_4x2c4__sse(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4664 void xnn_f32_gemm_minmax_ukernel_4x2c4__sse(
4665 size_t mr,
4666 size_t nc,
4667 size_t kc,
4668 const float* restrict a,
4669 size_t a_stride,
4670 const float* restrict w,
4671 float* restrict c,
4672 size_t cm_stride,
4673 size_t cn_stride,
4674 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4675 {
4676 assert(mr != 0);
4677 assert(mr <= 4);
4678 assert(nc != 0);
4679 assert(kc != 0);
4680 assert(kc % sizeof(float) == 0);
4681 assert(a != NULL);
4682 assert(w != NULL);
4683 assert(c != NULL);
4684
4685 const float* a0 = a;
4686 float* c0 = c;
4687 const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
4688 float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
4689 if XNN_UNPREDICTABLE(mr < 2) {
4690 a1 = a0;
4691 c1 = c0;
4692 }
4693 const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
4694 float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
4695 if XNN_UNPREDICTABLE(mr <= 2) {
4696 a2 = a1;
4697 c2 = c1;
4698 }
4699 const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
4700 float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
4701 if XNN_UNPREDICTABLE(mr != 4) {
4702 a3 = a2;
4703 c3 = c2;
4704 }
4705
4706 do {
4707 __m128 vacc0x0c4 = _mm_load_ss(w);
4708 __m128 vacc0x1c4 = _mm_load_ss(w + 1);
4709 __m128 vacc1x0c4 = vacc0x0c4;
4710 __m128 vacc1x1c4 = vacc0x1c4;
4711 __m128 vacc2x0c4 = vacc0x0c4;
4712 __m128 vacc2x1c4 = vacc0x1c4;
4713 __m128 vacc3x0c4 = vacc0x0c4;
4714 __m128 vacc3x1c4 = vacc0x1c4;
4715 w += 2;
4716
4717 size_t k = kc;
4718 for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
4719 const __m128 va0 = _mm_loadu_ps(a0);
4720 a0 += 4;
4721 const __m128 va1 = _mm_loadu_ps(a1);
4722 a1 += 4;
4723 const __m128 va2 = _mm_loadu_ps(a2);
4724 a2 += 4;
4725 const __m128 va3 = _mm_loadu_ps(a3);
4726 a3 += 4;
4727
4728 const __m128 vb0 = _mm_loadu_ps(w);
4729 const __m128 vb1 = _mm_loadu_ps(w + 4);
4730 w += 8;
4731
4732 vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(va0, vb0));
4733 vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(va0, vb1));
4734 vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(va1, vb0));
4735 vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(va1, vb1));
4736 vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(va2, vb0));
4737 vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(va2, vb1));
4738 vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(va3, vb0));
4739 vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(va3, vb1));
4740 }
4741 if XNN_UNLIKELY(k != 0) {
4742 const __m128 va0 = _mm_loadu_ps(a0);
4743 a0 = (const float*) ((uintptr_t) a0 + k);
4744 const __m128 va1 = _mm_loadu_ps(a1);
4745 a1 = (const float*) ((uintptr_t) a1 + k);
4746 const __m128 va2 = _mm_loadu_ps(a2);
4747 a2 = (const float*) ((uintptr_t) a2 + k);
4748 const __m128 va3 = _mm_loadu_ps(a3);
4749 a3 = (const float*) ((uintptr_t) a3 + k);
4750
4751 const __m128 vb0 = _mm_loadu_ps(w);
4752 const __m128 vb1 = _mm_loadu_ps(w + 4);
4753 w += 8;
4754
4755 const __m128 vmask0 = _mm_cmpeq_ps(_mm_setzero_ps(), vb0);
4756 const __m128 vmask1 = _mm_cmpeq_ps(_mm_setzero_ps(), vb1);
4757
4758 vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va0), vb0));
4759 vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va0), vb1));
4760 vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va1), vb0));
4761 vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va1), vb1));
4762 vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va2), vb0));
4763 vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va2), vb1));
4764 vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va3), vb0));
4765 vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va3), vb1));
4766 }
4767
4768 const __m128 vacc0x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc0x0c4, vacc0x1c4), _mm_unpackhi_ps(vacc0x0c4, vacc0x1c4));
4769 const __m128 vacc1x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc1x0c4, vacc1x1c4), _mm_unpackhi_ps(vacc1x0c4, vacc1x1c4));
4770 const __m128 vacc2x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc2x0c4, vacc2x1c4), _mm_unpackhi_ps(vacc2x0c4, vacc2x1c4));
4771 const __m128 vacc3x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc3x0c4, vacc3x1c4), _mm_unpackhi_ps(vacc3x0c4, vacc3x1c4));
4772
4773 __m128 vacc01x01 = _mm_add_ps(_mm_movelh_ps(vacc0x01c2, vacc1x01c2), _mm_movehl_ps(vacc1x01c2, vacc0x01c2));
4774 __m128 vacc23x01 = _mm_add_ps(_mm_movelh_ps(vacc2x01c2, vacc3x01c2), _mm_movehl_ps(vacc3x01c2, vacc2x01c2));
4775
4776 const __m128 vmax = _mm_load_ps(params->sse.max);
4777 vacc01x01 = _mm_min_ps(vacc01x01, vmax);
4778 vacc23x01 = _mm_min_ps(vacc23x01, vmax);
4779
4780 const __m128 vmin = _mm_load_ps(params->sse.min);
4781 vacc01x01 = _mm_max_ps(vacc01x01, vmin);
4782 vacc23x01 = _mm_max_ps(vacc23x01, vmin);
4783
4784 if XNN_LIKELY(nc >= 2) {
4785 _mm_storel_pi((__m64*) c2, vacc23x01);
4786 c2 = (float*) ((uintptr_t) c2 + cn_stride);
4787 a2 = (const float*) ((uintptr_t) a2 - kc);
4788 _mm_storeh_pi((__m64*) c3, vacc23x01);
4789 c3 = (float*) ((uintptr_t) c3 + cn_stride);
4790 a3 = (const float*) ((uintptr_t) a3 - kc);
4791 _mm_storel_pi((__m64*) c0, vacc01x01);
4792 c0 = (float*) ((uintptr_t) c0 + cn_stride);
4793 a0 = (const float*) ((uintptr_t) a0 - kc);
4794 _mm_storeh_pi((__m64*) c1, vacc01x01);
4795 c1 = (float*) ((uintptr_t) c1 + cn_stride);
4796 a1 = (const float*) ((uintptr_t) a1 - kc);
4797
4798 nc -= 2;
4799 } else {
4800 assert(nc == 1);
4801 _mm_store_ss(c2, vacc23x01);
4802 _mm_store_ss(c3, _mm_movehl_ps(vacc23x01, vacc23x01));
4803 _mm_store_ss(c0, vacc01x01);
4804 _mm_store_ss(c1, _mm_movehl_ps(vacc01x01, vacc01x01));
4805
4806 nc = 0;
4807 }
4808 } while (nc != 0);
4809 }
4810
xnn_f32_gemm_minmax_ukernel_4x8__sse_load1(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4811 void xnn_f32_gemm_minmax_ukernel_4x8__sse_load1(
4812 size_t mr,
4813 size_t nc,
4814 size_t kc,
4815 const float*restrict a,
4816 size_t a_stride,
4817 const float*restrict w,
4818 float*restrict c,
4819 size_t cm_stride,
4820 size_t cn_stride,
4821 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
4822 {
4823 assert(mr != 0);
4824 assert(mr <= 4);
4825 assert(nc != 0);
4826 assert(kc != 0);
4827 assert(kc % sizeof(float) == 0);
4828 assert(a != NULL);
4829 assert(w != NULL);
4830 assert(c != NULL);
4831
4832 const float* a0 = a;
4833 float* c0 = c;
4834 const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
4835 float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
4836 if XNN_UNPREDICTABLE(mr < 2) {
4837 a1 = a0;
4838 c1 = c0;
4839 }
4840 const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
4841 float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
4842 if XNN_UNPREDICTABLE(mr <= 2) {
4843 a2 = a1;
4844 c2 = c1;
4845 }
4846 const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
4847 float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
4848 if XNN_UNPREDICTABLE(mr != 4) {
4849 a3 = a2;
4850 c3 = c2;
4851 }
4852
4853 do {
4854 __m128 vacc0x0123 = _mm_load_ps(w + 0);
4855 __m128 vacc0x4567 = _mm_load_ps(w + 4);
4856 __m128 vacc1x0123 = vacc0x0123;
4857 __m128 vacc1x4567 = vacc0x4567;
4858 __m128 vacc2x0123 = vacc0x0123;
4859 __m128 vacc2x4567 = vacc0x4567;
4860 __m128 vacc3x0123 = vacc0x0123;
4861 __m128 vacc3x4567 = vacc0x4567;
4862 w += 8;
4863
4864 size_t k = kc;
4865 do {
4866 const __m128 va0 = _mm_load1_ps(a0);
4867 a0 += 1;
4868 const __m128 va1 = _mm_load1_ps(a1);
4869 a1 += 1;
4870 const __m128 va2 = _mm_load1_ps(a2);
4871 a2 += 1;
4872 const __m128 va3 = _mm_load1_ps(a3);
4873 a3 += 1;
4874
4875 const __m128 vb0123 = _mm_load_ps(w);
4876 const __m128 vb4567 = _mm_load_ps(w + 4);
4877 w += 8;
4878
4879 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
4880 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
4881 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
4882 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
4883 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
4884 vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
4885 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
4886 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
4887
4888 k -= sizeof(float);
4889 } while (k != 0);
4890
4891 const __m128 vmax = _mm_load_ps(params->sse.max);
4892 vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
4893 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
4894 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
4895 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
4896 vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
4897 vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
4898 vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
4899 vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
4900
4901 const __m128 vmin = _mm_load_ps(params->sse.min);
4902 vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
4903 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
4904 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
4905 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
4906 vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
4907 vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
4908 vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
4909 vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
4910
4911 if XNN_LIKELY(nc >= 8) {
4912 _mm_storeu_ps(c3, vacc3x0123);
4913 _mm_storeu_ps(c3 + 4, vacc3x4567);
4914 c3 = (float*) ((uintptr_t) c3 + cn_stride);
4915 _mm_storeu_ps(c2, vacc2x0123);
4916 _mm_storeu_ps(c2 + 4, vacc2x4567);
4917 c2 = (float*) ((uintptr_t) c2 + cn_stride);
4918 _mm_storeu_ps(c1, vacc1x0123);
4919 _mm_storeu_ps(c1 + 4, vacc1x4567);
4920 c1 = (float*) ((uintptr_t) c1 + cn_stride);
4921 _mm_storeu_ps(c0, vacc0x0123);
4922 _mm_storeu_ps(c0 + 4, vacc0x4567);
4923 c0 = (float*) ((uintptr_t) c0 + cn_stride);
4924
4925 a3 = (const float*) ((uintptr_t) a3 - kc);
4926 a2 = (const float*) ((uintptr_t) a2 - kc);
4927 a1 = (const float*) ((uintptr_t) a1 - kc);
4928 a0 = (const float*) ((uintptr_t) a0 - kc);
4929
4930 nc -= 8;
4931 } else {
4932 if (nc & 4) {
4933 _mm_storeu_ps(c3, vacc3x0123);
4934 _mm_storeu_ps(c2, vacc2x0123);
4935 _mm_storeu_ps(c1, vacc1x0123);
4936 _mm_storeu_ps(c0, vacc0x0123);
4937
4938 vacc3x0123 = vacc3x4567;
4939 vacc2x0123 = vacc2x4567;
4940 vacc1x0123 = vacc1x4567;
4941 vacc0x0123 = vacc0x4567;
4942
4943 c3 += 4;
4944 c2 += 4;
4945 c1 += 4;
4946 c0 += 4;
4947 }
4948 if (nc & 2) {
4949 _mm_storel_pi((__m64*) c3, vacc3x0123);
4950 _mm_storel_pi((__m64*) c2, vacc2x0123);
4951 _mm_storel_pi((__m64*) c1, vacc1x0123);
4952 _mm_storel_pi((__m64*) c0, vacc0x0123);
4953
4954 vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
4955 vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
4956 vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
4957 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
4958
4959 c3 += 2;
4960 c2 += 2;
4961 c1 += 2;
4962 c0 += 2;
4963 }
4964 if (nc & 1) {
4965 _mm_store_ss(c3, vacc3x0123);
4966 _mm_store_ss(c2, vacc2x0123);
4967 _mm_store_ss(c1, vacc1x0123);
4968 _mm_store_ss(c0, vacc0x0123);
4969 }
4970
4971 nc = 0;
4972 }
4973 } while (nc != 0);
4974 }
4975
xnn_f32_ibilinear_chw_ukernel__sse_p8(size_t output_pixels,size_t channels,const float ** restrict input,size_t input_offset,const float * restrict weights,float * restrict output,size_t input_increment)4976 void xnn_f32_ibilinear_chw_ukernel__sse_p8(
4977 size_t output_pixels,
4978 size_t channels,
4979 const float**restrict input,
4980 size_t input_offset,
4981 const float*restrict weights,
4982 float*restrict output,
4983 size_t input_increment) XNN_OOB_READS
4984 {
4985 assert(output_pixels != 0);
4986 assert(channels != 0);
4987 assert(input_increment % sizeof(float) == 0);
4988
4989 do {
4990 const float** i = input;
4991 const float* w = weights;
4992 size_t p = output_pixels;
4993 for (; p >= 8; p -= 8) {
4994 const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
4995 const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
4996 const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
4997 const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
4998 const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
4999 const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
5000 const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
5001 const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
5002 const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
5003 const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
5004 const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
5005 const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
5006 const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
5007 const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
5008 const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
5009 const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
5010 i += 2 * 8;
5011
5012 const __m128 vw0123p0 = _mm_loadu_ps(w + 0);
5013 const __m128 vw0123p1 = _mm_loadu_ps(w + 4);
5014 const __m128 vw4567p0 = _mm_loadu_ps(w + 8);
5015 const __m128 vw4567p1 = _mm_loadu_ps(w + 12);
5016 w += 2 * 8;
5017
5018 const __m128 vtltr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0);
5019 const __m128 vblbr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0);
5020 const __m128 vtltr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl2);
5021 const __m128 vblbr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl2);
5022 const __m128 vtltr4 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl4);
5023 const __m128 vblbr4 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl4);
5024 const __m128 vtltr6 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl6);
5025 const __m128 vblbr6 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl6);
5026
5027 const __m128 valphah0123 = _mm_shuffle_ps(vw0123p0, vw0123p1, _MM_SHUFFLE(2, 0, 2, 0));
5028 const __m128 valphav0123 = _mm_shuffle_ps(vw0123p0, vw0123p1, _MM_SHUFFLE(3, 1, 3, 1));
5029 const __m128 valphah4567 = _mm_shuffle_ps(vw4567p0, vw4567p1, _MM_SHUFFLE(2, 0, 2, 0));
5030 const __m128 valphav4567 = _mm_shuffle_ps(vw4567p0, vw4567p1, _MM_SHUFFLE(3, 1, 3, 1));
5031
5032 const __m128 vtltr01 = _mm_loadh_pi(vtltr0, (const __m64*) itl1);
5033 const __m128 vblbr01 = _mm_loadh_pi(vblbr0, (const __m64*) ibl1);
5034 const __m128 vtltr23 = _mm_loadh_pi(vtltr2, (const __m64*) itl3);
5035 const __m128 vblbr23 = _mm_loadh_pi(vblbr2, (const __m64*) ibl3);
5036 const __m128 vtltr45 = _mm_loadh_pi(vtltr4, (const __m64*) itl5);
5037 const __m128 vblbr45 = _mm_loadh_pi(vblbr4, (const __m64*) ibl5);
5038 const __m128 vtltr67 = _mm_loadh_pi(vtltr6, (const __m64*) itl7);
5039 const __m128 vblbr67 = _mm_loadh_pi(vblbr6, (const __m64*) ibl7);
5040
5041 const __m128 vldrd01 = _mm_sub_ps(vblbr01, vtltr01);
5042 const __m128 vldrd23 = _mm_sub_ps(vblbr23, vtltr23);
5043 const __m128 vldrd45 = _mm_sub_ps(vblbr45, vtltr45);
5044 const __m128 vldrd67 = _mm_sub_ps(vblbr67, vtltr67);
5045
5046 const __m128 vld0123 = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(2, 0, 2, 0));
5047 const __m128 vrd0123 = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(3, 1, 3, 1));
5048 const __m128 vld4567 = _mm_shuffle_ps(vldrd45, vldrd67, _MM_SHUFFLE(2, 0, 2, 0));
5049 const __m128 vrd4567 = _mm_shuffle_ps(vldrd45, vldrd67, _MM_SHUFFLE(3, 1, 3, 1));
5050
5051 const __m128 vtl0123 = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(2, 0, 2, 0));
5052 const __m128 vtr0123 = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(3, 1, 3, 1));
5053 const __m128 vtl4567 = _mm_shuffle_ps(vtltr45, vtltr67, _MM_SHUFFLE(2, 0, 2, 0));
5054 const __m128 vtr4567 = _mm_shuffle_ps(vtltr45, vtltr67, _MM_SHUFFLE(3, 1, 3, 1));
5055
5056 const __m128 vl0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vld0123, valphav0123));
5057 const __m128 vr0123 = _mm_add_ps(vtr0123, _mm_mul_ps(vrd0123, valphav0123));
5058 const __m128 vl4567 = _mm_add_ps(vtl4567, _mm_mul_ps(vld4567, valphav4567));
5059 const __m128 vr4567 = _mm_add_ps(vtr4567, _mm_mul_ps(vrd4567, valphav4567));
5060
5061 const __m128 vd0123 = _mm_sub_ps(vr0123, vl0123);
5062 const __m128 vd4567 = _mm_sub_ps(vr4567, vl4567);
5063
5064 const __m128 vo0123 = _mm_add_ps(vl0123, _mm_mul_ps(vd0123, valphah0123));
5065 const __m128 vo4567 = _mm_add_ps(vl4567, _mm_mul_ps(vd4567, valphah4567));
5066
5067 _mm_storeu_ps(output + 0, vo0123);
5068 _mm_storeu_ps(output + 4, vo4567);
5069 output += 8;
5070 }
5071
5072 for (; p >= 4; p -= 4) {
5073 const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
5074 const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
5075 const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
5076 const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
5077 const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
5078 const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
5079 const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
5080 const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
5081 i += 8;
5082
5083 const __m128 vw0 = _mm_loadu_ps(w);
5084 const __m128 vw1 = _mm_loadu_ps(w + 4);
5085 w += 8;
5086
5087 const __m128 vtltr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0);
5088 const __m128 vblbr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0);
5089 const __m128 vtltr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl2);
5090 const __m128 vblbr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl2);
5091
5092 const __m128 valphah = _mm_shuffle_ps(vw0, vw1, _MM_SHUFFLE(2, 0, 2, 0));
5093 const __m128 valphav = _mm_shuffle_ps(vw0, vw1, _MM_SHUFFLE(3, 1, 3, 1));
5094
5095 const __m128 vtltr01 = _mm_loadh_pi(vtltr0, (const __m64*) itl1);
5096 const __m128 vblbr01 = _mm_loadh_pi(vblbr0, (const __m64*) ibl1);
5097 const __m128 vtltr23 = _mm_loadh_pi(vtltr2, (const __m64*) itl3);
5098 const __m128 vblbr23 = _mm_loadh_pi(vblbr2, (const __m64*) ibl3);
5099
5100 const __m128 vldrd01 = _mm_sub_ps(vblbr01, vtltr01);
5101 const __m128 vldrd23 = _mm_sub_ps(vblbr23, vtltr23);
5102
5103 const __m128 vld = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(2, 0, 2, 0));
5104 const __m128 vrd = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(3, 1, 3, 1));
5105
5106 const __m128 vtl = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(2, 0, 2, 0));
5107 const __m128 vtr = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(3, 1, 3, 1));
5108
5109 const __m128 vl = _mm_add_ps(vtl, _mm_mul_ps(vld, valphav));
5110 const __m128 vr = _mm_add_ps(vtr, _mm_mul_ps(vrd, valphav));
5111
5112 const __m128 vd = _mm_sub_ps(vr, vl);
5113 const __m128 vo = _mm_add_ps(vl, _mm_mul_ps(vd, valphah));
5114
5115 _mm_storeu_ps(output, vo);
5116 output += 4;
5117 }
5118
5119 if XNN_UNLIKELY(p != 0) {
5120 if (p & 2) {
5121 const __m128 vw = _mm_loadu_ps(w);
5122 w += 4;
5123
5124 const __m128 valphah = _mm_shuffle_ps(vw, vw, _MM_SHUFFLE(2, 0, 2, 0));
5125 const __m128 valphav = _mm_shuffle_ps(vw, vw, _MM_SHUFFLE(3, 1, 3, 1));
5126
5127 const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
5128 const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
5129 const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
5130 const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
5131 i += 4;
5132
5133 const __m128 vtltr = _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0), (const __m64*) itl1);
5134 const __m128 vblbr = _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0), (const __m64*) ibl1);
5135
5136 const __m128 vldrd = _mm_sub_ps(vblbr, vtltr);
5137 const __m128 vld = _mm_shuffle_ps(vldrd, vldrd, _MM_SHUFFLE(2, 0, 2, 0));
5138 const __m128 vrd = _mm_shuffle_ps(vldrd, vldrd, _MM_SHUFFLE(3, 1, 3, 1));
5139
5140 const __m128 vtl = _mm_shuffle_ps(vtltr, vtltr, _MM_SHUFFLE(2, 0, 2, 0));
5141 const __m128 vtr = _mm_shuffle_ps(vtltr, vtltr, _MM_SHUFFLE(3, 1, 3, 1));
5142
5143 const __m128 vl = _mm_add_ps(vtl, _mm_mul_ps(vld, valphav));
5144 const __m128 vr = _mm_add_ps(vtr, _mm_mul_ps(vrd, valphav));
5145
5146 const __m128 vd = _mm_sub_ps(vr, vl);
5147 const __m128 vo = _mm_add_ps(vl, _mm_mul_ps(vd, valphah));
5148
5149 _mm_storel_pi((__m64*) output, vo);
5150 output += 2;
5151 }
5152
5153 if (p & 1) {
5154 // We are computing the following formula:
5155 // result = (1 - alpha_h) * (1 - alpha_v) * top_left +
5156 // alpha_h * (1 - alpha_v) * top_right +
5157 // (1 - alpha_h) * alpha_v * bottom_left +
5158 // alpha_h * alpha_v * bottom_right.
5159 //
5160 // Rearranging gives
5161 // result = left + alpha_h * (right - left),
5162 // where
5163 // left = top_left + alpha_v * (bottom_left - top_left),
5164 // right = top_right + alpha_v * (bottom_right - top_right).
5165
5166 const float alphah = *w;
5167 const __m128 valphav = _mm_load_ps1(w + 1);
5168 w += 2;
5169
5170 const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
5171 const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
5172 i += 2;
5173
5174 const __m128 vtltr = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl);
5175 const __m128 vblbr = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl);
5176
5177 // Compute at once
5178 // left_diff = bottom_left - top_left
5179 // right_diff = bottom_right - top_right
5180 const __m128 vldrd = _mm_sub_ps(vblbr, vtltr);
5181 const __m128 vlr = _mm_add_ps(vtltr, _mm_mul_ps(vldrd, valphav));
5182
5183 // Extract them and compute the result.
5184 const float l = _mm_cvtss_f32(vlr);
5185 const float r = _mm_cvtss_f32(_mm_shuffle_ps(vlr, vlr, 1));
5186
5187 *output++ = l + alphah * (r - l);
5188 }
5189 }
5190
5191 input_offset += input_increment;
5192 } while (--channels != 0);
5193 }
5194
xnn_f32_ibilinear_ukernel__sse_c8(size_t output_pixels,size_t channels,const float ** restrict input,size_t input_offset,const float * restrict weights,float * restrict output,size_t output_increment)5195 void xnn_f32_ibilinear_ukernel__sse_c8(
5196 size_t output_pixels,
5197 size_t channels,
5198 const float**restrict input,
5199 size_t input_offset,
5200 const float*restrict weights,
5201 float*restrict output,
5202 size_t output_increment) XNN_OOB_READS
5203 {
5204 assert(output_pixels != 0);
5205 assert(channels != 0);
5206 assert(channels % sizeof(float) == 0);
5207
5208 do {
5209 const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset);
5210 const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset);
5211 const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset);
5212 const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset);
5213 input += 4;
5214
5215 __m128 valphahv = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) weights);
5216 valphahv = _mm_unpacklo_ps(valphahv, valphahv);
5217 const __m128 valphah = _mm_movelh_ps(valphahv, valphahv);
5218 const __m128 valphav = _mm_movehl_ps(valphahv, valphahv);
5219 weights += 2;
5220
5221 size_t c = channels;
5222 for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
5223 const __m128 vtl0123 = _mm_loadu_ps(i0);
5224 const __m128 vtr0123 = _mm_loadu_ps(i1);
5225 const __m128 vbl0123 = _mm_loadu_ps(i2);
5226 const __m128 vbr0123 = _mm_loadu_ps(i3);
5227 const __m128 vtl4567 = _mm_loadu_ps(i0 + 4);
5228 const __m128 vtr4567 = _mm_loadu_ps(i1 + 4);
5229 const __m128 vbl4567 = _mm_loadu_ps(i2 + 4);
5230 const __m128 vbr4567 = _mm_loadu_ps(i3 + 4);
5231 i0 += 8;
5232 i1 += 8;
5233 i2 += 8;
5234 i3 += 8;
5235
5236 const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
5237 const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
5238 const __m128 vtd4567 = _mm_sub_ps(vtr4567, vtl4567);
5239 const __m128 vbd4567 = _mm_sub_ps(vbr4567, vbl4567);
5240
5241 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
5242 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
5243 const __m128 vt4567 = _mm_add_ps(vtl4567, _mm_mul_ps(vtd4567, valphah));
5244 const __m128 vb4567 = _mm_add_ps(vbl4567, _mm_mul_ps(vbd4567, valphah));
5245
5246 const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
5247 const __m128 vd4567 = _mm_sub_ps(vb4567, vt4567);
5248
5249 const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
5250 const __m128 vo4567 = _mm_add_ps(vt4567, _mm_mul_ps(vd4567, valphav));
5251
5252 _mm_storeu_ps(output, vo0123);
5253 _mm_storeu_ps(output + 4, vo4567);
5254 output += 8;
5255 }
5256 for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
5257 const __m128 vtl0123 = _mm_loadu_ps(i0);
5258 const __m128 vtr0123 = _mm_loadu_ps(i1);
5259 const __m128 vbl0123 = _mm_loadu_ps(i2);
5260 const __m128 vbr0123 = _mm_loadu_ps(i3);
5261 i0 += 4;
5262 i1 += 4;
5263 i2 += 4;
5264 i3 += 4;
5265
5266 const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
5267 const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
5268
5269 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
5270 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
5271
5272 const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
5273
5274 const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
5275
5276 _mm_storeu_ps(output, vo0123);
5277 output += 4;
5278 }
5279 if XNN_UNLIKELY(c != 0) {
5280 const __m128 vtl0123 = _mm_loadu_ps(i0);
5281 const __m128 vtr0123 = _mm_loadu_ps(i1);
5282 const __m128 vbl0123 = _mm_loadu_ps(i2);
5283 const __m128 vbr0123 = _mm_loadu_ps(i3);
5284
5285 const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
5286 const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
5287
5288 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
5289 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
5290
5291 const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
5292
5293 __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
5294
5295 if (c & (2 * sizeof(float))) {
5296 _mm_storel_pi((__m64*) output, vo0123);
5297 vo0123 = _mm_movehl_ps(vo0123, vo0123);
5298 output += 2;
5299 }
5300 if (c & (1 * sizeof(float))) {
5301 _mm_store_ss(output, vo0123);
5302 output += 1;
5303 }
5304 }
5305
5306 output = (float*) ((uintptr_t) output + output_increment);
5307 } while (--output_pixels != 0);
5308 }
5309
xnn_f32_igemm_minmax_ukernel_1x8__sse_load1(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5310 void xnn_f32_igemm_minmax_ukernel_1x8__sse_load1(
5311 size_t mr,
5312 size_t nc,
5313 size_t kc,
5314 size_t ks,
5315 const float**restrict a,
5316 const float*restrict w,
5317 float*restrict c,
5318 size_t cm_stride,
5319 size_t cn_stride,
5320 size_t a_offset,
5321 const float* zero,
5322 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5323 {
5324 assert(mr != 0);
5325 assert(mr <= 1);
5326 assert(nc != 0);
5327 assert(kc != 0);
5328 assert(kc % sizeof(float) == 0);
5329 assert(ks != 0);
5330 assert(ks % (1 * sizeof(void*)) == 0);
5331 assert(a_offset % sizeof(float) == 0);
5332 assert(a != NULL);
5333 assert(w != NULL);
5334 assert(c != NULL);
5335
5336 float* c0 = c;
5337
5338 do {
5339 __m128 vacc0x0123 = _mm_load_ps(w);
5340 __m128 vacc0x4567 = _mm_load_ps(w + 4);
5341 w += 8;
5342
5343 size_t p = ks;
5344 do {
5345 const float* restrict a0 = a[0];
5346 assert(a0 != NULL);
5347 if XNN_UNPREDICTABLE(a0 != zero) {
5348 a0 = (const float*) ((uintptr_t) a0 + a_offset);
5349 }
5350 a += 1;
5351
5352 size_t k = kc;
5353 do {
5354 const __m128 vb0123 = _mm_load_ps(w);
5355 const __m128 vb4567 = _mm_load_ps(w + 4);
5356 w += 8;
5357
5358 const __m128 va0 = _mm_load1_ps(a0);
5359 a0 += 1;
5360
5361 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
5362 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
5363 k -= sizeof(float);
5364 } while (k != 0);
5365 p -= 1 * sizeof(void*);
5366 } while (p != 0);
5367
5368 const __m128 vmax = _mm_load_ps(params->sse.max);
5369 vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
5370 vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
5371
5372 const __m128 vmin = _mm_load_ps(params->sse.min);
5373 vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
5374 vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
5375
5376 if XNN_LIKELY(nc >= 8) {
5377 _mm_storeu_ps(c0, vacc0x0123);
5378 _mm_storeu_ps(c0 + 4, vacc0x4567);
5379 c0 = (float*) ((uintptr_t) c0 + cn_stride);
5380
5381 a = (const float**restrict) ((uintptr_t) a - ks);
5382 nc -= 8;
5383 } else {
5384 if (nc & 4) {
5385 _mm_storeu_ps(c0, vacc0x0123);
5386
5387 vacc0x0123 = vacc0x4567;
5388
5389 c0 += 4;
5390 }
5391 if (nc & 2) {
5392 _mm_storel_pi((__m64*) c0, vacc0x0123);
5393
5394 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
5395
5396 c0 += 2;
5397 }
5398 if (nc & 1) {
5399 _mm_store_ss(c0, vacc0x0123);
5400 }
5401
5402 nc = 0;
5403 }
5404 } while (nc != 0);
5405 }
5406
xnn_f32_igemm_minmax_ukernel_4x2c4__sse(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5407 void xnn_f32_igemm_minmax_ukernel_4x2c4__sse(
5408 size_t mr,
5409 size_t nc,
5410 size_t kc,
5411 size_t ks,
5412 const float**restrict a,
5413 const float*restrict w,
5414 float*restrict c,
5415 size_t cm_stride,
5416 size_t cn_stride,
5417 size_t a_offset,
5418 const float* zero,
5419 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5420 {
5421 assert(mr != 0);
5422 assert(mr <= 4);
5423 assert(nc != 0);
5424 assert(kc != 0);
5425 assert(kc % sizeof(float) == 0);
5426 assert(ks != 0);
5427 assert(ks % (4 * sizeof(void*)) == 0);
5428 assert(a_offset % sizeof(float) == 0);
5429 assert(a != NULL);
5430 assert(w != NULL);
5431 assert(c != NULL);
5432
5433 float* c0 = c;
5434 float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
5435 if XNN_UNPREDICTABLE(mr < 2) {
5436 c1 = c0;
5437 }
5438 float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
5439 if XNN_UNPREDICTABLE(mr <= 2) {
5440 c2 = c1;
5441 }
5442 float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
5443 if XNN_UNPREDICTABLE(mr != 4) {
5444 c3 = c2;
5445 }
5446
5447 do {
5448 __m128 vacc0x0c4 = _mm_load_ss(w);
5449 __m128 vacc0x1c4 = _mm_load_ss(w + 1);
5450 __m128 vacc1x0c4 = vacc0x0c4;
5451 __m128 vacc1x1c4 = vacc0x1c4;
5452 __m128 vacc2x0c4 = vacc0x0c4;
5453 __m128 vacc2x1c4 = vacc0x1c4;
5454 __m128 vacc3x0c4 = vacc0x0c4;
5455 __m128 vacc3x1c4 = vacc0x1c4;
5456 w += 2;
5457
5458 size_t p = ks;
5459 do {
5460 const float* restrict a0 = a[0];
5461 assert(a0 != NULL);
5462 if XNN_UNPREDICTABLE(a0 != zero) {
5463 a0 = (const float*) ((uintptr_t) a0 + a_offset);
5464 }
5465 const float* restrict a1 = a[1];
5466 assert(a1 != NULL);
5467 if XNN_UNPREDICTABLE(a1 != zero) {
5468 a1 = (const float*) ((uintptr_t) a1 + a_offset);
5469 }
5470 const float* restrict a2 = a[2];
5471 assert(a2 != NULL);
5472 if XNN_UNPREDICTABLE(a2 != zero) {
5473 a2 = (const float*) ((uintptr_t) a2 + a_offset);
5474 }
5475 const float* restrict a3 = a[3];
5476 assert(a3 != NULL);
5477 if XNN_UNPREDICTABLE(a3 != zero) {
5478 a3 = (const float*) ((uintptr_t) a3 + a_offset);
5479 }
5480 a += 4;
5481
5482 size_t k = kc;
5483 for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
5484 const __m128 va0 = _mm_loadu_ps(a0);
5485 a0 += 4;
5486 const __m128 va1 = _mm_loadu_ps(a1);
5487 a1 += 4;
5488 const __m128 va2 = _mm_loadu_ps(a2);
5489 a2 += 4;
5490 const __m128 va3 = _mm_loadu_ps(a3);
5491 a3 += 4;
5492
5493 const __m128 vb0 = _mm_loadu_ps(w);
5494 const __m128 vb1 = _mm_loadu_ps(w + 4);
5495 w += 8;
5496
5497 vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(va0, vb0));
5498 vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(va0, vb1));
5499 vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(va1, vb0));
5500 vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(va1, vb1));
5501 vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(va2, vb0));
5502 vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(va2, vb1));
5503 vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(va3, vb0));
5504 vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(va3, vb1));
5505 }
5506 if XNN_UNLIKELY(k != 0) {
5507 const __m128 va0 = _mm_loadu_ps(a0);
5508 const __m128 va1 = _mm_loadu_ps(a1);
5509 const __m128 va2 = _mm_loadu_ps(a2);
5510 const __m128 va3 = _mm_loadu_ps(a3);
5511
5512 const __m128 vb0 = _mm_loadu_ps(w);
5513 const __m128 vb1 = _mm_loadu_ps(w + 4);
5514 w += 8;
5515
5516 const __m128 vmask0 = _mm_cmpeq_ps(_mm_setzero_ps(), vb0);
5517 const __m128 vmask1 = _mm_cmpeq_ps(_mm_setzero_ps(), vb1);
5518
5519 vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va0), vb0));
5520 vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va0), vb1));
5521 vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va1), vb0));
5522 vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va1), vb1));
5523 vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va2), vb0));
5524 vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va2), vb1));
5525 vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va3), vb0));
5526 vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va3), vb1));
5527 }
5528 p -= 4 * sizeof(void*);
5529 } while (p != 0);
5530
5531 const __m128 vacc0x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc0x0c4, vacc0x1c4), _mm_unpackhi_ps(vacc0x0c4, vacc0x1c4));
5532 const __m128 vacc1x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc1x0c4, vacc1x1c4), _mm_unpackhi_ps(vacc1x0c4, vacc1x1c4));
5533 const __m128 vacc2x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc2x0c4, vacc2x1c4), _mm_unpackhi_ps(vacc2x0c4, vacc2x1c4));
5534 const __m128 vacc3x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc3x0c4, vacc3x1c4), _mm_unpackhi_ps(vacc3x0c4, vacc3x1c4));
5535
5536 __m128 vacc01x01 = _mm_add_ps(_mm_movelh_ps(vacc0x01c2, vacc1x01c2), _mm_movehl_ps(vacc1x01c2, vacc0x01c2));
5537 __m128 vacc23x01 = _mm_add_ps(_mm_movelh_ps(vacc2x01c2, vacc3x01c2), _mm_movehl_ps(vacc3x01c2, vacc2x01c2));
5538
5539 const __m128 vmax = _mm_load_ps(params->sse.max);
5540 vacc01x01 = _mm_min_ps(vacc01x01, vmax);
5541 vacc23x01 = _mm_min_ps(vacc23x01, vmax);
5542
5543 const __m128 vmin = _mm_load_ps(params->sse.min);
5544 vacc01x01 = _mm_max_ps(vacc01x01, vmin);
5545 vacc23x01 = _mm_max_ps(vacc23x01, vmin);
5546
5547 if XNN_LIKELY(nc >= 2) {
5548 _mm_storeh_pi((__m64*) c3, vacc23x01);
5549 c3 = (float*) ((uintptr_t) c3 + cn_stride);
5550 _mm_storel_pi((__m64*) c2, vacc23x01);
5551 c2 = (float*) ((uintptr_t) c2 + cn_stride);
5552 _mm_storeh_pi((__m64*) c1, vacc01x01);
5553 c1 = (float*) ((uintptr_t) c1 + cn_stride);
5554 _mm_storel_pi((__m64*) c0, vacc01x01);
5555 c0 = (float*) ((uintptr_t) c0 + cn_stride);
5556
5557 a = (const float**restrict) ((uintptr_t) a - ks);
5558 nc -= 2;
5559 } else {
5560 assert(nc == 1);
5561 _mm_store_ss(c3, _mm_movehl_ps(vacc23x01, vacc23x01));
5562 _mm_store_ss(c2, vacc23x01);
5563 _mm_store_ss(c1, _mm_movehl_ps(vacc01x01, vacc01x01));
5564 _mm_store_ss(c0, vacc01x01);
5565
5566 nc = 0;
5567 }
5568 } while (nc != 0);
5569 }
5570
xnn_f32_igemm_minmax_ukernel_4x8__sse_load1(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5571 void xnn_f32_igemm_minmax_ukernel_4x8__sse_load1(
5572 size_t mr,
5573 size_t nc,
5574 size_t kc,
5575 size_t ks,
5576 const float**restrict a,
5577 const float*restrict w,
5578 float*restrict c,
5579 size_t cm_stride,
5580 size_t cn_stride,
5581 size_t a_offset,
5582 const float* zero,
5583 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5584 {
5585 assert(mr != 0);
5586 assert(mr <= 4);
5587 assert(nc != 0);
5588 assert(kc != 0);
5589 assert(kc % sizeof(float) == 0);
5590 assert(ks != 0);
5591 assert(ks % (4 * sizeof(void*)) == 0);
5592 assert(a_offset % sizeof(float) == 0);
5593 assert(a != NULL);
5594 assert(w != NULL);
5595 assert(c != NULL);
5596
5597 float* c0 = c;
5598 float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
5599 if XNN_UNPREDICTABLE(mr < 2) {
5600 c1 = c0;
5601 }
5602 float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
5603 if XNN_UNPREDICTABLE(mr <= 2) {
5604 c2 = c1;
5605 }
5606 float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
5607 if XNN_UNPREDICTABLE(mr != 4) {
5608 c3 = c2;
5609 }
5610
5611 do {
5612 __m128 vacc0x0123 = _mm_load_ps(w);
5613 __m128 vacc0x4567 = _mm_load_ps(w + 4);
5614 __m128 vacc1x0123 = vacc0x0123;
5615 __m128 vacc1x4567 = vacc0x4567;
5616 __m128 vacc2x0123 = vacc0x0123;
5617 __m128 vacc2x4567 = vacc0x4567;
5618 __m128 vacc3x0123 = vacc0x0123;
5619 __m128 vacc3x4567 = vacc0x4567;
5620 w += 8;
5621
5622 size_t p = ks;
5623 do {
5624 const float* restrict a0 = a[0];
5625 assert(a0 != NULL);
5626 if XNN_UNPREDICTABLE(a0 != zero) {
5627 a0 = (const float*) ((uintptr_t) a0 + a_offset);
5628 }
5629 const float* restrict a1 = a[1];
5630 assert(a1 != NULL);
5631 if XNN_UNPREDICTABLE(a1 != zero) {
5632 a1 = (const float*) ((uintptr_t) a1 + a_offset);
5633 }
5634 const float* restrict a2 = a[2];
5635 assert(a2 != NULL);
5636 if XNN_UNPREDICTABLE(a2 != zero) {
5637 a2 = (const float*) ((uintptr_t) a2 + a_offset);
5638 }
5639 const float* restrict a3 = a[3];
5640 assert(a3 != NULL);
5641 if XNN_UNPREDICTABLE(a3 != zero) {
5642 a3 = (const float*) ((uintptr_t) a3 + a_offset);
5643 }
5644 a += 4;
5645
5646 size_t k = kc;
5647 do {
5648 const __m128 vb0123 = _mm_load_ps(w);
5649 const __m128 vb4567 = _mm_load_ps(w + 4);
5650 w += 8;
5651
5652 const __m128 va0 = _mm_load1_ps(a0);
5653 a0 += 1;
5654 const __m128 va1 = _mm_load1_ps(a1);
5655 a1 += 1;
5656 const __m128 va2 = _mm_load1_ps(a2);
5657 a2 += 1;
5658 const __m128 va3 = _mm_load1_ps(a3);
5659 a3 += 1;
5660
5661 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
5662 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
5663 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
5664 vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
5665 vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
5666 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
5667 vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
5668 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
5669 k -= sizeof(float);
5670 } while (k != 0);
5671 p -= 4 * sizeof(void*);
5672 } while (p != 0);
5673
5674 const __m128 vmax = _mm_load_ps(params->sse.max);
5675 vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
5676 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
5677 vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
5678 vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
5679 vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
5680 vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
5681 vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
5682 vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
5683
5684 const __m128 vmin = _mm_load_ps(params->sse.min);
5685 vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
5686 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
5687 vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
5688 vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
5689 vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
5690 vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
5691 vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
5692 vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
5693
5694 if XNN_LIKELY(nc >= 8) {
5695 _mm_storeu_ps(c3, vacc3x0123);
5696 _mm_storeu_ps(c3 + 4, vacc3x4567);
5697 c3 = (float*) ((uintptr_t) c3 + cn_stride);
5698 _mm_storeu_ps(c2, vacc2x0123);
5699 _mm_storeu_ps(c2 + 4, vacc2x4567);
5700 c2 = (float*) ((uintptr_t) c2 + cn_stride);
5701 _mm_storeu_ps(c1, vacc1x0123);
5702 _mm_storeu_ps(c1 + 4, vacc1x4567);
5703 c1 = (float*) ((uintptr_t) c1 + cn_stride);
5704 _mm_storeu_ps(c0, vacc0x0123);
5705 _mm_storeu_ps(c0 + 4, vacc0x4567);
5706 c0 = (float*) ((uintptr_t) c0 + cn_stride);
5707
5708 a = (const float**restrict) ((uintptr_t) a - ks);
5709 nc -= 8;
5710 } else {
5711 if (nc & 4) {
5712 _mm_storeu_ps(c3, vacc3x0123);
5713 _mm_storeu_ps(c2, vacc2x0123);
5714 _mm_storeu_ps(c1, vacc1x0123);
5715 _mm_storeu_ps(c0, vacc0x0123);
5716
5717 vacc3x0123 = vacc3x4567;
5718 vacc2x0123 = vacc2x4567;
5719 vacc1x0123 = vacc1x4567;
5720 vacc0x0123 = vacc0x4567;
5721
5722 c3 += 4;
5723 c2 += 4;
5724 c1 += 4;
5725 c0 += 4;
5726 }
5727 if (nc & 2) {
5728 _mm_storel_pi((__m64*) c3, vacc3x0123);
5729 _mm_storel_pi((__m64*) c2, vacc2x0123);
5730 _mm_storel_pi((__m64*) c1, vacc1x0123);
5731 _mm_storel_pi((__m64*) c0, vacc0x0123);
5732
5733 vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
5734 vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
5735 vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
5736 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
5737
5738 c3 += 2;
5739 c2 += 2;
5740 c1 += 2;
5741 c0 += 2;
5742 }
5743 if (nc & 1) {
5744 _mm_store_ss(c3, vacc3x0123);
5745 _mm_store_ss(c2, vacc2x0123);
5746 _mm_store_ss(c1, vacc1x0123);
5747 _mm_store_ss(c0, vacc0x0123);
5748 }
5749
5750 nc = 0;
5751 }
5752 } while (nc != 0);
5753 }
5754
xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5755 void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(
5756 size_t output_pixels,
5757 size_t kernel_elements,
5758 size_t channels,
5759 const float** input,
5760 size_t input_offset,
5761 float* output,
5762 size_t input_increment,
5763 size_t output_increment,
5764 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5765 {
5766 assert(output_pixels != 0);
5767 assert(kernel_elements != 0);
5768 assert(channels != 0);
5769
5770 const __m128 voutput_max = _mm_load_ps(params->sse.max);
5771 const __m128 voutput_min = _mm_load_ps(params->sse.min);
5772 do {
5773 float* o = output;
5774 {
5775 const float* i0 = *input++;
5776 const float* i1 = *input++;
5777 const float* i2 = *input++;
5778 const float* i3 = *input++;
5779 const float* i4 = *input++;
5780 const float* i5 = *input++;
5781 const float* i6 = *input++;
5782 const float* i7 = *input++;
5783 const float* i8 = *input++;
5784 i0 = (const float*) ((uintptr_t) i0 + input_offset);
5785 i1 = (const float*) ((uintptr_t) i1 + input_offset);
5786 i2 = (const float*) ((uintptr_t) i2 + input_offset);
5787 i3 = (const float*) ((uintptr_t) i3 + input_offset);
5788 i4 = (const float*) ((uintptr_t) i4 + input_offset);
5789 i5 = (const float*) ((uintptr_t) i5 + input_offset);
5790 i6 = (const float*) ((uintptr_t) i6 + input_offset);
5791 i7 = (const float*) ((uintptr_t) i7 + input_offset);
5792 i8 = (const float*) ((uintptr_t) i8 + input_offset);
5793 if (kernel_elements < 2) {
5794 i1 = i0;
5795 }
5796 if (kernel_elements <= 2) {
5797 i2 = i0;
5798 }
5799 if (kernel_elements < 4) {
5800 i3 = i0;
5801 }
5802 if (kernel_elements <= 4) {
5803 i4 = i0;
5804 }
5805 if (kernel_elements < 6) {
5806 i5 = i0;
5807 }
5808 if (kernel_elements <= 6) {
5809 i6 = i0;
5810 }
5811 if (kernel_elements < 8) {
5812 i7 = i0;
5813 }
5814 if (kernel_elements <= 8) {
5815 i8 = i0;
5816 }
5817
5818 size_t c = channels;
5819 for (; c >= 4; c -= 4) {
5820 const __m128 vi0 = _mm_loadu_ps(i0);
5821 i0 += 4;
5822 const __m128 vi1 = _mm_loadu_ps(i1);
5823 i1 += 4;
5824 const __m128 vi2 = _mm_loadu_ps(i2);
5825 i2 += 4;
5826 const __m128 vi3 = _mm_loadu_ps(i3);
5827 i3 += 4;
5828 const __m128 vi4 = _mm_loadu_ps(i4);
5829 i4 += 4;
5830 const __m128 vi5 = _mm_loadu_ps(i5);
5831 i5 += 4;
5832 const __m128 vi6 = _mm_loadu_ps(i6);
5833 i6 += 4;
5834 const __m128 vi7 = _mm_loadu_ps(i7);
5835 i7 += 4;
5836 const __m128 vi8 = _mm_loadu_ps(i8);
5837 i8 += 4;
5838
5839 const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8);
5840 const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5841 const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5842 const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5843
5844 const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5845 const __m128 vmax01678 = _mm_max_ps(vmax018, vmax67);
5846 const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
5847 const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5848
5849 _mm_storeu_ps(o, vout);
5850 o += 4;
5851 }
5852 if (c != 0) {
5853 const __m128 vi0 = _mm_loadu_ps(i0);
5854 i0 += 4;
5855 const __m128 vi1 = _mm_loadu_ps(i1);
5856 i1 += 4;
5857 const __m128 vi2 = _mm_loadu_ps(i2);
5858 i2 += 4;
5859 const __m128 vi3 = _mm_loadu_ps(i3);
5860 i3 += 4;
5861 const __m128 vi4 = _mm_loadu_ps(i4);
5862 i4 += 4;
5863 const __m128 vi5 = _mm_loadu_ps(i5);
5864 i5 += 4;
5865 const __m128 vi6 = _mm_loadu_ps(i6);
5866 i6 += 4;
5867 const __m128 vi7 = _mm_loadu_ps(i7);
5868 i7 += 4;
5869 const __m128 vi8 = _mm_loadu_ps(i8);
5870 i8 += 4;
5871
5872 const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8);
5873 const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5874 const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5875 const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5876
5877 const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5878 const __m128 vmax01678 = _mm_max_ps(vmax018, vmax67);
5879 const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
5880 __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5881
5882 if (c & 2) {
5883 _mm_storel_pi((__m64*) o, vout);
5884 o += 2;
5885 vout = _mm_movehl_ps(vout, vout);
5886 }
5887 if (c & 1) {
5888 _mm_store_ss(o, vout);
5889 o += 1;
5890 }
5891 }
5892 }
5893
5894 for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
5895 const float* i0 = *input++;
5896 const float* i1 = *input++;
5897 const float* i2 = *input++;
5898 const float* i3 = *input++;
5899 const float* i4 = *input++;
5900 const float* i5 = *input++;
5901 const float* i6 = *input++;
5902 const float* i7 = *input++;
5903 i0 = (const float*) ((uintptr_t) i0 + input_offset);
5904 i1 = (const float*) ((uintptr_t) i1 + input_offset);
5905 i2 = (const float*) ((uintptr_t) i2 + input_offset);
5906 i3 = (const float*) ((uintptr_t) i3 + input_offset);
5907 i4 = (const float*) ((uintptr_t) i4 + input_offset);
5908 i5 = (const float*) ((uintptr_t) i5 + input_offset);
5909 i6 = (const float*) ((uintptr_t) i6 + input_offset);
5910 i7 = (const float*) ((uintptr_t) i7 + input_offset);
5911 if (k < 2) {
5912 i1 = i0;
5913 }
5914 if (k <= 2) {
5915 i2 = i0;
5916 }
5917 if (k < 4) {
5918 i3 = i0;
5919 }
5920 if (k <= 4) {
5921 i4 = i0;
5922 }
5923 if (k < 6) {
5924 i5 = i0;
5925 }
5926 if (k <= 6) {
5927 i6 = i0;
5928 }
5929 if (k < 8) {
5930 i7 = i0;
5931 }
5932
5933 o = output;
5934 size_t c = channels;
5935 for (; c >= 4; c -= 4) {
5936 const __m128 vi0 = _mm_loadu_ps(i0);
5937 i0 += 4;
5938 const __m128 vi1 = _mm_loadu_ps(i1);
5939 i1 += 4;
5940 const __m128 vi2 = _mm_loadu_ps(i2);
5941 i2 += 4;
5942 const __m128 vi3 = _mm_loadu_ps(i3);
5943 i3 += 4;
5944 const __m128 vi4 = _mm_loadu_ps(i4);
5945 i4 += 4;
5946 const __m128 vi5 = _mm_loadu_ps(i5);
5947 i5 += 4;
5948 const __m128 vi6 = _mm_loadu_ps(i6);
5949 i6 += 4;
5950 const __m128 vi7 = _mm_loadu_ps(i7);
5951 i7 += 4;
5952 const __m128 vo = _mm_loadu_ps(o);
5953
5954 const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo);
5955 const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5956 const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5957 const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5958
5959 const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5960 const __m128 vmax0167 = _mm_max_ps(vmax01, vmax67);
5961 const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
5962 const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5963
5964 _mm_storeu_ps(o, vout);
5965 o += 4;
5966 }
5967 if (c != 0) {
5968 const __m128 vi0 = _mm_loadu_ps(i0);
5969 const __m128 vi1 = _mm_loadu_ps(i1);
5970 const __m128 vi2 = _mm_loadu_ps(i2);
5971 const __m128 vi3 = _mm_loadu_ps(i3);
5972 const __m128 vi4 = _mm_loadu_ps(i4);
5973 const __m128 vi5 = _mm_loadu_ps(i5);
5974 const __m128 vi6 = _mm_loadu_ps(i6);
5975 const __m128 vi7 = _mm_loadu_ps(i7);
5976 const __m128 vo = _mm_loadu_ps(o);
5977
5978 const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo);
5979 const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5980 const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5981 const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5982
5983 const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5984 const __m128 vmax0167 = _mm_max_ps(vmax01, vmax67);
5985 const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
5986 __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5987
5988 if (c & 2) {
5989 _mm_storel_pi((__m64*) o, vout);
5990 o += 2;
5991 vout = _mm_movehl_ps(vout, vout);
5992 }
5993 if (c & 1) {
5994 _mm_store_ss(o, vout);
5995 o += 1;
5996 }
5997 }
5998 }
5999 input = (const float**) ((uintptr_t) input + input_increment);
6000 output = (float*) ((uintptr_t) o + output_increment);
6001 } while (--output_pixels != 0);
6002 }
6003
xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,const float * multiplier,float * buffer,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6004 void xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4(
6005 size_t output_pixels,
6006 size_t kernel_elements,
6007 size_t channels,
6008 const float** input,
6009 size_t input_offset,
6010 const float* zero,
6011 const float* multiplier,
6012 float* buffer,
6013 float* output,
6014 size_t input_increment,
6015 size_t output_increment,
6016 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6017 {
6018 assert(output_pixels != 0);
6019 assert(kernel_elements > 9);
6020 assert(channels != 0);
6021
6022 const __m128 voutput_min = _mm_load_ps(params->sse.min);
6023 const __m128 voutput_max = _mm_load_ps(params->sse.max);
6024
6025 do {
6026 {
6027 const float* i0 = *input++;
6028 assert(i0 != NULL);
6029 if XNN_UNPREDICTABLE(i0 != zero) {
6030 i0 = (const float*) ((uintptr_t) i0 + input_offset);
6031 }
6032 const float* i1 = *input++;
6033 assert(i1 != NULL);
6034 if XNN_UNPREDICTABLE(i1 != zero) {
6035 i1 = (const float*) ((uintptr_t) i1 + input_offset);
6036 }
6037 const float* i2 = *input++;
6038 assert(i2 != NULL);
6039 if XNN_UNPREDICTABLE(i2 != zero) {
6040 i2 = (const float*) ((uintptr_t) i2 + input_offset);
6041 }
6042 const float* i3 = *input++;
6043 assert(i3 != NULL);
6044 if XNN_UNPREDICTABLE(i3 != zero) {
6045 i3 = (const float*) ((uintptr_t) i3 + input_offset);
6046 }
6047 const float* i4 = *input++;
6048 assert(i4 != NULL);
6049 if XNN_UNPREDICTABLE(i4 != zero) {
6050 i4 = (const float*) ((uintptr_t) i4 + input_offset);
6051 }
6052 const float* i5 = *input++;
6053 assert(i5 != NULL);
6054 if XNN_UNPREDICTABLE(i5 != zero) {
6055 i5 = (const float*) ((uintptr_t) i5 + input_offset);
6056 }
6057 const float* i6 = *input++;
6058 assert(i6 != NULL);
6059 if XNN_UNPREDICTABLE(i6 != zero) {
6060 i6 = (const float*) ((uintptr_t) i6 + input_offset);
6061 }
6062 const float* i7 = *input++;
6063 assert(i7 != NULL);
6064 if XNN_UNPREDICTABLE(i7 != zero) {
6065 i7 = (const float*) ((uintptr_t) i7 + input_offset);
6066 }
6067 const float* i8 = *input++;
6068 assert(i8 != NULL);
6069 if XNN_UNPREDICTABLE(i8 != zero) {
6070 i8 = (const float*) ((uintptr_t) i8 + input_offset);
6071 }
6072
6073 float* b = buffer;
6074 for (size_t c = 0; c < channels; c += 4) {
6075 const __m128 vi0 = _mm_loadu_ps(i0);
6076 i0 += 4;
6077 const __m128 vi1 = _mm_loadu_ps(i1);
6078 i1 += 4;
6079 const __m128 vi2 = _mm_loadu_ps(i2);
6080 i2 += 4;
6081 const __m128 vi3 = _mm_loadu_ps(i3);
6082 i3 += 4;
6083 const __m128 vi4 = _mm_loadu_ps(i4);
6084 i4 += 4;
6085 const __m128 vi5 = _mm_loadu_ps(i5);
6086 i5 += 4;
6087 const __m128 vi6 = _mm_loadu_ps(i6);
6088 i6 += 4;
6089 const __m128 vi7 = _mm_loadu_ps(i7);
6090 i7 += 4;
6091 const __m128 vi8 = _mm_loadu_ps(i8);
6092 i8 += 4;
6093
6094 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6095 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6096 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6097 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6098 const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
6099 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6100 const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
6101 const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
6102
6103 _mm_store_ps(b, vsum); b += 4;
6104 }
6105 }
6106
6107 size_t k = kernel_elements;
6108 for (k -= 9; k > 8; k -= 8) {
6109 const float* i0 = *input++;
6110 assert(i0 != NULL);
6111 if XNN_UNPREDICTABLE(i0 != zero) {
6112 i0 = (const float*) ((uintptr_t) i0 + input_offset);
6113 }
6114 const float* i1 = *input++;
6115 assert(i1 != NULL);
6116 if XNN_UNPREDICTABLE(i1 != zero) {
6117 i1 = (const float*) ((uintptr_t) i1 + input_offset);
6118 }
6119 const float* i2 = *input++;
6120 assert(i2 != NULL);
6121 if XNN_UNPREDICTABLE(i2 != zero) {
6122 i2 = (const float*) ((uintptr_t) i2 + input_offset);
6123 }
6124 const float* i3 = *input++;
6125 assert(i3 != NULL);
6126 if XNN_UNPREDICTABLE(i3 != zero) {
6127 i3 = (const float*) ((uintptr_t) i3 + input_offset);
6128 }
6129 const float* i4 = *input++;
6130 assert(i4 != NULL);
6131 if XNN_UNPREDICTABLE(i4 != zero) {
6132 i4 = (const float*) ((uintptr_t) i4 + input_offset);
6133 }
6134 const float* i5 = *input++;
6135 assert(i5 != NULL);
6136 if XNN_UNPREDICTABLE(i5 != zero) {
6137 i5 = (const float*) ((uintptr_t) i5 + input_offset);
6138 }
6139 const float* i6 = *input++;
6140 assert(i6 != NULL);
6141 if XNN_UNPREDICTABLE(i6 != zero) {
6142 i6 = (const float*) ((uintptr_t) i6 + input_offset);
6143 }
6144 const float* i7 = *input++;
6145 assert(i7 != NULL);
6146 if XNN_UNPREDICTABLE(i7 != zero) {
6147 i7 = (const float*) ((uintptr_t) i7 + input_offset);
6148 }
6149
6150 float* b = buffer;
6151 for (size_t c = 0; c < channels; c += 4) {
6152 const __m128 vi0 = _mm_loadu_ps(i0);
6153 i0 += 4;
6154 const __m128 vi1 = _mm_loadu_ps(i1);
6155 i1 += 4;
6156 const __m128 vi2 = _mm_loadu_ps(i2);
6157 i2 += 4;
6158 const __m128 vi3 = _mm_loadu_ps(i3);
6159 i3 += 4;
6160 const __m128 vi4 = _mm_loadu_ps(i4);
6161 i4 += 4;
6162 const __m128 vi5 = _mm_loadu_ps(i5);
6163 i5 += 4;
6164 const __m128 vi6 = _mm_loadu_ps(i6);
6165 i6 += 4;
6166 const __m128 vi7 = _mm_loadu_ps(i7);
6167 i7 += 4;
6168 const __m128 vacc = _mm_load_ps(b);
6169
6170 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6171 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6172 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6173 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6174 const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
6175 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6176 const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
6177 const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
6178
6179 _mm_store_ps(b, vsum); b += 4;
6180 }
6181 }
6182
6183 {
6184 const float* i0 = input[0];
6185 assert(i0 != NULL);
6186 const float* i1 = input[1];
6187 const float* i2 = input[2];
6188 const float* i3 = input[3];
6189 const float* i4 = input[4];
6190 const float* i5 = input[5];
6191 const float* i6 = input[6];
6192 const float* i7 = input[7];
6193 input = (const float**) ((uintptr_t) input + input_increment);
6194 if (k < 2) {
6195 i1 = zero;
6196 }
6197 assert(i1 != NULL);
6198 if (k <= 2) {
6199 i2 = zero;
6200 }
6201 assert(i2 != NULL);
6202 if (k < 4) {
6203 i3 = zero;
6204 }
6205 assert(i3 != NULL);
6206 if (k <= 4) {
6207 i4 = zero;
6208 }
6209 assert(i4 != NULL);
6210 if (k < 6) {
6211 i5 = zero;
6212 }
6213 assert(i5 != NULL);
6214 if (k <= 6) {
6215 i6 = zero;
6216 }
6217 assert(i6 != NULL);
6218 if (k < 8) {
6219 i7 = zero;
6220 }
6221 assert(i7 != NULL);
6222 if XNN_UNPREDICTABLE(i0 != zero) {
6223 i0 = (const float*) ((uintptr_t) i0 + input_offset);
6224 }
6225 if XNN_UNPREDICTABLE(i1 != zero) {
6226 i1 = (const float*) ((uintptr_t) i1 + input_offset);
6227 }
6228 if XNN_UNPREDICTABLE(i2 != zero) {
6229 i2 = (const float*) ((uintptr_t) i2 + input_offset);
6230 }
6231 if XNN_UNPREDICTABLE(i3 != zero) {
6232 i3 = (const float*) ((uintptr_t) i3 + input_offset);
6233 }
6234 if XNN_UNPREDICTABLE(i4 != zero) {
6235 i4 = (const float*) ((uintptr_t) i4 + input_offset);
6236 }
6237 if XNN_UNPREDICTABLE(i5 != zero) {
6238 i5 = (const float*) ((uintptr_t) i5 + input_offset);
6239 }
6240 if XNN_UNPREDICTABLE(i6 != zero) {
6241 i6 = (const float*) ((uintptr_t) i6 + input_offset);
6242 }
6243 if XNN_UNPREDICTABLE(i7 != zero) {
6244 i7 = (const float*) ((uintptr_t) i7 + input_offset);
6245 }
6246
6247 const __m128 vmultiplier = _mm_load1_ps(multiplier);
6248 multiplier += 1;
6249
6250 size_t c = channels;
6251 float* b = buffer;
6252 while (c >= 4) {
6253 const __m128 vi0 = _mm_loadu_ps(i0);
6254 i0 += 4;
6255 const __m128 vi1 = _mm_loadu_ps(i1);
6256 i1 += 4;
6257 const __m128 vi2 = _mm_loadu_ps(i2);
6258 i2 += 4;
6259 const __m128 vi3 = _mm_loadu_ps(i3);
6260 i3 += 4;
6261 const __m128 vi4 = _mm_loadu_ps(i4);
6262 i4 += 4;
6263 const __m128 vi5 = _mm_loadu_ps(i5);
6264 i5 += 4;
6265 const __m128 vi6 = _mm_loadu_ps(i6);
6266 i6 += 4;
6267 const __m128 vi7 = _mm_loadu_ps(i7);
6268 i7 += 4;
6269 const __m128 vacc = _mm_load_ps(b);
6270 b += 4;
6271
6272 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6273 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6274 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6275 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6276 const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
6277 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6278 const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
6279 const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
6280
6281 __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6282 vout = _mm_max_ps(vout, voutput_min);
6283 vout = _mm_min_ps(vout, voutput_max);
6284
6285 _mm_storeu_ps(output, vout);
6286 output += 4;
6287
6288 c -= 4;
6289 }
6290 if (c != 0) {
6291 const __m128 vi0 = _mm_loadu_ps(i0);
6292 const __m128 vi1 = _mm_loadu_ps(i1);
6293 const __m128 vi2 = _mm_loadu_ps(i2);
6294 const __m128 vi3 = _mm_loadu_ps(i3);
6295 const __m128 vi4 = _mm_loadu_ps(i4);
6296 const __m128 vi5 = _mm_loadu_ps(i5);
6297 const __m128 vi6 = _mm_loadu_ps(i6);
6298 const __m128 vi7 = _mm_loadu_ps(i7);
6299 const __m128 vacc = _mm_load_ps(b);
6300
6301 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6302 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6303 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6304 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6305 const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
6306 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6307 const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
6308 const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
6309
6310 __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6311 vout = _mm_max_ps(vout, voutput_min);
6312 vout = _mm_min_ps(vout, voutput_max);
6313
6314 if (c & 2) {
6315 _mm_storel_pi((__m64*) output, vout);
6316 vout = _mm_movehl_ps(vout, vout);
6317 output += 2;
6318 }
6319 if (c & 1) {
6320 _mm_store_ss(output, vout);
6321 output += 1;
6322 }
6323 }
6324 }
6325 output = (float*) ((uintptr_t) output + output_increment);
6326 } while (--output_pixels != 0);
6327 }
6328
xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,const float * multiplier,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6329 void xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4(
6330 size_t output_pixels,
6331 size_t kernel_elements,
6332 size_t channels,
6333 const float** input,
6334 size_t input_offset,
6335 const float* zero,
6336 const float* multiplier,
6337 float* output,
6338 size_t input_increment,
6339 size_t output_increment,
6340 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6341 {
6342 assert(output_pixels != 0);
6343 assert(kernel_elements != 0);
6344 assert(kernel_elements <= 9);
6345 assert(channels != 0);
6346
6347 const __m128 voutput_min = _mm_load_ps(params->sse.min);
6348 const __m128 voutput_max = _mm_load_ps(params->sse.max);
6349
6350 do {
6351 const float* i0 = input[0];
6352 assert(i0 != NULL);
6353 const float* i1 = input[1];
6354 const float* i2 = input[2];
6355 const float* i3 = input[3];
6356 const float* i4 = input[4];
6357 const float* i5 = input[5];
6358 const float* i6 = input[6];
6359 const float* i7 = input[7];
6360 const float* i8 = input[8];
6361 input = (const float**) ((uintptr_t) input + input_increment);
6362 if (kernel_elements < 2) {
6363 i1 = zero;
6364 }
6365 assert(i1 != NULL);
6366 if (kernel_elements <= 2) {
6367 i2 = zero;
6368 }
6369 assert(i2 != NULL);
6370 if (kernel_elements < 4) {
6371 i3 = zero;
6372 }
6373 assert(i3 != NULL);
6374 if (kernel_elements <= 4) {
6375 i4 = zero;
6376 }
6377 assert(i4 != NULL);
6378 if (kernel_elements < 6) {
6379 i5 = zero;
6380 }
6381 assert(i5 != NULL);
6382 if (kernel_elements <= 6) {
6383 i6 = zero;
6384 }
6385 assert(i6 != NULL);
6386 if (kernel_elements < 8) {
6387 i7 = zero;
6388 }
6389 assert(i7 != NULL);
6390 if (kernel_elements <= 8) {
6391 i8 = zero;
6392 }
6393 assert(i8 != NULL);
6394 if XNN_UNPREDICTABLE(i0 != zero) {
6395 i0 = (const float*) ((uintptr_t) i0 + input_offset);
6396 }
6397 if XNN_UNPREDICTABLE(i1 != zero) {
6398 i1 = (const float*) ((uintptr_t) i1 + input_offset);
6399 }
6400 if XNN_UNPREDICTABLE(i2 != zero) {
6401 i2 = (const float*) ((uintptr_t) i2 + input_offset);
6402 }
6403 if XNN_UNPREDICTABLE(i3 != zero) {
6404 i3 = (const float*) ((uintptr_t) i3 + input_offset);
6405 }
6406 if XNN_UNPREDICTABLE(i4 != zero) {
6407 i4 = (const float*) ((uintptr_t) i4 + input_offset);
6408 }
6409 if XNN_UNPREDICTABLE(i5 != zero) {
6410 i5 = (const float*) ((uintptr_t) i5 + input_offset);
6411 }
6412 if XNN_UNPREDICTABLE(i6 != zero) {
6413 i6 = (const float*) ((uintptr_t) i6 + input_offset);
6414 }
6415 if XNN_UNPREDICTABLE(i7 != zero) {
6416 i7 = (const float*) ((uintptr_t) i7 + input_offset);
6417 }
6418 if XNN_UNPREDICTABLE(i8 != zero) {
6419 i8 = (const float*) ((uintptr_t) i8 + input_offset);
6420 }
6421
6422 const __m128 vmultiplier = _mm_load1_ps(multiplier);
6423 multiplier += 1;
6424
6425 size_t c = channels;
6426 while (c >= 4) {
6427 const __m128 vi0 = _mm_loadu_ps(i0);
6428 i0 += 4;
6429 const __m128 vi1 = _mm_loadu_ps(i1);
6430 i1 += 4;
6431 const __m128 vi2 = _mm_loadu_ps(i2);
6432 i2 += 4;
6433 const __m128 vi3 = _mm_loadu_ps(i3);
6434 i3 += 4;
6435 const __m128 vi4 = _mm_loadu_ps(i4);
6436 i4 += 4;
6437 const __m128 vi5 = _mm_loadu_ps(i5);
6438 i5 += 4;
6439 const __m128 vi6 = _mm_loadu_ps(i6);
6440 i6 += 4;
6441 const __m128 vi7 = _mm_loadu_ps(i7);
6442 i7 += 4;
6443 const __m128 vi8 = _mm_loadu_ps(i8);
6444 i8 += 4;
6445
6446 const __m128 vsum018 = _mm_add_ps(_mm_add_ps(vi0, vi1), vi8);
6447 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6448 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6449 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6450
6451 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6452 const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
6453 const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
6454
6455 __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6456 vout = _mm_max_ps(vout, voutput_min);
6457 vout = _mm_min_ps(vout, voutput_max);
6458
6459 _mm_storeu_ps(output, vout); output += 4;
6460
6461 c -= 4;
6462 }
6463 if (c != 0) {
6464 const __m128 vi0 = _mm_loadu_ps(i0);
6465 const __m128 vi1 = _mm_loadu_ps(i1);
6466 const __m128 vi2 = _mm_loadu_ps(i2);
6467 const __m128 vi3 = _mm_loadu_ps(i3);
6468 const __m128 vi4 = _mm_loadu_ps(i4);
6469 const __m128 vi5 = _mm_loadu_ps(i5);
6470 const __m128 vi6 = _mm_loadu_ps(i6);
6471 const __m128 vi7 = _mm_loadu_ps(i7);
6472 const __m128 vi8 = _mm_loadu_ps(i8);
6473
6474 const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6475 const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6476 const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6477 const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6478 const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
6479 const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6480 const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
6481 const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
6482
6483 __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6484 vout = _mm_max_ps(vout, voutput_min);
6485 vout = _mm_min_ps(vout, voutput_max);
6486
6487 if (c & 2) {
6488 _mm_storel_pi((__m64*) output, vout);
6489 vout = _mm_movehl_ps(vout, vout);
6490 output += 2;
6491 }
6492 if (c & 1) {
6493 _mm_store_ss(output, vout);
6494 output += 1;
6495 }
6496 }
6497 output = (float*) ((uintptr_t) output + output_increment);
6498 } while (--output_pixels != 0);
6499 }
6500
xnn_f32_rmax_ukernel__sse(size_t n,const float * x,float * y)6501 void xnn_f32_rmax_ukernel__sse(
6502 size_t n,
6503 const float* x,
6504 float* y)
6505 {
6506 assert(n != 0);
6507 assert(n % sizeof(float) == 0);
6508
6509 __m128 vmax0 = _mm_load_ss(x);
6510 vmax0 = _mm_shuffle_ps(vmax0, vmax0, _MM_SHUFFLE(0, 0, 0, 0));
6511 __m128 vmax1 = vmax0;
6512 __m128 vmax2 = vmax0;
6513 __m128 vmax3 = vmax0;
6514 for (; n >= 64; n -= 64) {
6515 const __m128 vx0 = _mm_loadu_ps(x);
6516 const __m128 vx1 = _mm_loadu_ps(x + 4);
6517 const __m128 vx2 = _mm_loadu_ps(x + 8);
6518 const __m128 vx3 = _mm_loadu_ps(x + 12);
6519 x += 16;
6520
6521 vmax0 = _mm_max_ps(vmax0, vx0);
6522 vmax1 = _mm_max_ps(vmax1, vx1);
6523 vmax2 = _mm_max_ps(vmax2, vx2);
6524 vmax3 = _mm_max_ps(vmax3, vx3);
6525 }
6526 __m128 vmax = _mm_max_ps(_mm_max_ps(vmax0, vmax1), _mm_max_ps(vmax2, vmax3));
6527 for (; n >= 16; n -= 16) {
6528 const __m128 vx = _mm_loadu_ps(x);
6529 vmax = _mm_max_ps(vmax, vx);
6530 x += 4;
6531 }
6532 __m128 vmax_lo = _mm_max_ps(vmax, _mm_movehl_ps(vmax, vmax));
6533 vmax_lo = _mm_max_ss(vmax_lo, _mm_shuffle_ps(vmax_lo, vmax_lo, _MM_SHUFFLE(3, 3, 1, 1)));
6534 if XNN_UNLIKELY(n != 0) {
6535 do {
6536 vmax_lo = _mm_max_ss(vmax_lo, _mm_load_ss(x));
6537 x += 1;
6538 n -= 4;
6539 } while (n != 0);
6540 }
6541 _mm_store_ss(y, vmax_lo);
6542 }
6543
xnn_f32_spmm_minmax_ukernel_32x1__sse(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6544 void xnn_f32_spmm_minmax_ukernel_32x1__sse(
6545 size_t mc,
6546 size_t nc,
6547 const float*restrict input,
6548 const float*restrict weights,
6549 const int32_t*restrict widx_dmap,
6550 const uint32_t*restrict nidx_nnzmap,
6551 float*restrict output,
6552 size_t output_stride,
6553 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
6554 {
6555 assert(mc != 0);
6556 assert(mc % sizeof(float) == 0);
6557 assert(nc != 0);
6558
6559 const __m128 vmin = _mm_load_ps(params->sse.min);
6560 const __m128 vmax = _mm_load_ps(params->sse.max);
6561 size_t output_decrement = output_stride * nc - 32 * sizeof(float);
6562 while XNN_LIKELY(mc >= 32 * sizeof(float)) {
6563 const float*restrict w = weights;
6564 const int32_t* dmap = widx_dmap;
6565 const uint32_t* nnzmap = nidx_nnzmap;
6566 size_t n = nc;
6567 do {
6568 uint32_t nnz = *nnzmap++;
6569 __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6570 __m128 vacc4567 = vacc0123;
6571 __m128 vacc89AB = vacc0123;
6572 __m128 vaccCDEF = vacc0123;
6573 __m128 vaccGHIJ = vacc0123;
6574 __m128 vaccKLMN = vacc0123;
6575 __m128 vaccOPQR = vacc0123;
6576 __m128 vaccSTUV = vacc0123;
6577 if XNN_LIKELY(nnz != 0) {
6578 do {
6579 const intptr_t diff = *dmap++;
6580 const __m128 vi0123 = _mm_loadu_ps(input);
6581 const __m128 vi4567 = _mm_loadu_ps(input + 4);
6582 const __m128 vi89AB = _mm_loadu_ps(input + 8);
6583 const __m128 viCDEF = _mm_loadu_ps(input + 12);
6584 const __m128 viGHIJ = _mm_loadu_ps(input + 16);
6585 const __m128 viKLMN = _mm_loadu_ps(input + 20);
6586 const __m128 viOPQR = _mm_loadu_ps(input + 24);
6587 const __m128 viSTUV = _mm_loadu_ps(input + 28);
6588 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6589 const __m128 vw = _mm_load1_ps(w); w += 1;
6590 vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6591 vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
6592 vacc89AB = _mm_add_ps(vacc89AB, _mm_mul_ps(vi89AB, vw));
6593 vaccCDEF = _mm_add_ps(vaccCDEF, _mm_mul_ps(viCDEF, vw));
6594 vaccGHIJ = _mm_add_ps(vaccGHIJ, _mm_mul_ps(viGHIJ, vw));
6595 vaccKLMN = _mm_add_ps(vaccKLMN, _mm_mul_ps(viKLMN, vw));
6596 vaccOPQR = _mm_add_ps(vaccOPQR, _mm_mul_ps(viOPQR, vw));
6597 vaccSTUV = _mm_add_ps(vaccSTUV, _mm_mul_ps(viSTUV, vw));
6598 } while (--nnz != 0);
6599 }
6600 __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6601 __m128 vout4567 = _mm_min_ps(vacc4567, vmax);
6602 __m128 vout89AB = _mm_min_ps(vacc89AB, vmax);
6603 __m128 voutCDEF = _mm_min_ps(vaccCDEF, vmax);
6604 __m128 voutGHIJ = _mm_min_ps(vaccGHIJ, vmax);
6605 __m128 voutKLMN = _mm_min_ps(vaccKLMN, vmax);
6606 __m128 voutOPQR = _mm_min_ps(vaccOPQR, vmax);
6607 __m128 voutSTUV = _mm_min_ps(vaccSTUV, vmax);
6608 vout0123 = _mm_max_ps(vout0123, vmin);
6609 vout4567 = _mm_max_ps(vout4567, vmin);
6610 vout89AB = _mm_max_ps(vout89AB, vmin);
6611 voutCDEF = _mm_max_ps(voutCDEF, vmin);
6612 voutGHIJ = _mm_max_ps(voutGHIJ, vmin);
6613 voutKLMN = _mm_max_ps(voutKLMN, vmin);
6614 voutOPQR = _mm_max_ps(voutOPQR, vmin);
6615 voutSTUV = _mm_max_ps(voutSTUV, vmin);
6616 _mm_storeu_ps(output, vout0123);
6617 _mm_storeu_ps(output + 4, vout4567);
6618 _mm_storeu_ps(output + 8, vout89AB);
6619 _mm_storeu_ps(output + 12, voutCDEF);
6620 _mm_storeu_ps(output + 16, voutGHIJ);
6621 _mm_storeu_ps(output + 20, voutKLMN);
6622 _mm_storeu_ps(output + 24, voutOPQR);
6623 _mm_storeu_ps(output + 28, voutSTUV);
6624 output = (float*restrict) ((uintptr_t) output + output_stride);
6625 } while (--n != 0);
6626 output = (float*restrict) ((uintptr_t) output - output_decrement);
6627 input += 32;
6628 mc -= 32 * sizeof(float);
6629 }
6630 if XNN_UNLIKELY(mc != 0) {
6631 output_decrement += 16 * sizeof(float);
6632 if (mc & (16 * sizeof(float))) {
6633 const float*restrict w = weights;
6634 const int32_t* dmap = widx_dmap;
6635 const uint32_t* nnzmap = nidx_nnzmap;
6636 size_t n = nc;
6637 do {
6638 uint32_t nnz = *nnzmap++;
6639 __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6640 __m128 vacc4567 = vacc0123;
6641 __m128 vacc89AB = vacc0123;
6642 __m128 vaccCDEF = vacc0123;
6643 if XNN_LIKELY(nnz != 0) {
6644 do {
6645 const intptr_t diff = *dmap++;
6646 const __m128 vi0123 = _mm_loadu_ps(input);
6647 const __m128 vi4567 = _mm_loadu_ps(input + 4);
6648 const __m128 vi89AB = _mm_loadu_ps(input + 8);
6649 const __m128 viCDEF = _mm_loadu_ps(input + 12);
6650 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6651 const __m128 vw = _mm_load1_ps(w); w += 1;
6652 vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6653 vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
6654 vacc89AB = _mm_add_ps(vacc89AB, _mm_mul_ps(vi89AB, vw));
6655 vaccCDEF = _mm_add_ps(vaccCDEF, _mm_mul_ps(viCDEF, vw));
6656 } while (--nnz != 0);
6657 }
6658 __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6659 __m128 vout4567 = _mm_min_ps(vacc4567, vmax);
6660 __m128 vout89AB = _mm_min_ps(vacc89AB, vmax);
6661 __m128 voutCDEF = _mm_min_ps(vaccCDEF, vmax);
6662 vout0123 = _mm_max_ps(vout0123, vmin);
6663 vout4567 = _mm_max_ps(vout4567, vmin);
6664 vout89AB = _mm_max_ps(vout89AB, vmin);
6665 voutCDEF = _mm_max_ps(voutCDEF, vmin);
6666 _mm_storeu_ps(output, vout0123);
6667 _mm_storeu_ps(output + 4, vout4567);
6668 _mm_storeu_ps(output + 8, vout89AB);
6669 _mm_storeu_ps(output + 12, voutCDEF);
6670 output = (float*restrict) ((uintptr_t) output + output_stride);
6671 } while (--n != 0);
6672 output = (float*restrict) ((uintptr_t) output - output_decrement);
6673 input += 16;
6674 }
6675 output_decrement += 8 * sizeof(float);
6676 if (mc & (8 * sizeof(float))) {
6677 const float*restrict w = weights;
6678 const int32_t* dmap = widx_dmap;
6679 const uint32_t* nnzmap = nidx_nnzmap;
6680 size_t n = nc;
6681 do {
6682 uint32_t nnz = *nnzmap++;
6683 __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6684 __m128 vacc4567 = vacc0123;
6685 if XNN_LIKELY(nnz != 0) {
6686 do {
6687 const intptr_t diff = *dmap++;
6688 const __m128 vi0123 = _mm_loadu_ps(input);
6689 const __m128 vi4567 = _mm_loadu_ps(input + 4);
6690 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6691 const __m128 vw = _mm_load1_ps(w); w += 1;
6692 vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6693 vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
6694 } while (--nnz != 0);
6695 }
6696 __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6697 __m128 vout4567 = _mm_min_ps(vacc4567, vmax);
6698 vout0123 = _mm_max_ps(vout0123, vmin);
6699 vout4567 = _mm_max_ps(vout4567, vmin);
6700 _mm_storeu_ps(output, vout0123);
6701 _mm_storeu_ps(output + 4, vout4567);
6702 output = (float*restrict) ((uintptr_t) output + output_stride);
6703 } while (--n != 0);
6704 output = (float*restrict) ((uintptr_t) output - output_decrement);
6705 input += 8;
6706 }
6707 output_decrement += 4 * sizeof(float);
6708 if (mc & (4 * sizeof(float))) {
6709 const float*restrict w = weights;
6710 const int32_t* dmap = widx_dmap;
6711 const uint32_t* nnzmap = nidx_nnzmap;
6712 size_t n = nc;
6713 do {
6714 uint32_t nnz = *nnzmap++;
6715 __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6716 if XNN_LIKELY(nnz != 0) {
6717 do {
6718 const intptr_t diff = *dmap++;
6719 const __m128 vi0123 = _mm_loadu_ps(input);
6720 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6721 const __m128 vw = _mm_load1_ps(w); w += 1;
6722 vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6723 } while (--nnz != 0);
6724 }
6725 __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6726 vout0123 = _mm_max_ps(vout0123, vmin);
6727 _mm_storeu_ps(output, vout0123);
6728 output = (float*restrict) ((uintptr_t) output + output_stride);
6729 } while (--n != 0);
6730 output = (float*restrict) ((uintptr_t) output - output_decrement);
6731 input += 4;
6732 }
6733 output_decrement += 2 * sizeof(float);
6734 if (mc & (2 * sizeof(float))) {
6735 const float*restrict w = weights;
6736 const int32_t* dmap = widx_dmap;
6737 const uint32_t* nnzmap = nidx_nnzmap;
6738 size_t n = nc;
6739 do {
6740 uint32_t nnz = *nnzmap++;
6741 __m128 vacc01 = _mm_load_ss(w); w += 1;
6742 vacc01 = _mm_unpacklo_ps(vacc01, vacc01);
6743 if XNN_LIKELY(nnz != 0) {
6744 do {
6745 const intptr_t diff = *dmap++;
6746 const __m128 vi01 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) input);
6747 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6748 __m128 vw = _mm_load_ss(w); w += 1;
6749 vw = _mm_unpacklo_ps(vw, vw);
6750 vacc01 = _mm_add_ps(vacc01, _mm_mul_ps(vi01, vw));
6751 } while (--nnz != 0);
6752 }
6753 __m128 vout01 = _mm_min_ps(vacc01, vmax);
6754 vout01 = _mm_max_ps(vout01, vmin);
6755 _mm_storel_pi((__m64*) output, vout01);
6756 output = (float*restrict) ((uintptr_t) output + output_stride);
6757 } while (--n != 0);
6758 output = (float*restrict) ((uintptr_t) output - output_decrement);
6759 input += 2;
6760 }
6761 output_decrement += 1 * sizeof(float);
6762 if (mc & (1 * sizeof(float))) {
6763 const float*restrict w = weights;
6764 const int32_t* dmap = widx_dmap;
6765 const uint32_t* nnzmap = nidx_nnzmap;
6766 size_t n = nc;
6767 do {
6768 uint32_t nnz = *nnzmap++;
6769 __m128 vacc0 = _mm_load_ss(w); w += 1;
6770 if XNN_LIKELY(nnz != 0) {
6771 do {
6772 const intptr_t diff = *dmap++;
6773 const __m128 vi0 = _mm_load_ss(input);
6774 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6775 const __m128 vw = _mm_load_ss(w); w += 1;
6776 vacc0 = _mm_add_ss(vacc0, _mm_mul_ss(vi0, vw));
6777 } while (--nnz != 0);
6778 }
6779 __m128 vout0 = _mm_min_ss(vacc0, vmax);
6780 vout0 = _mm_max_ss(vout0, vmin);
6781 _mm_store_ss(output, vout0);
6782 output = (float*restrict) ((uintptr_t) output + output_stride);
6783 } while (--n != 0);
6784 output = (float*restrict) ((uintptr_t) output - output_decrement);
6785 input += 1;
6786 }
6787 }
6788 }
6789
xnn_f32_vadd_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6790 void xnn_f32_vadd_minmax_ukernel__sse_x8(
6791 size_t n,
6792 const float* a,
6793 const float* b,
6794 float* y,
6795 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6796 {
6797 assert(n != 0);
6798 assert(n % sizeof(float) == 0);
6799 assert(a != NULL);
6800 assert(b != NULL);
6801 assert(y != NULL);
6802
6803 const __m128 vy_min = _mm_load_ps(params->sse.min);
6804 const __m128 vy_max = _mm_load_ps(params->sse.max);
6805
6806 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
6807 const __m128 va0123 = _mm_loadu_ps(a);
6808 const __m128 va4567 = _mm_loadu_ps(a + 4);
6809 a += 8;
6810
6811 const __m128 vb0123 = _mm_loadu_ps(b);
6812 const __m128 vb4567 = _mm_loadu_ps(b + 4);
6813 b += 8;
6814
6815 __m128 vy0123 = _mm_add_ps(va0123, vb0123);
6816 __m128 vy4567 = _mm_add_ps(va4567, vb4567);
6817
6818
6819 vy0123 = _mm_max_ps(vy0123, vy_min);
6820 vy4567 = _mm_max_ps(vy4567, vy_min);
6821
6822 vy0123 = _mm_min_ps(vy0123, vy_max);
6823 vy4567 = _mm_min_ps(vy4567, vy_max);
6824
6825 _mm_storeu_ps(y, vy0123);
6826 _mm_storeu_ps(y + 4, vy4567);
6827 y += 8;
6828 }
6829 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
6830 const __m128 va0123 = _mm_loadu_ps(a);
6831 a += 4;
6832
6833 const __m128 vb0123 = _mm_loadu_ps(b);
6834 b += 4;
6835
6836 __m128 vy0123 = _mm_add_ps(va0123, vb0123);
6837 vy0123 = _mm_max_ps(vy0123, vy_min);
6838 vy0123 = _mm_min_ps(vy0123, vy_max);
6839 _mm_storeu_ps(y, vy0123);
6840 y += 4;
6841 }
6842 if XNN_UNLIKELY(n != 0) {
6843 const __m128 va0123 = _mm_loadu_ps(a);
6844 const __m128 vb0123 = _mm_loadu_ps(b);
6845
6846 __m128 vy0123 = _mm_add_ps(va0123, vb0123);
6847 vy0123 = _mm_max_ps(vy0123, vy_min);
6848 vy0123 = _mm_min_ps(vy0123, vy_max);
6849 if (n & (2 * sizeof(float))) {
6850 _mm_storel_pi((__m64*) y, vy0123);
6851 vy0123 = _mm_movehl_ps(vy0123, vy0123);
6852 y += 2;
6853 }
6854 if (n & (1 * sizeof(float))) {
6855 _mm_store_ss(y, vy0123);
6856 }
6857 }
6858 }
6859
xnn_f32_vaddc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6860 void xnn_f32_vaddc_minmax_ukernel__sse_x8(
6861 size_t n,
6862 const float* a,
6863 const float* b,
6864 float* y,
6865 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6866 {
6867 assert(n != 0);
6868 assert(n % sizeof(float) == 0);
6869 assert(a != NULL);
6870 assert(b != NULL);
6871 assert(y != NULL);
6872
6873 const __m128 vy_min = _mm_load_ps(params->sse.min);
6874 const __m128 vy_max = _mm_load_ps(params->sse.max);
6875
6876 const __m128 vb = _mm_load1_ps(b);
6877 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
6878 const __m128 va0123 = _mm_loadu_ps(a);
6879 const __m128 va4567 = _mm_loadu_ps(a + 4);
6880 a += 8;
6881
6882 __m128 vy0123 = _mm_add_ps(va0123, vb);
6883 __m128 vy4567 = _mm_add_ps(va4567, vb);
6884
6885
6886 vy0123 = _mm_max_ps(vy0123, vy_min);
6887 vy4567 = _mm_max_ps(vy4567, vy_min);
6888
6889 vy0123 = _mm_min_ps(vy0123, vy_max);
6890 vy4567 = _mm_min_ps(vy4567, vy_max);
6891
6892 _mm_storeu_ps(y, vy0123);
6893 _mm_storeu_ps(y + 4, vy4567);
6894 y += 8;
6895 }
6896 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
6897 const __m128 va0123 = _mm_loadu_ps(a);
6898 a += 4;
6899
6900 __m128 vy0123 = _mm_add_ps(va0123, vb);
6901 vy0123 = _mm_max_ps(vy0123, vy_min);
6902 vy0123 = _mm_min_ps(vy0123, vy_max);
6903 _mm_storeu_ps(y, vy0123);
6904 y += 4;
6905 }
6906 if XNN_UNLIKELY(n != 0) {
6907 const __m128 va0123 = _mm_loadu_ps(a);
6908
6909 __m128 vy0123 = _mm_add_ps(va0123, vb);
6910 vy0123 = _mm_max_ps(vy0123, vy_min);
6911 vy0123 = _mm_min_ps(vy0123, vy_max);
6912 if (n & (2 * sizeof(float))) {
6913 _mm_storel_pi((__m64*) y, vy0123);
6914 vy0123 = _mm_movehl_ps(vy0123, vy0123);
6915 y += 2;
6916 }
6917 if (n & (1 * sizeof(float))) {
6918 _mm_store_ss(y, vy0123);
6919 }
6920 }
6921 }
6922
xnn_f32_vdiv_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6923 void xnn_f32_vdiv_minmax_ukernel__sse_x8(
6924 size_t n,
6925 const float* a,
6926 const float* b,
6927 float* y,
6928 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6929 {
6930 assert(n != 0);
6931 assert(n % sizeof(float) == 0);
6932 assert(a != NULL);
6933 assert(b != NULL);
6934 assert(y != NULL);
6935
6936 const __m128 vy_min = _mm_load_ps(params->sse.min);
6937 const __m128 vy_max = _mm_load_ps(params->sse.max);
6938
6939 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
6940 const __m128 va0123 = _mm_loadu_ps(a);
6941 const __m128 va4567 = _mm_loadu_ps(a + 4);
6942 a += 8;
6943
6944 const __m128 vb0123 = _mm_loadu_ps(b);
6945 const __m128 vb4567 = _mm_loadu_ps(b + 4);
6946 b += 8;
6947
6948 __m128 vy0123 = _mm_div_ps(va0123, vb0123);
6949 __m128 vy4567 = _mm_div_ps(va4567, vb4567);
6950
6951
6952 vy0123 = _mm_max_ps(vy0123, vy_min);
6953 vy4567 = _mm_max_ps(vy4567, vy_min);
6954
6955 vy0123 = _mm_min_ps(vy0123, vy_max);
6956 vy4567 = _mm_min_ps(vy4567, vy_max);
6957
6958 _mm_storeu_ps(y, vy0123);
6959 _mm_storeu_ps(y + 4, vy4567);
6960 y += 8;
6961 }
6962 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
6963 const __m128 va0123 = _mm_loadu_ps(a);
6964 a += 4;
6965
6966 const __m128 vb0123 = _mm_loadu_ps(b);
6967 b += 4;
6968
6969 __m128 vy0123 = _mm_div_ps(va0123, vb0123);
6970 vy0123 = _mm_max_ps(vy0123, vy_min);
6971 vy0123 = _mm_min_ps(vy0123, vy_max);
6972 _mm_storeu_ps(y, vy0123);
6973 y += 4;
6974 }
6975 if XNN_UNLIKELY(n != 0) {
6976 const __m128 va0123 = _mm_loadu_ps(a);
6977 const __m128 vb0123 = _mm_loadu_ps(b);
6978
6979 __m128 vy0123 = _mm_div_ps(va0123, vb0123);
6980 vy0123 = _mm_max_ps(vy0123, vy_min);
6981 vy0123 = _mm_min_ps(vy0123, vy_max);
6982 if (n & (2 * sizeof(float))) {
6983 _mm_storel_pi((__m64*) y, vy0123);
6984 vy0123 = _mm_movehl_ps(vy0123, vy0123);
6985 y += 2;
6986 }
6987 if (n & (1 * sizeof(float))) {
6988 _mm_store_ss(y, vy0123);
6989 }
6990 }
6991 }
6992
xnn_f32_vdivc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6993 void xnn_f32_vdivc_minmax_ukernel__sse_x8(
6994 size_t n,
6995 const float* a,
6996 const float* b,
6997 float* y,
6998 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6999 {
7000 assert(n != 0);
7001 assert(n % sizeof(float) == 0);
7002 assert(a != NULL);
7003 assert(b != NULL);
7004 assert(y != NULL);
7005
7006 const __m128 vy_min = _mm_load_ps(params->sse.min);
7007 const __m128 vy_max = _mm_load_ps(params->sse.max);
7008
7009 const __m128 vb = _mm_load1_ps(b);
7010 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7011 const __m128 va0123 = _mm_loadu_ps(a);
7012 const __m128 va4567 = _mm_loadu_ps(a + 4);
7013 a += 8;
7014
7015 __m128 vy0123 = _mm_div_ps(va0123, vb);
7016 __m128 vy4567 = _mm_div_ps(va4567, vb);
7017
7018
7019 vy0123 = _mm_max_ps(vy0123, vy_min);
7020 vy4567 = _mm_max_ps(vy4567, vy_min);
7021
7022 vy0123 = _mm_min_ps(vy0123, vy_max);
7023 vy4567 = _mm_min_ps(vy4567, vy_max);
7024
7025 _mm_storeu_ps(y, vy0123);
7026 _mm_storeu_ps(y + 4, vy4567);
7027 y += 8;
7028 }
7029 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7030 const __m128 va0123 = _mm_loadu_ps(a);
7031 a += 4;
7032
7033 __m128 vy0123 = _mm_div_ps(va0123, vb);
7034 vy0123 = _mm_max_ps(vy0123, vy_min);
7035 vy0123 = _mm_min_ps(vy0123, vy_max);
7036 _mm_storeu_ps(y, vy0123);
7037 y += 4;
7038 }
7039 if XNN_UNLIKELY(n != 0) {
7040 const __m128 va0123 = _mm_loadu_ps(a);
7041
7042 __m128 vy0123 = _mm_div_ps(va0123, vb);
7043 vy0123 = _mm_max_ps(vy0123, vy_min);
7044 vy0123 = _mm_min_ps(vy0123, vy_max);
7045 if (n & (2 * sizeof(float))) {
7046 _mm_storel_pi((__m64*) y, vy0123);
7047 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7048 y += 2;
7049 }
7050 if (n & (1 * sizeof(float))) {
7051 _mm_store_ss(y, vy0123);
7052 }
7053 }
7054 }
7055
xnn_f32_vmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7056 void xnn_f32_vmax_ukernel__sse_x8(
7057 size_t n,
7058 const float* a,
7059 const float* b,
7060 float* y,
7061 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7062 {
7063 assert(n != 0);
7064 assert(n % sizeof(float) == 0);
7065 assert(a != NULL);
7066 assert(b != NULL);
7067 assert(y != NULL);
7068
7069
7070 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7071 const __m128 va0123 = _mm_loadu_ps(a);
7072 const __m128 va4567 = _mm_loadu_ps(a + 4);
7073 a += 8;
7074
7075 const __m128 vb0123 = _mm_loadu_ps(b);
7076 const __m128 vb4567 = _mm_loadu_ps(b + 4);
7077 b += 8;
7078
7079 __m128 vy0123 = _mm_max_ps(va0123, vb0123);
7080 __m128 vy4567 = _mm_max_ps(va4567, vb4567);
7081
7082
7083
7084 _mm_storeu_ps(y, vy0123);
7085 _mm_storeu_ps(y + 4, vy4567);
7086 y += 8;
7087 }
7088 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7089 const __m128 va0123 = _mm_loadu_ps(a);
7090 a += 4;
7091
7092 const __m128 vb0123 = _mm_loadu_ps(b);
7093 b += 4;
7094
7095 __m128 vy0123 = _mm_max_ps(va0123, vb0123);
7096 _mm_storeu_ps(y, vy0123);
7097 y += 4;
7098 }
7099 if XNN_UNLIKELY(n != 0) {
7100 const __m128 va0123 = _mm_loadu_ps(a);
7101 const __m128 vb0123 = _mm_loadu_ps(b);
7102
7103 __m128 vy0123 = _mm_max_ps(va0123, vb0123);
7104 if (n & (2 * sizeof(float))) {
7105 _mm_storel_pi((__m64*) y, vy0123);
7106 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7107 y += 2;
7108 }
7109 if (n & (1 * sizeof(float))) {
7110 _mm_store_ss(y, vy0123);
7111 }
7112 }
7113 }
7114
xnn_f32_vmaxc_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7115 void xnn_f32_vmaxc_ukernel__sse_x8(
7116 size_t n,
7117 const float* a,
7118 const float* b,
7119 float* y,
7120 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7121 {
7122 assert(n != 0);
7123 assert(n % sizeof(float) == 0);
7124 assert(a != NULL);
7125 assert(b != NULL);
7126 assert(y != NULL);
7127
7128
7129 const __m128 vb = _mm_load1_ps(b);
7130 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7131 const __m128 va0123 = _mm_loadu_ps(a);
7132 const __m128 va4567 = _mm_loadu_ps(a + 4);
7133 a += 8;
7134
7135 __m128 vy0123 = _mm_max_ps(va0123, vb);
7136 __m128 vy4567 = _mm_max_ps(va4567, vb);
7137
7138
7139
7140 _mm_storeu_ps(y, vy0123);
7141 _mm_storeu_ps(y + 4, vy4567);
7142 y += 8;
7143 }
7144 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7145 const __m128 va0123 = _mm_loadu_ps(a);
7146 a += 4;
7147
7148 __m128 vy0123 = _mm_max_ps(va0123, vb);
7149 _mm_storeu_ps(y, vy0123);
7150 y += 4;
7151 }
7152 if XNN_UNLIKELY(n != 0) {
7153 const __m128 va0123 = _mm_loadu_ps(a);
7154
7155 __m128 vy0123 = _mm_max_ps(va0123, vb);
7156 if (n & (2 * sizeof(float))) {
7157 _mm_storel_pi((__m64*) y, vy0123);
7158 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7159 y += 2;
7160 }
7161 if (n & (1 * sizeof(float))) {
7162 _mm_store_ss(y, vy0123);
7163 }
7164 }
7165 }
7166
xnn_f32_vmin_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7167 void xnn_f32_vmin_ukernel__sse_x8(
7168 size_t n,
7169 const float* a,
7170 const float* b,
7171 float* y,
7172 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7173 {
7174 assert(n != 0);
7175 assert(n % sizeof(float) == 0);
7176 assert(a != NULL);
7177 assert(b != NULL);
7178 assert(y != NULL);
7179
7180
7181 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7182 const __m128 va0123 = _mm_loadu_ps(a);
7183 const __m128 va4567 = _mm_loadu_ps(a + 4);
7184 a += 8;
7185
7186 const __m128 vb0123 = _mm_loadu_ps(b);
7187 const __m128 vb4567 = _mm_loadu_ps(b + 4);
7188 b += 8;
7189
7190 __m128 vy0123 = _mm_min_ps(va0123, vb0123);
7191 __m128 vy4567 = _mm_min_ps(va4567, vb4567);
7192
7193
7194
7195 _mm_storeu_ps(y, vy0123);
7196 _mm_storeu_ps(y + 4, vy4567);
7197 y += 8;
7198 }
7199 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7200 const __m128 va0123 = _mm_loadu_ps(a);
7201 a += 4;
7202
7203 const __m128 vb0123 = _mm_loadu_ps(b);
7204 b += 4;
7205
7206 __m128 vy0123 = _mm_min_ps(va0123, vb0123);
7207 _mm_storeu_ps(y, vy0123);
7208 y += 4;
7209 }
7210 if XNN_UNLIKELY(n != 0) {
7211 const __m128 va0123 = _mm_loadu_ps(a);
7212 const __m128 vb0123 = _mm_loadu_ps(b);
7213
7214 __m128 vy0123 = _mm_min_ps(va0123, vb0123);
7215 if (n & (2 * sizeof(float))) {
7216 _mm_storel_pi((__m64*) y, vy0123);
7217 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7218 y += 2;
7219 }
7220 if (n & (1 * sizeof(float))) {
7221 _mm_store_ss(y, vy0123);
7222 }
7223 }
7224 }
7225
xnn_f32_vminc_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7226 void xnn_f32_vminc_ukernel__sse_x8(
7227 size_t n,
7228 const float* a,
7229 const float* b,
7230 float* y,
7231 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7232 {
7233 assert(n != 0);
7234 assert(n % sizeof(float) == 0);
7235 assert(a != NULL);
7236 assert(b != NULL);
7237 assert(y != NULL);
7238
7239
7240 const __m128 vb = _mm_load1_ps(b);
7241 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7242 const __m128 va0123 = _mm_loadu_ps(a);
7243 const __m128 va4567 = _mm_loadu_ps(a + 4);
7244 a += 8;
7245
7246 __m128 vy0123 = _mm_min_ps(va0123, vb);
7247 __m128 vy4567 = _mm_min_ps(va4567, vb);
7248
7249
7250
7251 _mm_storeu_ps(y, vy0123);
7252 _mm_storeu_ps(y + 4, vy4567);
7253 y += 8;
7254 }
7255 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7256 const __m128 va0123 = _mm_loadu_ps(a);
7257 a += 4;
7258
7259 __m128 vy0123 = _mm_min_ps(va0123, vb);
7260 _mm_storeu_ps(y, vy0123);
7261 y += 4;
7262 }
7263 if XNN_UNLIKELY(n != 0) {
7264 const __m128 va0123 = _mm_loadu_ps(a);
7265
7266 __m128 vy0123 = _mm_min_ps(va0123, vb);
7267 if (n & (2 * sizeof(float))) {
7268 _mm_storel_pi((__m64*) y, vy0123);
7269 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7270 y += 2;
7271 }
7272 if (n & (1 * sizeof(float))) {
7273 _mm_store_ss(y, vy0123);
7274 }
7275 }
7276 }
7277
xnn_f32_vmul_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7278 void xnn_f32_vmul_minmax_ukernel__sse_x8(
7279 size_t n,
7280 const float* a,
7281 const float* b,
7282 float* y,
7283 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7284 {
7285 assert(n != 0);
7286 assert(n % sizeof(float) == 0);
7287 assert(a != NULL);
7288 assert(b != NULL);
7289 assert(y != NULL);
7290
7291 const __m128 vy_min = _mm_load_ps(params->sse.min);
7292 const __m128 vy_max = _mm_load_ps(params->sse.max);
7293
7294 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7295 const __m128 va0123 = _mm_loadu_ps(a);
7296 const __m128 va4567 = _mm_loadu_ps(a + 4);
7297 a += 8;
7298
7299 const __m128 vb0123 = _mm_loadu_ps(b);
7300 const __m128 vb4567 = _mm_loadu_ps(b + 4);
7301 b += 8;
7302
7303 __m128 vy0123 = _mm_mul_ps(va0123, vb0123);
7304 __m128 vy4567 = _mm_mul_ps(va4567, vb4567);
7305
7306
7307 vy0123 = _mm_max_ps(vy0123, vy_min);
7308 vy4567 = _mm_max_ps(vy4567, vy_min);
7309
7310 vy0123 = _mm_min_ps(vy0123, vy_max);
7311 vy4567 = _mm_min_ps(vy4567, vy_max);
7312
7313 _mm_storeu_ps(y, vy0123);
7314 _mm_storeu_ps(y + 4, vy4567);
7315 y += 8;
7316 }
7317 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7318 const __m128 va0123 = _mm_loadu_ps(a);
7319 a += 4;
7320
7321 const __m128 vb0123 = _mm_loadu_ps(b);
7322 b += 4;
7323
7324 __m128 vy0123 = _mm_mul_ps(va0123, vb0123);
7325 vy0123 = _mm_max_ps(vy0123, vy_min);
7326 vy0123 = _mm_min_ps(vy0123, vy_max);
7327 _mm_storeu_ps(y, vy0123);
7328 y += 4;
7329 }
7330 if XNN_UNLIKELY(n != 0) {
7331 const __m128 va0123 = _mm_loadu_ps(a);
7332 const __m128 vb0123 = _mm_loadu_ps(b);
7333
7334 __m128 vy0123 = _mm_mul_ps(va0123, vb0123);
7335 vy0123 = _mm_max_ps(vy0123, vy_min);
7336 vy0123 = _mm_min_ps(vy0123, vy_max);
7337 if (n & (2 * sizeof(float))) {
7338 _mm_storel_pi((__m64*) y, vy0123);
7339 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7340 y += 2;
7341 }
7342 if (n & (1 * sizeof(float))) {
7343 _mm_store_ss(y, vy0123);
7344 }
7345 }
7346 }
7347
xnn_f32_vmulc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7348 void xnn_f32_vmulc_minmax_ukernel__sse_x8(
7349 size_t n,
7350 const float* a,
7351 const float* b,
7352 float* y,
7353 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7354 {
7355 assert(n != 0);
7356 assert(n % sizeof(float) == 0);
7357 assert(a != NULL);
7358 assert(b != NULL);
7359 assert(y != NULL);
7360
7361 const __m128 vy_min = _mm_load_ps(params->sse.min);
7362 const __m128 vy_max = _mm_load_ps(params->sse.max);
7363
7364 const __m128 vb = _mm_load1_ps(b);
7365 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7366 const __m128 va0123 = _mm_loadu_ps(a);
7367 const __m128 va4567 = _mm_loadu_ps(a + 4);
7368 a += 8;
7369
7370 __m128 vy0123 = _mm_mul_ps(va0123, vb);
7371 __m128 vy4567 = _mm_mul_ps(va4567, vb);
7372
7373
7374 vy0123 = _mm_max_ps(vy0123, vy_min);
7375 vy4567 = _mm_max_ps(vy4567, vy_min);
7376
7377 vy0123 = _mm_min_ps(vy0123, vy_max);
7378 vy4567 = _mm_min_ps(vy4567, vy_max);
7379
7380 _mm_storeu_ps(y, vy0123);
7381 _mm_storeu_ps(y + 4, vy4567);
7382 y += 8;
7383 }
7384 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7385 const __m128 va0123 = _mm_loadu_ps(a);
7386 a += 4;
7387
7388 __m128 vy0123 = _mm_mul_ps(va0123, vb);
7389 vy0123 = _mm_max_ps(vy0123, vy_min);
7390 vy0123 = _mm_min_ps(vy0123, vy_max);
7391 _mm_storeu_ps(y, vy0123);
7392 y += 4;
7393 }
7394 if XNN_UNLIKELY(n != 0) {
7395 const __m128 va0123 = _mm_loadu_ps(a);
7396
7397 __m128 vy0123 = _mm_mul_ps(va0123, vb);
7398 vy0123 = _mm_max_ps(vy0123, vy_min);
7399 vy0123 = _mm_min_ps(vy0123, vy_max);
7400 if (n & (2 * sizeof(float))) {
7401 _mm_storel_pi((__m64*) y, vy0123);
7402 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7403 y += 2;
7404 }
7405 if (n & (1 * sizeof(float))) {
7406 _mm_store_ss(y, vy0123);
7407 }
7408 }
7409 }
7410
xnn_f32_vrdivc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7411 void xnn_f32_vrdivc_minmax_ukernel__sse_x8(
7412 size_t n,
7413 const float* a,
7414 const float* b,
7415 float* y,
7416 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7417 {
7418 assert(n != 0);
7419 assert(n % sizeof(float) == 0);
7420 assert(a != NULL);
7421 assert(b != NULL);
7422 assert(y != NULL);
7423
7424 const __m128 vy_min = _mm_load_ps(params->sse.min);
7425 const __m128 vy_max = _mm_load_ps(params->sse.max);
7426
7427 const __m128 vb = _mm_load1_ps(b);
7428 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7429 const __m128 va0123 = _mm_loadu_ps(a);
7430 const __m128 va4567 = _mm_loadu_ps(a + 4);
7431 a += 8;
7432
7433 __m128 vy0123 = _mm_div_ps(vb, va0123);
7434 __m128 vy4567 = _mm_div_ps(vb, va4567);
7435
7436
7437 vy0123 = _mm_max_ps(vy0123, vy_min);
7438 vy4567 = _mm_max_ps(vy4567, vy_min);
7439
7440 vy0123 = _mm_min_ps(vy0123, vy_max);
7441 vy4567 = _mm_min_ps(vy4567, vy_max);
7442
7443 _mm_storeu_ps(y, vy0123);
7444 _mm_storeu_ps(y + 4, vy4567);
7445 y += 8;
7446 }
7447 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7448 const __m128 va0123 = _mm_loadu_ps(a);
7449 a += 4;
7450
7451 __m128 vy0123 = _mm_div_ps(vb, va0123);
7452 vy0123 = _mm_max_ps(vy0123, vy_min);
7453 vy0123 = _mm_min_ps(vy0123, vy_max);
7454 _mm_storeu_ps(y, vy0123);
7455 y += 4;
7456 }
7457 if XNN_UNLIKELY(n != 0) {
7458 const __m128 va0123 = _mm_loadu_ps(a);
7459
7460 __m128 vy0123 = _mm_div_ps(vb, va0123);
7461 vy0123 = _mm_max_ps(vy0123, vy_min);
7462 vy0123 = _mm_min_ps(vy0123, vy_max);
7463 if (n & (2 * sizeof(float))) {
7464 _mm_storel_pi((__m64*) y, vy0123);
7465 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7466 y += 2;
7467 }
7468 if (n & (1 * sizeof(float))) {
7469 _mm_store_ss(y, vy0123);
7470 }
7471 }
7472 }
7473
xnn_f32_vrsubc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7474 void xnn_f32_vrsubc_minmax_ukernel__sse_x8(
7475 size_t n,
7476 const float* a,
7477 const float* b,
7478 float* y,
7479 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7480 {
7481 assert(n != 0);
7482 assert(n % sizeof(float) == 0);
7483 assert(a != NULL);
7484 assert(b != NULL);
7485 assert(y != NULL);
7486
7487 const __m128 vy_min = _mm_load_ps(params->sse.min);
7488 const __m128 vy_max = _mm_load_ps(params->sse.max);
7489
7490 const __m128 vb = _mm_load1_ps(b);
7491 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7492 const __m128 va0123 = _mm_loadu_ps(a);
7493 const __m128 va4567 = _mm_loadu_ps(a + 4);
7494 a += 8;
7495
7496 __m128 vy0123 = _mm_sub_ps(vb, va0123);
7497 __m128 vy4567 = _mm_sub_ps(vb, va4567);
7498
7499
7500 vy0123 = _mm_max_ps(vy0123, vy_min);
7501 vy4567 = _mm_max_ps(vy4567, vy_min);
7502
7503 vy0123 = _mm_min_ps(vy0123, vy_max);
7504 vy4567 = _mm_min_ps(vy4567, vy_max);
7505
7506 _mm_storeu_ps(y, vy0123);
7507 _mm_storeu_ps(y + 4, vy4567);
7508 y += 8;
7509 }
7510 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7511 const __m128 va0123 = _mm_loadu_ps(a);
7512 a += 4;
7513
7514 __m128 vy0123 = _mm_sub_ps(vb, va0123);
7515 vy0123 = _mm_max_ps(vy0123, vy_min);
7516 vy0123 = _mm_min_ps(vy0123, vy_max);
7517 _mm_storeu_ps(y, vy0123);
7518 y += 4;
7519 }
7520 if XNN_UNLIKELY(n != 0) {
7521 const __m128 va0123 = _mm_loadu_ps(a);
7522
7523 __m128 vy0123 = _mm_sub_ps(vb, va0123);
7524 vy0123 = _mm_max_ps(vy0123, vy_min);
7525 vy0123 = _mm_min_ps(vy0123, vy_max);
7526 if (n & (2 * sizeof(float))) {
7527 _mm_storel_pi((__m64*) y, vy0123);
7528 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7529 y += 2;
7530 }
7531 if (n & (1 * sizeof(float))) {
7532 _mm_store_ss(y, vy0123);
7533 }
7534 }
7535 }
7536
xnn_f32_vsqrdiff_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7537 void xnn_f32_vsqrdiff_ukernel__sse_x8(
7538 size_t n,
7539 const float* a,
7540 const float* b,
7541 float* y,
7542 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7543 {
7544 assert(n != 0);
7545 assert(n % sizeof(float) == 0);
7546 assert(a != NULL);
7547 assert(b != NULL);
7548 assert(y != NULL);
7549
7550
7551 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7552 const __m128 va0123 = _mm_loadu_ps(a);
7553 const __m128 va4567 = _mm_loadu_ps(a + 4);
7554 a += 8;
7555
7556 const __m128 vb0123 = _mm_loadu_ps(b);
7557 const __m128 vb4567 = _mm_loadu_ps(b + 4);
7558 b += 8;
7559
7560 __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7561 __m128 vy4567 = _mm_sub_ps(va4567, vb4567);
7562
7563 vy0123 = _mm_mul_ps(vy0123, vy0123);
7564 vy4567 = _mm_mul_ps(vy4567, vy4567);
7565
7566
7567 _mm_storeu_ps(y, vy0123);
7568 _mm_storeu_ps(y + 4, vy4567);
7569 y += 8;
7570 }
7571 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7572 const __m128 va0123 = _mm_loadu_ps(a);
7573 a += 4;
7574
7575 const __m128 vb0123 = _mm_loadu_ps(b);
7576 b += 4;
7577
7578 __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7579 vy0123 = _mm_mul_ps(vy0123, vy0123);
7580 _mm_storeu_ps(y, vy0123);
7581 y += 4;
7582 }
7583 if XNN_UNLIKELY(n != 0) {
7584 const __m128 va0123 = _mm_loadu_ps(a);
7585 const __m128 vb0123 = _mm_loadu_ps(b);
7586
7587 __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7588 vy0123 = _mm_mul_ps(vy0123, vy0123);
7589 if (n & (2 * sizeof(float))) {
7590 _mm_storel_pi((__m64*) y, vy0123);
7591 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7592 y += 2;
7593 }
7594 if (n & (1 * sizeof(float))) {
7595 _mm_store_ss(y, vy0123);
7596 }
7597 }
7598 }
7599
xnn_f32_vsqrdiffc_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7600 void xnn_f32_vsqrdiffc_ukernel__sse_x8(
7601 size_t n,
7602 const float* a,
7603 const float* b,
7604 float* y,
7605 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7606 {
7607 assert(n != 0);
7608 assert(n % sizeof(float) == 0);
7609 assert(a != NULL);
7610 assert(b != NULL);
7611 assert(y != NULL);
7612
7613
7614 const __m128 vb = _mm_load1_ps(b);
7615 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7616 const __m128 va0123 = _mm_loadu_ps(a);
7617 const __m128 va4567 = _mm_loadu_ps(a + 4);
7618 a += 8;
7619
7620 __m128 vy0123 = _mm_sub_ps(va0123, vb);
7621 __m128 vy4567 = _mm_sub_ps(va4567, vb);
7622
7623 vy0123 = _mm_mul_ps(vy0123, vy0123);
7624 vy4567 = _mm_mul_ps(vy4567, vy4567);
7625
7626
7627 _mm_storeu_ps(y, vy0123);
7628 _mm_storeu_ps(y + 4, vy4567);
7629 y += 8;
7630 }
7631 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7632 const __m128 va0123 = _mm_loadu_ps(a);
7633 a += 4;
7634
7635 __m128 vy0123 = _mm_sub_ps(va0123, vb);
7636 vy0123 = _mm_mul_ps(vy0123, vy0123);
7637 _mm_storeu_ps(y, vy0123);
7638 y += 4;
7639 }
7640 if XNN_UNLIKELY(n != 0) {
7641 const __m128 va0123 = _mm_loadu_ps(a);
7642
7643 __m128 vy0123 = _mm_sub_ps(va0123, vb);
7644 vy0123 = _mm_mul_ps(vy0123, vy0123);
7645 if (n & (2 * sizeof(float))) {
7646 _mm_storel_pi((__m64*) y, vy0123);
7647 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7648 y += 2;
7649 }
7650 if (n & (1 * sizeof(float))) {
7651 _mm_store_ss(y, vy0123);
7652 }
7653 }
7654 }
7655
xnn_f32_vsub_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7656 void xnn_f32_vsub_minmax_ukernel__sse_x8(
7657 size_t n,
7658 const float* a,
7659 const float* b,
7660 float* y,
7661 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7662 {
7663 assert(n != 0);
7664 assert(n % sizeof(float) == 0);
7665 assert(a != NULL);
7666 assert(b != NULL);
7667 assert(y != NULL);
7668
7669 const __m128 vy_min = _mm_load_ps(params->sse.min);
7670 const __m128 vy_max = _mm_load_ps(params->sse.max);
7671
7672 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7673 const __m128 va0123 = _mm_loadu_ps(a);
7674 const __m128 va4567 = _mm_loadu_ps(a + 4);
7675 a += 8;
7676
7677 const __m128 vb0123 = _mm_loadu_ps(b);
7678 const __m128 vb4567 = _mm_loadu_ps(b + 4);
7679 b += 8;
7680
7681 __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7682 __m128 vy4567 = _mm_sub_ps(va4567, vb4567);
7683
7684
7685 vy0123 = _mm_max_ps(vy0123, vy_min);
7686 vy4567 = _mm_max_ps(vy4567, vy_min);
7687
7688 vy0123 = _mm_min_ps(vy0123, vy_max);
7689 vy4567 = _mm_min_ps(vy4567, vy_max);
7690
7691 _mm_storeu_ps(y, vy0123);
7692 _mm_storeu_ps(y + 4, vy4567);
7693 y += 8;
7694 }
7695 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7696 const __m128 va0123 = _mm_loadu_ps(a);
7697 a += 4;
7698
7699 const __m128 vb0123 = _mm_loadu_ps(b);
7700 b += 4;
7701
7702 __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7703 vy0123 = _mm_max_ps(vy0123, vy_min);
7704 vy0123 = _mm_min_ps(vy0123, vy_max);
7705 _mm_storeu_ps(y, vy0123);
7706 y += 4;
7707 }
7708 if XNN_UNLIKELY(n != 0) {
7709 const __m128 va0123 = _mm_loadu_ps(a);
7710 const __m128 vb0123 = _mm_loadu_ps(b);
7711
7712 __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7713 vy0123 = _mm_max_ps(vy0123, vy_min);
7714 vy0123 = _mm_min_ps(vy0123, vy_max);
7715 if (n & (2 * sizeof(float))) {
7716 _mm_storel_pi((__m64*) y, vy0123);
7717 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7718 y += 2;
7719 }
7720 if (n & (1 * sizeof(float))) {
7721 _mm_store_ss(y, vy0123);
7722 }
7723 }
7724 }
7725
xnn_f32_vsubc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7726 void xnn_f32_vsubc_minmax_ukernel__sse_x8(
7727 size_t n,
7728 const float* a,
7729 const float* b,
7730 float* y,
7731 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7732 {
7733 assert(n != 0);
7734 assert(n % sizeof(float) == 0);
7735 assert(a != NULL);
7736 assert(b != NULL);
7737 assert(y != NULL);
7738
7739 const __m128 vy_min = _mm_load_ps(params->sse.min);
7740 const __m128 vy_max = _mm_load_ps(params->sse.max);
7741
7742 const __m128 vb = _mm_load1_ps(b);
7743 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7744 const __m128 va0123 = _mm_loadu_ps(a);
7745 const __m128 va4567 = _mm_loadu_ps(a + 4);
7746 a += 8;
7747
7748 __m128 vy0123 = _mm_sub_ps(va0123, vb);
7749 __m128 vy4567 = _mm_sub_ps(va4567, vb);
7750
7751
7752 vy0123 = _mm_max_ps(vy0123, vy_min);
7753 vy4567 = _mm_max_ps(vy4567, vy_min);
7754
7755 vy0123 = _mm_min_ps(vy0123, vy_max);
7756 vy4567 = _mm_min_ps(vy4567, vy_max);
7757
7758 _mm_storeu_ps(y, vy0123);
7759 _mm_storeu_ps(y + 4, vy4567);
7760 y += 8;
7761 }
7762 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7763 const __m128 va0123 = _mm_loadu_ps(a);
7764 a += 4;
7765
7766 __m128 vy0123 = _mm_sub_ps(va0123, vb);
7767 vy0123 = _mm_max_ps(vy0123, vy_min);
7768 vy0123 = _mm_min_ps(vy0123, vy_max);
7769 _mm_storeu_ps(y, vy0123);
7770 y += 4;
7771 }
7772 if XNN_UNLIKELY(n != 0) {
7773 const __m128 va0123 = _mm_loadu_ps(a);
7774
7775 __m128 vy0123 = _mm_sub_ps(va0123, vb);
7776 vy0123 = _mm_max_ps(vy0123, vy_min);
7777 vy0123 = _mm_min_ps(vy0123, vy_max);
7778 if (n & (2 * sizeof(float))) {
7779 _mm_storel_pi((__m64*) y, vy0123);
7780 vy0123 = _mm_movehl_ps(vy0123, vy0123);
7781 y += 2;
7782 }
7783 if (n & (1 * sizeof(float))) {
7784 _mm_store_ss(y, vy0123);
7785 }
7786 }
7787 }
7788
xnn_f32_vclamp_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7789 void xnn_f32_vclamp_ukernel__sse_x8(
7790 size_t n,
7791 const float* x,
7792 float* y,
7793 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7794 {
7795 assert(n != 0);
7796 assert(n % sizeof(float) == 0);
7797 assert(x != NULL);
7798 assert(y != NULL);
7799
7800 const __m128 vy_min = _mm_load_ps(params->sse.min);
7801 const __m128 vy_max = _mm_load_ps(params->sse.max);
7802
7803 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7804 __m128 vacc0123 = _mm_loadu_ps(x);
7805 __m128 vacc4567 = _mm_loadu_ps(x + 4);
7806 x += 8;
7807
7808 vacc0123 = _mm_max_ps(vacc0123, vy_min);
7809 vacc4567 = _mm_max_ps(vacc4567, vy_min);
7810
7811 vacc0123 = _mm_min_ps(vacc0123, vy_max);
7812 vacc4567 = _mm_min_ps(vacc4567, vy_max);
7813
7814 _mm_storeu_ps(y, vacc0123);
7815 _mm_storeu_ps(y + 4, vacc4567);
7816 y += 8;
7817 }
7818 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7819 __m128 vacc = _mm_loadu_ps(x);
7820 x += 4;
7821
7822 vacc = _mm_max_ps(vacc, vy_min);
7823 vacc = _mm_min_ps(vacc, vy_max);
7824
7825 _mm_storeu_ps(y, vacc);
7826 y += 4;
7827 }
7828 if XNN_UNLIKELY(n != 0) {
7829 __m128 vacc = _mm_loadu_ps(x);
7830 vacc = _mm_max_ps(vacc, vy_min);
7831 vacc = _mm_min_ps(vacc, vy_max);
7832
7833 if (n & (2 * sizeof(float))) {
7834 _mm_storel_pi((__m64*) y, vacc);
7835 vacc = _mm_movehl_ps(vacc, vacc);
7836 y += 2;
7837 }
7838 if (n & (1 * sizeof(float))) {
7839 _mm_store_ss(y, vacc);
7840 }
7841 }
7842 }
7843
xnn_f32_vhswish_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])7844 void xnn_f32_vhswish_ukernel__sse_x8(
7845 size_t n,
7846 const float* x,
7847 float* y,
7848 const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7849 {
7850 assert(n != 0);
7851 assert(n % sizeof(float) == 0);
7852
7853 const __m128 vsixth = _mm_load_ps(params->sse.sixth);
7854 const __m128 vhalf = _mm_load_ps(params->sse.half);
7855 const __m128 vone = _mm_load_ps(params->sse.one);
7856 const __m128 vzero = _mm_setzero_ps();
7857
7858 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7859 const __m128 vx0123 = _mm_loadu_ps(x);
7860 const __m128 vx4567 = _mm_loadu_ps(x + 4);
7861 x += 8;
7862
7863 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
7864 __m128 vacc4567 = _mm_mul_ps(vx4567, vsixth);
7865
7866 vacc0123 = _mm_add_ps(vacc0123, vhalf);
7867 vacc4567 = _mm_add_ps(vacc4567, vhalf);
7868
7869 vacc0123 = _mm_max_ps(vacc0123, vzero);
7870 vacc4567 = _mm_max_ps(vacc4567, vzero);
7871
7872 vacc0123 = _mm_min_ps(vacc0123, vone);
7873 vacc4567 = _mm_min_ps(vacc4567, vone);
7874
7875 vacc0123 = _mm_mul_ps(vacc0123, vx0123);
7876 vacc4567 = _mm_mul_ps(vacc4567, vx4567);
7877
7878 _mm_storeu_ps(y, vacc0123);
7879 _mm_storeu_ps(y + 4, vacc4567);
7880 y += 8;
7881 }
7882 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7883 const __m128 vx0123 = _mm_loadu_ps(x);
7884 x += 4;
7885 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
7886 vacc0123 = _mm_add_ps(vacc0123, vhalf);
7887 vacc0123 = _mm_max_ps(vacc0123, vzero);
7888 vacc0123 = _mm_min_ps(vacc0123, vone);
7889 vacc0123 = _mm_mul_ps(vacc0123, vx0123);
7890 _mm_storeu_ps(y, vacc0123);
7891 y += 4;
7892 }
7893 if XNN_UNLIKELY(n != 0) {
7894 const __m128 vx0123 = _mm_loadu_ps(x);
7895 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
7896 vacc0123 = _mm_add_ps(vacc0123, vhalf);
7897 vacc0123 = _mm_max_ps(vacc0123, vzero);
7898 vacc0123 = _mm_min_ps(vacc0123, vone);
7899 vacc0123 = _mm_mul_ps(vacc0123, vx0123);
7900
7901 if (n & (2 * sizeof(float))) {
7902 _mm_storel_pi((__m64*) y, vacc0123);
7903 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
7904 y += 2;
7905 }
7906 if (n & (1 * sizeof(float))) {
7907 _mm_store_ss(y, vacc0123);
7908 }
7909 }
7910 }
7911
xnn_f32_vlrelu_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])7912 void xnn_f32_vlrelu_ukernel__sse_x8(
7913 size_t n,
7914 const float* x,
7915 float* y,
7916 const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7917 {
7918 assert(n != 0);
7919 assert(n % sizeof(float) == 0);
7920
7921 const __m128 vslope = _mm_load_ps(params->sse.slope);
7922 const __m128 vzero = _mm_setzero_ps();
7923 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7924 __m128 vx0123 = _mm_loadu_ps(x);
7925 __m128 vx4567 = _mm_loadu_ps(x + 4);
7926 x += 8;
7927
7928 __m128 vacc0123 = _mm_max_ps(_mm_setzero_ps(), vx0123);
7929 vx0123 = _mm_min_ps(vx0123, vzero);
7930 __m128 vacc4567 = _mm_max_ps(_mm_setzero_ps(), vx4567);
7931 vx4567 = _mm_min_ps(vx4567, vzero);
7932
7933 vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vx0123, vslope));
7934 vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vx4567, vslope));
7935
7936 _mm_storeu_ps(y, vacc0123);
7937 _mm_storeu_ps(y + 4, vacc4567);
7938 y += 8;
7939 }
7940 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7941 __m128 vx = _mm_loadu_ps(x);
7942 x += 4;
7943
7944 __m128 vacc = _mm_max_ps(_mm_setzero_ps(), vx);
7945 vx = _mm_min_ps(vx, vzero);
7946 vacc = _mm_add_ps(vacc, _mm_mul_ps(vx, vslope));
7947
7948 _mm_storeu_ps(y, vacc);
7949 y += 4;
7950 }
7951 if XNN_UNLIKELY(n != 0) {
7952 __m128 vx = _mm_loadu_ps(x);
7953
7954 __m128 vacc = _mm_max_ps(_mm_setzero_ps(), vx);
7955 vx = _mm_min_ps(vx, vzero);
7956 vacc = _mm_add_ps(vacc, _mm_mul_ps(vx, vslope));
7957
7958 if (n & (2 * sizeof(float))) {
7959 _mm_storel_pi((__m64*) y, vacc);
7960 vacc = _mm_movehl_ps(vacc, vacc);
7961 y += 2;
7962 }
7963 if (n & (1 * sizeof(float))) {
7964 _mm_store_ss(y, vacc);
7965 }
7966 }
7967 }
7968
xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x(size_t rows,size_t channels,const float * restrict input,size_t input_stride,const float * restrict weights,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7969 void xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x(
7970 size_t rows,
7971 size_t channels,
7972 const float*restrict input,
7973 size_t input_stride,
7974 const float*restrict weights,
7975 float*restrict output,
7976 size_t output_stride,
7977 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7978 {
7979 assert(rows != 0);
7980 assert(channels != 0);
7981 assert(channels % sizeof(float) == 0);
7982
7983 const float* i0 = input;
7984 float* o0 = output;
7985 const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
7986 float* o1 = (float*) ((uintptr_t) o0 + output_stride);
7987
7988 const size_t input_increment = input_stride * 2 - channels;
7989 const size_t output_increment = output_stride * 2 - channels;
7990
7991 const __m128 vmin = _mm_load_ps(params->sse.min);
7992 const __m128 vmax = _mm_load_ps(params->sse.max);
7993 do {
7994 if XNN_UNPREDICTABLE(rows < 2) {
7995 i1 = i0;
7996 o1 = o0;
7997 }
7998
7999 const float* w = weights;
8000 size_t c = channels;
8001 for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
8002 const __m128 vscale0123 = _mm_load_ps(w);
8003
8004 __m128 vacc0x0123 = _mm_loadu_ps(i0);
8005 i0 += 4;
8006 __m128 vacc1x0123 = _mm_loadu_ps(i1);
8007 i1 += 4;
8008
8009 vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123);
8010 vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123);
8011
8012 const __m128 vbias0123 = _mm_load_ps(w + 4);
8013
8014 vacc0x0123 = _mm_add_ps(vacc0x0123, vbias0123);
8015 vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123);
8016
8017 vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
8018 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
8019
8020 vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
8021 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
8022
8023 _mm_storeu_ps(o0, vacc0x0123);
8024 o0 += 4;
8025 _mm_storeu_ps(o1, vacc1x0123);
8026 o1 += 4;
8027
8028 w += 8;
8029 }
8030 if XNN_UNLIKELY(c != 0) {
8031 const __m128 vscale0123 = _mm_load_ps(w);
8032
8033 __m128 vacc0x0123 = _mm_loadu_ps(i0);
8034 i0 = (const float*) ((uintptr_t) i0 + c);
8035 __m128 vacc1x0123 = _mm_loadu_ps(i1);
8036 i1 = (const float*) ((uintptr_t) i1 + c);
8037
8038 vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123);
8039 vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123);
8040
8041 const __m128 vbias0123 = _mm_load_ps(w + 4);
8042
8043 vacc0x0123 = _mm_add_ps(vacc0x0123, vbias0123);
8044 vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123);
8045
8046 vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
8047 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
8048
8049 vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
8050 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
8051
8052 if (c & (2 * sizeof(float))) {
8053 _mm_storel_pi((__m64*) o0, vacc0x0123);
8054 _mm_storel_pi((__m64*) o1, vacc1x0123);
8055
8056 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
8057 vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
8058
8059 o0 += 2;
8060 o1 += 2;
8061 }
8062 if (c & (1 * sizeof(float))) {
8063 _mm_store_ss(o0, vacc0x0123);
8064 _mm_store_ss(o1, vacc1x0123);
8065
8066 o0 += 1;
8067 o1 += 1;
8068 }
8069 }
8070 i0 = (const float*) ((uintptr_t) i0 + input_increment);
8071 o0 = (float*) ((uintptr_t) o0 + output_increment);
8072 i1 = (const float*) ((uintptr_t) i1 + input_increment);
8073 o1 = (float*) ((uintptr_t) o1 + output_increment);
8074 rows = doz(rows, 2);
8075 } while (rows != 0);
8076 }
8077
xnn_f32_vsqrt_ukernel__sse_sqrt_x4(size_t n,const float * x,float * y,const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS (1)])8078 void xnn_f32_vsqrt_ukernel__sse_sqrt_x4(
8079 size_t n,
8080 const float* x,
8081 float* y,
8082 const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8083 {
8084 assert(n != 0);
8085 assert(n % sizeof(float) == 0);
8086
8087 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8088 const __m128 vx = _mm_loadu_ps(x);
8089 x += 4;
8090 const __m128 vy = _mm_sqrt_ps(vx);
8091 _mm_storeu_ps(y, vy);
8092 y += 4;
8093 }
8094 if XNN_UNLIKELY(n != 0) {
8095 const __m128 vx = _mm_loadu_ps(x);
8096 __m128 vy = _mm_sqrt_ps(vx);
8097 if (n & (2 * sizeof(float))) {
8098 _mm_storel_pi((__m64*) y, vy);
8099 vy = _mm_movehl_ps(vy, vy);
8100 y += 2;
8101 }
8102 if (n & (1 * sizeof(float))) {
8103 _mm_store_ss(y, vy);
8104 }
8105 }
8106 }
8107
xnn_f32_vabs_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS (1)])8108 void xnn_f32_vabs_ukernel__sse_x8(
8109 size_t n,
8110 const float* x,
8111 float* y,
8112 const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8113 {
8114 assert(n != 0);
8115 assert(n % sizeof(float) == 0);
8116 assert(x != NULL);
8117 assert(y != NULL);
8118
8119 const __m128 vnonsign_mask = _mm_load_ps(params->sse.nonsign_mask);
8120 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
8121 const __m128 vx0123 = _mm_loadu_ps(x);
8122 const __m128 vx4567 = _mm_loadu_ps(x + 4);
8123 x += 8;
8124
8125 const __m128 vy0123 = _mm_and_ps(vx0123, vnonsign_mask);
8126 const __m128 vy4567 = _mm_and_ps(vx4567, vnonsign_mask);
8127
8128 _mm_storeu_ps(y, vy0123);
8129 _mm_storeu_ps(y + 4, vy4567);
8130 y += 8;
8131 }
8132 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8133 const __m128 vx = _mm_loadu_ps(x);
8134 x += 4;
8135 const __m128 vy = _mm_and_ps(vx, vnonsign_mask);
8136 _mm_storeu_ps(y, vy);
8137 y += 4;
8138 }
8139 if XNN_UNLIKELY(n != 0) {
8140 const __m128 vx = _mm_loadu_ps(x);
8141 __m128 vy = _mm_and_ps(vx, vnonsign_mask);
8142 if (n & (2 * sizeof(float))) {
8143 _mm_storel_pi((__m64*) y, vy);
8144 vy = _mm_movehl_ps(vy, vy);
8145 y += 2;
8146 }
8147 if (n & (1 * sizeof(float))) {
8148 _mm_store_ss(y, vy);
8149 }
8150 }
8151 }
8152
xnn_f32_vneg_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS (1)])8153 void xnn_f32_vneg_ukernel__sse_x8(
8154 size_t n,
8155 const float* x,
8156 float* y,
8157 const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8158 {
8159 assert(n != 0);
8160 assert(n % sizeof(float) == 0);
8161 assert(x != NULL);
8162 assert(y != NULL);
8163
8164 const __m128 vsign_mask = _mm_load_ps(params->sse.sign_mask);
8165 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
8166 const __m128 vx0123 = _mm_loadu_ps(x);
8167 const __m128 vx4567 = _mm_loadu_ps(x + 4);
8168 x += 8;
8169
8170 const __m128 vy0123 = _mm_xor_ps(vx0123, vsign_mask);
8171 const __m128 vy4567 = _mm_xor_ps(vx4567, vsign_mask);
8172
8173 _mm_storeu_ps(y, vy0123);
8174 _mm_storeu_ps(y + 4, vy4567);
8175 y += 8;
8176 }
8177 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8178 const __m128 vx = _mm_loadu_ps(x);
8179 x += 4;
8180 const __m128 vy = _mm_xor_ps(vx, vsign_mask);
8181 _mm_storeu_ps(y, vy);
8182 y += 4;
8183 }
8184 if XNN_UNLIKELY(n != 0) {
8185 const __m128 vx = _mm_loadu_ps(x);
8186 __m128 vy = _mm_xor_ps(vx, vsign_mask);
8187 if (n & (2 * sizeof(float))) {
8188 _mm_storel_pi((__m64*) y, vy);
8189 vy = _mm_movehl_ps(vy, vy);
8190 y += 2;
8191 }
8192 if (n & (1 * sizeof(float))) {
8193 _mm_store_ss(y, vy);
8194 }
8195 }
8196 }
8197
xnn_f32_vsqr_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])8198 void xnn_f32_vsqr_ukernel__sse_x8(
8199 size_t n,
8200 const float* x,
8201 float* y,
8202 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8203 {
8204 assert(n != 0);
8205 assert(n % sizeof(float) == 0);
8206 assert(x != NULL);
8207 assert(y != NULL);
8208
8209 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
8210 const __m128 vx0123 = _mm_loadu_ps(x);
8211 const __m128 vx4567 = _mm_loadu_ps(x + 4);
8212 x += 8;
8213
8214 const __m128 vy0123 = _mm_mul_ps(vx0123, vx0123);
8215 const __m128 vy4567 = _mm_mul_ps(vx4567, vx4567);
8216
8217 _mm_storeu_ps(y, vy0123);
8218 _mm_storeu_ps(y + 4, vy4567);
8219 y += 8;
8220 }
8221 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8222 const __m128 vx = _mm_loadu_ps(x);
8223 x += 4;
8224 const __m128 vy = _mm_mul_ps(vx, vx);
8225 _mm_storeu_ps(y, vy);
8226 y += 4;
8227 }
8228 if XNN_UNLIKELY(n != 0) {
8229 const __m128 vx = _mm_loadu_ps(x);
8230 __m128 vy = _mm_mul_ps(vx, vx);
8231 if (n & (2 * sizeof(float))) {
8232 _mm_storel_pi((__m64*) y, vy);
8233 vy = _mm_movehl_ps(vy, vy);
8234 y += 2;
8235 }
8236 if (n & (1 * sizeof(float))) {
8237 _mm_store_ss(y, vy);
8238 }
8239 }
8240 }
8241
xnn_x32_packx_ukernel_4x__sse(size_t m,size_t k,const uint32_t * restrict x,size_t x_stride,uint32_t * restrict y)8242 void xnn_x32_packx_ukernel_4x__sse(
8243 size_t m,
8244 size_t k,
8245 const uint32_t* restrict x,
8246 size_t x_stride,
8247 uint32_t* restrict y)
8248 {
8249 assert(m != 0);
8250 assert(k != 0);
8251
8252 const float* x0 = (const float*) x;
8253 const float* x1 = (const float*) ((uintptr_t) x0 + x_stride);
8254 if (m < 2) {
8255 x1 = x0;
8256 }
8257 const float* x2 = (const float*) ((uintptr_t) x1 + x_stride);
8258 if (m <= 2) {
8259 x2 = x1;
8260 }
8261 const float* x3 = (const float*) ((uintptr_t) x2 + x_stride);
8262 if (m != 4) {
8263 x3 = x2;
8264 }
8265
8266 float*restrict y_f32 = (float*) y;
8267
8268 for (; k >= 4; k -= 4) {
8269 const __m128 vx0 = _mm_loadu_ps(x0);
8270 x0 += 4;
8271 const __m128 vx1 = _mm_loadu_ps(x1);
8272 x1 += 4;
8273 const __m128 vx2 = _mm_loadu_ps(x2);
8274 x2 += 4;
8275 const __m128 vx3 = _mm_loadu_ps(x3);
8276 x3 += 4;
8277
8278 const __m128 vt0 = _mm_unpacklo_ps(vx0, vx1);
8279 const __m128 vt1 = _mm_unpackhi_ps(vx0, vx1);
8280 const __m128 vt2 = _mm_unpacklo_ps(vx2, vx3);
8281 const __m128 vt3 = _mm_unpackhi_ps(vx2, vx3);
8282
8283 const __m128 vy0 = _mm_movelh_ps(vt0, vt2);
8284 _mm_store_ps(y_f32, vy0);
8285
8286 const __m128 vy1 = _mm_movehl_ps(vt2, vt0);
8287 _mm_store_ps(y_f32 + 4, vy1);
8288
8289 const __m128 vy2 = _mm_movelh_ps(vt1, vt3);
8290 _mm_store_ps(y_f32 + 8, vy2);
8291
8292 const __m128 vy3 = _mm_movehl_ps(vt3, vt1);
8293 _mm_store_ps(y_f32 + 12, vy3);
8294
8295 y_f32 += 16;
8296 }
8297 if XNN_UNLIKELY(k != 0) {
8298 do {
8299 const __m128 vx0 = _mm_load_ss(x0);
8300 x0 += 1;
8301 const __m128 vx1 = _mm_load_ss(x1);
8302 x1 += 1;
8303 const __m128 vx2 = _mm_load_ss(x2);
8304 x2 += 1;
8305 const __m128 vx3 = _mm_load_ss(x3);
8306 x3 += 1;
8307
8308 const __m128 vx01 = _mm_unpacklo_ps(vx0, vx1);
8309 const __m128 vx23 = _mm_unpacklo_ps(vx2, vx3);
8310 const __m128 vy = _mm_movelh_ps(vx01, vx23);
8311
8312 _mm_store_ps(y_f32, vy);
8313 y_f32 += 4;
8314 } while (--k != 0);
8315 }
8316 }
8317
xnn_x32_transposec_ukernel__4x4_sse(const uint32_t * input,uint32_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)8318 void xnn_x32_transposec_ukernel__4x4_sse(
8319 const uint32_t* input,
8320 uint32_t* output,
8321 size_t input_stride,
8322 size_t output_stride,
8323 size_t block_width,
8324 size_t block_height) XNN_OOB_READS
8325 {
8326 assert(output_stride >= block_height * sizeof(uint32_t));
8327 assert(input_stride >= block_width * sizeof(uint32_t));
8328
8329 const size_t tile_height = 4;
8330 const size_t tile_width = 4;
8331 const size_t tile_wbytes = tile_width * sizeof(float);
8332 const size_t input_vreset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
8333 const size_t output_vreset = tile_height * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t);
8334 const size_t input_offset = tile_height * input_stride;
8335
8336 const float* i0 = (const float*) input;
8337 const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
8338 const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
8339 const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
8340
8341 float* o0 = (float*) output;
8342 float* o1 = (float*) ((uintptr_t) o0 + output_stride);
8343 float* o2 = (float*) ((uintptr_t) o1 + output_stride);
8344 float* o3 = (float*) ((uintptr_t) o2 + output_stride);
8345
8346 do {
8347 if XNN_UNPREDICTABLE(block_width < 2) {
8348 o1 = o0;
8349 }
8350 if XNN_UNPREDICTABLE(block_width <= 2) {
8351 o2 = o0;
8352 }
8353 if XNN_UNPREDICTABLE(block_width < 4) {
8354 o3 = o0;
8355 }
8356 size_t bh = block_height;
8357 for (; bh >= 4; bh -= 4) {
8358 __m128 v0 = _mm_loadu_ps(i0);
8359 i0 = (const float*) ((uintptr_t) i0 + input_offset);
8360 __m128 v1 = _mm_loadu_ps(i1);
8361 i1 = (const float*) ((uintptr_t) i1 + input_offset);
8362 __m128 v2 = _mm_loadu_ps(i2);
8363 i2 = (const float*) ((uintptr_t) i2 + input_offset);
8364 __m128 v3 = _mm_loadu_ps(i3);
8365 i3 = (const float*) ((uintptr_t) i3 + input_offset);
8366
8367 _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
8368
8369 _mm_storeu_ps(o3, v3);
8370 o3 = (float*) ((uintptr_t) o3 + tile_wbytes);
8371 _mm_storeu_ps(o2, v2);
8372 o2 = (float*) ((uintptr_t) o2 + tile_wbytes);
8373 _mm_storeu_ps(o1, v1);
8374 o1 = (float*) ((uintptr_t) o1 + tile_wbytes);
8375 _mm_storeu_ps(o0, v0);
8376 o0 = (float*) ((uintptr_t) o0 + tile_wbytes);
8377 }
8378
8379 if (bh != 0) {
8380 if XNN_UNPREDICTABLE(bh <= 2) {
8381 i2 = i0;
8382 }
8383 if XNN_UNPREDICTABLE(bh < 2) {
8384 i1 = i0;
8385 }
8386 __m128 v0 = _mm_loadu_ps(i0);
8387 __m128 v1 = _mm_loadu_ps(i1);
8388 __m128 v2 = _mm_loadu_ps(i2);
8389 __m128 v3 = _mm_setzero_ps();
8390
8391 _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
8392
8393 if (bh & 2) {
8394 _mm_storel_pi((__m64*) o3, v3);
8395 o3 += 2;
8396 _mm_storel_pi((__m64*) o2, v2);
8397 o2 += 2;
8398 _mm_storel_pi((__m64*) o1, v1);
8399 o1 += 2;
8400 _mm_storel_pi((__m64*) o0, v0);
8401 o0 += 2;
8402 v0 = _mm_movehl_ps(v0, v0);
8403 v1 = _mm_movehl_ps(v1, v1);
8404 v2 = _mm_movehl_ps(v2, v2);
8405 v3 = _mm_movehl_ps(v3, v3);
8406 }
8407 if (bh & 1) {
8408 _mm_store_ss(o3, v3);
8409 _mm_store_ss(o2, v2);
8410 _mm_store_ss(o1, v1);
8411 _mm_store_ss(o0, v0);
8412 }
8413 }
8414 i0 = (const float*) ((uintptr_t) i0 + input_vreset);
8415 i1 = (const float*) ((uintptr_t) i0 + input_stride);
8416 i2 = (const float*) ((uintptr_t) i1 + input_stride);
8417 i3 = (const float*) ((uintptr_t) i2 + input_stride);
8418 o0 = (float*) ((uintptr_t) o0 + output_vreset);
8419 o1 = (float*) ((uintptr_t) o1 + output_vreset);
8420 o2 = (float*) ((uintptr_t) o2 + output_vreset);
8421 o3 = (float*) ((uintptr_t) o3 + output_vreset);
8422 block_width = doz(block_width, tile_width);
8423 } while (block_width != 0);
8424 }
8425