xref: /aosp_15_r20/external/XNNPACK/src/amalgam/sse.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <immintrin.h>
9 
10 #include <xnnpack/avgpool.h>
11 #include <xnnpack/common.h>
12 #include <xnnpack/conv.h>
13 #include <xnnpack/dwconv.h>
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/gemm.h>
16 #include <xnnpack/ibilinear.h>
17 #include <xnnpack/igemm.h>
18 #include <xnnpack/intrinsics-polyfill.h>
19 #include <xnnpack/math.h>
20 #include <xnnpack/maxpool.h>
21 #include <xnnpack/packx.h>
22 #include <xnnpack/pavgpool.h>
23 #include <xnnpack/rmax.h>
24 #include <xnnpack/spmm.h>
25 #include <xnnpack/transpose.h>
26 #include <xnnpack/vbinary.h>
27 #include <xnnpack/vmulcaddc.h>
28 #include <xnnpack/vunary.h>
29 
30 
xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,float * buffer,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])31 void xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4(
32     size_t output_pixels,
33     size_t kernel_elements,
34     size_t channels,
35     const float** input,
36     size_t input_offset,
37     const float* zero,
38     float* buffer,
39     float* output,
40     size_t input_increment,
41     size_t output_increment,
42     const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
43 {
44   assert(output_pixels != 0);
45   assert(kernel_elements > 9);
46   assert(channels != 0);
47 
48   const __m128 vscale = _mm_load_ps(params->sse.scale);
49   const __m128 vmin = _mm_load_ps(params->sse.min);
50   const __m128 vmax = _mm_load_ps(params->sse.max);
51 
52   do {
53     {
54       const float* i0 = *input++;
55       assert(i0 != NULL);
56       if XNN_UNPREDICTABLE(i0 != zero) {
57         i0 = (const float*) ((uintptr_t) i0 + input_offset);
58       }
59       const float* i1 = *input++;
60       assert(i1 != NULL);
61       if XNN_UNPREDICTABLE(i1 != zero) {
62         i1 = (const float*) ((uintptr_t) i1 + input_offset);
63       }
64       const float* i2 = *input++;
65       assert(i2 != NULL);
66       if XNN_UNPREDICTABLE(i2 != zero) {
67         i2 = (const float*) ((uintptr_t) i2 + input_offset);
68       }
69       const float* i3 = *input++;
70       assert(i3 != NULL);
71       if XNN_UNPREDICTABLE(i3 != zero) {
72         i3 = (const float*) ((uintptr_t) i3 + input_offset);
73       }
74       const float* i4 = *input++;
75       assert(i4 != NULL);
76       if XNN_UNPREDICTABLE(i4 != zero) {
77         i4 = (const float*) ((uintptr_t) i4 + input_offset);
78       }
79       const float* i5 = *input++;
80       assert(i5 != NULL);
81       if XNN_UNPREDICTABLE(i5 != zero) {
82         i5 = (const float*) ((uintptr_t) i5 + input_offset);
83       }
84       const float* i6 = *input++;
85       assert(i6 != NULL);
86       if XNN_UNPREDICTABLE(i6 != zero) {
87         i6 = (const float*) ((uintptr_t) i6 + input_offset);
88       }
89       const float* i7 = *input++;
90       assert(i7 != NULL);
91       if XNN_UNPREDICTABLE(i7 != zero) {
92         i7 = (const float*) ((uintptr_t) i7 + input_offset);
93       }
94       const float* i8 = *input++;
95       assert(i8 != NULL);
96       if XNN_UNPREDICTABLE(i8 != zero) {
97         i8 = (const float*) ((uintptr_t) i8 + input_offset);
98       }
99 
100       float* b = buffer;
101       for (size_t c = 0; c < channels; c += 4) {
102         const __m128 vi0 = _mm_loadu_ps(i0);
103         i0 += 4;
104         const __m128 vi1 = _mm_loadu_ps(i1);
105         i1 += 4;
106         const __m128 vi2 = _mm_loadu_ps(i2);
107         i2 += 4;
108         const __m128 vi3 = _mm_loadu_ps(i3);
109         i3 += 4;
110         const __m128 vi4 = _mm_loadu_ps(i4);
111         i4 += 4;
112         const __m128 vi5 = _mm_loadu_ps(i5);
113         i5 += 4;
114         const __m128 vi6 = _mm_loadu_ps(i6);
115         i6 += 4;
116         const __m128 vi7 = _mm_loadu_ps(i7);
117         i7 += 4;
118         const __m128 vi8 = _mm_loadu_ps(i8);
119         i8 += 4;
120 
121         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
122         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
123         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
124         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
125         const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
126         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
127         const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
128         const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
129 
130         _mm_store_ps(b, vsum); b += 4;
131       }
132     }
133 
134     size_t k = kernel_elements;
135     for (k -= 9; k > 8; k -= 8) {
136       const float* i0 = *input++;
137       assert(i0 != NULL);
138       if XNN_UNPREDICTABLE(i0 != zero) {
139         i0 = (const float*) ((uintptr_t) i0 + input_offset);
140       }
141       const float* i1 = *input++;
142       assert(i1 != NULL);
143       if XNN_UNPREDICTABLE(i1 != zero) {
144         i1 = (const float*) ((uintptr_t) i1 + input_offset);
145       }
146       const float* i2 = *input++;
147       assert(i2 != NULL);
148       if XNN_UNPREDICTABLE(i2 != zero) {
149         i2 = (const float*) ((uintptr_t) i2 + input_offset);
150       }
151       const float* i3 = *input++;
152       assert(i3 != NULL);
153       if XNN_UNPREDICTABLE(i3 != zero) {
154         i3 = (const float*) ((uintptr_t) i3 + input_offset);
155       }
156       const float* i4 = *input++;
157       assert(i4 != NULL);
158       if XNN_UNPREDICTABLE(i4 != zero) {
159         i4 = (const float*) ((uintptr_t) i4 + input_offset);
160       }
161       const float* i5 = *input++;
162       assert(i5 != NULL);
163       if XNN_UNPREDICTABLE(i5 != zero) {
164         i5 = (const float*) ((uintptr_t) i5 + input_offset);
165       }
166       const float* i6 = *input++;
167       assert(i6 != NULL);
168       if XNN_UNPREDICTABLE(i6 != zero) {
169         i6 = (const float*) ((uintptr_t) i6 + input_offset);
170       }
171       const float* i7 = *input++;
172       assert(i7 != NULL);
173       if XNN_UNPREDICTABLE(i7 != zero) {
174         i7 = (const float*) ((uintptr_t) i7 + input_offset);
175       }
176 
177       float* b = buffer;
178       for (size_t c = 0; c < channels; c += 4) {
179         const __m128 vi0 = _mm_loadu_ps(i0);
180         i0 += 4;
181         const __m128 vi1 = _mm_loadu_ps(i1);
182         i1 += 4;
183         const __m128 vi2 = _mm_loadu_ps(i2);
184         i2 += 4;
185         const __m128 vi3 = _mm_loadu_ps(i3);
186         i3 += 4;
187         const __m128 vi4 = _mm_loadu_ps(i4);
188         i4 += 4;
189         const __m128 vi5 = _mm_loadu_ps(i5);
190         i5 += 4;
191         const __m128 vi6 = _mm_loadu_ps(i6);
192         i6 += 4;
193         const __m128 vi7 = _mm_loadu_ps(i7);
194         i7 += 4;
195         const __m128 vacc = _mm_load_ps(b);
196 
197         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
198         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
199         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
200         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
201         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
202         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
203         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
204         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
205 
206         _mm_store_ps(b, vsum); b += 4;
207       }
208     }
209 
210     {
211       const float* i0 = input[0];
212       assert(i0 != NULL);
213       const float* i1 = input[1];
214       const float* i2 = input[2];
215       const float* i3 = input[3];
216       const float* i4 = input[4];
217       const float* i5 = input[5];
218       const float* i6 = input[6];
219       const float* i7 = input[7];
220       input = (const float**) ((uintptr_t) input + input_increment);
221       if (k < 2) {
222         i1 = zero;
223       }
224       assert(i1 != NULL);
225       if (k <= 2) {
226         i2 = zero;
227       }
228       assert(i2 != NULL);
229       if (k < 4) {
230         i3 = zero;
231       }
232       assert(i3 != NULL);
233       if (k <= 4) {
234         i4 = zero;
235       }
236       assert(i4 != NULL);
237       if (k < 6) {
238         i5 = zero;
239       }
240       assert(i5 != NULL);
241       if (k <= 6) {
242         i6 = zero;
243       }
244       assert(i6 != NULL);
245       if (k < 8) {
246         i7 = zero;
247       }
248       assert(i7 != NULL);
249       if XNN_UNPREDICTABLE(i0 != zero) {
250         i0 = (const float*) ((uintptr_t) i0 + input_offset);
251       }
252       if XNN_UNPREDICTABLE(i1 != zero) {
253         i1 = (const float*) ((uintptr_t) i1 + input_offset);
254       }
255       if XNN_UNPREDICTABLE(i2 != zero) {
256         i2 = (const float*) ((uintptr_t) i2 + input_offset);
257       }
258       if XNN_UNPREDICTABLE(i3 != zero) {
259         i3 = (const float*) ((uintptr_t) i3 + input_offset);
260       }
261       if XNN_UNPREDICTABLE(i4 != zero) {
262         i4 = (const float*) ((uintptr_t) i4 + input_offset);
263       }
264       if XNN_UNPREDICTABLE(i5 != zero) {
265         i5 = (const float*) ((uintptr_t) i5 + input_offset);
266       }
267       if XNN_UNPREDICTABLE(i6 != zero) {
268         i6 = (const float*) ((uintptr_t) i6 + input_offset);
269       }
270       if XNN_UNPREDICTABLE(i7 != zero) {
271         i7 = (const float*) ((uintptr_t) i7 + input_offset);
272       }
273 
274       size_t c = channels;
275       float* b = buffer;
276       while (c >= 4) {
277         const __m128 vi0 = _mm_loadu_ps(i0);
278         i0 += 4;
279         const __m128 vi1 = _mm_loadu_ps(i1);
280         i1 += 4;
281         const __m128 vi2 = _mm_loadu_ps(i2);
282         i2 += 4;
283         const __m128 vi3 = _mm_loadu_ps(i3);
284         i3 += 4;
285         const __m128 vi4 = _mm_loadu_ps(i4);
286         i4 += 4;
287         const __m128 vi5 = _mm_loadu_ps(i5);
288         i5 += 4;
289         const __m128 vi6 = _mm_loadu_ps(i6);
290         i6 += 4;
291         const __m128 vi7 = _mm_loadu_ps(i7);
292         i7 += 4;
293         const __m128 vacc = _mm_load_ps(b);
294         b += 4;
295 
296         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
297         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
298         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
299         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
300         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
301         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
302         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
303         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
304 
305         __m128 vout = _mm_mul_ps(vsum, vscale);
306         vout = _mm_max_ps(vout, vmin);
307         vout = _mm_min_ps(vout, vmax);
308 
309         _mm_storeu_ps(output, vout);
310         output += 4;
311 
312         c -= 4;
313       }
314       if (c != 0) {
315         const __m128 vi0 = _mm_loadu_ps(i0);
316         const __m128 vi1 = _mm_loadu_ps(i1);
317         const __m128 vi2 = _mm_loadu_ps(i2);
318         const __m128 vi3 = _mm_loadu_ps(i3);
319         const __m128 vi4 = _mm_loadu_ps(i4);
320         const __m128 vi5 = _mm_loadu_ps(i5);
321         const __m128 vi6 = _mm_loadu_ps(i6);
322         const __m128 vi7 = _mm_loadu_ps(i7);
323         const __m128 vacc = _mm_load_ps(b);
324 
325         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
326         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
327         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
328         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
329         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
330         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
331         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
332         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
333 
334         __m128 vout = _mm_mul_ps(vsum, vscale);
335         vout = _mm_max_ps(vout, vmin);
336         vout = _mm_min_ps(vout, vmax);
337 
338         if (c & 2) {
339           _mm_storel_pi((__m64*) output, vout);
340           vout = _mm_movehl_ps(vout, vout);
341           output += 2;
342         }
343         if (c & 1) {
344           _mm_store_ss(output, vout);
345           output += 1;
346         }
347       }
348     }
349     output = (float*) ((uintptr_t) output + output_increment);
350   } while (--output_pixels != 0);
351 }
352 
xnn_f32_avgpool_minmax_ukernel_9x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])353 void xnn_f32_avgpool_minmax_ukernel_9x__sse_c4(
354     size_t output_pixels,
355     size_t kernel_elements,
356     size_t channels,
357     const float** input,
358     size_t input_offset,
359     const float* zero,
360     float* output,
361     size_t input_increment,
362     size_t output_increment,
363     const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
364 {
365   assert(output_pixels != 0);
366   assert(kernel_elements != 0);
367   assert(kernel_elements <= 9);
368   assert(channels != 0);
369 
370   const __m128 vscale = _mm_load_ps(params->sse.scale);
371   const __m128 vmin = _mm_load_ps(params->sse.min);
372   const __m128 vmax = _mm_load_ps(params->sse.max);
373 
374   do {
375     const float* i0 = input[0];
376     assert(i0 != NULL);
377     const float* i1 = input[1];
378     const float* i2 = input[2];
379     const float* i3 = input[3];
380     const float* i4 = input[4];
381     const float* i5 = input[5];
382     const float* i6 = input[6];
383     const float* i7 = input[7];
384     const float* i8 = input[8];
385     input = (const float**) ((uintptr_t) input + input_increment);
386     if (kernel_elements < 2) {
387       i1 = zero;
388     }
389     assert(i1 != NULL);
390     if (kernel_elements <= 2) {
391       i2 = zero;
392     }
393     assert(i2 != NULL);
394     if (kernel_elements < 4) {
395       i3 = zero;
396     }
397     assert(i3 != NULL);
398     if (kernel_elements <= 4) {
399       i4 = zero;
400     }
401     assert(i4 != NULL);
402     if (kernel_elements < 6) {
403       i5 = zero;
404     }
405     assert(i5 != NULL);
406     if (kernel_elements <= 6) {
407       i6 = zero;
408     }
409     assert(i6 != NULL);
410     if (kernel_elements < 8) {
411       i7 = zero;
412     }
413     assert(i7 != NULL);
414     if (kernel_elements <= 8) {
415       i8 = zero;
416     }
417     assert(i8 != NULL);
418     if XNN_UNPREDICTABLE(i0 != zero) {
419       i0 = (const float*) ((uintptr_t) i0 + input_offset);
420     }
421     if XNN_UNPREDICTABLE(i1 != zero) {
422       i1 = (const float*) ((uintptr_t) i1 + input_offset);
423     }
424     if XNN_UNPREDICTABLE(i2 != zero) {
425       i2 = (const float*) ((uintptr_t) i2 + input_offset);
426     }
427     if XNN_UNPREDICTABLE(i3 != zero) {
428       i3 = (const float*) ((uintptr_t) i3 + input_offset);
429     }
430     if XNN_UNPREDICTABLE(i4 != zero) {
431       i4 = (const float*) ((uintptr_t) i4 + input_offset);
432     }
433     if XNN_UNPREDICTABLE(i5 != zero) {
434       i5 = (const float*) ((uintptr_t) i5 + input_offset);
435     }
436     if XNN_UNPREDICTABLE(i6 != zero) {
437       i6 = (const float*) ((uintptr_t) i6 + input_offset);
438     }
439     if XNN_UNPREDICTABLE(i7 != zero) {
440       i7 = (const float*) ((uintptr_t) i7 + input_offset);
441     }
442     if XNN_UNPREDICTABLE(i8 != zero) {
443       i8 = (const float*) ((uintptr_t) i8 + input_offset);
444     }
445 
446     size_t c = channels;
447     while (c >= 4) {
448       const __m128 vi0 = _mm_loadu_ps(i0);
449       i0 += 4;
450       const __m128 vi1 = _mm_loadu_ps(i1);
451       i1 += 4;
452       const __m128 vi2 = _mm_loadu_ps(i2);
453       i2 += 4;
454       const __m128 vi3 = _mm_loadu_ps(i3);
455       i3 += 4;
456       const __m128 vi4 = _mm_loadu_ps(i4);
457       i4 += 4;
458       const __m128 vi5 = _mm_loadu_ps(i5);
459       i5 += 4;
460       const __m128 vi6 = _mm_loadu_ps(i6);
461       i6 += 4;
462       const __m128 vi7 = _mm_loadu_ps(i7);
463       i7 += 4;
464       const __m128 vi8 = _mm_loadu_ps(i8);
465       i8 += 4;
466 
467       const __m128 vsum018 = _mm_add_ps(_mm_add_ps(vi0, vi1), vi8);
468       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
469       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
470       const __m128 vsum67 = _mm_add_ps(vi6, vi7);
471 
472       const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
473       const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
474       const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
475 
476       __m128 vout = _mm_mul_ps(vsum, vscale);
477       vout = _mm_max_ps(vout, vmin);
478       vout = _mm_min_ps(vout, vmax);
479 
480       _mm_storeu_ps(output, vout); output += 4;
481 
482       c -= 4;
483     }
484     if (c != 0) {
485       const __m128 vi0 = _mm_loadu_ps(i0);
486       const __m128 vi1 = _mm_loadu_ps(i1);
487       const __m128 vi2 = _mm_loadu_ps(i2);
488       const __m128 vi3 = _mm_loadu_ps(i3);
489       const __m128 vi4 = _mm_loadu_ps(i4);
490       const __m128 vi5 = _mm_loadu_ps(i5);
491       const __m128 vi6 = _mm_loadu_ps(i6);
492       const __m128 vi7 = _mm_loadu_ps(i7);
493       const __m128 vi8 = _mm_loadu_ps(i8);
494 
495       const __m128 vsum01 = _mm_add_ps(vi0, vi1);
496       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
497       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
498       const __m128 vsum67 = _mm_add_ps(vi6, vi7);
499       const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
500       const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
501       const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
502       const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
503 
504       __m128 vout = _mm_mul_ps(vsum, vscale);
505       vout = _mm_max_ps(vout, vmin);
506       vout = _mm_min_ps(vout, vmax);
507 
508       if (c & 2) {
509         _mm_storel_pi((__m64*) output, vout);
510         vout = _mm_movehl_ps(vout, vout);
511         output += 2;
512       }
513       if (c & 1) {
514         _mm_store_ss(output, vout);
515         output += 1;
516       }
517     }
518     output = (float*) ((uintptr_t) output + output_increment);
519   } while (--output_pixels != 0);
520 }
521 
xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_channel_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])522 void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(
523     size_t input_height,
524     size_t input_width,
525     size_t output_y_start,
526     size_t output_y_end,
527     const float* input,
528     const float* zero,
529     const float* weights,
530     float* output,
531     size_t input_padding_top,
532     size_t output_channels,
533     size_t output_height_stride,
534     size_t output_channel_stride,
535     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
536 {
537   assert(input_width != 0);
538   assert(output_y_end > output_y_start);
539   assert(input_padding_top <= 1);
540   assert(output_channels != 0);
541 
542   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
543   const size_t input_width_increment = round_down_po2(input_width, 4) * 3 /* channels */ * sizeof(float);
544   const size_t output_width = (input_width + 1) / 2;
545   const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
546 
547   // Adjustment for padding processed below
548   const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
549   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
550   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
551   const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
552   const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
553   float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
554   float* output1 = (float*) ((uintptr_t) output0 + output_height_stride);
555 
556   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
557     i0 = zero;
558   }
559 
560   const __m128 vmin = _mm_load_ps(params->sse.min);
561   const __m128 vmax = _mm_load_ps(params->sse.max);
562 
563   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
564     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
565     const size_t input_y4 = input_y2 + 2;
566     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
567       i2 = zero;
568     }
569     if XNN_UNPREDICTABLE(input_y4 > input_height) {
570       i3 = zero;
571     }
572     if XNN_UNPREDICTABLE(input_y4 >= input_height) {
573       i4 = zero;
574     }
575     if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
576       output1 = output0;
577     }
578 
579     const float* w = weights;
580     size_t c = output_channels;
581     float* o0c0 = output0;
582     float* o1c0 = output1;
583     float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
584     float* o1c1 = (float*) ((uintptr_t) o1c0 + output_channel_stride);
585     float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
586     float* o1c2 = (float*) ((uintptr_t) o1c1 + output_channel_stride);
587     float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
588     float* o1c3 = (float*) ((uintptr_t) o1c2 + output_channel_stride);
589     do {
590       if XNN_UNPREDICTABLE(c < 2) {
591         o0c1 = o0c0;
592         o1c1 = o1c0;
593       }
594       if XNN_UNPREDICTABLE(c <= 2) {
595         o0c2 = o0c1;
596         o1c2 = o1c1;
597       }
598       if XNN_UNPREDICTABLE(c < 4) {
599         o0c3 = o0c2;
600         o1c3 = o1c2;
601       }
602 
603       // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
604       __m128 vi0x0 = _mm_setzero_ps();
605       __m128 vi1x0 = _mm_setzero_ps();
606       __m128 vi2x0 = _mm_setzero_ps();
607       __m128 vi3x0 = _mm_setzero_ps();
608       __m128 vi4x0 = _mm_setzero_ps();
609 
610       size_t iw = input_width;
611       for (; iw >= 4; iw -= 4) {
612         __m128 vo0x0 = _mm_load_ps(w);
613         __m128 vo1x0 = vo0x0;
614         __m128 vo0x1 = vo0x0;
615         __m128 vo1x1 = vo0x0;
616 
617         const __m128 vk00c0 = _mm_load_ps(w + 4);
618 
619         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
620         const __m128 vi0x1 = _mm_loadu_ps(i0); i0 += 4;
621         const __m128 vi1x1 = _mm_loadu_ps(i1); i1 += 4;
622         const __m128 vi2x1 = _mm_loadu_ps(i2); i2 += 4;
623         const __m128 vi3x1 = _mm_loadu_ps(i3); i3 += 4;
624         const __m128 vi4x1 = _mm_loadu_ps(i4); i4 += 4;
625 
626         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
627         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
628         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
629         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
630 
631         const __m128 vk10c0 = _mm_load_ps(w + 8);
632 
633         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
634         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
635         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
636         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
637 
638         const __m128 vk20c0 = _mm_load_ps(w + 12);
639 
640         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
641         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
642         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
643         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
644 
645         const __m128 vk00c1 = _mm_load_ps(w + 16);
646 
647         // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
648         const __m128 vi0x2 = _mm_loadu_ps(i0); i0 += 4;
649         const __m128 vi1x2 = _mm_loadu_ps(i1); i1 += 4;
650         const __m128 vi2x2 = _mm_loadu_ps(i2); i2 += 4;
651         const __m128 vi3x2 = _mm_loadu_ps(i3); i3 += 4;
652         const __m128 vi4x2 = _mm_loadu_ps(i4); i4 += 4;
653 
654         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
655         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
656         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
657         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
658 
659         const __m128 vk10c1 = _mm_load_ps(w + 20);
660 
661         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
662         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
663         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
664         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
665 
666         const __m128 vk20c1 = _mm_load_ps(w + 24);
667 
668         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
669         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
670         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
671         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
672 
673         const __m128 vk00c2 = _mm_load_ps(w + 28);
674 
675         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
676         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
677         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
678         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
679 
680         const __m128 vk10c2 = _mm_load_ps(w + 32);
681 
682         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
683         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
684         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
685         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
686 
687         const __m128 vk20c2 = _mm_load_ps(w + 36);
688 
689         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
690         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
691         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
692         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
693 
694         const __m128 vk01c0 = _mm_load_ps(w + 40);
695 
696         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
697         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
698         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
699         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
700 
701         const __m128 vk11c0 = _mm_load_ps(w + 44);
702 
703         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
704         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
705         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
706         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
707 
708         const __m128 vk21c0 = _mm_load_ps(w + 48);
709 
710         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
711         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
712         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
713         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
714 
715         const __m128 vk01c1 = _mm_load_ps(w + 52);
716 
717         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
718         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
719         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
720         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
721 
722         const __m128 vk11c1 = _mm_load_ps(w + 56);
723 
724         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
725         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
726         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
727         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
728 
729         const __m128 vk21c1 = _mm_load_ps(w + 60);
730 
731         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
732         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
733         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
734         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
735 
736         const __m128 vk01c2 = _mm_load_ps(w + 64);
737 
738         // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
739         const __m128 vi0x3 = _mm_loadu_ps(i0); i0 += 4;
740         const __m128 vi1x3 = _mm_loadu_ps(i1); i1 += 4;
741         const __m128 vi2x3 = _mm_loadu_ps(i2); i2 += 4;
742         const __m128 vi3x3 = _mm_loadu_ps(i3); i3 += 4;
743         const __m128 vi4x3 = _mm_loadu_ps(i4); i4 += 4;
744 
745         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
746         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
747         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
748         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
749 
750         const __m128 vk11c2 = _mm_load_ps(w + 68);
751 
752         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
753         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
754         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
755         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
756 
757         const __m128 vk21c2 = _mm_load_ps(w + 72);
758 
759         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
760         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
761         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
762         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
763 
764         const __m128 vk02c0 = _mm_load_ps(w + 76);
765 
766         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
767         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
768         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(1, 1, 1, 1))));
769         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
770 
771         const __m128 vk12c0 = _mm_load_ps(w + 80);
772 
773         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
774         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
775         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(1, 1, 1, 1))));
776         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(1, 1, 1, 1))));
777 
778         const __m128 vk22c0 = _mm_load_ps(w + 84);
779 
780         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
781         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
782         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
783         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(1, 1, 1, 1))));
784 
785         const __m128 vk02c1 = _mm_load_ps(w + 88);
786 
787         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
788         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
789         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(2, 2, 2, 2))));
790         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
791 
792         const __m128 vk12c1 = _mm_load_ps(w + 92);
793 
794         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
795         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
796         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(2, 2, 2, 2))));
797         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(2, 2, 2, 2))));
798 
799         const __m128 vk22c1 = _mm_load_ps(w + 96);
800 
801         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
802         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
803         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
804         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(2, 2, 2, 2))));
805 
806         const __m128 vk02c2 = _mm_load_ps(w + 100);
807 
808         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
809         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
810         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(3, 3, 3, 3))));
811         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
812 
813         const __m128 vk12c2 = _mm_load_ps(w + 104);
814 
815         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
816         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
817         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(3, 3, 3, 3))));
818         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(3, 3, 3, 3))));
819 
820         const __m128 vk22c2 = _mm_load_ps(w + 108);
821 
822         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
823         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
824         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
825         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(3, 3, 3, 3))));
826 
827         vi0x0 = vi0x3;
828         vi1x0 = vi1x3;
829         vi2x0 = vi2x3;
830         vi3x0 = vi3x3;
831         vi4x0 = vi4x3;
832 
833         vo0x0 = _mm_max_ps(vo0x0, vmin);
834         vo1x0 = _mm_max_ps(vo1x0, vmin);
835         vo0x1 = _mm_max_ps(vo0x1, vmin);
836         vo1x1 = _mm_max_ps(vo1x1, vmin);
837 
838         vo0x0 = _mm_min_ps(vo0x0, vmax);
839         vo1x0 = _mm_min_ps(vo1x0, vmax);
840         vo0x1 = _mm_min_ps(vo0x1, vmax);
841         vo1x1 = _mm_min_ps(vo1x1, vmax);
842 
843         const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
844         const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
845         const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
846         const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
847 
848         // Always 2+ output width elements remaining
849         _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
850         _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
851         _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
852         _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
853 
854         _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
855         _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
856         _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
857         _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
858       }
859       assert(iw < 4);
860       if XNN_UNLIKELY(iw != 0) {
861         __m128 vo0x0 = _mm_load_ps(w);
862         __m128 vo1x0 = vo0x0;
863         __m128 vo0x1 = vo0x0;
864         __m128 vo1x1 = vo0x0;
865 
866         const __m128 vk00c0 = _mm_load_ps(w + 4);
867 
868         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
869         __m128 vi0x1 = _mm_loadu_ps(i0);
870         __m128 vi1x1 = _mm_loadu_ps(i1);
871         __m128 vi2x1 = _mm_loadu_ps(i2);
872         __m128 vi3x1 = _mm_loadu_ps(i3);
873         __m128 vi4x1 = _mm_loadu_ps(i4);
874 
875         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
876         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
877         if (iw > 2) {
878           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
879           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
880         }
881 
882         const __m128 vk10c0 = _mm_load_ps(w + 8);
883 
884         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
885         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
886         if (iw > 2) {
887           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
888           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
889         }
890 
891         const __m128 vk20c0 = _mm_load_ps(w + 12);
892 
893         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
894         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
895         if (iw > 2) {
896           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
897           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
898         }
899 
900         const __m128 vk00c1 = _mm_load_ps(w + 16);
901 
902         __m128 vi0x2 = _mm_setzero_ps();
903         __m128 vi1x2 = _mm_setzero_ps();
904         __m128 vi2x2 = _mm_setzero_ps();
905         __m128 vi3x2 = _mm_setzero_ps();
906         __m128 vi4x2 = _mm_setzero_ps();
907         if (iw >= 2) {
908           // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
909           vi0x2 = _mm_loadu_ps(i0 + 4);
910           vi1x2 = _mm_loadu_ps(i1 + 4);
911           vi2x2 = _mm_loadu_ps(i2 + 4);
912           vi3x2 = _mm_loadu_ps(i3 + 4);
913           vi4x2 = _mm_loadu_ps(i4 + 4);
914         }
915 
916         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
917         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
918         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
919         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
920 
921         const __m128 vk10c1 = _mm_load_ps(w + 20);
922 
923         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
924         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
925         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
926         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
927 
928         const __m128 vk20c1 = _mm_load_ps(w + 24);
929 
930         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
931         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
932         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
933         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
934 
935         const __m128 vk00c2 = _mm_load_ps(w + 28);
936 
937         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
938         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
939         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
940         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
941 
942         const __m128 vk10c2 = _mm_load_ps(w + 32);
943 
944         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
945         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
946         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
947         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
948 
949         const __m128 vk20c2 = _mm_load_ps(w + 36);
950 
951         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
952         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
953         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
954         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
955 
956         const __m128 vk01c0 = _mm_load_ps(w + 40);
957 
958         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
959         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
960         if (iw > 2) {
961           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
962           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
963         }
964 
965         const __m128 vk11c0 = _mm_load_ps(w + 44);
966 
967         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
968         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
969         if (iw > 2) {
970           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
971           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
972         }
973 
974         const __m128 vk21c0 = _mm_load_ps(w + 48);
975 
976         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
977         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
978         if (iw > 2) {
979           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
980           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
981         }
982 
983         const __m128 vk01c1 = _mm_load_ps(w + 52);
984 
985         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
986         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
987         if (iw > 2) {
988           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
989           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
990         }
991 
992         const __m128 vk11c1 = _mm_load_ps(w + 56);
993 
994         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
995         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
996         if (iw > 2) {
997           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
998           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
999         }
1000 
1001         const __m128 vk21c1 = _mm_load_ps(w + 60);
1002 
1003         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
1004         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
1005         if (iw > 2) {
1006           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
1007           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
1008         }
1009 
1010         const __m128 vk01c2 = _mm_load_ps(w + 64);
1011 
1012         __m128 vi0x3 = _mm_setzero_ps();
1013         __m128 vi1x3 = _mm_setzero_ps();
1014         __m128 vi2x3 = _mm_setzero_ps();
1015         __m128 vi3x3 = _mm_setzero_ps();
1016         __m128 vi4x3 = _mm_setzero_ps();
1017         if (iw > 2) {
1018           // viMx3 = ( 0.0, 0.0, 0.0, iM3c2 )
1019           vi0x3 = _mm_load_ss(i0 + 8);
1020           vi1x3 = _mm_load_ss(i1 + 8);
1021           vi2x3 = _mm_load_ss(i2 + 8);
1022           vi3x3 = _mm_load_ss(i3 + 8);
1023           vi4x3 = _mm_load_ss(i4 + 8);
1024         }
1025 
1026         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
1027         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
1028         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
1029         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
1030 
1031         const __m128 vk11c2 = _mm_load_ps(w + 68);
1032 
1033         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
1034         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
1035         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
1036         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
1037 
1038         const __m128 vk21c2 = _mm_load_ps(w + 72);
1039 
1040         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
1041         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
1042         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
1043         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
1044 
1045         if (iw >= 2) {
1046           const __m128 vk02c0 = _mm_load_ps(w + 76);
1047 
1048           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
1049           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
1050 
1051           const __m128 vk12c0 = _mm_load_ps(w + 80);
1052 
1053           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
1054           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
1055 
1056           const __m128 vk22c0 = _mm_load_ps(w + 84);
1057 
1058           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
1059           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
1060 
1061           const __m128 vk02c1 = _mm_load_ps(w + 88);
1062 
1063           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
1064           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
1065 
1066           const __m128 vk12c1 = _mm_load_ps(w + 92);
1067 
1068           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
1069           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
1070 
1071           const __m128 vk22c1 = _mm_load_ps(w + 96);
1072 
1073           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
1074           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
1075 
1076           const __m128 vk02c2 = _mm_load_ps(w + 100);
1077 
1078           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
1079           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
1080 
1081           const __m128 vk12c2 = _mm_load_ps(w + 104);
1082 
1083           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
1084           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
1085 
1086           const __m128 vk22c2 = _mm_load_ps(w + 108);
1087 
1088           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
1089           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
1090         }
1091 
1092         vo0x0 = _mm_max_ps(vo0x0, vmin);
1093         vo1x0 = _mm_max_ps(vo1x0, vmin);
1094         vo0x1 = _mm_max_ps(vo0x1, vmin);
1095         vo1x1 = _mm_max_ps(vo1x1, vmin);
1096 
1097         vo0x0 = _mm_min_ps(vo0x0, vmax);
1098         vo1x0 = _mm_min_ps(vo1x0, vmax);
1099         vo0x1 = _mm_min_ps(vo0x1, vmax);
1100         vo1x1 = _mm_min_ps(vo1x1, vmax);
1101 
1102         if (iw == 3) {
1103           // Exactly 2 output width elements remaining
1104           const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
1105           const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
1106           const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
1107           const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
1108 
1109           _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
1110           _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
1111           _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
1112           _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
1113 
1114           _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
1115           _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
1116           _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
1117           _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
1118         } else {
1119           // Exactly 1 output width element remaining
1120 
1121           _mm_store_ss(o1c0, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(0, 0, 0, 0))); o1c0 += 1;
1122           _mm_store_ss(o1c1, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(1, 1, 1, 1))); o1c1 += 1;
1123           _mm_store_ss(o1c2, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(2, 2, 2, 2))); o1c2 += 1;
1124           _mm_store_ss(o1c3, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(3, 3, 3, 3))); o1c3 += 1;
1125 
1126           _mm_store_ss(o0c0, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(0, 0, 0, 0))); o0c0 += 1;
1127           _mm_store_ss(o0c1, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(1, 1, 1, 1))); o0c1 += 1;
1128           _mm_store_ss(o0c2, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(2, 2, 2, 2))); o0c2 += 1;
1129           _mm_store_ss(o0c3, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(3, 3, 3, 3))); o0c3 += 1;
1130         }
1131       }
1132       // Move output pointers back to the position of the first pixel in a row,
1133       // and forward to the next block of output channels.
1134       o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
1135       o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
1136       o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
1137       o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
1138       o1c0 = (float*) ((uintptr_t) o1c0 + output_channel_increment);
1139       o1c1 = (float*) ((uintptr_t) o1c1 + output_channel_increment);
1140       o1c2 = (float*) ((uintptr_t) o1c2 + output_channel_increment);
1141       o1c3 = (float*) ((uintptr_t) o1c3 + output_channel_increment);
1142       // Revert input pointers to the position of the first pixel in a row
1143       i0 = (const float*) ((uintptr_t) i0 - input_width_increment);
1144       i1 = (const float*) ((uintptr_t) i1 - input_width_increment);
1145       i2 = (const float*) ((uintptr_t) i2 - input_width_increment);
1146       i3 = (const float*) ((uintptr_t) i3 - input_width_increment);
1147       i4 = (const float*) ((uintptr_t) i4 - input_width_increment);
1148       // Move to the block of weights for the next 4 output channels
1149       w += 112;
1150       c = doz(c, 4);
1151     } while (c != 0);
1152     // Move output pointers forward to the next two rows
1153     output0 = (float*) ((uintptr_t) output1 + output_height_stride);
1154     output1 = (float*) ((uintptr_t) output0 + output_height_stride);
1155     // Move input pointers forward to the next four rows
1156     i0 = i4;
1157     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
1158     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
1159     i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
1160     i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
1161   }
1162 }
1163 
xnn_f32_dwconv_minmax_ukernel_up8x25__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1164 void xnn_f32_dwconv_minmax_ukernel_up8x25__sse(
1165     size_t channels,
1166     size_t output_width,
1167     const float** input,
1168     const float* weights,
1169     float* output,
1170     size_t input_stride,
1171     size_t output_increment,
1172     size_t input_offset,
1173     const float* zero,
1174     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1175 {
1176   assert(channels != 0);
1177   assert(output_width != 0);
1178 
1179   const __m128 vmax = _mm_load_ps(params->sse.max);
1180   const __m128 vmin = _mm_load_ps(params->sse.min);
1181   do {
1182     const float* i0 = input[0];
1183     assert(i0 != NULL);
1184     if XNN_UNPREDICTABLE(i0 != zero) {
1185       i0 = (const float*) ((uintptr_t) i0 + input_offset);
1186     }
1187     const float* i1 = input[1];
1188     assert(i1 != NULL);
1189     if XNN_UNPREDICTABLE(i1 != zero) {
1190       i1 = (const float*) ((uintptr_t) i1 + input_offset);
1191     }
1192     const float* i2 = input[2];
1193     assert(i2 != NULL);
1194     if XNN_UNPREDICTABLE(i2 != zero) {
1195       i2 = (const float*) ((uintptr_t) i2 + input_offset);
1196     }
1197     const float* i3 = input[3];
1198     assert(i3 != NULL);
1199     if XNN_UNPREDICTABLE(i3 != zero) {
1200       i3 = (const float*) ((uintptr_t) i3 + input_offset);
1201     }
1202     const float* i4 = input[4];
1203     assert(i4 != NULL);
1204     if XNN_UNPREDICTABLE(i4 != zero) {
1205       i4 = (const float*) ((uintptr_t) i4 + input_offset);
1206     }
1207     const float* i5 = input[5];
1208     assert(i5 != NULL);
1209     if XNN_UNPREDICTABLE(i5 != zero) {
1210       i5 = (const float*) ((uintptr_t) i5 + input_offset);
1211     }
1212     const float* i6 = input[6];
1213     assert(i6 != NULL);
1214     if XNN_UNPREDICTABLE(i6 != zero) {
1215       i6 = (const float*) ((uintptr_t) i6 + input_offset);
1216     }
1217     const float* i7 = input[7];
1218     assert(i7 != NULL);
1219     if XNN_UNPREDICTABLE(i7 != zero) {
1220       i7 = (const float*) ((uintptr_t) i7 + input_offset);
1221     }
1222     const float* i8 = input[8];
1223     assert(i8 != NULL);
1224     if XNN_UNPREDICTABLE(i8 != zero) {
1225       i8 = (const float*) ((uintptr_t) i8 + input_offset);
1226     }
1227     const float* i9 = input[9];
1228     assert(i9 != NULL);
1229     if XNN_UNPREDICTABLE(i9 != zero) {
1230       i9 = (const float*) ((uintptr_t) i9 + input_offset);
1231     }
1232     const float* i10 = input[10];
1233     assert(i10 != NULL);
1234     if XNN_UNPREDICTABLE(i10 != zero) {
1235       i10 = (const float*) ((uintptr_t) i10 + input_offset);
1236     }
1237     const float* i11 = input[11];
1238     assert(i11 != NULL);
1239     if XNN_UNPREDICTABLE(i11 != zero) {
1240       i11 = (const float*) ((uintptr_t) i11 + input_offset);
1241     }
1242     const float* i12 = input[12];
1243     assert(i12 != NULL);
1244     if XNN_UNPREDICTABLE(i12 != zero) {
1245       i12 = (const float*) ((uintptr_t) i12 + input_offset);
1246     }
1247     const float* i13 = input[13];
1248     assert(i13 != NULL);
1249     if XNN_UNPREDICTABLE(i13 != zero) {
1250       i13 = (const float*) ((uintptr_t) i13 + input_offset);
1251     }
1252     const float* i14 = input[14];
1253     assert(i14 != NULL);
1254     if XNN_UNPREDICTABLE(i14 != zero) {
1255       i14 = (const float*) ((uintptr_t) i14 + input_offset);
1256     }
1257     const float* i15 = input[15];
1258     assert(i15 != NULL);
1259     if XNN_UNPREDICTABLE(i15 != zero) {
1260       i15 = (const float*) ((uintptr_t) i15 + input_offset);
1261     }
1262     const float* i16 = input[16];
1263     assert(i16 != NULL);
1264     if XNN_UNPREDICTABLE(i16 != zero) {
1265       i16 = (const float*) ((uintptr_t) i16 + input_offset);
1266     }
1267     const float* i17 = input[17];
1268     assert(i17 != NULL);
1269     if XNN_UNPREDICTABLE(i17 != zero) {
1270       i17 = (const float*) ((uintptr_t) i17 + input_offset);
1271     }
1272     const float* i18 = input[18];
1273     assert(i18 != NULL);
1274     if XNN_UNPREDICTABLE(i18 != zero) {
1275       i18 = (const float*) ((uintptr_t) i18 + input_offset);
1276     }
1277     const float* i19 = input[19];
1278     assert(i19 != NULL);
1279     if XNN_UNPREDICTABLE(i19 != zero) {
1280       i19 = (const float*) ((uintptr_t) i19 + input_offset);
1281     }
1282     const float* i20 = input[20];
1283     assert(i20 != NULL);
1284     if XNN_UNPREDICTABLE(i20 != zero) {
1285       i20 = (const float*) ((uintptr_t) i20 + input_offset);
1286     }
1287     const float* i21 = input[21];
1288     assert(i21 != NULL);
1289     if XNN_UNPREDICTABLE(i21 != zero) {
1290       i21 = (const float*) ((uintptr_t) i21 + input_offset);
1291     }
1292     const float* i22 = input[22];
1293     assert(i22 != NULL);
1294     if XNN_UNPREDICTABLE(i22 != zero) {
1295       i22 = (const float*) ((uintptr_t) i22 + input_offset);
1296     }
1297     const float* i23 = input[23];
1298     assert(i23 != NULL);
1299     if XNN_UNPREDICTABLE(i23 != zero) {
1300       i23 = (const float*) ((uintptr_t) i23 + input_offset);
1301     }
1302     const float* i24 = input[24];
1303     assert(i24 != NULL);
1304     if XNN_UNPREDICTABLE(i24 != zero) {
1305       i24 = (const float*) ((uintptr_t) i24 + input_offset);
1306     }
1307     input = (const float**) ((uintptr_t) input + input_stride);
1308 
1309     size_t c = channels;
1310     const float* w = weights;
1311     for (; c >= 8; c -= 8) {
1312       __m128 vacc0123p0 = _mm_load_ps(w);
1313       __m128 vacc4567p0 = _mm_load_ps(w + 4);
1314 
1315 
1316       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1317       const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
1318       i0 += 8;
1319 
1320       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1321       const __m128 vk0x4567 = _mm_load_ps(w + 12);
1322       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1323       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
1324 
1325       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1326       const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
1327       i1 += 8;
1328 
1329       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1330       const __m128 vk1x4567 = _mm_load_ps(w + 20);
1331       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1332       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
1333 
1334       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1335       const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
1336       i2 += 8;
1337 
1338       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1339       const __m128 vk2x4567 = _mm_load_ps(w + 28);
1340       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1341       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
1342 
1343       const __m128 vi3x0123 = _mm_loadu_ps(i3);
1344       const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
1345       i3 += 8;
1346 
1347       const __m128 vk3x0123 = _mm_load_ps(w + 32);
1348       const __m128 vk3x4567 = _mm_load_ps(w + 36);
1349       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
1350       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
1351 
1352       const __m128 vi4x0123 = _mm_loadu_ps(i4);
1353       const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
1354       i4 += 8;
1355 
1356       const __m128 vk4x0123 = _mm_load_ps(w + 40);
1357       const __m128 vk4x4567 = _mm_load_ps(w + 44);
1358       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
1359       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
1360 
1361       const __m128 vi5x0123 = _mm_loadu_ps(i5);
1362       const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
1363       i5 += 8;
1364 
1365       const __m128 vk5x0123 = _mm_load_ps(w + 48);
1366       const __m128 vk5x4567 = _mm_load_ps(w + 52);
1367       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
1368       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
1369 
1370       const __m128 vi6x0123 = _mm_loadu_ps(i6);
1371       const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
1372       i6 += 8;
1373 
1374       const __m128 vk6x0123 = _mm_load_ps(w + 56);
1375       const __m128 vk6x4567 = _mm_load_ps(w + 60);
1376       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
1377       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
1378 
1379       const __m128 vi7x0123 = _mm_loadu_ps(i7);
1380       const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
1381       i7 += 8;
1382 
1383       const __m128 vk7x0123 = _mm_load_ps(w + 64);
1384       const __m128 vk7x4567 = _mm_load_ps(w + 68);
1385       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
1386       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
1387 
1388       const __m128 vi8x0123 = _mm_loadu_ps(i8);
1389       const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
1390       i8 += 8;
1391 
1392       const __m128 vk8x0123 = _mm_load_ps(w + 72);
1393       const __m128 vk8x4567 = _mm_load_ps(w + 76);
1394       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
1395       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
1396 
1397       const __m128 vi9x0123 = _mm_loadu_ps(i9);
1398       const __m128 vi9x4567 = _mm_loadu_ps(i9 + 4);
1399       i9 += 8;
1400 
1401       const __m128 vk9x0123 = _mm_load_ps(w + 80);
1402       const __m128 vk9x4567 = _mm_load_ps(w + 84);
1403       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
1404       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi9x4567, vk9x4567));
1405 
1406       const __m128 vi10x0123 = _mm_loadu_ps(i10);
1407       const __m128 vi10x4567 = _mm_loadu_ps(i10 + 4);
1408       i10 += 8;
1409 
1410       const __m128 vk10x0123 = _mm_load_ps(w + 88);
1411       const __m128 vk10x4567 = _mm_load_ps(w + 92);
1412       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
1413       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi10x4567, vk10x4567));
1414 
1415       const __m128 vi11x0123 = _mm_loadu_ps(i11);
1416       const __m128 vi11x4567 = _mm_loadu_ps(i11 + 4);
1417       i11 += 8;
1418 
1419       const __m128 vk11x0123 = _mm_load_ps(w + 96);
1420       const __m128 vk11x4567 = _mm_load_ps(w + 100);
1421       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
1422       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi11x4567, vk11x4567));
1423 
1424       const __m128 vi12x0123 = _mm_loadu_ps(i12);
1425       const __m128 vi12x4567 = _mm_loadu_ps(i12 + 4);
1426       i12 += 8;
1427 
1428       const __m128 vk12x0123 = _mm_load_ps(w + 104);
1429       const __m128 vk12x4567 = _mm_load_ps(w + 108);
1430       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
1431       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi12x4567, vk12x4567));
1432 
1433       const __m128 vi13x0123 = _mm_loadu_ps(i13);
1434       const __m128 vi13x4567 = _mm_loadu_ps(i13 + 4);
1435       i13 += 8;
1436 
1437       const __m128 vk13x0123 = _mm_load_ps(w + 112);
1438       const __m128 vk13x4567 = _mm_load_ps(w + 116);
1439       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
1440       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi13x4567, vk13x4567));
1441 
1442       const __m128 vi14x0123 = _mm_loadu_ps(i14);
1443       const __m128 vi14x4567 = _mm_loadu_ps(i14 + 4);
1444       i14 += 8;
1445 
1446       const __m128 vk14x0123 = _mm_load_ps(w + 120);
1447       const __m128 vk14x4567 = _mm_load_ps(w + 124);
1448       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
1449       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi14x4567, vk14x4567));
1450 
1451       const __m128 vi15x0123 = _mm_loadu_ps(i15);
1452       const __m128 vi15x4567 = _mm_loadu_ps(i15 + 4);
1453       i15 += 8;
1454 
1455       const __m128 vk15x0123 = _mm_load_ps(w + 128);
1456       const __m128 vk15x4567 = _mm_load_ps(w + 132);
1457       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
1458       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi15x4567, vk15x4567));
1459 
1460       const __m128 vi16x0123 = _mm_loadu_ps(i16);
1461       const __m128 vi16x4567 = _mm_loadu_ps(i16 + 4);
1462       i16 += 8;
1463 
1464       const __m128 vk16x0123 = _mm_load_ps(w + 136);
1465       const __m128 vk16x4567 = _mm_load_ps(w + 140);
1466       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
1467       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi16x4567, vk16x4567));
1468 
1469       const __m128 vi17x0123 = _mm_loadu_ps(i17);
1470       const __m128 vi17x4567 = _mm_loadu_ps(i17 + 4);
1471       i17 += 8;
1472 
1473       const __m128 vk17x0123 = _mm_load_ps(w + 144);
1474       const __m128 vk17x4567 = _mm_load_ps(w + 148);
1475       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
1476       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi17x4567, vk17x4567));
1477 
1478       const __m128 vi18x0123 = _mm_loadu_ps(i18);
1479       const __m128 vi18x4567 = _mm_loadu_ps(i18 + 4);
1480       i18 += 8;
1481 
1482       const __m128 vk18x0123 = _mm_load_ps(w + 152);
1483       const __m128 vk18x4567 = _mm_load_ps(w + 156);
1484       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
1485       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi18x4567, vk18x4567));
1486 
1487       const __m128 vi19x0123 = _mm_loadu_ps(i19);
1488       const __m128 vi19x4567 = _mm_loadu_ps(i19 + 4);
1489       i19 += 8;
1490 
1491       const __m128 vk19x0123 = _mm_load_ps(w + 160);
1492       const __m128 vk19x4567 = _mm_load_ps(w + 164);
1493       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
1494       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi19x4567, vk19x4567));
1495 
1496       const __m128 vi20x0123 = _mm_loadu_ps(i20);
1497       const __m128 vi20x4567 = _mm_loadu_ps(i20 + 4);
1498       i20 += 8;
1499 
1500       const __m128 vk20x0123 = _mm_load_ps(w + 168);
1501       const __m128 vk20x4567 = _mm_load_ps(w + 172);
1502       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
1503       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi20x4567, vk20x4567));
1504 
1505       const __m128 vi21x0123 = _mm_loadu_ps(i21);
1506       const __m128 vi21x4567 = _mm_loadu_ps(i21 + 4);
1507       i21 += 8;
1508 
1509       const __m128 vk21x0123 = _mm_load_ps(w + 176);
1510       const __m128 vk21x4567 = _mm_load_ps(w + 180);
1511       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
1512       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi21x4567, vk21x4567));
1513 
1514       const __m128 vi22x0123 = _mm_loadu_ps(i22);
1515       const __m128 vi22x4567 = _mm_loadu_ps(i22 + 4);
1516       i22 += 8;
1517 
1518       const __m128 vk22x0123 = _mm_load_ps(w + 184);
1519       const __m128 vk22x4567 = _mm_load_ps(w + 188);
1520       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
1521       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi22x4567, vk22x4567));
1522 
1523       const __m128 vi23x0123 = _mm_loadu_ps(i23);
1524       const __m128 vi23x4567 = _mm_loadu_ps(i23 + 4);
1525       i23 += 8;
1526 
1527       const __m128 vk23x0123 = _mm_load_ps(w + 192);
1528       const __m128 vk23x4567 = _mm_load_ps(w + 196);
1529       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
1530       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi23x4567, vk23x4567));
1531 
1532       const __m128 vi24x0123 = _mm_loadu_ps(i24);
1533       const __m128 vi24x4567 = _mm_loadu_ps(i24 + 4);
1534       i24 += 8;
1535 
1536       const __m128 vk24x0123 = _mm_load_ps(w + 200);
1537       const __m128 vk24x4567 = _mm_load_ps(w + 204);
1538       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
1539       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi24x4567, vk24x4567));
1540 
1541       w += 208;
1542 
1543 
1544       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1545       __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
1546       vacc0123 = _mm_min_ps(vacc0123, vmax);
1547       vacc4567 = _mm_min_ps(vacc4567, vmax);
1548 
1549       _mm_storeu_ps(output, vacc0123);
1550       _mm_storeu_ps(output + 4, vacc4567);
1551       output += 8;
1552     }
1553     for (; c >= 4; c -= 4) {
1554       __m128 vacc0123p0 = _mm_load_ps(w);
1555 
1556       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1557       i0 += 4;
1558 
1559       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1560       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1561 
1562       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1563       i1 += 4;
1564 
1565       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1566       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1567 
1568       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1569       i2 += 4;
1570 
1571       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1572       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1573 
1574       const __m128 vi3x0123 = _mm_loadu_ps(i3);
1575       i3 += 4;
1576 
1577       const __m128 vk3x0123 = _mm_load_ps(w + 32);
1578       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
1579 
1580       const __m128 vi4x0123 = _mm_loadu_ps(i4);
1581       i4 += 4;
1582 
1583       const __m128 vk4x0123 = _mm_load_ps(w + 40);
1584       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
1585 
1586       const __m128 vi5x0123 = _mm_loadu_ps(i5);
1587       i5 += 4;
1588 
1589       const __m128 vk5x0123 = _mm_load_ps(w + 48);
1590       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
1591 
1592       const __m128 vi6x0123 = _mm_loadu_ps(i6);
1593       i6 += 4;
1594 
1595       const __m128 vk6x0123 = _mm_load_ps(w + 56);
1596       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
1597 
1598       const __m128 vi7x0123 = _mm_loadu_ps(i7);
1599       i7 += 4;
1600 
1601       const __m128 vk7x0123 = _mm_load_ps(w + 64);
1602       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
1603 
1604       const __m128 vi8x0123 = _mm_loadu_ps(i8);
1605       i8 += 4;
1606 
1607       const __m128 vk8x0123 = _mm_load_ps(w + 72);
1608       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
1609 
1610       const __m128 vi9x0123 = _mm_loadu_ps(i9);
1611       i9 += 4;
1612 
1613       const __m128 vk9x0123 = _mm_load_ps(w + 80);
1614       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
1615 
1616       const __m128 vi10x0123 = _mm_loadu_ps(i10);
1617       i10 += 4;
1618 
1619       const __m128 vk10x0123 = _mm_load_ps(w + 88);
1620       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
1621 
1622       const __m128 vi11x0123 = _mm_loadu_ps(i11);
1623       i11 += 4;
1624 
1625       const __m128 vk11x0123 = _mm_load_ps(w + 96);
1626       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
1627 
1628       const __m128 vi12x0123 = _mm_loadu_ps(i12);
1629       i12 += 4;
1630 
1631       const __m128 vk12x0123 = _mm_load_ps(w + 104);
1632       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
1633 
1634       const __m128 vi13x0123 = _mm_loadu_ps(i13);
1635       i13 += 4;
1636 
1637       const __m128 vk13x0123 = _mm_load_ps(w + 112);
1638       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
1639 
1640       const __m128 vi14x0123 = _mm_loadu_ps(i14);
1641       i14 += 4;
1642 
1643       const __m128 vk14x0123 = _mm_load_ps(w + 120);
1644       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
1645 
1646       const __m128 vi15x0123 = _mm_loadu_ps(i15);
1647       i15 += 4;
1648 
1649       const __m128 vk15x0123 = _mm_load_ps(w + 128);
1650       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
1651 
1652       const __m128 vi16x0123 = _mm_loadu_ps(i16);
1653       i16 += 4;
1654 
1655       const __m128 vk16x0123 = _mm_load_ps(w + 136);
1656       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
1657 
1658       const __m128 vi17x0123 = _mm_loadu_ps(i17);
1659       i17 += 4;
1660 
1661       const __m128 vk17x0123 = _mm_load_ps(w + 144);
1662       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
1663 
1664       const __m128 vi18x0123 = _mm_loadu_ps(i18);
1665       i18 += 4;
1666 
1667       const __m128 vk18x0123 = _mm_load_ps(w + 152);
1668       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
1669 
1670       const __m128 vi19x0123 = _mm_loadu_ps(i19);
1671       i19 += 4;
1672 
1673       const __m128 vk19x0123 = _mm_load_ps(w + 160);
1674       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
1675 
1676       const __m128 vi20x0123 = _mm_loadu_ps(i20);
1677       i20 += 4;
1678 
1679       const __m128 vk20x0123 = _mm_load_ps(w + 168);
1680       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
1681 
1682       const __m128 vi21x0123 = _mm_loadu_ps(i21);
1683       i21 += 4;
1684 
1685       const __m128 vk21x0123 = _mm_load_ps(w + 176);
1686       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
1687 
1688       const __m128 vi22x0123 = _mm_loadu_ps(i22);
1689       i22 += 4;
1690 
1691       const __m128 vk22x0123 = _mm_load_ps(w + 184);
1692       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
1693 
1694       const __m128 vi23x0123 = _mm_loadu_ps(i23);
1695       i23 += 4;
1696 
1697       const __m128 vk23x0123 = _mm_load_ps(w + 192);
1698       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
1699 
1700       const __m128 vi24x0123 = _mm_loadu_ps(i24);
1701       i24 += 4;
1702 
1703       const __m128 vk24x0123 = _mm_load_ps(w + 200);
1704       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
1705 
1706       w += 4;
1707 
1708 
1709       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1710       vacc0123 = _mm_min_ps(vacc0123, vmax);
1711 
1712       _mm_storeu_ps(output, vacc0123);
1713       output += 4;
1714     }
1715     if XNN_UNLIKELY(c != 0) {
1716       __m128 vacc0123p0 = _mm_load_ps(w);
1717 
1718       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1719       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1720       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1721 
1722       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1723       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1724       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1725 
1726       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1727       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1728       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1729 
1730       const __m128 vi3x0123 = _mm_loadu_ps(i3);
1731       const __m128 vk3x0123 = _mm_load_ps(w + 32);
1732       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
1733 
1734       const __m128 vi4x0123 = _mm_loadu_ps(i4);
1735       const __m128 vk4x0123 = _mm_load_ps(w + 40);
1736       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
1737 
1738       const __m128 vi5x0123 = _mm_loadu_ps(i5);
1739       const __m128 vk5x0123 = _mm_load_ps(w + 48);
1740       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
1741 
1742       const __m128 vi6x0123 = _mm_loadu_ps(i6);
1743       const __m128 vk6x0123 = _mm_load_ps(w + 56);
1744       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
1745 
1746       const __m128 vi7x0123 = _mm_loadu_ps(i7);
1747       const __m128 vk7x0123 = _mm_load_ps(w + 64);
1748       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
1749 
1750       const __m128 vi8x0123 = _mm_loadu_ps(i8);
1751       const __m128 vk8x0123 = _mm_load_ps(w + 72);
1752       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
1753 
1754       const __m128 vi9x0123 = _mm_loadu_ps(i9);
1755       const __m128 vk9x0123 = _mm_load_ps(w + 80);
1756       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
1757 
1758       const __m128 vi10x0123 = _mm_loadu_ps(i10);
1759       const __m128 vk10x0123 = _mm_load_ps(w + 88);
1760       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
1761 
1762       const __m128 vi11x0123 = _mm_loadu_ps(i11);
1763       const __m128 vk11x0123 = _mm_load_ps(w + 96);
1764       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
1765 
1766       const __m128 vi12x0123 = _mm_loadu_ps(i12);
1767       const __m128 vk12x0123 = _mm_load_ps(w + 104);
1768       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
1769 
1770       const __m128 vi13x0123 = _mm_loadu_ps(i13);
1771       const __m128 vk13x0123 = _mm_load_ps(w + 112);
1772       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
1773 
1774       const __m128 vi14x0123 = _mm_loadu_ps(i14);
1775       const __m128 vk14x0123 = _mm_load_ps(w + 120);
1776       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
1777 
1778       const __m128 vi15x0123 = _mm_loadu_ps(i15);
1779       const __m128 vk15x0123 = _mm_load_ps(w + 128);
1780       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
1781 
1782       const __m128 vi16x0123 = _mm_loadu_ps(i16);
1783       const __m128 vk16x0123 = _mm_load_ps(w + 136);
1784       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
1785 
1786       const __m128 vi17x0123 = _mm_loadu_ps(i17);
1787       const __m128 vk17x0123 = _mm_load_ps(w + 144);
1788       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
1789 
1790       const __m128 vi18x0123 = _mm_loadu_ps(i18);
1791       const __m128 vk18x0123 = _mm_load_ps(w + 152);
1792       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
1793 
1794       const __m128 vi19x0123 = _mm_loadu_ps(i19);
1795       const __m128 vk19x0123 = _mm_load_ps(w + 160);
1796       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
1797 
1798       const __m128 vi20x0123 = _mm_loadu_ps(i20);
1799       const __m128 vk20x0123 = _mm_load_ps(w + 168);
1800       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
1801 
1802       const __m128 vi21x0123 = _mm_loadu_ps(i21);
1803       const __m128 vk21x0123 = _mm_load_ps(w + 176);
1804       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
1805 
1806       const __m128 vi22x0123 = _mm_loadu_ps(i22);
1807       const __m128 vk22x0123 = _mm_load_ps(w + 184);
1808       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
1809 
1810       const __m128 vi23x0123 = _mm_loadu_ps(i23);
1811       const __m128 vk23x0123 = _mm_load_ps(w + 192);
1812       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
1813 
1814       const __m128 vi24x0123 = _mm_loadu_ps(i24);
1815       const __m128 vk24x0123 = _mm_load_ps(w + 200);
1816       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
1817 
1818 
1819       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1820       vacc0123 = _mm_min_ps(vacc0123, vmax);
1821 
1822       if (c & 2) {
1823         _mm_storel_pi((__m64*) output, vacc0123);
1824         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1825         output += 2;
1826       }
1827       if (c & 1) {
1828         _mm_store_ss(output, vacc0123);
1829         output += 1;
1830       }
1831     }
1832 
1833     output = (float*) ((uintptr_t) output + output_increment);
1834   } while (--output_width != 0);
1835 }
1836 
xnn_f32_dwconv_minmax_ukernel_up8x3__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1837 void xnn_f32_dwconv_minmax_ukernel_up8x3__sse(
1838     size_t channels,
1839     size_t output_width,
1840     const float** input,
1841     const float* weights,
1842     float* output,
1843     size_t input_stride,
1844     size_t output_increment,
1845     size_t input_offset,
1846     const float* zero,
1847     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1848 {
1849   assert(channels != 0);
1850   assert(output_width != 0);
1851 
1852   const __m128 vmax = _mm_load_ps(params->sse.max);
1853   const __m128 vmin = _mm_load_ps(params->sse.min);
1854   do {
1855     const float* i0 = input[0];
1856     assert(i0 != NULL);
1857     if XNN_UNPREDICTABLE(i0 != zero) {
1858       i0 = (const float*) ((uintptr_t) i0 + input_offset);
1859     }
1860     const float* i1 = input[1];
1861     assert(i1 != NULL);
1862     if XNN_UNPREDICTABLE(i1 != zero) {
1863       i1 = (const float*) ((uintptr_t) i1 + input_offset);
1864     }
1865     const float* i2 = input[2];
1866     assert(i2 != NULL);
1867     if XNN_UNPREDICTABLE(i2 != zero) {
1868       i2 = (const float*) ((uintptr_t) i2 + input_offset);
1869     }
1870     input = (const float**) ((uintptr_t) input + input_stride);
1871 
1872     size_t c = channels;
1873     const float* w = weights;
1874     for (; c >= 8; c -= 8) {
1875       __m128 vacc0123p0 = _mm_load_ps(w);
1876       __m128 vacc4567p0 = _mm_load_ps(w + 4);
1877 
1878 
1879       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1880       const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
1881       i0 += 8;
1882 
1883       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1884       const __m128 vk0x4567 = _mm_load_ps(w + 12);
1885       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1886       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
1887 
1888       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1889       const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
1890       i1 += 8;
1891 
1892       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1893       const __m128 vk1x4567 = _mm_load_ps(w + 20);
1894       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1895       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
1896 
1897       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1898       const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
1899       i2 += 8;
1900 
1901       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1902       const __m128 vk2x4567 = _mm_load_ps(w + 28);
1903       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1904       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
1905 
1906       w += 32;
1907 
1908 
1909       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1910       __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
1911       vacc0123 = _mm_min_ps(vacc0123, vmax);
1912       vacc4567 = _mm_min_ps(vacc4567, vmax);
1913 
1914       _mm_storeu_ps(output, vacc0123);
1915       _mm_storeu_ps(output + 4, vacc4567);
1916       output += 8;
1917     }
1918     for (; c >= 4; c -= 4) {
1919       __m128 vacc0123p0 = _mm_load_ps(w);
1920 
1921       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1922       i0 += 4;
1923 
1924       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1925       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1926 
1927       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1928       i1 += 4;
1929 
1930       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1931       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1932 
1933       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1934       i2 += 4;
1935 
1936       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1937       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1938 
1939       w += 4;
1940 
1941 
1942       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1943       vacc0123 = _mm_min_ps(vacc0123, vmax);
1944 
1945       _mm_storeu_ps(output, vacc0123);
1946       output += 4;
1947     }
1948     if XNN_UNLIKELY(c != 0) {
1949       __m128 vacc0123p0 = _mm_load_ps(w);
1950 
1951       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1952       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1953       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1954 
1955       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1956       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1957       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1958 
1959       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1960       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1961       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1962 
1963 
1964       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1965       vacc0123 = _mm_min_ps(vacc0123, vmax);
1966 
1967       if (c & 2) {
1968         _mm_storel_pi((__m64*) output, vacc0123);
1969         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1970         output += 2;
1971       }
1972       if (c & 1) {
1973         _mm_store_ss(output, vacc0123);
1974         output += 1;
1975       }
1976     }
1977 
1978     output = (float*) ((uintptr_t) output + output_increment);
1979   } while (--output_width != 0);
1980 }
1981 
xnn_f32_dwconv_minmax_ukernel_up8x4__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1982 void xnn_f32_dwconv_minmax_ukernel_up8x4__sse(
1983     size_t channels,
1984     size_t output_width,
1985     const float** input,
1986     const float* weights,
1987     float* output,
1988     size_t input_stride,
1989     size_t output_increment,
1990     size_t input_offset,
1991     const float* zero,
1992     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1993 {
1994   assert(channels != 0);
1995   assert(output_width != 0);
1996 
1997   const __m128 vmax = _mm_load_ps(params->sse.max);
1998   const __m128 vmin = _mm_load_ps(params->sse.min);
1999   do {
2000     const float* i0 = input[0];
2001     assert(i0 != NULL);
2002     if XNN_UNPREDICTABLE(i0 != zero) {
2003       i0 = (const float*) ((uintptr_t) i0 + input_offset);
2004     }
2005     const float* i1 = input[1];
2006     assert(i1 != NULL);
2007     if XNN_UNPREDICTABLE(i1 != zero) {
2008       i1 = (const float*) ((uintptr_t) i1 + input_offset);
2009     }
2010     const float* i2 = input[2];
2011     assert(i2 != NULL);
2012     if XNN_UNPREDICTABLE(i2 != zero) {
2013       i2 = (const float*) ((uintptr_t) i2 + input_offset);
2014     }
2015     const float* i3 = input[3];
2016     assert(i3 != NULL);
2017     if XNN_UNPREDICTABLE(i3 != zero) {
2018       i3 = (const float*) ((uintptr_t) i3 + input_offset);
2019     }
2020     input = (const float**) ((uintptr_t) input + input_stride);
2021 
2022     size_t c = channels;
2023     const float* w = weights;
2024     for (; c >= 8; c -= 8) {
2025       __m128 vacc0123p0 = _mm_load_ps(w);
2026       __m128 vacc4567p0 = _mm_load_ps(w + 4);
2027 
2028 
2029       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2030       const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
2031       i0 += 8;
2032 
2033       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2034       const __m128 vk0x4567 = _mm_load_ps(w + 12);
2035       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2036       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
2037 
2038       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2039       const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
2040       i1 += 8;
2041 
2042       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2043       const __m128 vk1x4567 = _mm_load_ps(w + 20);
2044       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2045       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
2046 
2047       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2048       const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
2049       i2 += 8;
2050 
2051       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2052       const __m128 vk2x4567 = _mm_load_ps(w + 28);
2053       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2054       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
2055 
2056       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2057       const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
2058       i3 += 8;
2059 
2060       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2061       const __m128 vk3x4567 = _mm_load_ps(w + 36);
2062       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2063       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
2064 
2065       w += 40;
2066 
2067 
2068       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2069       __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
2070       vacc0123 = _mm_min_ps(vacc0123, vmax);
2071       vacc4567 = _mm_min_ps(vacc4567, vmax);
2072 
2073       _mm_storeu_ps(output, vacc0123);
2074       _mm_storeu_ps(output + 4, vacc4567);
2075       output += 8;
2076     }
2077     for (; c >= 4; c -= 4) {
2078       __m128 vacc0123p0 = _mm_load_ps(w);
2079 
2080       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2081       i0 += 4;
2082 
2083       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2084       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2085 
2086       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2087       i1 += 4;
2088 
2089       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2090       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2091 
2092       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2093       i2 += 4;
2094 
2095       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2096       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2097 
2098       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2099       i3 += 4;
2100 
2101       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2102       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2103 
2104       w += 4;
2105 
2106 
2107       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2108       vacc0123 = _mm_min_ps(vacc0123, vmax);
2109 
2110       _mm_storeu_ps(output, vacc0123);
2111       output += 4;
2112     }
2113     if XNN_UNLIKELY(c != 0) {
2114       __m128 vacc0123p0 = _mm_load_ps(w);
2115 
2116       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2117       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2118       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2119 
2120       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2121       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2122       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2123 
2124       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2125       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2126       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2127 
2128       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2129       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2130       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2131 
2132 
2133       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2134       vacc0123 = _mm_min_ps(vacc0123, vmax);
2135 
2136       if (c & 2) {
2137         _mm_storel_pi((__m64*) output, vacc0123);
2138         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
2139         output += 2;
2140       }
2141       if (c & 1) {
2142         _mm_store_ss(output, vacc0123);
2143         output += 1;
2144       }
2145     }
2146 
2147     output = (float*) ((uintptr_t) output + output_increment);
2148   } while (--output_width != 0);
2149 }
2150 
xnn_f32_dwconv_minmax_ukernel_up8x9__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2151 void xnn_f32_dwconv_minmax_ukernel_up8x9__sse(
2152     size_t channels,
2153     size_t output_width,
2154     const float** input,
2155     const float* weights,
2156     float* output,
2157     size_t input_stride,
2158     size_t output_increment,
2159     size_t input_offset,
2160     const float* zero,
2161     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2162 {
2163   assert(channels != 0);
2164   assert(output_width != 0);
2165 
2166   const __m128 vmax = _mm_load_ps(params->sse.max);
2167   const __m128 vmin = _mm_load_ps(params->sse.min);
2168   do {
2169     const float* i0 = input[0];
2170     assert(i0 != NULL);
2171     if XNN_UNPREDICTABLE(i0 != zero) {
2172       i0 = (const float*) ((uintptr_t) i0 + input_offset);
2173     }
2174     const float* i1 = input[1];
2175     assert(i1 != NULL);
2176     if XNN_UNPREDICTABLE(i1 != zero) {
2177       i1 = (const float*) ((uintptr_t) i1 + input_offset);
2178     }
2179     const float* i2 = input[2];
2180     assert(i2 != NULL);
2181     if XNN_UNPREDICTABLE(i2 != zero) {
2182       i2 = (const float*) ((uintptr_t) i2 + input_offset);
2183     }
2184     const float* i3 = input[3];
2185     assert(i3 != NULL);
2186     if XNN_UNPREDICTABLE(i3 != zero) {
2187       i3 = (const float*) ((uintptr_t) i3 + input_offset);
2188     }
2189     const float* i4 = input[4];
2190     assert(i4 != NULL);
2191     if XNN_UNPREDICTABLE(i4 != zero) {
2192       i4 = (const float*) ((uintptr_t) i4 + input_offset);
2193     }
2194     const float* i5 = input[5];
2195     assert(i5 != NULL);
2196     if XNN_UNPREDICTABLE(i5 != zero) {
2197       i5 = (const float*) ((uintptr_t) i5 + input_offset);
2198     }
2199     const float* i6 = input[6];
2200     assert(i6 != NULL);
2201     if XNN_UNPREDICTABLE(i6 != zero) {
2202       i6 = (const float*) ((uintptr_t) i6 + input_offset);
2203     }
2204     const float* i7 = input[7];
2205     assert(i7 != NULL);
2206     if XNN_UNPREDICTABLE(i7 != zero) {
2207       i7 = (const float*) ((uintptr_t) i7 + input_offset);
2208     }
2209     const float* i8 = input[8];
2210     assert(i8 != NULL);
2211     if XNN_UNPREDICTABLE(i8 != zero) {
2212       i8 = (const float*) ((uintptr_t) i8 + input_offset);
2213     }
2214     input = (const float**) ((uintptr_t) input + input_stride);
2215 
2216     size_t c = channels;
2217     const float* w = weights;
2218     for (; c >= 8; c -= 8) {
2219       __m128 vacc0123p0 = _mm_load_ps(w);
2220       __m128 vacc4567p0 = _mm_load_ps(w + 4);
2221 
2222 
2223       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2224       const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
2225       i0 += 8;
2226 
2227       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2228       const __m128 vk0x4567 = _mm_load_ps(w + 12);
2229       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2230       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
2231 
2232       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2233       const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
2234       i1 += 8;
2235 
2236       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2237       const __m128 vk1x4567 = _mm_load_ps(w + 20);
2238       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2239       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
2240 
2241       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2242       const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
2243       i2 += 8;
2244 
2245       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2246       const __m128 vk2x4567 = _mm_load_ps(w + 28);
2247       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2248       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
2249 
2250       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2251       const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
2252       i3 += 8;
2253 
2254       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2255       const __m128 vk3x4567 = _mm_load_ps(w + 36);
2256       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2257       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
2258 
2259       const __m128 vi4x0123 = _mm_loadu_ps(i4);
2260       const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
2261       i4 += 8;
2262 
2263       const __m128 vk4x0123 = _mm_load_ps(w + 40);
2264       const __m128 vk4x4567 = _mm_load_ps(w + 44);
2265       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
2266       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
2267 
2268       const __m128 vi5x0123 = _mm_loadu_ps(i5);
2269       const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
2270       i5 += 8;
2271 
2272       const __m128 vk5x0123 = _mm_load_ps(w + 48);
2273       const __m128 vk5x4567 = _mm_load_ps(w + 52);
2274       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
2275       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
2276 
2277       const __m128 vi6x0123 = _mm_loadu_ps(i6);
2278       const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
2279       i6 += 8;
2280 
2281       const __m128 vk6x0123 = _mm_load_ps(w + 56);
2282       const __m128 vk6x4567 = _mm_load_ps(w + 60);
2283       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
2284       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
2285 
2286       const __m128 vi7x0123 = _mm_loadu_ps(i7);
2287       const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
2288       i7 += 8;
2289 
2290       const __m128 vk7x0123 = _mm_load_ps(w + 64);
2291       const __m128 vk7x4567 = _mm_load_ps(w + 68);
2292       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
2293       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
2294 
2295       const __m128 vi8x0123 = _mm_loadu_ps(i8);
2296       const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
2297       i8 += 8;
2298 
2299       const __m128 vk8x0123 = _mm_load_ps(w + 72);
2300       const __m128 vk8x4567 = _mm_load_ps(w + 76);
2301       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
2302       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
2303 
2304       w += 80;
2305 
2306 
2307       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2308       __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
2309       vacc0123 = _mm_min_ps(vacc0123, vmax);
2310       vacc4567 = _mm_min_ps(vacc4567, vmax);
2311 
2312       _mm_storeu_ps(output, vacc0123);
2313       _mm_storeu_ps(output + 4, vacc4567);
2314       output += 8;
2315     }
2316     for (; c >= 4; c -= 4) {
2317       __m128 vacc0123p0 = _mm_load_ps(w);
2318 
2319       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2320       i0 += 4;
2321 
2322       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2323       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2324 
2325       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2326       i1 += 4;
2327 
2328       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2329       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2330 
2331       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2332       i2 += 4;
2333 
2334       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2335       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2336 
2337       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2338       i3 += 4;
2339 
2340       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2341       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2342 
2343       const __m128 vi4x0123 = _mm_loadu_ps(i4);
2344       i4 += 4;
2345 
2346       const __m128 vk4x0123 = _mm_load_ps(w + 40);
2347       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
2348 
2349       const __m128 vi5x0123 = _mm_loadu_ps(i5);
2350       i5 += 4;
2351 
2352       const __m128 vk5x0123 = _mm_load_ps(w + 48);
2353       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
2354 
2355       const __m128 vi6x0123 = _mm_loadu_ps(i6);
2356       i6 += 4;
2357 
2358       const __m128 vk6x0123 = _mm_load_ps(w + 56);
2359       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
2360 
2361       const __m128 vi7x0123 = _mm_loadu_ps(i7);
2362       i7 += 4;
2363 
2364       const __m128 vk7x0123 = _mm_load_ps(w + 64);
2365       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
2366 
2367       const __m128 vi8x0123 = _mm_loadu_ps(i8);
2368       i8 += 4;
2369 
2370       const __m128 vk8x0123 = _mm_load_ps(w + 72);
2371       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
2372 
2373       w += 4;
2374 
2375 
2376       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2377       vacc0123 = _mm_min_ps(vacc0123, vmax);
2378 
2379       _mm_storeu_ps(output, vacc0123);
2380       output += 4;
2381     }
2382     if XNN_UNLIKELY(c != 0) {
2383       __m128 vacc0123p0 = _mm_load_ps(w);
2384 
2385       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2386       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2387       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2388 
2389       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2390       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2391       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2392 
2393       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2394       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2395       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2396 
2397       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2398       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2399       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2400 
2401       const __m128 vi4x0123 = _mm_loadu_ps(i4);
2402       const __m128 vk4x0123 = _mm_load_ps(w + 40);
2403       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
2404 
2405       const __m128 vi5x0123 = _mm_loadu_ps(i5);
2406       const __m128 vk5x0123 = _mm_load_ps(w + 48);
2407       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
2408 
2409       const __m128 vi6x0123 = _mm_loadu_ps(i6);
2410       const __m128 vk6x0123 = _mm_load_ps(w + 56);
2411       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
2412 
2413       const __m128 vi7x0123 = _mm_loadu_ps(i7);
2414       const __m128 vk7x0123 = _mm_load_ps(w + 64);
2415       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
2416 
2417       const __m128 vi8x0123 = _mm_loadu_ps(i8);
2418       const __m128 vk8x0123 = _mm_load_ps(w + 72);
2419       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
2420 
2421 
2422       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2423       vacc0123 = _mm_min_ps(vacc0123, vmax);
2424 
2425       if (c & 2) {
2426         _mm_storel_pi((__m64*) output, vacc0123);
2427         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
2428         output += 2;
2429       }
2430       if (c & 1) {
2431         _mm_store_ss(output, vacc0123);
2432         output += 1;
2433       }
2434     }
2435 
2436     output = (float*) ((uintptr_t) output + output_increment);
2437   } while (--output_width != 0);
2438 }
2439 
xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])2440 void xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2(
2441     size_t input_height,
2442     size_t input_width,
2443     const float* input,
2444     const float* weights,
2445     const float* zero,
2446     float* output,
2447     uint32_t padding_top,
2448     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2449 {
2450   assert(input_height != 0);
2451   assert(input_width != 0);
2452   assert(input_width % sizeof(float) == 0);
2453   assert(padding_top == 1);
2454 
2455   const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
2456   const __m128 vmax = _mm_load_ps(params->sse.max);
2457   const __m128 vmin = _mm_load_ps(params->sse.min);
2458 
2459   const __m128 vbias = _mm_load1_ps(weights);
2460   const __m128 vk00 = _mm_load1_ps(weights + 1);
2461   const __m128 vk01 = _mm_load1_ps(weights + 2);
2462   const __m128 vk02 = _mm_load1_ps(weights + 3);
2463   const __m128 vk10 = _mm_load1_ps(weights + 4);
2464   const __m128 vk11 = _mm_load1_ps(weights + 5);
2465   const __m128 vk12 = _mm_load1_ps(weights + 6);
2466   const __m128 vk20 = _mm_load1_ps(weights + 7);
2467   const __m128 vk21 = _mm_load1_ps(weights + 8);
2468   const __m128 vk22 = _mm_load1_ps(weights + 9);
2469 
2470   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
2471 
2472   const float* i0 = zero;
2473   const float* i1 = input;
2474   const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
2475   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
2476 
2477   float* o0 = output;
2478   float* o1 = (float*) ((uintptr_t) o0 + input_width);
2479 
2480   size_t output_height = input_height;
2481   do {
2482     if XNN_UNPREDICTABLE(output_height < 2) {
2483       i2 = zero;
2484       o1 = o0;
2485     }
2486     if XNN_UNPREDICTABLE(output_height < 3) {
2487       i3 = zero;
2488     }
2489 
2490     // vi0x3012 = ( vi02, vi01, vi{M}0, vi{M}3 )
2491     __m128 vi0x3012 = _mm_setzero_ps();
2492     // vi1x3012 = ( vi12, vi11, vi{M}0, vi{M}3 )
2493     __m128 vi1x3012 = _mm_setzero_ps();
2494     // vi2x3012 = ( vi22, vi21, vi{M}0, vi{M}3 )
2495     __m128 vi2x3012 = _mm_setzero_ps();
2496     // vi3x3012 = ( vi32, vi31, vi{M}0, vi{M}3 )
2497     __m128 vi3x3012 = _mm_setzero_ps();
2498 
2499     __m128 vi0x4567 = _mm_loadu_ps(i0);
2500     i0 += 4;
2501     __m128 vi1x4567 = _mm_loadu_ps(i1);
2502     i1 += 4;
2503     __m128 vi2x4567 = _mm_loadu_ps(i2);
2504     i2 += 4;
2505     __m128 vi3x4567 = _mm_loadu_ps(i3);
2506     i3 += 4;
2507 
2508     size_t w = input_width;
2509     for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) {
2510       // vi0x89AB = ( vi0B, vi0A, vi09, vi08 )
2511       const __m128 vi0x89AB = _mm_loadu_ps(i0);
2512       i0 += 4;
2513       // vi1x89AB = ( vi1B, vi1A, vi19, vi18 )
2514       const __m128 vi1x89AB = _mm_loadu_ps(i1);
2515       i1 += 4;
2516       // vi2x89AB = ( vi2B, vi2A, vi29, vi28 )
2517       const __m128 vi2x89AB = _mm_loadu_ps(i2);
2518       i2 += 4;
2519       // vi3x89AB = ( vi3B, vi3A, vi39, vi38 )
2520       const __m128 vi3x89AB = _mm_loadu_ps(i3);
2521       i3 += 4;
2522 
2523       // vi0x7456 = ( vi06, vi05, vi04, vi07 )
2524       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
2525       // vi1x7456 = ( vi16, vi15, vi14, vi17 )
2526       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
2527       // vi2x7456 = ( vi26, vi25, vi24, vi27 )
2528       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
2529       // vi3x7456 = ( vi36, vi35, vi34, vi37 )
2530       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
2531 
2532       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
2533       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
2534       __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
2535       __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
2536       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
2537       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
2538 
2539       // vi0x3456 = ( vi06, vi05, vi04, vi03 )
2540       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
2541       // vi1x3456 = ( vi16, vi15, vi14, vi13 )
2542       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
2543       // vi2x3456 = ( vi26, vi25, vi24, vi23 )
2544       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
2545       // vi3x3456 = ( vi36, vi35, vi34, vi33 )
2546       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
2547 
2548       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
2549       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
2550       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
2551       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
2552       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
2553       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
2554 
2555       vi0x3012 = vi0x7456;
2556       vi1x3012 = vi1x7456;
2557       vi2x3012 = vi2x7456;
2558       vi3x3012 = vi3x7456;
2559 
2560       // vi0x8567 = ( vi07, vi06, vi05, vi08 )
2561       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
2562       // vi1x8567 = ( vi17, vi16, vi15, vi18 )
2563       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
2564       // vi2x8567 = ( vi27, vi26, vi25, vi28 )
2565       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
2566       // vi3x8567 = ( vi37, vi36, vi35, vi38 )
2567       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
2568 
2569       // vi0x5678 = ( vi08, vi07, vi06, vi05 )
2570       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
2571       // vi1x5678 = ( vi18, vi17, vi16, vi15 )
2572       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
2573       // vi2x5678 = ( vi28, vi27, vi26, vi25 )
2574       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
2575       // vi3x5678 = ( vi38, vi37, vi36, vi35 )
2576       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
2577 
2578       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
2579       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
2580       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
2581       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
2582       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
2583       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
2584 
2585       vi0x4567 = vi0x89AB;
2586       vi1x4567 = vi1x89AB;
2587       vi2x4567 = vi2x89AB;
2588       vi3x4567 = vi3x89AB;
2589 
2590       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2591       vo1p0 = _mm_add_ps(vo1p0, vo1p1);
2592 
2593       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2594       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
2595 
2596       vo0 = _mm_min_ps(vo0, vmax);
2597       vo1 = _mm_min_ps(vo1, vmax);
2598 
2599       _mm_storeu_ps(o1, vo1);
2600       o1 += 4;
2601       _mm_storeu_ps(o0, vo0);
2602       o0 += 4;
2603     }
2604     // Always process the last block of 1..4 pixels.
2605     assert(w >= 1 * sizeof(float));
2606     assert(w <= 4 * sizeof(float));
2607     {
2608       vi0x4567 = _mm_and_ps(vmask, vi0x4567);
2609       vi1x4567 = _mm_and_ps(vmask, vi1x4567);
2610       vi2x4567 = _mm_and_ps(vmask, vi2x4567);
2611       vi3x4567 = _mm_and_ps(vmask, vi3x4567);
2612 
2613       // vi0x7456 = ( vi06, vi05, vi04, vi07 )
2614       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
2615       // vi1x7456 = ( vi16, vi15, vi14, vi17 )
2616       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
2617       // vi2x7456 = ( vi26, vi25, vi24, vi27 )
2618       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
2619       // vi3x7456 = ( vi36, vi35, vi34, vi37 )
2620       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
2621 
2622       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
2623       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
2624       __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
2625       __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
2626       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
2627       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
2628 
2629       // vi0x3456 = ( vi06, vi05, vi04, vi03 )
2630       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
2631       // vi1x3456 = ( vi16, vi15, vi14, vi13 )
2632       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
2633       // vi2x3456 = ( vi26, vi25, vi24, vi23 )
2634       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
2635       // vi3x3456 = ( vi36, vi35, vi34, vi33 )
2636       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
2637 
2638       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
2639       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
2640       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
2641       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
2642       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
2643       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
2644 
2645       const __m128 vzero = _mm_setzero_ps();
2646       // vi0x8567 = ( vi07, vi06, vi05, 0.0 )
2647       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
2648       // vi1x8567 = ( vi17, vi16, vi15, 0.0 )
2649       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
2650       // vi2x8567 = ( vi27, vi26, vi25, 0.0 )
2651       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
2652       // vi3x8567 = ( vi37, vi36, vi35, 0.0 )
2653       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
2654 
2655       // vi0x5678 = ( vi08, vi07, vi06, vi05 )
2656       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
2657       // vi1x5678 = ( vi18, vi17, vi16, vi15 )
2658       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
2659       // vi2x5678 = ( vi28, vi27, vi26, vi25 )
2660       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
2661       // vi3x5678 = ( vi38, vi37, vi36, vi35 )
2662       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
2663 
2664       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
2665       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
2666       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
2667       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
2668       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
2669       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
2670 
2671       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2672       vo1p0 = _mm_add_ps(vo1p0, vo1p1);
2673 
2674       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2675       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
2676 
2677       vo0 = _mm_min_ps(vo0, vmax);
2678       vo1 = _mm_min_ps(vo1, vmax);
2679 
2680       if XNN_LIKELY(w == 4 * sizeof(float)) {
2681         _mm_storeu_ps(o1, vo1);
2682         o1 += 4;
2683         _mm_storeu_ps(o0, vo0);
2684         o0 += 4;
2685       } else {
2686         if (w & (2 * sizeof(float))) {
2687           _mm_storel_pi((__m64*) o1, vo1);
2688           o1 += 2;
2689           _mm_storel_pi((__m64*) o0, vo0);
2690           o0 += 2;
2691 
2692           vo0 = _mm_movehl_ps(vo0, vo0);
2693           vo1 = _mm_movehl_ps(vo1, vo1);
2694         }
2695         if (w & (1 * sizeof(float))) {
2696           _mm_store_ss(o1, vo1);
2697           o1 += 1;
2698           _mm_store_ss(o0, vo0);
2699           o0 += 1;
2700         }
2701       }
2702     }
2703 
2704     i0 = (const float*) ((uintptr_t) i2 - input_decrement);
2705     i1 = (const float*) ((uintptr_t) i3 - input_decrement);
2706     i2 = (const float*) ((uintptr_t) i1 + input_width);
2707     i3 = (const float*) ((uintptr_t) i2 + input_width);
2708 
2709     o0 = o1;
2710     o1 = (float*) ((uintptr_t) o0 + input_width);
2711 
2712     output_height = doz(output_height, 2);
2713   } while (output_height != 0);
2714 }
2715 
xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])2716 void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3(
2717     size_t input_height,
2718     size_t input_width,
2719     const float* input,
2720     const float* weights,
2721     const float* zero,
2722     float* output,
2723     uint32_t padding_top,
2724     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2725 {
2726   assert(input_height != 0);
2727   assert(input_width != 0);
2728   assert(input_width % sizeof(float) == 0);
2729   assert(padding_top >= 0);
2730   assert(padding_top <= 1);
2731 
2732   const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
2733   const __m128 vmask_odd  = _mm_load_ps((const float*) params->sse.mask_odd);
2734   const __m128 vmax = _mm_load_ps(params->sse.max);
2735   const __m128 vmin = _mm_load_ps(params->sse.min);
2736 
2737   const __m128 vbias = _mm_load1_ps(weights);
2738   const __m128 vk00 = _mm_load1_ps(weights + 1);
2739   const __m128 vk01 = _mm_load1_ps(weights + 2);
2740   const __m128 vk02 = _mm_load1_ps(weights + 3);
2741   const __m128 vk10 = _mm_load1_ps(weights + 4);
2742   const __m128 vk11 = _mm_load1_ps(weights + 5);
2743   const __m128 vk12 = _mm_load1_ps(weights + 6);
2744   const __m128 vk20 = _mm_load1_ps(weights + 7);
2745   const __m128 vk21 = _mm_load1_ps(weights + 8);
2746   const __m128 vk22 = _mm_load1_ps(weights + 9);
2747 
2748   const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float));
2749 
2750   const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
2751   const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
2752   if XNN_UNPREDICTABLE(padding_top != 0) {
2753     i0 = zero;
2754   }
2755   const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
2756 
2757   float* o0 = output;
2758 
2759   size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
2760   size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
2761   do {
2762     if XNN_UNPREDICTABLE(padded_input_height < 4) {
2763       i2 = zero;
2764     }
2765 
2766     __m128 vi0x7531 = _mm_setzero_ps();
2767     __m128 vi1x7531 = _mm_setzero_ps();
2768     __m128 vi2x7531 = _mm_setzero_ps();
2769 
2770     size_t w = input_width;
2771     for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) {
2772       const __m128 vi0x89AB = _mm_loadu_ps(i0);
2773       const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
2774       i0 += 8;
2775       const __m128 vi1x89AB = _mm_loadu_ps(i1);
2776       const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
2777       i1 += 8;
2778       const __m128 vi2x89AB = _mm_loadu_ps(i2);
2779       const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
2780       i2 += 8;
2781 
2782       const __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
2783       const __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
2784       const __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
2785       const __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
2786       const __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
2787       const __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
2788 
2789       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
2790       __m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
2791       __m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
2792 
2793       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2794       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2795       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2796 
2797       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
2798       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
2799       vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
2800 
2801       const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
2802       const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
2803       const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
2804 
2805       vi0x7531 = vi0xF9BD;
2806       vi1x7531 = vi1xF9BD;
2807       vi2x7531 = vi2xF9BD;
2808 
2809       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
2810       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
2811       vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
2812 
2813       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2814       vo0p0 = _mm_add_ps(vo0p0, vo0p2);
2815 
2816       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2817 
2818       vo0 = _mm_min_ps(vo0, vmax);
2819 
2820       _mm_storeu_ps(o0, vo0);
2821       o0 += 4;
2822     }
2823     // Potentially process the last block of 0..7 pixels.
2824     assert(w < 8 * sizeof(float));
2825     if XNN_LIKELY(w != 0) {
2826       const __m128 vi0x89AB = _mm_loadu_ps(i0);
2827       const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
2828       const __m128 vi1x89AB = _mm_loadu_ps(i1);
2829       const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
2830       const __m128 vi2x89AB = _mm_loadu_ps(i2);
2831       const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
2832 
2833       const __m128 vi0x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
2834       const __m128 vi0x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
2835       const __m128 vi1x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
2836       const __m128 vi1x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
2837       const __m128 vi2x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
2838       const __m128 vi2x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
2839 
2840       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
2841       __m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
2842       __m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
2843 
2844       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2845       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2846       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2847 
2848       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
2849       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
2850       vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
2851 
2852       const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
2853       const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
2854       const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
2855 
2856       vi0x7531 = vi0xF9BD;
2857       vi1x7531 = vi1xF9BD;
2858       vi2x7531 = vi2xF9BD;
2859 
2860       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
2861       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
2862       vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
2863 
2864       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2865       vo0p0 = _mm_add_ps(vo0p0, vo0p2);
2866 
2867       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2868 
2869       vo0 = _mm_min_ps(vo0, vmax);
2870 
2871       if (w == 7 * sizeof(float)) {
2872         _mm_storeu_ps(o0, vo0);
2873         o0 += 4;
2874       } else {
2875         w += 1 * sizeof(float);
2876         if (w & (4 * sizeof(float))) {
2877           _mm_storel_pi((__m64*) o0, vo0);
2878           o0 += 2;
2879 
2880           vo0 = _mm_movehl_ps(vo0, vo0);
2881         }
2882         if (w & (2 * sizeof(float))) {
2883           _mm_store_ss(o0, vo0);
2884           o0 += 1;
2885         }
2886       }
2887     }
2888 
2889     i0 = (const float*) ((uintptr_t) i2 - input_decrement);
2890     i1 = (const float*) ((uintptr_t) i0 + input_width);
2891     i2 = (const float*) ((uintptr_t) i1 + input_width);
2892 
2893 
2894     output_height -= 1;
2895     padded_input_height -= 2;
2896   } while (output_height != 0);
2897 }
2898 
xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])2899 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(
2900     size_t input_height,
2901     size_t input_width,
2902     const float* input,
2903     const float* weights,
2904     const float* zero,
2905     float* output,
2906     uint32_t padding_top,
2907     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2908 {
2909   assert(input_height != 0);
2910   assert(input_width != 0);
2911   assert(input_width % sizeof(float) == 0);
2912   assert(padding_top == 2);
2913 
2914   const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
2915   const __m128 vmax = _mm_load_ps(params->sse.max);
2916   const __m128 vmin = _mm_load_ps(params->sse.min);
2917 
2918   const __m128 vbias = _mm_load1_ps(weights);
2919   const __m128 vk00 = _mm_load1_ps(weights + 1);
2920   const __m128 vk01 = _mm_load1_ps(weights + 2);
2921   const __m128 vk02 = _mm_load1_ps(weights + 3);
2922   const __m128 vk03 = _mm_load1_ps(weights + 4);
2923   const __m128 vk04 = _mm_load1_ps(weights + 5);
2924   const __m128 vk10 = _mm_load1_ps(weights + 6);
2925   const __m128 vk11 = _mm_load1_ps(weights + 7);
2926   const __m128 vk12 = _mm_load1_ps(weights + 8);
2927   const __m128 vk13 = _mm_load1_ps(weights + 9);
2928   const __m128 vk14 = _mm_load1_ps(weights + 10);
2929   const __m128 vk20 = _mm_load1_ps(weights + 11);
2930   const __m128 vk21 = _mm_load1_ps(weights + 12);
2931   const __m128 vk22 = _mm_load1_ps(weights + 13);
2932   const __m128 vk23 = _mm_load1_ps(weights + 14);
2933   const __m128 vk24 = _mm_load1_ps(weights + 15);
2934   const __m128 vk30 = _mm_load1_ps(weights + 16);
2935   const __m128 vk31 = _mm_load1_ps(weights + 17);
2936   const __m128 vk32 = _mm_load1_ps(weights + 18);
2937   const __m128 vk33 = _mm_load1_ps(weights + 19);
2938   const __m128 vk34 = _mm_load1_ps(weights + 20);
2939   const __m128 vk40 = _mm_load1_ps(weights + 21);
2940   const __m128 vk41 = _mm_load1_ps(weights + 22);
2941   const __m128 vk42 = _mm_load1_ps(weights + 23);
2942   const __m128 vk43 = _mm_load1_ps(weights + 24);
2943   const __m128 vk44 = _mm_load1_ps(weights + 25);
2944 
2945   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
2946 
2947   const float* i0 = zero;
2948   const float* i1 = zero;
2949   const float* i2 = input;
2950   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
2951   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
2952   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
2953   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
2954   const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
2955 
2956   float* o0 = output;
2957   float* o1 = (float*) ((uintptr_t) o0 + input_width);
2958   float* o2 = (float*) ((uintptr_t) o1 + input_width);
2959   float* o3 = (float*) ((uintptr_t) o2 + input_width);
2960 
2961   size_t output_height = input_height;
2962   do {
2963     if XNN_UNPREDICTABLE(output_height < 2) {
2964       i3 = zero;
2965       o1 = o0;
2966     }
2967     if XNN_UNPREDICTABLE(output_height < 3) {
2968       i4 = zero;
2969       o2 = o1;
2970     }
2971     if XNN_UNPREDICTABLE(output_height < 4) {
2972       i5 = zero;
2973       o3 = o2;
2974     }
2975     if XNN_UNPREDICTABLE(output_height < 5) {
2976       i6 = zero;
2977     }
2978     if XNN_UNPREDICTABLE(output_height < 6) {
2979       i7 = zero;
2980     }
2981 
2982     __m128 vi0x3012 = _mm_setzero_ps();
2983     __m128 vi1x3012 = _mm_setzero_ps();
2984     __m128 vi2x3012 = _mm_setzero_ps();
2985     __m128 vi3x3012 = _mm_setzero_ps();
2986     __m128 vi4x3012 = _mm_setzero_ps();
2987     __m128 vi5x3012 = _mm_setzero_ps();
2988     __m128 vi6x3012 = _mm_setzero_ps();
2989     __m128 vi7x3012 = _mm_setzero_ps();
2990 
2991     __m128 vi0x4567 = _mm_loadu_ps(i0);
2992     i0 += 4;
2993     __m128 vi1x4567 = _mm_loadu_ps(i1);
2994     i1 += 4;
2995     __m128 vi2x4567 = _mm_loadu_ps(i2);
2996     i2 += 4;
2997     __m128 vi3x4567 = _mm_loadu_ps(i3);
2998     i3 += 4;
2999     __m128 vi4x4567 = _mm_loadu_ps(i4);
3000     i4 += 4;
3001     __m128 vi5x4567 = _mm_loadu_ps(i5);
3002     i5 += 4;
3003     __m128 vi6x4567 = _mm_loadu_ps(i6);
3004     i6 += 4;
3005     __m128 vi7x4567 = _mm_loadu_ps(i7);
3006     i7 += 4;
3007 
3008     size_t w = input_width;
3009     for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
3010       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
3011       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
3012       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
3013       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
3014       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
3015       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
3016       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
3017       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
3018       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
3019       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
3020       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
3021       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
3022       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
3023       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
3024       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
3025       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
3026       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
3027       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
3028       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
3029       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
3030 
3031       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
3032       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
3033       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
3034       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
3035       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
3036       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
3037       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
3038       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
3039 
3040       const __m128 vi0x89AB = _mm_loadu_ps(i0);
3041       i0 += 4;
3042       const __m128 vi1x89AB = _mm_loadu_ps(i1);
3043       i1 += 4;
3044       const __m128 vi2x89AB = _mm_loadu_ps(i2);
3045       i2 += 4;
3046       const __m128 vi3x89AB = _mm_loadu_ps(i3);
3047       i3 += 4;
3048       const __m128 vi4x89AB = _mm_loadu_ps(i4);
3049       i4 += 4;
3050       const __m128 vi5x89AB = _mm_loadu_ps(i5);
3051       i5 += 4;
3052       const __m128 vi6x89AB = _mm_loadu_ps(i6);
3053       i6 += 4;
3054       const __m128 vi7x89AB = _mm_loadu_ps(i7);
3055       i7 += 4;
3056 
3057       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
3058       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
3059       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
3060       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
3061       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
3062       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
3063       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
3064       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
3065 
3066       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
3067       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
3068       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
3069       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
3070       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
3071       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
3072       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
3073       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
3074       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
3075       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
3076       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
3077       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
3078       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
3079       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
3080       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
3081       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
3082       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
3083       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
3084       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
3085       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
3086 
3087       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
3088       vi0x3012 = vi0x7456;
3089       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
3090       vi1x3012 = vi1x7456;
3091       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
3092       vi2x3012 = vi2x7456;
3093       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
3094       vi3x3012 = vi3x7456;
3095       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
3096       vi4x3012 = vi4x7456;
3097       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
3098       vi5x3012 = vi5x7456;
3099       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
3100       vi6x3012 = vi6x7456;
3101       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
3102       vi7x3012 = vi7x7456;
3103 
3104       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
3105       vi0x4567 = vi0x89AB;
3106       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
3107       vi1x4567 = vi1x89AB;
3108       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
3109       vi2x4567 = vi2x89AB;
3110       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
3111       vi3x4567 = vi3x89AB;
3112       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
3113       vi4x4567 = vi4x89AB;
3114       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
3115       vi5x4567 = vi5x89AB;
3116       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
3117       vi6x4567 = vi6x89AB;
3118       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
3119       vi7x4567 = vi7x89AB;
3120 
3121       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
3122       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
3123       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
3124       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
3125       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
3126       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
3127       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
3128       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
3129       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
3130       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
3131       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
3132       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
3133       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
3134       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
3135       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
3136       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
3137       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
3138       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
3139       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
3140       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
3141 
3142       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
3143       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
3144       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
3145       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
3146       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
3147       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
3148       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
3149       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
3150 
3151       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
3152       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
3153       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
3154       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
3155       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
3156       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
3157       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
3158       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
3159       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
3160       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
3161       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
3162       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
3163       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
3164       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
3165       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
3166       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
3167       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
3168       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
3169       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
3170       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
3171 
3172       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3173       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3174       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3175       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3176       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3177       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3178       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3179       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3180 
3181       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
3182       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
3183       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
3184       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
3185       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
3186       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
3187       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
3188       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
3189       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
3190       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
3191       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
3192       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
3193       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
3194       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
3195       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
3196       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
3197       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
3198       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
3199       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
3200       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
3201 
3202 
3203       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3204       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3205       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
3206       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
3207 
3208       vo0 = _mm_min_ps(vo0, vmax);
3209       vo1 = _mm_min_ps(vo1, vmax);
3210       vo2 = _mm_min_ps(vo2, vmax);
3211       vo3 = _mm_min_ps(vo3, vmax);
3212 
3213       _mm_storeu_ps(o3, vo3);
3214       o3 += 4;
3215       _mm_storeu_ps(o2, vo2);
3216       o2 += 4;
3217       _mm_storeu_ps(o1, vo1);
3218       o1 += 4;
3219       _mm_storeu_ps(o0, vo0);
3220       o0 += 4;
3221     }
3222     // Always process the last block of 5..8 pixels.
3223     if XNN_LIKELY(w > 4 * sizeof(float)) {
3224       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
3225       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
3226       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
3227       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
3228       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
3229       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
3230       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
3231       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
3232       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
3233       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
3234       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
3235       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
3236       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
3237       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
3238       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
3239       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
3240       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
3241       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
3242       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
3243       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
3244 
3245       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
3246       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
3247       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
3248       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
3249       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
3250       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
3251       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
3252       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
3253 
3254       const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
3255       i0 += 4;
3256       const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
3257       i1 += 4;
3258       const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
3259       i2 += 4;
3260       const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
3261       i3 += 4;
3262       const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
3263       i4 += 4;
3264       const __m128 vi5x89AB = _mm_and_ps(_mm_loadu_ps(i5), vmask);
3265       i5 += 4;
3266       const __m128 vi6x89AB = _mm_and_ps(_mm_loadu_ps(i6), vmask);
3267       i6 += 4;
3268       const __m128 vi7x89AB = _mm_and_ps(_mm_loadu_ps(i7), vmask);
3269       i7 += 4;
3270 
3271       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
3272       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
3273       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
3274       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
3275       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
3276       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
3277       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
3278       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
3279 
3280       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
3281       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
3282       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
3283       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
3284       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
3285       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
3286       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
3287       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
3288       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
3289       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
3290       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
3291       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
3292       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
3293       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
3294       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
3295       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
3296       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
3297       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
3298       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
3299       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
3300 
3301       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
3302       vi0x3012 = vi0x7456;
3303       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
3304       vi1x3012 = vi1x7456;
3305       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
3306       vi2x3012 = vi2x7456;
3307       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
3308       vi3x3012 = vi3x7456;
3309       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
3310       vi4x3012 = vi4x7456;
3311       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
3312       vi5x3012 = vi5x7456;
3313       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
3314       vi6x3012 = vi6x7456;
3315       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
3316       vi7x3012 = vi7x7456;
3317 
3318       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
3319       vi0x4567 = vi0x89AB;
3320       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
3321       vi1x4567 = vi1x89AB;
3322       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
3323       vi2x4567 = vi2x89AB;
3324       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
3325       vi3x4567 = vi3x89AB;
3326       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
3327       vi4x4567 = vi4x89AB;
3328       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
3329       vi5x4567 = vi5x89AB;
3330       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
3331       vi6x4567 = vi6x89AB;
3332       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
3333       vi7x4567 = vi7x89AB;
3334 
3335       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
3336       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
3337       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
3338       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
3339       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
3340       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
3341       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
3342       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
3343       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
3344       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
3345       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
3346       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
3347       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
3348       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
3349       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
3350       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
3351       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
3352       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
3353       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
3354       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
3355 
3356       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
3357       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
3358       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
3359       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
3360       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
3361       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
3362       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
3363       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
3364 
3365       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
3366       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
3367       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
3368       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
3369       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
3370       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
3371       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
3372       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
3373       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
3374       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
3375       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
3376       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
3377       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
3378       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
3379       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
3380       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
3381       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
3382       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
3383       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
3384       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
3385 
3386       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3387       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3388       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3389       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3390       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3391       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3392       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3393       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3394 
3395       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
3396       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
3397       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
3398       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
3399       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
3400       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
3401       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
3402       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
3403       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
3404       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
3405       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
3406       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
3407       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
3408       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
3409       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
3410       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
3411       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
3412       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
3413       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
3414       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
3415 
3416 
3417       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3418       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3419       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
3420       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
3421 
3422       vo0 = _mm_min_ps(vo0, vmax);
3423       vo1 = _mm_min_ps(vo1, vmax);
3424       vo2 = _mm_min_ps(vo2, vmax);
3425       vo3 = _mm_min_ps(vo3, vmax);
3426 
3427       _mm_storeu_ps(o3, vo3);
3428       o3 += 4;
3429       _mm_storeu_ps(o2, vo2);
3430       o2 += 4;
3431       _mm_storeu_ps(o1, vo1);
3432       o1 += 4;
3433       _mm_storeu_ps(o0, vo0);
3434       o0 += 4;
3435 
3436       w -= 4 * sizeof(float);
3437     }
3438     assert(w >= 1 * sizeof(float));
3439     assert(w <= 4 * sizeof(float));
3440     {
3441       vi0x4567 = _mm_and_ps(vi0x4567, vmask);
3442       vi1x4567 = _mm_and_ps(vi1x4567, vmask);
3443       vi2x4567 = _mm_and_ps(vi2x4567, vmask);
3444       vi3x4567 = _mm_and_ps(vi3x4567, vmask);
3445       vi4x4567 = _mm_and_ps(vi4x4567, vmask);
3446       vi5x4567 = _mm_and_ps(vi5x4567, vmask);
3447       vi6x4567 = _mm_and_ps(vi6x4567, vmask);
3448       vi7x4567 = _mm_and_ps(vi7x4567, vmask);
3449 
3450       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
3451       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
3452       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
3453       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
3454       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
3455       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
3456       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
3457       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
3458       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
3459       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
3460       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
3461       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
3462       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
3463       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
3464       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
3465       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
3466       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
3467       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
3468       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
3469       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
3470 
3471       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
3472       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
3473       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
3474       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
3475       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
3476       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
3477       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
3478       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
3479 
3480       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
3481       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
3482       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
3483       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
3484       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
3485       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
3486       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
3487       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
3488 
3489       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
3490       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
3491       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
3492       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
3493       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
3494       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
3495       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
3496       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
3497       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
3498       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
3499       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
3500       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
3501       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
3502       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
3503       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
3504       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
3505       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
3506       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
3507       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
3508       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
3509 
3510       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
3511       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
3512       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
3513       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
3514       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
3515       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
3516       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
3517       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
3518 
3519       const __m128 vzero = _mm_setzero_ps();
3520       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
3521       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
3522       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
3523       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
3524       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
3525       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vzero);
3526       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vzero);
3527       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vzero);
3528 
3529       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
3530       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
3531       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
3532       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
3533       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
3534       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
3535       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
3536       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
3537       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
3538       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
3539       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
3540       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
3541       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
3542       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
3543       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
3544       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
3545       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
3546       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
3547       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
3548       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
3549 
3550       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
3551       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
3552       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
3553       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
3554       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
3555       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
3556       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
3557       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
3558 
3559       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
3560       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
3561       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
3562       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
3563       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
3564       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
3565       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
3566       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
3567       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
3568       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
3569       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
3570       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
3571       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
3572       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
3573       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
3574       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
3575       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
3576       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
3577       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
3578       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
3579 
3580       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3581       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3582       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3583       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3584       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3585       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3586       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3587       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3588 
3589       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
3590       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
3591       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
3592       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
3593       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
3594       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
3595       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
3596       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
3597       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
3598       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
3599       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
3600       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
3601       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
3602       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
3603       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
3604       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
3605       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
3606       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
3607       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
3608       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
3609 
3610 
3611       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3612       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3613       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
3614       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
3615 
3616       vo0 = _mm_min_ps(vo0, vmax);
3617       vo1 = _mm_min_ps(vo1, vmax);
3618       vo2 = _mm_min_ps(vo2, vmax);
3619       vo3 = _mm_min_ps(vo3, vmax);
3620 
3621       if XNN_LIKELY(w & (4 * sizeof(float))) {
3622         _mm_storeu_ps(o3, vo3);
3623         o3 += 4;
3624         _mm_storeu_ps(o2, vo2);
3625         o2 += 4;
3626         _mm_storeu_ps(o1, vo1);
3627         o1 += 4;
3628         _mm_storeu_ps(o0, vo0);
3629         o0 += 4;
3630       } else {
3631         if (w & (2 * sizeof(float))) {
3632           _mm_storel_pi((__m64*) o3, vo3);
3633           o3 += 2;
3634           _mm_storel_pi((__m64*) o2, vo2);
3635           o2 += 2;
3636           _mm_storel_pi((__m64*) o1, vo1);
3637           o1 += 2;
3638           _mm_storel_pi((__m64*) o0, vo0);
3639           o0 += 2;
3640 
3641           vo0 = _mm_movehl_ps(vo0, vo0);
3642           vo1 = _mm_movehl_ps(vo1, vo1);
3643           vo2 = _mm_movehl_ps(vo2, vo2);
3644           vo3 = _mm_movehl_ps(vo3, vo3);
3645         }
3646         if (w & (1 * sizeof(float))) {
3647           _mm_store_ss(o3, vo3);
3648           o3 += 1;
3649           _mm_store_ss(o2, vo2);
3650           o2 += 1;
3651           _mm_store_ss(o1, vo1);
3652           o1 += 1;
3653           _mm_store_ss(o0, vo0);
3654           o0 += 1;
3655         }
3656       }
3657     }
3658 
3659     i0 = (const float*) ((uintptr_t) i4 - input_decrement);
3660     i1 = (const float*) ((uintptr_t) i5 - input_decrement);
3661     i2 = (const float*) ((uintptr_t) i1 + input_width);
3662     i3 = (const float*) ((uintptr_t) i2 + input_width);
3663     i4 = (const float*) ((uintptr_t) i3 + input_width);
3664     i5 = (const float*) ((uintptr_t) i4 + input_width);
3665     i6 = (const float*) ((uintptr_t) i5 + input_width);
3666     i7 = (const float*) ((uintptr_t) i6 + input_width);
3667 
3668     o0 = o3;
3669     o1 = (float*) ((uintptr_t) o0 + input_width);
3670     o2 = (float*) ((uintptr_t) o1 + input_width);
3671     o3 = (float*) ((uintptr_t) o2 + input_width);
3672 
3673     output_height = doz(output_height, 4);
3674   } while (output_height != 0);
3675 }
3676 
xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])3677 void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4(
3678     size_t input_height,
3679     size_t input_width,
3680     const float* input,
3681     const float* weights,
3682     const float* zero,
3683     float* output,
3684     uint32_t padding_top,
3685     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3686 {
3687   assert(input_height != 0);
3688   assert(input_width != 0);
3689   assert(input_width % sizeof(float) == 0);
3690   assert(padding_top >= 1);
3691   assert(padding_top <= 2);
3692 
3693   const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
3694   const __m128 vmask_odd  = _mm_load_ps((const float*) params->sse.mask_odd);
3695   const __m128 vmax = _mm_load_ps(params->sse.max);
3696   const __m128 vmin = _mm_load_ps(params->sse.min);
3697 
3698   const __m128 vbias = _mm_load1_ps(weights);
3699   const __m128 vk00 = _mm_load1_ps(weights + 1);
3700   const __m128 vk01 = _mm_load1_ps(weights + 2);
3701   const __m128 vk02 = _mm_load1_ps(weights + 3);
3702   const __m128 vk03 = _mm_load1_ps(weights + 4);
3703   const __m128 vk04 = _mm_load1_ps(weights + 5);
3704   const __m128 vk10 = _mm_load1_ps(weights + 6);
3705   const __m128 vk11 = _mm_load1_ps(weights + 7);
3706   const __m128 vk12 = _mm_load1_ps(weights + 8);
3707   const __m128 vk13 = _mm_load1_ps(weights + 9);
3708   const __m128 vk14 = _mm_load1_ps(weights + 10);
3709   const __m128 vk20 = _mm_load1_ps(weights + 11);
3710   const __m128 vk21 = _mm_load1_ps(weights + 12);
3711   const __m128 vk22 = _mm_load1_ps(weights + 13);
3712   const __m128 vk23 = _mm_load1_ps(weights + 14);
3713   const __m128 vk24 = _mm_load1_ps(weights + 15);
3714   const __m128 vk30 = _mm_load1_ps(weights + 16);
3715   const __m128 vk31 = _mm_load1_ps(weights + 17);
3716   const __m128 vk32 = _mm_load1_ps(weights + 18);
3717   const __m128 vk33 = _mm_load1_ps(weights + 19);
3718   const __m128 vk34 = _mm_load1_ps(weights + 20);
3719   const __m128 vk40 = _mm_load1_ps(weights + 21);
3720   const __m128 vk41 = _mm_load1_ps(weights + 22);
3721   const __m128 vk42 = _mm_load1_ps(weights + 23);
3722   const __m128 vk43 = _mm_load1_ps(weights + 24);
3723   const __m128 vk44 = _mm_load1_ps(weights + 25);
3724 
3725   const uint32_t padding_top_less_1 = padding_top - 1;
3726   const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float));
3727 
3728   const float* i0 = zero;
3729   const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
3730   const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3731   if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
3732     i1 = zero;
3733   }
3734   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3735   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3736   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
3737   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
3738 
3739   const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
3740 
3741   float* o0 = output;
3742   float* o1 = (float*) ((uintptr_t) o0 + output_width);
3743 
3744   size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
3745   size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
3746   do {
3747     if XNN_UNPREDICTABLE(padded_input_height < 6) {
3748       i3 = zero;
3749     }
3750     if XNN_UNPREDICTABLE(padded_input_height < 7) {
3751       i4 = zero;
3752       o1 = o0;
3753     }
3754     if XNN_UNPREDICTABLE(padded_input_height < 8) {
3755       i5 = zero;
3756     }
3757     if XNN_UNPREDICTABLE(padded_input_height < 9) {
3758       i6 = zero;
3759     }
3760 
3761     __m128 vi0x6024 = _mm_setzero_ps();
3762     __m128 vi1x6024 = _mm_setzero_ps();
3763     __m128 vi2x6024 = _mm_setzero_ps();
3764     __m128 vi3x6024 = _mm_setzero_ps();
3765     __m128 vi4x6024 = _mm_setzero_ps();
3766     __m128 vi5x6024 = _mm_setzero_ps();
3767     __m128 vi6x6024 = _mm_setzero_ps();
3768 
3769     __m128 vi0x7135 = _mm_setzero_ps();
3770     __m128 vi1x7135 = _mm_setzero_ps();
3771     __m128 vi2x7135 = _mm_setzero_ps();
3772     __m128 vi3x7135 = _mm_setzero_ps();
3773     __m128 vi4x7135 = _mm_setzero_ps();
3774     __m128 vi5x7135 = _mm_setzero_ps();
3775     __m128 vi6x7135 = _mm_setzero_ps();
3776 
3777     const __m128 vi0x89AB = _mm_loadu_ps(i0);
3778     const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
3779     i0 += 8;
3780     const __m128 vi1x89AB = _mm_loadu_ps(i1);
3781     const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
3782     i1 += 8;
3783     const __m128 vi2x89AB = _mm_loadu_ps(i2);
3784     const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
3785     i2 += 8;
3786     const __m128 vi3x89AB = _mm_loadu_ps(i3);
3787     const __m128 vi3xCDEF = _mm_loadu_ps(i3 + 4);
3788     i3 += 8;
3789     const __m128 vi4x89AB = _mm_loadu_ps(i4);
3790     const __m128 vi4xCDEF = _mm_loadu_ps(i4 + 4);
3791     i4 += 8;
3792     const __m128 vi5x89AB = _mm_loadu_ps(i5);
3793     const __m128 vi5xCDEF = _mm_loadu_ps(i5 + 4);
3794     i5 += 8;
3795     const __m128 vi6x89AB = _mm_loadu_ps(i6);
3796     const __m128 vi6xCDEF = _mm_loadu_ps(i6 + 4);
3797     i6 += 8;
3798 
3799     __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3800     __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3801     __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3802     __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3803     __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3804     __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3805     __m128 vi3x8ACE = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3806     __m128 vi3x9BDF = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3807     __m128 vi4x8ACE = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3808     __m128 vi4x9BDF = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3809     __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3810     __m128 vi5x9BDF = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3811     __m128 vi6x8ACE = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3812     __m128 vi6x9BDF = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3813 
3814     size_t w = input_width;
3815     for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) {
3816       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
3817       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
3818       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
3819       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
3820       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
3821       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
3822       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
3823       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
3824       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
3825       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
3826 
3827       const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3828       const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3829       const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3830       const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3831       const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3832       const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3833       const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3834 
3835       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
3836       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
3837       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
3838       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
3839       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
3840       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
3841       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
3842       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
3843       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
3844       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
3845 
3846       const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
3847       vi0x6024 = vi0xE8AC;
3848       const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
3849       vi1x6024 = vi1xE8AC;
3850       const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
3851       vi2x6024 = vi2xE8AC;
3852       const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
3853       vi3x6024 = vi3xE8AC;
3854       const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
3855       vi4x6024 = vi4xE8AC;
3856       const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
3857       vi5x6024 = vi5xE8AC;
3858       const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
3859       vi6x6024 = vi6xE8AC;
3860 
3861       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3862       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3863       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3864       const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3865       const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3866       const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3867       const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3868 
3869       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
3870       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
3871       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
3872       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
3873       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
3874       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
3875       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
3876       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
3877       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
3878       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
3879 
3880       const __m128 vi0xGHIJ = _mm_loadu_ps(i0);
3881       const __m128 vi0xKLMN = _mm_loadu_ps(i0 + 4);
3882       i0 += 8;
3883       const __m128 vi1xGHIJ = _mm_loadu_ps(i1);
3884       const __m128 vi1xKLMN = _mm_loadu_ps(i1 + 4);
3885       i1 += 8;
3886       const __m128 vi2xGHIJ = _mm_loadu_ps(i2);
3887       const __m128 vi2xKLMN = _mm_loadu_ps(i2 + 4);
3888       i2 += 8;
3889       const __m128 vi3xGHIJ = _mm_loadu_ps(i3);
3890       const __m128 vi3xKLMN = _mm_loadu_ps(i3 + 4);
3891       i3 += 8;
3892       const __m128 vi4xGHIJ = _mm_loadu_ps(i4);
3893       const __m128 vi4xKLMN = _mm_loadu_ps(i4 + 4);
3894       i4 += 8;
3895       const __m128 vi5xGHIJ = _mm_loadu_ps(i5);
3896       const __m128 vi5xKLMN = _mm_loadu_ps(i5 + 4);
3897       i5 += 8;
3898       const __m128 vi6xGHIJ = _mm_loadu_ps(i6);
3899       const __m128 vi6xKLMN = _mm_loadu_ps(i6 + 4);
3900       i6 += 8;
3901 
3902       const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
3903       vi0x7135 = vi0xF9BD;
3904       const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
3905       vi1x7135 = vi1xF9BD;
3906       const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
3907       vi2x7135 = vi2xF9BD;
3908       const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
3909       vi3x7135 = vi3xF9BD;
3910       const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
3911       vi4x7135 = vi4xF9BD;
3912       const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
3913       vi5x7135 = vi5xF9BD;
3914       const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
3915       vi6x7135 = vi6xF9BD;
3916 
3917       const __m128 vi0xGIKM = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3918       const __m128 vi0xHJLN = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3919       vi0x9BDF = vi0xHJLN;
3920       const __m128 vi1xGIKM = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3921       const __m128 vi1xHJLN = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3922       vi1x9BDF = vi1xHJLN;
3923       const __m128 vi2xGIKM = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3924       const __m128 vi2xHJLN = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3925       vi2x9BDF = vi2xHJLN;
3926       const __m128 vi3xGIKM = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3927       const __m128 vi3xHJLN = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3928       vi3x9BDF = vi3xHJLN;
3929       const __m128 vi4xGIKM = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3930       const __m128 vi4xHJLN = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3931       vi4x9BDF = vi4xHJLN;
3932       const __m128 vi5xGIKM = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3933       const __m128 vi5xHJLN = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3934       vi5x9BDF = vi5xHJLN;
3935       const __m128 vi6xGIKM = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3936       const __m128 vi6xHJLN = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3937       vi6x9BDF = vi6xHJLN;
3938 
3939       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
3940       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
3941       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
3942       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
3943       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
3944       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
3945       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
3946       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
3947       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
3948       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
3949 
3950       const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vi0xGIKM);
3951       vi0x8ACE = vi0xGIKM;
3952       const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vi1xGIKM);
3953       vi1x8ACE = vi1xGIKM;
3954       const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vi2xGIKM);
3955       vi2x8ACE = vi2xGIKM;
3956       const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vi3xGIKM);
3957       vi3x8ACE = vi3xGIKM;
3958       const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vi4xGIKM);
3959       vi4x8ACE = vi4xGIKM;
3960       const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM);
3961       vi5x8ACE = vi5xGIKM;
3962       const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vi6xGIKM);
3963       vi6x8ACE = vi6xGIKM;
3964 
3965       const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3966       const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3967       const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3968       const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3969       const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3970       const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3971       const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3972 
3973       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
3974       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
3975       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
3976       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
3977       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
3978       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
3979       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
3980       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
3981       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
3982       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
3983 
3984 
3985       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3986       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3987 
3988       vo0 = _mm_min_ps(vo0, vmax);
3989       vo1 = _mm_min_ps(vo1, vmax);
3990 
3991       _mm_storeu_ps(o1, vo1);
3992       o1 += 4;
3993       _mm_storeu_ps(o0, vo0);
3994       o0 += 4;
3995     }
3996     // Last block has 1-8 pixels to process.
3997     assert(w <= 8 * sizeof(float));
3998     assert(w >= 1 * sizeof(float));
3999     {
4000       vi0x8ACE = _mm_and_ps(vi0x8ACE, vmask_even);
4001       vi0x9BDF = _mm_and_ps(vi0x9BDF, vmask_odd);
4002       vi1x8ACE = _mm_and_ps(vi1x8ACE, vmask_even);
4003       vi1x9BDF = _mm_and_ps(vi1x9BDF, vmask_odd);
4004       vi2x8ACE = _mm_and_ps(vi2x8ACE, vmask_even);
4005       vi2x9BDF = _mm_and_ps(vi2x9BDF, vmask_odd);
4006       vi3x8ACE = _mm_and_ps(vi3x8ACE, vmask_even);
4007       vi3x9BDF = _mm_and_ps(vi3x9BDF, vmask_odd);
4008       vi4x8ACE = _mm_and_ps(vi4x8ACE, vmask_even);
4009       vi4x9BDF = _mm_and_ps(vi4x9BDF, vmask_odd);
4010       vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even);
4011       vi5x9BDF = _mm_and_ps(vi5x9BDF, vmask_odd);
4012       vi6x8ACE = _mm_and_ps(vi6x8ACE, vmask_even);
4013       vi6x9BDF = _mm_and_ps(vi6x9BDF, vmask_odd);
4014 
4015       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
4016       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
4017       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
4018       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
4019       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
4020       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
4021       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
4022       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
4023       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
4024       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
4025 
4026       const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4027       const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4028       const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4029       const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4030       const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4031       const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4032       const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4033 
4034       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
4035       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
4036       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
4037       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
4038       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
4039       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
4040       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
4041       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
4042       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
4043       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
4044 
4045       const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
4046       const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
4047       const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
4048       const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
4049       const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
4050       const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
4051       const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
4052 
4053       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4054       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4055       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4056       const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4057       const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4058       const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4059       const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4060 
4061       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
4062       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
4063       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
4064       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
4065       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
4066       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
4067       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
4068       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
4069       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
4070       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
4071 
4072       const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
4073       const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
4074       const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
4075       const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
4076       const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
4077       const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
4078       const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
4079 
4080       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
4081       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
4082       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
4083       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
4084       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
4085       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
4086       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
4087       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
4088       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
4089       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
4090 
4091       const __m128 vzero = _mm_setzero_ps();
4092       const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vzero);
4093       const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vzero);
4094       const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vzero);
4095       const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vzero);
4096       const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vzero);
4097       const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero);
4098       const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vzero);
4099 
4100       const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4101       const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4102       const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4103       const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4104       const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4105       const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4106       const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4107 
4108       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
4109       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
4110       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
4111       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
4112       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
4113       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
4114       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
4115       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
4116       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
4117       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
4118 
4119 
4120       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
4121       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
4122 
4123       vo0 = _mm_min_ps(vo0, vmax);
4124       vo1 = _mm_min_ps(vo1, vmax);
4125 
4126       size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float));
4127       if XNN_LIKELY(w_tmp >= 4) {
4128         _mm_storeu_ps(o1, vo1);
4129         o1 += 4;
4130         _mm_storeu_ps(o0, vo0);
4131         o0 += 4;
4132       } else {
4133         if (w_tmp & 2) {
4134           _mm_storel_pi((__m64*) o1, vo1);
4135           o1 += 2;
4136           _mm_storel_pi((__m64*) o0, vo0);
4137           o0 += 2;
4138 
4139           vo0 = _mm_movehl_ps(vo0, vo0);
4140           vo1 = _mm_movehl_ps(vo1, vo1);
4141         }
4142         if (w_tmp & 1) {
4143           _mm_store_ss(o1, vo1);
4144           o1 += 1;
4145           _mm_store_ss(o0, vo0);
4146           o0 += 1;
4147         }
4148       }
4149     }
4150 
4151     i0 = (const float*) ((uintptr_t) i4 - input_decrement);
4152     i1 = (const float*) ((uintptr_t) i5 - input_decrement);
4153     i2 = (const float*) ((uintptr_t) i6 - input_decrement);
4154     i3 = (const float*) ((uintptr_t) i2 + input_width);
4155     i4 = (const float*) ((uintptr_t) i3 + input_width);
4156     i5 = (const float*) ((uintptr_t) i4 + input_width);
4157     i6 = (const float*) ((uintptr_t) i5 + input_width);
4158 
4159     o0 = o1;
4160     o1 = (float*) ((uintptr_t) o0 + output_width);
4161 
4162     output_height = doz(output_height, 2);
4163     padded_input_height = doz(padded_input_height, 4);
4164   } while (output_height != 0);
4165 }
4166 
xnn_f32_gavgpool_cw_ukernel__sse_x4(size_t elements,size_t channels,const float * input,float * output,const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS (1)])4167 void xnn_f32_gavgpool_cw_ukernel__sse_x4(
4168     size_t elements,
4169     size_t channels,
4170     const float* input,
4171     float* output,
4172     const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4173 {
4174   assert(elements != 0);
4175   assert(elements % sizeof(float) == 0);
4176   assert(channels != 0);
4177 
4178   const float* i0 = input;
4179   const float* i1 = (const float*) ((uintptr_t) i0 + elements);
4180   const float* i2 = (const float*) ((uintptr_t) i1 + elements);
4181   const float* i3 = (const float*) ((uintptr_t) i2 + elements);
4182 
4183   const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
4184   const __m128 vmultiplier = _mm_load_ps(params->sse.multiplier);
4185   const __m128 voutput_min = _mm_load_ps(params->sse.output_min);
4186   const __m128 voutput_max = _mm_load_ps(params->sse.output_max);
4187 
4188   while (channels >= 4) {
4189     __m128 vsum0 = _mm_setzero_ps();
4190     __m128 vsum1 = _mm_setzero_ps();
4191     __m128 vsum2 = _mm_setzero_ps();
4192     __m128 vsum3 = _mm_setzero_ps();
4193     size_t n = elements;
4194     while (n >= 4 * sizeof(float)) {
4195       const __m128 vi0 = _mm_loadu_ps(i0);
4196       i0 += 4;
4197       const __m128 vi1 = _mm_loadu_ps(i1);
4198       i1 += 4;
4199       const __m128 vi2 = _mm_loadu_ps(i2);
4200       i2 += 4;
4201       const __m128 vi3 = _mm_loadu_ps(i3);
4202       i3 += 4;
4203 
4204       vsum0 = _mm_add_ps(vsum0, vi0);
4205       vsum1 = _mm_add_ps(vsum1, vi1);
4206       vsum2 = _mm_add_ps(vsum2, vi2);
4207       vsum3 = _mm_add_ps(vsum3, vi3);
4208       n -= 4 * sizeof(float);
4209     }
4210 
4211     if XNN_UNLIKELY(n != 0) {
4212       const __m128 vi0 = _mm_and_ps(_mm_loadu_ps(i0), vmask);
4213       i0 = (const float*) ((uintptr_t) i0 + n);
4214       const __m128 vi1 = _mm_and_ps(_mm_loadu_ps(i1), vmask);
4215       i1 = (const float*) ((uintptr_t) i1 + n);
4216       const __m128 vi2 = _mm_and_ps(_mm_loadu_ps(i2), vmask);
4217       i2 = (const float*) ((uintptr_t) i2 + n);
4218       const __m128 vi3 = _mm_and_ps(_mm_loadu_ps(i3), vmask);
4219       i3 = (const float*) ((uintptr_t) i3 + n);
4220 
4221       vsum0 = _mm_add_ps(vsum0, vi0);
4222       vsum1 = _mm_add_ps(vsum1, vi1);
4223       vsum2 = _mm_add_ps(vsum2, vi2);
4224       vsum3 = _mm_add_ps(vsum3, vi3);
4225     }
4226 
4227     // Having exactly 4 rows makes this work out nicely as we end up with
4228     // the 4 totals in 4 different lanes of the same vector.
4229     const __m128 vsum01 = _mm_add_ps(_mm_unpacklo_ps(vsum0, vsum1), _mm_unpackhi_ps(vsum0, vsum1));
4230     const __m128 vsum23 = _mm_add_ps(_mm_unpacklo_ps(vsum2, vsum3), _mm_unpackhi_ps(vsum2, vsum3));
4231     const __m128 vsum = _mm_add_ps(_mm_movelh_ps(vsum01, vsum23), _mm_movehl_ps(vsum23, vsum01));
4232     __m128 vout = _mm_mul_ps(vsum, vmultiplier);
4233 
4234     vout = _mm_max_ps(vout, voutput_min);
4235     vout = _mm_min_ps(vout, voutput_max);
4236 
4237     _mm_storeu_ps(output, vout);
4238     output += 4;
4239     i0 = i3;
4240     i1 = (const float*) ((uintptr_t) i0 + elements);
4241     i2 = (const float*) ((uintptr_t) i1 + elements);
4242     i3 = (const float*) ((uintptr_t) i2 + elements);
4243     channels -= 4;
4244   }
4245 
4246   while (channels != 0) {
4247     __m128 vsum = _mm_setzero_ps();
4248     size_t n = elements;
4249     while (n >= 4 * sizeof(float)) {
4250       const __m128 vi0 = _mm_loadu_ps(i0);
4251       i0 += 4;
4252       vsum = _mm_add_ps(vsum, vi0);
4253       n -= 4 * sizeof(float);
4254     }
4255 
4256     if XNN_UNLIKELY(n != 0) {
4257       __m128 vi0 = _mm_and_ps(_mm_loadu_ps(i0), vmask);
4258       i0 = (const float*) ((uintptr_t) i0 + n);
4259       vsum = _mm_add_ps(vsum, vi0);
4260     }
4261 
4262     vsum = _mm_add_ps(vsum, _mm_movehl_ps(vsum, vsum));
4263     vsum = _mm_add_ss(vsum, _mm_shuffle_ps(vsum, vsum, _MM_SHUFFLE(3, 2, 1, 1)));
4264 
4265     __m128 vout = _mm_mul_ss(vsum, vmultiplier);
4266 
4267     vout = _mm_max_ss(vout, voutput_min);
4268     vout = _mm_min_ss(vout, voutput_max);
4269 
4270     _mm_store_ss(output, vout);
4271     output += 1;
4272     channels -= 1;
4273   }
4274 }
4275 
xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4(size_t rows,size_t channels,const float * input,size_t input_stride,const float * zero,float * buffer,float * output,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])4276 void xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4(
4277     size_t rows,
4278     size_t channels,
4279     const float* input,
4280     size_t input_stride,
4281     const float* zero,
4282     float* buffer,
4283     float* output,
4284     const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4285 {
4286   assert(rows > 7);
4287   assert(channels != 0);
4288 
4289   const float* i0 = input;
4290   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
4291   const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
4292   const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
4293   const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
4294   const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
4295   const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
4296   const size_t packed_channels = round_up_po2(channels, 4);
4297   const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float);
4298 
4299   float* b = buffer;
4300   for (size_t c = 0; c < channels; c += 4) {
4301     const __m128 vi0 = _mm_loadu_ps(i0);
4302     i0 += 4;
4303     const __m128 vi1 = _mm_loadu_ps(i1);
4304     i1 += 4;
4305     const __m128 vi2 = _mm_loadu_ps(i2);
4306     i2 += 4;
4307     const __m128 vi3 = _mm_loadu_ps(i3);
4308     i3 += 4;
4309     const __m128 vi4 = _mm_loadu_ps(i4);
4310     i4 += 4;
4311     const __m128 vi5 = _mm_loadu_ps(i5);
4312     i5 += 4;
4313     const __m128 vi6 = _mm_loadu_ps(i6);
4314     i6 += 4;
4315 
4316     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4317     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4318     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4319 
4320     const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
4321     const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
4322 
4323     const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
4324 
4325     _mm_store_ps(b, vsum); b += 4;
4326   }
4327   for (rows -= 7; rows > 7; rows -= 7) {
4328     b = buffer;
4329 
4330     i0 = (const float*) ((uintptr_t) i0 + input_increment);
4331     i1 = (const float*) ((uintptr_t) i1 + input_increment);
4332     i2 = (const float*) ((uintptr_t) i2 + input_increment);
4333     i3 = (const float*) ((uintptr_t) i3 + input_increment);
4334     i4 = (const float*) ((uintptr_t) i4 + input_increment);
4335     i5 = (const float*) ((uintptr_t) i5 + input_increment);
4336     i6 = (const float*) ((uintptr_t) i6 + input_increment);
4337 
4338     for (size_t c = 0; c < channels; c += 4) {
4339       const __m128 vi0 = _mm_loadu_ps(i0);
4340       i0 += 4;
4341       const __m128 vi1 = _mm_loadu_ps(i1);
4342       i1 += 4;
4343       const __m128 vi2 = _mm_loadu_ps(i2);
4344       i2 += 4;
4345       const __m128 vi3 = _mm_loadu_ps(i3);
4346       i3 += 4;
4347       const __m128 vi4 = _mm_loadu_ps(i4);
4348       i4 += 4;
4349       const __m128 vi5 = _mm_loadu_ps(i5);
4350       i5 += 4;
4351       const __m128 vi6 = _mm_loadu_ps(i6);
4352       i6 += 4;
4353       const __m128 vacc = _mm_load_ps(b);
4354 
4355       const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4356       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4357       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4358       const __m128 vsum6a = _mm_add_ps(vi6, vacc);
4359 
4360       const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
4361       const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
4362 
4363       const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
4364 
4365       _mm_store_ps(b, vsum); b += 4;
4366     }
4367   }
4368 
4369   i0 = (const float*) ((uintptr_t) i0 + input_increment);
4370   i1 = (const float*) ((uintptr_t) i1 + input_increment);
4371   if (rows < 2) {
4372     i1 = zero;
4373   }
4374   i2 = (const float*) ((uintptr_t) i2 + input_increment);
4375   if (rows <= 2) {
4376     i2 = zero;
4377   }
4378   i3 = (const float*) ((uintptr_t) i3 + input_increment);
4379   if (rows < 4) {
4380     i3 = zero;
4381   }
4382   i4 = (const float*) ((uintptr_t) i4 + input_increment);
4383   if (rows <= 4) {
4384     i4 = zero;
4385   }
4386   i5 = (const float*) ((uintptr_t) i5 + input_increment);
4387   if (rows < 6) {
4388     i5 = zero;
4389   }
4390   i6 = (const float*) ((uintptr_t) i6 + input_increment);
4391   if (rows <= 6) {
4392     i6 = zero;
4393   }
4394   const __m128 vscale = _mm_load_ps(params->sse.scale);
4395   const __m128 vmin = _mm_load_ps(params->sse.min);
4396   const __m128 vmax = _mm_load_ps(params->sse.max);
4397 
4398   b = buffer;
4399   while (channels >= 4) {
4400     const __m128 vi0 = _mm_loadu_ps(i0);
4401     i0 += 4;
4402     const __m128 vi1 = _mm_loadu_ps(i1);
4403     i1 += 4;
4404     const __m128 vi2 = _mm_loadu_ps(i2);
4405     i2 += 4;
4406     const __m128 vi3 = _mm_loadu_ps(i3);
4407     i3 += 4;
4408     const __m128 vi4 = _mm_loadu_ps(i4);
4409     i4 += 4;
4410     const __m128 vi5 = _mm_loadu_ps(i5);
4411     i5 += 4;
4412     const __m128 vi6 = _mm_loadu_ps(i6);
4413     i6 += 4;
4414     const __m128 vacc = _mm_load_ps(b);
4415     b += 4;
4416 
4417     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4418     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4419     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4420     const __m128 vsum6a = _mm_add_ps(vi6, vacc);
4421 
4422     const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
4423     const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
4424 
4425     const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
4426 
4427     __m128 vout = _mm_mul_ps(vsum, vscale);
4428     vout = _mm_max_ps(vout, vmin);
4429     vout = _mm_min_ps(vout, vmax);
4430 
4431     _mm_storeu_ps(output, vout);
4432     output += 4;
4433 
4434     channels -= 4;
4435   }
4436   if (channels != 0) {
4437     const __m128 vi0 = _mm_loadu_ps(i0);
4438     const __m128 vi1 = _mm_loadu_ps(i1);
4439     const __m128 vi2 = _mm_loadu_ps(i2);
4440     const __m128 vi3 = _mm_loadu_ps(i3);
4441     const __m128 vi4 = _mm_loadu_ps(i4);
4442     const __m128 vi5 = _mm_loadu_ps(i5);
4443     const __m128 vi6 = _mm_loadu_ps(i6);
4444     const __m128 vacc = _mm_loadu_ps(b);
4445 
4446     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4447     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4448     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4449     const __m128 vsum6a = _mm_add_ps(vi6, vacc);
4450 
4451     const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
4452     const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
4453 
4454     const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
4455 
4456     __m128 vout = _mm_mul_ps(vsum, vscale);
4457     vout = _mm_max_ps(vout, vmin);
4458     vout = _mm_min_ps(vout, vmax);
4459 
4460     if (channels & 2) {
4461       _mm_storel_pi((__m64*) output, vout);
4462       vout = _mm_movehl_ps(vout, vout);
4463       output += 2;
4464     }
4465     if (channels & 1) {
4466       _mm_store_ss(output, vout);
4467     }
4468   }
4469 }
4470 
xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4(size_t rows,size_t channels,const float * input,size_t input_stride,const float * zero,float * output,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])4471 void xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4(
4472     size_t rows,
4473     size_t channels,
4474     const float* input,
4475     size_t input_stride,
4476     const float* zero,
4477     float* output,
4478     const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4479 {
4480   assert(rows != 0);
4481   assert(rows <= 7);
4482   assert(channels != 0);
4483 
4484   const float* i0 = input;
4485   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
4486   if (rows < 2) {
4487     i1 = zero;
4488   }
4489   const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
4490   if (rows <= 2) {
4491     i2 = zero;
4492   }
4493   const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
4494   if (rows < 4) {
4495     i3 = zero;
4496   }
4497   const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
4498   if (rows <= 4) {
4499     i4 = zero;
4500   }
4501   const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
4502   if (rows < 6) {
4503     i5 = zero;
4504   }
4505   const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
4506   if (rows <= 6) {
4507     i6 = zero;
4508   }
4509   const __m128 vscale = _mm_load_ps(params->sse.scale);
4510   const __m128 vmin = _mm_load_ps(params->sse.min);
4511   const __m128 vmax = _mm_load_ps(params->sse.max);
4512 
4513   while (channels >= 4) {
4514     const __m128 vi0 = _mm_loadu_ps(i0);
4515     i0 += 4;
4516     const __m128 vi1 = _mm_loadu_ps(i1);
4517     i1 += 4;
4518     const __m128 vi2 = _mm_loadu_ps(i2);
4519     i2 += 4;
4520     const __m128 vi3 = _mm_loadu_ps(i3);
4521     i3 += 4;
4522     const __m128 vi4 = _mm_loadu_ps(i4);
4523     i4 += 4;
4524     const __m128 vi5 = _mm_loadu_ps(i5);
4525     i5 += 4;
4526     const __m128 vi6 = _mm_loadu_ps(i6);
4527     i6 += 4;
4528 
4529     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4530     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4531     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4532 
4533     const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
4534     const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
4535 
4536     const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
4537 
4538     __m128 vout = _mm_mul_ps(vsum, vscale);
4539     vout = _mm_max_ps(vout, vmin);
4540     vout = _mm_min_ps(vout, vmax);
4541 
4542     _mm_storeu_ps(output, vout);
4543     output += 4;
4544 
4545     channels -= 4;
4546   }
4547   if (channels != 0) {
4548     const __m128 vi0 = _mm_loadu_ps(i0);
4549     const __m128 vi1 = _mm_loadu_ps(i1);
4550     const __m128 vi2 = _mm_loadu_ps(i2);
4551     const __m128 vi3 = _mm_loadu_ps(i3);
4552     const __m128 vi4 = _mm_loadu_ps(i4);
4553     const __m128 vi5 = _mm_loadu_ps(i5);
4554     const __m128 vi6 = _mm_loadu_ps(i6);
4555 
4556     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4557     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4558     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4559 
4560     const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
4561     const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
4562 
4563     const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
4564 
4565     __m128 vout = _mm_mul_ps(vsum, vscale);
4566     vout = _mm_max_ps(vout, vmin);
4567     vout = _mm_min_ps(vout, vmax);
4568 
4569     if (channels & 2) {
4570       _mm_storel_pi((__m64*) output, vout);
4571       vout = _mm_movehl_ps(vout, vout);
4572       output += 2;
4573     }
4574     if (channels & 1) {
4575       _mm_store_ss(output, vout);
4576     }
4577   }
4578 }
4579 
xnn_f32_gemm_minmax_ukernel_1x8__sse_load1(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4580 void xnn_f32_gemm_minmax_ukernel_1x8__sse_load1(
4581     size_t mr,
4582     size_t nc,
4583     size_t kc,
4584     const float*restrict a,
4585     size_t a_stride,
4586     const float*restrict w,
4587     float*restrict c,
4588     size_t cm_stride,
4589     size_t cn_stride,
4590     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
4591 {
4592   assert(mr != 0);
4593   assert(mr <= 1);
4594   assert(nc != 0);
4595   assert(kc != 0);
4596   assert(kc % sizeof(float) == 0);
4597   assert(a != NULL);
4598   assert(w != NULL);
4599   assert(c != NULL);
4600 
4601   const float* a0 = a;
4602   float* c0 = c;
4603 
4604   do {
4605     __m128 vacc0x0123 = _mm_load_ps(w + 0);
4606     __m128 vacc0x4567 = _mm_load_ps(w + 4);
4607     w += 8;
4608 
4609     size_t k = kc;
4610     do {
4611       const __m128 va0 = _mm_load1_ps(a0);
4612       a0 += 1;
4613 
4614       const __m128 vb0123 = _mm_load_ps(w);
4615       const __m128 vb4567 = _mm_load_ps(w + 4);
4616       w += 8;
4617 
4618       vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
4619       vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
4620 
4621       k -= sizeof(float);
4622     } while (k != 0);
4623 
4624     const __m128 vmax = _mm_load_ps(params->sse.max);
4625     vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
4626     vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
4627 
4628     const __m128 vmin = _mm_load_ps(params->sse.min);
4629     vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
4630     vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
4631 
4632     if XNN_LIKELY(nc >= 8) {
4633       _mm_storeu_ps(c0, vacc0x0123);
4634       _mm_storeu_ps(c0 + 4, vacc0x4567);
4635       c0 = (float*) ((uintptr_t) c0 + cn_stride);
4636 
4637       a0 = (const float*) ((uintptr_t) a0 - kc);
4638 
4639       nc -= 8;
4640     } else {
4641       if (nc & 4) {
4642         _mm_storeu_ps(c0, vacc0x0123);
4643 
4644         vacc0x0123 = vacc0x4567;
4645 
4646         c0 += 4;
4647       }
4648       if (nc & 2) {
4649         _mm_storel_pi((__m64*) c0, vacc0x0123);
4650 
4651         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
4652 
4653         c0 += 2;
4654       }
4655       if (nc & 1) {
4656         _mm_store_ss(c0, vacc0x0123);
4657       }
4658 
4659       nc = 0;
4660     }
4661   } while (nc != 0);
4662 }
4663 
xnn_f32_gemm_minmax_ukernel_4x2c4__sse(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4664 void xnn_f32_gemm_minmax_ukernel_4x2c4__sse(
4665     size_t mr,
4666     size_t nc,
4667     size_t kc,
4668     const float* restrict a,
4669     size_t a_stride,
4670     const float* restrict w,
4671     float* restrict c,
4672     size_t cm_stride,
4673     size_t cn_stride,
4674     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4675 {
4676   assert(mr != 0);
4677   assert(mr <= 4);
4678   assert(nc != 0);
4679   assert(kc != 0);
4680   assert(kc % sizeof(float) == 0);
4681   assert(a != NULL);
4682   assert(w != NULL);
4683   assert(c != NULL);
4684 
4685   const float* a0 = a;
4686   float* c0 = c;
4687   const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
4688   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
4689   if XNN_UNPREDICTABLE(mr < 2) {
4690     a1 = a0;
4691     c1 = c0;
4692   }
4693   const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
4694   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
4695   if XNN_UNPREDICTABLE(mr <= 2) {
4696     a2 = a1;
4697     c2 = c1;
4698   }
4699   const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
4700   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
4701   if XNN_UNPREDICTABLE(mr != 4) {
4702     a3 = a2;
4703     c3 = c2;
4704   }
4705 
4706   do {
4707     __m128 vacc0x0c4 = _mm_load_ss(w);
4708     __m128 vacc0x1c4 = _mm_load_ss(w + 1);
4709     __m128 vacc1x0c4 = vacc0x0c4;
4710     __m128 vacc1x1c4 = vacc0x1c4;
4711     __m128 vacc2x0c4 = vacc0x0c4;
4712     __m128 vacc2x1c4 = vacc0x1c4;
4713     __m128 vacc3x0c4 = vacc0x0c4;
4714     __m128 vacc3x1c4 = vacc0x1c4;
4715     w += 2;
4716 
4717     size_t k = kc;
4718     for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
4719       const __m128 va0 = _mm_loadu_ps(a0);
4720       a0 += 4;
4721       const __m128 va1 = _mm_loadu_ps(a1);
4722       a1 += 4;
4723       const __m128 va2 = _mm_loadu_ps(a2);
4724       a2 += 4;
4725       const __m128 va3 = _mm_loadu_ps(a3);
4726       a3 += 4;
4727 
4728       const __m128 vb0 = _mm_loadu_ps(w);
4729       const __m128 vb1 = _mm_loadu_ps(w + 4);
4730       w += 8;
4731 
4732       vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(va0, vb0));
4733       vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(va0, vb1));
4734       vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(va1, vb0));
4735       vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(va1, vb1));
4736       vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(va2, vb0));
4737       vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(va2, vb1));
4738       vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(va3, vb0));
4739       vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(va3, vb1));
4740     }
4741     if XNN_UNLIKELY(k != 0) {
4742       const __m128 va0 = _mm_loadu_ps(a0);
4743       a0 = (const float*) ((uintptr_t) a0 + k);
4744       const __m128 va1 = _mm_loadu_ps(a1);
4745       a1 = (const float*) ((uintptr_t) a1 + k);
4746       const __m128 va2 = _mm_loadu_ps(a2);
4747       a2 = (const float*) ((uintptr_t) a2 + k);
4748       const __m128 va3 = _mm_loadu_ps(a3);
4749       a3 = (const float*) ((uintptr_t) a3 + k);
4750 
4751       const __m128 vb0 = _mm_loadu_ps(w);
4752       const __m128 vb1 = _mm_loadu_ps(w + 4);
4753       w += 8;
4754 
4755       const __m128 vmask0 = _mm_cmpeq_ps(_mm_setzero_ps(), vb0);
4756       const __m128 vmask1 = _mm_cmpeq_ps(_mm_setzero_ps(), vb1);
4757 
4758       vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va0), vb0));
4759       vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va0), vb1));
4760       vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va1), vb0));
4761       vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va1), vb1));
4762       vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va2), vb0));
4763       vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va2), vb1));
4764       vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va3), vb0));
4765       vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va3), vb1));
4766     }
4767 
4768     const __m128 vacc0x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc0x0c4, vacc0x1c4), _mm_unpackhi_ps(vacc0x0c4, vacc0x1c4));
4769     const __m128 vacc1x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc1x0c4, vacc1x1c4), _mm_unpackhi_ps(vacc1x0c4, vacc1x1c4));
4770     const __m128 vacc2x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc2x0c4, vacc2x1c4), _mm_unpackhi_ps(vacc2x0c4, vacc2x1c4));
4771     const __m128 vacc3x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc3x0c4, vacc3x1c4), _mm_unpackhi_ps(vacc3x0c4, vacc3x1c4));
4772 
4773     __m128 vacc01x01 = _mm_add_ps(_mm_movelh_ps(vacc0x01c2, vacc1x01c2), _mm_movehl_ps(vacc1x01c2, vacc0x01c2));
4774     __m128 vacc23x01 = _mm_add_ps(_mm_movelh_ps(vacc2x01c2, vacc3x01c2), _mm_movehl_ps(vacc3x01c2, vacc2x01c2));
4775 
4776     const __m128 vmax = _mm_load_ps(params->sse.max);
4777     vacc01x01 = _mm_min_ps(vacc01x01, vmax);
4778     vacc23x01 = _mm_min_ps(vacc23x01, vmax);
4779 
4780     const __m128 vmin = _mm_load_ps(params->sse.min);
4781     vacc01x01 = _mm_max_ps(vacc01x01, vmin);
4782     vacc23x01 = _mm_max_ps(vacc23x01, vmin);
4783 
4784     if XNN_LIKELY(nc >= 2) {
4785       _mm_storel_pi((__m64*) c2, vacc23x01);
4786       c2 = (float*) ((uintptr_t) c2 + cn_stride);
4787       a2 = (const float*) ((uintptr_t) a2 - kc);
4788       _mm_storeh_pi((__m64*) c3, vacc23x01);
4789       c3 = (float*) ((uintptr_t) c3 + cn_stride);
4790       a3 = (const float*) ((uintptr_t) a3 - kc);
4791       _mm_storel_pi((__m64*) c0, vacc01x01);
4792       c0 = (float*) ((uintptr_t) c0 + cn_stride);
4793       a0 = (const float*) ((uintptr_t) a0 - kc);
4794       _mm_storeh_pi((__m64*) c1, vacc01x01);
4795       c1 = (float*) ((uintptr_t) c1 + cn_stride);
4796       a1 = (const float*) ((uintptr_t) a1 - kc);
4797 
4798       nc -= 2;
4799     } else {
4800       assert(nc == 1);
4801       _mm_store_ss(c2, vacc23x01);
4802       _mm_store_ss(c3, _mm_movehl_ps(vacc23x01, vacc23x01));
4803       _mm_store_ss(c0, vacc01x01);
4804       _mm_store_ss(c1, _mm_movehl_ps(vacc01x01, vacc01x01));
4805 
4806       nc = 0;
4807     }
4808   } while (nc != 0);
4809 }
4810 
xnn_f32_gemm_minmax_ukernel_4x8__sse_load1(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4811 void xnn_f32_gemm_minmax_ukernel_4x8__sse_load1(
4812     size_t mr,
4813     size_t nc,
4814     size_t kc,
4815     const float*restrict a,
4816     size_t a_stride,
4817     const float*restrict w,
4818     float*restrict c,
4819     size_t cm_stride,
4820     size_t cn_stride,
4821     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
4822 {
4823   assert(mr != 0);
4824   assert(mr <= 4);
4825   assert(nc != 0);
4826   assert(kc != 0);
4827   assert(kc % sizeof(float) == 0);
4828   assert(a != NULL);
4829   assert(w != NULL);
4830   assert(c != NULL);
4831 
4832   const float* a0 = a;
4833   float* c0 = c;
4834   const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
4835   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
4836   if XNN_UNPREDICTABLE(mr < 2) {
4837     a1 = a0;
4838     c1 = c0;
4839   }
4840   const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
4841   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
4842   if XNN_UNPREDICTABLE(mr <= 2) {
4843     a2 = a1;
4844     c2 = c1;
4845   }
4846   const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
4847   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
4848   if XNN_UNPREDICTABLE(mr != 4) {
4849     a3 = a2;
4850     c3 = c2;
4851   }
4852 
4853   do {
4854     __m128 vacc0x0123 = _mm_load_ps(w + 0);
4855     __m128 vacc0x4567 = _mm_load_ps(w + 4);
4856     __m128 vacc1x0123 = vacc0x0123;
4857     __m128 vacc1x4567 = vacc0x4567;
4858     __m128 vacc2x0123 = vacc0x0123;
4859     __m128 vacc2x4567 = vacc0x4567;
4860     __m128 vacc3x0123 = vacc0x0123;
4861     __m128 vacc3x4567 = vacc0x4567;
4862     w += 8;
4863 
4864     size_t k = kc;
4865     do {
4866       const __m128 va0 = _mm_load1_ps(a0);
4867       a0 += 1;
4868       const __m128 va1 = _mm_load1_ps(a1);
4869       a1 += 1;
4870       const __m128 va2 = _mm_load1_ps(a2);
4871       a2 += 1;
4872       const __m128 va3 = _mm_load1_ps(a3);
4873       a3 += 1;
4874 
4875       const __m128 vb0123 = _mm_load_ps(w);
4876       const __m128 vb4567 = _mm_load_ps(w + 4);
4877       w += 8;
4878 
4879       vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
4880       vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
4881       vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
4882       vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
4883       vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
4884       vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
4885       vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
4886       vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
4887 
4888       k -= sizeof(float);
4889     } while (k != 0);
4890 
4891     const __m128 vmax = _mm_load_ps(params->sse.max);
4892     vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
4893     vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
4894     vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
4895     vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
4896     vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
4897     vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
4898     vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
4899     vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
4900 
4901     const __m128 vmin = _mm_load_ps(params->sse.min);
4902     vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
4903     vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
4904     vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
4905     vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
4906     vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
4907     vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
4908     vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
4909     vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
4910 
4911     if XNN_LIKELY(nc >= 8) {
4912       _mm_storeu_ps(c3, vacc3x0123);
4913       _mm_storeu_ps(c3 + 4, vacc3x4567);
4914       c3 = (float*) ((uintptr_t) c3 + cn_stride);
4915       _mm_storeu_ps(c2, vacc2x0123);
4916       _mm_storeu_ps(c2 + 4, vacc2x4567);
4917       c2 = (float*) ((uintptr_t) c2 + cn_stride);
4918       _mm_storeu_ps(c1, vacc1x0123);
4919       _mm_storeu_ps(c1 + 4, vacc1x4567);
4920       c1 = (float*) ((uintptr_t) c1 + cn_stride);
4921       _mm_storeu_ps(c0, vacc0x0123);
4922       _mm_storeu_ps(c0 + 4, vacc0x4567);
4923       c0 = (float*) ((uintptr_t) c0 + cn_stride);
4924 
4925       a3 = (const float*) ((uintptr_t) a3 - kc);
4926       a2 = (const float*) ((uintptr_t) a2 - kc);
4927       a1 = (const float*) ((uintptr_t) a1 - kc);
4928       a0 = (const float*) ((uintptr_t) a0 - kc);
4929 
4930       nc -= 8;
4931     } else {
4932       if (nc & 4) {
4933         _mm_storeu_ps(c3, vacc3x0123);
4934         _mm_storeu_ps(c2, vacc2x0123);
4935         _mm_storeu_ps(c1, vacc1x0123);
4936         _mm_storeu_ps(c0, vacc0x0123);
4937 
4938         vacc3x0123 = vacc3x4567;
4939         vacc2x0123 = vacc2x4567;
4940         vacc1x0123 = vacc1x4567;
4941         vacc0x0123 = vacc0x4567;
4942 
4943         c3 += 4;
4944         c2 += 4;
4945         c1 += 4;
4946         c0 += 4;
4947       }
4948       if (nc & 2) {
4949         _mm_storel_pi((__m64*) c3, vacc3x0123);
4950         _mm_storel_pi((__m64*) c2, vacc2x0123);
4951         _mm_storel_pi((__m64*) c1, vacc1x0123);
4952         _mm_storel_pi((__m64*) c0, vacc0x0123);
4953 
4954         vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
4955         vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
4956         vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
4957         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
4958 
4959         c3 += 2;
4960         c2 += 2;
4961         c1 += 2;
4962         c0 += 2;
4963       }
4964       if (nc & 1) {
4965         _mm_store_ss(c3, vacc3x0123);
4966         _mm_store_ss(c2, vacc2x0123);
4967         _mm_store_ss(c1, vacc1x0123);
4968         _mm_store_ss(c0, vacc0x0123);
4969       }
4970 
4971       nc = 0;
4972     }
4973   } while (nc != 0);
4974 }
4975 
xnn_f32_ibilinear_chw_ukernel__sse_p8(size_t output_pixels,size_t channels,const float ** restrict input,size_t input_offset,const float * restrict weights,float * restrict output,size_t input_increment)4976 void xnn_f32_ibilinear_chw_ukernel__sse_p8(
4977     size_t output_pixels,
4978     size_t channels,
4979     const float**restrict input,
4980     size_t input_offset,
4981     const float*restrict weights,
4982     float*restrict output,
4983     size_t input_increment) XNN_OOB_READS
4984 {
4985   assert(output_pixels != 0);
4986   assert(channels != 0);
4987   assert(input_increment % sizeof(float) == 0);
4988 
4989   do {
4990     const float** i = input;
4991     const float* w = weights;
4992     size_t p = output_pixels;
4993     for (; p >= 8; p -= 8) {
4994       const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
4995       const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
4996       const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
4997       const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
4998       const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
4999       const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
5000       const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
5001       const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
5002       const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
5003       const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
5004       const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
5005       const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
5006       const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
5007       const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
5008       const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
5009       const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
5010       i += 2 * 8;
5011 
5012       const __m128 vw0123p0 = _mm_loadu_ps(w + 0);
5013       const __m128 vw0123p1 = _mm_loadu_ps(w + 4);
5014       const __m128 vw4567p0 = _mm_loadu_ps(w + 8);
5015       const __m128 vw4567p1 = _mm_loadu_ps(w + 12);
5016       w += 2 * 8;
5017 
5018       const __m128 vtltr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0);
5019       const __m128 vblbr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0);
5020       const __m128 vtltr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl2);
5021       const __m128 vblbr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl2);
5022       const __m128 vtltr4 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl4);
5023       const __m128 vblbr4 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl4);
5024       const __m128 vtltr6 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl6);
5025       const __m128 vblbr6 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl6);
5026 
5027       const __m128 valphah0123 = _mm_shuffle_ps(vw0123p0, vw0123p1, _MM_SHUFFLE(2, 0, 2, 0));
5028       const __m128 valphav0123 = _mm_shuffle_ps(vw0123p0, vw0123p1, _MM_SHUFFLE(3, 1, 3, 1));
5029       const __m128 valphah4567 = _mm_shuffle_ps(vw4567p0, vw4567p1, _MM_SHUFFLE(2, 0, 2, 0));
5030       const __m128 valphav4567 = _mm_shuffle_ps(vw4567p0, vw4567p1, _MM_SHUFFLE(3, 1, 3, 1));
5031 
5032       const __m128 vtltr01 = _mm_loadh_pi(vtltr0, (const __m64*) itl1);
5033       const __m128 vblbr01 = _mm_loadh_pi(vblbr0, (const __m64*) ibl1);
5034       const __m128 vtltr23 = _mm_loadh_pi(vtltr2, (const __m64*) itl3);
5035       const __m128 vblbr23 = _mm_loadh_pi(vblbr2, (const __m64*) ibl3);
5036       const __m128 vtltr45 = _mm_loadh_pi(vtltr4, (const __m64*) itl5);
5037       const __m128 vblbr45 = _mm_loadh_pi(vblbr4, (const __m64*) ibl5);
5038       const __m128 vtltr67 = _mm_loadh_pi(vtltr6, (const __m64*) itl7);
5039       const __m128 vblbr67 = _mm_loadh_pi(vblbr6, (const __m64*) ibl7);
5040 
5041       const __m128 vldrd01 = _mm_sub_ps(vblbr01, vtltr01);
5042       const __m128 vldrd23 = _mm_sub_ps(vblbr23, vtltr23);
5043       const __m128 vldrd45 = _mm_sub_ps(vblbr45, vtltr45);
5044       const __m128 vldrd67 = _mm_sub_ps(vblbr67, vtltr67);
5045 
5046       const __m128 vld0123 = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(2, 0, 2, 0));
5047       const __m128 vrd0123 = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(3, 1, 3, 1));
5048       const __m128 vld4567 = _mm_shuffle_ps(vldrd45, vldrd67, _MM_SHUFFLE(2, 0, 2, 0));
5049       const __m128 vrd4567 = _mm_shuffle_ps(vldrd45, vldrd67, _MM_SHUFFLE(3, 1, 3, 1));
5050 
5051       const __m128 vtl0123 = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(2, 0, 2, 0));
5052       const __m128 vtr0123 = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(3, 1, 3, 1));
5053       const __m128 vtl4567 = _mm_shuffle_ps(vtltr45, vtltr67, _MM_SHUFFLE(2, 0, 2, 0));
5054       const __m128 vtr4567 = _mm_shuffle_ps(vtltr45, vtltr67, _MM_SHUFFLE(3, 1, 3, 1));
5055 
5056       const __m128 vl0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vld0123, valphav0123));
5057       const __m128 vr0123 = _mm_add_ps(vtr0123, _mm_mul_ps(vrd0123, valphav0123));
5058       const __m128 vl4567 = _mm_add_ps(vtl4567, _mm_mul_ps(vld4567, valphav4567));
5059       const __m128 vr4567 = _mm_add_ps(vtr4567, _mm_mul_ps(vrd4567, valphav4567));
5060 
5061       const __m128 vd0123 = _mm_sub_ps(vr0123, vl0123);
5062       const __m128 vd4567 = _mm_sub_ps(vr4567, vl4567);
5063 
5064       const __m128 vo0123 = _mm_add_ps(vl0123, _mm_mul_ps(vd0123, valphah0123));
5065       const __m128 vo4567 = _mm_add_ps(vl4567, _mm_mul_ps(vd4567, valphah4567));
5066 
5067       _mm_storeu_ps(output + 0, vo0123);
5068       _mm_storeu_ps(output + 4, vo4567);
5069       output += 8;
5070     }
5071 
5072     for (; p >= 4; p -= 4) {
5073       const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
5074       const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
5075       const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
5076       const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
5077       const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
5078       const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
5079       const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
5080       const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
5081       i += 8;
5082 
5083       const __m128 vw0 = _mm_loadu_ps(w);
5084       const __m128 vw1 = _mm_loadu_ps(w + 4);
5085       w += 8;
5086 
5087       const __m128 vtltr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0);
5088       const __m128 vblbr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0);
5089       const __m128 vtltr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl2);
5090       const __m128 vblbr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl2);
5091 
5092       const __m128 valphah = _mm_shuffle_ps(vw0, vw1, _MM_SHUFFLE(2, 0, 2, 0));
5093       const __m128 valphav = _mm_shuffle_ps(vw0, vw1, _MM_SHUFFLE(3, 1, 3, 1));
5094 
5095       const __m128 vtltr01 = _mm_loadh_pi(vtltr0, (const __m64*) itl1);
5096       const __m128 vblbr01 = _mm_loadh_pi(vblbr0, (const __m64*) ibl1);
5097       const __m128 vtltr23 = _mm_loadh_pi(vtltr2, (const __m64*) itl3);
5098       const __m128 vblbr23 = _mm_loadh_pi(vblbr2, (const __m64*) ibl3);
5099 
5100       const __m128 vldrd01 = _mm_sub_ps(vblbr01, vtltr01);
5101       const __m128 vldrd23 = _mm_sub_ps(vblbr23, vtltr23);
5102 
5103       const __m128 vld = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(2, 0, 2, 0));
5104       const __m128 vrd = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(3, 1, 3, 1));
5105 
5106       const __m128 vtl = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(2, 0, 2, 0));
5107       const __m128 vtr = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(3, 1, 3, 1));
5108 
5109       const __m128 vl = _mm_add_ps(vtl, _mm_mul_ps(vld, valphav));
5110       const __m128 vr = _mm_add_ps(vtr, _mm_mul_ps(vrd, valphav));
5111 
5112       const __m128 vd = _mm_sub_ps(vr, vl);
5113       const __m128 vo = _mm_add_ps(vl, _mm_mul_ps(vd, valphah));
5114 
5115       _mm_storeu_ps(output, vo);
5116       output += 4;
5117     }
5118 
5119     if XNN_UNLIKELY(p != 0) {
5120       if (p & 2) {
5121         const __m128 vw = _mm_loadu_ps(w);
5122         w += 4;
5123 
5124         const __m128 valphah = _mm_shuffle_ps(vw, vw, _MM_SHUFFLE(2, 0, 2, 0));
5125         const __m128 valphav = _mm_shuffle_ps(vw, vw, _MM_SHUFFLE(3, 1, 3, 1));
5126 
5127         const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
5128         const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
5129         const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
5130         const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
5131         i += 4;
5132 
5133         const __m128 vtltr = _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0), (const __m64*) itl1);
5134         const __m128 vblbr = _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0), (const __m64*) ibl1);
5135 
5136         const __m128 vldrd = _mm_sub_ps(vblbr, vtltr);
5137         const __m128 vld = _mm_shuffle_ps(vldrd, vldrd, _MM_SHUFFLE(2, 0, 2, 0));
5138         const __m128 vrd = _mm_shuffle_ps(vldrd, vldrd, _MM_SHUFFLE(3, 1, 3, 1));
5139 
5140         const __m128 vtl = _mm_shuffle_ps(vtltr, vtltr, _MM_SHUFFLE(2, 0, 2, 0));
5141         const __m128 vtr = _mm_shuffle_ps(vtltr, vtltr, _MM_SHUFFLE(3, 1, 3, 1));
5142 
5143         const __m128 vl = _mm_add_ps(vtl, _mm_mul_ps(vld, valphav));
5144         const __m128 vr = _mm_add_ps(vtr, _mm_mul_ps(vrd, valphav));
5145 
5146         const __m128 vd = _mm_sub_ps(vr, vl);
5147         const __m128 vo = _mm_add_ps(vl, _mm_mul_ps(vd, valphah));
5148 
5149         _mm_storel_pi((__m64*) output, vo);
5150         output += 2;
5151       }
5152 
5153       if (p & 1) {
5154         // We are computing the following formula:
5155         //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
5156         //                 alpha_h  * (1 - alpha_v) * top_right +
5157         //            (1 - alpha_h) *      alpha_v  * bottom_left +
5158         //                 alpha_h  *      alpha_v  * bottom_right.
5159         //
5160         // Rearranging gives
5161         //   result =    left + alpha_h * (right        - left),
5162         // where
5163         //   left =  top_left + alpha_v * (bottom_left  - top_left),
5164         //  right = top_right + alpha_v * (bottom_right - top_right).
5165 
5166         const float alphah = *w;
5167         const __m128 valphav = _mm_load_ps1(w + 1);
5168         w += 2;
5169 
5170         const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
5171         const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
5172         i += 2;
5173 
5174         const __m128 vtltr = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl);
5175         const __m128 vblbr = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl);
5176 
5177         // Compute at once
5178         //    left_diff = bottom_left  - top_left
5179         //   right_diff = bottom_right - top_right
5180         const __m128 vldrd = _mm_sub_ps(vblbr, vtltr);
5181         const __m128 vlr = _mm_add_ps(vtltr, _mm_mul_ps(vldrd, valphav));
5182 
5183         // Extract them and compute the result.
5184         const float l = _mm_cvtss_f32(vlr);
5185         const float r = _mm_cvtss_f32(_mm_shuffle_ps(vlr, vlr, 1));
5186 
5187         *output++ = l + alphah * (r - l);
5188       }
5189     }
5190 
5191     input_offset += input_increment;
5192   } while (--channels != 0);
5193 }
5194 
xnn_f32_ibilinear_ukernel__sse_c8(size_t output_pixels,size_t channels,const float ** restrict input,size_t input_offset,const float * restrict weights,float * restrict output,size_t output_increment)5195 void xnn_f32_ibilinear_ukernel__sse_c8(
5196     size_t output_pixels,
5197     size_t channels,
5198     const float**restrict input,
5199     size_t input_offset,
5200     const float*restrict weights,
5201     float*restrict output,
5202     size_t output_increment) XNN_OOB_READS
5203 {
5204   assert(output_pixels != 0);
5205   assert(channels != 0);
5206   assert(channels % sizeof(float) == 0);
5207 
5208   do {
5209     const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset);
5210     const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset);
5211     const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset);
5212     const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset);
5213     input += 4;
5214 
5215     __m128 valphahv = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) weights);
5216     valphahv = _mm_unpacklo_ps(valphahv, valphahv);
5217     const __m128 valphah = _mm_movelh_ps(valphahv, valphahv);
5218     const __m128 valphav = _mm_movehl_ps(valphahv, valphahv);
5219     weights += 2;
5220 
5221     size_t c = channels;
5222     for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
5223       const __m128 vtl0123 = _mm_loadu_ps(i0);
5224       const __m128 vtr0123 = _mm_loadu_ps(i1);
5225       const __m128 vbl0123 = _mm_loadu_ps(i2);
5226       const __m128 vbr0123 = _mm_loadu_ps(i3);
5227       const __m128 vtl4567 = _mm_loadu_ps(i0 + 4);
5228       const __m128 vtr4567 = _mm_loadu_ps(i1 + 4);
5229       const __m128 vbl4567 = _mm_loadu_ps(i2 + 4);
5230       const __m128 vbr4567 = _mm_loadu_ps(i3 + 4);
5231       i0 += 8;
5232       i1 += 8;
5233       i2 += 8;
5234       i3 += 8;
5235 
5236       const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
5237       const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
5238       const __m128 vtd4567 = _mm_sub_ps(vtr4567, vtl4567);
5239       const __m128 vbd4567 = _mm_sub_ps(vbr4567, vbl4567);
5240 
5241       const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
5242       const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
5243       const __m128 vt4567 = _mm_add_ps(vtl4567, _mm_mul_ps(vtd4567, valphah));
5244       const __m128 vb4567 = _mm_add_ps(vbl4567, _mm_mul_ps(vbd4567, valphah));
5245 
5246       const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
5247       const __m128 vd4567 = _mm_sub_ps(vb4567, vt4567);
5248 
5249       const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
5250       const __m128 vo4567 = _mm_add_ps(vt4567, _mm_mul_ps(vd4567, valphav));
5251 
5252       _mm_storeu_ps(output, vo0123);
5253       _mm_storeu_ps(output + 4, vo4567);
5254       output += 8;
5255     }
5256     for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
5257       const __m128 vtl0123 = _mm_loadu_ps(i0);
5258       const __m128 vtr0123 = _mm_loadu_ps(i1);
5259       const __m128 vbl0123 = _mm_loadu_ps(i2);
5260       const __m128 vbr0123 = _mm_loadu_ps(i3);
5261       i0 += 4;
5262       i1 += 4;
5263       i2 += 4;
5264       i3 += 4;
5265 
5266       const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
5267       const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
5268 
5269       const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
5270       const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
5271 
5272       const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
5273 
5274       const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
5275 
5276       _mm_storeu_ps(output, vo0123);
5277       output += 4;
5278     }
5279     if XNN_UNLIKELY(c != 0) {
5280       const __m128 vtl0123 = _mm_loadu_ps(i0);
5281       const __m128 vtr0123 = _mm_loadu_ps(i1);
5282       const __m128 vbl0123 = _mm_loadu_ps(i2);
5283       const __m128 vbr0123 = _mm_loadu_ps(i3);
5284 
5285       const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
5286       const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
5287 
5288       const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
5289       const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
5290 
5291       const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
5292 
5293       __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
5294 
5295       if (c & (2 * sizeof(float))) {
5296         _mm_storel_pi((__m64*) output, vo0123);
5297         vo0123 = _mm_movehl_ps(vo0123, vo0123);
5298         output += 2;
5299       }
5300       if (c & (1 * sizeof(float))) {
5301         _mm_store_ss(output, vo0123);
5302         output += 1;
5303       }
5304     }
5305 
5306     output = (float*) ((uintptr_t) output + output_increment);
5307   } while (--output_pixels != 0);
5308 }
5309 
xnn_f32_igemm_minmax_ukernel_1x8__sse_load1(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5310 void xnn_f32_igemm_minmax_ukernel_1x8__sse_load1(
5311     size_t mr,
5312     size_t nc,
5313     size_t kc,
5314     size_t ks,
5315     const float**restrict a,
5316     const float*restrict w,
5317     float*restrict c,
5318     size_t cm_stride,
5319     size_t cn_stride,
5320     size_t a_offset,
5321     const float* zero,
5322     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5323 {
5324   assert(mr != 0);
5325   assert(mr <= 1);
5326   assert(nc != 0);
5327   assert(kc != 0);
5328   assert(kc % sizeof(float) == 0);
5329   assert(ks != 0);
5330   assert(ks % (1 * sizeof(void*)) == 0);
5331   assert(a_offset % sizeof(float) == 0);
5332   assert(a != NULL);
5333   assert(w != NULL);
5334   assert(c != NULL);
5335 
5336   float* c0 = c;
5337 
5338   do {
5339     __m128 vacc0x0123 = _mm_load_ps(w);
5340     __m128 vacc0x4567 = _mm_load_ps(w + 4);
5341     w += 8;
5342 
5343     size_t p = ks;
5344     do {
5345       const float* restrict a0 = a[0];
5346       assert(a0 != NULL);
5347       if XNN_UNPREDICTABLE(a0 != zero) {
5348         a0 = (const float*) ((uintptr_t) a0 + a_offset);
5349       }
5350       a += 1;
5351 
5352       size_t k = kc;
5353       do {
5354         const __m128 vb0123 = _mm_load_ps(w);
5355         const __m128 vb4567 = _mm_load_ps(w + 4);
5356         w += 8;
5357 
5358         const __m128 va0 = _mm_load1_ps(a0);
5359         a0 += 1;
5360 
5361         vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
5362         vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
5363         k -= sizeof(float);
5364       } while (k != 0);
5365       p -= 1 * sizeof(void*);
5366     } while (p != 0);
5367 
5368     const __m128 vmax = _mm_load_ps(params->sse.max);
5369     vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
5370     vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
5371 
5372     const __m128 vmin = _mm_load_ps(params->sse.min);
5373     vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
5374     vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
5375 
5376     if XNN_LIKELY(nc >= 8) {
5377       _mm_storeu_ps(c0, vacc0x0123);
5378       _mm_storeu_ps(c0 + 4, vacc0x4567);
5379       c0 = (float*) ((uintptr_t) c0 + cn_stride);
5380 
5381       a = (const float**restrict) ((uintptr_t) a - ks);
5382       nc -= 8;
5383     } else {
5384       if (nc & 4) {
5385         _mm_storeu_ps(c0, vacc0x0123);
5386 
5387         vacc0x0123 = vacc0x4567;
5388 
5389         c0 += 4;
5390       }
5391       if (nc & 2) {
5392         _mm_storel_pi((__m64*) c0, vacc0x0123);
5393 
5394         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
5395 
5396         c0 += 2;
5397       }
5398       if (nc & 1) {
5399         _mm_store_ss(c0, vacc0x0123);
5400       }
5401 
5402       nc = 0;
5403     }
5404   } while (nc != 0);
5405 }
5406 
xnn_f32_igemm_minmax_ukernel_4x2c4__sse(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5407 void xnn_f32_igemm_minmax_ukernel_4x2c4__sse(
5408     size_t mr,
5409     size_t nc,
5410     size_t kc,
5411     size_t ks,
5412     const float**restrict a,
5413     const float*restrict w,
5414     float*restrict c,
5415     size_t cm_stride,
5416     size_t cn_stride,
5417     size_t a_offset,
5418     const float* zero,
5419     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5420 {
5421   assert(mr != 0);
5422   assert(mr <= 4);
5423   assert(nc != 0);
5424   assert(kc != 0);
5425   assert(kc % sizeof(float) == 0);
5426   assert(ks != 0);
5427   assert(ks % (4 * sizeof(void*)) == 0);
5428   assert(a_offset % sizeof(float) == 0);
5429   assert(a != NULL);
5430   assert(w != NULL);
5431   assert(c != NULL);
5432 
5433   float* c0 = c;
5434   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
5435   if XNN_UNPREDICTABLE(mr < 2) {
5436     c1 = c0;
5437   }
5438   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
5439   if XNN_UNPREDICTABLE(mr <= 2) {
5440     c2 = c1;
5441   }
5442   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
5443   if XNN_UNPREDICTABLE(mr != 4) {
5444     c3 = c2;
5445   }
5446 
5447   do {
5448     __m128 vacc0x0c4 = _mm_load_ss(w);
5449     __m128 vacc0x1c4 = _mm_load_ss(w + 1);
5450     __m128 vacc1x0c4 = vacc0x0c4;
5451     __m128 vacc1x1c4 = vacc0x1c4;
5452     __m128 vacc2x0c4 = vacc0x0c4;
5453     __m128 vacc2x1c4 = vacc0x1c4;
5454     __m128 vacc3x0c4 = vacc0x0c4;
5455     __m128 vacc3x1c4 = vacc0x1c4;
5456     w += 2;
5457 
5458     size_t p = ks;
5459     do {
5460       const float* restrict a0 = a[0];
5461       assert(a0 != NULL);
5462       if XNN_UNPREDICTABLE(a0 != zero) {
5463         a0 = (const float*) ((uintptr_t) a0 + a_offset);
5464       }
5465       const float* restrict a1 = a[1];
5466       assert(a1 != NULL);
5467       if XNN_UNPREDICTABLE(a1 != zero) {
5468         a1 = (const float*) ((uintptr_t) a1 + a_offset);
5469       }
5470       const float* restrict a2 = a[2];
5471       assert(a2 != NULL);
5472       if XNN_UNPREDICTABLE(a2 != zero) {
5473         a2 = (const float*) ((uintptr_t) a2 + a_offset);
5474       }
5475       const float* restrict a3 = a[3];
5476       assert(a3 != NULL);
5477       if XNN_UNPREDICTABLE(a3 != zero) {
5478         a3 = (const float*) ((uintptr_t) a3 + a_offset);
5479       }
5480       a += 4;
5481 
5482       size_t k = kc;
5483       for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
5484         const __m128 va0 = _mm_loadu_ps(a0);
5485         a0 += 4;
5486         const __m128 va1 = _mm_loadu_ps(a1);
5487         a1 += 4;
5488         const __m128 va2 = _mm_loadu_ps(a2);
5489         a2 += 4;
5490         const __m128 va3 = _mm_loadu_ps(a3);
5491         a3 += 4;
5492 
5493         const __m128 vb0 = _mm_loadu_ps(w);
5494         const __m128 vb1 = _mm_loadu_ps(w + 4);
5495         w += 8;
5496 
5497         vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(va0, vb0));
5498         vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(va0, vb1));
5499         vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(va1, vb0));
5500         vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(va1, vb1));
5501         vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(va2, vb0));
5502         vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(va2, vb1));
5503         vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(va3, vb0));
5504         vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(va3, vb1));
5505       }
5506       if XNN_UNLIKELY(k != 0) {
5507         const __m128 va0 = _mm_loadu_ps(a0);
5508         const __m128 va1 = _mm_loadu_ps(a1);
5509         const __m128 va2 = _mm_loadu_ps(a2);
5510         const __m128 va3 = _mm_loadu_ps(a3);
5511 
5512         const __m128 vb0 = _mm_loadu_ps(w);
5513         const __m128 vb1 = _mm_loadu_ps(w + 4);
5514         w += 8;
5515 
5516         const __m128 vmask0 = _mm_cmpeq_ps(_mm_setzero_ps(), vb0);
5517         const __m128 vmask1 = _mm_cmpeq_ps(_mm_setzero_ps(), vb1);
5518 
5519         vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va0), vb0));
5520         vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va0), vb1));
5521         vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va1), vb0));
5522         vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va1), vb1));
5523         vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va2), vb0));
5524         vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va2), vb1));
5525         vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va3), vb0));
5526         vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va3), vb1));
5527       }
5528       p -= 4 * sizeof(void*);
5529     } while (p != 0);
5530 
5531     const __m128 vacc0x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc0x0c4, vacc0x1c4), _mm_unpackhi_ps(vacc0x0c4, vacc0x1c4));
5532     const __m128 vacc1x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc1x0c4, vacc1x1c4), _mm_unpackhi_ps(vacc1x0c4, vacc1x1c4));
5533     const __m128 vacc2x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc2x0c4, vacc2x1c4), _mm_unpackhi_ps(vacc2x0c4, vacc2x1c4));
5534     const __m128 vacc3x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc3x0c4, vacc3x1c4), _mm_unpackhi_ps(vacc3x0c4, vacc3x1c4));
5535 
5536     __m128 vacc01x01 = _mm_add_ps(_mm_movelh_ps(vacc0x01c2, vacc1x01c2), _mm_movehl_ps(vacc1x01c2, vacc0x01c2));
5537     __m128 vacc23x01 = _mm_add_ps(_mm_movelh_ps(vacc2x01c2, vacc3x01c2), _mm_movehl_ps(vacc3x01c2, vacc2x01c2));
5538 
5539     const __m128 vmax = _mm_load_ps(params->sse.max);
5540     vacc01x01 = _mm_min_ps(vacc01x01, vmax);
5541     vacc23x01 = _mm_min_ps(vacc23x01, vmax);
5542 
5543     const __m128 vmin = _mm_load_ps(params->sse.min);
5544     vacc01x01 = _mm_max_ps(vacc01x01, vmin);
5545     vacc23x01 = _mm_max_ps(vacc23x01, vmin);
5546 
5547     if XNN_LIKELY(nc >= 2) {
5548       _mm_storeh_pi((__m64*) c3, vacc23x01);
5549       c3 = (float*) ((uintptr_t) c3 + cn_stride);
5550       _mm_storel_pi((__m64*) c2, vacc23x01);
5551       c2 = (float*) ((uintptr_t) c2 + cn_stride);
5552       _mm_storeh_pi((__m64*) c1, vacc01x01);
5553       c1 = (float*) ((uintptr_t) c1 + cn_stride);
5554       _mm_storel_pi((__m64*) c0, vacc01x01);
5555       c0 = (float*) ((uintptr_t) c0 + cn_stride);
5556 
5557       a = (const float**restrict) ((uintptr_t) a - ks);
5558       nc -= 2;
5559     } else {
5560       assert(nc == 1);
5561       _mm_store_ss(c3, _mm_movehl_ps(vacc23x01, vacc23x01));
5562       _mm_store_ss(c2, vacc23x01);
5563       _mm_store_ss(c1, _mm_movehl_ps(vacc01x01, vacc01x01));
5564       _mm_store_ss(c0, vacc01x01);
5565 
5566       nc = 0;
5567     }
5568   } while (nc != 0);
5569 }
5570 
xnn_f32_igemm_minmax_ukernel_4x8__sse_load1(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5571 void xnn_f32_igemm_minmax_ukernel_4x8__sse_load1(
5572     size_t mr,
5573     size_t nc,
5574     size_t kc,
5575     size_t ks,
5576     const float**restrict a,
5577     const float*restrict w,
5578     float*restrict c,
5579     size_t cm_stride,
5580     size_t cn_stride,
5581     size_t a_offset,
5582     const float* zero,
5583     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5584 {
5585   assert(mr != 0);
5586   assert(mr <= 4);
5587   assert(nc != 0);
5588   assert(kc != 0);
5589   assert(kc % sizeof(float) == 0);
5590   assert(ks != 0);
5591   assert(ks % (4 * sizeof(void*)) == 0);
5592   assert(a_offset % sizeof(float) == 0);
5593   assert(a != NULL);
5594   assert(w != NULL);
5595   assert(c != NULL);
5596 
5597   float* c0 = c;
5598   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
5599   if XNN_UNPREDICTABLE(mr < 2) {
5600     c1 = c0;
5601   }
5602   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
5603   if XNN_UNPREDICTABLE(mr <= 2) {
5604     c2 = c1;
5605   }
5606   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
5607   if XNN_UNPREDICTABLE(mr != 4) {
5608     c3 = c2;
5609   }
5610 
5611   do {
5612     __m128 vacc0x0123 = _mm_load_ps(w);
5613     __m128 vacc0x4567 = _mm_load_ps(w + 4);
5614     __m128 vacc1x0123 = vacc0x0123;
5615     __m128 vacc1x4567 = vacc0x4567;
5616     __m128 vacc2x0123 = vacc0x0123;
5617     __m128 vacc2x4567 = vacc0x4567;
5618     __m128 vacc3x0123 = vacc0x0123;
5619     __m128 vacc3x4567 = vacc0x4567;
5620     w += 8;
5621 
5622     size_t p = ks;
5623     do {
5624       const float* restrict a0 = a[0];
5625       assert(a0 != NULL);
5626       if XNN_UNPREDICTABLE(a0 != zero) {
5627         a0 = (const float*) ((uintptr_t) a0 + a_offset);
5628       }
5629       const float* restrict a1 = a[1];
5630       assert(a1 != NULL);
5631       if XNN_UNPREDICTABLE(a1 != zero) {
5632         a1 = (const float*) ((uintptr_t) a1 + a_offset);
5633       }
5634       const float* restrict a2 = a[2];
5635       assert(a2 != NULL);
5636       if XNN_UNPREDICTABLE(a2 != zero) {
5637         a2 = (const float*) ((uintptr_t) a2 + a_offset);
5638       }
5639       const float* restrict a3 = a[3];
5640       assert(a3 != NULL);
5641       if XNN_UNPREDICTABLE(a3 != zero) {
5642         a3 = (const float*) ((uintptr_t) a3 + a_offset);
5643       }
5644       a += 4;
5645 
5646       size_t k = kc;
5647       do {
5648         const __m128 vb0123 = _mm_load_ps(w);
5649         const __m128 vb4567 = _mm_load_ps(w + 4);
5650         w += 8;
5651 
5652         const __m128 va0 = _mm_load1_ps(a0);
5653         a0 += 1;
5654         const __m128 va1 = _mm_load1_ps(a1);
5655         a1 += 1;
5656         const __m128 va2 = _mm_load1_ps(a2);
5657         a2 += 1;
5658         const __m128 va3 = _mm_load1_ps(a3);
5659         a3 += 1;
5660 
5661         vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
5662         vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
5663         vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
5664         vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
5665         vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
5666         vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
5667         vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
5668         vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
5669         k -= sizeof(float);
5670       } while (k != 0);
5671       p -= 4 * sizeof(void*);
5672     } while (p != 0);
5673 
5674     const __m128 vmax = _mm_load_ps(params->sse.max);
5675     vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
5676     vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
5677     vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
5678     vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
5679     vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
5680     vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
5681     vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
5682     vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
5683 
5684     const __m128 vmin = _mm_load_ps(params->sse.min);
5685     vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
5686     vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
5687     vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
5688     vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
5689     vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
5690     vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
5691     vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
5692     vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
5693 
5694     if XNN_LIKELY(nc >= 8) {
5695       _mm_storeu_ps(c3, vacc3x0123);
5696       _mm_storeu_ps(c3 + 4, vacc3x4567);
5697       c3 = (float*) ((uintptr_t) c3 + cn_stride);
5698       _mm_storeu_ps(c2, vacc2x0123);
5699       _mm_storeu_ps(c2 + 4, vacc2x4567);
5700       c2 = (float*) ((uintptr_t) c2 + cn_stride);
5701       _mm_storeu_ps(c1, vacc1x0123);
5702       _mm_storeu_ps(c1 + 4, vacc1x4567);
5703       c1 = (float*) ((uintptr_t) c1 + cn_stride);
5704       _mm_storeu_ps(c0, vacc0x0123);
5705       _mm_storeu_ps(c0 + 4, vacc0x4567);
5706       c0 = (float*) ((uintptr_t) c0 + cn_stride);
5707 
5708       a = (const float**restrict) ((uintptr_t) a - ks);
5709       nc -= 8;
5710     } else {
5711       if (nc & 4) {
5712         _mm_storeu_ps(c3, vacc3x0123);
5713         _mm_storeu_ps(c2, vacc2x0123);
5714         _mm_storeu_ps(c1, vacc1x0123);
5715         _mm_storeu_ps(c0, vacc0x0123);
5716 
5717         vacc3x0123 = vacc3x4567;
5718         vacc2x0123 = vacc2x4567;
5719         vacc1x0123 = vacc1x4567;
5720         vacc0x0123 = vacc0x4567;
5721 
5722         c3 += 4;
5723         c2 += 4;
5724         c1 += 4;
5725         c0 += 4;
5726       }
5727       if (nc & 2) {
5728         _mm_storel_pi((__m64*) c3, vacc3x0123);
5729         _mm_storel_pi((__m64*) c2, vacc2x0123);
5730         _mm_storel_pi((__m64*) c1, vacc1x0123);
5731         _mm_storel_pi((__m64*) c0, vacc0x0123);
5732 
5733         vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
5734         vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
5735         vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
5736         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
5737 
5738         c3 += 2;
5739         c2 += 2;
5740         c1 += 2;
5741         c0 += 2;
5742       }
5743       if (nc & 1) {
5744         _mm_store_ss(c3, vacc3x0123);
5745         _mm_store_ss(c2, vacc2x0123);
5746         _mm_store_ss(c1, vacc1x0123);
5747         _mm_store_ss(c0, vacc0x0123);
5748       }
5749 
5750       nc = 0;
5751     }
5752   } while (nc != 0);
5753 }
5754 
xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5755 void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(
5756     size_t output_pixels,
5757     size_t kernel_elements,
5758     size_t channels,
5759     const float** input,
5760     size_t input_offset,
5761     float* output,
5762     size_t input_increment,
5763     size_t output_increment,
5764     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5765 {
5766   assert(output_pixels != 0);
5767   assert(kernel_elements != 0);
5768   assert(channels != 0);
5769 
5770   const __m128 voutput_max = _mm_load_ps(params->sse.max);
5771   const __m128 voutput_min = _mm_load_ps(params->sse.min);
5772   do {
5773     float* o = output;
5774     {
5775       const float* i0 = *input++;
5776       const float* i1 = *input++;
5777       const float* i2 = *input++;
5778       const float* i3 = *input++;
5779       const float* i4 = *input++;
5780       const float* i5 = *input++;
5781       const float* i6 = *input++;
5782       const float* i7 = *input++;
5783       const float* i8 = *input++;
5784       i0 = (const float*) ((uintptr_t) i0 + input_offset);
5785       i1 = (const float*) ((uintptr_t) i1 + input_offset);
5786       i2 = (const float*) ((uintptr_t) i2 + input_offset);
5787       i3 = (const float*) ((uintptr_t) i3 + input_offset);
5788       i4 = (const float*) ((uintptr_t) i4 + input_offset);
5789       i5 = (const float*) ((uintptr_t) i5 + input_offset);
5790       i6 = (const float*) ((uintptr_t) i6 + input_offset);
5791       i7 = (const float*) ((uintptr_t) i7 + input_offset);
5792       i8 = (const float*) ((uintptr_t) i8 + input_offset);
5793       if (kernel_elements < 2) {
5794         i1 = i0;
5795       }
5796       if (kernel_elements <= 2) {
5797         i2 = i0;
5798       }
5799       if (kernel_elements < 4) {
5800         i3 = i0;
5801       }
5802       if (kernel_elements <= 4) {
5803         i4 = i0;
5804       }
5805       if (kernel_elements < 6) {
5806         i5 = i0;
5807       }
5808       if (kernel_elements <= 6) {
5809         i6 = i0;
5810       }
5811       if (kernel_elements < 8) {
5812         i7 = i0;
5813       }
5814       if (kernel_elements <= 8) {
5815         i8 = i0;
5816       }
5817 
5818       size_t c = channels;
5819       for (; c >= 4; c -= 4) {
5820         const __m128 vi0 = _mm_loadu_ps(i0);
5821         i0 += 4;
5822         const __m128 vi1 = _mm_loadu_ps(i1);
5823         i1 += 4;
5824         const __m128 vi2 = _mm_loadu_ps(i2);
5825         i2 += 4;
5826         const __m128 vi3 = _mm_loadu_ps(i3);
5827         i3 += 4;
5828         const __m128 vi4 = _mm_loadu_ps(i4);
5829         i4 += 4;
5830         const __m128 vi5 = _mm_loadu_ps(i5);
5831         i5 += 4;
5832         const __m128 vi6 = _mm_loadu_ps(i6);
5833         i6 += 4;
5834         const __m128 vi7 = _mm_loadu_ps(i7);
5835         i7 += 4;
5836         const __m128 vi8 = _mm_loadu_ps(i8);
5837         i8 += 4;
5838 
5839         const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8);
5840         const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5841         const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5842         const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5843 
5844         const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5845         const __m128 vmax01678 = _mm_max_ps(vmax018, vmax67);
5846         const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
5847         const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5848 
5849         _mm_storeu_ps(o, vout);
5850         o += 4;
5851       }
5852       if (c != 0) {
5853         const __m128 vi0 = _mm_loadu_ps(i0);
5854         i0 += 4;
5855         const __m128 vi1 = _mm_loadu_ps(i1);
5856         i1 += 4;
5857         const __m128 vi2 = _mm_loadu_ps(i2);
5858         i2 += 4;
5859         const __m128 vi3 = _mm_loadu_ps(i3);
5860         i3 += 4;
5861         const __m128 vi4 = _mm_loadu_ps(i4);
5862         i4 += 4;
5863         const __m128 vi5 = _mm_loadu_ps(i5);
5864         i5 += 4;
5865         const __m128 vi6 = _mm_loadu_ps(i6);
5866         i6 += 4;
5867         const __m128 vi7 = _mm_loadu_ps(i7);
5868         i7 += 4;
5869         const __m128 vi8 = _mm_loadu_ps(i8);
5870         i8 += 4;
5871 
5872         const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8);
5873         const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5874         const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5875         const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5876 
5877         const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5878         const __m128 vmax01678 = _mm_max_ps(vmax018, vmax67);
5879         const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
5880         __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5881 
5882         if (c & 2) {
5883           _mm_storel_pi((__m64*) o, vout);
5884           o += 2;
5885           vout = _mm_movehl_ps(vout, vout);
5886         }
5887         if (c & 1) {
5888           _mm_store_ss(o, vout);
5889           o += 1;
5890         }
5891       }
5892     }
5893 
5894     for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
5895       const float* i0 = *input++;
5896       const float* i1 = *input++;
5897       const float* i2 = *input++;
5898       const float* i3 = *input++;
5899       const float* i4 = *input++;
5900       const float* i5 = *input++;
5901       const float* i6 = *input++;
5902       const float* i7 = *input++;
5903       i0 = (const float*) ((uintptr_t) i0 + input_offset);
5904       i1 = (const float*) ((uintptr_t) i1 + input_offset);
5905       i2 = (const float*) ((uintptr_t) i2 + input_offset);
5906       i3 = (const float*) ((uintptr_t) i3 + input_offset);
5907       i4 = (const float*) ((uintptr_t) i4 + input_offset);
5908       i5 = (const float*) ((uintptr_t) i5 + input_offset);
5909       i6 = (const float*) ((uintptr_t) i6 + input_offset);
5910       i7 = (const float*) ((uintptr_t) i7 + input_offset);
5911       if (k < 2) {
5912         i1 = i0;
5913       }
5914       if (k <= 2) {
5915         i2 = i0;
5916       }
5917       if (k < 4) {
5918         i3 = i0;
5919       }
5920       if (k <= 4) {
5921         i4 = i0;
5922       }
5923       if (k < 6) {
5924         i5 = i0;
5925       }
5926       if (k <= 6) {
5927         i6 = i0;
5928       }
5929       if (k < 8) {
5930         i7 = i0;
5931       }
5932 
5933       o = output;
5934       size_t c = channels;
5935       for (; c >= 4; c -= 4) {
5936         const __m128 vi0 = _mm_loadu_ps(i0);
5937         i0 += 4;
5938         const __m128 vi1 = _mm_loadu_ps(i1);
5939         i1 += 4;
5940         const __m128 vi2 = _mm_loadu_ps(i2);
5941         i2 += 4;
5942         const __m128 vi3 = _mm_loadu_ps(i3);
5943         i3 += 4;
5944         const __m128 vi4 = _mm_loadu_ps(i4);
5945         i4 += 4;
5946         const __m128 vi5 = _mm_loadu_ps(i5);
5947         i5 += 4;
5948         const __m128 vi6 = _mm_loadu_ps(i6);
5949         i6 += 4;
5950         const __m128 vi7 = _mm_loadu_ps(i7);
5951         i7 += 4;
5952         const __m128 vo = _mm_loadu_ps(o);
5953 
5954         const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo);
5955         const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5956         const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5957         const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5958 
5959         const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5960         const __m128 vmax0167 = _mm_max_ps(vmax01, vmax67);
5961         const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
5962         const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5963 
5964         _mm_storeu_ps(o, vout);
5965         o += 4;
5966       }
5967       if (c != 0) {
5968         const __m128 vi0 = _mm_loadu_ps(i0);
5969         const __m128 vi1 = _mm_loadu_ps(i1);
5970         const __m128 vi2 = _mm_loadu_ps(i2);
5971         const __m128 vi3 = _mm_loadu_ps(i3);
5972         const __m128 vi4 = _mm_loadu_ps(i4);
5973         const __m128 vi5 = _mm_loadu_ps(i5);
5974         const __m128 vi6 = _mm_loadu_ps(i6);
5975         const __m128 vi7 = _mm_loadu_ps(i7);
5976         const __m128 vo = _mm_loadu_ps(o);
5977 
5978         const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo);
5979         const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5980         const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5981         const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5982 
5983         const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5984         const __m128 vmax0167 = _mm_max_ps(vmax01, vmax67);
5985         const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
5986         __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5987 
5988         if (c & 2) {
5989           _mm_storel_pi((__m64*) o, vout);
5990           o += 2;
5991           vout = _mm_movehl_ps(vout, vout);
5992         }
5993         if (c & 1) {
5994           _mm_store_ss(o, vout);
5995           o += 1;
5996         }
5997       }
5998     }
5999     input = (const float**) ((uintptr_t) input + input_increment);
6000     output = (float*) ((uintptr_t) o + output_increment);
6001   } while (--output_pixels != 0);
6002 }
6003 
xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,const float * multiplier,float * buffer,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6004 void xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4(
6005     size_t output_pixels,
6006     size_t kernel_elements,
6007     size_t channels,
6008     const float** input,
6009     size_t input_offset,
6010     const float* zero,
6011     const float* multiplier,
6012     float* buffer,
6013     float* output,
6014     size_t input_increment,
6015     size_t output_increment,
6016     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6017 {
6018   assert(output_pixels != 0);
6019   assert(kernel_elements > 9);
6020   assert(channels != 0);
6021 
6022   const __m128 voutput_min = _mm_load_ps(params->sse.min);
6023   const __m128 voutput_max = _mm_load_ps(params->sse.max);
6024 
6025   do {
6026     {
6027       const float* i0 = *input++;
6028       assert(i0 != NULL);
6029       if XNN_UNPREDICTABLE(i0 != zero) {
6030         i0 = (const float*) ((uintptr_t) i0 + input_offset);
6031       }
6032       const float* i1 = *input++;
6033       assert(i1 != NULL);
6034       if XNN_UNPREDICTABLE(i1 != zero) {
6035         i1 = (const float*) ((uintptr_t) i1 + input_offset);
6036       }
6037       const float* i2 = *input++;
6038       assert(i2 != NULL);
6039       if XNN_UNPREDICTABLE(i2 != zero) {
6040         i2 = (const float*) ((uintptr_t) i2 + input_offset);
6041       }
6042       const float* i3 = *input++;
6043       assert(i3 != NULL);
6044       if XNN_UNPREDICTABLE(i3 != zero) {
6045         i3 = (const float*) ((uintptr_t) i3 + input_offset);
6046       }
6047       const float* i4 = *input++;
6048       assert(i4 != NULL);
6049       if XNN_UNPREDICTABLE(i4 != zero) {
6050         i4 = (const float*) ((uintptr_t) i4 + input_offset);
6051       }
6052       const float* i5 = *input++;
6053       assert(i5 != NULL);
6054       if XNN_UNPREDICTABLE(i5 != zero) {
6055         i5 = (const float*) ((uintptr_t) i5 + input_offset);
6056       }
6057       const float* i6 = *input++;
6058       assert(i6 != NULL);
6059       if XNN_UNPREDICTABLE(i6 != zero) {
6060         i6 = (const float*) ((uintptr_t) i6 + input_offset);
6061       }
6062       const float* i7 = *input++;
6063       assert(i7 != NULL);
6064       if XNN_UNPREDICTABLE(i7 != zero) {
6065         i7 = (const float*) ((uintptr_t) i7 + input_offset);
6066       }
6067       const float* i8 = *input++;
6068       assert(i8 != NULL);
6069       if XNN_UNPREDICTABLE(i8 != zero) {
6070         i8 = (const float*) ((uintptr_t) i8 + input_offset);
6071       }
6072 
6073       float* b = buffer;
6074       for (size_t c = 0; c < channels; c += 4) {
6075         const __m128 vi0 = _mm_loadu_ps(i0);
6076         i0 += 4;
6077         const __m128 vi1 = _mm_loadu_ps(i1);
6078         i1 += 4;
6079         const __m128 vi2 = _mm_loadu_ps(i2);
6080         i2 += 4;
6081         const __m128 vi3 = _mm_loadu_ps(i3);
6082         i3 += 4;
6083         const __m128 vi4 = _mm_loadu_ps(i4);
6084         i4 += 4;
6085         const __m128 vi5 = _mm_loadu_ps(i5);
6086         i5 += 4;
6087         const __m128 vi6 = _mm_loadu_ps(i6);
6088         i6 += 4;
6089         const __m128 vi7 = _mm_loadu_ps(i7);
6090         i7 += 4;
6091         const __m128 vi8 = _mm_loadu_ps(i8);
6092         i8 += 4;
6093 
6094         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6095         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6096         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6097         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6098         const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
6099         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6100         const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
6101         const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
6102 
6103         _mm_store_ps(b, vsum); b += 4;
6104       }
6105     }
6106 
6107     size_t k = kernel_elements;
6108     for (k -= 9; k > 8; k -= 8) {
6109       const float* i0 = *input++;
6110       assert(i0 != NULL);
6111       if XNN_UNPREDICTABLE(i0 != zero) {
6112         i0 = (const float*) ((uintptr_t) i0 + input_offset);
6113       }
6114       const float* i1 = *input++;
6115       assert(i1 != NULL);
6116       if XNN_UNPREDICTABLE(i1 != zero) {
6117         i1 = (const float*) ((uintptr_t) i1 + input_offset);
6118       }
6119       const float* i2 = *input++;
6120       assert(i2 != NULL);
6121       if XNN_UNPREDICTABLE(i2 != zero) {
6122         i2 = (const float*) ((uintptr_t) i2 + input_offset);
6123       }
6124       const float* i3 = *input++;
6125       assert(i3 != NULL);
6126       if XNN_UNPREDICTABLE(i3 != zero) {
6127         i3 = (const float*) ((uintptr_t) i3 + input_offset);
6128       }
6129       const float* i4 = *input++;
6130       assert(i4 != NULL);
6131       if XNN_UNPREDICTABLE(i4 != zero) {
6132         i4 = (const float*) ((uintptr_t) i4 + input_offset);
6133       }
6134       const float* i5 = *input++;
6135       assert(i5 != NULL);
6136       if XNN_UNPREDICTABLE(i5 != zero) {
6137         i5 = (const float*) ((uintptr_t) i5 + input_offset);
6138       }
6139       const float* i6 = *input++;
6140       assert(i6 != NULL);
6141       if XNN_UNPREDICTABLE(i6 != zero) {
6142         i6 = (const float*) ((uintptr_t) i6 + input_offset);
6143       }
6144       const float* i7 = *input++;
6145       assert(i7 != NULL);
6146       if XNN_UNPREDICTABLE(i7 != zero) {
6147         i7 = (const float*) ((uintptr_t) i7 + input_offset);
6148       }
6149 
6150       float* b = buffer;
6151       for (size_t c = 0; c < channels; c += 4) {
6152         const __m128 vi0 = _mm_loadu_ps(i0);
6153         i0 += 4;
6154         const __m128 vi1 = _mm_loadu_ps(i1);
6155         i1 += 4;
6156         const __m128 vi2 = _mm_loadu_ps(i2);
6157         i2 += 4;
6158         const __m128 vi3 = _mm_loadu_ps(i3);
6159         i3 += 4;
6160         const __m128 vi4 = _mm_loadu_ps(i4);
6161         i4 += 4;
6162         const __m128 vi5 = _mm_loadu_ps(i5);
6163         i5 += 4;
6164         const __m128 vi6 = _mm_loadu_ps(i6);
6165         i6 += 4;
6166         const __m128 vi7 = _mm_loadu_ps(i7);
6167         i7 += 4;
6168         const __m128 vacc = _mm_load_ps(b);
6169 
6170         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6171         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6172         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6173         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6174         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
6175         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6176         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
6177         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
6178 
6179         _mm_store_ps(b, vsum); b += 4;
6180       }
6181     }
6182 
6183     {
6184       const float* i0 = input[0];
6185       assert(i0 != NULL);
6186       const float* i1 = input[1];
6187       const float* i2 = input[2];
6188       const float* i3 = input[3];
6189       const float* i4 = input[4];
6190       const float* i5 = input[5];
6191       const float* i6 = input[6];
6192       const float* i7 = input[7];
6193       input = (const float**) ((uintptr_t) input + input_increment);
6194       if (k < 2) {
6195         i1 = zero;
6196       }
6197       assert(i1 != NULL);
6198       if (k <= 2) {
6199         i2 = zero;
6200       }
6201       assert(i2 != NULL);
6202       if (k < 4) {
6203         i3 = zero;
6204       }
6205       assert(i3 != NULL);
6206       if (k <= 4) {
6207         i4 = zero;
6208       }
6209       assert(i4 != NULL);
6210       if (k < 6) {
6211         i5 = zero;
6212       }
6213       assert(i5 != NULL);
6214       if (k <= 6) {
6215         i6 = zero;
6216       }
6217       assert(i6 != NULL);
6218       if (k < 8) {
6219         i7 = zero;
6220       }
6221       assert(i7 != NULL);
6222       if XNN_UNPREDICTABLE(i0 != zero) {
6223         i0 = (const float*) ((uintptr_t) i0 + input_offset);
6224       }
6225       if XNN_UNPREDICTABLE(i1 != zero) {
6226         i1 = (const float*) ((uintptr_t) i1 + input_offset);
6227       }
6228       if XNN_UNPREDICTABLE(i2 != zero) {
6229         i2 = (const float*) ((uintptr_t) i2 + input_offset);
6230       }
6231       if XNN_UNPREDICTABLE(i3 != zero) {
6232         i3 = (const float*) ((uintptr_t) i3 + input_offset);
6233       }
6234       if XNN_UNPREDICTABLE(i4 != zero) {
6235         i4 = (const float*) ((uintptr_t) i4 + input_offset);
6236       }
6237       if XNN_UNPREDICTABLE(i5 != zero) {
6238         i5 = (const float*) ((uintptr_t) i5 + input_offset);
6239       }
6240       if XNN_UNPREDICTABLE(i6 != zero) {
6241         i6 = (const float*) ((uintptr_t) i6 + input_offset);
6242       }
6243       if XNN_UNPREDICTABLE(i7 != zero) {
6244         i7 = (const float*) ((uintptr_t) i7 + input_offset);
6245       }
6246 
6247       const __m128 vmultiplier = _mm_load1_ps(multiplier);
6248       multiplier += 1;
6249 
6250       size_t c = channels;
6251       float* b = buffer;
6252       while (c >= 4) {
6253         const __m128 vi0 = _mm_loadu_ps(i0);
6254         i0 += 4;
6255         const __m128 vi1 = _mm_loadu_ps(i1);
6256         i1 += 4;
6257         const __m128 vi2 = _mm_loadu_ps(i2);
6258         i2 += 4;
6259         const __m128 vi3 = _mm_loadu_ps(i3);
6260         i3 += 4;
6261         const __m128 vi4 = _mm_loadu_ps(i4);
6262         i4 += 4;
6263         const __m128 vi5 = _mm_loadu_ps(i5);
6264         i5 += 4;
6265         const __m128 vi6 = _mm_loadu_ps(i6);
6266         i6 += 4;
6267         const __m128 vi7 = _mm_loadu_ps(i7);
6268         i7 += 4;
6269         const __m128 vacc = _mm_load_ps(b);
6270         b += 4;
6271 
6272         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6273         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6274         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6275         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6276         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
6277         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6278         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
6279         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
6280 
6281         __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6282         vout = _mm_max_ps(vout, voutput_min);
6283         vout = _mm_min_ps(vout, voutput_max);
6284 
6285         _mm_storeu_ps(output, vout);
6286         output += 4;
6287 
6288         c -= 4;
6289       }
6290       if (c != 0) {
6291         const __m128 vi0 = _mm_loadu_ps(i0);
6292         const __m128 vi1 = _mm_loadu_ps(i1);
6293         const __m128 vi2 = _mm_loadu_ps(i2);
6294         const __m128 vi3 = _mm_loadu_ps(i3);
6295         const __m128 vi4 = _mm_loadu_ps(i4);
6296         const __m128 vi5 = _mm_loadu_ps(i5);
6297         const __m128 vi6 = _mm_loadu_ps(i6);
6298         const __m128 vi7 = _mm_loadu_ps(i7);
6299         const __m128 vacc = _mm_load_ps(b);
6300 
6301         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6302         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6303         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6304         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6305         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
6306         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6307         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
6308         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
6309 
6310         __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6311         vout = _mm_max_ps(vout, voutput_min);
6312         vout = _mm_min_ps(vout, voutput_max);
6313 
6314         if (c & 2) {
6315           _mm_storel_pi((__m64*) output, vout);
6316           vout = _mm_movehl_ps(vout, vout);
6317           output += 2;
6318         }
6319         if (c & 1) {
6320           _mm_store_ss(output, vout);
6321           output += 1;
6322         }
6323       }
6324     }
6325     output = (float*) ((uintptr_t) output + output_increment);
6326   } while (--output_pixels != 0);
6327 }
6328 
xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,const float * multiplier,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6329 void xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4(
6330     size_t output_pixels,
6331     size_t kernel_elements,
6332     size_t channels,
6333     const float** input,
6334     size_t input_offset,
6335     const float* zero,
6336     const float* multiplier,
6337     float* output,
6338     size_t input_increment,
6339     size_t output_increment,
6340     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6341 {
6342   assert(output_pixels != 0);
6343   assert(kernel_elements != 0);
6344   assert(kernel_elements <= 9);
6345   assert(channels != 0);
6346 
6347   const __m128 voutput_min = _mm_load_ps(params->sse.min);
6348   const __m128 voutput_max = _mm_load_ps(params->sse.max);
6349 
6350   do {
6351     const float* i0 = input[0];
6352     assert(i0 != NULL);
6353     const float* i1 = input[1];
6354     const float* i2 = input[2];
6355     const float* i3 = input[3];
6356     const float* i4 = input[4];
6357     const float* i5 = input[5];
6358     const float* i6 = input[6];
6359     const float* i7 = input[7];
6360     const float* i8 = input[8];
6361     input = (const float**) ((uintptr_t) input + input_increment);
6362     if (kernel_elements < 2) {
6363       i1 = zero;
6364     }
6365     assert(i1 != NULL);
6366     if (kernel_elements <= 2) {
6367       i2 = zero;
6368     }
6369     assert(i2 != NULL);
6370     if (kernel_elements < 4) {
6371       i3 = zero;
6372     }
6373     assert(i3 != NULL);
6374     if (kernel_elements <= 4) {
6375       i4 = zero;
6376     }
6377     assert(i4 != NULL);
6378     if (kernel_elements < 6) {
6379       i5 = zero;
6380     }
6381     assert(i5 != NULL);
6382     if (kernel_elements <= 6) {
6383       i6 = zero;
6384     }
6385     assert(i6 != NULL);
6386     if (kernel_elements < 8) {
6387       i7 = zero;
6388     }
6389     assert(i7 != NULL);
6390     if (kernel_elements <= 8) {
6391       i8 = zero;
6392     }
6393     assert(i8 != NULL);
6394     if XNN_UNPREDICTABLE(i0 != zero) {
6395       i0 = (const float*) ((uintptr_t) i0 + input_offset);
6396     }
6397     if XNN_UNPREDICTABLE(i1 != zero) {
6398       i1 = (const float*) ((uintptr_t) i1 + input_offset);
6399     }
6400     if XNN_UNPREDICTABLE(i2 != zero) {
6401       i2 = (const float*) ((uintptr_t) i2 + input_offset);
6402     }
6403     if XNN_UNPREDICTABLE(i3 != zero) {
6404       i3 = (const float*) ((uintptr_t) i3 + input_offset);
6405     }
6406     if XNN_UNPREDICTABLE(i4 != zero) {
6407       i4 = (const float*) ((uintptr_t) i4 + input_offset);
6408     }
6409     if XNN_UNPREDICTABLE(i5 != zero) {
6410       i5 = (const float*) ((uintptr_t) i5 + input_offset);
6411     }
6412     if XNN_UNPREDICTABLE(i6 != zero) {
6413       i6 = (const float*) ((uintptr_t) i6 + input_offset);
6414     }
6415     if XNN_UNPREDICTABLE(i7 != zero) {
6416       i7 = (const float*) ((uintptr_t) i7 + input_offset);
6417     }
6418     if XNN_UNPREDICTABLE(i8 != zero) {
6419       i8 = (const float*) ((uintptr_t) i8 + input_offset);
6420     }
6421 
6422     const __m128 vmultiplier = _mm_load1_ps(multiplier);
6423     multiplier += 1;
6424 
6425     size_t c = channels;
6426     while (c >= 4) {
6427       const __m128 vi0 = _mm_loadu_ps(i0);
6428       i0 += 4;
6429       const __m128 vi1 = _mm_loadu_ps(i1);
6430       i1 += 4;
6431       const __m128 vi2 = _mm_loadu_ps(i2);
6432       i2 += 4;
6433       const __m128 vi3 = _mm_loadu_ps(i3);
6434       i3 += 4;
6435       const __m128 vi4 = _mm_loadu_ps(i4);
6436       i4 += 4;
6437       const __m128 vi5 = _mm_loadu_ps(i5);
6438       i5 += 4;
6439       const __m128 vi6 = _mm_loadu_ps(i6);
6440       i6 += 4;
6441       const __m128 vi7 = _mm_loadu_ps(i7);
6442       i7 += 4;
6443       const __m128 vi8 = _mm_loadu_ps(i8);
6444       i8 += 4;
6445 
6446       const __m128 vsum018 = _mm_add_ps(_mm_add_ps(vi0, vi1), vi8);
6447       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6448       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6449       const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6450 
6451       const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6452       const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
6453       const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
6454 
6455       __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6456       vout = _mm_max_ps(vout, voutput_min);
6457       vout = _mm_min_ps(vout, voutput_max);
6458 
6459       _mm_storeu_ps(output, vout); output += 4;
6460 
6461       c -= 4;
6462     }
6463     if (c != 0) {
6464       const __m128 vi0 = _mm_loadu_ps(i0);
6465       const __m128 vi1 = _mm_loadu_ps(i1);
6466       const __m128 vi2 = _mm_loadu_ps(i2);
6467       const __m128 vi3 = _mm_loadu_ps(i3);
6468       const __m128 vi4 = _mm_loadu_ps(i4);
6469       const __m128 vi5 = _mm_loadu_ps(i5);
6470       const __m128 vi6 = _mm_loadu_ps(i6);
6471       const __m128 vi7 = _mm_loadu_ps(i7);
6472       const __m128 vi8 = _mm_loadu_ps(i8);
6473 
6474       const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6475       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6476       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6477       const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6478       const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
6479       const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6480       const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
6481       const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
6482 
6483       __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6484       vout = _mm_max_ps(vout, voutput_min);
6485       vout = _mm_min_ps(vout, voutput_max);
6486 
6487       if (c & 2) {
6488         _mm_storel_pi((__m64*) output, vout);
6489         vout = _mm_movehl_ps(vout, vout);
6490         output += 2;
6491       }
6492       if (c & 1) {
6493         _mm_store_ss(output, vout);
6494         output += 1;
6495       }
6496     }
6497     output = (float*) ((uintptr_t) output + output_increment);
6498   } while (--output_pixels != 0);
6499 }
6500 
xnn_f32_rmax_ukernel__sse(size_t n,const float * x,float * y)6501 void xnn_f32_rmax_ukernel__sse(
6502     size_t n,
6503     const float* x,
6504     float* y)
6505 {
6506   assert(n != 0);
6507   assert(n % sizeof(float) == 0);
6508 
6509   __m128 vmax0 = _mm_load_ss(x);
6510   vmax0 = _mm_shuffle_ps(vmax0, vmax0, _MM_SHUFFLE(0, 0, 0, 0));
6511   __m128 vmax1 = vmax0;
6512   __m128 vmax2 = vmax0;
6513   __m128 vmax3 = vmax0;
6514   for (; n >= 64; n -= 64) {
6515     const __m128 vx0 = _mm_loadu_ps(x);
6516     const __m128 vx1 = _mm_loadu_ps(x + 4);
6517     const __m128 vx2 = _mm_loadu_ps(x + 8);
6518     const __m128 vx3 = _mm_loadu_ps(x + 12);
6519     x += 16;
6520 
6521     vmax0 = _mm_max_ps(vmax0, vx0);
6522     vmax1 = _mm_max_ps(vmax1, vx1);
6523     vmax2 = _mm_max_ps(vmax2, vx2);
6524     vmax3 = _mm_max_ps(vmax3, vx3);
6525   }
6526   __m128 vmax = _mm_max_ps(_mm_max_ps(vmax0, vmax1), _mm_max_ps(vmax2, vmax3));
6527   for (; n >= 16; n -= 16) {
6528     const __m128 vx = _mm_loadu_ps(x);
6529     vmax = _mm_max_ps(vmax, vx);
6530     x += 4;
6531   }
6532   __m128 vmax_lo = _mm_max_ps(vmax, _mm_movehl_ps(vmax, vmax));
6533   vmax_lo = _mm_max_ss(vmax_lo, _mm_shuffle_ps(vmax_lo, vmax_lo, _MM_SHUFFLE(3, 3, 1, 1)));
6534   if XNN_UNLIKELY(n != 0) {
6535     do {
6536       vmax_lo = _mm_max_ss(vmax_lo, _mm_load_ss(x));
6537       x += 1;
6538       n -= 4;
6539     } while (n != 0);
6540   }
6541   _mm_store_ss(y, vmax_lo);
6542 }
6543 
xnn_f32_spmm_minmax_ukernel_32x1__sse(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6544 void xnn_f32_spmm_minmax_ukernel_32x1__sse(
6545     size_t mc,
6546     size_t nc,
6547     const float*restrict input,
6548     const float*restrict weights,
6549     const int32_t*restrict widx_dmap,
6550     const uint32_t*restrict nidx_nnzmap,
6551     float*restrict output,
6552     size_t output_stride,
6553     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
6554 {
6555   assert(mc != 0);
6556   assert(mc % sizeof(float) == 0);
6557   assert(nc != 0);
6558 
6559   const __m128 vmin = _mm_load_ps(params->sse.min);
6560   const __m128 vmax = _mm_load_ps(params->sse.max);
6561   size_t output_decrement = output_stride * nc - 32 * sizeof(float);
6562   while XNN_LIKELY(mc >= 32 * sizeof(float)) {
6563     const float*restrict w = weights;
6564     const int32_t* dmap = widx_dmap;
6565     const uint32_t* nnzmap = nidx_nnzmap;
6566     size_t n = nc;
6567     do {
6568       uint32_t nnz = *nnzmap++;
6569       __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6570       __m128 vacc4567 = vacc0123;
6571       __m128 vacc89AB = vacc0123;
6572       __m128 vaccCDEF = vacc0123;
6573       __m128 vaccGHIJ = vacc0123;
6574       __m128 vaccKLMN = vacc0123;
6575       __m128 vaccOPQR = vacc0123;
6576       __m128 vaccSTUV = vacc0123;
6577       if XNN_LIKELY(nnz != 0) {
6578         do {
6579           const intptr_t diff = *dmap++;
6580           const __m128 vi0123 = _mm_loadu_ps(input);
6581           const __m128 vi4567 = _mm_loadu_ps(input + 4);
6582           const __m128 vi89AB = _mm_loadu_ps(input + 8);
6583           const __m128 viCDEF = _mm_loadu_ps(input + 12);
6584           const __m128 viGHIJ = _mm_loadu_ps(input + 16);
6585           const __m128 viKLMN = _mm_loadu_ps(input + 20);
6586           const __m128 viOPQR = _mm_loadu_ps(input + 24);
6587           const __m128 viSTUV = _mm_loadu_ps(input + 28);
6588           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6589           const __m128 vw = _mm_load1_ps(w); w += 1;
6590           vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6591           vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
6592           vacc89AB = _mm_add_ps(vacc89AB, _mm_mul_ps(vi89AB, vw));
6593           vaccCDEF = _mm_add_ps(vaccCDEF, _mm_mul_ps(viCDEF, vw));
6594           vaccGHIJ = _mm_add_ps(vaccGHIJ, _mm_mul_ps(viGHIJ, vw));
6595           vaccKLMN = _mm_add_ps(vaccKLMN, _mm_mul_ps(viKLMN, vw));
6596           vaccOPQR = _mm_add_ps(vaccOPQR, _mm_mul_ps(viOPQR, vw));
6597           vaccSTUV = _mm_add_ps(vaccSTUV, _mm_mul_ps(viSTUV, vw));
6598         } while (--nnz != 0);
6599       }
6600       __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6601       __m128 vout4567 = _mm_min_ps(vacc4567, vmax);
6602       __m128 vout89AB = _mm_min_ps(vacc89AB, vmax);
6603       __m128 voutCDEF = _mm_min_ps(vaccCDEF, vmax);
6604       __m128 voutGHIJ = _mm_min_ps(vaccGHIJ, vmax);
6605       __m128 voutKLMN = _mm_min_ps(vaccKLMN, vmax);
6606       __m128 voutOPQR = _mm_min_ps(vaccOPQR, vmax);
6607       __m128 voutSTUV = _mm_min_ps(vaccSTUV, vmax);
6608       vout0123 = _mm_max_ps(vout0123, vmin);
6609       vout4567 = _mm_max_ps(vout4567, vmin);
6610       vout89AB = _mm_max_ps(vout89AB, vmin);
6611       voutCDEF = _mm_max_ps(voutCDEF, vmin);
6612       voutGHIJ = _mm_max_ps(voutGHIJ, vmin);
6613       voutKLMN = _mm_max_ps(voutKLMN, vmin);
6614       voutOPQR = _mm_max_ps(voutOPQR, vmin);
6615       voutSTUV = _mm_max_ps(voutSTUV, vmin);
6616       _mm_storeu_ps(output, vout0123);
6617       _mm_storeu_ps(output + 4, vout4567);
6618       _mm_storeu_ps(output + 8, vout89AB);
6619       _mm_storeu_ps(output + 12, voutCDEF);
6620       _mm_storeu_ps(output + 16, voutGHIJ);
6621       _mm_storeu_ps(output + 20, voutKLMN);
6622       _mm_storeu_ps(output + 24, voutOPQR);
6623       _mm_storeu_ps(output + 28, voutSTUV);
6624       output = (float*restrict) ((uintptr_t) output + output_stride);
6625     } while (--n != 0);
6626     output = (float*restrict) ((uintptr_t) output - output_decrement);
6627     input += 32;
6628     mc -= 32 * sizeof(float);
6629   }
6630   if XNN_UNLIKELY(mc != 0) {
6631     output_decrement += 16 * sizeof(float);
6632     if (mc & (16 * sizeof(float))) {
6633       const float*restrict w = weights;
6634       const int32_t* dmap = widx_dmap;
6635       const uint32_t* nnzmap = nidx_nnzmap;
6636       size_t n = nc;
6637       do {
6638         uint32_t nnz = *nnzmap++;
6639         __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6640         __m128 vacc4567 = vacc0123;
6641         __m128 vacc89AB = vacc0123;
6642         __m128 vaccCDEF = vacc0123;
6643         if XNN_LIKELY(nnz != 0) {
6644           do {
6645             const intptr_t diff = *dmap++;
6646             const __m128 vi0123 = _mm_loadu_ps(input);
6647             const __m128 vi4567 = _mm_loadu_ps(input + 4);
6648             const __m128 vi89AB = _mm_loadu_ps(input + 8);
6649             const __m128 viCDEF = _mm_loadu_ps(input + 12);
6650             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6651             const __m128 vw = _mm_load1_ps(w); w += 1;
6652             vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6653             vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
6654             vacc89AB = _mm_add_ps(vacc89AB, _mm_mul_ps(vi89AB, vw));
6655             vaccCDEF = _mm_add_ps(vaccCDEF, _mm_mul_ps(viCDEF, vw));
6656           } while (--nnz != 0);
6657         }
6658         __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6659         __m128 vout4567 = _mm_min_ps(vacc4567, vmax);
6660         __m128 vout89AB = _mm_min_ps(vacc89AB, vmax);
6661         __m128 voutCDEF = _mm_min_ps(vaccCDEF, vmax);
6662         vout0123 = _mm_max_ps(vout0123, vmin);
6663         vout4567 = _mm_max_ps(vout4567, vmin);
6664         vout89AB = _mm_max_ps(vout89AB, vmin);
6665         voutCDEF = _mm_max_ps(voutCDEF, vmin);
6666         _mm_storeu_ps(output, vout0123);
6667         _mm_storeu_ps(output + 4, vout4567);
6668         _mm_storeu_ps(output + 8, vout89AB);
6669         _mm_storeu_ps(output + 12, voutCDEF);
6670         output = (float*restrict) ((uintptr_t) output + output_stride);
6671       } while (--n != 0);
6672       output = (float*restrict) ((uintptr_t) output - output_decrement);
6673       input += 16;
6674     }
6675     output_decrement += 8 * sizeof(float);
6676     if (mc & (8 * sizeof(float))) {
6677       const float*restrict w = weights;
6678       const int32_t* dmap = widx_dmap;
6679       const uint32_t* nnzmap = nidx_nnzmap;
6680       size_t n = nc;
6681       do {
6682         uint32_t nnz = *nnzmap++;
6683         __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6684         __m128 vacc4567 = vacc0123;
6685         if XNN_LIKELY(nnz != 0) {
6686           do {
6687             const intptr_t diff = *dmap++;
6688             const __m128 vi0123 = _mm_loadu_ps(input);
6689             const __m128 vi4567 = _mm_loadu_ps(input + 4);
6690             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6691             const __m128 vw = _mm_load1_ps(w); w += 1;
6692             vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6693             vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
6694           } while (--nnz != 0);
6695         }
6696         __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6697         __m128 vout4567 = _mm_min_ps(vacc4567, vmax);
6698         vout0123 = _mm_max_ps(vout0123, vmin);
6699         vout4567 = _mm_max_ps(vout4567, vmin);
6700         _mm_storeu_ps(output, vout0123);
6701         _mm_storeu_ps(output + 4, vout4567);
6702         output = (float*restrict) ((uintptr_t) output + output_stride);
6703       } while (--n != 0);
6704       output = (float*restrict) ((uintptr_t) output - output_decrement);
6705       input += 8;
6706     }
6707     output_decrement += 4 * sizeof(float);
6708     if (mc & (4 * sizeof(float))) {
6709       const float*restrict w = weights;
6710       const int32_t* dmap = widx_dmap;
6711       const uint32_t* nnzmap = nidx_nnzmap;
6712       size_t n = nc;
6713       do {
6714         uint32_t nnz = *nnzmap++;
6715         __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6716         if XNN_LIKELY(nnz != 0) {
6717           do {
6718             const intptr_t diff = *dmap++;
6719             const __m128 vi0123 = _mm_loadu_ps(input);
6720             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6721             const __m128 vw = _mm_load1_ps(w); w += 1;
6722             vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6723           } while (--nnz != 0);
6724         }
6725         __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6726         vout0123 = _mm_max_ps(vout0123, vmin);
6727         _mm_storeu_ps(output, vout0123);
6728         output = (float*restrict) ((uintptr_t) output + output_stride);
6729       } while (--n != 0);
6730       output = (float*restrict) ((uintptr_t) output - output_decrement);
6731       input += 4;
6732     }
6733     output_decrement += 2 * sizeof(float);
6734     if (mc & (2 * sizeof(float))) {
6735       const float*restrict w = weights;
6736       const int32_t* dmap = widx_dmap;
6737       const uint32_t* nnzmap = nidx_nnzmap;
6738       size_t n = nc;
6739       do {
6740         uint32_t nnz = *nnzmap++;
6741         __m128 vacc01 = _mm_load_ss(w); w += 1;
6742         vacc01 = _mm_unpacklo_ps(vacc01, vacc01);
6743         if XNN_LIKELY(nnz != 0) {
6744           do {
6745             const intptr_t diff = *dmap++;
6746             const __m128 vi01 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) input);
6747             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6748             __m128 vw = _mm_load_ss(w); w += 1;
6749             vw = _mm_unpacklo_ps(vw, vw);
6750             vacc01 = _mm_add_ps(vacc01, _mm_mul_ps(vi01, vw));
6751           } while (--nnz != 0);
6752         }
6753         __m128 vout01 = _mm_min_ps(vacc01, vmax);
6754         vout01 = _mm_max_ps(vout01, vmin);
6755         _mm_storel_pi((__m64*) output, vout01);
6756         output = (float*restrict) ((uintptr_t) output + output_stride);
6757       } while (--n != 0);
6758       output = (float*restrict) ((uintptr_t) output - output_decrement);
6759       input += 2;
6760     }
6761     output_decrement += 1 * sizeof(float);
6762     if (mc & (1 * sizeof(float))) {
6763       const float*restrict w = weights;
6764       const int32_t* dmap = widx_dmap;
6765       const uint32_t* nnzmap = nidx_nnzmap;
6766       size_t n = nc;
6767       do {
6768         uint32_t nnz = *nnzmap++;
6769         __m128 vacc0 = _mm_load_ss(w); w += 1;
6770         if XNN_LIKELY(nnz != 0) {
6771           do {
6772             const intptr_t diff = *dmap++;
6773             const __m128 vi0 = _mm_load_ss(input);
6774             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6775             const __m128 vw = _mm_load_ss(w); w += 1;
6776             vacc0 = _mm_add_ss(vacc0, _mm_mul_ss(vi0, vw));
6777           } while (--nnz != 0);
6778         }
6779         __m128 vout0 = _mm_min_ss(vacc0, vmax);
6780         vout0 = _mm_max_ss(vout0, vmin);
6781         _mm_store_ss(output, vout0);
6782         output = (float*restrict) ((uintptr_t) output + output_stride);
6783       } while (--n != 0);
6784       output = (float*restrict) ((uintptr_t) output - output_decrement);
6785       input += 1;
6786     }
6787   }
6788 }
6789 
xnn_f32_vadd_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6790 void xnn_f32_vadd_minmax_ukernel__sse_x8(
6791     size_t n,
6792     const float* a,
6793     const float* b,
6794     float* y,
6795     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6796 {
6797   assert(n != 0);
6798   assert(n % sizeof(float) == 0);
6799   assert(a != NULL);
6800   assert(b != NULL);
6801   assert(y != NULL);
6802 
6803   const __m128 vy_min = _mm_load_ps(params->sse.min);
6804   const __m128 vy_max = _mm_load_ps(params->sse.max);
6805 
6806   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
6807     const __m128 va0123 = _mm_loadu_ps(a);
6808     const __m128 va4567 = _mm_loadu_ps(a + 4);
6809     a += 8;
6810 
6811     const __m128 vb0123 = _mm_loadu_ps(b);
6812     const __m128 vb4567 = _mm_loadu_ps(b + 4);
6813     b += 8;
6814 
6815     __m128 vy0123 = _mm_add_ps(va0123, vb0123);
6816     __m128 vy4567 = _mm_add_ps(va4567, vb4567);
6817 
6818 
6819     vy0123 = _mm_max_ps(vy0123, vy_min);
6820     vy4567 = _mm_max_ps(vy4567, vy_min);
6821 
6822     vy0123 = _mm_min_ps(vy0123, vy_max);
6823     vy4567 = _mm_min_ps(vy4567, vy_max);
6824 
6825     _mm_storeu_ps(y, vy0123);
6826     _mm_storeu_ps(y + 4, vy4567);
6827     y += 8;
6828   }
6829   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
6830     const __m128 va0123 = _mm_loadu_ps(a);
6831     a += 4;
6832 
6833     const __m128 vb0123 = _mm_loadu_ps(b);
6834     b += 4;
6835 
6836     __m128 vy0123 = _mm_add_ps(va0123, vb0123);
6837     vy0123 = _mm_max_ps(vy0123, vy_min);
6838     vy0123 = _mm_min_ps(vy0123, vy_max);
6839     _mm_storeu_ps(y, vy0123);
6840     y += 4;
6841   }
6842   if XNN_UNLIKELY(n != 0) {
6843     const __m128 va0123 = _mm_loadu_ps(a);
6844     const __m128 vb0123 = _mm_loadu_ps(b);
6845 
6846     __m128 vy0123 = _mm_add_ps(va0123, vb0123);
6847     vy0123 = _mm_max_ps(vy0123, vy_min);
6848     vy0123 = _mm_min_ps(vy0123, vy_max);
6849     if (n & (2 * sizeof(float))) {
6850       _mm_storel_pi((__m64*) y, vy0123);
6851       vy0123 = _mm_movehl_ps(vy0123, vy0123);
6852       y += 2;
6853     }
6854     if (n & (1 * sizeof(float))) {
6855       _mm_store_ss(y, vy0123);
6856     }
6857   }
6858 }
6859 
xnn_f32_vaddc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6860 void xnn_f32_vaddc_minmax_ukernel__sse_x8(
6861     size_t n,
6862     const float* a,
6863     const float* b,
6864     float* y,
6865     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6866 {
6867   assert(n != 0);
6868   assert(n % sizeof(float) == 0);
6869   assert(a != NULL);
6870   assert(b != NULL);
6871   assert(y != NULL);
6872 
6873   const __m128 vy_min = _mm_load_ps(params->sse.min);
6874   const __m128 vy_max = _mm_load_ps(params->sse.max);
6875 
6876   const __m128 vb = _mm_load1_ps(b);
6877   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
6878     const __m128 va0123 = _mm_loadu_ps(a);
6879     const __m128 va4567 = _mm_loadu_ps(a + 4);
6880     a += 8;
6881 
6882     __m128 vy0123 = _mm_add_ps(va0123, vb);
6883     __m128 vy4567 = _mm_add_ps(va4567, vb);
6884 
6885 
6886     vy0123 = _mm_max_ps(vy0123, vy_min);
6887     vy4567 = _mm_max_ps(vy4567, vy_min);
6888 
6889     vy0123 = _mm_min_ps(vy0123, vy_max);
6890     vy4567 = _mm_min_ps(vy4567, vy_max);
6891 
6892     _mm_storeu_ps(y, vy0123);
6893     _mm_storeu_ps(y + 4, vy4567);
6894     y += 8;
6895   }
6896   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
6897     const __m128 va0123 = _mm_loadu_ps(a);
6898     a += 4;
6899 
6900     __m128 vy0123 = _mm_add_ps(va0123, vb);
6901     vy0123 = _mm_max_ps(vy0123, vy_min);
6902     vy0123 = _mm_min_ps(vy0123, vy_max);
6903     _mm_storeu_ps(y, vy0123);
6904     y += 4;
6905   }
6906   if XNN_UNLIKELY(n != 0) {
6907     const __m128 va0123 = _mm_loadu_ps(a);
6908 
6909     __m128 vy0123 = _mm_add_ps(va0123, vb);
6910     vy0123 = _mm_max_ps(vy0123, vy_min);
6911     vy0123 = _mm_min_ps(vy0123, vy_max);
6912     if (n & (2 * sizeof(float))) {
6913       _mm_storel_pi((__m64*) y, vy0123);
6914       vy0123 = _mm_movehl_ps(vy0123, vy0123);
6915       y += 2;
6916     }
6917     if (n & (1 * sizeof(float))) {
6918       _mm_store_ss(y, vy0123);
6919     }
6920   }
6921 }
6922 
xnn_f32_vdiv_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6923 void xnn_f32_vdiv_minmax_ukernel__sse_x8(
6924     size_t n,
6925     const float* a,
6926     const float* b,
6927     float* y,
6928     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6929 {
6930   assert(n != 0);
6931   assert(n % sizeof(float) == 0);
6932   assert(a != NULL);
6933   assert(b != NULL);
6934   assert(y != NULL);
6935 
6936   const __m128 vy_min = _mm_load_ps(params->sse.min);
6937   const __m128 vy_max = _mm_load_ps(params->sse.max);
6938 
6939   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
6940     const __m128 va0123 = _mm_loadu_ps(a);
6941     const __m128 va4567 = _mm_loadu_ps(a + 4);
6942     a += 8;
6943 
6944     const __m128 vb0123 = _mm_loadu_ps(b);
6945     const __m128 vb4567 = _mm_loadu_ps(b + 4);
6946     b += 8;
6947 
6948     __m128 vy0123 = _mm_div_ps(va0123, vb0123);
6949     __m128 vy4567 = _mm_div_ps(va4567, vb4567);
6950 
6951 
6952     vy0123 = _mm_max_ps(vy0123, vy_min);
6953     vy4567 = _mm_max_ps(vy4567, vy_min);
6954 
6955     vy0123 = _mm_min_ps(vy0123, vy_max);
6956     vy4567 = _mm_min_ps(vy4567, vy_max);
6957 
6958     _mm_storeu_ps(y, vy0123);
6959     _mm_storeu_ps(y + 4, vy4567);
6960     y += 8;
6961   }
6962   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
6963     const __m128 va0123 = _mm_loadu_ps(a);
6964     a += 4;
6965 
6966     const __m128 vb0123 = _mm_loadu_ps(b);
6967     b += 4;
6968 
6969     __m128 vy0123 = _mm_div_ps(va0123, vb0123);
6970     vy0123 = _mm_max_ps(vy0123, vy_min);
6971     vy0123 = _mm_min_ps(vy0123, vy_max);
6972     _mm_storeu_ps(y, vy0123);
6973     y += 4;
6974   }
6975   if XNN_UNLIKELY(n != 0) {
6976     const __m128 va0123 = _mm_loadu_ps(a);
6977     const __m128 vb0123 = _mm_loadu_ps(b);
6978 
6979     __m128 vy0123 = _mm_div_ps(va0123, vb0123);
6980     vy0123 = _mm_max_ps(vy0123, vy_min);
6981     vy0123 = _mm_min_ps(vy0123, vy_max);
6982     if (n & (2 * sizeof(float))) {
6983       _mm_storel_pi((__m64*) y, vy0123);
6984       vy0123 = _mm_movehl_ps(vy0123, vy0123);
6985       y += 2;
6986     }
6987     if (n & (1 * sizeof(float))) {
6988       _mm_store_ss(y, vy0123);
6989     }
6990   }
6991 }
6992 
xnn_f32_vdivc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6993 void xnn_f32_vdivc_minmax_ukernel__sse_x8(
6994     size_t n,
6995     const float* a,
6996     const float* b,
6997     float* y,
6998     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6999 {
7000   assert(n != 0);
7001   assert(n % sizeof(float) == 0);
7002   assert(a != NULL);
7003   assert(b != NULL);
7004   assert(y != NULL);
7005 
7006   const __m128 vy_min = _mm_load_ps(params->sse.min);
7007   const __m128 vy_max = _mm_load_ps(params->sse.max);
7008 
7009   const __m128 vb = _mm_load1_ps(b);
7010   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7011     const __m128 va0123 = _mm_loadu_ps(a);
7012     const __m128 va4567 = _mm_loadu_ps(a + 4);
7013     a += 8;
7014 
7015     __m128 vy0123 = _mm_div_ps(va0123, vb);
7016     __m128 vy4567 = _mm_div_ps(va4567, vb);
7017 
7018 
7019     vy0123 = _mm_max_ps(vy0123, vy_min);
7020     vy4567 = _mm_max_ps(vy4567, vy_min);
7021 
7022     vy0123 = _mm_min_ps(vy0123, vy_max);
7023     vy4567 = _mm_min_ps(vy4567, vy_max);
7024 
7025     _mm_storeu_ps(y, vy0123);
7026     _mm_storeu_ps(y + 4, vy4567);
7027     y += 8;
7028   }
7029   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7030     const __m128 va0123 = _mm_loadu_ps(a);
7031     a += 4;
7032 
7033     __m128 vy0123 = _mm_div_ps(va0123, vb);
7034     vy0123 = _mm_max_ps(vy0123, vy_min);
7035     vy0123 = _mm_min_ps(vy0123, vy_max);
7036     _mm_storeu_ps(y, vy0123);
7037     y += 4;
7038   }
7039   if XNN_UNLIKELY(n != 0) {
7040     const __m128 va0123 = _mm_loadu_ps(a);
7041 
7042     __m128 vy0123 = _mm_div_ps(va0123, vb);
7043     vy0123 = _mm_max_ps(vy0123, vy_min);
7044     vy0123 = _mm_min_ps(vy0123, vy_max);
7045     if (n & (2 * sizeof(float))) {
7046       _mm_storel_pi((__m64*) y, vy0123);
7047       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7048       y += 2;
7049     }
7050     if (n & (1 * sizeof(float))) {
7051       _mm_store_ss(y, vy0123);
7052     }
7053   }
7054 }
7055 
xnn_f32_vmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7056 void xnn_f32_vmax_ukernel__sse_x8(
7057     size_t n,
7058     const float* a,
7059     const float* b,
7060     float* y,
7061     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7062 {
7063   assert(n != 0);
7064   assert(n % sizeof(float) == 0);
7065   assert(a != NULL);
7066   assert(b != NULL);
7067   assert(y != NULL);
7068 
7069 
7070   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7071     const __m128 va0123 = _mm_loadu_ps(a);
7072     const __m128 va4567 = _mm_loadu_ps(a + 4);
7073     a += 8;
7074 
7075     const __m128 vb0123 = _mm_loadu_ps(b);
7076     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7077     b += 8;
7078 
7079     __m128 vy0123 = _mm_max_ps(va0123, vb0123);
7080     __m128 vy4567 = _mm_max_ps(va4567, vb4567);
7081 
7082 
7083 
7084     _mm_storeu_ps(y, vy0123);
7085     _mm_storeu_ps(y + 4, vy4567);
7086     y += 8;
7087   }
7088   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7089     const __m128 va0123 = _mm_loadu_ps(a);
7090     a += 4;
7091 
7092     const __m128 vb0123 = _mm_loadu_ps(b);
7093     b += 4;
7094 
7095     __m128 vy0123 = _mm_max_ps(va0123, vb0123);
7096     _mm_storeu_ps(y, vy0123);
7097     y += 4;
7098   }
7099   if XNN_UNLIKELY(n != 0) {
7100     const __m128 va0123 = _mm_loadu_ps(a);
7101     const __m128 vb0123 = _mm_loadu_ps(b);
7102 
7103     __m128 vy0123 = _mm_max_ps(va0123, vb0123);
7104     if (n & (2 * sizeof(float))) {
7105       _mm_storel_pi((__m64*) y, vy0123);
7106       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7107       y += 2;
7108     }
7109     if (n & (1 * sizeof(float))) {
7110       _mm_store_ss(y, vy0123);
7111     }
7112   }
7113 }
7114 
xnn_f32_vmaxc_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7115 void xnn_f32_vmaxc_ukernel__sse_x8(
7116     size_t n,
7117     const float* a,
7118     const float* b,
7119     float* y,
7120     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7121 {
7122   assert(n != 0);
7123   assert(n % sizeof(float) == 0);
7124   assert(a != NULL);
7125   assert(b != NULL);
7126   assert(y != NULL);
7127 
7128 
7129   const __m128 vb = _mm_load1_ps(b);
7130   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7131     const __m128 va0123 = _mm_loadu_ps(a);
7132     const __m128 va4567 = _mm_loadu_ps(a + 4);
7133     a += 8;
7134 
7135     __m128 vy0123 = _mm_max_ps(va0123, vb);
7136     __m128 vy4567 = _mm_max_ps(va4567, vb);
7137 
7138 
7139 
7140     _mm_storeu_ps(y, vy0123);
7141     _mm_storeu_ps(y + 4, vy4567);
7142     y += 8;
7143   }
7144   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7145     const __m128 va0123 = _mm_loadu_ps(a);
7146     a += 4;
7147 
7148     __m128 vy0123 = _mm_max_ps(va0123, vb);
7149     _mm_storeu_ps(y, vy0123);
7150     y += 4;
7151   }
7152   if XNN_UNLIKELY(n != 0) {
7153     const __m128 va0123 = _mm_loadu_ps(a);
7154 
7155     __m128 vy0123 = _mm_max_ps(va0123, vb);
7156     if (n & (2 * sizeof(float))) {
7157       _mm_storel_pi((__m64*) y, vy0123);
7158       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7159       y += 2;
7160     }
7161     if (n & (1 * sizeof(float))) {
7162       _mm_store_ss(y, vy0123);
7163     }
7164   }
7165 }
7166 
xnn_f32_vmin_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7167 void xnn_f32_vmin_ukernel__sse_x8(
7168     size_t n,
7169     const float* a,
7170     const float* b,
7171     float* y,
7172     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7173 {
7174   assert(n != 0);
7175   assert(n % sizeof(float) == 0);
7176   assert(a != NULL);
7177   assert(b != NULL);
7178   assert(y != NULL);
7179 
7180 
7181   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7182     const __m128 va0123 = _mm_loadu_ps(a);
7183     const __m128 va4567 = _mm_loadu_ps(a + 4);
7184     a += 8;
7185 
7186     const __m128 vb0123 = _mm_loadu_ps(b);
7187     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7188     b += 8;
7189 
7190     __m128 vy0123 = _mm_min_ps(va0123, vb0123);
7191     __m128 vy4567 = _mm_min_ps(va4567, vb4567);
7192 
7193 
7194 
7195     _mm_storeu_ps(y, vy0123);
7196     _mm_storeu_ps(y + 4, vy4567);
7197     y += 8;
7198   }
7199   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7200     const __m128 va0123 = _mm_loadu_ps(a);
7201     a += 4;
7202 
7203     const __m128 vb0123 = _mm_loadu_ps(b);
7204     b += 4;
7205 
7206     __m128 vy0123 = _mm_min_ps(va0123, vb0123);
7207     _mm_storeu_ps(y, vy0123);
7208     y += 4;
7209   }
7210   if XNN_UNLIKELY(n != 0) {
7211     const __m128 va0123 = _mm_loadu_ps(a);
7212     const __m128 vb0123 = _mm_loadu_ps(b);
7213 
7214     __m128 vy0123 = _mm_min_ps(va0123, vb0123);
7215     if (n & (2 * sizeof(float))) {
7216       _mm_storel_pi((__m64*) y, vy0123);
7217       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7218       y += 2;
7219     }
7220     if (n & (1 * sizeof(float))) {
7221       _mm_store_ss(y, vy0123);
7222     }
7223   }
7224 }
7225 
xnn_f32_vminc_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7226 void xnn_f32_vminc_ukernel__sse_x8(
7227     size_t n,
7228     const float* a,
7229     const float* b,
7230     float* y,
7231     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7232 {
7233   assert(n != 0);
7234   assert(n % sizeof(float) == 0);
7235   assert(a != NULL);
7236   assert(b != NULL);
7237   assert(y != NULL);
7238 
7239 
7240   const __m128 vb = _mm_load1_ps(b);
7241   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7242     const __m128 va0123 = _mm_loadu_ps(a);
7243     const __m128 va4567 = _mm_loadu_ps(a + 4);
7244     a += 8;
7245 
7246     __m128 vy0123 = _mm_min_ps(va0123, vb);
7247     __m128 vy4567 = _mm_min_ps(va4567, vb);
7248 
7249 
7250 
7251     _mm_storeu_ps(y, vy0123);
7252     _mm_storeu_ps(y + 4, vy4567);
7253     y += 8;
7254   }
7255   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7256     const __m128 va0123 = _mm_loadu_ps(a);
7257     a += 4;
7258 
7259     __m128 vy0123 = _mm_min_ps(va0123, vb);
7260     _mm_storeu_ps(y, vy0123);
7261     y += 4;
7262   }
7263   if XNN_UNLIKELY(n != 0) {
7264     const __m128 va0123 = _mm_loadu_ps(a);
7265 
7266     __m128 vy0123 = _mm_min_ps(va0123, vb);
7267     if (n & (2 * sizeof(float))) {
7268       _mm_storel_pi((__m64*) y, vy0123);
7269       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7270       y += 2;
7271     }
7272     if (n & (1 * sizeof(float))) {
7273       _mm_store_ss(y, vy0123);
7274     }
7275   }
7276 }
7277 
xnn_f32_vmul_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7278 void xnn_f32_vmul_minmax_ukernel__sse_x8(
7279     size_t n,
7280     const float* a,
7281     const float* b,
7282     float* y,
7283     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7284 {
7285   assert(n != 0);
7286   assert(n % sizeof(float) == 0);
7287   assert(a != NULL);
7288   assert(b != NULL);
7289   assert(y != NULL);
7290 
7291   const __m128 vy_min = _mm_load_ps(params->sse.min);
7292   const __m128 vy_max = _mm_load_ps(params->sse.max);
7293 
7294   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7295     const __m128 va0123 = _mm_loadu_ps(a);
7296     const __m128 va4567 = _mm_loadu_ps(a + 4);
7297     a += 8;
7298 
7299     const __m128 vb0123 = _mm_loadu_ps(b);
7300     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7301     b += 8;
7302 
7303     __m128 vy0123 = _mm_mul_ps(va0123, vb0123);
7304     __m128 vy4567 = _mm_mul_ps(va4567, vb4567);
7305 
7306 
7307     vy0123 = _mm_max_ps(vy0123, vy_min);
7308     vy4567 = _mm_max_ps(vy4567, vy_min);
7309 
7310     vy0123 = _mm_min_ps(vy0123, vy_max);
7311     vy4567 = _mm_min_ps(vy4567, vy_max);
7312 
7313     _mm_storeu_ps(y, vy0123);
7314     _mm_storeu_ps(y + 4, vy4567);
7315     y += 8;
7316   }
7317   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7318     const __m128 va0123 = _mm_loadu_ps(a);
7319     a += 4;
7320 
7321     const __m128 vb0123 = _mm_loadu_ps(b);
7322     b += 4;
7323 
7324     __m128 vy0123 = _mm_mul_ps(va0123, vb0123);
7325     vy0123 = _mm_max_ps(vy0123, vy_min);
7326     vy0123 = _mm_min_ps(vy0123, vy_max);
7327     _mm_storeu_ps(y, vy0123);
7328     y += 4;
7329   }
7330   if XNN_UNLIKELY(n != 0) {
7331     const __m128 va0123 = _mm_loadu_ps(a);
7332     const __m128 vb0123 = _mm_loadu_ps(b);
7333 
7334     __m128 vy0123 = _mm_mul_ps(va0123, vb0123);
7335     vy0123 = _mm_max_ps(vy0123, vy_min);
7336     vy0123 = _mm_min_ps(vy0123, vy_max);
7337     if (n & (2 * sizeof(float))) {
7338       _mm_storel_pi((__m64*) y, vy0123);
7339       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7340       y += 2;
7341     }
7342     if (n & (1 * sizeof(float))) {
7343       _mm_store_ss(y, vy0123);
7344     }
7345   }
7346 }
7347 
xnn_f32_vmulc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7348 void xnn_f32_vmulc_minmax_ukernel__sse_x8(
7349     size_t n,
7350     const float* a,
7351     const float* b,
7352     float* y,
7353     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7354 {
7355   assert(n != 0);
7356   assert(n % sizeof(float) == 0);
7357   assert(a != NULL);
7358   assert(b != NULL);
7359   assert(y != NULL);
7360 
7361   const __m128 vy_min = _mm_load_ps(params->sse.min);
7362   const __m128 vy_max = _mm_load_ps(params->sse.max);
7363 
7364   const __m128 vb = _mm_load1_ps(b);
7365   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7366     const __m128 va0123 = _mm_loadu_ps(a);
7367     const __m128 va4567 = _mm_loadu_ps(a + 4);
7368     a += 8;
7369 
7370     __m128 vy0123 = _mm_mul_ps(va0123, vb);
7371     __m128 vy4567 = _mm_mul_ps(va4567, vb);
7372 
7373 
7374     vy0123 = _mm_max_ps(vy0123, vy_min);
7375     vy4567 = _mm_max_ps(vy4567, vy_min);
7376 
7377     vy0123 = _mm_min_ps(vy0123, vy_max);
7378     vy4567 = _mm_min_ps(vy4567, vy_max);
7379 
7380     _mm_storeu_ps(y, vy0123);
7381     _mm_storeu_ps(y + 4, vy4567);
7382     y += 8;
7383   }
7384   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7385     const __m128 va0123 = _mm_loadu_ps(a);
7386     a += 4;
7387 
7388     __m128 vy0123 = _mm_mul_ps(va0123, vb);
7389     vy0123 = _mm_max_ps(vy0123, vy_min);
7390     vy0123 = _mm_min_ps(vy0123, vy_max);
7391     _mm_storeu_ps(y, vy0123);
7392     y += 4;
7393   }
7394   if XNN_UNLIKELY(n != 0) {
7395     const __m128 va0123 = _mm_loadu_ps(a);
7396 
7397     __m128 vy0123 = _mm_mul_ps(va0123, vb);
7398     vy0123 = _mm_max_ps(vy0123, vy_min);
7399     vy0123 = _mm_min_ps(vy0123, vy_max);
7400     if (n & (2 * sizeof(float))) {
7401       _mm_storel_pi((__m64*) y, vy0123);
7402       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7403       y += 2;
7404     }
7405     if (n & (1 * sizeof(float))) {
7406       _mm_store_ss(y, vy0123);
7407     }
7408   }
7409 }
7410 
xnn_f32_vrdivc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7411 void xnn_f32_vrdivc_minmax_ukernel__sse_x8(
7412     size_t n,
7413     const float* a,
7414     const float* b,
7415     float* y,
7416     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7417 {
7418   assert(n != 0);
7419   assert(n % sizeof(float) == 0);
7420   assert(a != NULL);
7421   assert(b != NULL);
7422   assert(y != NULL);
7423 
7424   const __m128 vy_min = _mm_load_ps(params->sse.min);
7425   const __m128 vy_max = _mm_load_ps(params->sse.max);
7426 
7427   const __m128 vb = _mm_load1_ps(b);
7428   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7429     const __m128 va0123 = _mm_loadu_ps(a);
7430     const __m128 va4567 = _mm_loadu_ps(a + 4);
7431     a += 8;
7432 
7433     __m128 vy0123 = _mm_div_ps(vb, va0123);
7434     __m128 vy4567 = _mm_div_ps(vb, va4567);
7435 
7436 
7437     vy0123 = _mm_max_ps(vy0123, vy_min);
7438     vy4567 = _mm_max_ps(vy4567, vy_min);
7439 
7440     vy0123 = _mm_min_ps(vy0123, vy_max);
7441     vy4567 = _mm_min_ps(vy4567, vy_max);
7442 
7443     _mm_storeu_ps(y, vy0123);
7444     _mm_storeu_ps(y + 4, vy4567);
7445     y += 8;
7446   }
7447   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7448     const __m128 va0123 = _mm_loadu_ps(a);
7449     a += 4;
7450 
7451     __m128 vy0123 = _mm_div_ps(vb, va0123);
7452     vy0123 = _mm_max_ps(vy0123, vy_min);
7453     vy0123 = _mm_min_ps(vy0123, vy_max);
7454     _mm_storeu_ps(y, vy0123);
7455     y += 4;
7456   }
7457   if XNN_UNLIKELY(n != 0) {
7458     const __m128 va0123 = _mm_loadu_ps(a);
7459 
7460     __m128 vy0123 = _mm_div_ps(vb, va0123);
7461     vy0123 = _mm_max_ps(vy0123, vy_min);
7462     vy0123 = _mm_min_ps(vy0123, vy_max);
7463     if (n & (2 * sizeof(float))) {
7464       _mm_storel_pi((__m64*) y, vy0123);
7465       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7466       y += 2;
7467     }
7468     if (n & (1 * sizeof(float))) {
7469       _mm_store_ss(y, vy0123);
7470     }
7471   }
7472 }
7473 
xnn_f32_vrsubc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7474 void xnn_f32_vrsubc_minmax_ukernel__sse_x8(
7475     size_t n,
7476     const float* a,
7477     const float* b,
7478     float* y,
7479     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7480 {
7481   assert(n != 0);
7482   assert(n % sizeof(float) == 0);
7483   assert(a != NULL);
7484   assert(b != NULL);
7485   assert(y != NULL);
7486 
7487   const __m128 vy_min = _mm_load_ps(params->sse.min);
7488   const __m128 vy_max = _mm_load_ps(params->sse.max);
7489 
7490   const __m128 vb = _mm_load1_ps(b);
7491   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7492     const __m128 va0123 = _mm_loadu_ps(a);
7493     const __m128 va4567 = _mm_loadu_ps(a + 4);
7494     a += 8;
7495 
7496     __m128 vy0123 = _mm_sub_ps(vb, va0123);
7497     __m128 vy4567 = _mm_sub_ps(vb, va4567);
7498 
7499 
7500     vy0123 = _mm_max_ps(vy0123, vy_min);
7501     vy4567 = _mm_max_ps(vy4567, vy_min);
7502 
7503     vy0123 = _mm_min_ps(vy0123, vy_max);
7504     vy4567 = _mm_min_ps(vy4567, vy_max);
7505 
7506     _mm_storeu_ps(y, vy0123);
7507     _mm_storeu_ps(y + 4, vy4567);
7508     y += 8;
7509   }
7510   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7511     const __m128 va0123 = _mm_loadu_ps(a);
7512     a += 4;
7513 
7514     __m128 vy0123 = _mm_sub_ps(vb, va0123);
7515     vy0123 = _mm_max_ps(vy0123, vy_min);
7516     vy0123 = _mm_min_ps(vy0123, vy_max);
7517     _mm_storeu_ps(y, vy0123);
7518     y += 4;
7519   }
7520   if XNN_UNLIKELY(n != 0) {
7521     const __m128 va0123 = _mm_loadu_ps(a);
7522 
7523     __m128 vy0123 = _mm_sub_ps(vb, va0123);
7524     vy0123 = _mm_max_ps(vy0123, vy_min);
7525     vy0123 = _mm_min_ps(vy0123, vy_max);
7526     if (n & (2 * sizeof(float))) {
7527       _mm_storel_pi((__m64*) y, vy0123);
7528       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7529       y += 2;
7530     }
7531     if (n & (1 * sizeof(float))) {
7532       _mm_store_ss(y, vy0123);
7533     }
7534   }
7535 }
7536 
xnn_f32_vsqrdiff_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7537 void xnn_f32_vsqrdiff_ukernel__sse_x8(
7538     size_t n,
7539     const float* a,
7540     const float* b,
7541     float* y,
7542     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7543 {
7544   assert(n != 0);
7545   assert(n % sizeof(float) == 0);
7546   assert(a != NULL);
7547   assert(b != NULL);
7548   assert(y != NULL);
7549 
7550 
7551   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7552     const __m128 va0123 = _mm_loadu_ps(a);
7553     const __m128 va4567 = _mm_loadu_ps(a + 4);
7554     a += 8;
7555 
7556     const __m128 vb0123 = _mm_loadu_ps(b);
7557     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7558     b += 8;
7559 
7560     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7561     __m128 vy4567 = _mm_sub_ps(va4567, vb4567);
7562 
7563     vy0123 = _mm_mul_ps(vy0123, vy0123);
7564     vy4567 = _mm_mul_ps(vy4567, vy4567);
7565 
7566 
7567     _mm_storeu_ps(y, vy0123);
7568     _mm_storeu_ps(y + 4, vy4567);
7569     y += 8;
7570   }
7571   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7572     const __m128 va0123 = _mm_loadu_ps(a);
7573     a += 4;
7574 
7575     const __m128 vb0123 = _mm_loadu_ps(b);
7576     b += 4;
7577 
7578     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7579     vy0123 = _mm_mul_ps(vy0123, vy0123);
7580     _mm_storeu_ps(y, vy0123);
7581     y += 4;
7582   }
7583   if XNN_UNLIKELY(n != 0) {
7584     const __m128 va0123 = _mm_loadu_ps(a);
7585     const __m128 vb0123 = _mm_loadu_ps(b);
7586 
7587     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7588     vy0123 = _mm_mul_ps(vy0123, vy0123);
7589     if (n & (2 * sizeof(float))) {
7590       _mm_storel_pi((__m64*) y, vy0123);
7591       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7592       y += 2;
7593     }
7594     if (n & (1 * sizeof(float))) {
7595       _mm_store_ss(y, vy0123);
7596     }
7597   }
7598 }
7599 
xnn_f32_vsqrdiffc_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7600 void xnn_f32_vsqrdiffc_ukernel__sse_x8(
7601     size_t n,
7602     const float* a,
7603     const float* b,
7604     float* y,
7605     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7606 {
7607   assert(n != 0);
7608   assert(n % sizeof(float) == 0);
7609   assert(a != NULL);
7610   assert(b != NULL);
7611   assert(y != NULL);
7612 
7613 
7614   const __m128 vb = _mm_load1_ps(b);
7615   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7616     const __m128 va0123 = _mm_loadu_ps(a);
7617     const __m128 va4567 = _mm_loadu_ps(a + 4);
7618     a += 8;
7619 
7620     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7621     __m128 vy4567 = _mm_sub_ps(va4567, vb);
7622 
7623     vy0123 = _mm_mul_ps(vy0123, vy0123);
7624     vy4567 = _mm_mul_ps(vy4567, vy4567);
7625 
7626 
7627     _mm_storeu_ps(y, vy0123);
7628     _mm_storeu_ps(y + 4, vy4567);
7629     y += 8;
7630   }
7631   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7632     const __m128 va0123 = _mm_loadu_ps(a);
7633     a += 4;
7634 
7635     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7636     vy0123 = _mm_mul_ps(vy0123, vy0123);
7637     _mm_storeu_ps(y, vy0123);
7638     y += 4;
7639   }
7640   if XNN_UNLIKELY(n != 0) {
7641     const __m128 va0123 = _mm_loadu_ps(a);
7642 
7643     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7644     vy0123 = _mm_mul_ps(vy0123, vy0123);
7645     if (n & (2 * sizeof(float))) {
7646       _mm_storel_pi((__m64*) y, vy0123);
7647       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7648       y += 2;
7649     }
7650     if (n & (1 * sizeof(float))) {
7651       _mm_store_ss(y, vy0123);
7652     }
7653   }
7654 }
7655 
xnn_f32_vsub_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7656 void xnn_f32_vsub_minmax_ukernel__sse_x8(
7657     size_t n,
7658     const float* a,
7659     const float* b,
7660     float* y,
7661     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7662 {
7663   assert(n != 0);
7664   assert(n % sizeof(float) == 0);
7665   assert(a != NULL);
7666   assert(b != NULL);
7667   assert(y != NULL);
7668 
7669   const __m128 vy_min = _mm_load_ps(params->sse.min);
7670   const __m128 vy_max = _mm_load_ps(params->sse.max);
7671 
7672   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7673     const __m128 va0123 = _mm_loadu_ps(a);
7674     const __m128 va4567 = _mm_loadu_ps(a + 4);
7675     a += 8;
7676 
7677     const __m128 vb0123 = _mm_loadu_ps(b);
7678     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7679     b += 8;
7680 
7681     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7682     __m128 vy4567 = _mm_sub_ps(va4567, vb4567);
7683 
7684 
7685     vy0123 = _mm_max_ps(vy0123, vy_min);
7686     vy4567 = _mm_max_ps(vy4567, vy_min);
7687 
7688     vy0123 = _mm_min_ps(vy0123, vy_max);
7689     vy4567 = _mm_min_ps(vy4567, vy_max);
7690 
7691     _mm_storeu_ps(y, vy0123);
7692     _mm_storeu_ps(y + 4, vy4567);
7693     y += 8;
7694   }
7695   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7696     const __m128 va0123 = _mm_loadu_ps(a);
7697     a += 4;
7698 
7699     const __m128 vb0123 = _mm_loadu_ps(b);
7700     b += 4;
7701 
7702     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7703     vy0123 = _mm_max_ps(vy0123, vy_min);
7704     vy0123 = _mm_min_ps(vy0123, vy_max);
7705     _mm_storeu_ps(y, vy0123);
7706     y += 4;
7707   }
7708   if XNN_UNLIKELY(n != 0) {
7709     const __m128 va0123 = _mm_loadu_ps(a);
7710     const __m128 vb0123 = _mm_loadu_ps(b);
7711 
7712     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7713     vy0123 = _mm_max_ps(vy0123, vy_min);
7714     vy0123 = _mm_min_ps(vy0123, vy_max);
7715     if (n & (2 * sizeof(float))) {
7716       _mm_storel_pi((__m64*) y, vy0123);
7717       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7718       y += 2;
7719     }
7720     if (n & (1 * sizeof(float))) {
7721       _mm_store_ss(y, vy0123);
7722     }
7723   }
7724 }
7725 
xnn_f32_vsubc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7726 void xnn_f32_vsubc_minmax_ukernel__sse_x8(
7727     size_t n,
7728     const float* a,
7729     const float* b,
7730     float* y,
7731     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7732 {
7733   assert(n != 0);
7734   assert(n % sizeof(float) == 0);
7735   assert(a != NULL);
7736   assert(b != NULL);
7737   assert(y != NULL);
7738 
7739   const __m128 vy_min = _mm_load_ps(params->sse.min);
7740   const __m128 vy_max = _mm_load_ps(params->sse.max);
7741 
7742   const __m128 vb = _mm_load1_ps(b);
7743   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7744     const __m128 va0123 = _mm_loadu_ps(a);
7745     const __m128 va4567 = _mm_loadu_ps(a + 4);
7746     a += 8;
7747 
7748     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7749     __m128 vy4567 = _mm_sub_ps(va4567, vb);
7750 
7751 
7752     vy0123 = _mm_max_ps(vy0123, vy_min);
7753     vy4567 = _mm_max_ps(vy4567, vy_min);
7754 
7755     vy0123 = _mm_min_ps(vy0123, vy_max);
7756     vy4567 = _mm_min_ps(vy4567, vy_max);
7757 
7758     _mm_storeu_ps(y, vy0123);
7759     _mm_storeu_ps(y + 4, vy4567);
7760     y += 8;
7761   }
7762   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7763     const __m128 va0123 = _mm_loadu_ps(a);
7764     a += 4;
7765 
7766     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7767     vy0123 = _mm_max_ps(vy0123, vy_min);
7768     vy0123 = _mm_min_ps(vy0123, vy_max);
7769     _mm_storeu_ps(y, vy0123);
7770     y += 4;
7771   }
7772   if XNN_UNLIKELY(n != 0) {
7773     const __m128 va0123 = _mm_loadu_ps(a);
7774 
7775     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7776     vy0123 = _mm_max_ps(vy0123, vy_min);
7777     vy0123 = _mm_min_ps(vy0123, vy_max);
7778     if (n & (2 * sizeof(float))) {
7779       _mm_storel_pi((__m64*) y, vy0123);
7780       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7781       y += 2;
7782     }
7783     if (n & (1 * sizeof(float))) {
7784       _mm_store_ss(y, vy0123);
7785     }
7786   }
7787 }
7788 
xnn_f32_vclamp_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7789 void xnn_f32_vclamp_ukernel__sse_x8(
7790     size_t n,
7791     const float* x,
7792     float* y,
7793     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7794 {
7795   assert(n != 0);
7796   assert(n % sizeof(float) == 0);
7797   assert(x != NULL);
7798   assert(y != NULL);
7799 
7800   const __m128 vy_min = _mm_load_ps(params->sse.min);
7801   const __m128 vy_max = _mm_load_ps(params->sse.max);
7802 
7803   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7804     __m128 vacc0123 = _mm_loadu_ps(x);
7805     __m128 vacc4567 = _mm_loadu_ps(x + 4);
7806     x += 8;
7807 
7808     vacc0123 = _mm_max_ps(vacc0123, vy_min);
7809     vacc4567 = _mm_max_ps(vacc4567, vy_min);
7810 
7811     vacc0123 = _mm_min_ps(vacc0123, vy_max);
7812     vacc4567 = _mm_min_ps(vacc4567, vy_max);
7813 
7814     _mm_storeu_ps(y, vacc0123);
7815     _mm_storeu_ps(y + 4, vacc4567);
7816     y += 8;
7817   }
7818   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7819     __m128 vacc = _mm_loadu_ps(x);
7820     x += 4;
7821 
7822     vacc = _mm_max_ps(vacc, vy_min);
7823     vacc = _mm_min_ps(vacc, vy_max);
7824 
7825     _mm_storeu_ps(y, vacc);
7826     y += 4;
7827   }
7828   if XNN_UNLIKELY(n != 0) {
7829     __m128 vacc = _mm_loadu_ps(x);
7830     vacc = _mm_max_ps(vacc, vy_min);
7831     vacc = _mm_min_ps(vacc, vy_max);
7832 
7833     if (n & (2 * sizeof(float))) {
7834       _mm_storel_pi((__m64*) y, vacc);
7835       vacc = _mm_movehl_ps(vacc, vacc);
7836       y += 2;
7837     }
7838     if (n & (1 * sizeof(float))) {
7839       _mm_store_ss(y, vacc);
7840     }
7841   }
7842 }
7843 
xnn_f32_vhswish_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])7844 void xnn_f32_vhswish_ukernel__sse_x8(
7845     size_t n,
7846     const float* x,
7847     float* y,
7848     const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7849 {
7850   assert(n != 0);
7851   assert(n % sizeof(float) == 0);
7852 
7853   const __m128 vsixth = _mm_load_ps(params->sse.sixth);
7854   const __m128 vhalf = _mm_load_ps(params->sse.half);
7855   const __m128 vone = _mm_load_ps(params->sse.one);
7856   const __m128 vzero = _mm_setzero_ps();
7857 
7858   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7859     const __m128 vx0123 = _mm_loadu_ps(x);
7860     const __m128 vx4567 = _mm_loadu_ps(x + 4);
7861     x += 8;
7862 
7863     __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
7864     __m128 vacc4567 = _mm_mul_ps(vx4567, vsixth);
7865 
7866     vacc0123 = _mm_add_ps(vacc0123, vhalf);
7867     vacc4567 = _mm_add_ps(vacc4567, vhalf);
7868 
7869     vacc0123 = _mm_max_ps(vacc0123, vzero);
7870     vacc4567 = _mm_max_ps(vacc4567, vzero);
7871 
7872     vacc0123 = _mm_min_ps(vacc0123, vone);
7873     vacc4567 = _mm_min_ps(vacc4567, vone);
7874 
7875     vacc0123 = _mm_mul_ps(vacc0123, vx0123);
7876     vacc4567 = _mm_mul_ps(vacc4567, vx4567);
7877 
7878     _mm_storeu_ps(y, vacc0123);
7879     _mm_storeu_ps(y + 4, vacc4567);
7880     y += 8;
7881   }
7882   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7883     const __m128 vx0123 = _mm_loadu_ps(x);
7884     x += 4;
7885     __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
7886     vacc0123 = _mm_add_ps(vacc0123, vhalf);
7887     vacc0123 = _mm_max_ps(vacc0123, vzero);
7888     vacc0123 = _mm_min_ps(vacc0123, vone);
7889     vacc0123 = _mm_mul_ps(vacc0123, vx0123);
7890     _mm_storeu_ps(y, vacc0123);
7891     y += 4;
7892   }
7893   if XNN_UNLIKELY(n != 0) {
7894     const __m128 vx0123 = _mm_loadu_ps(x);
7895     __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
7896     vacc0123 = _mm_add_ps(vacc0123, vhalf);
7897     vacc0123 = _mm_max_ps(vacc0123, vzero);
7898     vacc0123 = _mm_min_ps(vacc0123, vone);
7899     vacc0123 = _mm_mul_ps(vacc0123, vx0123);
7900 
7901     if (n & (2 * sizeof(float))) {
7902       _mm_storel_pi((__m64*) y, vacc0123);
7903       vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
7904       y += 2;
7905     }
7906     if (n & (1 * sizeof(float))) {
7907       _mm_store_ss(y, vacc0123);
7908     }
7909   }
7910 }
7911 
xnn_f32_vlrelu_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])7912 void xnn_f32_vlrelu_ukernel__sse_x8(
7913     size_t n,
7914     const float* x,
7915     float* y,
7916     const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7917 {
7918   assert(n != 0);
7919   assert(n % sizeof(float) == 0);
7920 
7921   const __m128 vslope = _mm_load_ps(params->sse.slope);
7922   const __m128 vzero = _mm_setzero_ps();
7923   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7924     __m128 vx0123 = _mm_loadu_ps(x);
7925     __m128 vx4567 = _mm_loadu_ps(x + 4);
7926     x += 8;
7927 
7928     __m128 vacc0123 = _mm_max_ps(_mm_setzero_ps(), vx0123);
7929     vx0123 = _mm_min_ps(vx0123, vzero);
7930     __m128 vacc4567 = _mm_max_ps(_mm_setzero_ps(), vx4567);
7931     vx4567 = _mm_min_ps(vx4567, vzero);
7932 
7933     vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vx0123, vslope));
7934     vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vx4567, vslope));
7935 
7936     _mm_storeu_ps(y, vacc0123);
7937     _mm_storeu_ps(y + 4, vacc4567);
7938     y += 8;
7939   }
7940   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7941     __m128 vx = _mm_loadu_ps(x);
7942     x += 4;
7943 
7944     __m128 vacc = _mm_max_ps(_mm_setzero_ps(), vx);
7945     vx = _mm_min_ps(vx, vzero);
7946     vacc = _mm_add_ps(vacc, _mm_mul_ps(vx, vslope));
7947 
7948     _mm_storeu_ps(y, vacc);
7949     y += 4;
7950   }
7951   if XNN_UNLIKELY(n != 0) {
7952     __m128 vx = _mm_loadu_ps(x);
7953 
7954     __m128 vacc = _mm_max_ps(_mm_setzero_ps(), vx);
7955     vx = _mm_min_ps(vx, vzero);
7956     vacc = _mm_add_ps(vacc, _mm_mul_ps(vx, vslope));
7957 
7958     if (n & (2 * sizeof(float))) {
7959       _mm_storel_pi((__m64*) y, vacc);
7960       vacc = _mm_movehl_ps(vacc, vacc);
7961       y += 2;
7962     }
7963     if (n & (1 * sizeof(float))) {
7964       _mm_store_ss(y, vacc);
7965     }
7966   }
7967 }
7968 
xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x(size_t rows,size_t channels,const float * restrict input,size_t input_stride,const float * restrict weights,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7969 void xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x(
7970     size_t rows,
7971     size_t channels,
7972     const float*restrict input,
7973     size_t input_stride,
7974     const float*restrict weights,
7975     float*restrict output,
7976     size_t output_stride,
7977     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7978 {
7979   assert(rows != 0);
7980   assert(channels != 0);
7981   assert(channels % sizeof(float) == 0);
7982 
7983   const float* i0 = input;
7984   float* o0 = output;
7985   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
7986   float* o1 = (float*) ((uintptr_t) o0 + output_stride);
7987 
7988   const size_t input_increment = input_stride * 2 - channels;
7989   const size_t output_increment = output_stride * 2 - channels;
7990 
7991   const __m128 vmin = _mm_load_ps(params->sse.min);
7992   const __m128 vmax = _mm_load_ps(params->sse.max);
7993   do {
7994     if XNN_UNPREDICTABLE(rows < 2) {
7995       i1 = i0;
7996       o1 = o0;
7997     }
7998 
7999     const float* w = weights;
8000     size_t c = channels;
8001     for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
8002       const __m128 vscale0123 = _mm_load_ps(w);
8003 
8004       __m128 vacc0x0123 = _mm_loadu_ps(i0);
8005       i0 += 4;
8006       __m128 vacc1x0123 = _mm_loadu_ps(i1);
8007       i1 += 4;
8008 
8009       vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123);
8010       vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123);
8011 
8012       const __m128 vbias0123 = _mm_load_ps(w + 4);
8013 
8014       vacc0x0123 = _mm_add_ps(vacc0x0123, vbias0123);
8015       vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123);
8016 
8017       vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
8018       vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
8019 
8020       vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
8021       vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
8022 
8023       _mm_storeu_ps(o0, vacc0x0123);
8024       o0 += 4;
8025       _mm_storeu_ps(o1, vacc1x0123);
8026       o1 += 4;
8027 
8028       w += 8;
8029     }
8030     if XNN_UNLIKELY(c != 0) {
8031       const __m128 vscale0123 = _mm_load_ps(w);
8032 
8033       __m128 vacc0x0123 = _mm_loadu_ps(i0);
8034       i0 = (const float*) ((uintptr_t) i0 + c);
8035       __m128 vacc1x0123 = _mm_loadu_ps(i1);
8036       i1 = (const float*) ((uintptr_t) i1 + c);
8037 
8038       vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123);
8039       vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123);
8040 
8041       const __m128 vbias0123 = _mm_load_ps(w + 4);
8042 
8043       vacc0x0123 = _mm_add_ps(vacc0x0123, vbias0123);
8044       vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123);
8045 
8046       vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
8047       vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
8048 
8049       vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
8050       vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
8051 
8052       if (c & (2 * sizeof(float))) {
8053         _mm_storel_pi((__m64*) o0, vacc0x0123);
8054         _mm_storel_pi((__m64*) o1, vacc1x0123);
8055 
8056         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
8057         vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
8058 
8059         o0 += 2;
8060         o1 += 2;
8061       }
8062       if (c & (1 * sizeof(float))) {
8063         _mm_store_ss(o0, vacc0x0123);
8064         _mm_store_ss(o1, vacc1x0123);
8065 
8066         o0 += 1;
8067         o1 += 1;
8068       }
8069     }
8070     i0 = (const float*) ((uintptr_t) i0 + input_increment);
8071     o0 = (float*) ((uintptr_t) o0 + output_increment);
8072     i1 = (const float*) ((uintptr_t) i1 + input_increment);
8073     o1 = (float*) ((uintptr_t) o1 + output_increment);
8074     rows = doz(rows, 2);
8075   } while (rows != 0);
8076 }
8077 
xnn_f32_vsqrt_ukernel__sse_sqrt_x4(size_t n,const float * x,float * y,const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS (1)])8078 void xnn_f32_vsqrt_ukernel__sse_sqrt_x4(
8079     size_t n,
8080     const float* x,
8081     float* y,
8082     const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8083 {
8084   assert(n != 0);
8085   assert(n % sizeof(float) == 0);
8086 
8087   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8088     const __m128 vx = _mm_loadu_ps(x);
8089     x += 4;
8090     const __m128 vy = _mm_sqrt_ps(vx);
8091     _mm_storeu_ps(y, vy);
8092     y += 4;
8093   }
8094   if XNN_UNLIKELY(n != 0) {
8095     const __m128 vx = _mm_loadu_ps(x);
8096     __m128 vy = _mm_sqrt_ps(vx);
8097     if (n & (2 * sizeof(float))) {
8098       _mm_storel_pi((__m64*) y, vy);
8099       vy = _mm_movehl_ps(vy, vy);
8100       y += 2;
8101     }
8102     if (n & (1 * sizeof(float))) {
8103       _mm_store_ss(y, vy);
8104     }
8105   }
8106 }
8107 
xnn_f32_vabs_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS (1)])8108 void xnn_f32_vabs_ukernel__sse_x8(
8109     size_t n,
8110     const float* x,
8111     float* y,
8112     const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8113 {
8114   assert(n != 0);
8115   assert(n % sizeof(float) == 0);
8116   assert(x != NULL);
8117   assert(y != NULL);
8118 
8119   const __m128 vnonsign_mask = _mm_load_ps(params->sse.nonsign_mask);
8120   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
8121     const __m128 vx0123 = _mm_loadu_ps(x);
8122     const __m128 vx4567 = _mm_loadu_ps(x + 4);
8123     x += 8;
8124 
8125     const __m128 vy0123 = _mm_and_ps(vx0123, vnonsign_mask);
8126     const __m128 vy4567 = _mm_and_ps(vx4567, vnonsign_mask);
8127 
8128     _mm_storeu_ps(y, vy0123);
8129     _mm_storeu_ps(y + 4, vy4567);
8130     y += 8;
8131   }
8132   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8133     const __m128 vx = _mm_loadu_ps(x);
8134     x += 4;
8135     const __m128 vy = _mm_and_ps(vx, vnonsign_mask);
8136     _mm_storeu_ps(y, vy);
8137     y += 4;
8138   }
8139   if XNN_UNLIKELY(n != 0) {
8140     const __m128 vx = _mm_loadu_ps(x);
8141     __m128 vy = _mm_and_ps(vx, vnonsign_mask);
8142     if (n & (2 * sizeof(float))) {
8143       _mm_storel_pi((__m64*) y, vy);
8144       vy = _mm_movehl_ps(vy, vy);
8145       y += 2;
8146     }
8147     if (n & (1 * sizeof(float))) {
8148       _mm_store_ss(y, vy);
8149     }
8150   }
8151 }
8152 
xnn_f32_vneg_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS (1)])8153 void xnn_f32_vneg_ukernel__sse_x8(
8154     size_t n,
8155     const float* x,
8156     float* y,
8157     const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8158 {
8159   assert(n != 0);
8160   assert(n % sizeof(float) == 0);
8161   assert(x != NULL);
8162   assert(y != NULL);
8163 
8164   const __m128 vsign_mask = _mm_load_ps(params->sse.sign_mask);
8165   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
8166     const __m128 vx0123 = _mm_loadu_ps(x);
8167     const __m128 vx4567 = _mm_loadu_ps(x + 4);
8168     x += 8;
8169 
8170     const __m128 vy0123 = _mm_xor_ps(vx0123, vsign_mask);
8171     const __m128 vy4567 = _mm_xor_ps(vx4567, vsign_mask);
8172 
8173     _mm_storeu_ps(y, vy0123);
8174     _mm_storeu_ps(y + 4, vy4567);
8175     y += 8;
8176   }
8177   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8178     const __m128 vx = _mm_loadu_ps(x);
8179     x += 4;
8180     const __m128 vy = _mm_xor_ps(vx, vsign_mask);
8181     _mm_storeu_ps(y, vy);
8182     y += 4;
8183   }
8184   if XNN_UNLIKELY(n != 0) {
8185     const __m128 vx = _mm_loadu_ps(x);
8186     __m128 vy = _mm_xor_ps(vx, vsign_mask);
8187     if (n & (2 * sizeof(float))) {
8188       _mm_storel_pi((__m64*) y, vy);
8189       vy = _mm_movehl_ps(vy, vy);
8190       y += 2;
8191     }
8192     if (n & (1 * sizeof(float))) {
8193       _mm_store_ss(y, vy);
8194     }
8195   }
8196 }
8197 
xnn_f32_vsqr_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])8198 void xnn_f32_vsqr_ukernel__sse_x8(
8199     size_t n,
8200     const float* x,
8201     float* y,
8202     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8203 {
8204   assert(n != 0);
8205   assert(n % sizeof(float) == 0);
8206   assert(x != NULL);
8207   assert(y != NULL);
8208 
8209   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
8210     const __m128 vx0123 = _mm_loadu_ps(x);
8211     const __m128 vx4567 = _mm_loadu_ps(x + 4);
8212     x += 8;
8213 
8214     const __m128 vy0123 = _mm_mul_ps(vx0123, vx0123);
8215     const __m128 vy4567 = _mm_mul_ps(vx4567, vx4567);
8216 
8217     _mm_storeu_ps(y, vy0123);
8218     _mm_storeu_ps(y + 4, vy4567);
8219     y += 8;
8220   }
8221   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8222     const __m128 vx = _mm_loadu_ps(x);
8223     x += 4;
8224     const __m128 vy = _mm_mul_ps(vx, vx);
8225     _mm_storeu_ps(y, vy);
8226     y += 4;
8227   }
8228   if XNN_UNLIKELY(n != 0) {
8229     const __m128 vx = _mm_loadu_ps(x);
8230     __m128 vy = _mm_mul_ps(vx, vx);
8231     if (n & (2 * sizeof(float))) {
8232       _mm_storel_pi((__m64*) y, vy);
8233       vy = _mm_movehl_ps(vy, vy);
8234       y += 2;
8235     }
8236     if (n & (1 * sizeof(float))) {
8237       _mm_store_ss(y, vy);
8238     }
8239   }
8240 }
8241 
xnn_x32_packx_ukernel_4x__sse(size_t m,size_t k,const uint32_t * restrict x,size_t x_stride,uint32_t * restrict y)8242 void xnn_x32_packx_ukernel_4x__sse(
8243     size_t m,
8244     size_t k,
8245     const uint32_t* restrict x,
8246     size_t x_stride,
8247     uint32_t* restrict y)
8248 {
8249   assert(m != 0);
8250   assert(k != 0);
8251 
8252   const float* x0 = (const float*) x;
8253   const float* x1 = (const float*) ((uintptr_t) x0 + x_stride);
8254   if (m < 2) {
8255     x1 = x0;
8256   }
8257   const float* x2 = (const float*) ((uintptr_t) x1 + x_stride);
8258   if (m <= 2) {
8259     x2 = x1;
8260   }
8261   const float* x3 = (const float*) ((uintptr_t) x2 + x_stride);
8262   if (m != 4) {
8263     x3 = x2;
8264   }
8265 
8266   float*restrict y_f32 = (float*) y;
8267 
8268   for (; k >= 4; k -= 4) {
8269     const __m128 vx0 = _mm_loadu_ps(x0);
8270     x0 += 4;
8271     const __m128 vx1 = _mm_loadu_ps(x1);
8272     x1 += 4;
8273     const __m128 vx2 = _mm_loadu_ps(x2);
8274     x2 += 4;
8275     const __m128 vx3 = _mm_loadu_ps(x3);
8276     x3 += 4;
8277 
8278     const __m128 vt0 = _mm_unpacklo_ps(vx0, vx1);
8279     const __m128 vt1 = _mm_unpackhi_ps(vx0, vx1);
8280     const __m128 vt2 = _mm_unpacklo_ps(vx2, vx3);
8281     const __m128 vt3 = _mm_unpackhi_ps(vx2, vx3);
8282 
8283     const __m128 vy0 = _mm_movelh_ps(vt0, vt2);
8284     _mm_store_ps(y_f32, vy0);
8285 
8286     const __m128 vy1 = _mm_movehl_ps(vt2, vt0);
8287     _mm_store_ps(y_f32 + 4, vy1);
8288 
8289     const __m128 vy2 = _mm_movelh_ps(vt1, vt3);
8290     _mm_store_ps(y_f32 + 8, vy2);
8291 
8292     const __m128 vy3 = _mm_movehl_ps(vt3, vt1);
8293     _mm_store_ps(y_f32 + 12, vy3);
8294 
8295     y_f32 += 16;
8296   }
8297   if XNN_UNLIKELY(k != 0) {
8298     do {
8299       const __m128 vx0 = _mm_load_ss(x0);
8300       x0 += 1;
8301       const __m128 vx1 = _mm_load_ss(x1);
8302       x1 += 1;
8303       const __m128 vx2 = _mm_load_ss(x2);
8304       x2 += 1;
8305       const __m128 vx3 = _mm_load_ss(x3);
8306       x3 += 1;
8307 
8308       const __m128 vx01 = _mm_unpacklo_ps(vx0, vx1);
8309       const __m128 vx23 = _mm_unpacklo_ps(vx2, vx3);
8310       const __m128 vy = _mm_movelh_ps(vx01, vx23);
8311 
8312       _mm_store_ps(y_f32, vy);
8313       y_f32 += 4;
8314     } while (--k != 0);
8315   }
8316 }
8317 
xnn_x32_transposec_ukernel__4x4_sse(const uint32_t * input,uint32_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)8318 void xnn_x32_transposec_ukernel__4x4_sse(
8319     const uint32_t* input,
8320     uint32_t* output,
8321     size_t input_stride,
8322     size_t output_stride,
8323     size_t block_width,
8324     size_t block_height) XNN_OOB_READS
8325 {
8326   assert(output_stride >= block_height * sizeof(uint32_t));
8327   assert(input_stride >= block_width * sizeof(uint32_t));
8328 
8329   const size_t tile_height = 4;
8330   const size_t tile_width = 4;
8331   const size_t tile_wbytes = tile_width * sizeof(float);
8332   const size_t input_vreset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
8333   const size_t output_vreset = tile_height * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t);
8334   const size_t input_offset = tile_height * input_stride;
8335 
8336   const float* i0 = (const float*) input;
8337   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
8338   const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
8339   const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
8340 
8341   float* o0 = (float*) output;
8342   float* o1 = (float*) ((uintptr_t) o0 + output_stride);
8343   float* o2 = (float*) ((uintptr_t) o1 + output_stride);
8344   float* o3 = (float*) ((uintptr_t) o2 + output_stride);
8345 
8346   do {
8347     if XNN_UNPREDICTABLE(block_width < 2) {
8348       o1 = o0;
8349     }
8350     if XNN_UNPREDICTABLE(block_width <= 2) {
8351       o2 = o0;
8352     }
8353     if XNN_UNPREDICTABLE(block_width < 4) {
8354       o3 = o0;
8355     }
8356     size_t bh = block_height;
8357     for (; bh >= 4; bh -= 4) {
8358       __m128 v0 = _mm_loadu_ps(i0);
8359       i0 = (const float*) ((uintptr_t) i0 + input_offset);
8360       __m128 v1 = _mm_loadu_ps(i1);
8361       i1 = (const float*) ((uintptr_t) i1 + input_offset);
8362       __m128 v2 = _mm_loadu_ps(i2);
8363       i2 = (const float*) ((uintptr_t) i2 + input_offset);
8364       __m128 v3 = _mm_loadu_ps(i3);
8365       i3 = (const float*) ((uintptr_t) i3 + input_offset);
8366 
8367       _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
8368 
8369       _mm_storeu_ps(o3, v3);
8370       o3 = (float*) ((uintptr_t) o3 + tile_wbytes);
8371       _mm_storeu_ps(o2, v2);
8372       o2 = (float*) ((uintptr_t) o2 + tile_wbytes);
8373       _mm_storeu_ps(o1, v1);
8374       o1 = (float*) ((uintptr_t) o1 + tile_wbytes);
8375       _mm_storeu_ps(o0, v0);
8376       o0 = (float*) ((uintptr_t) o0 + tile_wbytes);
8377     }
8378 
8379     if (bh != 0) {
8380       if XNN_UNPREDICTABLE(bh <= 2) {
8381         i2 = i0;
8382       }
8383       if XNN_UNPREDICTABLE(bh < 2) {
8384         i1 = i0;
8385       }
8386       __m128 v0 = _mm_loadu_ps(i0);
8387       __m128 v1 = _mm_loadu_ps(i1);
8388       __m128 v2 = _mm_loadu_ps(i2);
8389       __m128 v3 = _mm_setzero_ps();
8390 
8391       _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
8392 
8393       if (bh & 2) {
8394         _mm_storel_pi((__m64*) o3, v3);
8395         o3 += 2;
8396         _mm_storel_pi((__m64*) o2, v2);
8397         o2 += 2;
8398         _mm_storel_pi((__m64*) o1, v1);
8399         o1 += 2;
8400         _mm_storel_pi((__m64*) o0, v0);
8401         o0 += 2;
8402         v0 = _mm_movehl_ps(v0, v0);
8403         v1 = _mm_movehl_ps(v1, v1);
8404         v2 = _mm_movehl_ps(v2, v2);
8405         v3 = _mm_movehl_ps(v3, v3);
8406       }
8407       if (bh & 1) {
8408         _mm_store_ss(o3, v3);
8409         _mm_store_ss(o2, v2);
8410         _mm_store_ss(o1, v1);
8411         _mm_store_ss(o0, v0);
8412       }
8413     }
8414     i0 = (const float*) ((uintptr_t) i0 + input_vreset);
8415     i1 = (const float*) ((uintptr_t) i0 + input_stride);
8416     i2 = (const float*) ((uintptr_t) i1 + input_stride);
8417     i3 = (const float*) ((uintptr_t) i2 + input_stride);
8418     o0 = (float*) ((uintptr_t) o0 + output_vreset);
8419     o1 = (float*) ((uintptr_t) o1 + output_vreset);
8420     o2 = (float*) ((uintptr_t) o2 + output_vreset);
8421     o3 = (float*) ((uintptr_t) o3 + output_vreset);
8422     block_width = doz(block_width, tile_width);
8423   } while (block_width != 0);
8424 }
8425