xref: /aosp_15_r20/external/XNNPACK/src/f32-dwconv2d-chw/gen/5x5p2-minmax-sse-4x4.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-dwconv2d-chw/5x5p2-sse.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xmmintrin.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(
19     size_t input_height,
20     size_t input_width,
21     const float* input,
22     const float* weights,
23     const float* zero,
24     float* output,
25     uint32_t padding_top,
26     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28   assert(input_height != 0);
29   assert(input_width != 0);
30   assert(input_width % sizeof(float) == 0);
31   assert(padding_top == 2);
32 
33   const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
34   const __m128 vmax = _mm_load_ps(params->sse.max);
35   const __m128 vmin = _mm_load_ps(params->sse.min);
36 
37   const __m128 vbias = _mm_load1_ps(weights);
38   const __m128 vk00 = _mm_load1_ps(weights + 1);
39   const __m128 vk01 = _mm_load1_ps(weights + 2);
40   const __m128 vk02 = _mm_load1_ps(weights + 3);
41   const __m128 vk03 = _mm_load1_ps(weights + 4);
42   const __m128 vk04 = _mm_load1_ps(weights + 5);
43   const __m128 vk10 = _mm_load1_ps(weights + 6);
44   const __m128 vk11 = _mm_load1_ps(weights + 7);
45   const __m128 vk12 = _mm_load1_ps(weights + 8);
46   const __m128 vk13 = _mm_load1_ps(weights + 9);
47   const __m128 vk14 = _mm_load1_ps(weights + 10);
48   const __m128 vk20 = _mm_load1_ps(weights + 11);
49   const __m128 vk21 = _mm_load1_ps(weights + 12);
50   const __m128 vk22 = _mm_load1_ps(weights + 13);
51   const __m128 vk23 = _mm_load1_ps(weights + 14);
52   const __m128 vk24 = _mm_load1_ps(weights + 15);
53   const __m128 vk30 = _mm_load1_ps(weights + 16);
54   const __m128 vk31 = _mm_load1_ps(weights + 17);
55   const __m128 vk32 = _mm_load1_ps(weights + 18);
56   const __m128 vk33 = _mm_load1_ps(weights + 19);
57   const __m128 vk34 = _mm_load1_ps(weights + 20);
58   const __m128 vk40 = _mm_load1_ps(weights + 21);
59   const __m128 vk41 = _mm_load1_ps(weights + 22);
60   const __m128 vk42 = _mm_load1_ps(weights + 23);
61   const __m128 vk43 = _mm_load1_ps(weights + 24);
62   const __m128 vk44 = _mm_load1_ps(weights + 25);
63 
64   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
65 
66   const float* i0 = zero;
67   const float* i1 = zero;
68   const float* i2 = input;
69   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
70   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
71   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
72   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
73   const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
74 
75   float* o0 = output;
76   float* o1 = (float*) ((uintptr_t) o0 + input_width);
77   float* o2 = (float*) ((uintptr_t) o1 + input_width);
78   float* o3 = (float*) ((uintptr_t) o2 + input_width);
79 
80   size_t output_height = input_height;
81   do {
82     if XNN_UNPREDICTABLE(output_height < 2) {
83       i3 = zero;
84       o1 = o0;
85     }
86     if XNN_UNPREDICTABLE(output_height < 3) {
87       i4 = zero;
88       o2 = o1;
89     }
90     if XNN_UNPREDICTABLE(output_height < 4) {
91       i5 = zero;
92       o3 = o2;
93     }
94     if XNN_UNPREDICTABLE(output_height < 5) {
95       i6 = zero;
96     }
97     if XNN_UNPREDICTABLE(output_height < 6) {
98       i7 = zero;
99     }
100 
101     __m128 vi0x3012 = _mm_setzero_ps();
102     __m128 vi1x3012 = _mm_setzero_ps();
103     __m128 vi2x3012 = _mm_setzero_ps();
104     __m128 vi3x3012 = _mm_setzero_ps();
105     __m128 vi4x3012 = _mm_setzero_ps();
106     __m128 vi5x3012 = _mm_setzero_ps();
107     __m128 vi6x3012 = _mm_setzero_ps();
108     __m128 vi7x3012 = _mm_setzero_ps();
109 
110     __m128 vi0x4567 = _mm_loadu_ps(i0);
111     i0 += 4;
112     __m128 vi1x4567 = _mm_loadu_ps(i1);
113     i1 += 4;
114     __m128 vi2x4567 = _mm_loadu_ps(i2);
115     i2 += 4;
116     __m128 vi3x4567 = _mm_loadu_ps(i3);
117     i3 += 4;
118     __m128 vi4x4567 = _mm_loadu_ps(i4);
119     i4 += 4;
120     __m128 vi5x4567 = _mm_loadu_ps(i5);
121     i5 += 4;
122     __m128 vi6x4567 = _mm_loadu_ps(i6);
123     i6 += 4;
124     __m128 vi7x4567 = _mm_loadu_ps(i7);
125     i7 += 4;
126 
127     size_t w = input_width;
128     for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
129       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
130       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
131       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
132       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
133       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
134       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
135       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
136       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
137       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
138       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
139       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
140       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
141       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
142       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
143       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
144       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
145       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
146       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
147       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
148       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
149 
150       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
151       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
152       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
153       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
154       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
155       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
156       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
157       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
158 
159       const __m128 vi0x89AB = _mm_loadu_ps(i0);
160       i0 += 4;
161       const __m128 vi1x89AB = _mm_loadu_ps(i1);
162       i1 += 4;
163       const __m128 vi2x89AB = _mm_loadu_ps(i2);
164       i2 += 4;
165       const __m128 vi3x89AB = _mm_loadu_ps(i3);
166       i3 += 4;
167       const __m128 vi4x89AB = _mm_loadu_ps(i4);
168       i4 += 4;
169       const __m128 vi5x89AB = _mm_loadu_ps(i5);
170       i5 += 4;
171       const __m128 vi6x89AB = _mm_loadu_ps(i6);
172       i6 += 4;
173       const __m128 vi7x89AB = _mm_loadu_ps(i7);
174       i7 += 4;
175 
176       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
177       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
178       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
179       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
180       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
181       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
182       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
183       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
184 
185       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
186       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
187       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
188       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
189       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
190       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
191       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
192       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
193       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
194       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
195       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
196       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
197       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
198       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
199       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
200       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
201       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
202       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
203       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
204       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
205 
206       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
207       vi0x3012 = vi0x7456;
208       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
209       vi1x3012 = vi1x7456;
210       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
211       vi2x3012 = vi2x7456;
212       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
213       vi3x3012 = vi3x7456;
214       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
215       vi4x3012 = vi4x7456;
216       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
217       vi5x3012 = vi5x7456;
218       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
219       vi6x3012 = vi6x7456;
220       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
221       vi7x3012 = vi7x7456;
222 
223       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
224       vi0x4567 = vi0x89AB;
225       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
226       vi1x4567 = vi1x89AB;
227       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
228       vi2x4567 = vi2x89AB;
229       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
230       vi3x4567 = vi3x89AB;
231       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
232       vi4x4567 = vi4x89AB;
233       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
234       vi5x4567 = vi5x89AB;
235       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
236       vi6x4567 = vi6x89AB;
237       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
238       vi7x4567 = vi7x89AB;
239 
240       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
241       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
242       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
243       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
244       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
245       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
246       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
247       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
248       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
249       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
250       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
251       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
252       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
253       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
254       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
255       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
256       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
257       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
258       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
259       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
260 
261       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
262       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
263       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
264       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
265       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
266       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
267       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
268       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
269 
270       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
271       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
272       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
273       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
274       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
275       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
276       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
277       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
278       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
279       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
280       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
281       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
282       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
283       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
284       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
285       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
286       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
287       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
288       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
289       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
290 
291       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
292       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
293       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
294       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
295       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
296       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
297       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
298       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
299 
300       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
301       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
302       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
303       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
304       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
305       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
306       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
307       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
308       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
309       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
310       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
311       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
312       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
313       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
314       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
315       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
316       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
317       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
318       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
319       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
320 
321 
322       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
323       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
324       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
325       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
326 
327       vo0 = _mm_min_ps(vo0, vmax);
328       vo1 = _mm_min_ps(vo1, vmax);
329       vo2 = _mm_min_ps(vo2, vmax);
330       vo3 = _mm_min_ps(vo3, vmax);
331 
332       _mm_storeu_ps(o3, vo3);
333       o3 += 4;
334       _mm_storeu_ps(o2, vo2);
335       o2 += 4;
336       _mm_storeu_ps(o1, vo1);
337       o1 += 4;
338       _mm_storeu_ps(o0, vo0);
339       o0 += 4;
340     }
341     // Always process the last block of 5..8 pixels.
342     if XNN_LIKELY(w > 4 * sizeof(float)) {
343       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
344       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
345       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
346       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
347       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
348       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
349       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
350       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
351       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
352       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
353       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
354       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
355       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
356       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
357       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
358       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
359       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
360       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
361       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
362       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
363 
364       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
365       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
366       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
367       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
368       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
369       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
370       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
371       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
372 
373       const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
374       i0 += 4;
375       const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
376       i1 += 4;
377       const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
378       i2 += 4;
379       const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
380       i3 += 4;
381       const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
382       i4 += 4;
383       const __m128 vi5x89AB = _mm_and_ps(_mm_loadu_ps(i5), vmask);
384       i5 += 4;
385       const __m128 vi6x89AB = _mm_and_ps(_mm_loadu_ps(i6), vmask);
386       i6 += 4;
387       const __m128 vi7x89AB = _mm_and_ps(_mm_loadu_ps(i7), vmask);
388       i7 += 4;
389 
390       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
391       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
392       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
393       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
394       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
395       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
396       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
397       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
398 
399       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
400       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
401       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
402       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
403       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
404       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
405       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
406       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
407       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
408       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
409       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
410       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
411       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
412       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
413       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
414       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
415       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
416       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
417       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
418       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
419 
420       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
421       vi0x3012 = vi0x7456;
422       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
423       vi1x3012 = vi1x7456;
424       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
425       vi2x3012 = vi2x7456;
426       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
427       vi3x3012 = vi3x7456;
428       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
429       vi4x3012 = vi4x7456;
430       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
431       vi5x3012 = vi5x7456;
432       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
433       vi6x3012 = vi6x7456;
434       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
435       vi7x3012 = vi7x7456;
436 
437       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
438       vi0x4567 = vi0x89AB;
439       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
440       vi1x4567 = vi1x89AB;
441       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
442       vi2x4567 = vi2x89AB;
443       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
444       vi3x4567 = vi3x89AB;
445       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
446       vi4x4567 = vi4x89AB;
447       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
448       vi5x4567 = vi5x89AB;
449       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
450       vi6x4567 = vi6x89AB;
451       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
452       vi7x4567 = vi7x89AB;
453 
454       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
455       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
456       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
457       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
458       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
459       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
460       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
461       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
462       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
463       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
464       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
465       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
466       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
467       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
468       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
469       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
470       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
471       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
472       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
473       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
474 
475       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
476       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
477       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
478       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
479       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
480       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
481       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
482       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
483 
484       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
485       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
486       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
487       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
488       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
489       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
490       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
491       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
492       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
493       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
494       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
495       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
496       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
497       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
498       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
499       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
500       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
501       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
502       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
503       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
504 
505       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
506       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
507       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
508       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
509       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
510       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
511       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
512       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
513 
514       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
515       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
516       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
517       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
518       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
519       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
520       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
521       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
522       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
523       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
524       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
525       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
526       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
527       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
528       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
529       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
530       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
531       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
532       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
533       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
534 
535 
536       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
537       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
538       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
539       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
540 
541       vo0 = _mm_min_ps(vo0, vmax);
542       vo1 = _mm_min_ps(vo1, vmax);
543       vo2 = _mm_min_ps(vo2, vmax);
544       vo3 = _mm_min_ps(vo3, vmax);
545 
546       _mm_storeu_ps(o3, vo3);
547       o3 += 4;
548       _mm_storeu_ps(o2, vo2);
549       o2 += 4;
550       _mm_storeu_ps(o1, vo1);
551       o1 += 4;
552       _mm_storeu_ps(o0, vo0);
553       o0 += 4;
554 
555       w -= 4 * sizeof(float);
556     }
557     assert(w >= 1 * sizeof(float));
558     assert(w <= 4 * sizeof(float));
559     {
560       vi0x4567 = _mm_and_ps(vi0x4567, vmask);
561       vi1x4567 = _mm_and_ps(vi1x4567, vmask);
562       vi2x4567 = _mm_and_ps(vi2x4567, vmask);
563       vi3x4567 = _mm_and_ps(vi3x4567, vmask);
564       vi4x4567 = _mm_and_ps(vi4x4567, vmask);
565       vi5x4567 = _mm_and_ps(vi5x4567, vmask);
566       vi6x4567 = _mm_and_ps(vi6x4567, vmask);
567       vi7x4567 = _mm_and_ps(vi7x4567, vmask);
568 
569       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
570       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
571       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
572       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
573       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
574       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
575       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
576       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
577       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
578       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
579       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
580       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
581       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
582       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
583       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
584       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
585       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
586       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
587       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
588       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
589 
590       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
591       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
592       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
593       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
594       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
595       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
596       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
597       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
598 
599       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
600       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
601       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
602       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
603       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
604       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
605       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
606       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
607 
608       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
609       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
610       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
611       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
612       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
613       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
614       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
615       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
616       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
617       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
618       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
619       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
620       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
621       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
622       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
623       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
624       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
625       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
626       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
627       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
628 
629       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
630       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
631       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
632       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
633       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
634       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
635       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
636       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
637 
638       const __m128 vzero = _mm_setzero_ps();
639       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
640       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
641       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
642       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
643       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
644       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vzero);
645       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vzero);
646       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vzero);
647 
648       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
649       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
650       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
651       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
652       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
653       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
654       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
655       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
656       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
657       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
658       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
659       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
660       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
661       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
662       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
663       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
664       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
665       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
666       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
667       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
668 
669       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
670       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
671       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
672       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
673       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
674       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
675       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
676       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
677 
678       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
679       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
680       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
681       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
682       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
683       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
684       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
685       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
686       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
687       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
688       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
689       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
690       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
691       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
692       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
693       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
694       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
695       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
696       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
697       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
698 
699       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
700       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
701       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
702       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
703       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
704       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
705       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
706       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
707 
708       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
709       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
710       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
711       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
712       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
713       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
714       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
715       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
716       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
717       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
718       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
719       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
720       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
721       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
722       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
723       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
724       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
725       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
726       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
727       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
728 
729 
730       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
731       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
732       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
733       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
734 
735       vo0 = _mm_min_ps(vo0, vmax);
736       vo1 = _mm_min_ps(vo1, vmax);
737       vo2 = _mm_min_ps(vo2, vmax);
738       vo3 = _mm_min_ps(vo3, vmax);
739 
740       if XNN_LIKELY(w & (4 * sizeof(float))) {
741         _mm_storeu_ps(o3, vo3);
742         o3 += 4;
743         _mm_storeu_ps(o2, vo2);
744         o2 += 4;
745         _mm_storeu_ps(o1, vo1);
746         o1 += 4;
747         _mm_storeu_ps(o0, vo0);
748         o0 += 4;
749       } else {
750         if (w & (2 * sizeof(float))) {
751           _mm_storel_pi((__m64*) o3, vo3);
752           o3 += 2;
753           _mm_storel_pi((__m64*) o2, vo2);
754           o2 += 2;
755           _mm_storel_pi((__m64*) o1, vo1);
756           o1 += 2;
757           _mm_storel_pi((__m64*) o0, vo0);
758           o0 += 2;
759 
760           vo0 = _mm_movehl_ps(vo0, vo0);
761           vo1 = _mm_movehl_ps(vo1, vo1);
762           vo2 = _mm_movehl_ps(vo2, vo2);
763           vo3 = _mm_movehl_ps(vo3, vo3);
764         }
765         if (w & (1 * sizeof(float))) {
766           _mm_store_ss(o3, vo3);
767           o3 += 1;
768           _mm_store_ss(o2, vo2);
769           o2 += 1;
770           _mm_store_ss(o1, vo1);
771           o1 += 1;
772           _mm_store_ss(o0, vo0);
773           o0 += 1;
774         }
775       }
776     }
777 
778     i0 = (const float*) ((uintptr_t) i4 - input_decrement);
779     i1 = (const float*) ((uintptr_t) i5 - input_decrement);
780     i2 = (const float*) ((uintptr_t) i1 + input_width);
781     i3 = (const float*) ((uintptr_t) i2 + input_width);
782     i4 = (const float*) ((uintptr_t) i3 + input_width);
783     i5 = (const float*) ((uintptr_t) i4 + input_width);
784     i6 = (const float*) ((uintptr_t) i5 + input_width);
785     i7 = (const float*) ((uintptr_t) i6 + input_width);
786 
787     o0 = o3;
788     o1 = (float*) ((uintptr_t) o0 + input_width);
789     o2 = (float*) ((uintptr_t) o1 + input_width);
790     o3 = (float*) ((uintptr_t) o2 + input_width);
791 
792     output_height = doz(output_height, 4);
793   } while (output_height != 0);
794 }
795