xref: /aosp_15_r20/external/XNNPACK/src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-sse-4x4.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-dwconv2d-chw/3x3s2p1-sse.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xmmintrin.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4(
19     size_t input_height,
20     size_t input_width,
21     const float* input,
22     const float* weights,
23     const float* zero,
24     float* output,
25     uint32_t padding_top,
26     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28   assert(input_height != 0);
29   assert(input_width != 0);
30   assert(input_width % sizeof(float) == 0);
31   assert(padding_top >= 0);
32   assert(padding_top <= 1);
33 
34   const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
35   const __m128 vmask_odd  = _mm_load_ps((const float*) params->sse.mask_odd);
36   const __m128 vmax = _mm_load_ps(params->sse.max);
37   const __m128 vmin = _mm_load_ps(params->sse.min);
38 
39   const __m128 vbias = _mm_load1_ps(weights);
40   const __m128 vk00 = _mm_load1_ps(weights + 1);
41   const __m128 vk01 = _mm_load1_ps(weights + 2);
42   const __m128 vk02 = _mm_load1_ps(weights + 3);
43   const __m128 vk10 = _mm_load1_ps(weights + 4);
44   const __m128 vk11 = _mm_load1_ps(weights + 5);
45   const __m128 vk12 = _mm_load1_ps(weights + 6);
46   const __m128 vk20 = _mm_load1_ps(weights + 7);
47   const __m128 vk21 = _mm_load1_ps(weights + 8);
48   const __m128 vk22 = _mm_load1_ps(weights + 9);
49 
50   const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float));
51   const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
52 
53   const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
54   const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
55   if XNN_UNPREDICTABLE(padding_top != 0) {
56     i0 = zero;
57   }
58   const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
59   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
60   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
61   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
62   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
63   const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
64   const float* i8 = (const float*) ((uintptr_t) i7 + input_width);
65 
66   float* o0 = output;
67   float* o1 = (float*) ((uintptr_t) o0 + output_width);
68   float* o2 = (float*) ((uintptr_t) o1 + output_width);
69   float* o3 = (float*) ((uintptr_t) o2 + output_width);
70 
71   size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
72   size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
73   do {
74     if XNN_UNPREDICTABLE(padded_input_height < 4) {
75       i2 = zero;
76     }
77     if XNN_UNPREDICTABLE(padded_input_height < 5) {
78       i3 = zero;
79       o1 = o0;
80     }
81     if XNN_UNPREDICTABLE(padded_input_height < 6) {
82       i4 = zero;
83     }
84     if XNN_UNPREDICTABLE(padded_input_height < 7) {
85       i5 = zero;
86       o2 = o1;
87     }
88     if XNN_UNPREDICTABLE(padded_input_height < 8) {
89       i6 = zero;
90     }
91     if XNN_UNPREDICTABLE(padded_input_height < 9) {
92       i7 = zero;
93       o3 = o2;
94     }
95     if XNN_UNPREDICTABLE(padded_input_height < 10) {
96       i8 = zero;
97     }
98 
99     __m128 vi0x7531 = _mm_setzero_ps();
100     __m128 vi1x7531 = _mm_setzero_ps();
101     __m128 vi2x7531 = _mm_setzero_ps();
102     __m128 vi3x7531 = _mm_setzero_ps();
103     __m128 vi4x7531 = _mm_setzero_ps();
104     __m128 vi5x7531 = _mm_setzero_ps();
105     __m128 vi6x7531 = _mm_setzero_ps();
106     __m128 vi7x7531 = _mm_setzero_ps();
107     __m128 vi8x7531 = _mm_setzero_ps();
108 
109     size_t w = input_width;
110     for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) {
111       const __m128 vi0x89AB = _mm_loadu_ps(i0);
112       const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
113       i0 += 8;
114       const __m128 vi1x89AB = _mm_loadu_ps(i1);
115       const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
116       i1 += 8;
117       const __m128 vi2x89AB = _mm_loadu_ps(i2);
118       const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
119       i2 += 8;
120       const __m128 vi3x89AB = _mm_loadu_ps(i3);
121       const __m128 vi3xCDEF = _mm_loadu_ps(i3 + 4);
122       i3 += 8;
123       const __m128 vi4x89AB = _mm_loadu_ps(i4);
124       const __m128 vi4xCDEF = _mm_loadu_ps(i4 + 4);
125       i4 += 8;
126       const __m128 vi5x89AB = _mm_loadu_ps(i5);
127       const __m128 vi5xCDEF = _mm_loadu_ps(i5 + 4);
128       i5 += 8;
129       const __m128 vi6x89AB = _mm_loadu_ps(i6);
130       const __m128 vi6xCDEF = _mm_loadu_ps(i6 + 4);
131       i6 += 8;
132       const __m128 vi7x89AB = _mm_loadu_ps(i7);
133       const __m128 vi7xCDEF = _mm_loadu_ps(i7 + 4);
134       i7 += 8;
135       const __m128 vi8x89AB = _mm_loadu_ps(i8);
136       const __m128 vi8xCDEF = _mm_loadu_ps(i8 + 4);
137       i8 += 8;
138 
139       const __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
140       const __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
141       const __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
142       const __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
143       const __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
144       const __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
145       const __m128 vi3x8ACE = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
146       const __m128 vi3x9BDF = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
147       const __m128 vi4x8ACE = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
148       const __m128 vi4x9BDF = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
149       const __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
150       const __m128 vi5x9BDF = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
151       const __m128 vi6x8ACE = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
152       const __m128 vi6x9BDF = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
153       const __m128 vi7x8ACE = _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
154       const __m128 vi7x9BDF = _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
155       const __m128 vi8x8ACE = _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
156       const __m128 vi8x9BDF = _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
157 
158       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
159       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk01));
160       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi4x8ACE, vk01));
161       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi6x8ACE, vk01));
162       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk11));
163       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk11));
164       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk11));
165       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x8ACE, vk11));
166       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk21));
167       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk21));
168       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x8ACE, vk21));
169       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi8x8ACE, vk21));
170 
171       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
172       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
173       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
174       const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
175       const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
176       const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
177       const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
178       const __m128 vi7xF9BD = _mm_shuffle_ps(vi7x9BDF, vi7x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
179       const __m128 vi8xF9BD = _mm_shuffle_ps(vi8x9BDF, vi8x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
180 
181       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
182       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk02));
183       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x9BDF, vk02));
184       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x9BDF, vk02));
185       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk12));
186       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk12));
187       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x9BDF, vk12));
188       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x9BDF, vk12));
189       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk22));
190       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk22));
191       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x9BDF, vk22));
192       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi8x9BDF, vk22));
193 
194       const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
195       const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
196       const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
197       const __m128 vi3x7BDF = _mm_move_ss(vi3xF9BD, vi3x7531);
198       const __m128 vi4x7BDF = _mm_move_ss(vi4xF9BD, vi4x7531);
199       const __m128 vi5x7BDF = _mm_move_ss(vi5xF9BD, vi5x7531);
200       const __m128 vi6x7BDF = _mm_move_ss(vi6xF9BD, vi6x7531);
201       const __m128 vi7x7BDF = _mm_move_ss(vi7xF9BD, vi7x7531);
202       const __m128 vi8x7BDF = _mm_move_ss(vi8xF9BD, vi8x7531);
203 
204       vi0x7531 = vi0xF9BD;
205       vi1x7531 = vi1xF9BD;
206       vi2x7531 = vi2xF9BD;
207       vi3x7531 = vi3xF9BD;
208       vi4x7531 = vi4xF9BD;
209       vi5x7531 = vi5xF9BD;
210       vi6x7531 = vi6xF9BD;
211       vi7x7531 = vi7xF9BD;
212       vi8x7531 = vi8xF9BD;
213 
214       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
215       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x7BDF, vk00));
216       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x7BDF, vk00));
217       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x7BDF, vk00));
218       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x7BDF, vk10));
219       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x7BDF, vk10));
220       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x7BDF, vk10));
221       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x7BDF, vk10));
222       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x7BDF, vk20));
223       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x7BDF, vk20));
224       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x7BDF, vk20));
225       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi8x7BDF, vk20));
226 
227 
228       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
229       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
230       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
231       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
232 
233       vo0 = _mm_min_ps(vo0, vmax);
234       vo1 = _mm_min_ps(vo1, vmax);
235       vo2 = _mm_min_ps(vo2, vmax);
236       vo3 = _mm_min_ps(vo3, vmax);
237 
238       _mm_storeu_ps(o3, vo3);
239       o3 += 4;
240       _mm_storeu_ps(o2, vo2);
241       o2 += 4;
242       _mm_storeu_ps(o1, vo1);
243       o1 += 4;
244       _mm_storeu_ps(o0, vo0);
245       o0 += 4;
246     }
247     // Potentially process the last block of 0..7 pixels.
248     assert(w < 8 * sizeof(float));
249     if XNN_LIKELY(w != 0) {
250       const __m128 vi0x89AB = _mm_loadu_ps(i0);
251       const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
252       const __m128 vi1x89AB = _mm_loadu_ps(i1);
253       const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
254       const __m128 vi2x89AB = _mm_loadu_ps(i2);
255       const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
256       const __m128 vi3x89AB = _mm_loadu_ps(i3);
257       const __m128 vi3xCDEF = _mm_loadu_ps(i3 + 4);
258       const __m128 vi4x89AB = _mm_loadu_ps(i4);
259       const __m128 vi4xCDEF = _mm_loadu_ps(i4 + 4);
260       const __m128 vi5x89AB = _mm_loadu_ps(i5);
261       const __m128 vi5xCDEF = _mm_loadu_ps(i5 + 4);
262       const __m128 vi6x89AB = _mm_loadu_ps(i6);
263       const __m128 vi6xCDEF = _mm_loadu_ps(i6 + 4);
264       const __m128 vi7x89AB = _mm_loadu_ps(i7);
265       const __m128 vi7xCDEF = _mm_loadu_ps(i7 + 4);
266       const __m128 vi8x89AB = _mm_loadu_ps(i8);
267       const __m128 vi8xCDEF = _mm_loadu_ps(i8 + 4);
268 
269       const __m128 vi0x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
270       const __m128 vi0x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
271       const __m128 vi1x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
272       const __m128 vi1x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
273       const __m128 vi2x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
274       const __m128 vi2x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
275       const __m128 vi3x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
276       const __m128 vi3x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
277       const __m128 vi4x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
278       const __m128 vi4x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
279       const __m128 vi5x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
280       const __m128 vi5x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
281       const __m128 vi6x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
282       const __m128 vi6x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
283       const __m128 vi7x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
284       const __m128 vi7x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
285       const __m128 vi8x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
286       const __m128 vi8x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
287 
288       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
289       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk01));
290       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi4x8ACE, vk01));
291       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi6x8ACE, vk01));
292       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk11));
293       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk11));
294       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk11));
295       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x8ACE, vk11));
296       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk21));
297       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk21));
298       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x8ACE, vk21));
299       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi8x8ACE, vk21));
300 
301       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
302       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
303       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
304       const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
305       const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
306       const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
307       const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
308       const __m128 vi7xF9BD = _mm_shuffle_ps(vi7x9BDF, vi7x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
309       const __m128 vi8xF9BD = _mm_shuffle_ps(vi8x9BDF, vi8x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
310 
311       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
312       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk02));
313       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x9BDF, vk02));
314       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x9BDF, vk02));
315       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk12));
316       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk12));
317       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x9BDF, vk12));
318       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x9BDF, vk12));
319       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk22));
320       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk22));
321       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x9BDF, vk22));
322       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi8x9BDF, vk22));
323 
324       const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
325       const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
326       const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
327       const __m128 vi3x7BDF = _mm_move_ss(vi3xF9BD, vi3x7531);
328       const __m128 vi4x7BDF = _mm_move_ss(vi4xF9BD, vi4x7531);
329       const __m128 vi5x7BDF = _mm_move_ss(vi5xF9BD, vi5x7531);
330       const __m128 vi6x7BDF = _mm_move_ss(vi6xF9BD, vi6x7531);
331       const __m128 vi7x7BDF = _mm_move_ss(vi7xF9BD, vi7x7531);
332       const __m128 vi8x7BDF = _mm_move_ss(vi8xF9BD, vi8x7531);
333 
334       vi0x7531 = vi0xF9BD;
335       vi1x7531 = vi1xF9BD;
336       vi2x7531 = vi2xF9BD;
337       vi3x7531 = vi3xF9BD;
338       vi4x7531 = vi4xF9BD;
339       vi5x7531 = vi5xF9BD;
340       vi6x7531 = vi6xF9BD;
341       vi7x7531 = vi7xF9BD;
342       vi8x7531 = vi8xF9BD;
343 
344       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
345       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x7BDF, vk00));
346       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x7BDF, vk00));
347       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x7BDF, vk00));
348       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x7BDF, vk10));
349       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x7BDF, vk10));
350       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x7BDF, vk10));
351       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x7BDF, vk10));
352       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x7BDF, vk20));
353       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x7BDF, vk20));
354       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x7BDF, vk20));
355       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi8x7BDF, vk20));
356 
357 
358       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
359       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
360       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
361       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
362 
363       vo0 = _mm_min_ps(vo0, vmax);
364       vo1 = _mm_min_ps(vo1, vmax);
365       vo2 = _mm_min_ps(vo2, vmax);
366       vo3 = _mm_min_ps(vo3, vmax);
367 
368       if (w == 7 * sizeof(float)) {
369         _mm_storeu_ps(o3, vo3);
370         o3 += 4;
371         _mm_storeu_ps(o2, vo2);
372         o2 += 4;
373         _mm_storeu_ps(o1, vo1);
374         o1 += 4;
375         _mm_storeu_ps(o0, vo0);
376         o0 += 4;
377       } else {
378         w += 1 * sizeof(float);
379         if (w & (4 * sizeof(float))) {
380           _mm_storel_pi((__m64*) o3, vo3);
381           o3 += 2;
382           _mm_storel_pi((__m64*) o2, vo2);
383           o2 += 2;
384           _mm_storel_pi((__m64*) o1, vo1);
385           o1 += 2;
386           _mm_storel_pi((__m64*) o0, vo0);
387           o0 += 2;
388 
389           vo0 = _mm_movehl_ps(vo0, vo0);
390           vo1 = _mm_movehl_ps(vo1, vo1);
391           vo2 = _mm_movehl_ps(vo2, vo2);
392           vo3 = _mm_movehl_ps(vo3, vo3);
393         }
394         if (w & (2 * sizeof(float))) {
395           _mm_store_ss(o3, vo3);
396           o3 += 1;
397           _mm_store_ss(o2, vo2);
398           o2 += 1;
399           _mm_store_ss(o1, vo1);
400           o1 += 1;
401           _mm_store_ss(o0, vo0);
402           o0 += 1;
403         }
404       }
405     }
406 
407     i0 = (const float*) ((uintptr_t) i8 - input_decrement);
408     i1 = (const float*) ((uintptr_t) i0 + input_width);
409     i2 = (const float*) ((uintptr_t) i1 + input_width);
410     i3 = (const float*) ((uintptr_t) i2 + input_width);
411     i4 = (const float*) ((uintptr_t) i3 + input_width);
412     i5 = (const float*) ((uintptr_t) i4 + input_width);
413     i6 = (const float*) ((uintptr_t) i5 + input_width);
414     i7 = (const float*) ((uintptr_t) i6 + input_width);
415     i8 = (const float*) ((uintptr_t) i7 + input_width);
416 
417     o0 = o3;
418     o1 = (float*) ((uintptr_t) o0 + output_width);
419     o2 = (float*) ((uintptr_t) o1 + output_width);
420     o3 = (float*) ((uintptr_t) o2 + output_width);
421 
422     output_height = doz(output_height, 4);
423     padded_input_height = doz(padded_input_height, 8);
424   } while (output_height != 0);
425 }
426