xref: /aosp_15_r20/external/XNNPACK/src/amalgam/sse.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker // Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker //
3*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker 
6*4bdc9457SAndroid Build Coastguard Worker #include <assert.h>
7*4bdc9457SAndroid Build Coastguard Worker 
8*4bdc9457SAndroid Build Coastguard Worker #include <immintrin.h>
9*4bdc9457SAndroid Build Coastguard Worker 
10*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/avgpool.h>
11*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/common.h>
12*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/conv.h>
13*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/dwconv.h>
14*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/gavgpool.h>
15*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/gemm.h>
16*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/ibilinear.h>
17*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/igemm.h>
18*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/intrinsics-polyfill.h>
19*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/math.h>
20*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/maxpool.h>
21*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/packx.h>
22*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/pavgpool.h>
23*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/rmax.h>
24*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/spmm.h>
25*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/transpose.h>
26*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vbinary.h>
27*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vmulcaddc.h>
28*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vunary.h>
29*4bdc9457SAndroid Build Coastguard Worker 
30*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,float * buffer,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])31*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4(
32*4bdc9457SAndroid Build Coastguard Worker     size_t output_pixels,
33*4bdc9457SAndroid Build Coastguard Worker     size_t kernel_elements,
34*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
35*4bdc9457SAndroid Build Coastguard Worker     const float** input,
36*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
37*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
38*4bdc9457SAndroid Build Coastguard Worker     float* buffer,
39*4bdc9457SAndroid Build Coastguard Worker     float* output,
40*4bdc9457SAndroid Build Coastguard Worker     size_t input_increment,
41*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
42*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
43*4bdc9457SAndroid Build Coastguard Worker {
44*4bdc9457SAndroid Build Coastguard Worker   assert(output_pixels != 0);
45*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements > 9);
46*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
47*4bdc9457SAndroid Build Coastguard Worker 
48*4bdc9457SAndroid Build Coastguard Worker   const __m128 vscale = _mm_load_ps(params->sse.scale);
49*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
50*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
51*4bdc9457SAndroid Build Coastguard Worker 
52*4bdc9457SAndroid Build Coastguard Worker   do {
53*4bdc9457SAndroid Build Coastguard Worker     {
54*4bdc9457SAndroid Build Coastguard Worker       const float* i0 = *input++;
55*4bdc9457SAndroid Build Coastguard Worker       assert(i0 != NULL);
56*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i0 != zero) {
57*4bdc9457SAndroid Build Coastguard Worker         i0 = (const float*) ((uintptr_t) i0 + input_offset);
58*4bdc9457SAndroid Build Coastguard Worker       }
59*4bdc9457SAndroid Build Coastguard Worker       const float* i1 = *input++;
60*4bdc9457SAndroid Build Coastguard Worker       assert(i1 != NULL);
61*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i1 != zero) {
62*4bdc9457SAndroid Build Coastguard Worker         i1 = (const float*) ((uintptr_t) i1 + input_offset);
63*4bdc9457SAndroid Build Coastguard Worker       }
64*4bdc9457SAndroid Build Coastguard Worker       const float* i2 = *input++;
65*4bdc9457SAndroid Build Coastguard Worker       assert(i2 != NULL);
66*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i2 != zero) {
67*4bdc9457SAndroid Build Coastguard Worker         i2 = (const float*) ((uintptr_t) i2 + input_offset);
68*4bdc9457SAndroid Build Coastguard Worker       }
69*4bdc9457SAndroid Build Coastguard Worker       const float* i3 = *input++;
70*4bdc9457SAndroid Build Coastguard Worker       assert(i3 != NULL);
71*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i3 != zero) {
72*4bdc9457SAndroid Build Coastguard Worker         i3 = (const float*) ((uintptr_t) i3 + input_offset);
73*4bdc9457SAndroid Build Coastguard Worker       }
74*4bdc9457SAndroid Build Coastguard Worker       const float* i4 = *input++;
75*4bdc9457SAndroid Build Coastguard Worker       assert(i4 != NULL);
76*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i4 != zero) {
77*4bdc9457SAndroid Build Coastguard Worker         i4 = (const float*) ((uintptr_t) i4 + input_offset);
78*4bdc9457SAndroid Build Coastguard Worker       }
79*4bdc9457SAndroid Build Coastguard Worker       const float* i5 = *input++;
80*4bdc9457SAndroid Build Coastguard Worker       assert(i5 != NULL);
81*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i5 != zero) {
82*4bdc9457SAndroid Build Coastguard Worker         i5 = (const float*) ((uintptr_t) i5 + input_offset);
83*4bdc9457SAndroid Build Coastguard Worker       }
84*4bdc9457SAndroid Build Coastguard Worker       const float* i6 = *input++;
85*4bdc9457SAndroid Build Coastguard Worker       assert(i6 != NULL);
86*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i6 != zero) {
87*4bdc9457SAndroid Build Coastguard Worker         i6 = (const float*) ((uintptr_t) i6 + input_offset);
88*4bdc9457SAndroid Build Coastguard Worker       }
89*4bdc9457SAndroid Build Coastguard Worker       const float* i7 = *input++;
90*4bdc9457SAndroid Build Coastguard Worker       assert(i7 != NULL);
91*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i7 != zero) {
92*4bdc9457SAndroid Build Coastguard Worker         i7 = (const float*) ((uintptr_t) i7 + input_offset);
93*4bdc9457SAndroid Build Coastguard Worker       }
94*4bdc9457SAndroid Build Coastguard Worker       const float* i8 = *input++;
95*4bdc9457SAndroid Build Coastguard Worker       assert(i8 != NULL);
96*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i8 != zero) {
97*4bdc9457SAndroid Build Coastguard Worker         i8 = (const float*) ((uintptr_t) i8 + input_offset);
98*4bdc9457SAndroid Build Coastguard Worker       }
99*4bdc9457SAndroid Build Coastguard Worker 
100*4bdc9457SAndroid Build Coastguard Worker       float* b = buffer;
101*4bdc9457SAndroid Build Coastguard Worker       for (size_t c = 0; c < channels; c += 4) {
102*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
103*4bdc9457SAndroid Build Coastguard Worker         i0 += 4;
104*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
105*4bdc9457SAndroid Build Coastguard Worker         i1 += 4;
106*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
107*4bdc9457SAndroid Build Coastguard Worker         i2 += 4;
108*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
109*4bdc9457SAndroid Build Coastguard Worker         i3 += 4;
110*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
111*4bdc9457SAndroid Build Coastguard Worker         i4 += 4;
112*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
113*4bdc9457SAndroid Build Coastguard Worker         i5 += 4;
114*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
115*4bdc9457SAndroid Build Coastguard Worker         i6 += 4;
116*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
117*4bdc9457SAndroid Build Coastguard Worker         i7 += 4;
118*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi8 = _mm_loadu_ps(i8);
119*4bdc9457SAndroid Build Coastguard Worker         i8 += 4;
120*4bdc9457SAndroid Build Coastguard Worker 
121*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
122*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
123*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
124*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
125*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
126*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
127*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
128*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
129*4bdc9457SAndroid Build Coastguard Worker 
130*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ps(b, vsum); b += 4;
131*4bdc9457SAndroid Build Coastguard Worker       }
132*4bdc9457SAndroid Build Coastguard Worker     }
133*4bdc9457SAndroid Build Coastguard Worker 
134*4bdc9457SAndroid Build Coastguard Worker     size_t k = kernel_elements;
135*4bdc9457SAndroid Build Coastguard Worker     for (k -= 9; k > 8; k -= 8) {
136*4bdc9457SAndroid Build Coastguard Worker       const float* i0 = *input++;
137*4bdc9457SAndroid Build Coastguard Worker       assert(i0 != NULL);
138*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i0 != zero) {
139*4bdc9457SAndroid Build Coastguard Worker         i0 = (const float*) ((uintptr_t) i0 + input_offset);
140*4bdc9457SAndroid Build Coastguard Worker       }
141*4bdc9457SAndroid Build Coastguard Worker       const float* i1 = *input++;
142*4bdc9457SAndroid Build Coastguard Worker       assert(i1 != NULL);
143*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i1 != zero) {
144*4bdc9457SAndroid Build Coastguard Worker         i1 = (const float*) ((uintptr_t) i1 + input_offset);
145*4bdc9457SAndroid Build Coastguard Worker       }
146*4bdc9457SAndroid Build Coastguard Worker       const float* i2 = *input++;
147*4bdc9457SAndroid Build Coastguard Worker       assert(i2 != NULL);
148*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i2 != zero) {
149*4bdc9457SAndroid Build Coastguard Worker         i2 = (const float*) ((uintptr_t) i2 + input_offset);
150*4bdc9457SAndroid Build Coastguard Worker       }
151*4bdc9457SAndroid Build Coastguard Worker       const float* i3 = *input++;
152*4bdc9457SAndroid Build Coastguard Worker       assert(i3 != NULL);
153*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i3 != zero) {
154*4bdc9457SAndroid Build Coastguard Worker         i3 = (const float*) ((uintptr_t) i3 + input_offset);
155*4bdc9457SAndroid Build Coastguard Worker       }
156*4bdc9457SAndroid Build Coastguard Worker       const float* i4 = *input++;
157*4bdc9457SAndroid Build Coastguard Worker       assert(i4 != NULL);
158*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i4 != zero) {
159*4bdc9457SAndroid Build Coastguard Worker         i4 = (const float*) ((uintptr_t) i4 + input_offset);
160*4bdc9457SAndroid Build Coastguard Worker       }
161*4bdc9457SAndroid Build Coastguard Worker       const float* i5 = *input++;
162*4bdc9457SAndroid Build Coastguard Worker       assert(i5 != NULL);
163*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i5 != zero) {
164*4bdc9457SAndroid Build Coastguard Worker         i5 = (const float*) ((uintptr_t) i5 + input_offset);
165*4bdc9457SAndroid Build Coastguard Worker       }
166*4bdc9457SAndroid Build Coastguard Worker       const float* i6 = *input++;
167*4bdc9457SAndroid Build Coastguard Worker       assert(i6 != NULL);
168*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i6 != zero) {
169*4bdc9457SAndroid Build Coastguard Worker         i6 = (const float*) ((uintptr_t) i6 + input_offset);
170*4bdc9457SAndroid Build Coastguard Worker       }
171*4bdc9457SAndroid Build Coastguard Worker       const float* i7 = *input++;
172*4bdc9457SAndroid Build Coastguard Worker       assert(i7 != NULL);
173*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i7 != zero) {
174*4bdc9457SAndroid Build Coastguard Worker         i7 = (const float*) ((uintptr_t) i7 + input_offset);
175*4bdc9457SAndroid Build Coastguard Worker       }
176*4bdc9457SAndroid Build Coastguard Worker 
177*4bdc9457SAndroid Build Coastguard Worker       float* b = buffer;
178*4bdc9457SAndroid Build Coastguard Worker       for (size_t c = 0; c < channels; c += 4) {
179*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
180*4bdc9457SAndroid Build Coastguard Worker         i0 += 4;
181*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
182*4bdc9457SAndroid Build Coastguard Worker         i1 += 4;
183*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
184*4bdc9457SAndroid Build Coastguard Worker         i2 += 4;
185*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
186*4bdc9457SAndroid Build Coastguard Worker         i3 += 4;
187*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
188*4bdc9457SAndroid Build Coastguard Worker         i4 += 4;
189*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
190*4bdc9457SAndroid Build Coastguard Worker         i5 += 4;
191*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
192*4bdc9457SAndroid Build Coastguard Worker         i6 += 4;
193*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
194*4bdc9457SAndroid Build Coastguard Worker         i7 += 4;
195*4bdc9457SAndroid Build Coastguard Worker         const __m128 vacc = _mm_load_ps(b);
196*4bdc9457SAndroid Build Coastguard Worker 
197*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
198*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
199*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
200*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
201*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
202*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
203*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
204*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
205*4bdc9457SAndroid Build Coastguard Worker 
206*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ps(b, vsum); b += 4;
207*4bdc9457SAndroid Build Coastguard Worker       }
208*4bdc9457SAndroid Build Coastguard Worker     }
209*4bdc9457SAndroid Build Coastguard Worker 
210*4bdc9457SAndroid Build Coastguard Worker     {
211*4bdc9457SAndroid Build Coastguard Worker       const float* i0 = input[0];
212*4bdc9457SAndroid Build Coastguard Worker       assert(i0 != NULL);
213*4bdc9457SAndroid Build Coastguard Worker       const float* i1 = input[1];
214*4bdc9457SAndroid Build Coastguard Worker       const float* i2 = input[2];
215*4bdc9457SAndroid Build Coastguard Worker       const float* i3 = input[3];
216*4bdc9457SAndroid Build Coastguard Worker       const float* i4 = input[4];
217*4bdc9457SAndroid Build Coastguard Worker       const float* i5 = input[5];
218*4bdc9457SAndroid Build Coastguard Worker       const float* i6 = input[6];
219*4bdc9457SAndroid Build Coastguard Worker       const float* i7 = input[7];
220*4bdc9457SAndroid Build Coastguard Worker       input = (const float**) ((uintptr_t) input + input_increment);
221*4bdc9457SAndroid Build Coastguard Worker       if (k < 2) {
222*4bdc9457SAndroid Build Coastguard Worker         i1 = zero;
223*4bdc9457SAndroid Build Coastguard Worker       }
224*4bdc9457SAndroid Build Coastguard Worker       assert(i1 != NULL);
225*4bdc9457SAndroid Build Coastguard Worker       if (k <= 2) {
226*4bdc9457SAndroid Build Coastguard Worker         i2 = zero;
227*4bdc9457SAndroid Build Coastguard Worker       }
228*4bdc9457SAndroid Build Coastguard Worker       assert(i2 != NULL);
229*4bdc9457SAndroid Build Coastguard Worker       if (k < 4) {
230*4bdc9457SAndroid Build Coastguard Worker         i3 = zero;
231*4bdc9457SAndroid Build Coastguard Worker       }
232*4bdc9457SAndroid Build Coastguard Worker       assert(i3 != NULL);
233*4bdc9457SAndroid Build Coastguard Worker       if (k <= 4) {
234*4bdc9457SAndroid Build Coastguard Worker         i4 = zero;
235*4bdc9457SAndroid Build Coastguard Worker       }
236*4bdc9457SAndroid Build Coastguard Worker       assert(i4 != NULL);
237*4bdc9457SAndroid Build Coastguard Worker       if (k < 6) {
238*4bdc9457SAndroid Build Coastguard Worker         i5 = zero;
239*4bdc9457SAndroid Build Coastguard Worker       }
240*4bdc9457SAndroid Build Coastguard Worker       assert(i5 != NULL);
241*4bdc9457SAndroid Build Coastguard Worker       if (k <= 6) {
242*4bdc9457SAndroid Build Coastguard Worker         i6 = zero;
243*4bdc9457SAndroid Build Coastguard Worker       }
244*4bdc9457SAndroid Build Coastguard Worker       assert(i6 != NULL);
245*4bdc9457SAndroid Build Coastguard Worker       if (k < 8) {
246*4bdc9457SAndroid Build Coastguard Worker         i7 = zero;
247*4bdc9457SAndroid Build Coastguard Worker       }
248*4bdc9457SAndroid Build Coastguard Worker       assert(i7 != NULL);
249*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i0 != zero) {
250*4bdc9457SAndroid Build Coastguard Worker         i0 = (const float*) ((uintptr_t) i0 + input_offset);
251*4bdc9457SAndroid Build Coastguard Worker       }
252*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i1 != zero) {
253*4bdc9457SAndroid Build Coastguard Worker         i1 = (const float*) ((uintptr_t) i1 + input_offset);
254*4bdc9457SAndroid Build Coastguard Worker       }
255*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i2 != zero) {
256*4bdc9457SAndroid Build Coastguard Worker         i2 = (const float*) ((uintptr_t) i2 + input_offset);
257*4bdc9457SAndroid Build Coastguard Worker       }
258*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i3 != zero) {
259*4bdc9457SAndroid Build Coastguard Worker         i3 = (const float*) ((uintptr_t) i3 + input_offset);
260*4bdc9457SAndroid Build Coastguard Worker       }
261*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i4 != zero) {
262*4bdc9457SAndroid Build Coastguard Worker         i4 = (const float*) ((uintptr_t) i4 + input_offset);
263*4bdc9457SAndroid Build Coastguard Worker       }
264*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i5 != zero) {
265*4bdc9457SAndroid Build Coastguard Worker         i5 = (const float*) ((uintptr_t) i5 + input_offset);
266*4bdc9457SAndroid Build Coastguard Worker       }
267*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i6 != zero) {
268*4bdc9457SAndroid Build Coastguard Worker         i6 = (const float*) ((uintptr_t) i6 + input_offset);
269*4bdc9457SAndroid Build Coastguard Worker       }
270*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i7 != zero) {
271*4bdc9457SAndroid Build Coastguard Worker         i7 = (const float*) ((uintptr_t) i7 + input_offset);
272*4bdc9457SAndroid Build Coastguard Worker       }
273*4bdc9457SAndroid Build Coastguard Worker 
274*4bdc9457SAndroid Build Coastguard Worker       size_t c = channels;
275*4bdc9457SAndroid Build Coastguard Worker       float* b = buffer;
276*4bdc9457SAndroid Build Coastguard Worker       while (c >= 4) {
277*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
278*4bdc9457SAndroid Build Coastguard Worker         i0 += 4;
279*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
280*4bdc9457SAndroid Build Coastguard Worker         i1 += 4;
281*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
282*4bdc9457SAndroid Build Coastguard Worker         i2 += 4;
283*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
284*4bdc9457SAndroid Build Coastguard Worker         i3 += 4;
285*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
286*4bdc9457SAndroid Build Coastguard Worker         i4 += 4;
287*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
288*4bdc9457SAndroid Build Coastguard Worker         i5 += 4;
289*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
290*4bdc9457SAndroid Build Coastguard Worker         i6 += 4;
291*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
292*4bdc9457SAndroid Build Coastguard Worker         i7 += 4;
293*4bdc9457SAndroid Build Coastguard Worker         const __m128 vacc = _mm_load_ps(b);
294*4bdc9457SAndroid Build Coastguard Worker         b += 4;
295*4bdc9457SAndroid Build Coastguard Worker 
296*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
297*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
298*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
299*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
300*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
301*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
302*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
303*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
304*4bdc9457SAndroid Build Coastguard Worker 
305*4bdc9457SAndroid Build Coastguard Worker         __m128 vout = _mm_mul_ps(vsum, vscale);
306*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_max_ps(vout, vmin);
307*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_min_ps(vout, vmax);
308*4bdc9457SAndroid Build Coastguard Worker 
309*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(output, vout);
310*4bdc9457SAndroid Build Coastguard Worker         output += 4;
311*4bdc9457SAndroid Build Coastguard Worker 
312*4bdc9457SAndroid Build Coastguard Worker         c -= 4;
313*4bdc9457SAndroid Build Coastguard Worker       }
314*4bdc9457SAndroid Build Coastguard Worker       if (c != 0) {
315*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
316*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
317*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
318*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
319*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
320*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
321*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
322*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
323*4bdc9457SAndroid Build Coastguard Worker         const __m128 vacc = _mm_load_ps(b);
324*4bdc9457SAndroid Build Coastguard Worker 
325*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
326*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
327*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
328*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
329*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
330*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
331*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
332*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
333*4bdc9457SAndroid Build Coastguard Worker 
334*4bdc9457SAndroid Build Coastguard Worker         __m128 vout = _mm_mul_ps(vsum, vscale);
335*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_max_ps(vout, vmin);
336*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_min_ps(vout, vmax);
337*4bdc9457SAndroid Build Coastguard Worker 
338*4bdc9457SAndroid Build Coastguard Worker         if (c & 2) {
339*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) output, vout);
340*4bdc9457SAndroid Build Coastguard Worker           vout = _mm_movehl_ps(vout, vout);
341*4bdc9457SAndroid Build Coastguard Worker           output += 2;
342*4bdc9457SAndroid Build Coastguard Worker         }
343*4bdc9457SAndroid Build Coastguard Worker         if (c & 1) {
344*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(output, vout);
345*4bdc9457SAndroid Build Coastguard Worker           output += 1;
346*4bdc9457SAndroid Build Coastguard Worker         }
347*4bdc9457SAndroid Build Coastguard Worker       }
348*4bdc9457SAndroid Build Coastguard Worker     }
349*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) output + output_increment);
350*4bdc9457SAndroid Build Coastguard Worker   } while (--output_pixels != 0);
351*4bdc9457SAndroid Build Coastguard Worker }
352*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_avgpool_minmax_ukernel_9x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])353*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_avgpool_minmax_ukernel_9x__sse_c4(
354*4bdc9457SAndroid Build Coastguard Worker     size_t output_pixels,
355*4bdc9457SAndroid Build Coastguard Worker     size_t kernel_elements,
356*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
357*4bdc9457SAndroid Build Coastguard Worker     const float** input,
358*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
359*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
360*4bdc9457SAndroid Build Coastguard Worker     float* output,
361*4bdc9457SAndroid Build Coastguard Worker     size_t input_increment,
362*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
363*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
364*4bdc9457SAndroid Build Coastguard Worker {
365*4bdc9457SAndroid Build Coastguard Worker   assert(output_pixels != 0);
366*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements != 0);
367*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements <= 9);
368*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
369*4bdc9457SAndroid Build Coastguard Worker 
370*4bdc9457SAndroid Build Coastguard Worker   const __m128 vscale = _mm_load_ps(params->sse.scale);
371*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
372*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
373*4bdc9457SAndroid Build Coastguard Worker 
374*4bdc9457SAndroid Build Coastguard Worker   do {
375*4bdc9457SAndroid Build Coastguard Worker     const float* i0 = input[0];
376*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
377*4bdc9457SAndroid Build Coastguard Worker     const float* i1 = input[1];
378*4bdc9457SAndroid Build Coastguard Worker     const float* i2 = input[2];
379*4bdc9457SAndroid Build Coastguard Worker     const float* i3 = input[3];
380*4bdc9457SAndroid Build Coastguard Worker     const float* i4 = input[4];
381*4bdc9457SAndroid Build Coastguard Worker     const float* i5 = input[5];
382*4bdc9457SAndroid Build Coastguard Worker     const float* i6 = input[6];
383*4bdc9457SAndroid Build Coastguard Worker     const float* i7 = input[7];
384*4bdc9457SAndroid Build Coastguard Worker     const float* i8 = input[8];
385*4bdc9457SAndroid Build Coastguard Worker     input = (const float**) ((uintptr_t) input + input_increment);
386*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 2) {
387*4bdc9457SAndroid Build Coastguard Worker       i1 = zero;
388*4bdc9457SAndroid Build Coastguard Worker     }
389*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
390*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 2) {
391*4bdc9457SAndroid Build Coastguard Worker       i2 = zero;
392*4bdc9457SAndroid Build Coastguard Worker     }
393*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
394*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 4) {
395*4bdc9457SAndroid Build Coastguard Worker       i3 = zero;
396*4bdc9457SAndroid Build Coastguard Worker     }
397*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
398*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 4) {
399*4bdc9457SAndroid Build Coastguard Worker       i4 = zero;
400*4bdc9457SAndroid Build Coastguard Worker     }
401*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
402*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 6) {
403*4bdc9457SAndroid Build Coastguard Worker       i5 = zero;
404*4bdc9457SAndroid Build Coastguard Worker     }
405*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
406*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 6) {
407*4bdc9457SAndroid Build Coastguard Worker       i6 = zero;
408*4bdc9457SAndroid Build Coastguard Worker     }
409*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
410*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 8) {
411*4bdc9457SAndroid Build Coastguard Worker       i7 = zero;
412*4bdc9457SAndroid Build Coastguard Worker     }
413*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
414*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 8) {
415*4bdc9457SAndroid Build Coastguard Worker       i8 = zero;
416*4bdc9457SAndroid Build Coastguard Worker     }
417*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
418*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
419*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + input_offset);
420*4bdc9457SAndroid Build Coastguard Worker     }
421*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
422*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + input_offset);
423*4bdc9457SAndroid Build Coastguard Worker     }
424*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
425*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + input_offset);
426*4bdc9457SAndroid Build Coastguard Worker     }
427*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
428*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 + input_offset);
429*4bdc9457SAndroid Build Coastguard Worker     }
430*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
431*4bdc9457SAndroid Build Coastguard Worker       i4 = (const float*) ((uintptr_t) i4 + input_offset);
432*4bdc9457SAndroid Build Coastguard Worker     }
433*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
434*4bdc9457SAndroid Build Coastguard Worker       i5 = (const float*) ((uintptr_t) i5 + input_offset);
435*4bdc9457SAndroid Build Coastguard Worker     }
436*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
437*4bdc9457SAndroid Build Coastguard Worker       i6 = (const float*) ((uintptr_t) i6 + input_offset);
438*4bdc9457SAndroid Build Coastguard Worker     }
439*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
440*4bdc9457SAndroid Build Coastguard Worker       i7 = (const float*) ((uintptr_t) i7 + input_offset);
441*4bdc9457SAndroid Build Coastguard Worker     }
442*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
443*4bdc9457SAndroid Build Coastguard Worker       i8 = (const float*) ((uintptr_t) i8 + input_offset);
444*4bdc9457SAndroid Build Coastguard Worker     }
445*4bdc9457SAndroid Build Coastguard Worker 
446*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
447*4bdc9457SAndroid Build Coastguard Worker     while (c >= 4) {
448*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0 = _mm_loadu_ps(i0);
449*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
450*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1 = _mm_loadu_ps(i1);
451*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
452*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2 = _mm_loadu_ps(i2);
453*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
454*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3 = _mm_loadu_ps(i3);
455*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
456*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4 = _mm_loadu_ps(i4);
457*4bdc9457SAndroid Build Coastguard Worker       i4 += 4;
458*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5 = _mm_loadu_ps(i5);
459*4bdc9457SAndroid Build Coastguard Worker       i5 += 4;
460*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6 = _mm_loadu_ps(i6);
461*4bdc9457SAndroid Build Coastguard Worker       i6 += 4;
462*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7 = _mm_loadu_ps(i7);
463*4bdc9457SAndroid Build Coastguard Worker       i7 += 4;
464*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8 = _mm_loadu_ps(i8);
465*4bdc9457SAndroid Build Coastguard Worker       i8 += 4;
466*4bdc9457SAndroid Build Coastguard Worker 
467*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum018 = _mm_add_ps(_mm_add_ps(vi0, vi1), vi8);
468*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
469*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
470*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum67 = _mm_add_ps(vi6, vi7);
471*4bdc9457SAndroid Build Coastguard Worker 
472*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
473*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
474*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
475*4bdc9457SAndroid Build Coastguard Worker 
476*4bdc9457SAndroid Build Coastguard Worker       __m128 vout = _mm_mul_ps(vsum, vscale);
477*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_max_ps(vout, vmin);
478*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_min_ps(vout, vmax);
479*4bdc9457SAndroid Build Coastguard Worker 
480*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vout); output += 4;
481*4bdc9457SAndroid Build Coastguard Worker 
482*4bdc9457SAndroid Build Coastguard Worker       c -= 4;
483*4bdc9457SAndroid Build Coastguard Worker     }
484*4bdc9457SAndroid Build Coastguard Worker     if (c != 0) {
485*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0 = _mm_loadu_ps(i0);
486*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1 = _mm_loadu_ps(i1);
487*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2 = _mm_loadu_ps(i2);
488*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3 = _mm_loadu_ps(i3);
489*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4 = _mm_loadu_ps(i4);
490*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5 = _mm_loadu_ps(i5);
491*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6 = _mm_loadu_ps(i6);
492*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7 = _mm_loadu_ps(i7);
493*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8 = _mm_loadu_ps(i8);
494*4bdc9457SAndroid Build Coastguard Worker 
495*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum01 = _mm_add_ps(vi0, vi1);
496*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
497*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
498*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum67 = _mm_add_ps(vi6, vi7);
499*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
500*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
501*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
502*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
503*4bdc9457SAndroid Build Coastguard Worker 
504*4bdc9457SAndroid Build Coastguard Worker       __m128 vout = _mm_mul_ps(vsum, vscale);
505*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_max_ps(vout, vmin);
506*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_min_ps(vout, vmax);
507*4bdc9457SAndroid Build Coastguard Worker 
508*4bdc9457SAndroid Build Coastguard Worker       if (c & 2) {
509*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) output, vout);
510*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_movehl_ps(vout, vout);
511*4bdc9457SAndroid Build Coastguard Worker         output += 2;
512*4bdc9457SAndroid Build Coastguard Worker       }
513*4bdc9457SAndroid Build Coastguard Worker       if (c & 1) {
514*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(output, vout);
515*4bdc9457SAndroid Build Coastguard Worker         output += 1;
516*4bdc9457SAndroid Build Coastguard Worker       }
517*4bdc9457SAndroid Build Coastguard Worker     }
518*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) output + output_increment);
519*4bdc9457SAndroid Build Coastguard Worker   } while (--output_pixels != 0);
520*4bdc9457SAndroid Build Coastguard Worker }
521*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_channel_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])522*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(
523*4bdc9457SAndroid Build Coastguard Worker     size_t input_height,
524*4bdc9457SAndroid Build Coastguard Worker     size_t input_width,
525*4bdc9457SAndroid Build Coastguard Worker     size_t output_y_start,
526*4bdc9457SAndroid Build Coastguard Worker     size_t output_y_end,
527*4bdc9457SAndroid Build Coastguard Worker     const float* input,
528*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
529*4bdc9457SAndroid Build Coastguard Worker     const float* weights,
530*4bdc9457SAndroid Build Coastguard Worker     float* output,
531*4bdc9457SAndroid Build Coastguard Worker     size_t input_padding_top,
532*4bdc9457SAndroid Build Coastguard Worker     size_t output_channels,
533*4bdc9457SAndroid Build Coastguard Worker     size_t output_height_stride,
534*4bdc9457SAndroid Build Coastguard Worker     size_t output_channel_stride,
535*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
536*4bdc9457SAndroid Build Coastguard Worker {
537*4bdc9457SAndroid Build Coastguard Worker   assert(input_width != 0);
538*4bdc9457SAndroid Build Coastguard Worker   assert(output_y_end > output_y_start);
539*4bdc9457SAndroid Build Coastguard Worker   assert(input_padding_top <= 1);
540*4bdc9457SAndroid Build Coastguard Worker   assert(output_channels != 0);
541*4bdc9457SAndroid Build Coastguard Worker 
542*4bdc9457SAndroid Build Coastguard Worker   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
543*4bdc9457SAndroid Build Coastguard Worker   const size_t input_width_increment = round_down_po2(input_width, 4) * 3 /* channels */ * sizeof(float);
544*4bdc9457SAndroid Build Coastguard Worker   const size_t output_width = (input_width + 1) / 2;
545*4bdc9457SAndroid Build Coastguard Worker   const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
546*4bdc9457SAndroid Build Coastguard Worker 
547*4bdc9457SAndroid Build Coastguard Worker   // Adjustment for padding processed below
548*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
549*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
550*4bdc9457SAndroid Build Coastguard Worker   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
551*4bdc9457SAndroid Build Coastguard Worker   const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
552*4bdc9457SAndroid Build Coastguard Worker   const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
553*4bdc9457SAndroid Build Coastguard Worker   float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
554*4bdc9457SAndroid Build Coastguard Worker   float* output1 = (float*) ((uintptr_t) output0 + output_height_stride);
555*4bdc9457SAndroid Build Coastguard Worker 
556*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
557*4bdc9457SAndroid Build Coastguard Worker     i0 = zero;
558*4bdc9457SAndroid Build Coastguard Worker   }
559*4bdc9457SAndroid Build Coastguard Worker 
560*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
561*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
562*4bdc9457SAndroid Build Coastguard Worker 
563*4bdc9457SAndroid Build Coastguard Worker   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
564*4bdc9457SAndroid Build Coastguard Worker     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
565*4bdc9457SAndroid Build Coastguard Worker     const size_t input_y4 = input_y2 + 2;
566*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
567*4bdc9457SAndroid Build Coastguard Worker       i2 = zero;
568*4bdc9457SAndroid Build Coastguard Worker     }
569*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(input_y4 > input_height) {
570*4bdc9457SAndroid Build Coastguard Worker       i3 = zero;
571*4bdc9457SAndroid Build Coastguard Worker     }
572*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(input_y4 >= input_height) {
573*4bdc9457SAndroid Build Coastguard Worker       i4 = zero;
574*4bdc9457SAndroid Build Coastguard Worker     }
575*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
576*4bdc9457SAndroid Build Coastguard Worker       output1 = output0;
577*4bdc9457SAndroid Build Coastguard Worker     }
578*4bdc9457SAndroid Build Coastguard Worker 
579*4bdc9457SAndroid Build Coastguard Worker     const float* w = weights;
580*4bdc9457SAndroid Build Coastguard Worker     size_t c = output_channels;
581*4bdc9457SAndroid Build Coastguard Worker     float* o0c0 = output0;
582*4bdc9457SAndroid Build Coastguard Worker     float* o1c0 = output1;
583*4bdc9457SAndroid Build Coastguard Worker     float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
584*4bdc9457SAndroid Build Coastguard Worker     float* o1c1 = (float*) ((uintptr_t) o1c0 + output_channel_stride);
585*4bdc9457SAndroid Build Coastguard Worker     float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
586*4bdc9457SAndroid Build Coastguard Worker     float* o1c2 = (float*) ((uintptr_t) o1c1 + output_channel_stride);
587*4bdc9457SAndroid Build Coastguard Worker     float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
588*4bdc9457SAndroid Build Coastguard Worker     float* o1c3 = (float*) ((uintptr_t) o1c2 + output_channel_stride);
589*4bdc9457SAndroid Build Coastguard Worker     do {
590*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(c < 2) {
591*4bdc9457SAndroid Build Coastguard Worker         o0c1 = o0c0;
592*4bdc9457SAndroid Build Coastguard Worker         o1c1 = o1c0;
593*4bdc9457SAndroid Build Coastguard Worker       }
594*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(c <= 2) {
595*4bdc9457SAndroid Build Coastguard Worker         o0c2 = o0c1;
596*4bdc9457SAndroid Build Coastguard Worker         o1c2 = o1c1;
597*4bdc9457SAndroid Build Coastguard Worker       }
598*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(c < 4) {
599*4bdc9457SAndroid Build Coastguard Worker         o0c3 = o0c2;
600*4bdc9457SAndroid Build Coastguard Worker         o1c3 = o1c2;
601*4bdc9457SAndroid Build Coastguard Worker       }
602*4bdc9457SAndroid Build Coastguard Worker 
603*4bdc9457SAndroid Build Coastguard Worker       // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
604*4bdc9457SAndroid Build Coastguard Worker       __m128 vi0x0 = _mm_setzero_ps();
605*4bdc9457SAndroid Build Coastguard Worker       __m128 vi1x0 = _mm_setzero_ps();
606*4bdc9457SAndroid Build Coastguard Worker       __m128 vi2x0 = _mm_setzero_ps();
607*4bdc9457SAndroid Build Coastguard Worker       __m128 vi3x0 = _mm_setzero_ps();
608*4bdc9457SAndroid Build Coastguard Worker       __m128 vi4x0 = _mm_setzero_ps();
609*4bdc9457SAndroid Build Coastguard Worker 
610*4bdc9457SAndroid Build Coastguard Worker       size_t iw = input_width;
611*4bdc9457SAndroid Build Coastguard Worker       for (; iw >= 4; iw -= 4) {
612*4bdc9457SAndroid Build Coastguard Worker         __m128 vo0x0 = _mm_load_ps(w);
613*4bdc9457SAndroid Build Coastguard Worker         __m128 vo1x0 = vo0x0;
614*4bdc9457SAndroid Build Coastguard Worker         __m128 vo0x1 = vo0x0;
615*4bdc9457SAndroid Build Coastguard Worker         __m128 vo1x1 = vo0x0;
616*4bdc9457SAndroid Build Coastguard Worker 
617*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk00c0 = _mm_load_ps(w + 4);
618*4bdc9457SAndroid Build Coastguard Worker 
619*4bdc9457SAndroid Build Coastguard Worker         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
620*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0x1 = _mm_loadu_ps(i0); i0 += 4;
621*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1x1 = _mm_loadu_ps(i1); i1 += 4;
622*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2x1 = _mm_loadu_ps(i2); i2 += 4;
623*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3x1 = _mm_loadu_ps(i3); i3 += 4;
624*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4x1 = _mm_loadu_ps(i4); i4 += 4;
625*4bdc9457SAndroid Build Coastguard Worker 
626*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
627*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
628*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
629*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
630*4bdc9457SAndroid Build Coastguard Worker 
631*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk10c0 = _mm_load_ps(w + 8);
632*4bdc9457SAndroid Build Coastguard Worker 
633*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
634*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
635*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
636*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
637*4bdc9457SAndroid Build Coastguard Worker 
638*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk20c0 = _mm_load_ps(w + 12);
639*4bdc9457SAndroid Build Coastguard Worker 
640*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
641*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
642*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
643*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
644*4bdc9457SAndroid Build Coastguard Worker 
645*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk00c1 = _mm_load_ps(w + 16);
646*4bdc9457SAndroid Build Coastguard Worker 
647*4bdc9457SAndroid Build Coastguard Worker         // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
648*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0x2 = _mm_loadu_ps(i0); i0 += 4;
649*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1x2 = _mm_loadu_ps(i1); i1 += 4;
650*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2x2 = _mm_loadu_ps(i2); i2 += 4;
651*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3x2 = _mm_loadu_ps(i3); i3 += 4;
652*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4x2 = _mm_loadu_ps(i4); i4 += 4;
653*4bdc9457SAndroid Build Coastguard Worker 
654*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
655*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
656*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
657*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
658*4bdc9457SAndroid Build Coastguard Worker 
659*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk10c1 = _mm_load_ps(w + 20);
660*4bdc9457SAndroid Build Coastguard Worker 
661*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
662*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
663*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
664*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
665*4bdc9457SAndroid Build Coastguard Worker 
666*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk20c1 = _mm_load_ps(w + 24);
667*4bdc9457SAndroid Build Coastguard Worker 
668*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
669*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
670*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
671*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
672*4bdc9457SAndroid Build Coastguard Worker 
673*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk00c2 = _mm_load_ps(w + 28);
674*4bdc9457SAndroid Build Coastguard Worker 
675*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
676*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
677*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
678*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
679*4bdc9457SAndroid Build Coastguard Worker 
680*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk10c2 = _mm_load_ps(w + 32);
681*4bdc9457SAndroid Build Coastguard Worker 
682*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
683*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
684*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
685*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
686*4bdc9457SAndroid Build Coastguard Worker 
687*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk20c2 = _mm_load_ps(w + 36);
688*4bdc9457SAndroid Build Coastguard Worker 
689*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
690*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
691*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
692*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
693*4bdc9457SAndroid Build Coastguard Worker 
694*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk01c0 = _mm_load_ps(w + 40);
695*4bdc9457SAndroid Build Coastguard Worker 
696*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
697*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
698*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
699*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
700*4bdc9457SAndroid Build Coastguard Worker 
701*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk11c0 = _mm_load_ps(w + 44);
702*4bdc9457SAndroid Build Coastguard Worker 
703*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
704*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
705*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
706*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
707*4bdc9457SAndroid Build Coastguard Worker 
708*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk21c0 = _mm_load_ps(w + 48);
709*4bdc9457SAndroid Build Coastguard Worker 
710*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
711*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
712*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
713*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
714*4bdc9457SAndroid Build Coastguard Worker 
715*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk01c1 = _mm_load_ps(w + 52);
716*4bdc9457SAndroid Build Coastguard Worker 
717*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
718*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
719*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
720*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
721*4bdc9457SAndroid Build Coastguard Worker 
722*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk11c1 = _mm_load_ps(w + 56);
723*4bdc9457SAndroid Build Coastguard Worker 
724*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
725*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
726*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
727*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
728*4bdc9457SAndroid Build Coastguard Worker 
729*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk21c1 = _mm_load_ps(w + 60);
730*4bdc9457SAndroid Build Coastguard Worker 
731*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
732*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
733*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
734*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
735*4bdc9457SAndroid Build Coastguard Worker 
736*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk01c2 = _mm_load_ps(w + 64);
737*4bdc9457SAndroid Build Coastguard Worker 
738*4bdc9457SAndroid Build Coastguard Worker         // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
739*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0x3 = _mm_loadu_ps(i0); i0 += 4;
740*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1x3 = _mm_loadu_ps(i1); i1 += 4;
741*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2x3 = _mm_loadu_ps(i2); i2 += 4;
742*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3x3 = _mm_loadu_ps(i3); i3 += 4;
743*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4x3 = _mm_loadu_ps(i4); i4 += 4;
744*4bdc9457SAndroid Build Coastguard Worker 
745*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
746*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
747*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
748*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
749*4bdc9457SAndroid Build Coastguard Worker 
750*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk11c2 = _mm_load_ps(w + 68);
751*4bdc9457SAndroid Build Coastguard Worker 
752*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
753*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
754*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
755*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
756*4bdc9457SAndroid Build Coastguard Worker 
757*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk21c2 = _mm_load_ps(w + 72);
758*4bdc9457SAndroid Build Coastguard Worker 
759*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
760*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
761*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
762*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
763*4bdc9457SAndroid Build Coastguard Worker 
764*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk02c0 = _mm_load_ps(w + 76);
765*4bdc9457SAndroid Build Coastguard Worker 
766*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
767*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
768*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(1, 1, 1, 1))));
769*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
770*4bdc9457SAndroid Build Coastguard Worker 
771*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk12c0 = _mm_load_ps(w + 80);
772*4bdc9457SAndroid Build Coastguard Worker 
773*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
774*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
775*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(1, 1, 1, 1))));
776*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(1, 1, 1, 1))));
777*4bdc9457SAndroid Build Coastguard Worker 
778*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk22c0 = _mm_load_ps(w + 84);
779*4bdc9457SAndroid Build Coastguard Worker 
780*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
781*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
782*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
783*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(1, 1, 1, 1))));
784*4bdc9457SAndroid Build Coastguard Worker 
785*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk02c1 = _mm_load_ps(w + 88);
786*4bdc9457SAndroid Build Coastguard Worker 
787*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
788*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
789*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(2, 2, 2, 2))));
790*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
791*4bdc9457SAndroid Build Coastguard Worker 
792*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk12c1 = _mm_load_ps(w + 92);
793*4bdc9457SAndroid Build Coastguard Worker 
794*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
795*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
796*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(2, 2, 2, 2))));
797*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(2, 2, 2, 2))));
798*4bdc9457SAndroid Build Coastguard Worker 
799*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk22c1 = _mm_load_ps(w + 96);
800*4bdc9457SAndroid Build Coastguard Worker 
801*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
802*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
803*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
804*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(2, 2, 2, 2))));
805*4bdc9457SAndroid Build Coastguard Worker 
806*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk02c2 = _mm_load_ps(w + 100);
807*4bdc9457SAndroid Build Coastguard Worker 
808*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
809*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
810*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(3, 3, 3, 3))));
811*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
812*4bdc9457SAndroid Build Coastguard Worker 
813*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk12c2 = _mm_load_ps(w + 104);
814*4bdc9457SAndroid Build Coastguard Worker 
815*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
816*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
817*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(3, 3, 3, 3))));
818*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(3, 3, 3, 3))));
819*4bdc9457SAndroid Build Coastguard Worker 
820*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk22c2 = _mm_load_ps(w + 108);
821*4bdc9457SAndroid Build Coastguard Worker 
822*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
823*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
824*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
825*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(3, 3, 3, 3))));
826*4bdc9457SAndroid Build Coastguard Worker 
827*4bdc9457SAndroid Build Coastguard Worker         vi0x0 = vi0x3;
828*4bdc9457SAndroid Build Coastguard Worker         vi1x0 = vi1x3;
829*4bdc9457SAndroid Build Coastguard Worker         vi2x0 = vi2x3;
830*4bdc9457SAndroid Build Coastguard Worker         vi3x0 = vi3x3;
831*4bdc9457SAndroid Build Coastguard Worker         vi4x0 = vi4x3;
832*4bdc9457SAndroid Build Coastguard Worker 
833*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_max_ps(vo0x0, vmin);
834*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_max_ps(vo1x0, vmin);
835*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_max_ps(vo0x1, vmin);
836*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_max_ps(vo1x1, vmin);
837*4bdc9457SAndroid Build Coastguard Worker 
838*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_min_ps(vo0x0, vmax);
839*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_min_ps(vo1x0, vmax);
840*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_min_ps(vo0x1, vmax);
841*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_min_ps(vo1x1, vmax);
842*4bdc9457SAndroid Build Coastguard Worker 
843*4bdc9457SAndroid Build Coastguard Worker         const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
844*4bdc9457SAndroid Build Coastguard Worker         const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
845*4bdc9457SAndroid Build Coastguard Worker         const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
846*4bdc9457SAndroid Build Coastguard Worker         const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
847*4bdc9457SAndroid Build Coastguard Worker 
848*4bdc9457SAndroid Build Coastguard Worker         // Always 2+ output width elements remaining
849*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
850*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
851*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
852*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
853*4bdc9457SAndroid Build Coastguard Worker 
854*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
855*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
856*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
857*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
858*4bdc9457SAndroid Build Coastguard Worker       }
859*4bdc9457SAndroid Build Coastguard Worker       assert(iw < 4);
860*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNLIKELY(iw != 0) {
861*4bdc9457SAndroid Build Coastguard Worker         __m128 vo0x0 = _mm_load_ps(w);
862*4bdc9457SAndroid Build Coastguard Worker         __m128 vo1x0 = vo0x0;
863*4bdc9457SAndroid Build Coastguard Worker         __m128 vo0x1 = vo0x0;
864*4bdc9457SAndroid Build Coastguard Worker         __m128 vo1x1 = vo0x0;
865*4bdc9457SAndroid Build Coastguard Worker 
866*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk00c0 = _mm_load_ps(w + 4);
867*4bdc9457SAndroid Build Coastguard Worker 
868*4bdc9457SAndroid Build Coastguard Worker         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
869*4bdc9457SAndroid Build Coastguard Worker         __m128 vi0x1 = _mm_loadu_ps(i0);
870*4bdc9457SAndroid Build Coastguard Worker         __m128 vi1x1 = _mm_loadu_ps(i1);
871*4bdc9457SAndroid Build Coastguard Worker         __m128 vi2x1 = _mm_loadu_ps(i2);
872*4bdc9457SAndroid Build Coastguard Worker         __m128 vi3x1 = _mm_loadu_ps(i3);
873*4bdc9457SAndroid Build Coastguard Worker         __m128 vi4x1 = _mm_loadu_ps(i4);
874*4bdc9457SAndroid Build Coastguard Worker 
875*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
876*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
877*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
878*4bdc9457SAndroid Build Coastguard Worker           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
879*4bdc9457SAndroid Build Coastguard Worker           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
880*4bdc9457SAndroid Build Coastguard Worker         }
881*4bdc9457SAndroid Build Coastguard Worker 
882*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk10c0 = _mm_load_ps(w + 8);
883*4bdc9457SAndroid Build Coastguard Worker 
884*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
885*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
886*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
887*4bdc9457SAndroid Build Coastguard Worker           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
888*4bdc9457SAndroid Build Coastguard Worker           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
889*4bdc9457SAndroid Build Coastguard Worker         }
890*4bdc9457SAndroid Build Coastguard Worker 
891*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk20c0 = _mm_load_ps(w + 12);
892*4bdc9457SAndroid Build Coastguard Worker 
893*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
894*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
895*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
896*4bdc9457SAndroid Build Coastguard Worker           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
897*4bdc9457SAndroid Build Coastguard Worker           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
898*4bdc9457SAndroid Build Coastguard Worker         }
899*4bdc9457SAndroid Build Coastguard Worker 
900*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk00c1 = _mm_load_ps(w + 16);
901*4bdc9457SAndroid Build Coastguard Worker 
902*4bdc9457SAndroid Build Coastguard Worker         __m128 vi0x2 = _mm_setzero_ps();
903*4bdc9457SAndroid Build Coastguard Worker         __m128 vi1x2 = _mm_setzero_ps();
904*4bdc9457SAndroid Build Coastguard Worker         __m128 vi2x2 = _mm_setzero_ps();
905*4bdc9457SAndroid Build Coastguard Worker         __m128 vi3x2 = _mm_setzero_ps();
906*4bdc9457SAndroid Build Coastguard Worker         __m128 vi4x2 = _mm_setzero_ps();
907*4bdc9457SAndroid Build Coastguard Worker         if (iw >= 2) {
908*4bdc9457SAndroid Build Coastguard Worker           // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
909*4bdc9457SAndroid Build Coastguard Worker           vi0x2 = _mm_loadu_ps(i0 + 4);
910*4bdc9457SAndroid Build Coastguard Worker           vi1x2 = _mm_loadu_ps(i1 + 4);
911*4bdc9457SAndroid Build Coastguard Worker           vi2x2 = _mm_loadu_ps(i2 + 4);
912*4bdc9457SAndroid Build Coastguard Worker           vi3x2 = _mm_loadu_ps(i3 + 4);
913*4bdc9457SAndroid Build Coastguard Worker           vi4x2 = _mm_loadu_ps(i4 + 4);
914*4bdc9457SAndroid Build Coastguard Worker         }
915*4bdc9457SAndroid Build Coastguard Worker 
916*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
917*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
918*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
919*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
920*4bdc9457SAndroid Build Coastguard Worker 
921*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk10c1 = _mm_load_ps(w + 20);
922*4bdc9457SAndroid Build Coastguard Worker 
923*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
924*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
925*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
926*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
927*4bdc9457SAndroid Build Coastguard Worker 
928*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk20c1 = _mm_load_ps(w + 24);
929*4bdc9457SAndroid Build Coastguard Worker 
930*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
931*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
932*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
933*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
934*4bdc9457SAndroid Build Coastguard Worker 
935*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk00c2 = _mm_load_ps(w + 28);
936*4bdc9457SAndroid Build Coastguard Worker 
937*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
938*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
939*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
940*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
941*4bdc9457SAndroid Build Coastguard Worker 
942*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk10c2 = _mm_load_ps(w + 32);
943*4bdc9457SAndroid Build Coastguard Worker 
944*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
945*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
946*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
947*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
948*4bdc9457SAndroid Build Coastguard Worker 
949*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk20c2 = _mm_load_ps(w + 36);
950*4bdc9457SAndroid Build Coastguard Worker 
951*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
952*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
953*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
954*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
955*4bdc9457SAndroid Build Coastguard Worker 
956*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk01c0 = _mm_load_ps(w + 40);
957*4bdc9457SAndroid Build Coastguard Worker 
958*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
959*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
960*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
961*4bdc9457SAndroid Build Coastguard Worker           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
962*4bdc9457SAndroid Build Coastguard Worker           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
963*4bdc9457SAndroid Build Coastguard Worker         }
964*4bdc9457SAndroid Build Coastguard Worker 
965*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk11c0 = _mm_load_ps(w + 44);
966*4bdc9457SAndroid Build Coastguard Worker 
967*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
968*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
969*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
970*4bdc9457SAndroid Build Coastguard Worker           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
971*4bdc9457SAndroid Build Coastguard Worker           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
972*4bdc9457SAndroid Build Coastguard Worker         }
973*4bdc9457SAndroid Build Coastguard Worker 
974*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk21c0 = _mm_load_ps(w + 48);
975*4bdc9457SAndroid Build Coastguard Worker 
976*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
977*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
978*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
979*4bdc9457SAndroid Build Coastguard Worker           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
980*4bdc9457SAndroid Build Coastguard Worker           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
981*4bdc9457SAndroid Build Coastguard Worker         }
982*4bdc9457SAndroid Build Coastguard Worker 
983*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk01c1 = _mm_load_ps(w + 52);
984*4bdc9457SAndroid Build Coastguard Worker 
985*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
986*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
987*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
988*4bdc9457SAndroid Build Coastguard Worker           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
989*4bdc9457SAndroid Build Coastguard Worker           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
990*4bdc9457SAndroid Build Coastguard Worker         }
991*4bdc9457SAndroid Build Coastguard Worker 
992*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk11c1 = _mm_load_ps(w + 56);
993*4bdc9457SAndroid Build Coastguard Worker 
994*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
995*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
996*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
997*4bdc9457SAndroid Build Coastguard Worker           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
998*4bdc9457SAndroid Build Coastguard Worker           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
999*4bdc9457SAndroid Build Coastguard Worker         }
1000*4bdc9457SAndroid Build Coastguard Worker 
1001*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk21c1 = _mm_load_ps(w + 60);
1002*4bdc9457SAndroid Build Coastguard Worker 
1003*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
1004*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
1005*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
1006*4bdc9457SAndroid Build Coastguard Worker           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
1007*4bdc9457SAndroid Build Coastguard Worker           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
1008*4bdc9457SAndroid Build Coastguard Worker         }
1009*4bdc9457SAndroid Build Coastguard Worker 
1010*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk01c2 = _mm_load_ps(w + 64);
1011*4bdc9457SAndroid Build Coastguard Worker 
1012*4bdc9457SAndroid Build Coastguard Worker         __m128 vi0x3 = _mm_setzero_ps();
1013*4bdc9457SAndroid Build Coastguard Worker         __m128 vi1x3 = _mm_setzero_ps();
1014*4bdc9457SAndroid Build Coastguard Worker         __m128 vi2x3 = _mm_setzero_ps();
1015*4bdc9457SAndroid Build Coastguard Worker         __m128 vi3x3 = _mm_setzero_ps();
1016*4bdc9457SAndroid Build Coastguard Worker         __m128 vi4x3 = _mm_setzero_ps();
1017*4bdc9457SAndroid Build Coastguard Worker         if (iw > 2) {
1018*4bdc9457SAndroid Build Coastguard Worker           // viMx3 = ( 0.0, 0.0, 0.0, iM3c2 )
1019*4bdc9457SAndroid Build Coastguard Worker           vi0x3 = _mm_load_ss(i0 + 8);
1020*4bdc9457SAndroid Build Coastguard Worker           vi1x3 = _mm_load_ss(i1 + 8);
1021*4bdc9457SAndroid Build Coastguard Worker           vi2x3 = _mm_load_ss(i2 + 8);
1022*4bdc9457SAndroid Build Coastguard Worker           vi3x3 = _mm_load_ss(i3 + 8);
1023*4bdc9457SAndroid Build Coastguard Worker           vi4x3 = _mm_load_ss(i4 + 8);
1024*4bdc9457SAndroid Build Coastguard Worker         }
1025*4bdc9457SAndroid Build Coastguard Worker 
1026*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
1027*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
1028*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
1029*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
1030*4bdc9457SAndroid Build Coastguard Worker 
1031*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk11c2 = _mm_load_ps(w + 68);
1032*4bdc9457SAndroid Build Coastguard Worker 
1033*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
1034*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
1035*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
1036*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
1037*4bdc9457SAndroid Build Coastguard Worker 
1038*4bdc9457SAndroid Build Coastguard Worker         const __m128 vk21c2 = _mm_load_ps(w + 72);
1039*4bdc9457SAndroid Build Coastguard Worker 
1040*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
1041*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
1042*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
1043*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
1044*4bdc9457SAndroid Build Coastguard Worker 
1045*4bdc9457SAndroid Build Coastguard Worker         if (iw >= 2) {
1046*4bdc9457SAndroid Build Coastguard Worker           const __m128 vk02c0 = _mm_load_ps(w + 76);
1047*4bdc9457SAndroid Build Coastguard Worker 
1048*4bdc9457SAndroid Build Coastguard Worker           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
1049*4bdc9457SAndroid Build Coastguard Worker           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
1050*4bdc9457SAndroid Build Coastguard Worker 
1051*4bdc9457SAndroid Build Coastguard Worker           const __m128 vk12c0 = _mm_load_ps(w + 80);
1052*4bdc9457SAndroid Build Coastguard Worker 
1053*4bdc9457SAndroid Build Coastguard Worker           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
1054*4bdc9457SAndroid Build Coastguard Worker           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
1055*4bdc9457SAndroid Build Coastguard Worker 
1056*4bdc9457SAndroid Build Coastguard Worker           const __m128 vk22c0 = _mm_load_ps(w + 84);
1057*4bdc9457SAndroid Build Coastguard Worker 
1058*4bdc9457SAndroid Build Coastguard Worker           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
1059*4bdc9457SAndroid Build Coastguard Worker           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
1060*4bdc9457SAndroid Build Coastguard Worker 
1061*4bdc9457SAndroid Build Coastguard Worker           const __m128 vk02c1 = _mm_load_ps(w + 88);
1062*4bdc9457SAndroid Build Coastguard Worker 
1063*4bdc9457SAndroid Build Coastguard Worker           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
1064*4bdc9457SAndroid Build Coastguard Worker           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
1065*4bdc9457SAndroid Build Coastguard Worker 
1066*4bdc9457SAndroid Build Coastguard Worker           const __m128 vk12c1 = _mm_load_ps(w + 92);
1067*4bdc9457SAndroid Build Coastguard Worker 
1068*4bdc9457SAndroid Build Coastguard Worker           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
1069*4bdc9457SAndroid Build Coastguard Worker           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
1070*4bdc9457SAndroid Build Coastguard Worker 
1071*4bdc9457SAndroid Build Coastguard Worker           const __m128 vk22c1 = _mm_load_ps(w + 96);
1072*4bdc9457SAndroid Build Coastguard Worker 
1073*4bdc9457SAndroid Build Coastguard Worker           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
1074*4bdc9457SAndroid Build Coastguard Worker           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
1075*4bdc9457SAndroid Build Coastguard Worker 
1076*4bdc9457SAndroid Build Coastguard Worker           const __m128 vk02c2 = _mm_load_ps(w + 100);
1077*4bdc9457SAndroid Build Coastguard Worker 
1078*4bdc9457SAndroid Build Coastguard Worker           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
1079*4bdc9457SAndroid Build Coastguard Worker           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
1080*4bdc9457SAndroid Build Coastguard Worker 
1081*4bdc9457SAndroid Build Coastguard Worker           const __m128 vk12c2 = _mm_load_ps(w + 104);
1082*4bdc9457SAndroid Build Coastguard Worker 
1083*4bdc9457SAndroid Build Coastguard Worker           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
1084*4bdc9457SAndroid Build Coastguard Worker           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
1085*4bdc9457SAndroid Build Coastguard Worker 
1086*4bdc9457SAndroid Build Coastguard Worker           const __m128 vk22c2 = _mm_load_ps(w + 108);
1087*4bdc9457SAndroid Build Coastguard Worker 
1088*4bdc9457SAndroid Build Coastguard Worker           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
1089*4bdc9457SAndroid Build Coastguard Worker           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
1090*4bdc9457SAndroid Build Coastguard Worker         }
1091*4bdc9457SAndroid Build Coastguard Worker 
1092*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_max_ps(vo0x0, vmin);
1093*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_max_ps(vo1x0, vmin);
1094*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_max_ps(vo0x1, vmin);
1095*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_max_ps(vo1x1, vmin);
1096*4bdc9457SAndroid Build Coastguard Worker 
1097*4bdc9457SAndroid Build Coastguard Worker         vo0x0 = _mm_min_ps(vo0x0, vmax);
1098*4bdc9457SAndroid Build Coastguard Worker         vo1x0 = _mm_min_ps(vo1x0, vmax);
1099*4bdc9457SAndroid Build Coastguard Worker         vo0x1 = _mm_min_ps(vo0x1, vmax);
1100*4bdc9457SAndroid Build Coastguard Worker         vo1x1 = _mm_min_ps(vo1x1, vmax);
1101*4bdc9457SAndroid Build Coastguard Worker 
1102*4bdc9457SAndroid Build Coastguard Worker         if (iw == 3) {
1103*4bdc9457SAndroid Build Coastguard Worker           // Exactly 2 output width elements remaining
1104*4bdc9457SAndroid Build Coastguard Worker           const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
1105*4bdc9457SAndroid Build Coastguard Worker           const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
1106*4bdc9457SAndroid Build Coastguard Worker           const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
1107*4bdc9457SAndroid Build Coastguard Worker           const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
1108*4bdc9457SAndroid Build Coastguard Worker 
1109*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
1110*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
1111*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
1112*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
1113*4bdc9457SAndroid Build Coastguard Worker 
1114*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
1115*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
1116*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
1117*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
1118*4bdc9457SAndroid Build Coastguard Worker         } else {
1119*4bdc9457SAndroid Build Coastguard Worker           // Exactly 1 output width element remaining
1120*4bdc9457SAndroid Build Coastguard Worker 
1121*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o1c0, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(0, 0, 0, 0))); o1c0 += 1;
1122*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o1c1, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(1, 1, 1, 1))); o1c1 += 1;
1123*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o1c2, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(2, 2, 2, 2))); o1c2 += 1;
1124*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o1c3, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(3, 3, 3, 3))); o1c3 += 1;
1125*4bdc9457SAndroid Build Coastguard Worker 
1126*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o0c0, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(0, 0, 0, 0))); o0c0 += 1;
1127*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o0c1, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(1, 1, 1, 1))); o0c1 += 1;
1128*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o0c2, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(2, 2, 2, 2))); o0c2 += 1;
1129*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o0c3, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(3, 3, 3, 3))); o0c3 += 1;
1130*4bdc9457SAndroid Build Coastguard Worker         }
1131*4bdc9457SAndroid Build Coastguard Worker       }
1132*4bdc9457SAndroid Build Coastguard Worker       // Move output pointers back to the position of the first pixel in a row,
1133*4bdc9457SAndroid Build Coastguard Worker       // and forward to the next block of output channels.
1134*4bdc9457SAndroid Build Coastguard Worker       o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
1135*4bdc9457SAndroid Build Coastguard Worker       o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
1136*4bdc9457SAndroid Build Coastguard Worker       o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
1137*4bdc9457SAndroid Build Coastguard Worker       o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
1138*4bdc9457SAndroid Build Coastguard Worker       o1c0 = (float*) ((uintptr_t) o1c0 + output_channel_increment);
1139*4bdc9457SAndroid Build Coastguard Worker       o1c1 = (float*) ((uintptr_t) o1c1 + output_channel_increment);
1140*4bdc9457SAndroid Build Coastguard Worker       o1c2 = (float*) ((uintptr_t) o1c2 + output_channel_increment);
1141*4bdc9457SAndroid Build Coastguard Worker       o1c3 = (float*) ((uintptr_t) o1c3 + output_channel_increment);
1142*4bdc9457SAndroid Build Coastguard Worker       // Revert input pointers to the position of the first pixel in a row
1143*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 - input_width_increment);
1144*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 - input_width_increment);
1145*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 - input_width_increment);
1146*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 - input_width_increment);
1147*4bdc9457SAndroid Build Coastguard Worker       i4 = (const float*) ((uintptr_t) i4 - input_width_increment);
1148*4bdc9457SAndroid Build Coastguard Worker       // Move to the block of weights for the next 4 output channels
1149*4bdc9457SAndroid Build Coastguard Worker       w += 112;
1150*4bdc9457SAndroid Build Coastguard Worker       c = doz(c, 4);
1151*4bdc9457SAndroid Build Coastguard Worker     } while (c != 0);
1152*4bdc9457SAndroid Build Coastguard Worker     // Move output pointers forward to the next two rows
1153*4bdc9457SAndroid Build Coastguard Worker     output0 = (float*) ((uintptr_t) output1 + output_height_stride);
1154*4bdc9457SAndroid Build Coastguard Worker     output1 = (float*) ((uintptr_t) output0 + output_height_stride);
1155*4bdc9457SAndroid Build Coastguard Worker     // Move input pointers forward to the next four rows
1156*4bdc9457SAndroid Build Coastguard Worker     i0 = i4;
1157*4bdc9457SAndroid Build Coastguard Worker     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
1158*4bdc9457SAndroid Build Coastguard Worker     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
1159*4bdc9457SAndroid Build Coastguard Worker     i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
1160*4bdc9457SAndroid Build Coastguard Worker     i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
1161*4bdc9457SAndroid Build Coastguard Worker   }
1162*4bdc9457SAndroid Build Coastguard Worker }
1163*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_dwconv_minmax_ukernel_up8x25__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1164*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv_minmax_ukernel_up8x25__sse(
1165*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
1166*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
1167*4bdc9457SAndroid Build Coastguard Worker     const float** input,
1168*4bdc9457SAndroid Build Coastguard Worker     const float* weights,
1169*4bdc9457SAndroid Build Coastguard Worker     float* output,
1170*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
1171*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
1172*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
1173*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
1174*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1175*4bdc9457SAndroid Build Coastguard Worker {
1176*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
1177*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
1178*4bdc9457SAndroid Build Coastguard Worker 
1179*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
1180*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
1181*4bdc9457SAndroid Build Coastguard Worker   do {
1182*4bdc9457SAndroid Build Coastguard Worker     const float* i0 = input[0];
1183*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
1184*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
1185*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + input_offset);
1186*4bdc9457SAndroid Build Coastguard Worker     }
1187*4bdc9457SAndroid Build Coastguard Worker     const float* i1 = input[1];
1188*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
1189*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
1190*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + input_offset);
1191*4bdc9457SAndroid Build Coastguard Worker     }
1192*4bdc9457SAndroid Build Coastguard Worker     const float* i2 = input[2];
1193*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
1194*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
1195*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + input_offset);
1196*4bdc9457SAndroid Build Coastguard Worker     }
1197*4bdc9457SAndroid Build Coastguard Worker     const float* i3 = input[3];
1198*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
1199*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
1200*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 + input_offset);
1201*4bdc9457SAndroid Build Coastguard Worker     }
1202*4bdc9457SAndroid Build Coastguard Worker     const float* i4 = input[4];
1203*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
1204*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
1205*4bdc9457SAndroid Build Coastguard Worker       i4 = (const float*) ((uintptr_t) i4 + input_offset);
1206*4bdc9457SAndroid Build Coastguard Worker     }
1207*4bdc9457SAndroid Build Coastguard Worker     const float* i5 = input[5];
1208*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
1209*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
1210*4bdc9457SAndroid Build Coastguard Worker       i5 = (const float*) ((uintptr_t) i5 + input_offset);
1211*4bdc9457SAndroid Build Coastguard Worker     }
1212*4bdc9457SAndroid Build Coastguard Worker     const float* i6 = input[6];
1213*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
1214*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
1215*4bdc9457SAndroid Build Coastguard Worker       i6 = (const float*) ((uintptr_t) i6 + input_offset);
1216*4bdc9457SAndroid Build Coastguard Worker     }
1217*4bdc9457SAndroid Build Coastguard Worker     const float* i7 = input[7];
1218*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
1219*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
1220*4bdc9457SAndroid Build Coastguard Worker       i7 = (const float*) ((uintptr_t) i7 + input_offset);
1221*4bdc9457SAndroid Build Coastguard Worker     }
1222*4bdc9457SAndroid Build Coastguard Worker     const float* i8 = input[8];
1223*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
1224*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
1225*4bdc9457SAndroid Build Coastguard Worker       i8 = (const float*) ((uintptr_t) i8 + input_offset);
1226*4bdc9457SAndroid Build Coastguard Worker     }
1227*4bdc9457SAndroid Build Coastguard Worker     const float* i9 = input[9];
1228*4bdc9457SAndroid Build Coastguard Worker     assert(i9 != NULL);
1229*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i9 != zero) {
1230*4bdc9457SAndroid Build Coastguard Worker       i9 = (const float*) ((uintptr_t) i9 + input_offset);
1231*4bdc9457SAndroid Build Coastguard Worker     }
1232*4bdc9457SAndroid Build Coastguard Worker     const float* i10 = input[10];
1233*4bdc9457SAndroid Build Coastguard Worker     assert(i10 != NULL);
1234*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i10 != zero) {
1235*4bdc9457SAndroid Build Coastguard Worker       i10 = (const float*) ((uintptr_t) i10 + input_offset);
1236*4bdc9457SAndroid Build Coastguard Worker     }
1237*4bdc9457SAndroid Build Coastguard Worker     const float* i11 = input[11];
1238*4bdc9457SAndroid Build Coastguard Worker     assert(i11 != NULL);
1239*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i11 != zero) {
1240*4bdc9457SAndroid Build Coastguard Worker       i11 = (const float*) ((uintptr_t) i11 + input_offset);
1241*4bdc9457SAndroid Build Coastguard Worker     }
1242*4bdc9457SAndroid Build Coastguard Worker     const float* i12 = input[12];
1243*4bdc9457SAndroid Build Coastguard Worker     assert(i12 != NULL);
1244*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i12 != zero) {
1245*4bdc9457SAndroid Build Coastguard Worker       i12 = (const float*) ((uintptr_t) i12 + input_offset);
1246*4bdc9457SAndroid Build Coastguard Worker     }
1247*4bdc9457SAndroid Build Coastguard Worker     const float* i13 = input[13];
1248*4bdc9457SAndroid Build Coastguard Worker     assert(i13 != NULL);
1249*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i13 != zero) {
1250*4bdc9457SAndroid Build Coastguard Worker       i13 = (const float*) ((uintptr_t) i13 + input_offset);
1251*4bdc9457SAndroid Build Coastguard Worker     }
1252*4bdc9457SAndroid Build Coastguard Worker     const float* i14 = input[14];
1253*4bdc9457SAndroid Build Coastguard Worker     assert(i14 != NULL);
1254*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i14 != zero) {
1255*4bdc9457SAndroid Build Coastguard Worker       i14 = (const float*) ((uintptr_t) i14 + input_offset);
1256*4bdc9457SAndroid Build Coastguard Worker     }
1257*4bdc9457SAndroid Build Coastguard Worker     const float* i15 = input[15];
1258*4bdc9457SAndroid Build Coastguard Worker     assert(i15 != NULL);
1259*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i15 != zero) {
1260*4bdc9457SAndroid Build Coastguard Worker       i15 = (const float*) ((uintptr_t) i15 + input_offset);
1261*4bdc9457SAndroid Build Coastguard Worker     }
1262*4bdc9457SAndroid Build Coastguard Worker     const float* i16 = input[16];
1263*4bdc9457SAndroid Build Coastguard Worker     assert(i16 != NULL);
1264*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i16 != zero) {
1265*4bdc9457SAndroid Build Coastguard Worker       i16 = (const float*) ((uintptr_t) i16 + input_offset);
1266*4bdc9457SAndroid Build Coastguard Worker     }
1267*4bdc9457SAndroid Build Coastguard Worker     const float* i17 = input[17];
1268*4bdc9457SAndroid Build Coastguard Worker     assert(i17 != NULL);
1269*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i17 != zero) {
1270*4bdc9457SAndroid Build Coastguard Worker       i17 = (const float*) ((uintptr_t) i17 + input_offset);
1271*4bdc9457SAndroid Build Coastguard Worker     }
1272*4bdc9457SAndroid Build Coastguard Worker     const float* i18 = input[18];
1273*4bdc9457SAndroid Build Coastguard Worker     assert(i18 != NULL);
1274*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i18 != zero) {
1275*4bdc9457SAndroid Build Coastguard Worker       i18 = (const float*) ((uintptr_t) i18 + input_offset);
1276*4bdc9457SAndroid Build Coastguard Worker     }
1277*4bdc9457SAndroid Build Coastguard Worker     const float* i19 = input[19];
1278*4bdc9457SAndroid Build Coastguard Worker     assert(i19 != NULL);
1279*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i19 != zero) {
1280*4bdc9457SAndroid Build Coastguard Worker       i19 = (const float*) ((uintptr_t) i19 + input_offset);
1281*4bdc9457SAndroid Build Coastguard Worker     }
1282*4bdc9457SAndroid Build Coastguard Worker     const float* i20 = input[20];
1283*4bdc9457SAndroid Build Coastguard Worker     assert(i20 != NULL);
1284*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i20 != zero) {
1285*4bdc9457SAndroid Build Coastguard Worker       i20 = (const float*) ((uintptr_t) i20 + input_offset);
1286*4bdc9457SAndroid Build Coastguard Worker     }
1287*4bdc9457SAndroid Build Coastguard Worker     const float* i21 = input[21];
1288*4bdc9457SAndroid Build Coastguard Worker     assert(i21 != NULL);
1289*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i21 != zero) {
1290*4bdc9457SAndroid Build Coastguard Worker       i21 = (const float*) ((uintptr_t) i21 + input_offset);
1291*4bdc9457SAndroid Build Coastguard Worker     }
1292*4bdc9457SAndroid Build Coastguard Worker     const float* i22 = input[22];
1293*4bdc9457SAndroid Build Coastguard Worker     assert(i22 != NULL);
1294*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i22 != zero) {
1295*4bdc9457SAndroid Build Coastguard Worker       i22 = (const float*) ((uintptr_t) i22 + input_offset);
1296*4bdc9457SAndroid Build Coastguard Worker     }
1297*4bdc9457SAndroid Build Coastguard Worker     const float* i23 = input[23];
1298*4bdc9457SAndroid Build Coastguard Worker     assert(i23 != NULL);
1299*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i23 != zero) {
1300*4bdc9457SAndroid Build Coastguard Worker       i23 = (const float*) ((uintptr_t) i23 + input_offset);
1301*4bdc9457SAndroid Build Coastguard Worker     }
1302*4bdc9457SAndroid Build Coastguard Worker     const float* i24 = input[24];
1303*4bdc9457SAndroid Build Coastguard Worker     assert(i24 != NULL);
1304*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i24 != zero) {
1305*4bdc9457SAndroid Build Coastguard Worker       i24 = (const float*) ((uintptr_t) i24 + input_offset);
1306*4bdc9457SAndroid Build Coastguard Worker     }
1307*4bdc9457SAndroid Build Coastguard Worker     input = (const float**) ((uintptr_t) input + input_stride);
1308*4bdc9457SAndroid Build Coastguard Worker 
1309*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
1310*4bdc9457SAndroid Build Coastguard Worker     const float* w = weights;
1311*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 8; c -= 8) {
1312*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
1313*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc4567p0 = _mm_load_ps(w + 4);
1314*4bdc9457SAndroid Build Coastguard Worker 
1315*4bdc9457SAndroid Build Coastguard Worker 
1316*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1317*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
1318*4bdc9457SAndroid Build Coastguard Worker       i0 += 8;
1319*4bdc9457SAndroid Build Coastguard Worker 
1320*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1321*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x4567 = _mm_load_ps(w + 12);
1322*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1323*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
1324*4bdc9457SAndroid Build Coastguard Worker 
1325*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1326*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
1327*4bdc9457SAndroid Build Coastguard Worker       i1 += 8;
1328*4bdc9457SAndroid Build Coastguard Worker 
1329*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1330*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x4567 = _mm_load_ps(w + 20);
1331*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1332*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
1333*4bdc9457SAndroid Build Coastguard Worker 
1334*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1335*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
1336*4bdc9457SAndroid Build Coastguard Worker       i2 += 8;
1337*4bdc9457SAndroid Build Coastguard Worker 
1338*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1339*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x4567 = _mm_load_ps(w + 28);
1340*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1341*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
1342*4bdc9457SAndroid Build Coastguard Worker 
1343*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x0123 = _mm_loadu_ps(i3);
1344*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
1345*4bdc9457SAndroid Build Coastguard Worker       i3 += 8;
1346*4bdc9457SAndroid Build Coastguard Worker 
1347*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x0123 = _mm_load_ps(w + 32);
1348*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x4567 = _mm_load_ps(w + 36);
1349*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
1350*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
1351*4bdc9457SAndroid Build Coastguard Worker 
1352*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x0123 = _mm_loadu_ps(i4);
1353*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
1354*4bdc9457SAndroid Build Coastguard Worker       i4 += 8;
1355*4bdc9457SAndroid Build Coastguard Worker 
1356*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk4x0123 = _mm_load_ps(w + 40);
1357*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk4x4567 = _mm_load_ps(w + 44);
1358*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
1359*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
1360*4bdc9457SAndroid Build Coastguard Worker 
1361*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x0123 = _mm_loadu_ps(i5);
1362*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
1363*4bdc9457SAndroid Build Coastguard Worker       i5 += 8;
1364*4bdc9457SAndroid Build Coastguard Worker 
1365*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk5x0123 = _mm_load_ps(w + 48);
1366*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk5x4567 = _mm_load_ps(w + 52);
1367*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
1368*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
1369*4bdc9457SAndroid Build Coastguard Worker 
1370*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x0123 = _mm_loadu_ps(i6);
1371*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
1372*4bdc9457SAndroid Build Coastguard Worker       i6 += 8;
1373*4bdc9457SAndroid Build Coastguard Worker 
1374*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk6x0123 = _mm_load_ps(w + 56);
1375*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk6x4567 = _mm_load_ps(w + 60);
1376*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
1377*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
1378*4bdc9457SAndroid Build Coastguard Worker 
1379*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x0123 = _mm_loadu_ps(i7);
1380*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
1381*4bdc9457SAndroid Build Coastguard Worker       i7 += 8;
1382*4bdc9457SAndroid Build Coastguard Worker 
1383*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk7x0123 = _mm_load_ps(w + 64);
1384*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk7x4567 = _mm_load_ps(w + 68);
1385*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
1386*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
1387*4bdc9457SAndroid Build Coastguard Worker 
1388*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8x0123 = _mm_loadu_ps(i8);
1389*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
1390*4bdc9457SAndroid Build Coastguard Worker       i8 += 8;
1391*4bdc9457SAndroid Build Coastguard Worker 
1392*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk8x0123 = _mm_load_ps(w + 72);
1393*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk8x4567 = _mm_load_ps(w + 76);
1394*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
1395*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
1396*4bdc9457SAndroid Build Coastguard Worker 
1397*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi9x0123 = _mm_loadu_ps(i9);
1398*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi9x4567 = _mm_loadu_ps(i9 + 4);
1399*4bdc9457SAndroid Build Coastguard Worker       i9 += 8;
1400*4bdc9457SAndroid Build Coastguard Worker 
1401*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk9x0123 = _mm_load_ps(w + 80);
1402*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk9x4567 = _mm_load_ps(w + 84);
1403*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
1404*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi9x4567, vk9x4567));
1405*4bdc9457SAndroid Build Coastguard Worker 
1406*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi10x0123 = _mm_loadu_ps(i10);
1407*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi10x4567 = _mm_loadu_ps(i10 + 4);
1408*4bdc9457SAndroid Build Coastguard Worker       i10 += 8;
1409*4bdc9457SAndroid Build Coastguard Worker 
1410*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk10x0123 = _mm_load_ps(w + 88);
1411*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk10x4567 = _mm_load_ps(w + 92);
1412*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
1413*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi10x4567, vk10x4567));
1414*4bdc9457SAndroid Build Coastguard Worker 
1415*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi11x0123 = _mm_loadu_ps(i11);
1416*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi11x4567 = _mm_loadu_ps(i11 + 4);
1417*4bdc9457SAndroid Build Coastguard Worker       i11 += 8;
1418*4bdc9457SAndroid Build Coastguard Worker 
1419*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk11x0123 = _mm_load_ps(w + 96);
1420*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk11x4567 = _mm_load_ps(w + 100);
1421*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
1422*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi11x4567, vk11x4567));
1423*4bdc9457SAndroid Build Coastguard Worker 
1424*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi12x0123 = _mm_loadu_ps(i12);
1425*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi12x4567 = _mm_loadu_ps(i12 + 4);
1426*4bdc9457SAndroid Build Coastguard Worker       i12 += 8;
1427*4bdc9457SAndroid Build Coastguard Worker 
1428*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk12x0123 = _mm_load_ps(w + 104);
1429*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk12x4567 = _mm_load_ps(w + 108);
1430*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
1431*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi12x4567, vk12x4567));
1432*4bdc9457SAndroid Build Coastguard Worker 
1433*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi13x0123 = _mm_loadu_ps(i13);
1434*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi13x4567 = _mm_loadu_ps(i13 + 4);
1435*4bdc9457SAndroid Build Coastguard Worker       i13 += 8;
1436*4bdc9457SAndroid Build Coastguard Worker 
1437*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk13x0123 = _mm_load_ps(w + 112);
1438*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk13x4567 = _mm_load_ps(w + 116);
1439*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
1440*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi13x4567, vk13x4567));
1441*4bdc9457SAndroid Build Coastguard Worker 
1442*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi14x0123 = _mm_loadu_ps(i14);
1443*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi14x4567 = _mm_loadu_ps(i14 + 4);
1444*4bdc9457SAndroid Build Coastguard Worker       i14 += 8;
1445*4bdc9457SAndroid Build Coastguard Worker 
1446*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk14x0123 = _mm_load_ps(w + 120);
1447*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk14x4567 = _mm_load_ps(w + 124);
1448*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
1449*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi14x4567, vk14x4567));
1450*4bdc9457SAndroid Build Coastguard Worker 
1451*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi15x0123 = _mm_loadu_ps(i15);
1452*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi15x4567 = _mm_loadu_ps(i15 + 4);
1453*4bdc9457SAndroid Build Coastguard Worker       i15 += 8;
1454*4bdc9457SAndroid Build Coastguard Worker 
1455*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk15x0123 = _mm_load_ps(w + 128);
1456*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk15x4567 = _mm_load_ps(w + 132);
1457*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
1458*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi15x4567, vk15x4567));
1459*4bdc9457SAndroid Build Coastguard Worker 
1460*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi16x0123 = _mm_loadu_ps(i16);
1461*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi16x4567 = _mm_loadu_ps(i16 + 4);
1462*4bdc9457SAndroid Build Coastguard Worker       i16 += 8;
1463*4bdc9457SAndroid Build Coastguard Worker 
1464*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk16x0123 = _mm_load_ps(w + 136);
1465*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk16x4567 = _mm_load_ps(w + 140);
1466*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
1467*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi16x4567, vk16x4567));
1468*4bdc9457SAndroid Build Coastguard Worker 
1469*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi17x0123 = _mm_loadu_ps(i17);
1470*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi17x4567 = _mm_loadu_ps(i17 + 4);
1471*4bdc9457SAndroid Build Coastguard Worker       i17 += 8;
1472*4bdc9457SAndroid Build Coastguard Worker 
1473*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk17x0123 = _mm_load_ps(w + 144);
1474*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk17x4567 = _mm_load_ps(w + 148);
1475*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
1476*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi17x4567, vk17x4567));
1477*4bdc9457SAndroid Build Coastguard Worker 
1478*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi18x0123 = _mm_loadu_ps(i18);
1479*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi18x4567 = _mm_loadu_ps(i18 + 4);
1480*4bdc9457SAndroid Build Coastguard Worker       i18 += 8;
1481*4bdc9457SAndroid Build Coastguard Worker 
1482*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk18x0123 = _mm_load_ps(w + 152);
1483*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk18x4567 = _mm_load_ps(w + 156);
1484*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
1485*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi18x4567, vk18x4567));
1486*4bdc9457SAndroid Build Coastguard Worker 
1487*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi19x0123 = _mm_loadu_ps(i19);
1488*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi19x4567 = _mm_loadu_ps(i19 + 4);
1489*4bdc9457SAndroid Build Coastguard Worker       i19 += 8;
1490*4bdc9457SAndroid Build Coastguard Worker 
1491*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk19x0123 = _mm_load_ps(w + 160);
1492*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk19x4567 = _mm_load_ps(w + 164);
1493*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
1494*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi19x4567, vk19x4567));
1495*4bdc9457SAndroid Build Coastguard Worker 
1496*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi20x0123 = _mm_loadu_ps(i20);
1497*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi20x4567 = _mm_loadu_ps(i20 + 4);
1498*4bdc9457SAndroid Build Coastguard Worker       i20 += 8;
1499*4bdc9457SAndroid Build Coastguard Worker 
1500*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk20x0123 = _mm_load_ps(w + 168);
1501*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk20x4567 = _mm_load_ps(w + 172);
1502*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
1503*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi20x4567, vk20x4567));
1504*4bdc9457SAndroid Build Coastguard Worker 
1505*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi21x0123 = _mm_loadu_ps(i21);
1506*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi21x4567 = _mm_loadu_ps(i21 + 4);
1507*4bdc9457SAndroid Build Coastguard Worker       i21 += 8;
1508*4bdc9457SAndroid Build Coastguard Worker 
1509*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk21x0123 = _mm_load_ps(w + 176);
1510*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk21x4567 = _mm_load_ps(w + 180);
1511*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
1512*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi21x4567, vk21x4567));
1513*4bdc9457SAndroid Build Coastguard Worker 
1514*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi22x0123 = _mm_loadu_ps(i22);
1515*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi22x4567 = _mm_loadu_ps(i22 + 4);
1516*4bdc9457SAndroid Build Coastguard Worker       i22 += 8;
1517*4bdc9457SAndroid Build Coastguard Worker 
1518*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk22x0123 = _mm_load_ps(w + 184);
1519*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk22x4567 = _mm_load_ps(w + 188);
1520*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
1521*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi22x4567, vk22x4567));
1522*4bdc9457SAndroid Build Coastguard Worker 
1523*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi23x0123 = _mm_loadu_ps(i23);
1524*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi23x4567 = _mm_loadu_ps(i23 + 4);
1525*4bdc9457SAndroid Build Coastguard Worker       i23 += 8;
1526*4bdc9457SAndroid Build Coastguard Worker 
1527*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk23x0123 = _mm_load_ps(w + 192);
1528*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk23x4567 = _mm_load_ps(w + 196);
1529*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
1530*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi23x4567, vk23x4567));
1531*4bdc9457SAndroid Build Coastguard Worker 
1532*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi24x0123 = _mm_loadu_ps(i24);
1533*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi24x4567 = _mm_loadu_ps(i24 + 4);
1534*4bdc9457SAndroid Build Coastguard Worker       i24 += 8;
1535*4bdc9457SAndroid Build Coastguard Worker 
1536*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk24x0123 = _mm_load_ps(w + 200);
1537*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk24x4567 = _mm_load_ps(w + 204);
1538*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
1539*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi24x4567, vk24x4567));
1540*4bdc9457SAndroid Build Coastguard Worker 
1541*4bdc9457SAndroid Build Coastguard Worker       w += 208;
1542*4bdc9457SAndroid Build Coastguard Worker 
1543*4bdc9457SAndroid Build Coastguard Worker 
1544*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1545*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
1546*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
1547*4bdc9457SAndroid Build Coastguard Worker       vacc4567 = _mm_min_ps(vacc4567, vmax);
1548*4bdc9457SAndroid Build Coastguard Worker 
1549*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vacc0123);
1550*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 4, vacc4567);
1551*4bdc9457SAndroid Build Coastguard Worker       output += 8;
1552*4bdc9457SAndroid Build Coastguard Worker     }
1553*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 4; c -= 4) {
1554*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
1555*4bdc9457SAndroid Build Coastguard Worker 
1556*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1557*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
1558*4bdc9457SAndroid Build Coastguard Worker 
1559*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1560*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1561*4bdc9457SAndroid Build Coastguard Worker 
1562*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1563*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
1564*4bdc9457SAndroid Build Coastguard Worker 
1565*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1566*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1567*4bdc9457SAndroid Build Coastguard Worker 
1568*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1569*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
1570*4bdc9457SAndroid Build Coastguard Worker 
1571*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1572*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1573*4bdc9457SAndroid Build Coastguard Worker 
1574*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x0123 = _mm_loadu_ps(i3);
1575*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
1576*4bdc9457SAndroid Build Coastguard Worker 
1577*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x0123 = _mm_load_ps(w + 32);
1578*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
1579*4bdc9457SAndroid Build Coastguard Worker 
1580*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x0123 = _mm_loadu_ps(i4);
1581*4bdc9457SAndroid Build Coastguard Worker       i4 += 4;
1582*4bdc9457SAndroid Build Coastguard Worker 
1583*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk4x0123 = _mm_load_ps(w + 40);
1584*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
1585*4bdc9457SAndroid Build Coastguard Worker 
1586*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x0123 = _mm_loadu_ps(i5);
1587*4bdc9457SAndroid Build Coastguard Worker       i5 += 4;
1588*4bdc9457SAndroid Build Coastguard Worker 
1589*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk5x0123 = _mm_load_ps(w + 48);
1590*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
1591*4bdc9457SAndroid Build Coastguard Worker 
1592*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x0123 = _mm_loadu_ps(i6);
1593*4bdc9457SAndroid Build Coastguard Worker       i6 += 4;
1594*4bdc9457SAndroid Build Coastguard Worker 
1595*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk6x0123 = _mm_load_ps(w + 56);
1596*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
1597*4bdc9457SAndroid Build Coastguard Worker 
1598*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x0123 = _mm_loadu_ps(i7);
1599*4bdc9457SAndroid Build Coastguard Worker       i7 += 4;
1600*4bdc9457SAndroid Build Coastguard Worker 
1601*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk7x0123 = _mm_load_ps(w + 64);
1602*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
1603*4bdc9457SAndroid Build Coastguard Worker 
1604*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8x0123 = _mm_loadu_ps(i8);
1605*4bdc9457SAndroid Build Coastguard Worker       i8 += 4;
1606*4bdc9457SAndroid Build Coastguard Worker 
1607*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk8x0123 = _mm_load_ps(w + 72);
1608*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
1609*4bdc9457SAndroid Build Coastguard Worker 
1610*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi9x0123 = _mm_loadu_ps(i9);
1611*4bdc9457SAndroid Build Coastguard Worker       i9 += 4;
1612*4bdc9457SAndroid Build Coastguard Worker 
1613*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk9x0123 = _mm_load_ps(w + 80);
1614*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
1615*4bdc9457SAndroid Build Coastguard Worker 
1616*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi10x0123 = _mm_loadu_ps(i10);
1617*4bdc9457SAndroid Build Coastguard Worker       i10 += 4;
1618*4bdc9457SAndroid Build Coastguard Worker 
1619*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk10x0123 = _mm_load_ps(w + 88);
1620*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
1621*4bdc9457SAndroid Build Coastguard Worker 
1622*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi11x0123 = _mm_loadu_ps(i11);
1623*4bdc9457SAndroid Build Coastguard Worker       i11 += 4;
1624*4bdc9457SAndroid Build Coastguard Worker 
1625*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk11x0123 = _mm_load_ps(w + 96);
1626*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
1627*4bdc9457SAndroid Build Coastguard Worker 
1628*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi12x0123 = _mm_loadu_ps(i12);
1629*4bdc9457SAndroid Build Coastguard Worker       i12 += 4;
1630*4bdc9457SAndroid Build Coastguard Worker 
1631*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk12x0123 = _mm_load_ps(w + 104);
1632*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
1633*4bdc9457SAndroid Build Coastguard Worker 
1634*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi13x0123 = _mm_loadu_ps(i13);
1635*4bdc9457SAndroid Build Coastguard Worker       i13 += 4;
1636*4bdc9457SAndroid Build Coastguard Worker 
1637*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk13x0123 = _mm_load_ps(w + 112);
1638*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
1639*4bdc9457SAndroid Build Coastguard Worker 
1640*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi14x0123 = _mm_loadu_ps(i14);
1641*4bdc9457SAndroid Build Coastguard Worker       i14 += 4;
1642*4bdc9457SAndroid Build Coastguard Worker 
1643*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk14x0123 = _mm_load_ps(w + 120);
1644*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
1645*4bdc9457SAndroid Build Coastguard Worker 
1646*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi15x0123 = _mm_loadu_ps(i15);
1647*4bdc9457SAndroid Build Coastguard Worker       i15 += 4;
1648*4bdc9457SAndroid Build Coastguard Worker 
1649*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk15x0123 = _mm_load_ps(w + 128);
1650*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
1651*4bdc9457SAndroid Build Coastguard Worker 
1652*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi16x0123 = _mm_loadu_ps(i16);
1653*4bdc9457SAndroid Build Coastguard Worker       i16 += 4;
1654*4bdc9457SAndroid Build Coastguard Worker 
1655*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk16x0123 = _mm_load_ps(w + 136);
1656*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
1657*4bdc9457SAndroid Build Coastguard Worker 
1658*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi17x0123 = _mm_loadu_ps(i17);
1659*4bdc9457SAndroid Build Coastguard Worker       i17 += 4;
1660*4bdc9457SAndroid Build Coastguard Worker 
1661*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk17x0123 = _mm_load_ps(w + 144);
1662*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
1663*4bdc9457SAndroid Build Coastguard Worker 
1664*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi18x0123 = _mm_loadu_ps(i18);
1665*4bdc9457SAndroid Build Coastguard Worker       i18 += 4;
1666*4bdc9457SAndroid Build Coastguard Worker 
1667*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk18x0123 = _mm_load_ps(w + 152);
1668*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
1669*4bdc9457SAndroid Build Coastguard Worker 
1670*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi19x0123 = _mm_loadu_ps(i19);
1671*4bdc9457SAndroid Build Coastguard Worker       i19 += 4;
1672*4bdc9457SAndroid Build Coastguard Worker 
1673*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk19x0123 = _mm_load_ps(w + 160);
1674*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
1675*4bdc9457SAndroid Build Coastguard Worker 
1676*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi20x0123 = _mm_loadu_ps(i20);
1677*4bdc9457SAndroid Build Coastguard Worker       i20 += 4;
1678*4bdc9457SAndroid Build Coastguard Worker 
1679*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk20x0123 = _mm_load_ps(w + 168);
1680*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
1681*4bdc9457SAndroid Build Coastguard Worker 
1682*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi21x0123 = _mm_loadu_ps(i21);
1683*4bdc9457SAndroid Build Coastguard Worker       i21 += 4;
1684*4bdc9457SAndroid Build Coastguard Worker 
1685*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk21x0123 = _mm_load_ps(w + 176);
1686*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
1687*4bdc9457SAndroid Build Coastguard Worker 
1688*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi22x0123 = _mm_loadu_ps(i22);
1689*4bdc9457SAndroid Build Coastguard Worker       i22 += 4;
1690*4bdc9457SAndroid Build Coastguard Worker 
1691*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk22x0123 = _mm_load_ps(w + 184);
1692*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
1693*4bdc9457SAndroid Build Coastguard Worker 
1694*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi23x0123 = _mm_loadu_ps(i23);
1695*4bdc9457SAndroid Build Coastguard Worker       i23 += 4;
1696*4bdc9457SAndroid Build Coastguard Worker 
1697*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk23x0123 = _mm_load_ps(w + 192);
1698*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
1699*4bdc9457SAndroid Build Coastguard Worker 
1700*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi24x0123 = _mm_loadu_ps(i24);
1701*4bdc9457SAndroid Build Coastguard Worker       i24 += 4;
1702*4bdc9457SAndroid Build Coastguard Worker 
1703*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk24x0123 = _mm_load_ps(w + 200);
1704*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
1705*4bdc9457SAndroid Build Coastguard Worker 
1706*4bdc9457SAndroid Build Coastguard Worker       w += 4;
1707*4bdc9457SAndroid Build Coastguard Worker 
1708*4bdc9457SAndroid Build Coastguard Worker 
1709*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1710*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
1711*4bdc9457SAndroid Build Coastguard Worker 
1712*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vacc0123);
1713*4bdc9457SAndroid Build Coastguard Worker       output += 4;
1714*4bdc9457SAndroid Build Coastguard Worker     }
1715*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
1716*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
1717*4bdc9457SAndroid Build Coastguard Worker 
1718*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1719*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1720*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1721*4bdc9457SAndroid Build Coastguard Worker 
1722*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1723*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1724*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1725*4bdc9457SAndroid Build Coastguard Worker 
1726*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1727*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1728*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1729*4bdc9457SAndroid Build Coastguard Worker 
1730*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x0123 = _mm_loadu_ps(i3);
1731*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x0123 = _mm_load_ps(w + 32);
1732*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
1733*4bdc9457SAndroid Build Coastguard Worker 
1734*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x0123 = _mm_loadu_ps(i4);
1735*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk4x0123 = _mm_load_ps(w + 40);
1736*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
1737*4bdc9457SAndroid Build Coastguard Worker 
1738*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x0123 = _mm_loadu_ps(i5);
1739*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk5x0123 = _mm_load_ps(w + 48);
1740*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
1741*4bdc9457SAndroid Build Coastguard Worker 
1742*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x0123 = _mm_loadu_ps(i6);
1743*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk6x0123 = _mm_load_ps(w + 56);
1744*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
1745*4bdc9457SAndroid Build Coastguard Worker 
1746*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x0123 = _mm_loadu_ps(i7);
1747*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk7x0123 = _mm_load_ps(w + 64);
1748*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
1749*4bdc9457SAndroid Build Coastguard Worker 
1750*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8x0123 = _mm_loadu_ps(i8);
1751*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk8x0123 = _mm_load_ps(w + 72);
1752*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
1753*4bdc9457SAndroid Build Coastguard Worker 
1754*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi9x0123 = _mm_loadu_ps(i9);
1755*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk9x0123 = _mm_load_ps(w + 80);
1756*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
1757*4bdc9457SAndroid Build Coastguard Worker 
1758*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi10x0123 = _mm_loadu_ps(i10);
1759*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk10x0123 = _mm_load_ps(w + 88);
1760*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
1761*4bdc9457SAndroid Build Coastguard Worker 
1762*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi11x0123 = _mm_loadu_ps(i11);
1763*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk11x0123 = _mm_load_ps(w + 96);
1764*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
1765*4bdc9457SAndroid Build Coastguard Worker 
1766*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi12x0123 = _mm_loadu_ps(i12);
1767*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk12x0123 = _mm_load_ps(w + 104);
1768*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
1769*4bdc9457SAndroid Build Coastguard Worker 
1770*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi13x0123 = _mm_loadu_ps(i13);
1771*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk13x0123 = _mm_load_ps(w + 112);
1772*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
1773*4bdc9457SAndroid Build Coastguard Worker 
1774*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi14x0123 = _mm_loadu_ps(i14);
1775*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk14x0123 = _mm_load_ps(w + 120);
1776*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
1777*4bdc9457SAndroid Build Coastguard Worker 
1778*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi15x0123 = _mm_loadu_ps(i15);
1779*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk15x0123 = _mm_load_ps(w + 128);
1780*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
1781*4bdc9457SAndroid Build Coastguard Worker 
1782*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi16x0123 = _mm_loadu_ps(i16);
1783*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk16x0123 = _mm_load_ps(w + 136);
1784*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
1785*4bdc9457SAndroid Build Coastguard Worker 
1786*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi17x0123 = _mm_loadu_ps(i17);
1787*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk17x0123 = _mm_load_ps(w + 144);
1788*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
1789*4bdc9457SAndroid Build Coastguard Worker 
1790*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi18x0123 = _mm_loadu_ps(i18);
1791*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk18x0123 = _mm_load_ps(w + 152);
1792*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
1793*4bdc9457SAndroid Build Coastguard Worker 
1794*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi19x0123 = _mm_loadu_ps(i19);
1795*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk19x0123 = _mm_load_ps(w + 160);
1796*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
1797*4bdc9457SAndroid Build Coastguard Worker 
1798*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi20x0123 = _mm_loadu_ps(i20);
1799*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk20x0123 = _mm_load_ps(w + 168);
1800*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
1801*4bdc9457SAndroid Build Coastguard Worker 
1802*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi21x0123 = _mm_loadu_ps(i21);
1803*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk21x0123 = _mm_load_ps(w + 176);
1804*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
1805*4bdc9457SAndroid Build Coastguard Worker 
1806*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi22x0123 = _mm_loadu_ps(i22);
1807*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk22x0123 = _mm_load_ps(w + 184);
1808*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
1809*4bdc9457SAndroid Build Coastguard Worker 
1810*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi23x0123 = _mm_loadu_ps(i23);
1811*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk23x0123 = _mm_load_ps(w + 192);
1812*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
1813*4bdc9457SAndroid Build Coastguard Worker 
1814*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi24x0123 = _mm_loadu_ps(i24);
1815*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk24x0123 = _mm_load_ps(w + 200);
1816*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
1817*4bdc9457SAndroid Build Coastguard Worker 
1818*4bdc9457SAndroid Build Coastguard Worker 
1819*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1820*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
1821*4bdc9457SAndroid Build Coastguard Worker 
1822*4bdc9457SAndroid Build Coastguard Worker       if (c & 2) {
1823*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) output, vacc0123);
1824*4bdc9457SAndroid Build Coastguard Worker         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1825*4bdc9457SAndroid Build Coastguard Worker         output += 2;
1826*4bdc9457SAndroid Build Coastguard Worker       }
1827*4bdc9457SAndroid Build Coastguard Worker       if (c & 1) {
1828*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(output, vacc0123);
1829*4bdc9457SAndroid Build Coastguard Worker         output += 1;
1830*4bdc9457SAndroid Build Coastguard Worker       }
1831*4bdc9457SAndroid Build Coastguard Worker     }
1832*4bdc9457SAndroid Build Coastguard Worker 
1833*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) output + output_increment);
1834*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
1835*4bdc9457SAndroid Build Coastguard Worker }
1836*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_dwconv_minmax_ukernel_up8x3__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1837*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv_minmax_ukernel_up8x3__sse(
1838*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
1839*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
1840*4bdc9457SAndroid Build Coastguard Worker     const float** input,
1841*4bdc9457SAndroid Build Coastguard Worker     const float* weights,
1842*4bdc9457SAndroid Build Coastguard Worker     float* output,
1843*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
1844*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
1845*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
1846*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
1847*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1848*4bdc9457SAndroid Build Coastguard Worker {
1849*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
1850*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
1851*4bdc9457SAndroid Build Coastguard Worker 
1852*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
1853*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
1854*4bdc9457SAndroid Build Coastguard Worker   do {
1855*4bdc9457SAndroid Build Coastguard Worker     const float* i0 = input[0];
1856*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
1857*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
1858*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + input_offset);
1859*4bdc9457SAndroid Build Coastguard Worker     }
1860*4bdc9457SAndroid Build Coastguard Worker     const float* i1 = input[1];
1861*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
1862*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
1863*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + input_offset);
1864*4bdc9457SAndroid Build Coastguard Worker     }
1865*4bdc9457SAndroid Build Coastguard Worker     const float* i2 = input[2];
1866*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
1867*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
1868*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + input_offset);
1869*4bdc9457SAndroid Build Coastguard Worker     }
1870*4bdc9457SAndroid Build Coastguard Worker     input = (const float**) ((uintptr_t) input + input_stride);
1871*4bdc9457SAndroid Build Coastguard Worker 
1872*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
1873*4bdc9457SAndroid Build Coastguard Worker     const float* w = weights;
1874*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 8; c -= 8) {
1875*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
1876*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc4567p0 = _mm_load_ps(w + 4);
1877*4bdc9457SAndroid Build Coastguard Worker 
1878*4bdc9457SAndroid Build Coastguard Worker 
1879*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1880*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
1881*4bdc9457SAndroid Build Coastguard Worker       i0 += 8;
1882*4bdc9457SAndroid Build Coastguard Worker 
1883*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1884*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x4567 = _mm_load_ps(w + 12);
1885*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1886*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
1887*4bdc9457SAndroid Build Coastguard Worker 
1888*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1889*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
1890*4bdc9457SAndroid Build Coastguard Worker       i1 += 8;
1891*4bdc9457SAndroid Build Coastguard Worker 
1892*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1893*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x4567 = _mm_load_ps(w + 20);
1894*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1895*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
1896*4bdc9457SAndroid Build Coastguard Worker 
1897*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1898*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
1899*4bdc9457SAndroid Build Coastguard Worker       i2 += 8;
1900*4bdc9457SAndroid Build Coastguard Worker 
1901*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1902*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x4567 = _mm_load_ps(w + 28);
1903*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1904*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
1905*4bdc9457SAndroid Build Coastguard Worker 
1906*4bdc9457SAndroid Build Coastguard Worker       w += 32;
1907*4bdc9457SAndroid Build Coastguard Worker 
1908*4bdc9457SAndroid Build Coastguard Worker 
1909*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1910*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
1911*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
1912*4bdc9457SAndroid Build Coastguard Worker       vacc4567 = _mm_min_ps(vacc4567, vmax);
1913*4bdc9457SAndroid Build Coastguard Worker 
1914*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vacc0123);
1915*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 4, vacc4567);
1916*4bdc9457SAndroid Build Coastguard Worker       output += 8;
1917*4bdc9457SAndroid Build Coastguard Worker     }
1918*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 4; c -= 4) {
1919*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
1920*4bdc9457SAndroid Build Coastguard Worker 
1921*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1922*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
1923*4bdc9457SAndroid Build Coastguard Worker 
1924*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1925*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1926*4bdc9457SAndroid Build Coastguard Worker 
1927*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1928*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
1929*4bdc9457SAndroid Build Coastguard Worker 
1930*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1931*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1932*4bdc9457SAndroid Build Coastguard Worker 
1933*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1934*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
1935*4bdc9457SAndroid Build Coastguard Worker 
1936*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1937*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1938*4bdc9457SAndroid Build Coastguard Worker 
1939*4bdc9457SAndroid Build Coastguard Worker       w += 4;
1940*4bdc9457SAndroid Build Coastguard Worker 
1941*4bdc9457SAndroid Build Coastguard Worker 
1942*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1943*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
1944*4bdc9457SAndroid Build Coastguard Worker 
1945*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vacc0123);
1946*4bdc9457SAndroid Build Coastguard Worker       output += 4;
1947*4bdc9457SAndroid Build Coastguard Worker     }
1948*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
1949*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
1950*4bdc9457SAndroid Build Coastguard Worker 
1951*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
1952*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
1953*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
1954*4bdc9457SAndroid Build Coastguard Worker 
1955*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
1956*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
1957*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
1958*4bdc9457SAndroid Build Coastguard Worker 
1959*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
1960*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
1961*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
1962*4bdc9457SAndroid Build Coastguard Worker 
1963*4bdc9457SAndroid Build Coastguard Worker 
1964*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
1965*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
1966*4bdc9457SAndroid Build Coastguard Worker 
1967*4bdc9457SAndroid Build Coastguard Worker       if (c & 2) {
1968*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) output, vacc0123);
1969*4bdc9457SAndroid Build Coastguard Worker         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1970*4bdc9457SAndroid Build Coastguard Worker         output += 2;
1971*4bdc9457SAndroid Build Coastguard Worker       }
1972*4bdc9457SAndroid Build Coastguard Worker       if (c & 1) {
1973*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(output, vacc0123);
1974*4bdc9457SAndroid Build Coastguard Worker         output += 1;
1975*4bdc9457SAndroid Build Coastguard Worker       }
1976*4bdc9457SAndroid Build Coastguard Worker     }
1977*4bdc9457SAndroid Build Coastguard Worker 
1978*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) output + output_increment);
1979*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
1980*4bdc9457SAndroid Build Coastguard Worker }
1981*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_dwconv_minmax_ukernel_up8x4__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1982*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv_minmax_ukernel_up8x4__sse(
1983*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
1984*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
1985*4bdc9457SAndroid Build Coastguard Worker     const float** input,
1986*4bdc9457SAndroid Build Coastguard Worker     const float* weights,
1987*4bdc9457SAndroid Build Coastguard Worker     float* output,
1988*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
1989*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
1990*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
1991*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
1992*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1993*4bdc9457SAndroid Build Coastguard Worker {
1994*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
1995*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
1996*4bdc9457SAndroid Build Coastguard Worker 
1997*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
1998*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
1999*4bdc9457SAndroid Build Coastguard Worker   do {
2000*4bdc9457SAndroid Build Coastguard Worker     const float* i0 = input[0];
2001*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
2002*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
2003*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + input_offset);
2004*4bdc9457SAndroid Build Coastguard Worker     }
2005*4bdc9457SAndroid Build Coastguard Worker     const float* i1 = input[1];
2006*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
2007*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
2008*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + input_offset);
2009*4bdc9457SAndroid Build Coastguard Worker     }
2010*4bdc9457SAndroid Build Coastguard Worker     const float* i2 = input[2];
2011*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
2012*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
2013*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + input_offset);
2014*4bdc9457SAndroid Build Coastguard Worker     }
2015*4bdc9457SAndroid Build Coastguard Worker     const float* i3 = input[3];
2016*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
2017*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
2018*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 + input_offset);
2019*4bdc9457SAndroid Build Coastguard Worker     }
2020*4bdc9457SAndroid Build Coastguard Worker     input = (const float**) ((uintptr_t) input + input_stride);
2021*4bdc9457SAndroid Build Coastguard Worker 
2022*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
2023*4bdc9457SAndroid Build Coastguard Worker     const float* w = weights;
2024*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 8; c -= 8) {
2025*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
2026*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc4567p0 = _mm_load_ps(w + 4);
2027*4bdc9457SAndroid Build Coastguard Worker 
2028*4bdc9457SAndroid Build Coastguard Worker 
2029*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2030*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
2031*4bdc9457SAndroid Build Coastguard Worker       i0 += 8;
2032*4bdc9457SAndroid Build Coastguard Worker 
2033*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2034*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x4567 = _mm_load_ps(w + 12);
2035*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2036*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
2037*4bdc9457SAndroid Build Coastguard Worker 
2038*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2039*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
2040*4bdc9457SAndroid Build Coastguard Worker       i1 += 8;
2041*4bdc9457SAndroid Build Coastguard Worker 
2042*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2043*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x4567 = _mm_load_ps(w + 20);
2044*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2045*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
2046*4bdc9457SAndroid Build Coastguard Worker 
2047*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2048*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
2049*4bdc9457SAndroid Build Coastguard Worker       i2 += 8;
2050*4bdc9457SAndroid Build Coastguard Worker 
2051*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2052*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x4567 = _mm_load_ps(w + 28);
2053*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2054*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
2055*4bdc9457SAndroid Build Coastguard Worker 
2056*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2057*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
2058*4bdc9457SAndroid Build Coastguard Worker       i3 += 8;
2059*4bdc9457SAndroid Build Coastguard Worker 
2060*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2061*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x4567 = _mm_load_ps(w + 36);
2062*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2063*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
2064*4bdc9457SAndroid Build Coastguard Worker 
2065*4bdc9457SAndroid Build Coastguard Worker       w += 40;
2066*4bdc9457SAndroid Build Coastguard Worker 
2067*4bdc9457SAndroid Build Coastguard Worker 
2068*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2069*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
2070*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
2071*4bdc9457SAndroid Build Coastguard Worker       vacc4567 = _mm_min_ps(vacc4567, vmax);
2072*4bdc9457SAndroid Build Coastguard Worker 
2073*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vacc0123);
2074*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 4, vacc4567);
2075*4bdc9457SAndroid Build Coastguard Worker       output += 8;
2076*4bdc9457SAndroid Build Coastguard Worker     }
2077*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 4; c -= 4) {
2078*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
2079*4bdc9457SAndroid Build Coastguard Worker 
2080*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2081*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
2082*4bdc9457SAndroid Build Coastguard Worker 
2083*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2084*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2085*4bdc9457SAndroid Build Coastguard Worker 
2086*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2087*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
2088*4bdc9457SAndroid Build Coastguard Worker 
2089*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2090*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2091*4bdc9457SAndroid Build Coastguard Worker 
2092*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2093*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
2094*4bdc9457SAndroid Build Coastguard Worker 
2095*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2096*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2097*4bdc9457SAndroid Build Coastguard Worker 
2098*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2099*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
2100*4bdc9457SAndroid Build Coastguard Worker 
2101*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2102*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2103*4bdc9457SAndroid Build Coastguard Worker 
2104*4bdc9457SAndroid Build Coastguard Worker       w += 4;
2105*4bdc9457SAndroid Build Coastguard Worker 
2106*4bdc9457SAndroid Build Coastguard Worker 
2107*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2108*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
2109*4bdc9457SAndroid Build Coastguard Worker 
2110*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vacc0123);
2111*4bdc9457SAndroid Build Coastguard Worker       output += 4;
2112*4bdc9457SAndroid Build Coastguard Worker     }
2113*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
2114*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
2115*4bdc9457SAndroid Build Coastguard Worker 
2116*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2117*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2118*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2119*4bdc9457SAndroid Build Coastguard Worker 
2120*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2121*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2122*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2123*4bdc9457SAndroid Build Coastguard Worker 
2124*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2125*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2126*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2127*4bdc9457SAndroid Build Coastguard Worker 
2128*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2129*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2130*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2131*4bdc9457SAndroid Build Coastguard Worker 
2132*4bdc9457SAndroid Build Coastguard Worker 
2133*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2134*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
2135*4bdc9457SAndroid Build Coastguard Worker 
2136*4bdc9457SAndroid Build Coastguard Worker       if (c & 2) {
2137*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) output, vacc0123);
2138*4bdc9457SAndroid Build Coastguard Worker         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
2139*4bdc9457SAndroid Build Coastguard Worker         output += 2;
2140*4bdc9457SAndroid Build Coastguard Worker       }
2141*4bdc9457SAndroid Build Coastguard Worker       if (c & 1) {
2142*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(output, vacc0123);
2143*4bdc9457SAndroid Build Coastguard Worker         output += 1;
2144*4bdc9457SAndroid Build Coastguard Worker       }
2145*4bdc9457SAndroid Build Coastguard Worker     }
2146*4bdc9457SAndroid Build Coastguard Worker 
2147*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) output + output_increment);
2148*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
2149*4bdc9457SAndroid Build Coastguard Worker }
2150*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_dwconv_minmax_ukernel_up8x9__sse(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2151*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv_minmax_ukernel_up8x9__sse(
2152*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
2153*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
2154*4bdc9457SAndroid Build Coastguard Worker     const float** input,
2155*4bdc9457SAndroid Build Coastguard Worker     const float* weights,
2156*4bdc9457SAndroid Build Coastguard Worker     float* output,
2157*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
2158*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
2159*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
2160*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
2161*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2162*4bdc9457SAndroid Build Coastguard Worker {
2163*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
2164*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
2165*4bdc9457SAndroid Build Coastguard Worker 
2166*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
2167*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
2168*4bdc9457SAndroid Build Coastguard Worker   do {
2169*4bdc9457SAndroid Build Coastguard Worker     const float* i0 = input[0];
2170*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
2171*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
2172*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + input_offset);
2173*4bdc9457SAndroid Build Coastguard Worker     }
2174*4bdc9457SAndroid Build Coastguard Worker     const float* i1 = input[1];
2175*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
2176*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
2177*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + input_offset);
2178*4bdc9457SAndroid Build Coastguard Worker     }
2179*4bdc9457SAndroid Build Coastguard Worker     const float* i2 = input[2];
2180*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
2181*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
2182*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + input_offset);
2183*4bdc9457SAndroid Build Coastguard Worker     }
2184*4bdc9457SAndroid Build Coastguard Worker     const float* i3 = input[3];
2185*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
2186*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
2187*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 + input_offset);
2188*4bdc9457SAndroid Build Coastguard Worker     }
2189*4bdc9457SAndroid Build Coastguard Worker     const float* i4 = input[4];
2190*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
2191*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
2192*4bdc9457SAndroid Build Coastguard Worker       i4 = (const float*) ((uintptr_t) i4 + input_offset);
2193*4bdc9457SAndroid Build Coastguard Worker     }
2194*4bdc9457SAndroid Build Coastguard Worker     const float* i5 = input[5];
2195*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
2196*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
2197*4bdc9457SAndroid Build Coastguard Worker       i5 = (const float*) ((uintptr_t) i5 + input_offset);
2198*4bdc9457SAndroid Build Coastguard Worker     }
2199*4bdc9457SAndroid Build Coastguard Worker     const float* i6 = input[6];
2200*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
2201*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
2202*4bdc9457SAndroid Build Coastguard Worker       i6 = (const float*) ((uintptr_t) i6 + input_offset);
2203*4bdc9457SAndroid Build Coastguard Worker     }
2204*4bdc9457SAndroid Build Coastguard Worker     const float* i7 = input[7];
2205*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
2206*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
2207*4bdc9457SAndroid Build Coastguard Worker       i7 = (const float*) ((uintptr_t) i7 + input_offset);
2208*4bdc9457SAndroid Build Coastguard Worker     }
2209*4bdc9457SAndroid Build Coastguard Worker     const float* i8 = input[8];
2210*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
2211*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
2212*4bdc9457SAndroid Build Coastguard Worker       i8 = (const float*) ((uintptr_t) i8 + input_offset);
2213*4bdc9457SAndroid Build Coastguard Worker     }
2214*4bdc9457SAndroid Build Coastguard Worker     input = (const float**) ((uintptr_t) input + input_stride);
2215*4bdc9457SAndroid Build Coastguard Worker 
2216*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
2217*4bdc9457SAndroid Build Coastguard Worker     const float* w = weights;
2218*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 8; c -= 8) {
2219*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
2220*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc4567p0 = _mm_load_ps(w + 4);
2221*4bdc9457SAndroid Build Coastguard Worker 
2222*4bdc9457SAndroid Build Coastguard Worker 
2223*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2224*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
2225*4bdc9457SAndroid Build Coastguard Worker       i0 += 8;
2226*4bdc9457SAndroid Build Coastguard Worker 
2227*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2228*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x4567 = _mm_load_ps(w + 12);
2229*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2230*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
2231*4bdc9457SAndroid Build Coastguard Worker 
2232*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2233*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
2234*4bdc9457SAndroid Build Coastguard Worker       i1 += 8;
2235*4bdc9457SAndroid Build Coastguard Worker 
2236*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2237*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x4567 = _mm_load_ps(w + 20);
2238*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2239*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
2240*4bdc9457SAndroid Build Coastguard Worker 
2241*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2242*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
2243*4bdc9457SAndroid Build Coastguard Worker       i2 += 8;
2244*4bdc9457SAndroid Build Coastguard Worker 
2245*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2246*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x4567 = _mm_load_ps(w + 28);
2247*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2248*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
2249*4bdc9457SAndroid Build Coastguard Worker 
2250*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2251*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
2252*4bdc9457SAndroid Build Coastguard Worker       i3 += 8;
2253*4bdc9457SAndroid Build Coastguard Worker 
2254*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2255*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x4567 = _mm_load_ps(w + 36);
2256*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2257*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
2258*4bdc9457SAndroid Build Coastguard Worker 
2259*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x0123 = _mm_loadu_ps(i4);
2260*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
2261*4bdc9457SAndroid Build Coastguard Worker       i4 += 8;
2262*4bdc9457SAndroid Build Coastguard Worker 
2263*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk4x0123 = _mm_load_ps(w + 40);
2264*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk4x4567 = _mm_load_ps(w + 44);
2265*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
2266*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
2267*4bdc9457SAndroid Build Coastguard Worker 
2268*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x0123 = _mm_loadu_ps(i5);
2269*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
2270*4bdc9457SAndroid Build Coastguard Worker       i5 += 8;
2271*4bdc9457SAndroid Build Coastguard Worker 
2272*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk5x0123 = _mm_load_ps(w + 48);
2273*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk5x4567 = _mm_load_ps(w + 52);
2274*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
2275*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
2276*4bdc9457SAndroid Build Coastguard Worker 
2277*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x0123 = _mm_loadu_ps(i6);
2278*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
2279*4bdc9457SAndroid Build Coastguard Worker       i6 += 8;
2280*4bdc9457SAndroid Build Coastguard Worker 
2281*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk6x0123 = _mm_load_ps(w + 56);
2282*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk6x4567 = _mm_load_ps(w + 60);
2283*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
2284*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
2285*4bdc9457SAndroid Build Coastguard Worker 
2286*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x0123 = _mm_loadu_ps(i7);
2287*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
2288*4bdc9457SAndroid Build Coastguard Worker       i7 += 8;
2289*4bdc9457SAndroid Build Coastguard Worker 
2290*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk7x0123 = _mm_load_ps(w + 64);
2291*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk7x4567 = _mm_load_ps(w + 68);
2292*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
2293*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
2294*4bdc9457SAndroid Build Coastguard Worker 
2295*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8x0123 = _mm_loadu_ps(i8);
2296*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
2297*4bdc9457SAndroid Build Coastguard Worker       i8 += 8;
2298*4bdc9457SAndroid Build Coastguard Worker 
2299*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk8x0123 = _mm_load_ps(w + 72);
2300*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk8x4567 = _mm_load_ps(w + 76);
2301*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
2302*4bdc9457SAndroid Build Coastguard Worker       vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
2303*4bdc9457SAndroid Build Coastguard Worker 
2304*4bdc9457SAndroid Build Coastguard Worker       w += 80;
2305*4bdc9457SAndroid Build Coastguard Worker 
2306*4bdc9457SAndroid Build Coastguard Worker 
2307*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2308*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
2309*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
2310*4bdc9457SAndroid Build Coastguard Worker       vacc4567 = _mm_min_ps(vacc4567, vmax);
2311*4bdc9457SAndroid Build Coastguard Worker 
2312*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vacc0123);
2313*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 4, vacc4567);
2314*4bdc9457SAndroid Build Coastguard Worker       output += 8;
2315*4bdc9457SAndroid Build Coastguard Worker     }
2316*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 4; c -= 4) {
2317*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
2318*4bdc9457SAndroid Build Coastguard Worker 
2319*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2320*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
2321*4bdc9457SAndroid Build Coastguard Worker 
2322*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2323*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2324*4bdc9457SAndroid Build Coastguard Worker 
2325*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2326*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
2327*4bdc9457SAndroid Build Coastguard Worker 
2328*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2329*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2330*4bdc9457SAndroid Build Coastguard Worker 
2331*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2332*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
2333*4bdc9457SAndroid Build Coastguard Worker 
2334*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2335*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2336*4bdc9457SAndroid Build Coastguard Worker 
2337*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2338*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
2339*4bdc9457SAndroid Build Coastguard Worker 
2340*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2341*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2342*4bdc9457SAndroid Build Coastguard Worker 
2343*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x0123 = _mm_loadu_ps(i4);
2344*4bdc9457SAndroid Build Coastguard Worker       i4 += 4;
2345*4bdc9457SAndroid Build Coastguard Worker 
2346*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk4x0123 = _mm_load_ps(w + 40);
2347*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
2348*4bdc9457SAndroid Build Coastguard Worker 
2349*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x0123 = _mm_loadu_ps(i5);
2350*4bdc9457SAndroid Build Coastguard Worker       i5 += 4;
2351*4bdc9457SAndroid Build Coastguard Worker 
2352*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk5x0123 = _mm_load_ps(w + 48);
2353*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
2354*4bdc9457SAndroid Build Coastguard Worker 
2355*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x0123 = _mm_loadu_ps(i6);
2356*4bdc9457SAndroid Build Coastguard Worker       i6 += 4;
2357*4bdc9457SAndroid Build Coastguard Worker 
2358*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk6x0123 = _mm_load_ps(w + 56);
2359*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
2360*4bdc9457SAndroid Build Coastguard Worker 
2361*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x0123 = _mm_loadu_ps(i7);
2362*4bdc9457SAndroid Build Coastguard Worker       i7 += 4;
2363*4bdc9457SAndroid Build Coastguard Worker 
2364*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk7x0123 = _mm_load_ps(w + 64);
2365*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
2366*4bdc9457SAndroid Build Coastguard Worker 
2367*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8x0123 = _mm_loadu_ps(i8);
2368*4bdc9457SAndroid Build Coastguard Worker       i8 += 4;
2369*4bdc9457SAndroid Build Coastguard Worker 
2370*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk8x0123 = _mm_load_ps(w + 72);
2371*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
2372*4bdc9457SAndroid Build Coastguard Worker 
2373*4bdc9457SAndroid Build Coastguard Worker       w += 4;
2374*4bdc9457SAndroid Build Coastguard Worker 
2375*4bdc9457SAndroid Build Coastguard Worker 
2376*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2377*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
2378*4bdc9457SAndroid Build Coastguard Worker 
2379*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vacc0123);
2380*4bdc9457SAndroid Build Coastguard Worker       output += 4;
2381*4bdc9457SAndroid Build Coastguard Worker     }
2382*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
2383*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123p0 = _mm_load_ps(w);
2384*4bdc9457SAndroid Build Coastguard Worker 
2385*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x0123 = _mm_loadu_ps(i0);
2386*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk0x0123 = _mm_load_ps(w + 8);
2387*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
2388*4bdc9457SAndroid Build Coastguard Worker 
2389*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x0123 = _mm_loadu_ps(i1);
2390*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk1x0123 = _mm_load_ps(w + 16);
2391*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
2392*4bdc9457SAndroid Build Coastguard Worker 
2393*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x0123 = _mm_loadu_ps(i2);
2394*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk2x0123 = _mm_load_ps(w + 24);
2395*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
2396*4bdc9457SAndroid Build Coastguard Worker 
2397*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x0123 = _mm_loadu_ps(i3);
2398*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk3x0123 = _mm_load_ps(w + 32);
2399*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
2400*4bdc9457SAndroid Build Coastguard Worker 
2401*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x0123 = _mm_loadu_ps(i4);
2402*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk4x0123 = _mm_load_ps(w + 40);
2403*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
2404*4bdc9457SAndroid Build Coastguard Worker 
2405*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x0123 = _mm_loadu_ps(i5);
2406*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk5x0123 = _mm_load_ps(w + 48);
2407*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
2408*4bdc9457SAndroid Build Coastguard Worker 
2409*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x0123 = _mm_loadu_ps(i6);
2410*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk6x0123 = _mm_load_ps(w + 56);
2411*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
2412*4bdc9457SAndroid Build Coastguard Worker 
2413*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x0123 = _mm_loadu_ps(i7);
2414*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk7x0123 = _mm_load_ps(w + 64);
2415*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
2416*4bdc9457SAndroid Build Coastguard Worker 
2417*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8x0123 = _mm_loadu_ps(i8);
2418*4bdc9457SAndroid Build Coastguard Worker       const __m128 vk8x0123 = _mm_load_ps(w + 72);
2419*4bdc9457SAndroid Build Coastguard Worker       vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
2420*4bdc9457SAndroid Build Coastguard Worker 
2421*4bdc9457SAndroid Build Coastguard Worker 
2422*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
2423*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_min_ps(vacc0123, vmax);
2424*4bdc9457SAndroid Build Coastguard Worker 
2425*4bdc9457SAndroid Build Coastguard Worker       if (c & 2) {
2426*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) output, vacc0123);
2427*4bdc9457SAndroid Build Coastguard Worker         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
2428*4bdc9457SAndroid Build Coastguard Worker         output += 2;
2429*4bdc9457SAndroid Build Coastguard Worker       }
2430*4bdc9457SAndroid Build Coastguard Worker       if (c & 1) {
2431*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(output, vacc0123);
2432*4bdc9457SAndroid Build Coastguard Worker         output += 1;
2433*4bdc9457SAndroid Build Coastguard Worker       }
2434*4bdc9457SAndroid Build Coastguard Worker     }
2435*4bdc9457SAndroid Build Coastguard Worker 
2436*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) output + output_increment);
2437*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
2438*4bdc9457SAndroid Build Coastguard Worker }
2439*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])2440*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2(
2441*4bdc9457SAndroid Build Coastguard Worker     size_t input_height,
2442*4bdc9457SAndroid Build Coastguard Worker     size_t input_width,
2443*4bdc9457SAndroid Build Coastguard Worker     const float* input,
2444*4bdc9457SAndroid Build Coastguard Worker     const float* weights,
2445*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
2446*4bdc9457SAndroid Build Coastguard Worker     float* output,
2447*4bdc9457SAndroid Build Coastguard Worker     uint32_t padding_top,
2448*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2449*4bdc9457SAndroid Build Coastguard Worker {
2450*4bdc9457SAndroid Build Coastguard Worker   assert(input_height != 0);
2451*4bdc9457SAndroid Build Coastguard Worker   assert(input_width != 0);
2452*4bdc9457SAndroid Build Coastguard Worker   assert(input_width % sizeof(float) == 0);
2453*4bdc9457SAndroid Build Coastguard Worker   assert(padding_top == 1);
2454*4bdc9457SAndroid Build Coastguard Worker 
2455*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
2456*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
2457*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
2458*4bdc9457SAndroid Build Coastguard Worker 
2459*4bdc9457SAndroid Build Coastguard Worker   const __m128 vbias = _mm_load1_ps(weights);
2460*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk00 = _mm_load1_ps(weights + 1);
2461*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk01 = _mm_load1_ps(weights + 2);
2462*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk02 = _mm_load1_ps(weights + 3);
2463*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk10 = _mm_load1_ps(weights + 4);
2464*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk11 = _mm_load1_ps(weights + 5);
2465*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk12 = _mm_load1_ps(weights + 6);
2466*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk20 = _mm_load1_ps(weights + 7);
2467*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk21 = _mm_load1_ps(weights + 8);
2468*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk22 = _mm_load1_ps(weights + 9);
2469*4bdc9457SAndroid Build Coastguard Worker 
2470*4bdc9457SAndroid Build Coastguard Worker   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
2471*4bdc9457SAndroid Build Coastguard Worker 
2472*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = zero;
2473*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = input;
2474*4bdc9457SAndroid Build Coastguard Worker   const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
2475*4bdc9457SAndroid Build Coastguard Worker   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
2476*4bdc9457SAndroid Build Coastguard Worker 
2477*4bdc9457SAndroid Build Coastguard Worker   float* o0 = output;
2478*4bdc9457SAndroid Build Coastguard Worker   float* o1 = (float*) ((uintptr_t) o0 + input_width);
2479*4bdc9457SAndroid Build Coastguard Worker 
2480*4bdc9457SAndroid Build Coastguard Worker   size_t output_height = input_height;
2481*4bdc9457SAndroid Build Coastguard Worker   do {
2482*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(output_height < 2) {
2483*4bdc9457SAndroid Build Coastguard Worker       i2 = zero;
2484*4bdc9457SAndroid Build Coastguard Worker       o1 = o0;
2485*4bdc9457SAndroid Build Coastguard Worker     }
2486*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(output_height < 3) {
2487*4bdc9457SAndroid Build Coastguard Worker       i3 = zero;
2488*4bdc9457SAndroid Build Coastguard Worker     }
2489*4bdc9457SAndroid Build Coastguard Worker 
2490*4bdc9457SAndroid Build Coastguard Worker     // vi0x3012 = ( vi02, vi01, vi{M}0, vi{M}3 )
2491*4bdc9457SAndroid Build Coastguard Worker     __m128 vi0x3012 = _mm_setzero_ps();
2492*4bdc9457SAndroid Build Coastguard Worker     // vi1x3012 = ( vi12, vi11, vi{M}0, vi{M}3 )
2493*4bdc9457SAndroid Build Coastguard Worker     __m128 vi1x3012 = _mm_setzero_ps();
2494*4bdc9457SAndroid Build Coastguard Worker     // vi2x3012 = ( vi22, vi21, vi{M}0, vi{M}3 )
2495*4bdc9457SAndroid Build Coastguard Worker     __m128 vi2x3012 = _mm_setzero_ps();
2496*4bdc9457SAndroid Build Coastguard Worker     // vi3x3012 = ( vi32, vi31, vi{M}0, vi{M}3 )
2497*4bdc9457SAndroid Build Coastguard Worker     __m128 vi3x3012 = _mm_setzero_ps();
2498*4bdc9457SAndroid Build Coastguard Worker 
2499*4bdc9457SAndroid Build Coastguard Worker     __m128 vi0x4567 = _mm_loadu_ps(i0);
2500*4bdc9457SAndroid Build Coastguard Worker     i0 += 4;
2501*4bdc9457SAndroid Build Coastguard Worker     __m128 vi1x4567 = _mm_loadu_ps(i1);
2502*4bdc9457SAndroid Build Coastguard Worker     i1 += 4;
2503*4bdc9457SAndroid Build Coastguard Worker     __m128 vi2x4567 = _mm_loadu_ps(i2);
2504*4bdc9457SAndroid Build Coastguard Worker     i2 += 4;
2505*4bdc9457SAndroid Build Coastguard Worker     __m128 vi3x4567 = _mm_loadu_ps(i3);
2506*4bdc9457SAndroid Build Coastguard Worker     i3 += 4;
2507*4bdc9457SAndroid Build Coastguard Worker 
2508*4bdc9457SAndroid Build Coastguard Worker     size_t w = input_width;
2509*4bdc9457SAndroid Build Coastguard Worker     for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) {
2510*4bdc9457SAndroid Build Coastguard Worker       // vi0x89AB = ( vi0B, vi0A, vi09, vi08 )
2511*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x89AB = _mm_loadu_ps(i0);
2512*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
2513*4bdc9457SAndroid Build Coastguard Worker       // vi1x89AB = ( vi1B, vi1A, vi19, vi18 )
2514*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x89AB = _mm_loadu_ps(i1);
2515*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
2516*4bdc9457SAndroid Build Coastguard Worker       // vi2x89AB = ( vi2B, vi2A, vi29, vi28 )
2517*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x89AB = _mm_loadu_ps(i2);
2518*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
2519*4bdc9457SAndroid Build Coastguard Worker       // vi3x89AB = ( vi3B, vi3A, vi39, vi38 )
2520*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x89AB = _mm_loadu_ps(i3);
2521*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
2522*4bdc9457SAndroid Build Coastguard Worker 
2523*4bdc9457SAndroid Build Coastguard Worker       // vi0x7456 = ( vi06, vi05, vi04, vi07 )
2524*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
2525*4bdc9457SAndroid Build Coastguard Worker       // vi1x7456 = ( vi16, vi15, vi14, vi17 )
2526*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
2527*4bdc9457SAndroid Build Coastguard Worker       // vi2x7456 = ( vi26, vi25, vi24, vi27 )
2528*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
2529*4bdc9457SAndroid Build Coastguard Worker       // vi3x7456 = ( vi36, vi35, vi34, vi37 )
2530*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
2531*4bdc9457SAndroid Build Coastguard Worker 
2532*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
2533*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
2534*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
2535*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
2536*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
2537*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
2538*4bdc9457SAndroid Build Coastguard Worker 
2539*4bdc9457SAndroid Build Coastguard Worker       // vi0x3456 = ( vi06, vi05, vi04, vi03 )
2540*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
2541*4bdc9457SAndroid Build Coastguard Worker       // vi1x3456 = ( vi16, vi15, vi14, vi13 )
2542*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
2543*4bdc9457SAndroid Build Coastguard Worker       // vi2x3456 = ( vi26, vi25, vi24, vi23 )
2544*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
2545*4bdc9457SAndroid Build Coastguard Worker       // vi3x3456 = ( vi36, vi35, vi34, vi33 )
2546*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
2547*4bdc9457SAndroid Build Coastguard Worker 
2548*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
2549*4bdc9457SAndroid Build Coastguard Worker       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
2550*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
2551*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
2552*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
2553*4bdc9457SAndroid Build Coastguard Worker       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
2554*4bdc9457SAndroid Build Coastguard Worker 
2555*4bdc9457SAndroid Build Coastguard Worker       vi0x3012 = vi0x7456;
2556*4bdc9457SAndroid Build Coastguard Worker       vi1x3012 = vi1x7456;
2557*4bdc9457SAndroid Build Coastguard Worker       vi2x3012 = vi2x7456;
2558*4bdc9457SAndroid Build Coastguard Worker       vi3x3012 = vi3x7456;
2559*4bdc9457SAndroid Build Coastguard Worker 
2560*4bdc9457SAndroid Build Coastguard Worker       // vi0x8567 = ( vi07, vi06, vi05, vi08 )
2561*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
2562*4bdc9457SAndroid Build Coastguard Worker       // vi1x8567 = ( vi17, vi16, vi15, vi18 )
2563*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
2564*4bdc9457SAndroid Build Coastguard Worker       // vi2x8567 = ( vi27, vi26, vi25, vi28 )
2565*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
2566*4bdc9457SAndroid Build Coastguard Worker       // vi3x8567 = ( vi37, vi36, vi35, vi38 )
2567*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
2568*4bdc9457SAndroid Build Coastguard Worker 
2569*4bdc9457SAndroid Build Coastguard Worker       // vi0x5678 = ( vi08, vi07, vi06, vi05 )
2570*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
2571*4bdc9457SAndroid Build Coastguard Worker       // vi1x5678 = ( vi18, vi17, vi16, vi15 )
2572*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
2573*4bdc9457SAndroid Build Coastguard Worker       // vi2x5678 = ( vi28, vi27, vi26, vi25 )
2574*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
2575*4bdc9457SAndroid Build Coastguard Worker       // vi3x5678 = ( vi38, vi37, vi36, vi35 )
2576*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
2577*4bdc9457SAndroid Build Coastguard Worker 
2578*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
2579*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
2580*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
2581*4bdc9457SAndroid Build Coastguard Worker       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
2582*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
2583*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
2584*4bdc9457SAndroid Build Coastguard Worker 
2585*4bdc9457SAndroid Build Coastguard Worker       vi0x4567 = vi0x89AB;
2586*4bdc9457SAndroid Build Coastguard Worker       vi1x4567 = vi1x89AB;
2587*4bdc9457SAndroid Build Coastguard Worker       vi2x4567 = vi2x89AB;
2588*4bdc9457SAndroid Build Coastguard Worker       vi3x4567 = vi3x89AB;
2589*4bdc9457SAndroid Build Coastguard Worker 
2590*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2591*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, vo1p1);
2592*4bdc9457SAndroid Build Coastguard Worker 
2593*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2594*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
2595*4bdc9457SAndroid Build Coastguard Worker 
2596*4bdc9457SAndroid Build Coastguard Worker       vo0 = _mm_min_ps(vo0, vmax);
2597*4bdc9457SAndroid Build Coastguard Worker       vo1 = _mm_min_ps(vo1, vmax);
2598*4bdc9457SAndroid Build Coastguard Worker 
2599*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o1, vo1);
2600*4bdc9457SAndroid Build Coastguard Worker       o1 += 4;
2601*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o0, vo0);
2602*4bdc9457SAndroid Build Coastguard Worker       o0 += 4;
2603*4bdc9457SAndroid Build Coastguard Worker     }
2604*4bdc9457SAndroid Build Coastguard Worker     // Always process the last block of 1..4 pixels.
2605*4bdc9457SAndroid Build Coastguard Worker     assert(w >= 1 * sizeof(float));
2606*4bdc9457SAndroid Build Coastguard Worker     assert(w <= 4 * sizeof(float));
2607*4bdc9457SAndroid Build Coastguard Worker     {
2608*4bdc9457SAndroid Build Coastguard Worker       vi0x4567 = _mm_and_ps(vmask, vi0x4567);
2609*4bdc9457SAndroid Build Coastguard Worker       vi1x4567 = _mm_and_ps(vmask, vi1x4567);
2610*4bdc9457SAndroid Build Coastguard Worker       vi2x4567 = _mm_and_ps(vmask, vi2x4567);
2611*4bdc9457SAndroid Build Coastguard Worker       vi3x4567 = _mm_and_ps(vmask, vi3x4567);
2612*4bdc9457SAndroid Build Coastguard Worker 
2613*4bdc9457SAndroid Build Coastguard Worker       // vi0x7456 = ( vi06, vi05, vi04, vi07 )
2614*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
2615*4bdc9457SAndroid Build Coastguard Worker       // vi1x7456 = ( vi16, vi15, vi14, vi17 )
2616*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
2617*4bdc9457SAndroid Build Coastguard Worker       // vi2x7456 = ( vi26, vi25, vi24, vi27 )
2618*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
2619*4bdc9457SAndroid Build Coastguard Worker       // vi3x7456 = ( vi36, vi35, vi34, vi37 )
2620*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
2621*4bdc9457SAndroid Build Coastguard Worker 
2622*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
2623*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
2624*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
2625*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
2626*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
2627*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
2628*4bdc9457SAndroid Build Coastguard Worker 
2629*4bdc9457SAndroid Build Coastguard Worker       // vi0x3456 = ( vi06, vi05, vi04, vi03 )
2630*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
2631*4bdc9457SAndroid Build Coastguard Worker       // vi1x3456 = ( vi16, vi15, vi14, vi13 )
2632*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
2633*4bdc9457SAndroid Build Coastguard Worker       // vi2x3456 = ( vi26, vi25, vi24, vi23 )
2634*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
2635*4bdc9457SAndroid Build Coastguard Worker       // vi3x3456 = ( vi36, vi35, vi34, vi33 )
2636*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
2637*4bdc9457SAndroid Build Coastguard Worker 
2638*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
2639*4bdc9457SAndroid Build Coastguard Worker       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
2640*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
2641*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
2642*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
2643*4bdc9457SAndroid Build Coastguard Worker       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
2644*4bdc9457SAndroid Build Coastguard Worker 
2645*4bdc9457SAndroid Build Coastguard Worker       const __m128 vzero = _mm_setzero_ps();
2646*4bdc9457SAndroid Build Coastguard Worker       // vi0x8567 = ( vi07, vi06, vi05, 0.0 )
2647*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
2648*4bdc9457SAndroid Build Coastguard Worker       // vi1x8567 = ( vi17, vi16, vi15, 0.0 )
2649*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
2650*4bdc9457SAndroid Build Coastguard Worker       // vi2x8567 = ( vi27, vi26, vi25, 0.0 )
2651*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
2652*4bdc9457SAndroid Build Coastguard Worker       // vi3x8567 = ( vi37, vi36, vi35, 0.0 )
2653*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
2654*4bdc9457SAndroid Build Coastguard Worker 
2655*4bdc9457SAndroid Build Coastguard Worker       // vi0x5678 = ( vi08, vi07, vi06, vi05 )
2656*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
2657*4bdc9457SAndroid Build Coastguard Worker       // vi1x5678 = ( vi18, vi17, vi16, vi15 )
2658*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
2659*4bdc9457SAndroid Build Coastguard Worker       // vi2x5678 = ( vi28, vi27, vi26, vi25 )
2660*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
2661*4bdc9457SAndroid Build Coastguard Worker       // vi3x5678 = ( vi38, vi37, vi36, vi35 )
2662*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
2663*4bdc9457SAndroid Build Coastguard Worker 
2664*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
2665*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
2666*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
2667*4bdc9457SAndroid Build Coastguard Worker       vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
2668*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
2669*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
2670*4bdc9457SAndroid Build Coastguard Worker 
2671*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2672*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, vo1p1);
2673*4bdc9457SAndroid Build Coastguard Worker 
2674*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2675*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
2676*4bdc9457SAndroid Build Coastguard Worker 
2677*4bdc9457SAndroid Build Coastguard Worker       vo0 = _mm_min_ps(vo0, vmax);
2678*4bdc9457SAndroid Build Coastguard Worker       vo1 = _mm_min_ps(vo1, vmax);
2679*4bdc9457SAndroid Build Coastguard Worker 
2680*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(w == 4 * sizeof(float)) {
2681*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o1, vo1);
2682*4bdc9457SAndroid Build Coastguard Worker         o1 += 4;
2683*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o0, vo0);
2684*4bdc9457SAndroid Build Coastguard Worker         o0 += 4;
2685*4bdc9457SAndroid Build Coastguard Worker       } else {
2686*4bdc9457SAndroid Build Coastguard Worker         if (w & (2 * sizeof(float))) {
2687*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o1, vo1);
2688*4bdc9457SAndroid Build Coastguard Worker           o1 += 2;
2689*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o0, vo0);
2690*4bdc9457SAndroid Build Coastguard Worker           o0 += 2;
2691*4bdc9457SAndroid Build Coastguard Worker 
2692*4bdc9457SAndroid Build Coastguard Worker           vo0 = _mm_movehl_ps(vo0, vo0);
2693*4bdc9457SAndroid Build Coastguard Worker           vo1 = _mm_movehl_ps(vo1, vo1);
2694*4bdc9457SAndroid Build Coastguard Worker         }
2695*4bdc9457SAndroid Build Coastguard Worker         if (w & (1 * sizeof(float))) {
2696*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o1, vo1);
2697*4bdc9457SAndroid Build Coastguard Worker           o1 += 1;
2698*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o0, vo0);
2699*4bdc9457SAndroid Build Coastguard Worker           o0 += 1;
2700*4bdc9457SAndroid Build Coastguard Worker         }
2701*4bdc9457SAndroid Build Coastguard Worker       }
2702*4bdc9457SAndroid Build Coastguard Worker     }
2703*4bdc9457SAndroid Build Coastguard Worker 
2704*4bdc9457SAndroid Build Coastguard Worker     i0 = (const float*) ((uintptr_t) i2 - input_decrement);
2705*4bdc9457SAndroid Build Coastguard Worker     i1 = (const float*) ((uintptr_t) i3 - input_decrement);
2706*4bdc9457SAndroid Build Coastguard Worker     i2 = (const float*) ((uintptr_t) i1 + input_width);
2707*4bdc9457SAndroid Build Coastguard Worker     i3 = (const float*) ((uintptr_t) i2 + input_width);
2708*4bdc9457SAndroid Build Coastguard Worker 
2709*4bdc9457SAndroid Build Coastguard Worker     o0 = o1;
2710*4bdc9457SAndroid Build Coastguard Worker     o1 = (float*) ((uintptr_t) o0 + input_width);
2711*4bdc9457SAndroid Build Coastguard Worker 
2712*4bdc9457SAndroid Build Coastguard Worker     output_height = doz(output_height, 2);
2713*4bdc9457SAndroid Build Coastguard Worker   } while (output_height != 0);
2714*4bdc9457SAndroid Build Coastguard Worker }
2715*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])2716*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3(
2717*4bdc9457SAndroid Build Coastguard Worker     size_t input_height,
2718*4bdc9457SAndroid Build Coastguard Worker     size_t input_width,
2719*4bdc9457SAndroid Build Coastguard Worker     const float* input,
2720*4bdc9457SAndroid Build Coastguard Worker     const float* weights,
2721*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
2722*4bdc9457SAndroid Build Coastguard Worker     float* output,
2723*4bdc9457SAndroid Build Coastguard Worker     uint32_t padding_top,
2724*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2725*4bdc9457SAndroid Build Coastguard Worker {
2726*4bdc9457SAndroid Build Coastguard Worker   assert(input_height != 0);
2727*4bdc9457SAndroid Build Coastguard Worker   assert(input_width != 0);
2728*4bdc9457SAndroid Build Coastguard Worker   assert(input_width % sizeof(float) == 0);
2729*4bdc9457SAndroid Build Coastguard Worker   assert(padding_top >= 0);
2730*4bdc9457SAndroid Build Coastguard Worker   assert(padding_top <= 1);
2731*4bdc9457SAndroid Build Coastguard Worker 
2732*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
2733*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmask_odd  = _mm_load_ps((const float*) params->sse.mask_odd);
2734*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
2735*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
2736*4bdc9457SAndroid Build Coastguard Worker 
2737*4bdc9457SAndroid Build Coastguard Worker   const __m128 vbias = _mm_load1_ps(weights);
2738*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk00 = _mm_load1_ps(weights + 1);
2739*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk01 = _mm_load1_ps(weights + 2);
2740*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk02 = _mm_load1_ps(weights + 3);
2741*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk10 = _mm_load1_ps(weights + 4);
2742*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk11 = _mm_load1_ps(weights + 5);
2743*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk12 = _mm_load1_ps(weights + 6);
2744*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk20 = _mm_load1_ps(weights + 7);
2745*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk21 = _mm_load1_ps(weights + 8);
2746*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk22 = _mm_load1_ps(weights + 9);
2747*4bdc9457SAndroid Build Coastguard Worker 
2748*4bdc9457SAndroid Build Coastguard Worker   const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float));
2749*4bdc9457SAndroid Build Coastguard Worker 
2750*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
2751*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
2752*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(padding_top != 0) {
2753*4bdc9457SAndroid Build Coastguard Worker     i0 = zero;
2754*4bdc9457SAndroid Build Coastguard Worker   }
2755*4bdc9457SAndroid Build Coastguard Worker   const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
2756*4bdc9457SAndroid Build Coastguard Worker 
2757*4bdc9457SAndroid Build Coastguard Worker   float* o0 = output;
2758*4bdc9457SAndroid Build Coastguard Worker 
2759*4bdc9457SAndroid Build Coastguard Worker   size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
2760*4bdc9457SAndroid Build Coastguard Worker   size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
2761*4bdc9457SAndroid Build Coastguard Worker   do {
2762*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(padded_input_height < 4) {
2763*4bdc9457SAndroid Build Coastguard Worker       i2 = zero;
2764*4bdc9457SAndroid Build Coastguard Worker     }
2765*4bdc9457SAndroid Build Coastguard Worker 
2766*4bdc9457SAndroid Build Coastguard Worker     __m128 vi0x7531 = _mm_setzero_ps();
2767*4bdc9457SAndroid Build Coastguard Worker     __m128 vi1x7531 = _mm_setzero_ps();
2768*4bdc9457SAndroid Build Coastguard Worker     __m128 vi2x7531 = _mm_setzero_ps();
2769*4bdc9457SAndroid Build Coastguard Worker 
2770*4bdc9457SAndroid Build Coastguard Worker     size_t w = input_width;
2771*4bdc9457SAndroid Build Coastguard Worker     for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) {
2772*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x89AB = _mm_loadu_ps(i0);
2773*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
2774*4bdc9457SAndroid Build Coastguard Worker       i0 += 8;
2775*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x89AB = _mm_loadu_ps(i1);
2776*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
2777*4bdc9457SAndroid Build Coastguard Worker       i1 += 8;
2778*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x89AB = _mm_loadu_ps(i2);
2779*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
2780*4bdc9457SAndroid Build Coastguard Worker       i2 += 8;
2781*4bdc9457SAndroid Build Coastguard Worker 
2782*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
2783*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
2784*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
2785*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
2786*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
2787*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
2788*4bdc9457SAndroid Build Coastguard Worker 
2789*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
2790*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
2791*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
2792*4bdc9457SAndroid Build Coastguard Worker 
2793*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2794*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2795*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2796*4bdc9457SAndroid Build Coastguard Worker 
2797*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
2798*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
2799*4bdc9457SAndroid Build Coastguard Worker       vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
2800*4bdc9457SAndroid Build Coastguard Worker 
2801*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
2802*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
2803*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
2804*4bdc9457SAndroid Build Coastguard Worker 
2805*4bdc9457SAndroid Build Coastguard Worker       vi0x7531 = vi0xF9BD;
2806*4bdc9457SAndroid Build Coastguard Worker       vi1x7531 = vi1xF9BD;
2807*4bdc9457SAndroid Build Coastguard Worker       vi2x7531 = vi2xF9BD;
2808*4bdc9457SAndroid Build Coastguard Worker 
2809*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
2810*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
2811*4bdc9457SAndroid Build Coastguard Worker       vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
2812*4bdc9457SAndroid Build Coastguard Worker 
2813*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2814*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, vo0p2);
2815*4bdc9457SAndroid Build Coastguard Worker 
2816*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2817*4bdc9457SAndroid Build Coastguard Worker 
2818*4bdc9457SAndroid Build Coastguard Worker       vo0 = _mm_min_ps(vo0, vmax);
2819*4bdc9457SAndroid Build Coastguard Worker 
2820*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o0, vo0);
2821*4bdc9457SAndroid Build Coastguard Worker       o0 += 4;
2822*4bdc9457SAndroid Build Coastguard Worker     }
2823*4bdc9457SAndroid Build Coastguard Worker     // Potentially process the last block of 0..7 pixels.
2824*4bdc9457SAndroid Build Coastguard Worker     assert(w < 8 * sizeof(float));
2825*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(w != 0) {
2826*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x89AB = _mm_loadu_ps(i0);
2827*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
2828*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x89AB = _mm_loadu_ps(i1);
2829*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
2830*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x89AB = _mm_loadu_ps(i2);
2831*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
2832*4bdc9457SAndroid Build Coastguard Worker 
2833*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
2834*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
2835*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
2836*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
2837*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
2838*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x9BDF = _mm_and_ps(vmask_odd,  _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
2839*4bdc9457SAndroid Build Coastguard Worker 
2840*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
2841*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
2842*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
2843*4bdc9457SAndroid Build Coastguard Worker 
2844*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2845*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2846*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
2847*4bdc9457SAndroid Build Coastguard Worker 
2848*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
2849*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
2850*4bdc9457SAndroid Build Coastguard Worker       vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
2851*4bdc9457SAndroid Build Coastguard Worker 
2852*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
2853*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
2854*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
2855*4bdc9457SAndroid Build Coastguard Worker 
2856*4bdc9457SAndroid Build Coastguard Worker       vi0x7531 = vi0xF9BD;
2857*4bdc9457SAndroid Build Coastguard Worker       vi1x7531 = vi1xF9BD;
2858*4bdc9457SAndroid Build Coastguard Worker       vi2x7531 = vi2xF9BD;
2859*4bdc9457SAndroid Build Coastguard Worker 
2860*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
2861*4bdc9457SAndroid Build Coastguard Worker       vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
2862*4bdc9457SAndroid Build Coastguard Worker       vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
2863*4bdc9457SAndroid Build Coastguard Worker 
2864*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, vo0p1);
2865*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, vo0p2);
2866*4bdc9457SAndroid Build Coastguard Worker 
2867*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
2868*4bdc9457SAndroid Build Coastguard Worker 
2869*4bdc9457SAndroid Build Coastguard Worker       vo0 = _mm_min_ps(vo0, vmax);
2870*4bdc9457SAndroid Build Coastguard Worker 
2871*4bdc9457SAndroid Build Coastguard Worker       if (w == 7 * sizeof(float)) {
2872*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o0, vo0);
2873*4bdc9457SAndroid Build Coastguard Worker         o0 += 4;
2874*4bdc9457SAndroid Build Coastguard Worker       } else {
2875*4bdc9457SAndroid Build Coastguard Worker         w += 1 * sizeof(float);
2876*4bdc9457SAndroid Build Coastguard Worker         if (w & (4 * sizeof(float))) {
2877*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o0, vo0);
2878*4bdc9457SAndroid Build Coastguard Worker           o0 += 2;
2879*4bdc9457SAndroid Build Coastguard Worker 
2880*4bdc9457SAndroid Build Coastguard Worker           vo0 = _mm_movehl_ps(vo0, vo0);
2881*4bdc9457SAndroid Build Coastguard Worker         }
2882*4bdc9457SAndroid Build Coastguard Worker         if (w & (2 * sizeof(float))) {
2883*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o0, vo0);
2884*4bdc9457SAndroid Build Coastguard Worker           o0 += 1;
2885*4bdc9457SAndroid Build Coastguard Worker         }
2886*4bdc9457SAndroid Build Coastguard Worker       }
2887*4bdc9457SAndroid Build Coastguard Worker     }
2888*4bdc9457SAndroid Build Coastguard Worker 
2889*4bdc9457SAndroid Build Coastguard Worker     i0 = (const float*) ((uintptr_t) i2 - input_decrement);
2890*4bdc9457SAndroid Build Coastguard Worker     i1 = (const float*) ((uintptr_t) i0 + input_width);
2891*4bdc9457SAndroid Build Coastguard Worker     i2 = (const float*) ((uintptr_t) i1 + input_width);
2892*4bdc9457SAndroid Build Coastguard Worker 
2893*4bdc9457SAndroid Build Coastguard Worker 
2894*4bdc9457SAndroid Build Coastguard Worker     output_height -= 1;
2895*4bdc9457SAndroid Build Coastguard Worker     padded_input_height -= 2;
2896*4bdc9457SAndroid Build Coastguard Worker   } while (output_height != 0);
2897*4bdc9457SAndroid Build Coastguard Worker }
2898*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])2899*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(
2900*4bdc9457SAndroid Build Coastguard Worker     size_t input_height,
2901*4bdc9457SAndroid Build Coastguard Worker     size_t input_width,
2902*4bdc9457SAndroid Build Coastguard Worker     const float* input,
2903*4bdc9457SAndroid Build Coastguard Worker     const float* weights,
2904*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
2905*4bdc9457SAndroid Build Coastguard Worker     float* output,
2906*4bdc9457SAndroid Build Coastguard Worker     uint32_t padding_top,
2907*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2908*4bdc9457SAndroid Build Coastguard Worker {
2909*4bdc9457SAndroid Build Coastguard Worker   assert(input_height != 0);
2910*4bdc9457SAndroid Build Coastguard Worker   assert(input_width != 0);
2911*4bdc9457SAndroid Build Coastguard Worker   assert(input_width % sizeof(float) == 0);
2912*4bdc9457SAndroid Build Coastguard Worker   assert(padding_top == 2);
2913*4bdc9457SAndroid Build Coastguard Worker 
2914*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
2915*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
2916*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
2917*4bdc9457SAndroid Build Coastguard Worker 
2918*4bdc9457SAndroid Build Coastguard Worker   const __m128 vbias = _mm_load1_ps(weights);
2919*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk00 = _mm_load1_ps(weights + 1);
2920*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk01 = _mm_load1_ps(weights + 2);
2921*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk02 = _mm_load1_ps(weights + 3);
2922*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk03 = _mm_load1_ps(weights + 4);
2923*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk04 = _mm_load1_ps(weights + 5);
2924*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk10 = _mm_load1_ps(weights + 6);
2925*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk11 = _mm_load1_ps(weights + 7);
2926*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk12 = _mm_load1_ps(weights + 8);
2927*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk13 = _mm_load1_ps(weights + 9);
2928*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk14 = _mm_load1_ps(weights + 10);
2929*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk20 = _mm_load1_ps(weights + 11);
2930*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk21 = _mm_load1_ps(weights + 12);
2931*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk22 = _mm_load1_ps(weights + 13);
2932*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk23 = _mm_load1_ps(weights + 14);
2933*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk24 = _mm_load1_ps(weights + 15);
2934*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk30 = _mm_load1_ps(weights + 16);
2935*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk31 = _mm_load1_ps(weights + 17);
2936*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk32 = _mm_load1_ps(weights + 18);
2937*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk33 = _mm_load1_ps(weights + 19);
2938*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk34 = _mm_load1_ps(weights + 20);
2939*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk40 = _mm_load1_ps(weights + 21);
2940*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk41 = _mm_load1_ps(weights + 22);
2941*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk42 = _mm_load1_ps(weights + 23);
2942*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk43 = _mm_load1_ps(weights + 24);
2943*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk44 = _mm_load1_ps(weights + 25);
2944*4bdc9457SAndroid Build Coastguard Worker 
2945*4bdc9457SAndroid Build Coastguard Worker   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
2946*4bdc9457SAndroid Build Coastguard Worker 
2947*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = zero;
2948*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = zero;
2949*4bdc9457SAndroid Build Coastguard Worker   const float* i2 = input;
2950*4bdc9457SAndroid Build Coastguard Worker   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
2951*4bdc9457SAndroid Build Coastguard Worker   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
2952*4bdc9457SAndroid Build Coastguard Worker   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
2953*4bdc9457SAndroid Build Coastguard Worker   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
2954*4bdc9457SAndroid Build Coastguard Worker   const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
2955*4bdc9457SAndroid Build Coastguard Worker 
2956*4bdc9457SAndroid Build Coastguard Worker   float* o0 = output;
2957*4bdc9457SAndroid Build Coastguard Worker   float* o1 = (float*) ((uintptr_t) o0 + input_width);
2958*4bdc9457SAndroid Build Coastguard Worker   float* o2 = (float*) ((uintptr_t) o1 + input_width);
2959*4bdc9457SAndroid Build Coastguard Worker   float* o3 = (float*) ((uintptr_t) o2 + input_width);
2960*4bdc9457SAndroid Build Coastguard Worker 
2961*4bdc9457SAndroid Build Coastguard Worker   size_t output_height = input_height;
2962*4bdc9457SAndroid Build Coastguard Worker   do {
2963*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(output_height < 2) {
2964*4bdc9457SAndroid Build Coastguard Worker       i3 = zero;
2965*4bdc9457SAndroid Build Coastguard Worker       o1 = o0;
2966*4bdc9457SAndroid Build Coastguard Worker     }
2967*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(output_height < 3) {
2968*4bdc9457SAndroid Build Coastguard Worker       i4 = zero;
2969*4bdc9457SAndroid Build Coastguard Worker       o2 = o1;
2970*4bdc9457SAndroid Build Coastguard Worker     }
2971*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(output_height < 4) {
2972*4bdc9457SAndroid Build Coastguard Worker       i5 = zero;
2973*4bdc9457SAndroid Build Coastguard Worker       o3 = o2;
2974*4bdc9457SAndroid Build Coastguard Worker     }
2975*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(output_height < 5) {
2976*4bdc9457SAndroid Build Coastguard Worker       i6 = zero;
2977*4bdc9457SAndroid Build Coastguard Worker     }
2978*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(output_height < 6) {
2979*4bdc9457SAndroid Build Coastguard Worker       i7 = zero;
2980*4bdc9457SAndroid Build Coastguard Worker     }
2981*4bdc9457SAndroid Build Coastguard Worker 
2982*4bdc9457SAndroid Build Coastguard Worker     __m128 vi0x3012 = _mm_setzero_ps();
2983*4bdc9457SAndroid Build Coastguard Worker     __m128 vi1x3012 = _mm_setzero_ps();
2984*4bdc9457SAndroid Build Coastguard Worker     __m128 vi2x3012 = _mm_setzero_ps();
2985*4bdc9457SAndroid Build Coastguard Worker     __m128 vi3x3012 = _mm_setzero_ps();
2986*4bdc9457SAndroid Build Coastguard Worker     __m128 vi4x3012 = _mm_setzero_ps();
2987*4bdc9457SAndroid Build Coastguard Worker     __m128 vi5x3012 = _mm_setzero_ps();
2988*4bdc9457SAndroid Build Coastguard Worker     __m128 vi6x3012 = _mm_setzero_ps();
2989*4bdc9457SAndroid Build Coastguard Worker     __m128 vi7x3012 = _mm_setzero_ps();
2990*4bdc9457SAndroid Build Coastguard Worker 
2991*4bdc9457SAndroid Build Coastguard Worker     __m128 vi0x4567 = _mm_loadu_ps(i0);
2992*4bdc9457SAndroid Build Coastguard Worker     i0 += 4;
2993*4bdc9457SAndroid Build Coastguard Worker     __m128 vi1x4567 = _mm_loadu_ps(i1);
2994*4bdc9457SAndroid Build Coastguard Worker     i1 += 4;
2995*4bdc9457SAndroid Build Coastguard Worker     __m128 vi2x4567 = _mm_loadu_ps(i2);
2996*4bdc9457SAndroid Build Coastguard Worker     i2 += 4;
2997*4bdc9457SAndroid Build Coastguard Worker     __m128 vi3x4567 = _mm_loadu_ps(i3);
2998*4bdc9457SAndroid Build Coastguard Worker     i3 += 4;
2999*4bdc9457SAndroid Build Coastguard Worker     __m128 vi4x4567 = _mm_loadu_ps(i4);
3000*4bdc9457SAndroid Build Coastguard Worker     i4 += 4;
3001*4bdc9457SAndroid Build Coastguard Worker     __m128 vi5x4567 = _mm_loadu_ps(i5);
3002*4bdc9457SAndroid Build Coastguard Worker     i5 += 4;
3003*4bdc9457SAndroid Build Coastguard Worker     __m128 vi6x4567 = _mm_loadu_ps(i6);
3004*4bdc9457SAndroid Build Coastguard Worker     i6 += 4;
3005*4bdc9457SAndroid Build Coastguard Worker     __m128 vi7x4567 = _mm_loadu_ps(i7);
3006*4bdc9457SAndroid Build Coastguard Worker     i7 += 4;
3007*4bdc9457SAndroid Build Coastguard Worker 
3008*4bdc9457SAndroid Build Coastguard Worker     size_t w = input_width;
3009*4bdc9457SAndroid Build Coastguard Worker     for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
3010*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
3011*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
3012*4bdc9457SAndroid Build Coastguard Worker       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
3013*4bdc9457SAndroid Build Coastguard Worker       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
3014*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
3015*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
3016*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
3017*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
3018*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
3019*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
3020*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
3021*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
3022*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
3023*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
3024*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
3025*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
3026*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
3027*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
3028*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
3029*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
3030*4bdc9457SAndroid Build Coastguard Worker 
3031*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
3032*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
3033*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
3034*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
3035*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
3036*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
3037*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
3038*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
3039*4bdc9457SAndroid Build Coastguard Worker 
3040*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x89AB = _mm_loadu_ps(i0);
3041*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
3042*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x89AB = _mm_loadu_ps(i1);
3043*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
3044*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x89AB = _mm_loadu_ps(i2);
3045*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
3046*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x89AB = _mm_loadu_ps(i3);
3047*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
3048*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x89AB = _mm_loadu_ps(i4);
3049*4bdc9457SAndroid Build Coastguard Worker       i4 += 4;
3050*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x89AB = _mm_loadu_ps(i5);
3051*4bdc9457SAndroid Build Coastguard Worker       i5 += 4;
3052*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x89AB = _mm_loadu_ps(i6);
3053*4bdc9457SAndroid Build Coastguard Worker       i6 += 4;
3054*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x89AB = _mm_loadu_ps(i7);
3055*4bdc9457SAndroid Build Coastguard Worker       i7 += 4;
3056*4bdc9457SAndroid Build Coastguard Worker 
3057*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
3058*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
3059*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
3060*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
3061*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
3062*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
3063*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
3064*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
3065*4bdc9457SAndroid Build Coastguard Worker 
3066*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
3067*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
3068*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
3069*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
3070*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
3071*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
3072*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
3073*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
3074*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
3075*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
3076*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
3077*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
3078*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
3079*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
3080*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
3081*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
3082*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
3083*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
3084*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
3085*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
3086*4bdc9457SAndroid Build Coastguard Worker 
3087*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
3088*4bdc9457SAndroid Build Coastguard Worker       vi0x3012 = vi0x7456;
3089*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
3090*4bdc9457SAndroid Build Coastguard Worker       vi1x3012 = vi1x7456;
3091*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
3092*4bdc9457SAndroid Build Coastguard Worker       vi2x3012 = vi2x7456;
3093*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
3094*4bdc9457SAndroid Build Coastguard Worker       vi3x3012 = vi3x7456;
3095*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
3096*4bdc9457SAndroid Build Coastguard Worker       vi4x3012 = vi4x7456;
3097*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
3098*4bdc9457SAndroid Build Coastguard Worker       vi5x3012 = vi5x7456;
3099*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
3100*4bdc9457SAndroid Build Coastguard Worker       vi6x3012 = vi6x7456;
3101*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
3102*4bdc9457SAndroid Build Coastguard Worker       vi7x3012 = vi7x7456;
3103*4bdc9457SAndroid Build Coastguard Worker 
3104*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
3105*4bdc9457SAndroid Build Coastguard Worker       vi0x4567 = vi0x89AB;
3106*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
3107*4bdc9457SAndroid Build Coastguard Worker       vi1x4567 = vi1x89AB;
3108*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
3109*4bdc9457SAndroid Build Coastguard Worker       vi2x4567 = vi2x89AB;
3110*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
3111*4bdc9457SAndroid Build Coastguard Worker       vi3x4567 = vi3x89AB;
3112*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
3113*4bdc9457SAndroid Build Coastguard Worker       vi4x4567 = vi4x89AB;
3114*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
3115*4bdc9457SAndroid Build Coastguard Worker       vi5x4567 = vi5x89AB;
3116*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
3117*4bdc9457SAndroid Build Coastguard Worker       vi6x4567 = vi6x89AB;
3118*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
3119*4bdc9457SAndroid Build Coastguard Worker       vi7x4567 = vi7x89AB;
3120*4bdc9457SAndroid Build Coastguard Worker 
3121*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
3122*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
3123*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
3124*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
3125*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
3126*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
3127*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
3128*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
3129*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
3130*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
3131*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
3132*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
3133*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
3134*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
3135*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
3136*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
3137*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
3138*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
3139*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
3140*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
3141*4bdc9457SAndroid Build Coastguard Worker 
3142*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
3143*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
3144*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
3145*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
3146*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
3147*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
3148*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
3149*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
3150*4bdc9457SAndroid Build Coastguard Worker 
3151*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
3152*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
3153*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
3154*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
3155*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
3156*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
3157*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
3158*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
3159*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
3160*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
3161*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
3162*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
3163*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
3164*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
3165*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
3166*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
3167*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
3168*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
3169*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
3170*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
3171*4bdc9457SAndroid Build Coastguard Worker 
3172*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3173*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3174*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3175*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3176*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3177*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3178*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3179*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3180*4bdc9457SAndroid Build Coastguard Worker 
3181*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
3182*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
3183*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
3184*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
3185*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
3186*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
3187*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
3188*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
3189*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
3190*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
3191*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
3192*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
3193*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
3194*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
3195*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
3196*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
3197*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
3198*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
3199*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
3200*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
3201*4bdc9457SAndroid Build Coastguard Worker 
3202*4bdc9457SAndroid Build Coastguard Worker 
3203*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3204*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3205*4bdc9457SAndroid Build Coastguard Worker       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
3206*4bdc9457SAndroid Build Coastguard Worker       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
3207*4bdc9457SAndroid Build Coastguard Worker 
3208*4bdc9457SAndroid Build Coastguard Worker       vo0 = _mm_min_ps(vo0, vmax);
3209*4bdc9457SAndroid Build Coastguard Worker       vo1 = _mm_min_ps(vo1, vmax);
3210*4bdc9457SAndroid Build Coastguard Worker       vo2 = _mm_min_ps(vo2, vmax);
3211*4bdc9457SAndroid Build Coastguard Worker       vo3 = _mm_min_ps(vo3, vmax);
3212*4bdc9457SAndroid Build Coastguard Worker 
3213*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o3, vo3);
3214*4bdc9457SAndroid Build Coastguard Worker       o3 += 4;
3215*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o2, vo2);
3216*4bdc9457SAndroid Build Coastguard Worker       o2 += 4;
3217*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o1, vo1);
3218*4bdc9457SAndroid Build Coastguard Worker       o1 += 4;
3219*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o0, vo0);
3220*4bdc9457SAndroid Build Coastguard Worker       o0 += 4;
3221*4bdc9457SAndroid Build Coastguard Worker     }
3222*4bdc9457SAndroid Build Coastguard Worker     // Always process the last block of 5..8 pixels.
3223*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(w > 4 * sizeof(float)) {
3224*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
3225*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
3226*4bdc9457SAndroid Build Coastguard Worker       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
3227*4bdc9457SAndroid Build Coastguard Worker       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
3228*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
3229*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
3230*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
3231*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
3232*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
3233*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
3234*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
3235*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
3236*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
3237*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
3238*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
3239*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
3240*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
3241*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
3242*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
3243*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
3244*4bdc9457SAndroid Build Coastguard Worker 
3245*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
3246*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
3247*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
3248*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
3249*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
3250*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
3251*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
3252*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
3253*4bdc9457SAndroid Build Coastguard Worker 
3254*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
3255*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
3256*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
3257*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
3258*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
3259*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
3260*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
3261*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
3262*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
3263*4bdc9457SAndroid Build Coastguard Worker       i4 += 4;
3264*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x89AB = _mm_and_ps(_mm_loadu_ps(i5), vmask);
3265*4bdc9457SAndroid Build Coastguard Worker       i5 += 4;
3266*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x89AB = _mm_and_ps(_mm_loadu_ps(i6), vmask);
3267*4bdc9457SAndroid Build Coastguard Worker       i6 += 4;
3268*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x89AB = _mm_and_ps(_mm_loadu_ps(i7), vmask);
3269*4bdc9457SAndroid Build Coastguard Worker       i7 += 4;
3270*4bdc9457SAndroid Build Coastguard Worker 
3271*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
3272*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
3273*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
3274*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
3275*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
3276*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
3277*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
3278*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
3279*4bdc9457SAndroid Build Coastguard Worker 
3280*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
3281*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
3282*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
3283*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
3284*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
3285*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
3286*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
3287*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
3288*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
3289*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
3290*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
3291*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
3292*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
3293*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
3294*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
3295*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
3296*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
3297*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
3298*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
3299*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
3300*4bdc9457SAndroid Build Coastguard Worker 
3301*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
3302*4bdc9457SAndroid Build Coastguard Worker       vi0x3012 = vi0x7456;
3303*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
3304*4bdc9457SAndroid Build Coastguard Worker       vi1x3012 = vi1x7456;
3305*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
3306*4bdc9457SAndroid Build Coastguard Worker       vi2x3012 = vi2x7456;
3307*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
3308*4bdc9457SAndroid Build Coastguard Worker       vi3x3012 = vi3x7456;
3309*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
3310*4bdc9457SAndroid Build Coastguard Worker       vi4x3012 = vi4x7456;
3311*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
3312*4bdc9457SAndroid Build Coastguard Worker       vi5x3012 = vi5x7456;
3313*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
3314*4bdc9457SAndroid Build Coastguard Worker       vi6x3012 = vi6x7456;
3315*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
3316*4bdc9457SAndroid Build Coastguard Worker       vi7x3012 = vi7x7456;
3317*4bdc9457SAndroid Build Coastguard Worker 
3318*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
3319*4bdc9457SAndroid Build Coastguard Worker       vi0x4567 = vi0x89AB;
3320*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
3321*4bdc9457SAndroid Build Coastguard Worker       vi1x4567 = vi1x89AB;
3322*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
3323*4bdc9457SAndroid Build Coastguard Worker       vi2x4567 = vi2x89AB;
3324*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
3325*4bdc9457SAndroid Build Coastguard Worker       vi3x4567 = vi3x89AB;
3326*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
3327*4bdc9457SAndroid Build Coastguard Worker       vi4x4567 = vi4x89AB;
3328*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
3329*4bdc9457SAndroid Build Coastguard Worker       vi5x4567 = vi5x89AB;
3330*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
3331*4bdc9457SAndroid Build Coastguard Worker       vi6x4567 = vi6x89AB;
3332*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
3333*4bdc9457SAndroid Build Coastguard Worker       vi7x4567 = vi7x89AB;
3334*4bdc9457SAndroid Build Coastguard Worker 
3335*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
3336*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
3337*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
3338*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
3339*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
3340*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
3341*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
3342*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
3343*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
3344*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
3345*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
3346*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
3347*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
3348*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
3349*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
3350*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
3351*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
3352*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
3353*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
3354*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
3355*4bdc9457SAndroid Build Coastguard Worker 
3356*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
3357*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
3358*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
3359*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
3360*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
3361*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
3362*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
3363*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
3364*4bdc9457SAndroid Build Coastguard Worker 
3365*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
3366*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
3367*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
3368*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
3369*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
3370*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
3371*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
3372*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
3373*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
3374*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
3375*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
3376*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
3377*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
3378*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
3379*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
3380*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
3381*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
3382*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
3383*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
3384*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
3385*4bdc9457SAndroid Build Coastguard Worker 
3386*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3387*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3388*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3389*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3390*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3391*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3392*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3393*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
3394*4bdc9457SAndroid Build Coastguard Worker 
3395*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
3396*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
3397*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
3398*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
3399*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
3400*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
3401*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
3402*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
3403*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
3404*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
3405*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
3406*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
3407*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
3408*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
3409*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
3410*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
3411*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
3412*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
3413*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
3414*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
3415*4bdc9457SAndroid Build Coastguard Worker 
3416*4bdc9457SAndroid Build Coastguard Worker 
3417*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3418*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3419*4bdc9457SAndroid Build Coastguard Worker       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
3420*4bdc9457SAndroid Build Coastguard Worker       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
3421*4bdc9457SAndroid Build Coastguard Worker 
3422*4bdc9457SAndroid Build Coastguard Worker       vo0 = _mm_min_ps(vo0, vmax);
3423*4bdc9457SAndroid Build Coastguard Worker       vo1 = _mm_min_ps(vo1, vmax);
3424*4bdc9457SAndroid Build Coastguard Worker       vo2 = _mm_min_ps(vo2, vmax);
3425*4bdc9457SAndroid Build Coastguard Worker       vo3 = _mm_min_ps(vo3, vmax);
3426*4bdc9457SAndroid Build Coastguard Worker 
3427*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o3, vo3);
3428*4bdc9457SAndroid Build Coastguard Worker       o3 += 4;
3429*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o2, vo2);
3430*4bdc9457SAndroid Build Coastguard Worker       o2 += 4;
3431*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o1, vo1);
3432*4bdc9457SAndroid Build Coastguard Worker       o1 += 4;
3433*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o0, vo0);
3434*4bdc9457SAndroid Build Coastguard Worker       o0 += 4;
3435*4bdc9457SAndroid Build Coastguard Worker 
3436*4bdc9457SAndroid Build Coastguard Worker       w -= 4 * sizeof(float);
3437*4bdc9457SAndroid Build Coastguard Worker     }
3438*4bdc9457SAndroid Build Coastguard Worker     assert(w >= 1 * sizeof(float));
3439*4bdc9457SAndroid Build Coastguard Worker     assert(w <= 4 * sizeof(float));
3440*4bdc9457SAndroid Build Coastguard Worker     {
3441*4bdc9457SAndroid Build Coastguard Worker       vi0x4567 = _mm_and_ps(vi0x4567, vmask);
3442*4bdc9457SAndroid Build Coastguard Worker       vi1x4567 = _mm_and_ps(vi1x4567, vmask);
3443*4bdc9457SAndroid Build Coastguard Worker       vi2x4567 = _mm_and_ps(vi2x4567, vmask);
3444*4bdc9457SAndroid Build Coastguard Worker       vi3x4567 = _mm_and_ps(vi3x4567, vmask);
3445*4bdc9457SAndroid Build Coastguard Worker       vi4x4567 = _mm_and_ps(vi4x4567, vmask);
3446*4bdc9457SAndroid Build Coastguard Worker       vi5x4567 = _mm_and_ps(vi5x4567, vmask);
3447*4bdc9457SAndroid Build Coastguard Worker       vi6x4567 = _mm_and_ps(vi6x4567, vmask);
3448*4bdc9457SAndroid Build Coastguard Worker       vi7x4567 = _mm_and_ps(vi7x4567, vmask);
3449*4bdc9457SAndroid Build Coastguard Worker 
3450*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
3451*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
3452*4bdc9457SAndroid Build Coastguard Worker       __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
3453*4bdc9457SAndroid Build Coastguard Worker       __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
3454*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
3455*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
3456*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
3457*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
3458*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
3459*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
3460*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
3461*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
3462*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
3463*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
3464*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
3465*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
3466*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
3467*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
3468*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
3469*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
3470*4bdc9457SAndroid Build Coastguard Worker 
3471*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
3472*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
3473*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
3474*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
3475*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
3476*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
3477*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
3478*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
3479*4bdc9457SAndroid Build Coastguard Worker 
3480*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
3481*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
3482*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
3483*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
3484*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
3485*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
3486*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
3487*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
3488*4bdc9457SAndroid Build Coastguard Worker 
3489*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
3490*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
3491*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
3492*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
3493*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
3494*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
3495*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
3496*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
3497*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
3498*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
3499*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
3500*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
3501*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
3502*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
3503*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
3504*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
3505*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
3506*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
3507*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
3508*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
3509*4bdc9457SAndroid Build Coastguard Worker 
3510*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
3511*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
3512*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
3513*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
3514*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
3515*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
3516*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
3517*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
3518*4bdc9457SAndroid Build Coastguard Worker 
3519*4bdc9457SAndroid Build Coastguard Worker       const __m128 vzero = _mm_setzero_ps();
3520*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
3521*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
3522*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
3523*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
3524*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
3525*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vzero);
3526*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vzero);
3527*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vzero);
3528*4bdc9457SAndroid Build Coastguard Worker 
3529*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
3530*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
3531*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
3532*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
3533*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
3534*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
3535*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
3536*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
3537*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
3538*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
3539*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
3540*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
3541*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
3542*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
3543*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
3544*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
3545*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
3546*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
3547*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
3548*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
3549*4bdc9457SAndroid Build Coastguard Worker 
3550*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
3551*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
3552*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
3553*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
3554*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
3555*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
3556*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
3557*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
3558*4bdc9457SAndroid Build Coastguard Worker 
3559*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
3560*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
3561*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
3562*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
3563*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
3564*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
3565*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
3566*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
3567*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
3568*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
3569*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
3570*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
3571*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
3572*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
3573*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
3574*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
3575*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
3576*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
3577*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
3578*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
3579*4bdc9457SAndroid Build Coastguard Worker 
3580*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3581*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3582*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3583*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3584*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3585*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3586*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3587*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
3588*4bdc9457SAndroid Build Coastguard Worker 
3589*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
3590*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
3591*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
3592*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
3593*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
3594*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
3595*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
3596*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
3597*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
3598*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
3599*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
3600*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
3601*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
3602*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
3603*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
3604*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
3605*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
3606*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
3607*4bdc9457SAndroid Build Coastguard Worker       vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
3608*4bdc9457SAndroid Build Coastguard Worker       vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
3609*4bdc9457SAndroid Build Coastguard Worker 
3610*4bdc9457SAndroid Build Coastguard Worker 
3611*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3612*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3613*4bdc9457SAndroid Build Coastguard Worker       __m128 vo2 = _mm_max_ps(vo2p0, vmin);
3614*4bdc9457SAndroid Build Coastguard Worker       __m128 vo3 = _mm_max_ps(vo3p0, vmin);
3615*4bdc9457SAndroid Build Coastguard Worker 
3616*4bdc9457SAndroid Build Coastguard Worker       vo0 = _mm_min_ps(vo0, vmax);
3617*4bdc9457SAndroid Build Coastguard Worker       vo1 = _mm_min_ps(vo1, vmax);
3618*4bdc9457SAndroid Build Coastguard Worker       vo2 = _mm_min_ps(vo2, vmax);
3619*4bdc9457SAndroid Build Coastguard Worker       vo3 = _mm_min_ps(vo3, vmax);
3620*4bdc9457SAndroid Build Coastguard Worker 
3621*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(w & (4 * sizeof(float))) {
3622*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o3, vo3);
3623*4bdc9457SAndroid Build Coastguard Worker         o3 += 4;
3624*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o2, vo2);
3625*4bdc9457SAndroid Build Coastguard Worker         o2 += 4;
3626*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o1, vo1);
3627*4bdc9457SAndroid Build Coastguard Worker         o1 += 4;
3628*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o0, vo0);
3629*4bdc9457SAndroid Build Coastguard Worker         o0 += 4;
3630*4bdc9457SAndroid Build Coastguard Worker       } else {
3631*4bdc9457SAndroid Build Coastguard Worker         if (w & (2 * sizeof(float))) {
3632*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o3, vo3);
3633*4bdc9457SAndroid Build Coastguard Worker           o3 += 2;
3634*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o2, vo2);
3635*4bdc9457SAndroid Build Coastguard Worker           o2 += 2;
3636*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o1, vo1);
3637*4bdc9457SAndroid Build Coastguard Worker           o1 += 2;
3638*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o0, vo0);
3639*4bdc9457SAndroid Build Coastguard Worker           o0 += 2;
3640*4bdc9457SAndroid Build Coastguard Worker 
3641*4bdc9457SAndroid Build Coastguard Worker           vo0 = _mm_movehl_ps(vo0, vo0);
3642*4bdc9457SAndroid Build Coastguard Worker           vo1 = _mm_movehl_ps(vo1, vo1);
3643*4bdc9457SAndroid Build Coastguard Worker           vo2 = _mm_movehl_ps(vo2, vo2);
3644*4bdc9457SAndroid Build Coastguard Worker           vo3 = _mm_movehl_ps(vo3, vo3);
3645*4bdc9457SAndroid Build Coastguard Worker         }
3646*4bdc9457SAndroid Build Coastguard Worker         if (w & (1 * sizeof(float))) {
3647*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o3, vo3);
3648*4bdc9457SAndroid Build Coastguard Worker           o3 += 1;
3649*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o2, vo2);
3650*4bdc9457SAndroid Build Coastguard Worker           o2 += 1;
3651*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o1, vo1);
3652*4bdc9457SAndroid Build Coastguard Worker           o1 += 1;
3653*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o0, vo0);
3654*4bdc9457SAndroid Build Coastguard Worker           o0 += 1;
3655*4bdc9457SAndroid Build Coastguard Worker         }
3656*4bdc9457SAndroid Build Coastguard Worker       }
3657*4bdc9457SAndroid Build Coastguard Worker     }
3658*4bdc9457SAndroid Build Coastguard Worker 
3659*4bdc9457SAndroid Build Coastguard Worker     i0 = (const float*) ((uintptr_t) i4 - input_decrement);
3660*4bdc9457SAndroid Build Coastguard Worker     i1 = (const float*) ((uintptr_t) i5 - input_decrement);
3661*4bdc9457SAndroid Build Coastguard Worker     i2 = (const float*) ((uintptr_t) i1 + input_width);
3662*4bdc9457SAndroid Build Coastguard Worker     i3 = (const float*) ((uintptr_t) i2 + input_width);
3663*4bdc9457SAndroid Build Coastguard Worker     i4 = (const float*) ((uintptr_t) i3 + input_width);
3664*4bdc9457SAndroid Build Coastguard Worker     i5 = (const float*) ((uintptr_t) i4 + input_width);
3665*4bdc9457SAndroid Build Coastguard Worker     i6 = (const float*) ((uintptr_t) i5 + input_width);
3666*4bdc9457SAndroid Build Coastguard Worker     i7 = (const float*) ((uintptr_t) i6 + input_width);
3667*4bdc9457SAndroid Build Coastguard Worker 
3668*4bdc9457SAndroid Build Coastguard Worker     o0 = o3;
3669*4bdc9457SAndroid Build Coastguard Worker     o1 = (float*) ((uintptr_t) o0 + input_width);
3670*4bdc9457SAndroid Build Coastguard Worker     o2 = (float*) ((uintptr_t) o1 + input_width);
3671*4bdc9457SAndroid Build Coastguard Worker     o3 = (float*) ((uintptr_t) o2 + input_width);
3672*4bdc9457SAndroid Build Coastguard Worker 
3673*4bdc9457SAndroid Build Coastguard Worker     output_height = doz(output_height, 4);
3674*4bdc9457SAndroid Build Coastguard Worker   } while (output_height != 0);
3675*4bdc9457SAndroid Build Coastguard Worker }
3676*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])3677*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4(
3678*4bdc9457SAndroid Build Coastguard Worker     size_t input_height,
3679*4bdc9457SAndroid Build Coastguard Worker     size_t input_width,
3680*4bdc9457SAndroid Build Coastguard Worker     const float* input,
3681*4bdc9457SAndroid Build Coastguard Worker     const float* weights,
3682*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
3683*4bdc9457SAndroid Build Coastguard Worker     float* output,
3684*4bdc9457SAndroid Build Coastguard Worker     uint32_t padding_top,
3685*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3686*4bdc9457SAndroid Build Coastguard Worker {
3687*4bdc9457SAndroid Build Coastguard Worker   assert(input_height != 0);
3688*4bdc9457SAndroid Build Coastguard Worker   assert(input_width != 0);
3689*4bdc9457SAndroid Build Coastguard Worker   assert(input_width % sizeof(float) == 0);
3690*4bdc9457SAndroid Build Coastguard Worker   assert(padding_top >= 1);
3691*4bdc9457SAndroid Build Coastguard Worker   assert(padding_top <= 2);
3692*4bdc9457SAndroid Build Coastguard Worker 
3693*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
3694*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmask_odd  = _mm_load_ps((const float*) params->sse.mask_odd);
3695*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
3696*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
3697*4bdc9457SAndroid Build Coastguard Worker 
3698*4bdc9457SAndroid Build Coastguard Worker   const __m128 vbias = _mm_load1_ps(weights);
3699*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk00 = _mm_load1_ps(weights + 1);
3700*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk01 = _mm_load1_ps(weights + 2);
3701*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk02 = _mm_load1_ps(weights + 3);
3702*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk03 = _mm_load1_ps(weights + 4);
3703*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk04 = _mm_load1_ps(weights + 5);
3704*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk10 = _mm_load1_ps(weights + 6);
3705*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk11 = _mm_load1_ps(weights + 7);
3706*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk12 = _mm_load1_ps(weights + 8);
3707*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk13 = _mm_load1_ps(weights + 9);
3708*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk14 = _mm_load1_ps(weights + 10);
3709*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk20 = _mm_load1_ps(weights + 11);
3710*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk21 = _mm_load1_ps(weights + 12);
3711*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk22 = _mm_load1_ps(weights + 13);
3712*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk23 = _mm_load1_ps(weights + 14);
3713*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk24 = _mm_load1_ps(weights + 15);
3714*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk30 = _mm_load1_ps(weights + 16);
3715*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk31 = _mm_load1_ps(weights + 17);
3716*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk32 = _mm_load1_ps(weights + 18);
3717*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk33 = _mm_load1_ps(weights + 19);
3718*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk34 = _mm_load1_ps(weights + 20);
3719*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk40 = _mm_load1_ps(weights + 21);
3720*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk41 = _mm_load1_ps(weights + 22);
3721*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk42 = _mm_load1_ps(weights + 23);
3722*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk43 = _mm_load1_ps(weights + 24);
3723*4bdc9457SAndroid Build Coastguard Worker   const __m128 vk44 = _mm_load1_ps(weights + 25);
3724*4bdc9457SAndroid Build Coastguard Worker 
3725*4bdc9457SAndroid Build Coastguard Worker   const uint32_t padding_top_less_1 = padding_top - 1;
3726*4bdc9457SAndroid Build Coastguard Worker   const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float));
3727*4bdc9457SAndroid Build Coastguard Worker 
3728*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = zero;
3729*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
3730*4bdc9457SAndroid Build Coastguard Worker   const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3731*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
3732*4bdc9457SAndroid Build Coastguard Worker     i1 = zero;
3733*4bdc9457SAndroid Build Coastguard Worker   }
3734*4bdc9457SAndroid Build Coastguard Worker   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3735*4bdc9457SAndroid Build Coastguard Worker   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3736*4bdc9457SAndroid Build Coastguard Worker   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
3737*4bdc9457SAndroid Build Coastguard Worker   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
3738*4bdc9457SAndroid Build Coastguard Worker 
3739*4bdc9457SAndroid Build Coastguard Worker   const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
3740*4bdc9457SAndroid Build Coastguard Worker 
3741*4bdc9457SAndroid Build Coastguard Worker   float* o0 = output;
3742*4bdc9457SAndroid Build Coastguard Worker   float* o1 = (float*) ((uintptr_t) o0 + output_width);
3743*4bdc9457SAndroid Build Coastguard Worker 
3744*4bdc9457SAndroid Build Coastguard Worker   size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
3745*4bdc9457SAndroid Build Coastguard Worker   size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
3746*4bdc9457SAndroid Build Coastguard Worker   do {
3747*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(padded_input_height < 6) {
3748*4bdc9457SAndroid Build Coastguard Worker       i3 = zero;
3749*4bdc9457SAndroid Build Coastguard Worker     }
3750*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(padded_input_height < 7) {
3751*4bdc9457SAndroid Build Coastguard Worker       i4 = zero;
3752*4bdc9457SAndroid Build Coastguard Worker       o1 = o0;
3753*4bdc9457SAndroid Build Coastguard Worker     }
3754*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(padded_input_height < 8) {
3755*4bdc9457SAndroid Build Coastguard Worker       i5 = zero;
3756*4bdc9457SAndroid Build Coastguard Worker     }
3757*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(padded_input_height < 9) {
3758*4bdc9457SAndroid Build Coastguard Worker       i6 = zero;
3759*4bdc9457SAndroid Build Coastguard Worker     }
3760*4bdc9457SAndroid Build Coastguard Worker 
3761*4bdc9457SAndroid Build Coastguard Worker     __m128 vi0x6024 = _mm_setzero_ps();
3762*4bdc9457SAndroid Build Coastguard Worker     __m128 vi1x6024 = _mm_setzero_ps();
3763*4bdc9457SAndroid Build Coastguard Worker     __m128 vi2x6024 = _mm_setzero_ps();
3764*4bdc9457SAndroid Build Coastguard Worker     __m128 vi3x6024 = _mm_setzero_ps();
3765*4bdc9457SAndroid Build Coastguard Worker     __m128 vi4x6024 = _mm_setzero_ps();
3766*4bdc9457SAndroid Build Coastguard Worker     __m128 vi5x6024 = _mm_setzero_ps();
3767*4bdc9457SAndroid Build Coastguard Worker     __m128 vi6x6024 = _mm_setzero_ps();
3768*4bdc9457SAndroid Build Coastguard Worker 
3769*4bdc9457SAndroid Build Coastguard Worker     __m128 vi0x7135 = _mm_setzero_ps();
3770*4bdc9457SAndroid Build Coastguard Worker     __m128 vi1x7135 = _mm_setzero_ps();
3771*4bdc9457SAndroid Build Coastguard Worker     __m128 vi2x7135 = _mm_setzero_ps();
3772*4bdc9457SAndroid Build Coastguard Worker     __m128 vi3x7135 = _mm_setzero_ps();
3773*4bdc9457SAndroid Build Coastguard Worker     __m128 vi4x7135 = _mm_setzero_ps();
3774*4bdc9457SAndroid Build Coastguard Worker     __m128 vi5x7135 = _mm_setzero_ps();
3775*4bdc9457SAndroid Build Coastguard Worker     __m128 vi6x7135 = _mm_setzero_ps();
3776*4bdc9457SAndroid Build Coastguard Worker 
3777*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi0x89AB = _mm_loadu_ps(i0);
3778*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
3779*4bdc9457SAndroid Build Coastguard Worker     i0 += 8;
3780*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi1x89AB = _mm_loadu_ps(i1);
3781*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
3782*4bdc9457SAndroid Build Coastguard Worker     i1 += 8;
3783*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi2x89AB = _mm_loadu_ps(i2);
3784*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
3785*4bdc9457SAndroid Build Coastguard Worker     i2 += 8;
3786*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi3x89AB = _mm_loadu_ps(i3);
3787*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi3xCDEF = _mm_loadu_ps(i3 + 4);
3788*4bdc9457SAndroid Build Coastguard Worker     i3 += 8;
3789*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi4x89AB = _mm_loadu_ps(i4);
3790*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi4xCDEF = _mm_loadu_ps(i4 + 4);
3791*4bdc9457SAndroid Build Coastguard Worker     i4 += 8;
3792*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi5x89AB = _mm_loadu_ps(i5);
3793*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi5xCDEF = _mm_loadu_ps(i5 + 4);
3794*4bdc9457SAndroid Build Coastguard Worker     i5 += 8;
3795*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi6x89AB = _mm_loadu_ps(i6);
3796*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi6xCDEF = _mm_loadu_ps(i6 + 4);
3797*4bdc9457SAndroid Build Coastguard Worker     i6 += 8;
3798*4bdc9457SAndroid Build Coastguard Worker 
3799*4bdc9457SAndroid Build Coastguard Worker     __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3800*4bdc9457SAndroid Build Coastguard Worker     __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3801*4bdc9457SAndroid Build Coastguard Worker     __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3802*4bdc9457SAndroid Build Coastguard Worker     __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3803*4bdc9457SAndroid Build Coastguard Worker     __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3804*4bdc9457SAndroid Build Coastguard Worker     __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3805*4bdc9457SAndroid Build Coastguard Worker     __m128 vi3x8ACE = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3806*4bdc9457SAndroid Build Coastguard Worker     __m128 vi3x9BDF = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3807*4bdc9457SAndroid Build Coastguard Worker     __m128 vi4x8ACE = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3808*4bdc9457SAndroid Build Coastguard Worker     __m128 vi4x9BDF = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3809*4bdc9457SAndroid Build Coastguard Worker     __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3810*4bdc9457SAndroid Build Coastguard Worker     __m128 vi5x9BDF = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3811*4bdc9457SAndroid Build Coastguard Worker     __m128 vi6x8ACE = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
3812*4bdc9457SAndroid Build Coastguard Worker     __m128 vi6x9BDF = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
3813*4bdc9457SAndroid Build Coastguard Worker 
3814*4bdc9457SAndroid Build Coastguard Worker     size_t w = input_width;
3815*4bdc9457SAndroid Build Coastguard Worker     for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) {
3816*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
3817*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
3818*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
3819*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
3820*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
3821*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
3822*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
3823*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
3824*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
3825*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
3826*4bdc9457SAndroid Build Coastguard Worker 
3827*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3828*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3829*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3830*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3831*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3832*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3833*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
3834*4bdc9457SAndroid Build Coastguard Worker 
3835*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
3836*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
3837*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
3838*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
3839*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
3840*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
3841*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
3842*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
3843*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
3844*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
3845*4bdc9457SAndroid Build Coastguard Worker 
3846*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
3847*4bdc9457SAndroid Build Coastguard Worker       vi0x6024 = vi0xE8AC;
3848*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
3849*4bdc9457SAndroid Build Coastguard Worker       vi1x6024 = vi1xE8AC;
3850*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
3851*4bdc9457SAndroid Build Coastguard Worker       vi2x6024 = vi2xE8AC;
3852*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
3853*4bdc9457SAndroid Build Coastguard Worker       vi3x6024 = vi3xE8AC;
3854*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
3855*4bdc9457SAndroid Build Coastguard Worker       vi4x6024 = vi4xE8AC;
3856*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
3857*4bdc9457SAndroid Build Coastguard Worker       vi5x6024 = vi5xE8AC;
3858*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
3859*4bdc9457SAndroid Build Coastguard Worker       vi6x6024 = vi6xE8AC;
3860*4bdc9457SAndroid Build Coastguard Worker 
3861*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3862*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3863*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3864*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3865*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3866*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3867*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
3868*4bdc9457SAndroid Build Coastguard Worker 
3869*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
3870*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
3871*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
3872*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
3873*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
3874*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
3875*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
3876*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
3877*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
3878*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
3879*4bdc9457SAndroid Build Coastguard Worker 
3880*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xGHIJ = _mm_loadu_ps(i0);
3881*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xKLMN = _mm_loadu_ps(i0 + 4);
3882*4bdc9457SAndroid Build Coastguard Worker       i0 += 8;
3883*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xGHIJ = _mm_loadu_ps(i1);
3884*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xKLMN = _mm_loadu_ps(i1 + 4);
3885*4bdc9457SAndroid Build Coastguard Worker       i1 += 8;
3886*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xGHIJ = _mm_loadu_ps(i2);
3887*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xKLMN = _mm_loadu_ps(i2 + 4);
3888*4bdc9457SAndroid Build Coastguard Worker       i2 += 8;
3889*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xGHIJ = _mm_loadu_ps(i3);
3890*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xKLMN = _mm_loadu_ps(i3 + 4);
3891*4bdc9457SAndroid Build Coastguard Worker       i3 += 8;
3892*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xGHIJ = _mm_loadu_ps(i4);
3893*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xKLMN = _mm_loadu_ps(i4 + 4);
3894*4bdc9457SAndroid Build Coastguard Worker       i4 += 8;
3895*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xGHIJ = _mm_loadu_ps(i5);
3896*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xKLMN = _mm_loadu_ps(i5 + 4);
3897*4bdc9457SAndroid Build Coastguard Worker       i5 += 8;
3898*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xGHIJ = _mm_loadu_ps(i6);
3899*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xKLMN = _mm_loadu_ps(i6 + 4);
3900*4bdc9457SAndroid Build Coastguard Worker       i6 += 8;
3901*4bdc9457SAndroid Build Coastguard Worker 
3902*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
3903*4bdc9457SAndroid Build Coastguard Worker       vi0x7135 = vi0xF9BD;
3904*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
3905*4bdc9457SAndroid Build Coastguard Worker       vi1x7135 = vi1xF9BD;
3906*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
3907*4bdc9457SAndroid Build Coastguard Worker       vi2x7135 = vi2xF9BD;
3908*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
3909*4bdc9457SAndroid Build Coastguard Worker       vi3x7135 = vi3xF9BD;
3910*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
3911*4bdc9457SAndroid Build Coastguard Worker       vi4x7135 = vi4xF9BD;
3912*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
3913*4bdc9457SAndroid Build Coastguard Worker       vi5x7135 = vi5xF9BD;
3914*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
3915*4bdc9457SAndroid Build Coastguard Worker       vi6x7135 = vi6xF9BD;
3916*4bdc9457SAndroid Build Coastguard Worker 
3917*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xGIKM = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3918*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xHJLN = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3919*4bdc9457SAndroid Build Coastguard Worker       vi0x9BDF = vi0xHJLN;
3920*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xGIKM = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3921*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xHJLN = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3922*4bdc9457SAndroid Build Coastguard Worker       vi1x9BDF = vi1xHJLN;
3923*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xGIKM = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3924*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xHJLN = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3925*4bdc9457SAndroid Build Coastguard Worker       vi2x9BDF = vi2xHJLN;
3926*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xGIKM = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3927*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xHJLN = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3928*4bdc9457SAndroid Build Coastguard Worker       vi3x9BDF = vi3xHJLN;
3929*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xGIKM = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3930*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xHJLN = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3931*4bdc9457SAndroid Build Coastguard Worker       vi4x9BDF = vi4xHJLN;
3932*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xGIKM = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3933*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xHJLN = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3934*4bdc9457SAndroid Build Coastguard Worker       vi5x9BDF = vi5xHJLN;
3935*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xGIKM = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
3936*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xHJLN = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
3937*4bdc9457SAndroid Build Coastguard Worker       vi6x9BDF = vi6xHJLN;
3938*4bdc9457SAndroid Build Coastguard Worker 
3939*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
3940*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
3941*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
3942*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
3943*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
3944*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
3945*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
3946*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
3947*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
3948*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
3949*4bdc9457SAndroid Build Coastguard Worker 
3950*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vi0xGIKM);
3951*4bdc9457SAndroid Build Coastguard Worker       vi0x8ACE = vi0xGIKM;
3952*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vi1xGIKM);
3953*4bdc9457SAndroid Build Coastguard Worker       vi1x8ACE = vi1xGIKM;
3954*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vi2xGIKM);
3955*4bdc9457SAndroid Build Coastguard Worker       vi2x8ACE = vi2xGIKM;
3956*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vi3xGIKM);
3957*4bdc9457SAndroid Build Coastguard Worker       vi3x8ACE = vi3xGIKM;
3958*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vi4xGIKM);
3959*4bdc9457SAndroid Build Coastguard Worker       vi4x8ACE = vi4xGIKM;
3960*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM);
3961*4bdc9457SAndroid Build Coastguard Worker       vi5x8ACE = vi5xGIKM;
3962*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vi6xGIKM);
3963*4bdc9457SAndroid Build Coastguard Worker       vi6x8ACE = vi6xGIKM;
3964*4bdc9457SAndroid Build Coastguard Worker 
3965*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3966*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3967*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3968*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3969*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3970*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3971*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
3972*4bdc9457SAndroid Build Coastguard Worker 
3973*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
3974*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
3975*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
3976*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
3977*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
3978*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
3979*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
3980*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
3981*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
3982*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
3983*4bdc9457SAndroid Build Coastguard Worker 
3984*4bdc9457SAndroid Build Coastguard Worker 
3985*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
3986*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
3987*4bdc9457SAndroid Build Coastguard Worker 
3988*4bdc9457SAndroid Build Coastguard Worker       vo0 = _mm_min_ps(vo0, vmax);
3989*4bdc9457SAndroid Build Coastguard Worker       vo1 = _mm_min_ps(vo1, vmax);
3990*4bdc9457SAndroid Build Coastguard Worker 
3991*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o1, vo1);
3992*4bdc9457SAndroid Build Coastguard Worker       o1 += 4;
3993*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o0, vo0);
3994*4bdc9457SAndroid Build Coastguard Worker       o0 += 4;
3995*4bdc9457SAndroid Build Coastguard Worker     }
3996*4bdc9457SAndroid Build Coastguard Worker     // Last block has 1-8 pixels to process.
3997*4bdc9457SAndroid Build Coastguard Worker     assert(w <= 8 * sizeof(float));
3998*4bdc9457SAndroid Build Coastguard Worker     assert(w >= 1 * sizeof(float));
3999*4bdc9457SAndroid Build Coastguard Worker     {
4000*4bdc9457SAndroid Build Coastguard Worker       vi0x8ACE = _mm_and_ps(vi0x8ACE, vmask_even);
4001*4bdc9457SAndroid Build Coastguard Worker       vi0x9BDF = _mm_and_ps(vi0x9BDF, vmask_odd);
4002*4bdc9457SAndroid Build Coastguard Worker       vi1x8ACE = _mm_and_ps(vi1x8ACE, vmask_even);
4003*4bdc9457SAndroid Build Coastguard Worker       vi1x9BDF = _mm_and_ps(vi1x9BDF, vmask_odd);
4004*4bdc9457SAndroid Build Coastguard Worker       vi2x8ACE = _mm_and_ps(vi2x8ACE, vmask_even);
4005*4bdc9457SAndroid Build Coastguard Worker       vi2x9BDF = _mm_and_ps(vi2x9BDF, vmask_odd);
4006*4bdc9457SAndroid Build Coastguard Worker       vi3x8ACE = _mm_and_ps(vi3x8ACE, vmask_even);
4007*4bdc9457SAndroid Build Coastguard Worker       vi3x9BDF = _mm_and_ps(vi3x9BDF, vmask_odd);
4008*4bdc9457SAndroid Build Coastguard Worker       vi4x8ACE = _mm_and_ps(vi4x8ACE, vmask_even);
4009*4bdc9457SAndroid Build Coastguard Worker       vi4x9BDF = _mm_and_ps(vi4x9BDF, vmask_odd);
4010*4bdc9457SAndroid Build Coastguard Worker       vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even);
4011*4bdc9457SAndroid Build Coastguard Worker       vi5x9BDF = _mm_and_ps(vi5x9BDF, vmask_odd);
4012*4bdc9457SAndroid Build Coastguard Worker       vi6x8ACE = _mm_and_ps(vi6x8ACE, vmask_even);
4013*4bdc9457SAndroid Build Coastguard Worker       vi6x9BDF = _mm_and_ps(vi6x9BDF, vmask_odd);
4014*4bdc9457SAndroid Build Coastguard Worker 
4015*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
4016*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
4017*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
4018*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
4019*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
4020*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
4021*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
4022*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
4023*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
4024*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
4025*4bdc9457SAndroid Build Coastguard Worker 
4026*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4027*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4028*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4029*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4030*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4031*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4032*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
4033*4bdc9457SAndroid Build Coastguard Worker 
4034*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
4035*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
4036*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
4037*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
4038*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
4039*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
4040*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
4041*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
4042*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
4043*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
4044*4bdc9457SAndroid Build Coastguard Worker 
4045*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
4046*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
4047*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
4048*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
4049*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
4050*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
4051*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
4052*4bdc9457SAndroid Build Coastguard Worker 
4053*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4054*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4055*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4056*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4057*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4058*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4059*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
4060*4bdc9457SAndroid Build Coastguard Worker 
4061*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
4062*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
4063*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
4064*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
4065*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
4066*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
4067*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
4068*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
4069*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
4070*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
4071*4bdc9457SAndroid Build Coastguard Worker 
4072*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
4073*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
4074*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
4075*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
4076*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
4077*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
4078*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
4079*4bdc9457SAndroid Build Coastguard Worker 
4080*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
4081*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
4082*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
4083*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
4084*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
4085*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
4086*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
4087*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
4088*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
4089*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
4090*4bdc9457SAndroid Build Coastguard Worker 
4091*4bdc9457SAndroid Build Coastguard Worker       const __m128 vzero = _mm_setzero_ps();
4092*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vzero);
4093*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vzero);
4094*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vzero);
4095*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vzero);
4096*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vzero);
4097*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero);
4098*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vzero);
4099*4bdc9457SAndroid Build Coastguard Worker 
4100*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4101*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4102*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4103*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4104*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4105*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4106*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
4107*4bdc9457SAndroid Build Coastguard Worker 
4108*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
4109*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
4110*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
4111*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
4112*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
4113*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
4114*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
4115*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
4116*4bdc9457SAndroid Build Coastguard Worker       vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
4117*4bdc9457SAndroid Build Coastguard Worker       vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
4118*4bdc9457SAndroid Build Coastguard Worker 
4119*4bdc9457SAndroid Build Coastguard Worker 
4120*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0 = _mm_max_ps(vo0p0, vmin);
4121*4bdc9457SAndroid Build Coastguard Worker       __m128 vo1 = _mm_max_ps(vo1p0, vmin);
4122*4bdc9457SAndroid Build Coastguard Worker 
4123*4bdc9457SAndroid Build Coastguard Worker       vo0 = _mm_min_ps(vo0, vmax);
4124*4bdc9457SAndroid Build Coastguard Worker       vo1 = _mm_min_ps(vo1, vmax);
4125*4bdc9457SAndroid Build Coastguard Worker 
4126*4bdc9457SAndroid Build Coastguard Worker       size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float));
4127*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(w_tmp >= 4) {
4128*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o1, vo1);
4129*4bdc9457SAndroid Build Coastguard Worker         o1 += 4;
4130*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o0, vo0);
4131*4bdc9457SAndroid Build Coastguard Worker         o0 += 4;
4132*4bdc9457SAndroid Build Coastguard Worker       } else {
4133*4bdc9457SAndroid Build Coastguard Worker         if (w_tmp & 2) {
4134*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o1, vo1);
4135*4bdc9457SAndroid Build Coastguard Worker           o1 += 2;
4136*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o0, vo0);
4137*4bdc9457SAndroid Build Coastguard Worker           o0 += 2;
4138*4bdc9457SAndroid Build Coastguard Worker 
4139*4bdc9457SAndroid Build Coastguard Worker           vo0 = _mm_movehl_ps(vo0, vo0);
4140*4bdc9457SAndroid Build Coastguard Worker           vo1 = _mm_movehl_ps(vo1, vo1);
4141*4bdc9457SAndroid Build Coastguard Worker         }
4142*4bdc9457SAndroid Build Coastguard Worker         if (w_tmp & 1) {
4143*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o1, vo1);
4144*4bdc9457SAndroid Build Coastguard Worker           o1 += 1;
4145*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o0, vo0);
4146*4bdc9457SAndroid Build Coastguard Worker           o0 += 1;
4147*4bdc9457SAndroid Build Coastguard Worker         }
4148*4bdc9457SAndroid Build Coastguard Worker       }
4149*4bdc9457SAndroid Build Coastguard Worker     }
4150*4bdc9457SAndroid Build Coastguard Worker 
4151*4bdc9457SAndroid Build Coastguard Worker     i0 = (const float*) ((uintptr_t) i4 - input_decrement);
4152*4bdc9457SAndroid Build Coastguard Worker     i1 = (const float*) ((uintptr_t) i5 - input_decrement);
4153*4bdc9457SAndroid Build Coastguard Worker     i2 = (const float*) ((uintptr_t) i6 - input_decrement);
4154*4bdc9457SAndroid Build Coastguard Worker     i3 = (const float*) ((uintptr_t) i2 + input_width);
4155*4bdc9457SAndroid Build Coastguard Worker     i4 = (const float*) ((uintptr_t) i3 + input_width);
4156*4bdc9457SAndroid Build Coastguard Worker     i5 = (const float*) ((uintptr_t) i4 + input_width);
4157*4bdc9457SAndroid Build Coastguard Worker     i6 = (const float*) ((uintptr_t) i5 + input_width);
4158*4bdc9457SAndroid Build Coastguard Worker 
4159*4bdc9457SAndroid Build Coastguard Worker     o0 = o1;
4160*4bdc9457SAndroid Build Coastguard Worker     o1 = (float*) ((uintptr_t) o0 + output_width);
4161*4bdc9457SAndroid Build Coastguard Worker 
4162*4bdc9457SAndroid Build Coastguard Worker     output_height = doz(output_height, 2);
4163*4bdc9457SAndroid Build Coastguard Worker     padded_input_height = doz(padded_input_height, 4);
4164*4bdc9457SAndroid Build Coastguard Worker   } while (output_height != 0);
4165*4bdc9457SAndroid Build Coastguard Worker }
4166*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_gavgpool_cw_ukernel__sse_x4(size_t elements,size_t channels,const float * input,float * output,const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS (1)])4167*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gavgpool_cw_ukernel__sse_x4(
4168*4bdc9457SAndroid Build Coastguard Worker     size_t elements,
4169*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
4170*4bdc9457SAndroid Build Coastguard Worker     const float* input,
4171*4bdc9457SAndroid Build Coastguard Worker     float* output,
4172*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4173*4bdc9457SAndroid Build Coastguard Worker {
4174*4bdc9457SAndroid Build Coastguard Worker   assert(elements != 0);
4175*4bdc9457SAndroid Build Coastguard Worker   assert(elements % sizeof(float) == 0);
4176*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
4177*4bdc9457SAndroid Build Coastguard Worker 
4178*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = input;
4179*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = (const float*) ((uintptr_t) i0 + elements);
4180*4bdc9457SAndroid Build Coastguard Worker   const float* i2 = (const float*) ((uintptr_t) i1 + elements);
4181*4bdc9457SAndroid Build Coastguard Worker   const float* i3 = (const float*) ((uintptr_t) i2 + elements);
4182*4bdc9457SAndroid Build Coastguard Worker 
4183*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
4184*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmultiplier = _mm_load_ps(params->sse.multiplier);
4185*4bdc9457SAndroid Build Coastguard Worker   const __m128 voutput_min = _mm_load_ps(params->sse.output_min);
4186*4bdc9457SAndroid Build Coastguard Worker   const __m128 voutput_max = _mm_load_ps(params->sse.output_max);
4187*4bdc9457SAndroid Build Coastguard Worker 
4188*4bdc9457SAndroid Build Coastguard Worker   while (channels >= 4) {
4189*4bdc9457SAndroid Build Coastguard Worker     __m128 vsum0 = _mm_setzero_ps();
4190*4bdc9457SAndroid Build Coastguard Worker     __m128 vsum1 = _mm_setzero_ps();
4191*4bdc9457SAndroid Build Coastguard Worker     __m128 vsum2 = _mm_setzero_ps();
4192*4bdc9457SAndroid Build Coastguard Worker     __m128 vsum3 = _mm_setzero_ps();
4193*4bdc9457SAndroid Build Coastguard Worker     size_t n = elements;
4194*4bdc9457SAndroid Build Coastguard Worker     while (n >= 4 * sizeof(float)) {
4195*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0 = _mm_loadu_ps(i0);
4196*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
4197*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1 = _mm_loadu_ps(i1);
4198*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
4199*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2 = _mm_loadu_ps(i2);
4200*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
4201*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3 = _mm_loadu_ps(i3);
4202*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
4203*4bdc9457SAndroid Build Coastguard Worker 
4204*4bdc9457SAndroid Build Coastguard Worker       vsum0 = _mm_add_ps(vsum0, vi0);
4205*4bdc9457SAndroid Build Coastguard Worker       vsum1 = _mm_add_ps(vsum1, vi1);
4206*4bdc9457SAndroid Build Coastguard Worker       vsum2 = _mm_add_ps(vsum2, vi2);
4207*4bdc9457SAndroid Build Coastguard Worker       vsum3 = _mm_add_ps(vsum3, vi3);
4208*4bdc9457SAndroid Build Coastguard Worker       n -= 4 * sizeof(float);
4209*4bdc9457SAndroid Build Coastguard Worker     }
4210*4bdc9457SAndroid Build Coastguard Worker 
4211*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(n != 0) {
4212*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0 = _mm_and_ps(_mm_loadu_ps(i0), vmask);
4213*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + n);
4214*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1 = _mm_and_ps(_mm_loadu_ps(i1), vmask);
4215*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + n);
4216*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2 = _mm_and_ps(_mm_loadu_ps(i2), vmask);
4217*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + n);
4218*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3 = _mm_and_ps(_mm_loadu_ps(i3), vmask);
4219*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 + n);
4220*4bdc9457SAndroid Build Coastguard Worker 
4221*4bdc9457SAndroid Build Coastguard Worker       vsum0 = _mm_add_ps(vsum0, vi0);
4222*4bdc9457SAndroid Build Coastguard Worker       vsum1 = _mm_add_ps(vsum1, vi1);
4223*4bdc9457SAndroid Build Coastguard Worker       vsum2 = _mm_add_ps(vsum2, vi2);
4224*4bdc9457SAndroid Build Coastguard Worker       vsum3 = _mm_add_ps(vsum3, vi3);
4225*4bdc9457SAndroid Build Coastguard Worker     }
4226*4bdc9457SAndroid Build Coastguard Worker 
4227*4bdc9457SAndroid Build Coastguard Worker     // Having exactly 4 rows makes this work out nicely as we end up with
4228*4bdc9457SAndroid Build Coastguard Worker     // the 4 totals in 4 different lanes of the same vector.
4229*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum01 = _mm_add_ps(_mm_unpacklo_ps(vsum0, vsum1), _mm_unpackhi_ps(vsum0, vsum1));
4230*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum23 = _mm_add_ps(_mm_unpacklo_ps(vsum2, vsum3), _mm_unpackhi_ps(vsum2, vsum3));
4231*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum = _mm_add_ps(_mm_movelh_ps(vsum01, vsum23), _mm_movehl_ps(vsum23, vsum01));
4232*4bdc9457SAndroid Build Coastguard Worker     __m128 vout = _mm_mul_ps(vsum, vmultiplier);
4233*4bdc9457SAndroid Build Coastguard Worker 
4234*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_max_ps(vout, voutput_min);
4235*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_min_ps(vout, voutput_max);
4236*4bdc9457SAndroid Build Coastguard Worker 
4237*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(output, vout);
4238*4bdc9457SAndroid Build Coastguard Worker     output += 4;
4239*4bdc9457SAndroid Build Coastguard Worker     i0 = i3;
4240*4bdc9457SAndroid Build Coastguard Worker     i1 = (const float*) ((uintptr_t) i0 + elements);
4241*4bdc9457SAndroid Build Coastguard Worker     i2 = (const float*) ((uintptr_t) i1 + elements);
4242*4bdc9457SAndroid Build Coastguard Worker     i3 = (const float*) ((uintptr_t) i2 + elements);
4243*4bdc9457SAndroid Build Coastguard Worker     channels -= 4;
4244*4bdc9457SAndroid Build Coastguard Worker   }
4245*4bdc9457SAndroid Build Coastguard Worker 
4246*4bdc9457SAndroid Build Coastguard Worker   while (channels != 0) {
4247*4bdc9457SAndroid Build Coastguard Worker     __m128 vsum = _mm_setzero_ps();
4248*4bdc9457SAndroid Build Coastguard Worker     size_t n = elements;
4249*4bdc9457SAndroid Build Coastguard Worker     while (n >= 4 * sizeof(float)) {
4250*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0 = _mm_loadu_ps(i0);
4251*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
4252*4bdc9457SAndroid Build Coastguard Worker       vsum = _mm_add_ps(vsum, vi0);
4253*4bdc9457SAndroid Build Coastguard Worker       n -= 4 * sizeof(float);
4254*4bdc9457SAndroid Build Coastguard Worker     }
4255*4bdc9457SAndroid Build Coastguard Worker 
4256*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(n != 0) {
4257*4bdc9457SAndroid Build Coastguard Worker       __m128 vi0 = _mm_and_ps(_mm_loadu_ps(i0), vmask);
4258*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + n);
4259*4bdc9457SAndroid Build Coastguard Worker       vsum = _mm_add_ps(vsum, vi0);
4260*4bdc9457SAndroid Build Coastguard Worker     }
4261*4bdc9457SAndroid Build Coastguard Worker 
4262*4bdc9457SAndroid Build Coastguard Worker     vsum = _mm_add_ps(vsum, _mm_movehl_ps(vsum, vsum));
4263*4bdc9457SAndroid Build Coastguard Worker     vsum = _mm_add_ss(vsum, _mm_shuffle_ps(vsum, vsum, _MM_SHUFFLE(3, 2, 1, 1)));
4264*4bdc9457SAndroid Build Coastguard Worker 
4265*4bdc9457SAndroid Build Coastguard Worker     __m128 vout = _mm_mul_ss(vsum, vmultiplier);
4266*4bdc9457SAndroid Build Coastguard Worker 
4267*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_max_ss(vout, voutput_min);
4268*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_min_ss(vout, voutput_max);
4269*4bdc9457SAndroid Build Coastguard Worker 
4270*4bdc9457SAndroid Build Coastguard Worker     _mm_store_ss(output, vout);
4271*4bdc9457SAndroid Build Coastguard Worker     output += 1;
4272*4bdc9457SAndroid Build Coastguard Worker     channels -= 1;
4273*4bdc9457SAndroid Build Coastguard Worker   }
4274*4bdc9457SAndroid Build Coastguard Worker }
4275*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4(size_t rows,size_t channels,const float * input,size_t input_stride,const float * zero,float * buffer,float * output,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])4276*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4(
4277*4bdc9457SAndroid Build Coastguard Worker     size_t rows,
4278*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
4279*4bdc9457SAndroid Build Coastguard Worker     const float* input,
4280*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
4281*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
4282*4bdc9457SAndroid Build Coastguard Worker     float* buffer,
4283*4bdc9457SAndroid Build Coastguard Worker     float* output,
4284*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4285*4bdc9457SAndroid Build Coastguard Worker {
4286*4bdc9457SAndroid Build Coastguard Worker   assert(rows > 7);
4287*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
4288*4bdc9457SAndroid Build Coastguard Worker 
4289*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = input;
4290*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
4291*4bdc9457SAndroid Build Coastguard Worker   const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
4292*4bdc9457SAndroid Build Coastguard Worker   const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
4293*4bdc9457SAndroid Build Coastguard Worker   const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
4294*4bdc9457SAndroid Build Coastguard Worker   const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
4295*4bdc9457SAndroid Build Coastguard Worker   const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
4296*4bdc9457SAndroid Build Coastguard Worker   const size_t packed_channels = round_up_po2(channels, 4);
4297*4bdc9457SAndroid Build Coastguard Worker   const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float);
4298*4bdc9457SAndroid Build Coastguard Worker 
4299*4bdc9457SAndroid Build Coastguard Worker   float* b = buffer;
4300*4bdc9457SAndroid Build Coastguard Worker   for (size_t c = 0; c < channels; c += 4) {
4301*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi0 = _mm_loadu_ps(i0);
4302*4bdc9457SAndroid Build Coastguard Worker     i0 += 4;
4303*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi1 = _mm_loadu_ps(i1);
4304*4bdc9457SAndroid Build Coastguard Worker     i1 += 4;
4305*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi2 = _mm_loadu_ps(i2);
4306*4bdc9457SAndroid Build Coastguard Worker     i2 += 4;
4307*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi3 = _mm_loadu_ps(i3);
4308*4bdc9457SAndroid Build Coastguard Worker     i3 += 4;
4309*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi4 = _mm_loadu_ps(i4);
4310*4bdc9457SAndroid Build Coastguard Worker     i4 += 4;
4311*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi5 = _mm_loadu_ps(i5);
4312*4bdc9457SAndroid Build Coastguard Worker     i5 += 4;
4313*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi6 = _mm_loadu_ps(i6);
4314*4bdc9457SAndroid Build Coastguard Worker     i6 += 4;
4315*4bdc9457SAndroid Build Coastguard Worker 
4316*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4317*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4318*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4319*4bdc9457SAndroid Build Coastguard Worker 
4320*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
4321*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
4322*4bdc9457SAndroid Build Coastguard Worker 
4323*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
4324*4bdc9457SAndroid Build Coastguard Worker 
4325*4bdc9457SAndroid Build Coastguard Worker     _mm_store_ps(b, vsum); b += 4;
4326*4bdc9457SAndroid Build Coastguard Worker   }
4327*4bdc9457SAndroid Build Coastguard Worker   for (rows -= 7; rows > 7; rows -= 7) {
4328*4bdc9457SAndroid Build Coastguard Worker     b = buffer;
4329*4bdc9457SAndroid Build Coastguard Worker 
4330*4bdc9457SAndroid Build Coastguard Worker     i0 = (const float*) ((uintptr_t) i0 + input_increment);
4331*4bdc9457SAndroid Build Coastguard Worker     i1 = (const float*) ((uintptr_t) i1 + input_increment);
4332*4bdc9457SAndroid Build Coastguard Worker     i2 = (const float*) ((uintptr_t) i2 + input_increment);
4333*4bdc9457SAndroid Build Coastguard Worker     i3 = (const float*) ((uintptr_t) i3 + input_increment);
4334*4bdc9457SAndroid Build Coastguard Worker     i4 = (const float*) ((uintptr_t) i4 + input_increment);
4335*4bdc9457SAndroid Build Coastguard Worker     i5 = (const float*) ((uintptr_t) i5 + input_increment);
4336*4bdc9457SAndroid Build Coastguard Worker     i6 = (const float*) ((uintptr_t) i6 + input_increment);
4337*4bdc9457SAndroid Build Coastguard Worker 
4338*4bdc9457SAndroid Build Coastguard Worker     for (size_t c = 0; c < channels; c += 4) {
4339*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0 = _mm_loadu_ps(i0);
4340*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
4341*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1 = _mm_loadu_ps(i1);
4342*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
4343*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2 = _mm_loadu_ps(i2);
4344*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
4345*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3 = _mm_loadu_ps(i3);
4346*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
4347*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4 = _mm_loadu_ps(i4);
4348*4bdc9457SAndroid Build Coastguard Worker       i4 += 4;
4349*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5 = _mm_loadu_ps(i5);
4350*4bdc9457SAndroid Build Coastguard Worker       i5 += 4;
4351*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6 = _mm_loadu_ps(i6);
4352*4bdc9457SAndroid Build Coastguard Worker       i6 += 4;
4353*4bdc9457SAndroid Build Coastguard Worker       const __m128 vacc = _mm_load_ps(b);
4354*4bdc9457SAndroid Build Coastguard Worker 
4355*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4356*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4357*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4358*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum6a = _mm_add_ps(vi6, vacc);
4359*4bdc9457SAndroid Build Coastguard Worker 
4360*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
4361*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
4362*4bdc9457SAndroid Build Coastguard Worker 
4363*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
4364*4bdc9457SAndroid Build Coastguard Worker 
4365*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ps(b, vsum); b += 4;
4366*4bdc9457SAndroid Build Coastguard Worker     }
4367*4bdc9457SAndroid Build Coastguard Worker   }
4368*4bdc9457SAndroid Build Coastguard Worker 
4369*4bdc9457SAndroid Build Coastguard Worker   i0 = (const float*) ((uintptr_t) i0 + input_increment);
4370*4bdc9457SAndroid Build Coastguard Worker   i1 = (const float*) ((uintptr_t) i1 + input_increment);
4371*4bdc9457SAndroid Build Coastguard Worker   if (rows < 2) {
4372*4bdc9457SAndroid Build Coastguard Worker     i1 = zero;
4373*4bdc9457SAndroid Build Coastguard Worker   }
4374*4bdc9457SAndroid Build Coastguard Worker   i2 = (const float*) ((uintptr_t) i2 + input_increment);
4375*4bdc9457SAndroid Build Coastguard Worker   if (rows <= 2) {
4376*4bdc9457SAndroid Build Coastguard Worker     i2 = zero;
4377*4bdc9457SAndroid Build Coastguard Worker   }
4378*4bdc9457SAndroid Build Coastguard Worker   i3 = (const float*) ((uintptr_t) i3 + input_increment);
4379*4bdc9457SAndroid Build Coastguard Worker   if (rows < 4) {
4380*4bdc9457SAndroid Build Coastguard Worker     i3 = zero;
4381*4bdc9457SAndroid Build Coastguard Worker   }
4382*4bdc9457SAndroid Build Coastguard Worker   i4 = (const float*) ((uintptr_t) i4 + input_increment);
4383*4bdc9457SAndroid Build Coastguard Worker   if (rows <= 4) {
4384*4bdc9457SAndroid Build Coastguard Worker     i4 = zero;
4385*4bdc9457SAndroid Build Coastguard Worker   }
4386*4bdc9457SAndroid Build Coastguard Worker   i5 = (const float*) ((uintptr_t) i5 + input_increment);
4387*4bdc9457SAndroid Build Coastguard Worker   if (rows < 6) {
4388*4bdc9457SAndroid Build Coastguard Worker     i5 = zero;
4389*4bdc9457SAndroid Build Coastguard Worker   }
4390*4bdc9457SAndroid Build Coastguard Worker   i6 = (const float*) ((uintptr_t) i6 + input_increment);
4391*4bdc9457SAndroid Build Coastguard Worker   if (rows <= 6) {
4392*4bdc9457SAndroid Build Coastguard Worker     i6 = zero;
4393*4bdc9457SAndroid Build Coastguard Worker   }
4394*4bdc9457SAndroid Build Coastguard Worker   const __m128 vscale = _mm_load_ps(params->sse.scale);
4395*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
4396*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
4397*4bdc9457SAndroid Build Coastguard Worker 
4398*4bdc9457SAndroid Build Coastguard Worker   b = buffer;
4399*4bdc9457SAndroid Build Coastguard Worker   while (channels >= 4) {
4400*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi0 = _mm_loadu_ps(i0);
4401*4bdc9457SAndroid Build Coastguard Worker     i0 += 4;
4402*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi1 = _mm_loadu_ps(i1);
4403*4bdc9457SAndroid Build Coastguard Worker     i1 += 4;
4404*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi2 = _mm_loadu_ps(i2);
4405*4bdc9457SAndroid Build Coastguard Worker     i2 += 4;
4406*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi3 = _mm_loadu_ps(i3);
4407*4bdc9457SAndroid Build Coastguard Worker     i3 += 4;
4408*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi4 = _mm_loadu_ps(i4);
4409*4bdc9457SAndroid Build Coastguard Worker     i4 += 4;
4410*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi5 = _mm_loadu_ps(i5);
4411*4bdc9457SAndroid Build Coastguard Worker     i5 += 4;
4412*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi6 = _mm_loadu_ps(i6);
4413*4bdc9457SAndroid Build Coastguard Worker     i6 += 4;
4414*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc = _mm_load_ps(b);
4415*4bdc9457SAndroid Build Coastguard Worker     b += 4;
4416*4bdc9457SAndroid Build Coastguard Worker 
4417*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4418*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4419*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4420*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum6a = _mm_add_ps(vi6, vacc);
4421*4bdc9457SAndroid Build Coastguard Worker 
4422*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
4423*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
4424*4bdc9457SAndroid Build Coastguard Worker 
4425*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
4426*4bdc9457SAndroid Build Coastguard Worker 
4427*4bdc9457SAndroid Build Coastguard Worker     __m128 vout = _mm_mul_ps(vsum, vscale);
4428*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_max_ps(vout, vmin);
4429*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_min_ps(vout, vmax);
4430*4bdc9457SAndroid Build Coastguard Worker 
4431*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(output, vout);
4432*4bdc9457SAndroid Build Coastguard Worker     output += 4;
4433*4bdc9457SAndroid Build Coastguard Worker 
4434*4bdc9457SAndroid Build Coastguard Worker     channels -= 4;
4435*4bdc9457SAndroid Build Coastguard Worker   }
4436*4bdc9457SAndroid Build Coastguard Worker   if (channels != 0) {
4437*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi0 = _mm_loadu_ps(i0);
4438*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi1 = _mm_loadu_ps(i1);
4439*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi2 = _mm_loadu_ps(i2);
4440*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi3 = _mm_loadu_ps(i3);
4441*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi4 = _mm_loadu_ps(i4);
4442*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi5 = _mm_loadu_ps(i5);
4443*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi6 = _mm_loadu_ps(i6);
4444*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc = _mm_loadu_ps(b);
4445*4bdc9457SAndroid Build Coastguard Worker 
4446*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4447*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4448*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4449*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum6a = _mm_add_ps(vi6, vacc);
4450*4bdc9457SAndroid Build Coastguard Worker 
4451*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
4452*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
4453*4bdc9457SAndroid Build Coastguard Worker 
4454*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
4455*4bdc9457SAndroid Build Coastguard Worker 
4456*4bdc9457SAndroid Build Coastguard Worker     __m128 vout = _mm_mul_ps(vsum, vscale);
4457*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_max_ps(vout, vmin);
4458*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_min_ps(vout, vmax);
4459*4bdc9457SAndroid Build Coastguard Worker 
4460*4bdc9457SAndroid Build Coastguard Worker     if (channels & 2) {
4461*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) output, vout);
4462*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_movehl_ps(vout, vout);
4463*4bdc9457SAndroid Build Coastguard Worker       output += 2;
4464*4bdc9457SAndroid Build Coastguard Worker     }
4465*4bdc9457SAndroid Build Coastguard Worker     if (channels & 1) {
4466*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(output, vout);
4467*4bdc9457SAndroid Build Coastguard Worker     }
4468*4bdc9457SAndroid Build Coastguard Worker   }
4469*4bdc9457SAndroid Build Coastguard Worker }
4470*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4(size_t rows,size_t channels,const float * input,size_t input_stride,const float * zero,float * output,const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])4471*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4(
4472*4bdc9457SAndroid Build Coastguard Worker     size_t rows,
4473*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
4474*4bdc9457SAndroid Build Coastguard Worker     const float* input,
4475*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
4476*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
4477*4bdc9457SAndroid Build Coastguard Worker     float* output,
4478*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4479*4bdc9457SAndroid Build Coastguard Worker {
4480*4bdc9457SAndroid Build Coastguard Worker   assert(rows != 0);
4481*4bdc9457SAndroid Build Coastguard Worker   assert(rows <= 7);
4482*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
4483*4bdc9457SAndroid Build Coastguard Worker 
4484*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = input;
4485*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
4486*4bdc9457SAndroid Build Coastguard Worker   if (rows < 2) {
4487*4bdc9457SAndroid Build Coastguard Worker     i1 = zero;
4488*4bdc9457SAndroid Build Coastguard Worker   }
4489*4bdc9457SAndroid Build Coastguard Worker   const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
4490*4bdc9457SAndroid Build Coastguard Worker   if (rows <= 2) {
4491*4bdc9457SAndroid Build Coastguard Worker     i2 = zero;
4492*4bdc9457SAndroid Build Coastguard Worker   }
4493*4bdc9457SAndroid Build Coastguard Worker   const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
4494*4bdc9457SAndroid Build Coastguard Worker   if (rows < 4) {
4495*4bdc9457SAndroid Build Coastguard Worker     i3 = zero;
4496*4bdc9457SAndroid Build Coastguard Worker   }
4497*4bdc9457SAndroid Build Coastguard Worker   const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
4498*4bdc9457SAndroid Build Coastguard Worker   if (rows <= 4) {
4499*4bdc9457SAndroid Build Coastguard Worker     i4 = zero;
4500*4bdc9457SAndroid Build Coastguard Worker   }
4501*4bdc9457SAndroid Build Coastguard Worker   const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
4502*4bdc9457SAndroid Build Coastguard Worker   if (rows < 6) {
4503*4bdc9457SAndroid Build Coastguard Worker     i5 = zero;
4504*4bdc9457SAndroid Build Coastguard Worker   }
4505*4bdc9457SAndroid Build Coastguard Worker   const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
4506*4bdc9457SAndroid Build Coastguard Worker   if (rows <= 6) {
4507*4bdc9457SAndroid Build Coastguard Worker     i6 = zero;
4508*4bdc9457SAndroid Build Coastguard Worker   }
4509*4bdc9457SAndroid Build Coastguard Worker   const __m128 vscale = _mm_load_ps(params->sse.scale);
4510*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
4511*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
4512*4bdc9457SAndroid Build Coastguard Worker 
4513*4bdc9457SAndroid Build Coastguard Worker   while (channels >= 4) {
4514*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi0 = _mm_loadu_ps(i0);
4515*4bdc9457SAndroid Build Coastguard Worker     i0 += 4;
4516*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi1 = _mm_loadu_ps(i1);
4517*4bdc9457SAndroid Build Coastguard Worker     i1 += 4;
4518*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi2 = _mm_loadu_ps(i2);
4519*4bdc9457SAndroid Build Coastguard Worker     i2 += 4;
4520*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi3 = _mm_loadu_ps(i3);
4521*4bdc9457SAndroid Build Coastguard Worker     i3 += 4;
4522*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi4 = _mm_loadu_ps(i4);
4523*4bdc9457SAndroid Build Coastguard Worker     i4 += 4;
4524*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi5 = _mm_loadu_ps(i5);
4525*4bdc9457SAndroid Build Coastguard Worker     i5 += 4;
4526*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi6 = _mm_loadu_ps(i6);
4527*4bdc9457SAndroid Build Coastguard Worker     i6 += 4;
4528*4bdc9457SAndroid Build Coastguard Worker 
4529*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4530*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4531*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4532*4bdc9457SAndroid Build Coastguard Worker 
4533*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
4534*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
4535*4bdc9457SAndroid Build Coastguard Worker 
4536*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
4537*4bdc9457SAndroid Build Coastguard Worker 
4538*4bdc9457SAndroid Build Coastguard Worker     __m128 vout = _mm_mul_ps(vsum, vscale);
4539*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_max_ps(vout, vmin);
4540*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_min_ps(vout, vmax);
4541*4bdc9457SAndroid Build Coastguard Worker 
4542*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(output, vout);
4543*4bdc9457SAndroid Build Coastguard Worker     output += 4;
4544*4bdc9457SAndroid Build Coastguard Worker 
4545*4bdc9457SAndroid Build Coastguard Worker     channels -= 4;
4546*4bdc9457SAndroid Build Coastguard Worker   }
4547*4bdc9457SAndroid Build Coastguard Worker   if (channels != 0) {
4548*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi0 = _mm_loadu_ps(i0);
4549*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi1 = _mm_loadu_ps(i1);
4550*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi2 = _mm_loadu_ps(i2);
4551*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi3 = _mm_loadu_ps(i3);
4552*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi4 = _mm_loadu_ps(i4);
4553*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi5 = _mm_loadu_ps(i5);
4554*4bdc9457SAndroid Build Coastguard Worker     const __m128 vi6 = _mm_loadu_ps(i6);
4555*4bdc9457SAndroid Build Coastguard Worker 
4556*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum01 = _mm_add_ps(vi0, vi1);
4557*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum23 = _mm_add_ps(vi2, vi3);
4558*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum45 = _mm_add_ps(vi4, vi5);
4559*4bdc9457SAndroid Build Coastguard Worker 
4560*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
4561*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
4562*4bdc9457SAndroid Build Coastguard Worker 
4563*4bdc9457SAndroid Build Coastguard Worker     const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
4564*4bdc9457SAndroid Build Coastguard Worker 
4565*4bdc9457SAndroid Build Coastguard Worker     __m128 vout = _mm_mul_ps(vsum, vscale);
4566*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_max_ps(vout, vmin);
4567*4bdc9457SAndroid Build Coastguard Worker     vout = _mm_min_ps(vout, vmax);
4568*4bdc9457SAndroid Build Coastguard Worker 
4569*4bdc9457SAndroid Build Coastguard Worker     if (channels & 2) {
4570*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) output, vout);
4571*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_movehl_ps(vout, vout);
4572*4bdc9457SAndroid Build Coastguard Worker       output += 2;
4573*4bdc9457SAndroid Build Coastguard Worker     }
4574*4bdc9457SAndroid Build Coastguard Worker     if (channels & 1) {
4575*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(output, vout);
4576*4bdc9457SAndroid Build Coastguard Worker     }
4577*4bdc9457SAndroid Build Coastguard Worker   }
4578*4bdc9457SAndroid Build Coastguard Worker }
4579*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_gemm_minmax_ukernel_1x8__sse_load1(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4580*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gemm_minmax_ukernel_1x8__sse_load1(
4581*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
4582*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
4583*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
4584*4bdc9457SAndroid Build Coastguard Worker     const float*restrict a,
4585*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
4586*4bdc9457SAndroid Build Coastguard Worker     const float*restrict w,
4587*4bdc9457SAndroid Build Coastguard Worker     float*restrict c,
4588*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
4589*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
4590*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
4591*4bdc9457SAndroid Build Coastguard Worker {
4592*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
4593*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
4594*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
4595*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
4596*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(float) == 0);
4597*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
4598*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
4599*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
4600*4bdc9457SAndroid Build Coastguard Worker 
4601*4bdc9457SAndroid Build Coastguard Worker   const float* a0 = a;
4602*4bdc9457SAndroid Build Coastguard Worker   float* c0 = c;
4603*4bdc9457SAndroid Build Coastguard Worker 
4604*4bdc9457SAndroid Build Coastguard Worker   do {
4605*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x0123 = _mm_load_ps(w + 0);
4606*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x4567 = _mm_load_ps(w + 4);
4607*4bdc9457SAndroid Build Coastguard Worker     w += 8;
4608*4bdc9457SAndroid Build Coastguard Worker 
4609*4bdc9457SAndroid Build Coastguard Worker     size_t k = kc;
4610*4bdc9457SAndroid Build Coastguard Worker     do {
4611*4bdc9457SAndroid Build Coastguard Worker       const __m128 va0 = _mm_load1_ps(a0);
4612*4bdc9457SAndroid Build Coastguard Worker       a0 += 1;
4613*4bdc9457SAndroid Build Coastguard Worker 
4614*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb0123 = _mm_load_ps(w);
4615*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb4567 = _mm_load_ps(w + 4);
4616*4bdc9457SAndroid Build Coastguard Worker       w += 8;
4617*4bdc9457SAndroid Build Coastguard Worker 
4618*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
4619*4bdc9457SAndroid Build Coastguard Worker       vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
4620*4bdc9457SAndroid Build Coastguard Worker 
4621*4bdc9457SAndroid Build Coastguard Worker       k -= sizeof(float);
4622*4bdc9457SAndroid Build Coastguard Worker     } while (k != 0);
4623*4bdc9457SAndroid Build Coastguard Worker 
4624*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmax = _mm_load_ps(params->sse.max);
4625*4bdc9457SAndroid Build Coastguard Worker     vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
4626*4bdc9457SAndroid Build Coastguard Worker     vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
4627*4bdc9457SAndroid Build Coastguard Worker 
4628*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmin = _mm_load_ps(params->sse.min);
4629*4bdc9457SAndroid Build Coastguard Worker     vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
4630*4bdc9457SAndroid Build Coastguard Worker     vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
4631*4bdc9457SAndroid Build Coastguard Worker 
4632*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 8) {
4633*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c0, vacc0x0123);
4634*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c0 + 4, vacc0x4567);
4635*4bdc9457SAndroid Build Coastguard Worker       c0 = (float*) ((uintptr_t) c0 + cn_stride);
4636*4bdc9457SAndroid Build Coastguard Worker 
4637*4bdc9457SAndroid Build Coastguard Worker       a0 = (const float*) ((uintptr_t) a0 - kc);
4638*4bdc9457SAndroid Build Coastguard Worker 
4639*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
4640*4bdc9457SAndroid Build Coastguard Worker     } else {
4641*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
4642*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c0, vacc0x0123);
4643*4bdc9457SAndroid Build Coastguard Worker 
4644*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = vacc0x4567;
4645*4bdc9457SAndroid Build Coastguard Worker 
4646*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
4647*4bdc9457SAndroid Build Coastguard Worker       }
4648*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
4649*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c0, vacc0x0123);
4650*4bdc9457SAndroid Build Coastguard Worker 
4651*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
4652*4bdc9457SAndroid Build Coastguard Worker 
4653*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
4654*4bdc9457SAndroid Build Coastguard Worker       }
4655*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
4656*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c0, vacc0x0123);
4657*4bdc9457SAndroid Build Coastguard Worker       }
4658*4bdc9457SAndroid Build Coastguard Worker 
4659*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
4660*4bdc9457SAndroid Build Coastguard Worker     }
4661*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
4662*4bdc9457SAndroid Build Coastguard Worker }
4663*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_gemm_minmax_ukernel_4x2c4__sse(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4664*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gemm_minmax_ukernel_4x2c4__sse(
4665*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
4666*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
4667*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
4668*4bdc9457SAndroid Build Coastguard Worker     const float* restrict a,
4669*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
4670*4bdc9457SAndroid Build Coastguard Worker     const float* restrict w,
4671*4bdc9457SAndroid Build Coastguard Worker     float* restrict c,
4672*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
4673*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
4674*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4675*4bdc9457SAndroid Build Coastguard Worker {
4676*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
4677*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 4);
4678*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
4679*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
4680*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(float) == 0);
4681*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
4682*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
4683*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
4684*4bdc9457SAndroid Build Coastguard Worker 
4685*4bdc9457SAndroid Build Coastguard Worker   const float* a0 = a;
4686*4bdc9457SAndroid Build Coastguard Worker   float* c0 = c;
4687*4bdc9457SAndroid Build Coastguard Worker   const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
4688*4bdc9457SAndroid Build Coastguard Worker   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
4689*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
4690*4bdc9457SAndroid Build Coastguard Worker     a1 = a0;
4691*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
4692*4bdc9457SAndroid Build Coastguard Worker   }
4693*4bdc9457SAndroid Build Coastguard Worker   const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
4694*4bdc9457SAndroid Build Coastguard Worker   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
4695*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
4696*4bdc9457SAndroid Build Coastguard Worker     a2 = a1;
4697*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
4698*4bdc9457SAndroid Build Coastguard Worker   }
4699*4bdc9457SAndroid Build Coastguard Worker   const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
4700*4bdc9457SAndroid Build Coastguard Worker   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
4701*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr != 4) {
4702*4bdc9457SAndroid Build Coastguard Worker     a3 = a2;
4703*4bdc9457SAndroid Build Coastguard Worker     c3 = c2;
4704*4bdc9457SAndroid Build Coastguard Worker   }
4705*4bdc9457SAndroid Build Coastguard Worker 
4706*4bdc9457SAndroid Build Coastguard Worker   do {
4707*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x0c4 = _mm_load_ss(w);
4708*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x1c4 = _mm_load_ss(w + 1);
4709*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc1x0c4 = vacc0x0c4;
4710*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc1x1c4 = vacc0x1c4;
4711*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc2x0c4 = vacc0x0c4;
4712*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc2x1c4 = vacc0x1c4;
4713*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc3x0c4 = vacc0x0c4;
4714*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc3x1c4 = vacc0x1c4;
4715*4bdc9457SAndroid Build Coastguard Worker     w += 2;
4716*4bdc9457SAndroid Build Coastguard Worker 
4717*4bdc9457SAndroid Build Coastguard Worker     size_t k = kc;
4718*4bdc9457SAndroid Build Coastguard Worker     for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
4719*4bdc9457SAndroid Build Coastguard Worker       const __m128 va0 = _mm_loadu_ps(a0);
4720*4bdc9457SAndroid Build Coastguard Worker       a0 += 4;
4721*4bdc9457SAndroid Build Coastguard Worker       const __m128 va1 = _mm_loadu_ps(a1);
4722*4bdc9457SAndroid Build Coastguard Worker       a1 += 4;
4723*4bdc9457SAndroid Build Coastguard Worker       const __m128 va2 = _mm_loadu_ps(a2);
4724*4bdc9457SAndroid Build Coastguard Worker       a2 += 4;
4725*4bdc9457SAndroid Build Coastguard Worker       const __m128 va3 = _mm_loadu_ps(a3);
4726*4bdc9457SAndroid Build Coastguard Worker       a3 += 4;
4727*4bdc9457SAndroid Build Coastguard Worker 
4728*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb0 = _mm_loadu_ps(w);
4729*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb1 = _mm_loadu_ps(w + 4);
4730*4bdc9457SAndroid Build Coastguard Worker       w += 8;
4731*4bdc9457SAndroid Build Coastguard Worker 
4732*4bdc9457SAndroid Build Coastguard Worker       vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(va0, vb0));
4733*4bdc9457SAndroid Build Coastguard Worker       vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(va0, vb1));
4734*4bdc9457SAndroid Build Coastguard Worker       vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(va1, vb0));
4735*4bdc9457SAndroid Build Coastguard Worker       vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(va1, vb1));
4736*4bdc9457SAndroid Build Coastguard Worker       vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(va2, vb0));
4737*4bdc9457SAndroid Build Coastguard Worker       vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(va2, vb1));
4738*4bdc9457SAndroid Build Coastguard Worker       vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(va3, vb0));
4739*4bdc9457SAndroid Build Coastguard Worker       vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(va3, vb1));
4740*4bdc9457SAndroid Build Coastguard Worker     }
4741*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(k != 0) {
4742*4bdc9457SAndroid Build Coastguard Worker       const __m128 va0 = _mm_loadu_ps(a0);
4743*4bdc9457SAndroid Build Coastguard Worker       a0 = (const float*) ((uintptr_t) a0 + k);
4744*4bdc9457SAndroid Build Coastguard Worker       const __m128 va1 = _mm_loadu_ps(a1);
4745*4bdc9457SAndroid Build Coastguard Worker       a1 = (const float*) ((uintptr_t) a1 + k);
4746*4bdc9457SAndroid Build Coastguard Worker       const __m128 va2 = _mm_loadu_ps(a2);
4747*4bdc9457SAndroid Build Coastguard Worker       a2 = (const float*) ((uintptr_t) a2 + k);
4748*4bdc9457SAndroid Build Coastguard Worker       const __m128 va3 = _mm_loadu_ps(a3);
4749*4bdc9457SAndroid Build Coastguard Worker       a3 = (const float*) ((uintptr_t) a3 + k);
4750*4bdc9457SAndroid Build Coastguard Worker 
4751*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb0 = _mm_loadu_ps(w);
4752*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb1 = _mm_loadu_ps(w + 4);
4753*4bdc9457SAndroid Build Coastguard Worker       w += 8;
4754*4bdc9457SAndroid Build Coastguard Worker 
4755*4bdc9457SAndroid Build Coastguard Worker       const __m128 vmask0 = _mm_cmpeq_ps(_mm_setzero_ps(), vb0);
4756*4bdc9457SAndroid Build Coastguard Worker       const __m128 vmask1 = _mm_cmpeq_ps(_mm_setzero_ps(), vb1);
4757*4bdc9457SAndroid Build Coastguard Worker 
4758*4bdc9457SAndroid Build Coastguard Worker       vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va0), vb0));
4759*4bdc9457SAndroid Build Coastguard Worker       vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va0), vb1));
4760*4bdc9457SAndroid Build Coastguard Worker       vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va1), vb0));
4761*4bdc9457SAndroid Build Coastguard Worker       vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va1), vb1));
4762*4bdc9457SAndroid Build Coastguard Worker       vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va2), vb0));
4763*4bdc9457SAndroid Build Coastguard Worker       vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va2), vb1));
4764*4bdc9457SAndroid Build Coastguard Worker       vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va3), vb0));
4765*4bdc9457SAndroid Build Coastguard Worker       vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va3), vb1));
4766*4bdc9457SAndroid Build Coastguard Worker     }
4767*4bdc9457SAndroid Build Coastguard Worker 
4768*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc0x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc0x0c4, vacc0x1c4), _mm_unpackhi_ps(vacc0x0c4, vacc0x1c4));
4769*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc1x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc1x0c4, vacc1x1c4), _mm_unpackhi_ps(vacc1x0c4, vacc1x1c4));
4770*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc2x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc2x0c4, vacc2x1c4), _mm_unpackhi_ps(vacc2x0c4, vacc2x1c4));
4771*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc3x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc3x0c4, vacc3x1c4), _mm_unpackhi_ps(vacc3x0c4, vacc3x1c4));
4772*4bdc9457SAndroid Build Coastguard Worker 
4773*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc01x01 = _mm_add_ps(_mm_movelh_ps(vacc0x01c2, vacc1x01c2), _mm_movehl_ps(vacc1x01c2, vacc0x01c2));
4774*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc23x01 = _mm_add_ps(_mm_movelh_ps(vacc2x01c2, vacc3x01c2), _mm_movehl_ps(vacc3x01c2, vacc2x01c2));
4775*4bdc9457SAndroid Build Coastguard Worker 
4776*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmax = _mm_load_ps(params->sse.max);
4777*4bdc9457SAndroid Build Coastguard Worker     vacc01x01 = _mm_min_ps(vacc01x01, vmax);
4778*4bdc9457SAndroid Build Coastguard Worker     vacc23x01 = _mm_min_ps(vacc23x01, vmax);
4779*4bdc9457SAndroid Build Coastguard Worker 
4780*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmin = _mm_load_ps(params->sse.min);
4781*4bdc9457SAndroid Build Coastguard Worker     vacc01x01 = _mm_max_ps(vacc01x01, vmin);
4782*4bdc9457SAndroid Build Coastguard Worker     vacc23x01 = _mm_max_ps(vacc23x01, vmin);
4783*4bdc9457SAndroid Build Coastguard Worker 
4784*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 2) {
4785*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) c2, vacc23x01);
4786*4bdc9457SAndroid Build Coastguard Worker       c2 = (float*) ((uintptr_t) c2 + cn_stride);
4787*4bdc9457SAndroid Build Coastguard Worker       a2 = (const float*) ((uintptr_t) a2 - kc);
4788*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c3, vacc23x01);
4789*4bdc9457SAndroid Build Coastguard Worker       c3 = (float*) ((uintptr_t) c3 + cn_stride);
4790*4bdc9457SAndroid Build Coastguard Worker       a3 = (const float*) ((uintptr_t) a3 - kc);
4791*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) c0, vacc01x01);
4792*4bdc9457SAndroid Build Coastguard Worker       c0 = (float*) ((uintptr_t) c0 + cn_stride);
4793*4bdc9457SAndroid Build Coastguard Worker       a0 = (const float*) ((uintptr_t) a0 - kc);
4794*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c1, vacc01x01);
4795*4bdc9457SAndroid Build Coastguard Worker       c1 = (float*) ((uintptr_t) c1 + cn_stride);
4796*4bdc9457SAndroid Build Coastguard Worker       a1 = (const float*) ((uintptr_t) a1 - kc);
4797*4bdc9457SAndroid Build Coastguard Worker 
4798*4bdc9457SAndroid Build Coastguard Worker       nc -= 2;
4799*4bdc9457SAndroid Build Coastguard Worker     } else {
4800*4bdc9457SAndroid Build Coastguard Worker       assert(nc == 1);
4801*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(c2, vacc23x01);
4802*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(c3, _mm_movehl_ps(vacc23x01, vacc23x01));
4803*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(c0, vacc01x01);
4804*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(c1, _mm_movehl_ps(vacc01x01, vacc01x01));
4805*4bdc9457SAndroid Build Coastguard Worker 
4806*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
4807*4bdc9457SAndroid Build Coastguard Worker     }
4808*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
4809*4bdc9457SAndroid Build Coastguard Worker }
4810*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_gemm_minmax_ukernel_4x8__sse_load1(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4811*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gemm_minmax_ukernel_4x8__sse_load1(
4812*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
4813*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
4814*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
4815*4bdc9457SAndroid Build Coastguard Worker     const float*restrict a,
4816*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
4817*4bdc9457SAndroid Build Coastguard Worker     const float*restrict w,
4818*4bdc9457SAndroid Build Coastguard Worker     float*restrict c,
4819*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
4820*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
4821*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
4822*4bdc9457SAndroid Build Coastguard Worker {
4823*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
4824*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 4);
4825*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
4826*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
4827*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(float) == 0);
4828*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
4829*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
4830*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
4831*4bdc9457SAndroid Build Coastguard Worker 
4832*4bdc9457SAndroid Build Coastguard Worker   const float* a0 = a;
4833*4bdc9457SAndroid Build Coastguard Worker   float* c0 = c;
4834*4bdc9457SAndroid Build Coastguard Worker   const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
4835*4bdc9457SAndroid Build Coastguard Worker   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
4836*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
4837*4bdc9457SAndroid Build Coastguard Worker     a1 = a0;
4838*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
4839*4bdc9457SAndroid Build Coastguard Worker   }
4840*4bdc9457SAndroid Build Coastguard Worker   const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
4841*4bdc9457SAndroid Build Coastguard Worker   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
4842*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
4843*4bdc9457SAndroid Build Coastguard Worker     a2 = a1;
4844*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
4845*4bdc9457SAndroid Build Coastguard Worker   }
4846*4bdc9457SAndroid Build Coastguard Worker   const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
4847*4bdc9457SAndroid Build Coastguard Worker   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
4848*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr != 4) {
4849*4bdc9457SAndroid Build Coastguard Worker     a3 = a2;
4850*4bdc9457SAndroid Build Coastguard Worker     c3 = c2;
4851*4bdc9457SAndroid Build Coastguard Worker   }
4852*4bdc9457SAndroid Build Coastguard Worker 
4853*4bdc9457SAndroid Build Coastguard Worker   do {
4854*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x0123 = _mm_load_ps(w + 0);
4855*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x4567 = _mm_load_ps(w + 4);
4856*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc1x0123 = vacc0x0123;
4857*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc1x4567 = vacc0x4567;
4858*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc2x0123 = vacc0x0123;
4859*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc2x4567 = vacc0x4567;
4860*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc3x0123 = vacc0x0123;
4861*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc3x4567 = vacc0x4567;
4862*4bdc9457SAndroid Build Coastguard Worker     w += 8;
4863*4bdc9457SAndroid Build Coastguard Worker 
4864*4bdc9457SAndroid Build Coastguard Worker     size_t k = kc;
4865*4bdc9457SAndroid Build Coastguard Worker     do {
4866*4bdc9457SAndroid Build Coastguard Worker       const __m128 va0 = _mm_load1_ps(a0);
4867*4bdc9457SAndroid Build Coastguard Worker       a0 += 1;
4868*4bdc9457SAndroid Build Coastguard Worker       const __m128 va1 = _mm_load1_ps(a1);
4869*4bdc9457SAndroid Build Coastguard Worker       a1 += 1;
4870*4bdc9457SAndroid Build Coastguard Worker       const __m128 va2 = _mm_load1_ps(a2);
4871*4bdc9457SAndroid Build Coastguard Worker       a2 += 1;
4872*4bdc9457SAndroid Build Coastguard Worker       const __m128 va3 = _mm_load1_ps(a3);
4873*4bdc9457SAndroid Build Coastguard Worker       a3 += 1;
4874*4bdc9457SAndroid Build Coastguard Worker 
4875*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb0123 = _mm_load_ps(w);
4876*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb4567 = _mm_load_ps(w + 4);
4877*4bdc9457SAndroid Build Coastguard Worker       w += 8;
4878*4bdc9457SAndroid Build Coastguard Worker 
4879*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
4880*4bdc9457SAndroid Build Coastguard Worker       vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
4881*4bdc9457SAndroid Build Coastguard Worker       vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
4882*4bdc9457SAndroid Build Coastguard Worker       vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
4883*4bdc9457SAndroid Build Coastguard Worker       vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
4884*4bdc9457SAndroid Build Coastguard Worker       vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
4885*4bdc9457SAndroid Build Coastguard Worker       vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
4886*4bdc9457SAndroid Build Coastguard Worker       vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
4887*4bdc9457SAndroid Build Coastguard Worker 
4888*4bdc9457SAndroid Build Coastguard Worker       k -= sizeof(float);
4889*4bdc9457SAndroid Build Coastguard Worker     } while (k != 0);
4890*4bdc9457SAndroid Build Coastguard Worker 
4891*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmax = _mm_load_ps(params->sse.max);
4892*4bdc9457SAndroid Build Coastguard Worker     vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
4893*4bdc9457SAndroid Build Coastguard Worker     vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
4894*4bdc9457SAndroid Build Coastguard Worker     vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
4895*4bdc9457SAndroid Build Coastguard Worker     vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
4896*4bdc9457SAndroid Build Coastguard Worker     vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
4897*4bdc9457SAndroid Build Coastguard Worker     vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
4898*4bdc9457SAndroid Build Coastguard Worker     vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
4899*4bdc9457SAndroid Build Coastguard Worker     vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
4900*4bdc9457SAndroid Build Coastguard Worker 
4901*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmin = _mm_load_ps(params->sse.min);
4902*4bdc9457SAndroid Build Coastguard Worker     vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
4903*4bdc9457SAndroid Build Coastguard Worker     vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
4904*4bdc9457SAndroid Build Coastguard Worker     vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
4905*4bdc9457SAndroid Build Coastguard Worker     vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
4906*4bdc9457SAndroid Build Coastguard Worker     vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
4907*4bdc9457SAndroid Build Coastguard Worker     vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
4908*4bdc9457SAndroid Build Coastguard Worker     vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
4909*4bdc9457SAndroid Build Coastguard Worker     vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
4910*4bdc9457SAndroid Build Coastguard Worker 
4911*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 8) {
4912*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c3, vacc3x0123);
4913*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c3 + 4, vacc3x4567);
4914*4bdc9457SAndroid Build Coastguard Worker       c3 = (float*) ((uintptr_t) c3 + cn_stride);
4915*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c2, vacc2x0123);
4916*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c2 + 4, vacc2x4567);
4917*4bdc9457SAndroid Build Coastguard Worker       c2 = (float*) ((uintptr_t) c2 + cn_stride);
4918*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c1, vacc1x0123);
4919*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c1 + 4, vacc1x4567);
4920*4bdc9457SAndroid Build Coastguard Worker       c1 = (float*) ((uintptr_t) c1 + cn_stride);
4921*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c0, vacc0x0123);
4922*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c0 + 4, vacc0x4567);
4923*4bdc9457SAndroid Build Coastguard Worker       c0 = (float*) ((uintptr_t) c0 + cn_stride);
4924*4bdc9457SAndroid Build Coastguard Worker 
4925*4bdc9457SAndroid Build Coastguard Worker       a3 = (const float*) ((uintptr_t) a3 - kc);
4926*4bdc9457SAndroid Build Coastguard Worker       a2 = (const float*) ((uintptr_t) a2 - kc);
4927*4bdc9457SAndroid Build Coastguard Worker       a1 = (const float*) ((uintptr_t) a1 - kc);
4928*4bdc9457SAndroid Build Coastguard Worker       a0 = (const float*) ((uintptr_t) a0 - kc);
4929*4bdc9457SAndroid Build Coastguard Worker 
4930*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
4931*4bdc9457SAndroid Build Coastguard Worker     } else {
4932*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
4933*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c3, vacc3x0123);
4934*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c2, vacc2x0123);
4935*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c1, vacc1x0123);
4936*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c0, vacc0x0123);
4937*4bdc9457SAndroid Build Coastguard Worker 
4938*4bdc9457SAndroid Build Coastguard Worker         vacc3x0123 = vacc3x4567;
4939*4bdc9457SAndroid Build Coastguard Worker         vacc2x0123 = vacc2x4567;
4940*4bdc9457SAndroid Build Coastguard Worker         vacc1x0123 = vacc1x4567;
4941*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = vacc0x4567;
4942*4bdc9457SAndroid Build Coastguard Worker 
4943*4bdc9457SAndroid Build Coastguard Worker         c3 += 4;
4944*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
4945*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
4946*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
4947*4bdc9457SAndroid Build Coastguard Worker       }
4948*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
4949*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c3, vacc3x0123);
4950*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c2, vacc2x0123);
4951*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c1, vacc1x0123);
4952*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c0, vacc0x0123);
4953*4bdc9457SAndroid Build Coastguard Worker 
4954*4bdc9457SAndroid Build Coastguard Worker         vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
4955*4bdc9457SAndroid Build Coastguard Worker         vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
4956*4bdc9457SAndroid Build Coastguard Worker         vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
4957*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
4958*4bdc9457SAndroid Build Coastguard Worker 
4959*4bdc9457SAndroid Build Coastguard Worker         c3 += 2;
4960*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
4961*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
4962*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
4963*4bdc9457SAndroid Build Coastguard Worker       }
4964*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
4965*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c3, vacc3x0123);
4966*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c2, vacc2x0123);
4967*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c1, vacc1x0123);
4968*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c0, vacc0x0123);
4969*4bdc9457SAndroid Build Coastguard Worker       }
4970*4bdc9457SAndroid Build Coastguard Worker 
4971*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
4972*4bdc9457SAndroid Build Coastguard Worker     }
4973*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
4974*4bdc9457SAndroid Build Coastguard Worker }
4975*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_ibilinear_chw_ukernel__sse_p8(size_t output_pixels,size_t channels,const float ** restrict input,size_t input_offset,const float * restrict weights,float * restrict output,size_t input_increment)4976*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_ibilinear_chw_ukernel__sse_p8(
4977*4bdc9457SAndroid Build Coastguard Worker     size_t output_pixels,
4978*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
4979*4bdc9457SAndroid Build Coastguard Worker     const float**restrict input,
4980*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
4981*4bdc9457SAndroid Build Coastguard Worker     const float*restrict weights,
4982*4bdc9457SAndroid Build Coastguard Worker     float*restrict output,
4983*4bdc9457SAndroid Build Coastguard Worker     size_t input_increment) XNN_OOB_READS
4984*4bdc9457SAndroid Build Coastguard Worker {
4985*4bdc9457SAndroid Build Coastguard Worker   assert(output_pixels != 0);
4986*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
4987*4bdc9457SAndroid Build Coastguard Worker   assert(input_increment % sizeof(float) == 0);
4988*4bdc9457SAndroid Build Coastguard Worker 
4989*4bdc9457SAndroid Build Coastguard Worker   do {
4990*4bdc9457SAndroid Build Coastguard Worker     const float** i = input;
4991*4bdc9457SAndroid Build Coastguard Worker     const float* w = weights;
4992*4bdc9457SAndroid Build Coastguard Worker     size_t p = output_pixels;
4993*4bdc9457SAndroid Build Coastguard Worker     for (; p >= 8; p -= 8) {
4994*4bdc9457SAndroid Build Coastguard Worker       const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
4995*4bdc9457SAndroid Build Coastguard Worker       const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
4996*4bdc9457SAndroid Build Coastguard Worker       const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
4997*4bdc9457SAndroid Build Coastguard Worker       const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
4998*4bdc9457SAndroid Build Coastguard Worker       const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
4999*4bdc9457SAndroid Build Coastguard Worker       const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
5000*4bdc9457SAndroid Build Coastguard Worker       const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
5001*4bdc9457SAndroid Build Coastguard Worker       const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
5002*4bdc9457SAndroid Build Coastguard Worker       const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
5003*4bdc9457SAndroid Build Coastguard Worker       const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
5004*4bdc9457SAndroid Build Coastguard Worker       const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
5005*4bdc9457SAndroid Build Coastguard Worker       const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
5006*4bdc9457SAndroid Build Coastguard Worker       const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
5007*4bdc9457SAndroid Build Coastguard Worker       const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
5008*4bdc9457SAndroid Build Coastguard Worker       const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
5009*4bdc9457SAndroid Build Coastguard Worker       const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
5010*4bdc9457SAndroid Build Coastguard Worker       i += 2 * 8;
5011*4bdc9457SAndroid Build Coastguard Worker 
5012*4bdc9457SAndroid Build Coastguard Worker       const __m128 vw0123p0 = _mm_loadu_ps(w + 0);
5013*4bdc9457SAndroid Build Coastguard Worker       const __m128 vw0123p1 = _mm_loadu_ps(w + 4);
5014*4bdc9457SAndroid Build Coastguard Worker       const __m128 vw4567p0 = _mm_loadu_ps(w + 8);
5015*4bdc9457SAndroid Build Coastguard Worker       const __m128 vw4567p1 = _mm_loadu_ps(w + 12);
5016*4bdc9457SAndroid Build Coastguard Worker       w += 2 * 8;
5017*4bdc9457SAndroid Build Coastguard Worker 
5018*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0);
5019*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0);
5020*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl2);
5021*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl2);
5022*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr4 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl4);
5023*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr4 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl4);
5024*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr6 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl6);
5025*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr6 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl6);
5026*4bdc9457SAndroid Build Coastguard Worker 
5027*4bdc9457SAndroid Build Coastguard Worker       const __m128 valphah0123 = _mm_shuffle_ps(vw0123p0, vw0123p1, _MM_SHUFFLE(2, 0, 2, 0));
5028*4bdc9457SAndroid Build Coastguard Worker       const __m128 valphav0123 = _mm_shuffle_ps(vw0123p0, vw0123p1, _MM_SHUFFLE(3, 1, 3, 1));
5029*4bdc9457SAndroid Build Coastguard Worker       const __m128 valphah4567 = _mm_shuffle_ps(vw4567p0, vw4567p1, _MM_SHUFFLE(2, 0, 2, 0));
5030*4bdc9457SAndroid Build Coastguard Worker       const __m128 valphav4567 = _mm_shuffle_ps(vw4567p0, vw4567p1, _MM_SHUFFLE(3, 1, 3, 1));
5031*4bdc9457SAndroid Build Coastguard Worker 
5032*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr01 = _mm_loadh_pi(vtltr0, (const __m64*) itl1);
5033*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr01 = _mm_loadh_pi(vblbr0, (const __m64*) ibl1);
5034*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr23 = _mm_loadh_pi(vtltr2, (const __m64*) itl3);
5035*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr23 = _mm_loadh_pi(vblbr2, (const __m64*) ibl3);
5036*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr45 = _mm_loadh_pi(vtltr4, (const __m64*) itl5);
5037*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr45 = _mm_loadh_pi(vblbr4, (const __m64*) ibl5);
5038*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr67 = _mm_loadh_pi(vtltr6, (const __m64*) itl7);
5039*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr67 = _mm_loadh_pi(vblbr6, (const __m64*) ibl7);
5040*4bdc9457SAndroid Build Coastguard Worker 
5041*4bdc9457SAndroid Build Coastguard Worker       const __m128 vldrd01 = _mm_sub_ps(vblbr01, vtltr01);
5042*4bdc9457SAndroid Build Coastguard Worker       const __m128 vldrd23 = _mm_sub_ps(vblbr23, vtltr23);
5043*4bdc9457SAndroid Build Coastguard Worker       const __m128 vldrd45 = _mm_sub_ps(vblbr45, vtltr45);
5044*4bdc9457SAndroid Build Coastguard Worker       const __m128 vldrd67 = _mm_sub_ps(vblbr67, vtltr67);
5045*4bdc9457SAndroid Build Coastguard Worker 
5046*4bdc9457SAndroid Build Coastguard Worker       const __m128 vld0123 = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(2, 0, 2, 0));
5047*4bdc9457SAndroid Build Coastguard Worker       const __m128 vrd0123 = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(3, 1, 3, 1));
5048*4bdc9457SAndroid Build Coastguard Worker       const __m128 vld4567 = _mm_shuffle_ps(vldrd45, vldrd67, _MM_SHUFFLE(2, 0, 2, 0));
5049*4bdc9457SAndroid Build Coastguard Worker       const __m128 vrd4567 = _mm_shuffle_ps(vldrd45, vldrd67, _MM_SHUFFLE(3, 1, 3, 1));
5050*4bdc9457SAndroid Build Coastguard Worker 
5051*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtl0123 = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(2, 0, 2, 0));
5052*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtr0123 = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(3, 1, 3, 1));
5053*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtl4567 = _mm_shuffle_ps(vtltr45, vtltr67, _MM_SHUFFLE(2, 0, 2, 0));
5054*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtr4567 = _mm_shuffle_ps(vtltr45, vtltr67, _MM_SHUFFLE(3, 1, 3, 1));
5055*4bdc9457SAndroid Build Coastguard Worker 
5056*4bdc9457SAndroid Build Coastguard Worker       const __m128 vl0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vld0123, valphav0123));
5057*4bdc9457SAndroid Build Coastguard Worker       const __m128 vr0123 = _mm_add_ps(vtr0123, _mm_mul_ps(vrd0123, valphav0123));
5058*4bdc9457SAndroid Build Coastguard Worker       const __m128 vl4567 = _mm_add_ps(vtl4567, _mm_mul_ps(vld4567, valphav4567));
5059*4bdc9457SAndroid Build Coastguard Worker       const __m128 vr4567 = _mm_add_ps(vtr4567, _mm_mul_ps(vrd4567, valphav4567));
5060*4bdc9457SAndroid Build Coastguard Worker 
5061*4bdc9457SAndroid Build Coastguard Worker       const __m128 vd0123 = _mm_sub_ps(vr0123, vl0123);
5062*4bdc9457SAndroid Build Coastguard Worker       const __m128 vd4567 = _mm_sub_ps(vr4567, vl4567);
5063*4bdc9457SAndroid Build Coastguard Worker 
5064*4bdc9457SAndroid Build Coastguard Worker       const __m128 vo0123 = _mm_add_ps(vl0123, _mm_mul_ps(vd0123, valphah0123));
5065*4bdc9457SAndroid Build Coastguard Worker       const __m128 vo4567 = _mm_add_ps(vl4567, _mm_mul_ps(vd4567, valphah4567));
5066*4bdc9457SAndroid Build Coastguard Worker 
5067*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 0, vo0123);
5068*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 4, vo4567);
5069*4bdc9457SAndroid Build Coastguard Worker       output += 8;
5070*4bdc9457SAndroid Build Coastguard Worker     }
5071*4bdc9457SAndroid Build Coastguard Worker 
5072*4bdc9457SAndroid Build Coastguard Worker     for (; p >= 4; p -= 4) {
5073*4bdc9457SAndroid Build Coastguard Worker       const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
5074*4bdc9457SAndroid Build Coastguard Worker       const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
5075*4bdc9457SAndroid Build Coastguard Worker       const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
5076*4bdc9457SAndroid Build Coastguard Worker       const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
5077*4bdc9457SAndroid Build Coastguard Worker       const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
5078*4bdc9457SAndroid Build Coastguard Worker       const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
5079*4bdc9457SAndroid Build Coastguard Worker       const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
5080*4bdc9457SAndroid Build Coastguard Worker       const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
5081*4bdc9457SAndroid Build Coastguard Worker       i += 8;
5082*4bdc9457SAndroid Build Coastguard Worker 
5083*4bdc9457SAndroid Build Coastguard Worker       const __m128 vw0 = _mm_loadu_ps(w);
5084*4bdc9457SAndroid Build Coastguard Worker       const __m128 vw1 = _mm_loadu_ps(w + 4);
5085*4bdc9457SAndroid Build Coastguard Worker       w += 8;
5086*4bdc9457SAndroid Build Coastguard Worker 
5087*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0);
5088*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0);
5089*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl2);
5090*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl2);
5091*4bdc9457SAndroid Build Coastguard Worker 
5092*4bdc9457SAndroid Build Coastguard Worker       const __m128 valphah = _mm_shuffle_ps(vw0, vw1, _MM_SHUFFLE(2, 0, 2, 0));
5093*4bdc9457SAndroid Build Coastguard Worker       const __m128 valphav = _mm_shuffle_ps(vw0, vw1, _MM_SHUFFLE(3, 1, 3, 1));
5094*4bdc9457SAndroid Build Coastguard Worker 
5095*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr01 = _mm_loadh_pi(vtltr0, (const __m64*) itl1);
5096*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr01 = _mm_loadh_pi(vblbr0, (const __m64*) ibl1);
5097*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtltr23 = _mm_loadh_pi(vtltr2, (const __m64*) itl3);
5098*4bdc9457SAndroid Build Coastguard Worker       const __m128 vblbr23 = _mm_loadh_pi(vblbr2, (const __m64*) ibl3);
5099*4bdc9457SAndroid Build Coastguard Worker 
5100*4bdc9457SAndroid Build Coastguard Worker       const __m128 vldrd01 = _mm_sub_ps(vblbr01, vtltr01);
5101*4bdc9457SAndroid Build Coastguard Worker       const __m128 vldrd23 = _mm_sub_ps(vblbr23, vtltr23);
5102*4bdc9457SAndroid Build Coastguard Worker 
5103*4bdc9457SAndroid Build Coastguard Worker       const __m128 vld = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(2, 0, 2, 0));
5104*4bdc9457SAndroid Build Coastguard Worker       const __m128 vrd = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(3, 1, 3, 1));
5105*4bdc9457SAndroid Build Coastguard Worker 
5106*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtl = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(2, 0, 2, 0));
5107*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtr = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(3, 1, 3, 1));
5108*4bdc9457SAndroid Build Coastguard Worker 
5109*4bdc9457SAndroid Build Coastguard Worker       const __m128 vl = _mm_add_ps(vtl, _mm_mul_ps(vld, valphav));
5110*4bdc9457SAndroid Build Coastguard Worker       const __m128 vr = _mm_add_ps(vtr, _mm_mul_ps(vrd, valphav));
5111*4bdc9457SAndroid Build Coastguard Worker 
5112*4bdc9457SAndroid Build Coastguard Worker       const __m128 vd = _mm_sub_ps(vr, vl);
5113*4bdc9457SAndroid Build Coastguard Worker       const __m128 vo = _mm_add_ps(vl, _mm_mul_ps(vd, valphah));
5114*4bdc9457SAndroid Build Coastguard Worker 
5115*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vo);
5116*4bdc9457SAndroid Build Coastguard Worker       output += 4;
5117*4bdc9457SAndroid Build Coastguard Worker     }
5118*4bdc9457SAndroid Build Coastguard Worker 
5119*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(p != 0) {
5120*4bdc9457SAndroid Build Coastguard Worker       if (p & 2) {
5121*4bdc9457SAndroid Build Coastguard Worker         const __m128 vw = _mm_loadu_ps(w);
5122*4bdc9457SAndroid Build Coastguard Worker         w += 4;
5123*4bdc9457SAndroid Build Coastguard Worker 
5124*4bdc9457SAndroid Build Coastguard Worker         const __m128 valphah = _mm_shuffle_ps(vw, vw, _MM_SHUFFLE(2, 0, 2, 0));
5125*4bdc9457SAndroid Build Coastguard Worker         const __m128 valphav = _mm_shuffle_ps(vw, vw, _MM_SHUFFLE(3, 1, 3, 1));
5126*4bdc9457SAndroid Build Coastguard Worker 
5127*4bdc9457SAndroid Build Coastguard Worker         const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
5128*4bdc9457SAndroid Build Coastguard Worker         const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
5129*4bdc9457SAndroid Build Coastguard Worker         const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
5130*4bdc9457SAndroid Build Coastguard Worker         const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
5131*4bdc9457SAndroid Build Coastguard Worker         i += 4;
5132*4bdc9457SAndroid Build Coastguard Worker 
5133*4bdc9457SAndroid Build Coastguard Worker         const __m128 vtltr = _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0), (const __m64*) itl1);
5134*4bdc9457SAndroid Build Coastguard Worker         const __m128 vblbr = _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0), (const __m64*) ibl1);
5135*4bdc9457SAndroid Build Coastguard Worker 
5136*4bdc9457SAndroid Build Coastguard Worker         const __m128 vldrd = _mm_sub_ps(vblbr, vtltr);
5137*4bdc9457SAndroid Build Coastguard Worker         const __m128 vld = _mm_shuffle_ps(vldrd, vldrd, _MM_SHUFFLE(2, 0, 2, 0));
5138*4bdc9457SAndroid Build Coastguard Worker         const __m128 vrd = _mm_shuffle_ps(vldrd, vldrd, _MM_SHUFFLE(3, 1, 3, 1));
5139*4bdc9457SAndroid Build Coastguard Worker 
5140*4bdc9457SAndroid Build Coastguard Worker         const __m128 vtl = _mm_shuffle_ps(vtltr, vtltr, _MM_SHUFFLE(2, 0, 2, 0));
5141*4bdc9457SAndroid Build Coastguard Worker         const __m128 vtr = _mm_shuffle_ps(vtltr, vtltr, _MM_SHUFFLE(3, 1, 3, 1));
5142*4bdc9457SAndroid Build Coastguard Worker 
5143*4bdc9457SAndroid Build Coastguard Worker         const __m128 vl = _mm_add_ps(vtl, _mm_mul_ps(vld, valphav));
5144*4bdc9457SAndroid Build Coastguard Worker         const __m128 vr = _mm_add_ps(vtr, _mm_mul_ps(vrd, valphav));
5145*4bdc9457SAndroid Build Coastguard Worker 
5146*4bdc9457SAndroid Build Coastguard Worker         const __m128 vd = _mm_sub_ps(vr, vl);
5147*4bdc9457SAndroid Build Coastguard Worker         const __m128 vo = _mm_add_ps(vl, _mm_mul_ps(vd, valphah));
5148*4bdc9457SAndroid Build Coastguard Worker 
5149*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) output, vo);
5150*4bdc9457SAndroid Build Coastguard Worker         output += 2;
5151*4bdc9457SAndroid Build Coastguard Worker       }
5152*4bdc9457SAndroid Build Coastguard Worker 
5153*4bdc9457SAndroid Build Coastguard Worker       if (p & 1) {
5154*4bdc9457SAndroid Build Coastguard Worker         // We are computing the following formula:
5155*4bdc9457SAndroid Build Coastguard Worker         //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
5156*4bdc9457SAndroid Build Coastguard Worker         //                 alpha_h  * (1 - alpha_v) * top_right +
5157*4bdc9457SAndroid Build Coastguard Worker         //            (1 - alpha_h) *      alpha_v  * bottom_left +
5158*4bdc9457SAndroid Build Coastguard Worker         //                 alpha_h  *      alpha_v  * bottom_right.
5159*4bdc9457SAndroid Build Coastguard Worker         //
5160*4bdc9457SAndroid Build Coastguard Worker         // Rearranging gives
5161*4bdc9457SAndroid Build Coastguard Worker         //   result =    left + alpha_h * (right        - left),
5162*4bdc9457SAndroid Build Coastguard Worker         // where
5163*4bdc9457SAndroid Build Coastguard Worker         //   left =  top_left + alpha_v * (bottom_left  - top_left),
5164*4bdc9457SAndroid Build Coastguard Worker         //  right = top_right + alpha_v * (bottom_right - top_right).
5165*4bdc9457SAndroid Build Coastguard Worker 
5166*4bdc9457SAndroid Build Coastguard Worker         const float alphah = *w;
5167*4bdc9457SAndroid Build Coastguard Worker         const __m128 valphav = _mm_load_ps1(w + 1);
5168*4bdc9457SAndroid Build Coastguard Worker         w += 2;
5169*4bdc9457SAndroid Build Coastguard Worker 
5170*4bdc9457SAndroid Build Coastguard Worker         const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
5171*4bdc9457SAndroid Build Coastguard Worker         const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
5172*4bdc9457SAndroid Build Coastguard Worker         i += 2;
5173*4bdc9457SAndroid Build Coastguard Worker 
5174*4bdc9457SAndroid Build Coastguard Worker         const __m128 vtltr = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl);
5175*4bdc9457SAndroid Build Coastguard Worker         const __m128 vblbr = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl);
5176*4bdc9457SAndroid Build Coastguard Worker 
5177*4bdc9457SAndroid Build Coastguard Worker         // Compute at once
5178*4bdc9457SAndroid Build Coastguard Worker         //    left_diff = bottom_left  - top_left
5179*4bdc9457SAndroid Build Coastguard Worker         //   right_diff = bottom_right - top_right
5180*4bdc9457SAndroid Build Coastguard Worker         const __m128 vldrd = _mm_sub_ps(vblbr, vtltr);
5181*4bdc9457SAndroid Build Coastguard Worker         const __m128 vlr = _mm_add_ps(vtltr, _mm_mul_ps(vldrd, valphav));
5182*4bdc9457SAndroid Build Coastguard Worker 
5183*4bdc9457SAndroid Build Coastguard Worker         // Extract them and compute the result.
5184*4bdc9457SAndroid Build Coastguard Worker         const float l = _mm_cvtss_f32(vlr);
5185*4bdc9457SAndroid Build Coastguard Worker         const float r = _mm_cvtss_f32(_mm_shuffle_ps(vlr, vlr, 1));
5186*4bdc9457SAndroid Build Coastguard Worker 
5187*4bdc9457SAndroid Build Coastguard Worker         *output++ = l + alphah * (r - l);
5188*4bdc9457SAndroid Build Coastguard Worker       }
5189*4bdc9457SAndroid Build Coastguard Worker     }
5190*4bdc9457SAndroid Build Coastguard Worker 
5191*4bdc9457SAndroid Build Coastguard Worker     input_offset += input_increment;
5192*4bdc9457SAndroid Build Coastguard Worker   } while (--channels != 0);
5193*4bdc9457SAndroid Build Coastguard Worker }
5194*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_ibilinear_ukernel__sse_c8(size_t output_pixels,size_t channels,const float ** restrict input,size_t input_offset,const float * restrict weights,float * restrict output,size_t output_increment)5195*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_ibilinear_ukernel__sse_c8(
5196*4bdc9457SAndroid Build Coastguard Worker     size_t output_pixels,
5197*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
5198*4bdc9457SAndroid Build Coastguard Worker     const float**restrict input,
5199*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
5200*4bdc9457SAndroid Build Coastguard Worker     const float*restrict weights,
5201*4bdc9457SAndroid Build Coastguard Worker     float*restrict output,
5202*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment) XNN_OOB_READS
5203*4bdc9457SAndroid Build Coastguard Worker {
5204*4bdc9457SAndroid Build Coastguard Worker   assert(output_pixels != 0);
5205*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
5206*4bdc9457SAndroid Build Coastguard Worker   assert(channels % sizeof(float) == 0);
5207*4bdc9457SAndroid Build Coastguard Worker 
5208*4bdc9457SAndroid Build Coastguard Worker   do {
5209*4bdc9457SAndroid Build Coastguard Worker     const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset);
5210*4bdc9457SAndroid Build Coastguard Worker     const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset);
5211*4bdc9457SAndroid Build Coastguard Worker     const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset);
5212*4bdc9457SAndroid Build Coastguard Worker     const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset);
5213*4bdc9457SAndroid Build Coastguard Worker     input += 4;
5214*4bdc9457SAndroid Build Coastguard Worker 
5215*4bdc9457SAndroid Build Coastguard Worker     __m128 valphahv = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) weights);
5216*4bdc9457SAndroid Build Coastguard Worker     valphahv = _mm_unpacklo_ps(valphahv, valphahv);
5217*4bdc9457SAndroid Build Coastguard Worker     const __m128 valphah = _mm_movelh_ps(valphahv, valphahv);
5218*4bdc9457SAndroid Build Coastguard Worker     const __m128 valphav = _mm_movehl_ps(valphahv, valphahv);
5219*4bdc9457SAndroid Build Coastguard Worker     weights += 2;
5220*4bdc9457SAndroid Build Coastguard Worker 
5221*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
5222*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
5223*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtl0123 = _mm_loadu_ps(i0);
5224*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtr0123 = _mm_loadu_ps(i1);
5225*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbl0123 = _mm_loadu_ps(i2);
5226*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbr0123 = _mm_loadu_ps(i3);
5227*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtl4567 = _mm_loadu_ps(i0 + 4);
5228*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtr4567 = _mm_loadu_ps(i1 + 4);
5229*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbl4567 = _mm_loadu_ps(i2 + 4);
5230*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbr4567 = _mm_loadu_ps(i3 + 4);
5231*4bdc9457SAndroid Build Coastguard Worker       i0 += 8;
5232*4bdc9457SAndroid Build Coastguard Worker       i1 += 8;
5233*4bdc9457SAndroid Build Coastguard Worker       i2 += 8;
5234*4bdc9457SAndroid Build Coastguard Worker       i3 += 8;
5235*4bdc9457SAndroid Build Coastguard Worker 
5236*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
5237*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
5238*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtd4567 = _mm_sub_ps(vtr4567, vtl4567);
5239*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbd4567 = _mm_sub_ps(vbr4567, vbl4567);
5240*4bdc9457SAndroid Build Coastguard Worker 
5241*4bdc9457SAndroid Build Coastguard Worker       const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
5242*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
5243*4bdc9457SAndroid Build Coastguard Worker       const __m128 vt4567 = _mm_add_ps(vtl4567, _mm_mul_ps(vtd4567, valphah));
5244*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb4567 = _mm_add_ps(vbl4567, _mm_mul_ps(vbd4567, valphah));
5245*4bdc9457SAndroid Build Coastguard Worker 
5246*4bdc9457SAndroid Build Coastguard Worker       const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
5247*4bdc9457SAndroid Build Coastguard Worker       const __m128 vd4567 = _mm_sub_ps(vb4567, vt4567);
5248*4bdc9457SAndroid Build Coastguard Worker 
5249*4bdc9457SAndroid Build Coastguard Worker       const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
5250*4bdc9457SAndroid Build Coastguard Worker       const __m128 vo4567 = _mm_add_ps(vt4567, _mm_mul_ps(vd4567, valphav));
5251*4bdc9457SAndroid Build Coastguard Worker 
5252*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vo0123);
5253*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 4, vo4567);
5254*4bdc9457SAndroid Build Coastguard Worker       output += 8;
5255*4bdc9457SAndroid Build Coastguard Worker     }
5256*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
5257*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtl0123 = _mm_loadu_ps(i0);
5258*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtr0123 = _mm_loadu_ps(i1);
5259*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbl0123 = _mm_loadu_ps(i2);
5260*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbr0123 = _mm_loadu_ps(i3);
5261*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
5262*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
5263*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
5264*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
5265*4bdc9457SAndroid Build Coastguard Worker 
5266*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
5267*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
5268*4bdc9457SAndroid Build Coastguard Worker 
5269*4bdc9457SAndroid Build Coastguard Worker       const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
5270*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
5271*4bdc9457SAndroid Build Coastguard Worker 
5272*4bdc9457SAndroid Build Coastguard Worker       const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
5273*4bdc9457SAndroid Build Coastguard Worker 
5274*4bdc9457SAndroid Build Coastguard Worker       const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
5275*4bdc9457SAndroid Build Coastguard Worker 
5276*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vo0123);
5277*4bdc9457SAndroid Build Coastguard Worker       output += 4;
5278*4bdc9457SAndroid Build Coastguard Worker     }
5279*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
5280*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtl0123 = _mm_loadu_ps(i0);
5281*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtr0123 = _mm_loadu_ps(i1);
5282*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbl0123 = _mm_loadu_ps(i2);
5283*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbr0123 = _mm_loadu_ps(i3);
5284*4bdc9457SAndroid Build Coastguard Worker 
5285*4bdc9457SAndroid Build Coastguard Worker       const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
5286*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
5287*4bdc9457SAndroid Build Coastguard Worker 
5288*4bdc9457SAndroid Build Coastguard Worker       const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
5289*4bdc9457SAndroid Build Coastguard Worker       const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
5290*4bdc9457SAndroid Build Coastguard Worker 
5291*4bdc9457SAndroid Build Coastguard Worker       const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
5292*4bdc9457SAndroid Build Coastguard Worker 
5293*4bdc9457SAndroid Build Coastguard Worker       __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
5294*4bdc9457SAndroid Build Coastguard Worker 
5295*4bdc9457SAndroid Build Coastguard Worker       if (c & (2 * sizeof(float))) {
5296*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) output, vo0123);
5297*4bdc9457SAndroid Build Coastguard Worker         vo0123 = _mm_movehl_ps(vo0123, vo0123);
5298*4bdc9457SAndroid Build Coastguard Worker         output += 2;
5299*4bdc9457SAndroid Build Coastguard Worker       }
5300*4bdc9457SAndroid Build Coastguard Worker       if (c & (1 * sizeof(float))) {
5301*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(output, vo0123);
5302*4bdc9457SAndroid Build Coastguard Worker         output += 1;
5303*4bdc9457SAndroid Build Coastguard Worker       }
5304*4bdc9457SAndroid Build Coastguard Worker     }
5305*4bdc9457SAndroid Build Coastguard Worker 
5306*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) output + output_increment);
5307*4bdc9457SAndroid Build Coastguard Worker   } while (--output_pixels != 0);
5308*4bdc9457SAndroid Build Coastguard Worker }
5309*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_igemm_minmax_ukernel_1x8__sse_load1(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5310*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_igemm_minmax_ukernel_1x8__sse_load1(
5311*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
5312*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
5313*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
5314*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
5315*4bdc9457SAndroid Build Coastguard Worker     const float**restrict a,
5316*4bdc9457SAndroid Build Coastguard Worker     const float*restrict w,
5317*4bdc9457SAndroid Build Coastguard Worker     float*restrict c,
5318*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
5319*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
5320*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
5321*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
5322*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5323*4bdc9457SAndroid Build Coastguard Worker {
5324*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
5325*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
5326*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
5327*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
5328*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(float) == 0);
5329*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
5330*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (1 * sizeof(void*)) == 0);
5331*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(float) == 0);
5332*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
5333*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
5334*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
5335*4bdc9457SAndroid Build Coastguard Worker 
5336*4bdc9457SAndroid Build Coastguard Worker   float* c0 = c;
5337*4bdc9457SAndroid Build Coastguard Worker 
5338*4bdc9457SAndroid Build Coastguard Worker   do {
5339*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x0123 = _mm_load_ps(w);
5340*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x4567 = _mm_load_ps(w + 4);
5341*4bdc9457SAndroid Build Coastguard Worker     w += 8;
5342*4bdc9457SAndroid Build Coastguard Worker 
5343*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
5344*4bdc9457SAndroid Build Coastguard Worker     do {
5345*4bdc9457SAndroid Build Coastguard Worker       const float* restrict a0 = a[0];
5346*4bdc9457SAndroid Build Coastguard Worker       assert(a0 != NULL);
5347*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
5348*4bdc9457SAndroid Build Coastguard Worker         a0 = (const float*) ((uintptr_t) a0 + a_offset);
5349*4bdc9457SAndroid Build Coastguard Worker       }
5350*4bdc9457SAndroid Build Coastguard Worker       a += 1;
5351*4bdc9457SAndroid Build Coastguard Worker 
5352*4bdc9457SAndroid Build Coastguard Worker       size_t k = kc;
5353*4bdc9457SAndroid Build Coastguard Worker       do {
5354*4bdc9457SAndroid Build Coastguard Worker         const __m128 vb0123 = _mm_load_ps(w);
5355*4bdc9457SAndroid Build Coastguard Worker         const __m128 vb4567 = _mm_load_ps(w + 4);
5356*4bdc9457SAndroid Build Coastguard Worker         w += 8;
5357*4bdc9457SAndroid Build Coastguard Worker 
5358*4bdc9457SAndroid Build Coastguard Worker         const __m128 va0 = _mm_load1_ps(a0);
5359*4bdc9457SAndroid Build Coastguard Worker         a0 += 1;
5360*4bdc9457SAndroid Build Coastguard Worker 
5361*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
5362*4bdc9457SAndroid Build Coastguard Worker         vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
5363*4bdc9457SAndroid Build Coastguard Worker         k -= sizeof(float);
5364*4bdc9457SAndroid Build Coastguard Worker       } while (k != 0);
5365*4bdc9457SAndroid Build Coastguard Worker       p -= 1 * sizeof(void*);
5366*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
5367*4bdc9457SAndroid Build Coastguard Worker 
5368*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmax = _mm_load_ps(params->sse.max);
5369*4bdc9457SAndroid Build Coastguard Worker     vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
5370*4bdc9457SAndroid Build Coastguard Worker     vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
5371*4bdc9457SAndroid Build Coastguard Worker 
5372*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmin = _mm_load_ps(params->sse.min);
5373*4bdc9457SAndroid Build Coastguard Worker     vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
5374*4bdc9457SAndroid Build Coastguard Worker     vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
5375*4bdc9457SAndroid Build Coastguard Worker 
5376*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 8) {
5377*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c0, vacc0x0123);
5378*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c0 + 4, vacc0x4567);
5379*4bdc9457SAndroid Build Coastguard Worker       c0 = (float*) ((uintptr_t) c0 + cn_stride);
5380*4bdc9457SAndroid Build Coastguard Worker 
5381*4bdc9457SAndroid Build Coastguard Worker       a = (const float**restrict) ((uintptr_t) a - ks);
5382*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
5383*4bdc9457SAndroid Build Coastguard Worker     } else {
5384*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
5385*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c0, vacc0x0123);
5386*4bdc9457SAndroid Build Coastguard Worker 
5387*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = vacc0x4567;
5388*4bdc9457SAndroid Build Coastguard Worker 
5389*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
5390*4bdc9457SAndroid Build Coastguard Worker       }
5391*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
5392*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c0, vacc0x0123);
5393*4bdc9457SAndroid Build Coastguard Worker 
5394*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
5395*4bdc9457SAndroid Build Coastguard Worker 
5396*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
5397*4bdc9457SAndroid Build Coastguard Worker       }
5398*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
5399*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c0, vacc0x0123);
5400*4bdc9457SAndroid Build Coastguard Worker       }
5401*4bdc9457SAndroid Build Coastguard Worker 
5402*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
5403*4bdc9457SAndroid Build Coastguard Worker     }
5404*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
5405*4bdc9457SAndroid Build Coastguard Worker }
5406*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_igemm_minmax_ukernel_4x2c4__sse(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5407*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_igemm_minmax_ukernel_4x2c4__sse(
5408*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
5409*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
5410*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
5411*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
5412*4bdc9457SAndroid Build Coastguard Worker     const float**restrict a,
5413*4bdc9457SAndroid Build Coastguard Worker     const float*restrict w,
5414*4bdc9457SAndroid Build Coastguard Worker     float*restrict c,
5415*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
5416*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
5417*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
5418*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
5419*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5420*4bdc9457SAndroid Build Coastguard Worker {
5421*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
5422*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 4);
5423*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
5424*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
5425*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(float) == 0);
5426*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
5427*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (4 * sizeof(void*)) == 0);
5428*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(float) == 0);
5429*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
5430*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
5431*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
5432*4bdc9457SAndroid Build Coastguard Worker 
5433*4bdc9457SAndroid Build Coastguard Worker   float* c0 = c;
5434*4bdc9457SAndroid Build Coastguard Worker   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
5435*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
5436*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
5437*4bdc9457SAndroid Build Coastguard Worker   }
5438*4bdc9457SAndroid Build Coastguard Worker   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
5439*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
5440*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
5441*4bdc9457SAndroid Build Coastguard Worker   }
5442*4bdc9457SAndroid Build Coastguard Worker   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
5443*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr != 4) {
5444*4bdc9457SAndroid Build Coastguard Worker     c3 = c2;
5445*4bdc9457SAndroid Build Coastguard Worker   }
5446*4bdc9457SAndroid Build Coastguard Worker 
5447*4bdc9457SAndroid Build Coastguard Worker   do {
5448*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x0c4 = _mm_load_ss(w);
5449*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x1c4 = _mm_load_ss(w + 1);
5450*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc1x0c4 = vacc0x0c4;
5451*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc1x1c4 = vacc0x1c4;
5452*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc2x0c4 = vacc0x0c4;
5453*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc2x1c4 = vacc0x1c4;
5454*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc3x0c4 = vacc0x0c4;
5455*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc3x1c4 = vacc0x1c4;
5456*4bdc9457SAndroid Build Coastguard Worker     w += 2;
5457*4bdc9457SAndroid Build Coastguard Worker 
5458*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
5459*4bdc9457SAndroid Build Coastguard Worker     do {
5460*4bdc9457SAndroid Build Coastguard Worker       const float* restrict a0 = a[0];
5461*4bdc9457SAndroid Build Coastguard Worker       assert(a0 != NULL);
5462*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
5463*4bdc9457SAndroid Build Coastguard Worker         a0 = (const float*) ((uintptr_t) a0 + a_offset);
5464*4bdc9457SAndroid Build Coastguard Worker       }
5465*4bdc9457SAndroid Build Coastguard Worker       const float* restrict a1 = a[1];
5466*4bdc9457SAndroid Build Coastguard Worker       assert(a1 != NULL);
5467*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a1 != zero) {
5468*4bdc9457SAndroid Build Coastguard Worker         a1 = (const float*) ((uintptr_t) a1 + a_offset);
5469*4bdc9457SAndroid Build Coastguard Worker       }
5470*4bdc9457SAndroid Build Coastguard Worker       const float* restrict a2 = a[2];
5471*4bdc9457SAndroid Build Coastguard Worker       assert(a2 != NULL);
5472*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a2 != zero) {
5473*4bdc9457SAndroid Build Coastguard Worker         a2 = (const float*) ((uintptr_t) a2 + a_offset);
5474*4bdc9457SAndroid Build Coastguard Worker       }
5475*4bdc9457SAndroid Build Coastguard Worker       const float* restrict a3 = a[3];
5476*4bdc9457SAndroid Build Coastguard Worker       assert(a3 != NULL);
5477*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a3 != zero) {
5478*4bdc9457SAndroid Build Coastguard Worker         a3 = (const float*) ((uintptr_t) a3 + a_offset);
5479*4bdc9457SAndroid Build Coastguard Worker       }
5480*4bdc9457SAndroid Build Coastguard Worker       a += 4;
5481*4bdc9457SAndroid Build Coastguard Worker 
5482*4bdc9457SAndroid Build Coastguard Worker       size_t k = kc;
5483*4bdc9457SAndroid Build Coastguard Worker       for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
5484*4bdc9457SAndroid Build Coastguard Worker         const __m128 va0 = _mm_loadu_ps(a0);
5485*4bdc9457SAndroid Build Coastguard Worker         a0 += 4;
5486*4bdc9457SAndroid Build Coastguard Worker         const __m128 va1 = _mm_loadu_ps(a1);
5487*4bdc9457SAndroid Build Coastguard Worker         a1 += 4;
5488*4bdc9457SAndroid Build Coastguard Worker         const __m128 va2 = _mm_loadu_ps(a2);
5489*4bdc9457SAndroid Build Coastguard Worker         a2 += 4;
5490*4bdc9457SAndroid Build Coastguard Worker         const __m128 va3 = _mm_loadu_ps(a3);
5491*4bdc9457SAndroid Build Coastguard Worker         a3 += 4;
5492*4bdc9457SAndroid Build Coastguard Worker 
5493*4bdc9457SAndroid Build Coastguard Worker         const __m128 vb0 = _mm_loadu_ps(w);
5494*4bdc9457SAndroid Build Coastguard Worker         const __m128 vb1 = _mm_loadu_ps(w + 4);
5495*4bdc9457SAndroid Build Coastguard Worker         w += 8;
5496*4bdc9457SAndroid Build Coastguard Worker 
5497*4bdc9457SAndroid Build Coastguard Worker         vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(va0, vb0));
5498*4bdc9457SAndroid Build Coastguard Worker         vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(va0, vb1));
5499*4bdc9457SAndroid Build Coastguard Worker         vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(va1, vb0));
5500*4bdc9457SAndroid Build Coastguard Worker         vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(va1, vb1));
5501*4bdc9457SAndroid Build Coastguard Worker         vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(va2, vb0));
5502*4bdc9457SAndroid Build Coastguard Worker         vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(va2, vb1));
5503*4bdc9457SAndroid Build Coastguard Worker         vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(va3, vb0));
5504*4bdc9457SAndroid Build Coastguard Worker         vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(va3, vb1));
5505*4bdc9457SAndroid Build Coastguard Worker       }
5506*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNLIKELY(k != 0) {
5507*4bdc9457SAndroid Build Coastguard Worker         const __m128 va0 = _mm_loadu_ps(a0);
5508*4bdc9457SAndroid Build Coastguard Worker         const __m128 va1 = _mm_loadu_ps(a1);
5509*4bdc9457SAndroid Build Coastguard Worker         const __m128 va2 = _mm_loadu_ps(a2);
5510*4bdc9457SAndroid Build Coastguard Worker         const __m128 va3 = _mm_loadu_ps(a3);
5511*4bdc9457SAndroid Build Coastguard Worker 
5512*4bdc9457SAndroid Build Coastguard Worker         const __m128 vb0 = _mm_loadu_ps(w);
5513*4bdc9457SAndroid Build Coastguard Worker         const __m128 vb1 = _mm_loadu_ps(w + 4);
5514*4bdc9457SAndroid Build Coastguard Worker         w += 8;
5515*4bdc9457SAndroid Build Coastguard Worker 
5516*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmask0 = _mm_cmpeq_ps(_mm_setzero_ps(), vb0);
5517*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmask1 = _mm_cmpeq_ps(_mm_setzero_ps(), vb1);
5518*4bdc9457SAndroid Build Coastguard Worker 
5519*4bdc9457SAndroid Build Coastguard Worker         vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va0), vb0));
5520*4bdc9457SAndroid Build Coastguard Worker         vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va0), vb1));
5521*4bdc9457SAndroid Build Coastguard Worker         vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va1), vb0));
5522*4bdc9457SAndroid Build Coastguard Worker         vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va1), vb1));
5523*4bdc9457SAndroid Build Coastguard Worker         vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va2), vb0));
5524*4bdc9457SAndroid Build Coastguard Worker         vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va2), vb1));
5525*4bdc9457SAndroid Build Coastguard Worker         vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va3), vb0));
5526*4bdc9457SAndroid Build Coastguard Worker         vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va3), vb1));
5527*4bdc9457SAndroid Build Coastguard Worker       }
5528*4bdc9457SAndroid Build Coastguard Worker       p -= 4 * sizeof(void*);
5529*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
5530*4bdc9457SAndroid Build Coastguard Worker 
5531*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc0x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc0x0c4, vacc0x1c4), _mm_unpackhi_ps(vacc0x0c4, vacc0x1c4));
5532*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc1x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc1x0c4, vacc1x1c4), _mm_unpackhi_ps(vacc1x0c4, vacc1x1c4));
5533*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc2x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc2x0c4, vacc2x1c4), _mm_unpackhi_ps(vacc2x0c4, vacc2x1c4));
5534*4bdc9457SAndroid Build Coastguard Worker     const __m128 vacc3x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc3x0c4, vacc3x1c4), _mm_unpackhi_ps(vacc3x0c4, vacc3x1c4));
5535*4bdc9457SAndroid Build Coastguard Worker 
5536*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc01x01 = _mm_add_ps(_mm_movelh_ps(vacc0x01c2, vacc1x01c2), _mm_movehl_ps(vacc1x01c2, vacc0x01c2));
5537*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc23x01 = _mm_add_ps(_mm_movelh_ps(vacc2x01c2, vacc3x01c2), _mm_movehl_ps(vacc3x01c2, vacc2x01c2));
5538*4bdc9457SAndroid Build Coastguard Worker 
5539*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmax = _mm_load_ps(params->sse.max);
5540*4bdc9457SAndroid Build Coastguard Worker     vacc01x01 = _mm_min_ps(vacc01x01, vmax);
5541*4bdc9457SAndroid Build Coastguard Worker     vacc23x01 = _mm_min_ps(vacc23x01, vmax);
5542*4bdc9457SAndroid Build Coastguard Worker 
5543*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmin = _mm_load_ps(params->sse.min);
5544*4bdc9457SAndroid Build Coastguard Worker     vacc01x01 = _mm_max_ps(vacc01x01, vmin);
5545*4bdc9457SAndroid Build Coastguard Worker     vacc23x01 = _mm_max_ps(vacc23x01, vmin);
5546*4bdc9457SAndroid Build Coastguard Worker 
5547*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 2) {
5548*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c3, vacc23x01);
5549*4bdc9457SAndroid Build Coastguard Worker       c3 = (float*) ((uintptr_t) c3 + cn_stride);
5550*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) c2, vacc23x01);
5551*4bdc9457SAndroid Build Coastguard Worker       c2 = (float*) ((uintptr_t) c2 + cn_stride);
5552*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c1, vacc01x01);
5553*4bdc9457SAndroid Build Coastguard Worker       c1 = (float*) ((uintptr_t) c1 + cn_stride);
5554*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) c0, vacc01x01);
5555*4bdc9457SAndroid Build Coastguard Worker       c0 = (float*) ((uintptr_t) c0 + cn_stride);
5556*4bdc9457SAndroid Build Coastguard Worker 
5557*4bdc9457SAndroid Build Coastguard Worker       a = (const float**restrict) ((uintptr_t) a - ks);
5558*4bdc9457SAndroid Build Coastguard Worker       nc -= 2;
5559*4bdc9457SAndroid Build Coastguard Worker     } else {
5560*4bdc9457SAndroid Build Coastguard Worker       assert(nc == 1);
5561*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(c3, _mm_movehl_ps(vacc23x01, vacc23x01));
5562*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(c2, vacc23x01);
5563*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(c1, _mm_movehl_ps(vacc01x01, vacc01x01));
5564*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(c0, vacc01x01);
5565*4bdc9457SAndroid Build Coastguard Worker 
5566*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
5567*4bdc9457SAndroid Build Coastguard Worker     }
5568*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
5569*4bdc9457SAndroid Build Coastguard Worker }
5570*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_igemm_minmax_ukernel_4x8__sse_load1(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5571*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_igemm_minmax_ukernel_4x8__sse_load1(
5572*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
5573*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
5574*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
5575*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
5576*4bdc9457SAndroid Build Coastguard Worker     const float**restrict a,
5577*4bdc9457SAndroid Build Coastguard Worker     const float*restrict w,
5578*4bdc9457SAndroid Build Coastguard Worker     float*restrict c,
5579*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
5580*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
5581*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
5582*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
5583*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5584*4bdc9457SAndroid Build Coastguard Worker {
5585*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
5586*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 4);
5587*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
5588*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
5589*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(float) == 0);
5590*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
5591*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (4 * sizeof(void*)) == 0);
5592*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(float) == 0);
5593*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
5594*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
5595*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
5596*4bdc9457SAndroid Build Coastguard Worker 
5597*4bdc9457SAndroid Build Coastguard Worker   float* c0 = c;
5598*4bdc9457SAndroid Build Coastguard Worker   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
5599*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
5600*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
5601*4bdc9457SAndroid Build Coastguard Worker   }
5602*4bdc9457SAndroid Build Coastguard Worker   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
5603*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
5604*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
5605*4bdc9457SAndroid Build Coastguard Worker   }
5606*4bdc9457SAndroid Build Coastguard Worker   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
5607*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr != 4) {
5608*4bdc9457SAndroid Build Coastguard Worker     c3 = c2;
5609*4bdc9457SAndroid Build Coastguard Worker   }
5610*4bdc9457SAndroid Build Coastguard Worker 
5611*4bdc9457SAndroid Build Coastguard Worker   do {
5612*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x0123 = _mm_load_ps(w);
5613*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0x4567 = _mm_load_ps(w + 4);
5614*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc1x0123 = vacc0x0123;
5615*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc1x4567 = vacc0x4567;
5616*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc2x0123 = vacc0x0123;
5617*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc2x4567 = vacc0x4567;
5618*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc3x0123 = vacc0x0123;
5619*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc3x4567 = vacc0x4567;
5620*4bdc9457SAndroid Build Coastguard Worker     w += 8;
5621*4bdc9457SAndroid Build Coastguard Worker 
5622*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
5623*4bdc9457SAndroid Build Coastguard Worker     do {
5624*4bdc9457SAndroid Build Coastguard Worker       const float* restrict a0 = a[0];
5625*4bdc9457SAndroid Build Coastguard Worker       assert(a0 != NULL);
5626*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
5627*4bdc9457SAndroid Build Coastguard Worker         a0 = (const float*) ((uintptr_t) a0 + a_offset);
5628*4bdc9457SAndroid Build Coastguard Worker       }
5629*4bdc9457SAndroid Build Coastguard Worker       const float* restrict a1 = a[1];
5630*4bdc9457SAndroid Build Coastguard Worker       assert(a1 != NULL);
5631*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a1 != zero) {
5632*4bdc9457SAndroid Build Coastguard Worker         a1 = (const float*) ((uintptr_t) a1 + a_offset);
5633*4bdc9457SAndroid Build Coastguard Worker       }
5634*4bdc9457SAndroid Build Coastguard Worker       const float* restrict a2 = a[2];
5635*4bdc9457SAndroid Build Coastguard Worker       assert(a2 != NULL);
5636*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a2 != zero) {
5637*4bdc9457SAndroid Build Coastguard Worker         a2 = (const float*) ((uintptr_t) a2 + a_offset);
5638*4bdc9457SAndroid Build Coastguard Worker       }
5639*4bdc9457SAndroid Build Coastguard Worker       const float* restrict a3 = a[3];
5640*4bdc9457SAndroid Build Coastguard Worker       assert(a3 != NULL);
5641*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a3 != zero) {
5642*4bdc9457SAndroid Build Coastguard Worker         a3 = (const float*) ((uintptr_t) a3 + a_offset);
5643*4bdc9457SAndroid Build Coastguard Worker       }
5644*4bdc9457SAndroid Build Coastguard Worker       a += 4;
5645*4bdc9457SAndroid Build Coastguard Worker 
5646*4bdc9457SAndroid Build Coastguard Worker       size_t k = kc;
5647*4bdc9457SAndroid Build Coastguard Worker       do {
5648*4bdc9457SAndroid Build Coastguard Worker         const __m128 vb0123 = _mm_load_ps(w);
5649*4bdc9457SAndroid Build Coastguard Worker         const __m128 vb4567 = _mm_load_ps(w + 4);
5650*4bdc9457SAndroid Build Coastguard Worker         w += 8;
5651*4bdc9457SAndroid Build Coastguard Worker 
5652*4bdc9457SAndroid Build Coastguard Worker         const __m128 va0 = _mm_load1_ps(a0);
5653*4bdc9457SAndroid Build Coastguard Worker         a0 += 1;
5654*4bdc9457SAndroid Build Coastguard Worker         const __m128 va1 = _mm_load1_ps(a1);
5655*4bdc9457SAndroid Build Coastguard Worker         a1 += 1;
5656*4bdc9457SAndroid Build Coastguard Worker         const __m128 va2 = _mm_load1_ps(a2);
5657*4bdc9457SAndroid Build Coastguard Worker         a2 += 1;
5658*4bdc9457SAndroid Build Coastguard Worker         const __m128 va3 = _mm_load1_ps(a3);
5659*4bdc9457SAndroid Build Coastguard Worker         a3 += 1;
5660*4bdc9457SAndroid Build Coastguard Worker 
5661*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
5662*4bdc9457SAndroid Build Coastguard Worker         vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
5663*4bdc9457SAndroid Build Coastguard Worker         vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
5664*4bdc9457SAndroid Build Coastguard Worker         vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
5665*4bdc9457SAndroid Build Coastguard Worker         vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
5666*4bdc9457SAndroid Build Coastguard Worker         vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
5667*4bdc9457SAndroid Build Coastguard Worker         vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
5668*4bdc9457SAndroid Build Coastguard Worker         vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
5669*4bdc9457SAndroid Build Coastguard Worker         k -= sizeof(float);
5670*4bdc9457SAndroid Build Coastguard Worker       } while (k != 0);
5671*4bdc9457SAndroid Build Coastguard Worker       p -= 4 * sizeof(void*);
5672*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
5673*4bdc9457SAndroid Build Coastguard Worker 
5674*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmax = _mm_load_ps(params->sse.max);
5675*4bdc9457SAndroid Build Coastguard Worker     vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
5676*4bdc9457SAndroid Build Coastguard Worker     vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
5677*4bdc9457SAndroid Build Coastguard Worker     vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
5678*4bdc9457SAndroid Build Coastguard Worker     vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
5679*4bdc9457SAndroid Build Coastguard Worker     vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
5680*4bdc9457SAndroid Build Coastguard Worker     vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
5681*4bdc9457SAndroid Build Coastguard Worker     vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
5682*4bdc9457SAndroid Build Coastguard Worker     vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
5683*4bdc9457SAndroid Build Coastguard Worker 
5684*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmin = _mm_load_ps(params->sse.min);
5685*4bdc9457SAndroid Build Coastguard Worker     vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
5686*4bdc9457SAndroid Build Coastguard Worker     vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
5687*4bdc9457SAndroid Build Coastguard Worker     vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
5688*4bdc9457SAndroid Build Coastguard Worker     vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
5689*4bdc9457SAndroid Build Coastguard Worker     vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
5690*4bdc9457SAndroid Build Coastguard Worker     vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
5691*4bdc9457SAndroid Build Coastguard Worker     vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
5692*4bdc9457SAndroid Build Coastguard Worker     vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
5693*4bdc9457SAndroid Build Coastguard Worker 
5694*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 8) {
5695*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c3, vacc3x0123);
5696*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c3 + 4, vacc3x4567);
5697*4bdc9457SAndroid Build Coastguard Worker       c3 = (float*) ((uintptr_t) c3 + cn_stride);
5698*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c2, vacc2x0123);
5699*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c2 + 4, vacc2x4567);
5700*4bdc9457SAndroid Build Coastguard Worker       c2 = (float*) ((uintptr_t) c2 + cn_stride);
5701*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c1, vacc1x0123);
5702*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c1 + 4, vacc1x4567);
5703*4bdc9457SAndroid Build Coastguard Worker       c1 = (float*) ((uintptr_t) c1 + cn_stride);
5704*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c0, vacc0x0123);
5705*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(c0 + 4, vacc0x4567);
5706*4bdc9457SAndroid Build Coastguard Worker       c0 = (float*) ((uintptr_t) c0 + cn_stride);
5707*4bdc9457SAndroid Build Coastguard Worker 
5708*4bdc9457SAndroid Build Coastguard Worker       a = (const float**restrict) ((uintptr_t) a - ks);
5709*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
5710*4bdc9457SAndroid Build Coastguard Worker     } else {
5711*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
5712*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c3, vacc3x0123);
5713*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c2, vacc2x0123);
5714*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c1, vacc1x0123);
5715*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(c0, vacc0x0123);
5716*4bdc9457SAndroid Build Coastguard Worker 
5717*4bdc9457SAndroid Build Coastguard Worker         vacc3x0123 = vacc3x4567;
5718*4bdc9457SAndroid Build Coastguard Worker         vacc2x0123 = vacc2x4567;
5719*4bdc9457SAndroid Build Coastguard Worker         vacc1x0123 = vacc1x4567;
5720*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = vacc0x4567;
5721*4bdc9457SAndroid Build Coastguard Worker 
5722*4bdc9457SAndroid Build Coastguard Worker         c3 += 4;
5723*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
5724*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
5725*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
5726*4bdc9457SAndroid Build Coastguard Worker       }
5727*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
5728*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c3, vacc3x0123);
5729*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c2, vacc2x0123);
5730*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c1, vacc1x0123);
5731*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) c0, vacc0x0123);
5732*4bdc9457SAndroid Build Coastguard Worker 
5733*4bdc9457SAndroid Build Coastguard Worker         vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
5734*4bdc9457SAndroid Build Coastguard Worker         vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
5735*4bdc9457SAndroid Build Coastguard Worker         vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
5736*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
5737*4bdc9457SAndroid Build Coastguard Worker 
5738*4bdc9457SAndroid Build Coastguard Worker         c3 += 2;
5739*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
5740*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
5741*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
5742*4bdc9457SAndroid Build Coastguard Worker       }
5743*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
5744*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c3, vacc3x0123);
5745*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c2, vacc2x0123);
5746*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c1, vacc1x0123);
5747*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(c0, vacc0x0123);
5748*4bdc9457SAndroid Build Coastguard Worker       }
5749*4bdc9457SAndroid Build Coastguard Worker 
5750*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
5751*4bdc9457SAndroid Build Coastguard Worker     }
5752*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
5753*4bdc9457SAndroid Build Coastguard Worker }
5754*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5755*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(
5756*4bdc9457SAndroid Build Coastguard Worker     size_t output_pixels,
5757*4bdc9457SAndroid Build Coastguard Worker     size_t kernel_elements,
5758*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
5759*4bdc9457SAndroid Build Coastguard Worker     const float** input,
5760*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
5761*4bdc9457SAndroid Build Coastguard Worker     float* output,
5762*4bdc9457SAndroid Build Coastguard Worker     size_t input_increment,
5763*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
5764*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5765*4bdc9457SAndroid Build Coastguard Worker {
5766*4bdc9457SAndroid Build Coastguard Worker   assert(output_pixels != 0);
5767*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements != 0);
5768*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
5769*4bdc9457SAndroid Build Coastguard Worker 
5770*4bdc9457SAndroid Build Coastguard Worker   const __m128 voutput_max = _mm_load_ps(params->sse.max);
5771*4bdc9457SAndroid Build Coastguard Worker   const __m128 voutput_min = _mm_load_ps(params->sse.min);
5772*4bdc9457SAndroid Build Coastguard Worker   do {
5773*4bdc9457SAndroid Build Coastguard Worker     float* o = output;
5774*4bdc9457SAndroid Build Coastguard Worker     {
5775*4bdc9457SAndroid Build Coastguard Worker       const float* i0 = *input++;
5776*4bdc9457SAndroid Build Coastguard Worker       const float* i1 = *input++;
5777*4bdc9457SAndroid Build Coastguard Worker       const float* i2 = *input++;
5778*4bdc9457SAndroid Build Coastguard Worker       const float* i3 = *input++;
5779*4bdc9457SAndroid Build Coastguard Worker       const float* i4 = *input++;
5780*4bdc9457SAndroid Build Coastguard Worker       const float* i5 = *input++;
5781*4bdc9457SAndroid Build Coastguard Worker       const float* i6 = *input++;
5782*4bdc9457SAndroid Build Coastguard Worker       const float* i7 = *input++;
5783*4bdc9457SAndroid Build Coastguard Worker       const float* i8 = *input++;
5784*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + input_offset);
5785*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + input_offset);
5786*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + input_offset);
5787*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 + input_offset);
5788*4bdc9457SAndroid Build Coastguard Worker       i4 = (const float*) ((uintptr_t) i4 + input_offset);
5789*4bdc9457SAndroid Build Coastguard Worker       i5 = (const float*) ((uintptr_t) i5 + input_offset);
5790*4bdc9457SAndroid Build Coastguard Worker       i6 = (const float*) ((uintptr_t) i6 + input_offset);
5791*4bdc9457SAndroid Build Coastguard Worker       i7 = (const float*) ((uintptr_t) i7 + input_offset);
5792*4bdc9457SAndroid Build Coastguard Worker       i8 = (const float*) ((uintptr_t) i8 + input_offset);
5793*4bdc9457SAndroid Build Coastguard Worker       if (kernel_elements < 2) {
5794*4bdc9457SAndroid Build Coastguard Worker         i1 = i0;
5795*4bdc9457SAndroid Build Coastguard Worker       }
5796*4bdc9457SAndroid Build Coastguard Worker       if (kernel_elements <= 2) {
5797*4bdc9457SAndroid Build Coastguard Worker         i2 = i0;
5798*4bdc9457SAndroid Build Coastguard Worker       }
5799*4bdc9457SAndroid Build Coastguard Worker       if (kernel_elements < 4) {
5800*4bdc9457SAndroid Build Coastguard Worker         i3 = i0;
5801*4bdc9457SAndroid Build Coastguard Worker       }
5802*4bdc9457SAndroid Build Coastguard Worker       if (kernel_elements <= 4) {
5803*4bdc9457SAndroid Build Coastguard Worker         i4 = i0;
5804*4bdc9457SAndroid Build Coastguard Worker       }
5805*4bdc9457SAndroid Build Coastguard Worker       if (kernel_elements < 6) {
5806*4bdc9457SAndroid Build Coastguard Worker         i5 = i0;
5807*4bdc9457SAndroid Build Coastguard Worker       }
5808*4bdc9457SAndroid Build Coastguard Worker       if (kernel_elements <= 6) {
5809*4bdc9457SAndroid Build Coastguard Worker         i6 = i0;
5810*4bdc9457SAndroid Build Coastguard Worker       }
5811*4bdc9457SAndroid Build Coastguard Worker       if (kernel_elements < 8) {
5812*4bdc9457SAndroid Build Coastguard Worker         i7 = i0;
5813*4bdc9457SAndroid Build Coastguard Worker       }
5814*4bdc9457SAndroid Build Coastguard Worker       if (kernel_elements <= 8) {
5815*4bdc9457SAndroid Build Coastguard Worker         i8 = i0;
5816*4bdc9457SAndroid Build Coastguard Worker       }
5817*4bdc9457SAndroid Build Coastguard Worker 
5818*4bdc9457SAndroid Build Coastguard Worker       size_t c = channels;
5819*4bdc9457SAndroid Build Coastguard Worker       for (; c >= 4; c -= 4) {
5820*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
5821*4bdc9457SAndroid Build Coastguard Worker         i0 += 4;
5822*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
5823*4bdc9457SAndroid Build Coastguard Worker         i1 += 4;
5824*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
5825*4bdc9457SAndroid Build Coastguard Worker         i2 += 4;
5826*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
5827*4bdc9457SAndroid Build Coastguard Worker         i3 += 4;
5828*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
5829*4bdc9457SAndroid Build Coastguard Worker         i4 += 4;
5830*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
5831*4bdc9457SAndroid Build Coastguard Worker         i5 += 4;
5832*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
5833*4bdc9457SAndroid Build Coastguard Worker         i6 += 4;
5834*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
5835*4bdc9457SAndroid Build Coastguard Worker         i7 += 4;
5836*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi8 = _mm_loadu_ps(i8);
5837*4bdc9457SAndroid Build Coastguard Worker         i8 += 4;
5838*4bdc9457SAndroid Build Coastguard Worker 
5839*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8);
5840*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5841*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5842*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5843*4bdc9457SAndroid Build Coastguard Worker 
5844*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5845*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax01678 = _mm_max_ps(vmax018, vmax67);
5846*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
5847*4bdc9457SAndroid Build Coastguard Worker         const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5848*4bdc9457SAndroid Build Coastguard Worker 
5849*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o, vout);
5850*4bdc9457SAndroid Build Coastguard Worker         o += 4;
5851*4bdc9457SAndroid Build Coastguard Worker       }
5852*4bdc9457SAndroid Build Coastguard Worker       if (c != 0) {
5853*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
5854*4bdc9457SAndroid Build Coastguard Worker         i0 += 4;
5855*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
5856*4bdc9457SAndroid Build Coastguard Worker         i1 += 4;
5857*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
5858*4bdc9457SAndroid Build Coastguard Worker         i2 += 4;
5859*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
5860*4bdc9457SAndroid Build Coastguard Worker         i3 += 4;
5861*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
5862*4bdc9457SAndroid Build Coastguard Worker         i4 += 4;
5863*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
5864*4bdc9457SAndroid Build Coastguard Worker         i5 += 4;
5865*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
5866*4bdc9457SAndroid Build Coastguard Worker         i6 += 4;
5867*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
5868*4bdc9457SAndroid Build Coastguard Worker         i7 += 4;
5869*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi8 = _mm_loadu_ps(i8);
5870*4bdc9457SAndroid Build Coastguard Worker         i8 += 4;
5871*4bdc9457SAndroid Build Coastguard Worker 
5872*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8);
5873*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5874*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5875*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5876*4bdc9457SAndroid Build Coastguard Worker 
5877*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5878*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax01678 = _mm_max_ps(vmax018, vmax67);
5879*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
5880*4bdc9457SAndroid Build Coastguard Worker         __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5881*4bdc9457SAndroid Build Coastguard Worker 
5882*4bdc9457SAndroid Build Coastguard Worker         if (c & 2) {
5883*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o, vout);
5884*4bdc9457SAndroid Build Coastguard Worker           o += 2;
5885*4bdc9457SAndroid Build Coastguard Worker           vout = _mm_movehl_ps(vout, vout);
5886*4bdc9457SAndroid Build Coastguard Worker         }
5887*4bdc9457SAndroid Build Coastguard Worker         if (c & 1) {
5888*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o, vout);
5889*4bdc9457SAndroid Build Coastguard Worker           o += 1;
5890*4bdc9457SAndroid Build Coastguard Worker         }
5891*4bdc9457SAndroid Build Coastguard Worker       }
5892*4bdc9457SAndroid Build Coastguard Worker     }
5893*4bdc9457SAndroid Build Coastguard Worker 
5894*4bdc9457SAndroid Build Coastguard Worker     for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
5895*4bdc9457SAndroid Build Coastguard Worker       const float* i0 = *input++;
5896*4bdc9457SAndroid Build Coastguard Worker       const float* i1 = *input++;
5897*4bdc9457SAndroid Build Coastguard Worker       const float* i2 = *input++;
5898*4bdc9457SAndroid Build Coastguard Worker       const float* i3 = *input++;
5899*4bdc9457SAndroid Build Coastguard Worker       const float* i4 = *input++;
5900*4bdc9457SAndroid Build Coastguard Worker       const float* i5 = *input++;
5901*4bdc9457SAndroid Build Coastguard Worker       const float* i6 = *input++;
5902*4bdc9457SAndroid Build Coastguard Worker       const float* i7 = *input++;
5903*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + input_offset);
5904*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + input_offset);
5905*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + input_offset);
5906*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 + input_offset);
5907*4bdc9457SAndroid Build Coastguard Worker       i4 = (const float*) ((uintptr_t) i4 + input_offset);
5908*4bdc9457SAndroid Build Coastguard Worker       i5 = (const float*) ((uintptr_t) i5 + input_offset);
5909*4bdc9457SAndroid Build Coastguard Worker       i6 = (const float*) ((uintptr_t) i6 + input_offset);
5910*4bdc9457SAndroid Build Coastguard Worker       i7 = (const float*) ((uintptr_t) i7 + input_offset);
5911*4bdc9457SAndroid Build Coastguard Worker       if (k < 2) {
5912*4bdc9457SAndroid Build Coastguard Worker         i1 = i0;
5913*4bdc9457SAndroid Build Coastguard Worker       }
5914*4bdc9457SAndroid Build Coastguard Worker       if (k <= 2) {
5915*4bdc9457SAndroid Build Coastguard Worker         i2 = i0;
5916*4bdc9457SAndroid Build Coastguard Worker       }
5917*4bdc9457SAndroid Build Coastguard Worker       if (k < 4) {
5918*4bdc9457SAndroid Build Coastguard Worker         i3 = i0;
5919*4bdc9457SAndroid Build Coastguard Worker       }
5920*4bdc9457SAndroid Build Coastguard Worker       if (k <= 4) {
5921*4bdc9457SAndroid Build Coastguard Worker         i4 = i0;
5922*4bdc9457SAndroid Build Coastguard Worker       }
5923*4bdc9457SAndroid Build Coastguard Worker       if (k < 6) {
5924*4bdc9457SAndroid Build Coastguard Worker         i5 = i0;
5925*4bdc9457SAndroid Build Coastguard Worker       }
5926*4bdc9457SAndroid Build Coastguard Worker       if (k <= 6) {
5927*4bdc9457SAndroid Build Coastguard Worker         i6 = i0;
5928*4bdc9457SAndroid Build Coastguard Worker       }
5929*4bdc9457SAndroid Build Coastguard Worker       if (k < 8) {
5930*4bdc9457SAndroid Build Coastguard Worker         i7 = i0;
5931*4bdc9457SAndroid Build Coastguard Worker       }
5932*4bdc9457SAndroid Build Coastguard Worker 
5933*4bdc9457SAndroid Build Coastguard Worker       o = output;
5934*4bdc9457SAndroid Build Coastguard Worker       size_t c = channels;
5935*4bdc9457SAndroid Build Coastguard Worker       for (; c >= 4; c -= 4) {
5936*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
5937*4bdc9457SAndroid Build Coastguard Worker         i0 += 4;
5938*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
5939*4bdc9457SAndroid Build Coastguard Worker         i1 += 4;
5940*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
5941*4bdc9457SAndroid Build Coastguard Worker         i2 += 4;
5942*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
5943*4bdc9457SAndroid Build Coastguard Worker         i3 += 4;
5944*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
5945*4bdc9457SAndroid Build Coastguard Worker         i4 += 4;
5946*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
5947*4bdc9457SAndroid Build Coastguard Worker         i5 += 4;
5948*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
5949*4bdc9457SAndroid Build Coastguard Worker         i6 += 4;
5950*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
5951*4bdc9457SAndroid Build Coastguard Worker         i7 += 4;
5952*4bdc9457SAndroid Build Coastguard Worker         const __m128 vo = _mm_loadu_ps(o);
5953*4bdc9457SAndroid Build Coastguard Worker 
5954*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo);
5955*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5956*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5957*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5958*4bdc9457SAndroid Build Coastguard Worker 
5959*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5960*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax0167 = _mm_max_ps(vmax01, vmax67);
5961*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
5962*4bdc9457SAndroid Build Coastguard Worker         const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5963*4bdc9457SAndroid Build Coastguard Worker 
5964*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(o, vout);
5965*4bdc9457SAndroid Build Coastguard Worker         o += 4;
5966*4bdc9457SAndroid Build Coastguard Worker       }
5967*4bdc9457SAndroid Build Coastguard Worker       if (c != 0) {
5968*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
5969*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
5970*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
5971*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
5972*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
5973*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
5974*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
5975*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
5976*4bdc9457SAndroid Build Coastguard Worker         const __m128 vo = _mm_loadu_ps(o);
5977*4bdc9457SAndroid Build Coastguard Worker 
5978*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo);
5979*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax23 = _mm_max_ps(vi2, vi3);
5980*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax45 = _mm_max_ps(vi4, vi5);
5981*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax67 = _mm_max_ps(vi6, vi7);
5982*4bdc9457SAndroid Build Coastguard Worker 
5983*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
5984*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax0167 = _mm_max_ps(vmax01, vmax67);
5985*4bdc9457SAndroid Build Coastguard Worker         const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
5986*4bdc9457SAndroid Build Coastguard Worker         __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
5987*4bdc9457SAndroid Build Coastguard Worker 
5988*4bdc9457SAndroid Build Coastguard Worker         if (c & 2) {
5989*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) o, vout);
5990*4bdc9457SAndroid Build Coastguard Worker           o += 2;
5991*4bdc9457SAndroid Build Coastguard Worker           vout = _mm_movehl_ps(vout, vout);
5992*4bdc9457SAndroid Build Coastguard Worker         }
5993*4bdc9457SAndroid Build Coastguard Worker         if (c & 1) {
5994*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(o, vout);
5995*4bdc9457SAndroid Build Coastguard Worker           o += 1;
5996*4bdc9457SAndroid Build Coastguard Worker         }
5997*4bdc9457SAndroid Build Coastguard Worker       }
5998*4bdc9457SAndroid Build Coastguard Worker     }
5999*4bdc9457SAndroid Build Coastguard Worker     input = (const float**) ((uintptr_t) input + input_increment);
6000*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) o + output_increment);
6001*4bdc9457SAndroid Build Coastguard Worker   } while (--output_pixels != 0);
6002*4bdc9457SAndroid Build Coastguard Worker }
6003*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,const float * multiplier,float * buffer,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6004*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4(
6005*4bdc9457SAndroid Build Coastguard Worker     size_t output_pixels,
6006*4bdc9457SAndroid Build Coastguard Worker     size_t kernel_elements,
6007*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
6008*4bdc9457SAndroid Build Coastguard Worker     const float** input,
6009*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
6010*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
6011*4bdc9457SAndroid Build Coastguard Worker     const float* multiplier,
6012*4bdc9457SAndroid Build Coastguard Worker     float* buffer,
6013*4bdc9457SAndroid Build Coastguard Worker     float* output,
6014*4bdc9457SAndroid Build Coastguard Worker     size_t input_increment,
6015*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
6016*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6017*4bdc9457SAndroid Build Coastguard Worker {
6018*4bdc9457SAndroid Build Coastguard Worker   assert(output_pixels != 0);
6019*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements > 9);
6020*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
6021*4bdc9457SAndroid Build Coastguard Worker 
6022*4bdc9457SAndroid Build Coastguard Worker   const __m128 voutput_min = _mm_load_ps(params->sse.min);
6023*4bdc9457SAndroid Build Coastguard Worker   const __m128 voutput_max = _mm_load_ps(params->sse.max);
6024*4bdc9457SAndroid Build Coastguard Worker 
6025*4bdc9457SAndroid Build Coastguard Worker   do {
6026*4bdc9457SAndroid Build Coastguard Worker     {
6027*4bdc9457SAndroid Build Coastguard Worker       const float* i0 = *input++;
6028*4bdc9457SAndroid Build Coastguard Worker       assert(i0 != NULL);
6029*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i0 != zero) {
6030*4bdc9457SAndroid Build Coastguard Worker         i0 = (const float*) ((uintptr_t) i0 + input_offset);
6031*4bdc9457SAndroid Build Coastguard Worker       }
6032*4bdc9457SAndroid Build Coastguard Worker       const float* i1 = *input++;
6033*4bdc9457SAndroid Build Coastguard Worker       assert(i1 != NULL);
6034*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i1 != zero) {
6035*4bdc9457SAndroid Build Coastguard Worker         i1 = (const float*) ((uintptr_t) i1 + input_offset);
6036*4bdc9457SAndroid Build Coastguard Worker       }
6037*4bdc9457SAndroid Build Coastguard Worker       const float* i2 = *input++;
6038*4bdc9457SAndroid Build Coastguard Worker       assert(i2 != NULL);
6039*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i2 != zero) {
6040*4bdc9457SAndroid Build Coastguard Worker         i2 = (const float*) ((uintptr_t) i2 + input_offset);
6041*4bdc9457SAndroid Build Coastguard Worker       }
6042*4bdc9457SAndroid Build Coastguard Worker       const float* i3 = *input++;
6043*4bdc9457SAndroid Build Coastguard Worker       assert(i3 != NULL);
6044*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i3 != zero) {
6045*4bdc9457SAndroid Build Coastguard Worker         i3 = (const float*) ((uintptr_t) i3 + input_offset);
6046*4bdc9457SAndroid Build Coastguard Worker       }
6047*4bdc9457SAndroid Build Coastguard Worker       const float* i4 = *input++;
6048*4bdc9457SAndroid Build Coastguard Worker       assert(i4 != NULL);
6049*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i4 != zero) {
6050*4bdc9457SAndroid Build Coastguard Worker         i4 = (const float*) ((uintptr_t) i4 + input_offset);
6051*4bdc9457SAndroid Build Coastguard Worker       }
6052*4bdc9457SAndroid Build Coastguard Worker       const float* i5 = *input++;
6053*4bdc9457SAndroid Build Coastguard Worker       assert(i5 != NULL);
6054*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i5 != zero) {
6055*4bdc9457SAndroid Build Coastguard Worker         i5 = (const float*) ((uintptr_t) i5 + input_offset);
6056*4bdc9457SAndroid Build Coastguard Worker       }
6057*4bdc9457SAndroid Build Coastguard Worker       const float* i6 = *input++;
6058*4bdc9457SAndroid Build Coastguard Worker       assert(i6 != NULL);
6059*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i6 != zero) {
6060*4bdc9457SAndroid Build Coastguard Worker         i6 = (const float*) ((uintptr_t) i6 + input_offset);
6061*4bdc9457SAndroid Build Coastguard Worker       }
6062*4bdc9457SAndroid Build Coastguard Worker       const float* i7 = *input++;
6063*4bdc9457SAndroid Build Coastguard Worker       assert(i7 != NULL);
6064*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i7 != zero) {
6065*4bdc9457SAndroid Build Coastguard Worker         i7 = (const float*) ((uintptr_t) i7 + input_offset);
6066*4bdc9457SAndroid Build Coastguard Worker       }
6067*4bdc9457SAndroid Build Coastguard Worker       const float* i8 = *input++;
6068*4bdc9457SAndroid Build Coastguard Worker       assert(i8 != NULL);
6069*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i8 != zero) {
6070*4bdc9457SAndroid Build Coastguard Worker         i8 = (const float*) ((uintptr_t) i8 + input_offset);
6071*4bdc9457SAndroid Build Coastguard Worker       }
6072*4bdc9457SAndroid Build Coastguard Worker 
6073*4bdc9457SAndroid Build Coastguard Worker       float* b = buffer;
6074*4bdc9457SAndroid Build Coastguard Worker       for (size_t c = 0; c < channels; c += 4) {
6075*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
6076*4bdc9457SAndroid Build Coastguard Worker         i0 += 4;
6077*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
6078*4bdc9457SAndroid Build Coastguard Worker         i1 += 4;
6079*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
6080*4bdc9457SAndroid Build Coastguard Worker         i2 += 4;
6081*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
6082*4bdc9457SAndroid Build Coastguard Worker         i3 += 4;
6083*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
6084*4bdc9457SAndroid Build Coastguard Worker         i4 += 4;
6085*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
6086*4bdc9457SAndroid Build Coastguard Worker         i5 += 4;
6087*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
6088*4bdc9457SAndroid Build Coastguard Worker         i6 += 4;
6089*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
6090*4bdc9457SAndroid Build Coastguard Worker         i7 += 4;
6091*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi8 = _mm_loadu_ps(i8);
6092*4bdc9457SAndroid Build Coastguard Worker         i8 += 4;
6093*4bdc9457SAndroid Build Coastguard Worker 
6094*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6095*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6096*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6097*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6098*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
6099*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6100*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
6101*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
6102*4bdc9457SAndroid Build Coastguard Worker 
6103*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ps(b, vsum); b += 4;
6104*4bdc9457SAndroid Build Coastguard Worker       }
6105*4bdc9457SAndroid Build Coastguard Worker     }
6106*4bdc9457SAndroid Build Coastguard Worker 
6107*4bdc9457SAndroid Build Coastguard Worker     size_t k = kernel_elements;
6108*4bdc9457SAndroid Build Coastguard Worker     for (k -= 9; k > 8; k -= 8) {
6109*4bdc9457SAndroid Build Coastguard Worker       const float* i0 = *input++;
6110*4bdc9457SAndroid Build Coastguard Worker       assert(i0 != NULL);
6111*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i0 != zero) {
6112*4bdc9457SAndroid Build Coastguard Worker         i0 = (const float*) ((uintptr_t) i0 + input_offset);
6113*4bdc9457SAndroid Build Coastguard Worker       }
6114*4bdc9457SAndroid Build Coastguard Worker       const float* i1 = *input++;
6115*4bdc9457SAndroid Build Coastguard Worker       assert(i1 != NULL);
6116*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i1 != zero) {
6117*4bdc9457SAndroid Build Coastguard Worker         i1 = (const float*) ((uintptr_t) i1 + input_offset);
6118*4bdc9457SAndroid Build Coastguard Worker       }
6119*4bdc9457SAndroid Build Coastguard Worker       const float* i2 = *input++;
6120*4bdc9457SAndroid Build Coastguard Worker       assert(i2 != NULL);
6121*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i2 != zero) {
6122*4bdc9457SAndroid Build Coastguard Worker         i2 = (const float*) ((uintptr_t) i2 + input_offset);
6123*4bdc9457SAndroid Build Coastguard Worker       }
6124*4bdc9457SAndroid Build Coastguard Worker       const float* i3 = *input++;
6125*4bdc9457SAndroid Build Coastguard Worker       assert(i3 != NULL);
6126*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i3 != zero) {
6127*4bdc9457SAndroid Build Coastguard Worker         i3 = (const float*) ((uintptr_t) i3 + input_offset);
6128*4bdc9457SAndroid Build Coastguard Worker       }
6129*4bdc9457SAndroid Build Coastguard Worker       const float* i4 = *input++;
6130*4bdc9457SAndroid Build Coastguard Worker       assert(i4 != NULL);
6131*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i4 != zero) {
6132*4bdc9457SAndroid Build Coastguard Worker         i4 = (const float*) ((uintptr_t) i4 + input_offset);
6133*4bdc9457SAndroid Build Coastguard Worker       }
6134*4bdc9457SAndroid Build Coastguard Worker       const float* i5 = *input++;
6135*4bdc9457SAndroid Build Coastguard Worker       assert(i5 != NULL);
6136*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i5 != zero) {
6137*4bdc9457SAndroid Build Coastguard Worker         i5 = (const float*) ((uintptr_t) i5 + input_offset);
6138*4bdc9457SAndroid Build Coastguard Worker       }
6139*4bdc9457SAndroid Build Coastguard Worker       const float* i6 = *input++;
6140*4bdc9457SAndroid Build Coastguard Worker       assert(i6 != NULL);
6141*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i6 != zero) {
6142*4bdc9457SAndroid Build Coastguard Worker         i6 = (const float*) ((uintptr_t) i6 + input_offset);
6143*4bdc9457SAndroid Build Coastguard Worker       }
6144*4bdc9457SAndroid Build Coastguard Worker       const float* i7 = *input++;
6145*4bdc9457SAndroid Build Coastguard Worker       assert(i7 != NULL);
6146*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i7 != zero) {
6147*4bdc9457SAndroid Build Coastguard Worker         i7 = (const float*) ((uintptr_t) i7 + input_offset);
6148*4bdc9457SAndroid Build Coastguard Worker       }
6149*4bdc9457SAndroid Build Coastguard Worker 
6150*4bdc9457SAndroid Build Coastguard Worker       float* b = buffer;
6151*4bdc9457SAndroid Build Coastguard Worker       for (size_t c = 0; c < channels; c += 4) {
6152*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
6153*4bdc9457SAndroid Build Coastguard Worker         i0 += 4;
6154*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
6155*4bdc9457SAndroid Build Coastguard Worker         i1 += 4;
6156*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
6157*4bdc9457SAndroid Build Coastguard Worker         i2 += 4;
6158*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
6159*4bdc9457SAndroid Build Coastguard Worker         i3 += 4;
6160*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
6161*4bdc9457SAndroid Build Coastguard Worker         i4 += 4;
6162*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
6163*4bdc9457SAndroid Build Coastguard Worker         i5 += 4;
6164*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
6165*4bdc9457SAndroid Build Coastguard Worker         i6 += 4;
6166*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
6167*4bdc9457SAndroid Build Coastguard Worker         i7 += 4;
6168*4bdc9457SAndroid Build Coastguard Worker         const __m128 vacc = _mm_load_ps(b);
6169*4bdc9457SAndroid Build Coastguard Worker 
6170*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6171*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6172*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6173*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6174*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
6175*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6176*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
6177*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
6178*4bdc9457SAndroid Build Coastguard Worker 
6179*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ps(b, vsum); b += 4;
6180*4bdc9457SAndroid Build Coastguard Worker       }
6181*4bdc9457SAndroid Build Coastguard Worker     }
6182*4bdc9457SAndroid Build Coastguard Worker 
6183*4bdc9457SAndroid Build Coastguard Worker     {
6184*4bdc9457SAndroid Build Coastguard Worker       const float* i0 = input[0];
6185*4bdc9457SAndroid Build Coastguard Worker       assert(i0 != NULL);
6186*4bdc9457SAndroid Build Coastguard Worker       const float* i1 = input[1];
6187*4bdc9457SAndroid Build Coastguard Worker       const float* i2 = input[2];
6188*4bdc9457SAndroid Build Coastguard Worker       const float* i3 = input[3];
6189*4bdc9457SAndroid Build Coastguard Worker       const float* i4 = input[4];
6190*4bdc9457SAndroid Build Coastguard Worker       const float* i5 = input[5];
6191*4bdc9457SAndroid Build Coastguard Worker       const float* i6 = input[6];
6192*4bdc9457SAndroid Build Coastguard Worker       const float* i7 = input[7];
6193*4bdc9457SAndroid Build Coastguard Worker       input = (const float**) ((uintptr_t) input + input_increment);
6194*4bdc9457SAndroid Build Coastguard Worker       if (k < 2) {
6195*4bdc9457SAndroid Build Coastguard Worker         i1 = zero;
6196*4bdc9457SAndroid Build Coastguard Worker       }
6197*4bdc9457SAndroid Build Coastguard Worker       assert(i1 != NULL);
6198*4bdc9457SAndroid Build Coastguard Worker       if (k <= 2) {
6199*4bdc9457SAndroid Build Coastguard Worker         i2 = zero;
6200*4bdc9457SAndroid Build Coastguard Worker       }
6201*4bdc9457SAndroid Build Coastguard Worker       assert(i2 != NULL);
6202*4bdc9457SAndroid Build Coastguard Worker       if (k < 4) {
6203*4bdc9457SAndroid Build Coastguard Worker         i3 = zero;
6204*4bdc9457SAndroid Build Coastguard Worker       }
6205*4bdc9457SAndroid Build Coastguard Worker       assert(i3 != NULL);
6206*4bdc9457SAndroid Build Coastguard Worker       if (k <= 4) {
6207*4bdc9457SAndroid Build Coastguard Worker         i4 = zero;
6208*4bdc9457SAndroid Build Coastguard Worker       }
6209*4bdc9457SAndroid Build Coastguard Worker       assert(i4 != NULL);
6210*4bdc9457SAndroid Build Coastguard Worker       if (k < 6) {
6211*4bdc9457SAndroid Build Coastguard Worker         i5 = zero;
6212*4bdc9457SAndroid Build Coastguard Worker       }
6213*4bdc9457SAndroid Build Coastguard Worker       assert(i5 != NULL);
6214*4bdc9457SAndroid Build Coastguard Worker       if (k <= 6) {
6215*4bdc9457SAndroid Build Coastguard Worker         i6 = zero;
6216*4bdc9457SAndroid Build Coastguard Worker       }
6217*4bdc9457SAndroid Build Coastguard Worker       assert(i6 != NULL);
6218*4bdc9457SAndroid Build Coastguard Worker       if (k < 8) {
6219*4bdc9457SAndroid Build Coastguard Worker         i7 = zero;
6220*4bdc9457SAndroid Build Coastguard Worker       }
6221*4bdc9457SAndroid Build Coastguard Worker       assert(i7 != NULL);
6222*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i0 != zero) {
6223*4bdc9457SAndroid Build Coastguard Worker         i0 = (const float*) ((uintptr_t) i0 + input_offset);
6224*4bdc9457SAndroid Build Coastguard Worker       }
6225*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i1 != zero) {
6226*4bdc9457SAndroid Build Coastguard Worker         i1 = (const float*) ((uintptr_t) i1 + input_offset);
6227*4bdc9457SAndroid Build Coastguard Worker       }
6228*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i2 != zero) {
6229*4bdc9457SAndroid Build Coastguard Worker         i2 = (const float*) ((uintptr_t) i2 + input_offset);
6230*4bdc9457SAndroid Build Coastguard Worker       }
6231*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i3 != zero) {
6232*4bdc9457SAndroid Build Coastguard Worker         i3 = (const float*) ((uintptr_t) i3 + input_offset);
6233*4bdc9457SAndroid Build Coastguard Worker       }
6234*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i4 != zero) {
6235*4bdc9457SAndroid Build Coastguard Worker         i4 = (const float*) ((uintptr_t) i4 + input_offset);
6236*4bdc9457SAndroid Build Coastguard Worker       }
6237*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i5 != zero) {
6238*4bdc9457SAndroid Build Coastguard Worker         i5 = (const float*) ((uintptr_t) i5 + input_offset);
6239*4bdc9457SAndroid Build Coastguard Worker       }
6240*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i6 != zero) {
6241*4bdc9457SAndroid Build Coastguard Worker         i6 = (const float*) ((uintptr_t) i6 + input_offset);
6242*4bdc9457SAndroid Build Coastguard Worker       }
6243*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i7 != zero) {
6244*4bdc9457SAndroid Build Coastguard Worker         i7 = (const float*) ((uintptr_t) i7 + input_offset);
6245*4bdc9457SAndroid Build Coastguard Worker       }
6246*4bdc9457SAndroid Build Coastguard Worker 
6247*4bdc9457SAndroid Build Coastguard Worker       const __m128 vmultiplier = _mm_load1_ps(multiplier);
6248*4bdc9457SAndroid Build Coastguard Worker       multiplier += 1;
6249*4bdc9457SAndroid Build Coastguard Worker 
6250*4bdc9457SAndroid Build Coastguard Worker       size_t c = channels;
6251*4bdc9457SAndroid Build Coastguard Worker       float* b = buffer;
6252*4bdc9457SAndroid Build Coastguard Worker       while (c >= 4) {
6253*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
6254*4bdc9457SAndroid Build Coastguard Worker         i0 += 4;
6255*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
6256*4bdc9457SAndroid Build Coastguard Worker         i1 += 4;
6257*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
6258*4bdc9457SAndroid Build Coastguard Worker         i2 += 4;
6259*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
6260*4bdc9457SAndroid Build Coastguard Worker         i3 += 4;
6261*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
6262*4bdc9457SAndroid Build Coastguard Worker         i4 += 4;
6263*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
6264*4bdc9457SAndroid Build Coastguard Worker         i5 += 4;
6265*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
6266*4bdc9457SAndroid Build Coastguard Worker         i6 += 4;
6267*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
6268*4bdc9457SAndroid Build Coastguard Worker         i7 += 4;
6269*4bdc9457SAndroid Build Coastguard Worker         const __m128 vacc = _mm_load_ps(b);
6270*4bdc9457SAndroid Build Coastguard Worker         b += 4;
6271*4bdc9457SAndroid Build Coastguard Worker 
6272*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6273*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6274*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6275*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6276*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
6277*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6278*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
6279*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
6280*4bdc9457SAndroid Build Coastguard Worker 
6281*4bdc9457SAndroid Build Coastguard Worker         __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6282*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_max_ps(vout, voutput_min);
6283*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_min_ps(vout, voutput_max);
6284*4bdc9457SAndroid Build Coastguard Worker 
6285*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(output, vout);
6286*4bdc9457SAndroid Build Coastguard Worker         output += 4;
6287*4bdc9457SAndroid Build Coastguard Worker 
6288*4bdc9457SAndroid Build Coastguard Worker         c -= 4;
6289*4bdc9457SAndroid Build Coastguard Worker       }
6290*4bdc9457SAndroid Build Coastguard Worker       if (c != 0) {
6291*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi0 = _mm_loadu_ps(i0);
6292*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi1 = _mm_loadu_ps(i1);
6293*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi2 = _mm_loadu_ps(i2);
6294*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi3 = _mm_loadu_ps(i3);
6295*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi4 = _mm_loadu_ps(i4);
6296*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi5 = _mm_loadu_ps(i5);
6297*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi6 = _mm_loadu_ps(i6);
6298*4bdc9457SAndroid Build Coastguard Worker         const __m128 vi7 = _mm_loadu_ps(i7);
6299*4bdc9457SAndroid Build Coastguard Worker         const __m128 vacc = _mm_load_ps(b);
6300*4bdc9457SAndroid Build Coastguard Worker 
6301*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6302*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6303*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6304*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6305*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
6306*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6307*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
6308*4bdc9457SAndroid Build Coastguard Worker         const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
6309*4bdc9457SAndroid Build Coastguard Worker 
6310*4bdc9457SAndroid Build Coastguard Worker         __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6311*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_max_ps(vout, voutput_min);
6312*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_min_ps(vout, voutput_max);
6313*4bdc9457SAndroid Build Coastguard Worker 
6314*4bdc9457SAndroid Build Coastguard Worker         if (c & 2) {
6315*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_pi((__m64*) output, vout);
6316*4bdc9457SAndroid Build Coastguard Worker           vout = _mm_movehl_ps(vout, vout);
6317*4bdc9457SAndroid Build Coastguard Worker           output += 2;
6318*4bdc9457SAndroid Build Coastguard Worker         }
6319*4bdc9457SAndroid Build Coastguard Worker         if (c & 1) {
6320*4bdc9457SAndroid Build Coastguard Worker           _mm_store_ss(output, vout);
6321*4bdc9457SAndroid Build Coastguard Worker           output += 1;
6322*4bdc9457SAndroid Build Coastguard Worker         }
6323*4bdc9457SAndroid Build Coastguard Worker       }
6324*4bdc9457SAndroid Build Coastguard Worker     }
6325*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) output + output_increment);
6326*4bdc9457SAndroid Build Coastguard Worker   } while (--output_pixels != 0);
6327*4bdc9457SAndroid Build Coastguard Worker }
6328*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4(size_t output_pixels,size_t kernel_elements,size_t channels,const float ** input,size_t input_offset,const float * zero,const float * multiplier,float * output,size_t input_increment,size_t output_increment,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6329*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4(
6330*4bdc9457SAndroid Build Coastguard Worker     size_t output_pixels,
6331*4bdc9457SAndroid Build Coastguard Worker     size_t kernel_elements,
6332*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
6333*4bdc9457SAndroid Build Coastguard Worker     const float** input,
6334*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
6335*4bdc9457SAndroid Build Coastguard Worker     const float* zero,
6336*4bdc9457SAndroid Build Coastguard Worker     const float* multiplier,
6337*4bdc9457SAndroid Build Coastguard Worker     float* output,
6338*4bdc9457SAndroid Build Coastguard Worker     size_t input_increment,
6339*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
6340*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6341*4bdc9457SAndroid Build Coastguard Worker {
6342*4bdc9457SAndroid Build Coastguard Worker   assert(output_pixels != 0);
6343*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements != 0);
6344*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements <= 9);
6345*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
6346*4bdc9457SAndroid Build Coastguard Worker 
6347*4bdc9457SAndroid Build Coastguard Worker   const __m128 voutput_min = _mm_load_ps(params->sse.min);
6348*4bdc9457SAndroid Build Coastguard Worker   const __m128 voutput_max = _mm_load_ps(params->sse.max);
6349*4bdc9457SAndroid Build Coastguard Worker 
6350*4bdc9457SAndroid Build Coastguard Worker   do {
6351*4bdc9457SAndroid Build Coastguard Worker     const float* i0 = input[0];
6352*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
6353*4bdc9457SAndroid Build Coastguard Worker     const float* i1 = input[1];
6354*4bdc9457SAndroid Build Coastguard Worker     const float* i2 = input[2];
6355*4bdc9457SAndroid Build Coastguard Worker     const float* i3 = input[3];
6356*4bdc9457SAndroid Build Coastguard Worker     const float* i4 = input[4];
6357*4bdc9457SAndroid Build Coastguard Worker     const float* i5 = input[5];
6358*4bdc9457SAndroid Build Coastguard Worker     const float* i6 = input[6];
6359*4bdc9457SAndroid Build Coastguard Worker     const float* i7 = input[7];
6360*4bdc9457SAndroid Build Coastguard Worker     const float* i8 = input[8];
6361*4bdc9457SAndroid Build Coastguard Worker     input = (const float**) ((uintptr_t) input + input_increment);
6362*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 2) {
6363*4bdc9457SAndroid Build Coastguard Worker       i1 = zero;
6364*4bdc9457SAndroid Build Coastguard Worker     }
6365*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
6366*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 2) {
6367*4bdc9457SAndroid Build Coastguard Worker       i2 = zero;
6368*4bdc9457SAndroid Build Coastguard Worker     }
6369*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
6370*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 4) {
6371*4bdc9457SAndroid Build Coastguard Worker       i3 = zero;
6372*4bdc9457SAndroid Build Coastguard Worker     }
6373*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
6374*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 4) {
6375*4bdc9457SAndroid Build Coastguard Worker       i4 = zero;
6376*4bdc9457SAndroid Build Coastguard Worker     }
6377*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
6378*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 6) {
6379*4bdc9457SAndroid Build Coastguard Worker       i5 = zero;
6380*4bdc9457SAndroid Build Coastguard Worker     }
6381*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
6382*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 6) {
6383*4bdc9457SAndroid Build Coastguard Worker       i6 = zero;
6384*4bdc9457SAndroid Build Coastguard Worker     }
6385*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
6386*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 8) {
6387*4bdc9457SAndroid Build Coastguard Worker       i7 = zero;
6388*4bdc9457SAndroid Build Coastguard Worker     }
6389*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
6390*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 8) {
6391*4bdc9457SAndroid Build Coastguard Worker       i8 = zero;
6392*4bdc9457SAndroid Build Coastguard Worker     }
6393*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
6394*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
6395*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + input_offset);
6396*4bdc9457SAndroid Build Coastguard Worker     }
6397*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
6398*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + input_offset);
6399*4bdc9457SAndroid Build Coastguard Worker     }
6400*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
6401*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + input_offset);
6402*4bdc9457SAndroid Build Coastguard Worker     }
6403*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
6404*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 + input_offset);
6405*4bdc9457SAndroid Build Coastguard Worker     }
6406*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
6407*4bdc9457SAndroid Build Coastguard Worker       i4 = (const float*) ((uintptr_t) i4 + input_offset);
6408*4bdc9457SAndroid Build Coastguard Worker     }
6409*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
6410*4bdc9457SAndroid Build Coastguard Worker       i5 = (const float*) ((uintptr_t) i5 + input_offset);
6411*4bdc9457SAndroid Build Coastguard Worker     }
6412*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
6413*4bdc9457SAndroid Build Coastguard Worker       i6 = (const float*) ((uintptr_t) i6 + input_offset);
6414*4bdc9457SAndroid Build Coastguard Worker     }
6415*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
6416*4bdc9457SAndroid Build Coastguard Worker       i7 = (const float*) ((uintptr_t) i7 + input_offset);
6417*4bdc9457SAndroid Build Coastguard Worker     }
6418*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
6419*4bdc9457SAndroid Build Coastguard Worker       i8 = (const float*) ((uintptr_t) i8 + input_offset);
6420*4bdc9457SAndroid Build Coastguard Worker     }
6421*4bdc9457SAndroid Build Coastguard Worker 
6422*4bdc9457SAndroid Build Coastguard Worker     const __m128 vmultiplier = _mm_load1_ps(multiplier);
6423*4bdc9457SAndroid Build Coastguard Worker     multiplier += 1;
6424*4bdc9457SAndroid Build Coastguard Worker 
6425*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
6426*4bdc9457SAndroid Build Coastguard Worker     while (c >= 4) {
6427*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0 = _mm_loadu_ps(i0);
6428*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
6429*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1 = _mm_loadu_ps(i1);
6430*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
6431*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2 = _mm_loadu_ps(i2);
6432*4bdc9457SAndroid Build Coastguard Worker       i2 += 4;
6433*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3 = _mm_loadu_ps(i3);
6434*4bdc9457SAndroid Build Coastguard Worker       i3 += 4;
6435*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4 = _mm_loadu_ps(i4);
6436*4bdc9457SAndroid Build Coastguard Worker       i4 += 4;
6437*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5 = _mm_loadu_ps(i5);
6438*4bdc9457SAndroid Build Coastguard Worker       i5 += 4;
6439*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6 = _mm_loadu_ps(i6);
6440*4bdc9457SAndroid Build Coastguard Worker       i6 += 4;
6441*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7 = _mm_loadu_ps(i7);
6442*4bdc9457SAndroid Build Coastguard Worker       i7 += 4;
6443*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8 = _mm_loadu_ps(i8);
6444*4bdc9457SAndroid Build Coastguard Worker       i8 += 4;
6445*4bdc9457SAndroid Build Coastguard Worker 
6446*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum018 = _mm_add_ps(_mm_add_ps(vi0, vi1), vi8);
6447*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6448*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6449*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6450*4bdc9457SAndroid Build Coastguard Worker 
6451*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6452*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
6453*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
6454*4bdc9457SAndroid Build Coastguard Worker 
6455*4bdc9457SAndroid Build Coastguard Worker       __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6456*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_max_ps(vout, voutput_min);
6457*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_min_ps(vout, voutput_max);
6458*4bdc9457SAndroid Build Coastguard Worker 
6459*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vout); output += 4;
6460*4bdc9457SAndroid Build Coastguard Worker 
6461*4bdc9457SAndroid Build Coastguard Worker       c -= 4;
6462*4bdc9457SAndroid Build Coastguard Worker     }
6463*4bdc9457SAndroid Build Coastguard Worker     if (c != 0) {
6464*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi0 = _mm_loadu_ps(i0);
6465*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi1 = _mm_loadu_ps(i1);
6466*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi2 = _mm_loadu_ps(i2);
6467*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi3 = _mm_loadu_ps(i3);
6468*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi4 = _mm_loadu_ps(i4);
6469*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi5 = _mm_loadu_ps(i5);
6470*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi6 = _mm_loadu_ps(i6);
6471*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi7 = _mm_loadu_ps(i7);
6472*4bdc9457SAndroid Build Coastguard Worker       const __m128 vi8 = _mm_loadu_ps(i8);
6473*4bdc9457SAndroid Build Coastguard Worker 
6474*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum01 = _mm_add_ps(vi0, vi1);
6475*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum23 = _mm_add_ps(vi2, vi3);
6476*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum45 = _mm_add_ps(vi4, vi5);
6477*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum67 = _mm_add_ps(vi6, vi7);
6478*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
6479*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
6480*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
6481*4bdc9457SAndroid Build Coastguard Worker       const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
6482*4bdc9457SAndroid Build Coastguard Worker 
6483*4bdc9457SAndroid Build Coastguard Worker       __m128 vout = _mm_mul_ps(vsum, vmultiplier);
6484*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_max_ps(vout, voutput_min);
6485*4bdc9457SAndroid Build Coastguard Worker       vout = _mm_min_ps(vout, voutput_max);
6486*4bdc9457SAndroid Build Coastguard Worker 
6487*4bdc9457SAndroid Build Coastguard Worker       if (c & 2) {
6488*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) output, vout);
6489*4bdc9457SAndroid Build Coastguard Worker         vout = _mm_movehl_ps(vout, vout);
6490*4bdc9457SAndroid Build Coastguard Worker         output += 2;
6491*4bdc9457SAndroid Build Coastguard Worker       }
6492*4bdc9457SAndroid Build Coastguard Worker       if (c & 1) {
6493*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(output, vout);
6494*4bdc9457SAndroid Build Coastguard Worker         output += 1;
6495*4bdc9457SAndroid Build Coastguard Worker       }
6496*4bdc9457SAndroid Build Coastguard Worker     }
6497*4bdc9457SAndroid Build Coastguard Worker     output = (float*) ((uintptr_t) output + output_increment);
6498*4bdc9457SAndroid Build Coastguard Worker   } while (--output_pixels != 0);
6499*4bdc9457SAndroid Build Coastguard Worker }
6500*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_rmax_ukernel__sse(size_t n,const float * x,float * y)6501*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_rmax_ukernel__sse(
6502*4bdc9457SAndroid Build Coastguard Worker     size_t n,
6503*4bdc9457SAndroid Build Coastguard Worker     const float* x,
6504*4bdc9457SAndroid Build Coastguard Worker     float* y)
6505*4bdc9457SAndroid Build Coastguard Worker {
6506*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
6507*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
6508*4bdc9457SAndroid Build Coastguard Worker 
6509*4bdc9457SAndroid Build Coastguard Worker   __m128 vmax0 = _mm_load_ss(x);
6510*4bdc9457SAndroid Build Coastguard Worker   vmax0 = _mm_shuffle_ps(vmax0, vmax0, _MM_SHUFFLE(0, 0, 0, 0));
6511*4bdc9457SAndroid Build Coastguard Worker   __m128 vmax1 = vmax0;
6512*4bdc9457SAndroid Build Coastguard Worker   __m128 vmax2 = vmax0;
6513*4bdc9457SAndroid Build Coastguard Worker   __m128 vmax3 = vmax0;
6514*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 64; n -= 64) {
6515*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx0 = _mm_loadu_ps(x);
6516*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx1 = _mm_loadu_ps(x + 4);
6517*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx2 = _mm_loadu_ps(x + 8);
6518*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx3 = _mm_loadu_ps(x + 12);
6519*4bdc9457SAndroid Build Coastguard Worker     x += 16;
6520*4bdc9457SAndroid Build Coastguard Worker 
6521*4bdc9457SAndroid Build Coastguard Worker     vmax0 = _mm_max_ps(vmax0, vx0);
6522*4bdc9457SAndroid Build Coastguard Worker     vmax1 = _mm_max_ps(vmax1, vx1);
6523*4bdc9457SAndroid Build Coastguard Worker     vmax2 = _mm_max_ps(vmax2, vx2);
6524*4bdc9457SAndroid Build Coastguard Worker     vmax3 = _mm_max_ps(vmax3, vx3);
6525*4bdc9457SAndroid Build Coastguard Worker   }
6526*4bdc9457SAndroid Build Coastguard Worker   __m128 vmax = _mm_max_ps(_mm_max_ps(vmax0, vmax1), _mm_max_ps(vmax2, vmax3));
6527*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16; n -= 16) {
6528*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx = _mm_loadu_ps(x);
6529*4bdc9457SAndroid Build Coastguard Worker     vmax = _mm_max_ps(vmax, vx);
6530*4bdc9457SAndroid Build Coastguard Worker     x += 4;
6531*4bdc9457SAndroid Build Coastguard Worker   }
6532*4bdc9457SAndroid Build Coastguard Worker   __m128 vmax_lo = _mm_max_ps(vmax, _mm_movehl_ps(vmax, vmax));
6533*4bdc9457SAndroid Build Coastguard Worker   vmax_lo = _mm_max_ss(vmax_lo, _mm_shuffle_ps(vmax_lo, vmax_lo, _MM_SHUFFLE(3, 3, 1, 1)));
6534*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
6535*4bdc9457SAndroid Build Coastguard Worker     do {
6536*4bdc9457SAndroid Build Coastguard Worker       vmax_lo = _mm_max_ss(vmax_lo, _mm_load_ss(x));
6537*4bdc9457SAndroid Build Coastguard Worker       x += 1;
6538*4bdc9457SAndroid Build Coastguard Worker       n -= 4;
6539*4bdc9457SAndroid Build Coastguard Worker     } while (n != 0);
6540*4bdc9457SAndroid Build Coastguard Worker   }
6541*4bdc9457SAndroid Build Coastguard Worker   _mm_store_ss(y, vmax_lo);
6542*4bdc9457SAndroid Build Coastguard Worker }
6543*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_spmm_minmax_ukernel_32x1__sse(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6544*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_spmm_minmax_ukernel_32x1__sse(
6545*4bdc9457SAndroid Build Coastguard Worker     size_t mc,
6546*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
6547*4bdc9457SAndroid Build Coastguard Worker     const float*restrict input,
6548*4bdc9457SAndroid Build Coastguard Worker     const float*restrict weights,
6549*4bdc9457SAndroid Build Coastguard Worker     const int32_t*restrict widx_dmap,
6550*4bdc9457SAndroid Build Coastguard Worker     const uint32_t*restrict nidx_nnzmap,
6551*4bdc9457SAndroid Build Coastguard Worker     float*restrict output,
6552*4bdc9457SAndroid Build Coastguard Worker     size_t output_stride,
6553*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
6554*4bdc9457SAndroid Build Coastguard Worker {
6555*4bdc9457SAndroid Build Coastguard Worker   assert(mc != 0);
6556*4bdc9457SAndroid Build Coastguard Worker   assert(mc % sizeof(float) == 0);
6557*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
6558*4bdc9457SAndroid Build Coastguard Worker 
6559*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
6560*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
6561*4bdc9457SAndroid Build Coastguard Worker   size_t output_decrement = output_stride * nc - 32 * sizeof(float);
6562*4bdc9457SAndroid Build Coastguard Worker   while XNN_LIKELY(mc >= 32 * sizeof(float)) {
6563*4bdc9457SAndroid Build Coastguard Worker     const float*restrict w = weights;
6564*4bdc9457SAndroid Build Coastguard Worker     const int32_t* dmap = widx_dmap;
6565*4bdc9457SAndroid Build Coastguard Worker     const uint32_t* nnzmap = nidx_nnzmap;
6566*4bdc9457SAndroid Build Coastguard Worker     size_t n = nc;
6567*4bdc9457SAndroid Build Coastguard Worker     do {
6568*4bdc9457SAndroid Build Coastguard Worker       uint32_t nnz = *nnzmap++;
6569*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6570*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc4567 = vacc0123;
6571*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc89AB = vacc0123;
6572*4bdc9457SAndroid Build Coastguard Worker       __m128 vaccCDEF = vacc0123;
6573*4bdc9457SAndroid Build Coastguard Worker       __m128 vaccGHIJ = vacc0123;
6574*4bdc9457SAndroid Build Coastguard Worker       __m128 vaccKLMN = vacc0123;
6575*4bdc9457SAndroid Build Coastguard Worker       __m128 vaccOPQR = vacc0123;
6576*4bdc9457SAndroid Build Coastguard Worker       __m128 vaccSTUV = vacc0123;
6577*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(nnz != 0) {
6578*4bdc9457SAndroid Build Coastguard Worker         do {
6579*4bdc9457SAndroid Build Coastguard Worker           const intptr_t diff = *dmap++;
6580*4bdc9457SAndroid Build Coastguard Worker           const __m128 vi0123 = _mm_loadu_ps(input);
6581*4bdc9457SAndroid Build Coastguard Worker           const __m128 vi4567 = _mm_loadu_ps(input + 4);
6582*4bdc9457SAndroid Build Coastguard Worker           const __m128 vi89AB = _mm_loadu_ps(input + 8);
6583*4bdc9457SAndroid Build Coastguard Worker           const __m128 viCDEF = _mm_loadu_ps(input + 12);
6584*4bdc9457SAndroid Build Coastguard Worker           const __m128 viGHIJ = _mm_loadu_ps(input + 16);
6585*4bdc9457SAndroid Build Coastguard Worker           const __m128 viKLMN = _mm_loadu_ps(input + 20);
6586*4bdc9457SAndroid Build Coastguard Worker           const __m128 viOPQR = _mm_loadu_ps(input + 24);
6587*4bdc9457SAndroid Build Coastguard Worker           const __m128 viSTUV = _mm_loadu_ps(input + 28);
6588*4bdc9457SAndroid Build Coastguard Worker           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6589*4bdc9457SAndroid Build Coastguard Worker           const __m128 vw = _mm_load1_ps(w); w += 1;
6590*4bdc9457SAndroid Build Coastguard Worker           vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6591*4bdc9457SAndroid Build Coastguard Worker           vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
6592*4bdc9457SAndroid Build Coastguard Worker           vacc89AB = _mm_add_ps(vacc89AB, _mm_mul_ps(vi89AB, vw));
6593*4bdc9457SAndroid Build Coastguard Worker           vaccCDEF = _mm_add_ps(vaccCDEF, _mm_mul_ps(viCDEF, vw));
6594*4bdc9457SAndroid Build Coastguard Worker           vaccGHIJ = _mm_add_ps(vaccGHIJ, _mm_mul_ps(viGHIJ, vw));
6595*4bdc9457SAndroid Build Coastguard Worker           vaccKLMN = _mm_add_ps(vaccKLMN, _mm_mul_ps(viKLMN, vw));
6596*4bdc9457SAndroid Build Coastguard Worker           vaccOPQR = _mm_add_ps(vaccOPQR, _mm_mul_ps(viOPQR, vw));
6597*4bdc9457SAndroid Build Coastguard Worker           vaccSTUV = _mm_add_ps(vaccSTUV, _mm_mul_ps(viSTUV, vw));
6598*4bdc9457SAndroid Build Coastguard Worker         } while (--nnz != 0);
6599*4bdc9457SAndroid Build Coastguard Worker       }
6600*4bdc9457SAndroid Build Coastguard Worker       __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6601*4bdc9457SAndroid Build Coastguard Worker       __m128 vout4567 = _mm_min_ps(vacc4567, vmax);
6602*4bdc9457SAndroid Build Coastguard Worker       __m128 vout89AB = _mm_min_ps(vacc89AB, vmax);
6603*4bdc9457SAndroid Build Coastguard Worker       __m128 voutCDEF = _mm_min_ps(vaccCDEF, vmax);
6604*4bdc9457SAndroid Build Coastguard Worker       __m128 voutGHIJ = _mm_min_ps(vaccGHIJ, vmax);
6605*4bdc9457SAndroid Build Coastguard Worker       __m128 voutKLMN = _mm_min_ps(vaccKLMN, vmax);
6606*4bdc9457SAndroid Build Coastguard Worker       __m128 voutOPQR = _mm_min_ps(vaccOPQR, vmax);
6607*4bdc9457SAndroid Build Coastguard Worker       __m128 voutSTUV = _mm_min_ps(vaccSTUV, vmax);
6608*4bdc9457SAndroid Build Coastguard Worker       vout0123 = _mm_max_ps(vout0123, vmin);
6609*4bdc9457SAndroid Build Coastguard Worker       vout4567 = _mm_max_ps(vout4567, vmin);
6610*4bdc9457SAndroid Build Coastguard Worker       vout89AB = _mm_max_ps(vout89AB, vmin);
6611*4bdc9457SAndroid Build Coastguard Worker       voutCDEF = _mm_max_ps(voutCDEF, vmin);
6612*4bdc9457SAndroid Build Coastguard Worker       voutGHIJ = _mm_max_ps(voutGHIJ, vmin);
6613*4bdc9457SAndroid Build Coastguard Worker       voutKLMN = _mm_max_ps(voutKLMN, vmin);
6614*4bdc9457SAndroid Build Coastguard Worker       voutOPQR = _mm_max_ps(voutOPQR, vmin);
6615*4bdc9457SAndroid Build Coastguard Worker       voutSTUV = _mm_max_ps(voutSTUV, vmin);
6616*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output, vout0123);
6617*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 4, vout4567);
6618*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 8, vout89AB);
6619*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 12, voutCDEF);
6620*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 16, voutGHIJ);
6621*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 20, voutKLMN);
6622*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 24, voutOPQR);
6623*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(output + 28, voutSTUV);
6624*4bdc9457SAndroid Build Coastguard Worker       output = (float*restrict) ((uintptr_t) output + output_stride);
6625*4bdc9457SAndroid Build Coastguard Worker     } while (--n != 0);
6626*4bdc9457SAndroid Build Coastguard Worker     output = (float*restrict) ((uintptr_t) output - output_decrement);
6627*4bdc9457SAndroid Build Coastguard Worker     input += 32;
6628*4bdc9457SAndroid Build Coastguard Worker     mc -= 32 * sizeof(float);
6629*4bdc9457SAndroid Build Coastguard Worker   }
6630*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(mc != 0) {
6631*4bdc9457SAndroid Build Coastguard Worker     output_decrement += 16 * sizeof(float);
6632*4bdc9457SAndroid Build Coastguard Worker     if (mc & (16 * sizeof(float))) {
6633*4bdc9457SAndroid Build Coastguard Worker       const float*restrict w = weights;
6634*4bdc9457SAndroid Build Coastguard Worker       const int32_t* dmap = widx_dmap;
6635*4bdc9457SAndroid Build Coastguard Worker       const uint32_t* nnzmap = nidx_nnzmap;
6636*4bdc9457SAndroid Build Coastguard Worker       size_t n = nc;
6637*4bdc9457SAndroid Build Coastguard Worker       do {
6638*4bdc9457SAndroid Build Coastguard Worker         uint32_t nnz = *nnzmap++;
6639*4bdc9457SAndroid Build Coastguard Worker         __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6640*4bdc9457SAndroid Build Coastguard Worker         __m128 vacc4567 = vacc0123;
6641*4bdc9457SAndroid Build Coastguard Worker         __m128 vacc89AB = vacc0123;
6642*4bdc9457SAndroid Build Coastguard Worker         __m128 vaccCDEF = vacc0123;
6643*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(nnz != 0) {
6644*4bdc9457SAndroid Build Coastguard Worker           do {
6645*4bdc9457SAndroid Build Coastguard Worker             const intptr_t diff = *dmap++;
6646*4bdc9457SAndroid Build Coastguard Worker             const __m128 vi0123 = _mm_loadu_ps(input);
6647*4bdc9457SAndroid Build Coastguard Worker             const __m128 vi4567 = _mm_loadu_ps(input + 4);
6648*4bdc9457SAndroid Build Coastguard Worker             const __m128 vi89AB = _mm_loadu_ps(input + 8);
6649*4bdc9457SAndroid Build Coastguard Worker             const __m128 viCDEF = _mm_loadu_ps(input + 12);
6650*4bdc9457SAndroid Build Coastguard Worker             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6651*4bdc9457SAndroid Build Coastguard Worker             const __m128 vw = _mm_load1_ps(w); w += 1;
6652*4bdc9457SAndroid Build Coastguard Worker             vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6653*4bdc9457SAndroid Build Coastguard Worker             vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
6654*4bdc9457SAndroid Build Coastguard Worker             vacc89AB = _mm_add_ps(vacc89AB, _mm_mul_ps(vi89AB, vw));
6655*4bdc9457SAndroid Build Coastguard Worker             vaccCDEF = _mm_add_ps(vaccCDEF, _mm_mul_ps(viCDEF, vw));
6656*4bdc9457SAndroid Build Coastguard Worker           } while (--nnz != 0);
6657*4bdc9457SAndroid Build Coastguard Worker         }
6658*4bdc9457SAndroid Build Coastguard Worker         __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6659*4bdc9457SAndroid Build Coastguard Worker         __m128 vout4567 = _mm_min_ps(vacc4567, vmax);
6660*4bdc9457SAndroid Build Coastguard Worker         __m128 vout89AB = _mm_min_ps(vacc89AB, vmax);
6661*4bdc9457SAndroid Build Coastguard Worker         __m128 voutCDEF = _mm_min_ps(vaccCDEF, vmax);
6662*4bdc9457SAndroid Build Coastguard Worker         vout0123 = _mm_max_ps(vout0123, vmin);
6663*4bdc9457SAndroid Build Coastguard Worker         vout4567 = _mm_max_ps(vout4567, vmin);
6664*4bdc9457SAndroid Build Coastguard Worker         vout89AB = _mm_max_ps(vout89AB, vmin);
6665*4bdc9457SAndroid Build Coastguard Worker         voutCDEF = _mm_max_ps(voutCDEF, vmin);
6666*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(output, vout0123);
6667*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(output + 4, vout4567);
6668*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(output + 8, vout89AB);
6669*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(output + 12, voutCDEF);
6670*4bdc9457SAndroid Build Coastguard Worker         output = (float*restrict) ((uintptr_t) output + output_stride);
6671*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
6672*4bdc9457SAndroid Build Coastguard Worker       output = (float*restrict) ((uintptr_t) output - output_decrement);
6673*4bdc9457SAndroid Build Coastguard Worker       input += 16;
6674*4bdc9457SAndroid Build Coastguard Worker     }
6675*4bdc9457SAndroid Build Coastguard Worker     output_decrement += 8 * sizeof(float);
6676*4bdc9457SAndroid Build Coastguard Worker     if (mc & (8 * sizeof(float))) {
6677*4bdc9457SAndroid Build Coastguard Worker       const float*restrict w = weights;
6678*4bdc9457SAndroid Build Coastguard Worker       const int32_t* dmap = widx_dmap;
6679*4bdc9457SAndroid Build Coastguard Worker       const uint32_t* nnzmap = nidx_nnzmap;
6680*4bdc9457SAndroid Build Coastguard Worker       size_t n = nc;
6681*4bdc9457SAndroid Build Coastguard Worker       do {
6682*4bdc9457SAndroid Build Coastguard Worker         uint32_t nnz = *nnzmap++;
6683*4bdc9457SAndroid Build Coastguard Worker         __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6684*4bdc9457SAndroid Build Coastguard Worker         __m128 vacc4567 = vacc0123;
6685*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(nnz != 0) {
6686*4bdc9457SAndroid Build Coastguard Worker           do {
6687*4bdc9457SAndroid Build Coastguard Worker             const intptr_t diff = *dmap++;
6688*4bdc9457SAndroid Build Coastguard Worker             const __m128 vi0123 = _mm_loadu_ps(input);
6689*4bdc9457SAndroid Build Coastguard Worker             const __m128 vi4567 = _mm_loadu_ps(input + 4);
6690*4bdc9457SAndroid Build Coastguard Worker             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6691*4bdc9457SAndroid Build Coastguard Worker             const __m128 vw = _mm_load1_ps(w); w += 1;
6692*4bdc9457SAndroid Build Coastguard Worker             vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6693*4bdc9457SAndroid Build Coastguard Worker             vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
6694*4bdc9457SAndroid Build Coastguard Worker           } while (--nnz != 0);
6695*4bdc9457SAndroid Build Coastguard Worker         }
6696*4bdc9457SAndroid Build Coastguard Worker         __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6697*4bdc9457SAndroid Build Coastguard Worker         __m128 vout4567 = _mm_min_ps(vacc4567, vmax);
6698*4bdc9457SAndroid Build Coastguard Worker         vout0123 = _mm_max_ps(vout0123, vmin);
6699*4bdc9457SAndroid Build Coastguard Worker         vout4567 = _mm_max_ps(vout4567, vmin);
6700*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(output, vout0123);
6701*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(output + 4, vout4567);
6702*4bdc9457SAndroid Build Coastguard Worker         output = (float*restrict) ((uintptr_t) output + output_stride);
6703*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
6704*4bdc9457SAndroid Build Coastguard Worker       output = (float*restrict) ((uintptr_t) output - output_decrement);
6705*4bdc9457SAndroid Build Coastguard Worker       input += 8;
6706*4bdc9457SAndroid Build Coastguard Worker     }
6707*4bdc9457SAndroid Build Coastguard Worker     output_decrement += 4 * sizeof(float);
6708*4bdc9457SAndroid Build Coastguard Worker     if (mc & (4 * sizeof(float))) {
6709*4bdc9457SAndroid Build Coastguard Worker       const float*restrict w = weights;
6710*4bdc9457SAndroid Build Coastguard Worker       const int32_t* dmap = widx_dmap;
6711*4bdc9457SAndroid Build Coastguard Worker       const uint32_t* nnzmap = nidx_nnzmap;
6712*4bdc9457SAndroid Build Coastguard Worker       size_t n = nc;
6713*4bdc9457SAndroid Build Coastguard Worker       do {
6714*4bdc9457SAndroid Build Coastguard Worker         uint32_t nnz = *nnzmap++;
6715*4bdc9457SAndroid Build Coastguard Worker         __m128 vacc0123 = _mm_load1_ps(w); w += 1;
6716*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(nnz != 0) {
6717*4bdc9457SAndroid Build Coastguard Worker           do {
6718*4bdc9457SAndroid Build Coastguard Worker             const intptr_t diff = *dmap++;
6719*4bdc9457SAndroid Build Coastguard Worker             const __m128 vi0123 = _mm_loadu_ps(input);
6720*4bdc9457SAndroid Build Coastguard Worker             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6721*4bdc9457SAndroid Build Coastguard Worker             const __m128 vw = _mm_load1_ps(w); w += 1;
6722*4bdc9457SAndroid Build Coastguard Worker             vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
6723*4bdc9457SAndroid Build Coastguard Worker           } while (--nnz != 0);
6724*4bdc9457SAndroid Build Coastguard Worker         }
6725*4bdc9457SAndroid Build Coastguard Worker         __m128 vout0123 = _mm_min_ps(vacc0123, vmax);
6726*4bdc9457SAndroid Build Coastguard Worker         vout0123 = _mm_max_ps(vout0123, vmin);
6727*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_ps(output, vout0123);
6728*4bdc9457SAndroid Build Coastguard Worker         output = (float*restrict) ((uintptr_t) output + output_stride);
6729*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
6730*4bdc9457SAndroid Build Coastguard Worker       output = (float*restrict) ((uintptr_t) output - output_decrement);
6731*4bdc9457SAndroid Build Coastguard Worker       input += 4;
6732*4bdc9457SAndroid Build Coastguard Worker     }
6733*4bdc9457SAndroid Build Coastguard Worker     output_decrement += 2 * sizeof(float);
6734*4bdc9457SAndroid Build Coastguard Worker     if (mc & (2 * sizeof(float))) {
6735*4bdc9457SAndroid Build Coastguard Worker       const float*restrict w = weights;
6736*4bdc9457SAndroid Build Coastguard Worker       const int32_t* dmap = widx_dmap;
6737*4bdc9457SAndroid Build Coastguard Worker       const uint32_t* nnzmap = nidx_nnzmap;
6738*4bdc9457SAndroid Build Coastguard Worker       size_t n = nc;
6739*4bdc9457SAndroid Build Coastguard Worker       do {
6740*4bdc9457SAndroid Build Coastguard Worker         uint32_t nnz = *nnzmap++;
6741*4bdc9457SAndroid Build Coastguard Worker         __m128 vacc01 = _mm_load_ss(w); w += 1;
6742*4bdc9457SAndroid Build Coastguard Worker         vacc01 = _mm_unpacklo_ps(vacc01, vacc01);
6743*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(nnz != 0) {
6744*4bdc9457SAndroid Build Coastguard Worker           do {
6745*4bdc9457SAndroid Build Coastguard Worker             const intptr_t diff = *dmap++;
6746*4bdc9457SAndroid Build Coastguard Worker             const __m128 vi01 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) input);
6747*4bdc9457SAndroid Build Coastguard Worker             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6748*4bdc9457SAndroid Build Coastguard Worker             __m128 vw = _mm_load_ss(w); w += 1;
6749*4bdc9457SAndroid Build Coastguard Worker             vw = _mm_unpacklo_ps(vw, vw);
6750*4bdc9457SAndroid Build Coastguard Worker             vacc01 = _mm_add_ps(vacc01, _mm_mul_ps(vi01, vw));
6751*4bdc9457SAndroid Build Coastguard Worker           } while (--nnz != 0);
6752*4bdc9457SAndroid Build Coastguard Worker         }
6753*4bdc9457SAndroid Build Coastguard Worker         __m128 vout01 = _mm_min_ps(vacc01, vmax);
6754*4bdc9457SAndroid Build Coastguard Worker         vout01 = _mm_max_ps(vout01, vmin);
6755*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) output, vout01);
6756*4bdc9457SAndroid Build Coastguard Worker         output = (float*restrict) ((uintptr_t) output + output_stride);
6757*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
6758*4bdc9457SAndroid Build Coastguard Worker       output = (float*restrict) ((uintptr_t) output - output_decrement);
6759*4bdc9457SAndroid Build Coastguard Worker       input += 2;
6760*4bdc9457SAndroid Build Coastguard Worker     }
6761*4bdc9457SAndroid Build Coastguard Worker     output_decrement += 1 * sizeof(float);
6762*4bdc9457SAndroid Build Coastguard Worker     if (mc & (1 * sizeof(float))) {
6763*4bdc9457SAndroid Build Coastguard Worker       const float*restrict w = weights;
6764*4bdc9457SAndroid Build Coastguard Worker       const int32_t* dmap = widx_dmap;
6765*4bdc9457SAndroid Build Coastguard Worker       const uint32_t* nnzmap = nidx_nnzmap;
6766*4bdc9457SAndroid Build Coastguard Worker       size_t n = nc;
6767*4bdc9457SAndroid Build Coastguard Worker       do {
6768*4bdc9457SAndroid Build Coastguard Worker         uint32_t nnz = *nnzmap++;
6769*4bdc9457SAndroid Build Coastguard Worker         __m128 vacc0 = _mm_load_ss(w); w += 1;
6770*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(nnz != 0) {
6771*4bdc9457SAndroid Build Coastguard Worker           do {
6772*4bdc9457SAndroid Build Coastguard Worker             const intptr_t diff = *dmap++;
6773*4bdc9457SAndroid Build Coastguard Worker             const __m128 vi0 = _mm_load_ss(input);
6774*4bdc9457SAndroid Build Coastguard Worker             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
6775*4bdc9457SAndroid Build Coastguard Worker             const __m128 vw = _mm_load_ss(w); w += 1;
6776*4bdc9457SAndroid Build Coastguard Worker             vacc0 = _mm_add_ss(vacc0, _mm_mul_ss(vi0, vw));
6777*4bdc9457SAndroid Build Coastguard Worker           } while (--nnz != 0);
6778*4bdc9457SAndroid Build Coastguard Worker         }
6779*4bdc9457SAndroid Build Coastguard Worker         __m128 vout0 = _mm_min_ss(vacc0, vmax);
6780*4bdc9457SAndroid Build Coastguard Worker         vout0 = _mm_max_ss(vout0, vmin);
6781*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(output, vout0);
6782*4bdc9457SAndroid Build Coastguard Worker         output = (float*restrict) ((uintptr_t) output + output_stride);
6783*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
6784*4bdc9457SAndroid Build Coastguard Worker       output = (float*restrict) ((uintptr_t) output - output_decrement);
6785*4bdc9457SAndroid Build Coastguard Worker       input += 1;
6786*4bdc9457SAndroid Build Coastguard Worker     }
6787*4bdc9457SAndroid Build Coastguard Worker   }
6788*4bdc9457SAndroid Build Coastguard Worker }
6789*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vadd_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6790*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vadd_minmax_ukernel__sse_x8(
6791*4bdc9457SAndroid Build Coastguard Worker     size_t n,
6792*4bdc9457SAndroid Build Coastguard Worker     const float* a,
6793*4bdc9457SAndroid Build Coastguard Worker     const float* b,
6794*4bdc9457SAndroid Build Coastguard Worker     float* y,
6795*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6796*4bdc9457SAndroid Build Coastguard Worker {
6797*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
6798*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
6799*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
6800*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
6801*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
6802*4bdc9457SAndroid Build Coastguard Worker 
6803*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
6804*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
6805*4bdc9457SAndroid Build Coastguard Worker 
6806*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
6807*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
6808*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
6809*4bdc9457SAndroid Build Coastguard Worker     a += 8;
6810*4bdc9457SAndroid Build Coastguard Worker 
6811*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
6812*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb4567 = _mm_loadu_ps(b + 4);
6813*4bdc9457SAndroid Build Coastguard Worker     b += 8;
6814*4bdc9457SAndroid Build Coastguard Worker 
6815*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_add_ps(va0123, vb0123);
6816*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_add_ps(va4567, vb4567);
6817*4bdc9457SAndroid Build Coastguard Worker 
6818*4bdc9457SAndroid Build Coastguard Worker 
6819*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
6820*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
6821*4bdc9457SAndroid Build Coastguard Worker 
6822*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
6823*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
6824*4bdc9457SAndroid Build Coastguard Worker 
6825*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
6826*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
6827*4bdc9457SAndroid Build Coastguard Worker     y += 8;
6828*4bdc9457SAndroid Build Coastguard Worker   }
6829*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
6830*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
6831*4bdc9457SAndroid Build Coastguard Worker     a += 4;
6832*4bdc9457SAndroid Build Coastguard Worker 
6833*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
6834*4bdc9457SAndroid Build Coastguard Worker     b += 4;
6835*4bdc9457SAndroid Build Coastguard Worker 
6836*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_add_ps(va0123, vb0123);
6837*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
6838*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
6839*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
6840*4bdc9457SAndroid Build Coastguard Worker     y += 4;
6841*4bdc9457SAndroid Build Coastguard Worker   }
6842*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
6843*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
6844*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
6845*4bdc9457SAndroid Build Coastguard Worker 
6846*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_add_ps(va0123, vb0123);
6847*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
6848*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
6849*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
6850*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
6851*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
6852*4bdc9457SAndroid Build Coastguard Worker       y += 2;
6853*4bdc9457SAndroid Build Coastguard Worker     }
6854*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
6855*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
6856*4bdc9457SAndroid Build Coastguard Worker     }
6857*4bdc9457SAndroid Build Coastguard Worker   }
6858*4bdc9457SAndroid Build Coastguard Worker }
6859*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vaddc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6860*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vaddc_minmax_ukernel__sse_x8(
6861*4bdc9457SAndroid Build Coastguard Worker     size_t n,
6862*4bdc9457SAndroid Build Coastguard Worker     const float* a,
6863*4bdc9457SAndroid Build Coastguard Worker     const float* b,
6864*4bdc9457SAndroid Build Coastguard Worker     float* y,
6865*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6866*4bdc9457SAndroid Build Coastguard Worker {
6867*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
6868*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
6869*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
6870*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
6871*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
6872*4bdc9457SAndroid Build Coastguard Worker 
6873*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
6874*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
6875*4bdc9457SAndroid Build Coastguard Worker 
6876*4bdc9457SAndroid Build Coastguard Worker   const __m128 vb = _mm_load1_ps(b);
6877*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
6878*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
6879*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
6880*4bdc9457SAndroid Build Coastguard Worker     a += 8;
6881*4bdc9457SAndroid Build Coastguard Worker 
6882*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_add_ps(va0123, vb);
6883*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_add_ps(va4567, vb);
6884*4bdc9457SAndroid Build Coastguard Worker 
6885*4bdc9457SAndroid Build Coastguard Worker 
6886*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
6887*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
6888*4bdc9457SAndroid Build Coastguard Worker 
6889*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
6890*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
6891*4bdc9457SAndroid Build Coastguard Worker 
6892*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
6893*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
6894*4bdc9457SAndroid Build Coastguard Worker     y += 8;
6895*4bdc9457SAndroid Build Coastguard Worker   }
6896*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
6897*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
6898*4bdc9457SAndroid Build Coastguard Worker     a += 4;
6899*4bdc9457SAndroid Build Coastguard Worker 
6900*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_add_ps(va0123, vb);
6901*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
6902*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
6903*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
6904*4bdc9457SAndroid Build Coastguard Worker     y += 4;
6905*4bdc9457SAndroid Build Coastguard Worker   }
6906*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
6907*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
6908*4bdc9457SAndroid Build Coastguard Worker 
6909*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_add_ps(va0123, vb);
6910*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
6911*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
6912*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
6913*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
6914*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
6915*4bdc9457SAndroid Build Coastguard Worker       y += 2;
6916*4bdc9457SAndroid Build Coastguard Worker     }
6917*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
6918*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
6919*4bdc9457SAndroid Build Coastguard Worker     }
6920*4bdc9457SAndroid Build Coastguard Worker   }
6921*4bdc9457SAndroid Build Coastguard Worker }
6922*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vdiv_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6923*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vdiv_minmax_ukernel__sse_x8(
6924*4bdc9457SAndroid Build Coastguard Worker     size_t n,
6925*4bdc9457SAndroid Build Coastguard Worker     const float* a,
6926*4bdc9457SAndroid Build Coastguard Worker     const float* b,
6927*4bdc9457SAndroid Build Coastguard Worker     float* y,
6928*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6929*4bdc9457SAndroid Build Coastguard Worker {
6930*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
6931*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
6932*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
6933*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
6934*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
6935*4bdc9457SAndroid Build Coastguard Worker 
6936*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
6937*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
6938*4bdc9457SAndroid Build Coastguard Worker 
6939*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
6940*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
6941*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
6942*4bdc9457SAndroid Build Coastguard Worker     a += 8;
6943*4bdc9457SAndroid Build Coastguard Worker 
6944*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
6945*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb4567 = _mm_loadu_ps(b + 4);
6946*4bdc9457SAndroid Build Coastguard Worker     b += 8;
6947*4bdc9457SAndroid Build Coastguard Worker 
6948*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_div_ps(va0123, vb0123);
6949*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_div_ps(va4567, vb4567);
6950*4bdc9457SAndroid Build Coastguard Worker 
6951*4bdc9457SAndroid Build Coastguard Worker 
6952*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
6953*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
6954*4bdc9457SAndroid Build Coastguard Worker 
6955*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
6956*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
6957*4bdc9457SAndroid Build Coastguard Worker 
6958*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
6959*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
6960*4bdc9457SAndroid Build Coastguard Worker     y += 8;
6961*4bdc9457SAndroid Build Coastguard Worker   }
6962*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
6963*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
6964*4bdc9457SAndroid Build Coastguard Worker     a += 4;
6965*4bdc9457SAndroid Build Coastguard Worker 
6966*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
6967*4bdc9457SAndroid Build Coastguard Worker     b += 4;
6968*4bdc9457SAndroid Build Coastguard Worker 
6969*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_div_ps(va0123, vb0123);
6970*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
6971*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
6972*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
6973*4bdc9457SAndroid Build Coastguard Worker     y += 4;
6974*4bdc9457SAndroid Build Coastguard Worker   }
6975*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
6976*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
6977*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
6978*4bdc9457SAndroid Build Coastguard Worker 
6979*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_div_ps(va0123, vb0123);
6980*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
6981*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
6982*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
6983*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
6984*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
6985*4bdc9457SAndroid Build Coastguard Worker       y += 2;
6986*4bdc9457SAndroid Build Coastguard Worker     }
6987*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
6988*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
6989*4bdc9457SAndroid Build Coastguard Worker     }
6990*4bdc9457SAndroid Build Coastguard Worker   }
6991*4bdc9457SAndroid Build Coastguard Worker }
6992*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vdivc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6993*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vdivc_minmax_ukernel__sse_x8(
6994*4bdc9457SAndroid Build Coastguard Worker     size_t n,
6995*4bdc9457SAndroid Build Coastguard Worker     const float* a,
6996*4bdc9457SAndroid Build Coastguard Worker     const float* b,
6997*4bdc9457SAndroid Build Coastguard Worker     float* y,
6998*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6999*4bdc9457SAndroid Build Coastguard Worker {
7000*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7001*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7002*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7003*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7004*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7005*4bdc9457SAndroid Build Coastguard Worker 
7006*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
7007*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
7008*4bdc9457SAndroid Build Coastguard Worker 
7009*4bdc9457SAndroid Build Coastguard Worker   const __m128 vb = _mm_load1_ps(b);
7010*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7011*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7012*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7013*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7014*4bdc9457SAndroid Build Coastguard Worker 
7015*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_div_ps(va0123, vb);
7016*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_div_ps(va4567, vb);
7017*4bdc9457SAndroid Build Coastguard Worker 
7018*4bdc9457SAndroid Build Coastguard Worker 
7019*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7020*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
7021*4bdc9457SAndroid Build Coastguard Worker 
7022*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7023*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
7024*4bdc9457SAndroid Build Coastguard Worker 
7025*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7026*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7027*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7028*4bdc9457SAndroid Build Coastguard Worker   }
7029*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7030*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7031*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7032*4bdc9457SAndroid Build Coastguard Worker 
7033*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_div_ps(va0123, vb);
7034*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7035*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7036*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7037*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7038*4bdc9457SAndroid Build Coastguard Worker   }
7039*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7040*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7041*4bdc9457SAndroid Build Coastguard Worker 
7042*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_div_ps(va0123, vb);
7043*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7044*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7045*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7046*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7047*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7048*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7049*4bdc9457SAndroid Build Coastguard Worker     }
7050*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7051*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7052*4bdc9457SAndroid Build Coastguard Worker     }
7053*4bdc9457SAndroid Build Coastguard Worker   }
7054*4bdc9457SAndroid Build Coastguard Worker }
7055*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7056*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vmax_ukernel__sse_x8(
7057*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7058*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7059*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7060*4bdc9457SAndroid Build Coastguard Worker     float* y,
7061*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7062*4bdc9457SAndroid Build Coastguard Worker {
7063*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7064*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7065*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7066*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7067*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7068*4bdc9457SAndroid Build Coastguard Worker 
7069*4bdc9457SAndroid Build Coastguard Worker 
7070*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7071*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7072*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7073*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7074*4bdc9457SAndroid Build Coastguard Worker 
7075*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7076*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7077*4bdc9457SAndroid Build Coastguard Worker     b += 8;
7078*4bdc9457SAndroid Build Coastguard Worker 
7079*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_max_ps(va0123, vb0123);
7080*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_max_ps(va4567, vb4567);
7081*4bdc9457SAndroid Build Coastguard Worker 
7082*4bdc9457SAndroid Build Coastguard Worker 
7083*4bdc9457SAndroid Build Coastguard Worker 
7084*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7085*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7086*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7087*4bdc9457SAndroid Build Coastguard Worker   }
7088*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7089*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7090*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7091*4bdc9457SAndroid Build Coastguard Worker 
7092*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7093*4bdc9457SAndroid Build Coastguard Worker     b += 4;
7094*4bdc9457SAndroid Build Coastguard Worker 
7095*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_max_ps(va0123, vb0123);
7096*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7097*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7098*4bdc9457SAndroid Build Coastguard Worker   }
7099*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7100*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7101*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7102*4bdc9457SAndroid Build Coastguard Worker 
7103*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_max_ps(va0123, vb0123);
7104*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7105*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7106*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7107*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7108*4bdc9457SAndroid Build Coastguard Worker     }
7109*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7110*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7111*4bdc9457SAndroid Build Coastguard Worker     }
7112*4bdc9457SAndroid Build Coastguard Worker   }
7113*4bdc9457SAndroid Build Coastguard Worker }
7114*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vmaxc_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7115*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vmaxc_ukernel__sse_x8(
7116*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7117*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7118*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7119*4bdc9457SAndroid Build Coastguard Worker     float* y,
7120*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7121*4bdc9457SAndroid Build Coastguard Worker {
7122*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7123*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7124*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7125*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7126*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7127*4bdc9457SAndroid Build Coastguard Worker 
7128*4bdc9457SAndroid Build Coastguard Worker 
7129*4bdc9457SAndroid Build Coastguard Worker   const __m128 vb = _mm_load1_ps(b);
7130*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7131*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7132*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7133*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7134*4bdc9457SAndroid Build Coastguard Worker 
7135*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_max_ps(va0123, vb);
7136*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_max_ps(va4567, vb);
7137*4bdc9457SAndroid Build Coastguard Worker 
7138*4bdc9457SAndroid Build Coastguard Worker 
7139*4bdc9457SAndroid Build Coastguard Worker 
7140*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7141*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7142*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7143*4bdc9457SAndroid Build Coastguard Worker   }
7144*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7145*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7146*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7147*4bdc9457SAndroid Build Coastguard Worker 
7148*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_max_ps(va0123, vb);
7149*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7150*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7151*4bdc9457SAndroid Build Coastguard Worker   }
7152*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7153*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7154*4bdc9457SAndroid Build Coastguard Worker 
7155*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_max_ps(va0123, vb);
7156*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7157*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7158*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7159*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7160*4bdc9457SAndroid Build Coastguard Worker     }
7161*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7162*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7163*4bdc9457SAndroid Build Coastguard Worker     }
7164*4bdc9457SAndroid Build Coastguard Worker   }
7165*4bdc9457SAndroid Build Coastguard Worker }
7166*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vmin_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7167*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vmin_ukernel__sse_x8(
7168*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7169*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7170*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7171*4bdc9457SAndroid Build Coastguard Worker     float* y,
7172*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7173*4bdc9457SAndroid Build Coastguard Worker {
7174*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7175*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7176*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7177*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7178*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7179*4bdc9457SAndroid Build Coastguard Worker 
7180*4bdc9457SAndroid Build Coastguard Worker 
7181*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7182*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7183*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7184*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7185*4bdc9457SAndroid Build Coastguard Worker 
7186*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7187*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7188*4bdc9457SAndroid Build Coastguard Worker     b += 8;
7189*4bdc9457SAndroid Build Coastguard Worker 
7190*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_min_ps(va0123, vb0123);
7191*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_min_ps(va4567, vb4567);
7192*4bdc9457SAndroid Build Coastguard Worker 
7193*4bdc9457SAndroid Build Coastguard Worker 
7194*4bdc9457SAndroid Build Coastguard Worker 
7195*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7196*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7197*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7198*4bdc9457SAndroid Build Coastguard Worker   }
7199*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7200*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7201*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7202*4bdc9457SAndroid Build Coastguard Worker 
7203*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7204*4bdc9457SAndroid Build Coastguard Worker     b += 4;
7205*4bdc9457SAndroid Build Coastguard Worker 
7206*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_min_ps(va0123, vb0123);
7207*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7208*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7209*4bdc9457SAndroid Build Coastguard Worker   }
7210*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7211*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7212*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7213*4bdc9457SAndroid Build Coastguard Worker 
7214*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_min_ps(va0123, vb0123);
7215*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7216*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7217*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7218*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7219*4bdc9457SAndroid Build Coastguard Worker     }
7220*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7221*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7222*4bdc9457SAndroid Build Coastguard Worker     }
7223*4bdc9457SAndroid Build Coastguard Worker   }
7224*4bdc9457SAndroid Build Coastguard Worker }
7225*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vminc_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7226*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vminc_ukernel__sse_x8(
7227*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7228*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7229*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7230*4bdc9457SAndroid Build Coastguard Worker     float* y,
7231*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7232*4bdc9457SAndroid Build Coastguard Worker {
7233*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7234*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7235*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7236*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7237*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7238*4bdc9457SAndroid Build Coastguard Worker 
7239*4bdc9457SAndroid Build Coastguard Worker 
7240*4bdc9457SAndroid Build Coastguard Worker   const __m128 vb = _mm_load1_ps(b);
7241*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7242*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7243*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7244*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7245*4bdc9457SAndroid Build Coastguard Worker 
7246*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_min_ps(va0123, vb);
7247*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_min_ps(va4567, vb);
7248*4bdc9457SAndroid Build Coastguard Worker 
7249*4bdc9457SAndroid Build Coastguard Worker 
7250*4bdc9457SAndroid Build Coastguard Worker 
7251*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7252*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7253*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7254*4bdc9457SAndroid Build Coastguard Worker   }
7255*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7256*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7257*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7258*4bdc9457SAndroid Build Coastguard Worker 
7259*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_min_ps(va0123, vb);
7260*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7261*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7262*4bdc9457SAndroid Build Coastguard Worker   }
7263*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7264*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7265*4bdc9457SAndroid Build Coastguard Worker 
7266*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_min_ps(va0123, vb);
7267*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7268*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7269*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7270*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7271*4bdc9457SAndroid Build Coastguard Worker     }
7272*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7273*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7274*4bdc9457SAndroid Build Coastguard Worker     }
7275*4bdc9457SAndroid Build Coastguard Worker   }
7276*4bdc9457SAndroid Build Coastguard Worker }
7277*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vmul_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7278*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vmul_minmax_ukernel__sse_x8(
7279*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7280*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7281*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7282*4bdc9457SAndroid Build Coastguard Worker     float* y,
7283*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7284*4bdc9457SAndroid Build Coastguard Worker {
7285*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7286*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7287*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7288*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7289*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7290*4bdc9457SAndroid Build Coastguard Worker 
7291*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
7292*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
7293*4bdc9457SAndroid Build Coastguard Worker 
7294*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7295*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7296*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7297*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7298*4bdc9457SAndroid Build Coastguard Worker 
7299*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7300*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7301*4bdc9457SAndroid Build Coastguard Worker     b += 8;
7302*4bdc9457SAndroid Build Coastguard Worker 
7303*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_mul_ps(va0123, vb0123);
7304*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_mul_ps(va4567, vb4567);
7305*4bdc9457SAndroid Build Coastguard Worker 
7306*4bdc9457SAndroid Build Coastguard Worker 
7307*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7308*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
7309*4bdc9457SAndroid Build Coastguard Worker 
7310*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7311*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
7312*4bdc9457SAndroid Build Coastguard Worker 
7313*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7314*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7315*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7316*4bdc9457SAndroid Build Coastguard Worker   }
7317*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7318*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7319*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7320*4bdc9457SAndroid Build Coastguard Worker 
7321*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7322*4bdc9457SAndroid Build Coastguard Worker     b += 4;
7323*4bdc9457SAndroid Build Coastguard Worker 
7324*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_mul_ps(va0123, vb0123);
7325*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7326*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7327*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7328*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7329*4bdc9457SAndroid Build Coastguard Worker   }
7330*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7331*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7332*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7333*4bdc9457SAndroid Build Coastguard Worker 
7334*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_mul_ps(va0123, vb0123);
7335*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7336*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7337*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7338*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7339*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7340*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7341*4bdc9457SAndroid Build Coastguard Worker     }
7342*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7343*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7344*4bdc9457SAndroid Build Coastguard Worker     }
7345*4bdc9457SAndroid Build Coastguard Worker   }
7346*4bdc9457SAndroid Build Coastguard Worker }
7347*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vmulc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7348*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vmulc_minmax_ukernel__sse_x8(
7349*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7350*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7351*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7352*4bdc9457SAndroid Build Coastguard Worker     float* y,
7353*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7354*4bdc9457SAndroid Build Coastguard Worker {
7355*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7356*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7357*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7358*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7359*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7360*4bdc9457SAndroid Build Coastguard Worker 
7361*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
7362*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
7363*4bdc9457SAndroid Build Coastguard Worker 
7364*4bdc9457SAndroid Build Coastguard Worker   const __m128 vb = _mm_load1_ps(b);
7365*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7366*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7367*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7368*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7369*4bdc9457SAndroid Build Coastguard Worker 
7370*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_mul_ps(va0123, vb);
7371*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_mul_ps(va4567, vb);
7372*4bdc9457SAndroid Build Coastguard Worker 
7373*4bdc9457SAndroid Build Coastguard Worker 
7374*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7375*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
7376*4bdc9457SAndroid Build Coastguard Worker 
7377*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7378*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
7379*4bdc9457SAndroid Build Coastguard Worker 
7380*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7381*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7382*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7383*4bdc9457SAndroid Build Coastguard Worker   }
7384*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7385*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7386*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7387*4bdc9457SAndroid Build Coastguard Worker 
7388*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_mul_ps(va0123, vb);
7389*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7390*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7391*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7392*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7393*4bdc9457SAndroid Build Coastguard Worker   }
7394*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7395*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7396*4bdc9457SAndroid Build Coastguard Worker 
7397*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_mul_ps(va0123, vb);
7398*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7399*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7400*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7401*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7402*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7403*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7404*4bdc9457SAndroid Build Coastguard Worker     }
7405*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7406*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7407*4bdc9457SAndroid Build Coastguard Worker     }
7408*4bdc9457SAndroid Build Coastguard Worker   }
7409*4bdc9457SAndroid Build Coastguard Worker }
7410*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vrdivc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7411*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vrdivc_minmax_ukernel__sse_x8(
7412*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7413*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7414*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7415*4bdc9457SAndroid Build Coastguard Worker     float* y,
7416*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7417*4bdc9457SAndroid Build Coastguard Worker {
7418*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7419*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7420*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7421*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7422*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7423*4bdc9457SAndroid Build Coastguard Worker 
7424*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
7425*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
7426*4bdc9457SAndroid Build Coastguard Worker 
7427*4bdc9457SAndroid Build Coastguard Worker   const __m128 vb = _mm_load1_ps(b);
7428*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7429*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7430*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7431*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7432*4bdc9457SAndroid Build Coastguard Worker 
7433*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_div_ps(vb, va0123);
7434*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_div_ps(vb, va4567);
7435*4bdc9457SAndroid Build Coastguard Worker 
7436*4bdc9457SAndroid Build Coastguard Worker 
7437*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7438*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
7439*4bdc9457SAndroid Build Coastguard Worker 
7440*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7441*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
7442*4bdc9457SAndroid Build Coastguard Worker 
7443*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7444*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7445*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7446*4bdc9457SAndroid Build Coastguard Worker   }
7447*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7448*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7449*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7450*4bdc9457SAndroid Build Coastguard Worker 
7451*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_div_ps(vb, va0123);
7452*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7453*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7454*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7455*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7456*4bdc9457SAndroid Build Coastguard Worker   }
7457*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7458*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7459*4bdc9457SAndroid Build Coastguard Worker 
7460*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_div_ps(vb, va0123);
7461*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7462*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7463*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7464*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7465*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7466*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7467*4bdc9457SAndroid Build Coastguard Worker     }
7468*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7469*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7470*4bdc9457SAndroid Build Coastguard Worker     }
7471*4bdc9457SAndroid Build Coastguard Worker   }
7472*4bdc9457SAndroid Build Coastguard Worker }
7473*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vrsubc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7474*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vrsubc_minmax_ukernel__sse_x8(
7475*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7476*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7477*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7478*4bdc9457SAndroid Build Coastguard Worker     float* y,
7479*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7480*4bdc9457SAndroid Build Coastguard Worker {
7481*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7482*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7483*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7484*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7485*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7486*4bdc9457SAndroid Build Coastguard Worker 
7487*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
7488*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
7489*4bdc9457SAndroid Build Coastguard Worker 
7490*4bdc9457SAndroid Build Coastguard Worker   const __m128 vb = _mm_load1_ps(b);
7491*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7492*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7493*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7494*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7495*4bdc9457SAndroid Build Coastguard Worker 
7496*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(vb, va0123);
7497*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_sub_ps(vb, va4567);
7498*4bdc9457SAndroid Build Coastguard Worker 
7499*4bdc9457SAndroid Build Coastguard Worker 
7500*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7501*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
7502*4bdc9457SAndroid Build Coastguard Worker 
7503*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7504*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
7505*4bdc9457SAndroid Build Coastguard Worker 
7506*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7507*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7508*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7509*4bdc9457SAndroid Build Coastguard Worker   }
7510*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7511*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7512*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7513*4bdc9457SAndroid Build Coastguard Worker 
7514*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(vb, va0123);
7515*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7516*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7517*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7518*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7519*4bdc9457SAndroid Build Coastguard Worker   }
7520*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7521*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7522*4bdc9457SAndroid Build Coastguard Worker 
7523*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(vb, va0123);
7524*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7525*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7526*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7527*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7528*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7529*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7530*4bdc9457SAndroid Build Coastguard Worker     }
7531*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7532*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7533*4bdc9457SAndroid Build Coastguard Worker     }
7534*4bdc9457SAndroid Build Coastguard Worker   }
7535*4bdc9457SAndroid Build Coastguard Worker }
7536*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vsqrdiff_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7537*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vsqrdiff_ukernel__sse_x8(
7538*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7539*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7540*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7541*4bdc9457SAndroid Build Coastguard Worker     float* y,
7542*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7543*4bdc9457SAndroid Build Coastguard Worker {
7544*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7545*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7546*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7547*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7548*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7549*4bdc9457SAndroid Build Coastguard Worker 
7550*4bdc9457SAndroid Build Coastguard Worker 
7551*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7552*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7553*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7554*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7555*4bdc9457SAndroid Build Coastguard Worker 
7556*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7557*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7558*4bdc9457SAndroid Build Coastguard Worker     b += 8;
7559*4bdc9457SAndroid Build Coastguard Worker 
7560*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7561*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_sub_ps(va4567, vb4567);
7562*4bdc9457SAndroid Build Coastguard Worker 
7563*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_mul_ps(vy0123, vy0123);
7564*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_mul_ps(vy4567, vy4567);
7565*4bdc9457SAndroid Build Coastguard Worker 
7566*4bdc9457SAndroid Build Coastguard Worker 
7567*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7568*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7569*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7570*4bdc9457SAndroid Build Coastguard Worker   }
7571*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7572*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7573*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7574*4bdc9457SAndroid Build Coastguard Worker 
7575*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7576*4bdc9457SAndroid Build Coastguard Worker     b += 4;
7577*4bdc9457SAndroid Build Coastguard Worker 
7578*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7579*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_mul_ps(vy0123, vy0123);
7580*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7581*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7582*4bdc9457SAndroid Build Coastguard Worker   }
7583*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7584*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7585*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7586*4bdc9457SAndroid Build Coastguard Worker 
7587*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7588*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_mul_ps(vy0123, vy0123);
7589*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7590*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7591*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7592*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7593*4bdc9457SAndroid Build Coastguard Worker     }
7594*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7595*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7596*4bdc9457SAndroid Build Coastguard Worker     }
7597*4bdc9457SAndroid Build Coastguard Worker   }
7598*4bdc9457SAndroid Build Coastguard Worker }
7599*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vsqrdiffc_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])7600*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vsqrdiffc_ukernel__sse_x8(
7601*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7602*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7603*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7604*4bdc9457SAndroid Build Coastguard Worker     float* y,
7605*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7606*4bdc9457SAndroid Build Coastguard Worker {
7607*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7608*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7609*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7610*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7611*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7612*4bdc9457SAndroid Build Coastguard Worker 
7613*4bdc9457SAndroid Build Coastguard Worker 
7614*4bdc9457SAndroid Build Coastguard Worker   const __m128 vb = _mm_load1_ps(b);
7615*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7616*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7617*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7618*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7619*4bdc9457SAndroid Build Coastguard Worker 
7620*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7621*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_sub_ps(va4567, vb);
7622*4bdc9457SAndroid Build Coastguard Worker 
7623*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_mul_ps(vy0123, vy0123);
7624*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_mul_ps(vy4567, vy4567);
7625*4bdc9457SAndroid Build Coastguard Worker 
7626*4bdc9457SAndroid Build Coastguard Worker 
7627*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7628*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7629*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7630*4bdc9457SAndroid Build Coastguard Worker   }
7631*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7632*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7633*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7634*4bdc9457SAndroid Build Coastguard Worker 
7635*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7636*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_mul_ps(vy0123, vy0123);
7637*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7638*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7639*4bdc9457SAndroid Build Coastguard Worker   }
7640*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7641*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7642*4bdc9457SAndroid Build Coastguard Worker 
7643*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7644*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_mul_ps(vy0123, vy0123);
7645*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7646*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7647*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7648*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7649*4bdc9457SAndroid Build Coastguard Worker     }
7650*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7651*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7652*4bdc9457SAndroid Build Coastguard Worker     }
7653*4bdc9457SAndroid Build Coastguard Worker   }
7654*4bdc9457SAndroid Build Coastguard Worker }
7655*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vsub_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7656*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vsub_minmax_ukernel__sse_x8(
7657*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7658*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7659*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7660*4bdc9457SAndroid Build Coastguard Worker     float* y,
7661*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7662*4bdc9457SAndroid Build Coastguard Worker {
7663*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7664*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7665*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7666*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7667*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7668*4bdc9457SAndroid Build Coastguard Worker 
7669*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
7670*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
7671*4bdc9457SAndroid Build Coastguard Worker 
7672*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7673*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7674*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7675*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7676*4bdc9457SAndroid Build Coastguard Worker 
7677*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7678*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb4567 = _mm_loadu_ps(b + 4);
7679*4bdc9457SAndroid Build Coastguard Worker     b += 8;
7680*4bdc9457SAndroid Build Coastguard Worker 
7681*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7682*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_sub_ps(va4567, vb4567);
7683*4bdc9457SAndroid Build Coastguard Worker 
7684*4bdc9457SAndroid Build Coastguard Worker 
7685*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7686*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
7687*4bdc9457SAndroid Build Coastguard Worker 
7688*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7689*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
7690*4bdc9457SAndroid Build Coastguard Worker 
7691*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7692*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7693*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7694*4bdc9457SAndroid Build Coastguard Worker   }
7695*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7696*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7697*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7698*4bdc9457SAndroid Build Coastguard Worker 
7699*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7700*4bdc9457SAndroid Build Coastguard Worker     b += 4;
7701*4bdc9457SAndroid Build Coastguard Worker 
7702*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7703*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7704*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7705*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7706*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7707*4bdc9457SAndroid Build Coastguard Worker   }
7708*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7709*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7710*4bdc9457SAndroid Build Coastguard Worker     const __m128 vb0123 = _mm_loadu_ps(b);
7711*4bdc9457SAndroid Build Coastguard Worker 
7712*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb0123);
7713*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7714*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7715*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7716*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7717*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7718*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7719*4bdc9457SAndroid Build Coastguard Worker     }
7720*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7721*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7722*4bdc9457SAndroid Build Coastguard Worker     }
7723*4bdc9457SAndroid Build Coastguard Worker   }
7724*4bdc9457SAndroid Build Coastguard Worker }
7725*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vsubc_minmax_ukernel__sse_x8(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7726*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vsubc_minmax_ukernel__sse_x8(
7727*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7728*4bdc9457SAndroid Build Coastguard Worker     const float* a,
7729*4bdc9457SAndroid Build Coastguard Worker     const float* b,
7730*4bdc9457SAndroid Build Coastguard Worker     float* y,
7731*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7732*4bdc9457SAndroid Build Coastguard Worker {
7733*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7734*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7735*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7736*4bdc9457SAndroid Build Coastguard Worker   assert(b != NULL);
7737*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7738*4bdc9457SAndroid Build Coastguard Worker 
7739*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
7740*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
7741*4bdc9457SAndroid Build Coastguard Worker 
7742*4bdc9457SAndroid Build Coastguard Worker   const __m128 vb = _mm_load1_ps(b);
7743*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7744*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7745*4bdc9457SAndroid Build Coastguard Worker     const __m128 va4567 = _mm_loadu_ps(a + 4);
7746*4bdc9457SAndroid Build Coastguard Worker     a += 8;
7747*4bdc9457SAndroid Build Coastguard Worker 
7748*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7749*4bdc9457SAndroid Build Coastguard Worker     __m128 vy4567 = _mm_sub_ps(va4567, vb);
7750*4bdc9457SAndroid Build Coastguard Worker 
7751*4bdc9457SAndroid Build Coastguard Worker 
7752*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7753*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_max_ps(vy4567, vy_min);
7754*4bdc9457SAndroid Build Coastguard Worker 
7755*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7756*4bdc9457SAndroid Build Coastguard Worker     vy4567 = _mm_min_ps(vy4567, vy_max);
7757*4bdc9457SAndroid Build Coastguard Worker 
7758*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7759*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
7760*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7761*4bdc9457SAndroid Build Coastguard Worker   }
7762*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7763*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7764*4bdc9457SAndroid Build Coastguard Worker     a += 4;
7765*4bdc9457SAndroid Build Coastguard Worker 
7766*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7767*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7768*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7769*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
7770*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7771*4bdc9457SAndroid Build Coastguard Worker   }
7772*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7773*4bdc9457SAndroid Build Coastguard Worker     const __m128 va0123 = _mm_loadu_ps(a);
7774*4bdc9457SAndroid Build Coastguard Worker 
7775*4bdc9457SAndroid Build Coastguard Worker     __m128 vy0123 = _mm_sub_ps(va0123, vb);
7776*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_max_ps(vy0123, vy_min);
7777*4bdc9457SAndroid Build Coastguard Worker     vy0123 = _mm_min_ps(vy0123, vy_max);
7778*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7779*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy0123);
7780*4bdc9457SAndroid Build Coastguard Worker       vy0123 = _mm_movehl_ps(vy0123, vy0123);
7781*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7782*4bdc9457SAndroid Build Coastguard Worker     }
7783*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7784*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy0123);
7785*4bdc9457SAndroid Build Coastguard Worker     }
7786*4bdc9457SAndroid Build Coastguard Worker   }
7787*4bdc9457SAndroid Build Coastguard Worker }
7788*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vclamp_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7789*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vclamp_ukernel__sse_x8(
7790*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7791*4bdc9457SAndroid Build Coastguard Worker     const float* x,
7792*4bdc9457SAndroid Build Coastguard Worker     float* y,
7793*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7794*4bdc9457SAndroid Build Coastguard Worker {
7795*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7796*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7797*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
7798*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7799*4bdc9457SAndroid Build Coastguard Worker 
7800*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_min = _mm_load_ps(params->sse.min);
7801*4bdc9457SAndroid Build Coastguard Worker   const __m128 vy_max = _mm_load_ps(params->sse.max);
7802*4bdc9457SAndroid Build Coastguard Worker 
7803*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7804*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0123 = _mm_loadu_ps(x);
7805*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc4567 = _mm_loadu_ps(x + 4);
7806*4bdc9457SAndroid Build Coastguard Worker     x += 8;
7807*4bdc9457SAndroid Build Coastguard Worker 
7808*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_max_ps(vacc0123, vy_min);
7809*4bdc9457SAndroid Build Coastguard Worker     vacc4567 = _mm_max_ps(vacc4567, vy_min);
7810*4bdc9457SAndroid Build Coastguard Worker 
7811*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_min_ps(vacc0123, vy_max);
7812*4bdc9457SAndroid Build Coastguard Worker     vacc4567 = _mm_min_ps(vacc4567, vy_max);
7813*4bdc9457SAndroid Build Coastguard Worker 
7814*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vacc0123);
7815*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vacc4567);
7816*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7817*4bdc9457SAndroid Build Coastguard Worker   }
7818*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7819*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc = _mm_loadu_ps(x);
7820*4bdc9457SAndroid Build Coastguard Worker     x += 4;
7821*4bdc9457SAndroid Build Coastguard Worker 
7822*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm_max_ps(vacc, vy_min);
7823*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm_min_ps(vacc, vy_max);
7824*4bdc9457SAndroid Build Coastguard Worker 
7825*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vacc);
7826*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7827*4bdc9457SAndroid Build Coastguard Worker   }
7828*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7829*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc = _mm_loadu_ps(x);
7830*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm_max_ps(vacc, vy_min);
7831*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm_min_ps(vacc, vy_max);
7832*4bdc9457SAndroid Build Coastguard Worker 
7833*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7834*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vacc);
7835*4bdc9457SAndroid Build Coastguard Worker       vacc = _mm_movehl_ps(vacc, vacc);
7836*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7837*4bdc9457SAndroid Build Coastguard Worker     }
7838*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7839*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vacc);
7840*4bdc9457SAndroid Build Coastguard Worker     }
7841*4bdc9457SAndroid Build Coastguard Worker   }
7842*4bdc9457SAndroid Build Coastguard Worker }
7843*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vhswish_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])7844*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vhswish_ukernel__sse_x8(
7845*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7846*4bdc9457SAndroid Build Coastguard Worker     const float* x,
7847*4bdc9457SAndroid Build Coastguard Worker     float* y,
7848*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7849*4bdc9457SAndroid Build Coastguard Worker {
7850*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7851*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7852*4bdc9457SAndroid Build Coastguard Worker 
7853*4bdc9457SAndroid Build Coastguard Worker   const __m128 vsixth = _mm_load_ps(params->sse.sixth);
7854*4bdc9457SAndroid Build Coastguard Worker   const __m128 vhalf = _mm_load_ps(params->sse.half);
7855*4bdc9457SAndroid Build Coastguard Worker   const __m128 vone = _mm_load_ps(params->sse.one);
7856*4bdc9457SAndroid Build Coastguard Worker   const __m128 vzero = _mm_setzero_ps();
7857*4bdc9457SAndroid Build Coastguard Worker 
7858*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7859*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx0123 = _mm_loadu_ps(x);
7860*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx4567 = _mm_loadu_ps(x + 4);
7861*4bdc9457SAndroid Build Coastguard Worker     x += 8;
7862*4bdc9457SAndroid Build Coastguard Worker 
7863*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
7864*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc4567 = _mm_mul_ps(vx4567, vsixth);
7865*4bdc9457SAndroid Build Coastguard Worker 
7866*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_add_ps(vacc0123, vhalf);
7867*4bdc9457SAndroid Build Coastguard Worker     vacc4567 = _mm_add_ps(vacc4567, vhalf);
7868*4bdc9457SAndroid Build Coastguard Worker 
7869*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_max_ps(vacc0123, vzero);
7870*4bdc9457SAndroid Build Coastguard Worker     vacc4567 = _mm_max_ps(vacc4567, vzero);
7871*4bdc9457SAndroid Build Coastguard Worker 
7872*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_min_ps(vacc0123, vone);
7873*4bdc9457SAndroid Build Coastguard Worker     vacc4567 = _mm_min_ps(vacc4567, vone);
7874*4bdc9457SAndroid Build Coastguard Worker 
7875*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_mul_ps(vacc0123, vx0123);
7876*4bdc9457SAndroid Build Coastguard Worker     vacc4567 = _mm_mul_ps(vacc4567, vx4567);
7877*4bdc9457SAndroid Build Coastguard Worker 
7878*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vacc0123);
7879*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vacc4567);
7880*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7881*4bdc9457SAndroid Build Coastguard Worker   }
7882*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7883*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx0123 = _mm_loadu_ps(x);
7884*4bdc9457SAndroid Build Coastguard Worker     x += 4;
7885*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
7886*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_add_ps(vacc0123, vhalf);
7887*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_max_ps(vacc0123, vzero);
7888*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_min_ps(vacc0123, vone);
7889*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_mul_ps(vacc0123, vx0123);
7890*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vacc0123);
7891*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7892*4bdc9457SAndroid Build Coastguard Worker   }
7893*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7894*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx0123 = _mm_loadu_ps(x);
7895*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
7896*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_add_ps(vacc0123, vhalf);
7897*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_max_ps(vacc0123, vzero);
7898*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_min_ps(vacc0123, vone);
7899*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_mul_ps(vacc0123, vx0123);
7900*4bdc9457SAndroid Build Coastguard Worker 
7901*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7902*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vacc0123);
7903*4bdc9457SAndroid Build Coastguard Worker       vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
7904*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7905*4bdc9457SAndroid Build Coastguard Worker     }
7906*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7907*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vacc0123);
7908*4bdc9457SAndroid Build Coastguard Worker     }
7909*4bdc9457SAndroid Build Coastguard Worker   }
7910*4bdc9457SAndroid Build Coastguard Worker }
7911*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vlrelu_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])7912*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vlrelu_ukernel__sse_x8(
7913*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7914*4bdc9457SAndroid Build Coastguard Worker     const float* x,
7915*4bdc9457SAndroid Build Coastguard Worker     float* y,
7916*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7917*4bdc9457SAndroid Build Coastguard Worker {
7918*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7919*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
7920*4bdc9457SAndroid Build Coastguard Worker 
7921*4bdc9457SAndroid Build Coastguard Worker   const __m128 vslope = _mm_load_ps(params->sse.slope);
7922*4bdc9457SAndroid Build Coastguard Worker   const __m128 vzero = _mm_setzero_ps();
7923*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
7924*4bdc9457SAndroid Build Coastguard Worker     __m128 vx0123 = _mm_loadu_ps(x);
7925*4bdc9457SAndroid Build Coastguard Worker     __m128 vx4567 = _mm_loadu_ps(x + 4);
7926*4bdc9457SAndroid Build Coastguard Worker     x += 8;
7927*4bdc9457SAndroid Build Coastguard Worker 
7928*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc0123 = _mm_max_ps(_mm_setzero_ps(), vx0123);
7929*4bdc9457SAndroid Build Coastguard Worker     vx0123 = _mm_min_ps(vx0123, vzero);
7930*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc4567 = _mm_max_ps(_mm_setzero_ps(), vx4567);
7931*4bdc9457SAndroid Build Coastguard Worker     vx4567 = _mm_min_ps(vx4567, vzero);
7932*4bdc9457SAndroid Build Coastguard Worker 
7933*4bdc9457SAndroid Build Coastguard Worker     vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vx0123, vslope));
7934*4bdc9457SAndroid Build Coastguard Worker     vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vx4567, vslope));
7935*4bdc9457SAndroid Build Coastguard Worker 
7936*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vacc0123);
7937*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vacc4567);
7938*4bdc9457SAndroid Build Coastguard Worker     y += 8;
7939*4bdc9457SAndroid Build Coastguard Worker   }
7940*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
7941*4bdc9457SAndroid Build Coastguard Worker     __m128 vx = _mm_loadu_ps(x);
7942*4bdc9457SAndroid Build Coastguard Worker     x += 4;
7943*4bdc9457SAndroid Build Coastguard Worker 
7944*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc = _mm_max_ps(_mm_setzero_ps(), vx);
7945*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_min_ps(vx, vzero);
7946*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm_add_ps(vacc, _mm_mul_ps(vx, vslope));
7947*4bdc9457SAndroid Build Coastguard Worker 
7948*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vacc);
7949*4bdc9457SAndroid Build Coastguard Worker     y += 4;
7950*4bdc9457SAndroid Build Coastguard Worker   }
7951*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7952*4bdc9457SAndroid Build Coastguard Worker     __m128 vx = _mm_loadu_ps(x);
7953*4bdc9457SAndroid Build Coastguard Worker 
7954*4bdc9457SAndroid Build Coastguard Worker     __m128 vacc = _mm_max_ps(_mm_setzero_ps(), vx);
7955*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_min_ps(vx, vzero);
7956*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm_add_ps(vacc, _mm_mul_ps(vx, vslope));
7957*4bdc9457SAndroid Build Coastguard Worker 
7958*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
7959*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vacc);
7960*4bdc9457SAndroid Build Coastguard Worker       vacc = _mm_movehl_ps(vacc, vacc);
7961*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7962*4bdc9457SAndroid Build Coastguard Worker     }
7963*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
7964*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vacc);
7965*4bdc9457SAndroid Build Coastguard Worker     }
7966*4bdc9457SAndroid Build Coastguard Worker   }
7967*4bdc9457SAndroid Build Coastguard Worker }
7968*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x(size_t rows,size_t channels,const float * restrict input,size_t input_stride,const float * restrict weights,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7969*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x(
7970*4bdc9457SAndroid Build Coastguard Worker     size_t rows,
7971*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
7972*4bdc9457SAndroid Build Coastguard Worker     const float*restrict input,
7973*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
7974*4bdc9457SAndroid Build Coastguard Worker     const float*restrict weights,
7975*4bdc9457SAndroid Build Coastguard Worker     float*restrict output,
7976*4bdc9457SAndroid Build Coastguard Worker     size_t output_stride,
7977*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7978*4bdc9457SAndroid Build Coastguard Worker {
7979*4bdc9457SAndroid Build Coastguard Worker   assert(rows != 0);
7980*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
7981*4bdc9457SAndroid Build Coastguard Worker   assert(channels % sizeof(float) == 0);
7982*4bdc9457SAndroid Build Coastguard Worker 
7983*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = input;
7984*4bdc9457SAndroid Build Coastguard Worker   float* o0 = output;
7985*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
7986*4bdc9457SAndroid Build Coastguard Worker   float* o1 = (float*) ((uintptr_t) o0 + output_stride);
7987*4bdc9457SAndroid Build Coastguard Worker 
7988*4bdc9457SAndroid Build Coastguard Worker   const size_t input_increment = input_stride * 2 - channels;
7989*4bdc9457SAndroid Build Coastguard Worker   const size_t output_increment = output_stride * 2 - channels;
7990*4bdc9457SAndroid Build Coastguard Worker 
7991*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmin = _mm_load_ps(params->sse.min);
7992*4bdc9457SAndroid Build Coastguard Worker   const __m128 vmax = _mm_load_ps(params->sse.max);
7993*4bdc9457SAndroid Build Coastguard Worker   do {
7994*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(rows < 2) {
7995*4bdc9457SAndroid Build Coastguard Worker       i1 = i0;
7996*4bdc9457SAndroid Build Coastguard Worker       o1 = o0;
7997*4bdc9457SAndroid Build Coastguard Worker     }
7998*4bdc9457SAndroid Build Coastguard Worker 
7999*4bdc9457SAndroid Build Coastguard Worker     const float* w = weights;
8000*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
8001*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
8002*4bdc9457SAndroid Build Coastguard Worker       const __m128 vscale0123 = _mm_load_ps(w);
8003*4bdc9457SAndroid Build Coastguard Worker 
8004*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0x0123 = _mm_loadu_ps(i0);
8005*4bdc9457SAndroid Build Coastguard Worker       i0 += 4;
8006*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc1x0123 = _mm_loadu_ps(i1);
8007*4bdc9457SAndroid Build Coastguard Worker       i1 += 4;
8008*4bdc9457SAndroid Build Coastguard Worker 
8009*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123);
8010*4bdc9457SAndroid Build Coastguard Worker       vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123);
8011*4bdc9457SAndroid Build Coastguard Worker 
8012*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbias0123 = _mm_load_ps(w + 4);
8013*4bdc9457SAndroid Build Coastguard Worker 
8014*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_add_ps(vacc0x0123, vbias0123);
8015*4bdc9457SAndroid Build Coastguard Worker       vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123);
8016*4bdc9457SAndroid Build Coastguard Worker 
8017*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
8018*4bdc9457SAndroid Build Coastguard Worker       vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
8019*4bdc9457SAndroid Build Coastguard Worker 
8020*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
8021*4bdc9457SAndroid Build Coastguard Worker       vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
8022*4bdc9457SAndroid Build Coastguard Worker 
8023*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o0, vacc0x0123);
8024*4bdc9457SAndroid Build Coastguard Worker       o0 += 4;
8025*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o1, vacc1x0123);
8026*4bdc9457SAndroid Build Coastguard Worker       o1 += 4;
8027*4bdc9457SAndroid Build Coastguard Worker 
8028*4bdc9457SAndroid Build Coastguard Worker       w += 8;
8029*4bdc9457SAndroid Build Coastguard Worker     }
8030*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
8031*4bdc9457SAndroid Build Coastguard Worker       const __m128 vscale0123 = _mm_load_ps(w);
8032*4bdc9457SAndroid Build Coastguard Worker 
8033*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc0x0123 = _mm_loadu_ps(i0);
8034*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + c);
8035*4bdc9457SAndroid Build Coastguard Worker       __m128 vacc1x0123 = _mm_loadu_ps(i1);
8036*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + c);
8037*4bdc9457SAndroid Build Coastguard Worker 
8038*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123);
8039*4bdc9457SAndroid Build Coastguard Worker       vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123);
8040*4bdc9457SAndroid Build Coastguard Worker 
8041*4bdc9457SAndroid Build Coastguard Worker       const __m128 vbias0123 = _mm_load_ps(w + 4);
8042*4bdc9457SAndroid Build Coastguard Worker 
8043*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_add_ps(vacc0x0123, vbias0123);
8044*4bdc9457SAndroid Build Coastguard Worker       vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123);
8045*4bdc9457SAndroid Build Coastguard Worker 
8046*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
8047*4bdc9457SAndroid Build Coastguard Worker       vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
8048*4bdc9457SAndroid Build Coastguard Worker 
8049*4bdc9457SAndroid Build Coastguard Worker       vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
8050*4bdc9457SAndroid Build Coastguard Worker       vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
8051*4bdc9457SAndroid Build Coastguard Worker 
8052*4bdc9457SAndroid Build Coastguard Worker       if (c & (2 * sizeof(float))) {
8053*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) o0, vacc0x0123);
8054*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) o1, vacc1x0123);
8055*4bdc9457SAndroid Build Coastguard Worker 
8056*4bdc9457SAndroid Build Coastguard Worker         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
8057*4bdc9457SAndroid Build Coastguard Worker         vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
8058*4bdc9457SAndroid Build Coastguard Worker 
8059*4bdc9457SAndroid Build Coastguard Worker         o0 += 2;
8060*4bdc9457SAndroid Build Coastguard Worker         o1 += 2;
8061*4bdc9457SAndroid Build Coastguard Worker       }
8062*4bdc9457SAndroid Build Coastguard Worker       if (c & (1 * sizeof(float))) {
8063*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(o0, vacc0x0123);
8064*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(o1, vacc1x0123);
8065*4bdc9457SAndroid Build Coastguard Worker 
8066*4bdc9457SAndroid Build Coastguard Worker         o0 += 1;
8067*4bdc9457SAndroid Build Coastguard Worker         o1 += 1;
8068*4bdc9457SAndroid Build Coastguard Worker       }
8069*4bdc9457SAndroid Build Coastguard Worker     }
8070*4bdc9457SAndroid Build Coastguard Worker     i0 = (const float*) ((uintptr_t) i0 + input_increment);
8071*4bdc9457SAndroid Build Coastguard Worker     o0 = (float*) ((uintptr_t) o0 + output_increment);
8072*4bdc9457SAndroid Build Coastguard Worker     i1 = (const float*) ((uintptr_t) i1 + input_increment);
8073*4bdc9457SAndroid Build Coastguard Worker     o1 = (float*) ((uintptr_t) o1 + output_increment);
8074*4bdc9457SAndroid Build Coastguard Worker     rows = doz(rows, 2);
8075*4bdc9457SAndroid Build Coastguard Worker   } while (rows != 0);
8076*4bdc9457SAndroid Build Coastguard Worker }
8077*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vsqrt_ukernel__sse_sqrt_x4(size_t n,const float * x,float * y,const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS (1)])8078*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vsqrt_ukernel__sse_sqrt_x4(
8079*4bdc9457SAndroid Build Coastguard Worker     size_t n,
8080*4bdc9457SAndroid Build Coastguard Worker     const float* x,
8081*4bdc9457SAndroid Build Coastguard Worker     float* y,
8082*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8083*4bdc9457SAndroid Build Coastguard Worker {
8084*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
8085*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
8086*4bdc9457SAndroid Build Coastguard Worker 
8087*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8088*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx = _mm_loadu_ps(x);
8089*4bdc9457SAndroid Build Coastguard Worker     x += 4;
8090*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy = _mm_sqrt_ps(vx);
8091*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy);
8092*4bdc9457SAndroid Build Coastguard Worker     y += 4;
8093*4bdc9457SAndroid Build Coastguard Worker   }
8094*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
8095*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx = _mm_loadu_ps(x);
8096*4bdc9457SAndroid Build Coastguard Worker     __m128 vy = _mm_sqrt_ps(vx);
8097*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
8098*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy);
8099*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_movehl_ps(vy, vy);
8100*4bdc9457SAndroid Build Coastguard Worker       y += 2;
8101*4bdc9457SAndroid Build Coastguard Worker     }
8102*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
8103*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy);
8104*4bdc9457SAndroid Build Coastguard Worker     }
8105*4bdc9457SAndroid Build Coastguard Worker   }
8106*4bdc9457SAndroid Build Coastguard Worker }
8107*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vabs_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS (1)])8108*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vabs_ukernel__sse_x8(
8109*4bdc9457SAndroid Build Coastguard Worker     size_t n,
8110*4bdc9457SAndroid Build Coastguard Worker     const float* x,
8111*4bdc9457SAndroid Build Coastguard Worker     float* y,
8112*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8113*4bdc9457SAndroid Build Coastguard Worker {
8114*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
8115*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
8116*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
8117*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
8118*4bdc9457SAndroid Build Coastguard Worker 
8119*4bdc9457SAndroid Build Coastguard Worker   const __m128 vnonsign_mask = _mm_load_ps(params->sse.nonsign_mask);
8120*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
8121*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx0123 = _mm_loadu_ps(x);
8122*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx4567 = _mm_loadu_ps(x + 4);
8123*4bdc9457SAndroid Build Coastguard Worker     x += 8;
8124*4bdc9457SAndroid Build Coastguard Worker 
8125*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy0123 = _mm_and_ps(vx0123, vnonsign_mask);
8126*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy4567 = _mm_and_ps(vx4567, vnonsign_mask);
8127*4bdc9457SAndroid Build Coastguard Worker 
8128*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
8129*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
8130*4bdc9457SAndroid Build Coastguard Worker     y += 8;
8131*4bdc9457SAndroid Build Coastguard Worker   }
8132*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8133*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx = _mm_loadu_ps(x);
8134*4bdc9457SAndroid Build Coastguard Worker     x += 4;
8135*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy = _mm_and_ps(vx, vnonsign_mask);
8136*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy);
8137*4bdc9457SAndroid Build Coastguard Worker     y += 4;
8138*4bdc9457SAndroid Build Coastguard Worker   }
8139*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
8140*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx = _mm_loadu_ps(x);
8141*4bdc9457SAndroid Build Coastguard Worker     __m128 vy = _mm_and_ps(vx, vnonsign_mask);
8142*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
8143*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy);
8144*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_movehl_ps(vy, vy);
8145*4bdc9457SAndroid Build Coastguard Worker       y += 2;
8146*4bdc9457SAndroid Build Coastguard Worker     }
8147*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
8148*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy);
8149*4bdc9457SAndroid Build Coastguard Worker     }
8150*4bdc9457SAndroid Build Coastguard Worker   }
8151*4bdc9457SAndroid Build Coastguard Worker }
8152*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vneg_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS (1)])8153*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vneg_ukernel__sse_x8(
8154*4bdc9457SAndroid Build Coastguard Worker     size_t n,
8155*4bdc9457SAndroid Build Coastguard Worker     const float* x,
8156*4bdc9457SAndroid Build Coastguard Worker     float* y,
8157*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8158*4bdc9457SAndroid Build Coastguard Worker {
8159*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
8160*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
8161*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
8162*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
8163*4bdc9457SAndroid Build Coastguard Worker 
8164*4bdc9457SAndroid Build Coastguard Worker   const __m128 vsign_mask = _mm_load_ps(params->sse.sign_mask);
8165*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
8166*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx0123 = _mm_loadu_ps(x);
8167*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx4567 = _mm_loadu_ps(x + 4);
8168*4bdc9457SAndroid Build Coastguard Worker     x += 8;
8169*4bdc9457SAndroid Build Coastguard Worker 
8170*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy0123 = _mm_xor_ps(vx0123, vsign_mask);
8171*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy4567 = _mm_xor_ps(vx4567, vsign_mask);
8172*4bdc9457SAndroid Build Coastguard Worker 
8173*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
8174*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
8175*4bdc9457SAndroid Build Coastguard Worker     y += 8;
8176*4bdc9457SAndroid Build Coastguard Worker   }
8177*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8178*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx = _mm_loadu_ps(x);
8179*4bdc9457SAndroid Build Coastguard Worker     x += 4;
8180*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy = _mm_xor_ps(vx, vsign_mask);
8181*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy);
8182*4bdc9457SAndroid Build Coastguard Worker     y += 4;
8183*4bdc9457SAndroid Build Coastguard Worker   }
8184*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
8185*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx = _mm_loadu_ps(x);
8186*4bdc9457SAndroid Build Coastguard Worker     __m128 vy = _mm_xor_ps(vx, vsign_mask);
8187*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
8188*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy);
8189*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_movehl_ps(vy, vy);
8190*4bdc9457SAndroid Build Coastguard Worker       y += 2;
8191*4bdc9457SAndroid Build Coastguard Worker     }
8192*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
8193*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy);
8194*4bdc9457SAndroid Build Coastguard Worker     }
8195*4bdc9457SAndroid Build Coastguard Worker   }
8196*4bdc9457SAndroid Build Coastguard Worker }
8197*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vsqr_ukernel__sse_x8(size_t n,const float * x,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])8198*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vsqr_ukernel__sse_x8(
8199*4bdc9457SAndroid Build Coastguard Worker     size_t n,
8200*4bdc9457SAndroid Build Coastguard Worker     const float* x,
8201*4bdc9457SAndroid Build Coastguard Worker     float* y,
8202*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8203*4bdc9457SAndroid Build Coastguard Worker {
8204*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
8205*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
8206*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
8207*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
8208*4bdc9457SAndroid Build Coastguard Worker 
8209*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
8210*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx0123 = _mm_loadu_ps(x);
8211*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx4567 = _mm_loadu_ps(x + 4);
8212*4bdc9457SAndroid Build Coastguard Worker     x += 8;
8213*4bdc9457SAndroid Build Coastguard Worker 
8214*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy0123 = _mm_mul_ps(vx0123, vx0123);
8215*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy4567 = _mm_mul_ps(vx4567, vx4567);
8216*4bdc9457SAndroid Build Coastguard Worker 
8217*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy0123);
8218*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y + 4, vy4567);
8219*4bdc9457SAndroid Build Coastguard Worker     y += 8;
8220*4bdc9457SAndroid Build Coastguard Worker   }
8221*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
8222*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx = _mm_loadu_ps(x);
8223*4bdc9457SAndroid Build Coastguard Worker     x += 4;
8224*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy = _mm_mul_ps(vx, vx);
8225*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_ps(y, vy);
8226*4bdc9457SAndroid Build Coastguard Worker     y += 4;
8227*4bdc9457SAndroid Build Coastguard Worker   }
8228*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
8229*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx = _mm_loadu_ps(x);
8230*4bdc9457SAndroid Build Coastguard Worker     __m128 vy = _mm_mul_ps(vx, vx);
8231*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
8232*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy);
8233*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_movehl_ps(vy, vy);
8234*4bdc9457SAndroid Build Coastguard Worker       y += 2;
8235*4bdc9457SAndroid Build Coastguard Worker     }
8236*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
8237*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy);
8238*4bdc9457SAndroid Build Coastguard Worker     }
8239*4bdc9457SAndroid Build Coastguard Worker   }
8240*4bdc9457SAndroid Build Coastguard Worker }
8241*4bdc9457SAndroid Build Coastguard Worker 
xnn_x32_packx_ukernel_4x__sse(size_t m,size_t k,const uint32_t * restrict x,size_t x_stride,uint32_t * restrict y)8242*4bdc9457SAndroid Build Coastguard Worker void xnn_x32_packx_ukernel_4x__sse(
8243*4bdc9457SAndroid Build Coastguard Worker     size_t m,
8244*4bdc9457SAndroid Build Coastguard Worker     size_t k,
8245*4bdc9457SAndroid Build Coastguard Worker     const uint32_t* restrict x,
8246*4bdc9457SAndroid Build Coastguard Worker     size_t x_stride,
8247*4bdc9457SAndroid Build Coastguard Worker     uint32_t* restrict y)
8248*4bdc9457SAndroid Build Coastguard Worker {
8249*4bdc9457SAndroid Build Coastguard Worker   assert(m != 0);
8250*4bdc9457SAndroid Build Coastguard Worker   assert(k != 0);
8251*4bdc9457SAndroid Build Coastguard Worker 
8252*4bdc9457SAndroid Build Coastguard Worker   const float* x0 = (const float*) x;
8253*4bdc9457SAndroid Build Coastguard Worker   const float* x1 = (const float*) ((uintptr_t) x0 + x_stride);
8254*4bdc9457SAndroid Build Coastguard Worker   if (m < 2) {
8255*4bdc9457SAndroid Build Coastguard Worker     x1 = x0;
8256*4bdc9457SAndroid Build Coastguard Worker   }
8257*4bdc9457SAndroid Build Coastguard Worker   const float* x2 = (const float*) ((uintptr_t) x1 + x_stride);
8258*4bdc9457SAndroid Build Coastguard Worker   if (m <= 2) {
8259*4bdc9457SAndroid Build Coastguard Worker     x2 = x1;
8260*4bdc9457SAndroid Build Coastguard Worker   }
8261*4bdc9457SAndroid Build Coastguard Worker   const float* x3 = (const float*) ((uintptr_t) x2 + x_stride);
8262*4bdc9457SAndroid Build Coastguard Worker   if (m != 4) {
8263*4bdc9457SAndroid Build Coastguard Worker     x3 = x2;
8264*4bdc9457SAndroid Build Coastguard Worker   }
8265*4bdc9457SAndroid Build Coastguard Worker 
8266*4bdc9457SAndroid Build Coastguard Worker   float*restrict y_f32 = (float*) y;
8267*4bdc9457SAndroid Build Coastguard Worker 
8268*4bdc9457SAndroid Build Coastguard Worker   for (; k >= 4; k -= 4) {
8269*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx0 = _mm_loadu_ps(x0);
8270*4bdc9457SAndroid Build Coastguard Worker     x0 += 4;
8271*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx1 = _mm_loadu_ps(x1);
8272*4bdc9457SAndroid Build Coastguard Worker     x1 += 4;
8273*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx2 = _mm_loadu_ps(x2);
8274*4bdc9457SAndroid Build Coastguard Worker     x2 += 4;
8275*4bdc9457SAndroid Build Coastguard Worker     const __m128 vx3 = _mm_loadu_ps(x3);
8276*4bdc9457SAndroid Build Coastguard Worker     x3 += 4;
8277*4bdc9457SAndroid Build Coastguard Worker 
8278*4bdc9457SAndroid Build Coastguard Worker     const __m128 vt0 = _mm_unpacklo_ps(vx0, vx1);
8279*4bdc9457SAndroid Build Coastguard Worker     const __m128 vt1 = _mm_unpackhi_ps(vx0, vx1);
8280*4bdc9457SAndroid Build Coastguard Worker     const __m128 vt2 = _mm_unpacklo_ps(vx2, vx3);
8281*4bdc9457SAndroid Build Coastguard Worker     const __m128 vt3 = _mm_unpackhi_ps(vx2, vx3);
8282*4bdc9457SAndroid Build Coastguard Worker 
8283*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy0 = _mm_movelh_ps(vt0, vt2);
8284*4bdc9457SAndroid Build Coastguard Worker     _mm_store_ps(y_f32, vy0);
8285*4bdc9457SAndroid Build Coastguard Worker 
8286*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy1 = _mm_movehl_ps(vt2, vt0);
8287*4bdc9457SAndroid Build Coastguard Worker     _mm_store_ps(y_f32 + 4, vy1);
8288*4bdc9457SAndroid Build Coastguard Worker 
8289*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy2 = _mm_movelh_ps(vt1, vt3);
8290*4bdc9457SAndroid Build Coastguard Worker     _mm_store_ps(y_f32 + 8, vy2);
8291*4bdc9457SAndroid Build Coastguard Worker 
8292*4bdc9457SAndroid Build Coastguard Worker     const __m128 vy3 = _mm_movehl_ps(vt3, vt1);
8293*4bdc9457SAndroid Build Coastguard Worker     _mm_store_ps(y_f32 + 12, vy3);
8294*4bdc9457SAndroid Build Coastguard Worker 
8295*4bdc9457SAndroid Build Coastguard Worker     y_f32 += 16;
8296*4bdc9457SAndroid Build Coastguard Worker   }
8297*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(k != 0) {
8298*4bdc9457SAndroid Build Coastguard Worker     do {
8299*4bdc9457SAndroid Build Coastguard Worker       const __m128 vx0 = _mm_load_ss(x0);
8300*4bdc9457SAndroid Build Coastguard Worker       x0 += 1;
8301*4bdc9457SAndroid Build Coastguard Worker       const __m128 vx1 = _mm_load_ss(x1);
8302*4bdc9457SAndroid Build Coastguard Worker       x1 += 1;
8303*4bdc9457SAndroid Build Coastguard Worker       const __m128 vx2 = _mm_load_ss(x2);
8304*4bdc9457SAndroid Build Coastguard Worker       x2 += 1;
8305*4bdc9457SAndroid Build Coastguard Worker       const __m128 vx3 = _mm_load_ss(x3);
8306*4bdc9457SAndroid Build Coastguard Worker       x3 += 1;
8307*4bdc9457SAndroid Build Coastguard Worker 
8308*4bdc9457SAndroid Build Coastguard Worker       const __m128 vx01 = _mm_unpacklo_ps(vx0, vx1);
8309*4bdc9457SAndroid Build Coastguard Worker       const __m128 vx23 = _mm_unpacklo_ps(vx2, vx3);
8310*4bdc9457SAndroid Build Coastguard Worker       const __m128 vy = _mm_movelh_ps(vx01, vx23);
8311*4bdc9457SAndroid Build Coastguard Worker 
8312*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ps(y_f32, vy);
8313*4bdc9457SAndroid Build Coastguard Worker       y_f32 += 4;
8314*4bdc9457SAndroid Build Coastguard Worker     } while (--k != 0);
8315*4bdc9457SAndroid Build Coastguard Worker   }
8316*4bdc9457SAndroid Build Coastguard Worker }
8317*4bdc9457SAndroid Build Coastguard Worker 
xnn_x32_transposec_ukernel__4x4_sse(const uint32_t * input,uint32_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)8318*4bdc9457SAndroid Build Coastguard Worker void xnn_x32_transposec_ukernel__4x4_sse(
8319*4bdc9457SAndroid Build Coastguard Worker     const uint32_t* input,
8320*4bdc9457SAndroid Build Coastguard Worker     uint32_t* output,
8321*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
8322*4bdc9457SAndroid Build Coastguard Worker     size_t output_stride,
8323*4bdc9457SAndroid Build Coastguard Worker     size_t block_width,
8324*4bdc9457SAndroid Build Coastguard Worker     size_t block_height) XNN_OOB_READS
8325*4bdc9457SAndroid Build Coastguard Worker {
8326*4bdc9457SAndroid Build Coastguard Worker   assert(output_stride >= block_height * sizeof(uint32_t));
8327*4bdc9457SAndroid Build Coastguard Worker   assert(input_stride >= block_width * sizeof(uint32_t));
8328*4bdc9457SAndroid Build Coastguard Worker 
8329*4bdc9457SAndroid Build Coastguard Worker   const size_t tile_height = 4;
8330*4bdc9457SAndroid Build Coastguard Worker   const size_t tile_width = 4;
8331*4bdc9457SAndroid Build Coastguard Worker   const size_t tile_wbytes = tile_width * sizeof(float);
8332*4bdc9457SAndroid Build Coastguard Worker   const size_t input_vreset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
8333*4bdc9457SAndroid Build Coastguard Worker   const size_t output_vreset = tile_height * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t);
8334*4bdc9457SAndroid Build Coastguard Worker   const size_t input_offset = tile_height * input_stride;
8335*4bdc9457SAndroid Build Coastguard Worker 
8336*4bdc9457SAndroid Build Coastguard Worker   const float* i0 = (const float*) input;
8337*4bdc9457SAndroid Build Coastguard Worker   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
8338*4bdc9457SAndroid Build Coastguard Worker   const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
8339*4bdc9457SAndroid Build Coastguard Worker   const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
8340*4bdc9457SAndroid Build Coastguard Worker 
8341*4bdc9457SAndroid Build Coastguard Worker   float* o0 = (float*) output;
8342*4bdc9457SAndroid Build Coastguard Worker   float* o1 = (float*) ((uintptr_t) o0 + output_stride);
8343*4bdc9457SAndroid Build Coastguard Worker   float* o2 = (float*) ((uintptr_t) o1 + output_stride);
8344*4bdc9457SAndroid Build Coastguard Worker   float* o3 = (float*) ((uintptr_t) o2 + output_stride);
8345*4bdc9457SAndroid Build Coastguard Worker 
8346*4bdc9457SAndroid Build Coastguard Worker   do {
8347*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(block_width < 2) {
8348*4bdc9457SAndroid Build Coastguard Worker       o1 = o0;
8349*4bdc9457SAndroid Build Coastguard Worker     }
8350*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(block_width <= 2) {
8351*4bdc9457SAndroid Build Coastguard Worker       o2 = o0;
8352*4bdc9457SAndroid Build Coastguard Worker     }
8353*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(block_width < 4) {
8354*4bdc9457SAndroid Build Coastguard Worker       o3 = o0;
8355*4bdc9457SAndroid Build Coastguard Worker     }
8356*4bdc9457SAndroid Build Coastguard Worker     size_t bh = block_height;
8357*4bdc9457SAndroid Build Coastguard Worker     for (; bh >= 4; bh -= 4) {
8358*4bdc9457SAndroid Build Coastguard Worker       __m128 v0 = _mm_loadu_ps(i0);
8359*4bdc9457SAndroid Build Coastguard Worker       i0 = (const float*) ((uintptr_t) i0 + input_offset);
8360*4bdc9457SAndroid Build Coastguard Worker       __m128 v1 = _mm_loadu_ps(i1);
8361*4bdc9457SAndroid Build Coastguard Worker       i1 = (const float*) ((uintptr_t) i1 + input_offset);
8362*4bdc9457SAndroid Build Coastguard Worker       __m128 v2 = _mm_loadu_ps(i2);
8363*4bdc9457SAndroid Build Coastguard Worker       i2 = (const float*) ((uintptr_t) i2 + input_offset);
8364*4bdc9457SAndroid Build Coastguard Worker       __m128 v3 = _mm_loadu_ps(i3);
8365*4bdc9457SAndroid Build Coastguard Worker       i3 = (const float*) ((uintptr_t) i3 + input_offset);
8366*4bdc9457SAndroid Build Coastguard Worker 
8367*4bdc9457SAndroid Build Coastguard Worker       _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
8368*4bdc9457SAndroid Build Coastguard Worker 
8369*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o3, v3);
8370*4bdc9457SAndroid Build Coastguard Worker       o3 = (float*) ((uintptr_t) o3 + tile_wbytes);
8371*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o2, v2);
8372*4bdc9457SAndroid Build Coastguard Worker       o2 = (float*) ((uintptr_t) o2 + tile_wbytes);
8373*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o1, v1);
8374*4bdc9457SAndroid Build Coastguard Worker       o1 = (float*) ((uintptr_t) o1 + tile_wbytes);
8375*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(o0, v0);
8376*4bdc9457SAndroid Build Coastguard Worker       o0 = (float*) ((uintptr_t) o0 + tile_wbytes);
8377*4bdc9457SAndroid Build Coastguard Worker     }
8378*4bdc9457SAndroid Build Coastguard Worker 
8379*4bdc9457SAndroid Build Coastguard Worker     if (bh != 0) {
8380*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(bh <= 2) {
8381*4bdc9457SAndroid Build Coastguard Worker         i2 = i0;
8382*4bdc9457SAndroid Build Coastguard Worker       }
8383*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(bh < 2) {
8384*4bdc9457SAndroid Build Coastguard Worker         i1 = i0;
8385*4bdc9457SAndroid Build Coastguard Worker       }
8386*4bdc9457SAndroid Build Coastguard Worker       __m128 v0 = _mm_loadu_ps(i0);
8387*4bdc9457SAndroid Build Coastguard Worker       __m128 v1 = _mm_loadu_ps(i1);
8388*4bdc9457SAndroid Build Coastguard Worker       __m128 v2 = _mm_loadu_ps(i2);
8389*4bdc9457SAndroid Build Coastguard Worker       __m128 v3 = _mm_setzero_ps();
8390*4bdc9457SAndroid Build Coastguard Worker 
8391*4bdc9457SAndroid Build Coastguard Worker       _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
8392*4bdc9457SAndroid Build Coastguard Worker 
8393*4bdc9457SAndroid Build Coastguard Worker       if (bh & 2) {
8394*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) o3, v3);
8395*4bdc9457SAndroid Build Coastguard Worker         o3 += 2;
8396*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) o2, v2);
8397*4bdc9457SAndroid Build Coastguard Worker         o2 += 2;
8398*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) o1, v1);
8399*4bdc9457SAndroid Build Coastguard Worker         o1 += 2;
8400*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_pi((__m64*) o0, v0);
8401*4bdc9457SAndroid Build Coastguard Worker         o0 += 2;
8402*4bdc9457SAndroid Build Coastguard Worker         v0 = _mm_movehl_ps(v0, v0);
8403*4bdc9457SAndroid Build Coastguard Worker         v1 = _mm_movehl_ps(v1, v1);
8404*4bdc9457SAndroid Build Coastguard Worker         v2 = _mm_movehl_ps(v2, v2);
8405*4bdc9457SAndroid Build Coastguard Worker         v3 = _mm_movehl_ps(v3, v3);
8406*4bdc9457SAndroid Build Coastguard Worker       }
8407*4bdc9457SAndroid Build Coastguard Worker       if (bh & 1) {
8408*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(o3, v3);
8409*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(o2, v2);
8410*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(o1, v1);
8411*4bdc9457SAndroid Build Coastguard Worker         _mm_store_ss(o0, v0);
8412*4bdc9457SAndroid Build Coastguard Worker       }
8413*4bdc9457SAndroid Build Coastguard Worker     }
8414*4bdc9457SAndroid Build Coastguard Worker     i0 = (const float*) ((uintptr_t) i0 + input_vreset);
8415*4bdc9457SAndroid Build Coastguard Worker     i1 = (const float*) ((uintptr_t) i0 + input_stride);
8416*4bdc9457SAndroid Build Coastguard Worker     i2 = (const float*) ((uintptr_t) i1 + input_stride);
8417*4bdc9457SAndroid Build Coastguard Worker     i3 = (const float*) ((uintptr_t) i2 + input_stride);
8418*4bdc9457SAndroid Build Coastguard Worker     o0 = (float*) ((uintptr_t) o0 + output_vreset);
8419*4bdc9457SAndroid Build Coastguard Worker     o1 = (float*) ((uintptr_t) o1 + output_vreset);
8420*4bdc9457SAndroid Build Coastguard Worker     o2 = (float*) ((uintptr_t) o2 + output_vreset);
8421*4bdc9457SAndroid Build Coastguard Worker     o3 = (float*) ((uintptr_t) o3 + output_vreset);
8422*4bdc9457SAndroid Build Coastguard Worker     block_width = doz(block_width, tile_width);
8423*4bdc9457SAndroid Build Coastguard Worker   } while (block_width != 0);
8424*4bdc9457SAndroid Build Coastguard Worker }
8425