1*4bdc9457SAndroid Build Coastguard Worker // Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker //
3*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker
6*4bdc9457SAndroid Build Coastguard Worker #include <assert.h>
7*4bdc9457SAndroid Build Coastguard Worker
8*4bdc9457SAndroid Build Coastguard Worker #include <immintrin.h>
9*4bdc9457SAndroid Build Coastguard Worker
10*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/common.h>
11*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/dwconv.h>
12*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/gemm.h>
13*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/ibilinear.h>
14*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/igemm.h>
15*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/intrinsics-polyfill.h>
16*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/math.h>
17*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vmulcaddc.h>
18*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vunary.h>
19*4bdc9457SAndroid Build Coastguard Worker
20*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_dwconv_minmax_ukernel_up16x3__fma3(size_t channels,size_t output_width,const void ** input,const void * weights,void * output,size_t input_stride,size_t output_increment,size_t input_offset,const void * zero,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])21*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_dwconv_minmax_ukernel_up16x3__fma3(
22*4bdc9457SAndroid Build Coastguard Worker size_t channels,
23*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
24*4bdc9457SAndroid Build Coastguard Worker const void** input,
25*4bdc9457SAndroid Build Coastguard Worker const void* weights,
26*4bdc9457SAndroid Build Coastguard Worker void* output,
27*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
28*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
29*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
30*4bdc9457SAndroid Build Coastguard Worker const void* zero,
31*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
32*4bdc9457SAndroid Build Coastguard Worker {
33*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
34*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
35*4bdc9457SAndroid Build Coastguard Worker
36*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
37*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
38*4bdc9457SAndroid Build Coastguard Worker
39*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
40*4bdc9457SAndroid Build Coastguard Worker do {
41*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = input[0];
42*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
43*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
44*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
45*4bdc9457SAndroid Build Coastguard Worker }
46*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = input[1];
47*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
48*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
49*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
50*4bdc9457SAndroid Build Coastguard Worker }
51*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = input[2];
52*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
53*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
54*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
55*4bdc9457SAndroid Build Coastguard Worker }
56*4bdc9457SAndroid Build Coastguard Worker input = (const void**) ((uintptr_t) input + input_stride);
57*4bdc9457SAndroid Build Coastguard Worker
58*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
59*4bdc9457SAndroid Build Coastguard Worker const uint16_t* w = weights;
60*4bdc9457SAndroid Build Coastguard Worker for (; c >= 16; c -= 16) {
61*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
62*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
63*4bdc9457SAndroid Build Coastguard Worker
64*4bdc9457SAndroid Build Coastguard Worker
65*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
66*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
67*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
68*4bdc9457SAndroid Build Coastguard Worker
69*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
70*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
71*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
72*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
73*4bdc9457SAndroid Build Coastguard Worker
74*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
75*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
76*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
77*4bdc9457SAndroid Build Coastguard Worker
78*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
79*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
80*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
81*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
82*4bdc9457SAndroid Build Coastguard Worker
83*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
84*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8)));
85*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
86*4bdc9457SAndroid Build Coastguard Worker
87*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
88*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
89*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
90*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
91*4bdc9457SAndroid Build Coastguard Worker
92*4bdc9457SAndroid Build Coastguard Worker w += 64;
93*4bdc9457SAndroid Build Coastguard Worker
94*4bdc9457SAndroid Build Coastguard Worker
95*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
96*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
97*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
98*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
99*4bdc9457SAndroid Build Coastguard Worker
100*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
101*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
102*4bdc9457SAndroid Build Coastguard Worker o += 16;
103*4bdc9457SAndroid Build Coastguard Worker }
104*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
105*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
106*4bdc9457SAndroid Build Coastguard Worker
107*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
108*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
109*4bdc9457SAndroid Build Coastguard Worker
110*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
111*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
112*4bdc9457SAndroid Build Coastguard Worker
113*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
114*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
115*4bdc9457SAndroid Build Coastguard Worker
116*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
117*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
118*4bdc9457SAndroid Build Coastguard Worker
119*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
120*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
121*4bdc9457SAndroid Build Coastguard Worker
122*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
123*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
124*4bdc9457SAndroid Build Coastguard Worker
125*4bdc9457SAndroid Build Coastguard Worker w += 8;
126*4bdc9457SAndroid Build Coastguard Worker
127*4bdc9457SAndroid Build Coastguard Worker
128*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
129*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
130*4bdc9457SAndroid Build Coastguard Worker
131*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
132*4bdc9457SAndroid Build Coastguard Worker o += 8;
133*4bdc9457SAndroid Build Coastguard Worker }
134*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
135*4bdc9457SAndroid Build Coastguard Worker assert(c >= 1);
136*4bdc9457SAndroid Build Coastguard Worker assert(c <= 7);
137*4bdc9457SAndroid Build Coastguard Worker
138*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
139*4bdc9457SAndroid Build Coastguard Worker
140*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
141*4bdc9457SAndroid Build Coastguard Worker
142*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
143*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
144*4bdc9457SAndroid Build Coastguard Worker
145*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
146*4bdc9457SAndroid Build Coastguard Worker
147*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
148*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
149*4bdc9457SAndroid Build Coastguard Worker
150*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
151*4bdc9457SAndroid Build Coastguard Worker
152*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
153*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
154*4bdc9457SAndroid Build Coastguard Worker
155*4bdc9457SAndroid Build Coastguard Worker
156*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
157*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
158*4bdc9457SAndroid Build Coastguard Worker
159*4bdc9457SAndroid Build Coastguard Worker __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
160*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
161*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh01234567);
162*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
163*4bdc9457SAndroid Build Coastguard Worker o += 4;
164*4bdc9457SAndroid Build Coastguard Worker }
165*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
166*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh01234567);
167*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_srli_epi64(vh01234567, 32);
168*4bdc9457SAndroid Build Coastguard Worker o += 2;
169*4bdc9457SAndroid Build Coastguard Worker }
170*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
171*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
172*4bdc9457SAndroid Build Coastguard Worker o += 1;
173*4bdc9457SAndroid Build Coastguard Worker }
174*4bdc9457SAndroid Build Coastguard Worker }
175*4bdc9457SAndroid Build Coastguard Worker
176*4bdc9457SAndroid Build Coastguard Worker o = (uint16_t*) ((uintptr_t) o + output_increment);
177*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
178*4bdc9457SAndroid Build Coastguard Worker }
179*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_dwconv_minmax_ukernel_up16x4__fma3(size_t channels,size_t output_width,const void ** input,const void * weights,void * output,size_t input_stride,size_t output_increment,size_t input_offset,const void * zero,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])180*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_dwconv_minmax_ukernel_up16x4__fma3(
181*4bdc9457SAndroid Build Coastguard Worker size_t channels,
182*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
183*4bdc9457SAndroid Build Coastguard Worker const void** input,
184*4bdc9457SAndroid Build Coastguard Worker const void* weights,
185*4bdc9457SAndroid Build Coastguard Worker void* output,
186*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
187*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
188*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
189*4bdc9457SAndroid Build Coastguard Worker const void* zero,
190*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
191*4bdc9457SAndroid Build Coastguard Worker {
192*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
193*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
194*4bdc9457SAndroid Build Coastguard Worker
195*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
196*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
197*4bdc9457SAndroid Build Coastguard Worker
198*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
199*4bdc9457SAndroid Build Coastguard Worker do {
200*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = input[0];
201*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
202*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
203*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
204*4bdc9457SAndroid Build Coastguard Worker }
205*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = input[1];
206*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
207*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
208*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
209*4bdc9457SAndroid Build Coastguard Worker }
210*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = input[2];
211*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
212*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
213*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
214*4bdc9457SAndroid Build Coastguard Worker }
215*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = input[3];
216*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
217*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
218*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
219*4bdc9457SAndroid Build Coastguard Worker }
220*4bdc9457SAndroid Build Coastguard Worker input = (const void**) ((uintptr_t) input + input_stride);
221*4bdc9457SAndroid Build Coastguard Worker
222*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
223*4bdc9457SAndroid Build Coastguard Worker const uint16_t* w = weights;
224*4bdc9457SAndroid Build Coastguard Worker for (; c >= 16; c -= 16) {
225*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
226*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
227*4bdc9457SAndroid Build Coastguard Worker
228*4bdc9457SAndroid Build Coastguard Worker
229*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
230*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
231*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
232*4bdc9457SAndroid Build Coastguard Worker
233*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
234*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
235*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
236*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
237*4bdc9457SAndroid Build Coastguard Worker
238*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
239*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
240*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
241*4bdc9457SAndroid Build Coastguard Worker
242*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
243*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
244*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
245*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
246*4bdc9457SAndroid Build Coastguard Worker
247*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
248*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8)));
249*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
250*4bdc9457SAndroid Build Coastguard Worker
251*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
252*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
253*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
254*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
255*4bdc9457SAndroid Build Coastguard Worker
256*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
257*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 8)));
258*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
259*4bdc9457SAndroid Build Coastguard Worker
260*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 64)));
261*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72)));
262*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
263*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x89ABCDEF, vk3x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
264*4bdc9457SAndroid Build Coastguard Worker
265*4bdc9457SAndroid Build Coastguard Worker w += 80;
266*4bdc9457SAndroid Build Coastguard Worker
267*4bdc9457SAndroid Build Coastguard Worker
268*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
269*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
270*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
271*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
272*4bdc9457SAndroid Build Coastguard Worker
273*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
274*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
275*4bdc9457SAndroid Build Coastguard Worker o += 16;
276*4bdc9457SAndroid Build Coastguard Worker }
277*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
278*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
279*4bdc9457SAndroid Build Coastguard Worker
280*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
281*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
282*4bdc9457SAndroid Build Coastguard Worker
283*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
284*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
285*4bdc9457SAndroid Build Coastguard Worker
286*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
287*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
288*4bdc9457SAndroid Build Coastguard Worker
289*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
290*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
291*4bdc9457SAndroid Build Coastguard Worker
292*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
293*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
294*4bdc9457SAndroid Build Coastguard Worker
295*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
296*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
297*4bdc9457SAndroid Build Coastguard Worker
298*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
299*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
300*4bdc9457SAndroid Build Coastguard Worker
301*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
302*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
303*4bdc9457SAndroid Build Coastguard Worker
304*4bdc9457SAndroid Build Coastguard Worker w += 8;
305*4bdc9457SAndroid Build Coastguard Worker
306*4bdc9457SAndroid Build Coastguard Worker
307*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
308*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
309*4bdc9457SAndroid Build Coastguard Worker
310*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
311*4bdc9457SAndroid Build Coastguard Worker o += 8;
312*4bdc9457SAndroid Build Coastguard Worker }
313*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
314*4bdc9457SAndroid Build Coastguard Worker assert(c >= 1);
315*4bdc9457SAndroid Build Coastguard Worker assert(c <= 7);
316*4bdc9457SAndroid Build Coastguard Worker
317*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
318*4bdc9457SAndroid Build Coastguard Worker
319*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
320*4bdc9457SAndroid Build Coastguard Worker
321*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
322*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
323*4bdc9457SAndroid Build Coastguard Worker
324*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
325*4bdc9457SAndroid Build Coastguard Worker
326*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
327*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
328*4bdc9457SAndroid Build Coastguard Worker
329*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
330*4bdc9457SAndroid Build Coastguard Worker
331*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
332*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
333*4bdc9457SAndroid Build Coastguard Worker
334*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
335*4bdc9457SAndroid Build Coastguard Worker
336*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
337*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
338*4bdc9457SAndroid Build Coastguard Worker
339*4bdc9457SAndroid Build Coastguard Worker
340*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
341*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
342*4bdc9457SAndroid Build Coastguard Worker
343*4bdc9457SAndroid Build Coastguard Worker __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
344*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
345*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh01234567);
346*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
347*4bdc9457SAndroid Build Coastguard Worker o += 4;
348*4bdc9457SAndroid Build Coastguard Worker }
349*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
350*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh01234567);
351*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_srli_epi64(vh01234567, 32);
352*4bdc9457SAndroid Build Coastguard Worker o += 2;
353*4bdc9457SAndroid Build Coastguard Worker }
354*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
355*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
356*4bdc9457SAndroid Build Coastguard Worker o += 1;
357*4bdc9457SAndroid Build Coastguard Worker }
358*4bdc9457SAndroid Build Coastguard Worker }
359*4bdc9457SAndroid Build Coastguard Worker
360*4bdc9457SAndroid Build Coastguard Worker o = (uint16_t*) ((uintptr_t) o + output_increment);
361*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
362*4bdc9457SAndroid Build Coastguard Worker }
363*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_dwconv_minmax_ukernel_up16x9__fma3(size_t channels,size_t output_width,const void ** input,const void * weights,void * output,size_t input_stride,size_t output_increment,size_t input_offset,const void * zero,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])364*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_dwconv_minmax_ukernel_up16x9__fma3(
365*4bdc9457SAndroid Build Coastguard Worker size_t channels,
366*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
367*4bdc9457SAndroid Build Coastguard Worker const void** input,
368*4bdc9457SAndroid Build Coastguard Worker const void* weights,
369*4bdc9457SAndroid Build Coastguard Worker void* output,
370*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
371*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
372*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
373*4bdc9457SAndroid Build Coastguard Worker const void* zero,
374*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
375*4bdc9457SAndroid Build Coastguard Worker {
376*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
377*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
378*4bdc9457SAndroid Build Coastguard Worker
379*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
380*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
381*4bdc9457SAndroid Build Coastguard Worker
382*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
383*4bdc9457SAndroid Build Coastguard Worker do {
384*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = input[0];
385*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
386*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
387*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
388*4bdc9457SAndroid Build Coastguard Worker }
389*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = input[1];
390*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
391*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
392*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
393*4bdc9457SAndroid Build Coastguard Worker }
394*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = input[2];
395*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
396*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
397*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
398*4bdc9457SAndroid Build Coastguard Worker }
399*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = input[3];
400*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
401*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
402*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
403*4bdc9457SAndroid Build Coastguard Worker }
404*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = input[4];
405*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
406*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
407*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
408*4bdc9457SAndroid Build Coastguard Worker }
409*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = input[5];
410*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
411*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
412*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
413*4bdc9457SAndroid Build Coastguard Worker }
414*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = input[6];
415*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
416*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
417*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
418*4bdc9457SAndroid Build Coastguard Worker }
419*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i7 = input[7];
420*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
421*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
422*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
423*4bdc9457SAndroid Build Coastguard Worker }
424*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i8 = input[8];
425*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
426*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
427*4bdc9457SAndroid Build Coastguard Worker i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
428*4bdc9457SAndroid Build Coastguard Worker }
429*4bdc9457SAndroid Build Coastguard Worker input = (const void**) ((uintptr_t) input + input_stride);
430*4bdc9457SAndroid Build Coastguard Worker
431*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
432*4bdc9457SAndroid Build Coastguard Worker const uint16_t* w = weights;
433*4bdc9457SAndroid Build Coastguard Worker for (; c >= 16; c -= 16) {
434*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
435*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
436*4bdc9457SAndroid Build Coastguard Worker
437*4bdc9457SAndroid Build Coastguard Worker
438*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
439*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
440*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
441*4bdc9457SAndroid Build Coastguard Worker
442*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
443*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
444*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
445*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
446*4bdc9457SAndroid Build Coastguard Worker
447*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
448*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
449*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
450*4bdc9457SAndroid Build Coastguard Worker
451*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
452*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
453*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
454*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
455*4bdc9457SAndroid Build Coastguard Worker
456*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
457*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8)));
458*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
459*4bdc9457SAndroid Build Coastguard Worker
460*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
461*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
462*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
463*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
464*4bdc9457SAndroid Build Coastguard Worker
465*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
466*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 8)));
467*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
468*4bdc9457SAndroid Build Coastguard Worker
469*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 64)));
470*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72)));
471*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
472*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x89ABCDEF, vk3x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
473*4bdc9457SAndroid Build Coastguard Worker
474*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
475*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i4 + 8)));
476*4bdc9457SAndroid Build Coastguard Worker i4 += 16;
477*4bdc9457SAndroid Build Coastguard Worker
478*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 80)));
479*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 88)));
480*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
481*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x89ABCDEF, vk4x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
482*4bdc9457SAndroid Build Coastguard Worker
483*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
484*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i5 + 8)));
485*4bdc9457SAndroid Build Coastguard Worker i5 += 16;
486*4bdc9457SAndroid Build Coastguard Worker
487*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 96)));
488*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 104)));
489*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
490*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x89ABCDEF, vk5x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
491*4bdc9457SAndroid Build Coastguard Worker
492*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
493*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i6 + 8)));
494*4bdc9457SAndroid Build Coastguard Worker i6 += 16;
495*4bdc9457SAndroid Build Coastguard Worker
496*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 112)));
497*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 120)));
498*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
499*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x89ABCDEF, vk6x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
500*4bdc9457SAndroid Build Coastguard Worker
501*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
502*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i7 + 8)));
503*4bdc9457SAndroid Build Coastguard Worker i7 += 16;
504*4bdc9457SAndroid Build Coastguard Worker
505*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 128)));
506*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 136)));
507*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
508*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x89ABCDEF, vk7x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
509*4bdc9457SAndroid Build Coastguard Worker
510*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
511*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i8 + 8)));
512*4bdc9457SAndroid Build Coastguard Worker i8 += 16;
513*4bdc9457SAndroid Build Coastguard Worker
514*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 144)));
515*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 152)));
516*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
517*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x89ABCDEF, vk8x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
518*4bdc9457SAndroid Build Coastguard Worker
519*4bdc9457SAndroid Build Coastguard Worker w += 160;
520*4bdc9457SAndroid Build Coastguard Worker
521*4bdc9457SAndroid Build Coastguard Worker
522*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
523*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
524*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
525*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
526*4bdc9457SAndroid Build Coastguard Worker
527*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
528*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
529*4bdc9457SAndroid Build Coastguard Worker o += 16;
530*4bdc9457SAndroid Build Coastguard Worker }
531*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
532*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
533*4bdc9457SAndroid Build Coastguard Worker
534*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
535*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
536*4bdc9457SAndroid Build Coastguard Worker
537*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
538*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
539*4bdc9457SAndroid Build Coastguard Worker
540*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
541*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
542*4bdc9457SAndroid Build Coastguard Worker
543*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
544*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
545*4bdc9457SAndroid Build Coastguard Worker
546*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
547*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
548*4bdc9457SAndroid Build Coastguard Worker
549*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
550*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
551*4bdc9457SAndroid Build Coastguard Worker
552*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
553*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
554*4bdc9457SAndroid Build Coastguard Worker
555*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
556*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
557*4bdc9457SAndroid Build Coastguard Worker
558*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
559*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
560*4bdc9457SAndroid Build Coastguard Worker
561*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80)));
562*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
563*4bdc9457SAndroid Build Coastguard Worker
564*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
565*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
566*4bdc9457SAndroid Build Coastguard Worker
567*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 96)));
568*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
569*4bdc9457SAndroid Build Coastguard Worker
570*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
571*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
572*4bdc9457SAndroid Build Coastguard Worker
573*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 112)));
574*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
575*4bdc9457SAndroid Build Coastguard Worker
576*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
577*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
578*4bdc9457SAndroid Build Coastguard Worker
579*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 128)));
580*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
581*4bdc9457SAndroid Build Coastguard Worker
582*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
583*4bdc9457SAndroid Build Coastguard Worker i8 += 8;
584*4bdc9457SAndroid Build Coastguard Worker
585*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144)));
586*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
587*4bdc9457SAndroid Build Coastguard Worker
588*4bdc9457SAndroid Build Coastguard Worker w += 8;
589*4bdc9457SAndroid Build Coastguard Worker
590*4bdc9457SAndroid Build Coastguard Worker
591*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
592*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
593*4bdc9457SAndroid Build Coastguard Worker
594*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
595*4bdc9457SAndroid Build Coastguard Worker o += 8;
596*4bdc9457SAndroid Build Coastguard Worker }
597*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
598*4bdc9457SAndroid Build Coastguard Worker assert(c >= 1);
599*4bdc9457SAndroid Build Coastguard Worker assert(c <= 7);
600*4bdc9457SAndroid Build Coastguard Worker
601*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
602*4bdc9457SAndroid Build Coastguard Worker
603*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
604*4bdc9457SAndroid Build Coastguard Worker
605*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
606*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
607*4bdc9457SAndroid Build Coastguard Worker
608*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
609*4bdc9457SAndroid Build Coastguard Worker
610*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
611*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
612*4bdc9457SAndroid Build Coastguard Worker
613*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
614*4bdc9457SAndroid Build Coastguard Worker
615*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
616*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
617*4bdc9457SAndroid Build Coastguard Worker
618*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
619*4bdc9457SAndroid Build Coastguard Worker
620*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
621*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
622*4bdc9457SAndroid Build Coastguard Worker
623*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
624*4bdc9457SAndroid Build Coastguard Worker
625*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80)));
626*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
627*4bdc9457SAndroid Build Coastguard Worker
628*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
629*4bdc9457SAndroid Build Coastguard Worker
630*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 96)));
631*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
632*4bdc9457SAndroid Build Coastguard Worker
633*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
634*4bdc9457SAndroid Build Coastguard Worker
635*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 112)));
636*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
637*4bdc9457SAndroid Build Coastguard Worker
638*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
639*4bdc9457SAndroid Build Coastguard Worker
640*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 128)));
641*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
642*4bdc9457SAndroid Build Coastguard Worker
643*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
644*4bdc9457SAndroid Build Coastguard Worker
645*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144)));
646*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
647*4bdc9457SAndroid Build Coastguard Worker
648*4bdc9457SAndroid Build Coastguard Worker
649*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
650*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
651*4bdc9457SAndroid Build Coastguard Worker
652*4bdc9457SAndroid Build Coastguard Worker __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
653*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
654*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh01234567);
655*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
656*4bdc9457SAndroid Build Coastguard Worker o += 4;
657*4bdc9457SAndroid Build Coastguard Worker }
658*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
659*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh01234567);
660*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_srli_epi64(vh01234567, 32);
661*4bdc9457SAndroid Build Coastguard Worker o += 2;
662*4bdc9457SAndroid Build Coastguard Worker }
663*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
664*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
665*4bdc9457SAndroid Build Coastguard Worker o += 1;
666*4bdc9457SAndroid Build Coastguard Worker }
667*4bdc9457SAndroid Build Coastguard Worker }
668*4bdc9457SAndroid Build Coastguard Worker
669*4bdc9457SAndroid Build Coastguard Worker o = (uint16_t*) ((uintptr_t) o + output_increment);
670*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
671*4bdc9457SAndroid Build Coastguard Worker }
672*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2(size_t channels,size_t output_width,const void ** input,const void * weights,void * output,size_t input_stride,size_t output_increment,size_t input_offset,const void * zero,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])673*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2(
674*4bdc9457SAndroid Build Coastguard Worker size_t channels,
675*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
676*4bdc9457SAndroid Build Coastguard Worker const void** input,
677*4bdc9457SAndroid Build Coastguard Worker const void* weights,
678*4bdc9457SAndroid Build Coastguard Worker void* output,
679*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
680*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
681*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
682*4bdc9457SAndroid Build Coastguard Worker const void* zero,
683*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
684*4bdc9457SAndroid Build Coastguard Worker {
685*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
686*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
687*4bdc9457SAndroid Build Coastguard Worker
688*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
689*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
690*4bdc9457SAndroid Build Coastguard Worker
691*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
692*4bdc9457SAndroid Build Coastguard Worker do {
693*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = input[0];
694*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
695*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
696*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
697*4bdc9457SAndroid Build Coastguard Worker }
698*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = input[1];
699*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
700*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
701*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
702*4bdc9457SAndroid Build Coastguard Worker }
703*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = input[2];
704*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
705*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
706*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
707*4bdc9457SAndroid Build Coastguard Worker }
708*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = input[3];
709*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
710*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
711*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
712*4bdc9457SAndroid Build Coastguard Worker }
713*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = input[4];
714*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
715*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
716*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
717*4bdc9457SAndroid Build Coastguard Worker }
718*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = input[5];
719*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
720*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
721*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
722*4bdc9457SAndroid Build Coastguard Worker }
723*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = input[6];
724*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
725*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
726*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
727*4bdc9457SAndroid Build Coastguard Worker }
728*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i7 = input[7];
729*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
730*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
731*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
732*4bdc9457SAndroid Build Coastguard Worker }
733*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i8 = input[8];
734*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
735*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
736*4bdc9457SAndroid Build Coastguard Worker i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
737*4bdc9457SAndroid Build Coastguard Worker }
738*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i9 = input[9];
739*4bdc9457SAndroid Build Coastguard Worker assert(i9 != NULL);
740*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i9 != zero) {
741*4bdc9457SAndroid Build Coastguard Worker i9 = (const uint16_t*) ((uintptr_t) i9 + input_offset);
742*4bdc9457SAndroid Build Coastguard Worker }
743*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i10 = input[10];
744*4bdc9457SAndroid Build Coastguard Worker assert(i10 != NULL);
745*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i10 != zero) {
746*4bdc9457SAndroid Build Coastguard Worker i10 = (const uint16_t*) ((uintptr_t) i10 + input_offset);
747*4bdc9457SAndroid Build Coastguard Worker }
748*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i11 = input[11];
749*4bdc9457SAndroid Build Coastguard Worker assert(i11 != NULL);
750*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i11 != zero) {
751*4bdc9457SAndroid Build Coastguard Worker i11 = (const uint16_t*) ((uintptr_t) i11 + input_offset);
752*4bdc9457SAndroid Build Coastguard Worker }
753*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i12 = input[12];
754*4bdc9457SAndroid Build Coastguard Worker assert(i12 != NULL);
755*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i12 != zero) {
756*4bdc9457SAndroid Build Coastguard Worker i12 = (const uint16_t*) ((uintptr_t) i12 + input_offset);
757*4bdc9457SAndroid Build Coastguard Worker }
758*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i13 = input[13];
759*4bdc9457SAndroid Build Coastguard Worker assert(i13 != NULL);
760*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i13 != zero) {
761*4bdc9457SAndroid Build Coastguard Worker i13 = (const uint16_t*) ((uintptr_t) i13 + input_offset);
762*4bdc9457SAndroid Build Coastguard Worker }
763*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i14 = input[14];
764*4bdc9457SAndroid Build Coastguard Worker assert(i14 != NULL);
765*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i14 != zero) {
766*4bdc9457SAndroid Build Coastguard Worker i14 = (const uint16_t*) ((uintptr_t) i14 + input_offset);
767*4bdc9457SAndroid Build Coastguard Worker }
768*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i15 = input[15];
769*4bdc9457SAndroid Build Coastguard Worker assert(i15 != NULL);
770*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i15 != zero) {
771*4bdc9457SAndroid Build Coastguard Worker i15 = (const uint16_t*) ((uintptr_t) i15 + input_offset);
772*4bdc9457SAndroid Build Coastguard Worker }
773*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i16 = input[16];
774*4bdc9457SAndroid Build Coastguard Worker assert(i16 != NULL);
775*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i16 != zero) {
776*4bdc9457SAndroid Build Coastguard Worker i16 = (const uint16_t*) ((uintptr_t) i16 + input_offset);
777*4bdc9457SAndroid Build Coastguard Worker }
778*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i17 = input[17];
779*4bdc9457SAndroid Build Coastguard Worker assert(i17 != NULL);
780*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i17 != zero) {
781*4bdc9457SAndroid Build Coastguard Worker i17 = (const uint16_t*) ((uintptr_t) i17 + input_offset);
782*4bdc9457SAndroid Build Coastguard Worker }
783*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i18 = input[18];
784*4bdc9457SAndroid Build Coastguard Worker assert(i18 != NULL);
785*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i18 != zero) {
786*4bdc9457SAndroid Build Coastguard Worker i18 = (const uint16_t*) ((uintptr_t) i18 + input_offset);
787*4bdc9457SAndroid Build Coastguard Worker }
788*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i19 = input[19];
789*4bdc9457SAndroid Build Coastguard Worker assert(i19 != NULL);
790*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i19 != zero) {
791*4bdc9457SAndroid Build Coastguard Worker i19 = (const uint16_t*) ((uintptr_t) i19 + input_offset);
792*4bdc9457SAndroid Build Coastguard Worker }
793*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i20 = input[20];
794*4bdc9457SAndroid Build Coastguard Worker assert(i20 != NULL);
795*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i20 != zero) {
796*4bdc9457SAndroid Build Coastguard Worker i20 = (const uint16_t*) ((uintptr_t) i20 + input_offset);
797*4bdc9457SAndroid Build Coastguard Worker }
798*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i21 = input[21];
799*4bdc9457SAndroid Build Coastguard Worker assert(i21 != NULL);
800*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i21 != zero) {
801*4bdc9457SAndroid Build Coastguard Worker i21 = (const uint16_t*) ((uintptr_t) i21 + input_offset);
802*4bdc9457SAndroid Build Coastguard Worker }
803*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i22 = input[22];
804*4bdc9457SAndroid Build Coastguard Worker assert(i22 != NULL);
805*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i22 != zero) {
806*4bdc9457SAndroid Build Coastguard Worker i22 = (const uint16_t*) ((uintptr_t) i22 + input_offset);
807*4bdc9457SAndroid Build Coastguard Worker }
808*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i23 = input[23];
809*4bdc9457SAndroid Build Coastguard Worker assert(i23 != NULL);
810*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i23 != zero) {
811*4bdc9457SAndroid Build Coastguard Worker i23 = (const uint16_t*) ((uintptr_t) i23 + input_offset);
812*4bdc9457SAndroid Build Coastguard Worker }
813*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i24 = input[24];
814*4bdc9457SAndroid Build Coastguard Worker assert(i24 != NULL);
815*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i24 != zero) {
816*4bdc9457SAndroid Build Coastguard Worker i24 = (const uint16_t*) ((uintptr_t) i24 + input_offset);
817*4bdc9457SAndroid Build Coastguard Worker }
818*4bdc9457SAndroid Build Coastguard Worker input = (const void**) ((uintptr_t) input + input_stride);
819*4bdc9457SAndroid Build Coastguard Worker
820*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
821*4bdc9457SAndroid Build Coastguard Worker const uint16_t* w = weights;
822*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
823*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
824*4bdc9457SAndroid Build Coastguard Worker
825*4bdc9457SAndroid Build Coastguard Worker
826*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
827*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
828*4bdc9457SAndroid Build Coastguard Worker
829*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
830*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
831*4bdc9457SAndroid Build Coastguard Worker
832*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
833*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
834*4bdc9457SAndroid Build Coastguard Worker
835*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
836*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vi1x01234567, vk1x01234567), _MM_FROUND_NO_EXC));
837*4bdc9457SAndroid Build Coastguard Worker
838*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
839*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
840*4bdc9457SAndroid Build Coastguard Worker
841*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
842*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
843*4bdc9457SAndroid Build Coastguard Worker
844*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
845*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
846*4bdc9457SAndroid Build Coastguard Worker
847*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
848*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
849*4bdc9457SAndroid Build Coastguard Worker
850*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
851*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
852*4bdc9457SAndroid Build Coastguard Worker
853*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
854*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
855*4bdc9457SAndroid Build Coastguard Worker
856*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
857*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
858*4bdc9457SAndroid Build Coastguard Worker
859*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
860*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
861*4bdc9457SAndroid Build Coastguard Worker
862*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
863*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
864*4bdc9457SAndroid Build Coastguard Worker
865*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
866*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
867*4bdc9457SAndroid Build Coastguard Worker
868*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
869*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
870*4bdc9457SAndroid Build Coastguard Worker
871*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 64)));
872*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
873*4bdc9457SAndroid Build Coastguard Worker
874*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
875*4bdc9457SAndroid Build Coastguard Worker i8 += 8;
876*4bdc9457SAndroid Build Coastguard Worker
877*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72)));
878*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
879*4bdc9457SAndroid Build Coastguard Worker
880*4bdc9457SAndroid Build Coastguard Worker const __m256 vi9x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i9));
881*4bdc9457SAndroid Build Coastguard Worker i9 += 8;
882*4bdc9457SAndroid Build Coastguard Worker
883*4bdc9457SAndroid Build Coastguard Worker const __m256 vk9x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 80)));
884*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi9x01234567, vk9x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
885*4bdc9457SAndroid Build Coastguard Worker
886*4bdc9457SAndroid Build Coastguard Worker const __m256 vi10x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i10));
887*4bdc9457SAndroid Build Coastguard Worker i10 += 8;
888*4bdc9457SAndroid Build Coastguard Worker
889*4bdc9457SAndroid Build Coastguard Worker const __m256 vk10x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 88)));
890*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
891*4bdc9457SAndroid Build Coastguard Worker
892*4bdc9457SAndroid Build Coastguard Worker const __m256 vi11x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i11));
893*4bdc9457SAndroid Build Coastguard Worker i11 += 8;
894*4bdc9457SAndroid Build Coastguard Worker
895*4bdc9457SAndroid Build Coastguard Worker const __m256 vk11x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 96)));
896*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi11x01234567, vk11x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
897*4bdc9457SAndroid Build Coastguard Worker
898*4bdc9457SAndroid Build Coastguard Worker const __m256 vi12x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i12));
899*4bdc9457SAndroid Build Coastguard Worker i12 += 8;
900*4bdc9457SAndroid Build Coastguard Worker
901*4bdc9457SAndroid Build Coastguard Worker const __m256 vk12x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 104)));
902*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
903*4bdc9457SAndroid Build Coastguard Worker
904*4bdc9457SAndroid Build Coastguard Worker const __m256 vi13x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i13));
905*4bdc9457SAndroid Build Coastguard Worker i13 += 8;
906*4bdc9457SAndroid Build Coastguard Worker
907*4bdc9457SAndroid Build Coastguard Worker const __m256 vk13x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 112)));
908*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi13x01234567, vk13x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
909*4bdc9457SAndroid Build Coastguard Worker
910*4bdc9457SAndroid Build Coastguard Worker const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14));
911*4bdc9457SAndroid Build Coastguard Worker i14 += 8;
912*4bdc9457SAndroid Build Coastguard Worker
913*4bdc9457SAndroid Build Coastguard Worker const __m256 vk14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 120)));
914*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
915*4bdc9457SAndroid Build Coastguard Worker
916*4bdc9457SAndroid Build Coastguard Worker const __m256 vi15x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i15));
917*4bdc9457SAndroid Build Coastguard Worker i15 += 8;
918*4bdc9457SAndroid Build Coastguard Worker
919*4bdc9457SAndroid Build Coastguard Worker const __m256 vk15x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 128)));
920*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi15x01234567, vk15x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
921*4bdc9457SAndroid Build Coastguard Worker
922*4bdc9457SAndroid Build Coastguard Worker const __m256 vi16x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i16));
923*4bdc9457SAndroid Build Coastguard Worker i16 += 8;
924*4bdc9457SAndroid Build Coastguard Worker
925*4bdc9457SAndroid Build Coastguard Worker const __m256 vk16x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 136)));
926*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
927*4bdc9457SAndroid Build Coastguard Worker
928*4bdc9457SAndroid Build Coastguard Worker const __m256 vi17x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i17));
929*4bdc9457SAndroid Build Coastguard Worker i17 += 8;
930*4bdc9457SAndroid Build Coastguard Worker
931*4bdc9457SAndroid Build Coastguard Worker const __m256 vk17x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 144)));
932*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi17x01234567, vk17x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
933*4bdc9457SAndroid Build Coastguard Worker
934*4bdc9457SAndroid Build Coastguard Worker const __m256 vi18x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i18));
935*4bdc9457SAndroid Build Coastguard Worker i18 += 8;
936*4bdc9457SAndroid Build Coastguard Worker
937*4bdc9457SAndroid Build Coastguard Worker const __m256 vk18x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 152)));
938*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi18x01234567, vk18x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
939*4bdc9457SAndroid Build Coastguard Worker
940*4bdc9457SAndroid Build Coastguard Worker const __m256 vi19x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i19));
941*4bdc9457SAndroid Build Coastguard Worker i19 += 8;
942*4bdc9457SAndroid Build Coastguard Worker
943*4bdc9457SAndroid Build Coastguard Worker const __m256 vk19x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 160)));
944*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi19x01234567, vk19x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
945*4bdc9457SAndroid Build Coastguard Worker
946*4bdc9457SAndroid Build Coastguard Worker const __m256 vi20x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i20));
947*4bdc9457SAndroid Build Coastguard Worker i20 += 8;
948*4bdc9457SAndroid Build Coastguard Worker
949*4bdc9457SAndroid Build Coastguard Worker const __m256 vk20x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 168)));
950*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi20x01234567, vk20x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
951*4bdc9457SAndroid Build Coastguard Worker
952*4bdc9457SAndroid Build Coastguard Worker const __m256 vi21x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i21));
953*4bdc9457SAndroid Build Coastguard Worker i21 += 8;
954*4bdc9457SAndroid Build Coastguard Worker
955*4bdc9457SAndroid Build Coastguard Worker const __m256 vk21x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 176)));
956*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi21x01234567, vk21x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
957*4bdc9457SAndroid Build Coastguard Worker
958*4bdc9457SAndroid Build Coastguard Worker const __m256 vi22x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i22));
959*4bdc9457SAndroid Build Coastguard Worker i22 += 8;
960*4bdc9457SAndroid Build Coastguard Worker
961*4bdc9457SAndroid Build Coastguard Worker const __m256 vk22x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 184)));
962*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi22x01234567, vk22x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
963*4bdc9457SAndroid Build Coastguard Worker
964*4bdc9457SAndroid Build Coastguard Worker const __m256 vi23x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i23));
965*4bdc9457SAndroid Build Coastguard Worker i23 += 8;
966*4bdc9457SAndroid Build Coastguard Worker
967*4bdc9457SAndroid Build Coastguard Worker const __m256 vk23x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 192)));
968*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi23x01234567, vk23x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
969*4bdc9457SAndroid Build Coastguard Worker
970*4bdc9457SAndroid Build Coastguard Worker const __m256 vi24x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i24));
971*4bdc9457SAndroid Build Coastguard Worker i24 += 8;
972*4bdc9457SAndroid Build Coastguard Worker
973*4bdc9457SAndroid Build Coastguard Worker const __m256 vk24x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 200)));
974*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi24x01234567, vk24x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
975*4bdc9457SAndroid Build Coastguard Worker
976*4bdc9457SAndroid Build Coastguard Worker w += 208;
977*4bdc9457SAndroid Build Coastguard Worker
978*4bdc9457SAndroid Build Coastguard Worker // Add up all accumulators to vacc01234567p0
979*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vacc01234567p0, vacc01234567p1), _MM_FROUND_NO_EXC));
980*4bdc9457SAndroid Build Coastguard Worker
981*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
982*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
983*4bdc9457SAndroid Build Coastguard Worker
984*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
985*4bdc9457SAndroid Build Coastguard Worker o += 8;
986*4bdc9457SAndroid Build Coastguard Worker }
987*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
988*4bdc9457SAndroid Build Coastguard Worker assert(c >= 1);
989*4bdc9457SAndroid Build Coastguard Worker assert(c <= 7);
990*4bdc9457SAndroid Build Coastguard Worker
991*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
992*4bdc9457SAndroid Build Coastguard Worker
993*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
994*4bdc9457SAndroid Build Coastguard Worker
995*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
996*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
997*4bdc9457SAndroid Build Coastguard Worker
998*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
999*4bdc9457SAndroid Build Coastguard Worker
1000*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
1001*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vi1x01234567, vk1x01234567), _MM_FROUND_NO_EXC));
1002*4bdc9457SAndroid Build Coastguard Worker
1003*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1004*4bdc9457SAndroid Build Coastguard Worker
1005*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 24)));
1006*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1007*4bdc9457SAndroid Build Coastguard Worker
1008*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1009*4bdc9457SAndroid Build Coastguard Worker
1010*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
1011*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1012*4bdc9457SAndroid Build Coastguard Worker
1013*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
1014*4bdc9457SAndroid Build Coastguard Worker
1015*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 40)));
1016*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1017*4bdc9457SAndroid Build Coastguard Worker
1018*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
1019*4bdc9457SAndroid Build Coastguard Worker
1020*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
1021*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1022*4bdc9457SAndroid Build Coastguard Worker
1023*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
1024*4bdc9457SAndroid Build Coastguard Worker
1025*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 56)));
1026*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1027*4bdc9457SAndroid Build Coastguard Worker
1028*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
1029*4bdc9457SAndroid Build Coastguard Worker
1030*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
1031*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1032*4bdc9457SAndroid Build Coastguard Worker
1033*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
1034*4bdc9457SAndroid Build Coastguard Worker
1035*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 72)));
1036*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1037*4bdc9457SAndroid Build Coastguard Worker
1038*4bdc9457SAndroid Build Coastguard Worker const __m256 vi9x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i9));
1039*4bdc9457SAndroid Build Coastguard Worker
1040*4bdc9457SAndroid Build Coastguard Worker const __m256 vk9x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80)));
1041*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi9x01234567, vk9x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1042*4bdc9457SAndroid Build Coastguard Worker
1043*4bdc9457SAndroid Build Coastguard Worker const __m256 vi10x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i10));
1044*4bdc9457SAndroid Build Coastguard Worker
1045*4bdc9457SAndroid Build Coastguard Worker const __m256 vk10x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 88)));
1046*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1047*4bdc9457SAndroid Build Coastguard Worker
1048*4bdc9457SAndroid Build Coastguard Worker const __m256 vi11x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i11));
1049*4bdc9457SAndroid Build Coastguard Worker
1050*4bdc9457SAndroid Build Coastguard Worker const __m256 vk11x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 96)));
1051*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi11x01234567, vk11x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1052*4bdc9457SAndroid Build Coastguard Worker
1053*4bdc9457SAndroid Build Coastguard Worker const __m256 vi12x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i12));
1054*4bdc9457SAndroid Build Coastguard Worker
1055*4bdc9457SAndroid Build Coastguard Worker const __m256 vk12x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 104)));
1056*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1057*4bdc9457SAndroid Build Coastguard Worker
1058*4bdc9457SAndroid Build Coastguard Worker const __m256 vi13x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i13));
1059*4bdc9457SAndroid Build Coastguard Worker
1060*4bdc9457SAndroid Build Coastguard Worker const __m256 vk13x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 112)));
1061*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi13x01234567, vk13x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1062*4bdc9457SAndroid Build Coastguard Worker
1063*4bdc9457SAndroid Build Coastguard Worker const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14));
1064*4bdc9457SAndroid Build Coastguard Worker
1065*4bdc9457SAndroid Build Coastguard Worker const __m256 vk14x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 120)));
1066*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1067*4bdc9457SAndroid Build Coastguard Worker
1068*4bdc9457SAndroid Build Coastguard Worker const __m256 vi15x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i15));
1069*4bdc9457SAndroid Build Coastguard Worker
1070*4bdc9457SAndroid Build Coastguard Worker const __m256 vk15x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 128)));
1071*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi15x01234567, vk15x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1072*4bdc9457SAndroid Build Coastguard Worker
1073*4bdc9457SAndroid Build Coastguard Worker const __m256 vi16x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i16));
1074*4bdc9457SAndroid Build Coastguard Worker
1075*4bdc9457SAndroid Build Coastguard Worker const __m256 vk16x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 136)));
1076*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1077*4bdc9457SAndroid Build Coastguard Worker
1078*4bdc9457SAndroid Build Coastguard Worker const __m256 vi17x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i17));
1079*4bdc9457SAndroid Build Coastguard Worker
1080*4bdc9457SAndroid Build Coastguard Worker const __m256 vk17x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144)));
1081*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi17x01234567, vk17x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1082*4bdc9457SAndroid Build Coastguard Worker
1083*4bdc9457SAndroid Build Coastguard Worker const __m256 vi18x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i18));
1084*4bdc9457SAndroid Build Coastguard Worker
1085*4bdc9457SAndroid Build Coastguard Worker const __m256 vk18x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 152)));
1086*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi18x01234567, vk18x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1087*4bdc9457SAndroid Build Coastguard Worker
1088*4bdc9457SAndroid Build Coastguard Worker const __m256 vi19x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i19));
1089*4bdc9457SAndroid Build Coastguard Worker
1090*4bdc9457SAndroid Build Coastguard Worker const __m256 vk19x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 160)));
1091*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi19x01234567, vk19x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1092*4bdc9457SAndroid Build Coastguard Worker
1093*4bdc9457SAndroid Build Coastguard Worker const __m256 vi20x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i20));
1094*4bdc9457SAndroid Build Coastguard Worker
1095*4bdc9457SAndroid Build Coastguard Worker const __m256 vk20x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 168)));
1096*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi20x01234567, vk20x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1097*4bdc9457SAndroid Build Coastguard Worker
1098*4bdc9457SAndroid Build Coastguard Worker const __m256 vi21x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i21));
1099*4bdc9457SAndroid Build Coastguard Worker
1100*4bdc9457SAndroid Build Coastguard Worker const __m256 vk21x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 176)));
1101*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi21x01234567, vk21x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1102*4bdc9457SAndroid Build Coastguard Worker
1103*4bdc9457SAndroid Build Coastguard Worker const __m256 vi22x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i22));
1104*4bdc9457SAndroid Build Coastguard Worker
1105*4bdc9457SAndroid Build Coastguard Worker const __m256 vk22x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 184)));
1106*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi22x01234567, vk22x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1107*4bdc9457SAndroid Build Coastguard Worker
1108*4bdc9457SAndroid Build Coastguard Worker const __m256 vi23x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i23));
1109*4bdc9457SAndroid Build Coastguard Worker
1110*4bdc9457SAndroid Build Coastguard Worker const __m256 vk23x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 192)));
1111*4bdc9457SAndroid Build Coastguard Worker vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi23x01234567, vk23x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
1112*4bdc9457SAndroid Build Coastguard Worker
1113*4bdc9457SAndroid Build Coastguard Worker const __m256 vi24x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i24));
1114*4bdc9457SAndroid Build Coastguard Worker
1115*4bdc9457SAndroid Build Coastguard Worker const __m256 vk24x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 200)));
1116*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi24x01234567, vk24x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
1117*4bdc9457SAndroid Build Coastguard Worker
1118*4bdc9457SAndroid Build Coastguard Worker // Add up all accumulators to vacc01234567p0
1119*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vacc01234567p0, vacc01234567p1), _MM_FROUND_NO_EXC));
1120*4bdc9457SAndroid Build Coastguard Worker
1121*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1122*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1123*4bdc9457SAndroid Build Coastguard Worker
1124*4bdc9457SAndroid Build Coastguard Worker __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
1125*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
1126*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh01234567);
1127*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
1128*4bdc9457SAndroid Build Coastguard Worker o += 4;
1129*4bdc9457SAndroid Build Coastguard Worker }
1130*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
1131*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh01234567);
1132*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_srli_epi64(vh01234567, 32);
1133*4bdc9457SAndroid Build Coastguard Worker o += 2;
1134*4bdc9457SAndroid Build Coastguard Worker }
1135*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
1136*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
1137*4bdc9457SAndroid Build Coastguard Worker o += 1;
1138*4bdc9457SAndroid Build Coastguard Worker }
1139*4bdc9457SAndroid Build Coastguard Worker }
1140*4bdc9457SAndroid Build Coastguard Worker
1141*4bdc9457SAndroid Build Coastguard Worker o = (uint16_t*) ((uintptr_t) o + output_increment);
1142*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
1143*4bdc9457SAndroid Build Coastguard Worker }
1144*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_ibilinear_ukernel__fma3_c8(size_t output_pixels,size_t channels,const void ** restrict input,size_t input_offset,const void * restrict weights,void * restrict output,size_t output_increment)1145*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_ibilinear_ukernel__fma3_c8(
1146*4bdc9457SAndroid Build Coastguard Worker size_t output_pixels,
1147*4bdc9457SAndroid Build Coastguard Worker size_t channels,
1148*4bdc9457SAndroid Build Coastguard Worker const void**restrict input,
1149*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
1150*4bdc9457SAndroid Build Coastguard Worker const void*restrict weights,
1151*4bdc9457SAndroid Build Coastguard Worker void*restrict output,
1152*4bdc9457SAndroid Build Coastguard Worker size_t output_increment) XNN_OOB_READS
1153*4bdc9457SAndroid Build Coastguard Worker {
1154*4bdc9457SAndroid Build Coastguard Worker assert(output_pixels != 0);
1155*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
1156*4bdc9457SAndroid Build Coastguard Worker assert(channels % sizeof(uint16_t) == 0);
1157*4bdc9457SAndroid Build Coastguard Worker
1158*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
1159*4bdc9457SAndroid Build Coastguard Worker do {
1160*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = (const uint16_t*) ((uintptr_t) input[0] + input_offset);
1161*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = (const uint16_t*) ((uintptr_t) input[1] + input_offset);
1162*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = (const uint16_t*) ((uintptr_t) input[2] + input_offset);
1163*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = (const uint16_t*) ((uintptr_t) input[3] + input_offset);
1164*4bdc9457SAndroid Build Coastguard Worker input += 4;
1165*4bdc9457SAndroid Build Coastguard Worker
1166*4bdc9457SAndroid Build Coastguard Worker const __m256 valphahv = _mm256_cvtph_ps(_mm_castps_si128(_mm_broadcast_ss(weights)));
1167*4bdc9457SAndroid Build Coastguard Worker const __m256 valphah = _mm256_permute_ps(valphahv, _MM_SHUFFLE(2, 0, 2, 0));
1168*4bdc9457SAndroid Build Coastguard Worker const __m256 valphav = _mm256_permute_ps(valphahv, _MM_SHUFFLE(3, 1, 3, 1));
1169*4bdc9457SAndroid Build Coastguard Worker weights = (const uint16_t*) weights + 2;
1170*4bdc9457SAndroid Build Coastguard Worker
1171*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
1172*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
1173*4bdc9457SAndroid Build Coastguard Worker const __m256 vtl = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1174*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
1175*4bdc9457SAndroid Build Coastguard Worker const __m256 vtr = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1176*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
1177*4bdc9457SAndroid Build Coastguard Worker const __m256 vbl = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1178*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
1179*4bdc9457SAndroid Build Coastguard Worker const __m256 vbr = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1180*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
1181*4bdc9457SAndroid Build Coastguard Worker
1182*4bdc9457SAndroid Build Coastguard Worker const __m256 vtd = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vtr, vtl), _MM_FROUND_NO_EXC));
1183*4bdc9457SAndroid Build Coastguard Worker const __m256 vbd = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vbr, vbl), _MM_FROUND_NO_EXC));
1184*4bdc9457SAndroid Build Coastguard Worker
1185*4bdc9457SAndroid Build Coastguard Worker const __m256 vt = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vtd, valphah, vtl), _MM_FROUND_NO_EXC));
1186*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vbd, valphah, vbl), _MM_FROUND_NO_EXC));
1187*4bdc9457SAndroid Build Coastguard Worker
1188*4bdc9457SAndroid Build Coastguard Worker const __m256 vd = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, vt), _MM_FROUND_NO_EXC));
1189*4bdc9457SAndroid Build Coastguard Worker
1190*4bdc9457SAndroid Build Coastguard Worker const __m128i vo = _mm256_cvtps_ph(_mm256_fmadd_ps(vd, valphav, vt), _MM_FROUND_NO_EXC);
1191*4bdc9457SAndroid Build Coastguard Worker
1192*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, vo);
1193*4bdc9457SAndroid Build Coastguard Worker o += 8;
1194*4bdc9457SAndroid Build Coastguard Worker }
1195*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
1196*4bdc9457SAndroid Build Coastguard Worker const __m256 vtl = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1197*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
1198*4bdc9457SAndroid Build Coastguard Worker const __m256 vtr = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1199*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
1200*4bdc9457SAndroid Build Coastguard Worker const __m256 vbl = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1201*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
1202*4bdc9457SAndroid Build Coastguard Worker const __m256 vbr = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1203*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
1204*4bdc9457SAndroid Build Coastguard Worker
1205*4bdc9457SAndroid Build Coastguard Worker const __m256 vtd = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vtr, vtl), _MM_FROUND_NO_EXC));
1206*4bdc9457SAndroid Build Coastguard Worker const __m256 vbd = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vbr, vbl), _MM_FROUND_NO_EXC));
1207*4bdc9457SAndroid Build Coastguard Worker
1208*4bdc9457SAndroid Build Coastguard Worker const __m256 vt = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vtd, valphah, vtl), _MM_FROUND_NO_EXC));
1209*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vbd, valphah, vbl), _MM_FROUND_NO_EXC));
1210*4bdc9457SAndroid Build Coastguard Worker
1211*4bdc9457SAndroid Build Coastguard Worker const __m256 vd = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, vt), _MM_FROUND_NO_EXC));
1212*4bdc9457SAndroid Build Coastguard Worker
1213*4bdc9457SAndroid Build Coastguard Worker __m128i vo = _mm256_cvtps_ph(_mm256_fmadd_ps(vd, valphav, vt), _MM_FROUND_NO_EXC);
1214*4bdc9457SAndroid Build Coastguard Worker if (c & (4 * sizeof(uint16_t))) {
1215*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vo);
1216*4bdc9457SAndroid Build Coastguard Worker vo = _mm_unpackhi_epi64(vo, vo);
1217*4bdc9457SAndroid Build Coastguard Worker o += 4;
1218*4bdc9457SAndroid Build Coastguard Worker }
1219*4bdc9457SAndroid Build Coastguard Worker if (c & (2 * sizeof(uint16_t))) {
1220*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vo);
1221*4bdc9457SAndroid Build Coastguard Worker vo = _mm_srli_epi64(vo, 32);
1222*4bdc9457SAndroid Build Coastguard Worker o += 2;
1223*4bdc9457SAndroid Build Coastguard Worker }
1224*4bdc9457SAndroid Build Coastguard Worker if (c & (1 * sizeof(uint16_t))) {
1225*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vo, 0);
1226*4bdc9457SAndroid Build Coastguard Worker o += 1;
1227*4bdc9457SAndroid Build Coastguard Worker }
1228*4bdc9457SAndroid Build Coastguard Worker }
1229*4bdc9457SAndroid Build Coastguard Worker
1230*4bdc9457SAndroid Build Coastguard Worker o = (uint16_t*) ((uintptr_t) o + output_increment);
1231*4bdc9457SAndroid Build Coastguard Worker } while (--output_pixels != 0);
1232*4bdc9457SAndroid Build Coastguard Worker }
1233*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x(size_t rows,size_t channels,const void * restrict input,size_t input_stride,const void * restrict weights,void * restrict output,size_t output_stride,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1234*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x(
1235*4bdc9457SAndroid Build Coastguard Worker size_t rows,
1236*4bdc9457SAndroid Build Coastguard Worker size_t channels,
1237*4bdc9457SAndroid Build Coastguard Worker const void*restrict input,
1238*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
1239*4bdc9457SAndroid Build Coastguard Worker const void*restrict weights,
1240*4bdc9457SAndroid Build Coastguard Worker void*restrict output,
1241*4bdc9457SAndroid Build Coastguard Worker size_t output_stride,
1242*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1243*4bdc9457SAndroid Build Coastguard Worker {
1244*4bdc9457SAndroid Build Coastguard Worker assert(rows != 0);
1245*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
1246*4bdc9457SAndroid Build Coastguard Worker assert(channels % sizeof(uint16_t) == 0);
1247*4bdc9457SAndroid Build Coastguard Worker
1248*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = (const uint16_t*) input;
1249*4bdc9457SAndroid Build Coastguard Worker uint16_t* o0 = (uint16_t*) output;
1250*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
1251*4bdc9457SAndroid Build Coastguard Worker uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
1252*4bdc9457SAndroid Build Coastguard Worker
1253*4bdc9457SAndroid Build Coastguard Worker const size_t input_increment = input_stride * 2 - channels;
1254*4bdc9457SAndroid Build Coastguard Worker const size_t output_increment = output_stride * 2 - channels;
1255*4bdc9457SAndroid Build Coastguard Worker
1256*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
1257*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
1258*4bdc9457SAndroid Build Coastguard Worker do {
1259*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows < 2) {
1260*4bdc9457SAndroid Build Coastguard Worker i1 = i0;
1261*4bdc9457SAndroid Build Coastguard Worker o1 = o0;
1262*4bdc9457SAndroid Build Coastguard Worker }
1263*4bdc9457SAndroid Build Coastguard Worker
1264*4bdc9457SAndroid Build Coastguard Worker const uint16_t* w = (const uint16_t*) weights;
1265*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
1266*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
1267*4bdc9457SAndroid Build Coastguard Worker const __m256 vscale = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1268*4bdc9457SAndroid Build Coastguard Worker
1269*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1270*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
1271*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1272*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
1273*4bdc9457SAndroid Build Coastguard Worker
1274*4bdc9457SAndroid Build Coastguard Worker const __m256 vbias = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
1275*4bdc9457SAndroid Build Coastguard Worker w += 16;
1276*4bdc9457SAndroid Build Coastguard Worker
1277*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_fmadd_ps(vacc0, vscale, vbias);
1278*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_fmadd_ps(vacc1, vscale, vbias);
1279*4bdc9457SAndroid Build Coastguard Worker
1280*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_max_ps(vacc0, vmin);
1281*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_max_ps(vacc1, vmin);
1282*4bdc9457SAndroid Build Coastguard Worker
1283*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_min_ps(vacc0, vmax);
1284*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_min_ps(vacc1, vmax);
1285*4bdc9457SAndroid Build Coastguard Worker
1286*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
1287*4bdc9457SAndroid Build Coastguard Worker o0 += 8;
1288*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
1289*4bdc9457SAndroid Build Coastguard Worker o1 += 8;
1290*4bdc9457SAndroid Build Coastguard Worker }
1291*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
1292*4bdc9457SAndroid Build Coastguard Worker const __m256 vscale = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1293*4bdc9457SAndroid Build Coastguard Worker
1294*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1295*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + c);
1296*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1297*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + c);
1298*4bdc9457SAndroid Build Coastguard Worker
1299*4bdc9457SAndroid Build Coastguard Worker const __m256 vbias = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
1300*4bdc9457SAndroid Build Coastguard Worker
1301*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_fmadd_ps(vacc0, vscale, vbias);
1302*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_fmadd_ps(vacc1, vscale, vbias);
1303*4bdc9457SAndroid Build Coastguard Worker
1304*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_max_ps(vacc0, vmin);
1305*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_max_ps(vacc1, vmin);
1306*4bdc9457SAndroid Build Coastguard Worker
1307*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_min_ps(vacc0, vmax);
1308*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_min_ps(vacc1, vmax);
1309*4bdc9457SAndroid Build Coastguard Worker
1310*4bdc9457SAndroid Build Coastguard Worker __m128i vh0 = _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC);
1311*4bdc9457SAndroid Build Coastguard Worker __m128i vh1 = _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC);
1312*4bdc9457SAndroid Build Coastguard Worker
1313*4bdc9457SAndroid Build Coastguard Worker if (c & (4 * sizeof(uint16_t))) {
1314*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o0, vh0);
1315*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o1, vh1);
1316*4bdc9457SAndroid Build Coastguard Worker
1317*4bdc9457SAndroid Build Coastguard Worker vh0 = _mm_unpackhi_epi64(vh0, vh0);
1318*4bdc9457SAndroid Build Coastguard Worker vh1 = _mm_unpackhi_epi64(vh1, vh1);
1319*4bdc9457SAndroid Build Coastguard Worker
1320*4bdc9457SAndroid Build Coastguard Worker o0 += 4;
1321*4bdc9457SAndroid Build Coastguard Worker o1 += 4;
1322*4bdc9457SAndroid Build Coastguard Worker }
1323*4bdc9457SAndroid Build Coastguard Worker if (c & (2 * sizeof(uint16_t))) {
1324*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o0, vh0);
1325*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o1, vh1);
1326*4bdc9457SAndroid Build Coastguard Worker
1327*4bdc9457SAndroid Build Coastguard Worker vh0 = _mm_srli_epi64(vh0, 32);
1328*4bdc9457SAndroid Build Coastguard Worker vh1 = _mm_srli_epi64(vh1, 32);
1329*4bdc9457SAndroid Build Coastguard Worker
1330*4bdc9457SAndroid Build Coastguard Worker o0 += 2;
1331*4bdc9457SAndroid Build Coastguard Worker o1 += 2;
1332*4bdc9457SAndroid Build Coastguard Worker }
1333*4bdc9457SAndroid Build Coastguard Worker if (c & (1 * sizeof(uint16_t))) {
1334*4bdc9457SAndroid Build Coastguard Worker *o0 = (uint16_t) _mm_extract_epi16(vh0, 0);
1335*4bdc9457SAndroid Build Coastguard Worker *o1 = (uint16_t) _mm_extract_epi16(vh1, 0);
1336*4bdc9457SAndroid Build Coastguard Worker
1337*4bdc9457SAndroid Build Coastguard Worker o0 += 1;
1338*4bdc9457SAndroid Build Coastguard Worker o1 += 1;
1339*4bdc9457SAndroid Build Coastguard Worker }
1340*4bdc9457SAndroid Build Coastguard Worker }
1341*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
1342*4bdc9457SAndroid Build Coastguard Worker o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
1343*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
1344*4bdc9457SAndroid Build Coastguard Worker o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
1345*4bdc9457SAndroid Build Coastguard Worker rows = doz(rows, 2);
1346*4bdc9457SAndroid Build Coastguard Worker } while (rows != 0);
1347*4bdc9457SAndroid Build Coastguard Worker }
1348*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_dwconv_minmax_ukernel_up16x3__fma3(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1349*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv_minmax_ukernel_up16x3__fma3(
1350*4bdc9457SAndroid Build Coastguard Worker size_t channels,
1351*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
1352*4bdc9457SAndroid Build Coastguard Worker const float** input,
1353*4bdc9457SAndroid Build Coastguard Worker const float* weights,
1354*4bdc9457SAndroid Build Coastguard Worker float* output,
1355*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
1356*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
1357*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
1358*4bdc9457SAndroid Build Coastguard Worker const float* zero,
1359*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1360*4bdc9457SAndroid Build Coastguard Worker {
1361*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
1362*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
1363*4bdc9457SAndroid Build Coastguard Worker
1364*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
1365*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
1366*4bdc9457SAndroid Build Coastguard Worker do {
1367*4bdc9457SAndroid Build Coastguard Worker const float* i0 = input[0];
1368*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
1369*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
1370*4bdc9457SAndroid Build Coastguard Worker i0 = (const float*) ((uintptr_t) i0 + input_offset);
1371*4bdc9457SAndroid Build Coastguard Worker }
1372*4bdc9457SAndroid Build Coastguard Worker const float* i1 = input[1];
1373*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
1374*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
1375*4bdc9457SAndroid Build Coastguard Worker i1 = (const float*) ((uintptr_t) i1 + input_offset);
1376*4bdc9457SAndroid Build Coastguard Worker }
1377*4bdc9457SAndroid Build Coastguard Worker const float* i2 = input[2];
1378*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
1379*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
1380*4bdc9457SAndroid Build Coastguard Worker i2 = (const float*) ((uintptr_t) i2 + input_offset);
1381*4bdc9457SAndroid Build Coastguard Worker }
1382*4bdc9457SAndroid Build Coastguard Worker input = (const float**) ((uintptr_t) input + input_stride);
1383*4bdc9457SAndroid Build Coastguard Worker
1384*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
1385*4bdc9457SAndroid Build Coastguard Worker const float* w = weights;
1386*4bdc9457SAndroid Build Coastguard Worker for (; c >= 16; c -= 16) {
1387*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
1388*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
1389*4bdc9457SAndroid Build Coastguard Worker
1390*4bdc9457SAndroid Build Coastguard Worker
1391*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
1392*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
1393*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
1394*4bdc9457SAndroid Build Coastguard Worker
1395*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
1396*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
1397*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
1398*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0);
1399*4bdc9457SAndroid Build Coastguard Worker
1400*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
1401*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
1402*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
1403*4bdc9457SAndroid Build Coastguard Worker
1404*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
1405*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
1406*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
1407*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0);
1408*4bdc9457SAndroid Build Coastguard Worker
1409*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
1410*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
1411*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
1412*4bdc9457SAndroid Build Coastguard Worker
1413*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
1414*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
1415*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
1416*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0);
1417*4bdc9457SAndroid Build Coastguard Worker
1418*4bdc9457SAndroid Build Coastguard Worker w += 64;
1419*4bdc9457SAndroid Build Coastguard Worker
1420*4bdc9457SAndroid Build Coastguard Worker
1421*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1422*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
1423*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1424*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
1425*4bdc9457SAndroid Build Coastguard Worker
1426*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output, vacc01234567);
1427*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output + 8, vacc89ABCDEF);
1428*4bdc9457SAndroid Build Coastguard Worker output += 16;
1429*4bdc9457SAndroid Build Coastguard Worker }
1430*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
1431*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
1432*4bdc9457SAndroid Build Coastguard Worker
1433*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
1434*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
1435*4bdc9457SAndroid Build Coastguard Worker
1436*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
1437*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
1438*4bdc9457SAndroid Build Coastguard Worker
1439*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
1440*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
1441*4bdc9457SAndroid Build Coastguard Worker
1442*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
1443*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
1444*4bdc9457SAndroid Build Coastguard Worker
1445*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
1446*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
1447*4bdc9457SAndroid Build Coastguard Worker
1448*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
1449*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
1450*4bdc9457SAndroid Build Coastguard Worker
1451*4bdc9457SAndroid Build Coastguard Worker w += 8;
1452*4bdc9457SAndroid Build Coastguard Worker
1453*4bdc9457SAndroid Build Coastguard Worker
1454*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1455*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1456*4bdc9457SAndroid Build Coastguard Worker
1457*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output, vacc01234567);
1458*4bdc9457SAndroid Build Coastguard Worker output += 8;
1459*4bdc9457SAndroid Build Coastguard Worker }
1460*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
1461*4bdc9457SAndroid Build Coastguard Worker assert(c >= 1);
1462*4bdc9457SAndroid Build Coastguard Worker assert(c <= 7);
1463*4bdc9457SAndroid Build Coastguard Worker const __m256i vmask = _mm256_loadu_si256((const __m256i*) ¶ms->avx.mask_table[7 - c]);
1464*4bdc9457SAndroid Build Coastguard Worker
1465*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
1466*4bdc9457SAndroid Build Coastguard Worker
1467*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
1468*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
1469*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
1470*4bdc9457SAndroid Build Coastguard Worker
1471*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
1472*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
1473*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
1474*4bdc9457SAndroid Build Coastguard Worker
1475*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
1476*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
1477*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
1478*4bdc9457SAndroid Build Coastguard Worker
1479*4bdc9457SAndroid Build Coastguard Worker
1480*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1481*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1482*4bdc9457SAndroid Build Coastguard Worker
1483*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
1484*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
1485*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(output, vacc0123);
1486*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
1487*4bdc9457SAndroid Build Coastguard Worker output += 4;
1488*4bdc9457SAndroid Build Coastguard Worker }
1489*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
1490*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) output, vacc0123);
1491*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1492*4bdc9457SAndroid Build Coastguard Worker output += 2;
1493*4bdc9457SAndroid Build Coastguard Worker }
1494*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
1495*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(output, vacc0123);
1496*4bdc9457SAndroid Build Coastguard Worker output += 1;
1497*4bdc9457SAndroid Build Coastguard Worker }
1498*4bdc9457SAndroid Build Coastguard Worker }
1499*4bdc9457SAndroid Build Coastguard Worker
1500*4bdc9457SAndroid Build Coastguard Worker output = (float*) ((uintptr_t) output + output_increment);
1501*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
1502*4bdc9457SAndroid Build Coastguard Worker }
1503*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_dwconv_minmax_ukernel_up16x4__fma3(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1504*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv_minmax_ukernel_up16x4__fma3(
1505*4bdc9457SAndroid Build Coastguard Worker size_t channels,
1506*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
1507*4bdc9457SAndroid Build Coastguard Worker const float** input,
1508*4bdc9457SAndroid Build Coastguard Worker const float* weights,
1509*4bdc9457SAndroid Build Coastguard Worker float* output,
1510*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
1511*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
1512*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
1513*4bdc9457SAndroid Build Coastguard Worker const float* zero,
1514*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1515*4bdc9457SAndroid Build Coastguard Worker {
1516*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
1517*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
1518*4bdc9457SAndroid Build Coastguard Worker
1519*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
1520*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
1521*4bdc9457SAndroid Build Coastguard Worker do {
1522*4bdc9457SAndroid Build Coastguard Worker const float* i0 = input[0];
1523*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
1524*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
1525*4bdc9457SAndroid Build Coastguard Worker i0 = (const float*) ((uintptr_t) i0 + input_offset);
1526*4bdc9457SAndroid Build Coastguard Worker }
1527*4bdc9457SAndroid Build Coastguard Worker const float* i1 = input[1];
1528*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
1529*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
1530*4bdc9457SAndroid Build Coastguard Worker i1 = (const float*) ((uintptr_t) i1 + input_offset);
1531*4bdc9457SAndroid Build Coastguard Worker }
1532*4bdc9457SAndroid Build Coastguard Worker const float* i2 = input[2];
1533*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
1534*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
1535*4bdc9457SAndroid Build Coastguard Worker i2 = (const float*) ((uintptr_t) i2 + input_offset);
1536*4bdc9457SAndroid Build Coastguard Worker }
1537*4bdc9457SAndroid Build Coastguard Worker const float* i3 = input[3];
1538*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
1539*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
1540*4bdc9457SAndroid Build Coastguard Worker i3 = (const float*) ((uintptr_t) i3 + input_offset);
1541*4bdc9457SAndroid Build Coastguard Worker }
1542*4bdc9457SAndroid Build Coastguard Worker input = (const float**) ((uintptr_t) input + input_stride);
1543*4bdc9457SAndroid Build Coastguard Worker
1544*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
1545*4bdc9457SAndroid Build Coastguard Worker const float* w = weights;
1546*4bdc9457SAndroid Build Coastguard Worker for (; c >= 16; c -= 16) {
1547*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
1548*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
1549*4bdc9457SAndroid Build Coastguard Worker
1550*4bdc9457SAndroid Build Coastguard Worker
1551*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
1552*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
1553*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
1554*4bdc9457SAndroid Build Coastguard Worker
1555*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
1556*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
1557*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
1558*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0);
1559*4bdc9457SAndroid Build Coastguard Worker
1560*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
1561*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
1562*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
1563*4bdc9457SAndroid Build Coastguard Worker
1564*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
1565*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
1566*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
1567*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0);
1568*4bdc9457SAndroid Build Coastguard Worker
1569*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
1570*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
1571*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
1572*4bdc9457SAndroid Build Coastguard Worker
1573*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
1574*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
1575*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
1576*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0);
1577*4bdc9457SAndroid Build Coastguard Worker
1578*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
1579*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x89ABCDEF = _mm256_loadu_ps(i3 + 8);
1580*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
1581*4bdc9457SAndroid Build Coastguard Worker
1582*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
1583*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x89ABCDEF = _mm256_load_ps(w + 72);
1584*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0);
1585*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi3x89ABCDEF, vk3x89ABCDEF, vacc89ABCDEFp0);
1586*4bdc9457SAndroid Build Coastguard Worker
1587*4bdc9457SAndroid Build Coastguard Worker w += 80;
1588*4bdc9457SAndroid Build Coastguard Worker
1589*4bdc9457SAndroid Build Coastguard Worker
1590*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1591*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
1592*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1593*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
1594*4bdc9457SAndroid Build Coastguard Worker
1595*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output, vacc01234567);
1596*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output + 8, vacc89ABCDEF);
1597*4bdc9457SAndroid Build Coastguard Worker output += 16;
1598*4bdc9457SAndroid Build Coastguard Worker }
1599*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
1600*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
1601*4bdc9457SAndroid Build Coastguard Worker
1602*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
1603*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
1604*4bdc9457SAndroid Build Coastguard Worker
1605*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
1606*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
1607*4bdc9457SAndroid Build Coastguard Worker
1608*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
1609*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
1610*4bdc9457SAndroid Build Coastguard Worker
1611*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
1612*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
1613*4bdc9457SAndroid Build Coastguard Worker
1614*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
1615*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
1616*4bdc9457SAndroid Build Coastguard Worker
1617*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
1618*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
1619*4bdc9457SAndroid Build Coastguard Worker
1620*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
1621*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
1622*4bdc9457SAndroid Build Coastguard Worker
1623*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
1624*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0);
1625*4bdc9457SAndroid Build Coastguard Worker
1626*4bdc9457SAndroid Build Coastguard Worker w += 8;
1627*4bdc9457SAndroid Build Coastguard Worker
1628*4bdc9457SAndroid Build Coastguard Worker
1629*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1630*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1631*4bdc9457SAndroid Build Coastguard Worker
1632*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output, vacc01234567);
1633*4bdc9457SAndroid Build Coastguard Worker output += 8;
1634*4bdc9457SAndroid Build Coastguard Worker }
1635*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
1636*4bdc9457SAndroid Build Coastguard Worker assert(c >= 1);
1637*4bdc9457SAndroid Build Coastguard Worker assert(c <= 7);
1638*4bdc9457SAndroid Build Coastguard Worker const __m256i vmask = _mm256_loadu_si256((const __m256i*) ¶ms->avx.mask_table[7 - c]);
1639*4bdc9457SAndroid Build Coastguard Worker
1640*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
1641*4bdc9457SAndroid Build Coastguard Worker
1642*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
1643*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
1644*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
1645*4bdc9457SAndroid Build Coastguard Worker
1646*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
1647*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
1648*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
1649*4bdc9457SAndroid Build Coastguard Worker
1650*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
1651*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
1652*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
1653*4bdc9457SAndroid Build Coastguard Worker
1654*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask);
1655*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
1656*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0);
1657*4bdc9457SAndroid Build Coastguard Worker
1658*4bdc9457SAndroid Build Coastguard Worker
1659*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1660*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1661*4bdc9457SAndroid Build Coastguard Worker
1662*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
1663*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
1664*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(output, vacc0123);
1665*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
1666*4bdc9457SAndroid Build Coastguard Worker output += 4;
1667*4bdc9457SAndroid Build Coastguard Worker }
1668*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
1669*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) output, vacc0123);
1670*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1671*4bdc9457SAndroid Build Coastguard Worker output += 2;
1672*4bdc9457SAndroid Build Coastguard Worker }
1673*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
1674*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(output, vacc0123);
1675*4bdc9457SAndroid Build Coastguard Worker output += 1;
1676*4bdc9457SAndroid Build Coastguard Worker }
1677*4bdc9457SAndroid Build Coastguard Worker }
1678*4bdc9457SAndroid Build Coastguard Worker
1679*4bdc9457SAndroid Build Coastguard Worker output = (float*) ((uintptr_t) output + output_increment);
1680*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
1681*4bdc9457SAndroid Build Coastguard Worker }
1682*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_dwconv_minmax_ukernel_up16x9__fma3(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1683*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv_minmax_ukernel_up16x9__fma3(
1684*4bdc9457SAndroid Build Coastguard Worker size_t channels,
1685*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
1686*4bdc9457SAndroid Build Coastguard Worker const float** input,
1687*4bdc9457SAndroid Build Coastguard Worker const float* weights,
1688*4bdc9457SAndroid Build Coastguard Worker float* output,
1689*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
1690*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
1691*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
1692*4bdc9457SAndroid Build Coastguard Worker const float* zero,
1693*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1694*4bdc9457SAndroid Build Coastguard Worker {
1695*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
1696*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
1697*4bdc9457SAndroid Build Coastguard Worker
1698*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
1699*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
1700*4bdc9457SAndroid Build Coastguard Worker do {
1701*4bdc9457SAndroid Build Coastguard Worker const float* i0 = input[0];
1702*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
1703*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
1704*4bdc9457SAndroid Build Coastguard Worker i0 = (const float*) ((uintptr_t) i0 + input_offset);
1705*4bdc9457SAndroid Build Coastguard Worker }
1706*4bdc9457SAndroid Build Coastguard Worker const float* i1 = input[1];
1707*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
1708*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
1709*4bdc9457SAndroid Build Coastguard Worker i1 = (const float*) ((uintptr_t) i1 + input_offset);
1710*4bdc9457SAndroid Build Coastguard Worker }
1711*4bdc9457SAndroid Build Coastguard Worker const float* i2 = input[2];
1712*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
1713*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
1714*4bdc9457SAndroid Build Coastguard Worker i2 = (const float*) ((uintptr_t) i2 + input_offset);
1715*4bdc9457SAndroid Build Coastguard Worker }
1716*4bdc9457SAndroid Build Coastguard Worker const float* i3 = input[3];
1717*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
1718*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
1719*4bdc9457SAndroid Build Coastguard Worker i3 = (const float*) ((uintptr_t) i3 + input_offset);
1720*4bdc9457SAndroid Build Coastguard Worker }
1721*4bdc9457SAndroid Build Coastguard Worker const float* i4 = input[4];
1722*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
1723*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
1724*4bdc9457SAndroid Build Coastguard Worker i4 = (const float*) ((uintptr_t) i4 + input_offset);
1725*4bdc9457SAndroid Build Coastguard Worker }
1726*4bdc9457SAndroid Build Coastguard Worker const float* i5 = input[5];
1727*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
1728*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
1729*4bdc9457SAndroid Build Coastguard Worker i5 = (const float*) ((uintptr_t) i5 + input_offset);
1730*4bdc9457SAndroid Build Coastguard Worker }
1731*4bdc9457SAndroid Build Coastguard Worker const float* i6 = input[6];
1732*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
1733*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
1734*4bdc9457SAndroid Build Coastguard Worker i6 = (const float*) ((uintptr_t) i6 + input_offset);
1735*4bdc9457SAndroid Build Coastguard Worker }
1736*4bdc9457SAndroid Build Coastguard Worker const float* i7 = input[7];
1737*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
1738*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
1739*4bdc9457SAndroid Build Coastguard Worker i7 = (const float*) ((uintptr_t) i7 + input_offset);
1740*4bdc9457SAndroid Build Coastguard Worker }
1741*4bdc9457SAndroid Build Coastguard Worker const float* i8 = input[8];
1742*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
1743*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
1744*4bdc9457SAndroid Build Coastguard Worker i8 = (const float*) ((uintptr_t) i8 + input_offset);
1745*4bdc9457SAndroid Build Coastguard Worker }
1746*4bdc9457SAndroid Build Coastguard Worker input = (const float**) ((uintptr_t) input + input_stride);
1747*4bdc9457SAndroid Build Coastguard Worker
1748*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
1749*4bdc9457SAndroid Build Coastguard Worker const float* w = weights;
1750*4bdc9457SAndroid Build Coastguard Worker for (; c >= 16; c -= 16) {
1751*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
1752*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
1753*4bdc9457SAndroid Build Coastguard Worker
1754*4bdc9457SAndroid Build Coastguard Worker
1755*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
1756*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
1757*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
1758*4bdc9457SAndroid Build Coastguard Worker
1759*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
1760*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
1761*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
1762*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0);
1763*4bdc9457SAndroid Build Coastguard Worker
1764*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
1765*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
1766*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
1767*4bdc9457SAndroid Build Coastguard Worker
1768*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
1769*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
1770*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
1771*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0);
1772*4bdc9457SAndroid Build Coastguard Worker
1773*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
1774*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
1775*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
1776*4bdc9457SAndroid Build Coastguard Worker
1777*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
1778*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
1779*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
1780*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0);
1781*4bdc9457SAndroid Build Coastguard Worker
1782*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
1783*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x89ABCDEF = _mm256_loadu_ps(i3 + 8);
1784*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
1785*4bdc9457SAndroid Build Coastguard Worker
1786*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
1787*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x89ABCDEF = _mm256_load_ps(w + 72);
1788*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0);
1789*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi3x89ABCDEF, vk3x89ABCDEF, vacc89ABCDEFp0);
1790*4bdc9457SAndroid Build Coastguard Worker
1791*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_loadu_ps(i4);
1792*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x89ABCDEF = _mm256_loadu_ps(i4 + 8);
1793*4bdc9457SAndroid Build Coastguard Worker i4 += 16;
1794*4bdc9457SAndroid Build Coastguard Worker
1795*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_load_ps(w + 80);
1796*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x89ABCDEF = _mm256_load_ps(w + 88);
1797*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0);
1798*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi4x89ABCDEF, vk4x89ABCDEF, vacc89ABCDEFp0);
1799*4bdc9457SAndroid Build Coastguard Worker
1800*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_loadu_ps(i5);
1801*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x89ABCDEF = _mm256_loadu_ps(i5 + 8);
1802*4bdc9457SAndroid Build Coastguard Worker i5 += 16;
1803*4bdc9457SAndroid Build Coastguard Worker
1804*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_load_ps(w + 96);
1805*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x89ABCDEF = _mm256_load_ps(w + 104);
1806*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0);
1807*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi5x89ABCDEF, vk5x89ABCDEF, vacc89ABCDEFp0);
1808*4bdc9457SAndroid Build Coastguard Worker
1809*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_loadu_ps(i6);
1810*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x89ABCDEF = _mm256_loadu_ps(i6 + 8);
1811*4bdc9457SAndroid Build Coastguard Worker i6 += 16;
1812*4bdc9457SAndroid Build Coastguard Worker
1813*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_load_ps(w + 112);
1814*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x89ABCDEF = _mm256_load_ps(w + 120);
1815*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0);
1816*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi6x89ABCDEF, vk6x89ABCDEF, vacc89ABCDEFp0);
1817*4bdc9457SAndroid Build Coastguard Worker
1818*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_loadu_ps(i7);
1819*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x89ABCDEF = _mm256_loadu_ps(i7 + 8);
1820*4bdc9457SAndroid Build Coastguard Worker i7 += 16;
1821*4bdc9457SAndroid Build Coastguard Worker
1822*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_load_ps(w + 128);
1823*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x89ABCDEF = _mm256_load_ps(w + 136);
1824*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0);
1825*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi7x89ABCDEF, vk7x89ABCDEF, vacc89ABCDEFp0);
1826*4bdc9457SAndroid Build Coastguard Worker
1827*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_loadu_ps(i8);
1828*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x89ABCDEF = _mm256_loadu_ps(i8 + 8);
1829*4bdc9457SAndroid Build Coastguard Worker i8 += 16;
1830*4bdc9457SAndroid Build Coastguard Worker
1831*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_load_ps(w + 144);
1832*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x89ABCDEF = _mm256_load_ps(w + 152);
1833*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0);
1834*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEFp0 = _mm256_fmadd_ps(vi8x89ABCDEF, vk8x89ABCDEF, vacc89ABCDEFp0);
1835*4bdc9457SAndroid Build Coastguard Worker
1836*4bdc9457SAndroid Build Coastguard Worker w += 160;
1837*4bdc9457SAndroid Build Coastguard Worker
1838*4bdc9457SAndroid Build Coastguard Worker
1839*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1840*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
1841*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1842*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
1843*4bdc9457SAndroid Build Coastguard Worker
1844*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output, vacc01234567);
1845*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output + 8, vacc89ABCDEF);
1846*4bdc9457SAndroid Build Coastguard Worker output += 16;
1847*4bdc9457SAndroid Build Coastguard Worker }
1848*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
1849*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
1850*4bdc9457SAndroid Build Coastguard Worker
1851*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
1852*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
1853*4bdc9457SAndroid Build Coastguard Worker
1854*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
1855*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
1856*4bdc9457SAndroid Build Coastguard Worker
1857*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
1858*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
1859*4bdc9457SAndroid Build Coastguard Worker
1860*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
1861*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
1862*4bdc9457SAndroid Build Coastguard Worker
1863*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
1864*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
1865*4bdc9457SAndroid Build Coastguard Worker
1866*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
1867*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
1868*4bdc9457SAndroid Build Coastguard Worker
1869*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
1870*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
1871*4bdc9457SAndroid Build Coastguard Worker
1872*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
1873*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0);
1874*4bdc9457SAndroid Build Coastguard Worker
1875*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_loadu_ps(i4);
1876*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
1877*4bdc9457SAndroid Build Coastguard Worker
1878*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_load_ps(w + 80);
1879*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0);
1880*4bdc9457SAndroid Build Coastguard Worker
1881*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_loadu_ps(i5);
1882*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
1883*4bdc9457SAndroid Build Coastguard Worker
1884*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_load_ps(w + 96);
1885*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0);
1886*4bdc9457SAndroid Build Coastguard Worker
1887*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_loadu_ps(i6);
1888*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
1889*4bdc9457SAndroid Build Coastguard Worker
1890*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_load_ps(w + 112);
1891*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0);
1892*4bdc9457SAndroid Build Coastguard Worker
1893*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_loadu_ps(i7);
1894*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
1895*4bdc9457SAndroid Build Coastguard Worker
1896*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_load_ps(w + 128);
1897*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0);
1898*4bdc9457SAndroid Build Coastguard Worker
1899*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_loadu_ps(i8);
1900*4bdc9457SAndroid Build Coastguard Worker i8 += 8;
1901*4bdc9457SAndroid Build Coastguard Worker
1902*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_load_ps(w + 144);
1903*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0);
1904*4bdc9457SAndroid Build Coastguard Worker
1905*4bdc9457SAndroid Build Coastguard Worker w += 8;
1906*4bdc9457SAndroid Build Coastguard Worker
1907*4bdc9457SAndroid Build Coastguard Worker
1908*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1909*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1910*4bdc9457SAndroid Build Coastguard Worker
1911*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output, vacc01234567);
1912*4bdc9457SAndroid Build Coastguard Worker output += 8;
1913*4bdc9457SAndroid Build Coastguard Worker }
1914*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
1915*4bdc9457SAndroid Build Coastguard Worker assert(c >= 1);
1916*4bdc9457SAndroid Build Coastguard Worker assert(c <= 7);
1917*4bdc9457SAndroid Build Coastguard Worker const __m256i vmask = _mm256_loadu_si256((const __m256i*) ¶ms->avx.mask_table[7 - c]);
1918*4bdc9457SAndroid Build Coastguard Worker
1919*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
1920*4bdc9457SAndroid Build Coastguard Worker
1921*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
1922*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
1923*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
1924*4bdc9457SAndroid Build Coastguard Worker
1925*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
1926*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
1927*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
1928*4bdc9457SAndroid Build Coastguard Worker
1929*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
1930*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
1931*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
1932*4bdc9457SAndroid Build Coastguard Worker
1933*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask);
1934*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
1935*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0);
1936*4bdc9457SAndroid Build Coastguard Worker
1937*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_maskload_ps(i4, vmask);
1938*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_load_ps(w + 80);
1939*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0);
1940*4bdc9457SAndroid Build Coastguard Worker
1941*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_maskload_ps(i5, vmask);
1942*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_load_ps(w + 96);
1943*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0);
1944*4bdc9457SAndroid Build Coastguard Worker
1945*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_maskload_ps(i6, vmask);
1946*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_load_ps(w + 112);
1947*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0);
1948*4bdc9457SAndroid Build Coastguard Worker
1949*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_maskload_ps(i7, vmask);
1950*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_load_ps(w + 128);
1951*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0);
1952*4bdc9457SAndroid Build Coastguard Worker
1953*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_maskload_ps(i8, vmask);
1954*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_load_ps(w + 144);
1955*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0);
1956*4bdc9457SAndroid Build Coastguard Worker
1957*4bdc9457SAndroid Build Coastguard Worker
1958*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1959*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1960*4bdc9457SAndroid Build Coastguard Worker
1961*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
1962*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
1963*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(output, vacc0123);
1964*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
1965*4bdc9457SAndroid Build Coastguard Worker output += 4;
1966*4bdc9457SAndroid Build Coastguard Worker }
1967*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
1968*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) output, vacc0123);
1969*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1970*4bdc9457SAndroid Build Coastguard Worker output += 2;
1971*4bdc9457SAndroid Build Coastguard Worker }
1972*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
1973*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(output, vacc0123);
1974*4bdc9457SAndroid Build Coastguard Worker output += 1;
1975*4bdc9457SAndroid Build Coastguard Worker }
1976*4bdc9457SAndroid Build Coastguard Worker }
1977*4bdc9457SAndroid Build Coastguard Worker
1978*4bdc9457SAndroid Build Coastguard Worker output = (float*) ((uintptr_t) output + output_increment);
1979*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
1980*4bdc9457SAndroid Build Coastguard Worker }
1981*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_dwconv_minmax_ukernel_up8x25__fma3(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1982*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv_minmax_ukernel_up8x25__fma3(
1983*4bdc9457SAndroid Build Coastguard Worker size_t channels,
1984*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
1985*4bdc9457SAndroid Build Coastguard Worker const float** input,
1986*4bdc9457SAndroid Build Coastguard Worker const float* weights,
1987*4bdc9457SAndroid Build Coastguard Worker float* output,
1988*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
1989*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
1990*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
1991*4bdc9457SAndroid Build Coastguard Worker const float* zero,
1992*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1993*4bdc9457SAndroid Build Coastguard Worker {
1994*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
1995*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
1996*4bdc9457SAndroid Build Coastguard Worker
1997*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
1998*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
1999*4bdc9457SAndroid Build Coastguard Worker do {
2000*4bdc9457SAndroid Build Coastguard Worker const float* i0 = input[0];
2001*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
2002*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
2003*4bdc9457SAndroid Build Coastguard Worker i0 = (const float*) ((uintptr_t) i0 + input_offset);
2004*4bdc9457SAndroid Build Coastguard Worker }
2005*4bdc9457SAndroid Build Coastguard Worker const float* i1 = input[1];
2006*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
2007*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
2008*4bdc9457SAndroid Build Coastguard Worker i1 = (const float*) ((uintptr_t) i1 + input_offset);
2009*4bdc9457SAndroid Build Coastguard Worker }
2010*4bdc9457SAndroid Build Coastguard Worker const float* i2 = input[2];
2011*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
2012*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
2013*4bdc9457SAndroid Build Coastguard Worker i2 = (const float*) ((uintptr_t) i2 + input_offset);
2014*4bdc9457SAndroid Build Coastguard Worker }
2015*4bdc9457SAndroid Build Coastguard Worker const float* i3 = input[3];
2016*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
2017*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
2018*4bdc9457SAndroid Build Coastguard Worker i3 = (const float*) ((uintptr_t) i3 + input_offset);
2019*4bdc9457SAndroid Build Coastguard Worker }
2020*4bdc9457SAndroid Build Coastguard Worker const float* i4 = input[4];
2021*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
2022*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
2023*4bdc9457SAndroid Build Coastguard Worker i4 = (const float*) ((uintptr_t) i4 + input_offset);
2024*4bdc9457SAndroid Build Coastguard Worker }
2025*4bdc9457SAndroid Build Coastguard Worker const float* i5 = input[5];
2026*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
2027*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
2028*4bdc9457SAndroid Build Coastguard Worker i5 = (const float*) ((uintptr_t) i5 + input_offset);
2029*4bdc9457SAndroid Build Coastguard Worker }
2030*4bdc9457SAndroid Build Coastguard Worker const float* i6 = input[6];
2031*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
2032*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
2033*4bdc9457SAndroid Build Coastguard Worker i6 = (const float*) ((uintptr_t) i6 + input_offset);
2034*4bdc9457SAndroid Build Coastguard Worker }
2035*4bdc9457SAndroid Build Coastguard Worker const float* i7 = input[7];
2036*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
2037*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
2038*4bdc9457SAndroid Build Coastguard Worker i7 = (const float*) ((uintptr_t) i7 + input_offset);
2039*4bdc9457SAndroid Build Coastguard Worker }
2040*4bdc9457SAndroid Build Coastguard Worker const float* i8 = input[8];
2041*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
2042*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
2043*4bdc9457SAndroid Build Coastguard Worker i8 = (const float*) ((uintptr_t) i8 + input_offset);
2044*4bdc9457SAndroid Build Coastguard Worker }
2045*4bdc9457SAndroid Build Coastguard Worker const float* i9 = input[9];
2046*4bdc9457SAndroid Build Coastguard Worker assert(i9 != NULL);
2047*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i9 != zero) {
2048*4bdc9457SAndroid Build Coastguard Worker i9 = (const float*) ((uintptr_t) i9 + input_offset);
2049*4bdc9457SAndroid Build Coastguard Worker }
2050*4bdc9457SAndroid Build Coastguard Worker const float* i10 = input[10];
2051*4bdc9457SAndroid Build Coastguard Worker assert(i10 != NULL);
2052*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i10 != zero) {
2053*4bdc9457SAndroid Build Coastguard Worker i10 = (const float*) ((uintptr_t) i10 + input_offset);
2054*4bdc9457SAndroid Build Coastguard Worker }
2055*4bdc9457SAndroid Build Coastguard Worker const float* i11 = input[11];
2056*4bdc9457SAndroid Build Coastguard Worker assert(i11 != NULL);
2057*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i11 != zero) {
2058*4bdc9457SAndroid Build Coastguard Worker i11 = (const float*) ((uintptr_t) i11 + input_offset);
2059*4bdc9457SAndroid Build Coastguard Worker }
2060*4bdc9457SAndroid Build Coastguard Worker const float* i12 = input[12];
2061*4bdc9457SAndroid Build Coastguard Worker assert(i12 != NULL);
2062*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i12 != zero) {
2063*4bdc9457SAndroid Build Coastguard Worker i12 = (const float*) ((uintptr_t) i12 + input_offset);
2064*4bdc9457SAndroid Build Coastguard Worker }
2065*4bdc9457SAndroid Build Coastguard Worker const float* i13 = input[13];
2066*4bdc9457SAndroid Build Coastguard Worker assert(i13 != NULL);
2067*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i13 != zero) {
2068*4bdc9457SAndroid Build Coastguard Worker i13 = (const float*) ((uintptr_t) i13 + input_offset);
2069*4bdc9457SAndroid Build Coastguard Worker }
2070*4bdc9457SAndroid Build Coastguard Worker const float* i14 = input[14];
2071*4bdc9457SAndroid Build Coastguard Worker assert(i14 != NULL);
2072*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i14 != zero) {
2073*4bdc9457SAndroid Build Coastguard Worker i14 = (const float*) ((uintptr_t) i14 + input_offset);
2074*4bdc9457SAndroid Build Coastguard Worker }
2075*4bdc9457SAndroid Build Coastguard Worker const float* i15 = input[15];
2076*4bdc9457SAndroid Build Coastguard Worker assert(i15 != NULL);
2077*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i15 != zero) {
2078*4bdc9457SAndroid Build Coastguard Worker i15 = (const float*) ((uintptr_t) i15 + input_offset);
2079*4bdc9457SAndroid Build Coastguard Worker }
2080*4bdc9457SAndroid Build Coastguard Worker const float* i16 = input[16];
2081*4bdc9457SAndroid Build Coastguard Worker assert(i16 != NULL);
2082*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i16 != zero) {
2083*4bdc9457SAndroid Build Coastguard Worker i16 = (const float*) ((uintptr_t) i16 + input_offset);
2084*4bdc9457SAndroid Build Coastguard Worker }
2085*4bdc9457SAndroid Build Coastguard Worker const float* i17 = input[17];
2086*4bdc9457SAndroid Build Coastguard Worker assert(i17 != NULL);
2087*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i17 != zero) {
2088*4bdc9457SAndroid Build Coastguard Worker i17 = (const float*) ((uintptr_t) i17 + input_offset);
2089*4bdc9457SAndroid Build Coastguard Worker }
2090*4bdc9457SAndroid Build Coastguard Worker const float* i18 = input[18];
2091*4bdc9457SAndroid Build Coastguard Worker assert(i18 != NULL);
2092*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i18 != zero) {
2093*4bdc9457SAndroid Build Coastguard Worker i18 = (const float*) ((uintptr_t) i18 + input_offset);
2094*4bdc9457SAndroid Build Coastguard Worker }
2095*4bdc9457SAndroid Build Coastguard Worker const float* i19 = input[19];
2096*4bdc9457SAndroid Build Coastguard Worker assert(i19 != NULL);
2097*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i19 != zero) {
2098*4bdc9457SAndroid Build Coastguard Worker i19 = (const float*) ((uintptr_t) i19 + input_offset);
2099*4bdc9457SAndroid Build Coastguard Worker }
2100*4bdc9457SAndroid Build Coastguard Worker const float* i20 = input[20];
2101*4bdc9457SAndroid Build Coastguard Worker assert(i20 != NULL);
2102*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i20 != zero) {
2103*4bdc9457SAndroid Build Coastguard Worker i20 = (const float*) ((uintptr_t) i20 + input_offset);
2104*4bdc9457SAndroid Build Coastguard Worker }
2105*4bdc9457SAndroid Build Coastguard Worker const float* i21 = input[21];
2106*4bdc9457SAndroid Build Coastguard Worker assert(i21 != NULL);
2107*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i21 != zero) {
2108*4bdc9457SAndroid Build Coastguard Worker i21 = (const float*) ((uintptr_t) i21 + input_offset);
2109*4bdc9457SAndroid Build Coastguard Worker }
2110*4bdc9457SAndroid Build Coastguard Worker const float* i22 = input[22];
2111*4bdc9457SAndroid Build Coastguard Worker assert(i22 != NULL);
2112*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i22 != zero) {
2113*4bdc9457SAndroid Build Coastguard Worker i22 = (const float*) ((uintptr_t) i22 + input_offset);
2114*4bdc9457SAndroid Build Coastguard Worker }
2115*4bdc9457SAndroid Build Coastguard Worker const float* i23 = input[23];
2116*4bdc9457SAndroid Build Coastguard Worker assert(i23 != NULL);
2117*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i23 != zero) {
2118*4bdc9457SAndroid Build Coastguard Worker i23 = (const float*) ((uintptr_t) i23 + input_offset);
2119*4bdc9457SAndroid Build Coastguard Worker }
2120*4bdc9457SAndroid Build Coastguard Worker const float* i24 = input[24];
2121*4bdc9457SAndroid Build Coastguard Worker assert(i24 != NULL);
2122*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i24 != zero) {
2123*4bdc9457SAndroid Build Coastguard Worker i24 = (const float*) ((uintptr_t) i24 + input_offset);
2124*4bdc9457SAndroid Build Coastguard Worker }
2125*4bdc9457SAndroid Build Coastguard Worker input = (const float**) ((uintptr_t) input + input_stride);
2126*4bdc9457SAndroid Build Coastguard Worker
2127*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
2128*4bdc9457SAndroid Build Coastguard Worker const float* w = weights;
2129*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
2130*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
2131*4bdc9457SAndroid Build Coastguard Worker
2132*4bdc9457SAndroid Build Coastguard Worker
2133*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
2134*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
2135*4bdc9457SAndroid Build Coastguard Worker
2136*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 8);
2137*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
2138*4bdc9457SAndroid Build Coastguard Worker
2139*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
2140*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
2141*4bdc9457SAndroid Build Coastguard Worker
2142*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 16);
2143*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
2144*4bdc9457SAndroid Build Coastguard Worker
2145*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
2146*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
2147*4bdc9457SAndroid Build Coastguard Worker
2148*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 24);
2149*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
2150*4bdc9457SAndroid Build Coastguard Worker
2151*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
2152*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
2153*4bdc9457SAndroid Build Coastguard Worker
2154*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_load_ps(w + 32);
2155*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0);
2156*4bdc9457SAndroid Build Coastguard Worker
2157*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_loadu_ps(i4);
2158*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
2159*4bdc9457SAndroid Build Coastguard Worker
2160*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_load_ps(w + 40);
2161*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0);
2162*4bdc9457SAndroid Build Coastguard Worker
2163*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_loadu_ps(i5);
2164*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
2165*4bdc9457SAndroid Build Coastguard Worker
2166*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_load_ps(w + 48);
2167*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0);
2168*4bdc9457SAndroid Build Coastguard Worker
2169*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_loadu_ps(i6);
2170*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
2171*4bdc9457SAndroid Build Coastguard Worker
2172*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_load_ps(w + 56);
2173*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0);
2174*4bdc9457SAndroid Build Coastguard Worker
2175*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_loadu_ps(i7);
2176*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
2177*4bdc9457SAndroid Build Coastguard Worker
2178*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_load_ps(w + 64);
2179*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0);
2180*4bdc9457SAndroid Build Coastguard Worker
2181*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_loadu_ps(i8);
2182*4bdc9457SAndroid Build Coastguard Worker i8 += 8;
2183*4bdc9457SAndroid Build Coastguard Worker
2184*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_load_ps(w + 72);
2185*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0);
2186*4bdc9457SAndroid Build Coastguard Worker
2187*4bdc9457SAndroid Build Coastguard Worker const __m256 vi9x01234567 = _mm256_loadu_ps(i9);
2188*4bdc9457SAndroid Build Coastguard Worker i9 += 8;
2189*4bdc9457SAndroid Build Coastguard Worker
2190*4bdc9457SAndroid Build Coastguard Worker const __m256 vk9x01234567 = _mm256_load_ps(w + 80);
2191*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi9x01234567, vk9x01234567, vacc01234567p0);
2192*4bdc9457SAndroid Build Coastguard Worker
2193*4bdc9457SAndroid Build Coastguard Worker const __m256 vi10x01234567 = _mm256_loadu_ps(i10);
2194*4bdc9457SAndroid Build Coastguard Worker i10 += 8;
2195*4bdc9457SAndroid Build Coastguard Worker
2196*4bdc9457SAndroid Build Coastguard Worker const __m256 vk10x01234567 = _mm256_load_ps(w + 88);
2197*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0);
2198*4bdc9457SAndroid Build Coastguard Worker
2199*4bdc9457SAndroid Build Coastguard Worker const __m256 vi11x01234567 = _mm256_loadu_ps(i11);
2200*4bdc9457SAndroid Build Coastguard Worker i11 += 8;
2201*4bdc9457SAndroid Build Coastguard Worker
2202*4bdc9457SAndroid Build Coastguard Worker const __m256 vk11x01234567 = _mm256_load_ps(w + 96);
2203*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi11x01234567, vk11x01234567, vacc01234567p0);
2204*4bdc9457SAndroid Build Coastguard Worker
2205*4bdc9457SAndroid Build Coastguard Worker const __m256 vi12x01234567 = _mm256_loadu_ps(i12);
2206*4bdc9457SAndroid Build Coastguard Worker i12 += 8;
2207*4bdc9457SAndroid Build Coastguard Worker
2208*4bdc9457SAndroid Build Coastguard Worker const __m256 vk12x01234567 = _mm256_load_ps(w + 104);
2209*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0);
2210*4bdc9457SAndroid Build Coastguard Worker
2211*4bdc9457SAndroid Build Coastguard Worker const __m256 vi13x01234567 = _mm256_loadu_ps(i13);
2212*4bdc9457SAndroid Build Coastguard Worker i13 += 8;
2213*4bdc9457SAndroid Build Coastguard Worker
2214*4bdc9457SAndroid Build Coastguard Worker const __m256 vk13x01234567 = _mm256_load_ps(w + 112);
2215*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi13x01234567, vk13x01234567, vacc01234567p0);
2216*4bdc9457SAndroid Build Coastguard Worker
2217*4bdc9457SAndroid Build Coastguard Worker const __m256 vi14x01234567 = _mm256_loadu_ps(i14);
2218*4bdc9457SAndroid Build Coastguard Worker i14 += 8;
2219*4bdc9457SAndroid Build Coastguard Worker
2220*4bdc9457SAndroid Build Coastguard Worker const __m256 vk14x01234567 = _mm256_load_ps(w + 120);
2221*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0);
2222*4bdc9457SAndroid Build Coastguard Worker
2223*4bdc9457SAndroid Build Coastguard Worker const __m256 vi15x01234567 = _mm256_loadu_ps(i15);
2224*4bdc9457SAndroid Build Coastguard Worker i15 += 8;
2225*4bdc9457SAndroid Build Coastguard Worker
2226*4bdc9457SAndroid Build Coastguard Worker const __m256 vk15x01234567 = _mm256_load_ps(w + 128);
2227*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi15x01234567, vk15x01234567, vacc01234567p0);
2228*4bdc9457SAndroid Build Coastguard Worker
2229*4bdc9457SAndroid Build Coastguard Worker const __m256 vi16x01234567 = _mm256_loadu_ps(i16);
2230*4bdc9457SAndroid Build Coastguard Worker i16 += 8;
2231*4bdc9457SAndroid Build Coastguard Worker
2232*4bdc9457SAndroid Build Coastguard Worker const __m256 vk16x01234567 = _mm256_load_ps(w + 136);
2233*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0);
2234*4bdc9457SAndroid Build Coastguard Worker
2235*4bdc9457SAndroid Build Coastguard Worker const __m256 vi17x01234567 = _mm256_loadu_ps(i17);
2236*4bdc9457SAndroid Build Coastguard Worker i17 += 8;
2237*4bdc9457SAndroid Build Coastguard Worker
2238*4bdc9457SAndroid Build Coastguard Worker const __m256 vk17x01234567 = _mm256_load_ps(w + 144);
2239*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi17x01234567, vk17x01234567, vacc01234567p0);
2240*4bdc9457SAndroid Build Coastguard Worker
2241*4bdc9457SAndroid Build Coastguard Worker const __m256 vi18x01234567 = _mm256_loadu_ps(i18);
2242*4bdc9457SAndroid Build Coastguard Worker i18 += 8;
2243*4bdc9457SAndroid Build Coastguard Worker
2244*4bdc9457SAndroid Build Coastguard Worker const __m256 vk18x01234567 = _mm256_load_ps(w + 152);
2245*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi18x01234567, vk18x01234567, vacc01234567p0);
2246*4bdc9457SAndroid Build Coastguard Worker
2247*4bdc9457SAndroid Build Coastguard Worker const __m256 vi19x01234567 = _mm256_loadu_ps(i19);
2248*4bdc9457SAndroid Build Coastguard Worker i19 += 8;
2249*4bdc9457SAndroid Build Coastguard Worker
2250*4bdc9457SAndroid Build Coastguard Worker const __m256 vk19x01234567 = _mm256_load_ps(w + 160);
2251*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi19x01234567, vk19x01234567, vacc01234567p0);
2252*4bdc9457SAndroid Build Coastguard Worker
2253*4bdc9457SAndroid Build Coastguard Worker const __m256 vi20x01234567 = _mm256_loadu_ps(i20);
2254*4bdc9457SAndroid Build Coastguard Worker i20 += 8;
2255*4bdc9457SAndroid Build Coastguard Worker
2256*4bdc9457SAndroid Build Coastguard Worker const __m256 vk20x01234567 = _mm256_load_ps(w + 168);
2257*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi20x01234567, vk20x01234567, vacc01234567p0);
2258*4bdc9457SAndroid Build Coastguard Worker
2259*4bdc9457SAndroid Build Coastguard Worker const __m256 vi21x01234567 = _mm256_loadu_ps(i21);
2260*4bdc9457SAndroid Build Coastguard Worker i21 += 8;
2261*4bdc9457SAndroid Build Coastguard Worker
2262*4bdc9457SAndroid Build Coastguard Worker const __m256 vk21x01234567 = _mm256_load_ps(w + 176);
2263*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi21x01234567, vk21x01234567, vacc01234567p0);
2264*4bdc9457SAndroid Build Coastguard Worker
2265*4bdc9457SAndroid Build Coastguard Worker const __m256 vi22x01234567 = _mm256_loadu_ps(i22);
2266*4bdc9457SAndroid Build Coastguard Worker i22 += 8;
2267*4bdc9457SAndroid Build Coastguard Worker
2268*4bdc9457SAndroid Build Coastguard Worker const __m256 vk22x01234567 = _mm256_load_ps(w + 184);
2269*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi22x01234567, vk22x01234567, vacc01234567p0);
2270*4bdc9457SAndroid Build Coastguard Worker
2271*4bdc9457SAndroid Build Coastguard Worker const __m256 vi23x01234567 = _mm256_loadu_ps(i23);
2272*4bdc9457SAndroid Build Coastguard Worker i23 += 8;
2273*4bdc9457SAndroid Build Coastguard Worker
2274*4bdc9457SAndroid Build Coastguard Worker const __m256 vk23x01234567 = _mm256_load_ps(w + 192);
2275*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi23x01234567, vk23x01234567, vacc01234567p0);
2276*4bdc9457SAndroid Build Coastguard Worker
2277*4bdc9457SAndroid Build Coastguard Worker const __m256 vi24x01234567 = _mm256_loadu_ps(i24);
2278*4bdc9457SAndroid Build Coastguard Worker i24 += 8;
2279*4bdc9457SAndroid Build Coastguard Worker
2280*4bdc9457SAndroid Build Coastguard Worker const __m256 vk24x01234567 = _mm256_load_ps(w + 200);
2281*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi24x01234567, vk24x01234567, vacc01234567p0);
2282*4bdc9457SAndroid Build Coastguard Worker
2283*4bdc9457SAndroid Build Coastguard Worker w += 208;
2284*4bdc9457SAndroid Build Coastguard Worker
2285*4bdc9457SAndroid Build Coastguard Worker
2286*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
2287*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
2288*4bdc9457SAndroid Build Coastguard Worker
2289*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output, vacc01234567);
2290*4bdc9457SAndroid Build Coastguard Worker output += 8;
2291*4bdc9457SAndroid Build Coastguard Worker }
2292*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
2293*4bdc9457SAndroid Build Coastguard Worker assert(c >= 1);
2294*4bdc9457SAndroid Build Coastguard Worker assert(c <= 7);
2295*4bdc9457SAndroid Build Coastguard Worker const __m256i vmask = _mm256_loadu_si256((const __m256i*) ¶ms->avx.mask_table[7 - c]);
2296*4bdc9457SAndroid Build Coastguard Worker
2297*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567p0 = _mm256_load_ps(w);
2298*4bdc9457SAndroid Build Coastguard Worker
2299*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
2300*4bdc9457SAndroid Build Coastguard Worker const __m256 vk0x01234567 = _mm256_load_ps(w + 8);
2301*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0);
2302*4bdc9457SAndroid Build Coastguard Worker
2303*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
2304*4bdc9457SAndroid Build Coastguard Worker const __m256 vk1x01234567 = _mm256_load_ps(w + 16);
2305*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0);
2306*4bdc9457SAndroid Build Coastguard Worker
2307*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
2308*4bdc9457SAndroid Build Coastguard Worker const __m256 vk2x01234567 = _mm256_load_ps(w + 24);
2309*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0);
2310*4bdc9457SAndroid Build Coastguard Worker
2311*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask);
2312*4bdc9457SAndroid Build Coastguard Worker const __m256 vk3x01234567 = _mm256_load_ps(w + 32);
2313*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0);
2314*4bdc9457SAndroid Build Coastguard Worker
2315*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_maskload_ps(i4, vmask);
2316*4bdc9457SAndroid Build Coastguard Worker const __m256 vk4x01234567 = _mm256_load_ps(w + 40);
2317*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0);
2318*4bdc9457SAndroid Build Coastguard Worker
2319*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_maskload_ps(i5, vmask);
2320*4bdc9457SAndroid Build Coastguard Worker const __m256 vk5x01234567 = _mm256_load_ps(w + 48);
2321*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0);
2322*4bdc9457SAndroid Build Coastguard Worker
2323*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_maskload_ps(i6, vmask);
2324*4bdc9457SAndroid Build Coastguard Worker const __m256 vk6x01234567 = _mm256_load_ps(w + 56);
2325*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0);
2326*4bdc9457SAndroid Build Coastguard Worker
2327*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7x01234567 = _mm256_maskload_ps(i7, vmask);
2328*4bdc9457SAndroid Build Coastguard Worker const __m256 vk7x01234567 = _mm256_load_ps(w + 64);
2329*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0);
2330*4bdc9457SAndroid Build Coastguard Worker
2331*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8x01234567 = _mm256_maskload_ps(i8, vmask);
2332*4bdc9457SAndroid Build Coastguard Worker const __m256 vk8x01234567 = _mm256_load_ps(w + 72);
2333*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0);
2334*4bdc9457SAndroid Build Coastguard Worker
2335*4bdc9457SAndroid Build Coastguard Worker const __m256 vi9x01234567 = _mm256_maskload_ps(i9, vmask);
2336*4bdc9457SAndroid Build Coastguard Worker const __m256 vk9x01234567 = _mm256_load_ps(w + 80);
2337*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi9x01234567, vk9x01234567, vacc01234567p0);
2338*4bdc9457SAndroid Build Coastguard Worker
2339*4bdc9457SAndroid Build Coastguard Worker const __m256 vi10x01234567 = _mm256_maskload_ps(i10, vmask);
2340*4bdc9457SAndroid Build Coastguard Worker const __m256 vk10x01234567 = _mm256_load_ps(w + 88);
2341*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0);
2342*4bdc9457SAndroid Build Coastguard Worker
2343*4bdc9457SAndroid Build Coastguard Worker const __m256 vi11x01234567 = _mm256_maskload_ps(i11, vmask);
2344*4bdc9457SAndroid Build Coastguard Worker const __m256 vk11x01234567 = _mm256_load_ps(w + 96);
2345*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi11x01234567, vk11x01234567, vacc01234567p0);
2346*4bdc9457SAndroid Build Coastguard Worker
2347*4bdc9457SAndroid Build Coastguard Worker const __m256 vi12x01234567 = _mm256_maskload_ps(i12, vmask);
2348*4bdc9457SAndroid Build Coastguard Worker const __m256 vk12x01234567 = _mm256_load_ps(w + 104);
2349*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0);
2350*4bdc9457SAndroid Build Coastguard Worker
2351*4bdc9457SAndroid Build Coastguard Worker const __m256 vi13x01234567 = _mm256_maskload_ps(i13, vmask);
2352*4bdc9457SAndroid Build Coastguard Worker const __m256 vk13x01234567 = _mm256_load_ps(w + 112);
2353*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi13x01234567, vk13x01234567, vacc01234567p0);
2354*4bdc9457SAndroid Build Coastguard Worker
2355*4bdc9457SAndroid Build Coastguard Worker const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask);
2356*4bdc9457SAndroid Build Coastguard Worker const __m256 vk14x01234567 = _mm256_load_ps(w + 120);
2357*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0);
2358*4bdc9457SAndroid Build Coastguard Worker
2359*4bdc9457SAndroid Build Coastguard Worker const __m256 vi15x01234567 = _mm256_maskload_ps(i15, vmask);
2360*4bdc9457SAndroid Build Coastguard Worker const __m256 vk15x01234567 = _mm256_load_ps(w + 128);
2361*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi15x01234567, vk15x01234567, vacc01234567p0);
2362*4bdc9457SAndroid Build Coastguard Worker
2363*4bdc9457SAndroid Build Coastguard Worker const __m256 vi16x01234567 = _mm256_maskload_ps(i16, vmask);
2364*4bdc9457SAndroid Build Coastguard Worker const __m256 vk16x01234567 = _mm256_load_ps(w + 136);
2365*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0);
2366*4bdc9457SAndroid Build Coastguard Worker
2367*4bdc9457SAndroid Build Coastguard Worker const __m256 vi17x01234567 = _mm256_maskload_ps(i17, vmask);
2368*4bdc9457SAndroid Build Coastguard Worker const __m256 vk17x01234567 = _mm256_load_ps(w + 144);
2369*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi17x01234567, vk17x01234567, vacc01234567p0);
2370*4bdc9457SAndroid Build Coastguard Worker
2371*4bdc9457SAndroid Build Coastguard Worker const __m256 vi18x01234567 = _mm256_maskload_ps(i18, vmask);
2372*4bdc9457SAndroid Build Coastguard Worker const __m256 vk18x01234567 = _mm256_load_ps(w + 152);
2373*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi18x01234567, vk18x01234567, vacc01234567p0);
2374*4bdc9457SAndroid Build Coastguard Worker
2375*4bdc9457SAndroid Build Coastguard Worker const __m256 vi19x01234567 = _mm256_maskload_ps(i19, vmask);
2376*4bdc9457SAndroid Build Coastguard Worker const __m256 vk19x01234567 = _mm256_load_ps(w + 160);
2377*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi19x01234567, vk19x01234567, vacc01234567p0);
2378*4bdc9457SAndroid Build Coastguard Worker
2379*4bdc9457SAndroid Build Coastguard Worker const __m256 vi20x01234567 = _mm256_maskload_ps(i20, vmask);
2380*4bdc9457SAndroid Build Coastguard Worker const __m256 vk20x01234567 = _mm256_load_ps(w + 168);
2381*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi20x01234567, vk20x01234567, vacc01234567p0);
2382*4bdc9457SAndroid Build Coastguard Worker
2383*4bdc9457SAndroid Build Coastguard Worker const __m256 vi21x01234567 = _mm256_maskload_ps(i21, vmask);
2384*4bdc9457SAndroid Build Coastguard Worker const __m256 vk21x01234567 = _mm256_load_ps(w + 176);
2385*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi21x01234567, vk21x01234567, vacc01234567p0);
2386*4bdc9457SAndroid Build Coastguard Worker
2387*4bdc9457SAndroid Build Coastguard Worker const __m256 vi22x01234567 = _mm256_maskload_ps(i22, vmask);
2388*4bdc9457SAndroid Build Coastguard Worker const __m256 vk22x01234567 = _mm256_load_ps(w + 184);
2389*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi22x01234567, vk22x01234567, vacc01234567p0);
2390*4bdc9457SAndroid Build Coastguard Worker
2391*4bdc9457SAndroid Build Coastguard Worker const __m256 vi23x01234567 = _mm256_maskload_ps(i23, vmask);
2392*4bdc9457SAndroid Build Coastguard Worker const __m256 vk23x01234567 = _mm256_load_ps(w + 192);
2393*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi23x01234567, vk23x01234567, vacc01234567p0);
2394*4bdc9457SAndroid Build Coastguard Worker
2395*4bdc9457SAndroid Build Coastguard Worker const __m256 vi24x01234567 = _mm256_maskload_ps(i24, vmask);
2396*4bdc9457SAndroid Build Coastguard Worker const __m256 vk24x01234567 = _mm256_load_ps(w + 200);
2397*4bdc9457SAndroid Build Coastguard Worker vacc01234567p0 = _mm256_fmadd_ps(vi24x01234567, vk24x01234567, vacc01234567p0);
2398*4bdc9457SAndroid Build Coastguard Worker
2399*4bdc9457SAndroid Build Coastguard Worker
2400*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
2401*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
2402*4bdc9457SAndroid Build Coastguard Worker
2403*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
2404*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
2405*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(output, vacc0123);
2406*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
2407*4bdc9457SAndroid Build Coastguard Worker output += 4;
2408*4bdc9457SAndroid Build Coastguard Worker }
2409*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
2410*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) output, vacc0123);
2411*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
2412*4bdc9457SAndroid Build Coastguard Worker output += 2;
2413*4bdc9457SAndroid Build Coastguard Worker }
2414*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
2415*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(output, vacc0123);
2416*4bdc9457SAndroid Build Coastguard Worker output += 1;
2417*4bdc9457SAndroid Build Coastguard Worker }
2418*4bdc9457SAndroid Build Coastguard Worker }
2419*4bdc9457SAndroid Build Coastguard Worker
2420*4bdc9457SAndroid Build Coastguard Worker output = (float*) ((uintptr_t) output + output_increment);
2421*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
2422*4bdc9457SAndroid Build Coastguard Worker }
2423*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2424*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast(
2425*4bdc9457SAndroid Build Coastguard Worker size_t mr,
2426*4bdc9457SAndroid Build Coastguard Worker size_t nc,
2427*4bdc9457SAndroid Build Coastguard Worker size_t kc,
2428*4bdc9457SAndroid Build Coastguard Worker const float*restrict a,
2429*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
2430*4bdc9457SAndroid Build Coastguard Worker const float*restrict w,
2431*4bdc9457SAndroid Build Coastguard Worker float*restrict c,
2432*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
2433*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
2434*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2435*4bdc9457SAndroid Build Coastguard Worker {
2436*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
2437*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
2438*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
2439*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
2440*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(float) == 0);
2441*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
2442*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
2443*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
2444*4bdc9457SAndroid Build Coastguard Worker
2445*4bdc9457SAndroid Build Coastguard Worker const float* a0 = a;
2446*4bdc9457SAndroid Build Coastguard Worker float* c0 = c;
2447*4bdc9457SAndroid Build Coastguard Worker
2448*4bdc9457SAndroid Build Coastguard Worker do {
2449*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
2450*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
2451*4bdc9457SAndroid Build Coastguard Worker w += 16;
2452*4bdc9457SAndroid Build Coastguard Worker
2453*4bdc9457SAndroid Build Coastguard Worker size_t k = kc;
2454*4bdc9457SAndroid Build Coastguard Worker do {
2455*4bdc9457SAndroid Build Coastguard Worker const __m256 va0 = _mm256_broadcast_ss(a0);
2456*4bdc9457SAndroid Build Coastguard Worker a0 += 1;
2457*4bdc9457SAndroid Build Coastguard Worker
2458*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_load_ps(w);
2459*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
2460*4bdc9457SAndroid Build Coastguard Worker w += 16;
2461*4bdc9457SAndroid Build Coastguard Worker
2462*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
2463*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF);
2464*4bdc9457SAndroid Build Coastguard Worker
2465*4bdc9457SAndroid Build Coastguard Worker k -= sizeof(float);
2466*4bdc9457SAndroid Build Coastguard Worker } while (k != 0);
2467*4bdc9457SAndroid Build Coastguard Worker
2468*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
2469*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
2470*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
2471*4bdc9457SAndroid Build Coastguard Worker
2472*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
2473*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
2474*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
2475*4bdc9457SAndroid Build Coastguard Worker
2476*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nc >= 16) {
2477*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
2478*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
2479*4bdc9457SAndroid Build Coastguard Worker c0 = (float*) ((uintptr_t) c0 + cn_stride);
2480*4bdc9457SAndroid Build Coastguard Worker
2481*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 - kc);
2482*4bdc9457SAndroid Build Coastguard Worker
2483*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
2484*4bdc9457SAndroid Build Coastguard Worker } else {
2485*4bdc9457SAndroid Build Coastguard Worker if (nc & 8) {
2486*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
2487*4bdc9457SAndroid Build Coastguard Worker
2488*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = vacc0x89ABCDEF;
2489*4bdc9457SAndroid Build Coastguard Worker
2490*4bdc9457SAndroid Build Coastguard Worker c0 += 8;
2491*4bdc9457SAndroid Build Coastguard Worker }
2492*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
2493*4bdc9457SAndroid Build Coastguard Worker if (nc & 4) {
2494*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c0, vacc0x0123);
2495*4bdc9457SAndroid Build Coastguard Worker
2496*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
2497*4bdc9457SAndroid Build Coastguard Worker
2498*4bdc9457SAndroid Build Coastguard Worker c0 += 4;
2499*4bdc9457SAndroid Build Coastguard Worker }
2500*4bdc9457SAndroid Build Coastguard Worker if (nc & 2) {
2501*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c0, vacc0x0123);
2502*4bdc9457SAndroid Build Coastguard Worker
2503*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
2504*4bdc9457SAndroid Build Coastguard Worker
2505*4bdc9457SAndroid Build Coastguard Worker c0 += 2;
2506*4bdc9457SAndroid Build Coastguard Worker }
2507*4bdc9457SAndroid Build Coastguard Worker if (nc & 1) {
2508*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c0, vacc0x0123);
2509*4bdc9457SAndroid Build Coastguard Worker }
2510*4bdc9457SAndroid Build Coastguard Worker
2511*4bdc9457SAndroid Build Coastguard Worker nc = 0;
2512*4bdc9457SAndroid Build Coastguard Worker }
2513*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
2514*4bdc9457SAndroid Build Coastguard Worker }
2515*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2516*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast(
2517*4bdc9457SAndroid Build Coastguard Worker size_t mr,
2518*4bdc9457SAndroid Build Coastguard Worker size_t nc,
2519*4bdc9457SAndroid Build Coastguard Worker size_t kc,
2520*4bdc9457SAndroid Build Coastguard Worker const float*restrict a,
2521*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
2522*4bdc9457SAndroid Build Coastguard Worker const float*restrict w,
2523*4bdc9457SAndroid Build Coastguard Worker float*restrict c,
2524*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
2525*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
2526*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2527*4bdc9457SAndroid Build Coastguard Worker {
2528*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
2529*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
2530*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
2531*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
2532*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(float) == 0);
2533*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
2534*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
2535*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
2536*4bdc9457SAndroid Build Coastguard Worker
2537*4bdc9457SAndroid Build Coastguard Worker const float* a0 = a;
2538*4bdc9457SAndroid Build Coastguard Worker float* c0 = c;
2539*4bdc9457SAndroid Build Coastguard Worker
2540*4bdc9457SAndroid Build Coastguard Worker do {
2541*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
2542*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
2543*4bdc9457SAndroid Build Coastguard Worker w += 16;
2544*4bdc9457SAndroid Build Coastguard Worker
2545*4bdc9457SAndroid Build Coastguard Worker size_t k = kc;
2546*4bdc9457SAndroid Build Coastguard Worker while (k >= 4 * sizeof(float)) {
2547*4bdc9457SAndroid Build Coastguard Worker __m256 va0 = _mm256_broadcast_ps((const __m128*) a0);
2548*4bdc9457SAndroid Build Coastguard Worker a0 += 4;
2549*4bdc9457SAndroid Build Coastguard Worker
2550*4bdc9457SAndroid Build Coastguard Worker
2551*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c0 = _mm256_load_ps(w + 0);
2552*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8);
2553*4bdc9457SAndroid Build Coastguard Worker
2554*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567);
2555*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF);
2556*4bdc9457SAndroid Build Coastguard Worker
2557*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2558*4bdc9457SAndroid Build Coastguard Worker
2559*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c1 = _mm256_load_ps(w + 16);
2560*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc1 = _mm256_load_ps(w + 24);
2561*4bdc9457SAndroid Build Coastguard Worker
2562*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567);
2563*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF);
2564*4bdc9457SAndroid Build Coastguard Worker
2565*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2566*4bdc9457SAndroid Build Coastguard Worker
2567*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c2 = _mm256_load_ps(w + 32);
2568*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc2 = _mm256_load_ps(w + 40);
2569*4bdc9457SAndroid Build Coastguard Worker
2570*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567);
2571*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF);
2572*4bdc9457SAndroid Build Coastguard Worker
2573*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2574*4bdc9457SAndroid Build Coastguard Worker
2575*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c3 = _mm256_load_ps(w + 48);
2576*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56);
2577*4bdc9457SAndroid Build Coastguard Worker
2578*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567);
2579*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF);
2580*4bdc9457SAndroid Build Coastguard Worker
2581*4bdc9457SAndroid Build Coastguard Worker
2582*4bdc9457SAndroid Build Coastguard Worker w += 64;
2583*4bdc9457SAndroid Build Coastguard Worker k -= 4 * sizeof(float);
2584*4bdc9457SAndroid Build Coastguard Worker }
2585*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(k != 0) {
2586*4bdc9457SAndroid Build Coastguard Worker __m256 va0 = _mm256_broadcast_ps((const __m128*) a0);
2587*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 + k);
2588*4bdc9457SAndroid Build Coastguard Worker
2589*4bdc9457SAndroid Build Coastguard Worker const __m256 vzero = _mm256_setzero_ps();
2590*4bdc9457SAndroid Build Coastguard Worker
2591*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c0 = _mm256_load_ps(w + 0);
2592*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8);
2593*4bdc9457SAndroid Build Coastguard Worker
2594*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc0x01234567);
2595*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc0x89ABCDEF);
2596*4bdc9457SAndroid Build Coastguard Worker
2597*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2598*4bdc9457SAndroid Build Coastguard Worker
2599*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c1 = _mm256_load_ps(w + 16);
2600*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc1 = _mm256_load_ps(w + 24);
2601*4bdc9457SAndroid Build Coastguard Worker
2602*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc0x01234567);
2603*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc0x89ABCDEF);
2604*4bdc9457SAndroid Build Coastguard Worker
2605*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2606*4bdc9457SAndroid Build Coastguard Worker
2607*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c2 = _mm256_load_ps(w + 32);
2608*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc2 = _mm256_load_ps(w + 40);
2609*4bdc9457SAndroid Build Coastguard Worker
2610*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc0x01234567);
2611*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc0x89ABCDEF);
2612*4bdc9457SAndroid Build Coastguard Worker
2613*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2614*4bdc9457SAndroid Build Coastguard Worker
2615*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c3 = _mm256_load_ps(w + 48);
2616*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56);
2617*4bdc9457SAndroid Build Coastguard Worker
2618*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc0x01234567);
2619*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc0x89ABCDEF);
2620*4bdc9457SAndroid Build Coastguard Worker
2621*4bdc9457SAndroid Build Coastguard Worker
2622*4bdc9457SAndroid Build Coastguard Worker w += 64;
2623*4bdc9457SAndroid Build Coastguard Worker }
2624*4bdc9457SAndroid Build Coastguard Worker
2625*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
2626*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
2627*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
2628*4bdc9457SAndroid Build Coastguard Worker
2629*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
2630*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
2631*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
2632*4bdc9457SAndroid Build Coastguard Worker
2633*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nc >= 16) {
2634*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
2635*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
2636*4bdc9457SAndroid Build Coastguard Worker c0 = (float*) ((uintptr_t) c0 + cn_stride);
2637*4bdc9457SAndroid Build Coastguard Worker
2638*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 - kc);
2639*4bdc9457SAndroid Build Coastguard Worker
2640*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
2641*4bdc9457SAndroid Build Coastguard Worker } else {
2642*4bdc9457SAndroid Build Coastguard Worker if (nc & 8) {
2643*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
2644*4bdc9457SAndroid Build Coastguard Worker
2645*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = vacc0x89ABCDEF;
2646*4bdc9457SAndroid Build Coastguard Worker
2647*4bdc9457SAndroid Build Coastguard Worker c0 += 8;
2648*4bdc9457SAndroid Build Coastguard Worker }
2649*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
2650*4bdc9457SAndroid Build Coastguard Worker if (nc & 4) {
2651*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c0, vacc0x0123);
2652*4bdc9457SAndroid Build Coastguard Worker
2653*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
2654*4bdc9457SAndroid Build Coastguard Worker
2655*4bdc9457SAndroid Build Coastguard Worker c0 += 4;
2656*4bdc9457SAndroid Build Coastguard Worker }
2657*4bdc9457SAndroid Build Coastguard Worker if (nc & 2) {
2658*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c0, vacc0x0123);
2659*4bdc9457SAndroid Build Coastguard Worker
2660*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
2661*4bdc9457SAndroid Build Coastguard Worker
2662*4bdc9457SAndroid Build Coastguard Worker c0 += 2;
2663*4bdc9457SAndroid Build Coastguard Worker }
2664*4bdc9457SAndroid Build Coastguard Worker if (nc & 1) {
2665*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c0, vacc0x0123);
2666*4bdc9457SAndroid Build Coastguard Worker }
2667*4bdc9457SAndroid Build Coastguard Worker
2668*4bdc9457SAndroid Build Coastguard Worker nc = 0;
2669*4bdc9457SAndroid Build Coastguard Worker }
2670*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
2671*4bdc9457SAndroid Build Coastguard Worker }
2672*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2673*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast(
2674*4bdc9457SAndroid Build Coastguard Worker size_t mr,
2675*4bdc9457SAndroid Build Coastguard Worker size_t nc,
2676*4bdc9457SAndroid Build Coastguard Worker size_t kc,
2677*4bdc9457SAndroid Build Coastguard Worker const float*restrict a,
2678*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
2679*4bdc9457SAndroid Build Coastguard Worker const float*restrict w,
2680*4bdc9457SAndroid Build Coastguard Worker float*restrict c,
2681*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
2682*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
2683*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2684*4bdc9457SAndroid Build Coastguard Worker {
2685*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
2686*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 4);
2687*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
2688*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
2689*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(float) == 0);
2690*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
2691*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
2692*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
2693*4bdc9457SAndroid Build Coastguard Worker
2694*4bdc9457SAndroid Build Coastguard Worker const float* a0 = a;
2695*4bdc9457SAndroid Build Coastguard Worker float* c0 = c;
2696*4bdc9457SAndroid Build Coastguard Worker const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
2697*4bdc9457SAndroid Build Coastguard Worker float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
2698*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
2699*4bdc9457SAndroid Build Coastguard Worker a1 = a0;
2700*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
2701*4bdc9457SAndroid Build Coastguard Worker }
2702*4bdc9457SAndroid Build Coastguard Worker const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
2703*4bdc9457SAndroid Build Coastguard Worker float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
2704*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
2705*4bdc9457SAndroid Build Coastguard Worker a2 = a1;
2706*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
2707*4bdc9457SAndroid Build Coastguard Worker }
2708*4bdc9457SAndroid Build Coastguard Worker const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
2709*4bdc9457SAndroid Build Coastguard Worker float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
2710*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr != 4) {
2711*4bdc9457SAndroid Build Coastguard Worker a3 = a2;
2712*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
2713*4bdc9457SAndroid Build Coastguard Worker }
2714*4bdc9457SAndroid Build Coastguard Worker
2715*4bdc9457SAndroid Build Coastguard Worker do {
2716*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
2717*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
2718*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x01234567 = vacc0x01234567;
2719*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
2720*4bdc9457SAndroid Build Coastguard Worker __m256 vacc2x01234567 = vacc0x01234567;
2721*4bdc9457SAndroid Build Coastguard Worker __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
2722*4bdc9457SAndroid Build Coastguard Worker __m256 vacc3x01234567 = vacc0x01234567;
2723*4bdc9457SAndroid Build Coastguard Worker __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
2724*4bdc9457SAndroid Build Coastguard Worker w += 16;
2725*4bdc9457SAndroid Build Coastguard Worker
2726*4bdc9457SAndroid Build Coastguard Worker size_t k = kc;
2727*4bdc9457SAndroid Build Coastguard Worker while (k >= 4 * sizeof(float)) {
2728*4bdc9457SAndroid Build Coastguard Worker __m256 va0 = _mm256_broadcast_ps((const __m128*) a0);
2729*4bdc9457SAndroid Build Coastguard Worker a0 += 4;
2730*4bdc9457SAndroid Build Coastguard Worker __m256 va1 = _mm256_broadcast_ps((const __m128*) a1);
2731*4bdc9457SAndroid Build Coastguard Worker a1 += 4;
2732*4bdc9457SAndroid Build Coastguard Worker __m256 va2 = _mm256_broadcast_ps((const __m128*) a2);
2733*4bdc9457SAndroid Build Coastguard Worker a2 += 4;
2734*4bdc9457SAndroid Build Coastguard Worker __m256 va3 = _mm256_broadcast_ps((const __m128*) a3);
2735*4bdc9457SAndroid Build Coastguard Worker a3 += 4;
2736*4bdc9457SAndroid Build Coastguard Worker
2737*4bdc9457SAndroid Build Coastguard Worker
2738*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c0 = _mm256_load_ps(w + 0);
2739*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8);
2740*4bdc9457SAndroid Build Coastguard Worker
2741*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567);
2742*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c0, vacc1x01234567);
2743*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567);
2744*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c0, vacc3x01234567);
2745*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF);
2746*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF);
2747*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF);
2748*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF);
2749*4bdc9457SAndroid Build Coastguard Worker
2750*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2751*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
2752*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
2753*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
2754*4bdc9457SAndroid Build Coastguard Worker
2755*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c1 = _mm256_load_ps(w + 16);
2756*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc1 = _mm256_load_ps(w + 24);
2757*4bdc9457SAndroid Build Coastguard Worker
2758*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567);
2759*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c1, vacc1x01234567);
2760*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567);
2761*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c1, vacc3x01234567);
2762*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF);
2763*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc1, vacc1x89ABCDEF);
2764*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF);
2765*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc1, vacc3x89ABCDEF);
2766*4bdc9457SAndroid Build Coastguard Worker
2767*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2768*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
2769*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
2770*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
2771*4bdc9457SAndroid Build Coastguard Worker
2772*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c2 = _mm256_load_ps(w + 32);
2773*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc2 = _mm256_load_ps(w + 40);
2774*4bdc9457SAndroid Build Coastguard Worker
2775*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567);
2776*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567);
2777*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567);
2778*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567);
2779*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF);
2780*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc2, vacc1x89ABCDEF);
2781*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF);
2782*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc2, vacc3x89ABCDEF);
2783*4bdc9457SAndroid Build Coastguard Worker
2784*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2785*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
2786*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
2787*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
2788*4bdc9457SAndroid Build Coastguard Worker
2789*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c3 = _mm256_load_ps(w + 48);
2790*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56);
2791*4bdc9457SAndroid Build Coastguard Worker
2792*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567);
2793*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567);
2794*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567);
2795*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567);
2796*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF);
2797*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF);
2798*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF);
2799*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF);
2800*4bdc9457SAndroid Build Coastguard Worker
2801*4bdc9457SAndroid Build Coastguard Worker
2802*4bdc9457SAndroid Build Coastguard Worker w += 64;
2803*4bdc9457SAndroid Build Coastguard Worker k -= 4 * sizeof(float);
2804*4bdc9457SAndroid Build Coastguard Worker }
2805*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(k != 0) {
2806*4bdc9457SAndroid Build Coastguard Worker __m256 va0 = _mm256_broadcast_ps((const __m128*) a0);
2807*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 + k);
2808*4bdc9457SAndroid Build Coastguard Worker __m256 va1 = _mm256_broadcast_ps((const __m128*) a1);
2809*4bdc9457SAndroid Build Coastguard Worker a1 = (const float*) ((uintptr_t) a1 + k);
2810*4bdc9457SAndroid Build Coastguard Worker __m256 va2 = _mm256_broadcast_ps((const __m128*) a2);
2811*4bdc9457SAndroid Build Coastguard Worker a2 = (const float*) ((uintptr_t) a2 + k);
2812*4bdc9457SAndroid Build Coastguard Worker __m256 va3 = _mm256_broadcast_ps((const __m128*) a3);
2813*4bdc9457SAndroid Build Coastguard Worker a3 = (const float*) ((uintptr_t) a3 + k);
2814*4bdc9457SAndroid Build Coastguard Worker
2815*4bdc9457SAndroid Build Coastguard Worker const __m256 vzero = _mm256_setzero_ps();
2816*4bdc9457SAndroid Build Coastguard Worker
2817*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c0 = _mm256_load_ps(w + 0);
2818*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8);
2819*4bdc9457SAndroid Build Coastguard Worker
2820*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc0x01234567);
2821*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc1x01234567);
2822*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc2x01234567);
2823*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc3x01234567);
2824*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc0x89ABCDEF);
2825*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc1x89ABCDEF);
2826*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc2x89ABCDEF);
2827*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc3x89ABCDEF);
2828*4bdc9457SAndroid Build Coastguard Worker
2829*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2830*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
2831*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
2832*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
2833*4bdc9457SAndroid Build Coastguard Worker
2834*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c1 = _mm256_load_ps(w + 16);
2835*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc1 = _mm256_load_ps(w + 24);
2836*4bdc9457SAndroid Build Coastguard Worker
2837*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc0x01234567);
2838*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc1x01234567);
2839*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc2x01234567);
2840*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc3x01234567);
2841*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc0x89ABCDEF);
2842*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc1x89ABCDEF);
2843*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc2x89ABCDEF);
2844*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc3x89ABCDEF);
2845*4bdc9457SAndroid Build Coastguard Worker
2846*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2847*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
2848*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
2849*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
2850*4bdc9457SAndroid Build Coastguard Worker
2851*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c2 = _mm256_load_ps(w + 32);
2852*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc2 = _mm256_load_ps(w + 40);
2853*4bdc9457SAndroid Build Coastguard Worker
2854*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc0x01234567);
2855*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc1x01234567);
2856*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc2x01234567);
2857*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc3x01234567);
2858*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc0x89ABCDEF);
2859*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc1x89ABCDEF);
2860*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc2x89ABCDEF);
2861*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc3x89ABCDEF);
2862*4bdc9457SAndroid Build Coastguard Worker
2863*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
2864*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
2865*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
2866*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
2867*4bdc9457SAndroid Build Coastguard Worker
2868*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c3 = _mm256_load_ps(w + 48);
2869*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56);
2870*4bdc9457SAndroid Build Coastguard Worker
2871*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc0x01234567);
2872*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc1x01234567);
2873*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc2x01234567);
2874*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc3x01234567);
2875*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc0x89ABCDEF);
2876*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc1x89ABCDEF);
2877*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc2x89ABCDEF);
2878*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc3x89ABCDEF);
2879*4bdc9457SAndroid Build Coastguard Worker
2880*4bdc9457SAndroid Build Coastguard Worker
2881*4bdc9457SAndroid Build Coastguard Worker w += 64;
2882*4bdc9457SAndroid Build Coastguard Worker }
2883*4bdc9457SAndroid Build Coastguard Worker
2884*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
2885*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
2886*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
2887*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
2888*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
2889*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
2890*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
2891*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
2892*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
2893*4bdc9457SAndroid Build Coastguard Worker
2894*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
2895*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
2896*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
2897*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
2898*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
2899*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
2900*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
2901*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
2902*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
2903*4bdc9457SAndroid Build Coastguard Worker
2904*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nc >= 16) {
2905*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3, vacc3x01234567);
2906*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF);
2907*4bdc9457SAndroid Build Coastguard Worker c3 = (float*) ((uintptr_t) c3 + cn_stride);
2908*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2, vacc2x01234567);
2909*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF);
2910*4bdc9457SAndroid Build Coastguard Worker c2 = (float*) ((uintptr_t) c2 + cn_stride);
2911*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1, vacc1x01234567);
2912*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1 + 8, vacc1x89ABCDEF);
2913*4bdc9457SAndroid Build Coastguard Worker c1 = (float*) ((uintptr_t) c1 + cn_stride);
2914*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
2915*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
2916*4bdc9457SAndroid Build Coastguard Worker c0 = (float*) ((uintptr_t) c0 + cn_stride);
2917*4bdc9457SAndroid Build Coastguard Worker
2918*4bdc9457SAndroid Build Coastguard Worker a3 = (const float*) ((uintptr_t) a3 - kc);
2919*4bdc9457SAndroid Build Coastguard Worker a2 = (const float*) ((uintptr_t) a2 - kc);
2920*4bdc9457SAndroid Build Coastguard Worker a1 = (const float*) ((uintptr_t) a1 - kc);
2921*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 - kc);
2922*4bdc9457SAndroid Build Coastguard Worker
2923*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
2924*4bdc9457SAndroid Build Coastguard Worker } else {
2925*4bdc9457SAndroid Build Coastguard Worker if (nc & 8) {
2926*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3, vacc3x01234567);
2927*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2, vacc2x01234567);
2928*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1, vacc1x01234567);
2929*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
2930*4bdc9457SAndroid Build Coastguard Worker
2931*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = vacc3x89ABCDEF;
2932*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = vacc2x89ABCDEF;
2933*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = vacc1x89ABCDEF;
2934*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = vacc0x89ABCDEF;
2935*4bdc9457SAndroid Build Coastguard Worker
2936*4bdc9457SAndroid Build Coastguard Worker c3 += 8;
2937*4bdc9457SAndroid Build Coastguard Worker c2 += 8;
2938*4bdc9457SAndroid Build Coastguard Worker c1 += 8;
2939*4bdc9457SAndroid Build Coastguard Worker c0 += 8;
2940*4bdc9457SAndroid Build Coastguard Worker }
2941*4bdc9457SAndroid Build Coastguard Worker __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
2942*4bdc9457SAndroid Build Coastguard Worker __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
2943*4bdc9457SAndroid Build Coastguard Worker __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
2944*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
2945*4bdc9457SAndroid Build Coastguard Worker if (nc & 4) {
2946*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c3, vacc3x0123);
2947*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c2, vacc2x0123);
2948*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c1, vacc1x0123);
2949*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c0, vacc0x0123);
2950*4bdc9457SAndroid Build Coastguard Worker
2951*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
2952*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
2953*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
2954*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
2955*4bdc9457SAndroid Build Coastguard Worker
2956*4bdc9457SAndroid Build Coastguard Worker c3 += 4;
2957*4bdc9457SAndroid Build Coastguard Worker c2 += 4;
2958*4bdc9457SAndroid Build Coastguard Worker c1 += 4;
2959*4bdc9457SAndroid Build Coastguard Worker c0 += 4;
2960*4bdc9457SAndroid Build Coastguard Worker }
2961*4bdc9457SAndroid Build Coastguard Worker if (nc & 2) {
2962*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c3, vacc3x0123);
2963*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c2, vacc2x0123);
2964*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c1, vacc1x0123);
2965*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c0, vacc0x0123);
2966*4bdc9457SAndroid Build Coastguard Worker
2967*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
2968*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
2969*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
2970*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
2971*4bdc9457SAndroid Build Coastguard Worker
2972*4bdc9457SAndroid Build Coastguard Worker c3 += 2;
2973*4bdc9457SAndroid Build Coastguard Worker c2 += 2;
2974*4bdc9457SAndroid Build Coastguard Worker c1 += 2;
2975*4bdc9457SAndroid Build Coastguard Worker c0 += 2;
2976*4bdc9457SAndroid Build Coastguard Worker }
2977*4bdc9457SAndroid Build Coastguard Worker if (nc & 1) {
2978*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c3, vacc3x0123);
2979*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c2, vacc2x0123);
2980*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c1, vacc1x0123);
2981*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c0, vacc0x0123);
2982*4bdc9457SAndroid Build Coastguard Worker }
2983*4bdc9457SAndroid Build Coastguard Worker
2984*4bdc9457SAndroid Build Coastguard Worker nc = 0;
2985*4bdc9457SAndroid Build Coastguard Worker }
2986*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
2987*4bdc9457SAndroid Build Coastguard Worker }
2988*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2989*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast(
2990*4bdc9457SAndroid Build Coastguard Worker size_t mr,
2991*4bdc9457SAndroid Build Coastguard Worker size_t nc,
2992*4bdc9457SAndroid Build Coastguard Worker size_t kc,
2993*4bdc9457SAndroid Build Coastguard Worker const float*restrict a,
2994*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
2995*4bdc9457SAndroid Build Coastguard Worker const float*restrict w,
2996*4bdc9457SAndroid Build Coastguard Worker float*restrict c,
2997*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
2998*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
2999*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3000*4bdc9457SAndroid Build Coastguard Worker {
3001*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
3002*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 5);
3003*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
3004*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
3005*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(float) == 0);
3006*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
3007*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
3008*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
3009*4bdc9457SAndroid Build Coastguard Worker
3010*4bdc9457SAndroid Build Coastguard Worker const float* a0 = a;
3011*4bdc9457SAndroid Build Coastguard Worker float* c0 = c;
3012*4bdc9457SAndroid Build Coastguard Worker const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
3013*4bdc9457SAndroid Build Coastguard Worker float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
3014*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
3015*4bdc9457SAndroid Build Coastguard Worker a1 = a0;
3016*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
3017*4bdc9457SAndroid Build Coastguard Worker }
3018*4bdc9457SAndroid Build Coastguard Worker const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
3019*4bdc9457SAndroid Build Coastguard Worker float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
3020*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
3021*4bdc9457SAndroid Build Coastguard Worker a2 = a1;
3022*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
3023*4bdc9457SAndroid Build Coastguard Worker }
3024*4bdc9457SAndroid Build Coastguard Worker const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
3025*4bdc9457SAndroid Build Coastguard Worker float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
3026*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 4) {
3027*4bdc9457SAndroid Build Coastguard Worker a3 = a2;
3028*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
3029*4bdc9457SAndroid Build Coastguard Worker }
3030*4bdc9457SAndroid Build Coastguard Worker const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
3031*4bdc9457SAndroid Build Coastguard Worker float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
3032*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 4) {
3033*4bdc9457SAndroid Build Coastguard Worker a4 = a3;
3034*4bdc9457SAndroid Build Coastguard Worker c4 = c3;
3035*4bdc9457SAndroid Build Coastguard Worker }
3036*4bdc9457SAndroid Build Coastguard Worker
3037*4bdc9457SAndroid Build Coastguard Worker do {
3038*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
3039*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
3040*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x01234567 = vacc0x01234567;
3041*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
3042*4bdc9457SAndroid Build Coastguard Worker __m256 vacc2x01234567 = vacc0x01234567;
3043*4bdc9457SAndroid Build Coastguard Worker __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
3044*4bdc9457SAndroid Build Coastguard Worker __m256 vacc3x01234567 = vacc0x01234567;
3045*4bdc9457SAndroid Build Coastguard Worker __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
3046*4bdc9457SAndroid Build Coastguard Worker __m256 vacc4x01234567 = vacc0x01234567;
3047*4bdc9457SAndroid Build Coastguard Worker __m256 vacc4x89ABCDEF = vacc0x89ABCDEF;
3048*4bdc9457SAndroid Build Coastguard Worker w += 16;
3049*4bdc9457SAndroid Build Coastguard Worker
3050*4bdc9457SAndroid Build Coastguard Worker size_t k = kc;
3051*4bdc9457SAndroid Build Coastguard Worker do {
3052*4bdc9457SAndroid Build Coastguard Worker const __m256 va0 = _mm256_broadcast_ss(a0);
3053*4bdc9457SAndroid Build Coastguard Worker a0 += 1;
3054*4bdc9457SAndroid Build Coastguard Worker const __m256 va1 = _mm256_broadcast_ss(a1);
3055*4bdc9457SAndroid Build Coastguard Worker a1 += 1;
3056*4bdc9457SAndroid Build Coastguard Worker const __m256 va2 = _mm256_broadcast_ss(a2);
3057*4bdc9457SAndroid Build Coastguard Worker a2 += 1;
3058*4bdc9457SAndroid Build Coastguard Worker const __m256 va3 = _mm256_broadcast_ss(a3);
3059*4bdc9457SAndroid Build Coastguard Worker a3 += 1;
3060*4bdc9457SAndroid Build Coastguard Worker const __m256 va4 = _mm256_broadcast_ss(a4);
3061*4bdc9457SAndroid Build Coastguard Worker a4 += 1;
3062*4bdc9457SAndroid Build Coastguard Worker
3063*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_load_ps(w);
3064*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
3065*4bdc9457SAndroid Build Coastguard Worker w += 16;
3066*4bdc9457SAndroid Build Coastguard Worker
3067*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
3068*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
3069*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
3070*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
3071*4bdc9457SAndroid Build Coastguard Worker vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
3072*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF);
3073*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF);
3074*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF);
3075*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF);
3076*4bdc9457SAndroid Build Coastguard Worker vacc4x89ABCDEF = _mm256_fmadd_ps(va4, vb89ABCDEF, vacc4x89ABCDEF);
3077*4bdc9457SAndroid Build Coastguard Worker
3078*4bdc9457SAndroid Build Coastguard Worker k -= sizeof(float);
3079*4bdc9457SAndroid Build Coastguard Worker } while (k != 0);
3080*4bdc9457SAndroid Build Coastguard Worker
3081*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
3082*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
3083*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
3084*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
3085*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
3086*4bdc9457SAndroid Build Coastguard Worker vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
3087*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
3088*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
3089*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
3090*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
3091*4bdc9457SAndroid Build Coastguard Worker vacc4x89ABCDEF = _mm256_max_ps(vacc4x89ABCDEF, vmin);
3092*4bdc9457SAndroid Build Coastguard Worker
3093*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
3094*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
3095*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
3096*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
3097*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
3098*4bdc9457SAndroid Build Coastguard Worker vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
3099*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
3100*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
3101*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
3102*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
3103*4bdc9457SAndroid Build Coastguard Worker vacc4x89ABCDEF = _mm256_min_ps(vacc4x89ABCDEF, vmax);
3104*4bdc9457SAndroid Build Coastguard Worker
3105*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nc >= 16) {
3106*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c4, vacc4x01234567);
3107*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c4 + 8, vacc4x89ABCDEF);
3108*4bdc9457SAndroid Build Coastguard Worker c4 = (float*) ((uintptr_t) c4 + cn_stride);
3109*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3, vacc3x01234567);
3110*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF);
3111*4bdc9457SAndroid Build Coastguard Worker c3 = (float*) ((uintptr_t) c3 + cn_stride);
3112*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2, vacc2x01234567);
3113*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF);
3114*4bdc9457SAndroid Build Coastguard Worker c2 = (float*) ((uintptr_t) c2 + cn_stride);
3115*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1, vacc1x01234567);
3116*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1 + 8, vacc1x89ABCDEF);
3117*4bdc9457SAndroid Build Coastguard Worker c1 = (float*) ((uintptr_t) c1 + cn_stride);
3118*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3119*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
3120*4bdc9457SAndroid Build Coastguard Worker c0 = (float*) ((uintptr_t) c0 + cn_stride);
3121*4bdc9457SAndroid Build Coastguard Worker
3122*4bdc9457SAndroid Build Coastguard Worker a4 = (const float*) ((uintptr_t) a4 - kc);
3123*4bdc9457SAndroid Build Coastguard Worker a3 = (const float*) ((uintptr_t) a3 - kc);
3124*4bdc9457SAndroid Build Coastguard Worker a2 = (const float*) ((uintptr_t) a2 - kc);
3125*4bdc9457SAndroid Build Coastguard Worker a1 = (const float*) ((uintptr_t) a1 - kc);
3126*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 - kc);
3127*4bdc9457SAndroid Build Coastguard Worker
3128*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
3129*4bdc9457SAndroid Build Coastguard Worker } else {
3130*4bdc9457SAndroid Build Coastguard Worker if (nc & 8) {
3131*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c4, vacc4x01234567);
3132*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3, vacc3x01234567);
3133*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2, vacc2x01234567);
3134*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1, vacc1x01234567);
3135*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3136*4bdc9457SAndroid Build Coastguard Worker
3137*4bdc9457SAndroid Build Coastguard Worker vacc4x01234567 = vacc4x89ABCDEF;
3138*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = vacc3x89ABCDEF;
3139*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = vacc2x89ABCDEF;
3140*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = vacc1x89ABCDEF;
3141*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = vacc0x89ABCDEF;
3142*4bdc9457SAndroid Build Coastguard Worker
3143*4bdc9457SAndroid Build Coastguard Worker c4 += 8;
3144*4bdc9457SAndroid Build Coastguard Worker c3 += 8;
3145*4bdc9457SAndroid Build Coastguard Worker c2 += 8;
3146*4bdc9457SAndroid Build Coastguard Worker c1 += 8;
3147*4bdc9457SAndroid Build Coastguard Worker c0 += 8;
3148*4bdc9457SAndroid Build Coastguard Worker }
3149*4bdc9457SAndroid Build Coastguard Worker __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
3150*4bdc9457SAndroid Build Coastguard Worker __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
3151*4bdc9457SAndroid Build Coastguard Worker __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
3152*4bdc9457SAndroid Build Coastguard Worker __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
3153*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
3154*4bdc9457SAndroid Build Coastguard Worker if (nc & 4) {
3155*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c4, vacc4x0123);
3156*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c3, vacc3x0123);
3157*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c2, vacc2x0123);
3158*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c1, vacc1x0123);
3159*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c0, vacc0x0123);
3160*4bdc9457SAndroid Build Coastguard Worker
3161*4bdc9457SAndroid Build Coastguard Worker vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
3162*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
3163*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
3164*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
3165*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
3166*4bdc9457SAndroid Build Coastguard Worker
3167*4bdc9457SAndroid Build Coastguard Worker c4 += 4;
3168*4bdc9457SAndroid Build Coastguard Worker c3 += 4;
3169*4bdc9457SAndroid Build Coastguard Worker c2 += 4;
3170*4bdc9457SAndroid Build Coastguard Worker c1 += 4;
3171*4bdc9457SAndroid Build Coastguard Worker c0 += 4;
3172*4bdc9457SAndroid Build Coastguard Worker }
3173*4bdc9457SAndroid Build Coastguard Worker if (nc & 2) {
3174*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c4, vacc4x0123);
3175*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c3, vacc3x0123);
3176*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c2, vacc2x0123);
3177*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c1, vacc1x0123);
3178*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c0, vacc0x0123);
3179*4bdc9457SAndroid Build Coastguard Worker
3180*4bdc9457SAndroid Build Coastguard Worker vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
3181*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
3182*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
3183*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
3184*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
3185*4bdc9457SAndroid Build Coastguard Worker
3186*4bdc9457SAndroid Build Coastguard Worker c4 += 2;
3187*4bdc9457SAndroid Build Coastguard Worker c3 += 2;
3188*4bdc9457SAndroid Build Coastguard Worker c2 += 2;
3189*4bdc9457SAndroid Build Coastguard Worker c1 += 2;
3190*4bdc9457SAndroid Build Coastguard Worker c0 += 2;
3191*4bdc9457SAndroid Build Coastguard Worker }
3192*4bdc9457SAndroid Build Coastguard Worker if (nc & 1) {
3193*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c4, vacc4x0123);
3194*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c3, vacc3x0123);
3195*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c2, vacc2x0123);
3196*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c1, vacc1x0123);
3197*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c0, vacc0x0123);
3198*4bdc9457SAndroid Build Coastguard Worker }
3199*4bdc9457SAndroid Build Coastguard Worker
3200*4bdc9457SAndroid Build Coastguard Worker nc = 0;
3201*4bdc9457SAndroid Build Coastguard Worker }
3202*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
3203*4bdc9457SAndroid Build Coastguard Worker }
3204*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3205*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast(
3206*4bdc9457SAndroid Build Coastguard Worker size_t mr,
3207*4bdc9457SAndroid Build Coastguard Worker size_t nc,
3208*4bdc9457SAndroid Build Coastguard Worker size_t kc,
3209*4bdc9457SAndroid Build Coastguard Worker size_t ks,
3210*4bdc9457SAndroid Build Coastguard Worker const float**restrict a,
3211*4bdc9457SAndroid Build Coastguard Worker const float*restrict w,
3212*4bdc9457SAndroid Build Coastguard Worker float*restrict c,
3213*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
3214*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
3215*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
3216*4bdc9457SAndroid Build Coastguard Worker const float* zero,
3217*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3218*4bdc9457SAndroid Build Coastguard Worker {
3219*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
3220*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
3221*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
3222*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
3223*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(float) == 0);
3224*4bdc9457SAndroid Build Coastguard Worker assert(ks != 0);
3225*4bdc9457SAndroid Build Coastguard Worker assert(ks % (1 * sizeof(void*)) == 0);
3226*4bdc9457SAndroid Build Coastguard Worker assert(a_offset % sizeof(float) == 0);
3227*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
3228*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
3229*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
3230*4bdc9457SAndroid Build Coastguard Worker
3231*4bdc9457SAndroid Build Coastguard Worker float* c0 = c;
3232*4bdc9457SAndroid Build Coastguard Worker
3233*4bdc9457SAndroid Build Coastguard Worker do {
3234*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_load_ps(w);
3235*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
3236*4bdc9457SAndroid Build Coastguard Worker w += 16;
3237*4bdc9457SAndroid Build Coastguard Worker
3238*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
3239*4bdc9457SAndroid Build Coastguard Worker do {
3240*4bdc9457SAndroid Build Coastguard Worker const float* restrict a0 = a[0];
3241*4bdc9457SAndroid Build Coastguard Worker assert(a0 != NULL);
3242*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
3243*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 + a_offset);
3244*4bdc9457SAndroid Build Coastguard Worker }
3245*4bdc9457SAndroid Build Coastguard Worker a += 1;
3246*4bdc9457SAndroid Build Coastguard Worker
3247*4bdc9457SAndroid Build Coastguard Worker size_t k = kc;
3248*4bdc9457SAndroid Build Coastguard Worker do {
3249*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_load_ps(w);
3250*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
3251*4bdc9457SAndroid Build Coastguard Worker w += 16;
3252*4bdc9457SAndroid Build Coastguard Worker
3253*4bdc9457SAndroid Build Coastguard Worker const __m256 va0 = _mm256_broadcast_ss(a0);
3254*4bdc9457SAndroid Build Coastguard Worker a0 += 1;
3255*4bdc9457SAndroid Build Coastguard Worker
3256*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
3257*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF);
3258*4bdc9457SAndroid Build Coastguard Worker k -= sizeof(float);
3259*4bdc9457SAndroid Build Coastguard Worker } while (k != 0);
3260*4bdc9457SAndroid Build Coastguard Worker p -= 1 * sizeof(void*);
3261*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
3262*4bdc9457SAndroid Build Coastguard Worker
3263*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
3264*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
3265*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
3266*4bdc9457SAndroid Build Coastguard Worker
3267*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
3268*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
3269*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
3270*4bdc9457SAndroid Build Coastguard Worker
3271*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nc >= 16) {
3272*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3273*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
3274*4bdc9457SAndroid Build Coastguard Worker c0 = (float*) ((uintptr_t) c0 + cn_stride);
3275*4bdc9457SAndroid Build Coastguard Worker
3276*4bdc9457SAndroid Build Coastguard Worker a = (const float**restrict) ((uintptr_t) a - ks);
3277*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
3278*4bdc9457SAndroid Build Coastguard Worker } else {
3279*4bdc9457SAndroid Build Coastguard Worker if (nc & 8) {
3280*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3281*4bdc9457SAndroid Build Coastguard Worker
3282*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = vacc0x89ABCDEF;
3283*4bdc9457SAndroid Build Coastguard Worker
3284*4bdc9457SAndroid Build Coastguard Worker c0 += 8;
3285*4bdc9457SAndroid Build Coastguard Worker }
3286*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
3287*4bdc9457SAndroid Build Coastguard Worker if (nc & 4) {
3288*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c0, vacc0x0123);
3289*4bdc9457SAndroid Build Coastguard Worker
3290*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
3291*4bdc9457SAndroid Build Coastguard Worker
3292*4bdc9457SAndroid Build Coastguard Worker c0 += 4;
3293*4bdc9457SAndroid Build Coastguard Worker }
3294*4bdc9457SAndroid Build Coastguard Worker if (nc & 2) {
3295*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c0, vacc0x0123);
3296*4bdc9457SAndroid Build Coastguard Worker
3297*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
3298*4bdc9457SAndroid Build Coastguard Worker
3299*4bdc9457SAndroid Build Coastguard Worker c0 += 2;
3300*4bdc9457SAndroid Build Coastguard Worker }
3301*4bdc9457SAndroid Build Coastguard Worker if (nc & 1) {
3302*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c0, vacc0x0123);
3303*4bdc9457SAndroid Build Coastguard Worker }
3304*4bdc9457SAndroid Build Coastguard Worker
3305*4bdc9457SAndroid Build Coastguard Worker nc = 0;
3306*4bdc9457SAndroid Build Coastguard Worker }
3307*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
3308*4bdc9457SAndroid Build Coastguard Worker }
3309*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3310*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast(
3311*4bdc9457SAndroid Build Coastguard Worker size_t mr,
3312*4bdc9457SAndroid Build Coastguard Worker size_t nc,
3313*4bdc9457SAndroid Build Coastguard Worker size_t kc,
3314*4bdc9457SAndroid Build Coastguard Worker size_t ks,
3315*4bdc9457SAndroid Build Coastguard Worker const float**restrict a,
3316*4bdc9457SAndroid Build Coastguard Worker const float*restrict w,
3317*4bdc9457SAndroid Build Coastguard Worker float*restrict c,
3318*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
3319*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
3320*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
3321*4bdc9457SAndroid Build Coastguard Worker const float* zero,
3322*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3323*4bdc9457SAndroid Build Coastguard Worker {
3324*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
3325*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
3326*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
3327*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
3328*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(float) == 0);
3329*4bdc9457SAndroid Build Coastguard Worker assert(ks != 0);
3330*4bdc9457SAndroid Build Coastguard Worker assert(ks % (1 * sizeof(void*)) == 0);
3331*4bdc9457SAndroid Build Coastguard Worker assert(a_offset % sizeof(float) == 0);
3332*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
3333*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
3334*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
3335*4bdc9457SAndroid Build Coastguard Worker
3336*4bdc9457SAndroid Build Coastguard Worker float* c0 = c;
3337*4bdc9457SAndroid Build Coastguard Worker
3338*4bdc9457SAndroid Build Coastguard Worker do {
3339*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_load_ps(w);
3340*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
3341*4bdc9457SAndroid Build Coastguard Worker w += 16;
3342*4bdc9457SAndroid Build Coastguard Worker
3343*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
3344*4bdc9457SAndroid Build Coastguard Worker do {
3345*4bdc9457SAndroid Build Coastguard Worker const float* restrict a0 = a[0];
3346*4bdc9457SAndroid Build Coastguard Worker assert(a0 != NULL);
3347*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
3348*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 + a_offset);
3349*4bdc9457SAndroid Build Coastguard Worker }
3350*4bdc9457SAndroid Build Coastguard Worker a += 1;
3351*4bdc9457SAndroid Build Coastguard Worker
3352*4bdc9457SAndroid Build Coastguard Worker size_t k = kc;
3353*4bdc9457SAndroid Build Coastguard Worker while (k >= 4 * sizeof(float)) {
3354*4bdc9457SAndroid Build Coastguard Worker __m256 va0 = _mm256_broadcast_ps((const __m128*) a0);
3355*4bdc9457SAndroid Build Coastguard Worker a0 += 4;
3356*4bdc9457SAndroid Build Coastguard Worker
3357*4bdc9457SAndroid Build Coastguard Worker
3358*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c0 = _mm256_load_ps(w + 0);
3359*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8);
3360*4bdc9457SAndroid Build Coastguard Worker
3361*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567);
3362*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF);
3363*4bdc9457SAndroid Build Coastguard Worker
3364*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3365*4bdc9457SAndroid Build Coastguard Worker
3366*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c1 = _mm256_load_ps(w + 16);
3367*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc1 = _mm256_load_ps(w + 24);
3368*4bdc9457SAndroid Build Coastguard Worker
3369*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567);
3370*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF);
3371*4bdc9457SAndroid Build Coastguard Worker
3372*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3373*4bdc9457SAndroid Build Coastguard Worker
3374*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c2 = _mm256_load_ps(w + 32);
3375*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc2 = _mm256_load_ps(w + 40);
3376*4bdc9457SAndroid Build Coastguard Worker
3377*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567);
3378*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF);
3379*4bdc9457SAndroid Build Coastguard Worker
3380*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3381*4bdc9457SAndroid Build Coastguard Worker
3382*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c3 = _mm256_load_ps(w + 48);
3383*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56);
3384*4bdc9457SAndroid Build Coastguard Worker
3385*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567);
3386*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF);
3387*4bdc9457SAndroid Build Coastguard Worker
3388*4bdc9457SAndroid Build Coastguard Worker
3389*4bdc9457SAndroid Build Coastguard Worker w += 64;
3390*4bdc9457SAndroid Build Coastguard Worker k -= 4 * sizeof(float);
3391*4bdc9457SAndroid Build Coastguard Worker }
3392*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(k != 0) {
3393*4bdc9457SAndroid Build Coastguard Worker __m256 va0 = _mm256_broadcast_ps((const __m128*) a0);
3394*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 + k);
3395*4bdc9457SAndroid Build Coastguard Worker
3396*4bdc9457SAndroid Build Coastguard Worker const __m256 vzero = _mm256_setzero_ps();
3397*4bdc9457SAndroid Build Coastguard Worker
3398*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c0 = _mm256_load_ps(w + 0);
3399*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8);
3400*4bdc9457SAndroid Build Coastguard Worker
3401*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc0x01234567);
3402*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc0x89ABCDEF);
3403*4bdc9457SAndroid Build Coastguard Worker
3404*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3405*4bdc9457SAndroid Build Coastguard Worker
3406*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c1 = _mm256_load_ps(w + 16);
3407*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc1 = _mm256_load_ps(w + 24);
3408*4bdc9457SAndroid Build Coastguard Worker
3409*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc0x01234567);
3410*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc0x89ABCDEF);
3411*4bdc9457SAndroid Build Coastguard Worker
3412*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3413*4bdc9457SAndroid Build Coastguard Worker
3414*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c2 = _mm256_load_ps(w + 32);
3415*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc2 = _mm256_load_ps(w + 40);
3416*4bdc9457SAndroid Build Coastguard Worker
3417*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc0x01234567);
3418*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc0x89ABCDEF);
3419*4bdc9457SAndroid Build Coastguard Worker
3420*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3421*4bdc9457SAndroid Build Coastguard Worker
3422*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c3 = _mm256_load_ps(w + 48);
3423*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56);
3424*4bdc9457SAndroid Build Coastguard Worker
3425*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc0x01234567);
3426*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc0x89ABCDEF);
3427*4bdc9457SAndroid Build Coastguard Worker
3428*4bdc9457SAndroid Build Coastguard Worker
3429*4bdc9457SAndroid Build Coastguard Worker w += 64;
3430*4bdc9457SAndroid Build Coastguard Worker }
3431*4bdc9457SAndroid Build Coastguard Worker p -= 1 * sizeof(void*);
3432*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
3433*4bdc9457SAndroid Build Coastguard Worker
3434*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
3435*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
3436*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
3437*4bdc9457SAndroid Build Coastguard Worker
3438*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
3439*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
3440*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
3441*4bdc9457SAndroid Build Coastguard Worker
3442*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nc >= 16) {
3443*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3444*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
3445*4bdc9457SAndroid Build Coastguard Worker c0 = (float*) ((uintptr_t) c0 + cn_stride);
3446*4bdc9457SAndroid Build Coastguard Worker
3447*4bdc9457SAndroid Build Coastguard Worker a = (const float**restrict) ((uintptr_t) a - ks);
3448*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
3449*4bdc9457SAndroid Build Coastguard Worker } else {
3450*4bdc9457SAndroid Build Coastguard Worker if (nc & 8) {
3451*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3452*4bdc9457SAndroid Build Coastguard Worker
3453*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = vacc0x89ABCDEF;
3454*4bdc9457SAndroid Build Coastguard Worker
3455*4bdc9457SAndroid Build Coastguard Worker c0 += 8;
3456*4bdc9457SAndroid Build Coastguard Worker }
3457*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
3458*4bdc9457SAndroid Build Coastguard Worker if (nc & 4) {
3459*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c0, vacc0x0123);
3460*4bdc9457SAndroid Build Coastguard Worker
3461*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
3462*4bdc9457SAndroid Build Coastguard Worker
3463*4bdc9457SAndroid Build Coastguard Worker c0 += 4;
3464*4bdc9457SAndroid Build Coastguard Worker }
3465*4bdc9457SAndroid Build Coastguard Worker if (nc & 2) {
3466*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c0, vacc0x0123);
3467*4bdc9457SAndroid Build Coastguard Worker
3468*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
3469*4bdc9457SAndroid Build Coastguard Worker
3470*4bdc9457SAndroid Build Coastguard Worker c0 += 2;
3471*4bdc9457SAndroid Build Coastguard Worker }
3472*4bdc9457SAndroid Build Coastguard Worker if (nc & 1) {
3473*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c0, vacc0x0123);
3474*4bdc9457SAndroid Build Coastguard Worker }
3475*4bdc9457SAndroid Build Coastguard Worker
3476*4bdc9457SAndroid Build Coastguard Worker nc = 0;
3477*4bdc9457SAndroid Build Coastguard Worker }
3478*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
3479*4bdc9457SAndroid Build Coastguard Worker }
3480*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3481*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast(
3482*4bdc9457SAndroid Build Coastguard Worker size_t mr,
3483*4bdc9457SAndroid Build Coastguard Worker size_t nc,
3484*4bdc9457SAndroid Build Coastguard Worker size_t kc,
3485*4bdc9457SAndroid Build Coastguard Worker size_t ks,
3486*4bdc9457SAndroid Build Coastguard Worker const float**restrict a,
3487*4bdc9457SAndroid Build Coastguard Worker const float*restrict w,
3488*4bdc9457SAndroid Build Coastguard Worker float*restrict c,
3489*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
3490*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
3491*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
3492*4bdc9457SAndroid Build Coastguard Worker const float* zero,
3493*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3494*4bdc9457SAndroid Build Coastguard Worker {
3495*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
3496*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 4);
3497*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
3498*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
3499*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(float) == 0);
3500*4bdc9457SAndroid Build Coastguard Worker assert(ks != 0);
3501*4bdc9457SAndroid Build Coastguard Worker assert(ks % (4 * sizeof(void*)) == 0);
3502*4bdc9457SAndroid Build Coastguard Worker assert(a_offset % sizeof(float) == 0);
3503*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
3504*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
3505*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
3506*4bdc9457SAndroid Build Coastguard Worker
3507*4bdc9457SAndroid Build Coastguard Worker float* c0 = c;
3508*4bdc9457SAndroid Build Coastguard Worker float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
3509*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
3510*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
3511*4bdc9457SAndroid Build Coastguard Worker }
3512*4bdc9457SAndroid Build Coastguard Worker float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
3513*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
3514*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
3515*4bdc9457SAndroid Build Coastguard Worker }
3516*4bdc9457SAndroid Build Coastguard Worker float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
3517*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr != 4) {
3518*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
3519*4bdc9457SAndroid Build Coastguard Worker }
3520*4bdc9457SAndroid Build Coastguard Worker
3521*4bdc9457SAndroid Build Coastguard Worker do {
3522*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_load_ps(w);
3523*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
3524*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x01234567 = vacc0x01234567;
3525*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
3526*4bdc9457SAndroid Build Coastguard Worker __m256 vacc2x01234567 = vacc0x01234567;
3527*4bdc9457SAndroid Build Coastguard Worker __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
3528*4bdc9457SAndroid Build Coastguard Worker __m256 vacc3x01234567 = vacc0x01234567;
3529*4bdc9457SAndroid Build Coastguard Worker __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
3530*4bdc9457SAndroid Build Coastguard Worker w += 16;
3531*4bdc9457SAndroid Build Coastguard Worker
3532*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
3533*4bdc9457SAndroid Build Coastguard Worker do {
3534*4bdc9457SAndroid Build Coastguard Worker const float* restrict a0 = a[0];
3535*4bdc9457SAndroid Build Coastguard Worker assert(a0 != NULL);
3536*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
3537*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 + a_offset);
3538*4bdc9457SAndroid Build Coastguard Worker }
3539*4bdc9457SAndroid Build Coastguard Worker const float* restrict a1 = a[1];
3540*4bdc9457SAndroid Build Coastguard Worker assert(a1 != NULL);
3541*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a1 != zero) {
3542*4bdc9457SAndroid Build Coastguard Worker a1 = (const float*) ((uintptr_t) a1 + a_offset);
3543*4bdc9457SAndroid Build Coastguard Worker }
3544*4bdc9457SAndroid Build Coastguard Worker const float* restrict a2 = a[2];
3545*4bdc9457SAndroid Build Coastguard Worker assert(a2 != NULL);
3546*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a2 != zero) {
3547*4bdc9457SAndroid Build Coastguard Worker a2 = (const float*) ((uintptr_t) a2 + a_offset);
3548*4bdc9457SAndroid Build Coastguard Worker }
3549*4bdc9457SAndroid Build Coastguard Worker const float* restrict a3 = a[3];
3550*4bdc9457SAndroid Build Coastguard Worker assert(a3 != NULL);
3551*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a3 != zero) {
3552*4bdc9457SAndroid Build Coastguard Worker a3 = (const float*) ((uintptr_t) a3 + a_offset);
3553*4bdc9457SAndroid Build Coastguard Worker }
3554*4bdc9457SAndroid Build Coastguard Worker a += 4;
3555*4bdc9457SAndroid Build Coastguard Worker
3556*4bdc9457SAndroid Build Coastguard Worker size_t k = kc;
3557*4bdc9457SAndroid Build Coastguard Worker while (k >= 4 * sizeof(float)) {
3558*4bdc9457SAndroid Build Coastguard Worker __m256 va0 = _mm256_broadcast_ps((const __m128*) a0);
3559*4bdc9457SAndroid Build Coastguard Worker a0 += 4;
3560*4bdc9457SAndroid Build Coastguard Worker __m256 va1 = _mm256_broadcast_ps((const __m128*) a1);
3561*4bdc9457SAndroid Build Coastguard Worker a1 += 4;
3562*4bdc9457SAndroid Build Coastguard Worker __m256 va2 = _mm256_broadcast_ps((const __m128*) a2);
3563*4bdc9457SAndroid Build Coastguard Worker a2 += 4;
3564*4bdc9457SAndroid Build Coastguard Worker __m256 va3 = _mm256_broadcast_ps((const __m128*) a3);
3565*4bdc9457SAndroid Build Coastguard Worker a3 += 4;
3566*4bdc9457SAndroid Build Coastguard Worker
3567*4bdc9457SAndroid Build Coastguard Worker
3568*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c0 = _mm256_load_ps(w + 0);
3569*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8);
3570*4bdc9457SAndroid Build Coastguard Worker
3571*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567);
3572*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c0, vacc1x01234567);
3573*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567);
3574*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c0, vacc3x01234567);
3575*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF);
3576*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF);
3577*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF);
3578*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF);
3579*4bdc9457SAndroid Build Coastguard Worker
3580*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3581*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
3582*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
3583*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
3584*4bdc9457SAndroid Build Coastguard Worker
3585*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c1 = _mm256_load_ps(w + 16);
3586*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc1 = _mm256_load_ps(w + 24);
3587*4bdc9457SAndroid Build Coastguard Worker
3588*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567);
3589*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c1, vacc1x01234567);
3590*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567);
3591*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c1, vacc3x01234567);
3592*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF);
3593*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc1, vacc1x89ABCDEF);
3594*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF);
3595*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc1, vacc3x89ABCDEF);
3596*4bdc9457SAndroid Build Coastguard Worker
3597*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3598*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
3599*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
3600*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
3601*4bdc9457SAndroid Build Coastguard Worker
3602*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c2 = _mm256_load_ps(w + 32);
3603*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc2 = _mm256_load_ps(w + 40);
3604*4bdc9457SAndroid Build Coastguard Worker
3605*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567);
3606*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567);
3607*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567);
3608*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567);
3609*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF);
3610*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc2, vacc1x89ABCDEF);
3611*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF);
3612*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc2, vacc3x89ABCDEF);
3613*4bdc9457SAndroid Build Coastguard Worker
3614*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3615*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
3616*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
3617*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
3618*4bdc9457SAndroid Build Coastguard Worker
3619*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c3 = _mm256_load_ps(w + 48);
3620*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56);
3621*4bdc9457SAndroid Build Coastguard Worker
3622*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567);
3623*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567);
3624*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567);
3625*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567);
3626*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF);
3627*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF);
3628*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF);
3629*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF);
3630*4bdc9457SAndroid Build Coastguard Worker
3631*4bdc9457SAndroid Build Coastguard Worker
3632*4bdc9457SAndroid Build Coastguard Worker w += 64;
3633*4bdc9457SAndroid Build Coastguard Worker k -= 4 * sizeof(float);
3634*4bdc9457SAndroid Build Coastguard Worker }
3635*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(k != 0) {
3636*4bdc9457SAndroid Build Coastguard Worker __m256 va0 = _mm256_broadcast_ps((const __m128*) a0);
3637*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 + k);
3638*4bdc9457SAndroid Build Coastguard Worker __m256 va1 = _mm256_broadcast_ps((const __m128*) a1);
3639*4bdc9457SAndroid Build Coastguard Worker a1 = (const float*) ((uintptr_t) a1 + k);
3640*4bdc9457SAndroid Build Coastguard Worker __m256 va2 = _mm256_broadcast_ps((const __m128*) a2);
3641*4bdc9457SAndroid Build Coastguard Worker a2 = (const float*) ((uintptr_t) a2 + k);
3642*4bdc9457SAndroid Build Coastguard Worker __m256 va3 = _mm256_broadcast_ps((const __m128*) a3);
3643*4bdc9457SAndroid Build Coastguard Worker a3 = (const float*) ((uintptr_t) a3 + k);
3644*4bdc9457SAndroid Build Coastguard Worker
3645*4bdc9457SAndroid Build Coastguard Worker const __m256 vzero = _mm256_setzero_ps();
3646*4bdc9457SAndroid Build Coastguard Worker
3647*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c0 = _mm256_load_ps(w + 0);
3648*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8);
3649*4bdc9457SAndroid Build Coastguard Worker
3650*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc0x01234567);
3651*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc1x01234567);
3652*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc2x01234567);
3653*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ)), vb01234567c0, vacc3x01234567);
3654*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc0x89ABCDEF);
3655*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc1x89ABCDEF);
3656*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc2x89ABCDEF);
3657*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc0, vacc3x89ABCDEF);
3658*4bdc9457SAndroid Build Coastguard Worker
3659*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3660*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
3661*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
3662*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
3663*4bdc9457SAndroid Build Coastguard Worker
3664*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c1 = _mm256_load_ps(w + 16);
3665*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc1 = _mm256_load_ps(w + 24);
3666*4bdc9457SAndroid Build Coastguard Worker
3667*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc0x01234567);
3668*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc1x01234567);
3669*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc2x01234567);
3670*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ)), vb01234567c1, vacc3x01234567);
3671*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc0x89ABCDEF);
3672*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc1x89ABCDEF);
3673*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc2x89ABCDEF);
3674*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc1, vacc3x89ABCDEF);
3675*4bdc9457SAndroid Build Coastguard Worker
3676*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3677*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
3678*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
3679*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
3680*4bdc9457SAndroid Build Coastguard Worker
3681*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c2 = _mm256_load_ps(w + 32);
3682*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc2 = _mm256_load_ps(w + 40);
3683*4bdc9457SAndroid Build Coastguard Worker
3684*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc0x01234567);
3685*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc1x01234567);
3686*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc2x01234567);
3687*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567c2, vacc3x01234567);
3688*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc0x89ABCDEF);
3689*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc1x89ABCDEF);
3690*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc2x89ABCDEF);
3691*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc2, vacc3x89ABCDEF);
3692*4bdc9457SAndroid Build Coastguard Worker
3693*4bdc9457SAndroid Build Coastguard Worker va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1));
3694*4bdc9457SAndroid Build Coastguard Worker va1 = _mm256_permute_ps(va1, _MM_SHUFFLE(0, 3, 2, 1));
3695*4bdc9457SAndroid Build Coastguard Worker va2 = _mm256_permute_ps(va2, _MM_SHUFFLE(0, 3, 2, 1));
3696*4bdc9457SAndroid Build Coastguard Worker va3 = _mm256_permute_ps(va3, _MM_SHUFFLE(0, 3, 2, 1));
3697*4bdc9457SAndroid Build Coastguard Worker
3698*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567c3 = _mm256_load_ps(w + 48);
3699*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56);
3700*4bdc9457SAndroid Build Coastguard Worker
3701*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc0x01234567);
3702*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc1x01234567);
3703*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc2x01234567);
3704*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ)), vb01234567c3, vacc3x01234567);
3705*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc0x89ABCDEF);
3706*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc1x89ABCDEF);
3707*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc2x89ABCDEF);
3708*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ)), vb89ABCDEFc3, vacc3x89ABCDEF);
3709*4bdc9457SAndroid Build Coastguard Worker
3710*4bdc9457SAndroid Build Coastguard Worker
3711*4bdc9457SAndroid Build Coastguard Worker w += 64;
3712*4bdc9457SAndroid Build Coastguard Worker }
3713*4bdc9457SAndroid Build Coastguard Worker p -= 4 * sizeof(void*);
3714*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
3715*4bdc9457SAndroid Build Coastguard Worker
3716*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
3717*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
3718*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
3719*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
3720*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
3721*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
3722*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
3723*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
3724*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
3725*4bdc9457SAndroid Build Coastguard Worker
3726*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
3727*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
3728*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
3729*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
3730*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
3731*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
3732*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
3733*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
3734*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
3735*4bdc9457SAndroid Build Coastguard Worker
3736*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nc >= 16) {
3737*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3, vacc3x01234567);
3738*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF);
3739*4bdc9457SAndroid Build Coastguard Worker c3 = (float*) ((uintptr_t) c3 + cn_stride);
3740*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2, vacc2x01234567);
3741*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF);
3742*4bdc9457SAndroid Build Coastguard Worker c2 = (float*) ((uintptr_t) c2 + cn_stride);
3743*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1, vacc1x01234567);
3744*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1 + 8, vacc1x89ABCDEF);
3745*4bdc9457SAndroid Build Coastguard Worker c1 = (float*) ((uintptr_t) c1 + cn_stride);
3746*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3747*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
3748*4bdc9457SAndroid Build Coastguard Worker c0 = (float*) ((uintptr_t) c0 + cn_stride);
3749*4bdc9457SAndroid Build Coastguard Worker
3750*4bdc9457SAndroid Build Coastguard Worker a = (const float**restrict) ((uintptr_t) a - ks);
3751*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
3752*4bdc9457SAndroid Build Coastguard Worker } else {
3753*4bdc9457SAndroid Build Coastguard Worker if (nc & 8) {
3754*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3, vacc3x01234567);
3755*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2, vacc2x01234567);
3756*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1, vacc1x01234567);
3757*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3758*4bdc9457SAndroid Build Coastguard Worker
3759*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = vacc3x89ABCDEF;
3760*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = vacc2x89ABCDEF;
3761*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = vacc1x89ABCDEF;
3762*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = vacc0x89ABCDEF;
3763*4bdc9457SAndroid Build Coastguard Worker
3764*4bdc9457SAndroid Build Coastguard Worker c3 += 8;
3765*4bdc9457SAndroid Build Coastguard Worker c2 += 8;
3766*4bdc9457SAndroid Build Coastguard Worker c1 += 8;
3767*4bdc9457SAndroid Build Coastguard Worker c0 += 8;
3768*4bdc9457SAndroid Build Coastguard Worker }
3769*4bdc9457SAndroid Build Coastguard Worker __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
3770*4bdc9457SAndroid Build Coastguard Worker __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
3771*4bdc9457SAndroid Build Coastguard Worker __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
3772*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
3773*4bdc9457SAndroid Build Coastguard Worker if (nc & 4) {
3774*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c3, vacc3x0123);
3775*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c2, vacc2x0123);
3776*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c1, vacc1x0123);
3777*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c0, vacc0x0123);
3778*4bdc9457SAndroid Build Coastguard Worker
3779*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
3780*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
3781*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
3782*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
3783*4bdc9457SAndroid Build Coastguard Worker
3784*4bdc9457SAndroid Build Coastguard Worker c3 += 4;
3785*4bdc9457SAndroid Build Coastguard Worker c2 += 4;
3786*4bdc9457SAndroid Build Coastguard Worker c1 += 4;
3787*4bdc9457SAndroid Build Coastguard Worker c0 += 4;
3788*4bdc9457SAndroid Build Coastguard Worker }
3789*4bdc9457SAndroid Build Coastguard Worker if (nc & 2) {
3790*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c3, vacc3x0123);
3791*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c2, vacc2x0123);
3792*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c1, vacc1x0123);
3793*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c0, vacc0x0123);
3794*4bdc9457SAndroid Build Coastguard Worker
3795*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
3796*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
3797*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
3798*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
3799*4bdc9457SAndroid Build Coastguard Worker
3800*4bdc9457SAndroid Build Coastguard Worker c3 += 2;
3801*4bdc9457SAndroid Build Coastguard Worker c2 += 2;
3802*4bdc9457SAndroid Build Coastguard Worker c1 += 2;
3803*4bdc9457SAndroid Build Coastguard Worker c0 += 2;
3804*4bdc9457SAndroid Build Coastguard Worker }
3805*4bdc9457SAndroid Build Coastguard Worker if (nc & 1) {
3806*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c3, vacc3x0123);
3807*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c2, vacc2x0123);
3808*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c1, vacc1x0123);
3809*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c0, vacc0x0123);
3810*4bdc9457SAndroid Build Coastguard Worker }
3811*4bdc9457SAndroid Build Coastguard Worker
3812*4bdc9457SAndroid Build Coastguard Worker nc = 0;
3813*4bdc9457SAndroid Build Coastguard Worker }
3814*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
3815*4bdc9457SAndroid Build Coastguard Worker }
3816*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3817*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast(
3818*4bdc9457SAndroid Build Coastguard Worker size_t mr,
3819*4bdc9457SAndroid Build Coastguard Worker size_t nc,
3820*4bdc9457SAndroid Build Coastguard Worker size_t kc,
3821*4bdc9457SAndroid Build Coastguard Worker size_t ks,
3822*4bdc9457SAndroid Build Coastguard Worker const float**restrict a,
3823*4bdc9457SAndroid Build Coastguard Worker const float*restrict w,
3824*4bdc9457SAndroid Build Coastguard Worker float*restrict c,
3825*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
3826*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
3827*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
3828*4bdc9457SAndroid Build Coastguard Worker const float* zero,
3829*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3830*4bdc9457SAndroid Build Coastguard Worker {
3831*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
3832*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 5);
3833*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
3834*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
3835*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(float) == 0);
3836*4bdc9457SAndroid Build Coastguard Worker assert(ks != 0);
3837*4bdc9457SAndroid Build Coastguard Worker assert(ks % (5 * sizeof(void*)) == 0);
3838*4bdc9457SAndroid Build Coastguard Worker assert(a_offset % sizeof(float) == 0);
3839*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
3840*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
3841*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
3842*4bdc9457SAndroid Build Coastguard Worker
3843*4bdc9457SAndroid Build Coastguard Worker float* c0 = c;
3844*4bdc9457SAndroid Build Coastguard Worker float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
3845*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
3846*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
3847*4bdc9457SAndroid Build Coastguard Worker }
3848*4bdc9457SAndroid Build Coastguard Worker float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
3849*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
3850*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
3851*4bdc9457SAndroid Build Coastguard Worker }
3852*4bdc9457SAndroid Build Coastguard Worker float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
3853*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 4) {
3854*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
3855*4bdc9457SAndroid Build Coastguard Worker }
3856*4bdc9457SAndroid Build Coastguard Worker float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
3857*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 4) {
3858*4bdc9457SAndroid Build Coastguard Worker c4 = c3;
3859*4bdc9457SAndroid Build Coastguard Worker }
3860*4bdc9457SAndroid Build Coastguard Worker
3861*4bdc9457SAndroid Build Coastguard Worker do {
3862*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_load_ps(w);
3863*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
3864*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x01234567 = vacc0x01234567;
3865*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
3866*4bdc9457SAndroid Build Coastguard Worker __m256 vacc2x01234567 = vacc0x01234567;
3867*4bdc9457SAndroid Build Coastguard Worker __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
3868*4bdc9457SAndroid Build Coastguard Worker __m256 vacc3x01234567 = vacc0x01234567;
3869*4bdc9457SAndroid Build Coastguard Worker __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
3870*4bdc9457SAndroid Build Coastguard Worker __m256 vacc4x01234567 = vacc0x01234567;
3871*4bdc9457SAndroid Build Coastguard Worker __m256 vacc4x89ABCDEF = vacc0x89ABCDEF;
3872*4bdc9457SAndroid Build Coastguard Worker w += 16;
3873*4bdc9457SAndroid Build Coastguard Worker
3874*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
3875*4bdc9457SAndroid Build Coastguard Worker do {
3876*4bdc9457SAndroid Build Coastguard Worker const float* restrict a0 = a[0];
3877*4bdc9457SAndroid Build Coastguard Worker assert(a0 != NULL);
3878*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
3879*4bdc9457SAndroid Build Coastguard Worker a0 = (const float*) ((uintptr_t) a0 + a_offset);
3880*4bdc9457SAndroid Build Coastguard Worker }
3881*4bdc9457SAndroid Build Coastguard Worker const float* restrict a1 = a[1];
3882*4bdc9457SAndroid Build Coastguard Worker assert(a1 != NULL);
3883*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a1 != zero) {
3884*4bdc9457SAndroid Build Coastguard Worker a1 = (const float*) ((uintptr_t) a1 + a_offset);
3885*4bdc9457SAndroid Build Coastguard Worker }
3886*4bdc9457SAndroid Build Coastguard Worker const float* restrict a2 = a[2];
3887*4bdc9457SAndroid Build Coastguard Worker assert(a2 != NULL);
3888*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a2 != zero) {
3889*4bdc9457SAndroid Build Coastguard Worker a2 = (const float*) ((uintptr_t) a2 + a_offset);
3890*4bdc9457SAndroid Build Coastguard Worker }
3891*4bdc9457SAndroid Build Coastguard Worker const float* restrict a3 = a[3];
3892*4bdc9457SAndroid Build Coastguard Worker assert(a3 != NULL);
3893*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a3 != zero) {
3894*4bdc9457SAndroid Build Coastguard Worker a3 = (const float*) ((uintptr_t) a3 + a_offset);
3895*4bdc9457SAndroid Build Coastguard Worker }
3896*4bdc9457SAndroid Build Coastguard Worker const float* restrict a4 = a[4];
3897*4bdc9457SAndroid Build Coastguard Worker assert(a4 != NULL);
3898*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a4 != zero) {
3899*4bdc9457SAndroid Build Coastguard Worker a4 = (const float*) ((uintptr_t) a4 + a_offset);
3900*4bdc9457SAndroid Build Coastguard Worker }
3901*4bdc9457SAndroid Build Coastguard Worker a += 5;
3902*4bdc9457SAndroid Build Coastguard Worker
3903*4bdc9457SAndroid Build Coastguard Worker size_t k = kc;
3904*4bdc9457SAndroid Build Coastguard Worker do {
3905*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_load_ps(w);
3906*4bdc9457SAndroid Build Coastguard Worker const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
3907*4bdc9457SAndroid Build Coastguard Worker w += 16;
3908*4bdc9457SAndroid Build Coastguard Worker
3909*4bdc9457SAndroid Build Coastguard Worker const __m256 va0 = _mm256_broadcast_ss(a0);
3910*4bdc9457SAndroid Build Coastguard Worker a0 += 1;
3911*4bdc9457SAndroid Build Coastguard Worker const __m256 va1 = _mm256_broadcast_ss(a1);
3912*4bdc9457SAndroid Build Coastguard Worker a1 += 1;
3913*4bdc9457SAndroid Build Coastguard Worker const __m256 va2 = _mm256_broadcast_ss(a2);
3914*4bdc9457SAndroid Build Coastguard Worker a2 += 1;
3915*4bdc9457SAndroid Build Coastguard Worker const __m256 va3 = _mm256_broadcast_ss(a3);
3916*4bdc9457SAndroid Build Coastguard Worker a3 += 1;
3917*4bdc9457SAndroid Build Coastguard Worker const __m256 va4 = _mm256_broadcast_ss(a4);
3918*4bdc9457SAndroid Build Coastguard Worker a4 += 1;
3919*4bdc9457SAndroid Build Coastguard Worker
3920*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567);
3921*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF);
3922*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567);
3923*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF);
3924*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567);
3925*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF);
3926*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567);
3927*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF);
3928*4bdc9457SAndroid Build Coastguard Worker vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567);
3929*4bdc9457SAndroid Build Coastguard Worker vacc4x89ABCDEF = _mm256_fmadd_ps(va4, vb89ABCDEF, vacc4x89ABCDEF);
3930*4bdc9457SAndroid Build Coastguard Worker k -= sizeof(float);
3931*4bdc9457SAndroid Build Coastguard Worker } while (k != 0);
3932*4bdc9457SAndroid Build Coastguard Worker p -= 5 * sizeof(void*);
3933*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
3934*4bdc9457SAndroid Build Coastguard Worker
3935*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
3936*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
3937*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
3938*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
3939*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
3940*4bdc9457SAndroid Build Coastguard Worker vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
3941*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
3942*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
3943*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
3944*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
3945*4bdc9457SAndroid Build Coastguard Worker vacc4x89ABCDEF = _mm256_max_ps(vacc4x89ABCDEF, vmin);
3946*4bdc9457SAndroid Build Coastguard Worker
3947*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
3948*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
3949*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
3950*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
3951*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
3952*4bdc9457SAndroid Build Coastguard Worker vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
3953*4bdc9457SAndroid Build Coastguard Worker vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
3954*4bdc9457SAndroid Build Coastguard Worker vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
3955*4bdc9457SAndroid Build Coastguard Worker vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
3956*4bdc9457SAndroid Build Coastguard Worker vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
3957*4bdc9457SAndroid Build Coastguard Worker vacc4x89ABCDEF = _mm256_min_ps(vacc4x89ABCDEF, vmax);
3958*4bdc9457SAndroid Build Coastguard Worker
3959*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nc >= 16) {
3960*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c4, vacc4x01234567);
3961*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c4 + 8, vacc4x89ABCDEF);
3962*4bdc9457SAndroid Build Coastguard Worker c4 = (float*) ((uintptr_t) c4 + cn_stride);
3963*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3, vacc3x01234567);
3964*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF);
3965*4bdc9457SAndroid Build Coastguard Worker c3 = (float*) ((uintptr_t) c3 + cn_stride);
3966*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2, vacc2x01234567);
3967*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF);
3968*4bdc9457SAndroid Build Coastguard Worker c2 = (float*) ((uintptr_t) c2 + cn_stride);
3969*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1, vacc1x01234567);
3970*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1 + 8, vacc1x89ABCDEF);
3971*4bdc9457SAndroid Build Coastguard Worker c1 = (float*) ((uintptr_t) c1 + cn_stride);
3972*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3973*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
3974*4bdc9457SAndroid Build Coastguard Worker c0 = (float*) ((uintptr_t) c0 + cn_stride);
3975*4bdc9457SAndroid Build Coastguard Worker
3976*4bdc9457SAndroid Build Coastguard Worker a = (const float**restrict) ((uintptr_t) a - ks);
3977*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
3978*4bdc9457SAndroid Build Coastguard Worker } else {
3979*4bdc9457SAndroid Build Coastguard Worker if (nc & 8) {
3980*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c4, vacc4x01234567);
3981*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c3, vacc3x01234567);
3982*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c2, vacc2x01234567);
3983*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c1, vacc1x01234567);
3984*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(c0, vacc0x01234567);
3985*4bdc9457SAndroid Build Coastguard Worker
3986*4bdc9457SAndroid Build Coastguard Worker vacc4x01234567 = vacc4x89ABCDEF;
3987*4bdc9457SAndroid Build Coastguard Worker vacc3x01234567 = vacc3x89ABCDEF;
3988*4bdc9457SAndroid Build Coastguard Worker vacc2x01234567 = vacc2x89ABCDEF;
3989*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = vacc1x89ABCDEF;
3990*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = vacc0x89ABCDEF;
3991*4bdc9457SAndroid Build Coastguard Worker
3992*4bdc9457SAndroid Build Coastguard Worker c4 += 8;
3993*4bdc9457SAndroid Build Coastguard Worker c3 += 8;
3994*4bdc9457SAndroid Build Coastguard Worker c2 += 8;
3995*4bdc9457SAndroid Build Coastguard Worker c1 += 8;
3996*4bdc9457SAndroid Build Coastguard Worker c0 += 8;
3997*4bdc9457SAndroid Build Coastguard Worker }
3998*4bdc9457SAndroid Build Coastguard Worker __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
3999*4bdc9457SAndroid Build Coastguard Worker __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
4000*4bdc9457SAndroid Build Coastguard Worker __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
4001*4bdc9457SAndroid Build Coastguard Worker __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
4002*4bdc9457SAndroid Build Coastguard Worker __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
4003*4bdc9457SAndroid Build Coastguard Worker if (nc & 4) {
4004*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c4, vacc4x0123);
4005*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c3, vacc3x0123);
4006*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c2, vacc2x0123);
4007*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c1, vacc1x0123);
4008*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(c0, vacc0x0123);
4009*4bdc9457SAndroid Build Coastguard Worker
4010*4bdc9457SAndroid Build Coastguard Worker vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
4011*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
4012*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
4013*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
4014*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
4015*4bdc9457SAndroid Build Coastguard Worker
4016*4bdc9457SAndroid Build Coastguard Worker c4 += 4;
4017*4bdc9457SAndroid Build Coastguard Worker c3 += 4;
4018*4bdc9457SAndroid Build Coastguard Worker c2 += 4;
4019*4bdc9457SAndroid Build Coastguard Worker c1 += 4;
4020*4bdc9457SAndroid Build Coastguard Worker c0 += 4;
4021*4bdc9457SAndroid Build Coastguard Worker }
4022*4bdc9457SAndroid Build Coastguard Worker if (nc & 2) {
4023*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c4, vacc4x0123);
4024*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c3, vacc3x0123);
4025*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c2, vacc2x0123);
4026*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c1, vacc1x0123);
4027*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) c0, vacc0x0123);
4028*4bdc9457SAndroid Build Coastguard Worker
4029*4bdc9457SAndroid Build Coastguard Worker vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
4030*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
4031*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
4032*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
4033*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
4034*4bdc9457SAndroid Build Coastguard Worker
4035*4bdc9457SAndroid Build Coastguard Worker c4 += 2;
4036*4bdc9457SAndroid Build Coastguard Worker c3 += 2;
4037*4bdc9457SAndroid Build Coastguard Worker c2 += 2;
4038*4bdc9457SAndroid Build Coastguard Worker c1 += 2;
4039*4bdc9457SAndroid Build Coastguard Worker c0 += 2;
4040*4bdc9457SAndroid Build Coastguard Worker }
4041*4bdc9457SAndroid Build Coastguard Worker if (nc & 1) {
4042*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c4, vacc4x0123);
4043*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c3, vacc3x0123);
4044*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c2, vacc2x0123);
4045*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c1, vacc1x0123);
4046*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(c0, vacc0x0123);
4047*4bdc9457SAndroid Build Coastguard Worker }
4048*4bdc9457SAndroid Build Coastguard Worker
4049*4bdc9457SAndroid Build Coastguard Worker nc = 0;
4050*4bdc9457SAndroid Build Coastguard Worker }
4051*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
4052*4bdc9457SAndroid Build Coastguard Worker }
4053*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_vhswish_ukernel__fma3_x16(size_t n,const float * x,float * y,const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])4054*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vhswish_ukernel__fma3_x16(
4055*4bdc9457SAndroid Build Coastguard Worker size_t n,
4056*4bdc9457SAndroid Build Coastguard Worker const float* x,
4057*4bdc9457SAndroid Build Coastguard Worker float* y,
4058*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)])
4059*4bdc9457SAndroid Build Coastguard Worker {
4060*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
4061*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(float) == 0);
4062*4bdc9457SAndroid Build Coastguard Worker
4063*4bdc9457SAndroid Build Coastguard Worker const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
4064*4bdc9457SAndroid Build Coastguard Worker const __m256 vhalf = _mm256_load_ps(params->avx.half);
4065*4bdc9457SAndroid Build Coastguard Worker const __m256 vone = _mm256_load_ps(params->avx.one);
4066*4bdc9457SAndroid Build Coastguard Worker const __m256 vzero = _mm256_setzero_ps();
4067*4bdc9457SAndroid Build Coastguard Worker
4068*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4069*4bdc9457SAndroid Build Coastguard Worker const __m256 vx01234567 = _mm256_loadu_ps(x);
4070*4bdc9457SAndroid Build Coastguard Worker const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4071*4bdc9457SAndroid Build Coastguard Worker x += 16;
4072*4bdc9457SAndroid Build Coastguard Worker
4073*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_fmadd_ps(vx01234567, vsixth, vhalf);
4074*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEF = _mm256_fmadd_ps(vx89ABCDEF, vsixth, vhalf);
4075*4bdc9457SAndroid Build Coastguard Worker
4076*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_max_ps(vacc01234567, vzero);
4077*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vzero);
4078*4bdc9457SAndroid Build Coastguard Worker
4079*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vone);
4080*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vone);
4081*4bdc9457SAndroid Build Coastguard Worker
4082*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_mul_ps(vacc01234567, vx01234567);
4083*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_mul_ps(vacc89ABCDEF, vx89ABCDEF);
4084*4bdc9457SAndroid Build Coastguard Worker
4085*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(y, vacc01234567);
4086*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(y + 8, vacc89ABCDEF);
4087*4bdc9457SAndroid Build Coastguard Worker y += 16;
4088*4bdc9457SAndroid Build Coastguard Worker }
4089*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4090*4bdc9457SAndroid Build Coastguard Worker const __m256 vx = _mm256_loadu_ps(x);
4091*4bdc9457SAndroid Build Coastguard Worker x += 8;
4092*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf);
4093*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_max_ps(vacc, vzero);
4094*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_min_ps(vacc, vone);
4095*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_mul_ps(vacc, vx);
4096*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(y, vacc);
4097*4bdc9457SAndroid Build Coastguard Worker y += 8;
4098*4bdc9457SAndroid Build Coastguard Worker }
4099*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
4100*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(float));
4101*4bdc9457SAndroid Build Coastguard Worker assert(n <= 7 * sizeof(float));
4102*4bdc9457SAndroid Build Coastguard Worker const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4103*4bdc9457SAndroid Build Coastguard Worker
4104*4bdc9457SAndroid Build Coastguard Worker const __m256 vx = _mm256_maskload_ps(x, vmask);
4105*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf);
4106*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_max_ps(vacc, vzero);
4107*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_min_ps(vacc, vone);
4108*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_mul_ps(vacc, vx);
4109*4bdc9457SAndroid Build Coastguard Worker
4110*4bdc9457SAndroid Build Coastguard Worker __m128 vacc_lo = _mm256_castps256_ps128(vacc);
4111*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(float))) {
4112*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(y, vacc_lo);
4113*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm256_extractf128_ps(vacc, 1);
4114*4bdc9457SAndroid Build Coastguard Worker y += 4;
4115*4bdc9457SAndroid Build Coastguard Worker }
4116*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(float))) {
4117*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) y, vacc_lo);
4118*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
4119*4bdc9457SAndroid Build Coastguard Worker y += 2;
4120*4bdc9457SAndroid Build Coastguard Worker }
4121*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(float))) {
4122*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(y, vacc_lo);
4123*4bdc9457SAndroid Build Coastguard Worker }
4124*4bdc9457SAndroid Build Coastguard Worker }
4125*4bdc9457SAndroid Build Coastguard Worker }
4126