1*4bdc9457SAndroid Build Coastguard Worker // Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker //
3*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker
6*4bdc9457SAndroid Build Coastguard Worker #include <assert.h>
7*4bdc9457SAndroid Build Coastguard Worker
8*4bdc9457SAndroid Build Coastguard Worker #include <immintrin.h>
9*4bdc9457SAndroid Build Coastguard Worker
10*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/common.h>
11*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/dwconv.h>
12*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/gemm.h>
13*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/igemm.h>
14*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/intrinsics-polyfill.h>
15*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/lut.h>
16*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/math.h>
17*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vadd.h>
18*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vcvt.h>
19*4bdc9457SAndroid Build Coastguard Worker
20*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_f32_vcvt_ukernel__avx512skx_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])21*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_f32_vcvt_ukernel__avx512skx_x16(
22*4bdc9457SAndroid Build Coastguard Worker size_t n,
23*4bdc9457SAndroid Build Coastguard Worker const void* input,
24*4bdc9457SAndroid Build Coastguard Worker float* output,
25*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
26*4bdc9457SAndroid Build Coastguard Worker {
27*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
28*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
29*4bdc9457SAndroid Build Coastguard Worker assert(input != NULL);
30*4bdc9457SAndroid Build Coastguard Worker assert(output != NULL);
31*4bdc9457SAndroid Build Coastguard Worker
32*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
33*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
34*4bdc9457SAndroid Build Coastguard Worker const __m512 vacc = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*) i));
35*4bdc9457SAndroid Build Coastguard Worker i += 16;
36*4bdc9457SAndroid Build Coastguard Worker
37*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_ps(output, vacc);
38*4bdc9457SAndroid Build Coastguard Worker output += 16;
39*4bdc9457SAndroid Build Coastguard Worker }
40*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
41*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(uint16_t));
42*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(uint16_t));
43*4bdc9457SAndroid Build Coastguard Worker
44*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 32-bit elements (depends on n).
45*4bdc9457SAndroid Build Coastguard Worker n >>= 1 /* log2(sizeof(uint16_t)) */;
46*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
47*4bdc9457SAndroid Build Coastguard Worker
48*4bdc9457SAndroid Build Coastguard Worker const __m512 vacc = _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(vmask, i));
49*4bdc9457SAndroid Build Coastguard Worker
50*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_ps(output, vmask, vacc);
51*4bdc9457SAndroid Build Coastguard Worker }
52*4bdc9457SAndroid Build Coastguard Worker }
53*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_f16_vcvt_ukernel__avx512skx_x16(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])54*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_f16_vcvt_ukernel__avx512skx_x16(
55*4bdc9457SAndroid Build Coastguard Worker size_t n,
56*4bdc9457SAndroid Build Coastguard Worker const float* input,
57*4bdc9457SAndroid Build Coastguard Worker void* output,
58*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
59*4bdc9457SAndroid Build Coastguard Worker {
60*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
61*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(float) == 0);
62*4bdc9457SAndroid Build Coastguard Worker assert(input != NULL);
63*4bdc9457SAndroid Build Coastguard Worker assert(output != NULL);
64*4bdc9457SAndroid Build Coastguard Worker
65*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
66*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
67*4bdc9457SAndroid Build Coastguard Worker const __m512 vf = _mm512_loadu_ps(input);
68*4bdc9457SAndroid Build Coastguard Worker input += 16;
69*4bdc9457SAndroid Build Coastguard Worker
70*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) o, _mm512_cvtps_ph(vf, _MM_FROUND_NO_EXC));
71*4bdc9457SAndroid Build Coastguard Worker o += 16;
72*4bdc9457SAndroid Build Coastguard Worker }
73*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
74*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(float));
75*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(float));
76*4bdc9457SAndroid Build Coastguard Worker
77*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid elements (depends on n).
78*4bdc9457SAndroid Build Coastguard Worker n >>= 2 /* log2(sizeof(float)) */;
79*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
80*4bdc9457SAndroid Build Coastguard Worker
81*4bdc9457SAndroid Build Coastguard Worker const __m512 vf = _mm512_maskz_loadu_ps(vmask, input);
82*4bdc9457SAndroid Build Coastguard Worker const __m256i vh = _mm512_cvtps_ph(vf, _MM_FROUND_NO_EXC);
83*4bdc9457SAndroid Build Coastguard Worker _mm256_mask_storeu_epi16(o, vmask, vh);
84*4bdc9457SAndroid Build Coastguard Worker }
85*4bdc9457SAndroid Build Coastguard Worker }
86*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_qs8_vcvt_ukernel__avx512skx_x128(size_t n,const float * x,int8_t * y,const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])87*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_qs8_vcvt_ukernel__avx512skx_x128(
88*4bdc9457SAndroid Build Coastguard Worker size_t n,
89*4bdc9457SAndroid Build Coastguard Worker const float* x,
90*4bdc9457SAndroid Build Coastguard Worker int8_t* y,
91*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
92*4bdc9457SAndroid Build Coastguard Worker {
93*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
94*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(float) == 0);
95*4bdc9457SAndroid Build Coastguard Worker assert(x != NULL);
96*4bdc9457SAndroid Build Coastguard Worker assert(y != NULL);
97*4bdc9457SAndroid Build Coastguard Worker
98*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->avx2.scale);
99*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->avx512.output_max_less_zero_point);
100*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->avx512.output_zero_point);
101*4bdc9457SAndroid Build Coastguard Worker const __m512i vshuffle512_mask = _mm512_load_si512(params->avx512.shuffle512_mask);
102*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_min = _mm512_load_si512(params->avx512.output_min);
103*4bdc9457SAndroid Build Coastguard Worker for (; n >= 128 * sizeof(float); n -= 128 * sizeof(float)) {
104*4bdc9457SAndroid Build Coastguard Worker __m512 vx0123 = _mm512_loadu_ps(x);
105*4bdc9457SAndroid Build Coastguard Worker __m512 vx4567 = _mm512_loadu_ps(x + 16);
106*4bdc9457SAndroid Build Coastguard Worker __m512 vx89AB = _mm512_loadu_ps(x + 32);
107*4bdc9457SAndroid Build Coastguard Worker __m512 vxCDEF = _mm512_loadu_ps(x + 48);
108*4bdc9457SAndroid Build Coastguard Worker __m512 vxGHIJ = _mm512_loadu_ps(x + 64);
109*4bdc9457SAndroid Build Coastguard Worker __m512 vxKLMN = _mm512_loadu_ps(x + 80);
110*4bdc9457SAndroid Build Coastguard Worker __m512 vxOPQR = _mm512_loadu_ps(x + 96);
111*4bdc9457SAndroid Build Coastguard Worker __m512 vxSTUV = _mm512_loadu_ps(x + 112);
112*4bdc9457SAndroid Build Coastguard Worker x += 128;
113*4bdc9457SAndroid Build Coastguard Worker
114*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_mul_ps(vx0123, vscale);
115*4bdc9457SAndroid Build Coastguard Worker vx4567 = _mm512_mul_ps(vx4567, vscale);
116*4bdc9457SAndroid Build Coastguard Worker vx89AB = _mm512_mul_ps(vx89AB, vscale);
117*4bdc9457SAndroid Build Coastguard Worker vxCDEF = _mm512_mul_ps(vxCDEF, vscale);
118*4bdc9457SAndroid Build Coastguard Worker vxGHIJ = _mm512_mul_ps(vxGHIJ, vscale);
119*4bdc9457SAndroid Build Coastguard Worker vxKLMN = _mm512_mul_ps(vxKLMN, vscale);
120*4bdc9457SAndroid Build Coastguard Worker vxOPQR = _mm512_mul_ps(vxOPQR, vscale);
121*4bdc9457SAndroid Build Coastguard Worker vxSTUV = _mm512_mul_ps(vxSTUV, vscale);
122*4bdc9457SAndroid Build Coastguard Worker
123*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_min_ps(vx0123, voutput_max_less_zero_point);
124*4bdc9457SAndroid Build Coastguard Worker vx4567 = _mm512_min_ps(vx4567, voutput_max_less_zero_point);
125*4bdc9457SAndroid Build Coastguard Worker vx89AB = _mm512_min_ps(vx89AB, voutput_max_less_zero_point);
126*4bdc9457SAndroid Build Coastguard Worker vxCDEF = _mm512_min_ps(vxCDEF, voutput_max_less_zero_point);
127*4bdc9457SAndroid Build Coastguard Worker vxGHIJ = _mm512_min_ps(vxGHIJ, voutput_max_less_zero_point);
128*4bdc9457SAndroid Build Coastguard Worker vxKLMN = _mm512_min_ps(vxKLMN, voutput_max_less_zero_point);
129*4bdc9457SAndroid Build Coastguard Worker vxOPQR = _mm512_min_ps(vxOPQR, voutput_max_less_zero_point);
130*4bdc9457SAndroid Build Coastguard Worker vxSTUV = _mm512_min_ps(vxSTUV, voutput_max_less_zero_point);
131*4bdc9457SAndroid Build Coastguard Worker
132*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0123 = _mm512_cvtps_epi32(vx0123);
133*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc4567 = _mm512_cvtps_epi32(vx4567);
134*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc89AB = _mm512_cvtps_epi32(vx89AB);
135*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccCDEF = _mm512_cvtps_epi32(vxCDEF);
136*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccGHIJ = _mm512_cvtps_epi32(vxGHIJ);
137*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccKLMN = _mm512_cvtps_epi32(vxKLMN);
138*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccOPQR = _mm512_cvtps_epi32(vxOPQR);
139*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccSTUV = _mm512_cvtps_epi32(vxSTUV);
140*4bdc9457SAndroid Build Coastguard Worker
141*4bdc9457SAndroid Build Coastguard Worker __m512i vacc04152637 = _mm512_packs_epi32(vacc0123, vacc4567);
142*4bdc9457SAndroid Build Coastguard Worker __m512i vacc8C9DAEBF = _mm512_packs_epi32(vacc89AB, vaccCDEF);
143*4bdc9457SAndroid Build Coastguard Worker __m512i vaccGKHLIMJN = _mm512_packs_epi32(vaccGHIJ, vaccKLMN);
144*4bdc9457SAndroid Build Coastguard Worker __m512i vaccOSPTQURV = _mm512_packs_epi32(vaccOPQR, vaccSTUV);
145*4bdc9457SAndroid Build Coastguard Worker
146*4bdc9457SAndroid Build Coastguard Worker vacc04152637 = _mm512_adds_epi16(vacc04152637, voutput_zero_point);
147*4bdc9457SAndroid Build Coastguard Worker vacc8C9DAEBF = _mm512_adds_epi16(vacc8C9DAEBF, voutput_zero_point);
148*4bdc9457SAndroid Build Coastguard Worker vaccGKHLIMJN = _mm512_adds_epi16(vaccGKHLIMJN, voutput_zero_point);
149*4bdc9457SAndroid Build Coastguard Worker vaccOSPTQURV = _mm512_adds_epi16(vaccOSPTQURV, voutput_zero_point);
150*4bdc9457SAndroid Build Coastguard Worker
151*4bdc9457SAndroid Build Coastguard Worker __m512i vy048C159D26AE37BF = _mm512_packs_epi16(vacc04152637, vacc8C9DAEBF);
152*4bdc9457SAndroid Build Coastguard Worker __m512i vyGKOSHLPTIMQUJNRV = _mm512_packs_epi16(vaccGKHLIMJN, vaccOSPTQURV);
153*4bdc9457SAndroid Build Coastguard Worker
154*4bdc9457SAndroid Build Coastguard Worker vy048C159D26AE37BF = _mm512_max_epi8(vy048C159D26AE37BF, voutput_min);
155*4bdc9457SAndroid Build Coastguard Worker vyGKOSHLPTIMQUJNRV = _mm512_max_epi8(vyGKOSHLPTIMQUJNRV, voutput_min);
156*4bdc9457SAndroid Build Coastguard Worker
157*4bdc9457SAndroid Build Coastguard Worker const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF);
158*4bdc9457SAndroid Build Coastguard Worker const __m512i vyGHIJKLMNOPQRSTUV = _mm512_permutexvar_epi32(vshuffle512_mask, vyGKOSHLPTIMQUJNRV);
159*4bdc9457SAndroid Build Coastguard Worker
160*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_si512(y, vy0123456789ABCDEF);
161*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_si512(y + 64, vyGHIJKLMNOPQRSTUV);
162*4bdc9457SAndroid Build Coastguard Worker y += 128;
163*4bdc9457SAndroid Build Coastguard Worker }
164*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
165*4bdc9457SAndroid Build Coastguard Worker __m512 vx0123 = _mm512_loadu_ps(x);
166*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_mul_ps(vx0123, vscale);
167*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_min_ps(vx0123, voutput_max_less_zero_point);
168*4bdc9457SAndroid Build Coastguard Worker x += 16;
169*4bdc9457SAndroid Build Coastguard Worker
170*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0123 = _mm512_cvtps_epi32(vx0123);
171*4bdc9457SAndroid Build Coastguard Worker
172*4bdc9457SAndroid Build Coastguard Worker __m256i vacc0213 = _mm256_packs_epi32(_mm512_castsi512_si256(vacc0123), _mm512_extracti32x8_epi32(vacc0123, 1));
173*4bdc9457SAndroid Build Coastguard Worker vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
174*4bdc9457SAndroid Build Coastguard Worker const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
175*4bdc9457SAndroid Build Coastguard Worker __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
176*4bdc9457SAndroid Build Coastguard Worker vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min));
177*4bdc9457SAndroid Build Coastguard Worker
178*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy0123);
179*4bdc9457SAndroid Build Coastguard Worker y += 16;
180*4bdc9457SAndroid Build Coastguard Worker }
181*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
182*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(float));
183*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(float));
184*4bdc9457SAndroid Build Coastguard Worker
185*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid elements (depends on n).
186*4bdc9457SAndroid Build Coastguard Worker n >>= 2 /* log2(sizeof(float)) */;
187*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
188*4bdc9457SAndroid Build Coastguard Worker
189*4bdc9457SAndroid Build Coastguard Worker __m512 vx0123 = _mm512_maskz_loadu_ps(vmask, x);
190*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_mul_ps(vx0123, vscale);
191*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_min_ps(vx0123, voutput_max_less_zero_point);
192*4bdc9457SAndroid Build Coastguard Worker
193*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0123 = _mm512_cvtps_epi32(vx0123);
194*4bdc9457SAndroid Build Coastguard Worker
195*4bdc9457SAndroid Build Coastguard Worker __m256i vacc0213 = _mm256_packs_epi32(_mm512_castsi512_si256(vacc0123), _mm512_extracti32x8_epi32(vacc0123, 1));
196*4bdc9457SAndroid Build Coastguard Worker vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
197*4bdc9457SAndroid Build Coastguard Worker const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
198*4bdc9457SAndroid Build Coastguard Worker __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
199*4bdc9457SAndroid Build Coastguard Worker vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min));
200*4bdc9457SAndroid Build Coastguard Worker
201*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(y, vmask, vy0123);
202*4bdc9457SAndroid Build Coastguard Worker }
203*4bdc9457SAndroid Build Coastguard Worker }
204*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_qu8_vcvt_ukernel__avx512skx_x128(size_t n,const float * x,uint8_t * y,const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])205*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_qu8_vcvt_ukernel__avx512skx_x128(
206*4bdc9457SAndroid Build Coastguard Worker size_t n,
207*4bdc9457SAndroid Build Coastguard Worker const float* x,
208*4bdc9457SAndroid Build Coastguard Worker uint8_t* y,
209*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
210*4bdc9457SAndroid Build Coastguard Worker {
211*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
212*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(float) == 0);
213*4bdc9457SAndroid Build Coastguard Worker assert(x != NULL);
214*4bdc9457SAndroid Build Coastguard Worker assert(y != NULL);
215*4bdc9457SAndroid Build Coastguard Worker
216*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->avx2.scale);
217*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->avx512.output_max_less_zero_point);
218*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->avx512.output_zero_point);
219*4bdc9457SAndroid Build Coastguard Worker const __m512i vshuffle512_mask = _mm512_load_si512(params->avx512.shuffle512_mask);
220*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_min = _mm512_load_si512(params->avx512.output_min);
221*4bdc9457SAndroid Build Coastguard Worker for (; n >= 128 * sizeof(float); n -= 128 * sizeof(float)) {
222*4bdc9457SAndroid Build Coastguard Worker __m512 vx0123 = _mm512_loadu_ps(x);
223*4bdc9457SAndroid Build Coastguard Worker __m512 vx4567 = _mm512_loadu_ps(x + 16);
224*4bdc9457SAndroid Build Coastguard Worker __m512 vx89AB = _mm512_loadu_ps(x + 32);
225*4bdc9457SAndroid Build Coastguard Worker __m512 vxCDEF = _mm512_loadu_ps(x + 48);
226*4bdc9457SAndroid Build Coastguard Worker __m512 vxGHIJ = _mm512_loadu_ps(x + 64);
227*4bdc9457SAndroid Build Coastguard Worker __m512 vxKLMN = _mm512_loadu_ps(x + 80);
228*4bdc9457SAndroid Build Coastguard Worker __m512 vxOPQR = _mm512_loadu_ps(x + 96);
229*4bdc9457SAndroid Build Coastguard Worker __m512 vxSTUV = _mm512_loadu_ps(x + 112);
230*4bdc9457SAndroid Build Coastguard Worker x += 128;
231*4bdc9457SAndroid Build Coastguard Worker
232*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_mul_ps(vx0123, vscale);
233*4bdc9457SAndroid Build Coastguard Worker vx4567 = _mm512_mul_ps(vx4567, vscale);
234*4bdc9457SAndroid Build Coastguard Worker vx89AB = _mm512_mul_ps(vx89AB, vscale);
235*4bdc9457SAndroid Build Coastguard Worker vxCDEF = _mm512_mul_ps(vxCDEF, vscale);
236*4bdc9457SAndroid Build Coastguard Worker vxGHIJ = _mm512_mul_ps(vxGHIJ, vscale);
237*4bdc9457SAndroid Build Coastguard Worker vxKLMN = _mm512_mul_ps(vxKLMN, vscale);
238*4bdc9457SAndroid Build Coastguard Worker vxOPQR = _mm512_mul_ps(vxOPQR, vscale);
239*4bdc9457SAndroid Build Coastguard Worker vxSTUV = _mm512_mul_ps(vxSTUV, vscale);
240*4bdc9457SAndroid Build Coastguard Worker
241*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_min_ps(vx0123, voutput_max_less_zero_point);
242*4bdc9457SAndroid Build Coastguard Worker vx4567 = _mm512_min_ps(vx4567, voutput_max_less_zero_point);
243*4bdc9457SAndroid Build Coastguard Worker vx89AB = _mm512_min_ps(vx89AB, voutput_max_less_zero_point);
244*4bdc9457SAndroid Build Coastguard Worker vxCDEF = _mm512_min_ps(vxCDEF, voutput_max_less_zero_point);
245*4bdc9457SAndroid Build Coastguard Worker vxGHIJ = _mm512_min_ps(vxGHIJ, voutput_max_less_zero_point);
246*4bdc9457SAndroid Build Coastguard Worker vxKLMN = _mm512_min_ps(vxKLMN, voutput_max_less_zero_point);
247*4bdc9457SAndroid Build Coastguard Worker vxOPQR = _mm512_min_ps(vxOPQR, voutput_max_less_zero_point);
248*4bdc9457SAndroid Build Coastguard Worker vxSTUV = _mm512_min_ps(vxSTUV, voutput_max_less_zero_point);
249*4bdc9457SAndroid Build Coastguard Worker
250*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0123 = _mm512_cvtps_epi32(vx0123);
251*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc4567 = _mm512_cvtps_epi32(vx4567);
252*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc89AB = _mm512_cvtps_epi32(vx89AB);
253*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccCDEF = _mm512_cvtps_epi32(vxCDEF);
254*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccGHIJ = _mm512_cvtps_epi32(vxGHIJ);
255*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccKLMN = _mm512_cvtps_epi32(vxKLMN);
256*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccOPQR = _mm512_cvtps_epi32(vxOPQR);
257*4bdc9457SAndroid Build Coastguard Worker const __m512i vaccSTUV = _mm512_cvtps_epi32(vxSTUV);
258*4bdc9457SAndroid Build Coastguard Worker
259*4bdc9457SAndroid Build Coastguard Worker __m512i vacc04152637 = _mm512_packs_epi32(vacc0123, vacc4567);
260*4bdc9457SAndroid Build Coastguard Worker __m512i vacc8C9DAEBF = _mm512_packs_epi32(vacc89AB, vaccCDEF);
261*4bdc9457SAndroid Build Coastguard Worker __m512i vaccGKHLIMJN = _mm512_packs_epi32(vaccGHIJ, vaccKLMN);
262*4bdc9457SAndroid Build Coastguard Worker __m512i vaccOSPTQURV = _mm512_packs_epi32(vaccOPQR, vaccSTUV);
263*4bdc9457SAndroid Build Coastguard Worker
264*4bdc9457SAndroid Build Coastguard Worker vacc04152637 = _mm512_adds_epi16(vacc04152637, voutput_zero_point);
265*4bdc9457SAndroid Build Coastguard Worker vacc8C9DAEBF = _mm512_adds_epi16(vacc8C9DAEBF, voutput_zero_point);
266*4bdc9457SAndroid Build Coastguard Worker vaccGKHLIMJN = _mm512_adds_epi16(vaccGKHLIMJN, voutput_zero_point);
267*4bdc9457SAndroid Build Coastguard Worker vaccOSPTQURV = _mm512_adds_epi16(vaccOSPTQURV, voutput_zero_point);
268*4bdc9457SAndroid Build Coastguard Worker
269*4bdc9457SAndroid Build Coastguard Worker __m512i vy048C159D26AE37BF = _mm512_packus_epi16(vacc04152637, vacc8C9DAEBF);
270*4bdc9457SAndroid Build Coastguard Worker __m512i vyGKOSHLPTIMQUJNRV = _mm512_packus_epi16(vaccGKHLIMJN, vaccOSPTQURV);
271*4bdc9457SAndroid Build Coastguard Worker
272*4bdc9457SAndroid Build Coastguard Worker vy048C159D26AE37BF = _mm512_max_epu8(vy048C159D26AE37BF, voutput_min);
273*4bdc9457SAndroid Build Coastguard Worker vyGKOSHLPTIMQUJNRV = _mm512_max_epu8(vyGKOSHLPTIMQUJNRV, voutput_min);
274*4bdc9457SAndroid Build Coastguard Worker
275*4bdc9457SAndroid Build Coastguard Worker const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF);
276*4bdc9457SAndroid Build Coastguard Worker const __m512i vyGHIJKLMNOPQRSTUV = _mm512_permutexvar_epi32(vshuffle512_mask, vyGKOSHLPTIMQUJNRV);
277*4bdc9457SAndroid Build Coastguard Worker
278*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_si512(y, vy0123456789ABCDEF);
279*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_si512(y + 64, vyGHIJKLMNOPQRSTUV);
280*4bdc9457SAndroid Build Coastguard Worker y += 128;
281*4bdc9457SAndroid Build Coastguard Worker }
282*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
283*4bdc9457SAndroid Build Coastguard Worker __m512 vx0123 = _mm512_loadu_ps(x);
284*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_mul_ps(vx0123, vscale);
285*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_min_ps(vx0123, voutput_max_less_zero_point);
286*4bdc9457SAndroid Build Coastguard Worker x += 16;
287*4bdc9457SAndroid Build Coastguard Worker
288*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0123 = _mm512_cvtps_epi32(vx0123);
289*4bdc9457SAndroid Build Coastguard Worker
290*4bdc9457SAndroid Build Coastguard Worker __m256i vacc0213 = _mm256_packs_epi32(_mm512_castsi512_si256(vacc0123), _mm512_extracti32x8_epi32(vacc0123, 1));
291*4bdc9457SAndroid Build Coastguard Worker vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
292*4bdc9457SAndroid Build Coastguard Worker const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
293*4bdc9457SAndroid Build Coastguard Worker __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
294*4bdc9457SAndroid Build Coastguard Worker vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min));
295*4bdc9457SAndroid Build Coastguard Worker
296*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy0123);
297*4bdc9457SAndroid Build Coastguard Worker y += 16;
298*4bdc9457SAndroid Build Coastguard Worker }
299*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
300*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(float));
301*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(float));
302*4bdc9457SAndroid Build Coastguard Worker
303*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid elements (depends on n).
304*4bdc9457SAndroid Build Coastguard Worker n >>= 2 /* log2(sizeof(float)) */;
305*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
306*4bdc9457SAndroid Build Coastguard Worker
307*4bdc9457SAndroid Build Coastguard Worker __m512 vx0123 = _mm512_maskz_loadu_ps(vmask, x);
308*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_mul_ps(vx0123, vscale);
309*4bdc9457SAndroid Build Coastguard Worker vx0123 = _mm512_min_ps(vx0123, voutput_max_less_zero_point);
310*4bdc9457SAndroid Build Coastguard Worker
311*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0123 = _mm512_cvtps_epi32(vx0123);
312*4bdc9457SAndroid Build Coastguard Worker
313*4bdc9457SAndroid Build Coastguard Worker __m256i vacc0213 = _mm256_packs_epi32(_mm512_castsi512_si256(vacc0123), _mm512_extracti32x8_epi32(vacc0123, 1));
314*4bdc9457SAndroid Build Coastguard Worker vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
315*4bdc9457SAndroid Build Coastguard Worker const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
316*4bdc9457SAndroid Build Coastguard Worker __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
317*4bdc9457SAndroid Build Coastguard Worker vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min));
318*4bdc9457SAndroid Build Coastguard Worker
319*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(y, vmask, vy0123);
320*4bdc9457SAndroid Build Coastguard Worker }
321*4bdc9457SAndroid Build Coastguard Worker }
322*4bdc9457SAndroid Build Coastguard Worker
xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])323*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32(
324*4bdc9457SAndroid Build Coastguard Worker size_t channels,
325*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
326*4bdc9457SAndroid Build Coastguard Worker const int8_t** input,
327*4bdc9457SAndroid Build Coastguard Worker const void* weights,
328*4bdc9457SAndroid Build Coastguard Worker int8_t* output,
329*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
330*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
331*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
332*4bdc9457SAndroid Build Coastguard Worker const int8_t* zero,
333*4bdc9457SAndroid Build Coastguard Worker const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_MSAN
334*4bdc9457SAndroid Build Coastguard Worker {
335*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
336*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
337*4bdc9457SAndroid Build Coastguard Worker
338*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
339*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
340*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_min);
341*4bdc9457SAndroid Build Coastguard Worker const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
342*4bdc9457SAndroid Build Coastguard Worker
343*4bdc9457SAndroid Build Coastguard Worker do {
344*4bdc9457SAndroid Build Coastguard Worker const int8_t* i0 = input[0];
345*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
346*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
347*4bdc9457SAndroid Build Coastguard Worker i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
348*4bdc9457SAndroid Build Coastguard Worker }
349*4bdc9457SAndroid Build Coastguard Worker const int8_t* i1 = input[1];
350*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
351*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
352*4bdc9457SAndroid Build Coastguard Worker i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
353*4bdc9457SAndroid Build Coastguard Worker }
354*4bdc9457SAndroid Build Coastguard Worker const int8_t* i2 = input[2];
355*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
356*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
357*4bdc9457SAndroid Build Coastguard Worker i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
358*4bdc9457SAndroid Build Coastguard Worker }
359*4bdc9457SAndroid Build Coastguard Worker const int8_t* i3 = input[3];
360*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
361*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
362*4bdc9457SAndroid Build Coastguard Worker i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
363*4bdc9457SAndroid Build Coastguard Worker }
364*4bdc9457SAndroid Build Coastguard Worker const int8_t* i4 = input[4];
365*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
366*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
367*4bdc9457SAndroid Build Coastguard Worker i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
368*4bdc9457SAndroid Build Coastguard Worker }
369*4bdc9457SAndroid Build Coastguard Worker const int8_t* i5 = input[5];
370*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
371*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
372*4bdc9457SAndroid Build Coastguard Worker i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
373*4bdc9457SAndroid Build Coastguard Worker }
374*4bdc9457SAndroid Build Coastguard Worker const int8_t* i6 = input[6];
375*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
376*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
377*4bdc9457SAndroid Build Coastguard Worker i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
378*4bdc9457SAndroid Build Coastguard Worker }
379*4bdc9457SAndroid Build Coastguard Worker const int8_t* i7 = input[7];
380*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
381*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
382*4bdc9457SAndroid Build Coastguard Worker i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
383*4bdc9457SAndroid Build Coastguard Worker }
384*4bdc9457SAndroid Build Coastguard Worker const int8_t* i8 = input[8];
385*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
386*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
387*4bdc9457SAndroid Build Coastguard Worker i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
388*4bdc9457SAndroid Build Coastguard Worker }
389*4bdc9457SAndroid Build Coastguard Worker const int8_t* i9 = input[9];
390*4bdc9457SAndroid Build Coastguard Worker assert(i9 != NULL);
391*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i9 != zero) {
392*4bdc9457SAndroid Build Coastguard Worker i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
393*4bdc9457SAndroid Build Coastguard Worker }
394*4bdc9457SAndroid Build Coastguard Worker const int8_t* i10 = input[10];
395*4bdc9457SAndroid Build Coastguard Worker assert(i10 != NULL);
396*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i10 != zero) {
397*4bdc9457SAndroid Build Coastguard Worker i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
398*4bdc9457SAndroid Build Coastguard Worker }
399*4bdc9457SAndroid Build Coastguard Worker const int8_t* i11 = input[11];
400*4bdc9457SAndroid Build Coastguard Worker assert(i11 != NULL);
401*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i11 != zero) {
402*4bdc9457SAndroid Build Coastguard Worker i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
403*4bdc9457SAndroid Build Coastguard Worker }
404*4bdc9457SAndroid Build Coastguard Worker const int8_t* i12 = input[12];
405*4bdc9457SAndroid Build Coastguard Worker assert(i12 != NULL);
406*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i12 != zero) {
407*4bdc9457SAndroid Build Coastguard Worker i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
408*4bdc9457SAndroid Build Coastguard Worker }
409*4bdc9457SAndroid Build Coastguard Worker const int8_t* i13 = input[13];
410*4bdc9457SAndroid Build Coastguard Worker assert(i13 != NULL);
411*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i13 != zero) {
412*4bdc9457SAndroid Build Coastguard Worker i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
413*4bdc9457SAndroid Build Coastguard Worker }
414*4bdc9457SAndroid Build Coastguard Worker const int8_t* i14 = input[14];
415*4bdc9457SAndroid Build Coastguard Worker assert(i14 != NULL);
416*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i14 != zero) {
417*4bdc9457SAndroid Build Coastguard Worker i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
418*4bdc9457SAndroid Build Coastguard Worker }
419*4bdc9457SAndroid Build Coastguard Worker const int8_t* i15 = input[15];
420*4bdc9457SAndroid Build Coastguard Worker assert(i15 != NULL);
421*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i15 != zero) {
422*4bdc9457SAndroid Build Coastguard Worker i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
423*4bdc9457SAndroid Build Coastguard Worker }
424*4bdc9457SAndroid Build Coastguard Worker const int8_t* i16 = input[16];
425*4bdc9457SAndroid Build Coastguard Worker assert(i16 != NULL);
426*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i16 != zero) {
427*4bdc9457SAndroid Build Coastguard Worker i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
428*4bdc9457SAndroid Build Coastguard Worker }
429*4bdc9457SAndroid Build Coastguard Worker const int8_t* i17 = input[17];
430*4bdc9457SAndroid Build Coastguard Worker assert(i17 != NULL);
431*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i17 != zero) {
432*4bdc9457SAndroid Build Coastguard Worker i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
433*4bdc9457SAndroid Build Coastguard Worker }
434*4bdc9457SAndroid Build Coastguard Worker const int8_t* i18 = input[18];
435*4bdc9457SAndroid Build Coastguard Worker assert(i18 != NULL);
436*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i18 != zero) {
437*4bdc9457SAndroid Build Coastguard Worker i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
438*4bdc9457SAndroid Build Coastguard Worker }
439*4bdc9457SAndroid Build Coastguard Worker const int8_t* i19 = input[19];
440*4bdc9457SAndroid Build Coastguard Worker assert(i19 != NULL);
441*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i19 != zero) {
442*4bdc9457SAndroid Build Coastguard Worker i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
443*4bdc9457SAndroid Build Coastguard Worker }
444*4bdc9457SAndroid Build Coastguard Worker const int8_t* i20 = input[20];
445*4bdc9457SAndroid Build Coastguard Worker assert(i20 != NULL);
446*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i20 != zero) {
447*4bdc9457SAndroid Build Coastguard Worker i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
448*4bdc9457SAndroid Build Coastguard Worker }
449*4bdc9457SAndroid Build Coastguard Worker const int8_t* i21 = input[21];
450*4bdc9457SAndroid Build Coastguard Worker assert(i21 != NULL);
451*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i21 != zero) {
452*4bdc9457SAndroid Build Coastguard Worker i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
453*4bdc9457SAndroid Build Coastguard Worker }
454*4bdc9457SAndroid Build Coastguard Worker const int8_t* i22 = input[22];
455*4bdc9457SAndroid Build Coastguard Worker assert(i22 != NULL);
456*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i22 != zero) {
457*4bdc9457SAndroid Build Coastguard Worker i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
458*4bdc9457SAndroid Build Coastguard Worker }
459*4bdc9457SAndroid Build Coastguard Worker const int8_t* i23 = input[23];
460*4bdc9457SAndroid Build Coastguard Worker assert(i23 != NULL);
461*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i23 != zero) {
462*4bdc9457SAndroid Build Coastguard Worker i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
463*4bdc9457SAndroid Build Coastguard Worker }
464*4bdc9457SAndroid Build Coastguard Worker const int8_t* i24 = input[24];
465*4bdc9457SAndroid Build Coastguard Worker assert(i24 != NULL);
466*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i24 != zero) {
467*4bdc9457SAndroid Build Coastguard Worker i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
468*4bdc9457SAndroid Build Coastguard Worker }
469*4bdc9457SAndroid Build Coastguard Worker input = (const int8_t**) ((uintptr_t) input + input_stride);
470*4bdc9457SAndroid Build Coastguard Worker
471*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
472*4bdc9457SAndroid Build Coastguard Worker const void* w = weights;
473*4bdc9457SAndroid Build Coastguard Worker for (; c >= 32; c -= 32) {
474*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
475*4bdc9457SAndroid Build Coastguard Worker __m512i vaccGHIJKLMNOPQRSTUV = _mm512_loadu_si512((const void*) ((uintptr_t) w + 16 * sizeof(int32_t)));
476*4bdc9457SAndroid Build Coastguard Worker
477*4bdc9457SAndroid Build Coastguard Worker
478*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
479*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 0 * sizeof(int8_t))));
480*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i0 + 16)));
481*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 16 * sizeof(int8_t))));
482*4bdc9457SAndroid Build Coastguard Worker i0 += 32;
483*4bdc9457SAndroid Build Coastguard Worker
484*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
485*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV));
486*4bdc9457SAndroid Build Coastguard Worker
487*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
488*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 32 * sizeof(int8_t))));
489*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i1 + 16)));
490*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 48 * sizeof(int8_t))));
491*4bdc9457SAndroid Build Coastguard Worker i1 += 32;
492*4bdc9457SAndroid Build Coastguard Worker
493*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
494*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV));
495*4bdc9457SAndroid Build Coastguard Worker
496*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
497*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 64 * sizeof(int8_t))));
498*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i2 + 16)));
499*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 80 * sizeof(int8_t))));
500*4bdc9457SAndroid Build Coastguard Worker i2 += 32;
501*4bdc9457SAndroid Build Coastguard Worker
502*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
503*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV));
504*4bdc9457SAndroid Build Coastguard Worker
505*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3));
506*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(int8_t))));
507*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i3 + 16)));
508*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 112 * sizeof(int8_t))));
509*4bdc9457SAndroid Build Coastguard Worker i3 += 32;
510*4bdc9457SAndroid Build Coastguard Worker
511*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
512*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi3xGHIJKLMNOPQRSTUV, vk3xGHIJKLMNOPQRSTUV));
513*4bdc9457SAndroid Build Coastguard Worker
514*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4));
515*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 128 * sizeof(int8_t))));
516*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i4 + 16)));
517*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 144 * sizeof(int8_t))));
518*4bdc9457SAndroid Build Coastguard Worker i4 += 32;
519*4bdc9457SAndroid Build Coastguard Worker
520*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
521*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi4xGHIJKLMNOPQRSTUV, vk4xGHIJKLMNOPQRSTUV));
522*4bdc9457SAndroid Build Coastguard Worker
523*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5));
524*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 160 * sizeof(int8_t))));
525*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i5 + 16)));
526*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 176 * sizeof(int8_t))));
527*4bdc9457SAndroid Build Coastguard Worker i5 += 32;
528*4bdc9457SAndroid Build Coastguard Worker
529*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
530*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi5xGHIJKLMNOPQRSTUV, vk5xGHIJKLMNOPQRSTUV));
531*4bdc9457SAndroid Build Coastguard Worker
532*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6));
533*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 192 * sizeof(int8_t))));
534*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i6 + 16)));
535*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 208 * sizeof(int8_t))));
536*4bdc9457SAndroid Build Coastguard Worker i6 += 32;
537*4bdc9457SAndroid Build Coastguard Worker
538*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
539*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi6xGHIJKLMNOPQRSTUV, vk6xGHIJKLMNOPQRSTUV));
540*4bdc9457SAndroid Build Coastguard Worker
541*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7));
542*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 224 * sizeof(int8_t))));
543*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i7 + 16)));
544*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 240 * sizeof(int8_t))));
545*4bdc9457SAndroid Build Coastguard Worker i7 += 32;
546*4bdc9457SAndroid Build Coastguard Worker
547*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
548*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi7xGHIJKLMNOPQRSTUV, vk7xGHIJKLMNOPQRSTUV));
549*4bdc9457SAndroid Build Coastguard Worker
550*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8));
551*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 256 * sizeof(int8_t))));
552*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i8 + 16)));
553*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 272 * sizeof(int8_t))));
554*4bdc9457SAndroid Build Coastguard Worker i8 += 32;
555*4bdc9457SAndroid Build Coastguard Worker
556*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
557*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi8xGHIJKLMNOPQRSTUV, vk8xGHIJKLMNOPQRSTUV));
558*4bdc9457SAndroid Build Coastguard Worker
559*4bdc9457SAndroid Build Coastguard Worker const __m512i vi9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i9));
560*4bdc9457SAndroid Build Coastguard Worker const __m512i vk9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t))));
561*4bdc9457SAndroid Build Coastguard Worker const __m512i vi9xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i9 + 16)));
562*4bdc9457SAndroid Build Coastguard Worker const __m512i vk9xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 304 * sizeof(int8_t))));
563*4bdc9457SAndroid Build Coastguard Worker i9 += 32;
564*4bdc9457SAndroid Build Coastguard Worker
565*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF));
566*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi9xGHIJKLMNOPQRSTUV, vk9xGHIJKLMNOPQRSTUV));
567*4bdc9457SAndroid Build Coastguard Worker
568*4bdc9457SAndroid Build Coastguard Worker const __m512i vi10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i10));
569*4bdc9457SAndroid Build Coastguard Worker const __m512i vk10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 320 * sizeof(int8_t))));
570*4bdc9457SAndroid Build Coastguard Worker const __m512i vi10xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i10 + 16)));
571*4bdc9457SAndroid Build Coastguard Worker const __m512i vk10xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 336 * sizeof(int8_t))));
572*4bdc9457SAndroid Build Coastguard Worker i10 += 32;
573*4bdc9457SAndroid Build Coastguard Worker
574*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF));
575*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi10xGHIJKLMNOPQRSTUV, vk10xGHIJKLMNOPQRSTUV));
576*4bdc9457SAndroid Build Coastguard Worker
577*4bdc9457SAndroid Build Coastguard Worker const __m512i vi11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i11));
578*4bdc9457SAndroid Build Coastguard Worker const __m512i vk11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 352 * sizeof(int8_t))));
579*4bdc9457SAndroid Build Coastguard Worker const __m512i vi11xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i11 + 16)));
580*4bdc9457SAndroid Build Coastguard Worker const __m512i vk11xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 368 * sizeof(int8_t))));
581*4bdc9457SAndroid Build Coastguard Worker i11 += 32;
582*4bdc9457SAndroid Build Coastguard Worker
583*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF));
584*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi11xGHIJKLMNOPQRSTUV, vk11xGHIJKLMNOPQRSTUV));
585*4bdc9457SAndroid Build Coastguard Worker
586*4bdc9457SAndroid Build Coastguard Worker const __m512i vi12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i12));
587*4bdc9457SAndroid Build Coastguard Worker const __m512i vk12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 384 * sizeof(int8_t))));
588*4bdc9457SAndroid Build Coastguard Worker const __m512i vi12xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i12 + 16)));
589*4bdc9457SAndroid Build Coastguard Worker const __m512i vk12xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 400 * sizeof(int8_t))));
590*4bdc9457SAndroid Build Coastguard Worker i12 += 32;
591*4bdc9457SAndroid Build Coastguard Worker
592*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF));
593*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi12xGHIJKLMNOPQRSTUV, vk12xGHIJKLMNOPQRSTUV));
594*4bdc9457SAndroid Build Coastguard Worker
595*4bdc9457SAndroid Build Coastguard Worker const __m512i vi13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i13));
596*4bdc9457SAndroid Build Coastguard Worker const __m512i vk13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 416 * sizeof(int8_t))));
597*4bdc9457SAndroid Build Coastguard Worker const __m512i vi13xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i13 + 16)));
598*4bdc9457SAndroid Build Coastguard Worker const __m512i vk13xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 432 * sizeof(int8_t))));
599*4bdc9457SAndroid Build Coastguard Worker i13 += 32;
600*4bdc9457SAndroid Build Coastguard Worker
601*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF));
602*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi13xGHIJKLMNOPQRSTUV, vk13xGHIJKLMNOPQRSTUV));
603*4bdc9457SAndroid Build Coastguard Worker
604*4bdc9457SAndroid Build Coastguard Worker const __m512i vi14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i14));
605*4bdc9457SAndroid Build Coastguard Worker const __m512i vk14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 448 * sizeof(int8_t))));
606*4bdc9457SAndroid Build Coastguard Worker const __m512i vi14xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i14 + 16)));
607*4bdc9457SAndroid Build Coastguard Worker const __m512i vk14xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 464 * sizeof(int8_t))));
608*4bdc9457SAndroid Build Coastguard Worker i14 += 32;
609*4bdc9457SAndroid Build Coastguard Worker
610*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF));
611*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi14xGHIJKLMNOPQRSTUV, vk14xGHIJKLMNOPQRSTUV));
612*4bdc9457SAndroid Build Coastguard Worker
613*4bdc9457SAndroid Build Coastguard Worker const __m512i vi15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i15));
614*4bdc9457SAndroid Build Coastguard Worker const __m512i vk15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 480 * sizeof(int8_t))));
615*4bdc9457SAndroid Build Coastguard Worker const __m512i vi15xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i15 + 16)));
616*4bdc9457SAndroid Build Coastguard Worker const __m512i vk15xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 496 * sizeof(int8_t))));
617*4bdc9457SAndroid Build Coastguard Worker i15 += 32;
618*4bdc9457SAndroid Build Coastguard Worker
619*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF));
620*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi15xGHIJKLMNOPQRSTUV, vk15xGHIJKLMNOPQRSTUV));
621*4bdc9457SAndroid Build Coastguard Worker
622*4bdc9457SAndroid Build Coastguard Worker const __m512i vi16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i16));
623*4bdc9457SAndroid Build Coastguard Worker const __m512i vk16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 512 * sizeof(int8_t))));
624*4bdc9457SAndroid Build Coastguard Worker const __m512i vi16xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i16 + 16)));
625*4bdc9457SAndroid Build Coastguard Worker const __m512i vk16xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 528 * sizeof(int8_t))));
626*4bdc9457SAndroid Build Coastguard Worker i16 += 32;
627*4bdc9457SAndroid Build Coastguard Worker
628*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF));
629*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi16xGHIJKLMNOPQRSTUV, vk16xGHIJKLMNOPQRSTUV));
630*4bdc9457SAndroid Build Coastguard Worker
631*4bdc9457SAndroid Build Coastguard Worker const __m512i vi17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i17));
632*4bdc9457SAndroid Build Coastguard Worker const __m512i vk17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 544 * sizeof(int8_t))));
633*4bdc9457SAndroid Build Coastguard Worker const __m512i vi17xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i17 + 16)));
634*4bdc9457SAndroid Build Coastguard Worker const __m512i vk17xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 560 * sizeof(int8_t))));
635*4bdc9457SAndroid Build Coastguard Worker i17 += 32;
636*4bdc9457SAndroid Build Coastguard Worker
637*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF));
638*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi17xGHIJKLMNOPQRSTUV, vk17xGHIJKLMNOPQRSTUV));
639*4bdc9457SAndroid Build Coastguard Worker
640*4bdc9457SAndroid Build Coastguard Worker const __m512i vi18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i18));
641*4bdc9457SAndroid Build Coastguard Worker const __m512i vk18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 576 * sizeof(int8_t))));
642*4bdc9457SAndroid Build Coastguard Worker const __m512i vi18xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i18 + 16)));
643*4bdc9457SAndroid Build Coastguard Worker const __m512i vk18xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 592 * sizeof(int8_t))));
644*4bdc9457SAndroid Build Coastguard Worker i18 += 32;
645*4bdc9457SAndroid Build Coastguard Worker
646*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF));
647*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi18xGHIJKLMNOPQRSTUV, vk18xGHIJKLMNOPQRSTUV));
648*4bdc9457SAndroid Build Coastguard Worker
649*4bdc9457SAndroid Build Coastguard Worker const __m512i vi19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i19));
650*4bdc9457SAndroid Build Coastguard Worker const __m512i vk19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 608 * sizeof(int8_t))));
651*4bdc9457SAndroid Build Coastguard Worker const __m512i vi19xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i19 + 16)));
652*4bdc9457SAndroid Build Coastguard Worker const __m512i vk19xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 624 * sizeof(int8_t))));
653*4bdc9457SAndroid Build Coastguard Worker i19 += 32;
654*4bdc9457SAndroid Build Coastguard Worker
655*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF));
656*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi19xGHIJKLMNOPQRSTUV, vk19xGHIJKLMNOPQRSTUV));
657*4bdc9457SAndroid Build Coastguard Worker
658*4bdc9457SAndroid Build Coastguard Worker const __m512i vi20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i20));
659*4bdc9457SAndroid Build Coastguard Worker const __m512i vk20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 640 * sizeof(int8_t))));
660*4bdc9457SAndroid Build Coastguard Worker const __m512i vi20xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i20 + 16)));
661*4bdc9457SAndroid Build Coastguard Worker const __m512i vk20xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 656 * sizeof(int8_t))));
662*4bdc9457SAndroid Build Coastguard Worker i20 += 32;
663*4bdc9457SAndroid Build Coastguard Worker
664*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF));
665*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi20xGHIJKLMNOPQRSTUV, vk20xGHIJKLMNOPQRSTUV));
666*4bdc9457SAndroid Build Coastguard Worker
667*4bdc9457SAndroid Build Coastguard Worker const __m512i vi21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i21));
668*4bdc9457SAndroid Build Coastguard Worker const __m512i vk21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 672 * sizeof(int8_t))));
669*4bdc9457SAndroid Build Coastguard Worker const __m512i vi21xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i21 + 16)));
670*4bdc9457SAndroid Build Coastguard Worker const __m512i vk21xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 688 * sizeof(int8_t))));
671*4bdc9457SAndroid Build Coastguard Worker i21 += 32;
672*4bdc9457SAndroid Build Coastguard Worker
673*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF));
674*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi21xGHIJKLMNOPQRSTUV, vk21xGHIJKLMNOPQRSTUV));
675*4bdc9457SAndroid Build Coastguard Worker
676*4bdc9457SAndroid Build Coastguard Worker const __m512i vi22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i22));
677*4bdc9457SAndroid Build Coastguard Worker const __m512i vk22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 704 * sizeof(int8_t))));
678*4bdc9457SAndroid Build Coastguard Worker const __m512i vi22xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i22 + 16)));
679*4bdc9457SAndroid Build Coastguard Worker const __m512i vk22xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 720 * sizeof(int8_t))));
680*4bdc9457SAndroid Build Coastguard Worker i22 += 32;
681*4bdc9457SAndroid Build Coastguard Worker
682*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF));
683*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi22xGHIJKLMNOPQRSTUV, vk22xGHIJKLMNOPQRSTUV));
684*4bdc9457SAndroid Build Coastguard Worker
685*4bdc9457SAndroid Build Coastguard Worker const __m512i vi23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i23));
686*4bdc9457SAndroid Build Coastguard Worker const __m512i vk23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 736 * sizeof(int8_t))));
687*4bdc9457SAndroid Build Coastguard Worker const __m512i vi23xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i23 + 16)));
688*4bdc9457SAndroid Build Coastguard Worker const __m512i vk23xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 752 * sizeof(int8_t))));
689*4bdc9457SAndroid Build Coastguard Worker i23 += 32;
690*4bdc9457SAndroid Build Coastguard Worker
691*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF));
692*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi23xGHIJKLMNOPQRSTUV, vk23xGHIJKLMNOPQRSTUV));
693*4bdc9457SAndroid Build Coastguard Worker
694*4bdc9457SAndroid Build Coastguard Worker const __m512i vi24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i24));
695*4bdc9457SAndroid Build Coastguard Worker const __m512i vk24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 768 * sizeof(int8_t))));
696*4bdc9457SAndroid Build Coastguard Worker const __m512i vi24xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i24 + 16)));
697*4bdc9457SAndroid Build Coastguard Worker const __m512i vk24xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 784 * sizeof(int8_t))));
698*4bdc9457SAndroid Build Coastguard Worker i24 += 32;
699*4bdc9457SAndroid Build Coastguard Worker
700*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF));
701*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi24xGHIJKLMNOPQRSTUV, vk24xGHIJKLMNOPQRSTUV));
702*4bdc9457SAndroid Build Coastguard Worker
703*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 800 * sizeof(int8_t));
704*4bdc9457SAndroid Build Coastguard Worker
705*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
706*4bdc9457SAndroid Build Coastguard Worker __m512 vscaledGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vaccGHIJKLMNOPQRSTUV);
707*4bdc9457SAndroid Build Coastguard Worker
708*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale0123456789ABCDEF = _mm512_loadu_ps(w);
709*4bdc9457SAndroid Build Coastguard Worker const __m512 vscaleGHIJKLMNOPQRSTUV = _mm512_loadu_ps((const void*) ((uintptr_t) w + 16 * sizeof(float)));
710*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(float));
711*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale0123456789ABCDEF);
712*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_mul_ps(vscaledGHIJKLMNOPQRSTUV, vscaleGHIJKLMNOPQRSTUV);
713*4bdc9457SAndroid Build Coastguard Worker
714*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
715*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_min_ps(vscaledGHIJKLMNOPQRSTUV, voutput_max_less_zero_point);
716*4bdc9457SAndroid Build Coastguard Worker
717*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
718*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_cvtps_epi32(vscaledGHIJKLMNOPQRSTUV);
719*4bdc9457SAndroid Build Coastguard Worker
720*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
721*4bdc9457SAndroid Build Coastguard Worker __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vaccGHIJKLMNOPQRSTUV), _mm512_extracti32x8_epi32(vaccGHIJKLMNOPQRSTUV, 1)), _mm512_castsi512_si256(voutput_zero_point));
722*4bdc9457SAndroid Build Coastguard Worker
723*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ4567KLMN = _mm512_castsi512_si256(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV);
724*4bdc9457SAndroid Build Coastguard Worker const __m256i vout89ABOPQRCDEFSTUV = _mm512_extracti32x8_epi32(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV, 1);
725*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV = _mm256_packs_epi16(vout0123GHIJ4567KLMN, vout89ABOPQRCDEFSTUV);
726*4bdc9457SAndroid Build Coastguard Worker __m256i vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_permutevar8x32_epi32(vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV, vpermute_mask);
727*4bdc9457SAndroid Build Coastguard Worker const __m128i voutGHIJOPQR = _mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV);
728*4bdc9457SAndroid Build Coastguard Worker const __m128i voutKLMNSTUV = _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1);
729*4bdc9457SAndroid Build Coastguard Worker __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(voutGHIJOPQR, voutKLMNSTUV), _MM_SHUFFLE(3, 1, 2, 0));
730*4bdc9457SAndroid Build Coastguard Worker
731*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_max_epi8(vout0123456789ABCDEFGHIJKLMNOPQRSTUV, voutput_min);
732*4bdc9457SAndroid Build Coastguard Worker voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, _mm256_castsi256_si128(voutput_min));
733*4bdc9457SAndroid Build Coastguard Worker
734*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) output, vout0123456789ABCDEFGHIJKLMNOPQRSTUV);
735*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
736*4bdc9457SAndroid Build Coastguard Worker output += 32;
737*4bdc9457SAndroid Build Coastguard Worker }
738*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
739*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
740*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << (c & 15)) - UINT32_C(1)));
741*4bdc9457SAndroid Build Coastguard Worker const int8_t* k = (const int8_t*) ((uintptr_t) w + 32 * sizeof(int32_t));
742*4bdc9457SAndroid Build Coastguard Worker do {
743*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
744*4bdc9457SAndroid Build Coastguard Worker
745*4bdc9457SAndroid Build Coastguard Worker
746*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
747*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) k));
748*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
749*4bdc9457SAndroid Build Coastguard Worker
750*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
751*4bdc9457SAndroid Build Coastguard Worker
752*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
753*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 32)));
754*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
755*4bdc9457SAndroid Build Coastguard Worker
756*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
757*4bdc9457SAndroid Build Coastguard Worker
758*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
759*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 64)));
760*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
761*4bdc9457SAndroid Build Coastguard Worker
762*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
763*4bdc9457SAndroid Build Coastguard Worker
764*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3));
765*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 96)));
766*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
767*4bdc9457SAndroid Build Coastguard Worker
768*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
769*4bdc9457SAndroid Build Coastguard Worker
770*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4));
771*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 128)));
772*4bdc9457SAndroid Build Coastguard Worker i4 += 16;
773*4bdc9457SAndroid Build Coastguard Worker
774*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
775*4bdc9457SAndroid Build Coastguard Worker
776*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5));
777*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 160)));
778*4bdc9457SAndroid Build Coastguard Worker i5 += 16;
779*4bdc9457SAndroid Build Coastguard Worker
780*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
781*4bdc9457SAndroid Build Coastguard Worker
782*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6));
783*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 192)));
784*4bdc9457SAndroid Build Coastguard Worker i6 += 16;
785*4bdc9457SAndroid Build Coastguard Worker
786*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
787*4bdc9457SAndroid Build Coastguard Worker
788*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7));
789*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 224)));
790*4bdc9457SAndroid Build Coastguard Worker i7 += 16;
791*4bdc9457SAndroid Build Coastguard Worker
792*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
793*4bdc9457SAndroid Build Coastguard Worker
794*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8));
795*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 256)));
796*4bdc9457SAndroid Build Coastguard Worker i8 += 16;
797*4bdc9457SAndroid Build Coastguard Worker
798*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
799*4bdc9457SAndroid Build Coastguard Worker
800*4bdc9457SAndroid Build Coastguard Worker const __m512i vi9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i9));
801*4bdc9457SAndroid Build Coastguard Worker const __m512i vk9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 288)));
802*4bdc9457SAndroid Build Coastguard Worker i9 += 16;
803*4bdc9457SAndroid Build Coastguard Worker
804*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF));
805*4bdc9457SAndroid Build Coastguard Worker
806*4bdc9457SAndroid Build Coastguard Worker const __m512i vi10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i10));
807*4bdc9457SAndroid Build Coastguard Worker const __m512i vk10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 320)));
808*4bdc9457SAndroid Build Coastguard Worker i10 += 16;
809*4bdc9457SAndroid Build Coastguard Worker
810*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF));
811*4bdc9457SAndroid Build Coastguard Worker
812*4bdc9457SAndroid Build Coastguard Worker const __m512i vi11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i11));
813*4bdc9457SAndroid Build Coastguard Worker const __m512i vk11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 352)));
814*4bdc9457SAndroid Build Coastguard Worker i11 += 16;
815*4bdc9457SAndroid Build Coastguard Worker
816*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF));
817*4bdc9457SAndroid Build Coastguard Worker
818*4bdc9457SAndroid Build Coastguard Worker const __m512i vi12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i12));
819*4bdc9457SAndroid Build Coastguard Worker const __m512i vk12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 384)));
820*4bdc9457SAndroid Build Coastguard Worker i12 += 16;
821*4bdc9457SAndroid Build Coastguard Worker
822*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF));
823*4bdc9457SAndroid Build Coastguard Worker
824*4bdc9457SAndroid Build Coastguard Worker const __m512i vi13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i13));
825*4bdc9457SAndroid Build Coastguard Worker const __m512i vk13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 416)));
826*4bdc9457SAndroid Build Coastguard Worker i13 += 16;
827*4bdc9457SAndroid Build Coastguard Worker
828*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF));
829*4bdc9457SAndroid Build Coastguard Worker
830*4bdc9457SAndroid Build Coastguard Worker const __m512i vi14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i14));
831*4bdc9457SAndroid Build Coastguard Worker const __m512i vk14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 448)));
832*4bdc9457SAndroid Build Coastguard Worker i14 += 16;
833*4bdc9457SAndroid Build Coastguard Worker
834*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF));
835*4bdc9457SAndroid Build Coastguard Worker
836*4bdc9457SAndroid Build Coastguard Worker const __m512i vi15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i15));
837*4bdc9457SAndroid Build Coastguard Worker const __m512i vk15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 480)));
838*4bdc9457SAndroid Build Coastguard Worker i15 += 16;
839*4bdc9457SAndroid Build Coastguard Worker
840*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF));
841*4bdc9457SAndroid Build Coastguard Worker
842*4bdc9457SAndroid Build Coastguard Worker const __m512i vi16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i16));
843*4bdc9457SAndroid Build Coastguard Worker const __m512i vk16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 512)));
844*4bdc9457SAndroid Build Coastguard Worker i16 += 16;
845*4bdc9457SAndroid Build Coastguard Worker
846*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF));
847*4bdc9457SAndroid Build Coastguard Worker
848*4bdc9457SAndroid Build Coastguard Worker const __m512i vi17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i17));
849*4bdc9457SAndroid Build Coastguard Worker const __m512i vk17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 544)));
850*4bdc9457SAndroid Build Coastguard Worker i17 += 16;
851*4bdc9457SAndroid Build Coastguard Worker
852*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF));
853*4bdc9457SAndroid Build Coastguard Worker
854*4bdc9457SAndroid Build Coastguard Worker const __m512i vi18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i18));
855*4bdc9457SAndroid Build Coastguard Worker const __m512i vk18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 576)));
856*4bdc9457SAndroid Build Coastguard Worker i18 += 16;
857*4bdc9457SAndroid Build Coastguard Worker
858*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF));
859*4bdc9457SAndroid Build Coastguard Worker
860*4bdc9457SAndroid Build Coastguard Worker const __m512i vi19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i19));
861*4bdc9457SAndroid Build Coastguard Worker const __m512i vk19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 608)));
862*4bdc9457SAndroid Build Coastguard Worker i19 += 16;
863*4bdc9457SAndroid Build Coastguard Worker
864*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF));
865*4bdc9457SAndroid Build Coastguard Worker
866*4bdc9457SAndroid Build Coastguard Worker const __m512i vi20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i20));
867*4bdc9457SAndroid Build Coastguard Worker const __m512i vk20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 640)));
868*4bdc9457SAndroid Build Coastguard Worker i20 += 16;
869*4bdc9457SAndroid Build Coastguard Worker
870*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF));
871*4bdc9457SAndroid Build Coastguard Worker
872*4bdc9457SAndroid Build Coastguard Worker const __m512i vi21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i21));
873*4bdc9457SAndroid Build Coastguard Worker const __m512i vk21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 672)));
874*4bdc9457SAndroid Build Coastguard Worker i21 += 16;
875*4bdc9457SAndroid Build Coastguard Worker
876*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF));
877*4bdc9457SAndroid Build Coastguard Worker
878*4bdc9457SAndroid Build Coastguard Worker const __m512i vi22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i22));
879*4bdc9457SAndroid Build Coastguard Worker const __m512i vk22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 704)));
880*4bdc9457SAndroid Build Coastguard Worker i22 += 16;
881*4bdc9457SAndroid Build Coastguard Worker
882*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF));
883*4bdc9457SAndroid Build Coastguard Worker
884*4bdc9457SAndroid Build Coastguard Worker const __m512i vi23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i23));
885*4bdc9457SAndroid Build Coastguard Worker const __m512i vk23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 736)));
886*4bdc9457SAndroid Build Coastguard Worker i23 += 16;
887*4bdc9457SAndroid Build Coastguard Worker
888*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF));
889*4bdc9457SAndroid Build Coastguard Worker
890*4bdc9457SAndroid Build Coastguard Worker const __m512i vi24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i24));
891*4bdc9457SAndroid Build Coastguard Worker const __m512i vk24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 768)));
892*4bdc9457SAndroid Build Coastguard Worker i24 += 16;
893*4bdc9457SAndroid Build Coastguard Worker
894*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF));
895*4bdc9457SAndroid Build Coastguard Worker
896*4bdc9457SAndroid Build Coastguard Worker k += 16;
897*4bdc9457SAndroid Build Coastguard Worker
898*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
899*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale0123456789ABCDEF = _mm512_loadu_ps((const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 800 * sizeof(int8_t)));
900*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale0123456789ABCDEF);
901*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
902*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
903*4bdc9457SAndroid Build Coastguard Worker
904*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
905*4bdc9457SAndroid Build Coastguard Worker
906*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
907*4bdc9457SAndroid Build Coastguard Worker
908*4bdc9457SAndroid Build Coastguard Worker const __m128i vout012389AB = _mm256_castsi256_si128(vout012389AB4567CDEF);
909*4bdc9457SAndroid Build Coastguard Worker const __m128i vout4567CDEF = _mm256_extracti128_si256(vout012389AB4567CDEF, 1);
910*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(vout012389AB, vout4567CDEF), _MM_SHUFFLE(3, 1, 2, 0));
911*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, _mm256_castsi256_si128(voutput_min));
912*4bdc9457SAndroid Build Coastguard Worker
913*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(c >= 16) {
914*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
915*4bdc9457SAndroid Build Coastguard Worker output += 16;
916*4bdc9457SAndroid Build Coastguard Worker c -= 16;
917*4bdc9457SAndroid Build Coastguard Worker } else {
918*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
919*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + c);
920*4bdc9457SAndroid Build Coastguard Worker c = 0;
921*4bdc9457SAndroid Build Coastguard Worker }
922*4bdc9457SAndroid Build Coastguard Worker } while (c != 0);
923*4bdc9457SAndroid Build Coastguard Worker }
924*4bdc9457SAndroid Build Coastguard Worker
925*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + output_increment);
926*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
927*4bdc9457SAndroid Build Coastguard Worker }
928*4bdc9457SAndroid Build Coastguard Worker
xnn_qc8_dwconv_minmax_fp32_ukernel_up32x3__avx512skx_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])929*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_dwconv_minmax_fp32_ukernel_up32x3__avx512skx_mul32(
930*4bdc9457SAndroid Build Coastguard Worker size_t channels,
931*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
932*4bdc9457SAndroid Build Coastguard Worker const int8_t** input,
933*4bdc9457SAndroid Build Coastguard Worker const void* weights,
934*4bdc9457SAndroid Build Coastguard Worker int8_t* output,
935*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
936*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
937*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
938*4bdc9457SAndroid Build Coastguard Worker const int8_t* zero,
939*4bdc9457SAndroid Build Coastguard Worker const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_MSAN
940*4bdc9457SAndroid Build Coastguard Worker {
941*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
942*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
943*4bdc9457SAndroid Build Coastguard Worker
944*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
945*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
946*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_min);
947*4bdc9457SAndroid Build Coastguard Worker const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
948*4bdc9457SAndroid Build Coastguard Worker
949*4bdc9457SAndroid Build Coastguard Worker do {
950*4bdc9457SAndroid Build Coastguard Worker const int8_t* i0 = input[0];
951*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
952*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
953*4bdc9457SAndroid Build Coastguard Worker i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
954*4bdc9457SAndroid Build Coastguard Worker }
955*4bdc9457SAndroid Build Coastguard Worker const int8_t* i1 = input[1];
956*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
957*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
958*4bdc9457SAndroid Build Coastguard Worker i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
959*4bdc9457SAndroid Build Coastguard Worker }
960*4bdc9457SAndroid Build Coastguard Worker const int8_t* i2 = input[2];
961*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
962*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
963*4bdc9457SAndroid Build Coastguard Worker i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
964*4bdc9457SAndroid Build Coastguard Worker }
965*4bdc9457SAndroid Build Coastguard Worker input = (const int8_t**) ((uintptr_t) input + input_stride);
966*4bdc9457SAndroid Build Coastguard Worker
967*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
968*4bdc9457SAndroid Build Coastguard Worker const void* w = weights;
969*4bdc9457SAndroid Build Coastguard Worker for (; c >= 32; c -= 32) {
970*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
971*4bdc9457SAndroid Build Coastguard Worker __m512i vaccGHIJKLMNOPQRSTUV = _mm512_loadu_si512((const void*) ((uintptr_t) w + 16 * sizeof(int32_t)));
972*4bdc9457SAndroid Build Coastguard Worker
973*4bdc9457SAndroid Build Coastguard Worker
974*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
975*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 0 * sizeof(int8_t))));
976*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i0 + 16)));
977*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 16 * sizeof(int8_t))));
978*4bdc9457SAndroid Build Coastguard Worker i0 += 32;
979*4bdc9457SAndroid Build Coastguard Worker
980*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
981*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV));
982*4bdc9457SAndroid Build Coastguard Worker
983*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
984*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 32 * sizeof(int8_t))));
985*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i1 + 16)));
986*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 48 * sizeof(int8_t))));
987*4bdc9457SAndroid Build Coastguard Worker i1 += 32;
988*4bdc9457SAndroid Build Coastguard Worker
989*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
990*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV));
991*4bdc9457SAndroid Build Coastguard Worker
992*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
993*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 64 * sizeof(int8_t))));
994*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i2 + 16)));
995*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 80 * sizeof(int8_t))));
996*4bdc9457SAndroid Build Coastguard Worker i2 += 32;
997*4bdc9457SAndroid Build Coastguard Worker
998*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
999*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV));
1000*4bdc9457SAndroid Build Coastguard Worker
1001*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(int8_t));
1002*4bdc9457SAndroid Build Coastguard Worker
1003*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
1004*4bdc9457SAndroid Build Coastguard Worker __m512 vscaledGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vaccGHIJKLMNOPQRSTUV);
1005*4bdc9457SAndroid Build Coastguard Worker
1006*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale0123456789ABCDEF = _mm512_loadu_ps(w);
1007*4bdc9457SAndroid Build Coastguard Worker const __m512 vscaleGHIJKLMNOPQRSTUV = _mm512_loadu_ps((const void*) ((uintptr_t) w + 16 * sizeof(float)));
1008*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(float));
1009*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale0123456789ABCDEF);
1010*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_mul_ps(vscaledGHIJKLMNOPQRSTUV, vscaleGHIJKLMNOPQRSTUV);
1011*4bdc9457SAndroid Build Coastguard Worker
1012*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
1013*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_min_ps(vscaledGHIJKLMNOPQRSTUV, voutput_max_less_zero_point);
1014*4bdc9457SAndroid Build Coastguard Worker
1015*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
1016*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_cvtps_epi32(vscaledGHIJKLMNOPQRSTUV);
1017*4bdc9457SAndroid Build Coastguard Worker
1018*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
1019*4bdc9457SAndroid Build Coastguard Worker __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vaccGHIJKLMNOPQRSTUV), _mm512_extracti32x8_epi32(vaccGHIJKLMNOPQRSTUV, 1)), _mm512_castsi512_si256(voutput_zero_point));
1020*4bdc9457SAndroid Build Coastguard Worker
1021*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ4567KLMN = _mm512_castsi512_si256(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV);
1022*4bdc9457SAndroid Build Coastguard Worker const __m256i vout89ABOPQRCDEFSTUV = _mm512_extracti32x8_epi32(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV, 1);
1023*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV = _mm256_packs_epi16(vout0123GHIJ4567KLMN, vout89ABOPQRCDEFSTUV);
1024*4bdc9457SAndroid Build Coastguard Worker __m256i vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_permutevar8x32_epi32(vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV, vpermute_mask);
1025*4bdc9457SAndroid Build Coastguard Worker const __m128i voutGHIJOPQR = _mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV);
1026*4bdc9457SAndroid Build Coastguard Worker const __m128i voutKLMNSTUV = _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1);
1027*4bdc9457SAndroid Build Coastguard Worker __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(voutGHIJOPQR, voutKLMNSTUV), _MM_SHUFFLE(3, 1, 2, 0));
1028*4bdc9457SAndroid Build Coastguard Worker
1029*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_max_epi8(vout0123456789ABCDEFGHIJKLMNOPQRSTUV, voutput_min);
1030*4bdc9457SAndroid Build Coastguard Worker voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, _mm256_castsi256_si128(voutput_min));
1031*4bdc9457SAndroid Build Coastguard Worker
1032*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) output, vout0123456789ABCDEFGHIJKLMNOPQRSTUV);
1033*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
1034*4bdc9457SAndroid Build Coastguard Worker output += 32;
1035*4bdc9457SAndroid Build Coastguard Worker }
1036*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
1037*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
1038*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << (c & 15)) - UINT32_C(1)));
1039*4bdc9457SAndroid Build Coastguard Worker const int8_t* k = (const int8_t*) ((uintptr_t) w + 32 * sizeof(int32_t));
1040*4bdc9457SAndroid Build Coastguard Worker do {
1041*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
1042*4bdc9457SAndroid Build Coastguard Worker
1043*4bdc9457SAndroid Build Coastguard Worker
1044*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
1045*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) k));
1046*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
1047*4bdc9457SAndroid Build Coastguard Worker
1048*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
1049*4bdc9457SAndroid Build Coastguard Worker
1050*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
1051*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 32)));
1052*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
1053*4bdc9457SAndroid Build Coastguard Worker
1054*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
1055*4bdc9457SAndroid Build Coastguard Worker
1056*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
1057*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 64)));
1058*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
1059*4bdc9457SAndroid Build Coastguard Worker
1060*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
1061*4bdc9457SAndroid Build Coastguard Worker
1062*4bdc9457SAndroid Build Coastguard Worker k += 16;
1063*4bdc9457SAndroid Build Coastguard Worker
1064*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
1065*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale0123456789ABCDEF = _mm512_loadu_ps((const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(int8_t)));
1066*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale0123456789ABCDEF);
1067*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
1068*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
1069*4bdc9457SAndroid Build Coastguard Worker
1070*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
1071*4bdc9457SAndroid Build Coastguard Worker
1072*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
1073*4bdc9457SAndroid Build Coastguard Worker
1074*4bdc9457SAndroid Build Coastguard Worker const __m128i vout012389AB = _mm256_castsi256_si128(vout012389AB4567CDEF);
1075*4bdc9457SAndroid Build Coastguard Worker const __m128i vout4567CDEF = _mm256_extracti128_si256(vout012389AB4567CDEF, 1);
1076*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(vout012389AB, vout4567CDEF), _MM_SHUFFLE(3, 1, 2, 0));
1077*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, _mm256_castsi256_si128(voutput_min));
1078*4bdc9457SAndroid Build Coastguard Worker
1079*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(c >= 16) {
1080*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
1081*4bdc9457SAndroid Build Coastguard Worker output += 16;
1082*4bdc9457SAndroid Build Coastguard Worker c -= 16;
1083*4bdc9457SAndroid Build Coastguard Worker } else {
1084*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
1085*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + c);
1086*4bdc9457SAndroid Build Coastguard Worker c = 0;
1087*4bdc9457SAndroid Build Coastguard Worker }
1088*4bdc9457SAndroid Build Coastguard Worker } while (c != 0);
1089*4bdc9457SAndroid Build Coastguard Worker }
1090*4bdc9457SAndroid Build Coastguard Worker
1091*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + output_increment);
1092*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
1093*4bdc9457SAndroid Build Coastguard Worker }
1094*4bdc9457SAndroid Build Coastguard Worker
xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1095*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32(
1096*4bdc9457SAndroid Build Coastguard Worker size_t channels,
1097*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
1098*4bdc9457SAndroid Build Coastguard Worker const int8_t** input,
1099*4bdc9457SAndroid Build Coastguard Worker const void* weights,
1100*4bdc9457SAndroid Build Coastguard Worker int8_t* output,
1101*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
1102*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
1103*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
1104*4bdc9457SAndroid Build Coastguard Worker const int8_t* zero,
1105*4bdc9457SAndroid Build Coastguard Worker const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_MSAN
1106*4bdc9457SAndroid Build Coastguard Worker {
1107*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
1108*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
1109*4bdc9457SAndroid Build Coastguard Worker
1110*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
1111*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
1112*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_min);
1113*4bdc9457SAndroid Build Coastguard Worker const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
1114*4bdc9457SAndroid Build Coastguard Worker
1115*4bdc9457SAndroid Build Coastguard Worker do {
1116*4bdc9457SAndroid Build Coastguard Worker const int8_t* i0 = input[0];
1117*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
1118*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
1119*4bdc9457SAndroid Build Coastguard Worker i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
1120*4bdc9457SAndroid Build Coastguard Worker }
1121*4bdc9457SAndroid Build Coastguard Worker const int8_t* i1 = input[1];
1122*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
1123*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
1124*4bdc9457SAndroid Build Coastguard Worker i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
1125*4bdc9457SAndroid Build Coastguard Worker }
1126*4bdc9457SAndroid Build Coastguard Worker const int8_t* i2 = input[2];
1127*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
1128*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
1129*4bdc9457SAndroid Build Coastguard Worker i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
1130*4bdc9457SAndroid Build Coastguard Worker }
1131*4bdc9457SAndroid Build Coastguard Worker const int8_t* i3 = input[3];
1132*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
1133*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
1134*4bdc9457SAndroid Build Coastguard Worker i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
1135*4bdc9457SAndroid Build Coastguard Worker }
1136*4bdc9457SAndroid Build Coastguard Worker const int8_t* i4 = input[4];
1137*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
1138*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
1139*4bdc9457SAndroid Build Coastguard Worker i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
1140*4bdc9457SAndroid Build Coastguard Worker }
1141*4bdc9457SAndroid Build Coastguard Worker const int8_t* i5 = input[5];
1142*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
1143*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
1144*4bdc9457SAndroid Build Coastguard Worker i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
1145*4bdc9457SAndroid Build Coastguard Worker }
1146*4bdc9457SAndroid Build Coastguard Worker const int8_t* i6 = input[6];
1147*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
1148*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
1149*4bdc9457SAndroid Build Coastguard Worker i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
1150*4bdc9457SAndroid Build Coastguard Worker }
1151*4bdc9457SAndroid Build Coastguard Worker const int8_t* i7 = input[7];
1152*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
1153*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
1154*4bdc9457SAndroid Build Coastguard Worker i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
1155*4bdc9457SAndroid Build Coastguard Worker }
1156*4bdc9457SAndroid Build Coastguard Worker const int8_t* i8 = input[8];
1157*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
1158*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
1159*4bdc9457SAndroid Build Coastguard Worker i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
1160*4bdc9457SAndroid Build Coastguard Worker }
1161*4bdc9457SAndroid Build Coastguard Worker input = (const int8_t**) ((uintptr_t) input + input_stride);
1162*4bdc9457SAndroid Build Coastguard Worker
1163*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
1164*4bdc9457SAndroid Build Coastguard Worker const void* w = weights;
1165*4bdc9457SAndroid Build Coastguard Worker for (; c >= 32; c -= 32) {
1166*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
1167*4bdc9457SAndroid Build Coastguard Worker __m512i vaccGHIJKLMNOPQRSTUV = _mm512_loadu_si512((const void*) ((uintptr_t) w + 16 * sizeof(int32_t)));
1168*4bdc9457SAndroid Build Coastguard Worker
1169*4bdc9457SAndroid Build Coastguard Worker
1170*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
1171*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 0 * sizeof(int8_t))));
1172*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i0 + 16)));
1173*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 16 * sizeof(int8_t))));
1174*4bdc9457SAndroid Build Coastguard Worker i0 += 32;
1175*4bdc9457SAndroid Build Coastguard Worker
1176*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
1177*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV));
1178*4bdc9457SAndroid Build Coastguard Worker
1179*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
1180*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 32 * sizeof(int8_t))));
1181*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i1 + 16)));
1182*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 48 * sizeof(int8_t))));
1183*4bdc9457SAndroid Build Coastguard Worker i1 += 32;
1184*4bdc9457SAndroid Build Coastguard Worker
1185*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
1186*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV));
1187*4bdc9457SAndroid Build Coastguard Worker
1188*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
1189*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 64 * sizeof(int8_t))));
1190*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i2 + 16)));
1191*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 80 * sizeof(int8_t))));
1192*4bdc9457SAndroid Build Coastguard Worker i2 += 32;
1193*4bdc9457SAndroid Build Coastguard Worker
1194*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
1195*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV));
1196*4bdc9457SAndroid Build Coastguard Worker
1197*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3));
1198*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(int8_t))));
1199*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i3 + 16)));
1200*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 112 * sizeof(int8_t))));
1201*4bdc9457SAndroid Build Coastguard Worker i3 += 32;
1202*4bdc9457SAndroid Build Coastguard Worker
1203*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
1204*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi3xGHIJKLMNOPQRSTUV, vk3xGHIJKLMNOPQRSTUV));
1205*4bdc9457SAndroid Build Coastguard Worker
1206*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4));
1207*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 128 * sizeof(int8_t))));
1208*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i4 + 16)));
1209*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 144 * sizeof(int8_t))));
1210*4bdc9457SAndroid Build Coastguard Worker i4 += 32;
1211*4bdc9457SAndroid Build Coastguard Worker
1212*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
1213*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi4xGHIJKLMNOPQRSTUV, vk4xGHIJKLMNOPQRSTUV));
1214*4bdc9457SAndroid Build Coastguard Worker
1215*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5));
1216*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 160 * sizeof(int8_t))));
1217*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i5 + 16)));
1218*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 176 * sizeof(int8_t))));
1219*4bdc9457SAndroid Build Coastguard Worker i5 += 32;
1220*4bdc9457SAndroid Build Coastguard Worker
1221*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
1222*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi5xGHIJKLMNOPQRSTUV, vk5xGHIJKLMNOPQRSTUV));
1223*4bdc9457SAndroid Build Coastguard Worker
1224*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6));
1225*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 192 * sizeof(int8_t))));
1226*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i6 + 16)));
1227*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 208 * sizeof(int8_t))));
1228*4bdc9457SAndroid Build Coastguard Worker i6 += 32;
1229*4bdc9457SAndroid Build Coastguard Worker
1230*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
1231*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi6xGHIJKLMNOPQRSTUV, vk6xGHIJKLMNOPQRSTUV));
1232*4bdc9457SAndroid Build Coastguard Worker
1233*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7));
1234*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 224 * sizeof(int8_t))));
1235*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i7 + 16)));
1236*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 240 * sizeof(int8_t))));
1237*4bdc9457SAndroid Build Coastguard Worker i7 += 32;
1238*4bdc9457SAndroid Build Coastguard Worker
1239*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
1240*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi7xGHIJKLMNOPQRSTUV, vk7xGHIJKLMNOPQRSTUV));
1241*4bdc9457SAndroid Build Coastguard Worker
1242*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8));
1243*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 256 * sizeof(int8_t))));
1244*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i8 + 16)));
1245*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 272 * sizeof(int8_t))));
1246*4bdc9457SAndroid Build Coastguard Worker i8 += 32;
1247*4bdc9457SAndroid Build Coastguard Worker
1248*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
1249*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi8xGHIJKLMNOPQRSTUV, vk8xGHIJKLMNOPQRSTUV));
1250*4bdc9457SAndroid Build Coastguard Worker
1251*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t));
1252*4bdc9457SAndroid Build Coastguard Worker
1253*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
1254*4bdc9457SAndroid Build Coastguard Worker __m512 vscaledGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vaccGHIJKLMNOPQRSTUV);
1255*4bdc9457SAndroid Build Coastguard Worker
1256*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale0123456789ABCDEF = _mm512_loadu_ps(w);
1257*4bdc9457SAndroid Build Coastguard Worker const __m512 vscaleGHIJKLMNOPQRSTUV = _mm512_loadu_ps((const void*) ((uintptr_t) w + 16 * sizeof(float)));
1258*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(float));
1259*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale0123456789ABCDEF);
1260*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_mul_ps(vscaledGHIJKLMNOPQRSTUV, vscaleGHIJKLMNOPQRSTUV);
1261*4bdc9457SAndroid Build Coastguard Worker
1262*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
1263*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_min_ps(vscaledGHIJKLMNOPQRSTUV, voutput_max_less_zero_point);
1264*4bdc9457SAndroid Build Coastguard Worker
1265*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
1266*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_cvtps_epi32(vscaledGHIJKLMNOPQRSTUV);
1267*4bdc9457SAndroid Build Coastguard Worker
1268*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
1269*4bdc9457SAndroid Build Coastguard Worker __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vaccGHIJKLMNOPQRSTUV), _mm512_extracti32x8_epi32(vaccGHIJKLMNOPQRSTUV, 1)), _mm512_castsi512_si256(voutput_zero_point));
1270*4bdc9457SAndroid Build Coastguard Worker
1271*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ4567KLMN = _mm512_castsi512_si256(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV);
1272*4bdc9457SAndroid Build Coastguard Worker const __m256i vout89ABOPQRCDEFSTUV = _mm512_extracti32x8_epi32(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV, 1);
1273*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV = _mm256_packs_epi16(vout0123GHIJ4567KLMN, vout89ABOPQRCDEFSTUV);
1274*4bdc9457SAndroid Build Coastguard Worker __m256i vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_permutevar8x32_epi32(vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV, vpermute_mask);
1275*4bdc9457SAndroid Build Coastguard Worker const __m128i voutGHIJOPQR = _mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV);
1276*4bdc9457SAndroid Build Coastguard Worker const __m128i voutKLMNSTUV = _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1);
1277*4bdc9457SAndroid Build Coastguard Worker __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(voutGHIJOPQR, voutKLMNSTUV), _MM_SHUFFLE(3, 1, 2, 0));
1278*4bdc9457SAndroid Build Coastguard Worker
1279*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_max_epi8(vout0123456789ABCDEFGHIJKLMNOPQRSTUV, voutput_min);
1280*4bdc9457SAndroid Build Coastguard Worker voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, _mm256_castsi256_si128(voutput_min));
1281*4bdc9457SAndroid Build Coastguard Worker
1282*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) output, vout0123456789ABCDEFGHIJKLMNOPQRSTUV);
1283*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
1284*4bdc9457SAndroid Build Coastguard Worker output += 32;
1285*4bdc9457SAndroid Build Coastguard Worker }
1286*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
1287*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
1288*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << (c & 15)) - UINT32_C(1)));
1289*4bdc9457SAndroid Build Coastguard Worker const int8_t* k = (const int8_t*) ((uintptr_t) w + 32 * sizeof(int32_t));
1290*4bdc9457SAndroid Build Coastguard Worker do {
1291*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
1292*4bdc9457SAndroid Build Coastguard Worker
1293*4bdc9457SAndroid Build Coastguard Worker
1294*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
1295*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) k));
1296*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
1297*4bdc9457SAndroid Build Coastguard Worker
1298*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
1299*4bdc9457SAndroid Build Coastguard Worker
1300*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
1301*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 32)));
1302*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
1303*4bdc9457SAndroid Build Coastguard Worker
1304*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
1305*4bdc9457SAndroid Build Coastguard Worker
1306*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
1307*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 64)));
1308*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
1309*4bdc9457SAndroid Build Coastguard Worker
1310*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
1311*4bdc9457SAndroid Build Coastguard Worker
1312*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3));
1313*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 96)));
1314*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
1315*4bdc9457SAndroid Build Coastguard Worker
1316*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
1317*4bdc9457SAndroid Build Coastguard Worker
1318*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4));
1319*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 128)));
1320*4bdc9457SAndroid Build Coastguard Worker i4 += 16;
1321*4bdc9457SAndroid Build Coastguard Worker
1322*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
1323*4bdc9457SAndroid Build Coastguard Worker
1324*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5));
1325*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 160)));
1326*4bdc9457SAndroid Build Coastguard Worker i5 += 16;
1327*4bdc9457SAndroid Build Coastguard Worker
1328*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
1329*4bdc9457SAndroid Build Coastguard Worker
1330*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6));
1331*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 192)));
1332*4bdc9457SAndroid Build Coastguard Worker i6 += 16;
1333*4bdc9457SAndroid Build Coastguard Worker
1334*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
1335*4bdc9457SAndroid Build Coastguard Worker
1336*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7));
1337*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 224)));
1338*4bdc9457SAndroid Build Coastguard Worker i7 += 16;
1339*4bdc9457SAndroid Build Coastguard Worker
1340*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
1341*4bdc9457SAndroid Build Coastguard Worker
1342*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8));
1343*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 256)));
1344*4bdc9457SAndroid Build Coastguard Worker i8 += 16;
1345*4bdc9457SAndroid Build Coastguard Worker
1346*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
1347*4bdc9457SAndroid Build Coastguard Worker
1348*4bdc9457SAndroid Build Coastguard Worker k += 16;
1349*4bdc9457SAndroid Build Coastguard Worker
1350*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
1351*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale0123456789ABCDEF = _mm512_loadu_ps((const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t)));
1352*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale0123456789ABCDEF);
1353*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
1354*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
1355*4bdc9457SAndroid Build Coastguard Worker
1356*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
1357*4bdc9457SAndroid Build Coastguard Worker
1358*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
1359*4bdc9457SAndroid Build Coastguard Worker
1360*4bdc9457SAndroid Build Coastguard Worker const __m128i vout012389AB = _mm256_castsi256_si128(vout012389AB4567CDEF);
1361*4bdc9457SAndroid Build Coastguard Worker const __m128i vout4567CDEF = _mm256_extracti128_si256(vout012389AB4567CDEF, 1);
1362*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(vout012389AB, vout4567CDEF), _MM_SHUFFLE(3, 1, 2, 0));
1363*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, _mm256_castsi256_si128(voutput_min));
1364*4bdc9457SAndroid Build Coastguard Worker
1365*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(c >= 16) {
1366*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
1367*4bdc9457SAndroid Build Coastguard Worker output += 16;
1368*4bdc9457SAndroid Build Coastguard Worker c -= 16;
1369*4bdc9457SAndroid Build Coastguard Worker } else {
1370*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
1371*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + c);
1372*4bdc9457SAndroid Build Coastguard Worker c = 0;
1373*4bdc9457SAndroid Build Coastguard Worker }
1374*4bdc9457SAndroid Build Coastguard Worker } while (c != 0);
1375*4bdc9457SAndroid Build Coastguard Worker }
1376*4bdc9457SAndroid Build Coastguard Worker
1377*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + output_increment);
1378*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
1379*4bdc9457SAndroid Build Coastguard Worker }
1380*4bdc9457SAndroid Build Coastguard Worker
xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1381*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx(
1382*4bdc9457SAndroid Build Coastguard Worker size_t mr,
1383*4bdc9457SAndroid Build Coastguard Worker size_t nc,
1384*4bdc9457SAndroid Build Coastguard Worker size_t kc,
1385*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a,
1386*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
1387*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
1388*4bdc9457SAndroid Build Coastguard Worker int8_t* restrict c,
1389*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
1390*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
1391*4bdc9457SAndroid Build Coastguard Worker const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1392*4bdc9457SAndroid Build Coastguard Worker {
1393*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
1394*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
1395*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
1396*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
1397*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(int8_t) == 0);
1398*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
1399*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
1400*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
1401*4bdc9457SAndroid Build Coastguard Worker
1402*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
1403*4bdc9457SAndroid Build Coastguard Worker const int8_t* a0 = a;
1404*4bdc9457SAndroid Build Coastguard Worker int8_t* c0 = c;
1405*4bdc9457SAndroid Build Coastguard Worker
1406*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
1407*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
1408*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_zero_point);
1409*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx512.output_min);
1410*4bdc9457SAndroid Build Coastguard Worker do {
1411*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
1412*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
1413*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
1414*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
1415*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
1416*4bdc9457SAndroid Build Coastguard Worker
1417*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
1418*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
1419*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
1420*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
1421*4bdc9457SAndroid Build Coastguard Worker
1422*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) w));
1423*4bdc9457SAndroid Build Coastguard Worker
1424*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
1425*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 32)));
1426*4bdc9457SAndroid Build Coastguard Worker
1427*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
1428*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 64)));
1429*4bdc9457SAndroid Build Coastguard Worker
1430*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
1431*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 96)));
1432*4bdc9457SAndroid Build Coastguard Worker
1433*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
1434*4bdc9457SAndroid Build Coastguard Worker
1435*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int8_t*) w + 128);
1436*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(int8_t);
1437*4bdc9457SAndroid Build Coastguard Worker }
1438*4bdc9457SAndroid Build Coastguard Worker
1439*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
1440*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
1441*4bdc9457SAndroid Build Coastguard Worker
1442*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
1443*4bdc9457SAndroid Build Coastguard Worker
1444*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
1445*4bdc9457SAndroid Build Coastguard Worker
1446*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale012345678ABCDEF = _mm512_load_ps(w);
1447*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const float*) w + 16);
1448*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale084C195D2A6E3B7F = _mm512_permutexvar_ps(_mm512_set_epi32(15, 7, 11, 3, 14, 6, 10, 2, 13, 5, 9, 1, 12, 4, 8, 0), vscale012345678ABCDEF);
1449*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1450*4bdc9457SAndroid Build Coastguard Worker
1451*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
1452*4bdc9457SAndroid Build Coastguard Worker
1453*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
1454*4bdc9457SAndroid Build Coastguard Worker
1455*4bdc9457SAndroid Build Coastguard Worker const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
1456*4bdc9457SAndroid Build Coastguard Worker
1457*4bdc9457SAndroid Build Coastguard Worker const __m128i vout0x084C2A6E195D3B7F = _mm_packs_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
1458*4bdc9457SAndroid Build Coastguard Worker __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
1459*4bdc9457SAndroid Build Coastguard Worker vout0x0123456789ABCDEF = _mm_max_epi8(vout0x0123456789ABCDEF, voutput_min);
1460*4bdc9457SAndroid Build Coastguard Worker
1461*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
1462*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF);
1463*4bdc9457SAndroid Build Coastguard Worker
1464*4bdc9457SAndroid Build Coastguard Worker a0 = (const int8_t*) ((uintptr_t) a0 - k);
1465*4bdc9457SAndroid Build Coastguard Worker
1466*4bdc9457SAndroid Build Coastguard Worker c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
1467*4bdc9457SAndroid Build Coastguard Worker
1468*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
1469*4bdc9457SAndroid Build Coastguard Worker } else {
1470*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
1471*4bdc9457SAndroid Build Coastguard Worker const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT32_C(1) << nc) - UINT32_C(1)));
1472*4bdc9457SAndroid Build Coastguard Worker
1473*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(c0, vmask, vout0x0123456789ABCDEF);
1474*4bdc9457SAndroid Build Coastguard Worker
1475*4bdc9457SAndroid Build Coastguard Worker nc = 0;
1476*4bdc9457SAndroid Build Coastguard Worker }
1477*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
1478*4bdc9457SAndroid Build Coastguard Worker }
1479*4bdc9457SAndroid Build Coastguard Worker
xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1480*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx(
1481*4bdc9457SAndroid Build Coastguard Worker size_t mr,
1482*4bdc9457SAndroid Build Coastguard Worker size_t nc,
1483*4bdc9457SAndroid Build Coastguard Worker size_t kc,
1484*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a,
1485*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
1486*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
1487*4bdc9457SAndroid Build Coastguard Worker int8_t* restrict c,
1488*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
1489*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
1490*4bdc9457SAndroid Build Coastguard Worker const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1491*4bdc9457SAndroid Build Coastguard Worker {
1492*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
1493*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 4);
1494*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
1495*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
1496*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(int8_t) == 0);
1497*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
1498*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
1499*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
1500*4bdc9457SAndroid Build Coastguard Worker
1501*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
1502*4bdc9457SAndroid Build Coastguard Worker const int8_t* a0 = a;
1503*4bdc9457SAndroid Build Coastguard Worker int8_t* c0 = c;
1504*4bdc9457SAndroid Build Coastguard Worker const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
1505*4bdc9457SAndroid Build Coastguard Worker int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
1506*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
1507*4bdc9457SAndroid Build Coastguard Worker a1 = a0;
1508*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
1509*4bdc9457SAndroid Build Coastguard Worker }
1510*4bdc9457SAndroid Build Coastguard Worker const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
1511*4bdc9457SAndroid Build Coastguard Worker int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
1512*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
1513*4bdc9457SAndroid Build Coastguard Worker a2 = a1;
1514*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
1515*4bdc9457SAndroid Build Coastguard Worker }
1516*4bdc9457SAndroid Build Coastguard Worker const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
1517*4bdc9457SAndroid Build Coastguard Worker int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
1518*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr != 4) {
1519*4bdc9457SAndroid Build Coastguard Worker a3 = a2;
1520*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
1521*4bdc9457SAndroid Build Coastguard Worker }
1522*4bdc9457SAndroid Build Coastguard Worker
1523*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
1524*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
1525*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
1526*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_min = _mm512_load_si512(params->fp32_avx512.output_min);
1527*4bdc9457SAndroid Build Coastguard Worker do {
1528*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
1529*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
1530*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
1531*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
1532*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x0123 = vacc0x0123;
1533*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x4567 = vacc0x4567;
1534*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x89AB = vacc0x89AB;
1535*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1xCDEF = vacc0xCDEF;
1536*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x0123 = vacc0x0123;
1537*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x4567 = vacc0x4567;
1538*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x89AB = vacc0x89AB;
1539*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2xCDEF = vacc0xCDEF;
1540*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x0123 = vacc0x0123;
1541*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x4567 = vacc0x4567;
1542*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x89AB = vacc0x89AB;
1543*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3xCDEF = vacc0xCDEF;
1544*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
1545*4bdc9457SAndroid Build Coastguard Worker
1546*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
1547*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
1548*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
1549*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
1550*4bdc9457SAndroid Build Coastguard Worker const __m512i va1 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a1)));
1551*4bdc9457SAndroid Build Coastguard Worker a1 += 8;
1552*4bdc9457SAndroid Build Coastguard Worker const __m512i va2 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a2)));
1553*4bdc9457SAndroid Build Coastguard Worker a2 += 8;
1554*4bdc9457SAndroid Build Coastguard Worker const __m512i va3 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a3)));
1555*4bdc9457SAndroid Build Coastguard Worker a3 += 8;
1556*4bdc9457SAndroid Build Coastguard Worker
1557*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) w));
1558*4bdc9457SAndroid Build Coastguard Worker
1559*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
1560*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm512_add_epi32(vacc1x0123, _mm512_madd_epi16(va1, vb0123));
1561*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm512_add_epi32(vacc2x0123, _mm512_madd_epi16(va2, vb0123));
1562*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm512_add_epi32(vacc3x0123, _mm512_madd_epi16(va3, vb0123));
1563*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 32)));
1564*4bdc9457SAndroid Build Coastguard Worker
1565*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
1566*4bdc9457SAndroid Build Coastguard Worker vacc1x4567 = _mm512_add_epi32(vacc1x4567, _mm512_madd_epi16(va1, vb4567));
1567*4bdc9457SAndroid Build Coastguard Worker vacc2x4567 = _mm512_add_epi32(vacc2x4567, _mm512_madd_epi16(va2, vb4567));
1568*4bdc9457SAndroid Build Coastguard Worker vacc3x4567 = _mm512_add_epi32(vacc3x4567, _mm512_madd_epi16(va3, vb4567));
1569*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 64)));
1570*4bdc9457SAndroid Build Coastguard Worker
1571*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
1572*4bdc9457SAndroid Build Coastguard Worker vacc1x89AB = _mm512_add_epi32(vacc1x89AB, _mm512_madd_epi16(va1, vb89AB));
1573*4bdc9457SAndroid Build Coastguard Worker vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB));
1574*4bdc9457SAndroid Build Coastguard Worker vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB));
1575*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 96)));
1576*4bdc9457SAndroid Build Coastguard Worker
1577*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
1578*4bdc9457SAndroid Build Coastguard Worker vacc1xCDEF = _mm512_add_epi32(vacc1xCDEF, _mm512_madd_epi16(va1, vbCDEF));
1579*4bdc9457SAndroid Build Coastguard Worker vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF));
1580*4bdc9457SAndroid Build Coastguard Worker vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF));
1581*4bdc9457SAndroid Build Coastguard Worker
1582*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int8_t*) w + 128);
1583*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(int8_t);
1584*4bdc9457SAndroid Build Coastguard Worker }
1585*4bdc9457SAndroid Build Coastguard Worker
1586*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
1587*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
1588*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x0123, vacc1x4567), _mm512_unpackhi_epi32(vacc1x0123, vacc1x4567));
1589*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x89AB, vacc1xCDEF), _mm512_unpackhi_epi32(vacc1x89AB, vacc1xCDEF));
1590*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x0123, vacc2x4567), _mm512_unpackhi_epi32(vacc2x0123, vacc2x4567));
1591*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vacc2xCDEF));
1592*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x0123, vacc3x4567), _mm512_unpackhi_epi32(vacc3x0123, vacc3x4567));
1593*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vacc3xCDEF));
1594*4bdc9457SAndroid Build Coastguard Worker
1595*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
1596*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x04152637, vacc1x8C9DAEBF), _mm512_unpackhi_epi32(vacc1x04152637, vacc1x8C9DAEBF));
1597*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x04152637, vacc2x8C9DAEBF), _mm512_unpackhi_epi32(vacc2x04152637, vacc2x8C9DAEBF));
1598*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x04152637, vacc3x8C9DAEBF), _mm512_unpackhi_epi32(vacc3x04152637, vacc3x8C9DAEBF));
1599*4bdc9457SAndroid Build Coastguard Worker
1600*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
1601*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled1x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc1x084C195D2A6E3B7F);
1602*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled2x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc2x084C195D2A6E3B7F);
1603*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled3x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc3x084C195D2A6E3B7F);
1604*4bdc9457SAndroid Build Coastguard Worker
1605*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale012345678ABCDEF = _mm512_load_ps(w);
1606*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const float*) w + 16);
1607*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale084C195D2A6E3B7F = _mm512_permutexvar_ps(_mm512_set_epi32(15, 7, 11, 3, 14, 6, 10, 2, 13, 5, 9, 1, 12, 4, 8, 0), vscale012345678ABCDEF);
1608*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1609*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_mul_ps(vscaled1x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1610*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_mul_ps(vscaled2x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1611*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_mul_ps(vscaled3x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1612*4bdc9457SAndroid Build Coastguard Worker
1613*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
1614*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_min_ps(vscaled1x084C195D2A6E3B7F, voutput_max_less_zero_point);
1615*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_min_ps(vscaled2x084C195D2A6E3B7F, voutput_max_less_zero_point);
1616*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_min_ps(vscaled3x084C195D2A6E3B7F, voutput_max_less_zero_point);
1617*4bdc9457SAndroid Build Coastguard Worker
1618*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
1619*4bdc9457SAndroid Build Coastguard Worker vacc1x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled1x084C195D2A6E3B7F);
1620*4bdc9457SAndroid Build Coastguard Worker vacc2x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled2x084C195D2A6E3B7F);
1621*4bdc9457SAndroid Build Coastguard Worker vacc3x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled3x084C195D2A6E3B7F);
1622*4bdc9457SAndroid Build Coastguard Worker
1623*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
1624*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
1625*4bdc9457SAndroid Build Coastguard Worker
1626*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc23x084Cx195Dx2A6Ex3B7F);
1627*4bdc9457SAndroid Build Coastguard Worker vout0123x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0123x084Cx195Dx2A6Ex3B7F);
1628*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
1629*4bdc9457SAndroid Build Coastguard Worker vout0123x0123456789ABCDEF = _mm512_max_epi8(vout0123x0123456789ABCDEF, voutput_min);
1630*4bdc9457SAndroid Build Coastguard Worker
1631*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
1632*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, _mm512_castsi512_si128(vout0123x0123456789ABCDEF));
1633*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c1, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 1));
1634*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c2, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 2));
1635*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c3, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 3));
1636*4bdc9457SAndroid Build Coastguard Worker
1637*4bdc9457SAndroid Build Coastguard Worker a0 = (const int8_t*) ((uintptr_t) a0 - k);
1638*4bdc9457SAndroid Build Coastguard Worker a1 = (const int8_t*) ((uintptr_t) a1 - k);
1639*4bdc9457SAndroid Build Coastguard Worker a2 = (const int8_t*) ((uintptr_t) a2 - k);
1640*4bdc9457SAndroid Build Coastguard Worker a3 = (const int8_t*) ((uintptr_t) a3 - k);
1641*4bdc9457SAndroid Build Coastguard Worker
1642*4bdc9457SAndroid Build Coastguard Worker c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
1643*4bdc9457SAndroid Build Coastguard Worker c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
1644*4bdc9457SAndroid Build Coastguard Worker c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
1645*4bdc9457SAndroid Build Coastguard Worker c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
1646*4bdc9457SAndroid Build Coastguard Worker
1647*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
1648*4bdc9457SAndroid Build Coastguard Worker } else {
1649*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
1650*4bdc9457SAndroid Build Coastguard Worker __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT32_C(1) << nc) - UINT32_C(1)));
1651*4bdc9457SAndroid Build Coastguard Worker
1652*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c0, vmask, vout0123x0123456789ABCDEF);
1653*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftli_mask64(vmask, 16);
1654*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c1 - 16, vmask, vout0123x0123456789ABCDEF);
1655*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftli_mask64(vmask, 16);
1656*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c2 - 32, vmask, vout0123x0123456789ABCDEF);
1657*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftli_mask64(vmask, 16);
1658*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c3 - 48, vmask, vout0123x0123456789ABCDEF);
1659*4bdc9457SAndroid Build Coastguard Worker
1660*4bdc9457SAndroid Build Coastguard Worker nc = 0;
1661*4bdc9457SAndroid Build Coastguard Worker }
1662*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
1663*4bdc9457SAndroid Build Coastguard Worker }
1664*4bdc9457SAndroid Build Coastguard Worker
xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1665*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx(
1666*4bdc9457SAndroid Build Coastguard Worker size_t mr,
1667*4bdc9457SAndroid Build Coastguard Worker size_t nc,
1668*4bdc9457SAndroid Build Coastguard Worker size_t kc,
1669*4bdc9457SAndroid Build Coastguard Worker size_t ks,
1670*4bdc9457SAndroid Build Coastguard Worker const int8_t** restrict a,
1671*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
1672*4bdc9457SAndroid Build Coastguard Worker int8_t* restrict c,
1673*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
1674*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
1675*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
1676*4bdc9457SAndroid Build Coastguard Worker const int8_t* zero,
1677*4bdc9457SAndroid Build Coastguard Worker const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1678*4bdc9457SAndroid Build Coastguard Worker {
1679*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
1680*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
1681*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
1682*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
1683*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(int8_t) == 0);
1684*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
1685*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
1686*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
1687*4bdc9457SAndroid Build Coastguard Worker
1688*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
1689*4bdc9457SAndroid Build Coastguard Worker int8_t* c0 = c;
1690*4bdc9457SAndroid Build Coastguard Worker
1691*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
1692*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
1693*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_zero_point);
1694*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx512.output_min);
1695*4bdc9457SAndroid Build Coastguard Worker do {
1696*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
1697*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
1698*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
1699*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
1700*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
1701*4bdc9457SAndroid Build Coastguard Worker
1702*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
1703*4bdc9457SAndroid Build Coastguard Worker do {
1704*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a0 = a[0];
1705*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
1706*4bdc9457SAndroid Build Coastguard Worker a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
1707*4bdc9457SAndroid Build Coastguard Worker }
1708*4bdc9457SAndroid Build Coastguard Worker a += 1;
1709*4bdc9457SAndroid Build Coastguard Worker
1710*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
1711*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
1712*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
1713*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
1714*4bdc9457SAndroid Build Coastguard Worker
1715*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) w));
1716*4bdc9457SAndroid Build Coastguard Worker
1717*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
1718*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 32)));
1719*4bdc9457SAndroid Build Coastguard Worker
1720*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
1721*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 64)));
1722*4bdc9457SAndroid Build Coastguard Worker
1723*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
1724*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 96)));
1725*4bdc9457SAndroid Build Coastguard Worker
1726*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
1727*4bdc9457SAndroid Build Coastguard Worker
1728*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int8_t*) w + 128);
1729*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(int8_t);
1730*4bdc9457SAndroid Build Coastguard Worker }
1731*4bdc9457SAndroid Build Coastguard Worker p -= 1 * sizeof(void*);
1732*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
1733*4bdc9457SAndroid Build Coastguard Worker
1734*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
1735*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
1736*4bdc9457SAndroid Build Coastguard Worker
1737*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
1738*4bdc9457SAndroid Build Coastguard Worker
1739*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
1740*4bdc9457SAndroid Build Coastguard Worker
1741*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale012345678ABCDEF = _mm512_load_ps(w);
1742*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const float*) w + 16);
1743*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale084C195D2A6E3B7F = _mm512_permutexvar_ps(_mm512_set_epi32(15, 7, 11, 3, 14, 6, 10, 2, 13, 5, 9, 1, 12, 4, 8, 0), vscale012345678ABCDEF);
1744*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1745*4bdc9457SAndroid Build Coastguard Worker
1746*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
1747*4bdc9457SAndroid Build Coastguard Worker
1748*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
1749*4bdc9457SAndroid Build Coastguard Worker
1750*4bdc9457SAndroid Build Coastguard Worker const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
1751*4bdc9457SAndroid Build Coastguard Worker
1752*4bdc9457SAndroid Build Coastguard Worker const __m128i vout0x084C2A6E195D3B7F = _mm_packs_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
1753*4bdc9457SAndroid Build Coastguard Worker __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
1754*4bdc9457SAndroid Build Coastguard Worker vout0x0123456789ABCDEF = _mm_max_epi8(vout0x0123456789ABCDEF, voutput_min);
1755*4bdc9457SAndroid Build Coastguard Worker
1756*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
1757*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF);
1758*4bdc9457SAndroid Build Coastguard Worker
1759*4bdc9457SAndroid Build Coastguard Worker c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
1760*4bdc9457SAndroid Build Coastguard Worker
1761*4bdc9457SAndroid Build Coastguard Worker a = (const int8_t**restrict) ((uintptr_t) a - ks);
1762*4bdc9457SAndroid Build Coastguard Worker
1763*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
1764*4bdc9457SAndroid Build Coastguard Worker } else {
1765*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
1766*4bdc9457SAndroid Build Coastguard Worker const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT32_C(1) << nc) - UINT32_C(1)));
1767*4bdc9457SAndroid Build Coastguard Worker
1768*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(c0, vmask, vout0x0123456789ABCDEF);
1769*4bdc9457SAndroid Build Coastguard Worker
1770*4bdc9457SAndroid Build Coastguard Worker nc = 0;
1771*4bdc9457SAndroid Build Coastguard Worker }
1772*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
1773*4bdc9457SAndroid Build Coastguard Worker }
1774*4bdc9457SAndroid Build Coastguard Worker
xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1775*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx(
1776*4bdc9457SAndroid Build Coastguard Worker size_t mr,
1777*4bdc9457SAndroid Build Coastguard Worker size_t nc,
1778*4bdc9457SAndroid Build Coastguard Worker size_t kc,
1779*4bdc9457SAndroid Build Coastguard Worker size_t ks,
1780*4bdc9457SAndroid Build Coastguard Worker const int8_t** restrict a,
1781*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
1782*4bdc9457SAndroid Build Coastguard Worker int8_t* restrict c,
1783*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
1784*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
1785*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
1786*4bdc9457SAndroid Build Coastguard Worker const int8_t* zero,
1787*4bdc9457SAndroid Build Coastguard Worker const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1788*4bdc9457SAndroid Build Coastguard Worker {
1789*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
1790*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 4);
1791*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
1792*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
1793*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(int8_t) == 0);
1794*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
1795*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
1796*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
1797*4bdc9457SAndroid Build Coastguard Worker
1798*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
1799*4bdc9457SAndroid Build Coastguard Worker int8_t* c0 = c;
1800*4bdc9457SAndroid Build Coastguard Worker int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
1801*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
1802*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
1803*4bdc9457SAndroid Build Coastguard Worker }
1804*4bdc9457SAndroid Build Coastguard Worker int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
1805*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
1806*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
1807*4bdc9457SAndroid Build Coastguard Worker }
1808*4bdc9457SAndroid Build Coastguard Worker int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
1809*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr != 4) {
1810*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
1811*4bdc9457SAndroid Build Coastguard Worker }
1812*4bdc9457SAndroid Build Coastguard Worker
1813*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
1814*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
1815*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
1816*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_min = _mm512_load_si512(params->fp32_avx512.output_min);
1817*4bdc9457SAndroid Build Coastguard Worker do {
1818*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
1819*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
1820*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
1821*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
1822*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x0123 = vacc0x0123;
1823*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x4567 = vacc0x4567;
1824*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x89AB = vacc0x89AB;
1825*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1xCDEF = vacc0xCDEF;
1826*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x0123 = vacc0x0123;
1827*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x4567 = vacc0x4567;
1828*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x89AB = vacc0x89AB;
1829*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2xCDEF = vacc0xCDEF;
1830*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x0123 = vacc0x0123;
1831*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x4567 = vacc0x4567;
1832*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x89AB = vacc0x89AB;
1833*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3xCDEF = vacc0xCDEF;
1834*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
1835*4bdc9457SAndroid Build Coastguard Worker
1836*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
1837*4bdc9457SAndroid Build Coastguard Worker do {
1838*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a0 = a[0];
1839*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
1840*4bdc9457SAndroid Build Coastguard Worker a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
1841*4bdc9457SAndroid Build Coastguard Worker }
1842*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a1 = a[1];
1843*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a1 != zero) {
1844*4bdc9457SAndroid Build Coastguard Worker a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
1845*4bdc9457SAndroid Build Coastguard Worker }
1846*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a2 = a[2];
1847*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a2 != zero) {
1848*4bdc9457SAndroid Build Coastguard Worker a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
1849*4bdc9457SAndroid Build Coastguard Worker }
1850*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a3 = a[3];
1851*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a3 != zero) {
1852*4bdc9457SAndroid Build Coastguard Worker a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
1853*4bdc9457SAndroid Build Coastguard Worker }
1854*4bdc9457SAndroid Build Coastguard Worker a += 4;
1855*4bdc9457SAndroid Build Coastguard Worker
1856*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
1857*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
1858*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
1859*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
1860*4bdc9457SAndroid Build Coastguard Worker const __m512i va1 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a1)));
1861*4bdc9457SAndroid Build Coastguard Worker a1 += 8;
1862*4bdc9457SAndroid Build Coastguard Worker const __m512i va2 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a2)));
1863*4bdc9457SAndroid Build Coastguard Worker a2 += 8;
1864*4bdc9457SAndroid Build Coastguard Worker const __m512i va3 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a3)));
1865*4bdc9457SAndroid Build Coastguard Worker a3 += 8;
1866*4bdc9457SAndroid Build Coastguard Worker
1867*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) w));
1868*4bdc9457SAndroid Build Coastguard Worker
1869*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
1870*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm512_add_epi32(vacc1x0123, _mm512_madd_epi16(va1, vb0123));
1871*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm512_add_epi32(vacc2x0123, _mm512_madd_epi16(va2, vb0123));
1872*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm512_add_epi32(vacc3x0123, _mm512_madd_epi16(va3, vb0123));
1873*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 32)));
1874*4bdc9457SAndroid Build Coastguard Worker
1875*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
1876*4bdc9457SAndroid Build Coastguard Worker vacc1x4567 = _mm512_add_epi32(vacc1x4567, _mm512_madd_epi16(va1, vb4567));
1877*4bdc9457SAndroid Build Coastguard Worker vacc2x4567 = _mm512_add_epi32(vacc2x4567, _mm512_madd_epi16(va2, vb4567));
1878*4bdc9457SAndroid Build Coastguard Worker vacc3x4567 = _mm512_add_epi32(vacc3x4567, _mm512_madd_epi16(va3, vb4567));
1879*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 64)));
1880*4bdc9457SAndroid Build Coastguard Worker
1881*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
1882*4bdc9457SAndroid Build Coastguard Worker vacc1x89AB = _mm512_add_epi32(vacc1x89AB, _mm512_madd_epi16(va1, vb89AB));
1883*4bdc9457SAndroid Build Coastguard Worker vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB));
1884*4bdc9457SAndroid Build Coastguard Worker vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB));
1885*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 96)));
1886*4bdc9457SAndroid Build Coastguard Worker
1887*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
1888*4bdc9457SAndroid Build Coastguard Worker vacc1xCDEF = _mm512_add_epi32(vacc1xCDEF, _mm512_madd_epi16(va1, vbCDEF));
1889*4bdc9457SAndroid Build Coastguard Worker vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF));
1890*4bdc9457SAndroid Build Coastguard Worker vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF));
1891*4bdc9457SAndroid Build Coastguard Worker
1892*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int8_t*) w + 128);
1893*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(int8_t);
1894*4bdc9457SAndroid Build Coastguard Worker }
1895*4bdc9457SAndroid Build Coastguard Worker p -= 4 * sizeof(void*);
1896*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
1897*4bdc9457SAndroid Build Coastguard Worker
1898*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
1899*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
1900*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x0123, vacc1x4567), _mm512_unpackhi_epi32(vacc1x0123, vacc1x4567));
1901*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x89AB, vacc1xCDEF), _mm512_unpackhi_epi32(vacc1x89AB, vacc1xCDEF));
1902*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x0123, vacc2x4567), _mm512_unpackhi_epi32(vacc2x0123, vacc2x4567));
1903*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vacc2xCDEF));
1904*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x0123, vacc3x4567), _mm512_unpackhi_epi32(vacc3x0123, vacc3x4567));
1905*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vacc3xCDEF));
1906*4bdc9457SAndroid Build Coastguard Worker
1907*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
1908*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x04152637, vacc1x8C9DAEBF), _mm512_unpackhi_epi32(vacc1x04152637, vacc1x8C9DAEBF));
1909*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x04152637, vacc2x8C9DAEBF), _mm512_unpackhi_epi32(vacc2x04152637, vacc2x8C9DAEBF));
1910*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x04152637, vacc3x8C9DAEBF), _mm512_unpackhi_epi32(vacc3x04152637, vacc3x8C9DAEBF));
1911*4bdc9457SAndroid Build Coastguard Worker
1912*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
1913*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled1x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc1x084C195D2A6E3B7F);
1914*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled2x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc2x084C195D2A6E3B7F);
1915*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled3x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc3x084C195D2A6E3B7F);
1916*4bdc9457SAndroid Build Coastguard Worker
1917*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale012345678ABCDEF = _mm512_load_ps(w);
1918*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const float*) w + 16);
1919*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale084C195D2A6E3B7F = _mm512_permutexvar_ps(_mm512_set_epi32(15, 7, 11, 3, 14, 6, 10, 2, 13, 5, 9, 1, 12, 4, 8, 0), vscale012345678ABCDEF);
1920*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1921*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_mul_ps(vscaled1x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1922*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_mul_ps(vscaled2x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1923*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_mul_ps(vscaled3x084C195D2A6E3B7F, vscale084C195D2A6E3B7F);
1924*4bdc9457SAndroid Build Coastguard Worker
1925*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
1926*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_min_ps(vscaled1x084C195D2A6E3B7F, voutput_max_less_zero_point);
1927*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_min_ps(vscaled2x084C195D2A6E3B7F, voutput_max_less_zero_point);
1928*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_min_ps(vscaled3x084C195D2A6E3B7F, voutput_max_less_zero_point);
1929*4bdc9457SAndroid Build Coastguard Worker
1930*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
1931*4bdc9457SAndroid Build Coastguard Worker vacc1x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled1x084C195D2A6E3B7F);
1932*4bdc9457SAndroid Build Coastguard Worker vacc2x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled2x084C195D2A6E3B7F);
1933*4bdc9457SAndroid Build Coastguard Worker vacc3x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled3x084C195D2A6E3B7F);
1934*4bdc9457SAndroid Build Coastguard Worker
1935*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
1936*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
1937*4bdc9457SAndroid Build Coastguard Worker
1938*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc23x084Cx195Dx2A6Ex3B7F);
1939*4bdc9457SAndroid Build Coastguard Worker vout0123x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0123x084Cx195Dx2A6Ex3B7F);
1940*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
1941*4bdc9457SAndroid Build Coastguard Worker vout0123x0123456789ABCDEF = _mm512_max_epi8(vout0123x0123456789ABCDEF, voutput_min);
1942*4bdc9457SAndroid Build Coastguard Worker
1943*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
1944*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c3, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 3));
1945*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c2, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 2));
1946*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c1, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 1));
1947*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, _mm512_castsi512_si128(vout0123x0123456789ABCDEF));
1948*4bdc9457SAndroid Build Coastguard Worker
1949*4bdc9457SAndroid Build Coastguard Worker c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
1950*4bdc9457SAndroid Build Coastguard Worker c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
1951*4bdc9457SAndroid Build Coastguard Worker c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
1952*4bdc9457SAndroid Build Coastguard Worker c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
1953*4bdc9457SAndroid Build Coastguard Worker
1954*4bdc9457SAndroid Build Coastguard Worker a = (const int8_t**restrict) ((uintptr_t) a - ks);
1955*4bdc9457SAndroid Build Coastguard Worker
1956*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
1957*4bdc9457SAndroid Build Coastguard Worker } else {
1958*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
1959*4bdc9457SAndroid Build Coastguard Worker __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT64_C(1) << (nc + 48)) - (UINT64_C(1) << 48)));
1960*4bdc9457SAndroid Build Coastguard Worker
1961*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c3 - 48, vmask, vout0123x0123456789ABCDEF);
1962*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftri_mask64(vmask, 16);
1963*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c2 - 32, vmask, vout0123x0123456789ABCDEF);
1964*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftri_mask64(vmask, 16);
1965*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c1 - 16, vmask, vout0123x0123456789ABCDEF);
1966*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftri_mask64(vmask, 16);
1967*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c0, vmask, vout0123x0123456789ABCDEF);
1968*4bdc9457SAndroid Build Coastguard Worker
1969*4bdc9457SAndroid Build Coastguard Worker nc = 0;
1970*4bdc9457SAndroid Build Coastguard Worker }
1971*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
1972*4bdc9457SAndroid Build Coastguard Worker }
1973*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1974*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32(
1975*4bdc9457SAndroid Build Coastguard Worker size_t channels,
1976*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
1977*4bdc9457SAndroid Build Coastguard Worker const int8_t** input,
1978*4bdc9457SAndroid Build Coastguard Worker const void* weights,
1979*4bdc9457SAndroid Build Coastguard Worker int8_t* output,
1980*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
1981*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
1982*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
1983*4bdc9457SAndroid Build Coastguard Worker const int8_t* zero,
1984*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_MSAN
1985*4bdc9457SAndroid Build Coastguard Worker {
1986*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
1987*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
1988*4bdc9457SAndroid Build Coastguard Worker
1989*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
1990*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
1991*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
1992*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_min);
1993*4bdc9457SAndroid Build Coastguard Worker const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
1994*4bdc9457SAndroid Build Coastguard Worker
1995*4bdc9457SAndroid Build Coastguard Worker do {
1996*4bdc9457SAndroid Build Coastguard Worker const int8_t* i0 = input[0];
1997*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
1998*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
1999*4bdc9457SAndroid Build Coastguard Worker i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
2000*4bdc9457SAndroid Build Coastguard Worker }
2001*4bdc9457SAndroid Build Coastguard Worker const int8_t* i1 = input[1];
2002*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
2003*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
2004*4bdc9457SAndroid Build Coastguard Worker i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
2005*4bdc9457SAndroid Build Coastguard Worker }
2006*4bdc9457SAndroid Build Coastguard Worker const int8_t* i2 = input[2];
2007*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
2008*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
2009*4bdc9457SAndroid Build Coastguard Worker i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2010*4bdc9457SAndroid Build Coastguard Worker }
2011*4bdc9457SAndroid Build Coastguard Worker const int8_t* i3 = input[3];
2012*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
2013*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
2014*4bdc9457SAndroid Build Coastguard Worker i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2015*4bdc9457SAndroid Build Coastguard Worker }
2016*4bdc9457SAndroid Build Coastguard Worker const int8_t* i4 = input[4];
2017*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
2018*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
2019*4bdc9457SAndroid Build Coastguard Worker i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2020*4bdc9457SAndroid Build Coastguard Worker }
2021*4bdc9457SAndroid Build Coastguard Worker const int8_t* i5 = input[5];
2022*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
2023*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
2024*4bdc9457SAndroid Build Coastguard Worker i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2025*4bdc9457SAndroid Build Coastguard Worker }
2026*4bdc9457SAndroid Build Coastguard Worker const int8_t* i6 = input[6];
2027*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
2028*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
2029*4bdc9457SAndroid Build Coastguard Worker i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2030*4bdc9457SAndroid Build Coastguard Worker }
2031*4bdc9457SAndroid Build Coastguard Worker const int8_t* i7 = input[7];
2032*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
2033*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
2034*4bdc9457SAndroid Build Coastguard Worker i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2035*4bdc9457SAndroid Build Coastguard Worker }
2036*4bdc9457SAndroid Build Coastguard Worker const int8_t* i8 = input[8];
2037*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
2038*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
2039*4bdc9457SAndroid Build Coastguard Worker i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2040*4bdc9457SAndroid Build Coastguard Worker }
2041*4bdc9457SAndroid Build Coastguard Worker const int8_t* i9 = input[9];
2042*4bdc9457SAndroid Build Coastguard Worker assert(i9 != NULL);
2043*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i9 != zero) {
2044*4bdc9457SAndroid Build Coastguard Worker i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
2045*4bdc9457SAndroid Build Coastguard Worker }
2046*4bdc9457SAndroid Build Coastguard Worker const int8_t* i10 = input[10];
2047*4bdc9457SAndroid Build Coastguard Worker assert(i10 != NULL);
2048*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i10 != zero) {
2049*4bdc9457SAndroid Build Coastguard Worker i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
2050*4bdc9457SAndroid Build Coastguard Worker }
2051*4bdc9457SAndroid Build Coastguard Worker const int8_t* i11 = input[11];
2052*4bdc9457SAndroid Build Coastguard Worker assert(i11 != NULL);
2053*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i11 != zero) {
2054*4bdc9457SAndroid Build Coastguard Worker i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
2055*4bdc9457SAndroid Build Coastguard Worker }
2056*4bdc9457SAndroid Build Coastguard Worker const int8_t* i12 = input[12];
2057*4bdc9457SAndroid Build Coastguard Worker assert(i12 != NULL);
2058*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i12 != zero) {
2059*4bdc9457SAndroid Build Coastguard Worker i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
2060*4bdc9457SAndroid Build Coastguard Worker }
2061*4bdc9457SAndroid Build Coastguard Worker const int8_t* i13 = input[13];
2062*4bdc9457SAndroid Build Coastguard Worker assert(i13 != NULL);
2063*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i13 != zero) {
2064*4bdc9457SAndroid Build Coastguard Worker i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
2065*4bdc9457SAndroid Build Coastguard Worker }
2066*4bdc9457SAndroid Build Coastguard Worker const int8_t* i14 = input[14];
2067*4bdc9457SAndroid Build Coastguard Worker assert(i14 != NULL);
2068*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i14 != zero) {
2069*4bdc9457SAndroid Build Coastguard Worker i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
2070*4bdc9457SAndroid Build Coastguard Worker }
2071*4bdc9457SAndroid Build Coastguard Worker const int8_t* i15 = input[15];
2072*4bdc9457SAndroid Build Coastguard Worker assert(i15 != NULL);
2073*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i15 != zero) {
2074*4bdc9457SAndroid Build Coastguard Worker i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
2075*4bdc9457SAndroid Build Coastguard Worker }
2076*4bdc9457SAndroid Build Coastguard Worker const int8_t* i16 = input[16];
2077*4bdc9457SAndroid Build Coastguard Worker assert(i16 != NULL);
2078*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i16 != zero) {
2079*4bdc9457SAndroid Build Coastguard Worker i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
2080*4bdc9457SAndroid Build Coastguard Worker }
2081*4bdc9457SAndroid Build Coastguard Worker const int8_t* i17 = input[17];
2082*4bdc9457SAndroid Build Coastguard Worker assert(i17 != NULL);
2083*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i17 != zero) {
2084*4bdc9457SAndroid Build Coastguard Worker i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
2085*4bdc9457SAndroid Build Coastguard Worker }
2086*4bdc9457SAndroid Build Coastguard Worker const int8_t* i18 = input[18];
2087*4bdc9457SAndroid Build Coastguard Worker assert(i18 != NULL);
2088*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i18 != zero) {
2089*4bdc9457SAndroid Build Coastguard Worker i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
2090*4bdc9457SAndroid Build Coastguard Worker }
2091*4bdc9457SAndroid Build Coastguard Worker const int8_t* i19 = input[19];
2092*4bdc9457SAndroid Build Coastguard Worker assert(i19 != NULL);
2093*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i19 != zero) {
2094*4bdc9457SAndroid Build Coastguard Worker i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
2095*4bdc9457SAndroid Build Coastguard Worker }
2096*4bdc9457SAndroid Build Coastguard Worker const int8_t* i20 = input[20];
2097*4bdc9457SAndroid Build Coastguard Worker assert(i20 != NULL);
2098*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i20 != zero) {
2099*4bdc9457SAndroid Build Coastguard Worker i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
2100*4bdc9457SAndroid Build Coastguard Worker }
2101*4bdc9457SAndroid Build Coastguard Worker const int8_t* i21 = input[21];
2102*4bdc9457SAndroid Build Coastguard Worker assert(i21 != NULL);
2103*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i21 != zero) {
2104*4bdc9457SAndroid Build Coastguard Worker i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
2105*4bdc9457SAndroid Build Coastguard Worker }
2106*4bdc9457SAndroid Build Coastguard Worker const int8_t* i22 = input[22];
2107*4bdc9457SAndroid Build Coastguard Worker assert(i22 != NULL);
2108*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i22 != zero) {
2109*4bdc9457SAndroid Build Coastguard Worker i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
2110*4bdc9457SAndroid Build Coastguard Worker }
2111*4bdc9457SAndroid Build Coastguard Worker const int8_t* i23 = input[23];
2112*4bdc9457SAndroid Build Coastguard Worker assert(i23 != NULL);
2113*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i23 != zero) {
2114*4bdc9457SAndroid Build Coastguard Worker i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
2115*4bdc9457SAndroid Build Coastguard Worker }
2116*4bdc9457SAndroid Build Coastguard Worker const int8_t* i24 = input[24];
2117*4bdc9457SAndroid Build Coastguard Worker assert(i24 != NULL);
2118*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i24 != zero) {
2119*4bdc9457SAndroid Build Coastguard Worker i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
2120*4bdc9457SAndroid Build Coastguard Worker }
2121*4bdc9457SAndroid Build Coastguard Worker input = (const int8_t**) ((uintptr_t) input + input_stride);
2122*4bdc9457SAndroid Build Coastguard Worker
2123*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
2124*4bdc9457SAndroid Build Coastguard Worker const void* w = weights;
2125*4bdc9457SAndroid Build Coastguard Worker for (; c >= 32; c -= 32) {
2126*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
2127*4bdc9457SAndroid Build Coastguard Worker __m512i vaccGHIJKLMNOPQRSTUV = _mm512_loadu_si512((const void*) ((uintptr_t) w + 16 * sizeof(int32_t)));
2128*4bdc9457SAndroid Build Coastguard Worker
2129*4bdc9457SAndroid Build Coastguard Worker
2130*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
2131*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 0 * sizeof(int8_t))));
2132*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i0 + 16)));
2133*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 16 * sizeof(int8_t))));
2134*4bdc9457SAndroid Build Coastguard Worker i0 += 32;
2135*4bdc9457SAndroid Build Coastguard Worker
2136*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
2137*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV));
2138*4bdc9457SAndroid Build Coastguard Worker
2139*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
2140*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 32 * sizeof(int8_t))));
2141*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i1 + 16)));
2142*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 48 * sizeof(int8_t))));
2143*4bdc9457SAndroid Build Coastguard Worker i1 += 32;
2144*4bdc9457SAndroid Build Coastguard Worker
2145*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
2146*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV));
2147*4bdc9457SAndroid Build Coastguard Worker
2148*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
2149*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 64 * sizeof(int8_t))));
2150*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i2 + 16)));
2151*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 80 * sizeof(int8_t))));
2152*4bdc9457SAndroid Build Coastguard Worker i2 += 32;
2153*4bdc9457SAndroid Build Coastguard Worker
2154*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
2155*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV));
2156*4bdc9457SAndroid Build Coastguard Worker
2157*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3));
2158*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(int8_t))));
2159*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i3 + 16)));
2160*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 112 * sizeof(int8_t))));
2161*4bdc9457SAndroid Build Coastguard Worker i3 += 32;
2162*4bdc9457SAndroid Build Coastguard Worker
2163*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
2164*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi3xGHIJKLMNOPQRSTUV, vk3xGHIJKLMNOPQRSTUV));
2165*4bdc9457SAndroid Build Coastguard Worker
2166*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4));
2167*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 128 * sizeof(int8_t))));
2168*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i4 + 16)));
2169*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 144 * sizeof(int8_t))));
2170*4bdc9457SAndroid Build Coastguard Worker i4 += 32;
2171*4bdc9457SAndroid Build Coastguard Worker
2172*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
2173*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi4xGHIJKLMNOPQRSTUV, vk4xGHIJKLMNOPQRSTUV));
2174*4bdc9457SAndroid Build Coastguard Worker
2175*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5));
2176*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 160 * sizeof(int8_t))));
2177*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i5 + 16)));
2178*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 176 * sizeof(int8_t))));
2179*4bdc9457SAndroid Build Coastguard Worker i5 += 32;
2180*4bdc9457SAndroid Build Coastguard Worker
2181*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
2182*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi5xGHIJKLMNOPQRSTUV, vk5xGHIJKLMNOPQRSTUV));
2183*4bdc9457SAndroid Build Coastguard Worker
2184*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6));
2185*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 192 * sizeof(int8_t))));
2186*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i6 + 16)));
2187*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 208 * sizeof(int8_t))));
2188*4bdc9457SAndroid Build Coastguard Worker i6 += 32;
2189*4bdc9457SAndroid Build Coastguard Worker
2190*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
2191*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi6xGHIJKLMNOPQRSTUV, vk6xGHIJKLMNOPQRSTUV));
2192*4bdc9457SAndroid Build Coastguard Worker
2193*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7));
2194*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 224 * sizeof(int8_t))));
2195*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i7 + 16)));
2196*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 240 * sizeof(int8_t))));
2197*4bdc9457SAndroid Build Coastguard Worker i7 += 32;
2198*4bdc9457SAndroid Build Coastguard Worker
2199*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
2200*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi7xGHIJKLMNOPQRSTUV, vk7xGHIJKLMNOPQRSTUV));
2201*4bdc9457SAndroid Build Coastguard Worker
2202*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8));
2203*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 256 * sizeof(int8_t))));
2204*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i8 + 16)));
2205*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 272 * sizeof(int8_t))));
2206*4bdc9457SAndroid Build Coastguard Worker i8 += 32;
2207*4bdc9457SAndroid Build Coastguard Worker
2208*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
2209*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi8xGHIJKLMNOPQRSTUV, vk8xGHIJKLMNOPQRSTUV));
2210*4bdc9457SAndroid Build Coastguard Worker
2211*4bdc9457SAndroid Build Coastguard Worker const __m512i vi9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i9));
2212*4bdc9457SAndroid Build Coastguard Worker const __m512i vk9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t))));
2213*4bdc9457SAndroid Build Coastguard Worker const __m512i vi9xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i9 + 16)));
2214*4bdc9457SAndroid Build Coastguard Worker const __m512i vk9xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 304 * sizeof(int8_t))));
2215*4bdc9457SAndroid Build Coastguard Worker i9 += 32;
2216*4bdc9457SAndroid Build Coastguard Worker
2217*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF));
2218*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi9xGHIJKLMNOPQRSTUV, vk9xGHIJKLMNOPQRSTUV));
2219*4bdc9457SAndroid Build Coastguard Worker
2220*4bdc9457SAndroid Build Coastguard Worker const __m512i vi10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i10));
2221*4bdc9457SAndroid Build Coastguard Worker const __m512i vk10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 320 * sizeof(int8_t))));
2222*4bdc9457SAndroid Build Coastguard Worker const __m512i vi10xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i10 + 16)));
2223*4bdc9457SAndroid Build Coastguard Worker const __m512i vk10xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 336 * sizeof(int8_t))));
2224*4bdc9457SAndroid Build Coastguard Worker i10 += 32;
2225*4bdc9457SAndroid Build Coastguard Worker
2226*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF));
2227*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi10xGHIJKLMNOPQRSTUV, vk10xGHIJKLMNOPQRSTUV));
2228*4bdc9457SAndroid Build Coastguard Worker
2229*4bdc9457SAndroid Build Coastguard Worker const __m512i vi11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i11));
2230*4bdc9457SAndroid Build Coastguard Worker const __m512i vk11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 352 * sizeof(int8_t))));
2231*4bdc9457SAndroid Build Coastguard Worker const __m512i vi11xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i11 + 16)));
2232*4bdc9457SAndroid Build Coastguard Worker const __m512i vk11xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 368 * sizeof(int8_t))));
2233*4bdc9457SAndroid Build Coastguard Worker i11 += 32;
2234*4bdc9457SAndroid Build Coastguard Worker
2235*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF));
2236*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi11xGHIJKLMNOPQRSTUV, vk11xGHIJKLMNOPQRSTUV));
2237*4bdc9457SAndroid Build Coastguard Worker
2238*4bdc9457SAndroid Build Coastguard Worker const __m512i vi12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i12));
2239*4bdc9457SAndroid Build Coastguard Worker const __m512i vk12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 384 * sizeof(int8_t))));
2240*4bdc9457SAndroid Build Coastguard Worker const __m512i vi12xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i12 + 16)));
2241*4bdc9457SAndroid Build Coastguard Worker const __m512i vk12xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 400 * sizeof(int8_t))));
2242*4bdc9457SAndroid Build Coastguard Worker i12 += 32;
2243*4bdc9457SAndroid Build Coastguard Worker
2244*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF));
2245*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi12xGHIJKLMNOPQRSTUV, vk12xGHIJKLMNOPQRSTUV));
2246*4bdc9457SAndroid Build Coastguard Worker
2247*4bdc9457SAndroid Build Coastguard Worker const __m512i vi13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i13));
2248*4bdc9457SAndroid Build Coastguard Worker const __m512i vk13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 416 * sizeof(int8_t))));
2249*4bdc9457SAndroid Build Coastguard Worker const __m512i vi13xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i13 + 16)));
2250*4bdc9457SAndroid Build Coastguard Worker const __m512i vk13xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 432 * sizeof(int8_t))));
2251*4bdc9457SAndroid Build Coastguard Worker i13 += 32;
2252*4bdc9457SAndroid Build Coastguard Worker
2253*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF));
2254*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi13xGHIJKLMNOPQRSTUV, vk13xGHIJKLMNOPQRSTUV));
2255*4bdc9457SAndroid Build Coastguard Worker
2256*4bdc9457SAndroid Build Coastguard Worker const __m512i vi14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i14));
2257*4bdc9457SAndroid Build Coastguard Worker const __m512i vk14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 448 * sizeof(int8_t))));
2258*4bdc9457SAndroid Build Coastguard Worker const __m512i vi14xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i14 + 16)));
2259*4bdc9457SAndroid Build Coastguard Worker const __m512i vk14xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 464 * sizeof(int8_t))));
2260*4bdc9457SAndroid Build Coastguard Worker i14 += 32;
2261*4bdc9457SAndroid Build Coastguard Worker
2262*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF));
2263*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi14xGHIJKLMNOPQRSTUV, vk14xGHIJKLMNOPQRSTUV));
2264*4bdc9457SAndroid Build Coastguard Worker
2265*4bdc9457SAndroid Build Coastguard Worker const __m512i vi15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i15));
2266*4bdc9457SAndroid Build Coastguard Worker const __m512i vk15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 480 * sizeof(int8_t))));
2267*4bdc9457SAndroid Build Coastguard Worker const __m512i vi15xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i15 + 16)));
2268*4bdc9457SAndroid Build Coastguard Worker const __m512i vk15xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 496 * sizeof(int8_t))));
2269*4bdc9457SAndroid Build Coastguard Worker i15 += 32;
2270*4bdc9457SAndroid Build Coastguard Worker
2271*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF));
2272*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi15xGHIJKLMNOPQRSTUV, vk15xGHIJKLMNOPQRSTUV));
2273*4bdc9457SAndroid Build Coastguard Worker
2274*4bdc9457SAndroid Build Coastguard Worker const __m512i vi16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i16));
2275*4bdc9457SAndroid Build Coastguard Worker const __m512i vk16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 512 * sizeof(int8_t))));
2276*4bdc9457SAndroid Build Coastguard Worker const __m512i vi16xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i16 + 16)));
2277*4bdc9457SAndroid Build Coastguard Worker const __m512i vk16xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 528 * sizeof(int8_t))));
2278*4bdc9457SAndroid Build Coastguard Worker i16 += 32;
2279*4bdc9457SAndroid Build Coastguard Worker
2280*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF));
2281*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi16xGHIJKLMNOPQRSTUV, vk16xGHIJKLMNOPQRSTUV));
2282*4bdc9457SAndroid Build Coastguard Worker
2283*4bdc9457SAndroid Build Coastguard Worker const __m512i vi17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i17));
2284*4bdc9457SAndroid Build Coastguard Worker const __m512i vk17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 544 * sizeof(int8_t))));
2285*4bdc9457SAndroid Build Coastguard Worker const __m512i vi17xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i17 + 16)));
2286*4bdc9457SAndroid Build Coastguard Worker const __m512i vk17xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 560 * sizeof(int8_t))));
2287*4bdc9457SAndroid Build Coastguard Worker i17 += 32;
2288*4bdc9457SAndroid Build Coastguard Worker
2289*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF));
2290*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi17xGHIJKLMNOPQRSTUV, vk17xGHIJKLMNOPQRSTUV));
2291*4bdc9457SAndroid Build Coastguard Worker
2292*4bdc9457SAndroid Build Coastguard Worker const __m512i vi18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i18));
2293*4bdc9457SAndroid Build Coastguard Worker const __m512i vk18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 576 * sizeof(int8_t))));
2294*4bdc9457SAndroid Build Coastguard Worker const __m512i vi18xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i18 + 16)));
2295*4bdc9457SAndroid Build Coastguard Worker const __m512i vk18xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 592 * sizeof(int8_t))));
2296*4bdc9457SAndroid Build Coastguard Worker i18 += 32;
2297*4bdc9457SAndroid Build Coastguard Worker
2298*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF));
2299*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi18xGHIJKLMNOPQRSTUV, vk18xGHIJKLMNOPQRSTUV));
2300*4bdc9457SAndroid Build Coastguard Worker
2301*4bdc9457SAndroid Build Coastguard Worker const __m512i vi19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i19));
2302*4bdc9457SAndroid Build Coastguard Worker const __m512i vk19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 608 * sizeof(int8_t))));
2303*4bdc9457SAndroid Build Coastguard Worker const __m512i vi19xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i19 + 16)));
2304*4bdc9457SAndroid Build Coastguard Worker const __m512i vk19xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 624 * sizeof(int8_t))));
2305*4bdc9457SAndroid Build Coastguard Worker i19 += 32;
2306*4bdc9457SAndroid Build Coastguard Worker
2307*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF));
2308*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi19xGHIJKLMNOPQRSTUV, vk19xGHIJKLMNOPQRSTUV));
2309*4bdc9457SAndroid Build Coastguard Worker
2310*4bdc9457SAndroid Build Coastguard Worker const __m512i vi20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i20));
2311*4bdc9457SAndroid Build Coastguard Worker const __m512i vk20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 640 * sizeof(int8_t))));
2312*4bdc9457SAndroid Build Coastguard Worker const __m512i vi20xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i20 + 16)));
2313*4bdc9457SAndroid Build Coastguard Worker const __m512i vk20xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 656 * sizeof(int8_t))));
2314*4bdc9457SAndroid Build Coastguard Worker i20 += 32;
2315*4bdc9457SAndroid Build Coastguard Worker
2316*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF));
2317*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi20xGHIJKLMNOPQRSTUV, vk20xGHIJKLMNOPQRSTUV));
2318*4bdc9457SAndroid Build Coastguard Worker
2319*4bdc9457SAndroid Build Coastguard Worker const __m512i vi21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i21));
2320*4bdc9457SAndroid Build Coastguard Worker const __m512i vk21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 672 * sizeof(int8_t))));
2321*4bdc9457SAndroid Build Coastguard Worker const __m512i vi21xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i21 + 16)));
2322*4bdc9457SAndroid Build Coastguard Worker const __m512i vk21xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 688 * sizeof(int8_t))));
2323*4bdc9457SAndroid Build Coastguard Worker i21 += 32;
2324*4bdc9457SAndroid Build Coastguard Worker
2325*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF));
2326*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi21xGHIJKLMNOPQRSTUV, vk21xGHIJKLMNOPQRSTUV));
2327*4bdc9457SAndroid Build Coastguard Worker
2328*4bdc9457SAndroid Build Coastguard Worker const __m512i vi22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i22));
2329*4bdc9457SAndroid Build Coastguard Worker const __m512i vk22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 704 * sizeof(int8_t))));
2330*4bdc9457SAndroid Build Coastguard Worker const __m512i vi22xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i22 + 16)));
2331*4bdc9457SAndroid Build Coastguard Worker const __m512i vk22xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 720 * sizeof(int8_t))));
2332*4bdc9457SAndroid Build Coastguard Worker i22 += 32;
2333*4bdc9457SAndroid Build Coastguard Worker
2334*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF));
2335*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi22xGHIJKLMNOPQRSTUV, vk22xGHIJKLMNOPQRSTUV));
2336*4bdc9457SAndroid Build Coastguard Worker
2337*4bdc9457SAndroid Build Coastguard Worker const __m512i vi23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i23));
2338*4bdc9457SAndroid Build Coastguard Worker const __m512i vk23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 736 * sizeof(int8_t))));
2339*4bdc9457SAndroid Build Coastguard Worker const __m512i vi23xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i23 + 16)));
2340*4bdc9457SAndroid Build Coastguard Worker const __m512i vk23xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 752 * sizeof(int8_t))));
2341*4bdc9457SAndroid Build Coastguard Worker i23 += 32;
2342*4bdc9457SAndroid Build Coastguard Worker
2343*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF));
2344*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi23xGHIJKLMNOPQRSTUV, vk23xGHIJKLMNOPQRSTUV));
2345*4bdc9457SAndroid Build Coastguard Worker
2346*4bdc9457SAndroid Build Coastguard Worker const __m512i vi24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i24));
2347*4bdc9457SAndroid Build Coastguard Worker const __m512i vk24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 768 * sizeof(int8_t))));
2348*4bdc9457SAndroid Build Coastguard Worker const __m512i vi24xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i24 + 16)));
2349*4bdc9457SAndroid Build Coastguard Worker const __m512i vk24xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 784 * sizeof(int8_t))));
2350*4bdc9457SAndroid Build Coastguard Worker i24 += 32;
2351*4bdc9457SAndroid Build Coastguard Worker
2352*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF));
2353*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi24xGHIJKLMNOPQRSTUV, vk24xGHIJKLMNOPQRSTUV));
2354*4bdc9457SAndroid Build Coastguard Worker
2355*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 800 * sizeof(int8_t));
2356*4bdc9457SAndroid Build Coastguard Worker
2357*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
2358*4bdc9457SAndroid Build Coastguard Worker __m512 vscaledGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vaccGHIJKLMNOPQRSTUV);
2359*4bdc9457SAndroid Build Coastguard Worker
2360*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale);
2361*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_mul_ps(vscaledGHIJKLMNOPQRSTUV, vscale);
2362*4bdc9457SAndroid Build Coastguard Worker
2363*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
2364*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_min_ps(vscaledGHIJKLMNOPQRSTUV, voutput_max_less_zero_point);
2365*4bdc9457SAndroid Build Coastguard Worker
2366*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
2367*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_cvtps_epi32(vscaledGHIJKLMNOPQRSTUV);
2368*4bdc9457SAndroid Build Coastguard Worker
2369*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
2370*4bdc9457SAndroid Build Coastguard Worker __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vaccGHIJKLMNOPQRSTUV), _mm512_extracti32x8_epi32(vaccGHIJKLMNOPQRSTUV, 1)), _mm512_castsi512_si256(voutput_zero_point));
2371*4bdc9457SAndroid Build Coastguard Worker
2372*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ4567KLMN = _mm512_castsi512_si256(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV);
2373*4bdc9457SAndroid Build Coastguard Worker const __m256i vout89ABOPQRCDEFSTUV = _mm512_extracti32x8_epi32(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV, 1);
2374*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV = _mm256_packs_epi16(vout0123GHIJ4567KLMN, vout89ABOPQRCDEFSTUV);
2375*4bdc9457SAndroid Build Coastguard Worker __m256i vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_permutevar8x32_epi32(vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV, vpermute_mask);
2376*4bdc9457SAndroid Build Coastguard Worker const __m128i voutGHIJOPQR = _mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV);
2377*4bdc9457SAndroid Build Coastguard Worker const __m128i voutKLMNSTUV = _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1);
2378*4bdc9457SAndroid Build Coastguard Worker __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(voutGHIJOPQR, voutKLMNSTUV), _MM_SHUFFLE(3, 1, 2, 0));
2379*4bdc9457SAndroid Build Coastguard Worker
2380*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_max_epi8(vout0123456789ABCDEFGHIJKLMNOPQRSTUV, voutput_min);
2381*4bdc9457SAndroid Build Coastguard Worker voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, _mm256_castsi256_si128(voutput_min));
2382*4bdc9457SAndroid Build Coastguard Worker
2383*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) output, vout0123456789ABCDEFGHIJKLMNOPQRSTUV);
2384*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
2385*4bdc9457SAndroid Build Coastguard Worker output += 32;
2386*4bdc9457SAndroid Build Coastguard Worker }
2387*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
2388*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
2389*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << (c & 15)) - UINT32_C(1)));
2390*4bdc9457SAndroid Build Coastguard Worker const int8_t* k = (const int8_t*) ((uintptr_t) w + 32 * sizeof(int32_t));
2391*4bdc9457SAndroid Build Coastguard Worker do {
2392*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
2393*4bdc9457SAndroid Build Coastguard Worker
2394*4bdc9457SAndroid Build Coastguard Worker
2395*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
2396*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) k));
2397*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
2398*4bdc9457SAndroid Build Coastguard Worker
2399*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
2400*4bdc9457SAndroid Build Coastguard Worker
2401*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
2402*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 32)));
2403*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
2404*4bdc9457SAndroid Build Coastguard Worker
2405*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
2406*4bdc9457SAndroid Build Coastguard Worker
2407*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
2408*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 64)));
2409*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
2410*4bdc9457SAndroid Build Coastguard Worker
2411*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
2412*4bdc9457SAndroid Build Coastguard Worker
2413*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3));
2414*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 96)));
2415*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
2416*4bdc9457SAndroid Build Coastguard Worker
2417*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
2418*4bdc9457SAndroid Build Coastguard Worker
2419*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4));
2420*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 128)));
2421*4bdc9457SAndroid Build Coastguard Worker i4 += 16;
2422*4bdc9457SAndroid Build Coastguard Worker
2423*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
2424*4bdc9457SAndroid Build Coastguard Worker
2425*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5));
2426*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 160)));
2427*4bdc9457SAndroid Build Coastguard Worker i5 += 16;
2428*4bdc9457SAndroid Build Coastguard Worker
2429*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
2430*4bdc9457SAndroid Build Coastguard Worker
2431*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6));
2432*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 192)));
2433*4bdc9457SAndroid Build Coastguard Worker i6 += 16;
2434*4bdc9457SAndroid Build Coastguard Worker
2435*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
2436*4bdc9457SAndroid Build Coastguard Worker
2437*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7));
2438*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 224)));
2439*4bdc9457SAndroid Build Coastguard Worker i7 += 16;
2440*4bdc9457SAndroid Build Coastguard Worker
2441*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
2442*4bdc9457SAndroid Build Coastguard Worker
2443*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8));
2444*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 256)));
2445*4bdc9457SAndroid Build Coastguard Worker i8 += 16;
2446*4bdc9457SAndroid Build Coastguard Worker
2447*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
2448*4bdc9457SAndroid Build Coastguard Worker
2449*4bdc9457SAndroid Build Coastguard Worker const __m512i vi9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i9));
2450*4bdc9457SAndroid Build Coastguard Worker const __m512i vk9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 288)));
2451*4bdc9457SAndroid Build Coastguard Worker i9 += 16;
2452*4bdc9457SAndroid Build Coastguard Worker
2453*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF));
2454*4bdc9457SAndroid Build Coastguard Worker
2455*4bdc9457SAndroid Build Coastguard Worker const __m512i vi10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i10));
2456*4bdc9457SAndroid Build Coastguard Worker const __m512i vk10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 320)));
2457*4bdc9457SAndroid Build Coastguard Worker i10 += 16;
2458*4bdc9457SAndroid Build Coastguard Worker
2459*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF));
2460*4bdc9457SAndroid Build Coastguard Worker
2461*4bdc9457SAndroid Build Coastguard Worker const __m512i vi11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i11));
2462*4bdc9457SAndroid Build Coastguard Worker const __m512i vk11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 352)));
2463*4bdc9457SAndroid Build Coastguard Worker i11 += 16;
2464*4bdc9457SAndroid Build Coastguard Worker
2465*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF));
2466*4bdc9457SAndroid Build Coastguard Worker
2467*4bdc9457SAndroid Build Coastguard Worker const __m512i vi12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i12));
2468*4bdc9457SAndroid Build Coastguard Worker const __m512i vk12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 384)));
2469*4bdc9457SAndroid Build Coastguard Worker i12 += 16;
2470*4bdc9457SAndroid Build Coastguard Worker
2471*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF));
2472*4bdc9457SAndroid Build Coastguard Worker
2473*4bdc9457SAndroid Build Coastguard Worker const __m512i vi13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i13));
2474*4bdc9457SAndroid Build Coastguard Worker const __m512i vk13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 416)));
2475*4bdc9457SAndroid Build Coastguard Worker i13 += 16;
2476*4bdc9457SAndroid Build Coastguard Worker
2477*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF));
2478*4bdc9457SAndroid Build Coastguard Worker
2479*4bdc9457SAndroid Build Coastguard Worker const __m512i vi14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i14));
2480*4bdc9457SAndroid Build Coastguard Worker const __m512i vk14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 448)));
2481*4bdc9457SAndroid Build Coastguard Worker i14 += 16;
2482*4bdc9457SAndroid Build Coastguard Worker
2483*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF));
2484*4bdc9457SAndroid Build Coastguard Worker
2485*4bdc9457SAndroid Build Coastguard Worker const __m512i vi15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i15));
2486*4bdc9457SAndroid Build Coastguard Worker const __m512i vk15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 480)));
2487*4bdc9457SAndroid Build Coastguard Worker i15 += 16;
2488*4bdc9457SAndroid Build Coastguard Worker
2489*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF));
2490*4bdc9457SAndroid Build Coastguard Worker
2491*4bdc9457SAndroid Build Coastguard Worker const __m512i vi16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i16));
2492*4bdc9457SAndroid Build Coastguard Worker const __m512i vk16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 512)));
2493*4bdc9457SAndroid Build Coastguard Worker i16 += 16;
2494*4bdc9457SAndroid Build Coastguard Worker
2495*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF));
2496*4bdc9457SAndroid Build Coastguard Worker
2497*4bdc9457SAndroid Build Coastguard Worker const __m512i vi17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i17));
2498*4bdc9457SAndroid Build Coastguard Worker const __m512i vk17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 544)));
2499*4bdc9457SAndroid Build Coastguard Worker i17 += 16;
2500*4bdc9457SAndroid Build Coastguard Worker
2501*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF));
2502*4bdc9457SAndroid Build Coastguard Worker
2503*4bdc9457SAndroid Build Coastguard Worker const __m512i vi18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i18));
2504*4bdc9457SAndroid Build Coastguard Worker const __m512i vk18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 576)));
2505*4bdc9457SAndroid Build Coastguard Worker i18 += 16;
2506*4bdc9457SAndroid Build Coastguard Worker
2507*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF));
2508*4bdc9457SAndroid Build Coastguard Worker
2509*4bdc9457SAndroid Build Coastguard Worker const __m512i vi19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i19));
2510*4bdc9457SAndroid Build Coastguard Worker const __m512i vk19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 608)));
2511*4bdc9457SAndroid Build Coastguard Worker i19 += 16;
2512*4bdc9457SAndroid Build Coastguard Worker
2513*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF));
2514*4bdc9457SAndroid Build Coastguard Worker
2515*4bdc9457SAndroid Build Coastguard Worker const __m512i vi20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i20));
2516*4bdc9457SAndroid Build Coastguard Worker const __m512i vk20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 640)));
2517*4bdc9457SAndroid Build Coastguard Worker i20 += 16;
2518*4bdc9457SAndroid Build Coastguard Worker
2519*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF));
2520*4bdc9457SAndroid Build Coastguard Worker
2521*4bdc9457SAndroid Build Coastguard Worker const __m512i vi21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i21));
2522*4bdc9457SAndroid Build Coastguard Worker const __m512i vk21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 672)));
2523*4bdc9457SAndroid Build Coastguard Worker i21 += 16;
2524*4bdc9457SAndroid Build Coastguard Worker
2525*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF));
2526*4bdc9457SAndroid Build Coastguard Worker
2527*4bdc9457SAndroid Build Coastguard Worker const __m512i vi22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i22));
2528*4bdc9457SAndroid Build Coastguard Worker const __m512i vk22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 704)));
2529*4bdc9457SAndroid Build Coastguard Worker i22 += 16;
2530*4bdc9457SAndroid Build Coastguard Worker
2531*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF));
2532*4bdc9457SAndroid Build Coastguard Worker
2533*4bdc9457SAndroid Build Coastguard Worker const __m512i vi23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i23));
2534*4bdc9457SAndroid Build Coastguard Worker const __m512i vk23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 736)));
2535*4bdc9457SAndroid Build Coastguard Worker i23 += 16;
2536*4bdc9457SAndroid Build Coastguard Worker
2537*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF));
2538*4bdc9457SAndroid Build Coastguard Worker
2539*4bdc9457SAndroid Build Coastguard Worker const __m512i vi24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i24));
2540*4bdc9457SAndroid Build Coastguard Worker const __m512i vk24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 768)));
2541*4bdc9457SAndroid Build Coastguard Worker i24 += 16;
2542*4bdc9457SAndroid Build Coastguard Worker
2543*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF));
2544*4bdc9457SAndroid Build Coastguard Worker
2545*4bdc9457SAndroid Build Coastguard Worker k += 16;
2546*4bdc9457SAndroid Build Coastguard Worker
2547*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
2548*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale);
2549*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
2550*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
2551*4bdc9457SAndroid Build Coastguard Worker
2552*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
2553*4bdc9457SAndroid Build Coastguard Worker
2554*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
2555*4bdc9457SAndroid Build Coastguard Worker
2556*4bdc9457SAndroid Build Coastguard Worker const __m128i vout012389AB = _mm256_castsi256_si128(vout012389AB4567CDEF);
2557*4bdc9457SAndroid Build Coastguard Worker const __m128i vout4567CDEF = _mm256_extracti128_si256(vout012389AB4567CDEF, 1);
2558*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(vout012389AB, vout4567CDEF), _MM_SHUFFLE(3, 1, 2, 0));
2559*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, _mm256_castsi256_si128(voutput_min));
2560*4bdc9457SAndroid Build Coastguard Worker
2561*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(c >= 16) {
2562*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
2563*4bdc9457SAndroid Build Coastguard Worker output += 16;
2564*4bdc9457SAndroid Build Coastguard Worker c -= 16;
2565*4bdc9457SAndroid Build Coastguard Worker } else {
2566*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
2567*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + c);
2568*4bdc9457SAndroid Build Coastguard Worker c = 0;
2569*4bdc9457SAndroid Build Coastguard Worker }
2570*4bdc9457SAndroid Build Coastguard Worker } while (c != 0);
2571*4bdc9457SAndroid Build Coastguard Worker }
2572*4bdc9457SAndroid Build Coastguard Worker
2573*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + output_increment);
2574*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
2575*4bdc9457SAndroid Build Coastguard Worker }
2576*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2577*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32(
2578*4bdc9457SAndroid Build Coastguard Worker size_t channels,
2579*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
2580*4bdc9457SAndroid Build Coastguard Worker const int8_t** input,
2581*4bdc9457SAndroid Build Coastguard Worker const void* weights,
2582*4bdc9457SAndroid Build Coastguard Worker int8_t* output,
2583*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
2584*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
2585*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
2586*4bdc9457SAndroid Build Coastguard Worker const int8_t* zero,
2587*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_MSAN
2588*4bdc9457SAndroid Build Coastguard Worker {
2589*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
2590*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
2591*4bdc9457SAndroid Build Coastguard Worker
2592*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
2593*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
2594*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
2595*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_min);
2596*4bdc9457SAndroid Build Coastguard Worker const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
2597*4bdc9457SAndroid Build Coastguard Worker
2598*4bdc9457SAndroid Build Coastguard Worker do {
2599*4bdc9457SAndroid Build Coastguard Worker const int8_t* i0 = input[0];
2600*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
2601*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
2602*4bdc9457SAndroid Build Coastguard Worker i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
2603*4bdc9457SAndroid Build Coastguard Worker }
2604*4bdc9457SAndroid Build Coastguard Worker const int8_t* i1 = input[1];
2605*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
2606*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
2607*4bdc9457SAndroid Build Coastguard Worker i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
2608*4bdc9457SAndroid Build Coastguard Worker }
2609*4bdc9457SAndroid Build Coastguard Worker const int8_t* i2 = input[2];
2610*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
2611*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
2612*4bdc9457SAndroid Build Coastguard Worker i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2613*4bdc9457SAndroid Build Coastguard Worker }
2614*4bdc9457SAndroid Build Coastguard Worker const int8_t* i3 = input[3];
2615*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
2616*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
2617*4bdc9457SAndroid Build Coastguard Worker i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2618*4bdc9457SAndroid Build Coastguard Worker }
2619*4bdc9457SAndroid Build Coastguard Worker const int8_t* i4 = input[4];
2620*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
2621*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
2622*4bdc9457SAndroid Build Coastguard Worker i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2623*4bdc9457SAndroid Build Coastguard Worker }
2624*4bdc9457SAndroid Build Coastguard Worker const int8_t* i5 = input[5];
2625*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
2626*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
2627*4bdc9457SAndroid Build Coastguard Worker i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2628*4bdc9457SAndroid Build Coastguard Worker }
2629*4bdc9457SAndroid Build Coastguard Worker const int8_t* i6 = input[6];
2630*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
2631*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
2632*4bdc9457SAndroid Build Coastguard Worker i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2633*4bdc9457SAndroid Build Coastguard Worker }
2634*4bdc9457SAndroid Build Coastguard Worker const int8_t* i7 = input[7];
2635*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
2636*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
2637*4bdc9457SAndroid Build Coastguard Worker i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2638*4bdc9457SAndroid Build Coastguard Worker }
2639*4bdc9457SAndroid Build Coastguard Worker const int8_t* i8 = input[8];
2640*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
2641*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
2642*4bdc9457SAndroid Build Coastguard Worker i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2643*4bdc9457SAndroid Build Coastguard Worker }
2644*4bdc9457SAndroid Build Coastguard Worker input = (const int8_t**) ((uintptr_t) input + input_stride);
2645*4bdc9457SAndroid Build Coastguard Worker
2646*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
2647*4bdc9457SAndroid Build Coastguard Worker const void* w = weights;
2648*4bdc9457SAndroid Build Coastguard Worker for (; c >= 32; c -= 32) {
2649*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
2650*4bdc9457SAndroid Build Coastguard Worker __m512i vaccGHIJKLMNOPQRSTUV = _mm512_loadu_si512((const void*) ((uintptr_t) w + 16 * sizeof(int32_t)));
2651*4bdc9457SAndroid Build Coastguard Worker
2652*4bdc9457SAndroid Build Coastguard Worker
2653*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
2654*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 0 * sizeof(int8_t))));
2655*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i0 + 16)));
2656*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 16 * sizeof(int8_t))));
2657*4bdc9457SAndroid Build Coastguard Worker i0 += 32;
2658*4bdc9457SAndroid Build Coastguard Worker
2659*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
2660*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV));
2661*4bdc9457SAndroid Build Coastguard Worker
2662*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
2663*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 32 * sizeof(int8_t))));
2664*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i1 + 16)));
2665*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 48 * sizeof(int8_t))));
2666*4bdc9457SAndroid Build Coastguard Worker i1 += 32;
2667*4bdc9457SAndroid Build Coastguard Worker
2668*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
2669*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV));
2670*4bdc9457SAndroid Build Coastguard Worker
2671*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
2672*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 64 * sizeof(int8_t))));
2673*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i2 + 16)));
2674*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 80 * sizeof(int8_t))));
2675*4bdc9457SAndroid Build Coastguard Worker i2 += 32;
2676*4bdc9457SAndroid Build Coastguard Worker
2677*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
2678*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV));
2679*4bdc9457SAndroid Build Coastguard Worker
2680*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3));
2681*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(int8_t))));
2682*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i3 + 16)));
2683*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 112 * sizeof(int8_t))));
2684*4bdc9457SAndroid Build Coastguard Worker i3 += 32;
2685*4bdc9457SAndroid Build Coastguard Worker
2686*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
2687*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi3xGHIJKLMNOPQRSTUV, vk3xGHIJKLMNOPQRSTUV));
2688*4bdc9457SAndroid Build Coastguard Worker
2689*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4));
2690*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 128 * sizeof(int8_t))));
2691*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i4 + 16)));
2692*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 144 * sizeof(int8_t))));
2693*4bdc9457SAndroid Build Coastguard Worker i4 += 32;
2694*4bdc9457SAndroid Build Coastguard Worker
2695*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
2696*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi4xGHIJKLMNOPQRSTUV, vk4xGHIJKLMNOPQRSTUV));
2697*4bdc9457SAndroid Build Coastguard Worker
2698*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5));
2699*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 160 * sizeof(int8_t))));
2700*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i5 + 16)));
2701*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 176 * sizeof(int8_t))));
2702*4bdc9457SAndroid Build Coastguard Worker i5 += 32;
2703*4bdc9457SAndroid Build Coastguard Worker
2704*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
2705*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi5xGHIJKLMNOPQRSTUV, vk5xGHIJKLMNOPQRSTUV));
2706*4bdc9457SAndroid Build Coastguard Worker
2707*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6));
2708*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 192 * sizeof(int8_t))));
2709*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i6 + 16)));
2710*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 208 * sizeof(int8_t))));
2711*4bdc9457SAndroid Build Coastguard Worker i6 += 32;
2712*4bdc9457SAndroid Build Coastguard Worker
2713*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
2714*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi6xGHIJKLMNOPQRSTUV, vk6xGHIJKLMNOPQRSTUV));
2715*4bdc9457SAndroid Build Coastguard Worker
2716*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7));
2717*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 224 * sizeof(int8_t))));
2718*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i7 + 16)));
2719*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 240 * sizeof(int8_t))));
2720*4bdc9457SAndroid Build Coastguard Worker i7 += 32;
2721*4bdc9457SAndroid Build Coastguard Worker
2722*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
2723*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi7xGHIJKLMNOPQRSTUV, vk7xGHIJKLMNOPQRSTUV));
2724*4bdc9457SAndroid Build Coastguard Worker
2725*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8));
2726*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 256 * sizeof(int8_t))));
2727*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i8 + 16)));
2728*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 272 * sizeof(int8_t))));
2729*4bdc9457SAndroid Build Coastguard Worker i8 += 32;
2730*4bdc9457SAndroid Build Coastguard Worker
2731*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
2732*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi8xGHIJKLMNOPQRSTUV, vk8xGHIJKLMNOPQRSTUV));
2733*4bdc9457SAndroid Build Coastguard Worker
2734*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t));
2735*4bdc9457SAndroid Build Coastguard Worker
2736*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
2737*4bdc9457SAndroid Build Coastguard Worker __m512 vscaledGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vaccGHIJKLMNOPQRSTUV);
2738*4bdc9457SAndroid Build Coastguard Worker
2739*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale);
2740*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_mul_ps(vscaledGHIJKLMNOPQRSTUV, vscale);
2741*4bdc9457SAndroid Build Coastguard Worker
2742*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
2743*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_min_ps(vscaledGHIJKLMNOPQRSTUV, voutput_max_less_zero_point);
2744*4bdc9457SAndroid Build Coastguard Worker
2745*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
2746*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_cvtps_epi32(vscaledGHIJKLMNOPQRSTUV);
2747*4bdc9457SAndroid Build Coastguard Worker
2748*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
2749*4bdc9457SAndroid Build Coastguard Worker __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vaccGHIJKLMNOPQRSTUV), _mm512_extracti32x8_epi32(vaccGHIJKLMNOPQRSTUV, 1)), _mm512_castsi512_si256(voutput_zero_point));
2750*4bdc9457SAndroid Build Coastguard Worker
2751*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ4567KLMN = _mm512_castsi512_si256(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV);
2752*4bdc9457SAndroid Build Coastguard Worker const __m256i vout89ABOPQRCDEFSTUV = _mm512_extracti32x8_epi32(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV, 1);
2753*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV = _mm256_packs_epi16(vout0123GHIJ4567KLMN, vout89ABOPQRCDEFSTUV);
2754*4bdc9457SAndroid Build Coastguard Worker __m256i vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_permutevar8x32_epi32(vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV, vpermute_mask);
2755*4bdc9457SAndroid Build Coastguard Worker const __m128i voutGHIJOPQR = _mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV);
2756*4bdc9457SAndroid Build Coastguard Worker const __m128i voutKLMNSTUV = _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1);
2757*4bdc9457SAndroid Build Coastguard Worker __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(voutGHIJOPQR, voutKLMNSTUV), _MM_SHUFFLE(3, 1, 2, 0));
2758*4bdc9457SAndroid Build Coastguard Worker
2759*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_max_epi8(vout0123456789ABCDEFGHIJKLMNOPQRSTUV, voutput_min);
2760*4bdc9457SAndroid Build Coastguard Worker voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, _mm256_castsi256_si128(voutput_min));
2761*4bdc9457SAndroid Build Coastguard Worker
2762*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) output, vout0123456789ABCDEFGHIJKLMNOPQRSTUV);
2763*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
2764*4bdc9457SAndroid Build Coastguard Worker output += 32;
2765*4bdc9457SAndroid Build Coastguard Worker }
2766*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
2767*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
2768*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << (c & 15)) - UINT32_C(1)));
2769*4bdc9457SAndroid Build Coastguard Worker const int8_t* k = (const int8_t*) ((uintptr_t) w + 32 * sizeof(int32_t));
2770*4bdc9457SAndroid Build Coastguard Worker do {
2771*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
2772*4bdc9457SAndroid Build Coastguard Worker
2773*4bdc9457SAndroid Build Coastguard Worker
2774*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0));
2775*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) k));
2776*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
2777*4bdc9457SAndroid Build Coastguard Worker
2778*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
2779*4bdc9457SAndroid Build Coastguard Worker
2780*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1));
2781*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 32)));
2782*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
2783*4bdc9457SAndroid Build Coastguard Worker
2784*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
2785*4bdc9457SAndroid Build Coastguard Worker
2786*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2));
2787*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 64)));
2788*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
2789*4bdc9457SAndroid Build Coastguard Worker
2790*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
2791*4bdc9457SAndroid Build Coastguard Worker
2792*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3));
2793*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 96)));
2794*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
2795*4bdc9457SAndroid Build Coastguard Worker
2796*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
2797*4bdc9457SAndroid Build Coastguard Worker
2798*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4));
2799*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 128)));
2800*4bdc9457SAndroid Build Coastguard Worker i4 += 16;
2801*4bdc9457SAndroid Build Coastguard Worker
2802*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
2803*4bdc9457SAndroid Build Coastguard Worker
2804*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5));
2805*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 160)));
2806*4bdc9457SAndroid Build Coastguard Worker i5 += 16;
2807*4bdc9457SAndroid Build Coastguard Worker
2808*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
2809*4bdc9457SAndroid Build Coastguard Worker
2810*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6));
2811*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 192)));
2812*4bdc9457SAndroid Build Coastguard Worker i6 += 16;
2813*4bdc9457SAndroid Build Coastguard Worker
2814*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
2815*4bdc9457SAndroid Build Coastguard Worker
2816*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7));
2817*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 224)));
2818*4bdc9457SAndroid Build Coastguard Worker i7 += 16;
2819*4bdc9457SAndroid Build Coastguard Worker
2820*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
2821*4bdc9457SAndroid Build Coastguard Worker
2822*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8));
2823*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 256)));
2824*4bdc9457SAndroid Build Coastguard Worker i8 += 16;
2825*4bdc9457SAndroid Build Coastguard Worker
2826*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
2827*4bdc9457SAndroid Build Coastguard Worker
2828*4bdc9457SAndroid Build Coastguard Worker k += 16;
2829*4bdc9457SAndroid Build Coastguard Worker
2830*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
2831*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale);
2832*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
2833*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
2834*4bdc9457SAndroid Build Coastguard Worker
2835*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
2836*4bdc9457SAndroid Build Coastguard Worker
2837*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
2838*4bdc9457SAndroid Build Coastguard Worker
2839*4bdc9457SAndroid Build Coastguard Worker const __m128i vout012389AB = _mm256_castsi256_si128(vout012389AB4567CDEF);
2840*4bdc9457SAndroid Build Coastguard Worker const __m128i vout4567CDEF = _mm256_extracti128_si256(vout012389AB4567CDEF, 1);
2841*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(vout012389AB, vout4567CDEF), _MM_SHUFFLE(3, 1, 2, 0));
2842*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, _mm256_castsi256_si128(voutput_min));
2843*4bdc9457SAndroid Build Coastguard Worker
2844*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(c >= 16) {
2845*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
2846*4bdc9457SAndroid Build Coastguard Worker output += 16;
2847*4bdc9457SAndroid Build Coastguard Worker c -= 16;
2848*4bdc9457SAndroid Build Coastguard Worker } else {
2849*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
2850*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + c);
2851*4bdc9457SAndroid Build Coastguard Worker c = 0;
2852*4bdc9457SAndroid Build Coastguard Worker }
2853*4bdc9457SAndroid Build Coastguard Worker } while (c != 0);
2854*4bdc9457SAndroid Build Coastguard Worker }
2855*4bdc9457SAndroid Build Coastguard Worker
2856*4bdc9457SAndroid Build Coastguard Worker output = (int8_t*) ((uintptr_t) output + output_increment);
2857*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
2858*4bdc9457SAndroid Build Coastguard Worker }
2859*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_f32_vcvt_ukernel__avx512skx_x32(size_t n,const int8_t * x,float * y,const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])2860*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_f32_vcvt_ukernel__avx512skx_x32(
2861*4bdc9457SAndroid Build Coastguard Worker size_t n,
2862*4bdc9457SAndroid Build Coastguard Worker const int8_t* x,
2863*4bdc9457SAndroid Build Coastguard Worker float* y,
2864*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2865*4bdc9457SAndroid Build Coastguard Worker {
2866*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2867*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(int8_t) == 0);
2868*4bdc9457SAndroid Build Coastguard Worker assert(x != NULL);
2869*4bdc9457SAndroid Build Coastguard Worker assert(y != NULL);
2870*4bdc9457SAndroid Build Coastguard Worker
2871*4bdc9457SAndroid Build Coastguard Worker const __m512i vminus_zero_point = _mm512_load_si512(params->avx512.minus_zero_point);
2872*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->avx512.scale);
2873*4bdc9457SAndroid Build Coastguard Worker for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
2874*4bdc9457SAndroid Build Coastguard Worker __m512i vx0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) x));
2875*4bdc9457SAndroid Build Coastguard Worker __m512i vxGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (x + 16)));
2876*4bdc9457SAndroid Build Coastguard Worker x += 32;
2877*4bdc9457SAndroid Build Coastguard Worker
2878*4bdc9457SAndroid Build Coastguard Worker vx0123456789ABCDEF = _mm512_add_epi32(vx0123456789ABCDEF, vminus_zero_point);
2879*4bdc9457SAndroid Build Coastguard Worker vxGHIJKLMNOPQRSTUV = _mm512_add_epi32(vxGHIJKLMNOPQRSTUV, vminus_zero_point);
2880*4bdc9457SAndroid Build Coastguard Worker
2881*4bdc9457SAndroid Build Coastguard Worker __m512 vy0123456789ABCDEF = _mm512_cvtepi32_ps(vx0123456789ABCDEF);
2882*4bdc9457SAndroid Build Coastguard Worker __m512 vyGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vxGHIJKLMNOPQRSTUV);
2883*4bdc9457SAndroid Build Coastguard Worker
2884*4bdc9457SAndroid Build Coastguard Worker vy0123456789ABCDEF = _mm512_mul_ps(vy0123456789ABCDEF, vscale);
2885*4bdc9457SAndroid Build Coastguard Worker vyGHIJKLMNOPQRSTUV = _mm512_mul_ps(vyGHIJKLMNOPQRSTUV, vscale);
2886*4bdc9457SAndroid Build Coastguard Worker
2887*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_ps(y, vy0123456789ABCDEF);
2888*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_ps(y + 16, vyGHIJKLMNOPQRSTUV);
2889*4bdc9457SAndroid Build Coastguard Worker y += 32;
2890*4bdc9457SAndroid Build Coastguard Worker }
2891*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
2892*4bdc9457SAndroid Build Coastguard Worker __m512i vx = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) x));
2893*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_add_epi32(vx, vminus_zero_point);
2894*4bdc9457SAndroid Build Coastguard Worker x += 16;
2895*4bdc9457SAndroid Build Coastguard Worker
2896*4bdc9457SAndroid Build Coastguard Worker __m512 vy = _mm512_cvtepi32_ps(vx);
2897*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_mul_ps(vy, vscale);
2898*4bdc9457SAndroid Build Coastguard Worker
2899*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_ps(y, vy);
2900*4bdc9457SAndroid Build Coastguard Worker y += 16;
2901*4bdc9457SAndroid Build Coastguard Worker }
2902*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2903*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(int8_t));
2904*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(int8_t));
2905*4bdc9457SAndroid Build Coastguard Worker
2906*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid elements (depends on n).
2907*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
2908*4bdc9457SAndroid Build Coastguard Worker
2909*4bdc9457SAndroid Build Coastguard Worker __m512i vx = _mm512_cvtepi8_epi32(_mm_maskz_loadu_epi8(vmask, x));
2910*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_add_epi32(vx, vminus_zero_point);
2911*4bdc9457SAndroid Build Coastguard Worker
2912*4bdc9457SAndroid Build Coastguard Worker __m512 vy = _mm512_cvtepi32_ps(vx);
2913*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_mul_ps(vy, vscale);
2914*4bdc9457SAndroid Build Coastguard Worker
2915*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_ps(y, vmask, vy);
2916*4bdc9457SAndroid Build Coastguard Worker }
2917*4bdc9457SAndroid Build Coastguard Worker }
2918*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2919*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx(
2920*4bdc9457SAndroid Build Coastguard Worker size_t mr,
2921*4bdc9457SAndroid Build Coastguard Worker size_t nc,
2922*4bdc9457SAndroid Build Coastguard Worker size_t kc,
2923*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a,
2924*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
2925*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
2926*4bdc9457SAndroid Build Coastguard Worker int8_t* restrict c,
2927*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
2928*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
2929*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2930*4bdc9457SAndroid Build Coastguard Worker {
2931*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
2932*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
2933*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
2934*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
2935*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(int8_t) == 0);
2936*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
2937*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
2938*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
2939*4bdc9457SAndroid Build Coastguard Worker
2940*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
2941*4bdc9457SAndroid Build Coastguard Worker const int8_t* a0 = a;
2942*4bdc9457SAndroid Build Coastguard Worker int8_t* c0 = c;
2943*4bdc9457SAndroid Build Coastguard Worker
2944*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
2945*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
2946*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
2947*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_zero_point);
2948*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx512.output_min);
2949*4bdc9457SAndroid Build Coastguard Worker do {
2950*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
2951*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
2952*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
2953*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
2954*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
2955*4bdc9457SAndroid Build Coastguard Worker
2956*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
2957*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
2958*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
2959*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
2960*4bdc9457SAndroid Build Coastguard Worker
2961*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) w));
2962*4bdc9457SAndroid Build Coastguard Worker
2963*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
2964*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 32)));
2965*4bdc9457SAndroid Build Coastguard Worker
2966*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
2967*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 64)));
2968*4bdc9457SAndroid Build Coastguard Worker
2969*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
2970*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 96)));
2971*4bdc9457SAndroid Build Coastguard Worker
2972*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
2973*4bdc9457SAndroid Build Coastguard Worker
2974*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int8_t*) w + 128);
2975*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(int8_t);
2976*4bdc9457SAndroid Build Coastguard Worker }
2977*4bdc9457SAndroid Build Coastguard Worker
2978*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
2979*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
2980*4bdc9457SAndroid Build Coastguard Worker
2981*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
2982*4bdc9457SAndroid Build Coastguard Worker
2983*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
2984*4bdc9457SAndroid Build Coastguard Worker
2985*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale);
2986*4bdc9457SAndroid Build Coastguard Worker
2987*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
2988*4bdc9457SAndroid Build Coastguard Worker
2989*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
2990*4bdc9457SAndroid Build Coastguard Worker
2991*4bdc9457SAndroid Build Coastguard Worker const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
2992*4bdc9457SAndroid Build Coastguard Worker
2993*4bdc9457SAndroid Build Coastguard Worker const __m128i vout0x084C2A6E195D3B7F = _mm_packs_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
2994*4bdc9457SAndroid Build Coastguard Worker __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
2995*4bdc9457SAndroid Build Coastguard Worker vout0x0123456789ABCDEF = _mm_max_epi8(vout0x0123456789ABCDEF, voutput_min);
2996*4bdc9457SAndroid Build Coastguard Worker
2997*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
2998*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF);
2999*4bdc9457SAndroid Build Coastguard Worker
3000*4bdc9457SAndroid Build Coastguard Worker a0 = (const int8_t*) ((uintptr_t) a0 - k);
3001*4bdc9457SAndroid Build Coastguard Worker
3002*4bdc9457SAndroid Build Coastguard Worker c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3003*4bdc9457SAndroid Build Coastguard Worker
3004*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
3005*4bdc9457SAndroid Build Coastguard Worker } else {
3006*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
3007*4bdc9457SAndroid Build Coastguard Worker const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT32_C(1) << nc) - UINT32_C(1)));
3008*4bdc9457SAndroid Build Coastguard Worker
3009*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(c0, vmask, vout0x0123456789ABCDEF);
3010*4bdc9457SAndroid Build Coastguard Worker
3011*4bdc9457SAndroid Build Coastguard Worker nc = 0;
3012*4bdc9457SAndroid Build Coastguard Worker }
3013*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
3014*4bdc9457SAndroid Build Coastguard Worker }
3015*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3016*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx(
3017*4bdc9457SAndroid Build Coastguard Worker size_t mr,
3018*4bdc9457SAndroid Build Coastguard Worker size_t nc,
3019*4bdc9457SAndroid Build Coastguard Worker size_t kc,
3020*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a,
3021*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
3022*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
3023*4bdc9457SAndroid Build Coastguard Worker int8_t* restrict c,
3024*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
3025*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
3026*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3027*4bdc9457SAndroid Build Coastguard Worker {
3028*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
3029*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 4);
3030*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
3031*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
3032*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(int8_t) == 0);
3033*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
3034*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
3035*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
3036*4bdc9457SAndroid Build Coastguard Worker
3037*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
3038*4bdc9457SAndroid Build Coastguard Worker const int8_t* a0 = a;
3039*4bdc9457SAndroid Build Coastguard Worker int8_t* c0 = c;
3040*4bdc9457SAndroid Build Coastguard Worker const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
3041*4bdc9457SAndroid Build Coastguard Worker int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
3042*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
3043*4bdc9457SAndroid Build Coastguard Worker a1 = a0;
3044*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
3045*4bdc9457SAndroid Build Coastguard Worker }
3046*4bdc9457SAndroid Build Coastguard Worker const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
3047*4bdc9457SAndroid Build Coastguard Worker int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
3048*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
3049*4bdc9457SAndroid Build Coastguard Worker a2 = a1;
3050*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
3051*4bdc9457SAndroid Build Coastguard Worker }
3052*4bdc9457SAndroid Build Coastguard Worker const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
3053*4bdc9457SAndroid Build Coastguard Worker int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
3054*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr != 4) {
3055*4bdc9457SAndroid Build Coastguard Worker a3 = a2;
3056*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
3057*4bdc9457SAndroid Build Coastguard Worker }
3058*4bdc9457SAndroid Build Coastguard Worker
3059*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
3060*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
3061*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
3062*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
3063*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_min = _mm512_load_si512(params->fp32_avx512.output_min);
3064*4bdc9457SAndroid Build Coastguard Worker do {
3065*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
3066*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
3067*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
3068*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
3069*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x0123 = vacc0x0123;
3070*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x4567 = vacc0x4567;
3071*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x89AB = vacc0x89AB;
3072*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1xCDEF = vacc0xCDEF;
3073*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x0123 = vacc0x0123;
3074*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x4567 = vacc0x4567;
3075*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x89AB = vacc0x89AB;
3076*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2xCDEF = vacc0xCDEF;
3077*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x0123 = vacc0x0123;
3078*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x4567 = vacc0x4567;
3079*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x89AB = vacc0x89AB;
3080*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3xCDEF = vacc0xCDEF;
3081*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
3082*4bdc9457SAndroid Build Coastguard Worker
3083*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
3084*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
3085*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
3086*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
3087*4bdc9457SAndroid Build Coastguard Worker const __m512i va1 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a1)));
3088*4bdc9457SAndroid Build Coastguard Worker a1 += 8;
3089*4bdc9457SAndroid Build Coastguard Worker const __m512i va2 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a2)));
3090*4bdc9457SAndroid Build Coastguard Worker a2 += 8;
3091*4bdc9457SAndroid Build Coastguard Worker const __m512i va3 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a3)));
3092*4bdc9457SAndroid Build Coastguard Worker a3 += 8;
3093*4bdc9457SAndroid Build Coastguard Worker
3094*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) w));
3095*4bdc9457SAndroid Build Coastguard Worker
3096*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
3097*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm512_add_epi32(vacc1x0123, _mm512_madd_epi16(va1, vb0123));
3098*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm512_add_epi32(vacc2x0123, _mm512_madd_epi16(va2, vb0123));
3099*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm512_add_epi32(vacc3x0123, _mm512_madd_epi16(va3, vb0123));
3100*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 32)));
3101*4bdc9457SAndroid Build Coastguard Worker
3102*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
3103*4bdc9457SAndroid Build Coastguard Worker vacc1x4567 = _mm512_add_epi32(vacc1x4567, _mm512_madd_epi16(va1, vb4567));
3104*4bdc9457SAndroid Build Coastguard Worker vacc2x4567 = _mm512_add_epi32(vacc2x4567, _mm512_madd_epi16(va2, vb4567));
3105*4bdc9457SAndroid Build Coastguard Worker vacc3x4567 = _mm512_add_epi32(vacc3x4567, _mm512_madd_epi16(va3, vb4567));
3106*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 64)));
3107*4bdc9457SAndroid Build Coastguard Worker
3108*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
3109*4bdc9457SAndroid Build Coastguard Worker vacc1x89AB = _mm512_add_epi32(vacc1x89AB, _mm512_madd_epi16(va1, vb89AB));
3110*4bdc9457SAndroid Build Coastguard Worker vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB));
3111*4bdc9457SAndroid Build Coastguard Worker vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB));
3112*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 96)));
3113*4bdc9457SAndroid Build Coastguard Worker
3114*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
3115*4bdc9457SAndroid Build Coastguard Worker vacc1xCDEF = _mm512_add_epi32(vacc1xCDEF, _mm512_madd_epi16(va1, vbCDEF));
3116*4bdc9457SAndroid Build Coastguard Worker vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF));
3117*4bdc9457SAndroid Build Coastguard Worker vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF));
3118*4bdc9457SAndroid Build Coastguard Worker
3119*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int8_t*) w + 128);
3120*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(int8_t);
3121*4bdc9457SAndroid Build Coastguard Worker }
3122*4bdc9457SAndroid Build Coastguard Worker
3123*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
3124*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
3125*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x0123, vacc1x4567), _mm512_unpackhi_epi32(vacc1x0123, vacc1x4567));
3126*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x89AB, vacc1xCDEF), _mm512_unpackhi_epi32(vacc1x89AB, vacc1xCDEF));
3127*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x0123, vacc2x4567), _mm512_unpackhi_epi32(vacc2x0123, vacc2x4567));
3128*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vacc2xCDEF));
3129*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x0123, vacc3x4567), _mm512_unpackhi_epi32(vacc3x0123, vacc3x4567));
3130*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vacc3xCDEF));
3131*4bdc9457SAndroid Build Coastguard Worker
3132*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
3133*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x04152637, vacc1x8C9DAEBF), _mm512_unpackhi_epi32(vacc1x04152637, vacc1x8C9DAEBF));
3134*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x04152637, vacc2x8C9DAEBF), _mm512_unpackhi_epi32(vacc2x04152637, vacc2x8C9DAEBF));
3135*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x04152637, vacc3x8C9DAEBF), _mm512_unpackhi_epi32(vacc3x04152637, vacc3x8C9DAEBF));
3136*4bdc9457SAndroid Build Coastguard Worker
3137*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
3138*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled1x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc1x084C195D2A6E3B7F);
3139*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled2x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc2x084C195D2A6E3B7F);
3140*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled3x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc3x084C195D2A6E3B7F);
3141*4bdc9457SAndroid Build Coastguard Worker
3142*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale);
3143*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_mul_ps(vscaled1x084C195D2A6E3B7F, vscale);
3144*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_mul_ps(vscaled2x084C195D2A6E3B7F, vscale);
3145*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_mul_ps(vscaled3x084C195D2A6E3B7F, vscale);
3146*4bdc9457SAndroid Build Coastguard Worker
3147*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
3148*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_min_ps(vscaled1x084C195D2A6E3B7F, voutput_max_less_zero_point);
3149*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_min_ps(vscaled2x084C195D2A6E3B7F, voutput_max_less_zero_point);
3150*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_min_ps(vscaled3x084C195D2A6E3B7F, voutput_max_less_zero_point);
3151*4bdc9457SAndroid Build Coastguard Worker
3152*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
3153*4bdc9457SAndroid Build Coastguard Worker vacc1x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled1x084C195D2A6E3B7F);
3154*4bdc9457SAndroid Build Coastguard Worker vacc2x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled2x084C195D2A6E3B7F);
3155*4bdc9457SAndroid Build Coastguard Worker vacc3x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled3x084C195D2A6E3B7F);
3156*4bdc9457SAndroid Build Coastguard Worker
3157*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
3158*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
3159*4bdc9457SAndroid Build Coastguard Worker
3160*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc23x084Cx195Dx2A6Ex3B7F);
3161*4bdc9457SAndroid Build Coastguard Worker vout0123x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0123x084Cx195Dx2A6Ex3B7F);
3162*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
3163*4bdc9457SAndroid Build Coastguard Worker vout0123x0123456789ABCDEF = _mm512_max_epi8(vout0123x0123456789ABCDEF, voutput_min);
3164*4bdc9457SAndroid Build Coastguard Worker
3165*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
3166*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, _mm512_castsi512_si128(vout0123x0123456789ABCDEF));
3167*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c1, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 1));
3168*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c2, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 2));
3169*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c3, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 3));
3170*4bdc9457SAndroid Build Coastguard Worker
3171*4bdc9457SAndroid Build Coastguard Worker a0 = (const int8_t*) ((uintptr_t) a0 - k);
3172*4bdc9457SAndroid Build Coastguard Worker a1 = (const int8_t*) ((uintptr_t) a1 - k);
3173*4bdc9457SAndroid Build Coastguard Worker a2 = (const int8_t*) ((uintptr_t) a2 - k);
3174*4bdc9457SAndroid Build Coastguard Worker a3 = (const int8_t*) ((uintptr_t) a3 - k);
3175*4bdc9457SAndroid Build Coastguard Worker
3176*4bdc9457SAndroid Build Coastguard Worker c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3177*4bdc9457SAndroid Build Coastguard Worker c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
3178*4bdc9457SAndroid Build Coastguard Worker c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
3179*4bdc9457SAndroid Build Coastguard Worker c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
3180*4bdc9457SAndroid Build Coastguard Worker
3181*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
3182*4bdc9457SAndroid Build Coastguard Worker } else {
3183*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
3184*4bdc9457SAndroid Build Coastguard Worker __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT32_C(1) << nc) - UINT32_C(1)));
3185*4bdc9457SAndroid Build Coastguard Worker
3186*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c0, vmask, vout0123x0123456789ABCDEF);
3187*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftli_mask64(vmask, 16);
3188*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c1 - 16, vmask, vout0123x0123456789ABCDEF);
3189*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftli_mask64(vmask, 16);
3190*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c2 - 32, vmask, vout0123x0123456789ABCDEF);
3191*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftli_mask64(vmask, 16);
3192*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c3 - 48, vmask, vout0123x0123456789ABCDEF);
3193*4bdc9457SAndroid Build Coastguard Worker
3194*4bdc9457SAndroid Build Coastguard Worker nc = 0;
3195*4bdc9457SAndroid Build Coastguard Worker }
3196*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
3197*4bdc9457SAndroid Build Coastguard Worker }
3198*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3199*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx(
3200*4bdc9457SAndroid Build Coastguard Worker size_t mr,
3201*4bdc9457SAndroid Build Coastguard Worker size_t nc,
3202*4bdc9457SAndroid Build Coastguard Worker size_t kc,
3203*4bdc9457SAndroid Build Coastguard Worker size_t ks,
3204*4bdc9457SAndroid Build Coastguard Worker const int8_t** restrict a,
3205*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
3206*4bdc9457SAndroid Build Coastguard Worker int8_t* restrict c,
3207*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
3208*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
3209*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
3210*4bdc9457SAndroid Build Coastguard Worker const int8_t* zero,
3211*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3212*4bdc9457SAndroid Build Coastguard Worker {
3213*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
3214*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
3215*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
3216*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
3217*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(int8_t) == 0);
3218*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
3219*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
3220*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
3221*4bdc9457SAndroid Build Coastguard Worker
3222*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
3223*4bdc9457SAndroid Build Coastguard Worker int8_t* c0 = c;
3224*4bdc9457SAndroid Build Coastguard Worker
3225*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
3226*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
3227*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
3228*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_zero_point);
3229*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx512.output_min);
3230*4bdc9457SAndroid Build Coastguard Worker do {
3231*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
3232*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
3233*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
3234*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
3235*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
3236*4bdc9457SAndroid Build Coastguard Worker
3237*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
3238*4bdc9457SAndroid Build Coastguard Worker do {
3239*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a0 = a[0];
3240*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
3241*4bdc9457SAndroid Build Coastguard Worker a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
3242*4bdc9457SAndroid Build Coastguard Worker }
3243*4bdc9457SAndroid Build Coastguard Worker a += 1;
3244*4bdc9457SAndroid Build Coastguard Worker
3245*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
3246*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
3247*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
3248*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
3249*4bdc9457SAndroid Build Coastguard Worker
3250*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) w));
3251*4bdc9457SAndroid Build Coastguard Worker
3252*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
3253*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 32)));
3254*4bdc9457SAndroid Build Coastguard Worker
3255*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
3256*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 64)));
3257*4bdc9457SAndroid Build Coastguard Worker
3258*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
3259*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 96)));
3260*4bdc9457SAndroid Build Coastguard Worker
3261*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
3262*4bdc9457SAndroid Build Coastguard Worker
3263*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int8_t*) w + 128);
3264*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(int8_t);
3265*4bdc9457SAndroid Build Coastguard Worker }
3266*4bdc9457SAndroid Build Coastguard Worker p -= 1 * sizeof(void*);
3267*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
3268*4bdc9457SAndroid Build Coastguard Worker
3269*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
3270*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
3271*4bdc9457SAndroid Build Coastguard Worker
3272*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
3273*4bdc9457SAndroid Build Coastguard Worker
3274*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
3275*4bdc9457SAndroid Build Coastguard Worker
3276*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale);
3277*4bdc9457SAndroid Build Coastguard Worker
3278*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
3279*4bdc9457SAndroid Build Coastguard Worker
3280*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
3281*4bdc9457SAndroid Build Coastguard Worker
3282*4bdc9457SAndroid Build Coastguard Worker const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
3283*4bdc9457SAndroid Build Coastguard Worker
3284*4bdc9457SAndroid Build Coastguard Worker const __m128i vout0x084C2A6E195D3B7F = _mm_packs_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
3285*4bdc9457SAndroid Build Coastguard Worker __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
3286*4bdc9457SAndroid Build Coastguard Worker vout0x0123456789ABCDEF = _mm_max_epi8(vout0x0123456789ABCDEF, voutput_min);
3287*4bdc9457SAndroid Build Coastguard Worker
3288*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
3289*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF);
3290*4bdc9457SAndroid Build Coastguard Worker
3291*4bdc9457SAndroid Build Coastguard Worker c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3292*4bdc9457SAndroid Build Coastguard Worker
3293*4bdc9457SAndroid Build Coastguard Worker a = (const int8_t**restrict) ((uintptr_t) a - ks);
3294*4bdc9457SAndroid Build Coastguard Worker
3295*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
3296*4bdc9457SAndroid Build Coastguard Worker } else {
3297*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
3298*4bdc9457SAndroid Build Coastguard Worker const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT32_C(1) << nc) - UINT32_C(1)));
3299*4bdc9457SAndroid Build Coastguard Worker
3300*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(c0, vmask, vout0x0123456789ABCDEF);
3301*4bdc9457SAndroid Build Coastguard Worker
3302*4bdc9457SAndroid Build Coastguard Worker nc = 0;
3303*4bdc9457SAndroid Build Coastguard Worker }
3304*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
3305*4bdc9457SAndroid Build Coastguard Worker }
3306*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3307*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx(
3308*4bdc9457SAndroid Build Coastguard Worker size_t mr,
3309*4bdc9457SAndroid Build Coastguard Worker size_t nc,
3310*4bdc9457SAndroid Build Coastguard Worker size_t kc,
3311*4bdc9457SAndroid Build Coastguard Worker size_t ks,
3312*4bdc9457SAndroid Build Coastguard Worker const int8_t** restrict a,
3313*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
3314*4bdc9457SAndroid Build Coastguard Worker int8_t* restrict c,
3315*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
3316*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
3317*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
3318*4bdc9457SAndroid Build Coastguard Worker const int8_t* zero,
3319*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3320*4bdc9457SAndroid Build Coastguard Worker {
3321*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
3322*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 4);
3323*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
3324*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
3325*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(int8_t) == 0);
3326*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
3327*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
3328*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
3329*4bdc9457SAndroid Build Coastguard Worker
3330*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
3331*4bdc9457SAndroid Build Coastguard Worker int8_t* c0 = c;
3332*4bdc9457SAndroid Build Coastguard Worker int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
3333*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
3334*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
3335*4bdc9457SAndroid Build Coastguard Worker }
3336*4bdc9457SAndroid Build Coastguard Worker int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
3337*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
3338*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
3339*4bdc9457SAndroid Build Coastguard Worker }
3340*4bdc9457SAndroid Build Coastguard Worker int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
3341*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr != 4) {
3342*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
3343*4bdc9457SAndroid Build Coastguard Worker }
3344*4bdc9457SAndroid Build Coastguard Worker
3345*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
3346*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
3347*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
3348*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
3349*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_min = _mm512_load_si512(params->fp32_avx512.output_min);
3350*4bdc9457SAndroid Build Coastguard Worker do {
3351*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
3352*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
3353*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
3354*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
3355*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x0123 = vacc0x0123;
3356*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x4567 = vacc0x4567;
3357*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x89AB = vacc0x89AB;
3358*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1xCDEF = vacc0xCDEF;
3359*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x0123 = vacc0x0123;
3360*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x4567 = vacc0x4567;
3361*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x89AB = vacc0x89AB;
3362*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2xCDEF = vacc0xCDEF;
3363*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x0123 = vacc0x0123;
3364*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x4567 = vacc0x4567;
3365*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x89AB = vacc0x89AB;
3366*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3xCDEF = vacc0xCDEF;
3367*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
3368*4bdc9457SAndroid Build Coastguard Worker
3369*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
3370*4bdc9457SAndroid Build Coastguard Worker do {
3371*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a0 = a[0];
3372*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
3373*4bdc9457SAndroid Build Coastguard Worker a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
3374*4bdc9457SAndroid Build Coastguard Worker }
3375*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a1 = a[1];
3376*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a1 != zero) {
3377*4bdc9457SAndroid Build Coastguard Worker a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
3378*4bdc9457SAndroid Build Coastguard Worker }
3379*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a2 = a[2];
3380*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a2 != zero) {
3381*4bdc9457SAndroid Build Coastguard Worker a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
3382*4bdc9457SAndroid Build Coastguard Worker }
3383*4bdc9457SAndroid Build Coastguard Worker const int8_t* restrict a3 = a[3];
3384*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a3 != zero) {
3385*4bdc9457SAndroid Build Coastguard Worker a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
3386*4bdc9457SAndroid Build Coastguard Worker }
3387*4bdc9457SAndroid Build Coastguard Worker a += 4;
3388*4bdc9457SAndroid Build Coastguard Worker
3389*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
3390*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
3391*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
3392*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
3393*4bdc9457SAndroid Build Coastguard Worker const __m512i va1 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a1)));
3394*4bdc9457SAndroid Build Coastguard Worker a1 += 8;
3395*4bdc9457SAndroid Build Coastguard Worker const __m512i va2 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a2)));
3396*4bdc9457SAndroid Build Coastguard Worker a2 += 8;
3397*4bdc9457SAndroid Build Coastguard Worker const __m512i va3 = _mm512_broadcast_i32x4(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a3)));
3398*4bdc9457SAndroid Build Coastguard Worker a3 += 8;
3399*4bdc9457SAndroid Build Coastguard Worker
3400*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) w));
3401*4bdc9457SAndroid Build Coastguard Worker
3402*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
3403*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm512_add_epi32(vacc1x0123, _mm512_madd_epi16(va1, vb0123));
3404*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm512_add_epi32(vacc2x0123, _mm512_madd_epi16(va2, vb0123));
3405*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm512_add_epi32(vacc3x0123, _mm512_madd_epi16(va3, vb0123));
3406*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 32)));
3407*4bdc9457SAndroid Build Coastguard Worker
3408*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
3409*4bdc9457SAndroid Build Coastguard Worker vacc1x4567 = _mm512_add_epi32(vacc1x4567, _mm512_madd_epi16(va1, vb4567));
3410*4bdc9457SAndroid Build Coastguard Worker vacc2x4567 = _mm512_add_epi32(vacc2x4567, _mm512_madd_epi16(va2, vb4567));
3411*4bdc9457SAndroid Build Coastguard Worker vacc3x4567 = _mm512_add_epi32(vacc3x4567, _mm512_madd_epi16(va3, vb4567));
3412*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 64)));
3413*4bdc9457SAndroid Build Coastguard Worker
3414*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
3415*4bdc9457SAndroid Build Coastguard Worker vacc1x89AB = _mm512_add_epi32(vacc1x89AB, _mm512_madd_epi16(va1, vb89AB));
3416*4bdc9457SAndroid Build Coastguard Worker vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB));
3417*4bdc9457SAndroid Build Coastguard Worker vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB));
3418*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_cvtepi8_epi16(_mm256_load_si256((const __m256i*) ((const int8_t*) w + 96)));
3419*4bdc9457SAndroid Build Coastguard Worker
3420*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
3421*4bdc9457SAndroid Build Coastguard Worker vacc1xCDEF = _mm512_add_epi32(vacc1xCDEF, _mm512_madd_epi16(va1, vbCDEF));
3422*4bdc9457SAndroid Build Coastguard Worker vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF));
3423*4bdc9457SAndroid Build Coastguard Worker vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF));
3424*4bdc9457SAndroid Build Coastguard Worker
3425*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int8_t*) w + 128);
3426*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(int8_t);
3427*4bdc9457SAndroid Build Coastguard Worker }
3428*4bdc9457SAndroid Build Coastguard Worker p -= 4 * sizeof(void*);
3429*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
3430*4bdc9457SAndroid Build Coastguard Worker
3431*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
3432*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
3433*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x0123, vacc1x4567), _mm512_unpackhi_epi32(vacc1x0123, vacc1x4567));
3434*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x89AB, vacc1xCDEF), _mm512_unpackhi_epi32(vacc1x89AB, vacc1xCDEF));
3435*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x0123, vacc2x4567), _mm512_unpackhi_epi32(vacc2x0123, vacc2x4567));
3436*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vacc2xCDEF));
3437*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x0123, vacc3x4567), _mm512_unpackhi_epi32(vacc3x0123, vacc3x4567));
3438*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vacc3xCDEF));
3439*4bdc9457SAndroid Build Coastguard Worker
3440*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
3441*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x04152637, vacc1x8C9DAEBF), _mm512_unpackhi_epi32(vacc1x04152637, vacc1x8C9DAEBF));
3442*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x04152637, vacc2x8C9DAEBF), _mm512_unpackhi_epi32(vacc2x04152637, vacc2x8C9DAEBF));
3443*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x04152637, vacc3x8C9DAEBF), _mm512_unpackhi_epi32(vacc3x04152637, vacc3x8C9DAEBF));
3444*4bdc9457SAndroid Build Coastguard Worker
3445*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
3446*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled1x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc1x084C195D2A6E3B7F);
3447*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled2x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc2x084C195D2A6E3B7F);
3448*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled3x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc3x084C195D2A6E3B7F);
3449*4bdc9457SAndroid Build Coastguard Worker
3450*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale);
3451*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_mul_ps(vscaled1x084C195D2A6E3B7F, vscale);
3452*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_mul_ps(vscaled2x084C195D2A6E3B7F, vscale);
3453*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_mul_ps(vscaled3x084C195D2A6E3B7F, vscale);
3454*4bdc9457SAndroid Build Coastguard Worker
3455*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
3456*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_min_ps(vscaled1x084C195D2A6E3B7F, voutput_max_less_zero_point);
3457*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_min_ps(vscaled2x084C195D2A6E3B7F, voutput_max_less_zero_point);
3458*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_min_ps(vscaled3x084C195D2A6E3B7F, voutput_max_less_zero_point);
3459*4bdc9457SAndroid Build Coastguard Worker
3460*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
3461*4bdc9457SAndroid Build Coastguard Worker vacc1x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled1x084C195D2A6E3B7F);
3462*4bdc9457SAndroid Build Coastguard Worker vacc2x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled2x084C195D2A6E3B7F);
3463*4bdc9457SAndroid Build Coastguard Worker vacc3x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled3x084C195D2A6E3B7F);
3464*4bdc9457SAndroid Build Coastguard Worker
3465*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
3466*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
3467*4bdc9457SAndroid Build Coastguard Worker
3468*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc23x084Cx195Dx2A6Ex3B7F);
3469*4bdc9457SAndroid Build Coastguard Worker vout0123x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0123x084Cx195Dx2A6Ex3B7F);
3470*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
3471*4bdc9457SAndroid Build Coastguard Worker vout0123x0123456789ABCDEF = _mm512_max_epi8(vout0123x0123456789ABCDEF, voutput_min);
3472*4bdc9457SAndroid Build Coastguard Worker
3473*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
3474*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c3, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 3));
3475*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c2, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 2));
3476*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c1, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 1));
3477*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, _mm512_castsi512_si128(vout0123x0123456789ABCDEF));
3478*4bdc9457SAndroid Build Coastguard Worker
3479*4bdc9457SAndroid Build Coastguard Worker c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
3480*4bdc9457SAndroid Build Coastguard Worker c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
3481*4bdc9457SAndroid Build Coastguard Worker c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
3482*4bdc9457SAndroid Build Coastguard Worker c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3483*4bdc9457SAndroid Build Coastguard Worker
3484*4bdc9457SAndroid Build Coastguard Worker a = (const int8_t**restrict) ((uintptr_t) a - ks);
3485*4bdc9457SAndroid Build Coastguard Worker
3486*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
3487*4bdc9457SAndroid Build Coastguard Worker } else {
3488*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
3489*4bdc9457SAndroid Build Coastguard Worker __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT64_C(1) << (nc + 48)) - (UINT64_C(1) << 48)));
3490*4bdc9457SAndroid Build Coastguard Worker
3491*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c3 - 48, vmask, vout0123x0123456789ABCDEF);
3492*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftri_mask64(vmask, 16);
3493*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c2 - 32, vmask, vout0123x0123456789ABCDEF);
3494*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftri_mask64(vmask, 16);
3495*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c1 - 16, vmask, vout0123x0123456789ABCDEF);
3496*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftri_mask64(vmask, 16);
3497*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c0, vmask, vout0123x0123456789ABCDEF);
3498*4bdc9457SAndroid Build Coastguard Worker
3499*4bdc9457SAndroid Build Coastguard Worker nc = 0;
3500*4bdc9457SAndroid Build Coastguard Worker }
3501*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
3502*4bdc9457SAndroid Build Coastguard Worker }
3503*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3504*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16(
3505*4bdc9457SAndroid Build Coastguard Worker size_t n,
3506*4bdc9457SAndroid Build Coastguard Worker const int8_t* input_a,
3507*4bdc9457SAndroid Build Coastguard Worker const int8_t* input_b,
3508*4bdc9457SAndroid Build Coastguard Worker int8_t* output,
3509*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3510*4bdc9457SAndroid Build Coastguard Worker {
3511*4bdc9457SAndroid Build Coastguard Worker const __m512i vbias = _mm512_load_si512(params->avx512.bias);
3512*4bdc9457SAndroid Build Coastguard Worker const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
3513*4bdc9457SAndroid Build Coastguard Worker const __m512i vb_multiplier = _mm512_load_si512(params->avx512.b_multiplier);
3514*4bdc9457SAndroid Build Coastguard Worker const __m128i vshift = _mm_load_si128((const __m128i*) params->avx512.shift);
3515*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx512.output_zero_point);
3516*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx512.output_min);
3517*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx512.output_max);
3518*4bdc9457SAndroid Build Coastguard Worker
3519*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
3520*4bdc9457SAndroid Build Coastguard Worker const __m512i va0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_a));
3521*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_b));
3522*4bdc9457SAndroid Build Coastguard Worker input_a += 16;
3523*4bdc9457SAndroid Build Coastguard Worker input_b += 16;
3524*4bdc9457SAndroid Build Coastguard Worker
3525*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
3526*4bdc9457SAndroid Build Coastguard Worker
3527*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
3528*4bdc9457SAndroid Build Coastguard Worker
3529*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
3530*4bdc9457SAndroid Build Coastguard Worker
3531*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
3532*4bdc9457SAndroid Build Coastguard Worker
3533*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
3534*4bdc9457SAndroid Build Coastguard Worker
3535*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
3536*4bdc9457SAndroid Build Coastguard Worker
3537*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
3538*4bdc9457SAndroid Build Coastguard Worker
3539*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
3540*4bdc9457SAndroid Build Coastguard Worker output += 16;
3541*4bdc9457SAndroid Build Coastguard Worker }
3542*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
3543*4bdc9457SAndroid Build Coastguard Worker {
3544*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << n) - UINT32_C(1)));
3545*4bdc9457SAndroid Build Coastguard Worker const __m512i va0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_maskz_loadu_epi8(vmask, input_a));
3546*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_maskz_loadu_epi8(vmask, input_b));
3547*4bdc9457SAndroid Build Coastguard Worker
3548*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
3549*4bdc9457SAndroid Build Coastguard Worker
3550*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
3551*4bdc9457SAndroid Build Coastguard Worker
3552*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
3553*4bdc9457SAndroid Build Coastguard Worker
3554*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
3555*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
3556*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
3557*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
3558*4bdc9457SAndroid Build Coastguard Worker
3559*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
3560*4bdc9457SAndroid Build Coastguard Worker }
3561*4bdc9457SAndroid Build Coastguard Worker }
3562*4bdc9457SAndroid Build Coastguard Worker }
3563*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3564*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16(
3565*4bdc9457SAndroid Build Coastguard Worker size_t n,
3566*4bdc9457SAndroid Build Coastguard Worker const int8_t* input_a,
3567*4bdc9457SAndroid Build Coastguard Worker const int8_t* input_b,
3568*4bdc9457SAndroid Build Coastguard Worker int8_t* output,
3569*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3570*4bdc9457SAndroid Build Coastguard Worker {
3571*4bdc9457SAndroid Build Coastguard Worker const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
3572*4bdc9457SAndroid Build Coastguard Worker const __m128i vshift = _mm_load_si128((const __m128i*) params->avx512.shift);
3573*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx512.output_zero_point);
3574*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx512.output_min);
3575*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx512.output_max);
3576*4bdc9457SAndroid Build Coastguard Worker
3577*4bdc9457SAndroid Build Coastguard Worker const __m512i vbias = _mm512_add_epi32(
3578*4bdc9457SAndroid Build Coastguard Worker _mm512_broadcastd_epi32(_mm_cvtsi32_si128(params->avx512.b_multiplier[0] * (int32_t) *input_b)),
3579*4bdc9457SAndroid Build Coastguard Worker _mm512_load_si512(params->avx512.bias));
3580*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
3581*4bdc9457SAndroid Build Coastguard Worker const __m512i va0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_a));
3582*4bdc9457SAndroid Build Coastguard Worker input_a += 16;
3583*4bdc9457SAndroid Build Coastguard Worker
3584*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
3585*4bdc9457SAndroid Build Coastguard Worker
3586*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
3587*4bdc9457SAndroid Build Coastguard Worker
3588*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
3589*4bdc9457SAndroid Build Coastguard Worker
3590*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
3591*4bdc9457SAndroid Build Coastguard Worker
3592*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
3593*4bdc9457SAndroid Build Coastguard Worker
3594*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
3595*4bdc9457SAndroid Build Coastguard Worker
3596*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
3597*4bdc9457SAndroid Build Coastguard Worker output += 16;
3598*4bdc9457SAndroid Build Coastguard Worker }
3599*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
3600*4bdc9457SAndroid Build Coastguard Worker {
3601*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << n) - UINT32_C(1)));
3602*4bdc9457SAndroid Build Coastguard Worker const __m512i va0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_maskz_loadu_epi8(vmask, input_a));
3603*4bdc9457SAndroid Build Coastguard Worker
3604*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
3605*4bdc9457SAndroid Build Coastguard Worker
3606*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
3607*4bdc9457SAndroid Build Coastguard Worker
3608*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
3609*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
3610*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
3611*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
3612*4bdc9457SAndroid Build Coastguard Worker
3613*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
3614*4bdc9457SAndroid Build Coastguard Worker }
3615*4bdc9457SAndroid Build Coastguard Worker }
3616*4bdc9457SAndroid Build Coastguard Worker }
3617*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3618*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32(
3619*4bdc9457SAndroid Build Coastguard Worker size_t channels,
3620*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
3621*4bdc9457SAndroid Build Coastguard Worker const uint8_t** input,
3622*4bdc9457SAndroid Build Coastguard Worker const void* weights,
3623*4bdc9457SAndroid Build Coastguard Worker uint8_t* output,
3624*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
3625*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
3626*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
3627*4bdc9457SAndroid Build Coastguard Worker const uint8_t* zero,
3628*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_MSAN
3629*4bdc9457SAndroid Build Coastguard Worker {
3630*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
3631*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
3632*4bdc9457SAndroid Build Coastguard Worker
3633*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
3634*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
3635*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
3636*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_min);
3637*4bdc9457SAndroid Build Coastguard Worker const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
3638*4bdc9457SAndroid Build Coastguard Worker
3639*4bdc9457SAndroid Build Coastguard Worker const __m512i vk_zero_point = _mm512_cvtepu16_epi32(_mm256_load_si256((const __m256i*) params->fp32_avx512.kernel_zero_point));
3640*4bdc9457SAndroid Build Coastguard Worker do {
3641*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i0 = input[0];
3642*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
3643*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
3644*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
3645*4bdc9457SAndroid Build Coastguard Worker }
3646*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i1 = input[1];
3647*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
3648*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
3649*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
3650*4bdc9457SAndroid Build Coastguard Worker }
3651*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i2 = input[2];
3652*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
3653*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
3654*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
3655*4bdc9457SAndroid Build Coastguard Worker }
3656*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i3 = input[3];
3657*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
3658*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
3659*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
3660*4bdc9457SAndroid Build Coastguard Worker }
3661*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i4 = input[4];
3662*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
3663*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
3664*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
3665*4bdc9457SAndroid Build Coastguard Worker }
3666*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i5 = input[5];
3667*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
3668*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
3669*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
3670*4bdc9457SAndroid Build Coastguard Worker }
3671*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i6 = input[6];
3672*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
3673*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
3674*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
3675*4bdc9457SAndroid Build Coastguard Worker }
3676*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i7 = input[7];
3677*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
3678*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
3679*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
3680*4bdc9457SAndroid Build Coastguard Worker }
3681*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i8 = input[8];
3682*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
3683*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
3684*4bdc9457SAndroid Build Coastguard Worker i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
3685*4bdc9457SAndroid Build Coastguard Worker }
3686*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i9 = input[9];
3687*4bdc9457SAndroid Build Coastguard Worker assert(i9 != NULL);
3688*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i9 != zero) {
3689*4bdc9457SAndroid Build Coastguard Worker i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
3690*4bdc9457SAndroid Build Coastguard Worker }
3691*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i10 = input[10];
3692*4bdc9457SAndroid Build Coastguard Worker assert(i10 != NULL);
3693*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i10 != zero) {
3694*4bdc9457SAndroid Build Coastguard Worker i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
3695*4bdc9457SAndroid Build Coastguard Worker }
3696*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i11 = input[11];
3697*4bdc9457SAndroid Build Coastguard Worker assert(i11 != NULL);
3698*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i11 != zero) {
3699*4bdc9457SAndroid Build Coastguard Worker i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
3700*4bdc9457SAndroid Build Coastguard Worker }
3701*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i12 = input[12];
3702*4bdc9457SAndroid Build Coastguard Worker assert(i12 != NULL);
3703*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i12 != zero) {
3704*4bdc9457SAndroid Build Coastguard Worker i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
3705*4bdc9457SAndroid Build Coastguard Worker }
3706*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i13 = input[13];
3707*4bdc9457SAndroid Build Coastguard Worker assert(i13 != NULL);
3708*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i13 != zero) {
3709*4bdc9457SAndroid Build Coastguard Worker i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
3710*4bdc9457SAndroid Build Coastguard Worker }
3711*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i14 = input[14];
3712*4bdc9457SAndroid Build Coastguard Worker assert(i14 != NULL);
3713*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i14 != zero) {
3714*4bdc9457SAndroid Build Coastguard Worker i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
3715*4bdc9457SAndroid Build Coastguard Worker }
3716*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i15 = input[15];
3717*4bdc9457SAndroid Build Coastguard Worker assert(i15 != NULL);
3718*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i15 != zero) {
3719*4bdc9457SAndroid Build Coastguard Worker i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
3720*4bdc9457SAndroid Build Coastguard Worker }
3721*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i16 = input[16];
3722*4bdc9457SAndroid Build Coastguard Worker assert(i16 != NULL);
3723*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i16 != zero) {
3724*4bdc9457SAndroid Build Coastguard Worker i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
3725*4bdc9457SAndroid Build Coastguard Worker }
3726*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i17 = input[17];
3727*4bdc9457SAndroid Build Coastguard Worker assert(i17 != NULL);
3728*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i17 != zero) {
3729*4bdc9457SAndroid Build Coastguard Worker i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
3730*4bdc9457SAndroid Build Coastguard Worker }
3731*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i18 = input[18];
3732*4bdc9457SAndroid Build Coastguard Worker assert(i18 != NULL);
3733*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i18 != zero) {
3734*4bdc9457SAndroid Build Coastguard Worker i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
3735*4bdc9457SAndroid Build Coastguard Worker }
3736*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i19 = input[19];
3737*4bdc9457SAndroid Build Coastguard Worker assert(i19 != NULL);
3738*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i19 != zero) {
3739*4bdc9457SAndroid Build Coastguard Worker i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
3740*4bdc9457SAndroid Build Coastguard Worker }
3741*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i20 = input[20];
3742*4bdc9457SAndroid Build Coastguard Worker assert(i20 != NULL);
3743*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i20 != zero) {
3744*4bdc9457SAndroid Build Coastguard Worker i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
3745*4bdc9457SAndroid Build Coastguard Worker }
3746*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i21 = input[21];
3747*4bdc9457SAndroid Build Coastguard Worker assert(i21 != NULL);
3748*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i21 != zero) {
3749*4bdc9457SAndroid Build Coastguard Worker i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
3750*4bdc9457SAndroid Build Coastguard Worker }
3751*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i22 = input[22];
3752*4bdc9457SAndroid Build Coastguard Worker assert(i22 != NULL);
3753*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i22 != zero) {
3754*4bdc9457SAndroid Build Coastguard Worker i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
3755*4bdc9457SAndroid Build Coastguard Worker }
3756*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i23 = input[23];
3757*4bdc9457SAndroid Build Coastguard Worker assert(i23 != NULL);
3758*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i23 != zero) {
3759*4bdc9457SAndroid Build Coastguard Worker i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
3760*4bdc9457SAndroid Build Coastguard Worker }
3761*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i24 = input[24];
3762*4bdc9457SAndroid Build Coastguard Worker assert(i24 != NULL);
3763*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i24 != zero) {
3764*4bdc9457SAndroid Build Coastguard Worker i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
3765*4bdc9457SAndroid Build Coastguard Worker }
3766*4bdc9457SAndroid Build Coastguard Worker input = (const uint8_t**) ((uintptr_t) input + input_stride);
3767*4bdc9457SAndroid Build Coastguard Worker
3768*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
3769*4bdc9457SAndroid Build Coastguard Worker const void* w = weights;
3770*4bdc9457SAndroid Build Coastguard Worker for (; c >= 32; c -= 32) {
3771*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
3772*4bdc9457SAndroid Build Coastguard Worker __m512i vaccGHIJKLMNOPQRSTUV = _mm512_loadu_si512((const void*) ((uintptr_t) w + 16 * sizeof(int32_t)));
3773*4bdc9457SAndroid Build Coastguard Worker
3774*4bdc9457SAndroid Build Coastguard Worker
3775*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i0));
3776*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
3777*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i0 + 16)));
3778*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
3779*4bdc9457SAndroid Build Coastguard Worker i0 += 32;
3780*4bdc9457SAndroid Build Coastguard Worker
3781*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
3782*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV));
3783*4bdc9457SAndroid Build Coastguard Worker
3784*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i1));
3785*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
3786*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i1 + 16)));
3787*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
3788*4bdc9457SAndroid Build Coastguard Worker i1 += 32;
3789*4bdc9457SAndroid Build Coastguard Worker
3790*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
3791*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV));
3792*4bdc9457SAndroid Build Coastguard Worker
3793*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i2));
3794*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
3795*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i2 + 16)));
3796*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
3797*4bdc9457SAndroid Build Coastguard Worker i2 += 32;
3798*4bdc9457SAndroid Build Coastguard Worker
3799*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
3800*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV));
3801*4bdc9457SAndroid Build Coastguard Worker
3802*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i3));
3803*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
3804*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i3 + 16)));
3805*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
3806*4bdc9457SAndroid Build Coastguard Worker i3 += 32;
3807*4bdc9457SAndroid Build Coastguard Worker
3808*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
3809*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi3xGHIJKLMNOPQRSTUV, vk3xGHIJKLMNOPQRSTUV));
3810*4bdc9457SAndroid Build Coastguard Worker
3811*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i4));
3812*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
3813*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i4 + 16)));
3814*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
3815*4bdc9457SAndroid Build Coastguard Worker i4 += 32;
3816*4bdc9457SAndroid Build Coastguard Worker
3817*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
3818*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi4xGHIJKLMNOPQRSTUV, vk4xGHIJKLMNOPQRSTUV));
3819*4bdc9457SAndroid Build Coastguard Worker
3820*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i5));
3821*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
3822*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i5 + 16)));
3823*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
3824*4bdc9457SAndroid Build Coastguard Worker i5 += 32;
3825*4bdc9457SAndroid Build Coastguard Worker
3826*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
3827*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi5xGHIJKLMNOPQRSTUV, vk5xGHIJKLMNOPQRSTUV));
3828*4bdc9457SAndroid Build Coastguard Worker
3829*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i6));
3830*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
3831*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i6 + 16)));
3832*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 208 * sizeof(uint8_t)))), vk_zero_point);
3833*4bdc9457SAndroid Build Coastguard Worker i6 += 32;
3834*4bdc9457SAndroid Build Coastguard Worker
3835*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
3836*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi6xGHIJKLMNOPQRSTUV, vk6xGHIJKLMNOPQRSTUV));
3837*4bdc9457SAndroid Build Coastguard Worker
3838*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i7));
3839*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 224 * sizeof(uint8_t)))), vk_zero_point);
3840*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i7 + 16)));
3841*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 240 * sizeof(uint8_t)))), vk_zero_point);
3842*4bdc9457SAndroid Build Coastguard Worker i7 += 32;
3843*4bdc9457SAndroid Build Coastguard Worker
3844*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
3845*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi7xGHIJKLMNOPQRSTUV, vk7xGHIJKLMNOPQRSTUV));
3846*4bdc9457SAndroid Build Coastguard Worker
3847*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i8));
3848*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 256 * sizeof(uint8_t)))), vk_zero_point);
3849*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i8 + 16)));
3850*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 272 * sizeof(uint8_t)))), vk_zero_point);
3851*4bdc9457SAndroid Build Coastguard Worker i8 += 32;
3852*4bdc9457SAndroid Build Coastguard Worker
3853*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
3854*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi8xGHIJKLMNOPQRSTUV, vk8xGHIJKLMNOPQRSTUV));
3855*4bdc9457SAndroid Build Coastguard Worker
3856*4bdc9457SAndroid Build Coastguard Worker const __m512i vi9x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i9));
3857*4bdc9457SAndroid Build Coastguard Worker const __m512i vk9x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(uint8_t)))), vk_zero_point);
3858*4bdc9457SAndroid Build Coastguard Worker const __m512i vi9xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i9 + 16)));
3859*4bdc9457SAndroid Build Coastguard Worker const __m512i vk9xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 304 * sizeof(uint8_t)))), vk_zero_point);
3860*4bdc9457SAndroid Build Coastguard Worker i9 += 32;
3861*4bdc9457SAndroid Build Coastguard Worker
3862*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF));
3863*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi9xGHIJKLMNOPQRSTUV, vk9xGHIJKLMNOPQRSTUV));
3864*4bdc9457SAndroid Build Coastguard Worker
3865*4bdc9457SAndroid Build Coastguard Worker const __m512i vi10x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i10));
3866*4bdc9457SAndroid Build Coastguard Worker const __m512i vk10x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 320 * sizeof(uint8_t)))), vk_zero_point);
3867*4bdc9457SAndroid Build Coastguard Worker const __m512i vi10xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i10 + 16)));
3868*4bdc9457SAndroid Build Coastguard Worker const __m512i vk10xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 336 * sizeof(uint8_t)))), vk_zero_point);
3869*4bdc9457SAndroid Build Coastguard Worker i10 += 32;
3870*4bdc9457SAndroid Build Coastguard Worker
3871*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF));
3872*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi10xGHIJKLMNOPQRSTUV, vk10xGHIJKLMNOPQRSTUV));
3873*4bdc9457SAndroid Build Coastguard Worker
3874*4bdc9457SAndroid Build Coastguard Worker const __m512i vi11x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i11));
3875*4bdc9457SAndroid Build Coastguard Worker const __m512i vk11x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 352 * sizeof(uint8_t)))), vk_zero_point);
3876*4bdc9457SAndroid Build Coastguard Worker const __m512i vi11xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i11 + 16)));
3877*4bdc9457SAndroid Build Coastguard Worker const __m512i vk11xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 368 * sizeof(uint8_t)))), vk_zero_point);
3878*4bdc9457SAndroid Build Coastguard Worker i11 += 32;
3879*4bdc9457SAndroid Build Coastguard Worker
3880*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF));
3881*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi11xGHIJKLMNOPQRSTUV, vk11xGHIJKLMNOPQRSTUV));
3882*4bdc9457SAndroid Build Coastguard Worker
3883*4bdc9457SAndroid Build Coastguard Worker const __m512i vi12x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i12));
3884*4bdc9457SAndroid Build Coastguard Worker const __m512i vk12x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 384 * sizeof(uint8_t)))), vk_zero_point);
3885*4bdc9457SAndroid Build Coastguard Worker const __m512i vi12xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i12 + 16)));
3886*4bdc9457SAndroid Build Coastguard Worker const __m512i vk12xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 400 * sizeof(uint8_t)))), vk_zero_point);
3887*4bdc9457SAndroid Build Coastguard Worker i12 += 32;
3888*4bdc9457SAndroid Build Coastguard Worker
3889*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF));
3890*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi12xGHIJKLMNOPQRSTUV, vk12xGHIJKLMNOPQRSTUV));
3891*4bdc9457SAndroid Build Coastguard Worker
3892*4bdc9457SAndroid Build Coastguard Worker const __m512i vi13x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i13));
3893*4bdc9457SAndroid Build Coastguard Worker const __m512i vk13x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 416 * sizeof(uint8_t)))), vk_zero_point);
3894*4bdc9457SAndroid Build Coastguard Worker const __m512i vi13xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i13 + 16)));
3895*4bdc9457SAndroid Build Coastguard Worker const __m512i vk13xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 432 * sizeof(uint8_t)))), vk_zero_point);
3896*4bdc9457SAndroid Build Coastguard Worker i13 += 32;
3897*4bdc9457SAndroid Build Coastguard Worker
3898*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF));
3899*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi13xGHIJKLMNOPQRSTUV, vk13xGHIJKLMNOPQRSTUV));
3900*4bdc9457SAndroid Build Coastguard Worker
3901*4bdc9457SAndroid Build Coastguard Worker const __m512i vi14x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i14));
3902*4bdc9457SAndroid Build Coastguard Worker const __m512i vk14x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 448 * sizeof(uint8_t)))), vk_zero_point);
3903*4bdc9457SAndroid Build Coastguard Worker const __m512i vi14xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i14 + 16)));
3904*4bdc9457SAndroid Build Coastguard Worker const __m512i vk14xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 464 * sizeof(uint8_t)))), vk_zero_point);
3905*4bdc9457SAndroid Build Coastguard Worker i14 += 32;
3906*4bdc9457SAndroid Build Coastguard Worker
3907*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF));
3908*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi14xGHIJKLMNOPQRSTUV, vk14xGHIJKLMNOPQRSTUV));
3909*4bdc9457SAndroid Build Coastguard Worker
3910*4bdc9457SAndroid Build Coastguard Worker const __m512i vi15x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i15));
3911*4bdc9457SAndroid Build Coastguard Worker const __m512i vk15x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 480 * sizeof(uint8_t)))), vk_zero_point);
3912*4bdc9457SAndroid Build Coastguard Worker const __m512i vi15xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i15 + 16)));
3913*4bdc9457SAndroid Build Coastguard Worker const __m512i vk15xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 496 * sizeof(uint8_t)))), vk_zero_point);
3914*4bdc9457SAndroid Build Coastguard Worker i15 += 32;
3915*4bdc9457SAndroid Build Coastguard Worker
3916*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF));
3917*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi15xGHIJKLMNOPQRSTUV, vk15xGHIJKLMNOPQRSTUV));
3918*4bdc9457SAndroid Build Coastguard Worker
3919*4bdc9457SAndroid Build Coastguard Worker const __m512i vi16x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i16));
3920*4bdc9457SAndroid Build Coastguard Worker const __m512i vk16x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 512 * sizeof(uint8_t)))), vk_zero_point);
3921*4bdc9457SAndroid Build Coastguard Worker const __m512i vi16xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i16 + 16)));
3922*4bdc9457SAndroid Build Coastguard Worker const __m512i vk16xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 528 * sizeof(uint8_t)))), vk_zero_point);
3923*4bdc9457SAndroid Build Coastguard Worker i16 += 32;
3924*4bdc9457SAndroid Build Coastguard Worker
3925*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF));
3926*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi16xGHIJKLMNOPQRSTUV, vk16xGHIJKLMNOPQRSTUV));
3927*4bdc9457SAndroid Build Coastguard Worker
3928*4bdc9457SAndroid Build Coastguard Worker const __m512i vi17x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i17));
3929*4bdc9457SAndroid Build Coastguard Worker const __m512i vk17x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 544 * sizeof(uint8_t)))), vk_zero_point);
3930*4bdc9457SAndroid Build Coastguard Worker const __m512i vi17xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i17 + 16)));
3931*4bdc9457SAndroid Build Coastguard Worker const __m512i vk17xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 560 * sizeof(uint8_t)))), vk_zero_point);
3932*4bdc9457SAndroid Build Coastguard Worker i17 += 32;
3933*4bdc9457SAndroid Build Coastguard Worker
3934*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF));
3935*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi17xGHIJKLMNOPQRSTUV, vk17xGHIJKLMNOPQRSTUV));
3936*4bdc9457SAndroid Build Coastguard Worker
3937*4bdc9457SAndroid Build Coastguard Worker const __m512i vi18x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i18));
3938*4bdc9457SAndroid Build Coastguard Worker const __m512i vk18x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 576 * sizeof(uint8_t)))), vk_zero_point);
3939*4bdc9457SAndroid Build Coastguard Worker const __m512i vi18xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i18 + 16)));
3940*4bdc9457SAndroid Build Coastguard Worker const __m512i vk18xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 592 * sizeof(uint8_t)))), vk_zero_point);
3941*4bdc9457SAndroid Build Coastguard Worker i18 += 32;
3942*4bdc9457SAndroid Build Coastguard Worker
3943*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF));
3944*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi18xGHIJKLMNOPQRSTUV, vk18xGHIJKLMNOPQRSTUV));
3945*4bdc9457SAndroid Build Coastguard Worker
3946*4bdc9457SAndroid Build Coastguard Worker const __m512i vi19x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i19));
3947*4bdc9457SAndroid Build Coastguard Worker const __m512i vk19x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 608 * sizeof(uint8_t)))), vk_zero_point);
3948*4bdc9457SAndroid Build Coastguard Worker const __m512i vi19xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i19 + 16)));
3949*4bdc9457SAndroid Build Coastguard Worker const __m512i vk19xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 624 * sizeof(uint8_t)))), vk_zero_point);
3950*4bdc9457SAndroid Build Coastguard Worker i19 += 32;
3951*4bdc9457SAndroid Build Coastguard Worker
3952*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF));
3953*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi19xGHIJKLMNOPQRSTUV, vk19xGHIJKLMNOPQRSTUV));
3954*4bdc9457SAndroid Build Coastguard Worker
3955*4bdc9457SAndroid Build Coastguard Worker const __m512i vi20x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i20));
3956*4bdc9457SAndroid Build Coastguard Worker const __m512i vk20x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 640 * sizeof(uint8_t)))), vk_zero_point);
3957*4bdc9457SAndroid Build Coastguard Worker const __m512i vi20xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i20 + 16)));
3958*4bdc9457SAndroid Build Coastguard Worker const __m512i vk20xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 656 * sizeof(uint8_t)))), vk_zero_point);
3959*4bdc9457SAndroid Build Coastguard Worker i20 += 32;
3960*4bdc9457SAndroid Build Coastguard Worker
3961*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF));
3962*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi20xGHIJKLMNOPQRSTUV, vk20xGHIJKLMNOPQRSTUV));
3963*4bdc9457SAndroid Build Coastguard Worker
3964*4bdc9457SAndroid Build Coastguard Worker const __m512i vi21x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i21));
3965*4bdc9457SAndroid Build Coastguard Worker const __m512i vk21x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 672 * sizeof(uint8_t)))), vk_zero_point);
3966*4bdc9457SAndroid Build Coastguard Worker const __m512i vi21xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i21 + 16)));
3967*4bdc9457SAndroid Build Coastguard Worker const __m512i vk21xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 688 * sizeof(uint8_t)))), vk_zero_point);
3968*4bdc9457SAndroid Build Coastguard Worker i21 += 32;
3969*4bdc9457SAndroid Build Coastguard Worker
3970*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF));
3971*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi21xGHIJKLMNOPQRSTUV, vk21xGHIJKLMNOPQRSTUV));
3972*4bdc9457SAndroid Build Coastguard Worker
3973*4bdc9457SAndroid Build Coastguard Worker const __m512i vi22x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i22));
3974*4bdc9457SAndroid Build Coastguard Worker const __m512i vk22x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 704 * sizeof(uint8_t)))), vk_zero_point);
3975*4bdc9457SAndroid Build Coastguard Worker const __m512i vi22xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i22 + 16)));
3976*4bdc9457SAndroid Build Coastguard Worker const __m512i vk22xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 720 * sizeof(uint8_t)))), vk_zero_point);
3977*4bdc9457SAndroid Build Coastguard Worker i22 += 32;
3978*4bdc9457SAndroid Build Coastguard Worker
3979*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF));
3980*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi22xGHIJKLMNOPQRSTUV, vk22xGHIJKLMNOPQRSTUV));
3981*4bdc9457SAndroid Build Coastguard Worker
3982*4bdc9457SAndroid Build Coastguard Worker const __m512i vi23x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i23));
3983*4bdc9457SAndroid Build Coastguard Worker const __m512i vk23x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 736 * sizeof(uint8_t)))), vk_zero_point);
3984*4bdc9457SAndroid Build Coastguard Worker const __m512i vi23xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i23 + 16)));
3985*4bdc9457SAndroid Build Coastguard Worker const __m512i vk23xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 752 * sizeof(uint8_t)))), vk_zero_point);
3986*4bdc9457SAndroid Build Coastguard Worker i23 += 32;
3987*4bdc9457SAndroid Build Coastguard Worker
3988*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF));
3989*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi23xGHIJKLMNOPQRSTUV, vk23xGHIJKLMNOPQRSTUV));
3990*4bdc9457SAndroid Build Coastguard Worker
3991*4bdc9457SAndroid Build Coastguard Worker const __m512i vi24x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i24));
3992*4bdc9457SAndroid Build Coastguard Worker const __m512i vk24x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 768 * sizeof(uint8_t)))), vk_zero_point);
3993*4bdc9457SAndroid Build Coastguard Worker const __m512i vi24xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i24 + 16)));
3994*4bdc9457SAndroid Build Coastguard Worker const __m512i vk24xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 784 * sizeof(uint8_t)))), vk_zero_point);
3995*4bdc9457SAndroid Build Coastguard Worker i24 += 32;
3996*4bdc9457SAndroid Build Coastguard Worker
3997*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF));
3998*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi24xGHIJKLMNOPQRSTUV, vk24xGHIJKLMNOPQRSTUV));
3999*4bdc9457SAndroid Build Coastguard Worker
4000*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 800 * sizeof(uint8_t));
4001*4bdc9457SAndroid Build Coastguard Worker
4002*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
4003*4bdc9457SAndroid Build Coastguard Worker __m512 vscaledGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vaccGHIJKLMNOPQRSTUV);
4004*4bdc9457SAndroid Build Coastguard Worker
4005*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale);
4006*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_mul_ps(vscaledGHIJKLMNOPQRSTUV, vscale);
4007*4bdc9457SAndroid Build Coastguard Worker
4008*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
4009*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_min_ps(vscaledGHIJKLMNOPQRSTUV, voutput_max_less_zero_point);
4010*4bdc9457SAndroid Build Coastguard Worker
4011*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
4012*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_cvtps_epi32(vscaledGHIJKLMNOPQRSTUV);
4013*4bdc9457SAndroid Build Coastguard Worker
4014*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
4015*4bdc9457SAndroid Build Coastguard Worker __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vaccGHIJKLMNOPQRSTUV), _mm512_extracti32x8_epi32(vaccGHIJKLMNOPQRSTUV, 1)), _mm512_castsi512_si256(voutput_zero_point));
4016*4bdc9457SAndroid Build Coastguard Worker
4017*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ4567KLMN = _mm512_castsi512_si256(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV);
4018*4bdc9457SAndroid Build Coastguard Worker const __m256i vout89ABOPQRCDEFSTUV = _mm512_extracti32x8_epi32(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV, 1);
4019*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV = _mm256_packus_epi16(vout0123GHIJ4567KLMN, vout89ABOPQRCDEFSTUV);
4020*4bdc9457SAndroid Build Coastguard Worker __m256i vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_permutevar8x32_epi32(vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV, vpermute_mask);
4021*4bdc9457SAndroid Build Coastguard Worker const __m128i voutGHIJOPQR = _mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV);
4022*4bdc9457SAndroid Build Coastguard Worker const __m128i voutKLMNSTUV = _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1);
4023*4bdc9457SAndroid Build Coastguard Worker __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packus_epi16(voutGHIJOPQR, voutKLMNSTUV), _MM_SHUFFLE(3, 1, 2, 0));
4024*4bdc9457SAndroid Build Coastguard Worker
4025*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_max_epu8(vout0123456789ABCDEFGHIJKLMNOPQRSTUV, voutput_min);
4026*4bdc9457SAndroid Build Coastguard Worker voutGHIJKLMNOPQRSTUV = _mm_max_epu8(voutGHIJKLMNOPQRSTUV, _mm256_castsi256_si128(voutput_min));
4027*4bdc9457SAndroid Build Coastguard Worker
4028*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) output, vout0123456789ABCDEFGHIJKLMNOPQRSTUV);
4029*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
4030*4bdc9457SAndroid Build Coastguard Worker output += 32;
4031*4bdc9457SAndroid Build Coastguard Worker }
4032*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
4033*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
4034*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << (c & 15)) - UINT32_C(1)));
4035*4bdc9457SAndroid Build Coastguard Worker const uint8_t* k = (const uint8_t*) ((uintptr_t) w + 32 * sizeof(int32_t));
4036*4bdc9457SAndroid Build Coastguard Worker do {
4037*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
4038*4bdc9457SAndroid Build Coastguard Worker
4039*4bdc9457SAndroid Build Coastguard Worker
4040*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i0));
4041*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) k)), vk_zero_point);
4042*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
4043*4bdc9457SAndroid Build Coastguard Worker
4044*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
4045*4bdc9457SAndroid Build Coastguard Worker
4046*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i1));
4047*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 32))), vk_zero_point);
4048*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
4049*4bdc9457SAndroid Build Coastguard Worker
4050*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
4051*4bdc9457SAndroid Build Coastguard Worker
4052*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i2));
4053*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 64))), vk_zero_point);
4054*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
4055*4bdc9457SAndroid Build Coastguard Worker
4056*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
4057*4bdc9457SAndroid Build Coastguard Worker
4058*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i3));
4059*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 96))), vk_zero_point);
4060*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
4061*4bdc9457SAndroid Build Coastguard Worker
4062*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
4063*4bdc9457SAndroid Build Coastguard Worker
4064*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i4));
4065*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 128))), vk_zero_point);
4066*4bdc9457SAndroid Build Coastguard Worker i4 += 16;
4067*4bdc9457SAndroid Build Coastguard Worker
4068*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
4069*4bdc9457SAndroid Build Coastguard Worker
4070*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i5));
4071*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 160))), vk_zero_point);
4072*4bdc9457SAndroid Build Coastguard Worker i5 += 16;
4073*4bdc9457SAndroid Build Coastguard Worker
4074*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
4075*4bdc9457SAndroid Build Coastguard Worker
4076*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i6));
4077*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 192))), vk_zero_point);
4078*4bdc9457SAndroid Build Coastguard Worker i6 += 16;
4079*4bdc9457SAndroid Build Coastguard Worker
4080*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
4081*4bdc9457SAndroid Build Coastguard Worker
4082*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i7));
4083*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 224))), vk_zero_point);
4084*4bdc9457SAndroid Build Coastguard Worker i7 += 16;
4085*4bdc9457SAndroid Build Coastguard Worker
4086*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
4087*4bdc9457SAndroid Build Coastguard Worker
4088*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i8));
4089*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 256))), vk_zero_point);
4090*4bdc9457SAndroid Build Coastguard Worker i8 += 16;
4091*4bdc9457SAndroid Build Coastguard Worker
4092*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
4093*4bdc9457SAndroid Build Coastguard Worker
4094*4bdc9457SAndroid Build Coastguard Worker const __m512i vi9x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i9));
4095*4bdc9457SAndroid Build Coastguard Worker const __m512i vk9x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 288))), vk_zero_point);
4096*4bdc9457SAndroid Build Coastguard Worker i9 += 16;
4097*4bdc9457SAndroid Build Coastguard Worker
4098*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF));
4099*4bdc9457SAndroid Build Coastguard Worker
4100*4bdc9457SAndroid Build Coastguard Worker const __m512i vi10x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i10));
4101*4bdc9457SAndroid Build Coastguard Worker const __m512i vk10x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 320))), vk_zero_point);
4102*4bdc9457SAndroid Build Coastguard Worker i10 += 16;
4103*4bdc9457SAndroid Build Coastguard Worker
4104*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF));
4105*4bdc9457SAndroid Build Coastguard Worker
4106*4bdc9457SAndroid Build Coastguard Worker const __m512i vi11x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i11));
4107*4bdc9457SAndroid Build Coastguard Worker const __m512i vk11x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 352))), vk_zero_point);
4108*4bdc9457SAndroid Build Coastguard Worker i11 += 16;
4109*4bdc9457SAndroid Build Coastguard Worker
4110*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF));
4111*4bdc9457SAndroid Build Coastguard Worker
4112*4bdc9457SAndroid Build Coastguard Worker const __m512i vi12x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i12));
4113*4bdc9457SAndroid Build Coastguard Worker const __m512i vk12x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 384))), vk_zero_point);
4114*4bdc9457SAndroid Build Coastguard Worker i12 += 16;
4115*4bdc9457SAndroid Build Coastguard Worker
4116*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF));
4117*4bdc9457SAndroid Build Coastguard Worker
4118*4bdc9457SAndroid Build Coastguard Worker const __m512i vi13x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i13));
4119*4bdc9457SAndroid Build Coastguard Worker const __m512i vk13x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 416))), vk_zero_point);
4120*4bdc9457SAndroid Build Coastguard Worker i13 += 16;
4121*4bdc9457SAndroid Build Coastguard Worker
4122*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF));
4123*4bdc9457SAndroid Build Coastguard Worker
4124*4bdc9457SAndroid Build Coastguard Worker const __m512i vi14x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i14));
4125*4bdc9457SAndroid Build Coastguard Worker const __m512i vk14x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 448))), vk_zero_point);
4126*4bdc9457SAndroid Build Coastguard Worker i14 += 16;
4127*4bdc9457SAndroid Build Coastguard Worker
4128*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF));
4129*4bdc9457SAndroid Build Coastguard Worker
4130*4bdc9457SAndroid Build Coastguard Worker const __m512i vi15x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i15));
4131*4bdc9457SAndroid Build Coastguard Worker const __m512i vk15x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 480))), vk_zero_point);
4132*4bdc9457SAndroid Build Coastguard Worker i15 += 16;
4133*4bdc9457SAndroid Build Coastguard Worker
4134*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF));
4135*4bdc9457SAndroid Build Coastguard Worker
4136*4bdc9457SAndroid Build Coastguard Worker const __m512i vi16x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i16));
4137*4bdc9457SAndroid Build Coastguard Worker const __m512i vk16x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 512))), vk_zero_point);
4138*4bdc9457SAndroid Build Coastguard Worker i16 += 16;
4139*4bdc9457SAndroid Build Coastguard Worker
4140*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF));
4141*4bdc9457SAndroid Build Coastguard Worker
4142*4bdc9457SAndroid Build Coastguard Worker const __m512i vi17x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i17));
4143*4bdc9457SAndroid Build Coastguard Worker const __m512i vk17x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 544))), vk_zero_point);
4144*4bdc9457SAndroid Build Coastguard Worker i17 += 16;
4145*4bdc9457SAndroid Build Coastguard Worker
4146*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF));
4147*4bdc9457SAndroid Build Coastguard Worker
4148*4bdc9457SAndroid Build Coastguard Worker const __m512i vi18x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i18));
4149*4bdc9457SAndroid Build Coastguard Worker const __m512i vk18x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 576))), vk_zero_point);
4150*4bdc9457SAndroid Build Coastguard Worker i18 += 16;
4151*4bdc9457SAndroid Build Coastguard Worker
4152*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF));
4153*4bdc9457SAndroid Build Coastguard Worker
4154*4bdc9457SAndroid Build Coastguard Worker const __m512i vi19x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i19));
4155*4bdc9457SAndroid Build Coastguard Worker const __m512i vk19x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 608))), vk_zero_point);
4156*4bdc9457SAndroid Build Coastguard Worker i19 += 16;
4157*4bdc9457SAndroid Build Coastguard Worker
4158*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF));
4159*4bdc9457SAndroid Build Coastguard Worker
4160*4bdc9457SAndroid Build Coastguard Worker const __m512i vi20x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i20));
4161*4bdc9457SAndroid Build Coastguard Worker const __m512i vk20x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 640))), vk_zero_point);
4162*4bdc9457SAndroid Build Coastguard Worker i20 += 16;
4163*4bdc9457SAndroid Build Coastguard Worker
4164*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF));
4165*4bdc9457SAndroid Build Coastguard Worker
4166*4bdc9457SAndroid Build Coastguard Worker const __m512i vi21x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i21));
4167*4bdc9457SAndroid Build Coastguard Worker const __m512i vk21x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 672))), vk_zero_point);
4168*4bdc9457SAndroid Build Coastguard Worker i21 += 16;
4169*4bdc9457SAndroid Build Coastguard Worker
4170*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF));
4171*4bdc9457SAndroid Build Coastguard Worker
4172*4bdc9457SAndroid Build Coastguard Worker const __m512i vi22x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i22));
4173*4bdc9457SAndroid Build Coastguard Worker const __m512i vk22x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 704))), vk_zero_point);
4174*4bdc9457SAndroid Build Coastguard Worker i22 += 16;
4175*4bdc9457SAndroid Build Coastguard Worker
4176*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF));
4177*4bdc9457SAndroid Build Coastguard Worker
4178*4bdc9457SAndroid Build Coastguard Worker const __m512i vi23x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i23));
4179*4bdc9457SAndroid Build Coastguard Worker const __m512i vk23x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 736))), vk_zero_point);
4180*4bdc9457SAndroid Build Coastguard Worker i23 += 16;
4181*4bdc9457SAndroid Build Coastguard Worker
4182*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF));
4183*4bdc9457SAndroid Build Coastguard Worker
4184*4bdc9457SAndroid Build Coastguard Worker const __m512i vi24x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i24));
4185*4bdc9457SAndroid Build Coastguard Worker const __m512i vk24x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 768))), vk_zero_point);
4186*4bdc9457SAndroid Build Coastguard Worker i24 += 16;
4187*4bdc9457SAndroid Build Coastguard Worker
4188*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF));
4189*4bdc9457SAndroid Build Coastguard Worker
4190*4bdc9457SAndroid Build Coastguard Worker k += 16;
4191*4bdc9457SAndroid Build Coastguard Worker
4192*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
4193*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale);
4194*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
4195*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
4196*4bdc9457SAndroid Build Coastguard Worker
4197*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
4198*4bdc9457SAndroid Build Coastguard Worker
4199*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
4200*4bdc9457SAndroid Build Coastguard Worker
4201*4bdc9457SAndroid Build Coastguard Worker const __m128i vout012389AB = _mm256_castsi256_si128(vout012389AB4567CDEF);
4202*4bdc9457SAndroid Build Coastguard Worker const __m128i vout4567CDEF = _mm256_extracti128_si256(vout012389AB4567CDEF, 1);
4203*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(vout012389AB, vout4567CDEF), _MM_SHUFFLE(3, 1, 2, 0));
4204*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, _mm256_castsi256_si128(voutput_min));
4205*4bdc9457SAndroid Build Coastguard Worker
4206*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(c >= 16) {
4207*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
4208*4bdc9457SAndroid Build Coastguard Worker output += 16;
4209*4bdc9457SAndroid Build Coastguard Worker c -= 16;
4210*4bdc9457SAndroid Build Coastguard Worker } else {
4211*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
4212*4bdc9457SAndroid Build Coastguard Worker output = (uint8_t*) ((uintptr_t) output + c);
4213*4bdc9457SAndroid Build Coastguard Worker c = 0;
4214*4bdc9457SAndroid Build Coastguard Worker }
4215*4bdc9457SAndroid Build Coastguard Worker } while (c != 0);
4216*4bdc9457SAndroid Build Coastguard Worker }
4217*4bdc9457SAndroid Build Coastguard Worker
4218*4bdc9457SAndroid Build Coastguard Worker output = (uint8_t*) ((uintptr_t) output + output_increment);
4219*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
4220*4bdc9457SAndroid Build Coastguard Worker }
4221*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4222*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32(
4223*4bdc9457SAndroid Build Coastguard Worker size_t channels,
4224*4bdc9457SAndroid Build Coastguard Worker size_t output_width,
4225*4bdc9457SAndroid Build Coastguard Worker const uint8_t** input,
4226*4bdc9457SAndroid Build Coastguard Worker const void* weights,
4227*4bdc9457SAndroid Build Coastguard Worker uint8_t* output,
4228*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
4229*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
4230*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
4231*4bdc9457SAndroid Build Coastguard Worker const uint8_t* zero,
4232*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_MSAN
4233*4bdc9457SAndroid Build Coastguard Worker {
4234*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
4235*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0);
4236*4bdc9457SAndroid Build Coastguard Worker
4237*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
4238*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
4239*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
4240*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_min);
4241*4bdc9457SAndroid Build Coastguard Worker const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
4242*4bdc9457SAndroid Build Coastguard Worker
4243*4bdc9457SAndroid Build Coastguard Worker const __m512i vk_zero_point = _mm512_cvtepu16_epi32(_mm256_load_si256((const __m256i*) params->fp32_avx512.kernel_zero_point));
4244*4bdc9457SAndroid Build Coastguard Worker do {
4245*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i0 = input[0];
4246*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
4247*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
4248*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
4249*4bdc9457SAndroid Build Coastguard Worker }
4250*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i1 = input[1];
4251*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
4252*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
4253*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
4254*4bdc9457SAndroid Build Coastguard Worker }
4255*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i2 = input[2];
4256*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
4257*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
4258*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
4259*4bdc9457SAndroid Build Coastguard Worker }
4260*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i3 = input[3];
4261*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
4262*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
4263*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
4264*4bdc9457SAndroid Build Coastguard Worker }
4265*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i4 = input[4];
4266*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
4267*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
4268*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
4269*4bdc9457SAndroid Build Coastguard Worker }
4270*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i5 = input[5];
4271*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
4272*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
4273*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
4274*4bdc9457SAndroid Build Coastguard Worker }
4275*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i6 = input[6];
4276*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
4277*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
4278*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
4279*4bdc9457SAndroid Build Coastguard Worker }
4280*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i7 = input[7];
4281*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
4282*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
4283*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
4284*4bdc9457SAndroid Build Coastguard Worker }
4285*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i8 = input[8];
4286*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
4287*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
4288*4bdc9457SAndroid Build Coastguard Worker i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
4289*4bdc9457SAndroid Build Coastguard Worker }
4290*4bdc9457SAndroid Build Coastguard Worker input = (const uint8_t**) ((uintptr_t) input + input_stride);
4291*4bdc9457SAndroid Build Coastguard Worker
4292*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
4293*4bdc9457SAndroid Build Coastguard Worker const void* w = weights;
4294*4bdc9457SAndroid Build Coastguard Worker for (; c >= 32; c -= 32) {
4295*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
4296*4bdc9457SAndroid Build Coastguard Worker __m512i vaccGHIJKLMNOPQRSTUV = _mm512_loadu_si512((const void*) ((uintptr_t) w + 16 * sizeof(int32_t)));
4297*4bdc9457SAndroid Build Coastguard Worker
4298*4bdc9457SAndroid Build Coastguard Worker
4299*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i0));
4300*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
4301*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i0 + 16)));
4302*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
4303*4bdc9457SAndroid Build Coastguard Worker i0 += 32;
4304*4bdc9457SAndroid Build Coastguard Worker
4305*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
4306*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV));
4307*4bdc9457SAndroid Build Coastguard Worker
4308*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i1));
4309*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
4310*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i1 + 16)));
4311*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
4312*4bdc9457SAndroid Build Coastguard Worker i1 += 32;
4313*4bdc9457SAndroid Build Coastguard Worker
4314*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
4315*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV));
4316*4bdc9457SAndroid Build Coastguard Worker
4317*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i2));
4318*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
4319*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i2 + 16)));
4320*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
4321*4bdc9457SAndroid Build Coastguard Worker i2 += 32;
4322*4bdc9457SAndroid Build Coastguard Worker
4323*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
4324*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV));
4325*4bdc9457SAndroid Build Coastguard Worker
4326*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i3));
4327*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
4328*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i3 + 16)));
4329*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
4330*4bdc9457SAndroid Build Coastguard Worker i3 += 32;
4331*4bdc9457SAndroid Build Coastguard Worker
4332*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
4333*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi3xGHIJKLMNOPQRSTUV, vk3xGHIJKLMNOPQRSTUV));
4334*4bdc9457SAndroid Build Coastguard Worker
4335*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i4));
4336*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
4337*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i4 + 16)));
4338*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
4339*4bdc9457SAndroid Build Coastguard Worker i4 += 32;
4340*4bdc9457SAndroid Build Coastguard Worker
4341*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
4342*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi4xGHIJKLMNOPQRSTUV, vk4xGHIJKLMNOPQRSTUV));
4343*4bdc9457SAndroid Build Coastguard Worker
4344*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i5));
4345*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
4346*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i5 + 16)));
4347*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
4348*4bdc9457SAndroid Build Coastguard Worker i5 += 32;
4349*4bdc9457SAndroid Build Coastguard Worker
4350*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
4351*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi5xGHIJKLMNOPQRSTUV, vk5xGHIJKLMNOPQRSTUV));
4352*4bdc9457SAndroid Build Coastguard Worker
4353*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i6));
4354*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
4355*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i6 + 16)));
4356*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 208 * sizeof(uint8_t)))), vk_zero_point);
4357*4bdc9457SAndroid Build Coastguard Worker i6 += 32;
4358*4bdc9457SAndroid Build Coastguard Worker
4359*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
4360*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi6xGHIJKLMNOPQRSTUV, vk6xGHIJKLMNOPQRSTUV));
4361*4bdc9457SAndroid Build Coastguard Worker
4362*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i7));
4363*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 224 * sizeof(uint8_t)))), vk_zero_point);
4364*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i7 + 16)));
4365*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 240 * sizeof(uint8_t)))), vk_zero_point);
4366*4bdc9457SAndroid Build Coastguard Worker i7 += 32;
4367*4bdc9457SAndroid Build Coastguard Worker
4368*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
4369*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi7xGHIJKLMNOPQRSTUV, vk7xGHIJKLMNOPQRSTUV));
4370*4bdc9457SAndroid Build Coastguard Worker
4371*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i8));
4372*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 256 * sizeof(uint8_t)))), vk_zero_point);
4373*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8xGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (i8 + 16)));
4374*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8xGHIJKLMNOPQRSTUV = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 272 * sizeof(uint8_t)))), vk_zero_point);
4375*4bdc9457SAndroid Build Coastguard Worker i8 += 32;
4376*4bdc9457SAndroid Build Coastguard Worker
4377*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
4378*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi8xGHIJKLMNOPQRSTUV, vk8xGHIJKLMNOPQRSTUV));
4379*4bdc9457SAndroid Build Coastguard Worker
4380*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(uint8_t));
4381*4bdc9457SAndroid Build Coastguard Worker
4382*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
4383*4bdc9457SAndroid Build Coastguard Worker __m512 vscaledGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vaccGHIJKLMNOPQRSTUV);
4384*4bdc9457SAndroid Build Coastguard Worker
4385*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale);
4386*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_mul_ps(vscaledGHIJKLMNOPQRSTUV, vscale);
4387*4bdc9457SAndroid Build Coastguard Worker
4388*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
4389*4bdc9457SAndroid Build Coastguard Worker vscaledGHIJKLMNOPQRSTUV = _mm512_min_ps(vscaledGHIJKLMNOPQRSTUV, voutput_max_less_zero_point);
4390*4bdc9457SAndroid Build Coastguard Worker
4391*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
4392*4bdc9457SAndroid Build Coastguard Worker vaccGHIJKLMNOPQRSTUV = _mm512_cvtps_epi32(vscaledGHIJKLMNOPQRSTUV);
4393*4bdc9457SAndroid Build Coastguard Worker
4394*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
4395*4bdc9457SAndroid Build Coastguard Worker __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vaccGHIJKLMNOPQRSTUV), _mm512_extracti32x8_epi32(vaccGHIJKLMNOPQRSTUV, 1)), _mm512_castsi512_si256(voutput_zero_point));
4396*4bdc9457SAndroid Build Coastguard Worker
4397*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ4567KLMN = _mm512_castsi512_si256(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV);
4398*4bdc9457SAndroid Build Coastguard Worker const __m256i vout89ABOPQRCDEFSTUV = _mm512_extracti32x8_epi32(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV, 1);
4399*4bdc9457SAndroid Build Coastguard Worker const __m256i vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV = _mm256_packus_epi16(vout0123GHIJ4567KLMN, vout89ABOPQRCDEFSTUV);
4400*4bdc9457SAndroid Build Coastguard Worker __m256i vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_permutevar8x32_epi32(vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV, vpermute_mask);
4401*4bdc9457SAndroid Build Coastguard Worker const __m128i voutGHIJOPQR = _mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV);
4402*4bdc9457SAndroid Build Coastguard Worker const __m128i voutKLMNSTUV = _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1);
4403*4bdc9457SAndroid Build Coastguard Worker __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packus_epi16(voutGHIJOPQR, voutKLMNSTUV), _MM_SHUFFLE(3, 1, 2, 0));
4404*4bdc9457SAndroid Build Coastguard Worker
4405*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_max_epu8(vout0123456789ABCDEFGHIJKLMNOPQRSTUV, voutput_min);
4406*4bdc9457SAndroid Build Coastguard Worker voutGHIJKLMNOPQRSTUV = _mm_max_epu8(voutGHIJKLMNOPQRSTUV, _mm256_castsi256_si128(voutput_min));
4407*4bdc9457SAndroid Build Coastguard Worker
4408*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) output, vout0123456789ABCDEFGHIJKLMNOPQRSTUV);
4409*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
4410*4bdc9457SAndroid Build Coastguard Worker output += 32;
4411*4bdc9457SAndroid Build Coastguard Worker }
4412*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
4413*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
4414*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << (c & 15)) - UINT32_C(1)));
4415*4bdc9457SAndroid Build Coastguard Worker const uint8_t* k = (const uint8_t*) ((uintptr_t) w + 32 * sizeof(int32_t));
4416*4bdc9457SAndroid Build Coastguard Worker do {
4417*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w);
4418*4bdc9457SAndroid Build Coastguard Worker
4419*4bdc9457SAndroid Build Coastguard Worker
4420*4bdc9457SAndroid Build Coastguard Worker const __m512i vi0x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i0));
4421*4bdc9457SAndroid Build Coastguard Worker const __m512i vk0x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) k)), vk_zero_point);
4422*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
4423*4bdc9457SAndroid Build Coastguard Worker
4424*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF));
4425*4bdc9457SAndroid Build Coastguard Worker
4426*4bdc9457SAndroid Build Coastguard Worker const __m512i vi1x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i1));
4427*4bdc9457SAndroid Build Coastguard Worker const __m512i vk1x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 32))), vk_zero_point);
4428*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
4429*4bdc9457SAndroid Build Coastguard Worker
4430*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF));
4431*4bdc9457SAndroid Build Coastguard Worker
4432*4bdc9457SAndroid Build Coastguard Worker const __m512i vi2x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i2));
4433*4bdc9457SAndroid Build Coastguard Worker const __m512i vk2x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 64))), vk_zero_point);
4434*4bdc9457SAndroid Build Coastguard Worker i2 += 16;
4435*4bdc9457SAndroid Build Coastguard Worker
4436*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF));
4437*4bdc9457SAndroid Build Coastguard Worker
4438*4bdc9457SAndroid Build Coastguard Worker const __m512i vi3x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i3));
4439*4bdc9457SAndroid Build Coastguard Worker const __m512i vk3x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 96))), vk_zero_point);
4440*4bdc9457SAndroid Build Coastguard Worker i3 += 16;
4441*4bdc9457SAndroid Build Coastguard Worker
4442*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF));
4443*4bdc9457SAndroid Build Coastguard Worker
4444*4bdc9457SAndroid Build Coastguard Worker const __m512i vi4x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i4));
4445*4bdc9457SAndroid Build Coastguard Worker const __m512i vk4x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 128))), vk_zero_point);
4446*4bdc9457SAndroid Build Coastguard Worker i4 += 16;
4447*4bdc9457SAndroid Build Coastguard Worker
4448*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF));
4449*4bdc9457SAndroid Build Coastguard Worker
4450*4bdc9457SAndroid Build Coastguard Worker const __m512i vi5x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i5));
4451*4bdc9457SAndroid Build Coastguard Worker const __m512i vk5x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 160))), vk_zero_point);
4452*4bdc9457SAndroid Build Coastguard Worker i5 += 16;
4453*4bdc9457SAndroid Build Coastguard Worker
4454*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF));
4455*4bdc9457SAndroid Build Coastguard Worker
4456*4bdc9457SAndroid Build Coastguard Worker const __m512i vi6x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i6));
4457*4bdc9457SAndroid Build Coastguard Worker const __m512i vk6x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 192))), vk_zero_point);
4458*4bdc9457SAndroid Build Coastguard Worker i6 += 16;
4459*4bdc9457SAndroid Build Coastguard Worker
4460*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF));
4461*4bdc9457SAndroid Build Coastguard Worker
4462*4bdc9457SAndroid Build Coastguard Worker const __m512i vi7x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i7));
4463*4bdc9457SAndroid Build Coastguard Worker const __m512i vk7x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 224))), vk_zero_point);
4464*4bdc9457SAndroid Build Coastguard Worker i7 += 16;
4465*4bdc9457SAndroid Build Coastguard Worker
4466*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF));
4467*4bdc9457SAndroid Build Coastguard Worker
4468*4bdc9457SAndroid Build Coastguard Worker const __m512i vi8x0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) i8));
4469*4bdc9457SAndroid Build Coastguard Worker const __m512i vk8x0123456789ABCDEF = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (k + 256))), vk_zero_point);
4470*4bdc9457SAndroid Build Coastguard Worker i8 += 16;
4471*4bdc9457SAndroid Build Coastguard Worker
4472*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF));
4473*4bdc9457SAndroid Build Coastguard Worker
4474*4bdc9457SAndroid Build Coastguard Worker k += 16;
4475*4bdc9457SAndroid Build Coastguard Worker
4476*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF);
4477*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale);
4478*4bdc9457SAndroid Build Coastguard Worker vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point);
4479*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF);
4480*4bdc9457SAndroid Build Coastguard Worker
4481*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
4482*4bdc9457SAndroid Build Coastguard Worker
4483*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
4484*4bdc9457SAndroid Build Coastguard Worker
4485*4bdc9457SAndroid Build Coastguard Worker const __m128i vout012389AB = _mm256_castsi256_si128(vout012389AB4567CDEF);
4486*4bdc9457SAndroid Build Coastguard Worker const __m128i vout4567CDEF = _mm256_extracti128_si256(vout012389AB4567CDEF, 1);
4487*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(vout012389AB, vout4567CDEF), _MM_SHUFFLE(3, 1, 2, 0));
4488*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, _mm256_castsi256_si128(voutput_min));
4489*4bdc9457SAndroid Build Coastguard Worker
4490*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(c >= 16) {
4491*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
4492*4bdc9457SAndroid Build Coastguard Worker output += 16;
4493*4bdc9457SAndroid Build Coastguard Worker c -= 16;
4494*4bdc9457SAndroid Build Coastguard Worker } else {
4495*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
4496*4bdc9457SAndroid Build Coastguard Worker output = (uint8_t*) ((uintptr_t) output + c);
4497*4bdc9457SAndroid Build Coastguard Worker c = 0;
4498*4bdc9457SAndroid Build Coastguard Worker }
4499*4bdc9457SAndroid Build Coastguard Worker } while (c != 0);
4500*4bdc9457SAndroid Build Coastguard Worker }
4501*4bdc9457SAndroid Build Coastguard Worker
4502*4bdc9457SAndroid Build Coastguard Worker output = (uint8_t*) ((uintptr_t) output + output_increment);
4503*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0);
4504*4bdc9457SAndroid Build Coastguard Worker }
4505*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_f32_vcvt_ukernel__avx512skx_x32(size_t n,const uint8_t * x,float * y,const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])4506*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_f32_vcvt_ukernel__avx512skx_x32(
4507*4bdc9457SAndroid Build Coastguard Worker size_t n,
4508*4bdc9457SAndroid Build Coastguard Worker const uint8_t* x,
4509*4bdc9457SAndroid Build Coastguard Worker float* y,
4510*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4511*4bdc9457SAndroid Build Coastguard Worker {
4512*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
4513*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint8_t) == 0);
4514*4bdc9457SAndroid Build Coastguard Worker assert(x != NULL);
4515*4bdc9457SAndroid Build Coastguard Worker assert(y != NULL);
4516*4bdc9457SAndroid Build Coastguard Worker
4517*4bdc9457SAndroid Build Coastguard Worker const __m512i vminus_zero_point = _mm512_load_si512(params->avx512.minus_zero_point);
4518*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->avx512.scale);
4519*4bdc9457SAndroid Build Coastguard Worker for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
4520*4bdc9457SAndroid Build Coastguard Worker __m512i vx0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) x));
4521*4bdc9457SAndroid Build Coastguard Worker __m512i vxGHIJKLMNOPQRSTUV = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) (x + 16)));
4522*4bdc9457SAndroid Build Coastguard Worker x += 32;
4523*4bdc9457SAndroid Build Coastguard Worker
4524*4bdc9457SAndroid Build Coastguard Worker vx0123456789ABCDEF = _mm512_add_epi32(vx0123456789ABCDEF, vminus_zero_point);
4525*4bdc9457SAndroid Build Coastguard Worker vxGHIJKLMNOPQRSTUV = _mm512_add_epi32(vxGHIJKLMNOPQRSTUV, vminus_zero_point);
4526*4bdc9457SAndroid Build Coastguard Worker
4527*4bdc9457SAndroid Build Coastguard Worker __m512 vy0123456789ABCDEF = _mm512_cvtepi32_ps(vx0123456789ABCDEF);
4528*4bdc9457SAndroid Build Coastguard Worker __m512 vyGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vxGHIJKLMNOPQRSTUV);
4529*4bdc9457SAndroid Build Coastguard Worker
4530*4bdc9457SAndroid Build Coastguard Worker vy0123456789ABCDEF = _mm512_mul_ps(vy0123456789ABCDEF, vscale);
4531*4bdc9457SAndroid Build Coastguard Worker vyGHIJKLMNOPQRSTUV = _mm512_mul_ps(vyGHIJKLMNOPQRSTUV, vscale);
4532*4bdc9457SAndroid Build Coastguard Worker
4533*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_ps(y, vy0123456789ABCDEF);
4534*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_ps(y + 16, vyGHIJKLMNOPQRSTUV);
4535*4bdc9457SAndroid Build Coastguard Worker y += 32;
4536*4bdc9457SAndroid Build Coastguard Worker }
4537*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
4538*4bdc9457SAndroid Build Coastguard Worker __m512i vx = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) x));
4539*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_add_epi32(vx, vminus_zero_point);
4540*4bdc9457SAndroid Build Coastguard Worker x += 16;
4541*4bdc9457SAndroid Build Coastguard Worker
4542*4bdc9457SAndroid Build Coastguard Worker __m512 vy = _mm512_cvtepi32_ps(vx);
4543*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_mul_ps(vy, vscale);
4544*4bdc9457SAndroid Build Coastguard Worker
4545*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_ps(y, vy);
4546*4bdc9457SAndroid Build Coastguard Worker y += 16;
4547*4bdc9457SAndroid Build Coastguard Worker }
4548*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
4549*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(uint8_t));
4550*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(uint8_t));
4551*4bdc9457SAndroid Build Coastguard Worker
4552*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid elements (depends on n).
4553*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
4554*4bdc9457SAndroid Build Coastguard Worker
4555*4bdc9457SAndroid Build Coastguard Worker __m512i vx = _mm512_cvtepu8_epi32(_mm_maskz_loadu_epi8(vmask, x));
4556*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_add_epi32(vx, vminus_zero_point);
4557*4bdc9457SAndroid Build Coastguard Worker
4558*4bdc9457SAndroid Build Coastguard Worker __m512 vy = _mm512_cvtepi32_ps(vx);
4559*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_mul_ps(vy, vscale);
4560*4bdc9457SAndroid Build Coastguard Worker
4561*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_ps(y, vmask, vy);
4562*4bdc9457SAndroid Build Coastguard Worker }
4563*4bdc9457SAndroid Build Coastguard Worker }
4564*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4565*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx(
4566*4bdc9457SAndroid Build Coastguard Worker size_t mr,
4567*4bdc9457SAndroid Build Coastguard Worker size_t nc,
4568*4bdc9457SAndroid Build Coastguard Worker size_t kc,
4569*4bdc9457SAndroid Build Coastguard Worker const uint8_t* restrict a,
4570*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
4571*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
4572*4bdc9457SAndroid Build Coastguard Worker uint8_t* restrict c,
4573*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
4574*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
4575*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4576*4bdc9457SAndroid Build Coastguard Worker {
4577*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
4578*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
4579*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
4580*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
4581*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(uint8_t) == 0);
4582*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
4583*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
4584*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
4585*4bdc9457SAndroid Build Coastguard Worker
4586*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
4587*4bdc9457SAndroid Build Coastguard Worker const uint8_t* a0 = a;
4588*4bdc9457SAndroid Build Coastguard Worker uint8_t* c0 = c;
4589*4bdc9457SAndroid Build Coastguard Worker
4590*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
4591*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
4592*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
4593*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_zero_point);
4594*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx512.output_min);
4595*4bdc9457SAndroid Build Coastguard Worker do {
4596*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
4597*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
4598*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
4599*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
4600*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
4601*4bdc9457SAndroid Build Coastguard Worker
4602*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
4603*4bdc9457SAndroid Build Coastguard Worker const __m512i vb_zero_point = _mm512_load_si512(params->fp32_avx512.kernel_zero_point);
4604*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
4605*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
4606*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
4607*4bdc9457SAndroid Build Coastguard Worker
4608*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) w)), vb_zero_point);
4609*4bdc9457SAndroid Build Coastguard Worker
4610*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
4611*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 32))), vb_zero_point);
4612*4bdc9457SAndroid Build Coastguard Worker
4613*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
4614*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 64))), vb_zero_point);
4615*4bdc9457SAndroid Build Coastguard Worker
4616*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
4617*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 96))), vb_zero_point);
4618*4bdc9457SAndroid Build Coastguard Worker
4619*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
4620*4bdc9457SAndroid Build Coastguard Worker
4621*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const uint8_t*) w + 128);
4622*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(uint8_t);
4623*4bdc9457SAndroid Build Coastguard Worker }
4624*4bdc9457SAndroid Build Coastguard Worker
4625*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
4626*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
4627*4bdc9457SAndroid Build Coastguard Worker
4628*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
4629*4bdc9457SAndroid Build Coastguard Worker
4630*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
4631*4bdc9457SAndroid Build Coastguard Worker
4632*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale);
4633*4bdc9457SAndroid Build Coastguard Worker
4634*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
4635*4bdc9457SAndroid Build Coastguard Worker
4636*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
4637*4bdc9457SAndroid Build Coastguard Worker
4638*4bdc9457SAndroid Build Coastguard Worker const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
4639*4bdc9457SAndroid Build Coastguard Worker
4640*4bdc9457SAndroid Build Coastguard Worker const __m128i vout0x084C2A6E195D3B7F = _mm_packus_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
4641*4bdc9457SAndroid Build Coastguard Worker __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
4642*4bdc9457SAndroid Build Coastguard Worker vout0x0123456789ABCDEF = _mm_max_epu8(vout0x0123456789ABCDEF, voutput_min);
4643*4bdc9457SAndroid Build Coastguard Worker
4644*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
4645*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF);
4646*4bdc9457SAndroid Build Coastguard Worker
4647*4bdc9457SAndroid Build Coastguard Worker a0 = (const uint8_t*) ((uintptr_t) a0 - k);
4648*4bdc9457SAndroid Build Coastguard Worker
4649*4bdc9457SAndroid Build Coastguard Worker c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
4650*4bdc9457SAndroid Build Coastguard Worker
4651*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
4652*4bdc9457SAndroid Build Coastguard Worker } else {
4653*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
4654*4bdc9457SAndroid Build Coastguard Worker const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT32_C(1) << nc) - UINT32_C(1)));
4655*4bdc9457SAndroid Build Coastguard Worker
4656*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(c0, vmask, vout0x0123456789ABCDEF);
4657*4bdc9457SAndroid Build Coastguard Worker
4658*4bdc9457SAndroid Build Coastguard Worker nc = 0;
4659*4bdc9457SAndroid Build Coastguard Worker }
4660*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
4661*4bdc9457SAndroid Build Coastguard Worker }
4662*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4663*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx(
4664*4bdc9457SAndroid Build Coastguard Worker size_t mr,
4665*4bdc9457SAndroid Build Coastguard Worker size_t nc,
4666*4bdc9457SAndroid Build Coastguard Worker size_t kc,
4667*4bdc9457SAndroid Build Coastguard Worker const uint8_t* restrict a,
4668*4bdc9457SAndroid Build Coastguard Worker size_t a_stride,
4669*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
4670*4bdc9457SAndroid Build Coastguard Worker uint8_t* restrict c,
4671*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
4672*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
4673*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4674*4bdc9457SAndroid Build Coastguard Worker {
4675*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
4676*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 4);
4677*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
4678*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
4679*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(uint8_t) == 0);
4680*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
4681*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
4682*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
4683*4bdc9457SAndroid Build Coastguard Worker
4684*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
4685*4bdc9457SAndroid Build Coastguard Worker const uint8_t* a0 = a;
4686*4bdc9457SAndroid Build Coastguard Worker uint8_t* c0 = c;
4687*4bdc9457SAndroid Build Coastguard Worker const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
4688*4bdc9457SAndroid Build Coastguard Worker uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
4689*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
4690*4bdc9457SAndroid Build Coastguard Worker a1 = a0;
4691*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
4692*4bdc9457SAndroid Build Coastguard Worker }
4693*4bdc9457SAndroid Build Coastguard Worker const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
4694*4bdc9457SAndroid Build Coastguard Worker uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
4695*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
4696*4bdc9457SAndroid Build Coastguard Worker a2 = a1;
4697*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
4698*4bdc9457SAndroid Build Coastguard Worker }
4699*4bdc9457SAndroid Build Coastguard Worker const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
4700*4bdc9457SAndroid Build Coastguard Worker uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
4701*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr != 4) {
4702*4bdc9457SAndroid Build Coastguard Worker a3 = a2;
4703*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
4704*4bdc9457SAndroid Build Coastguard Worker }
4705*4bdc9457SAndroid Build Coastguard Worker
4706*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
4707*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
4708*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
4709*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
4710*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_min = _mm512_load_si512(params->fp32_avx512.output_min);
4711*4bdc9457SAndroid Build Coastguard Worker do {
4712*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
4713*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
4714*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
4715*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
4716*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x0123 = vacc0x0123;
4717*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x4567 = vacc0x4567;
4718*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x89AB = vacc0x89AB;
4719*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1xCDEF = vacc0xCDEF;
4720*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x0123 = vacc0x0123;
4721*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x4567 = vacc0x4567;
4722*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x89AB = vacc0x89AB;
4723*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2xCDEF = vacc0xCDEF;
4724*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x0123 = vacc0x0123;
4725*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x4567 = vacc0x4567;
4726*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x89AB = vacc0x89AB;
4727*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3xCDEF = vacc0xCDEF;
4728*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
4729*4bdc9457SAndroid Build Coastguard Worker
4730*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
4731*4bdc9457SAndroid Build Coastguard Worker const __m512i vb_zero_point = _mm512_load_si512(params->fp32_avx512.kernel_zero_point);
4732*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
4733*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
4734*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
4735*4bdc9457SAndroid Build Coastguard Worker const __m512i va1 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a1)));
4736*4bdc9457SAndroid Build Coastguard Worker a1 += 8;
4737*4bdc9457SAndroid Build Coastguard Worker const __m512i va2 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a2)));
4738*4bdc9457SAndroid Build Coastguard Worker a2 += 8;
4739*4bdc9457SAndroid Build Coastguard Worker const __m512i va3 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a3)));
4740*4bdc9457SAndroid Build Coastguard Worker a3 += 8;
4741*4bdc9457SAndroid Build Coastguard Worker
4742*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) w)), vb_zero_point);
4743*4bdc9457SAndroid Build Coastguard Worker
4744*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
4745*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm512_add_epi32(vacc1x0123, _mm512_madd_epi16(va1, vb0123));
4746*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm512_add_epi32(vacc2x0123, _mm512_madd_epi16(va2, vb0123));
4747*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm512_add_epi32(vacc3x0123, _mm512_madd_epi16(va3, vb0123));
4748*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 32))), vb_zero_point);
4749*4bdc9457SAndroid Build Coastguard Worker
4750*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
4751*4bdc9457SAndroid Build Coastguard Worker vacc1x4567 = _mm512_add_epi32(vacc1x4567, _mm512_madd_epi16(va1, vb4567));
4752*4bdc9457SAndroid Build Coastguard Worker vacc2x4567 = _mm512_add_epi32(vacc2x4567, _mm512_madd_epi16(va2, vb4567));
4753*4bdc9457SAndroid Build Coastguard Worker vacc3x4567 = _mm512_add_epi32(vacc3x4567, _mm512_madd_epi16(va3, vb4567));
4754*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 64))), vb_zero_point);
4755*4bdc9457SAndroid Build Coastguard Worker
4756*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
4757*4bdc9457SAndroid Build Coastguard Worker vacc1x89AB = _mm512_add_epi32(vacc1x89AB, _mm512_madd_epi16(va1, vb89AB));
4758*4bdc9457SAndroid Build Coastguard Worker vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB));
4759*4bdc9457SAndroid Build Coastguard Worker vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB));
4760*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 96))), vb_zero_point);
4761*4bdc9457SAndroid Build Coastguard Worker
4762*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
4763*4bdc9457SAndroid Build Coastguard Worker vacc1xCDEF = _mm512_add_epi32(vacc1xCDEF, _mm512_madd_epi16(va1, vbCDEF));
4764*4bdc9457SAndroid Build Coastguard Worker vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF));
4765*4bdc9457SAndroid Build Coastguard Worker vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF));
4766*4bdc9457SAndroid Build Coastguard Worker
4767*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const uint8_t*) w + 128);
4768*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(uint8_t);
4769*4bdc9457SAndroid Build Coastguard Worker }
4770*4bdc9457SAndroid Build Coastguard Worker
4771*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
4772*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
4773*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x0123, vacc1x4567), _mm512_unpackhi_epi32(vacc1x0123, vacc1x4567));
4774*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x89AB, vacc1xCDEF), _mm512_unpackhi_epi32(vacc1x89AB, vacc1xCDEF));
4775*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x0123, vacc2x4567), _mm512_unpackhi_epi32(vacc2x0123, vacc2x4567));
4776*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vacc2xCDEF));
4777*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x0123, vacc3x4567), _mm512_unpackhi_epi32(vacc3x0123, vacc3x4567));
4778*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vacc3xCDEF));
4779*4bdc9457SAndroid Build Coastguard Worker
4780*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
4781*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x04152637, vacc1x8C9DAEBF), _mm512_unpackhi_epi32(vacc1x04152637, vacc1x8C9DAEBF));
4782*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x04152637, vacc2x8C9DAEBF), _mm512_unpackhi_epi32(vacc2x04152637, vacc2x8C9DAEBF));
4783*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x04152637, vacc3x8C9DAEBF), _mm512_unpackhi_epi32(vacc3x04152637, vacc3x8C9DAEBF));
4784*4bdc9457SAndroid Build Coastguard Worker
4785*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
4786*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled1x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc1x084C195D2A6E3B7F);
4787*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled2x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc2x084C195D2A6E3B7F);
4788*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled3x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc3x084C195D2A6E3B7F);
4789*4bdc9457SAndroid Build Coastguard Worker
4790*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale);
4791*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_mul_ps(vscaled1x084C195D2A6E3B7F, vscale);
4792*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_mul_ps(vscaled2x084C195D2A6E3B7F, vscale);
4793*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_mul_ps(vscaled3x084C195D2A6E3B7F, vscale);
4794*4bdc9457SAndroid Build Coastguard Worker
4795*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
4796*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_min_ps(vscaled1x084C195D2A6E3B7F, voutput_max_less_zero_point);
4797*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_min_ps(vscaled2x084C195D2A6E3B7F, voutput_max_less_zero_point);
4798*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_min_ps(vscaled3x084C195D2A6E3B7F, voutput_max_less_zero_point);
4799*4bdc9457SAndroid Build Coastguard Worker
4800*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
4801*4bdc9457SAndroid Build Coastguard Worker vacc1x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled1x084C195D2A6E3B7F);
4802*4bdc9457SAndroid Build Coastguard Worker vacc2x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled2x084C195D2A6E3B7F);
4803*4bdc9457SAndroid Build Coastguard Worker vacc3x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled3x084C195D2A6E3B7F);
4804*4bdc9457SAndroid Build Coastguard Worker
4805*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
4806*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
4807*4bdc9457SAndroid Build Coastguard Worker
4808*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x084Cx195Dx2A6Ex3B7F = _mm512_packus_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc23x084Cx195Dx2A6Ex3B7F);
4809*4bdc9457SAndroid Build Coastguard Worker vout0123x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0123x084Cx195Dx2A6Ex3B7F);
4810*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
4811*4bdc9457SAndroid Build Coastguard Worker vout0123x0123456789ABCDEF = _mm512_max_epu8(vout0123x0123456789ABCDEF, voutput_min);
4812*4bdc9457SAndroid Build Coastguard Worker
4813*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
4814*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, _mm512_castsi512_si128(vout0123x0123456789ABCDEF));
4815*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c1, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 1));
4816*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c2, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 2));
4817*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c3, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 3));
4818*4bdc9457SAndroid Build Coastguard Worker
4819*4bdc9457SAndroid Build Coastguard Worker a0 = (const uint8_t*) ((uintptr_t) a0 - k);
4820*4bdc9457SAndroid Build Coastguard Worker a1 = (const uint8_t*) ((uintptr_t) a1 - k);
4821*4bdc9457SAndroid Build Coastguard Worker a2 = (const uint8_t*) ((uintptr_t) a2 - k);
4822*4bdc9457SAndroid Build Coastguard Worker a3 = (const uint8_t*) ((uintptr_t) a3 - k);
4823*4bdc9457SAndroid Build Coastguard Worker
4824*4bdc9457SAndroid Build Coastguard Worker c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
4825*4bdc9457SAndroid Build Coastguard Worker c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
4826*4bdc9457SAndroid Build Coastguard Worker c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
4827*4bdc9457SAndroid Build Coastguard Worker c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
4828*4bdc9457SAndroid Build Coastguard Worker
4829*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
4830*4bdc9457SAndroid Build Coastguard Worker } else {
4831*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
4832*4bdc9457SAndroid Build Coastguard Worker __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT32_C(1) << nc) - UINT32_C(1)));
4833*4bdc9457SAndroid Build Coastguard Worker
4834*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c0, vmask, vout0123x0123456789ABCDEF);
4835*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftli_mask64(vmask, 16);
4836*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c1 - 16, vmask, vout0123x0123456789ABCDEF);
4837*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftli_mask64(vmask, 16);
4838*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c2 - 32, vmask, vout0123x0123456789ABCDEF);
4839*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftli_mask64(vmask, 16);
4840*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c3 - 48, vmask, vout0123x0123456789ABCDEF);
4841*4bdc9457SAndroid Build Coastguard Worker
4842*4bdc9457SAndroid Build Coastguard Worker nc = 0;
4843*4bdc9457SAndroid Build Coastguard Worker }
4844*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
4845*4bdc9457SAndroid Build Coastguard Worker }
4846*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4847*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx(
4848*4bdc9457SAndroid Build Coastguard Worker size_t mr,
4849*4bdc9457SAndroid Build Coastguard Worker size_t nc,
4850*4bdc9457SAndroid Build Coastguard Worker size_t kc,
4851*4bdc9457SAndroid Build Coastguard Worker size_t ks,
4852*4bdc9457SAndroid Build Coastguard Worker const uint8_t** restrict a,
4853*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
4854*4bdc9457SAndroid Build Coastguard Worker uint8_t* restrict c,
4855*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
4856*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
4857*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
4858*4bdc9457SAndroid Build Coastguard Worker const uint8_t* zero,
4859*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4860*4bdc9457SAndroid Build Coastguard Worker {
4861*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
4862*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 1);
4863*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
4864*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
4865*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(uint8_t) == 0);
4866*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
4867*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
4868*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
4869*4bdc9457SAndroid Build Coastguard Worker
4870*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
4871*4bdc9457SAndroid Build Coastguard Worker uint8_t* c0 = c;
4872*4bdc9457SAndroid Build Coastguard Worker
4873*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
4874*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
4875*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
4876*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx512.output_zero_point);
4877*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx512.output_min);
4878*4bdc9457SAndroid Build Coastguard Worker do {
4879*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
4880*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
4881*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
4882*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
4883*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
4884*4bdc9457SAndroid Build Coastguard Worker
4885*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
4886*4bdc9457SAndroid Build Coastguard Worker do {
4887*4bdc9457SAndroid Build Coastguard Worker const uint8_t* restrict a0 = a[0];
4888*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
4889*4bdc9457SAndroid Build Coastguard Worker a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
4890*4bdc9457SAndroid Build Coastguard Worker }
4891*4bdc9457SAndroid Build Coastguard Worker a += 1;
4892*4bdc9457SAndroid Build Coastguard Worker
4893*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
4894*4bdc9457SAndroid Build Coastguard Worker const __m512i vb_zero_point = _mm512_load_si512(params->fp32_avx512.kernel_zero_point);
4895*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
4896*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
4897*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
4898*4bdc9457SAndroid Build Coastguard Worker
4899*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) w)), vb_zero_point);
4900*4bdc9457SAndroid Build Coastguard Worker
4901*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
4902*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 32))), vb_zero_point);
4903*4bdc9457SAndroid Build Coastguard Worker
4904*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
4905*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 64))), vb_zero_point);
4906*4bdc9457SAndroid Build Coastguard Worker
4907*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
4908*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 96))), vb_zero_point);
4909*4bdc9457SAndroid Build Coastguard Worker
4910*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
4911*4bdc9457SAndroid Build Coastguard Worker
4912*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const uint8_t*) w + 128);
4913*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(uint8_t);
4914*4bdc9457SAndroid Build Coastguard Worker }
4915*4bdc9457SAndroid Build Coastguard Worker p -= 1 * sizeof(void*);
4916*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
4917*4bdc9457SAndroid Build Coastguard Worker
4918*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
4919*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
4920*4bdc9457SAndroid Build Coastguard Worker
4921*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
4922*4bdc9457SAndroid Build Coastguard Worker
4923*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
4924*4bdc9457SAndroid Build Coastguard Worker
4925*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale);
4926*4bdc9457SAndroid Build Coastguard Worker
4927*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
4928*4bdc9457SAndroid Build Coastguard Worker
4929*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
4930*4bdc9457SAndroid Build Coastguard Worker
4931*4bdc9457SAndroid Build Coastguard Worker const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
4932*4bdc9457SAndroid Build Coastguard Worker
4933*4bdc9457SAndroid Build Coastguard Worker const __m128i vout0x084C2A6E195D3B7F = _mm_packus_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
4934*4bdc9457SAndroid Build Coastguard Worker __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
4935*4bdc9457SAndroid Build Coastguard Worker vout0x0123456789ABCDEF = _mm_max_epu8(vout0x0123456789ABCDEF, voutput_min);
4936*4bdc9457SAndroid Build Coastguard Worker
4937*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
4938*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF);
4939*4bdc9457SAndroid Build Coastguard Worker
4940*4bdc9457SAndroid Build Coastguard Worker c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
4941*4bdc9457SAndroid Build Coastguard Worker
4942*4bdc9457SAndroid Build Coastguard Worker a = (const uint8_t**restrict) ((uintptr_t) a - ks);
4943*4bdc9457SAndroid Build Coastguard Worker
4944*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
4945*4bdc9457SAndroid Build Coastguard Worker } else {
4946*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
4947*4bdc9457SAndroid Build Coastguard Worker const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT32_C(1) << nc) - UINT32_C(1)));
4948*4bdc9457SAndroid Build Coastguard Worker
4949*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(c0, vmask, vout0x0123456789ABCDEF);
4950*4bdc9457SAndroid Build Coastguard Worker
4951*4bdc9457SAndroid Build Coastguard Worker nc = 0;
4952*4bdc9457SAndroid Build Coastguard Worker }
4953*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
4954*4bdc9457SAndroid Build Coastguard Worker }
4955*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4956*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx(
4957*4bdc9457SAndroid Build Coastguard Worker size_t mr,
4958*4bdc9457SAndroid Build Coastguard Worker size_t nc,
4959*4bdc9457SAndroid Build Coastguard Worker size_t kc,
4960*4bdc9457SAndroid Build Coastguard Worker size_t ks,
4961*4bdc9457SAndroid Build Coastguard Worker const uint8_t** restrict a,
4962*4bdc9457SAndroid Build Coastguard Worker const void* restrict w,
4963*4bdc9457SAndroid Build Coastguard Worker uint8_t* restrict c,
4964*4bdc9457SAndroid Build Coastguard Worker size_t cm_stride,
4965*4bdc9457SAndroid Build Coastguard Worker size_t cn_stride,
4966*4bdc9457SAndroid Build Coastguard Worker size_t a_offset,
4967*4bdc9457SAndroid Build Coastguard Worker const uint8_t* zero,
4968*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4969*4bdc9457SAndroid Build Coastguard Worker {
4970*4bdc9457SAndroid Build Coastguard Worker assert(mr != 0);
4971*4bdc9457SAndroid Build Coastguard Worker assert(mr <= 4);
4972*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
4973*4bdc9457SAndroid Build Coastguard Worker assert(kc != 0);
4974*4bdc9457SAndroid Build Coastguard Worker assert(kc % sizeof(uint8_t) == 0);
4975*4bdc9457SAndroid Build Coastguard Worker assert(a != NULL);
4976*4bdc9457SAndroid Build Coastguard Worker assert(w != NULL);
4977*4bdc9457SAndroid Build Coastguard Worker assert(c != NULL);
4978*4bdc9457SAndroid Build Coastguard Worker
4979*4bdc9457SAndroid Build Coastguard Worker kc = round_up_po2(kc, 8);
4980*4bdc9457SAndroid Build Coastguard Worker uint8_t* c0 = c;
4981*4bdc9457SAndroid Build Coastguard Worker uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
4982*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr < 2) {
4983*4bdc9457SAndroid Build Coastguard Worker c1 = c0;
4984*4bdc9457SAndroid Build Coastguard Worker }
4985*4bdc9457SAndroid Build Coastguard Worker uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
4986*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr <= 2) {
4987*4bdc9457SAndroid Build Coastguard Worker c2 = c1;
4988*4bdc9457SAndroid Build Coastguard Worker }
4989*4bdc9457SAndroid Build Coastguard Worker uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
4990*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(mr != 4) {
4991*4bdc9457SAndroid Build Coastguard Worker c3 = c2;
4992*4bdc9457SAndroid Build Coastguard Worker }
4993*4bdc9457SAndroid Build Coastguard Worker
4994*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
4995*4bdc9457SAndroid Build Coastguard Worker const __m512 vscale = _mm512_load_ps(params->fp32_avx512.scale);
4996*4bdc9457SAndroid Build Coastguard Worker const __m512 voutput_max_less_zero_point = _mm512_load_ps(params->fp32_avx512.output_max_less_zero_point);
4997*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_zero_point = _mm512_load_si512(params->fp32_avx512.output_zero_point);
4998*4bdc9457SAndroid Build Coastguard Worker const __m512i voutput_min = _mm512_load_si512(params->fp32_avx512.output_min);
4999*4bdc9457SAndroid Build Coastguard Worker do {
5000*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
5001*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 4));
5002*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x89AB = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 8));
5003*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0xCDEF = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((const int32_t*) w + 12));
5004*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x0123 = vacc0x0123;
5005*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x4567 = vacc0x4567;
5006*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x89AB = vacc0x89AB;
5007*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1xCDEF = vacc0xCDEF;
5008*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x0123 = vacc0x0123;
5009*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x4567 = vacc0x4567;
5010*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x89AB = vacc0x89AB;
5011*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2xCDEF = vacc0xCDEF;
5012*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x0123 = vacc0x0123;
5013*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x4567 = vacc0x4567;
5014*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x89AB = vacc0x89AB;
5015*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3xCDEF = vacc0xCDEF;
5016*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const int32_t*) w + 16);
5017*4bdc9457SAndroid Build Coastguard Worker
5018*4bdc9457SAndroid Build Coastguard Worker size_t p = ks;
5019*4bdc9457SAndroid Build Coastguard Worker do {
5020*4bdc9457SAndroid Build Coastguard Worker const uint8_t* restrict a0 = a[0];
5021*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a0 != zero) {
5022*4bdc9457SAndroid Build Coastguard Worker a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
5023*4bdc9457SAndroid Build Coastguard Worker }
5024*4bdc9457SAndroid Build Coastguard Worker const uint8_t* restrict a1 = a[1];
5025*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a1 != zero) {
5026*4bdc9457SAndroid Build Coastguard Worker a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
5027*4bdc9457SAndroid Build Coastguard Worker }
5028*4bdc9457SAndroid Build Coastguard Worker const uint8_t* restrict a2 = a[2];
5029*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a2 != zero) {
5030*4bdc9457SAndroid Build Coastguard Worker a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
5031*4bdc9457SAndroid Build Coastguard Worker }
5032*4bdc9457SAndroid Build Coastguard Worker const uint8_t* restrict a3 = a[3];
5033*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(a3 != zero) {
5034*4bdc9457SAndroid Build Coastguard Worker a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
5035*4bdc9457SAndroid Build Coastguard Worker }
5036*4bdc9457SAndroid Build Coastguard Worker a += 4;
5037*4bdc9457SAndroid Build Coastguard Worker
5038*4bdc9457SAndroid Build Coastguard Worker size_t k = 0;
5039*4bdc9457SAndroid Build Coastguard Worker const __m512i vb_zero_point = _mm512_load_si512(params->fp32_avx512.kernel_zero_point);
5040*4bdc9457SAndroid Build Coastguard Worker while (k < kc) {
5041*4bdc9457SAndroid Build Coastguard Worker const __m512i va0 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a0)));
5042*4bdc9457SAndroid Build Coastguard Worker a0 += 8;
5043*4bdc9457SAndroid Build Coastguard Worker const __m512i va1 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a1)));
5044*4bdc9457SAndroid Build Coastguard Worker a1 += 8;
5045*4bdc9457SAndroid Build Coastguard Worker const __m512i va2 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a2)));
5046*4bdc9457SAndroid Build Coastguard Worker a2 += 8;
5047*4bdc9457SAndroid Build Coastguard Worker const __m512i va3 = _mm512_broadcast_i32x4(_mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a3)));
5048*4bdc9457SAndroid Build Coastguard Worker a3 += 8;
5049*4bdc9457SAndroid Build Coastguard Worker
5050*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123 = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) w)), vb_zero_point);
5051*4bdc9457SAndroid Build Coastguard Worker
5052*4bdc9457SAndroid Build Coastguard Worker vacc0x0123 = _mm512_add_epi32(vacc0x0123, _mm512_madd_epi16(va0, vb0123));
5053*4bdc9457SAndroid Build Coastguard Worker vacc1x0123 = _mm512_add_epi32(vacc1x0123, _mm512_madd_epi16(va1, vb0123));
5054*4bdc9457SAndroid Build Coastguard Worker vacc2x0123 = _mm512_add_epi32(vacc2x0123, _mm512_madd_epi16(va2, vb0123));
5055*4bdc9457SAndroid Build Coastguard Worker vacc3x0123 = _mm512_add_epi32(vacc3x0123, _mm512_madd_epi16(va3, vb0123));
5056*4bdc9457SAndroid Build Coastguard Worker const __m512i vb4567 = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 32))), vb_zero_point);
5057*4bdc9457SAndroid Build Coastguard Worker
5058*4bdc9457SAndroid Build Coastguard Worker vacc0x4567 = _mm512_add_epi32(vacc0x4567, _mm512_madd_epi16(va0, vb4567));
5059*4bdc9457SAndroid Build Coastguard Worker vacc1x4567 = _mm512_add_epi32(vacc1x4567, _mm512_madd_epi16(va1, vb4567));
5060*4bdc9457SAndroid Build Coastguard Worker vacc2x4567 = _mm512_add_epi32(vacc2x4567, _mm512_madd_epi16(va2, vb4567));
5061*4bdc9457SAndroid Build Coastguard Worker vacc3x4567 = _mm512_add_epi32(vacc3x4567, _mm512_madd_epi16(va3, vb4567));
5062*4bdc9457SAndroid Build Coastguard Worker const __m512i vb89AB = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 64))), vb_zero_point);
5063*4bdc9457SAndroid Build Coastguard Worker
5064*4bdc9457SAndroid Build Coastguard Worker vacc0x89AB = _mm512_add_epi32(vacc0x89AB, _mm512_madd_epi16(va0, vb89AB));
5065*4bdc9457SAndroid Build Coastguard Worker vacc1x89AB = _mm512_add_epi32(vacc1x89AB, _mm512_madd_epi16(va1, vb89AB));
5066*4bdc9457SAndroid Build Coastguard Worker vacc2x89AB = _mm512_add_epi32(vacc2x89AB, _mm512_madd_epi16(va2, vb89AB));
5067*4bdc9457SAndroid Build Coastguard Worker vacc3x89AB = _mm512_add_epi32(vacc3x89AB, _mm512_madd_epi16(va3, vb89AB));
5068*4bdc9457SAndroid Build Coastguard Worker const __m512i vbCDEF = _mm512_sub_epi16(_mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i*) ((const uint8_t*) w + 96))), vb_zero_point);
5069*4bdc9457SAndroid Build Coastguard Worker
5070*4bdc9457SAndroid Build Coastguard Worker vacc0xCDEF = _mm512_add_epi32(vacc0xCDEF, _mm512_madd_epi16(va0, vbCDEF));
5071*4bdc9457SAndroid Build Coastguard Worker vacc1xCDEF = _mm512_add_epi32(vacc1xCDEF, _mm512_madd_epi16(va1, vbCDEF));
5072*4bdc9457SAndroid Build Coastguard Worker vacc2xCDEF = _mm512_add_epi32(vacc2xCDEF, _mm512_madd_epi16(va2, vbCDEF));
5073*4bdc9457SAndroid Build Coastguard Worker vacc3xCDEF = _mm512_add_epi32(vacc3xCDEF, _mm512_madd_epi16(va3, vbCDEF));
5074*4bdc9457SAndroid Build Coastguard Worker
5075*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const uint8_t*) w + 128);
5076*4bdc9457SAndroid Build Coastguard Worker k += 8 * sizeof(uint8_t);
5077*4bdc9457SAndroid Build Coastguard Worker }
5078*4bdc9457SAndroid Build Coastguard Worker p -= 4 * sizeof(void*);
5079*4bdc9457SAndroid Build Coastguard Worker } while (p != 0);
5080*4bdc9457SAndroid Build Coastguard Worker
5081*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x0123, vacc0x4567), _mm512_unpackhi_epi32(vacc0x0123, vacc0x4567));
5082*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc0x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x89AB, vacc0xCDEF), _mm512_unpackhi_epi32(vacc0x89AB, vacc0xCDEF));
5083*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x0123, vacc1x4567), _mm512_unpackhi_epi32(vacc1x0123, vacc1x4567));
5084*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc1x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x89AB, vacc1xCDEF), _mm512_unpackhi_epi32(vacc1x89AB, vacc1xCDEF));
5085*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x0123, vacc2x4567), _mm512_unpackhi_epi32(vacc2x0123, vacc2x4567));
5086*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc2x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x89AB, vacc2xCDEF), _mm512_unpackhi_epi32(vacc2x89AB, vacc2xCDEF));
5087*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x04152637 = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x0123, vacc3x4567), _mm512_unpackhi_epi32(vacc3x0123, vacc3x4567));
5088*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc3x8C9DAEBF = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x89AB, vacc3xCDEF), _mm512_unpackhi_epi32(vacc3x89AB, vacc3xCDEF));
5089*4bdc9457SAndroid Build Coastguard Worker
5090*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc0x04152637, vacc0x8C9DAEBF), _mm512_unpackhi_epi32(vacc0x04152637, vacc0x8C9DAEBF));
5091*4bdc9457SAndroid Build Coastguard Worker __m512i vacc1x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc1x04152637, vacc1x8C9DAEBF), _mm512_unpackhi_epi32(vacc1x04152637, vacc1x8C9DAEBF));
5092*4bdc9457SAndroid Build Coastguard Worker __m512i vacc2x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc2x04152637, vacc2x8C9DAEBF), _mm512_unpackhi_epi32(vacc2x04152637, vacc2x8C9DAEBF));
5093*4bdc9457SAndroid Build Coastguard Worker __m512i vacc3x084C195D2A6E3B7F = _mm512_add_epi32(_mm512_unpacklo_epi32(vacc3x04152637, vacc3x8C9DAEBF), _mm512_unpackhi_epi32(vacc3x04152637, vacc3x8C9DAEBF));
5094*4bdc9457SAndroid Build Coastguard Worker
5095*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled0x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc0x084C195D2A6E3B7F);
5096*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled1x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc1x084C195D2A6E3B7F);
5097*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled2x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc2x084C195D2A6E3B7F);
5098*4bdc9457SAndroid Build Coastguard Worker __m512 vscaled3x084C195D2A6E3B7F = _mm512_cvtepi32_ps(vacc3x084C195D2A6E3B7F);
5099*4bdc9457SAndroid Build Coastguard Worker
5100*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_mul_ps(vscaled0x084C195D2A6E3B7F, vscale);
5101*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_mul_ps(vscaled1x084C195D2A6E3B7F, vscale);
5102*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_mul_ps(vscaled2x084C195D2A6E3B7F, vscale);
5103*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_mul_ps(vscaled3x084C195D2A6E3B7F, vscale);
5104*4bdc9457SAndroid Build Coastguard Worker
5105*4bdc9457SAndroid Build Coastguard Worker vscaled0x084C195D2A6E3B7F = _mm512_min_ps(vscaled0x084C195D2A6E3B7F, voutput_max_less_zero_point);
5106*4bdc9457SAndroid Build Coastguard Worker vscaled1x084C195D2A6E3B7F = _mm512_min_ps(vscaled1x084C195D2A6E3B7F, voutput_max_less_zero_point);
5107*4bdc9457SAndroid Build Coastguard Worker vscaled2x084C195D2A6E3B7F = _mm512_min_ps(vscaled2x084C195D2A6E3B7F, voutput_max_less_zero_point);
5108*4bdc9457SAndroid Build Coastguard Worker vscaled3x084C195D2A6E3B7F = _mm512_min_ps(vscaled3x084C195D2A6E3B7F, voutput_max_less_zero_point);
5109*4bdc9457SAndroid Build Coastguard Worker
5110*4bdc9457SAndroid Build Coastguard Worker vacc0x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled0x084C195D2A6E3B7F);
5111*4bdc9457SAndroid Build Coastguard Worker vacc1x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled1x084C195D2A6E3B7F);
5112*4bdc9457SAndroid Build Coastguard Worker vacc2x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled2x084C195D2A6E3B7F);
5113*4bdc9457SAndroid Build Coastguard Worker vacc3x084C195D2A6E3B7F = _mm512_cvtps_epi32(vscaled3x084C195D2A6E3B7F);
5114*4bdc9457SAndroid Build Coastguard Worker
5115*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
5116*4bdc9457SAndroid Build Coastguard Worker const __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
5117*4bdc9457SAndroid Build Coastguard Worker
5118*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x084Cx195Dx2A6Ex3B7F = _mm512_packus_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc23x084Cx195Dx2A6Ex3B7F);
5119*4bdc9457SAndroid Build Coastguard Worker vout0123x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0123x084Cx195Dx2A6Ex3B7F);
5120*4bdc9457SAndroid Build Coastguard Worker __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
5121*4bdc9457SAndroid Build Coastguard Worker vout0123x0123456789ABCDEF = _mm512_max_epu8(vout0123x0123456789ABCDEF, voutput_min);
5122*4bdc9457SAndroid Build Coastguard Worker
5123*4bdc9457SAndroid Build Coastguard Worker if (nc >= 16) {
5124*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c3, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 3));
5125*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c2, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 2));
5126*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c1, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 1));
5127*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) c0, _mm512_castsi512_si128(vout0123x0123456789ABCDEF));
5128*4bdc9457SAndroid Build Coastguard Worker
5129*4bdc9457SAndroid Build Coastguard Worker c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
5130*4bdc9457SAndroid Build Coastguard Worker c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
5131*4bdc9457SAndroid Build Coastguard Worker c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
5132*4bdc9457SAndroid Build Coastguard Worker c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
5133*4bdc9457SAndroid Build Coastguard Worker
5134*4bdc9457SAndroid Build Coastguard Worker a = (const uint8_t**restrict) ((uintptr_t) a - ks);
5135*4bdc9457SAndroid Build Coastguard Worker
5136*4bdc9457SAndroid Build Coastguard Worker nc -= 16;
5137*4bdc9457SAndroid Build Coastguard Worker } else {
5138*4bdc9457SAndroid Build Coastguard Worker // Prepare mask for valid 8-bit elements (depends on nc).
5139*4bdc9457SAndroid Build Coastguard Worker __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT64_C(1) << (nc + 48)) - (UINT64_C(1) << 48)));
5140*4bdc9457SAndroid Build Coastguard Worker
5141*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c3 - 48, vmask, vout0123x0123456789ABCDEF);
5142*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftri_mask64(vmask, 16);
5143*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c2 - 32, vmask, vout0123x0123456789ABCDEF);
5144*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftri_mask64(vmask, 16);
5145*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c1 - 16, vmask, vout0123x0123456789ABCDEF);
5146*4bdc9457SAndroid Build Coastguard Worker vmask = _kshiftri_mask64(vmask, 16);
5147*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(c0, vmask, vout0123x0123456789ABCDEF);
5148*4bdc9457SAndroid Build Coastguard Worker
5149*4bdc9457SAndroid Build Coastguard Worker nc = 0;
5150*4bdc9457SAndroid Build Coastguard Worker }
5151*4bdc9457SAndroid Build Coastguard Worker } while (nc != 0);
5152*4bdc9457SAndroid Build Coastguard Worker }
5153*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5154*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16(
5155*4bdc9457SAndroid Build Coastguard Worker size_t n,
5156*4bdc9457SAndroid Build Coastguard Worker const uint8_t* input_a,
5157*4bdc9457SAndroid Build Coastguard Worker const uint8_t* input_b,
5158*4bdc9457SAndroid Build Coastguard Worker uint8_t* output,
5159*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5160*4bdc9457SAndroid Build Coastguard Worker {
5161*4bdc9457SAndroid Build Coastguard Worker const __m512i vbias = _mm512_load_si512(params->avx512.bias);
5162*4bdc9457SAndroid Build Coastguard Worker const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
5163*4bdc9457SAndroid Build Coastguard Worker const __m512i vb_multiplier = _mm512_load_si512(params->avx512.b_multiplier);
5164*4bdc9457SAndroid Build Coastguard Worker const __m128i vshift = _mm_load_si128((const __m128i*) params->avx512.shift);
5165*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx512.output_zero_point);
5166*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx512.output_min);
5167*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx512.output_max);
5168*4bdc9457SAndroid Build Coastguard Worker
5169*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
5170*4bdc9457SAndroid Build Coastguard Worker const __m512i va0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_a));
5171*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_b));
5172*4bdc9457SAndroid Build Coastguard Worker input_a += 16;
5173*4bdc9457SAndroid Build Coastguard Worker input_b += 16;
5174*4bdc9457SAndroid Build Coastguard Worker
5175*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
5176*4bdc9457SAndroid Build Coastguard Worker
5177*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
5178*4bdc9457SAndroid Build Coastguard Worker
5179*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
5180*4bdc9457SAndroid Build Coastguard Worker
5181*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
5182*4bdc9457SAndroid Build Coastguard Worker
5183*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5184*4bdc9457SAndroid Build Coastguard Worker
5185*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
5186*4bdc9457SAndroid Build Coastguard Worker
5187*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
5188*4bdc9457SAndroid Build Coastguard Worker
5189*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5190*4bdc9457SAndroid Build Coastguard Worker output += 16;
5191*4bdc9457SAndroid Build Coastguard Worker }
5192*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
5193*4bdc9457SAndroid Build Coastguard Worker {
5194*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << n) - UINT32_C(1)));
5195*4bdc9457SAndroid Build Coastguard Worker const __m512i va0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_maskz_loadu_epi8(vmask, input_a));
5196*4bdc9457SAndroid Build Coastguard Worker const __m512i vb0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_maskz_loadu_epi8(vmask, input_b));
5197*4bdc9457SAndroid Build Coastguard Worker
5198*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
5199*4bdc9457SAndroid Build Coastguard Worker
5200*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
5201*4bdc9457SAndroid Build Coastguard Worker
5202*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
5203*4bdc9457SAndroid Build Coastguard Worker
5204*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
5205*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5206*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
5207*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
5208*4bdc9457SAndroid Build Coastguard Worker
5209*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
5210*4bdc9457SAndroid Build Coastguard Worker }
5211*4bdc9457SAndroid Build Coastguard Worker }
5212*4bdc9457SAndroid Build Coastguard Worker }
5213*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5214*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16(
5215*4bdc9457SAndroid Build Coastguard Worker size_t n,
5216*4bdc9457SAndroid Build Coastguard Worker const uint8_t* input_a,
5217*4bdc9457SAndroid Build Coastguard Worker const uint8_t* input_b,
5218*4bdc9457SAndroid Build Coastguard Worker uint8_t* output,
5219*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5220*4bdc9457SAndroid Build Coastguard Worker {
5221*4bdc9457SAndroid Build Coastguard Worker const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
5222*4bdc9457SAndroid Build Coastguard Worker const __m128i vshift = _mm_load_si128((const __m128i*) params->avx512.shift);
5223*4bdc9457SAndroid Build Coastguard Worker const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx512.output_zero_point);
5224*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx512.output_min);
5225*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx512.output_max);
5226*4bdc9457SAndroid Build Coastguard Worker
5227*4bdc9457SAndroid Build Coastguard Worker const __m512i vbias = _mm512_add_epi32(
5228*4bdc9457SAndroid Build Coastguard Worker _mm512_broadcastd_epi32(_mm_cvtsi32_si128(params->avx512.b_multiplier[0] * (int32_t) *input_b)),
5229*4bdc9457SAndroid Build Coastguard Worker _mm512_load_si512(params->avx512.bias));
5230*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
5231*4bdc9457SAndroid Build Coastguard Worker const __m512i va0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_a));
5232*4bdc9457SAndroid Build Coastguard Worker input_a += 16;
5233*4bdc9457SAndroid Build Coastguard Worker
5234*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
5235*4bdc9457SAndroid Build Coastguard Worker
5236*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
5237*4bdc9457SAndroid Build Coastguard Worker
5238*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
5239*4bdc9457SAndroid Build Coastguard Worker
5240*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5241*4bdc9457SAndroid Build Coastguard Worker
5242*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
5243*4bdc9457SAndroid Build Coastguard Worker
5244*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
5245*4bdc9457SAndroid Build Coastguard Worker
5246*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5247*4bdc9457SAndroid Build Coastguard Worker output += 16;
5248*4bdc9457SAndroid Build Coastguard Worker }
5249*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
5250*4bdc9457SAndroid Build Coastguard Worker {
5251*4bdc9457SAndroid Build Coastguard Worker const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << n) - UINT32_C(1)));
5252*4bdc9457SAndroid Build Coastguard Worker const __m512i va0123456789ABCDEF = _mm512_cvtepu8_epi32(_mm_maskz_loadu_epi8(vmask, input_a));
5253*4bdc9457SAndroid Build Coastguard Worker
5254*4bdc9457SAndroid Build Coastguard Worker __m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
5255*4bdc9457SAndroid Build Coastguard Worker
5256*4bdc9457SAndroid Build Coastguard Worker vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
5257*4bdc9457SAndroid Build Coastguard Worker
5258*4bdc9457SAndroid Build Coastguard Worker __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
5259*4bdc9457SAndroid Build Coastguard Worker __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5260*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
5261*4bdc9457SAndroid Build Coastguard Worker vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
5262*4bdc9457SAndroid Build Coastguard Worker
5263*4bdc9457SAndroid Build Coastguard Worker _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF);
5264*4bdc9457SAndroid Build Coastguard Worker }
5265*4bdc9457SAndroid Build Coastguard Worker }
5266*4bdc9457SAndroid Build Coastguard Worker }
5267*4bdc9457SAndroid Build Coastguard Worker
xnn_x8_lut_ukernel__avx512skx_vpshufb_x64(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])5268*4bdc9457SAndroid Build Coastguard Worker void xnn_x8_lut_ukernel__avx512skx_vpshufb_x64(
5269*4bdc9457SAndroid Build Coastguard Worker size_t n,
5270*4bdc9457SAndroid Build Coastguard Worker const uint8_t* x,
5271*4bdc9457SAndroid Build Coastguard Worker uint8_t* y,
5272*4bdc9457SAndroid Build Coastguard Worker const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
5273*4bdc9457SAndroid Build Coastguard Worker {
5274*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
5275*4bdc9457SAndroid Build Coastguard Worker assert(x != NULL);
5276*4bdc9457SAndroid Build Coastguard Worker assert(y != NULL);
5277*4bdc9457SAndroid Build Coastguard Worker
5278*4bdc9457SAndroid Build Coastguard Worker const __m512i vt0 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) t));
5279*4bdc9457SAndroid Build Coastguard Worker const __m512i vt1 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 16)));
5280*4bdc9457SAndroid Build Coastguard Worker const __m512i vt2 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 32)));
5281*4bdc9457SAndroid Build Coastguard Worker const __m512i vt3 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 48)));
5282*4bdc9457SAndroid Build Coastguard Worker const __m512i vt4 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 64)));
5283*4bdc9457SAndroid Build Coastguard Worker const __m512i vt5 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 80)));
5284*4bdc9457SAndroid Build Coastguard Worker const __m512i vt6 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 96)));
5285*4bdc9457SAndroid Build Coastguard Worker const __m512i vt7 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 112)));
5286*4bdc9457SAndroid Build Coastguard Worker const __m512i vt8 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 128)));
5287*4bdc9457SAndroid Build Coastguard Worker const __m512i vt9 = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 144)));
5288*4bdc9457SAndroid Build Coastguard Worker const __m512i vtA = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 160)));
5289*4bdc9457SAndroid Build Coastguard Worker const __m512i vtB = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 176)));
5290*4bdc9457SAndroid Build Coastguard Worker const __m512i vtC = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 192)));
5291*4bdc9457SAndroid Build Coastguard Worker const __m512i vtD = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 208)));
5292*4bdc9457SAndroid Build Coastguard Worker const __m512i vtE = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 224)));
5293*4bdc9457SAndroid Build Coastguard Worker const __m512i vtF = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) (t + 240)));
5294*4bdc9457SAndroid Build Coastguard Worker
5295*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable0 = vt0;
5296*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable1 = _mm512_xor_si512(vt0, vt1);
5297*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable2 = _mm512_xor_si512(vt1, vt2);
5298*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable3 = _mm512_xor_si512(vt2, vt3);
5299*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable4 = _mm512_xor_si512(vt3, vt4);
5300*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable5 = _mm512_xor_si512(vt4, vt5);
5301*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable6 = _mm512_xor_si512(vt5, vt6);
5302*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable7 = _mm512_xor_si512(vt6, vt7);
5303*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable8 = _mm512_xor_si512(_mm512_xor_si512(vt7, vt8), vtable0);
5304*4bdc9457SAndroid Build Coastguard Worker const __m512i vtable9 = _mm512_xor_si512(_mm512_xor_si512(vt8, vt9), vtable1);
5305*4bdc9457SAndroid Build Coastguard Worker const __m512i vtableA = _mm512_xor_si512(_mm512_xor_si512(vt9, vtA), vtable2);
5306*4bdc9457SAndroid Build Coastguard Worker const __m512i vtableB = _mm512_xor_si512(_mm512_xor_si512(vtA, vtB), vtable3);
5307*4bdc9457SAndroid Build Coastguard Worker const __m512i vtableC = _mm512_xor_si512(_mm512_xor_si512(vtB, vtC), vtable4);
5308*4bdc9457SAndroid Build Coastguard Worker const __m512i vtableD = _mm512_xor_si512(_mm512_xor_si512(vtC, vtD), vtable5);
5309*4bdc9457SAndroid Build Coastguard Worker const __m512i vtableE = _mm512_xor_si512(_mm512_xor_si512(vtD, vtE), vtable6);
5310*4bdc9457SAndroid Build Coastguard Worker const __m512i vtableF = _mm512_xor_si512(_mm512_xor_si512(vtE, vtF), vtable7);
5311*4bdc9457SAndroid Build Coastguard Worker
5312*4bdc9457SAndroid Build Coastguard Worker const __m512i voffset = _mm512_set1_epi8(16);
5313*4bdc9457SAndroid Build Coastguard Worker for (; n >= 64 * sizeof(uint8_t); n -= 64 * sizeof(uint8_t)) {
5314*4bdc9457SAndroid Build Coastguard Worker __m512i vx = _mm512_loadu_si512(x);
5315*4bdc9457SAndroid Build Coastguard Worker x += 64;
5316*4bdc9457SAndroid Build Coastguard Worker
5317*4bdc9457SAndroid Build Coastguard Worker __m512i vy = _mm512_shuffle_epi8(vtable0, vx);
5318*4bdc9457SAndroid Build Coastguard Worker
5319*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5320*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable1, vx));
5321*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5322*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable2, vx));
5323*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5324*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable3, vx));
5325*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5326*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable4, vx));
5327*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5328*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable5, vx));
5329*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5330*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable6, vx));
5331*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5332*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable7, vx));
5333*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5334*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable8, vx));
5335*4bdc9457SAndroid Build Coastguard Worker
5336*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5337*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable9, vx));
5338*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5339*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableA, vx));
5340*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5341*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableB, vx));
5342*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5343*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableC, vx));
5344*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5345*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableD, vx));
5346*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5347*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableE, vx));
5348*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5349*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableF, vx));
5350*4bdc9457SAndroid Build Coastguard Worker
5351*4bdc9457SAndroid Build Coastguard Worker _mm512_storeu_si512(y, vy);
5352*4bdc9457SAndroid Build Coastguard Worker y += 64;
5353*4bdc9457SAndroid Build Coastguard Worker }
5354*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
5355*4bdc9457SAndroid Build Coastguard Worker assert(n < 64);
5356*4bdc9457SAndroid Build Coastguard Worker const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT64_C(1) << n) - UINT64_C(1)));
5357*4bdc9457SAndroid Build Coastguard Worker
5358*4bdc9457SAndroid Build Coastguard Worker __m512i vx = _mm512_maskz_loadu_epi8(vmask, x);
5359*4bdc9457SAndroid Build Coastguard Worker
5360*4bdc9457SAndroid Build Coastguard Worker __m512i vy = _mm512_shuffle_epi8(vtable0, vx);
5361*4bdc9457SAndroid Build Coastguard Worker
5362*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5363*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable1, vx));
5364*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5365*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable2, vx));
5366*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5367*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable3, vx));
5368*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5369*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable4, vx));
5370*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5371*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable5, vx));
5372*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5373*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable6, vx));
5374*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5375*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable7, vx));
5376*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_sub_epi8(vx, voffset);
5377*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable8, vx));
5378*4bdc9457SAndroid Build Coastguard Worker
5379*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5380*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtable9, vx));
5381*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5382*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableA, vx));
5383*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5384*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableB, vx));
5385*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5386*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableC, vx));
5387*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5388*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableD, vx));
5389*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5390*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableE, vx));
5391*4bdc9457SAndroid Build Coastguard Worker vx = _mm512_subs_epi8(vx, voffset);
5392*4bdc9457SAndroid Build Coastguard Worker vy = _mm512_xor_si512(vy, _mm512_shuffle_epi8(vtableF, vx));
5393*4bdc9457SAndroid Build Coastguard Worker
5394*4bdc9457SAndroid Build Coastguard Worker _mm512_mask_storeu_epi8(y, vmask, vy);
5395*4bdc9457SAndroid Build Coastguard Worker }
5396*4bdc9457SAndroid Build Coastguard Worker }
5397