xref: /aosp_15_r20/external/XNNPACK/src/amalgam/avx2.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker // Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker //
3*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker 
6*4bdc9457SAndroid Build Coastguard Worker #include <assert.h>
7*4bdc9457SAndroid Build Coastguard Worker 
8*4bdc9457SAndroid Build Coastguard Worker #include <immintrin.h>
9*4bdc9457SAndroid Build Coastguard Worker 
10*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/common.h>
11*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/dwconv.h>
12*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/gemm.h>
13*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/igemm.h>
14*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/intrinsics-polyfill.h>
15*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/lut.h>
16*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/math.h>
17*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/pavgpool.h>
18*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/raddstoreexpminusmax.h>
19*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/unaligned.h>
20*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vadd.h>
21*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vcvt.h>
22*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vlrelu.h>
23*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vunary.h>
24*4bdc9457SAndroid Build Coastguard Worker 
25*4bdc9457SAndroid Build Coastguard Worker 
xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,const void * restrict a,size_t a_stride,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])26*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast(
27*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
28*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
29*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
30*4bdc9457SAndroid Build Coastguard Worker     const void*restrict a,
31*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
32*4bdc9457SAndroid Build Coastguard Worker     const void*restrict w,
33*4bdc9457SAndroid Build Coastguard Worker     void*restrict c,
34*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
35*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
36*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
37*4bdc9457SAndroid Build Coastguard Worker {
38*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
39*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
40*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
41*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
42*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(uint16_t) == 0);
43*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
44*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
45*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
46*4bdc9457SAndroid Build Coastguard Worker 
47*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* a0 = a;
48*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c0 = c;
49*4bdc9457SAndroid Build Coastguard Worker 
50*4bdc9457SAndroid Build Coastguard Worker   do {
51*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
52*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
53*4bdc9457SAndroid Build Coastguard Worker     w = (const uint16_t*) w + 16;
54*4bdc9457SAndroid Build Coastguard Worker 
55*4bdc9457SAndroid Build Coastguard Worker     size_t k = kc;
56*4bdc9457SAndroid Build Coastguard Worker     do {
57*4bdc9457SAndroid Build Coastguard Worker       const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
58*4bdc9457SAndroid Build Coastguard Worker       a0 += 1;
59*4bdc9457SAndroid Build Coastguard Worker 
60*4bdc9457SAndroid Build Coastguard Worker       const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
61*4bdc9457SAndroid Build Coastguard Worker       const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
62*4bdc9457SAndroid Build Coastguard Worker       w = (const uint16_t*) w + 16;
63*4bdc9457SAndroid Build Coastguard Worker 
64*4bdc9457SAndroid Build Coastguard Worker       vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
65*4bdc9457SAndroid Build Coastguard Worker       vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
66*4bdc9457SAndroid Build Coastguard Worker 
67*4bdc9457SAndroid Build Coastguard Worker       k -= sizeof(uint16_t);
68*4bdc9457SAndroid Build Coastguard Worker     } while (k != 0);
69*4bdc9457SAndroid Build Coastguard Worker 
70*4bdc9457SAndroid Build Coastguard Worker     const __m256 vmin = _mm256_load_ps(params->avx.min);
71*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
72*4bdc9457SAndroid Build Coastguard Worker     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
73*4bdc9457SAndroid Build Coastguard Worker 
74*4bdc9457SAndroid Build Coastguard Worker     const __m256 vmax = _mm256_load_ps(params->avx.max);
75*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
76*4bdc9457SAndroid Build Coastguard Worker     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
77*4bdc9457SAndroid Build Coastguard Worker 
78*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 16) {
79*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
80*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
81*4bdc9457SAndroid Build Coastguard Worker       c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
82*4bdc9457SAndroid Build Coastguard Worker 
83*4bdc9457SAndroid Build Coastguard Worker       a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
84*4bdc9457SAndroid Build Coastguard Worker 
85*4bdc9457SAndroid Build Coastguard Worker       nc -= 16;
86*4bdc9457SAndroid Build Coastguard Worker     } else {
87*4bdc9457SAndroid Build Coastguard Worker       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
88*4bdc9457SAndroid Build Coastguard Worker       if (nc & 8) {
89*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c0, vh0x01234567);
90*4bdc9457SAndroid Build Coastguard Worker 
91*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
92*4bdc9457SAndroid Build Coastguard Worker 
93*4bdc9457SAndroid Build Coastguard Worker         c0 += 8;
94*4bdc9457SAndroid Build Coastguard Worker       }
95*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
96*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c0, vh0x01234567);
97*4bdc9457SAndroid Build Coastguard Worker 
98*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
99*4bdc9457SAndroid Build Coastguard Worker 
100*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
101*4bdc9457SAndroid Build Coastguard Worker       }
102*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
103*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vh0x01234567);
104*4bdc9457SAndroid Build Coastguard Worker 
105*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
106*4bdc9457SAndroid Build Coastguard Worker 
107*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
108*4bdc9457SAndroid Build Coastguard Worker       }
109*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
110*4bdc9457SAndroid Build Coastguard Worker         *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
111*4bdc9457SAndroid Build Coastguard Worker       }
112*4bdc9457SAndroid Build Coastguard Worker 
113*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
114*4bdc9457SAndroid Build Coastguard Worker     }
115*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
116*4bdc9457SAndroid Build Coastguard Worker }
117*4bdc9457SAndroid Build Coastguard Worker 
xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,const void * restrict a,size_t a_stride,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])118*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast(
119*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
120*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
121*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
122*4bdc9457SAndroid Build Coastguard Worker     const void*restrict a,
123*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
124*4bdc9457SAndroid Build Coastguard Worker     const void*restrict w,
125*4bdc9457SAndroid Build Coastguard Worker     void*restrict c,
126*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
127*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
128*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
129*4bdc9457SAndroid Build Coastguard Worker {
130*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
131*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 4);
132*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
133*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
134*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(uint16_t) == 0);
135*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
136*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
137*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
138*4bdc9457SAndroid Build Coastguard Worker 
139*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* a0 = a;
140*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c0 = c;
141*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* a1 = (const uint16_t*) ((uintptr_t) a0 + a_stride);
142*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
143*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
144*4bdc9457SAndroid Build Coastguard Worker     a1 = a0;
145*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
146*4bdc9457SAndroid Build Coastguard Worker   }
147*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* a2 = (const uint16_t*) ((uintptr_t) a1 + a_stride);
148*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
149*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
150*4bdc9457SAndroid Build Coastguard Worker     a2 = a1;
151*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
152*4bdc9457SAndroid Build Coastguard Worker   }
153*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* a3 = (const uint16_t*) ((uintptr_t) a2 + a_stride);
154*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
155*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr != 4) {
156*4bdc9457SAndroid Build Coastguard Worker     a3 = a2;
157*4bdc9457SAndroid Build Coastguard Worker     c3 = c2;
158*4bdc9457SAndroid Build Coastguard Worker   }
159*4bdc9457SAndroid Build Coastguard Worker 
160*4bdc9457SAndroid Build Coastguard Worker   do {
161*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
162*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
163*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc1x01234567 = vacc0x01234567;
164*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
165*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc2x01234567 = vacc0x01234567;
166*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
167*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc3x01234567 = vacc0x01234567;
168*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
169*4bdc9457SAndroid Build Coastguard Worker     w = (const uint16_t*) w + 16;
170*4bdc9457SAndroid Build Coastguard Worker 
171*4bdc9457SAndroid Build Coastguard Worker     size_t k = kc;
172*4bdc9457SAndroid Build Coastguard Worker     do {
173*4bdc9457SAndroid Build Coastguard Worker       const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
174*4bdc9457SAndroid Build Coastguard Worker       a0 += 1;
175*4bdc9457SAndroid Build Coastguard Worker       const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
176*4bdc9457SAndroid Build Coastguard Worker       a1 += 1;
177*4bdc9457SAndroid Build Coastguard Worker       const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
178*4bdc9457SAndroid Build Coastguard Worker       a2 += 1;
179*4bdc9457SAndroid Build Coastguard Worker       const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
180*4bdc9457SAndroid Build Coastguard Worker       a3 += 1;
181*4bdc9457SAndroid Build Coastguard Worker 
182*4bdc9457SAndroid Build Coastguard Worker       const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
183*4bdc9457SAndroid Build Coastguard Worker       const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
184*4bdc9457SAndroid Build Coastguard Worker       w = (const uint16_t*) w + 16;
185*4bdc9457SAndroid Build Coastguard Worker 
186*4bdc9457SAndroid Build Coastguard Worker       vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
187*4bdc9457SAndroid Build Coastguard Worker       vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
188*4bdc9457SAndroid Build Coastguard Worker       vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
189*4bdc9457SAndroid Build Coastguard Worker       vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
190*4bdc9457SAndroid Build Coastguard Worker       vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
191*4bdc9457SAndroid Build Coastguard Worker       vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
192*4bdc9457SAndroid Build Coastguard Worker       vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
193*4bdc9457SAndroid Build Coastguard Worker       vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
194*4bdc9457SAndroid Build Coastguard Worker 
195*4bdc9457SAndroid Build Coastguard Worker       k -= sizeof(uint16_t);
196*4bdc9457SAndroid Build Coastguard Worker     } while (k != 0);
197*4bdc9457SAndroid Build Coastguard Worker 
198*4bdc9457SAndroid Build Coastguard Worker     const __m256 vmin = _mm256_load_ps(params->avx.min);
199*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
200*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
201*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
202*4bdc9457SAndroid Build Coastguard Worker     vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
203*4bdc9457SAndroid Build Coastguard Worker     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
204*4bdc9457SAndroid Build Coastguard Worker     vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
205*4bdc9457SAndroid Build Coastguard Worker     vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
206*4bdc9457SAndroid Build Coastguard Worker     vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
207*4bdc9457SAndroid Build Coastguard Worker 
208*4bdc9457SAndroid Build Coastguard Worker     const __m256 vmax = _mm256_load_ps(params->avx.max);
209*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
210*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
211*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
212*4bdc9457SAndroid Build Coastguard Worker     vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
213*4bdc9457SAndroid Build Coastguard Worker     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
214*4bdc9457SAndroid Build Coastguard Worker     vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
215*4bdc9457SAndroid Build Coastguard Worker     vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
216*4bdc9457SAndroid Build Coastguard Worker     vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
217*4bdc9457SAndroid Build Coastguard Worker 
218*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 16) {
219*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
220*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
221*4bdc9457SAndroid Build Coastguard Worker       c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
222*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
223*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
224*4bdc9457SAndroid Build Coastguard Worker       c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
225*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
226*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
227*4bdc9457SAndroid Build Coastguard Worker       c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
228*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
229*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
230*4bdc9457SAndroid Build Coastguard Worker       c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
231*4bdc9457SAndroid Build Coastguard Worker 
232*4bdc9457SAndroid Build Coastguard Worker       a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
233*4bdc9457SAndroid Build Coastguard Worker       a1 = (const uint16_t*) ((uintptr_t) a1 - kc);
234*4bdc9457SAndroid Build Coastguard Worker       a2 = (const uint16_t*) ((uintptr_t) a2 - kc);
235*4bdc9457SAndroid Build Coastguard Worker       a3 = (const uint16_t*) ((uintptr_t) a3 - kc);
236*4bdc9457SAndroid Build Coastguard Worker 
237*4bdc9457SAndroid Build Coastguard Worker       nc -= 16;
238*4bdc9457SAndroid Build Coastguard Worker     } else {
239*4bdc9457SAndroid Build Coastguard Worker       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
240*4bdc9457SAndroid Build Coastguard Worker       __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
241*4bdc9457SAndroid Build Coastguard Worker       __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
242*4bdc9457SAndroid Build Coastguard Worker       __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
243*4bdc9457SAndroid Build Coastguard Worker       if (nc & 8) {
244*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c0, vh0x01234567);
245*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c1, vh1x01234567);
246*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c2, vh2x01234567);
247*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c3, vh3x01234567);
248*4bdc9457SAndroid Build Coastguard Worker 
249*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
250*4bdc9457SAndroid Build Coastguard Worker         vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
251*4bdc9457SAndroid Build Coastguard Worker         vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
252*4bdc9457SAndroid Build Coastguard Worker         vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
253*4bdc9457SAndroid Build Coastguard Worker 
254*4bdc9457SAndroid Build Coastguard Worker         c0 += 8;
255*4bdc9457SAndroid Build Coastguard Worker         c1 += 8;
256*4bdc9457SAndroid Build Coastguard Worker         c2 += 8;
257*4bdc9457SAndroid Build Coastguard Worker         c3 += 8;
258*4bdc9457SAndroid Build Coastguard Worker       }
259*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
260*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c0, vh0x01234567);
261*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c1, vh1x01234567);
262*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c2, vh2x01234567);
263*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c3, vh3x01234567);
264*4bdc9457SAndroid Build Coastguard Worker 
265*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
266*4bdc9457SAndroid Build Coastguard Worker         vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
267*4bdc9457SAndroid Build Coastguard Worker         vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
268*4bdc9457SAndroid Build Coastguard Worker         vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
269*4bdc9457SAndroid Build Coastguard Worker 
270*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
271*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
272*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
273*4bdc9457SAndroid Build Coastguard Worker         c3 += 4;
274*4bdc9457SAndroid Build Coastguard Worker       }
275*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
276*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vh0x01234567);
277*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c1, vh1x01234567);
278*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c2, vh2x01234567);
279*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c3, vh3x01234567);
280*4bdc9457SAndroid Build Coastguard Worker 
281*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
282*4bdc9457SAndroid Build Coastguard Worker         vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
283*4bdc9457SAndroid Build Coastguard Worker         vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
284*4bdc9457SAndroid Build Coastguard Worker         vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
285*4bdc9457SAndroid Build Coastguard Worker 
286*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
287*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
288*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
289*4bdc9457SAndroid Build Coastguard Worker         c3 += 2;
290*4bdc9457SAndroid Build Coastguard Worker       }
291*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
292*4bdc9457SAndroid Build Coastguard Worker         *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
293*4bdc9457SAndroid Build Coastguard Worker         *c1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
294*4bdc9457SAndroid Build Coastguard Worker         *c2 = (uint16_t) _mm_extract_epi16(vh2x01234567, 0);
295*4bdc9457SAndroid Build Coastguard Worker         *c3 = (uint16_t) _mm_extract_epi16(vh3x01234567, 0);
296*4bdc9457SAndroid Build Coastguard Worker       }
297*4bdc9457SAndroid Build Coastguard Worker 
298*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
299*4bdc9457SAndroid Build Coastguard Worker     }
300*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
301*4bdc9457SAndroid Build Coastguard Worker }
302*4bdc9457SAndroid Build Coastguard Worker 
xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const void ** restrict a,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const void * zero,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])303*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast(
304*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
305*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
306*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
307*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
308*4bdc9457SAndroid Build Coastguard Worker     const void**restrict a,
309*4bdc9457SAndroid Build Coastguard Worker     const void*restrict w,
310*4bdc9457SAndroid Build Coastguard Worker     void*restrict c,
311*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
312*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
313*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
314*4bdc9457SAndroid Build Coastguard Worker     const void* zero,
315*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
316*4bdc9457SAndroid Build Coastguard Worker {
317*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
318*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
319*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
320*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
321*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(uint16_t) == 0);
322*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
323*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (1 * sizeof(void*)) == 0);
324*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(uint16_t) == 0);
325*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
326*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
327*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
328*4bdc9457SAndroid Build Coastguard Worker 
329*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c0 = c;
330*4bdc9457SAndroid Build Coastguard Worker 
331*4bdc9457SAndroid Build Coastguard Worker   do {
332*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
333*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
334*4bdc9457SAndroid Build Coastguard Worker     w = (const uint16_t*) w + 16;
335*4bdc9457SAndroid Build Coastguard Worker 
336*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
337*4bdc9457SAndroid Build Coastguard Worker     do {
338*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* restrict a0 = (const uint16_t*) a[0];
339*4bdc9457SAndroid Build Coastguard Worker       assert(a0 != NULL);
340*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
341*4bdc9457SAndroid Build Coastguard Worker         a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
342*4bdc9457SAndroid Build Coastguard Worker       }
343*4bdc9457SAndroid Build Coastguard Worker       a += 1;
344*4bdc9457SAndroid Build Coastguard Worker 
345*4bdc9457SAndroid Build Coastguard Worker       size_t k = kc;
346*4bdc9457SAndroid Build Coastguard Worker       do {
347*4bdc9457SAndroid Build Coastguard Worker         const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
348*4bdc9457SAndroid Build Coastguard Worker         const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
349*4bdc9457SAndroid Build Coastguard Worker         w = (const uint16_t*) w + 16;
350*4bdc9457SAndroid Build Coastguard Worker 
351*4bdc9457SAndroid Build Coastguard Worker         const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
352*4bdc9457SAndroid Build Coastguard Worker         a0 += 1;
353*4bdc9457SAndroid Build Coastguard Worker 
354*4bdc9457SAndroid Build Coastguard Worker         vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
355*4bdc9457SAndroid Build Coastguard Worker         vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
356*4bdc9457SAndroid Build Coastguard Worker 
357*4bdc9457SAndroid Build Coastguard Worker         k -= sizeof(uint16_t);
358*4bdc9457SAndroid Build Coastguard Worker       } while (k != 0);
359*4bdc9457SAndroid Build Coastguard Worker       p -= 1 * sizeof(void*);
360*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
361*4bdc9457SAndroid Build Coastguard Worker 
362*4bdc9457SAndroid Build Coastguard Worker     const __m256 vmin = _mm256_load_ps(params->avx.min);
363*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
364*4bdc9457SAndroid Build Coastguard Worker     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
365*4bdc9457SAndroid Build Coastguard Worker 
366*4bdc9457SAndroid Build Coastguard Worker     const __m256 vmax = _mm256_load_ps(params->avx.max);
367*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
368*4bdc9457SAndroid Build Coastguard Worker     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
369*4bdc9457SAndroid Build Coastguard Worker 
370*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 16) {
371*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
372*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
373*4bdc9457SAndroid Build Coastguard Worker       c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
374*4bdc9457SAndroid Build Coastguard Worker 
375*4bdc9457SAndroid Build Coastguard Worker       a = (const void**restrict) ((uintptr_t) a - ks);
376*4bdc9457SAndroid Build Coastguard Worker       nc -= 16;
377*4bdc9457SAndroid Build Coastguard Worker     } else {
378*4bdc9457SAndroid Build Coastguard Worker       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
379*4bdc9457SAndroid Build Coastguard Worker       if (nc & 8) {
380*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c0, vh0x01234567);
381*4bdc9457SAndroid Build Coastguard Worker 
382*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
383*4bdc9457SAndroid Build Coastguard Worker 
384*4bdc9457SAndroid Build Coastguard Worker         c0 += 8;
385*4bdc9457SAndroid Build Coastguard Worker       }
386*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
387*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c0, vh0x01234567);
388*4bdc9457SAndroid Build Coastguard Worker 
389*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
390*4bdc9457SAndroid Build Coastguard Worker 
391*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
392*4bdc9457SAndroid Build Coastguard Worker       }
393*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
394*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vh0x01234567);
395*4bdc9457SAndroid Build Coastguard Worker 
396*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
397*4bdc9457SAndroid Build Coastguard Worker 
398*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
399*4bdc9457SAndroid Build Coastguard Worker       }
400*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
401*4bdc9457SAndroid Build Coastguard Worker         *c0 = _mm_extract_epi16(vh0x01234567, 0);
402*4bdc9457SAndroid Build Coastguard Worker       }
403*4bdc9457SAndroid Build Coastguard Worker 
404*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
405*4bdc9457SAndroid Build Coastguard Worker     }
406*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
407*4bdc9457SAndroid Build Coastguard Worker }
408*4bdc9457SAndroid Build Coastguard Worker 
xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const void ** restrict a,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const void * zero,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])409*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast(
410*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
411*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
412*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
413*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
414*4bdc9457SAndroid Build Coastguard Worker     const void**restrict a,
415*4bdc9457SAndroid Build Coastguard Worker     const void*restrict w,
416*4bdc9457SAndroid Build Coastguard Worker     void*restrict c,
417*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
418*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
419*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
420*4bdc9457SAndroid Build Coastguard Worker     const void* zero,
421*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
422*4bdc9457SAndroid Build Coastguard Worker {
423*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
424*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 4);
425*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
426*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
427*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(uint16_t) == 0);
428*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
429*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (4 * sizeof(void*)) == 0);
430*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(uint16_t) == 0);
431*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
432*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
433*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
434*4bdc9457SAndroid Build Coastguard Worker 
435*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c0 = c;
436*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
437*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
438*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
439*4bdc9457SAndroid Build Coastguard Worker   }
440*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
441*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
442*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
443*4bdc9457SAndroid Build Coastguard Worker   }
444*4bdc9457SAndroid Build Coastguard Worker   uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
445*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr != 4) {
446*4bdc9457SAndroid Build Coastguard Worker     c3 = c2;
447*4bdc9457SAndroid Build Coastguard Worker   }
448*4bdc9457SAndroid Build Coastguard Worker 
449*4bdc9457SAndroid Build Coastguard Worker   do {
450*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
451*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
452*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc1x01234567 = vacc0x01234567;
453*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
454*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc2x01234567 = vacc0x01234567;
455*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
456*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc3x01234567 = vacc0x01234567;
457*4bdc9457SAndroid Build Coastguard Worker     __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
458*4bdc9457SAndroid Build Coastguard Worker     w = (const uint16_t*) w + 16;
459*4bdc9457SAndroid Build Coastguard Worker 
460*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
461*4bdc9457SAndroid Build Coastguard Worker     do {
462*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* restrict a0 = (const uint16_t*) a[0];
463*4bdc9457SAndroid Build Coastguard Worker       assert(a0 != NULL);
464*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
465*4bdc9457SAndroid Build Coastguard Worker         a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
466*4bdc9457SAndroid Build Coastguard Worker       }
467*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* restrict a1 = (const uint16_t*) a[1];
468*4bdc9457SAndroid Build Coastguard Worker       assert(a1 != NULL);
469*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a1 != zero) {
470*4bdc9457SAndroid Build Coastguard Worker         a1 = (const uint16_t*) ((uintptr_t) a1 + a_offset);
471*4bdc9457SAndroid Build Coastguard Worker       }
472*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* restrict a2 = (const uint16_t*) a[2];
473*4bdc9457SAndroid Build Coastguard Worker       assert(a2 != NULL);
474*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a2 != zero) {
475*4bdc9457SAndroid Build Coastguard Worker         a2 = (const uint16_t*) ((uintptr_t) a2 + a_offset);
476*4bdc9457SAndroid Build Coastguard Worker       }
477*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* restrict a3 = (const uint16_t*) a[3];
478*4bdc9457SAndroid Build Coastguard Worker       assert(a3 != NULL);
479*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a3 != zero) {
480*4bdc9457SAndroid Build Coastguard Worker         a3 = (const uint16_t*) ((uintptr_t) a3 + a_offset);
481*4bdc9457SAndroid Build Coastguard Worker       }
482*4bdc9457SAndroid Build Coastguard Worker       a += 4;
483*4bdc9457SAndroid Build Coastguard Worker 
484*4bdc9457SAndroid Build Coastguard Worker       size_t k = kc;
485*4bdc9457SAndroid Build Coastguard Worker       do {
486*4bdc9457SAndroid Build Coastguard Worker         const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
487*4bdc9457SAndroid Build Coastguard Worker         const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
488*4bdc9457SAndroid Build Coastguard Worker         w = (const uint16_t*) w + 16;
489*4bdc9457SAndroid Build Coastguard Worker 
490*4bdc9457SAndroid Build Coastguard Worker         const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
491*4bdc9457SAndroid Build Coastguard Worker         a0 += 1;
492*4bdc9457SAndroid Build Coastguard Worker         const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
493*4bdc9457SAndroid Build Coastguard Worker         a1 += 1;
494*4bdc9457SAndroid Build Coastguard Worker         const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
495*4bdc9457SAndroid Build Coastguard Worker         a2 += 1;
496*4bdc9457SAndroid Build Coastguard Worker         const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
497*4bdc9457SAndroid Build Coastguard Worker         a3 += 1;
498*4bdc9457SAndroid Build Coastguard Worker 
499*4bdc9457SAndroid Build Coastguard Worker         vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
500*4bdc9457SAndroid Build Coastguard Worker         vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
501*4bdc9457SAndroid Build Coastguard Worker         vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
502*4bdc9457SAndroid Build Coastguard Worker         vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
503*4bdc9457SAndroid Build Coastguard Worker         vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
504*4bdc9457SAndroid Build Coastguard Worker         vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
505*4bdc9457SAndroid Build Coastguard Worker         vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
506*4bdc9457SAndroid Build Coastguard Worker         vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
507*4bdc9457SAndroid Build Coastguard Worker 
508*4bdc9457SAndroid Build Coastguard Worker         k -= sizeof(uint16_t);
509*4bdc9457SAndroid Build Coastguard Worker       } while (k != 0);
510*4bdc9457SAndroid Build Coastguard Worker       p -= 4 * sizeof(void*);
511*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
512*4bdc9457SAndroid Build Coastguard Worker 
513*4bdc9457SAndroid Build Coastguard Worker     const __m256 vmin = _mm256_load_ps(params->avx.min);
514*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
515*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
516*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
517*4bdc9457SAndroid Build Coastguard Worker     vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
518*4bdc9457SAndroid Build Coastguard Worker     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
519*4bdc9457SAndroid Build Coastguard Worker     vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
520*4bdc9457SAndroid Build Coastguard Worker     vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
521*4bdc9457SAndroid Build Coastguard Worker     vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
522*4bdc9457SAndroid Build Coastguard Worker 
523*4bdc9457SAndroid Build Coastguard Worker     const __m256 vmax = _mm256_load_ps(params->avx.max);
524*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
525*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
526*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
527*4bdc9457SAndroid Build Coastguard Worker     vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
528*4bdc9457SAndroid Build Coastguard Worker     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
529*4bdc9457SAndroid Build Coastguard Worker     vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
530*4bdc9457SAndroid Build Coastguard Worker     vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
531*4bdc9457SAndroid Build Coastguard Worker     vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
532*4bdc9457SAndroid Build Coastguard Worker 
533*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(nc >= 16) {
534*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
535*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
536*4bdc9457SAndroid Build Coastguard Worker       c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
537*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
538*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
539*4bdc9457SAndroid Build Coastguard Worker       c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
540*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
541*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
542*4bdc9457SAndroid Build Coastguard Worker       c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
543*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
544*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
545*4bdc9457SAndroid Build Coastguard Worker       c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
546*4bdc9457SAndroid Build Coastguard Worker 
547*4bdc9457SAndroid Build Coastguard Worker       a = (const void**restrict) ((uintptr_t) a - ks);
548*4bdc9457SAndroid Build Coastguard Worker       nc -= 16;
549*4bdc9457SAndroid Build Coastguard Worker     } else {
550*4bdc9457SAndroid Build Coastguard Worker       __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
551*4bdc9457SAndroid Build Coastguard Worker       __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
552*4bdc9457SAndroid Build Coastguard Worker       __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
553*4bdc9457SAndroid Build Coastguard Worker       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
554*4bdc9457SAndroid Build Coastguard Worker       if (nc & 8) {
555*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c3, vh3x01234567);
556*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c2, vh2x01234567);
557*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c1, vh1x01234567);
558*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) c0, vh0x01234567);
559*4bdc9457SAndroid Build Coastguard Worker 
560*4bdc9457SAndroid Build Coastguard Worker         vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
561*4bdc9457SAndroid Build Coastguard Worker         vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
562*4bdc9457SAndroid Build Coastguard Worker         vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
563*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
564*4bdc9457SAndroid Build Coastguard Worker 
565*4bdc9457SAndroid Build Coastguard Worker         c3 += 8;
566*4bdc9457SAndroid Build Coastguard Worker         c2 += 8;
567*4bdc9457SAndroid Build Coastguard Worker         c1 += 8;
568*4bdc9457SAndroid Build Coastguard Worker         c0 += 8;
569*4bdc9457SAndroid Build Coastguard Worker       }
570*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
571*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c3, vh3x01234567);
572*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c2, vh2x01234567);
573*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c1, vh1x01234567);
574*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) c0, vh0x01234567);
575*4bdc9457SAndroid Build Coastguard Worker 
576*4bdc9457SAndroid Build Coastguard Worker         vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
577*4bdc9457SAndroid Build Coastguard Worker         vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
578*4bdc9457SAndroid Build Coastguard Worker         vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
579*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
580*4bdc9457SAndroid Build Coastguard Worker 
581*4bdc9457SAndroid Build Coastguard Worker         c3 += 4;
582*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
583*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
584*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
585*4bdc9457SAndroid Build Coastguard Worker       }
586*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
587*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c3, vh3x01234567);
588*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c2, vh2x01234567);
589*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c1, vh1x01234567);
590*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vh0x01234567);
591*4bdc9457SAndroid Build Coastguard Worker 
592*4bdc9457SAndroid Build Coastguard Worker         vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
593*4bdc9457SAndroid Build Coastguard Worker         vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
594*4bdc9457SAndroid Build Coastguard Worker         vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
595*4bdc9457SAndroid Build Coastguard Worker         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
596*4bdc9457SAndroid Build Coastguard Worker 
597*4bdc9457SAndroid Build Coastguard Worker         c3 += 2;
598*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
599*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
600*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
601*4bdc9457SAndroid Build Coastguard Worker       }
602*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
603*4bdc9457SAndroid Build Coastguard Worker         *c3 = _mm_extract_epi16(vh3x01234567, 0);
604*4bdc9457SAndroid Build Coastguard Worker         *c2 = _mm_extract_epi16(vh2x01234567, 0);
605*4bdc9457SAndroid Build Coastguard Worker         *c1 = _mm_extract_epi16(vh1x01234567, 0);
606*4bdc9457SAndroid Build Coastguard Worker         *c0 = _mm_extract_epi16(vh0x01234567, 0);
607*4bdc9457SAndroid Build Coastguard Worker       }
608*4bdc9457SAndroid Build Coastguard Worker 
609*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
610*4bdc9457SAndroid Build Coastguard Worker     }
611*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
612*4bdc9457SAndroid Build Coastguard Worker }
613*4bdc9457SAndroid Build Coastguard Worker 
xnn_f16_pavgpool_minmax_ukernel_9p8x__avx2_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,const void * zero,const void * multiplier,void * buffer,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])614*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_pavgpool_minmax_ukernel_9p8x__avx2_c8(
615*4bdc9457SAndroid Build Coastguard Worker     size_t output_pixels,
616*4bdc9457SAndroid Build Coastguard Worker     size_t kernel_elements,
617*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
618*4bdc9457SAndroid Build Coastguard Worker     const void** input,
619*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
620*4bdc9457SAndroid Build Coastguard Worker     const void* zero,
621*4bdc9457SAndroid Build Coastguard Worker     const void* multiplier,
622*4bdc9457SAndroid Build Coastguard Worker     void* buffer,
623*4bdc9457SAndroid Build Coastguard Worker     void* output,
624*4bdc9457SAndroid Build Coastguard Worker     size_t input_increment,
625*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
626*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
627*4bdc9457SAndroid Build Coastguard Worker {
628*4bdc9457SAndroid Build Coastguard Worker   assert(output_pixels != 0);
629*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements > 9);
630*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
631*4bdc9457SAndroid Build Coastguard Worker 
632*4bdc9457SAndroid Build Coastguard Worker   const __m256 voutput_min = _mm256_load_ps(params->avx.min);
633*4bdc9457SAndroid Build Coastguard Worker   const __m256 voutput_max = _mm256_load_ps(params->avx.max);
634*4bdc9457SAndroid Build Coastguard Worker 
635*4bdc9457SAndroid Build Coastguard Worker   uint16_t* o = (uint16_t*) output;
636*4bdc9457SAndroid Build Coastguard Worker   do {
637*4bdc9457SAndroid Build Coastguard Worker     {
638*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i0 = (const uint16_t*) *input++;
639*4bdc9457SAndroid Build Coastguard Worker       assert(i0 != NULL);
640*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i0 != zero) {
641*4bdc9457SAndroid Build Coastguard Worker         i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
642*4bdc9457SAndroid Build Coastguard Worker       }
643*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i1 = (const uint16_t*) *input++;
644*4bdc9457SAndroid Build Coastguard Worker       assert(i1 != NULL);
645*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i1 != zero) {
646*4bdc9457SAndroid Build Coastguard Worker         i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
647*4bdc9457SAndroid Build Coastguard Worker       }
648*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i2 = (const uint16_t*) *input++;
649*4bdc9457SAndroid Build Coastguard Worker       assert(i2 != NULL);
650*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i2 != zero) {
651*4bdc9457SAndroid Build Coastguard Worker         i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
652*4bdc9457SAndroid Build Coastguard Worker       }
653*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i3 = (const uint16_t*) *input++;
654*4bdc9457SAndroid Build Coastguard Worker       assert(i3 != NULL);
655*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i3 != zero) {
656*4bdc9457SAndroid Build Coastguard Worker         i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
657*4bdc9457SAndroid Build Coastguard Worker       }
658*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i4 = (const uint16_t*) *input++;
659*4bdc9457SAndroid Build Coastguard Worker       assert(i4 != NULL);
660*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i4 != zero) {
661*4bdc9457SAndroid Build Coastguard Worker         i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
662*4bdc9457SAndroid Build Coastguard Worker       }
663*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i5 = (const uint16_t*) *input++;
664*4bdc9457SAndroid Build Coastguard Worker       assert(i5 != NULL);
665*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i5 != zero) {
666*4bdc9457SAndroid Build Coastguard Worker         i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
667*4bdc9457SAndroid Build Coastguard Worker       }
668*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i6 = (const uint16_t*) *input++;
669*4bdc9457SAndroid Build Coastguard Worker       assert(i6 != NULL);
670*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i6 != zero) {
671*4bdc9457SAndroid Build Coastguard Worker         i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
672*4bdc9457SAndroid Build Coastguard Worker       }
673*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i7 = (const uint16_t*) *input++;
674*4bdc9457SAndroid Build Coastguard Worker       assert(i7 != NULL);
675*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i7 != zero) {
676*4bdc9457SAndroid Build Coastguard Worker         i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
677*4bdc9457SAndroid Build Coastguard Worker       }
678*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i8 = (const uint16_t*) *input++;
679*4bdc9457SAndroid Build Coastguard Worker       assert(i8 != NULL);
680*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i8 != zero) {
681*4bdc9457SAndroid Build Coastguard Worker         i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
682*4bdc9457SAndroid Build Coastguard Worker       }
683*4bdc9457SAndroid Build Coastguard Worker 
684*4bdc9457SAndroid Build Coastguard Worker       uint16_t* b = (uint16_t*) buffer;
685*4bdc9457SAndroid Build Coastguard Worker       for (size_t c = 0; c < channels; c += 8) {
686*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
687*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
688*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
689*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
690*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
691*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
692*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
693*4bdc9457SAndroid Build Coastguard Worker         i3 += 8;
694*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
695*4bdc9457SAndroid Build Coastguard Worker         i4 += 8;
696*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
697*4bdc9457SAndroid Build Coastguard Worker         i5 += 8;
698*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
699*4bdc9457SAndroid Build Coastguard Worker         i6 += 8;
700*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
701*4bdc9457SAndroid Build Coastguard Worker         i7 += 8;
702*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
703*4bdc9457SAndroid Build Coastguard Worker         i8 += 8;
704*4bdc9457SAndroid Build Coastguard Worker 
705*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
706*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
707*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
708*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
709*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
710*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
711*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
712*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC));
713*4bdc9457SAndroid Build Coastguard Worker 
714*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) b, _mm256_cvtps_ph(vsum, _MM_FROUND_NO_EXC));
715*4bdc9457SAndroid Build Coastguard Worker         b += 8;
716*4bdc9457SAndroid Build Coastguard Worker       }
717*4bdc9457SAndroid Build Coastguard Worker     }
718*4bdc9457SAndroid Build Coastguard Worker 
719*4bdc9457SAndroid Build Coastguard Worker     size_t k = kernel_elements;
720*4bdc9457SAndroid Build Coastguard Worker     for (k -= 9; k > 8; k -= 8) {
721*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i0 = (const uint16_t*) *input++;
722*4bdc9457SAndroid Build Coastguard Worker       assert(i0 != NULL);
723*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i0 != zero) {
724*4bdc9457SAndroid Build Coastguard Worker         i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
725*4bdc9457SAndroid Build Coastguard Worker       }
726*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i1 = (const uint16_t*) *input++;
727*4bdc9457SAndroid Build Coastguard Worker       assert(i1 != NULL);
728*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i1 != zero) {
729*4bdc9457SAndroid Build Coastguard Worker         i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
730*4bdc9457SAndroid Build Coastguard Worker       }
731*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i2 = (const uint16_t*) *input++;
732*4bdc9457SAndroid Build Coastguard Worker       assert(i2 != NULL);
733*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i2 != zero) {
734*4bdc9457SAndroid Build Coastguard Worker         i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
735*4bdc9457SAndroid Build Coastguard Worker       }
736*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i3 = (const uint16_t*) *input++;
737*4bdc9457SAndroid Build Coastguard Worker       assert(i3 != NULL);
738*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i3 != zero) {
739*4bdc9457SAndroid Build Coastguard Worker         i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
740*4bdc9457SAndroid Build Coastguard Worker       }
741*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i4 = (const uint16_t*) *input++;
742*4bdc9457SAndroid Build Coastguard Worker       assert(i4 != NULL);
743*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i4 != zero) {
744*4bdc9457SAndroid Build Coastguard Worker         i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
745*4bdc9457SAndroid Build Coastguard Worker       }
746*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i5 = (const uint16_t*) *input++;
747*4bdc9457SAndroid Build Coastguard Worker       assert(i5 != NULL);
748*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i5 != zero) {
749*4bdc9457SAndroid Build Coastguard Worker         i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
750*4bdc9457SAndroid Build Coastguard Worker       }
751*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i6 = (const uint16_t*) *input++;
752*4bdc9457SAndroid Build Coastguard Worker       assert(i6 != NULL);
753*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i6 != zero) {
754*4bdc9457SAndroid Build Coastguard Worker         i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
755*4bdc9457SAndroid Build Coastguard Worker       }
756*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i7 = (const uint16_t*) *input++;
757*4bdc9457SAndroid Build Coastguard Worker       assert(i7 != NULL);
758*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i7 != zero) {
759*4bdc9457SAndroid Build Coastguard Worker         i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
760*4bdc9457SAndroid Build Coastguard Worker       }
761*4bdc9457SAndroid Build Coastguard Worker 
762*4bdc9457SAndroid Build Coastguard Worker       uint16_t* b = (uint16_t*) buffer;
763*4bdc9457SAndroid Build Coastguard Worker       for (size_t c = 0; c < channels; c += 8) {
764*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
765*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
766*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
767*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
768*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
769*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
770*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
771*4bdc9457SAndroid Build Coastguard Worker         i3 += 8;
772*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
773*4bdc9457SAndroid Build Coastguard Worker         i4 += 8;
774*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
775*4bdc9457SAndroid Build Coastguard Worker         i5 += 8;
776*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
777*4bdc9457SAndroid Build Coastguard Worker         i6 += 8;
778*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
779*4bdc9457SAndroid Build Coastguard Worker         i7 += 8;
780*4bdc9457SAndroid Build Coastguard Worker         const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
781*4bdc9457SAndroid Build Coastguard Worker 
782*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
783*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
784*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
785*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
786*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
787*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
788*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
789*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC));
790*4bdc9457SAndroid Build Coastguard Worker 
791*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) b, _mm256_cvtps_ph(vsum, _MM_FROUND_NO_EXC));
792*4bdc9457SAndroid Build Coastguard Worker         b += 8;
793*4bdc9457SAndroid Build Coastguard Worker       }
794*4bdc9457SAndroid Build Coastguard Worker     }
795*4bdc9457SAndroid Build Coastguard Worker 
796*4bdc9457SAndroid Build Coastguard Worker     {
797*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i0 = (const uint16_t*) input[0];
798*4bdc9457SAndroid Build Coastguard Worker       assert(i0 != NULL);
799*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i1 = (const uint16_t*) input[1];
800*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i2 = (const uint16_t*) input[2];
801*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i3 = (const uint16_t*) input[3];
802*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i4 = (const uint16_t*) input[4];
803*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i5 = (const uint16_t*) input[5];
804*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i6 = (const uint16_t*) input[6];
805*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* i7 = (const uint16_t*) input[7];
806*4bdc9457SAndroid Build Coastguard Worker       input = (const void**) ((uintptr_t) input + input_increment);
807*4bdc9457SAndroid Build Coastguard Worker       if (k < 2) {
808*4bdc9457SAndroid Build Coastguard Worker         i1 = (const uint16_t*) zero;
809*4bdc9457SAndroid Build Coastguard Worker       }
810*4bdc9457SAndroid Build Coastguard Worker       assert(i1 != NULL);
811*4bdc9457SAndroid Build Coastguard Worker       if (k <= 2) {
812*4bdc9457SAndroid Build Coastguard Worker         i2 = (const uint16_t*) zero;
813*4bdc9457SAndroid Build Coastguard Worker       }
814*4bdc9457SAndroid Build Coastguard Worker       assert(i2 != NULL);
815*4bdc9457SAndroid Build Coastguard Worker       if (k < 4) {
816*4bdc9457SAndroid Build Coastguard Worker         i3 = (const uint16_t*) zero;
817*4bdc9457SAndroid Build Coastguard Worker       }
818*4bdc9457SAndroid Build Coastguard Worker       assert(i3 != NULL);
819*4bdc9457SAndroid Build Coastguard Worker       if (k <= 4) {
820*4bdc9457SAndroid Build Coastguard Worker         i4 = (const uint16_t*) zero;
821*4bdc9457SAndroid Build Coastguard Worker       }
822*4bdc9457SAndroid Build Coastguard Worker       assert(i4 != NULL);
823*4bdc9457SAndroid Build Coastguard Worker       if (k < 6) {
824*4bdc9457SAndroid Build Coastguard Worker         i5 = (const uint16_t*) zero;
825*4bdc9457SAndroid Build Coastguard Worker       }
826*4bdc9457SAndroid Build Coastguard Worker       assert(i5 != NULL);
827*4bdc9457SAndroid Build Coastguard Worker       if (k <= 6) {
828*4bdc9457SAndroid Build Coastguard Worker         i6 = (const uint16_t*) zero;
829*4bdc9457SAndroid Build Coastguard Worker       }
830*4bdc9457SAndroid Build Coastguard Worker       assert(i6 != NULL);
831*4bdc9457SAndroid Build Coastguard Worker       if (k < 8) {
832*4bdc9457SAndroid Build Coastguard Worker         i7 = (const uint16_t*) zero;
833*4bdc9457SAndroid Build Coastguard Worker       }
834*4bdc9457SAndroid Build Coastguard Worker       assert(i7 != NULL);
835*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i0 != zero) {
836*4bdc9457SAndroid Build Coastguard Worker         i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
837*4bdc9457SAndroid Build Coastguard Worker       }
838*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i1 != zero) {
839*4bdc9457SAndroid Build Coastguard Worker         i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
840*4bdc9457SAndroid Build Coastguard Worker       }
841*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i2 != zero) {
842*4bdc9457SAndroid Build Coastguard Worker         i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
843*4bdc9457SAndroid Build Coastguard Worker       }
844*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i3 != zero) {
845*4bdc9457SAndroid Build Coastguard Worker         i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
846*4bdc9457SAndroid Build Coastguard Worker       }
847*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i4 != zero) {
848*4bdc9457SAndroid Build Coastguard Worker         i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
849*4bdc9457SAndroid Build Coastguard Worker       }
850*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i5 != zero) {
851*4bdc9457SAndroid Build Coastguard Worker         i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
852*4bdc9457SAndroid Build Coastguard Worker       }
853*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i6 != zero) {
854*4bdc9457SAndroid Build Coastguard Worker         i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
855*4bdc9457SAndroid Build Coastguard Worker       }
856*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(i7 != zero) {
857*4bdc9457SAndroid Build Coastguard Worker         i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
858*4bdc9457SAndroid Build Coastguard Worker       }
859*4bdc9457SAndroid Build Coastguard Worker 
860*4bdc9457SAndroid Build Coastguard Worker       const __m256 vmultiplier = _mm256_cvtph_ps(_mm_set1_epi16((short) *((const uint16_t*) multiplier)));
861*4bdc9457SAndroid Build Coastguard Worker       multiplier = (const uint16_t*) multiplier + 1;
862*4bdc9457SAndroid Build Coastguard Worker 
863*4bdc9457SAndroid Build Coastguard Worker       size_t c = channels;
864*4bdc9457SAndroid Build Coastguard Worker       const uint16_t* b = (const uint16_t*) buffer;
865*4bdc9457SAndroid Build Coastguard Worker       while (c >= 8) {
866*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
867*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
868*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
869*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
870*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
871*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
872*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
873*4bdc9457SAndroid Build Coastguard Worker         i3 += 8;
874*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
875*4bdc9457SAndroid Build Coastguard Worker         i4 += 8;
876*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
877*4bdc9457SAndroid Build Coastguard Worker         i5 += 8;
878*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
879*4bdc9457SAndroid Build Coastguard Worker         i6 += 8;
880*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
881*4bdc9457SAndroid Build Coastguard Worker         i7 += 8;
882*4bdc9457SAndroid Build Coastguard Worker         const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
883*4bdc9457SAndroid Build Coastguard Worker         b += 8;
884*4bdc9457SAndroid Build Coastguard Worker 
885*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
886*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
887*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
888*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
889*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
890*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
891*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
892*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC));
893*4bdc9457SAndroid Build Coastguard Worker 
894*4bdc9457SAndroid Build Coastguard Worker         __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vmultiplier), _MM_FROUND_NO_EXC));
895*4bdc9457SAndroid Build Coastguard Worker         vout = _mm256_max_ps(vout, voutput_min);
896*4bdc9457SAndroid Build Coastguard Worker         vout = _mm256_min_ps(vout, voutput_max);
897*4bdc9457SAndroid Build Coastguard Worker 
898*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
899*4bdc9457SAndroid Build Coastguard Worker         o += 8;
900*4bdc9457SAndroid Build Coastguard Worker 
901*4bdc9457SAndroid Build Coastguard Worker         c -= 8;
902*4bdc9457SAndroid Build Coastguard Worker       }
903*4bdc9457SAndroid Build Coastguard Worker       if (c != 0) {
904*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
905*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
906*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
907*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
908*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
909*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
910*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
911*4bdc9457SAndroid Build Coastguard Worker         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
912*4bdc9457SAndroid Build Coastguard Worker         const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
913*4bdc9457SAndroid Build Coastguard Worker 
914*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
915*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
916*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
917*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
918*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
919*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
920*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
921*4bdc9457SAndroid Build Coastguard Worker         const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC));
922*4bdc9457SAndroid Build Coastguard Worker 
923*4bdc9457SAndroid Build Coastguard Worker         __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vmultiplier), _MM_FROUND_NO_EXC));
924*4bdc9457SAndroid Build Coastguard Worker         vout = _mm256_max_ps(vout, voutput_min);
925*4bdc9457SAndroid Build Coastguard Worker         vout = _mm256_min_ps(vout, voutput_max);
926*4bdc9457SAndroid Build Coastguard Worker 
927*4bdc9457SAndroid Build Coastguard Worker         __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
928*4bdc9457SAndroid Build Coastguard Worker         if (c & 4) {
929*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i*) o, vh);
930*4bdc9457SAndroid Build Coastguard Worker           vh = _mm_unpackhi_epi64(vh, vh);
931*4bdc9457SAndroid Build Coastguard Worker           o += 4;
932*4bdc9457SAndroid Build Coastguard Worker         }
933*4bdc9457SAndroid Build Coastguard Worker         if (c & 2) {
934*4bdc9457SAndroid Build Coastguard Worker           _mm_storeu_si32(o, vh);
935*4bdc9457SAndroid Build Coastguard Worker           vh = _mm_srli_epi64(vh, 32);
936*4bdc9457SAndroid Build Coastguard Worker           o += 2;
937*4bdc9457SAndroid Build Coastguard Worker         }
938*4bdc9457SAndroid Build Coastguard Worker         if (c & 1) {
939*4bdc9457SAndroid Build Coastguard Worker           *o = (uint16_t) _mm_extract_epi16(vh, 0);
940*4bdc9457SAndroid Build Coastguard Worker           o += 1;
941*4bdc9457SAndroid Build Coastguard Worker         }
942*4bdc9457SAndroid Build Coastguard Worker       }
943*4bdc9457SAndroid Build Coastguard Worker     }
944*4bdc9457SAndroid Build Coastguard Worker     o = (uint16_t*) ((uintptr_t) o + output_increment);
945*4bdc9457SAndroid Build Coastguard Worker   } while (--output_pixels != 0);
946*4bdc9457SAndroid Build Coastguard Worker }
947*4bdc9457SAndroid Build Coastguard Worker 
xnn_f16_pavgpool_minmax_ukernel_9x__avx2_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,const void * zero,const void * multiplier,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])948*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_pavgpool_minmax_ukernel_9x__avx2_c8(
949*4bdc9457SAndroid Build Coastguard Worker     size_t output_pixels,
950*4bdc9457SAndroid Build Coastguard Worker     size_t kernel_elements,
951*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
952*4bdc9457SAndroid Build Coastguard Worker     const void** input,
953*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
954*4bdc9457SAndroid Build Coastguard Worker     const void* zero,
955*4bdc9457SAndroid Build Coastguard Worker     const void* multiplier,
956*4bdc9457SAndroid Build Coastguard Worker     void* output,
957*4bdc9457SAndroid Build Coastguard Worker     size_t input_increment,
958*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
959*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
960*4bdc9457SAndroid Build Coastguard Worker {
961*4bdc9457SAndroid Build Coastguard Worker   assert(output_pixels != 0);
962*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements != 0);
963*4bdc9457SAndroid Build Coastguard Worker   assert(kernel_elements <= 9);
964*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
965*4bdc9457SAndroid Build Coastguard Worker 
966*4bdc9457SAndroid Build Coastguard Worker   const __m256 voutput_min = _mm256_load_ps(params->avx.min);
967*4bdc9457SAndroid Build Coastguard Worker   const __m256 voutput_max = _mm256_load_ps(params->avx.max);
968*4bdc9457SAndroid Build Coastguard Worker 
969*4bdc9457SAndroid Build Coastguard Worker   uint16_t* o = (uint16_t*) output;
970*4bdc9457SAndroid Build Coastguard Worker   do {
971*4bdc9457SAndroid Build Coastguard Worker     const uint16_t* i0 = (const uint16_t*) input[0];
972*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
973*4bdc9457SAndroid Build Coastguard Worker     const uint16_t* i1 = (const uint16_t*) input[1];
974*4bdc9457SAndroid Build Coastguard Worker     const uint16_t* i2 = (const uint16_t*) input[2];
975*4bdc9457SAndroid Build Coastguard Worker     const uint16_t* i3 = (const uint16_t*) input[3];
976*4bdc9457SAndroid Build Coastguard Worker     const uint16_t* i4 = (const uint16_t*) input[4];
977*4bdc9457SAndroid Build Coastguard Worker     const uint16_t* i5 = (const uint16_t*) input[5];
978*4bdc9457SAndroid Build Coastguard Worker     const uint16_t* i6 = (const uint16_t*) input[6];
979*4bdc9457SAndroid Build Coastguard Worker     const uint16_t* i7 = (const uint16_t*) input[7];
980*4bdc9457SAndroid Build Coastguard Worker     const uint16_t* i8 = (const uint16_t*) input[8];
981*4bdc9457SAndroid Build Coastguard Worker     input = (const void**) ((uintptr_t) input + input_increment);
982*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 2) {
983*4bdc9457SAndroid Build Coastguard Worker       i1 = (const uint16_t*) zero;
984*4bdc9457SAndroid Build Coastguard Worker     }
985*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
986*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 2) {
987*4bdc9457SAndroid Build Coastguard Worker       i2 = (const uint16_t*) zero;
988*4bdc9457SAndroid Build Coastguard Worker     }
989*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
990*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 4) {
991*4bdc9457SAndroid Build Coastguard Worker       i3 = (const uint16_t*) zero;
992*4bdc9457SAndroid Build Coastguard Worker     }
993*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
994*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 4) {
995*4bdc9457SAndroid Build Coastguard Worker       i4 = (const uint16_t*) zero;
996*4bdc9457SAndroid Build Coastguard Worker     }
997*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
998*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 6) {
999*4bdc9457SAndroid Build Coastguard Worker       i5 = (const uint16_t*) zero;
1000*4bdc9457SAndroid Build Coastguard Worker     }
1001*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
1002*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 6) {
1003*4bdc9457SAndroid Build Coastguard Worker       i6 = (const uint16_t*) zero;
1004*4bdc9457SAndroid Build Coastguard Worker     }
1005*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
1006*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements < 8) {
1007*4bdc9457SAndroid Build Coastguard Worker       i7 = (const uint16_t*) zero;
1008*4bdc9457SAndroid Build Coastguard Worker     }
1009*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
1010*4bdc9457SAndroid Build Coastguard Worker     if (kernel_elements <= 8) {
1011*4bdc9457SAndroid Build Coastguard Worker       i8 = (const uint16_t*) zero;
1012*4bdc9457SAndroid Build Coastguard Worker     }
1013*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
1014*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
1015*4bdc9457SAndroid Build Coastguard Worker       i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
1016*4bdc9457SAndroid Build Coastguard Worker     }
1017*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
1018*4bdc9457SAndroid Build Coastguard Worker       i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
1019*4bdc9457SAndroid Build Coastguard Worker     }
1020*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
1021*4bdc9457SAndroid Build Coastguard Worker       i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
1022*4bdc9457SAndroid Build Coastguard Worker     }
1023*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
1024*4bdc9457SAndroid Build Coastguard Worker       i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
1025*4bdc9457SAndroid Build Coastguard Worker     }
1026*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
1027*4bdc9457SAndroid Build Coastguard Worker       i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
1028*4bdc9457SAndroid Build Coastguard Worker     }
1029*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
1030*4bdc9457SAndroid Build Coastguard Worker       i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
1031*4bdc9457SAndroid Build Coastguard Worker     }
1032*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
1033*4bdc9457SAndroid Build Coastguard Worker       i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
1034*4bdc9457SAndroid Build Coastguard Worker     }
1035*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
1036*4bdc9457SAndroid Build Coastguard Worker       i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
1037*4bdc9457SAndroid Build Coastguard Worker     }
1038*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
1039*4bdc9457SAndroid Build Coastguard Worker       i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
1040*4bdc9457SAndroid Build Coastguard Worker     }
1041*4bdc9457SAndroid Build Coastguard Worker 
1042*4bdc9457SAndroid Build Coastguard Worker     const __m256 vmultiplier = _mm256_cvtph_ps(_mm_set1_epi16((short) *((const uint16_t*) multiplier)));
1043*4bdc9457SAndroid Build Coastguard Worker     multiplier = (const uint16_t*) multiplier + 1;
1044*4bdc9457SAndroid Build Coastguard Worker 
1045*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
1046*4bdc9457SAndroid Build Coastguard Worker     while (c >= 8) {
1047*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1048*4bdc9457SAndroid Build Coastguard Worker       i0 += 8;
1049*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1050*4bdc9457SAndroid Build Coastguard Worker       i1 += 8;
1051*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1052*4bdc9457SAndroid Build Coastguard Worker       i2 += 8;
1053*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1054*4bdc9457SAndroid Build Coastguard Worker       i3 += 8;
1055*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
1056*4bdc9457SAndroid Build Coastguard Worker       i4 += 8;
1057*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
1058*4bdc9457SAndroid Build Coastguard Worker       i5 += 8;
1059*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
1060*4bdc9457SAndroid Build Coastguard Worker       i6 += 8;
1061*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
1062*4bdc9457SAndroid Build Coastguard Worker       i7 += 8;
1063*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
1064*4bdc9457SAndroid Build Coastguard Worker       i8 += 8;
1065*4bdc9457SAndroid Build Coastguard Worker 
1066*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
1067*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
1068*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
1069*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
1070*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
1071*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
1072*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
1073*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC));
1074*4bdc9457SAndroid Build Coastguard Worker 
1075*4bdc9457SAndroid Build Coastguard Worker       __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vmultiplier), _MM_FROUND_NO_EXC));
1076*4bdc9457SAndroid Build Coastguard Worker       vout = _mm256_max_ps(vout, voutput_min);
1077*4bdc9457SAndroid Build Coastguard Worker       vout = _mm256_min_ps(vout, voutput_max);
1078*4bdc9457SAndroid Build Coastguard Worker 
1079*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
1080*4bdc9457SAndroid Build Coastguard Worker       o += 8;
1081*4bdc9457SAndroid Build Coastguard Worker 
1082*4bdc9457SAndroid Build Coastguard Worker       c -= 8;
1083*4bdc9457SAndroid Build Coastguard Worker     }
1084*4bdc9457SAndroid Build Coastguard Worker     if (c != 0) {
1085*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1086*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1087*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1088*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1089*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
1090*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
1091*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
1092*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
1093*4bdc9457SAndroid Build Coastguard Worker       const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
1094*4bdc9457SAndroid Build Coastguard Worker 
1095*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
1096*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
1097*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
1098*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
1099*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
1100*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
1101*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
1102*4bdc9457SAndroid Build Coastguard Worker       const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC));
1103*4bdc9457SAndroid Build Coastguard Worker 
1104*4bdc9457SAndroid Build Coastguard Worker       __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vmultiplier), _MM_FROUND_NO_EXC));
1105*4bdc9457SAndroid Build Coastguard Worker       vout = _mm256_max_ps(vout, voutput_min);
1106*4bdc9457SAndroid Build Coastguard Worker       vout = _mm256_min_ps(vout, voutput_max);
1107*4bdc9457SAndroid Build Coastguard Worker 
1108*4bdc9457SAndroid Build Coastguard Worker       __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
1109*4bdc9457SAndroid Build Coastguard Worker       if (c & 4) {
1110*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) o, vh);
1111*4bdc9457SAndroid Build Coastguard Worker         vh = _mm_unpackhi_epi64(vh, vh);
1112*4bdc9457SAndroid Build Coastguard Worker         o += 4;
1113*4bdc9457SAndroid Build Coastguard Worker       }
1114*4bdc9457SAndroid Build Coastguard Worker       if (c & 2) {
1115*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(o, vh);
1116*4bdc9457SAndroid Build Coastguard Worker         vh = _mm_srli_epi64(vh, 32);
1117*4bdc9457SAndroid Build Coastguard Worker         o += 2;
1118*4bdc9457SAndroid Build Coastguard Worker       }
1119*4bdc9457SAndroid Build Coastguard Worker       if (c & 1) {
1120*4bdc9457SAndroid Build Coastguard Worker         *o = (uint16_t) _mm_extract_epi16(vh, 0);
1121*4bdc9457SAndroid Build Coastguard Worker         o += 1;
1122*4bdc9457SAndroid Build Coastguard Worker       }
1123*4bdc9457SAndroid Build Coastguard Worker     }
1124*4bdc9457SAndroid Build Coastguard Worker     o = (uint16_t*) ((uintptr_t) o + output_increment);
1125*4bdc9457SAndroid Build Coastguard Worker   } while (--output_pixels != 0);
1126*4bdc9457SAndroid Build Coastguard Worker }
1127*4bdc9457SAndroid Build Coastguard Worker 
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40(size_t batch,const void * input,const void * max,void * output,void * sum,const union xnn_f16_expminus_params params[restrict XNN_MIN_ELEMENTS (1)])1128*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40(
1129*4bdc9457SAndroid Build Coastguard Worker     size_t batch,
1130*4bdc9457SAndroid Build Coastguard Worker     const void* input,
1131*4bdc9457SAndroid Build Coastguard Worker     const void* max,
1132*4bdc9457SAndroid Build Coastguard Worker     void* output,
1133*4bdc9457SAndroid Build Coastguard Worker     void* sum,
1134*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f16_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1135*4bdc9457SAndroid Build Coastguard Worker {
1136*4bdc9457SAndroid Build Coastguard Worker   assert(batch % sizeof(uint16_t) == 0);
1137*4bdc9457SAndroid Build Coastguard Worker 
1138*4bdc9457SAndroid Build Coastguard Worker   const __m256 vi_max = _mm256_cvtph_ps(_mm_set1_epi16((short) *((const uint16_t*) max)));
1139*4bdc9457SAndroid Build Coastguard Worker   const __m256 vlog2e = _mm256_load_ps(params->avx2_rr1_p2.log2e);
1140*4bdc9457SAndroid Build Coastguard Worker   const __m256 vmagic_bias = _mm256_load_ps(params->avx2_rr1_p2.magic_bias);
1141*4bdc9457SAndroid Build Coastguard Worker   const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2);
1142*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc2 = _mm256_load_ps(params->avx2_rr1_p2.c2);
1143*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc1 = _mm256_load_ps(params->avx2_rr1_p2.c1);
1144*4bdc9457SAndroid Build Coastguard Worker   const __m256 vdenorm_cutoff = _mm256_load_ps(params->avx2_rr1_p2.denorm_cutoff);
1145*4bdc9457SAndroid Build Coastguard Worker 
1146*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* i = (const uint16_t*) input;
1147*4bdc9457SAndroid Build Coastguard Worker   uint16_t* o = (uint16_t*) output;
1148*4bdc9457SAndroid Build Coastguard Worker   __m256 vacc0 = _mm256_setzero_ps();
1149*4bdc9457SAndroid Build Coastguard Worker   for (; batch >= 40 * sizeof(uint16_t); batch -= 40 * sizeof(uint16_t)) {
1150*4bdc9457SAndroid Build Coastguard Worker     const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1151*4bdc9457SAndroid Build Coastguard Worker     const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
1152*4bdc9457SAndroid Build Coastguard Worker     const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 16)));
1153*4bdc9457SAndroid Build Coastguard Worker     const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 24)));
1154*4bdc9457SAndroid Build Coastguard Worker     const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 32)));
1155*4bdc9457SAndroid Build Coastguard Worker     i += 40;
1156*4bdc9457SAndroid Build Coastguard Worker 
1157*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx0 = _mm256_sub_ps(vi0, vi_max);
1158*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx1 = _mm256_sub_ps(vi1, vi_max);
1159*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx2 = _mm256_sub_ps(vi2, vi_max);
1160*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx3 = _mm256_sub_ps(vi3, vi_max);
1161*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx4 = _mm256_sub_ps(vi4, vi_max);
1162*4bdc9457SAndroid Build Coastguard Worker 
1163*4bdc9457SAndroid Build Coastguard Worker     __m256 vn0 = _mm256_fmadd_ps(vx0, vlog2e, vmagic_bias);
1164*4bdc9457SAndroid Build Coastguard Worker     __m256 vn1 = _mm256_fmadd_ps(vx1, vlog2e, vmagic_bias);
1165*4bdc9457SAndroid Build Coastguard Worker     __m256 vn2 = _mm256_fmadd_ps(vx2, vlog2e, vmagic_bias);
1166*4bdc9457SAndroid Build Coastguard Worker     __m256 vn3 = _mm256_fmadd_ps(vx3, vlog2e, vmagic_bias);
1167*4bdc9457SAndroid Build Coastguard Worker     __m256 vn4 = _mm256_fmadd_ps(vx4, vlog2e, vmagic_bias);
1168*4bdc9457SAndroid Build Coastguard Worker 
1169*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23));
1170*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23));
1171*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23));
1172*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23));
1173*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23));
1174*4bdc9457SAndroid Build Coastguard Worker 
1175*4bdc9457SAndroid Build Coastguard Worker     vn0 = _mm256_sub_ps(vn0, vmagic_bias);
1176*4bdc9457SAndroid Build Coastguard Worker     vn1 = _mm256_sub_ps(vn1, vmagic_bias);
1177*4bdc9457SAndroid Build Coastguard Worker     vn2 = _mm256_sub_ps(vn2, vmagic_bias);
1178*4bdc9457SAndroid Build Coastguard Worker     vn3 = _mm256_sub_ps(vn3, vmagic_bias);
1179*4bdc9457SAndroid Build Coastguard Worker     vn4 = _mm256_sub_ps(vn4, vmagic_bias);
1180*4bdc9457SAndroid Build Coastguard Worker 
1181*4bdc9457SAndroid Build Coastguard Worker     __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0);
1182*4bdc9457SAndroid Build Coastguard Worker     __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1);
1183*4bdc9457SAndroid Build Coastguard Worker     __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2);
1184*4bdc9457SAndroid Build Coastguard Worker     __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3);
1185*4bdc9457SAndroid Build Coastguard Worker     __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4);
1186*4bdc9457SAndroid Build Coastguard Worker 
1187*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp0 = _mm256_fmadd_ps(vc2, vt0, vc1);
1188*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp1 = _mm256_fmadd_ps(vc2, vt1, vc1);
1189*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp2 = _mm256_fmadd_ps(vc2, vt2, vc1);
1190*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp3 = _mm256_fmadd_ps(vc2, vt3, vc1);
1191*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp4 = _mm256_fmadd_ps(vc2, vt4, vc1);
1192*4bdc9457SAndroid Build Coastguard Worker 
1193*4bdc9457SAndroid Build Coastguard Worker     vt0 = _mm256_mul_ps(vt0, vs0);
1194*4bdc9457SAndroid Build Coastguard Worker     vt1 = _mm256_mul_ps(vt1, vs1);
1195*4bdc9457SAndroid Build Coastguard Worker     vt2 = _mm256_mul_ps(vt2, vs2);
1196*4bdc9457SAndroid Build Coastguard Worker     vt3 = _mm256_mul_ps(vt3, vs3);
1197*4bdc9457SAndroid Build Coastguard Worker     vt4 = _mm256_mul_ps(vt4, vs4);
1198*4bdc9457SAndroid Build Coastguard Worker 
1199*4bdc9457SAndroid Build Coastguard Worker     __m256 vf0 = _mm256_fmadd_ps(vt0, vp0, vs0);
1200*4bdc9457SAndroid Build Coastguard Worker     __m256 vf1 = _mm256_fmadd_ps(vt1, vp1, vs1);
1201*4bdc9457SAndroid Build Coastguard Worker     __m256 vf2 = _mm256_fmadd_ps(vt2, vp2, vs2);
1202*4bdc9457SAndroid Build Coastguard Worker     __m256 vf3 = _mm256_fmadd_ps(vt3, vp3, vs3);
1203*4bdc9457SAndroid Build Coastguard Worker     __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4);
1204*4bdc9457SAndroid Build Coastguard Worker 
1205*4bdc9457SAndroid Build Coastguard Worker     vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vx0, vdenorm_cutoff, _CMP_LT_OS), vf0);
1206*4bdc9457SAndroid Build Coastguard Worker     vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vx1, vdenorm_cutoff, _CMP_LT_OS), vf1);
1207*4bdc9457SAndroid Build Coastguard Worker     vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vx2, vdenorm_cutoff, _CMP_LT_OS), vf2);
1208*4bdc9457SAndroid Build Coastguard Worker     vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vx3, vdenorm_cutoff, _CMP_LT_OS), vf3);
1209*4bdc9457SAndroid Build Coastguard Worker     vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vx4, vdenorm_cutoff, _CMP_LT_OS), vf4);
1210*4bdc9457SAndroid Build Coastguard Worker 
1211*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf0, _MM_FROUND_NO_EXC));
1212*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vf1, _MM_FROUND_NO_EXC));
1213*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) (o + 16), _mm256_cvtps_ph(vf2, _MM_FROUND_NO_EXC));
1214*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) (o + 24), _mm256_cvtps_ph(vf3, _MM_FROUND_NO_EXC));
1215*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) (o + 32), _mm256_cvtps_ph(vf4, _MM_FROUND_NO_EXC));
1216*4bdc9457SAndroid Build Coastguard Worker     o += 40;
1217*4bdc9457SAndroid Build Coastguard Worker 
1218*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_add_ps(vacc0, vf0);
1219*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_add_ps(vacc0, vf1);
1220*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_add_ps(vacc0, vf2);
1221*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_add_ps(vacc0, vf3);
1222*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_add_ps(vacc0, vf4);
1223*4bdc9457SAndroid Build Coastguard Worker   }
1224*4bdc9457SAndroid Build Coastguard Worker 
1225*4bdc9457SAndroid Build Coastguard Worker   __m256 vacc = vacc0;
1226*4bdc9457SAndroid Build Coastguard Worker   for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
1227*4bdc9457SAndroid Build Coastguard Worker     const __m256 vi = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1228*4bdc9457SAndroid Build Coastguard Worker     i += 8;
1229*4bdc9457SAndroid Build Coastguard Worker 
1230*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx = _mm256_sub_ps(vi, vi_max);
1231*4bdc9457SAndroid Build Coastguard Worker 
1232*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias);
1233*4bdc9457SAndroid Build Coastguard Worker 
1234*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1235*4bdc9457SAndroid Build Coastguard Worker 
1236*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
1237*4bdc9457SAndroid Build Coastguard Worker 
1238*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx);
1239*4bdc9457SAndroid Build Coastguard Worker 
1240*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp = _mm256_fmadd_ps(vc2, vt, vc1);
1241*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
1242*4bdc9457SAndroid Build Coastguard Worker     __m256 vf = _mm256_fmadd_ps(vt, vp, vs);
1243*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf);
1244*4bdc9457SAndroid Build Coastguard Worker 
1245*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf, _MM_FROUND_NO_EXC));
1246*4bdc9457SAndroid Build Coastguard Worker     o += 8;
1247*4bdc9457SAndroid Build Coastguard Worker 
1248*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_add_ps(vacc, vf);
1249*4bdc9457SAndroid Build Coastguard Worker   }
1250*4bdc9457SAndroid Build Coastguard Worker   __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1));
1251*4bdc9457SAndroid Build Coastguard Worker   if (batch != 0) {
1252*4bdc9457SAndroid Build Coastguard Worker     assert(batch >= 1 * sizeof(uint16_t));
1253*4bdc9457SAndroid Build Coastguard Worker     assert(batch <= 7 * sizeof(uint16_t));
1254*4bdc9457SAndroid Build Coastguard Worker 
1255*4bdc9457SAndroid Build Coastguard Worker     const __m256 vi = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1256*4bdc9457SAndroid Build Coastguard Worker 
1257*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx = _mm256_sub_ps(vi, vi_max);
1258*4bdc9457SAndroid Build Coastguard Worker 
1259*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias);
1260*4bdc9457SAndroid Build Coastguard Worker 
1261*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1262*4bdc9457SAndroid Build Coastguard Worker 
1263*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
1264*4bdc9457SAndroid Build Coastguard Worker 
1265*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx);
1266*4bdc9457SAndroid Build Coastguard Worker 
1267*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp = _mm256_fmadd_ps(vc2, vt, vc1);
1268*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
1269*4bdc9457SAndroid Build Coastguard Worker     __m256 vf = _mm256_fmadd_ps(vt, vp, vs);
1270*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf);
1271*4bdc9457SAndroid Build Coastguard Worker 
1272*4bdc9457SAndroid Build Coastguard Worker     __m128i vh = _mm256_cvtps_ph(vf, _MM_FROUND_NO_EXC);
1273*4bdc9457SAndroid Build Coastguard Worker     __m128 vf_lo = _mm256_castps256_ps128(vf);
1274*4bdc9457SAndroid Build Coastguard Worker     if (batch & (4 * sizeof(uint16_t))) {
1275*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) o, vh);
1276*4bdc9457SAndroid Build Coastguard Worker       vh = _mm_unpackhi_epi64(vh, vh);
1277*4bdc9457SAndroid Build Coastguard Worker       vacc_lo = _mm_add_ps(vacc_lo, vf_lo);
1278*4bdc9457SAndroid Build Coastguard Worker       vf_lo = _mm256_extractf128_ps(vf, 1);
1279*4bdc9457SAndroid Build Coastguard Worker       o += 4;
1280*4bdc9457SAndroid Build Coastguard Worker     }
1281*4bdc9457SAndroid Build Coastguard Worker     if (batch & (2 * sizeof(uint16_t))) {
1282*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(o, vh);
1283*4bdc9457SAndroid Build Coastguard Worker       vh = _mm_srli_epi64(vh, 32);
1284*4bdc9457SAndroid Build Coastguard Worker       vacc_lo = _mm_blend_ps(_mm_add_ps(vacc_lo, vf_lo), vacc_lo, 0xC);
1285*4bdc9457SAndroid Build Coastguard Worker       vf_lo = _mm_movehl_ps(vf_lo, vf_lo);
1286*4bdc9457SAndroid Build Coastguard Worker       o += 2;
1287*4bdc9457SAndroid Build Coastguard Worker     }
1288*4bdc9457SAndroid Build Coastguard Worker     if (batch & (1 * sizeof(uint16_t))) {
1289*4bdc9457SAndroid Build Coastguard Worker       *o = (uint16_t) _mm_extract_epi16(vh, 0);
1290*4bdc9457SAndroid Build Coastguard Worker       vacc_lo = _mm_add_ss(vacc_lo, vf_lo);
1291*4bdc9457SAndroid Build Coastguard Worker     }
1292*4bdc9457SAndroid Build Coastguard Worker   }
1293*4bdc9457SAndroid Build Coastguard Worker   vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo));
1294*4bdc9457SAndroid Build Coastguard Worker   vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo));
1295*4bdc9457SAndroid Build Coastguard Worker   *((uint16_t*) sum) = (uint16_t) _mm_extract_epi16(_mm_cvtps_ph(vacc_lo, _MM_FROUND_NO_EXC), 0);
1296*4bdc9457SAndroid Build Coastguard Worker   _mm256_zeroupper();
1297*4bdc9457SAndroid Build Coastguard Worker }
1298*4bdc9457SAndroid Build Coastguard Worker 
xnn_f16_velu_ukernel__avx2_rr1_p3_x16(size_t n,const void * input,void * output,const union xnn_f16_elu_params params[restrict XNN_MIN_ELEMENTS (1)])1299*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_velu_ukernel__avx2_rr1_p3_x16(
1300*4bdc9457SAndroid Build Coastguard Worker     size_t n,
1301*4bdc9457SAndroid Build Coastguard Worker     const void* input,
1302*4bdc9457SAndroid Build Coastguard Worker     void* output,
1303*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f16_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
1304*4bdc9457SAndroid Build Coastguard Worker {
1305*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(uint16_t) == 0);
1306*4bdc9457SAndroid Build Coastguard Worker 
1307*4bdc9457SAndroid Build Coastguard Worker   const __m256 vprescale = _mm256_load_ps(params->avx2_rr1_p3.prescale);
1308*4bdc9457SAndroid Build Coastguard Worker   const __m256 vsat_cutoff = _mm256_load_ps(params->avx2_rr1_p3.sat_cutoff);
1309*4bdc9457SAndroid Build Coastguard Worker   const __m256 vmagic_bias = _mm256_load_ps(params->avx2_rr1_p3.magic_bias);
1310*4bdc9457SAndroid Build Coastguard Worker   const __m256 vlog2e = _mm256_load_ps(params->avx2_rr1_p3.log2e);
1311*4bdc9457SAndroid Build Coastguard Worker   const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p3.minus_ln2);
1312*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc3 = _mm256_load_ps(params->avx2_rr1_p3.c3);
1313*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc2 = _mm256_load_ps(params->avx2_rr1_p3.c2);
1314*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc1 = _mm256_load_ps(params->avx2_rr1_p3.c1);
1315*4bdc9457SAndroid Build Coastguard Worker   const __m256 valpha = _mm256_load_ps(params->avx2_rr1_p3.alpha);
1316*4bdc9457SAndroid Build Coastguard Worker   const __m256 vbeta = _mm256_load_ps(params->avx2_rr1_p3.beta);
1317*4bdc9457SAndroid Build Coastguard Worker 
1318*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* i = (const uint16_t*) input;
1319*4bdc9457SAndroid Build Coastguard Worker   uint16_t* o = (uint16_t*) output;
1320*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1321*4bdc9457SAndroid Build Coastguard Worker     __m256 vx0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1322*4bdc9457SAndroid Build Coastguard Worker     __m256 vx1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
1323*4bdc9457SAndroid Build Coastguard Worker     i += 16;
1324*4bdc9457SAndroid Build Coastguard Worker 
1325*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz0 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx0, vprescale));
1326*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz1 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx1, vprescale));
1327*4bdc9457SAndroid Build Coastguard Worker 
1328*4bdc9457SAndroid Build Coastguard Worker     __m256 vn0 = _mm256_fmadd_ps(vz0, vlog2e, vmagic_bias);
1329*4bdc9457SAndroid Build Coastguard Worker     __m256 vn1 = _mm256_fmadd_ps(vz1, vlog2e, vmagic_bias);
1330*4bdc9457SAndroid Build Coastguard Worker 
1331*4bdc9457SAndroid Build Coastguard Worker     __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23));
1332*4bdc9457SAndroid Build Coastguard Worker     vn0 = _mm256_sub_ps(vn0, vmagic_bias);
1333*4bdc9457SAndroid Build Coastguard Worker     __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23));
1334*4bdc9457SAndroid Build Coastguard Worker     vn1 = _mm256_sub_ps(vn1, vmagic_bias);
1335*4bdc9457SAndroid Build Coastguard Worker 
1336*4bdc9457SAndroid Build Coastguard Worker     __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vz0);
1337*4bdc9457SAndroid Build Coastguard Worker     __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vz1);
1338*4bdc9457SAndroid Build Coastguard Worker 
1339*4bdc9457SAndroid Build Coastguard Worker     __m256 vp0 = _mm256_fmadd_ps(vc3, vt0, vc2);
1340*4bdc9457SAndroid Build Coastguard Worker     __m256 vp1 = _mm256_fmadd_ps(vc3, vt1, vc2);
1341*4bdc9457SAndroid Build Coastguard Worker 
1342*4bdc9457SAndroid Build Coastguard Worker     vp0 = _mm256_fmadd_ps(vp0, vt0, vc1);
1343*4bdc9457SAndroid Build Coastguard Worker     vt0 = _mm256_mul_ps(vt0, valpha);
1344*4bdc9457SAndroid Build Coastguard Worker     vp1 = _mm256_fmadd_ps(vp1, vt1, vc1);
1345*4bdc9457SAndroid Build Coastguard Worker     vt1 = _mm256_mul_ps(vt1, valpha);
1346*4bdc9457SAndroid Build Coastguard Worker 
1347*4bdc9457SAndroid Build Coastguard Worker     vt0 = _mm256_mul_ps(vt0, vs0);
1348*4bdc9457SAndroid Build Coastguard Worker     vs0 = _mm256_fmsub_ps(vs0, valpha, valpha);
1349*4bdc9457SAndroid Build Coastguard Worker     vt1 = _mm256_mul_ps(vt1, vs1);
1350*4bdc9457SAndroid Build Coastguard Worker     vs1 = _mm256_fmsub_ps(vs1, valpha, valpha);
1351*4bdc9457SAndroid Build Coastguard Worker 
1352*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve0 = _mm256_fmadd_ps(vp0, vt0, vs0);
1353*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_mul_ps(vx0, vbeta);
1354*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve1 = _mm256_fmadd_ps(vp1, vt1, vs1);
1355*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_mul_ps(vx1, vbeta);
1356*4bdc9457SAndroid Build Coastguard Worker 
1357*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy0 = _mm256_blendv_ps(vx0, ve0, vx0);
1358*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy1 = _mm256_blendv_ps(vx1, ve1, vx1);
1359*4bdc9457SAndroid Build Coastguard Worker 
1360*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy0, _MM_FROUND_NO_EXC));
1361*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vy1, _MM_FROUND_NO_EXC));
1362*4bdc9457SAndroid Build Coastguard Worker     o += 16;
1363*4bdc9457SAndroid Build Coastguard Worker   }
1364*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1365*4bdc9457SAndroid Build Coastguard Worker     __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1366*4bdc9457SAndroid Build Coastguard Worker     i += 8;
1367*4bdc9457SAndroid Build Coastguard Worker 
1368*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
1369*4bdc9457SAndroid Build Coastguard Worker 
1370*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1371*4bdc9457SAndroid Build Coastguard Worker     __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1372*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
1373*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1374*4bdc9457SAndroid Build Coastguard Worker 
1375*4bdc9457SAndroid Build Coastguard Worker     __m256 vp = _mm256_fmadd_ps(vc3, vt, vc2);
1376*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc1);
1377*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, valpha);
1378*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
1379*4bdc9457SAndroid Build Coastguard Worker     vs = _mm256_fmsub_ps(vs, valpha, valpha);
1380*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve = _mm256_fmadd_ps(vp, vt, vs);
1381*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_mul_ps(vx, vbeta);
1382*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
1383*4bdc9457SAndroid Build Coastguard Worker 
1384*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1385*4bdc9457SAndroid Build Coastguard Worker     o += 8;
1386*4bdc9457SAndroid Build Coastguard Worker   }
1387*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
1388*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(uint16_t));
1389*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 7 * sizeof(uint16_t));
1390*4bdc9457SAndroid Build Coastguard Worker     __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1391*4bdc9457SAndroid Build Coastguard Worker 
1392*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
1393*4bdc9457SAndroid Build Coastguard Worker 
1394*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1395*4bdc9457SAndroid Build Coastguard Worker     __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1396*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
1397*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1398*4bdc9457SAndroid Build Coastguard Worker 
1399*4bdc9457SAndroid Build Coastguard Worker     __m256 vp = _mm256_fmadd_ps(vc3, vt, vc2);
1400*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc1);
1401*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, valpha);
1402*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
1403*4bdc9457SAndroid Build Coastguard Worker     vs = _mm256_fmsub_ps(vs, valpha, valpha);
1404*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve = _mm256_fmadd_ps(vp, vt, vs);
1405*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_mul_ps(vx, vbeta);
1406*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
1407*4bdc9457SAndroid Build Coastguard Worker 
1408*4bdc9457SAndroid Build Coastguard Worker     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1409*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(uint16_t))) {
1410*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) o, vh);
1411*4bdc9457SAndroid Build Coastguard Worker       vh = _mm_unpackhi_epi64(vh, vh);
1412*4bdc9457SAndroid Build Coastguard Worker       o += 4;
1413*4bdc9457SAndroid Build Coastguard Worker     }
1414*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(uint16_t))) {
1415*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(o, vh);
1416*4bdc9457SAndroid Build Coastguard Worker       vh = _mm_srli_epi64(vh, 32);
1417*4bdc9457SAndroid Build Coastguard Worker       o += 2;
1418*4bdc9457SAndroid Build Coastguard Worker     }
1419*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(uint16_t))) {
1420*4bdc9457SAndroid Build Coastguard Worker       *o = (uint16_t) _mm_extract_epi16(vh, 0);
1421*4bdc9457SAndroid Build Coastguard Worker     }
1422*4bdc9457SAndroid Build Coastguard Worker   }
1423*4bdc9457SAndroid Build Coastguard Worker }
1424*4bdc9457SAndroid Build Coastguard Worker 
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32(size_t batch,const void * input,void * output,const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS (1)])1425*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32(
1426*4bdc9457SAndroid Build Coastguard Worker     size_t batch,
1427*4bdc9457SAndroid Build Coastguard Worker     const void* input,
1428*4bdc9457SAndroid Build Coastguard Worker     void* output,
1429*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)])
1430*4bdc9457SAndroid Build Coastguard Worker {
1431*4bdc9457SAndroid Build Coastguard Worker   assert(batch % sizeof(uint16_t) == 0);
1432*4bdc9457SAndroid Build Coastguard Worker 
1433*4bdc9457SAndroid Build Coastguard Worker   const __m256 vsign_mask = _mm256_load_ps(params->avx2_rr1_p2.sign_mask);
1434*4bdc9457SAndroid Build Coastguard Worker   const __m256 vmagic_bias = _mm256_load_ps(params->avx2_rr1_p2.magic_bias);
1435*4bdc9457SAndroid Build Coastguard Worker   const __m256 vlog2e = _mm256_load_ps(params->avx2_rr1_p2.log2e);
1436*4bdc9457SAndroid Build Coastguard Worker   const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2);
1437*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc2 = _mm256_load_ps(params->avx2_rr1_p2.c2);
1438*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc1 = _mm256_load_ps(params->avx2_rr1_p2.c1);
1439*4bdc9457SAndroid Build Coastguard Worker   const __m256 vone = _mm256_load_ps(params->avx2_rr1_p2.one);
1440*4bdc9457SAndroid Build Coastguard Worker   const __m256 vdenorm_cutoff = _mm256_load_ps(params->avx2_rr1_p2.denorm_cutoff);
1441*4bdc9457SAndroid Build Coastguard Worker 
1442*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* i = (const uint16_t*) input;
1443*4bdc9457SAndroid Build Coastguard Worker   uint16_t* o = (uint16_t*) output;
1444*4bdc9457SAndroid Build Coastguard Worker   for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) {
1445*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1446*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
1447*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 16)));
1448*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 24)));
1449*4bdc9457SAndroid Build Coastguard Worker     i += 32;
1450*4bdc9457SAndroid Build Coastguard Worker 
1451*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz0 = _mm256_or_ps(vx0, vsign_mask);
1452*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz1 = _mm256_or_ps(vx1, vsign_mask);
1453*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz2 = _mm256_or_ps(vx2, vsign_mask);
1454*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz3 = _mm256_or_ps(vx3, vsign_mask);
1455*4bdc9457SAndroid Build Coastguard Worker 
1456*4bdc9457SAndroid Build Coastguard Worker     __m256 vn0 = _mm256_fmadd_ps(vz0, vlog2e, vmagic_bias);
1457*4bdc9457SAndroid Build Coastguard Worker     __m256 vn1 = _mm256_fmadd_ps(vz1, vlog2e, vmagic_bias);
1458*4bdc9457SAndroid Build Coastguard Worker     __m256 vn2 = _mm256_fmadd_ps(vz2, vlog2e, vmagic_bias);
1459*4bdc9457SAndroid Build Coastguard Worker     __m256 vn3 = _mm256_fmadd_ps(vz3, vlog2e, vmagic_bias);
1460*4bdc9457SAndroid Build Coastguard Worker 
1461*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23));
1462*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23));
1463*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23));
1464*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23));
1465*4bdc9457SAndroid Build Coastguard Worker 
1466*4bdc9457SAndroid Build Coastguard Worker     vn0 = _mm256_sub_ps(vn0, vmagic_bias);
1467*4bdc9457SAndroid Build Coastguard Worker     vn1 = _mm256_sub_ps(vn1, vmagic_bias);
1468*4bdc9457SAndroid Build Coastguard Worker     vn2 = _mm256_sub_ps(vn2, vmagic_bias);
1469*4bdc9457SAndroid Build Coastguard Worker     vn3 = _mm256_sub_ps(vn3, vmagic_bias);
1470*4bdc9457SAndroid Build Coastguard Worker 
1471*4bdc9457SAndroid Build Coastguard Worker     __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vz0);
1472*4bdc9457SAndroid Build Coastguard Worker     __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vz1);
1473*4bdc9457SAndroid Build Coastguard Worker     __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vz2);
1474*4bdc9457SAndroid Build Coastguard Worker     __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vz3);
1475*4bdc9457SAndroid Build Coastguard Worker 
1476*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp0 = _mm256_fmadd_ps(vc2, vt0, vc1);
1477*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp1 = _mm256_fmadd_ps(vc2, vt1, vc1);
1478*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp2 = _mm256_fmadd_ps(vc2, vt2, vc1);
1479*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp3 = _mm256_fmadd_ps(vc2, vt3, vc1);
1480*4bdc9457SAndroid Build Coastguard Worker 
1481*4bdc9457SAndroid Build Coastguard Worker     vt0 = _mm256_mul_ps(vt0, vs0);
1482*4bdc9457SAndroid Build Coastguard Worker     vt1 = _mm256_mul_ps(vt1, vs1);
1483*4bdc9457SAndroid Build Coastguard Worker     vt2 = _mm256_mul_ps(vt2, vs2);
1484*4bdc9457SAndroid Build Coastguard Worker     vt3 = _mm256_mul_ps(vt3, vs3);
1485*4bdc9457SAndroid Build Coastguard Worker 
1486*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve0 = _mm256_fmadd_ps(vt0, vp0, vs0);
1487*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve1 = _mm256_fmadd_ps(vt1, vp1, vs1);
1488*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve2 = _mm256_fmadd_ps(vt2, vp2, vs2);
1489*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve3 = _mm256_fmadd_ps(vt3, vp3, vs3);
1490*4bdc9457SAndroid Build Coastguard Worker 
1491*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd0 = _mm256_add_ps(ve0, vone);
1492*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd1 = _mm256_add_ps(ve1, vone);
1493*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd2 = _mm256_add_ps(ve2, vone);
1494*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd3 = _mm256_add_ps(ve3, vone);
1495*4bdc9457SAndroid Build Coastguard Worker 
1496*4bdc9457SAndroid Build Coastguard Worker     const __m256 vr0 = _mm256_rcp_ps(vd0);
1497*4bdc9457SAndroid Build Coastguard Worker     const __m256 vr1 = _mm256_rcp_ps(vd1);
1498*4bdc9457SAndroid Build Coastguard Worker     const __m256 vr2 = _mm256_rcp_ps(vd2);
1499*4bdc9457SAndroid Build Coastguard Worker     const __m256 vr3 = _mm256_rcp_ps(vd3);
1500*4bdc9457SAndroid Build Coastguard Worker 
1501*4bdc9457SAndroid Build Coastguard Worker     __m256 vf0 = _mm256_mul_ps(ve0, vr0);
1502*4bdc9457SAndroid Build Coastguard Worker     __m256 vf1 = _mm256_mul_ps(ve1, vr1);
1503*4bdc9457SAndroid Build Coastguard Worker     __m256 vf2 = _mm256_mul_ps(ve2, vr2);
1504*4bdc9457SAndroid Build Coastguard Worker     __m256 vf3 = _mm256_mul_ps(ve3, vr3);
1505*4bdc9457SAndroid Build Coastguard Worker 
1506*4bdc9457SAndroid Build Coastguard Worker     vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vz0, vdenorm_cutoff, _CMP_LT_OS), vf0);
1507*4bdc9457SAndroid Build Coastguard Worker     vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vz1, vdenorm_cutoff, _CMP_LT_OS), vf1);
1508*4bdc9457SAndroid Build Coastguard Worker     vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vz2, vdenorm_cutoff, _CMP_LT_OS), vf2);
1509*4bdc9457SAndroid Build Coastguard Worker     vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vz3, vdenorm_cutoff, _CMP_LT_OS), vf3);
1510*4bdc9457SAndroid Build Coastguard Worker 
1511*4bdc9457SAndroid Build Coastguard Worker     vf0 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf0), vf0, vx0);
1512*4bdc9457SAndroid Build Coastguard Worker     vf1 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf1), vf1, vx1);
1513*4bdc9457SAndroid Build Coastguard Worker     vf2 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf2), vf2, vx2);
1514*4bdc9457SAndroid Build Coastguard Worker     vf3 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf3), vf3, vx3);
1515*4bdc9457SAndroid Build Coastguard Worker 
1516*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf0, _MM_FROUND_NO_EXC));
1517*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vf1, _MM_FROUND_NO_EXC));
1518*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) (o + 16), _mm256_cvtps_ph(vf2, _MM_FROUND_NO_EXC));
1519*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) (o + 24), _mm256_cvtps_ph(vf3, _MM_FROUND_NO_EXC));
1520*4bdc9457SAndroid Build Coastguard Worker     o += 32;
1521*4bdc9457SAndroid Build Coastguard Worker   }
1522*4bdc9457SAndroid Build Coastguard Worker   for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
1523*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1524*4bdc9457SAndroid Build Coastguard Worker     i += 8;
1525*4bdc9457SAndroid Build Coastguard Worker 
1526*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz = _mm256_or_ps(vx, vsign_mask);
1527*4bdc9457SAndroid Build Coastguard Worker 
1528*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1529*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1530*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
1531*4bdc9457SAndroid Build Coastguard Worker 
1532*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1533*4bdc9457SAndroid Build Coastguard Worker 
1534*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp = _mm256_fmadd_ps(vc2, vt, vc1);
1535*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
1536*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve = _mm256_fmadd_ps(vt, vp, vs);
1537*4bdc9457SAndroid Build Coastguard Worker 
1538*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd = _mm256_add_ps(ve, vone);
1539*4bdc9457SAndroid Build Coastguard Worker     const __m256 vr = _mm256_rcp_ps(vd);
1540*4bdc9457SAndroid Build Coastguard Worker     __m256 vf = _mm256_mul_ps(ve, vr);
1541*4bdc9457SAndroid Build Coastguard Worker 
1542*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
1543*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
1544*4bdc9457SAndroid Build Coastguard Worker 
1545*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf, _MM_FROUND_NO_EXC));
1546*4bdc9457SAndroid Build Coastguard Worker     o += 8;
1547*4bdc9457SAndroid Build Coastguard Worker   }
1548*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(batch != 0) {
1549*4bdc9457SAndroid Build Coastguard Worker     assert(batch >= 1 * sizeof(uint16_t));
1550*4bdc9457SAndroid Build Coastguard Worker     assert(batch <= 7 * sizeof(uint16_t));
1551*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1552*4bdc9457SAndroid Build Coastguard Worker 
1553*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz = _mm256_or_ps(vx, vsign_mask);
1554*4bdc9457SAndroid Build Coastguard Worker 
1555*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
1556*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
1557*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
1558*4bdc9457SAndroid Build Coastguard Worker 
1559*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
1560*4bdc9457SAndroid Build Coastguard Worker 
1561*4bdc9457SAndroid Build Coastguard Worker     const __m256 vp = _mm256_fmadd_ps(vc2, vt, vc1);
1562*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
1563*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve = _mm256_fmadd_ps(vt, vp, vs);
1564*4bdc9457SAndroid Build Coastguard Worker 
1565*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd = _mm256_add_ps(ve, vone);
1566*4bdc9457SAndroid Build Coastguard Worker     const __m256 vr = _mm256_rcp_ps(vd);
1567*4bdc9457SAndroid Build Coastguard Worker     __m256 vf = _mm256_mul_ps(ve, vr);
1568*4bdc9457SAndroid Build Coastguard Worker 
1569*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
1570*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
1571*4bdc9457SAndroid Build Coastguard Worker 
1572*4bdc9457SAndroid Build Coastguard Worker     __m128i vh = _mm256_cvtps_ph(vf, _MM_FROUND_NO_EXC);
1573*4bdc9457SAndroid Build Coastguard Worker     if (batch & (4 * sizeof(uint16_t))) {
1574*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) o, vh);
1575*4bdc9457SAndroid Build Coastguard Worker       vh = _mm_unpackhi_epi64(vh, vh);
1576*4bdc9457SAndroid Build Coastguard Worker       o += 4;
1577*4bdc9457SAndroid Build Coastguard Worker     }
1578*4bdc9457SAndroid Build Coastguard Worker     if (batch & (2 * sizeof(uint16_t))) {
1579*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(o, vh);
1580*4bdc9457SAndroid Build Coastguard Worker       vh = _mm_srli_epi64(vh, 32);
1581*4bdc9457SAndroid Build Coastguard Worker       o += 2;
1582*4bdc9457SAndroid Build Coastguard Worker     }
1583*4bdc9457SAndroid Build Coastguard Worker     if (batch & (1 * sizeof(uint16_t))) {
1584*4bdc9457SAndroid Build Coastguard Worker       *o = (uint16_t) _mm_extract_epi16(vh, 0);
1585*4bdc9457SAndroid Build Coastguard Worker     }
1586*4bdc9457SAndroid Build Coastguard Worker   }
1587*4bdc9457SAndroid Build Coastguard Worker }
1588*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_qs8_vcvt_ukernel__avx2_x64(size_t n,const float * x,int8_t * y,const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])1589*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_qs8_vcvt_ukernel__avx2_x64(
1590*4bdc9457SAndroid Build Coastguard Worker     size_t n,
1591*4bdc9457SAndroid Build Coastguard Worker     const float* x,
1592*4bdc9457SAndroid Build Coastguard Worker     int8_t* y,
1593*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
1594*4bdc9457SAndroid Build Coastguard Worker {
1595*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
1596*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
1597*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
1598*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
1599*4bdc9457SAndroid Build Coastguard Worker 
1600*4bdc9457SAndroid Build Coastguard Worker   const __m256 vscale = _mm256_load_ps(params->avx2.scale);
1601*4bdc9457SAndroid Build Coastguard Worker   const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
1602*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
1603*4bdc9457SAndroid Build Coastguard Worker   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) params->avx2.shuffle_mask);
1604*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx2.output_min);
1605*4bdc9457SAndroid Build Coastguard Worker 
1606*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 64 * sizeof(float); n -= 64 * sizeof(float)) {
1607*4bdc9457SAndroid Build Coastguard Worker     __m256 vx01 = _mm256_loadu_ps(x);
1608*4bdc9457SAndroid Build Coastguard Worker     __m256 vx23 = _mm256_loadu_ps(x + 8);
1609*4bdc9457SAndroid Build Coastguard Worker     __m256 vx45 = _mm256_loadu_ps(x + 16);
1610*4bdc9457SAndroid Build Coastguard Worker     __m256 vx67 = _mm256_loadu_ps(x + 24);
1611*4bdc9457SAndroid Build Coastguard Worker     __m256 vx89 = _mm256_loadu_ps(x + 32);
1612*4bdc9457SAndroid Build Coastguard Worker     __m256 vxAB = _mm256_loadu_ps(x + 40);
1613*4bdc9457SAndroid Build Coastguard Worker     __m256 vxCD = _mm256_loadu_ps(x + 48);
1614*4bdc9457SAndroid Build Coastguard Worker     __m256 vxEF = _mm256_loadu_ps(x + 56);
1615*4bdc9457SAndroid Build Coastguard Worker     x += 64;
1616*4bdc9457SAndroid Build Coastguard Worker 
1617*4bdc9457SAndroid Build Coastguard Worker     vx01 = _mm256_mul_ps(vx01, vscale);
1618*4bdc9457SAndroid Build Coastguard Worker     vx23 = _mm256_mul_ps(vx23, vscale);
1619*4bdc9457SAndroid Build Coastguard Worker     vx45 = _mm256_mul_ps(vx45, vscale);
1620*4bdc9457SAndroid Build Coastguard Worker     vx67 = _mm256_mul_ps(vx67, vscale);
1621*4bdc9457SAndroid Build Coastguard Worker     vx89 = _mm256_mul_ps(vx89, vscale);
1622*4bdc9457SAndroid Build Coastguard Worker     vxAB = _mm256_mul_ps(vxAB, vscale);
1623*4bdc9457SAndroid Build Coastguard Worker     vxCD = _mm256_mul_ps(vxCD, vscale);
1624*4bdc9457SAndroid Build Coastguard Worker     vxEF = _mm256_mul_ps(vxEF, vscale);
1625*4bdc9457SAndroid Build Coastguard Worker 
1626*4bdc9457SAndroid Build Coastguard Worker     vx01 = _mm256_min_ps(vx01, voutput_max_less_zero_point);
1627*4bdc9457SAndroid Build Coastguard Worker     vx23 = _mm256_min_ps(vx23, voutput_max_less_zero_point);
1628*4bdc9457SAndroid Build Coastguard Worker     vx45 = _mm256_min_ps(vx45, voutput_max_less_zero_point);
1629*4bdc9457SAndroid Build Coastguard Worker     vx67 = _mm256_min_ps(vx67, voutput_max_less_zero_point);
1630*4bdc9457SAndroid Build Coastguard Worker     vx89 = _mm256_min_ps(vx89, voutput_max_less_zero_point);
1631*4bdc9457SAndroid Build Coastguard Worker     vxAB = _mm256_min_ps(vxAB, voutput_max_less_zero_point);
1632*4bdc9457SAndroid Build Coastguard Worker     vxCD = _mm256_min_ps(vxCD, voutput_max_less_zero_point);
1633*4bdc9457SAndroid Build Coastguard Worker     vxEF = _mm256_min_ps(vxEF, voutput_max_less_zero_point);
1634*4bdc9457SAndroid Build Coastguard Worker 
1635*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc01 = _mm256_cvtps_epi32(vx01);
1636*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc23 = _mm256_cvtps_epi32(vx23);
1637*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc45 = _mm256_cvtps_epi32(vx45);
1638*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc67 = _mm256_cvtps_epi32(vx67);
1639*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc89 = _mm256_cvtps_epi32(vx89);
1640*4bdc9457SAndroid Build Coastguard Worker     const __m256i vaccAB = _mm256_cvtps_epi32(vxAB);
1641*4bdc9457SAndroid Build Coastguard Worker     const __m256i vaccCD = _mm256_cvtps_epi32(vxCD);
1642*4bdc9457SAndroid Build Coastguard Worker     const __m256i vaccEF = _mm256_cvtps_epi32(vxEF);
1643*4bdc9457SAndroid Build Coastguard Worker 
1644*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0213 = _mm256_packs_epi32(vacc01, vacc23);
1645*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc4657 = _mm256_packs_epi32(vacc45, vacc67);
1646*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc8A9B = _mm256_packs_epi32(vacc89, vaccAB);
1647*4bdc9457SAndroid Build Coastguard Worker     __m256i vaccCEDF = _mm256_packs_epi32(vaccCD, vaccEF);
1648*4bdc9457SAndroid Build Coastguard Worker 
1649*4bdc9457SAndroid Build Coastguard Worker     vacc0213 = _mm256_adds_epi16(vacc0213, voutput_zero_point);
1650*4bdc9457SAndroid Build Coastguard Worker     vacc4657 = _mm256_adds_epi16(vacc4657, voutput_zero_point);
1651*4bdc9457SAndroid Build Coastguard Worker     vacc8A9B = _mm256_adds_epi16(vacc8A9B, voutput_zero_point);
1652*4bdc9457SAndroid Build Coastguard Worker     vaccCEDF = _mm256_adds_epi16(vaccCEDF, voutput_zero_point);
1653*4bdc9457SAndroid Build Coastguard Worker 
1654*4bdc9457SAndroid Build Coastguard Worker     const __m256i vy02461357 = _mm256_packs_epi16(vacc0213, vacc4657);
1655*4bdc9457SAndroid Build Coastguard Worker     const __m256i vy8ACE9BDF = _mm256_packs_epi16(vacc8A9B, vaccCEDF);
1656*4bdc9457SAndroid Build Coastguard Worker 
1657*4bdc9457SAndroid Build Coastguard Worker     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
1658*4bdc9457SAndroid Build Coastguard Worker     __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask);
1659*4bdc9457SAndroid Build Coastguard Worker 
1660*4bdc9457SAndroid Build Coastguard Worker     vy01234567 = _mm256_max_epi8(vy01234567, voutput_min);
1661*4bdc9457SAndroid Build Coastguard Worker     vy89ABCDEF = _mm256_max_epi8(vy89ABCDEF, voutput_min);
1662*4bdc9457SAndroid Build Coastguard Worker 
1663*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) y, vy01234567);
1664*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) (y + 32), vy89ABCDEF);
1665*4bdc9457SAndroid Build Coastguard Worker     y += 64;
1666*4bdc9457SAndroid Build Coastguard Worker   }
1667*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1668*4bdc9457SAndroid Build Coastguard Worker     __m256 vx = _mm256_loadu_ps(x);
1669*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_mul_ps(vx, vscale);
1670*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
1671*4bdc9457SAndroid Build Coastguard Worker     x += 8;
1672*4bdc9457SAndroid Build Coastguard Worker 
1673*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc = _mm256_cvtps_epi32(vx);
1674*4bdc9457SAndroid Build Coastguard Worker 
1675*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
1676*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
1677*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_packs_epi16(vy, vy);
1678*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
1679*4bdc9457SAndroid Build Coastguard Worker 
1680*4bdc9457SAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i*) y, vy);
1681*4bdc9457SAndroid Build Coastguard Worker     y += 8;
1682*4bdc9457SAndroid Build Coastguard Worker   }
1683*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
1684*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(float));
1685*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 7 * sizeof(float));
1686*4bdc9457SAndroid Build Coastguard Worker     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx2.mask_table[7] - n));
1687*4bdc9457SAndroid Build Coastguard Worker 
1688*4bdc9457SAndroid Build Coastguard Worker     __m256 vx = _mm256_maskload_ps(x, vmask);
1689*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_mul_ps(vx, vscale);
1690*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
1691*4bdc9457SAndroid Build Coastguard Worker 
1692*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc = _mm256_cvtps_epi32(vx);
1693*4bdc9457SAndroid Build Coastguard Worker 
1694*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
1695*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
1696*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_packs_epi16(vy, vy);
1697*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
1698*4bdc9457SAndroid Build Coastguard Worker 
1699*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(float))) {
1700*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(y, vy);
1701*4bdc9457SAndroid Build Coastguard Worker       y += 4;
1702*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi64(vy, 32);
1703*4bdc9457SAndroid Build Coastguard Worker     }
1704*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
1705*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si16(y, vy);
1706*4bdc9457SAndroid Build Coastguard Worker       y += 2;
1707*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi32(vy, 16);
1708*4bdc9457SAndroid Build Coastguard Worker     }
1709*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
1710*4bdc9457SAndroid Build Coastguard Worker       *y = (int8_t) _mm_extract_epi8(vy, 0);
1711*4bdc9457SAndroid Build Coastguard Worker     }
1712*4bdc9457SAndroid Build Coastguard Worker   }
1713*4bdc9457SAndroid Build Coastguard Worker }
1714*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_qu8_vcvt_ukernel__avx2_x64(size_t n,const float * x,uint8_t * y,const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])1715*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_qu8_vcvt_ukernel__avx2_x64(
1716*4bdc9457SAndroid Build Coastguard Worker     size_t n,
1717*4bdc9457SAndroid Build Coastguard Worker     const float* x,
1718*4bdc9457SAndroid Build Coastguard Worker     uint8_t* y,
1719*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
1720*4bdc9457SAndroid Build Coastguard Worker {
1721*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
1722*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
1723*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
1724*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
1725*4bdc9457SAndroid Build Coastguard Worker 
1726*4bdc9457SAndroid Build Coastguard Worker   const __m256 vscale = _mm256_load_ps(params->avx2.scale);
1727*4bdc9457SAndroid Build Coastguard Worker   const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx2.output_max_less_zero_point);
1728*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
1729*4bdc9457SAndroid Build Coastguard Worker   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) params->avx2.shuffle_mask);
1730*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx2.output_min);
1731*4bdc9457SAndroid Build Coastguard Worker 
1732*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 64 * sizeof(float); n -= 64 * sizeof(float)) {
1733*4bdc9457SAndroid Build Coastguard Worker     __m256 vx01 = _mm256_loadu_ps(x);
1734*4bdc9457SAndroid Build Coastguard Worker     __m256 vx23 = _mm256_loadu_ps(x + 8);
1735*4bdc9457SAndroid Build Coastguard Worker     __m256 vx45 = _mm256_loadu_ps(x + 16);
1736*4bdc9457SAndroid Build Coastguard Worker     __m256 vx67 = _mm256_loadu_ps(x + 24);
1737*4bdc9457SAndroid Build Coastguard Worker     __m256 vx89 = _mm256_loadu_ps(x + 32);
1738*4bdc9457SAndroid Build Coastguard Worker     __m256 vxAB = _mm256_loadu_ps(x + 40);
1739*4bdc9457SAndroid Build Coastguard Worker     __m256 vxCD = _mm256_loadu_ps(x + 48);
1740*4bdc9457SAndroid Build Coastguard Worker     __m256 vxEF = _mm256_loadu_ps(x + 56);
1741*4bdc9457SAndroid Build Coastguard Worker     x += 64;
1742*4bdc9457SAndroid Build Coastguard Worker 
1743*4bdc9457SAndroid Build Coastguard Worker     vx01 = _mm256_mul_ps(vx01, vscale);
1744*4bdc9457SAndroid Build Coastguard Worker     vx23 = _mm256_mul_ps(vx23, vscale);
1745*4bdc9457SAndroid Build Coastguard Worker     vx45 = _mm256_mul_ps(vx45, vscale);
1746*4bdc9457SAndroid Build Coastguard Worker     vx67 = _mm256_mul_ps(vx67, vscale);
1747*4bdc9457SAndroid Build Coastguard Worker     vx89 = _mm256_mul_ps(vx89, vscale);
1748*4bdc9457SAndroid Build Coastguard Worker     vxAB = _mm256_mul_ps(vxAB, vscale);
1749*4bdc9457SAndroid Build Coastguard Worker     vxCD = _mm256_mul_ps(vxCD, vscale);
1750*4bdc9457SAndroid Build Coastguard Worker     vxEF = _mm256_mul_ps(vxEF, vscale);
1751*4bdc9457SAndroid Build Coastguard Worker 
1752*4bdc9457SAndroid Build Coastguard Worker     vx01 = _mm256_min_ps(vx01, voutput_max_less_zero_point);
1753*4bdc9457SAndroid Build Coastguard Worker     vx23 = _mm256_min_ps(vx23, voutput_max_less_zero_point);
1754*4bdc9457SAndroid Build Coastguard Worker     vx45 = _mm256_min_ps(vx45, voutput_max_less_zero_point);
1755*4bdc9457SAndroid Build Coastguard Worker     vx67 = _mm256_min_ps(vx67, voutput_max_less_zero_point);
1756*4bdc9457SAndroid Build Coastguard Worker     vx89 = _mm256_min_ps(vx89, voutput_max_less_zero_point);
1757*4bdc9457SAndroid Build Coastguard Worker     vxAB = _mm256_min_ps(vxAB, voutput_max_less_zero_point);
1758*4bdc9457SAndroid Build Coastguard Worker     vxCD = _mm256_min_ps(vxCD, voutput_max_less_zero_point);
1759*4bdc9457SAndroid Build Coastguard Worker     vxEF = _mm256_min_ps(vxEF, voutput_max_less_zero_point);
1760*4bdc9457SAndroid Build Coastguard Worker 
1761*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc01 = _mm256_cvtps_epi32(vx01);
1762*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc23 = _mm256_cvtps_epi32(vx23);
1763*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc45 = _mm256_cvtps_epi32(vx45);
1764*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc67 = _mm256_cvtps_epi32(vx67);
1765*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc89 = _mm256_cvtps_epi32(vx89);
1766*4bdc9457SAndroid Build Coastguard Worker     const __m256i vaccAB = _mm256_cvtps_epi32(vxAB);
1767*4bdc9457SAndroid Build Coastguard Worker     const __m256i vaccCD = _mm256_cvtps_epi32(vxCD);
1768*4bdc9457SAndroid Build Coastguard Worker     const __m256i vaccEF = _mm256_cvtps_epi32(vxEF);
1769*4bdc9457SAndroid Build Coastguard Worker 
1770*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0213 = _mm256_packs_epi32(vacc01, vacc23);
1771*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc4657 = _mm256_packs_epi32(vacc45, vacc67);
1772*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc8A9B = _mm256_packs_epi32(vacc89, vaccAB);
1773*4bdc9457SAndroid Build Coastguard Worker     __m256i vaccCEDF = _mm256_packs_epi32(vaccCD, vaccEF);
1774*4bdc9457SAndroid Build Coastguard Worker 
1775*4bdc9457SAndroid Build Coastguard Worker     vacc0213 = _mm256_adds_epi16(vacc0213, voutput_zero_point);
1776*4bdc9457SAndroid Build Coastguard Worker     vacc4657 = _mm256_adds_epi16(vacc4657, voutput_zero_point);
1777*4bdc9457SAndroid Build Coastguard Worker     vacc8A9B = _mm256_adds_epi16(vacc8A9B, voutput_zero_point);
1778*4bdc9457SAndroid Build Coastguard Worker     vaccCEDF = _mm256_adds_epi16(vaccCEDF, voutput_zero_point);
1779*4bdc9457SAndroid Build Coastguard Worker 
1780*4bdc9457SAndroid Build Coastguard Worker     const __m256i vy02461357 = _mm256_packus_epi16(vacc0213, vacc4657);
1781*4bdc9457SAndroid Build Coastguard Worker     const __m256i vy8ACE9BDF = _mm256_packus_epi16(vacc8A9B, vaccCEDF);
1782*4bdc9457SAndroid Build Coastguard Worker 
1783*4bdc9457SAndroid Build Coastguard Worker     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
1784*4bdc9457SAndroid Build Coastguard Worker     __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask);
1785*4bdc9457SAndroid Build Coastguard Worker 
1786*4bdc9457SAndroid Build Coastguard Worker     vy01234567 = _mm256_max_epu8(vy01234567, voutput_min);
1787*4bdc9457SAndroid Build Coastguard Worker     vy89ABCDEF = _mm256_max_epu8(vy89ABCDEF, voutput_min);
1788*4bdc9457SAndroid Build Coastguard Worker 
1789*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) y, vy01234567);
1790*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) (y + 32), vy89ABCDEF);
1791*4bdc9457SAndroid Build Coastguard Worker     y += 64;
1792*4bdc9457SAndroid Build Coastguard Worker   }
1793*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1794*4bdc9457SAndroid Build Coastguard Worker     __m256 vx = _mm256_loadu_ps(x);
1795*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_mul_ps(vx, vscale);
1796*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
1797*4bdc9457SAndroid Build Coastguard Worker     x += 8;
1798*4bdc9457SAndroid Build Coastguard Worker 
1799*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc = _mm256_cvtps_epi32(vx);
1800*4bdc9457SAndroid Build Coastguard Worker 
1801*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
1802*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
1803*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_packus_epi16(vy, vy);
1804*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
1805*4bdc9457SAndroid Build Coastguard Worker 
1806*4bdc9457SAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i*) y, vy);
1807*4bdc9457SAndroid Build Coastguard Worker     y += 8;
1808*4bdc9457SAndroid Build Coastguard Worker   }
1809*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
1810*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(float));
1811*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 7 * sizeof(float));
1812*4bdc9457SAndroid Build Coastguard Worker     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx2.mask_table[7] - n));
1813*4bdc9457SAndroid Build Coastguard Worker 
1814*4bdc9457SAndroid Build Coastguard Worker     __m256 vx = _mm256_maskload_ps(x, vmask);
1815*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_mul_ps(vx, vscale);
1816*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
1817*4bdc9457SAndroid Build Coastguard Worker 
1818*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc = _mm256_cvtps_epi32(vx);
1819*4bdc9457SAndroid Build Coastguard Worker 
1820*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
1821*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
1822*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_packus_epi16(vy, vy);
1823*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
1824*4bdc9457SAndroid Build Coastguard Worker 
1825*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(float))) {
1826*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(y, vy);
1827*4bdc9457SAndroid Build Coastguard Worker       y += 4;
1828*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi64(vy, 32);
1829*4bdc9457SAndroid Build Coastguard Worker     }
1830*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
1831*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si16(y, vy);
1832*4bdc9457SAndroid Build Coastguard Worker       y += 2;
1833*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi32(vy, 16);
1834*4bdc9457SAndroid Build Coastguard Worker     }
1835*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
1836*4bdc9457SAndroid Build Coastguard Worker       *y = (uint8_t) _mm_extract_epi8(vy, 0);
1837*4bdc9457SAndroid Build Coastguard Worker     }
1838*4bdc9457SAndroid Build Coastguard Worker   }
1839*4bdc9457SAndroid Build Coastguard Worker }
1840*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56(size_t n,const float * x,float * y,const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS (1)])1841*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56(
1842*4bdc9457SAndroid Build Coastguard Worker     size_t n,
1843*4bdc9457SAndroid Build Coastguard Worker     const float* x,
1844*4bdc9457SAndroid Build Coastguard Worker     float* y,
1845*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
1846*4bdc9457SAndroid Build Coastguard Worker {
1847*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
1848*4bdc9457SAndroid Build Coastguard Worker 
1849*4bdc9457SAndroid Build Coastguard Worker   const __m256 vprescale = _mm256_load_ps(params->avx2_rr1_lut4_p4.prescale);
1850*4bdc9457SAndroid Build Coastguard Worker   const __m256 valpha = _mm256_load_ps(params->avx2_rr1_lut4_p4.alpha);
1851*4bdc9457SAndroid Build Coastguard Worker   const __m256 vbeta = _mm256_load_ps(params->avx2_rr1_lut4_p4.beta);
1852*4bdc9457SAndroid Build Coastguard Worker   const __m256 vsat_cutoff = _mm256_load_ps(params->avx2_rr1_lut4_p4.sat_cutoff);
1853*4bdc9457SAndroid Build Coastguard Worker   const __m256 vmagic_bias = _mm256_load_ps(params->avx2_rr1_lut4_p4.magic_bias);
1854*4bdc9457SAndroid Build Coastguard Worker   const __m256 vlog2e = _mm256_load_ps(params->avx2_rr1_lut4_p4.log2e);
1855*4bdc9457SAndroid Build Coastguard Worker   const __m256 vtable = _mm256_load_ps(params->avx2_rr1_lut4_p4.table);
1856*4bdc9457SAndroid Build Coastguard Worker   const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_lut4_p4.minus_ln2);
1857*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc4 = _mm256_load_ps(params->avx2_rr1_lut4_p4.c4);
1858*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc3 = _mm256_load_ps(params->avx2_rr1_lut4_p4.c3);
1859*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc2 = _mm256_load_ps(params->avx2_rr1_lut4_p4.c2);
1860*4bdc9457SAndroid Build Coastguard Worker 
1861*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 56 * sizeof(float); n -= 56 * sizeof(float)) {
1862*4bdc9457SAndroid Build Coastguard Worker     __m256 vx0 = _mm256_loadu_ps(x);
1863*4bdc9457SAndroid Build Coastguard Worker     __m256 vx1 = _mm256_loadu_ps(x + 8);
1864*4bdc9457SAndroid Build Coastguard Worker     __m256 vx2 = _mm256_loadu_ps(x + 16);
1865*4bdc9457SAndroid Build Coastguard Worker     __m256 vx3 = _mm256_loadu_ps(x + 24);
1866*4bdc9457SAndroid Build Coastguard Worker     __m256 vx4 = _mm256_loadu_ps(x + 32);
1867*4bdc9457SAndroid Build Coastguard Worker     __m256 vx5 = _mm256_loadu_ps(x + 40);
1868*4bdc9457SAndroid Build Coastguard Worker     __m256 vx6 = _mm256_loadu_ps(x + 48);
1869*4bdc9457SAndroid Build Coastguard Worker     x += 56;
1870*4bdc9457SAndroid Build Coastguard Worker 
1871*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz0 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx0, vprescale));
1872*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz1 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx1, vprescale));
1873*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz2 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx2, vprescale));
1874*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz3 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx3, vprescale));
1875*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz4 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx4, vprescale));
1876*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz5 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx5, vprescale));
1877*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz6 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx6, vprescale));
1878*4bdc9457SAndroid Build Coastguard Worker 
1879*4bdc9457SAndroid Build Coastguard Worker     __m256 vn0 = _mm256_fmadd_ps(vz0, vlog2e, vmagic_bias);
1880*4bdc9457SAndroid Build Coastguard Worker     __m256 vn1 = _mm256_fmadd_ps(vz1, vlog2e, vmagic_bias);
1881*4bdc9457SAndroid Build Coastguard Worker     __m256 vn2 = _mm256_fmadd_ps(vz2, vlog2e, vmagic_bias);
1882*4bdc9457SAndroid Build Coastguard Worker     __m256 vn3 = _mm256_fmadd_ps(vz3, vlog2e, vmagic_bias);
1883*4bdc9457SAndroid Build Coastguard Worker     __m256 vn4 = _mm256_fmadd_ps(vz4, vlog2e, vmagic_bias);
1884*4bdc9457SAndroid Build Coastguard Worker     __m256 vn5 = _mm256_fmadd_ps(vz5, vlog2e, vmagic_bias);
1885*4bdc9457SAndroid Build Coastguard Worker     __m256 vn6 = _mm256_fmadd_ps(vz6, vlog2e, vmagic_bias);
1886*4bdc9457SAndroid Build Coastguard Worker 
1887*4bdc9457SAndroid Build Coastguard Worker     const __m256i ven0 = _mm256_slli_epi32(_mm256_castps_si256(vn0), 21);
1888*4bdc9457SAndroid Build Coastguard Worker     const __m256i vl0 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn0)));
1889*4bdc9457SAndroid Build Coastguard Worker     vn0 = _mm256_sub_ps(vn0, vmagic_bias);
1890*4bdc9457SAndroid Build Coastguard Worker     const __m256i ven1 = _mm256_slli_epi32(_mm256_castps_si256(vn1), 21);
1891*4bdc9457SAndroid Build Coastguard Worker     const __m256i vl1 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn1)));
1892*4bdc9457SAndroid Build Coastguard Worker     vn1 = _mm256_sub_ps(vn1, vmagic_bias);
1893*4bdc9457SAndroid Build Coastguard Worker     const __m256i ven2 = _mm256_slli_epi32(_mm256_castps_si256(vn2), 21);
1894*4bdc9457SAndroid Build Coastguard Worker     const __m256i vl2 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn2)));
1895*4bdc9457SAndroid Build Coastguard Worker     vn2 = _mm256_sub_ps(vn2, vmagic_bias);
1896*4bdc9457SAndroid Build Coastguard Worker     const __m256i ven3 = _mm256_slli_epi32(_mm256_castps_si256(vn3), 21);
1897*4bdc9457SAndroid Build Coastguard Worker     const __m256i vl3 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn3)));
1898*4bdc9457SAndroid Build Coastguard Worker     vn3 = _mm256_sub_ps(vn3, vmagic_bias);
1899*4bdc9457SAndroid Build Coastguard Worker     const __m256i ven4 = _mm256_slli_epi32(_mm256_castps_si256(vn4), 21);
1900*4bdc9457SAndroid Build Coastguard Worker     const __m256i vl4 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn4)));
1901*4bdc9457SAndroid Build Coastguard Worker     vn4 = _mm256_sub_ps(vn4, vmagic_bias);
1902*4bdc9457SAndroid Build Coastguard Worker     const __m256i ven5 = _mm256_slli_epi32(_mm256_castps_si256(vn5), 21);
1903*4bdc9457SAndroid Build Coastguard Worker     const __m256i vl5 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn5)));
1904*4bdc9457SAndroid Build Coastguard Worker     vn5 = _mm256_sub_ps(vn5, vmagic_bias);
1905*4bdc9457SAndroid Build Coastguard Worker     const __m256i ven6 = _mm256_slli_epi32(_mm256_castps_si256(vn6), 21);
1906*4bdc9457SAndroid Build Coastguard Worker     const __m256i vl6 = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn6)));
1907*4bdc9457SAndroid Build Coastguard Worker     vn6 = _mm256_sub_ps(vn6, vmagic_bias);
1908*4bdc9457SAndroid Build Coastguard Worker 
1909*4bdc9457SAndroid Build Coastguard Worker     __m256 vs0 = _mm256_castsi256_ps(_mm256_add_epi32(vl0, ven0));
1910*4bdc9457SAndroid Build Coastguard Worker     __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vz0);
1911*4bdc9457SAndroid Build Coastguard Worker     __m256 vs1 = _mm256_castsi256_ps(_mm256_add_epi32(vl1, ven1));
1912*4bdc9457SAndroid Build Coastguard Worker     __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vz1);
1913*4bdc9457SAndroid Build Coastguard Worker     __m256 vs2 = _mm256_castsi256_ps(_mm256_add_epi32(vl2, ven2));
1914*4bdc9457SAndroid Build Coastguard Worker     __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vz2);
1915*4bdc9457SAndroid Build Coastguard Worker     __m256 vs3 = _mm256_castsi256_ps(_mm256_add_epi32(vl3, ven3));
1916*4bdc9457SAndroid Build Coastguard Worker     __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vz3);
1917*4bdc9457SAndroid Build Coastguard Worker     __m256 vs4 = _mm256_castsi256_ps(_mm256_add_epi32(vl4, ven4));
1918*4bdc9457SAndroid Build Coastguard Worker     __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vz4);
1919*4bdc9457SAndroid Build Coastguard Worker     __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5));
1920*4bdc9457SAndroid Build Coastguard Worker     __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vz5);
1921*4bdc9457SAndroid Build Coastguard Worker     __m256 vs6 = _mm256_castsi256_ps(_mm256_add_epi32(vl6, ven6));
1922*4bdc9457SAndroid Build Coastguard Worker     __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vz6);
1923*4bdc9457SAndroid Build Coastguard Worker 
1924*4bdc9457SAndroid Build Coastguard Worker     __m256 vp0 = _mm256_fmadd_ps(vc4, vt0, vc3);
1925*4bdc9457SAndroid Build Coastguard Worker     __m256 vp1 = _mm256_fmadd_ps(vc4, vt1, vc3);
1926*4bdc9457SAndroid Build Coastguard Worker     __m256 vp2 = _mm256_fmadd_ps(vc4, vt2, vc3);
1927*4bdc9457SAndroid Build Coastguard Worker     __m256 vp3 = _mm256_fmadd_ps(vc4, vt3, vc3);
1928*4bdc9457SAndroid Build Coastguard Worker     __m256 vp4 = _mm256_fmadd_ps(vc4, vt4, vc3);
1929*4bdc9457SAndroid Build Coastguard Worker     __m256 vp5 = _mm256_fmadd_ps(vc4, vt5, vc3);
1930*4bdc9457SAndroid Build Coastguard Worker     __m256 vp6 = _mm256_fmadd_ps(vc4, vt6, vc3);
1931*4bdc9457SAndroid Build Coastguard Worker 
1932*4bdc9457SAndroid Build Coastguard Worker     vp0 = _mm256_fmadd_ps(vp0, vt0, vc2);
1933*4bdc9457SAndroid Build Coastguard Worker     vp1 = _mm256_fmadd_ps(vp1, vt1, vc2);
1934*4bdc9457SAndroid Build Coastguard Worker     vp2 = _mm256_fmadd_ps(vp2, vt2, vc2);
1935*4bdc9457SAndroid Build Coastguard Worker     vp3 = _mm256_fmadd_ps(vp3, vt3, vc2);
1936*4bdc9457SAndroid Build Coastguard Worker     vp4 = _mm256_fmadd_ps(vp4, vt4, vc2);
1937*4bdc9457SAndroid Build Coastguard Worker     vp5 = _mm256_fmadd_ps(vp5, vt5, vc2);
1938*4bdc9457SAndroid Build Coastguard Worker     vp6 = _mm256_fmadd_ps(vp6, vt6, vc2);
1939*4bdc9457SAndroid Build Coastguard Worker 
1940*4bdc9457SAndroid Build Coastguard Worker     vp0 = _mm256_mul_ps(vp0, vt0);
1941*4bdc9457SAndroid Build Coastguard Worker     vt0 = _mm256_mul_ps(vt0, vs0);
1942*4bdc9457SAndroid Build Coastguard Worker     vp1 = _mm256_mul_ps(vp1, vt1);
1943*4bdc9457SAndroid Build Coastguard Worker     vt1 = _mm256_mul_ps(vt1, vs1);
1944*4bdc9457SAndroid Build Coastguard Worker     vp2 = _mm256_mul_ps(vp2, vt2);
1945*4bdc9457SAndroid Build Coastguard Worker     vt2 = _mm256_mul_ps(vt2, vs2);
1946*4bdc9457SAndroid Build Coastguard Worker     vp3 = _mm256_mul_ps(vp3, vt3);
1947*4bdc9457SAndroid Build Coastguard Worker     vt3 = _mm256_mul_ps(vt3, vs3);
1948*4bdc9457SAndroid Build Coastguard Worker     vp4 = _mm256_mul_ps(vp4, vt4);
1949*4bdc9457SAndroid Build Coastguard Worker     vt4 = _mm256_mul_ps(vt4, vs4);
1950*4bdc9457SAndroid Build Coastguard Worker     vp5 = _mm256_mul_ps(vp5, vt5);
1951*4bdc9457SAndroid Build Coastguard Worker     vt5 = _mm256_mul_ps(vt5, vs5);
1952*4bdc9457SAndroid Build Coastguard Worker     vp6 = _mm256_mul_ps(vp6, vt6);
1953*4bdc9457SAndroid Build Coastguard Worker     vt6 = _mm256_mul_ps(vt6, vs6);
1954*4bdc9457SAndroid Build Coastguard Worker 
1955*4bdc9457SAndroid Build Coastguard Worker     vs0 = _mm256_fmsub_ps(vs0, valpha, valpha);
1956*4bdc9457SAndroid Build Coastguard Worker     vp0 = _mm256_fmadd_ps(vp0, vt0, vt0);
1957*4bdc9457SAndroid Build Coastguard Worker     vs1 = _mm256_fmsub_ps(vs1, valpha, valpha);
1958*4bdc9457SAndroid Build Coastguard Worker     vp1 = _mm256_fmadd_ps(vp1, vt1, vt1);
1959*4bdc9457SAndroid Build Coastguard Worker     vs2 = _mm256_fmsub_ps(vs2, valpha, valpha);
1960*4bdc9457SAndroid Build Coastguard Worker     vp2 = _mm256_fmadd_ps(vp2, vt2, vt2);
1961*4bdc9457SAndroid Build Coastguard Worker     vs3 = _mm256_fmsub_ps(vs3, valpha, valpha);
1962*4bdc9457SAndroid Build Coastguard Worker     vp3 = _mm256_fmadd_ps(vp3, vt3, vt3);
1963*4bdc9457SAndroid Build Coastguard Worker     vs4 = _mm256_fmsub_ps(vs4, valpha, valpha);
1964*4bdc9457SAndroid Build Coastguard Worker     vp4 = _mm256_fmadd_ps(vp4, vt4, vt4);
1965*4bdc9457SAndroid Build Coastguard Worker     vs5 = _mm256_fmsub_ps(vs5, valpha, valpha);
1966*4bdc9457SAndroid Build Coastguard Worker     vp5 = _mm256_fmadd_ps(vp5, vt5, vt5);
1967*4bdc9457SAndroid Build Coastguard Worker     vs6 = _mm256_fmsub_ps(vs6, valpha, valpha);
1968*4bdc9457SAndroid Build Coastguard Worker     vp6 = _mm256_fmadd_ps(vp6, vt6, vt6);
1969*4bdc9457SAndroid Build Coastguard Worker 
1970*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve0 = _mm256_fmadd_ps(vp0, valpha, vs0);
1971*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_mul_ps(vx0, vbeta);
1972*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve1 = _mm256_fmadd_ps(vp1, valpha, vs1);
1973*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_mul_ps(vx1, vbeta);
1974*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve2 = _mm256_fmadd_ps(vp2, valpha, vs2);
1975*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_mul_ps(vx2, vbeta);
1976*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve3 = _mm256_fmadd_ps(vp3, valpha, vs3);
1977*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_mul_ps(vx3, vbeta);
1978*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve4 = _mm256_fmadd_ps(vp4, valpha, vs4);
1979*4bdc9457SAndroid Build Coastguard Worker     vx4 = _mm256_mul_ps(vx4, vbeta);
1980*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5);
1981*4bdc9457SAndroid Build Coastguard Worker     vx5 = _mm256_mul_ps(vx5, vbeta);
1982*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve6 = _mm256_fmadd_ps(vp6, valpha, vs6);
1983*4bdc9457SAndroid Build Coastguard Worker     vx6 = _mm256_mul_ps(vx6, vbeta);
1984*4bdc9457SAndroid Build Coastguard Worker 
1985*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy0 = _mm256_blendv_ps(vx0, ve0, vx0);
1986*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy1 = _mm256_blendv_ps(vx1, ve1, vx1);
1987*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy2 = _mm256_blendv_ps(vx2, ve2, vx2);
1988*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy3 = _mm256_blendv_ps(vx3, ve3, vx3);
1989*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy4 = _mm256_blendv_ps(vx4, ve4, vx4);
1990*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy5 = _mm256_blendv_ps(vx5, ve5, vx5);
1991*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy6 = _mm256_blendv_ps(vx6, ve6, vx6);
1992*4bdc9457SAndroid Build Coastguard Worker 
1993*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y, vy0);
1994*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 8, vy1);
1995*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 16, vy2);
1996*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 24, vy3);
1997*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 32, vy4);
1998*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 40, vy5);
1999*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 48, vy6);
2000*4bdc9457SAndroid Build Coastguard Worker     y += 56;
2001*4bdc9457SAndroid Build Coastguard Worker   }
2002*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2003*4bdc9457SAndroid Build Coastguard Worker     __m256 vx = _mm256_loadu_ps(x);
2004*4bdc9457SAndroid Build Coastguard Worker     x += 8;
2005*4bdc9457SAndroid Build Coastguard Worker 
2006*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
2007*4bdc9457SAndroid Build Coastguard Worker 
2008*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
2009*4bdc9457SAndroid Build Coastguard Worker     const __m256i ven = _mm256_slli_epi32(_mm256_castps_si256(vn), 21);
2010*4bdc9457SAndroid Build Coastguard Worker     const __m256i vl = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn)));
2011*4bdc9457SAndroid Build Coastguard Worker     __m256 vs = _mm256_castsi256_ps(_mm256_add_epi32(vl, ven));
2012*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
2013*4bdc9457SAndroid Build Coastguard Worker 
2014*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
2015*4bdc9457SAndroid Build Coastguard Worker 
2016*4bdc9457SAndroid Build Coastguard Worker     __m256 vp = _mm256_fmadd_ps(vc4, vt, vc3);
2017*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc2);
2018*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_mul_ps(vp, vt);
2019*4bdc9457SAndroid Build Coastguard Worker 
2020*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
2021*4bdc9457SAndroid Build Coastguard Worker     vs = _mm256_fmsub_ps(vs, valpha, valpha);
2022*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vt);
2023*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve = _mm256_fmadd_ps(vp, valpha, vs);
2024*4bdc9457SAndroid Build Coastguard Worker 
2025*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_mul_ps(vx, vbeta);
2026*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
2027*4bdc9457SAndroid Build Coastguard Worker 
2028*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y, vy);
2029*4bdc9457SAndroid Build Coastguard Worker     y += 8;
2030*4bdc9457SAndroid Build Coastguard Worker   }
2031*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
2032*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(float));
2033*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 7 * sizeof(float));
2034*4bdc9457SAndroid Build Coastguard Worker     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx2_rr1_lut4_p4.mask_table[7] - n));
2035*4bdc9457SAndroid Build Coastguard Worker 
2036*4bdc9457SAndroid Build Coastguard Worker     __m256 vx = _mm256_maskload_ps(x, vmask);
2037*4bdc9457SAndroid Build Coastguard Worker 
2038*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
2039*4bdc9457SAndroid Build Coastguard Worker 
2040*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
2041*4bdc9457SAndroid Build Coastguard Worker     const __m256i ven = _mm256_slli_epi32(_mm256_castps_si256(vn), 21);
2042*4bdc9457SAndroid Build Coastguard Worker     const __m256i vl = _mm256_castps_si256(_mm256_permutevar_ps(vtable, _mm256_castps_si256(vn)));
2043*4bdc9457SAndroid Build Coastguard Worker     __m256 vs = _mm256_castsi256_ps(_mm256_add_epi32(vl, ven));
2044*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
2045*4bdc9457SAndroid Build Coastguard Worker 
2046*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
2047*4bdc9457SAndroid Build Coastguard Worker 
2048*4bdc9457SAndroid Build Coastguard Worker     __m256 vp = _mm256_fmadd_ps(vc4, vt, vc3);
2049*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc2);
2050*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_mul_ps(vp, vt);
2051*4bdc9457SAndroid Build Coastguard Worker 
2052*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
2053*4bdc9457SAndroid Build Coastguard Worker     vs = _mm256_fmsub_ps(vs, valpha, valpha);
2054*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vt);
2055*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve = _mm256_fmadd_ps(vp, valpha, vs);
2056*4bdc9457SAndroid Build Coastguard Worker 
2057*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_mul_ps(vx, vbeta);
2058*4bdc9457SAndroid Build Coastguard Worker     const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
2059*4bdc9457SAndroid Build Coastguard Worker 
2060*4bdc9457SAndroid Build Coastguard Worker     __m128 vy_lo = _mm256_castps256_ps128(vy);
2061*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(float))) {
2062*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(y, vy_lo);
2063*4bdc9457SAndroid Build Coastguard Worker       vy_lo = _mm256_extractf128_ps(vy, 1);
2064*4bdc9457SAndroid Build Coastguard Worker       y += 4;
2065*4bdc9457SAndroid Build Coastguard Worker     }
2066*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
2067*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy_lo);
2068*4bdc9457SAndroid Build Coastguard Worker       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2069*4bdc9457SAndroid Build Coastguard Worker       y += 2;
2070*4bdc9457SAndroid Build Coastguard Worker     }
2071*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
2072*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy_lo);
2073*4bdc9457SAndroid Build Coastguard Worker     }
2074*4bdc9457SAndroid Build Coastguard Worker   }
2075*4bdc9457SAndroid Build Coastguard Worker }
2076*4bdc9457SAndroid Build Coastguard Worker 
xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40(size_t n,const float * x,float * y,const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS (1)])2077*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40(
2078*4bdc9457SAndroid Build Coastguard Worker     size_t n,
2079*4bdc9457SAndroid Build Coastguard Worker     const float* x,
2080*4bdc9457SAndroid Build Coastguard Worker     float* y,
2081*4bdc9457SAndroid Build Coastguard Worker     const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)])
2082*4bdc9457SAndroid Build Coastguard Worker {
2083*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(float) == 0);
2084*4bdc9457SAndroid Build Coastguard Worker 
2085*4bdc9457SAndroid Build Coastguard Worker   const __m256 vsign_mask = _mm256_load_ps(params->avx2_rr1_p5.sign_mask);
2086*4bdc9457SAndroid Build Coastguard Worker   const __m256 vmagic_bias = _mm256_load_ps(params->avx2_rr1_p5.magic_bias);
2087*4bdc9457SAndroid Build Coastguard Worker   const __m256 vlog2e = _mm256_load_ps(params->avx2_rr1_p5.log2e);
2088*4bdc9457SAndroid Build Coastguard Worker   const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p5.minus_ln2);
2089*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc5 = _mm256_load_ps(params->avx2_rr1_p5.c5);
2090*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc4 = _mm256_load_ps(params->avx2_rr1_p5.c4);
2091*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc3 = _mm256_load_ps(params->avx2_rr1_p5.c3);
2092*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc2 = _mm256_load_ps(params->avx2_rr1_p5.c2);
2093*4bdc9457SAndroid Build Coastguard Worker   const __m256 vc1 = _mm256_load_ps(params->avx2_rr1_p5.c1);
2094*4bdc9457SAndroid Build Coastguard Worker   const __m256 vone = _mm256_load_ps(params->avx2_rr1_p5.one);
2095*4bdc9457SAndroid Build Coastguard Worker   const __m256 vdenorm_cutoff = _mm256_load_ps(params->avx2_rr1_p5.denorm_cutoff);
2096*4bdc9457SAndroid Build Coastguard Worker 
2097*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 40 * sizeof(float); n -= 40 * sizeof(float)) {
2098*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx0 = _mm256_loadu_ps(x);
2099*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx1 = _mm256_loadu_ps(x + 8);
2100*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx2 = _mm256_loadu_ps(x + 16);
2101*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx3 = _mm256_loadu_ps(x + 24);
2102*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx4 = _mm256_loadu_ps(x + 32);
2103*4bdc9457SAndroid Build Coastguard Worker     x += 40;
2104*4bdc9457SAndroid Build Coastguard Worker 
2105*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz0 = _mm256_or_ps(vx0, vsign_mask);
2106*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz1 = _mm256_or_ps(vx1, vsign_mask);
2107*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz2 = _mm256_or_ps(vx2, vsign_mask);
2108*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz3 = _mm256_or_ps(vx3, vsign_mask);
2109*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz4 = _mm256_or_ps(vx4, vsign_mask);
2110*4bdc9457SAndroid Build Coastguard Worker 
2111*4bdc9457SAndroid Build Coastguard Worker     __m256 vn0 = _mm256_fmadd_ps(vz0, vlog2e, vmagic_bias);
2112*4bdc9457SAndroid Build Coastguard Worker     __m256 vn1 = _mm256_fmadd_ps(vz1, vlog2e, vmagic_bias);
2113*4bdc9457SAndroid Build Coastguard Worker     __m256 vn2 = _mm256_fmadd_ps(vz2, vlog2e, vmagic_bias);
2114*4bdc9457SAndroid Build Coastguard Worker     __m256 vn3 = _mm256_fmadd_ps(vz3, vlog2e, vmagic_bias);
2115*4bdc9457SAndroid Build Coastguard Worker     __m256 vn4 = _mm256_fmadd_ps(vz4, vlog2e, vmagic_bias);
2116*4bdc9457SAndroid Build Coastguard Worker 
2117*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs0 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn0), 23));
2118*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs1 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn1), 23));
2119*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs2 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn2), 23));
2120*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs3 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn3), 23));
2121*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs4 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn4), 23));
2122*4bdc9457SAndroid Build Coastguard Worker 
2123*4bdc9457SAndroid Build Coastguard Worker     vn0 = _mm256_sub_ps(vn0, vmagic_bias);
2124*4bdc9457SAndroid Build Coastguard Worker     vn1 = _mm256_sub_ps(vn1, vmagic_bias);
2125*4bdc9457SAndroid Build Coastguard Worker     vn2 = _mm256_sub_ps(vn2, vmagic_bias);
2126*4bdc9457SAndroid Build Coastguard Worker     vn3 = _mm256_sub_ps(vn3, vmagic_bias);
2127*4bdc9457SAndroid Build Coastguard Worker     vn4 = _mm256_sub_ps(vn4, vmagic_bias);
2128*4bdc9457SAndroid Build Coastguard Worker 
2129*4bdc9457SAndroid Build Coastguard Worker     __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vz0);
2130*4bdc9457SAndroid Build Coastguard Worker     __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vz1);
2131*4bdc9457SAndroid Build Coastguard Worker     __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vz2);
2132*4bdc9457SAndroid Build Coastguard Worker     __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vz3);
2133*4bdc9457SAndroid Build Coastguard Worker     __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vz4);
2134*4bdc9457SAndroid Build Coastguard Worker 
2135*4bdc9457SAndroid Build Coastguard Worker     __m256 vp0 = _mm256_fmadd_ps(vc5, vt0, vc4);
2136*4bdc9457SAndroid Build Coastguard Worker     __m256 vp1 = _mm256_fmadd_ps(vc5, vt1, vc4);
2137*4bdc9457SAndroid Build Coastguard Worker     __m256 vp2 = _mm256_fmadd_ps(vc5, vt2, vc4);
2138*4bdc9457SAndroid Build Coastguard Worker     __m256 vp3 = _mm256_fmadd_ps(vc5, vt3, vc4);
2139*4bdc9457SAndroid Build Coastguard Worker     __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4);
2140*4bdc9457SAndroid Build Coastguard Worker 
2141*4bdc9457SAndroid Build Coastguard Worker     vp0 = _mm256_fmadd_ps(vp0, vt0, vc3);
2142*4bdc9457SAndroid Build Coastguard Worker     vp1 = _mm256_fmadd_ps(vp1, vt1, vc3);
2143*4bdc9457SAndroid Build Coastguard Worker     vp2 = _mm256_fmadd_ps(vp2, vt2, vc3);
2144*4bdc9457SAndroid Build Coastguard Worker     vp3 = _mm256_fmadd_ps(vp3, vt3, vc3);
2145*4bdc9457SAndroid Build Coastguard Worker     vp4 = _mm256_fmadd_ps(vp4, vt4, vc3);
2146*4bdc9457SAndroid Build Coastguard Worker 
2147*4bdc9457SAndroid Build Coastguard Worker     vp0 = _mm256_fmadd_ps(vp0, vt0, vc2);
2148*4bdc9457SAndroid Build Coastguard Worker     vp1 = _mm256_fmadd_ps(vp1, vt1, vc2);
2149*4bdc9457SAndroid Build Coastguard Worker     vp2 = _mm256_fmadd_ps(vp2, vt2, vc2);
2150*4bdc9457SAndroid Build Coastguard Worker     vp3 = _mm256_fmadd_ps(vp3, vt3, vc2);
2151*4bdc9457SAndroid Build Coastguard Worker     vp4 = _mm256_fmadd_ps(vp4, vt4, vc2);
2152*4bdc9457SAndroid Build Coastguard Worker 
2153*4bdc9457SAndroid Build Coastguard Worker     vp0 = _mm256_fmadd_ps(vp0, vt0, vc1);
2154*4bdc9457SAndroid Build Coastguard Worker     vp1 = _mm256_fmadd_ps(vp1, vt1, vc1);
2155*4bdc9457SAndroid Build Coastguard Worker     vp2 = _mm256_fmadd_ps(vp2, vt2, vc1);
2156*4bdc9457SAndroid Build Coastguard Worker     vp3 = _mm256_fmadd_ps(vp3, vt3, vc1);
2157*4bdc9457SAndroid Build Coastguard Worker     vp4 = _mm256_fmadd_ps(vp4, vt4, vc1);
2158*4bdc9457SAndroid Build Coastguard Worker 
2159*4bdc9457SAndroid Build Coastguard Worker     vt0 = _mm256_mul_ps(vt0, vs0);
2160*4bdc9457SAndroid Build Coastguard Worker     vt1 = _mm256_mul_ps(vt1, vs1);
2161*4bdc9457SAndroid Build Coastguard Worker     vt2 = _mm256_mul_ps(vt2, vs2);
2162*4bdc9457SAndroid Build Coastguard Worker     vt3 = _mm256_mul_ps(vt3, vs3);
2163*4bdc9457SAndroid Build Coastguard Worker     vt4 = _mm256_mul_ps(vt4, vs4);
2164*4bdc9457SAndroid Build Coastguard Worker 
2165*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve0 = _mm256_fmadd_ps(vt0, vp0, vs0);
2166*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve1 = _mm256_fmadd_ps(vt1, vp1, vs1);
2167*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve2 = _mm256_fmadd_ps(vt2, vp2, vs2);
2168*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve3 = _mm256_fmadd_ps(vt3, vp3, vs3);
2169*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve4 = _mm256_fmadd_ps(vt4, vp4, vs4);
2170*4bdc9457SAndroid Build Coastguard Worker 
2171*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd0 = _mm256_add_ps(ve0, vone);
2172*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd1 = _mm256_add_ps(ve1, vone);
2173*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd2 = _mm256_add_ps(ve2, vone);
2174*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd3 = _mm256_add_ps(ve3, vone);
2175*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd4 = _mm256_add_ps(ve4, vone);
2176*4bdc9457SAndroid Build Coastguard Worker 
2177*4bdc9457SAndroid Build Coastguard Worker     __m256 vf0 = _mm256_div_ps(ve0, vd0);
2178*4bdc9457SAndroid Build Coastguard Worker     __m256 vf1 = _mm256_div_ps(ve1, vd1);
2179*4bdc9457SAndroid Build Coastguard Worker     __m256 vf2 = _mm256_div_ps(ve2, vd2);
2180*4bdc9457SAndroid Build Coastguard Worker     __m256 vf3 = _mm256_div_ps(ve3, vd3);
2181*4bdc9457SAndroid Build Coastguard Worker     __m256 vf4 = _mm256_div_ps(ve4, vd4);
2182*4bdc9457SAndroid Build Coastguard Worker 
2183*4bdc9457SAndroid Build Coastguard Worker     vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vz0, vdenorm_cutoff, _CMP_LT_OS), vf0);
2184*4bdc9457SAndroid Build Coastguard Worker     vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vz1, vdenorm_cutoff, _CMP_LT_OS), vf1);
2185*4bdc9457SAndroid Build Coastguard Worker     vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vz2, vdenorm_cutoff, _CMP_LT_OS), vf2);
2186*4bdc9457SAndroid Build Coastguard Worker     vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vz3, vdenorm_cutoff, _CMP_LT_OS), vf3);
2187*4bdc9457SAndroid Build Coastguard Worker     vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vz4, vdenorm_cutoff, _CMP_LT_OS), vf4);
2188*4bdc9457SAndroid Build Coastguard Worker 
2189*4bdc9457SAndroid Build Coastguard Worker     vf0 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf0), vf0, vx0);
2190*4bdc9457SAndroid Build Coastguard Worker     vf1 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf1), vf1, vx1);
2191*4bdc9457SAndroid Build Coastguard Worker     vf2 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf2), vf2, vx2);
2192*4bdc9457SAndroid Build Coastguard Worker     vf3 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf3), vf3, vx3);
2193*4bdc9457SAndroid Build Coastguard Worker     vf4 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf4), vf4, vx4);
2194*4bdc9457SAndroid Build Coastguard Worker 
2195*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y, vf0);
2196*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 8, vf1);
2197*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 16, vf2);
2198*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 24, vf3);
2199*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 32, vf4);
2200*4bdc9457SAndroid Build Coastguard Worker     y += 40;
2201*4bdc9457SAndroid Build Coastguard Worker   }
2202*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2203*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx = _mm256_loadu_ps(x);
2204*4bdc9457SAndroid Build Coastguard Worker     x += 8;
2205*4bdc9457SAndroid Build Coastguard Worker 
2206*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz = _mm256_or_ps(vx, vsign_mask);
2207*4bdc9457SAndroid Build Coastguard Worker 
2208*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
2209*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
2210*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
2211*4bdc9457SAndroid Build Coastguard Worker 
2212*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
2213*4bdc9457SAndroid Build Coastguard Worker 
2214*4bdc9457SAndroid Build Coastguard Worker     __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
2215*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc3);
2216*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc2);
2217*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc1);
2218*4bdc9457SAndroid Build Coastguard Worker 
2219*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
2220*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve = _mm256_fmadd_ps(vt, vp, vs);
2221*4bdc9457SAndroid Build Coastguard Worker 
2222*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd = _mm256_add_ps(ve, vone);
2223*4bdc9457SAndroid Build Coastguard Worker     __m256 vf = _mm256_div_ps(ve, vd);
2224*4bdc9457SAndroid Build Coastguard Worker 
2225*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
2226*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
2227*4bdc9457SAndroid Build Coastguard Worker 
2228*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y, vf);
2229*4bdc9457SAndroid Build Coastguard Worker     y += 8;
2230*4bdc9457SAndroid Build Coastguard Worker   }
2231*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
2232*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(float));
2233*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 7 * sizeof(float));
2234*4bdc9457SAndroid Build Coastguard Worker     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx2_rr1_p5.mask_table[7] - n));
2235*4bdc9457SAndroid Build Coastguard Worker 
2236*4bdc9457SAndroid Build Coastguard Worker     const __m256 vx = _mm256_maskload_ps(x, vmask);
2237*4bdc9457SAndroid Build Coastguard Worker 
2238*4bdc9457SAndroid Build Coastguard Worker     const __m256 vz = _mm256_or_ps(vx, vsign_mask);
2239*4bdc9457SAndroid Build Coastguard Worker 
2240*4bdc9457SAndroid Build Coastguard Worker     __m256 vn = _mm256_fmadd_ps(vz, vlog2e, vmagic_bias);
2241*4bdc9457SAndroid Build Coastguard Worker     const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
2242*4bdc9457SAndroid Build Coastguard Worker     vn = _mm256_sub_ps(vn, vmagic_bias);
2243*4bdc9457SAndroid Build Coastguard Worker 
2244*4bdc9457SAndroid Build Coastguard Worker     __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vz);
2245*4bdc9457SAndroid Build Coastguard Worker 
2246*4bdc9457SAndroid Build Coastguard Worker     __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
2247*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc3);
2248*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc2);
2249*4bdc9457SAndroid Build Coastguard Worker     vp = _mm256_fmadd_ps(vp, vt, vc1);
2250*4bdc9457SAndroid Build Coastguard Worker 
2251*4bdc9457SAndroid Build Coastguard Worker     vt = _mm256_mul_ps(vt, vs);
2252*4bdc9457SAndroid Build Coastguard Worker     const __m256 ve = _mm256_fmadd_ps(vt, vp, vs);
2253*4bdc9457SAndroid Build Coastguard Worker 
2254*4bdc9457SAndroid Build Coastguard Worker     const __m256 vd = _mm256_add_ps(ve, vone);
2255*4bdc9457SAndroid Build Coastguard Worker     __m256 vf = _mm256_div_ps(ve, vd);
2256*4bdc9457SAndroid Build Coastguard Worker 
2257*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
2258*4bdc9457SAndroid Build Coastguard Worker     vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
2259*4bdc9457SAndroid Build Coastguard Worker 
2260*4bdc9457SAndroid Build Coastguard Worker     __m128 vf_lo = _mm256_castps256_ps128(vf);
2261*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(float))) {
2262*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(y, vf_lo);
2263*4bdc9457SAndroid Build Coastguard Worker       vf_lo = _mm256_extractf128_ps(vf, 1);
2264*4bdc9457SAndroid Build Coastguard Worker       y += 4;
2265*4bdc9457SAndroid Build Coastguard Worker     }
2266*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(float))) {
2267*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vf_lo);
2268*4bdc9457SAndroid Build Coastguard Worker       vf_lo = _mm_movehl_ps(vf_lo, vf_lo);
2269*4bdc9457SAndroid Build Coastguard Worker       y += 2;
2270*4bdc9457SAndroid Build Coastguard Worker     }
2271*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(float))) {
2272*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vf_lo);
2273*4bdc9457SAndroid Build Coastguard Worker     }
2274*4bdc9457SAndroid Build Coastguard Worker   }
2275*4bdc9457SAndroid Build Coastguard Worker }
2276*4bdc9457SAndroid Build Coastguard Worker 
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2277*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(
2278*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
2279*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
2280*4bdc9457SAndroid Build Coastguard Worker     const int8_t** input,
2281*4bdc9457SAndroid Build Coastguard Worker     const void* weights,
2282*4bdc9457SAndroid Build Coastguard Worker     int8_t* output,
2283*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
2284*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
2285*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
2286*4bdc9457SAndroid Build Coastguard Worker     const int8_t* zero,
2287*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2288*4bdc9457SAndroid Build Coastguard Worker {
2289*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
2290*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
2291*4bdc9457SAndroid Build Coastguard Worker 
2292*4bdc9457SAndroid Build Coastguard Worker   do {
2293*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i0 = input[0];
2294*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
2295*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
2296*4bdc9457SAndroid Build Coastguard Worker       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
2297*4bdc9457SAndroid Build Coastguard Worker     }
2298*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i1 = input[1];
2299*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
2300*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
2301*4bdc9457SAndroid Build Coastguard Worker       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
2302*4bdc9457SAndroid Build Coastguard Worker     }
2303*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i2 = input[2];
2304*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
2305*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
2306*4bdc9457SAndroid Build Coastguard Worker       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2307*4bdc9457SAndroid Build Coastguard Worker     }
2308*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i3 = input[3];
2309*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
2310*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
2311*4bdc9457SAndroid Build Coastguard Worker       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2312*4bdc9457SAndroid Build Coastguard Worker     }
2313*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i4 = input[4];
2314*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
2315*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
2316*4bdc9457SAndroid Build Coastguard Worker       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2317*4bdc9457SAndroid Build Coastguard Worker     }
2318*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i5 = input[5];
2319*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
2320*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
2321*4bdc9457SAndroid Build Coastguard Worker       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2322*4bdc9457SAndroid Build Coastguard Worker     }
2323*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i6 = input[6];
2324*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
2325*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
2326*4bdc9457SAndroid Build Coastguard Worker       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2327*4bdc9457SAndroid Build Coastguard Worker     }
2328*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i7 = input[7];
2329*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
2330*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
2331*4bdc9457SAndroid Build Coastguard Worker       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2332*4bdc9457SAndroid Build Coastguard Worker     }
2333*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i8 = input[8];
2334*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
2335*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
2336*4bdc9457SAndroid Build Coastguard Worker       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2337*4bdc9457SAndroid Build Coastguard Worker     }
2338*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i9 = input[9];
2339*4bdc9457SAndroid Build Coastguard Worker     assert(i9 != NULL);
2340*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i9 != zero) {
2341*4bdc9457SAndroid Build Coastguard Worker       i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
2342*4bdc9457SAndroid Build Coastguard Worker     }
2343*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i10 = input[10];
2344*4bdc9457SAndroid Build Coastguard Worker     assert(i10 != NULL);
2345*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i10 != zero) {
2346*4bdc9457SAndroid Build Coastguard Worker       i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
2347*4bdc9457SAndroid Build Coastguard Worker     }
2348*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i11 = input[11];
2349*4bdc9457SAndroid Build Coastguard Worker     assert(i11 != NULL);
2350*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i11 != zero) {
2351*4bdc9457SAndroid Build Coastguard Worker       i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
2352*4bdc9457SAndroid Build Coastguard Worker     }
2353*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i12 = input[12];
2354*4bdc9457SAndroid Build Coastguard Worker     assert(i12 != NULL);
2355*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i12 != zero) {
2356*4bdc9457SAndroid Build Coastguard Worker       i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
2357*4bdc9457SAndroid Build Coastguard Worker     }
2358*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i13 = input[13];
2359*4bdc9457SAndroid Build Coastguard Worker     assert(i13 != NULL);
2360*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i13 != zero) {
2361*4bdc9457SAndroid Build Coastguard Worker       i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
2362*4bdc9457SAndroid Build Coastguard Worker     }
2363*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i14 = input[14];
2364*4bdc9457SAndroid Build Coastguard Worker     assert(i14 != NULL);
2365*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i14 != zero) {
2366*4bdc9457SAndroid Build Coastguard Worker       i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
2367*4bdc9457SAndroid Build Coastguard Worker     }
2368*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i15 = input[15];
2369*4bdc9457SAndroid Build Coastguard Worker     assert(i15 != NULL);
2370*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i15 != zero) {
2371*4bdc9457SAndroid Build Coastguard Worker       i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
2372*4bdc9457SAndroid Build Coastguard Worker     }
2373*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i16 = input[16];
2374*4bdc9457SAndroid Build Coastguard Worker     assert(i16 != NULL);
2375*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i16 != zero) {
2376*4bdc9457SAndroid Build Coastguard Worker       i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
2377*4bdc9457SAndroid Build Coastguard Worker     }
2378*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i17 = input[17];
2379*4bdc9457SAndroid Build Coastguard Worker     assert(i17 != NULL);
2380*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i17 != zero) {
2381*4bdc9457SAndroid Build Coastguard Worker       i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
2382*4bdc9457SAndroid Build Coastguard Worker     }
2383*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i18 = input[18];
2384*4bdc9457SAndroid Build Coastguard Worker     assert(i18 != NULL);
2385*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i18 != zero) {
2386*4bdc9457SAndroid Build Coastguard Worker       i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
2387*4bdc9457SAndroid Build Coastguard Worker     }
2388*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i19 = input[19];
2389*4bdc9457SAndroid Build Coastguard Worker     assert(i19 != NULL);
2390*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i19 != zero) {
2391*4bdc9457SAndroid Build Coastguard Worker       i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
2392*4bdc9457SAndroid Build Coastguard Worker     }
2393*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i20 = input[20];
2394*4bdc9457SAndroid Build Coastguard Worker     assert(i20 != NULL);
2395*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i20 != zero) {
2396*4bdc9457SAndroid Build Coastguard Worker       i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
2397*4bdc9457SAndroid Build Coastguard Worker     }
2398*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i21 = input[21];
2399*4bdc9457SAndroid Build Coastguard Worker     assert(i21 != NULL);
2400*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i21 != zero) {
2401*4bdc9457SAndroid Build Coastguard Worker       i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
2402*4bdc9457SAndroid Build Coastguard Worker     }
2403*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i22 = input[22];
2404*4bdc9457SAndroid Build Coastguard Worker     assert(i22 != NULL);
2405*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i22 != zero) {
2406*4bdc9457SAndroid Build Coastguard Worker       i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
2407*4bdc9457SAndroid Build Coastguard Worker     }
2408*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i23 = input[23];
2409*4bdc9457SAndroid Build Coastguard Worker     assert(i23 != NULL);
2410*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i23 != zero) {
2411*4bdc9457SAndroid Build Coastguard Worker       i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
2412*4bdc9457SAndroid Build Coastguard Worker     }
2413*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i24 = input[24];
2414*4bdc9457SAndroid Build Coastguard Worker     assert(i24 != NULL);
2415*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i24 != zero) {
2416*4bdc9457SAndroid Build Coastguard Worker       i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
2417*4bdc9457SAndroid Build Coastguard Worker     }
2418*4bdc9457SAndroid Build Coastguard Worker     input = (const int8_t**) ((uintptr_t) input + input_stride);
2419*4bdc9457SAndroid Build Coastguard Worker 
2420*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
2421*4bdc9457SAndroid Build Coastguard Worker     const void* w = weights;
2422*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 16; c -= 16) {
2423*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
2424*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
2425*4bdc9457SAndroid Build Coastguard Worker 
2426*4bdc9457SAndroid Build Coastguard Worker 
2427*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
2428*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
2429*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
2430*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
2431*4bdc9457SAndroid Build Coastguard Worker       i0 += 16;
2432*4bdc9457SAndroid Build Coastguard Worker 
2433*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
2434*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
2435*4bdc9457SAndroid Build Coastguard Worker 
2436*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
2437*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
2438*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
2439*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
2440*4bdc9457SAndroid Build Coastguard Worker       i1 += 16;
2441*4bdc9457SAndroid Build Coastguard Worker 
2442*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
2443*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
2444*4bdc9457SAndroid Build Coastguard Worker 
2445*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
2446*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
2447*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
2448*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
2449*4bdc9457SAndroid Build Coastguard Worker       i2 += 16;
2450*4bdc9457SAndroid Build Coastguard Worker 
2451*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
2452*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
2453*4bdc9457SAndroid Build Coastguard Worker 
2454*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
2455*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
2456*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
2457*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
2458*4bdc9457SAndroid Build Coastguard Worker       i3 += 16;
2459*4bdc9457SAndroid Build Coastguard Worker 
2460*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
2461*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
2462*4bdc9457SAndroid Build Coastguard Worker 
2463*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
2464*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
2465*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
2466*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
2467*4bdc9457SAndroid Build Coastguard Worker       i4 += 16;
2468*4bdc9457SAndroid Build Coastguard Worker 
2469*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
2470*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
2471*4bdc9457SAndroid Build Coastguard Worker 
2472*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
2473*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
2474*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
2475*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
2476*4bdc9457SAndroid Build Coastguard Worker       i5 += 16;
2477*4bdc9457SAndroid Build Coastguard Worker 
2478*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
2479*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
2480*4bdc9457SAndroid Build Coastguard Worker 
2481*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
2482*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
2483*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
2484*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
2485*4bdc9457SAndroid Build Coastguard Worker       i6 += 16;
2486*4bdc9457SAndroid Build Coastguard Worker 
2487*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
2488*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
2489*4bdc9457SAndroid Build Coastguard Worker 
2490*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
2491*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
2492*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
2493*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
2494*4bdc9457SAndroid Build Coastguard Worker       i7 += 16;
2495*4bdc9457SAndroid Build Coastguard Worker 
2496*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
2497*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
2498*4bdc9457SAndroid Build Coastguard Worker 
2499*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
2500*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
2501*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
2502*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
2503*4bdc9457SAndroid Build Coastguard Worker       i8 += 16;
2504*4bdc9457SAndroid Build Coastguard Worker 
2505*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
2506*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
2507*4bdc9457SAndroid Build Coastguard Worker 
2508*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
2509*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
2510*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i9 + 8)));
2511*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
2512*4bdc9457SAndroid Build Coastguard Worker       i9 += 16;
2513*4bdc9457SAndroid Build Coastguard Worker 
2514*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
2515*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi9x89ABCDEF, vk9x89ABCDEF));
2516*4bdc9457SAndroid Build Coastguard Worker 
2517*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
2518*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
2519*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i10 + 8)));
2520*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
2521*4bdc9457SAndroid Build Coastguard Worker       i10 += 16;
2522*4bdc9457SAndroid Build Coastguard Worker 
2523*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
2524*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi10x89ABCDEF, vk10x89ABCDEF));
2525*4bdc9457SAndroid Build Coastguard Worker 
2526*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
2527*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
2528*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i11 + 8)));
2529*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
2530*4bdc9457SAndroid Build Coastguard Worker       i11 += 16;
2531*4bdc9457SAndroid Build Coastguard Worker 
2532*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
2533*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi11x89ABCDEF, vk11x89ABCDEF));
2534*4bdc9457SAndroid Build Coastguard Worker 
2535*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
2536*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
2537*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i12 + 8)));
2538*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
2539*4bdc9457SAndroid Build Coastguard Worker       i12 += 16;
2540*4bdc9457SAndroid Build Coastguard Worker 
2541*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
2542*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi12x89ABCDEF, vk12x89ABCDEF));
2543*4bdc9457SAndroid Build Coastguard Worker 
2544*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
2545*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
2546*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i13 + 8)));
2547*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
2548*4bdc9457SAndroid Build Coastguard Worker       i13 += 16;
2549*4bdc9457SAndroid Build Coastguard Worker 
2550*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
2551*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi13x89ABCDEF, vk13x89ABCDEF));
2552*4bdc9457SAndroid Build Coastguard Worker 
2553*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
2554*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
2555*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i14 + 8)));
2556*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
2557*4bdc9457SAndroid Build Coastguard Worker       i14 += 16;
2558*4bdc9457SAndroid Build Coastguard Worker 
2559*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
2560*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi14x89ABCDEF, vk14x89ABCDEF));
2561*4bdc9457SAndroid Build Coastguard Worker 
2562*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
2563*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
2564*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i15 + 8)));
2565*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
2566*4bdc9457SAndroid Build Coastguard Worker       i15 += 16;
2567*4bdc9457SAndroid Build Coastguard Worker 
2568*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
2569*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi15x89ABCDEF, vk15x89ABCDEF));
2570*4bdc9457SAndroid Build Coastguard Worker 
2571*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
2572*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
2573*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i16 + 8)));
2574*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
2575*4bdc9457SAndroid Build Coastguard Worker       i16 += 16;
2576*4bdc9457SAndroid Build Coastguard Worker 
2577*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
2578*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi16x89ABCDEF, vk16x89ABCDEF));
2579*4bdc9457SAndroid Build Coastguard Worker 
2580*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
2581*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
2582*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i17 + 8)));
2583*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
2584*4bdc9457SAndroid Build Coastguard Worker       i17 += 16;
2585*4bdc9457SAndroid Build Coastguard Worker 
2586*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
2587*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi17x89ABCDEF, vk17x89ABCDEF));
2588*4bdc9457SAndroid Build Coastguard Worker 
2589*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
2590*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
2591*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i18 + 8)));
2592*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
2593*4bdc9457SAndroid Build Coastguard Worker       i18 += 16;
2594*4bdc9457SAndroid Build Coastguard Worker 
2595*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
2596*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi18x89ABCDEF, vk18x89ABCDEF));
2597*4bdc9457SAndroid Build Coastguard Worker 
2598*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
2599*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
2600*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i19 + 8)));
2601*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
2602*4bdc9457SAndroid Build Coastguard Worker       i19 += 16;
2603*4bdc9457SAndroid Build Coastguard Worker 
2604*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
2605*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi19x89ABCDEF, vk19x89ABCDEF));
2606*4bdc9457SAndroid Build Coastguard Worker 
2607*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
2608*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
2609*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i20 + 8)));
2610*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
2611*4bdc9457SAndroid Build Coastguard Worker       i20 += 16;
2612*4bdc9457SAndroid Build Coastguard Worker 
2613*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
2614*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi20x89ABCDEF, vk20x89ABCDEF));
2615*4bdc9457SAndroid Build Coastguard Worker 
2616*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
2617*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
2618*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i21 + 8)));
2619*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
2620*4bdc9457SAndroid Build Coastguard Worker       i21 += 16;
2621*4bdc9457SAndroid Build Coastguard Worker 
2622*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
2623*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi21x89ABCDEF, vk21x89ABCDEF));
2624*4bdc9457SAndroid Build Coastguard Worker 
2625*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
2626*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
2627*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i22 + 8)));
2628*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
2629*4bdc9457SAndroid Build Coastguard Worker       i22 += 16;
2630*4bdc9457SAndroid Build Coastguard Worker 
2631*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
2632*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi22x89ABCDEF, vk22x89ABCDEF));
2633*4bdc9457SAndroid Build Coastguard Worker 
2634*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
2635*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
2636*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i23 + 8)));
2637*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
2638*4bdc9457SAndroid Build Coastguard Worker       i23 += 16;
2639*4bdc9457SAndroid Build Coastguard Worker 
2640*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
2641*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi23x89ABCDEF, vk23x89ABCDEF));
2642*4bdc9457SAndroid Build Coastguard Worker 
2643*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
2644*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
2645*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i24 + 8)));
2646*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
2647*4bdc9457SAndroid Build Coastguard Worker       i24 += 16;
2648*4bdc9457SAndroid Build Coastguard Worker 
2649*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
2650*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi24x89ABCDEF, vk24x89ABCDEF));
2651*4bdc9457SAndroid Build Coastguard Worker 
2652*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
2653*4bdc9457SAndroid Build Coastguard Worker 
2654*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
2655*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
2656*4bdc9457SAndroid Build Coastguard Worker 
2657*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale01234567 = _mm256_loadu_ps((const float*) w);
2658*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale89ABCDEF = _mm256_loadu_ps((const float*) w + 8);
2659*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((const float*) w + 16);
2660*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
2661*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale89ABCDEF);
2662*4bdc9457SAndroid Build Coastguard Worker 
2663*4bdc9457SAndroid Build Coastguard Worker       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
2664*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
2665*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
2666*4bdc9457SAndroid Build Coastguard Worker 
2667*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
2668*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
2669*4bdc9457SAndroid Build Coastguard Worker 
2670*4bdc9457SAndroid Build Coastguard Worker       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
2671*4bdc9457SAndroid Build Coastguard Worker       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
2672*4bdc9457SAndroid Build Coastguard Worker 
2673*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
2674*4bdc9457SAndroid Build Coastguard Worker 
2675*4bdc9457SAndroid Build Coastguard Worker       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
2676*4bdc9457SAndroid Build Coastguard Worker       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
2677*4bdc9457SAndroid Build Coastguard Worker 
2678*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
2679*4bdc9457SAndroid Build Coastguard Worker       output += 16;
2680*4bdc9457SAndroid Build Coastguard Worker     }
2681*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
2682*4bdc9457SAndroid Build Coastguard Worker       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
2683*4bdc9457SAndroid Build Coastguard Worker       do {
2684*4bdc9457SAndroid Build Coastguard Worker         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
2685*4bdc9457SAndroid Build Coastguard Worker 
2686*4bdc9457SAndroid Build Coastguard Worker 
2687*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
2688*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
2689*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
2690*4bdc9457SAndroid Build Coastguard Worker 
2691*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
2692*4bdc9457SAndroid Build Coastguard Worker 
2693*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
2694*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
2695*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
2696*4bdc9457SAndroid Build Coastguard Worker 
2697*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
2698*4bdc9457SAndroid Build Coastguard Worker 
2699*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
2700*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
2701*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
2702*4bdc9457SAndroid Build Coastguard Worker 
2703*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
2704*4bdc9457SAndroid Build Coastguard Worker 
2705*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
2706*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
2707*4bdc9457SAndroid Build Coastguard Worker         i3 += 8;
2708*4bdc9457SAndroid Build Coastguard Worker 
2709*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
2710*4bdc9457SAndroid Build Coastguard Worker 
2711*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
2712*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
2713*4bdc9457SAndroid Build Coastguard Worker         i4 += 8;
2714*4bdc9457SAndroid Build Coastguard Worker 
2715*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
2716*4bdc9457SAndroid Build Coastguard Worker 
2717*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
2718*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
2719*4bdc9457SAndroid Build Coastguard Worker         i5 += 8;
2720*4bdc9457SAndroid Build Coastguard Worker 
2721*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
2722*4bdc9457SAndroid Build Coastguard Worker 
2723*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
2724*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
2725*4bdc9457SAndroid Build Coastguard Worker         i6 += 8;
2726*4bdc9457SAndroid Build Coastguard Worker 
2727*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
2728*4bdc9457SAndroid Build Coastguard Worker 
2729*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
2730*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
2731*4bdc9457SAndroid Build Coastguard Worker         i7 += 8;
2732*4bdc9457SAndroid Build Coastguard Worker 
2733*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
2734*4bdc9457SAndroid Build Coastguard Worker 
2735*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
2736*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
2737*4bdc9457SAndroid Build Coastguard Worker         i8 += 8;
2738*4bdc9457SAndroid Build Coastguard Worker 
2739*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
2740*4bdc9457SAndroid Build Coastguard Worker 
2741*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
2742*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 144)));
2743*4bdc9457SAndroid Build Coastguard Worker         i9 += 8;
2744*4bdc9457SAndroid Build Coastguard Worker 
2745*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
2746*4bdc9457SAndroid Build Coastguard Worker 
2747*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
2748*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 160)));
2749*4bdc9457SAndroid Build Coastguard Worker         i10 += 8;
2750*4bdc9457SAndroid Build Coastguard Worker 
2751*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
2752*4bdc9457SAndroid Build Coastguard Worker 
2753*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
2754*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 176)));
2755*4bdc9457SAndroid Build Coastguard Worker         i11 += 8;
2756*4bdc9457SAndroid Build Coastguard Worker 
2757*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
2758*4bdc9457SAndroid Build Coastguard Worker 
2759*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
2760*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 192)));
2761*4bdc9457SAndroid Build Coastguard Worker         i12 += 8;
2762*4bdc9457SAndroid Build Coastguard Worker 
2763*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
2764*4bdc9457SAndroid Build Coastguard Worker 
2765*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
2766*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 208)));
2767*4bdc9457SAndroid Build Coastguard Worker         i13 += 8;
2768*4bdc9457SAndroid Build Coastguard Worker 
2769*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
2770*4bdc9457SAndroid Build Coastguard Worker 
2771*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
2772*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 224)));
2773*4bdc9457SAndroid Build Coastguard Worker         i14 += 8;
2774*4bdc9457SAndroid Build Coastguard Worker 
2775*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
2776*4bdc9457SAndroid Build Coastguard Worker 
2777*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
2778*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 240)));
2779*4bdc9457SAndroid Build Coastguard Worker         i15 += 8;
2780*4bdc9457SAndroid Build Coastguard Worker 
2781*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
2782*4bdc9457SAndroid Build Coastguard Worker 
2783*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
2784*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 256)));
2785*4bdc9457SAndroid Build Coastguard Worker         i16 += 8;
2786*4bdc9457SAndroid Build Coastguard Worker 
2787*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
2788*4bdc9457SAndroid Build Coastguard Worker 
2789*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
2790*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 272)));
2791*4bdc9457SAndroid Build Coastguard Worker         i17 += 8;
2792*4bdc9457SAndroid Build Coastguard Worker 
2793*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
2794*4bdc9457SAndroid Build Coastguard Worker 
2795*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
2796*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 288)));
2797*4bdc9457SAndroid Build Coastguard Worker         i18 += 8;
2798*4bdc9457SAndroid Build Coastguard Worker 
2799*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
2800*4bdc9457SAndroid Build Coastguard Worker 
2801*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
2802*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 304)));
2803*4bdc9457SAndroid Build Coastguard Worker         i19 += 8;
2804*4bdc9457SAndroid Build Coastguard Worker 
2805*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
2806*4bdc9457SAndroid Build Coastguard Worker 
2807*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
2808*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 320)));
2809*4bdc9457SAndroid Build Coastguard Worker         i20 += 8;
2810*4bdc9457SAndroid Build Coastguard Worker 
2811*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
2812*4bdc9457SAndroid Build Coastguard Worker 
2813*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
2814*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 336)));
2815*4bdc9457SAndroid Build Coastguard Worker         i21 += 8;
2816*4bdc9457SAndroid Build Coastguard Worker 
2817*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
2818*4bdc9457SAndroid Build Coastguard Worker 
2819*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
2820*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 352)));
2821*4bdc9457SAndroid Build Coastguard Worker         i22 += 8;
2822*4bdc9457SAndroid Build Coastguard Worker 
2823*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
2824*4bdc9457SAndroid Build Coastguard Worker 
2825*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
2826*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 368)));
2827*4bdc9457SAndroid Build Coastguard Worker         i23 += 8;
2828*4bdc9457SAndroid Build Coastguard Worker 
2829*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
2830*4bdc9457SAndroid Build Coastguard Worker 
2831*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
2832*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 384)));
2833*4bdc9457SAndroid Build Coastguard Worker         i24 += 8;
2834*4bdc9457SAndroid Build Coastguard Worker 
2835*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
2836*4bdc9457SAndroid Build Coastguard Worker 
2837*4bdc9457SAndroid Build Coastguard Worker         k += 8;
2838*4bdc9457SAndroid Build Coastguard Worker 
2839*4bdc9457SAndroid Build Coastguard Worker         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
2840*4bdc9457SAndroid Build Coastguard Worker         const __m256 vscale01234567 = _mm256_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t)));
2841*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
2842*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
2843*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
2844*4bdc9457SAndroid Build Coastguard Worker 
2845*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int32_t*) w + 8);
2846*4bdc9457SAndroid Build Coastguard Worker 
2847*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
2848*4bdc9457SAndroid Build Coastguard Worker         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
2849*4bdc9457SAndroid Build Coastguard Worker 
2850*4bdc9457SAndroid Build Coastguard Worker         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
2851*4bdc9457SAndroid Build Coastguard Worker 
2852*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
2853*4bdc9457SAndroid Build Coastguard Worker         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
2854*4bdc9457SAndroid Build Coastguard Worker 
2855*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(c >= 8) {
2856*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
2857*4bdc9457SAndroid Build Coastguard Worker           output += 8;
2858*4bdc9457SAndroid Build Coastguard Worker           c -= 8;
2859*4bdc9457SAndroid Build Coastguard Worker         } else {
2860*4bdc9457SAndroid Build Coastguard Worker           if (c & 4) {
2861*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
2862*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
2863*4bdc9457SAndroid Build Coastguard Worker             output += 4;
2864*4bdc9457SAndroid Build Coastguard Worker           }
2865*4bdc9457SAndroid Build Coastguard Worker           if (c & 2) {
2866*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
2867*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
2868*4bdc9457SAndroid Build Coastguard Worker             output += 2;
2869*4bdc9457SAndroid Build Coastguard Worker           }
2870*4bdc9457SAndroid Build Coastguard Worker           if (c & 1) {
2871*4bdc9457SAndroid Build Coastguard Worker             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
2872*4bdc9457SAndroid Build Coastguard Worker             output += 1;
2873*4bdc9457SAndroid Build Coastguard Worker           }
2874*4bdc9457SAndroid Build Coastguard Worker           c = 0;
2875*4bdc9457SAndroid Build Coastguard Worker         }
2876*4bdc9457SAndroid Build Coastguard Worker       } while (c != 0);
2877*4bdc9457SAndroid Build Coastguard Worker     }
2878*4bdc9457SAndroid Build Coastguard Worker 
2879*4bdc9457SAndroid Build Coastguard Worker     output = (int8_t*) ((uintptr_t) output + output_increment);
2880*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
2881*4bdc9457SAndroid Build Coastguard Worker }
2882*4bdc9457SAndroid Build Coastguard Worker 
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2883*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx2_mul32(
2884*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
2885*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
2886*4bdc9457SAndroid Build Coastguard Worker     const int8_t** input,
2887*4bdc9457SAndroid Build Coastguard Worker     const void* weights,
2888*4bdc9457SAndroid Build Coastguard Worker     int8_t* output,
2889*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
2890*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
2891*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
2892*4bdc9457SAndroid Build Coastguard Worker     const int8_t* zero,
2893*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2894*4bdc9457SAndroid Build Coastguard Worker {
2895*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
2896*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
2897*4bdc9457SAndroid Build Coastguard Worker 
2898*4bdc9457SAndroid Build Coastguard Worker   do {
2899*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i0 = input[0];
2900*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
2901*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
2902*4bdc9457SAndroid Build Coastguard Worker       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
2903*4bdc9457SAndroid Build Coastguard Worker     }
2904*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i1 = input[1];
2905*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
2906*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
2907*4bdc9457SAndroid Build Coastguard Worker       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
2908*4bdc9457SAndroid Build Coastguard Worker     }
2909*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i2 = input[2];
2910*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
2911*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
2912*4bdc9457SAndroid Build Coastguard Worker       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2913*4bdc9457SAndroid Build Coastguard Worker     }
2914*4bdc9457SAndroid Build Coastguard Worker     input = (const int8_t**) ((uintptr_t) input + input_stride);
2915*4bdc9457SAndroid Build Coastguard Worker 
2916*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
2917*4bdc9457SAndroid Build Coastguard Worker     const void* w = weights;
2918*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 16; c -= 16) {
2919*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
2920*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
2921*4bdc9457SAndroid Build Coastguard Worker 
2922*4bdc9457SAndroid Build Coastguard Worker 
2923*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
2924*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
2925*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
2926*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
2927*4bdc9457SAndroid Build Coastguard Worker       i0 += 16;
2928*4bdc9457SAndroid Build Coastguard Worker 
2929*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
2930*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
2931*4bdc9457SAndroid Build Coastguard Worker 
2932*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
2933*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
2934*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
2935*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
2936*4bdc9457SAndroid Build Coastguard Worker       i1 += 16;
2937*4bdc9457SAndroid Build Coastguard Worker 
2938*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
2939*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
2940*4bdc9457SAndroid Build Coastguard Worker 
2941*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
2942*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
2943*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
2944*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
2945*4bdc9457SAndroid Build Coastguard Worker       i2 += 16;
2946*4bdc9457SAndroid Build Coastguard Worker 
2947*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
2948*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
2949*4bdc9457SAndroid Build Coastguard Worker 
2950*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t));
2951*4bdc9457SAndroid Build Coastguard Worker 
2952*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
2953*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
2954*4bdc9457SAndroid Build Coastguard Worker 
2955*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale01234567 = _mm256_loadu_ps((const float*) w);
2956*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale89ABCDEF = _mm256_loadu_ps((const float*) w + 8);
2957*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((const float*) w + 16);
2958*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
2959*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale89ABCDEF);
2960*4bdc9457SAndroid Build Coastguard Worker 
2961*4bdc9457SAndroid Build Coastguard Worker       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
2962*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
2963*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
2964*4bdc9457SAndroid Build Coastguard Worker 
2965*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
2966*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
2967*4bdc9457SAndroid Build Coastguard Worker 
2968*4bdc9457SAndroid Build Coastguard Worker       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
2969*4bdc9457SAndroid Build Coastguard Worker       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
2970*4bdc9457SAndroid Build Coastguard Worker 
2971*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
2972*4bdc9457SAndroid Build Coastguard Worker 
2973*4bdc9457SAndroid Build Coastguard Worker       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
2974*4bdc9457SAndroid Build Coastguard Worker       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
2975*4bdc9457SAndroid Build Coastguard Worker 
2976*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
2977*4bdc9457SAndroid Build Coastguard Worker       output += 16;
2978*4bdc9457SAndroid Build Coastguard Worker     }
2979*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
2980*4bdc9457SAndroid Build Coastguard Worker       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
2981*4bdc9457SAndroid Build Coastguard Worker       do {
2982*4bdc9457SAndroid Build Coastguard Worker         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
2983*4bdc9457SAndroid Build Coastguard Worker 
2984*4bdc9457SAndroid Build Coastguard Worker 
2985*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
2986*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
2987*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
2988*4bdc9457SAndroid Build Coastguard Worker 
2989*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
2990*4bdc9457SAndroid Build Coastguard Worker 
2991*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
2992*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
2993*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
2994*4bdc9457SAndroid Build Coastguard Worker 
2995*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
2996*4bdc9457SAndroid Build Coastguard Worker 
2997*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
2998*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
2999*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
3000*4bdc9457SAndroid Build Coastguard Worker 
3001*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3002*4bdc9457SAndroid Build Coastguard Worker 
3003*4bdc9457SAndroid Build Coastguard Worker         k += 8;
3004*4bdc9457SAndroid Build Coastguard Worker 
3005*4bdc9457SAndroid Build Coastguard Worker         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3006*4bdc9457SAndroid Build Coastguard Worker         const __m256 vscale01234567 = _mm256_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3007*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
3008*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
3009*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3010*4bdc9457SAndroid Build Coastguard Worker 
3011*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int32_t*) w + 8);
3012*4bdc9457SAndroid Build Coastguard Worker 
3013*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
3014*4bdc9457SAndroid Build Coastguard Worker         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
3015*4bdc9457SAndroid Build Coastguard Worker 
3016*4bdc9457SAndroid Build Coastguard Worker         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3017*4bdc9457SAndroid Build Coastguard Worker 
3018*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3019*4bdc9457SAndroid Build Coastguard Worker         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3020*4bdc9457SAndroid Build Coastguard Worker 
3021*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(c >= 8) {
3022*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3023*4bdc9457SAndroid Build Coastguard Worker           output += 8;
3024*4bdc9457SAndroid Build Coastguard Worker           c -= 8;
3025*4bdc9457SAndroid Build Coastguard Worker         } else {
3026*4bdc9457SAndroid Build Coastguard Worker           if (c & 4) {
3027*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
3028*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3029*4bdc9457SAndroid Build Coastguard Worker             output += 4;
3030*4bdc9457SAndroid Build Coastguard Worker           }
3031*4bdc9457SAndroid Build Coastguard Worker           if (c & 2) {
3032*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
3033*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3034*4bdc9457SAndroid Build Coastguard Worker             output += 2;
3035*4bdc9457SAndroid Build Coastguard Worker           }
3036*4bdc9457SAndroid Build Coastguard Worker           if (c & 1) {
3037*4bdc9457SAndroid Build Coastguard Worker             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3038*4bdc9457SAndroid Build Coastguard Worker             output += 1;
3039*4bdc9457SAndroid Build Coastguard Worker           }
3040*4bdc9457SAndroid Build Coastguard Worker           c = 0;
3041*4bdc9457SAndroid Build Coastguard Worker         }
3042*4bdc9457SAndroid Build Coastguard Worker       } while (c != 0);
3043*4bdc9457SAndroid Build Coastguard Worker     }
3044*4bdc9457SAndroid Build Coastguard Worker 
3045*4bdc9457SAndroid Build Coastguard Worker     output = (int8_t*) ((uintptr_t) output + output_increment);
3046*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
3047*4bdc9457SAndroid Build Coastguard Worker }
3048*4bdc9457SAndroid Build Coastguard Worker 
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3049*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(
3050*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
3051*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
3052*4bdc9457SAndroid Build Coastguard Worker     const int8_t** input,
3053*4bdc9457SAndroid Build Coastguard Worker     const void* weights,
3054*4bdc9457SAndroid Build Coastguard Worker     int8_t* output,
3055*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
3056*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
3057*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
3058*4bdc9457SAndroid Build Coastguard Worker     const int8_t* zero,
3059*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3060*4bdc9457SAndroid Build Coastguard Worker {
3061*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
3062*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
3063*4bdc9457SAndroid Build Coastguard Worker 
3064*4bdc9457SAndroid Build Coastguard Worker   do {
3065*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i0 = input[0];
3066*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
3067*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
3068*4bdc9457SAndroid Build Coastguard Worker       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
3069*4bdc9457SAndroid Build Coastguard Worker     }
3070*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i1 = input[1];
3071*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
3072*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
3073*4bdc9457SAndroid Build Coastguard Worker       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
3074*4bdc9457SAndroid Build Coastguard Worker     }
3075*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i2 = input[2];
3076*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
3077*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
3078*4bdc9457SAndroid Build Coastguard Worker       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
3079*4bdc9457SAndroid Build Coastguard Worker     }
3080*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i3 = input[3];
3081*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
3082*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
3083*4bdc9457SAndroid Build Coastguard Worker       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
3084*4bdc9457SAndroid Build Coastguard Worker     }
3085*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i4 = input[4];
3086*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
3087*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
3088*4bdc9457SAndroid Build Coastguard Worker       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
3089*4bdc9457SAndroid Build Coastguard Worker     }
3090*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i5 = input[5];
3091*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
3092*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
3093*4bdc9457SAndroid Build Coastguard Worker       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
3094*4bdc9457SAndroid Build Coastguard Worker     }
3095*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i6 = input[6];
3096*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
3097*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
3098*4bdc9457SAndroid Build Coastguard Worker       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
3099*4bdc9457SAndroid Build Coastguard Worker     }
3100*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i7 = input[7];
3101*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
3102*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
3103*4bdc9457SAndroid Build Coastguard Worker       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
3104*4bdc9457SAndroid Build Coastguard Worker     }
3105*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i8 = input[8];
3106*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
3107*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
3108*4bdc9457SAndroid Build Coastguard Worker       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
3109*4bdc9457SAndroid Build Coastguard Worker     }
3110*4bdc9457SAndroid Build Coastguard Worker     input = (const int8_t**) ((uintptr_t) input + input_stride);
3111*4bdc9457SAndroid Build Coastguard Worker 
3112*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
3113*4bdc9457SAndroid Build Coastguard Worker     const void* w = weights;
3114*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 16; c -= 16) {
3115*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3116*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
3117*4bdc9457SAndroid Build Coastguard Worker 
3118*4bdc9457SAndroid Build Coastguard Worker 
3119*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3120*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
3121*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
3122*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
3123*4bdc9457SAndroid Build Coastguard Worker       i0 += 16;
3124*4bdc9457SAndroid Build Coastguard Worker 
3125*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3126*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
3127*4bdc9457SAndroid Build Coastguard Worker 
3128*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3129*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
3130*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
3131*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
3132*4bdc9457SAndroid Build Coastguard Worker       i1 += 16;
3133*4bdc9457SAndroid Build Coastguard Worker 
3134*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3135*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
3136*4bdc9457SAndroid Build Coastguard Worker 
3137*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3138*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
3139*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
3140*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
3141*4bdc9457SAndroid Build Coastguard Worker       i2 += 16;
3142*4bdc9457SAndroid Build Coastguard Worker 
3143*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3144*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
3145*4bdc9457SAndroid Build Coastguard Worker 
3146*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3147*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
3148*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
3149*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
3150*4bdc9457SAndroid Build Coastguard Worker       i3 += 16;
3151*4bdc9457SAndroid Build Coastguard Worker 
3152*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3153*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
3154*4bdc9457SAndroid Build Coastguard Worker 
3155*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3156*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
3157*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
3158*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
3159*4bdc9457SAndroid Build Coastguard Worker       i4 += 16;
3160*4bdc9457SAndroid Build Coastguard Worker 
3161*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3162*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
3163*4bdc9457SAndroid Build Coastguard Worker 
3164*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3165*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
3166*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
3167*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
3168*4bdc9457SAndroid Build Coastguard Worker       i5 += 16;
3169*4bdc9457SAndroid Build Coastguard Worker 
3170*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3171*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
3172*4bdc9457SAndroid Build Coastguard Worker 
3173*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3174*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
3175*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
3176*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
3177*4bdc9457SAndroid Build Coastguard Worker       i6 += 16;
3178*4bdc9457SAndroid Build Coastguard Worker 
3179*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3180*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
3181*4bdc9457SAndroid Build Coastguard Worker 
3182*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3183*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
3184*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
3185*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
3186*4bdc9457SAndroid Build Coastguard Worker       i7 += 16;
3187*4bdc9457SAndroid Build Coastguard Worker 
3188*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3189*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
3190*4bdc9457SAndroid Build Coastguard Worker 
3191*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3192*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
3193*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
3194*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
3195*4bdc9457SAndroid Build Coastguard Worker       i8 += 16;
3196*4bdc9457SAndroid Build Coastguard Worker 
3197*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3198*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
3199*4bdc9457SAndroid Build Coastguard Worker 
3200*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
3201*4bdc9457SAndroid Build Coastguard Worker 
3202*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3203*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
3204*4bdc9457SAndroid Build Coastguard Worker 
3205*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale01234567 = _mm256_loadu_ps((const float*) w);
3206*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale89ABCDEF = _mm256_loadu_ps((const float*) w + 8);
3207*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((const float*) w + 16);
3208*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
3209*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale89ABCDEF);
3210*4bdc9457SAndroid Build Coastguard Worker 
3211*4bdc9457SAndroid Build Coastguard Worker       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3212*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
3213*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
3214*4bdc9457SAndroid Build Coastguard Worker 
3215*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3216*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
3217*4bdc9457SAndroid Build Coastguard Worker 
3218*4bdc9457SAndroid Build Coastguard Worker       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3219*4bdc9457SAndroid Build Coastguard Worker       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
3220*4bdc9457SAndroid Build Coastguard Worker 
3221*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
3222*4bdc9457SAndroid Build Coastguard Worker 
3223*4bdc9457SAndroid Build Coastguard Worker       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3224*4bdc9457SAndroid Build Coastguard Worker       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
3225*4bdc9457SAndroid Build Coastguard Worker 
3226*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
3227*4bdc9457SAndroid Build Coastguard Worker       output += 16;
3228*4bdc9457SAndroid Build Coastguard Worker     }
3229*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
3230*4bdc9457SAndroid Build Coastguard Worker       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
3231*4bdc9457SAndroid Build Coastguard Worker       do {
3232*4bdc9457SAndroid Build Coastguard Worker         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
3233*4bdc9457SAndroid Build Coastguard Worker 
3234*4bdc9457SAndroid Build Coastguard Worker 
3235*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
3236*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
3237*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
3238*4bdc9457SAndroid Build Coastguard Worker 
3239*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
3240*4bdc9457SAndroid Build Coastguard Worker 
3241*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
3242*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
3243*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
3244*4bdc9457SAndroid Build Coastguard Worker 
3245*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
3246*4bdc9457SAndroid Build Coastguard Worker 
3247*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
3248*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
3249*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
3250*4bdc9457SAndroid Build Coastguard Worker 
3251*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
3252*4bdc9457SAndroid Build Coastguard Worker 
3253*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
3254*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
3255*4bdc9457SAndroid Build Coastguard Worker         i3 += 8;
3256*4bdc9457SAndroid Build Coastguard Worker 
3257*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
3258*4bdc9457SAndroid Build Coastguard Worker 
3259*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
3260*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
3261*4bdc9457SAndroid Build Coastguard Worker         i4 += 8;
3262*4bdc9457SAndroid Build Coastguard Worker 
3263*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
3264*4bdc9457SAndroid Build Coastguard Worker 
3265*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
3266*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
3267*4bdc9457SAndroid Build Coastguard Worker         i5 += 8;
3268*4bdc9457SAndroid Build Coastguard Worker 
3269*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
3270*4bdc9457SAndroid Build Coastguard Worker 
3271*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
3272*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
3273*4bdc9457SAndroid Build Coastguard Worker         i6 += 8;
3274*4bdc9457SAndroid Build Coastguard Worker 
3275*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
3276*4bdc9457SAndroid Build Coastguard Worker 
3277*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
3278*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
3279*4bdc9457SAndroid Build Coastguard Worker         i7 += 8;
3280*4bdc9457SAndroid Build Coastguard Worker 
3281*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
3282*4bdc9457SAndroid Build Coastguard Worker 
3283*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
3284*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
3285*4bdc9457SAndroid Build Coastguard Worker         i8 += 8;
3286*4bdc9457SAndroid Build Coastguard Worker 
3287*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
3288*4bdc9457SAndroid Build Coastguard Worker 
3289*4bdc9457SAndroid Build Coastguard Worker         k += 8;
3290*4bdc9457SAndroid Build Coastguard Worker 
3291*4bdc9457SAndroid Build Coastguard Worker         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
3292*4bdc9457SAndroid Build Coastguard Worker         const __m256 vscale01234567 = _mm256_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)));
3293*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale01234567);
3294*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
3295*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
3296*4bdc9457SAndroid Build Coastguard Worker 
3297*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int32_t*) w + 8);
3298*4bdc9457SAndroid Build Coastguard Worker 
3299*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
3300*4bdc9457SAndroid Build Coastguard Worker         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
3301*4bdc9457SAndroid Build Coastguard Worker 
3302*4bdc9457SAndroid Build Coastguard Worker         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3303*4bdc9457SAndroid Build Coastguard Worker 
3304*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
3305*4bdc9457SAndroid Build Coastguard Worker         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3306*4bdc9457SAndroid Build Coastguard Worker 
3307*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(c >= 8) {
3308*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3309*4bdc9457SAndroid Build Coastguard Worker           output += 8;
3310*4bdc9457SAndroid Build Coastguard Worker           c -= 8;
3311*4bdc9457SAndroid Build Coastguard Worker         } else {
3312*4bdc9457SAndroid Build Coastguard Worker           if (c & 4) {
3313*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
3314*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3315*4bdc9457SAndroid Build Coastguard Worker             output += 4;
3316*4bdc9457SAndroid Build Coastguard Worker           }
3317*4bdc9457SAndroid Build Coastguard Worker           if (c & 2) {
3318*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
3319*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3320*4bdc9457SAndroid Build Coastguard Worker             output += 2;
3321*4bdc9457SAndroid Build Coastguard Worker           }
3322*4bdc9457SAndroid Build Coastguard Worker           if (c & 1) {
3323*4bdc9457SAndroid Build Coastguard Worker             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3324*4bdc9457SAndroid Build Coastguard Worker             output += 1;
3325*4bdc9457SAndroid Build Coastguard Worker           }
3326*4bdc9457SAndroid Build Coastguard Worker           c = 0;
3327*4bdc9457SAndroid Build Coastguard Worker         }
3328*4bdc9457SAndroid Build Coastguard Worker       } while (c != 0);
3329*4bdc9457SAndroid Build Coastguard Worker     }
3330*4bdc9457SAndroid Build Coastguard Worker 
3331*4bdc9457SAndroid Build Coastguard Worker     output = (int8_t*) ((uintptr_t) output + output_increment);
3332*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
3333*4bdc9457SAndroid Build Coastguard Worker }
3334*4bdc9457SAndroid Build Coastguard Worker 
xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3335*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2(
3336*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
3337*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
3338*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
3339*4bdc9457SAndroid Build Coastguard Worker     const int8_t* restrict a,
3340*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
3341*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
3342*4bdc9457SAndroid Build Coastguard Worker     int8_t* restrict c,
3343*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
3344*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
3345*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3346*4bdc9457SAndroid Build Coastguard Worker {
3347*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
3348*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
3349*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
3350*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
3351*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(int8_t) == 0);
3352*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
3353*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
3354*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
3355*4bdc9457SAndroid Build Coastguard Worker 
3356*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
3357*4bdc9457SAndroid Build Coastguard Worker   const int8_t* a0 = a;
3358*4bdc9457SAndroid Build Coastguard Worker   int8_t* c0 = c;
3359*4bdc9457SAndroid Build Coastguard Worker 
3360*4bdc9457SAndroid Build Coastguard Worker   do {
3361*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
3362*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
3363*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
3364*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
3365*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
3366*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
3367*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
3368*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
3369*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
3370*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
3371*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
3372*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
3373*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
3374*4bdc9457SAndroid Build Coastguard Worker 
3375*4bdc9457SAndroid Build Coastguard Worker     size_t k = 0;
3376*4bdc9457SAndroid Build Coastguard Worker     while (k < kc) {
3377*4bdc9457SAndroid Build Coastguard Worker       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
3378*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
3379*4bdc9457SAndroid Build Coastguard Worker       a0 += 8;
3380*4bdc9457SAndroid Build Coastguard Worker 
3381*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
3382*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
3383*4bdc9457SAndroid Build Coastguard Worker 
3384*4bdc9457SAndroid Build Coastguard Worker       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
3385*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
3386*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
3387*4bdc9457SAndroid Build Coastguard Worker 
3388*4bdc9457SAndroid Build Coastguard Worker       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
3389*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
3390*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
3391*4bdc9457SAndroid Build Coastguard Worker 
3392*4bdc9457SAndroid Build Coastguard Worker       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
3393*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
3394*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
3395*4bdc9457SAndroid Build Coastguard Worker 
3396*4bdc9457SAndroid Build Coastguard Worker       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
3397*4bdc9457SAndroid Build Coastguard Worker 
3398*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((const int8_t*) w + 64);
3399*4bdc9457SAndroid Build Coastguard Worker       k += 8 * sizeof(int8_t);
3400*4bdc9457SAndroid Build Coastguard Worker     }
3401*4bdc9457SAndroid Build Coastguard Worker 
3402*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
3403*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
3404*4bdc9457SAndroid Build Coastguard Worker 
3405*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
3406*4bdc9457SAndroid Build Coastguard Worker 
3407*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
3408*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
3409*4bdc9457SAndroid Build Coastguard Worker 
3410*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
3411*4bdc9457SAndroid Build Coastguard Worker 
3412*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale01234567 = _mm256_load_ps(w);
3413*4bdc9457SAndroid Build Coastguard Worker     w = (const void*) ((const float*) w + 8);
3414*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
3415*4bdc9457SAndroid Build Coastguard Worker 
3416*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3417*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
3418*4bdc9457SAndroid Build Coastguard Worker 
3419*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
3420*4bdc9457SAndroid Build Coastguard Worker 
3421*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3422*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
3423*4bdc9457SAndroid Build Coastguard Worker 
3424*4bdc9457SAndroid Build Coastguard Worker     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
3425*4bdc9457SAndroid Build Coastguard Worker 
3426*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
3427*4bdc9457SAndroid Build Coastguard Worker 
3428*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
3429*4bdc9457SAndroid Build Coastguard Worker 
3430*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
3431*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
3432*4bdc9457SAndroid Build Coastguard Worker 
3433*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
3434*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
3435*4bdc9457SAndroid Build Coastguard Worker 
3436*4bdc9457SAndroid Build Coastguard Worker       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3437*4bdc9457SAndroid Build Coastguard Worker 
3438*4bdc9457SAndroid Build Coastguard Worker       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
3439*4bdc9457SAndroid Build Coastguard Worker 
3440*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
3441*4bdc9457SAndroid Build Coastguard Worker     } else {
3442*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
3443*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
3444*4bdc9457SAndroid Build Coastguard Worker 
3445*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
3446*4bdc9457SAndroid Build Coastguard Worker 
3447*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
3448*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
3449*4bdc9457SAndroid Build Coastguard Worker       }
3450*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
3451*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
3452*4bdc9457SAndroid Build Coastguard Worker 
3453*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
3454*4bdc9457SAndroid Build Coastguard Worker 
3455*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
3456*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
3457*4bdc9457SAndroid Build Coastguard Worker       }
3458*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
3459*4bdc9457SAndroid Build Coastguard Worker         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
3460*4bdc9457SAndroid Build Coastguard Worker       }
3461*4bdc9457SAndroid Build Coastguard Worker 
3462*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
3463*4bdc9457SAndroid Build Coastguard Worker     }
3464*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
3465*4bdc9457SAndroid Build Coastguard Worker }
3466*4bdc9457SAndroid Build Coastguard Worker 
xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3467*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2(
3468*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
3469*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
3470*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
3471*4bdc9457SAndroid Build Coastguard Worker     const int8_t* restrict a,
3472*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
3473*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
3474*4bdc9457SAndroid Build Coastguard Worker     int8_t* restrict c,
3475*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
3476*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
3477*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3478*4bdc9457SAndroid Build Coastguard Worker {
3479*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
3480*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 3);
3481*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
3482*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
3483*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(int8_t) == 0);
3484*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
3485*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
3486*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
3487*4bdc9457SAndroid Build Coastguard Worker 
3488*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
3489*4bdc9457SAndroid Build Coastguard Worker   const int8_t* a0 = a;
3490*4bdc9457SAndroid Build Coastguard Worker   int8_t* c0 = c;
3491*4bdc9457SAndroid Build Coastguard Worker   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
3492*4bdc9457SAndroid Build Coastguard Worker   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
3493*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
3494*4bdc9457SAndroid Build Coastguard Worker     a1 = a0;
3495*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
3496*4bdc9457SAndroid Build Coastguard Worker   }
3497*4bdc9457SAndroid Build Coastguard Worker   const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
3498*4bdc9457SAndroid Build Coastguard Worker   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
3499*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
3500*4bdc9457SAndroid Build Coastguard Worker     a2 = a1;
3501*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
3502*4bdc9457SAndroid Build Coastguard Worker   }
3503*4bdc9457SAndroid Build Coastguard Worker 
3504*4bdc9457SAndroid Build Coastguard Worker   do {
3505*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
3506*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
3507*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
3508*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
3509*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
3510*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
3511*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
3512*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
3513*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
3514*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
3515*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
3516*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
3517*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01 = vacc0x01;
3518*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x23 = vacc0x23;
3519*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x45 = vacc0x45;
3520*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x67 = vacc0x67;
3521*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01 = vacc0x01;
3522*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x23 = vacc0x23;
3523*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x45 = vacc0x45;
3524*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x67 = vacc0x67;
3525*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
3526*4bdc9457SAndroid Build Coastguard Worker 
3527*4bdc9457SAndroid Build Coastguard Worker     size_t k = 0;
3528*4bdc9457SAndroid Build Coastguard Worker     while (k < kc) {
3529*4bdc9457SAndroid Build Coastguard Worker       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
3530*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
3531*4bdc9457SAndroid Build Coastguard Worker       a0 += 8;
3532*4bdc9457SAndroid Build Coastguard Worker       const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
3533*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
3534*4bdc9457SAndroid Build Coastguard Worker       a1 += 8;
3535*4bdc9457SAndroid Build Coastguard Worker       const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
3536*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
3537*4bdc9457SAndroid Build Coastguard Worker       a2 += 8;
3538*4bdc9457SAndroid Build Coastguard Worker 
3539*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
3540*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
3541*4bdc9457SAndroid Build Coastguard Worker 
3542*4bdc9457SAndroid Build Coastguard Worker       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
3543*4bdc9457SAndroid Build Coastguard Worker       vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
3544*4bdc9457SAndroid Build Coastguard Worker       vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
3545*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
3546*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
3547*4bdc9457SAndroid Build Coastguard Worker 
3548*4bdc9457SAndroid Build Coastguard Worker       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
3549*4bdc9457SAndroid Build Coastguard Worker       vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
3550*4bdc9457SAndroid Build Coastguard Worker       vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
3551*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
3552*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
3553*4bdc9457SAndroid Build Coastguard Worker 
3554*4bdc9457SAndroid Build Coastguard Worker       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
3555*4bdc9457SAndroid Build Coastguard Worker       vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
3556*4bdc9457SAndroid Build Coastguard Worker       vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
3557*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
3558*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
3559*4bdc9457SAndroid Build Coastguard Worker 
3560*4bdc9457SAndroid Build Coastguard Worker       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
3561*4bdc9457SAndroid Build Coastguard Worker       vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
3562*4bdc9457SAndroid Build Coastguard Worker       vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
3563*4bdc9457SAndroid Build Coastguard Worker 
3564*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((const int8_t*) w + 64);
3565*4bdc9457SAndroid Build Coastguard Worker       k += 8 * sizeof(int8_t);
3566*4bdc9457SAndroid Build Coastguard Worker     }
3567*4bdc9457SAndroid Build Coastguard Worker 
3568*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
3569*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
3570*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
3571*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
3572*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
3573*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
3574*4bdc9457SAndroid Build Coastguard Worker 
3575*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
3576*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
3577*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
3578*4bdc9457SAndroid Build Coastguard Worker 
3579*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
3580*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
3581*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
3582*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
3583*4bdc9457SAndroid Build Coastguard Worker 
3584*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
3585*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
3586*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
3587*4bdc9457SAndroid Build Coastguard Worker 
3588*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale01234567 = _mm256_load_ps(w);
3589*4bdc9457SAndroid Build Coastguard Worker     w = (const void*) ((const float*) w + 8);
3590*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
3591*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale01234567);
3592*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale01234567);
3593*4bdc9457SAndroid Build Coastguard Worker 
3594*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3595*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
3596*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
3597*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
3598*4bdc9457SAndroid Build Coastguard Worker 
3599*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
3600*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
3601*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
3602*4bdc9457SAndroid Build Coastguard Worker 
3603*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3604*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
3605*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
3606*4bdc9457SAndroid Build Coastguard Worker 
3607*4bdc9457SAndroid Build Coastguard Worker     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
3608*4bdc9457SAndroid Build Coastguard Worker     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
3609*4bdc9457SAndroid Build Coastguard Worker 
3610*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
3611*4bdc9457SAndroid Build Coastguard Worker 
3612*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
3613*4bdc9457SAndroid Build Coastguard Worker 
3614*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
3615*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
3616*4bdc9457SAndroid Build Coastguard Worker 
3617*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
3618*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
3619*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c1, vout_hi);
3620*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
3621*4bdc9457SAndroid Build Coastguard Worker 
3622*4bdc9457SAndroid Build Coastguard Worker       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3623*4bdc9457SAndroid Build Coastguard Worker       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
3624*4bdc9457SAndroid Build Coastguard Worker       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
3625*4bdc9457SAndroid Build Coastguard Worker 
3626*4bdc9457SAndroid Build Coastguard Worker       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
3627*4bdc9457SAndroid Build Coastguard Worker       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
3628*4bdc9457SAndroid Build Coastguard Worker       a2 = (const int8_t*) ((uintptr_t) a2 - kc);
3629*4bdc9457SAndroid Build Coastguard Worker 
3630*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
3631*4bdc9457SAndroid Build Coastguard Worker     } else {
3632*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
3633*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
3634*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c1, vout_hi);
3635*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout_lo, 2));
3636*4bdc9457SAndroid Build Coastguard Worker 
3637*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
3638*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
3639*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
3640*4bdc9457SAndroid Build Coastguard Worker 
3641*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
3642*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
3643*4bdc9457SAndroid Build Coastguard Worker       }
3644*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
3645*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
3646*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout_hi, 0));
3647*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout_lo, 4));
3648*4bdc9457SAndroid Build Coastguard Worker 
3649*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
3650*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
3651*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
3652*4bdc9457SAndroid Build Coastguard Worker 
3653*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
3654*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
3655*4bdc9457SAndroid Build Coastguard Worker       }
3656*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
3657*4bdc9457SAndroid Build Coastguard Worker         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
3658*4bdc9457SAndroid Build Coastguard Worker         *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
3659*4bdc9457SAndroid Build Coastguard Worker         *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
3660*4bdc9457SAndroid Build Coastguard Worker       }
3661*4bdc9457SAndroid Build Coastguard Worker 
3662*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
3663*4bdc9457SAndroid Build Coastguard Worker     }
3664*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
3665*4bdc9457SAndroid Build Coastguard Worker }
3666*4bdc9457SAndroid Build Coastguard Worker 
xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3667*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2(
3668*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
3669*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
3670*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
3671*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
3672*4bdc9457SAndroid Build Coastguard Worker     const int8_t** restrict a,
3673*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
3674*4bdc9457SAndroid Build Coastguard Worker     int8_t* restrict c,
3675*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
3676*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
3677*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
3678*4bdc9457SAndroid Build Coastguard Worker     const int8_t* zero,
3679*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3680*4bdc9457SAndroid Build Coastguard Worker {
3681*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
3682*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
3683*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
3684*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
3685*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
3686*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (1 * sizeof(void*)) == 0);
3687*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(int8_t) == 0);
3688*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
3689*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
3690*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
3691*4bdc9457SAndroid Build Coastguard Worker 
3692*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
3693*4bdc9457SAndroid Build Coastguard Worker   int8_t* c0 = c;
3694*4bdc9457SAndroid Build Coastguard Worker 
3695*4bdc9457SAndroid Build Coastguard Worker   do {
3696*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
3697*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
3698*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
3699*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
3700*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
3701*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
3702*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
3703*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
3704*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
3705*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
3706*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
3707*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
3708*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
3709*4bdc9457SAndroid Build Coastguard Worker 
3710*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
3711*4bdc9457SAndroid Build Coastguard Worker     do {
3712*4bdc9457SAndroid Build Coastguard Worker       const int8_t* restrict a0 = a[0];
3713*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
3714*4bdc9457SAndroid Build Coastguard Worker         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
3715*4bdc9457SAndroid Build Coastguard Worker       }
3716*4bdc9457SAndroid Build Coastguard Worker       a += 1;
3717*4bdc9457SAndroid Build Coastguard Worker 
3718*4bdc9457SAndroid Build Coastguard Worker       size_t k = 0;
3719*4bdc9457SAndroid Build Coastguard Worker       while (k < kc) {
3720*4bdc9457SAndroid Build Coastguard Worker         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
3721*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
3722*4bdc9457SAndroid Build Coastguard Worker         a0 += 8;
3723*4bdc9457SAndroid Build Coastguard Worker 
3724*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
3725*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
3726*4bdc9457SAndroid Build Coastguard Worker 
3727*4bdc9457SAndroid Build Coastguard Worker         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
3728*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
3729*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
3730*4bdc9457SAndroid Build Coastguard Worker 
3731*4bdc9457SAndroid Build Coastguard Worker         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
3732*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
3733*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
3734*4bdc9457SAndroid Build Coastguard Worker 
3735*4bdc9457SAndroid Build Coastguard Worker         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
3736*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
3737*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
3738*4bdc9457SAndroid Build Coastguard Worker 
3739*4bdc9457SAndroid Build Coastguard Worker         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
3740*4bdc9457SAndroid Build Coastguard Worker 
3741*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int8_t*) w + 64);
3742*4bdc9457SAndroid Build Coastguard Worker         k += 8 * sizeof(int8_t);
3743*4bdc9457SAndroid Build Coastguard Worker       }
3744*4bdc9457SAndroid Build Coastguard Worker       p -= 1 * sizeof(void*);
3745*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
3746*4bdc9457SAndroid Build Coastguard Worker 
3747*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
3748*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
3749*4bdc9457SAndroid Build Coastguard Worker 
3750*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
3751*4bdc9457SAndroid Build Coastguard Worker 
3752*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
3753*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
3754*4bdc9457SAndroid Build Coastguard Worker 
3755*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
3756*4bdc9457SAndroid Build Coastguard Worker 
3757*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale01234567 = _mm256_load_ps(w);
3758*4bdc9457SAndroid Build Coastguard Worker     w = (const void*) ((const float*) w + 8);
3759*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
3760*4bdc9457SAndroid Build Coastguard Worker 
3761*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3762*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
3763*4bdc9457SAndroid Build Coastguard Worker 
3764*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
3765*4bdc9457SAndroid Build Coastguard Worker 
3766*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3767*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
3768*4bdc9457SAndroid Build Coastguard Worker 
3769*4bdc9457SAndroid Build Coastguard Worker     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
3770*4bdc9457SAndroid Build Coastguard Worker 
3771*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
3772*4bdc9457SAndroid Build Coastguard Worker 
3773*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
3774*4bdc9457SAndroid Build Coastguard Worker 
3775*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
3776*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
3777*4bdc9457SAndroid Build Coastguard Worker 
3778*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
3779*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
3780*4bdc9457SAndroid Build Coastguard Worker 
3781*4bdc9457SAndroid Build Coastguard Worker       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3782*4bdc9457SAndroid Build Coastguard Worker 
3783*4bdc9457SAndroid Build Coastguard Worker       a = (const int8_t**restrict) ((uintptr_t) a - ks);
3784*4bdc9457SAndroid Build Coastguard Worker 
3785*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
3786*4bdc9457SAndroid Build Coastguard Worker     } else {
3787*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
3788*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
3789*4bdc9457SAndroid Build Coastguard Worker 
3790*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
3791*4bdc9457SAndroid Build Coastguard Worker 
3792*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
3793*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
3794*4bdc9457SAndroid Build Coastguard Worker       }
3795*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
3796*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
3797*4bdc9457SAndroid Build Coastguard Worker 
3798*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
3799*4bdc9457SAndroid Build Coastguard Worker 
3800*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
3801*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
3802*4bdc9457SAndroid Build Coastguard Worker       }
3803*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
3804*4bdc9457SAndroid Build Coastguard Worker         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
3805*4bdc9457SAndroid Build Coastguard Worker       }
3806*4bdc9457SAndroid Build Coastguard Worker 
3807*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
3808*4bdc9457SAndroid Build Coastguard Worker     }
3809*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
3810*4bdc9457SAndroid Build Coastguard Worker }
3811*4bdc9457SAndroid Build Coastguard Worker 
xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3812*4bdc9457SAndroid Build Coastguard Worker void xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2(
3813*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
3814*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
3815*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
3816*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
3817*4bdc9457SAndroid Build Coastguard Worker     const int8_t** restrict a,
3818*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
3819*4bdc9457SAndroid Build Coastguard Worker     int8_t* restrict c,
3820*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
3821*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
3822*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
3823*4bdc9457SAndroid Build Coastguard Worker     const int8_t* zero,
3824*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3825*4bdc9457SAndroid Build Coastguard Worker {
3826*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
3827*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 3);
3828*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
3829*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
3830*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
3831*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (3 * sizeof(void*)) == 0);
3832*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(int8_t) == 0);
3833*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
3834*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
3835*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
3836*4bdc9457SAndroid Build Coastguard Worker 
3837*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
3838*4bdc9457SAndroid Build Coastguard Worker   int8_t* c0 = c;
3839*4bdc9457SAndroid Build Coastguard Worker   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
3840*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
3841*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
3842*4bdc9457SAndroid Build Coastguard Worker   }
3843*4bdc9457SAndroid Build Coastguard Worker   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
3844*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
3845*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
3846*4bdc9457SAndroid Build Coastguard Worker   }
3847*4bdc9457SAndroid Build Coastguard Worker 
3848*4bdc9457SAndroid Build Coastguard Worker   do {
3849*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
3850*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
3851*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
3852*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
3853*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
3854*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
3855*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
3856*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
3857*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
3858*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
3859*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
3860*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
3861*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01 = vacc0x01;
3862*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x23 = vacc0x23;
3863*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x45 = vacc0x45;
3864*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x67 = vacc0x67;
3865*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01 = vacc0x01;
3866*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x23 = vacc0x23;
3867*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x45 = vacc0x45;
3868*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x67 = vacc0x67;
3869*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
3870*4bdc9457SAndroid Build Coastguard Worker 
3871*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
3872*4bdc9457SAndroid Build Coastguard Worker     do {
3873*4bdc9457SAndroid Build Coastguard Worker       const int8_t* restrict a0 = a[0];
3874*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
3875*4bdc9457SAndroid Build Coastguard Worker         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
3876*4bdc9457SAndroid Build Coastguard Worker       }
3877*4bdc9457SAndroid Build Coastguard Worker       const int8_t* restrict a1 = a[1];
3878*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a1 != zero) {
3879*4bdc9457SAndroid Build Coastguard Worker         a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
3880*4bdc9457SAndroid Build Coastguard Worker       }
3881*4bdc9457SAndroid Build Coastguard Worker       const int8_t* restrict a2 = a[2];
3882*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a2 != zero) {
3883*4bdc9457SAndroid Build Coastguard Worker         a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
3884*4bdc9457SAndroid Build Coastguard Worker       }
3885*4bdc9457SAndroid Build Coastguard Worker       a += 3;
3886*4bdc9457SAndroid Build Coastguard Worker 
3887*4bdc9457SAndroid Build Coastguard Worker       size_t k = 0;
3888*4bdc9457SAndroid Build Coastguard Worker       while (k < kc) {
3889*4bdc9457SAndroid Build Coastguard Worker         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
3890*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
3891*4bdc9457SAndroid Build Coastguard Worker         a0 += 8;
3892*4bdc9457SAndroid Build Coastguard Worker         const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
3893*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
3894*4bdc9457SAndroid Build Coastguard Worker         a1 += 8;
3895*4bdc9457SAndroid Build Coastguard Worker         const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
3896*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
3897*4bdc9457SAndroid Build Coastguard Worker         a2 += 8;
3898*4bdc9457SAndroid Build Coastguard Worker 
3899*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
3900*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
3901*4bdc9457SAndroid Build Coastguard Worker 
3902*4bdc9457SAndroid Build Coastguard Worker         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
3903*4bdc9457SAndroid Build Coastguard Worker         vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
3904*4bdc9457SAndroid Build Coastguard Worker         vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
3905*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
3906*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
3907*4bdc9457SAndroid Build Coastguard Worker 
3908*4bdc9457SAndroid Build Coastguard Worker         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
3909*4bdc9457SAndroid Build Coastguard Worker         vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
3910*4bdc9457SAndroid Build Coastguard Worker         vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
3911*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
3912*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
3913*4bdc9457SAndroid Build Coastguard Worker 
3914*4bdc9457SAndroid Build Coastguard Worker         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
3915*4bdc9457SAndroid Build Coastguard Worker         vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
3916*4bdc9457SAndroid Build Coastguard Worker         vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
3917*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
3918*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
3919*4bdc9457SAndroid Build Coastguard Worker 
3920*4bdc9457SAndroid Build Coastguard Worker         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
3921*4bdc9457SAndroid Build Coastguard Worker         vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
3922*4bdc9457SAndroid Build Coastguard Worker         vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
3923*4bdc9457SAndroid Build Coastguard Worker 
3924*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int8_t*) w + 64);
3925*4bdc9457SAndroid Build Coastguard Worker         k += 8 * sizeof(int8_t);
3926*4bdc9457SAndroid Build Coastguard Worker       }
3927*4bdc9457SAndroid Build Coastguard Worker       p -= 3 * sizeof(void*);
3928*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
3929*4bdc9457SAndroid Build Coastguard Worker 
3930*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
3931*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
3932*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
3933*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
3934*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
3935*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
3936*4bdc9457SAndroid Build Coastguard Worker 
3937*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
3938*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
3939*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
3940*4bdc9457SAndroid Build Coastguard Worker 
3941*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
3942*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
3943*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
3944*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
3945*4bdc9457SAndroid Build Coastguard Worker 
3946*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
3947*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
3948*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
3949*4bdc9457SAndroid Build Coastguard Worker 
3950*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale01234567 = _mm256_load_ps(w);
3951*4bdc9457SAndroid Build Coastguard Worker     w = (const void*) ((const float*) w + 8);
3952*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale01234567);
3953*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale01234567);
3954*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale01234567);
3955*4bdc9457SAndroid Build Coastguard Worker 
3956*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
3957*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
3958*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
3959*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
3960*4bdc9457SAndroid Build Coastguard Worker 
3961*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
3962*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
3963*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
3964*4bdc9457SAndroid Build Coastguard Worker 
3965*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
3966*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
3967*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
3968*4bdc9457SAndroid Build Coastguard Worker 
3969*4bdc9457SAndroid Build Coastguard Worker     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
3970*4bdc9457SAndroid Build Coastguard Worker     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
3971*4bdc9457SAndroid Build Coastguard Worker 
3972*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
3973*4bdc9457SAndroid Build Coastguard Worker 
3974*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
3975*4bdc9457SAndroid Build Coastguard Worker 
3976*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
3977*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
3978*4bdc9457SAndroid Build Coastguard Worker 
3979*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
3980*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
3981*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c1, vout_hi);
3982*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
3983*4bdc9457SAndroid Build Coastguard Worker 
3984*4bdc9457SAndroid Build Coastguard Worker       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
3985*4bdc9457SAndroid Build Coastguard Worker       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
3986*4bdc9457SAndroid Build Coastguard Worker       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
3987*4bdc9457SAndroid Build Coastguard Worker 
3988*4bdc9457SAndroid Build Coastguard Worker       a = (const int8_t**restrict) ((uintptr_t) a - ks);
3989*4bdc9457SAndroid Build Coastguard Worker 
3990*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
3991*4bdc9457SAndroid Build Coastguard Worker     } else {
3992*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
3993*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout_lo, 2));
3994*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c1, vout_hi);
3995*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
3996*4bdc9457SAndroid Build Coastguard Worker 
3997*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
3998*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
3999*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
4000*4bdc9457SAndroid Build Coastguard Worker 
4001*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
4002*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
4003*4bdc9457SAndroid Build Coastguard Worker       }
4004*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
4005*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout_lo, 4));
4006*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout_hi, 0));
4007*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
4008*4bdc9457SAndroid Build Coastguard Worker 
4009*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
4010*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
4011*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
4012*4bdc9457SAndroid Build Coastguard Worker 
4013*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
4014*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
4015*4bdc9457SAndroid Build Coastguard Worker       }
4016*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
4017*4bdc9457SAndroid Build Coastguard Worker         *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
4018*4bdc9457SAndroid Build Coastguard Worker         *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
4019*4bdc9457SAndroid Build Coastguard Worker         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
4020*4bdc9457SAndroid Build Coastguard Worker       }
4021*4bdc9457SAndroid Build Coastguard Worker 
4022*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
4023*4bdc9457SAndroid Build Coastguard Worker     }
4024*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
4025*4bdc9457SAndroid Build Coastguard Worker }
4026*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4027*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(
4028*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
4029*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
4030*4bdc9457SAndroid Build Coastguard Worker     const int8_t** input,
4031*4bdc9457SAndroid Build Coastguard Worker     const void* weights,
4032*4bdc9457SAndroid Build Coastguard Worker     int8_t* output,
4033*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
4034*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
4035*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
4036*4bdc9457SAndroid Build Coastguard Worker     const int8_t* zero,
4037*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4038*4bdc9457SAndroid Build Coastguard Worker {
4039*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
4040*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
4041*4bdc9457SAndroid Build Coastguard Worker 
4042*4bdc9457SAndroid Build Coastguard Worker   do {
4043*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i0 = input[0];
4044*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
4045*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
4046*4bdc9457SAndroid Build Coastguard Worker       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
4047*4bdc9457SAndroid Build Coastguard Worker     }
4048*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i1 = input[1];
4049*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
4050*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
4051*4bdc9457SAndroid Build Coastguard Worker       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
4052*4bdc9457SAndroid Build Coastguard Worker     }
4053*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i2 = input[2];
4054*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
4055*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
4056*4bdc9457SAndroid Build Coastguard Worker       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
4057*4bdc9457SAndroid Build Coastguard Worker     }
4058*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i3 = input[3];
4059*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
4060*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
4061*4bdc9457SAndroid Build Coastguard Worker       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
4062*4bdc9457SAndroid Build Coastguard Worker     }
4063*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i4 = input[4];
4064*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
4065*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
4066*4bdc9457SAndroid Build Coastguard Worker       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
4067*4bdc9457SAndroid Build Coastguard Worker     }
4068*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i5 = input[5];
4069*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
4070*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
4071*4bdc9457SAndroid Build Coastguard Worker       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
4072*4bdc9457SAndroid Build Coastguard Worker     }
4073*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i6 = input[6];
4074*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
4075*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
4076*4bdc9457SAndroid Build Coastguard Worker       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
4077*4bdc9457SAndroid Build Coastguard Worker     }
4078*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i7 = input[7];
4079*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
4080*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
4081*4bdc9457SAndroid Build Coastguard Worker       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
4082*4bdc9457SAndroid Build Coastguard Worker     }
4083*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i8 = input[8];
4084*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
4085*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
4086*4bdc9457SAndroid Build Coastguard Worker       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
4087*4bdc9457SAndroid Build Coastguard Worker     }
4088*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i9 = input[9];
4089*4bdc9457SAndroid Build Coastguard Worker     assert(i9 != NULL);
4090*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i9 != zero) {
4091*4bdc9457SAndroid Build Coastguard Worker       i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
4092*4bdc9457SAndroid Build Coastguard Worker     }
4093*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i10 = input[10];
4094*4bdc9457SAndroid Build Coastguard Worker     assert(i10 != NULL);
4095*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i10 != zero) {
4096*4bdc9457SAndroid Build Coastguard Worker       i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
4097*4bdc9457SAndroid Build Coastguard Worker     }
4098*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i11 = input[11];
4099*4bdc9457SAndroid Build Coastguard Worker     assert(i11 != NULL);
4100*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i11 != zero) {
4101*4bdc9457SAndroid Build Coastguard Worker       i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
4102*4bdc9457SAndroid Build Coastguard Worker     }
4103*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i12 = input[12];
4104*4bdc9457SAndroid Build Coastguard Worker     assert(i12 != NULL);
4105*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i12 != zero) {
4106*4bdc9457SAndroid Build Coastguard Worker       i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
4107*4bdc9457SAndroid Build Coastguard Worker     }
4108*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i13 = input[13];
4109*4bdc9457SAndroid Build Coastguard Worker     assert(i13 != NULL);
4110*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i13 != zero) {
4111*4bdc9457SAndroid Build Coastguard Worker       i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
4112*4bdc9457SAndroid Build Coastguard Worker     }
4113*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i14 = input[14];
4114*4bdc9457SAndroid Build Coastguard Worker     assert(i14 != NULL);
4115*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i14 != zero) {
4116*4bdc9457SAndroid Build Coastguard Worker       i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
4117*4bdc9457SAndroid Build Coastguard Worker     }
4118*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i15 = input[15];
4119*4bdc9457SAndroid Build Coastguard Worker     assert(i15 != NULL);
4120*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i15 != zero) {
4121*4bdc9457SAndroid Build Coastguard Worker       i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
4122*4bdc9457SAndroid Build Coastguard Worker     }
4123*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i16 = input[16];
4124*4bdc9457SAndroid Build Coastguard Worker     assert(i16 != NULL);
4125*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i16 != zero) {
4126*4bdc9457SAndroid Build Coastguard Worker       i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
4127*4bdc9457SAndroid Build Coastguard Worker     }
4128*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i17 = input[17];
4129*4bdc9457SAndroid Build Coastguard Worker     assert(i17 != NULL);
4130*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i17 != zero) {
4131*4bdc9457SAndroid Build Coastguard Worker       i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
4132*4bdc9457SAndroid Build Coastguard Worker     }
4133*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i18 = input[18];
4134*4bdc9457SAndroid Build Coastguard Worker     assert(i18 != NULL);
4135*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i18 != zero) {
4136*4bdc9457SAndroid Build Coastguard Worker       i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
4137*4bdc9457SAndroid Build Coastguard Worker     }
4138*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i19 = input[19];
4139*4bdc9457SAndroid Build Coastguard Worker     assert(i19 != NULL);
4140*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i19 != zero) {
4141*4bdc9457SAndroid Build Coastguard Worker       i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
4142*4bdc9457SAndroid Build Coastguard Worker     }
4143*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i20 = input[20];
4144*4bdc9457SAndroid Build Coastguard Worker     assert(i20 != NULL);
4145*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i20 != zero) {
4146*4bdc9457SAndroid Build Coastguard Worker       i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
4147*4bdc9457SAndroid Build Coastguard Worker     }
4148*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i21 = input[21];
4149*4bdc9457SAndroid Build Coastguard Worker     assert(i21 != NULL);
4150*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i21 != zero) {
4151*4bdc9457SAndroid Build Coastguard Worker       i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
4152*4bdc9457SAndroid Build Coastguard Worker     }
4153*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i22 = input[22];
4154*4bdc9457SAndroid Build Coastguard Worker     assert(i22 != NULL);
4155*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i22 != zero) {
4156*4bdc9457SAndroid Build Coastguard Worker       i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
4157*4bdc9457SAndroid Build Coastguard Worker     }
4158*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i23 = input[23];
4159*4bdc9457SAndroid Build Coastguard Worker     assert(i23 != NULL);
4160*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i23 != zero) {
4161*4bdc9457SAndroid Build Coastguard Worker       i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
4162*4bdc9457SAndroid Build Coastguard Worker     }
4163*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i24 = input[24];
4164*4bdc9457SAndroid Build Coastguard Worker     assert(i24 != NULL);
4165*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i24 != zero) {
4166*4bdc9457SAndroid Build Coastguard Worker       i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
4167*4bdc9457SAndroid Build Coastguard Worker     }
4168*4bdc9457SAndroid Build Coastguard Worker     input = (const int8_t**) ((uintptr_t) input + input_stride);
4169*4bdc9457SAndroid Build Coastguard Worker 
4170*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
4171*4bdc9457SAndroid Build Coastguard Worker     const void* w = weights;
4172*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 16; c -= 16) {
4173*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
4174*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
4175*4bdc9457SAndroid Build Coastguard Worker 
4176*4bdc9457SAndroid Build Coastguard Worker 
4177*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
4178*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
4179*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
4180*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
4181*4bdc9457SAndroid Build Coastguard Worker       i0 += 16;
4182*4bdc9457SAndroid Build Coastguard Worker 
4183*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
4184*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
4185*4bdc9457SAndroid Build Coastguard Worker 
4186*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
4187*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
4188*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
4189*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
4190*4bdc9457SAndroid Build Coastguard Worker       i1 += 16;
4191*4bdc9457SAndroid Build Coastguard Worker 
4192*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
4193*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
4194*4bdc9457SAndroid Build Coastguard Worker 
4195*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
4196*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
4197*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
4198*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
4199*4bdc9457SAndroid Build Coastguard Worker       i2 += 16;
4200*4bdc9457SAndroid Build Coastguard Worker 
4201*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
4202*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
4203*4bdc9457SAndroid Build Coastguard Worker 
4204*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
4205*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
4206*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
4207*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
4208*4bdc9457SAndroid Build Coastguard Worker       i3 += 16;
4209*4bdc9457SAndroid Build Coastguard Worker 
4210*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
4211*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
4212*4bdc9457SAndroid Build Coastguard Worker 
4213*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
4214*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
4215*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
4216*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
4217*4bdc9457SAndroid Build Coastguard Worker       i4 += 16;
4218*4bdc9457SAndroid Build Coastguard Worker 
4219*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
4220*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
4221*4bdc9457SAndroid Build Coastguard Worker 
4222*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
4223*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
4224*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
4225*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
4226*4bdc9457SAndroid Build Coastguard Worker       i5 += 16;
4227*4bdc9457SAndroid Build Coastguard Worker 
4228*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
4229*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
4230*4bdc9457SAndroid Build Coastguard Worker 
4231*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
4232*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
4233*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
4234*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
4235*4bdc9457SAndroid Build Coastguard Worker       i6 += 16;
4236*4bdc9457SAndroid Build Coastguard Worker 
4237*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
4238*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
4239*4bdc9457SAndroid Build Coastguard Worker 
4240*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
4241*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
4242*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
4243*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
4244*4bdc9457SAndroid Build Coastguard Worker       i7 += 16;
4245*4bdc9457SAndroid Build Coastguard Worker 
4246*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
4247*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
4248*4bdc9457SAndroid Build Coastguard Worker 
4249*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
4250*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
4251*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
4252*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
4253*4bdc9457SAndroid Build Coastguard Worker       i8 += 16;
4254*4bdc9457SAndroid Build Coastguard Worker 
4255*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
4256*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
4257*4bdc9457SAndroid Build Coastguard Worker 
4258*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
4259*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
4260*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i9 + 8)));
4261*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk9x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
4262*4bdc9457SAndroid Build Coastguard Worker       i9 += 16;
4263*4bdc9457SAndroid Build Coastguard Worker 
4264*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
4265*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi9x89ABCDEF, vk9x89ABCDEF));
4266*4bdc9457SAndroid Build Coastguard Worker 
4267*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
4268*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
4269*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i10 + 8)));
4270*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk10x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
4271*4bdc9457SAndroid Build Coastguard Worker       i10 += 16;
4272*4bdc9457SAndroid Build Coastguard Worker 
4273*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
4274*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi10x89ABCDEF, vk10x89ABCDEF));
4275*4bdc9457SAndroid Build Coastguard Worker 
4276*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
4277*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
4278*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i11 + 8)));
4279*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk11x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
4280*4bdc9457SAndroid Build Coastguard Worker       i11 += 16;
4281*4bdc9457SAndroid Build Coastguard Worker 
4282*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
4283*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi11x89ABCDEF, vk11x89ABCDEF));
4284*4bdc9457SAndroid Build Coastguard Worker 
4285*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
4286*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
4287*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i12 + 8)));
4288*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk12x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
4289*4bdc9457SAndroid Build Coastguard Worker       i12 += 16;
4290*4bdc9457SAndroid Build Coastguard Worker 
4291*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
4292*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi12x89ABCDEF, vk12x89ABCDEF));
4293*4bdc9457SAndroid Build Coastguard Worker 
4294*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
4295*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
4296*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i13 + 8)));
4297*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk13x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
4298*4bdc9457SAndroid Build Coastguard Worker       i13 += 16;
4299*4bdc9457SAndroid Build Coastguard Worker 
4300*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
4301*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi13x89ABCDEF, vk13x89ABCDEF));
4302*4bdc9457SAndroid Build Coastguard Worker 
4303*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
4304*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
4305*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i14 + 8)));
4306*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk14x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
4307*4bdc9457SAndroid Build Coastguard Worker       i14 += 16;
4308*4bdc9457SAndroid Build Coastguard Worker 
4309*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
4310*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi14x89ABCDEF, vk14x89ABCDEF));
4311*4bdc9457SAndroid Build Coastguard Worker 
4312*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
4313*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
4314*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i15 + 8)));
4315*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk15x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
4316*4bdc9457SAndroid Build Coastguard Worker       i15 += 16;
4317*4bdc9457SAndroid Build Coastguard Worker 
4318*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
4319*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi15x89ABCDEF, vk15x89ABCDEF));
4320*4bdc9457SAndroid Build Coastguard Worker 
4321*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
4322*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
4323*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i16 + 8)));
4324*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk16x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
4325*4bdc9457SAndroid Build Coastguard Worker       i16 += 16;
4326*4bdc9457SAndroid Build Coastguard Worker 
4327*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
4328*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi16x89ABCDEF, vk16x89ABCDEF));
4329*4bdc9457SAndroid Build Coastguard Worker 
4330*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
4331*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
4332*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i17 + 8)));
4333*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk17x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
4334*4bdc9457SAndroid Build Coastguard Worker       i17 += 16;
4335*4bdc9457SAndroid Build Coastguard Worker 
4336*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
4337*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi17x89ABCDEF, vk17x89ABCDEF));
4338*4bdc9457SAndroid Build Coastguard Worker 
4339*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
4340*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
4341*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i18 + 8)));
4342*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk18x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
4343*4bdc9457SAndroid Build Coastguard Worker       i18 += 16;
4344*4bdc9457SAndroid Build Coastguard Worker 
4345*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
4346*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi18x89ABCDEF, vk18x89ABCDEF));
4347*4bdc9457SAndroid Build Coastguard Worker 
4348*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
4349*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
4350*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i19 + 8)));
4351*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk19x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
4352*4bdc9457SAndroid Build Coastguard Worker       i19 += 16;
4353*4bdc9457SAndroid Build Coastguard Worker 
4354*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
4355*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi19x89ABCDEF, vk19x89ABCDEF));
4356*4bdc9457SAndroid Build Coastguard Worker 
4357*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
4358*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
4359*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i20 + 8)));
4360*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk20x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
4361*4bdc9457SAndroid Build Coastguard Worker       i20 += 16;
4362*4bdc9457SAndroid Build Coastguard Worker 
4363*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
4364*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi20x89ABCDEF, vk20x89ABCDEF));
4365*4bdc9457SAndroid Build Coastguard Worker 
4366*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
4367*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
4368*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i21 + 8)));
4369*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk21x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
4370*4bdc9457SAndroid Build Coastguard Worker       i21 += 16;
4371*4bdc9457SAndroid Build Coastguard Worker 
4372*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
4373*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi21x89ABCDEF, vk21x89ABCDEF));
4374*4bdc9457SAndroid Build Coastguard Worker 
4375*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
4376*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
4377*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i22 + 8)));
4378*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk22x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
4379*4bdc9457SAndroid Build Coastguard Worker       i22 += 16;
4380*4bdc9457SAndroid Build Coastguard Worker 
4381*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
4382*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi22x89ABCDEF, vk22x89ABCDEF));
4383*4bdc9457SAndroid Build Coastguard Worker 
4384*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
4385*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
4386*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i23 + 8)));
4387*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk23x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
4388*4bdc9457SAndroid Build Coastguard Worker       i23 += 16;
4389*4bdc9457SAndroid Build Coastguard Worker 
4390*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
4391*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi23x89ABCDEF, vk23x89ABCDEF));
4392*4bdc9457SAndroid Build Coastguard Worker 
4393*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
4394*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
4395*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i24 + 8)));
4396*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk24x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
4397*4bdc9457SAndroid Build Coastguard Worker       i24 += 16;
4398*4bdc9457SAndroid Build Coastguard Worker 
4399*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
4400*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi24x89ABCDEF, vk24x89ABCDEF));
4401*4bdc9457SAndroid Build Coastguard Worker 
4402*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
4403*4bdc9457SAndroid Build Coastguard Worker 
4404*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
4405*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
4406*4bdc9457SAndroid Build Coastguard Worker 
4407*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
4408*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
4409*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
4410*4bdc9457SAndroid Build Coastguard Worker 
4411*4bdc9457SAndroid Build Coastguard Worker       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
4412*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
4413*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
4414*4bdc9457SAndroid Build Coastguard Worker 
4415*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
4416*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
4417*4bdc9457SAndroid Build Coastguard Worker 
4418*4bdc9457SAndroid Build Coastguard Worker       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
4419*4bdc9457SAndroid Build Coastguard Worker       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
4420*4bdc9457SAndroid Build Coastguard Worker 
4421*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
4422*4bdc9457SAndroid Build Coastguard Worker 
4423*4bdc9457SAndroid Build Coastguard Worker       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
4424*4bdc9457SAndroid Build Coastguard Worker       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
4425*4bdc9457SAndroid Build Coastguard Worker 
4426*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
4427*4bdc9457SAndroid Build Coastguard Worker       output += 16;
4428*4bdc9457SAndroid Build Coastguard Worker     }
4429*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
4430*4bdc9457SAndroid Build Coastguard Worker       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
4431*4bdc9457SAndroid Build Coastguard Worker       do {
4432*4bdc9457SAndroid Build Coastguard Worker         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
4433*4bdc9457SAndroid Build Coastguard Worker 
4434*4bdc9457SAndroid Build Coastguard Worker 
4435*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
4436*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
4437*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
4438*4bdc9457SAndroid Build Coastguard Worker 
4439*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
4440*4bdc9457SAndroid Build Coastguard Worker 
4441*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
4442*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
4443*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
4444*4bdc9457SAndroid Build Coastguard Worker 
4445*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
4446*4bdc9457SAndroid Build Coastguard Worker 
4447*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
4448*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
4449*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
4450*4bdc9457SAndroid Build Coastguard Worker 
4451*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
4452*4bdc9457SAndroid Build Coastguard Worker 
4453*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
4454*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
4455*4bdc9457SAndroid Build Coastguard Worker         i3 += 8;
4456*4bdc9457SAndroid Build Coastguard Worker 
4457*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
4458*4bdc9457SAndroid Build Coastguard Worker 
4459*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
4460*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
4461*4bdc9457SAndroid Build Coastguard Worker         i4 += 8;
4462*4bdc9457SAndroid Build Coastguard Worker 
4463*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
4464*4bdc9457SAndroid Build Coastguard Worker 
4465*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
4466*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
4467*4bdc9457SAndroid Build Coastguard Worker         i5 += 8;
4468*4bdc9457SAndroid Build Coastguard Worker 
4469*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
4470*4bdc9457SAndroid Build Coastguard Worker 
4471*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
4472*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
4473*4bdc9457SAndroid Build Coastguard Worker         i6 += 8;
4474*4bdc9457SAndroid Build Coastguard Worker 
4475*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
4476*4bdc9457SAndroid Build Coastguard Worker 
4477*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
4478*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
4479*4bdc9457SAndroid Build Coastguard Worker         i7 += 8;
4480*4bdc9457SAndroid Build Coastguard Worker 
4481*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
4482*4bdc9457SAndroid Build Coastguard Worker 
4483*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
4484*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
4485*4bdc9457SAndroid Build Coastguard Worker         i8 += 8;
4486*4bdc9457SAndroid Build Coastguard Worker 
4487*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
4488*4bdc9457SAndroid Build Coastguard Worker 
4489*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i9));
4490*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk9x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 144)));
4491*4bdc9457SAndroid Build Coastguard Worker         i9 += 8;
4492*4bdc9457SAndroid Build Coastguard Worker 
4493*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
4494*4bdc9457SAndroid Build Coastguard Worker 
4495*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i10));
4496*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk10x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 160)));
4497*4bdc9457SAndroid Build Coastguard Worker         i10 += 8;
4498*4bdc9457SAndroid Build Coastguard Worker 
4499*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
4500*4bdc9457SAndroid Build Coastguard Worker 
4501*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i11));
4502*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk11x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 176)));
4503*4bdc9457SAndroid Build Coastguard Worker         i11 += 8;
4504*4bdc9457SAndroid Build Coastguard Worker 
4505*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
4506*4bdc9457SAndroid Build Coastguard Worker 
4507*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i12));
4508*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk12x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 192)));
4509*4bdc9457SAndroid Build Coastguard Worker         i12 += 8;
4510*4bdc9457SAndroid Build Coastguard Worker 
4511*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
4512*4bdc9457SAndroid Build Coastguard Worker 
4513*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i13));
4514*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk13x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 208)));
4515*4bdc9457SAndroid Build Coastguard Worker         i13 += 8;
4516*4bdc9457SAndroid Build Coastguard Worker 
4517*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
4518*4bdc9457SAndroid Build Coastguard Worker 
4519*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i14));
4520*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk14x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 224)));
4521*4bdc9457SAndroid Build Coastguard Worker         i14 += 8;
4522*4bdc9457SAndroid Build Coastguard Worker 
4523*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
4524*4bdc9457SAndroid Build Coastguard Worker 
4525*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i15));
4526*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk15x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 240)));
4527*4bdc9457SAndroid Build Coastguard Worker         i15 += 8;
4528*4bdc9457SAndroid Build Coastguard Worker 
4529*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
4530*4bdc9457SAndroid Build Coastguard Worker 
4531*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i16));
4532*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk16x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 256)));
4533*4bdc9457SAndroid Build Coastguard Worker         i16 += 8;
4534*4bdc9457SAndroid Build Coastguard Worker 
4535*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
4536*4bdc9457SAndroid Build Coastguard Worker 
4537*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i17));
4538*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk17x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 272)));
4539*4bdc9457SAndroid Build Coastguard Worker         i17 += 8;
4540*4bdc9457SAndroid Build Coastguard Worker 
4541*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
4542*4bdc9457SAndroid Build Coastguard Worker 
4543*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i18));
4544*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk18x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 288)));
4545*4bdc9457SAndroid Build Coastguard Worker         i18 += 8;
4546*4bdc9457SAndroid Build Coastguard Worker 
4547*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
4548*4bdc9457SAndroid Build Coastguard Worker 
4549*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i19));
4550*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk19x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 304)));
4551*4bdc9457SAndroid Build Coastguard Worker         i19 += 8;
4552*4bdc9457SAndroid Build Coastguard Worker 
4553*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
4554*4bdc9457SAndroid Build Coastguard Worker 
4555*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i20));
4556*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk20x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 320)));
4557*4bdc9457SAndroid Build Coastguard Worker         i20 += 8;
4558*4bdc9457SAndroid Build Coastguard Worker 
4559*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
4560*4bdc9457SAndroid Build Coastguard Worker 
4561*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i21));
4562*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk21x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 336)));
4563*4bdc9457SAndroid Build Coastguard Worker         i21 += 8;
4564*4bdc9457SAndroid Build Coastguard Worker 
4565*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
4566*4bdc9457SAndroid Build Coastguard Worker 
4567*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i22));
4568*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk22x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 352)));
4569*4bdc9457SAndroid Build Coastguard Worker         i22 += 8;
4570*4bdc9457SAndroid Build Coastguard Worker 
4571*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
4572*4bdc9457SAndroid Build Coastguard Worker 
4573*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i23));
4574*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk23x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 368)));
4575*4bdc9457SAndroid Build Coastguard Worker         i23 += 8;
4576*4bdc9457SAndroid Build Coastguard Worker 
4577*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
4578*4bdc9457SAndroid Build Coastguard Worker 
4579*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i24));
4580*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk24x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 384)));
4581*4bdc9457SAndroid Build Coastguard Worker         i24 += 8;
4582*4bdc9457SAndroid Build Coastguard Worker 
4583*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
4584*4bdc9457SAndroid Build Coastguard Worker 
4585*4bdc9457SAndroid Build Coastguard Worker         k += 8;
4586*4bdc9457SAndroid Build Coastguard Worker 
4587*4bdc9457SAndroid Build Coastguard Worker         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
4588*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
4589*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
4590*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
4591*4bdc9457SAndroid Build Coastguard Worker 
4592*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int32_t*) w + 8);
4593*4bdc9457SAndroid Build Coastguard Worker 
4594*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
4595*4bdc9457SAndroid Build Coastguard Worker         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
4596*4bdc9457SAndroid Build Coastguard Worker 
4597*4bdc9457SAndroid Build Coastguard Worker         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4598*4bdc9457SAndroid Build Coastguard Worker 
4599*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
4600*4bdc9457SAndroid Build Coastguard Worker         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4601*4bdc9457SAndroid Build Coastguard Worker 
4602*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(c >= 8) {
4603*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4604*4bdc9457SAndroid Build Coastguard Worker           output += 8;
4605*4bdc9457SAndroid Build Coastguard Worker           c -= 8;
4606*4bdc9457SAndroid Build Coastguard Worker         } else {
4607*4bdc9457SAndroid Build Coastguard Worker           if (c & 4) {
4608*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
4609*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4610*4bdc9457SAndroid Build Coastguard Worker             output += 4;
4611*4bdc9457SAndroid Build Coastguard Worker           }
4612*4bdc9457SAndroid Build Coastguard Worker           if (c & 2) {
4613*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
4614*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4615*4bdc9457SAndroid Build Coastguard Worker             output += 2;
4616*4bdc9457SAndroid Build Coastguard Worker           }
4617*4bdc9457SAndroid Build Coastguard Worker           if (c & 1) {
4618*4bdc9457SAndroid Build Coastguard Worker             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4619*4bdc9457SAndroid Build Coastguard Worker             output += 1;
4620*4bdc9457SAndroid Build Coastguard Worker           }
4621*4bdc9457SAndroid Build Coastguard Worker           c = 0;
4622*4bdc9457SAndroid Build Coastguard Worker         }
4623*4bdc9457SAndroid Build Coastguard Worker       } while (c != 0);
4624*4bdc9457SAndroid Build Coastguard Worker     }
4625*4bdc9457SAndroid Build Coastguard Worker 
4626*4bdc9457SAndroid Build Coastguard Worker     output = (int8_t*) ((uintptr_t) output + output_increment);
4627*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
4628*4bdc9457SAndroid Build Coastguard Worker }
4629*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4630*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(
4631*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
4632*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
4633*4bdc9457SAndroid Build Coastguard Worker     const int8_t** input,
4634*4bdc9457SAndroid Build Coastguard Worker     const void* weights,
4635*4bdc9457SAndroid Build Coastguard Worker     int8_t* output,
4636*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
4637*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
4638*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
4639*4bdc9457SAndroid Build Coastguard Worker     const int8_t* zero,
4640*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4641*4bdc9457SAndroid Build Coastguard Worker {
4642*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
4643*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
4644*4bdc9457SAndroid Build Coastguard Worker 
4645*4bdc9457SAndroid Build Coastguard Worker   do {
4646*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i0 = input[0];
4647*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
4648*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
4649*4bdc9457SAndroid Build Coastguard Worker       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
4650*4bdc9457SAndroid Build Coastguard Worker     }
4651*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i1 = input[1];
4652*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
4653*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
4654*4bdc9457SAndroid Build Coastguard Worker       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
4655*4bdc9457SAndroid Build Coastguard Worker     }
4656*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i2 = input[2];
4657*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
4658*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
4659*4bdc9457SAndroid Build Coastguard Worker       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
4660*4bdc9457SAndroid Build Coastguard Worker     }
4661*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i3 = input[3];
4662*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
4663*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
4664*4bdc9457SAndroid Build Coastguard Worker       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
4665*4bdc9457SAndroid Build Coastguard Worker     }
4666*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i4 = input[4];
4667*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
4668*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
4669*4bdc9457SAndroid Build Coastguard Worker       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
4670*4bdc9457SAndroid Build Coastguard Worker     }
4671*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i5 = input[5];
4672*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
4673*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
4674*4bdc9457SAndroid Build Coastguard Worker       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
4675*4bdc9457SAndroid Build Coastguard Worker     }
4676*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i6 = input[6];
4677*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
4678*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
4679*4bdc9457SAndroid Build Coastguard Worker       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
4680*4bdc9457SAndroid Build Coastguard Worker     }
4681*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i7 = input[7];
4682*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
4683*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
4684*4bdc9457SAndroid Build Coastguard Worker       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
4685*4bdc9457SAndroid Build Coastguard Worker     }
4686*4bdc9457SAndroid Build Coastguard Worker     const int8_t* i8 = input[8];
4687*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
4688*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
4689*4bdc9457SAndroid Build Coastguard Worker       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
4690*4bdc9457SAndroid Build Coastguard Worker     }
4691*4bdc9457SAndroid Build Coastguard Worker     input = (const int8_t**) ((uintptr_t) input + input_stride);
4692*4bdc9457SAndroid Build Coastguard Worker 
4693*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
4694*4bdc9457SAndroid Build Coastguard Worker     const void* w = weights;
4695*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 16; c -= 16) {
4696*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
4697*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
4698*4bdc9457SAndroid Build Coastguard Worker 
4699*4bdc9457SAndroid Build Coastguard Worker 
4700*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
4701*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
4702*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
4703*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
4704*4bdc9457SAndroid Build Coastguard Worker       i0 += 16;
4705*4bdc9457SAndroid Build Coastguard Worker 
4706*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
4707*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
4708*4bdc9457SAndroid Build Coastguard Worker 
4709*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
4710*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
4711*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
4712*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
4713*4bdc9457SAndroid Build Coastguard Worker       i1 += 16;
4714*4bdc9457SAndroid Build Coastguard Worker 
4715*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
4716*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
4717*4bdc9457SAndroid Build Coastguard Worker 
4718*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
4719*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
4720*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
4721*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
4722*4bdc9457SAndroid Build Coastguard Worker       i2 += 16;
4723*4bdc9457SAndroid Build Coastguard Worker 
4724*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
4725*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
4726*4bdc9457SAndroid Build Coastguard Worker 
4727*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
4728*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
4729*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
4730*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
4731*4bdc9457SAndroid Build Coastguard Worker       i3 += 16;
4732*4bdc9457SAndroid Build Coastguard Worker 
4733*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
4734*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
4735*4bdc9457SAndroid Build Coastguard Worker 
4736*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
4737*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
4738*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
4739*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
4740*4bdc9457SAndroid Build Coastguard Worker       i4 += 16;
4741*4bdc9457SAndroid Build Coastguard Worker 
4742*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
4743*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
4744*4bdc9457SAndroid Build Coastguard Worker 
4745*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
4746*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
4747*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
4748*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
4749*4bdc9457SAndroid Build Coastguard Worker       i5 += 16;
4750*4bdc9457SAndroid Build Coastguard Worker 
4751*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
4752*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
4753*4bdc9457SAndroid Build Coastguard Worker 
4754*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
4755*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
4756*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
4757*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
4758*4bdc9457SAndroid Build Coastguard Worker       i6 += 16;
4759*4bdc9457SAndroid Build Coastguard Worker 
4760*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
4761*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
4762*4bdc9457SAndroid Build Coastguard Worker 
4763*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
4764*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
4765*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
4766*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
4767*4bdc9457SAndroid Build Coastguard Worker       i7 += 16;
4768*4bdc9457SAndroid Build Coastguard Worker 
4769*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
4770*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
4771*4bdc9457SAndroid Build Coastguard Worker 
4772*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
4773*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
4774*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
4775*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
4776*4bdc9457SAndroid Build Coastguard Worker       i8 += 16;
4777*4bdc9457SAndroid Build Coastguard Worker 
4778*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
4779*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
4780*4bdc9457SAndroid Build Coastguard Worker 
4781*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
4782*4bdc9457SAndroid Build Coastguard Worker 
4783*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
4784*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
4785*4bdc9457SAndroid Build Coastguard Worker 
4786*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
4787*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
4788*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
4789*4bdc9457SAndroid Build Coastguard Worker 
4790*4bdc9457SAndroid Build Coastguard Worker       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
4791*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
4792*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
4793*4bdc9457SAndroid Build Coastguard Worker 
4794*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
4795*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
4796*4bdc9457SAndroid Build Coastguard Worker 
4797*4bdc9457SAndroid Build Coastguard Worker       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
4798*4bdc9457SAndroid Build Coastguard Worker       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
4799*4bdc9457SAndroid Build Coastguard Worker 
4800*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
4801*4bdc9457SAndroid Build Coastguard Worker 
4802*4bdc9457SAndroid Build Coastguard Worker       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
4803*4bdc9457SAndroid Build Coastguard Worker       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
4804*4bdc9457SAndroid Build Coastguard Worker 
4805*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
4806*4bdc9457SAndroid Build Coastguard Worker       output += 16;
4807*4bdc9457SAndroid Build Coastguard Worker     }
4808*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
4809*4bdc9457SAndroid Build Coastguard Worker       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
4810*4bdc9457SAndroid Build Coastguard Worker       do {
4811*4bdc9457SAndroid Build Coastguard Worker         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
4812*4bdc9457SAndroid Build Coastguard Worker 
4813*4bdc9457SAndroid Build Coastguard Worker 
4814*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i0));
4815*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk0x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) k));
4816*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
4817*4bdc9457SAndroid Build Coastguard Worker 
4818*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
4819*4bdc9457SAndroid Build Coastguard Worker 
4820*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i1));
4821*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk1x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16)));
4822*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
4823*4bdc9457SAndroid Build Coastguard Worker 
4824*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
4825*4bdc9457SAndroid Build Coastguard Worker 
4826*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i2));
4827*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk2x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32)));
4828*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
4829*4bdc9457SAndroid Build Coastguard Worker 
4830*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
4831*4bdc9457SAndroid Build Coastguard Worker 
4832*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i3));
4833*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk3x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48)));
4834*4bdc9457SAndroid Build Coastguard Worker         i3 += 8;
4835*4bdc9457SAndroid Build Coastguard Worker 
4836*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
4837*4bdc9457SAndroid Build Coastguard Worker 
4838*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i4));
4839*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk4x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64)));
4840*4bdc9457SAndroid Build Coastguard Worker         i4 += 8;
4841*4bdc9457SAndroid Build Coastguard Worker 
4842*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
4843*4bdc9457SAndroid Build Coastguard Worker 
4844*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i5));
4845*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk5x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80)));
4846*4bdc9457SAndroid Build Coastguard Worker         i5 += 8;
4847*4bdc9457SAndroid Build Coastguard Worker 
4848*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
4849*4bdc9457SAndroid Build Coastguard Worker 
4850*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i6));
4851*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk6x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96)));
4852*4bdc9457SAndroid Build Coastguard Worker         i6 += 8;
4853*4bdc9457SAndroid Build Coastguard Worker 
4854*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
4855*4bdc9457SAndroid Build Coastguard Worker 
4856*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i7));
4857*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk7x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112)));
4858*4bdc9457SAndroid Build Coastguard Worker         i7 += 8;
4859*4bdc9457SAndroid Build Coastguard Worker 
4860*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
4861*4bdc9457SAndroid Build Coastguard Worker 
4862*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i8));
4863*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk8x01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128)));
4864*4bdc9457SAndroid Build Coastguard Worker         i8 += 8;
4865*4bdc9457SAndroid Build Coastguard Worker 
4866*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
4867*4bdc9457SAndroid Build Coastguard Worker 
4868*4bdc9457SAndroid Build Coastguard Worker         k += 8;
4869*4bdc9457SAndroid Build Coastguard Worker 
4870*4bdc9457SAndroid Build Coastguard Worker         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
4871*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
4872*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
4873*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
4874*4bdc9457SAndroid Build Coastguard Worker 
4875*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int32_t*) w + 8);
4876*4bdc9457SAndroid Build Coastguard Worker 
4877*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
4878*4bdc9457SAndroid Build Coastguard Worker         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
4879*4bdc9457SAndroid Build Coastguard Worker 
4880*4bdc9457SAndroid Build Coastguard Worker         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4881*4bdc9457SAndroid Build Coastguard Worker 
4882*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
4883*4bdc9457SAndroid Build Coastguard Worker         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4884*4bdc9457SAndroid Build Coastguard Worker 
4885*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(c >= 8) {
4886*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4887*4bdc9457SAndroid Build Coastguard Worker           output += 8;
4888*4bdc9457SAndroid Build Coastguard Worker           c -= 8;
4889*4bdc9457SAndroid Build Coastguard Worker         } else {
4890*4bdc9457SAndroid Build Coastguard Worker           if (c & 4) {
4891*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
4892*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4893*4bdc9457SAndroid Build Coastguard Worker             output += 4;
4894*4bdc9457SAndroid Build Coastguard Worker           }
4895*4bdc9457SAndroid Build Coastguard Worker           if (c & 2) {
4896*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
4897*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4898*4bdc9457SAndroid Build Coastguard Worker             output += 2;
4899*4bdc9457SAndroid Build Coastguard Worker           }
4900*4bdc9457SAndroid Build Coastguard Worker           if (c & 1) {
4901*4bdc9457SAndroid Build Coastguard Worker             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4902*4bdc9457SAndroid Build Coastguard Worker             output += 1;
4903*4bdc9457SAndroid Build Coastguard Worker           }
4904*4bdc9457SAndroid Build Coastguard Worker           c = 0;
4905*4bdc9457SAndroid Build Coastguard Worker         }
4906*4bdc9457SAndroid Build Coastguard Worker       } while (c != 0);
4907*4bdc9457SAndroid Build Coastguard Worker     }
4908*4bdc9457SAndroid Build Coastguard Worker 
4909*4bdc9457SAndroid Build Coastguard Worker     output = (int8_t*) ((uintptr_t) output + output_increment);
4910*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
4911*4bdc9457SAndroid Build Coastguard Worker }
4912*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_f32_vcvt_ukernel__avx2_x16(size_t n,const int8_t * x,float * y,const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])4913*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_f32_vcvt_ukernel__avx2_x16(
4914*4bdc9457SAndroid Build Coastguard Worker     size_t n,
4915*4bdc9457SAndroid Build Coastguard Worker     const int8_t* x,
4916*4bdc9457SAndroid Build Coastguard Worker     float* y,
4917*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4918*4bdc9457SAndroid Build Coastguard Worker {
4919*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
4920*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(int8_t) == 0);
4921*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
4922*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
4923*4bdc9457SAndroid Build Coastguard Worker 
4924*4bdc9457SAndroid Build Coastguard Worker   const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
4925*4bdc9457SAndroid Build Coastguard Worker   const __m256 vscale = _mm256_load_ps(params->avx.scale);
4926*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
4927*4bdc9457SAndroid Build Coastguard Worker     __m256i vx01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
4928*4bdc9457SAndroid Build Coastguard Worker     __m256i vx89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
4929*4bdc9457SAndroid Build Coastguard Worker     x += 16;
4930*4bdc9457SAndroid Build Coastguard Worker 
4931*4bdc9457SAndroid Build Coastguard Worker     vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
4932*4bdc9457SAndroid Build Coastguard Worker     vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
4933*4bdc9457SAndroid Build Coastguard Worker 
4934*4bdc9457SAndroid Build Coastguard Worker     __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
4935*4bdc9457SAndroid Build Coastguard Worker     __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
4936*4bdc9457SAndroid Build Coastguard Worker 
4937*4bdc9457SAndroid Build Coastguard Worker     vy01234567 = _mm256_mul_ps(vy01234567, vscale);
4938*4bdc9457SAndroid Build Coastguard Worker     vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
4939*4bdc9457SAndroid Build Coastguard Worker 
4940*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y, vy01234567);
4941*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 8, vy89ABCDEF);
4942*4bdc9457SAndroid Build Coastguard Worker     y += 16;
4943*4bdc9457SAndroid Build Coastguard Worker   }
4944*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
4945*4bdc9457SAndroid Build Coastguard Worker     __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
4946*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_add_epi32(vx, vminus_zero_point);
4947*4bdc9457SAndroid Build Coastguard Worker     x += 8;
4948*4bdc9457SAndroid Build Coastguard Worker 
4949*4bdc9457SAndroid Build Coastguard Worker     __m256 vy = _mm256_cvtepi32_ps(vx);
4950*4bdc9457SAndroid Build Coastguard Worker     vy = _mm256_mul_ps(vy, vscale);
4951*4bdc9457SAndroid Build Coastguard Worker 
4952*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y, vy);
4953*4bdc9457SAndroid Build Coastguard Worker     y += 8;
4954*4bdc9457SAndroid Build Coastguard Worker   }
4955*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
4956*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(int8_t));
4957*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 7 * sizeof(int8_t));
4958*4bdc9457SAndroid Build Coastguard Worker 
4959*4bdc9457SAndroid Build Coastguard Worker     __m256i vx = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) x));
4960*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_add_epi32(vx, vminus_zero_point);
4961*4bdc9457SAndroid Build Coastguard Worker 
4962*4bdc9457SAndroid Build Coastguard Worker     __m256 vy = _mm256_cvtepi32_ps(vx);
4963*4bdc9457SAndroid Build Coastguard Worker     vy = _mm256_mul_ps(vy, vscale);
4964*4bdc9457SAndroid Build Coastguard Worker 
4965*4bdc9457SAndroid Build Coastguard Worker     __m128 vy_lo = _mm256_castps256_ps128(vy);
4966*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(int8_t))) {
4967*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(y, vy_lo);
4968*4bdc9457SAndroid Build Coastguard Worker       vy_lo = _mm256_extractf128_ps(vy, 1);
4969*4bdc9457SAndroid Build Coastguard Worker       y += 4;
4970*4bdc9457SAndroid Build Coastguard Worker     }
4971*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(int8_t))) {
4972*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy_lo);
4973*4bdc9457SAndroid Build Coastguard Worker       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4974*4bdc9457SAndroid Build Coastguard Worker       y += 2;
4975*4bdc9457SAndroid Build Coastguard Worker     }
4976*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(int8_t))) {
4977*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy_lo);
4978*4bdc9457SAndroid Build Coastguard Worker     }
4979*4bdc9457SAndroid Build Coastguard Worker   }
4980*4bdc9457SAndroid Build Coastguard Worker }
4981*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4982*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2(
4983*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
4984*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
4985*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
4986*4bdc9457SAndroid Build Coastguard Worker     const int8_t* restrict a,
4987*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
4988*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
4989*4bdc9457SAndroid Build Coastguard Worker     int8_t* restrict c,
4990*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
4991*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
4992*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4993*4bdc9457SAndroid Build Coastguard Worker {
4994*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
4995*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
4996*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
4997*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
4998*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(int8_t) == 0);
4999*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
5000*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
5001*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
5002*4bdc9457SAndroid Build Coastguard Worker 
5003*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
5004*4bdc9457SAndroid Build Coastguard Worker   const int8_t* a0 = a;
5005*4bdc9457SAndroid Build Coastguard Worker   int8_t* c0 = c;
5006*4bdc9457SAndroid Build Coastguard Worker 
5007*4bdc9457SAndroid Build Coastguard Worker   do {
5008*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
5009*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
5010*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
5011*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
5012*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
5013*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
5014*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
5015*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
5016*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
5017*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
5018*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
5019*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
5020*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
5021*4bdc9457SAndroid Build Coastguard Worker 
5022*4bdc9457SAndroid Build Coastguard Worker     size_t k = 0;
5023*4bdc9457SAndroid Build Coastguard Worker     while (k < kc) {
5024*4bdc9457SAndroid Build Coastguard Worker       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
5025*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
5026*4bdc9457SAndroid Build Coastguard Worker       a0 += 8;
5027*4bdc9457SAndroid Build Coastguard Worker 
5028*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
5029*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
5030*4bdc9457SAndroid Build Coastguard Worker 
5031*4bdc9457SAndroid Build Coastguard Worker       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
5032*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
5033*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
5034*4bdc9457SAndroid Build Coastguard Worker 
5035*4bdc9457SAndroid Build Coastguard Worker       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
5036*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
5037*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
5038*4bdc9457SAndroid Build Coastguard Worker 
5039*4bdc9457SAndroid Build Coastguard Worker       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
5040*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
5041*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
5042*4bdc9457SAndroid Build Coastguard Worker 
5043*4bdc9457SAndroid Build Coastguard Worker       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
5044*4bdc9457SAndroid Build Coastguard Worker 
5045*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((const int8_t*) w + 64);
5046*4bdc9457SAndroid Build Coastguard Worker       k += 8 * sizeof(int8_t);
5047*4bdc9457SAndroid Build Coastguard Worker     }
5048*4bdc9457SAndroid Build Coastguard Worker 
5049*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
5050*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
5051*4bdc9457SAndroid Build Coastguard Worker 
5052*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
5053*4bdc9457SAndroid Build Coastguard Worker 
5054*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5055*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
5056*4bdc9457SAndroid Build Coastguard Worker 
5057*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
5058*4bdc9457SAndroid Build Coastguard Worker 
5059*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5060*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
5061*4bdc9457SAndroid Build Coastguard Worker 
5062*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5063*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
5064*4bdc9457SAndroid Build Coastguard Worker 
5065*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
5066*4bdc9457SAndroid Build Coastguard Worker 
5067*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5068*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
5069*4bdc9457SAndroid Build Coastguard Worker 
5070*4bdc9457SAndroid Build Coastguard Worker     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5071*4bdc9457SAndroid Build Coastguard Worker 
5072*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
5073*4bdc9457SAndroid Build Coastguard Worker 
5074*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
5075*4bdc9457SAndroid Build Coastguard Worker 
5076*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
5077*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
5078*4bdc9457SAndroid Build Coastguard Worker 
5079*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
5080*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
5081*4bdc9457SAndroid Build Coastguard Worker 
5082*4bdc9457SAndroid Build Coastguard Worker       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
5083*4bdc9457SAndroid Build Coastguard Worker 
5084*4bdc9457SAndroid Build Coastguard Worker       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
5085*4bdc9457SAndroid Build Coastguard Worker 
5086*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
5087*4bdc9457SAndroid Build Coastguard Worker     } else {
5088*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
5089*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
5090*4bdc9457SAndroid Build Coastguard Worker 
5091*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
5092*4bdc9457SAndroid Build Coastguard Worker 
5093*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
5094*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
5095*4bdc9457SAndroid Build Coastguard Worker       }
5096*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
5097*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
5098*4bdc9457SAndroid Build Coastguard Worker 
5099*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
5100*4bdc9457SAndroid Build Coastguard Worker 
5101*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
5102*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
5103*4bdc9457SAndroid Build Coastguard Worker       }
5104*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
5105*4bdc9457SAndroid Build Coastguard Worker         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
5106*4bdc9457SAndroid Build Coastguard Worker       }
5107*4bdc9457SAndroid Build Coastguard Worker 
5108*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
5109*4bdc9457SAndroid Build Coastguard Worker     }
5110*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
5111*4bdc9457SAndroid Build Coastguard Worker }
5112*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5113*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2(
5114*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
5115*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
5116*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
5117*4bdc9457SAndroid Build Coastguard Worker     const int8_t* restrict a,
5118*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
5119*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
5120*4bdc9457SAndroid Build Coastguard Worker     int8_t* restrict c,
5121*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
5122*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
5123*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5124*4bdc9457SAndroid Build Coastguard Worker {
5125*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
5126*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 3);
5127*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
5128*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
5129*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(int8_t) == 0);
5130*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
5131*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
5132*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
5133*4bdc9457SAndroid Build Coastguard Worker 
5134*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
5135*4bdc9457SAndroid Build Coastguard Worker   const int8_t* a0 = a;
5136*4bdc9457SAndroid Build Coastguard Worker   int8_t* c0 = c;
5137*4bdc9457SAndroid Build Coastguard Worker   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
5138*4bdc9457SAndroid Build Coastguard Worker   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
5139*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
5140*4bdc9457SAndroid Build Coastguard Worker     a1 = a0;
5141*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
5142*4bdc9457SAndroid Build Coastguard Worker   }
5143*4bdc9457SAndroid Build Coastguard Worker   const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
5144*4bdc9457SAndroid Build Coastguard Worker   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
5145*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
5146*4bdc9457SAndroid Build Coastguard Worker     a2 = a1;
5147*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
5148*4bdc9457SAndroid Build Coastguard Worker   }
5149*4bdc9457SAndroid Build Coastguard Worker 
5150*4bdc9457SAndroid Build Coastguard Worker   do {
5151*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
5152*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
5153*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
5154*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
5155*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
5156*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
5157*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
5158*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
5159*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
5160*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
5161*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
5162*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
5163*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01 = vacc0x01;
5164*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x23 = vacc0x23;
5165*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x45 = vacc0x45;
5166*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x67 = vacc0x67;
5167*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01 = vacc0x01;
5168*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x23 = vacc0x23;
5169*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x45 = vacc0x45;
5170*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x67 = vacc0x67;
5171*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
5172*4bdc9457SAndroid Build Coastguard Worker 
5173*4bdc9457SAndroid Build Coastguard Worker     size_t k = 0;
5174*4bdc9457SAndroid Build Coastguard Worker     while (k < kc) {
5175*4bdc9457SAndroid Build Coastguard Worker       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
5176*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
5177*4bdc9457SAndroid Build Coastguard Worker       a0 += 8;
5178*4bdc9457SAndroid Build Coastguard Worker       const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
5179*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
5180*4bdc9457SAndroid Build Coastguard Worker       a1 += 8;
5181*4bdc9457SAndroid Build Coastguard Worker       const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
5182*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
5183*4bdc9457SAndroid Build Coastguard Worker       a2 += 8;
5184*4bdc9457SAndroid Build Coastguard Worker 
5185*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
5186*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
5187*4bdc9457SAndroid Build Coastguard Worker 
5188*4bdc9457SAndroid Build Coastguard Worker       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
5189*4bdc9457SAndroid Build Coastguard Worker       vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
5190*4bdc9457SAndroid Build Coastguard Worker       vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
5191*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
5192*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
5193*4bdc9457SAndroid Build Coastguard Worker 
5194*4bdc9457SAndroid Build Coastguard Worker       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
5195*4bdc9457SAndroid Build Coastguard Worker       vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
5196*4bdc9457SAndroid Build Coastguard Worker       vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
5197*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
5198*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
5199*4bdc9457SAndroid Build Coastguard Worker 
5200*4bdc9457SAndroid Build Coastguard Worker       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
5201*4bdc9457SAndroid Build Coastguard Worker       vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
5202*4bdc9457SAndroid Build Coastguard Worker       vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
5203*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
5204*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
5205*4bdc9457SAndroid Build Coastguard Worker 
5206*4bdc9457SAndroid Build Coastguard Worker       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
5207*4bdc9457SAndroid Build Coastguard Worker       vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
5208*4bdc9457SAndroid Build Coastguard Worker       vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
5209*4bdc9457SAndroid Build Coastguard Worker 
5210*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((const int8_t*) w + 64);
5211*4bdc9457SAndroid Build Coastguard Worker       k += 8 * sizeof(int8_t);
5212*4bdc9457SAndroid Build Coastguard Worker     }
5213*4bdc9457SAndroid Build Coastguard Worker 
5214*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
5215*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
5216*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
5217*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
5218*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
5219*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
5220*4bdc9457SAndroid Build Coastguard Worker 
5221*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
5222*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
5223*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
5224*4bdc9457SAndroid Build Coastguard Worker 
5225*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5226*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
5227*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
5228*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
5229*4bdc9457SAndroid Build Coastguard Worker 
5230*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
5231*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
5232*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
5233*4bdc9457SAndroid Build Coastguard Worker 
5234*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5235*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
5236*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
5237*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
5238*4bdc9457SAndroid Build Coastguard Worker 
5239*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5240*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
5241*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
5242*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
5243*4bdc9457SAndroid Build Coastguard Worker 
5244*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
5245*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
5246*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
5247*4bdc9457SAndroid Build Coastguard Worker 
5248*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5249*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
5250*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
5251*4bdc9457SAndroid Build Coastguard Worker 
5252*4bdc9457SAndroid Build Coastguard Worker     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5253*4bdc9457SAndroid Build Coastguard Worker     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5254*4bdc9457SAndroid Build Coastguard Worker 
5255*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
5256*4bdc9457SAndroid Build Coastguard Worker 
5257*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
5258*4bdc9457SAndroid Build Coastguard Worker 
5259*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
5260*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
5261*4bdc9457SAndroid Build Coastguard Worker 
5262*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
5263*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
5264*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c1, vout_hi);
5265*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
5266*4bdc9457SAndroid Build Coastguard Worker 
5267*4bdc9457SAndroid Build Coastguard Worker       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
5268*4bdc9457SAndroid Build Coastguard Worker       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
5269*4bdc9457SAndroid Build Coastguard Worker       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
5270*4bdc9457SAndroid Build Coastguard Worker 
5271*4bdc9457SAndroid Build Coastguard Worker       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
5272*4bdc9457SAndroid Build Coastguard Worker       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
5273*4bdc9457SAndroid Build Coastguard Worker       a2 = (const int8_t*) ((uintptr_t) a2 - kc);
5274*4bdc9457SAndroid Build Coastguard Worker 
5275*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
5276*4bdc9457SAndroid Build Coastguard Worker     } else {
5277*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
5278*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
5279*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c1, vout_hi);
5280*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout_lo, 2));
5281*4bdc9457SAndroid Build Coastguard Worker 
5282*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
5283*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
5284*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
5285*4bdc9457SAndroid Build Coastguard Worker 
5286*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
5287*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
5288*4bdc9457SAndroid Build Coastguard Worker       }
5289*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
5290*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
5291*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout_hi, 0));
5292*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout_lo, 4));
5293*4bdc9457SAndroid Build Coastguard Worker 
5294*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
5295*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
5296*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
5297*4bdc9457SAndroid Build Coastguard Worker 
5298*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
5299*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
5300*4bdc9457SAndroid Build Coastguard Worker       }
5301*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
5302*4bdc9457SAndroid Build Coastguard Worker         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
5303*4bdc9457SAndroid Build Coastguard Worker         *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
5304*4bdc9457SAndroid Build Coastguard Worker         *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
5305*4bdc9457SAndroid Build Coastguard Worker       }
5306*4bdc9457SAndroid Build Coastguard Worker 
5307*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
5308*4bdc9457SAndroid Build Coastguard Worker     }
5309*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
5310*4bdc9457SAndroid Build Coastguard Worker }
5311*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5312*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2(
5313*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
5314*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
5315*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
5316*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
5317*4bdc9457SAndroid Build Coastguard Worker     const int8_t** restrict a,
5318*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
5319*4bdc9457SAndroid Build Coastguard Worker     int8_t* restrict c,
5320*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
5321*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
5322*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
5323*4bdc9457SAndroid Build Coastguard Worker     const int8_t* zero,
5324*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5325*4bdc9457SAndroid Build Coastguard Worker {
5326*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
5327*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
5328*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
5329*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
5330*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
5331*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (1 * sizeof(void*)) == 0);
5332*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(int8_t) == 0);
5333*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
5334*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
5335*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
5336*4bdc9457SAndroid Build Coastguard Worker 
5337*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
5338*4bdc9457SAndroid Build Coastguard Worker   int8_t* c0 = c;
5339*4bdc9457SAndroid Build Coastguard Worker 
5340*4bdc9457SAndroid Build Coastguard Worker   do {
5341*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
5342*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
5343*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
5344*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
5345*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
5346*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
5347*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
5348*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
5349*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
5350*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
5351*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
5352*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
5353*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
5354*4bdc9457SAndroid Build Coastguard Worker 
5355*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
5356*4bdc9457SAndroid Build Coastguard Worker     do {
5357*4bdc9457SAndroid Build Coastguard Worker       const int8_t* restrict a0 = a[0];
5358*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
5359*4bdc9457SAndroid Build Coastguard Worker         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
5360*4bdc9457SAndroid Build Coastguard Worker       }
5361*4bdc9457SAndroid Build Coastguard Worker       a += 1;
5362*4bdc9457SAndroid Build Coastguard Worker 
5363*4bdc9457SAndroid Build Coastguard Worker       size_t k = 0;
5364*4bdc9457SAndroid Build Coastguard Worker       while (k < kc) {
5365*4bdc9457SAndroid Build Coastguard Worker         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
5366*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
5367*4bdc9457SAndroid Build Coastguard Worker         a0 += 8;
5368*4bdc9457SAndroid Build Coastguard Worker 
5369*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
5370*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
5371*4bdc9457SAndroid Build Coastguard Worker 
5372*4bdc9457SAndroid Build Coastguard Worker         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
5373*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
5374*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
5375*4bdc9457SAndroid Build Coastguard Worker 
5376*4bdc9457SAndroid Build Coastguard Worker         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
5377*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
5378*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
5379*4bdc9457SAndroid Build Coastguard Worker 
5380*4bdc9457SAndroid Build Coastguard Worker         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
5381*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
5382*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
5383*4bdc9457SAndroid Build Coastguard Worker 
5384*4bdc9457SAndroid Build Coastguard Worker         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
5385*4bdc9457SAndroid Build Coastguard Worker 
5386*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int8_t*) w + 64);
5387*4bdc9457SAndroid Build Coastguard Worker         k += 8 * sizeof(int8_t);
5388*4bdc9457SAndroid Build Coastguard Worker       }
5389*4bdc9457SAndroid Build Coastguard Worker       p -= 1 * sizeof(void*);
5390*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
5391*4bdc9457SAndroid Build Coastguard Worker 
5392*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
5393*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
5394*4bdc9457SAndroid Build Coastguard Worker 
5395*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
5396*4bdc9457SAndroid Build Coastguard Worker 
5397*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5398*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
5399*4bdc9457SAndroid Build Coastguard Worker 
5400*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
5401*4bdc9457SAndroid Build Coastguard Worker 
5402*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5403*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
5404*4bdc9457SAndroid Build Coastguard Worker 
5405*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5406*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
5407*4bdc9457SAndroid Build Coastguard Worker 
5408*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
5409*4bdc9457SAndroid Build Coastguard Worker 
5410*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5411*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
5412*4bdc9457SAndroid Build Coastguard Worker 
5413*4bdc9457SAndroid Build Coastguard Worker     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5414*4bdc9457SAndroid Build Coastguard Worker 
5415*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
5416*4bdc9457SAndroid Build Coastguard Worker 
5417*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
5418*4bdc9457SAndroid Build Coastguard Worker 
5419*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
5420*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
5421*4bdc9457SAndroid Build Coastguard Worker 
5422*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
5423*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
5424*4bdc9457SAndroid Build Coastguard Worker 
5425*4bdc9457SAndroid Build Coastguard Worker       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
5426*4bdc9457SAndroid Build Coastguard Worker 
5427*4bdc9457SAndroid Build Coastguard Worker       a = (const int8_t**restrict) ((uintptr_t) a - ks);
5428*4bdc9457SAndroid Build Coastguard Worker 
5429*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
5430*4bdc9457SAndroid Build Coastguard Worker     } else {
5431*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
5432*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
5433*4bdc9457SAndroid Build Coastguard Worker 
5434*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
5435*4bdc9457SAndroid Build Coastguard Worker 
5436*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
5437*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
5438*4bdc9457SAndroid Build Coastguard Worker       }
5439*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
5440*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
5441*4bdc9457SAndroid Build Coastguard Worker 
5442*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
5443*4bdc9457SAndroid Build Coastguard Worker 
5444*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
5445*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
5446*4bdc9457SAndroid Build Coastguard Worker       }
5447*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
5448*4bdc9457SAndroid Build Coastguard Worker         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
5449*4bdc9457SAndroid Build Coastguard Worker       }
5450*4bdc9457SAndroid Build Coastguard Worker 
5451*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
5452*4bdc9457SAndroid Build Coastguard Worker     }
5453*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
5454*4bdc9457SAndroid Build Coastguard Worker }
5455*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5456*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2(
5457*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
5458*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
5459*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
5460*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
5461*4bdc9457SAndroid Build Coastguard Worker     const int8_t** restrict a,
5462*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
5463*4bdc9457SAndroid Build Coastguard Worker     int8_t* restrict c,
5464*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
5465*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
5466*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
5467*4bdc9457SAndroid Build Coastguard Worker     const int8_t* zero,
5468*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5469*4bdc9457SAndroid Build Coastguard Worker {
5470*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
5471*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 3);
5472*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
5473*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
5474*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
5475*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (3 * sizeof(void*)) == 0);
5476*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(int8_t) == 0);
5477*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
5478*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
5479*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
5480*4bdc9457SAndroid Build Coastguard Worker 
5481*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
5482*4bdc9457SAndroid Build Coastguard Worker   int8_t* c0 = c;
5483*4bdc9457SAndroid Build Coastguard Worker   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
5484*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
5485*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
5486*4bdc9457SAndroid Build Coastguard Worker   }
5487*4bdc9457SAndroid Build Coastguard Worker   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
5488*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
5489*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
5490*4bdc9457SAndroid Build Coastguard Worker   }
5491*4bdc9457SAndroid Build Coastguard Worker 
5492*4bdc9457SAndroid Build Coastguard Worker   do {
5493*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
5494*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
5495*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
5496*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
5497*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
5498*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
5499*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
5500*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
5501*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
5502*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
5503*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
5504*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
5505*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01 = vacc0x01;
5506*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x23 = vacc0x23;
5507*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x45 = vacc0x45;
5508*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x67 = vacc0x67;
5509*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01 = vacc0x01;
5510*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x23 = vacc0x23;
5511*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x45 = vacc0x45;
5512*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x67 = vacc0x67;
5513*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
5514*4bdc9457SAndroid Build Coastguard Worker 
5515*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
5516*4bdc9457SAndroid Build Coastguard Worker     do {
5517*4bdc9457SAndroid Build Coastguard Worker       const int8_t* restrict a0 = a[0];
5518*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
5519*4bdc9457SAndroid Build Coastguard Worker         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
5520*4bdc9457SAndroid Build Coastguard Worker       }
5521*4bdc9457SAndroid Build Coastguard Worker       const int8_t* restrict a1 = a[1];
5522*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a1 != zero) {
5523*4bdc9457SAndroid Build Coastguard Worker         a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
5524*4bdc9457SAndroid Build Coastguard Worker       }
5525*4bdc9457SAndroid Build Coastguard Worker       const int8_t* restrict a2 = a[2];
5526*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a2 != zero) {
5527*4bdc9457SAndroid Build Coastguard Worker         a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
5528*4bdc9457SAndroid Build Coastguard Worker       }
5529*4bdc9457SAndroid Build Coastguard Worker       a += 3;
5530*4bdc9457SAndroid Build Coastguard Worker 
5531*4bdc9457SAndroid Build Coastguard Worker       size_t k = 0;
5532*4bdc9457SAndroid Build Coastguard Worker       while (k < kc) {
5533*4bdc9457SAndroid Build Coastguard Worker         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
5534*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
5535*4bdc9457SAndroid Build Coastguard Worker         a0 += 8;
5536*4bdc9457SAndroid Build Coastguard Worker         const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
5537*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
5538*4bdc9457SAndroid Build Coastguard Worker         a1 += 8;
5539*4bdc9457SAndroid Build Coastguard Worker         const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
5540*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
5541*4bdc9457SAndroid Build Coastguard Worker         a2 += 8;
5542*4bdc9457SAndroid Build Coastguard Worker 
5543*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
5544*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
5545*4bdc9457SAndroid Build Coastguard Worker 
5546*4bdc9457SAndroid Build Coastguard Worker         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
5547*4bdc9457SAndroid Build Coastguard Worker         vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
5548*4bdc9457SAndroid Build Coastguard Worker         vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
5549*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
5550*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
5551*4bdc9457SAndroid Build Coastguard Worker 
5552*4bdc9457SAndroid Build Coastguard Worker         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
5553*4bdc9457SAndroid Build Coastguard Worker         vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
5554*4bdc9457SAndroid Build Coastguard Worker         vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
5555*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
5556*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
5557*4bdc9457SAndroid Build Coastguard Worker 
5558*4bdc9457SAndroid Build Coastguard Worker         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
5559*4bdc9457SAndroid Build Coastguard Worker         vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
5560*4bdc9457SAndroid Build Coastguard Worker         vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
5561*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
5562*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
5563*4bdc9457SAndroid Build Coastguard Worker 
5564*4bdc9457SAndroid Build Coastguard Worker         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
5565*4bdc9457SAndroid Build Coastguard Worker         vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
5566*4bdc9457SAndroid Build Coastguard Worker         vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
5567*4bdc9457SAndroid Build Coastguard Worker 
5568*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int8_t*) w + 64);
5569*4bdc9457SAndroid Build Coastguard Worker         k += 8 * sizeof(int8_t);
5570*4bdc9457SAndroid Build Coastguard Worker       }
5571*4bdc9457SAndroid Build Coastguard Worker       p -= 3 * sizeof(void*);
5572*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
5573*4bdc9457SAndroid Build Coastguard Worker 
5574*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
5575*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
5576*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
5577*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
5578*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
5579*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
5580*4bdc9457SAndroid Build Coastguard Worker 
5581*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
5582*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
5583*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
5584*4bdc9457SAndroid Build Coastguard Worker 
5585*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5586*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
5587*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
5588*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
5589*4bdc9457SAndroid Build Coastguard Worker 
5590*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
5591*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
5592*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
5593*4bdc9457SAndroid Build Coastguard Worker 
5594*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
5595*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
5596*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
5597*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
5598*4bdc9457SAndroid Build Coastguard Worker 
5599*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
5600*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
5601*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
5602*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
5603*4bdc9457SAndroid Build Coastguard Worker 
5604*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
5605*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
5606*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
5607*4bdc9457SAndroid Build Coastguard Worker 
5608*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
5609*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
5610*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
5611*4bdc9457SAndroid Build Coastguard Worker 
5612*4bdc9457SAndroid Build Coastguard Worker     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5613*4bdc9457SAndroid Build Coastguard Worker     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
5614*4bdc9457SAndroid Build Coastguard Worker 
5615*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
5616*4bdc9457SAndroid Build Coastguard Worker 
5617*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
5618*4bdc9457SAndroid Build Coastguard Worker 
5619*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
5620*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
5621*4bdc9457SAndroid Build Coastguard Worker 
5622*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
5623*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
5624*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c1, vout_hi);
5625*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
5626*4bdc9457SAndroid Build Coastguard Worker 
5627*4bdc9457SAndroid Build Coastguard Worker       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
5628*4bdc9457SAndroid Build Coastguard Worker       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
5629*4bdc9457SAndroid Build Coastguard Worker       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
5630*4bdc9457SAndroid Build Coastguard Worker 
5631*4bdc9457SAndroid Build Coastguard Worker       a = (const int8_t**restrict) ((uintptr_t) a - ks);
5632*4bdc9457SAndroid Build Coastguard Worker 
5633*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
5634*4bdc9457SAndroid Build Coastguard Worker     } else {
5635*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
5636*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout_lo, 2));
5637*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c1, vout_hi);
5638*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
5639*4bdc9457SAndroid Build Coastguard Worker 
5640*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
5641*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
5642*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
5643*4bdc9457SAndroid Build Coastguard Worker 
5644*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
5645*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
5646*4bdc9457SAndroid Build Coastguard Worker       }
5647*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
5648*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout_lo, 4));
5649*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout_hi, 0));
5650*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
5651*4bdc9457SAndroid Build Coastguard Worker 
5652*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
5653*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
5654*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
5655*4bdc9457SAndroid Build Coastguard Worker 
5656*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
5657*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
5658*4bdc9457SAndroid Build Coastguard Worker       }
5659*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
5660*4bdc9457SAndroid Build Coastguard Worker         *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
5661*4bdc9457SAndroid Build Coastguard Worker         *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
5662*4bdc9457SAndroid Build Coastguard Worker         *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
5663*4bdc9457SAndroid Build Coastguard Worker       }
5664*4bdc9457SAndroid Build Coastguard Worker 
5665*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
5666*4bdc9457SAndroid Build Coastguard Worker     }
5667*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
5668*4bdc9457SAndroid Build Coastguard Worker }
5669*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5670*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(
5671*4bdc9457SAndroid Build Coastguard Worker     size_t n,
5672*4bdc9457SAndroid Build Coastguard Worker     const int8_t* input_a,
5673*4bdc9457SAndroid Build Coastguard Worker     const int8_t* input_b,
5674*4bdc9457SAndroid Build Coastguard Worker     int8_t* output,
5675*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5676*4bdc9457SAndroid Build Coastguard Worker {
5677*4bdc9457SAndroid Build Coastguard Worker   const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
5678*4bdc9457SAndroid Build Coastguard Worker   const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
5679*4bdc9457SAndroid Build Coastguard Worker   const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
5680*4bdc9457SAndroid Build Coastguard Worker   const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
5681*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
5682*4bdc9457SAndroid Build Coastguard Worker   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
5683*4bdc9457SAndroid Build Coastguard Worker   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
5684*4bdc9457SAndroid Build Coastguard Worker 
5685*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
5686*4bdc9457SAndroid Build Coastguard Worker     const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
5687*4bdc9457SAndroid Build Coastguard Worker     const __m256i vb01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
5688*4bdc9457SAndroid Build Coastguard Worker     const __m256i va89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
5689*4bdc9457SAndroid Build Coastguard Worker     const __m256i vb89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
5690*4bdc9457SAndroid Build Coastguard Worker     input_a += 16;
5691*4bdc9457SAndroid Build Coastguard Worker     input_b += 16;
5692*4bdc9457SAndroid Build Coastguard Worker 
5693*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
5694*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
5695*4bdc9457SAndroid Build Coastguard Worker 
5696*4bdc9457SAndroid Build Coastguard Worker     vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
5697*4bdc9457SAndroid Build Coastguard Worker     vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vb89ABCDEF, vb_multiplier));
5698*4bdc9457SAndroid Build Coastguard Worker 
5699*4bdc9457SAndroid Build Coastguard Worker     vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
5700*4bdc9457SAndroid Build Coastguard Worker     vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
5701*4bdc9457SAndroid Build Coastguard Worker 
5702*4bdc9457SAndroid Build Coastguard Worker     __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
5703*4bdc9457SAndroid Build Coastguard Worker 
5704*4bdc9457SAndroid Build Coastguard Worker     __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5705*4bdc9457SAndroid Build Coastguard Worker 
5706*4bdc9457SAndroid Build Coastguard Worker     vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5707*4bdc9457SAndroid Build Coastguard Worker 
5708*4bdc9457SAndroid Build Coastguard Worker     vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
5709*4bdc9457SAndroid Build Coastguard Worker 
5710*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5711*4bdc9457SAndroid Build Coastguard Worker     output += 16;
5712*4bdc9457SAndroid Build Coastguard Worker   }
5713*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
5714*4bdc9457SAndroid Build Coastguard Worker     do {
5715*4bdc9457SAndroid Build Coastguard Worker       const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
5716*4bdc9457SAndroid Build Coastguard Worker       const __m256i vb01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
5717*4bdc9457SAndroid Build Coastguard Worker       input_a += 8;
5718*4bdc9457SAndroid Build Coastguard Worker       input_b += 8;
5719*4bdc9457SAndroid Build Coastguard Worker 
5720*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
5721*4bdc9457SAndroid Build Coastguard Worker 
5722*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
5723*4bdc9457SAndroid Build Coastguard Worker 
5724*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
5725*4bdc9457SAndroid Build Coastguard Worker 
5726*4bdc9457SAndroid Build Coastguard Worker       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
5727*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5728*4bdc9457SAndroid Build Coastguard Worker       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5729*4bdc9457SAndroid Build Coastguard Worker       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5730*4bdc9457SAndroid Build Coastguard Worker 
5731*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
5732*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5733*4bdc9457SAndroid Build Coastguard Worker         output += 8;
5734*4bdc9457SAndroid Build Coastguard Worker         n -= 8 * sizeof(int8_t);
5735*4bdc9457SAndroid Build Coastguard Worker       } else {
5736*4bdc9457SAndroid Build Coastguard Worker         if (n & (4 * sizeof(int8_t))) {
5737*4bdc9457SAndroid Build Coastguard Worker           _mm_storeu_si32(output, vout0123456701234567);
5738*4bdc9457SAndroid Build Coastguard Worker           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5739*4bdc9457SAndroid Build Coastguard Worker           output += 4;
5740*4bdc9457SAndroid Build Coastguard Worker         }
5741*4bdc9457SAndroid Build Coastguard Worker         if (n & (2 * sizeof(int8_t))) {
5742*4bdc9457SAndroid Build Coastguard Worker           _mm_storeu_si16(output, vout0123456701234567);
5743*4bdc9457SAndroid Build Coastguard Worker           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5744*4bdc9457SAndroid Build Coastguard Worker           output += 2;
5745*4bdc9457SAndroid Build Coastguard Worker         }
5746*4bdc9457SAndroid Build Coastguard Worker         if (n & (1 * sizeof(int8_t))) {
5747*4bdc9457SAndroid Build Coastguard Worker           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5748*4bdc9457SAndroid Build Coastguard Worker         }
5749*4bdc9457SAndroid Build Coastguard Worker         n = 0;
5750*4bdc9457SAndroid Build Coastguard Worker       }
5751*4bdc9457SAndroid Build Coastguard Worker     } while (n != 0);
5752*4bdc9457SAndroid Build Coastguard Worker   }
5753*4bdc9457SAndroid Build Coastguard Worker }
5754*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5755*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(
5756*4bdc9457SAndroid Build Coastguard Worker     size_t n,
5757*4bdc9457SAndroid Build Coastguard Worker     const int8_t* input_a,
5758*4bdc9457SAndroid Build Coastguard Worker     const int8_t* input_b,
5759*4bdc9457SAndroid Build Coastguard Worker     int8_t* output,
5760*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5761*4bdc9457SAndroid Build Coastguard Worker {
5762*4bdc9457SAndroid Build Coastguard Worker   const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
5763*4bdc9457SAndroid Build Coastguard Worker   const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
5764*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
5765*4bdc9457SAndroid Build Coastguard Worker   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
5766*4bdc9457SAndroid Build Coastguard Worker   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
5767*4bdc9457SAndroid Build Coastguard Worker 
5768*4bdc9457SAndroid Build Coastguard Worker   const __m256i vbias = _mm256_add_epi32(
5769*4bdc9457SAndroid Build Coastguard Worker     _mm256_broadcastd_epi32(_mm_cvtsi32_si128(params->avx2.b_multiplier[0] * (int32_t) *input_b)),
5770*4bdc9457SAndroid Build Coastguard Worker     _mm256_load_si256((const __m256i*) params->avx2.bias));
5771*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
5772*4bdc9457SAndroid Build Coastguard Worker     const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
5773*4bdc9457SAndroid Build Coastguard Worker     const __m256i va89ABCDEF = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
5774*4bdc9457SAndroid Build Coastguard Worker     input_a += 16;
5775*4bdc9457SAndroid Build Coastguard Worker 
5776*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
5777*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
5778*4bdc9457SAndroid Build Coastguard Worker 
5779*4bdc9457SAndroid Build Coastguard Worker     vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
5780*4bdc9457SAndroid Build Coastguard Worker     vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
5781*4bdc9457SAndroid Build Coastguard Worker 
5782*4bdc9457SAndroid Build Coastguard Worker     __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
5783*4bdc9457SAndroid Build Coastguard Worker 
5784*4bdc9457SAndroid Build Coastguard Worker     __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
5785*4bdc9457SAndroid Build Coastguard Worker 
5786*4bdc9457SAndroid Build Coastguard Worker     vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5787*4bdc9457SAndroid Build Coastguard Worker 
5788*4bdc9457SAndroid Build Coastguard Worker     vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
5789*4bdc9457SAndroid Build Coastguard Worker 
5790*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5791*4bdc9457SAndroid Build Coastguard Worker     output += 16;
5792*4bdc9457SAndroid Build Coastguard Worker   }
5793*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
5794*4bdc9457SAndroid Build Coastguard Worker     do {
5795*4bdc9457SAndroid Build Coastguard Worker       const __m256i va01234567 = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
5796*4bdc9457SAndroid Build Coastguard Worker       input_a += 8;
5797*4bdc9457SAndroid Build Coastguard Worker 
5798*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
5799*4bdc9457SAndroid Build Coastguard Worker 
5800*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
5801*4bdc9457SAndroid Build Coastguard Worker 
5802*4bdc9457SAndroid Build Coastguard Worker       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
5803*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5804*4bdc9457SAndroid Build Coastguard Worker       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5805*4bdc9457SAndroid Build Coastguard Worker       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5806*4bdc9457SAndroid Build Coastguard Worker 
5807*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
5808*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5809*4bdc9457SAndroid Build Coastguard Worker         output += 8;
5810*4bdc9457SAndroid Build Coastguard Worker         n -= 8 * sizeof(int8_t);
5811*4bdc9457SAndroid Build Coastguard Worker       } else {
5812*4bdc9457SAndroid Build Coastguard Worker         if (n & (4 * sizeof(int8_t))) {
5813*4bdc9457SAndroid Build Coastguard Worker           _mm_storeu_si32(output, vout0123456701234567);
5814*4bdc9457SAndroid Build Coastguard Worker           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5815*4bdc9457SAndroid Build Coastguard Worker           output += 4;
5816*4bdc9457SAndroid Build Coastguard Worker         }
5817*4bdc9457SAndroid Build Coastguard Worker         if (n & (2 * sizeof(int8_t))) {
5818*4bdc9457SAndroid Build Coastguard Worker           _mm_storeu_si16(output, vout0123456701234567);
5819*4bdc9457SAndroid Build Coastguard Worker           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5820*4bdc9457SAndroid Build Coastguard Worker           output += 2;
5821*4bdc9457SAndroid Build Coastguard Worker         }
5822*4bdc9457SAndroid Build Coastguard Worker         if (n & (1 * sizeof(int8_t))) {
5823*4bdc9457SAndroid Build Coastguard Worker           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5824*4bdc9457SAndroid Build Coastguard Worker         }
5825*4bdc9457SAndroid Build Coastguard Worker         n = 0;
5826*4bdc9457SAndroid Build Coastguard Worker       }
5827*4bdc9457SAndroid Build Coastguard Worker     } while (n != 0);
5828*4bdc9457SAndroid Build Coastguard Worker   }
5829*4bdc9457SAndroid Build Coastguard Worker }
5830*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_vcvt_ukernel__avx2_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])5831*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_vcvt_ukernel__avx2_x32(
5832*4bdc9457SAndroid Build Coastguard Worker     size_t n,
5833*4bdc9457SAndroid Build Coastguard Worker     const int8_t* x,
5834*4bdc9457SAndroid Build Coastguard Worker     int8_t* y,
5835*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5836*4bdc9457SAndroid Build Coastguard Worker {
5837*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
5838*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(int8_t) == 0);
5839*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
5840*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
5841*4bdc9457SAndroid Build Coastguard Worker 
5842*4bdc9457SAndroid Build Coastguard Worker   const __m256i vinput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.input_zero_point);
5843*4bdc9457SAndroid Build Coastguard Worker   const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
5844*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
5845*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
5846*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) x));
5847*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (x + 16)));
5848*4bdc9457SAndroid Build Coastguard Worker     x += 32;
5849*4bdc9457SAndroid Build Coastguard Worker 
5850*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_sub_epi16(vinput_zero_point, vacc0);
5851*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_sub_epi16(vinput_zero_point, vacc1);
5852*4bdc9457SAndroid Build Coastguard Worker 
5853*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_slli_epi16(vacc0, 7);
5854*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_slli_epi16(vacc1, 7);
5855*4bdc9457SAndroid Build Coastguard Worker 
5856*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_mulhrs_epi16(vacc0, vmultiplier);
5857*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_mulhrs_epi16(vacc1, vmultiplier);
5858*4bdc9457SAndroid Build Coastguard Worker 
5859*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_adds_epi16(vacc0, voutput_zero_point);
5860*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_adds_epi16(vacc1, voutput_zero_point);
5861*4bdc9457SAndroid Build Coastguard Worker 
5862*4bdc9457SAndroid Build Coastguard Worker     __m256i vy0 = _mm256_packs_epi16(vacc0, vacc1);
5863*4bdc9457SAndroid Build Coastguard Worker 
5864*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_permute4x64_epi64(vy0, _MM_SHUFFLE(3, 1, 2, 0));
5865*4bdc9457SAndroid Build Coastguard Worker 
5866*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) y, vy0);
5867*4bdc9457SAndroid Build Coastguard Worker     y += 32;
5868*4bdc9457SAndroid Build Coastguard Worker   }
5869*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
5870*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) x));
5871*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_sub_epi16(vinput_zero_point, vacc);
5872*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_slli_epi16(vacc, 7);
5873*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_mulhrs_epi16(vacc, vmultiplier);
5874*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_adds_epi16(vacc, voutput_zero_point);
5875*4bdc9457SAndroid Build Coastguard Worker     x += 16;
5876*4bdc9457SAndroid Build Coastguard Worker 
5877*4bdc9457SAndroid Build Coastguard Worker     const __m128i vacc_hi = _mm256_extracti128_si256(vacc, 1);
5878*4bdc9457SAndroid Build Coastguard Worker     const __m128i vy = _mm_packs_epi16(_mm256_castsi256_si128(vacc), vacc_hi);
5879*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) y, vy);
5880*4bdc9457SAndroid Build Coastguard Worker     y += 16;
5881*4bdc9457SAndroid Build Coastguard Worker   }
5882*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
5883*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(int8_t));
5884*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 15 * sizeof(int8_t));
5885*4bdc9457SAndroid Build Coastguard Worker 
5886*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) x));
5887*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_sub_epi16(vinput_zero_point, vacc);
5888*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_slli_epi16(vacc, 7);
5889*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_mulhrs_epi16(vacc, vmultiplier);
5890*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_adds_epi16(vacc, voutput_zero_point);
5891*4bdc9457SAndroid Build Coastguard Worker 
5892*4bdc9457SAndroid Build Coastguard Worker     const __m128i vacc_hi = _mm256_extracti128_si256(vacc, 1);
5893*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_packs_epi16(_mm256_castsi256_si128(vacc), vacc_hi);
5894*4bdc9457SAndroid Build Coastguard Worker     if (n & (8 * sizeof(int8_t))) {
5895*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) y, vy);
5896*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_unpackhi_epi64(vy, vy);
5897*4bdc9457SAndroid Build Coastguard Worker       y += 8;
5898*4bdc9457SAndroid Build Coastguard Worker     }
5899*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(int8_t))) {
5900*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(y, vy);
5901*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi64(vy, 32);
5902*4bdc9457SAndroid Build Coastguard Worker       y += 4;
5903*4bdc9457SAndroid Build Coastguard Worker     }
5904*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(int8_t))) {
5905*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si16(y, vy);
5906*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi32(vy, 16);
5907*4bdc9457SAndroid Build Coastguard Worker       y += 2;
5908*4bdc9457SAndroid Build Coastguard Worker     }
5909*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(int8_t))) {
5910*4bdc9457SAndroid Build Coastguard Worker       *y = (int8_t) _mm_extract_epi8(vy, 0);
5911*4bdc9457SAndroid Build Coastguard Worker     }
5912*4bdc9457SAndroid Build Coastguard Worker   }
5913*4bdc9457SAndroid Build Coastguard Worker }
5914*4bdc9457SAndroid Build Coastguard Worker 
xnn_qs8_vlrelu_ukernel__avx2_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])5915*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_vlrelu_ukernel__avx2_x32(
5916*4bdc9457SAndroid Build Coastguard Worker     size_t n,
5917*4bdc9457SAndroid Build Coastguard Worker     const int8_t* x,
5918*4bdc9457SAndroid Build Coastguard Worker     int8_t* y,
5919*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5920*4bdc9457SAndroid Build Coastguard Worker {
5921*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
5922*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(int8_t) == 0);
5923*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
5924*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
5925*4bdc9457SAndroid Build Coastguard Worker 
5926*4bdc9457SAndroid Build Coastguard Worker   const __m256i vinput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.input_zero_point);
5927*4bdc9457SAndroid Build Coastguard Worker   const __m256i vpositive_multiplier = _mm256_load_si256((const __m256i*) params->avx2.positive_multiplier);
5928*4bdc9457SAndroid Build Coastguard Worker   const __m256i vnegative_multiplier = _mm256_load_si256((const __m256i*) params->avx2.negative_multiplier);
5929*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
5930*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
5931*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) x));
5932*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (x + 16)));
5933*4bdc9457SAndroid Build Coastguard Worker     x += 32;
5934*4bdc9457SAndroid Build Coastguard Worker 
5935*4bdc9457SAndroid Build Coastguard Worker     __m256i vmultiplier0 = _mm256_cmpgt_epi16(vacc0, vinput_zero_point);
5936*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_sub_epi16(vinput_zero_point, vacc0);
5937*4bdc9457SAndroid Build Coastguard Worker     __m256i vmultiplier1 = _mm256_cmpgt_epi16(vacc1, vinput_zero_point);
5938*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_sub_epi16(vinput_zero_point, vacc1);
5939*4bdc9457SAndroid Build Coastguard Worker 
5940*4bdc9457SAndroid Build Coastguard Worker     vmultiplier0 = _mm256_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier0);
5941*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_slli_epi16(vacc0, 7);
5942*4bdc9457SAndroid Build Coastguard Worker     vmultiplier1 = _mm256_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier1);
5943*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_slli_epi16(vacc1, 7);
5944*4bdc9457SAndroid Build Coastguard Worker 
5945*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_mulhrs_epi16(vacc0, vmultiplier0);
5946*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_mulhrs_epi16(vacc1, vmultiplier1);
5947*4bdc9457SAndroid Build Coastguard Worker 
5948*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_adds_epi16(vacc0, voutput_zero_point);
5949*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_adds_epi16(vacc1, voutput_zero_point);
5950*4bdc9457SAndroid Build Coastguard Worker 
5951*4bdc9457SAndroid Build Coastguard Worker     __m256i vy0 = _mm256_packs_epi16(vacc0, vacc1);
5952*4bdc9457SAndroid Build Coastguard Worker 
5953*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_permute4x64_epi64(vy0, _MM_SHUFFLE(3, 1, 2, 0));
5954*4bdc9457SAndroid Build Coastguard Worker 
5955*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) y, vy0);
5956*4bdc9457SAndroid Build Coastguard Worker     y += 32;
5957*4bdc9457SAndroid Build Coastguard Worker   }
5958*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
5959*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) x));
5960*4bdc9457SAndroid Build Coastguard Worker     __m256i vmultiplier = _mm256_cmpgt_epi16(vacc, vinput_zero_point);
5961*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_sub_epi16(vinput_zero_point, vacc);
5962*4bdc9457SAndroid Build Coastguard Worker     vmultiplier = _mm256_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
5963*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_slli_epi16(vacc, 7);
5964*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_mulhrs_epi16(vacc, vmultiplier);
5965*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_adds_epi16(vacc, voutput_zero_point);
5966*4bdc9457SAndroid Build Coastguard Worker     x += 16;
5967*4bdc9457SAndroid Build Coastguard Worker 
5968*4bdc9457SAndroid Build Coastguard Worker     const __m128i vacc_hi = _mm256_extracti128_si256(vacc, 1);
5969*4bdc9457SAndroid Build Coastguard Worker     const __m128i vy = _mm_packs_epi16(_mm256_castsi256_si128(vacc), vacc_hi);
5970*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) y, vy);
5971*4bdc9457SAndroid Build Coastguard Worker     y += 16;
5972*4bdc9457SAndroid Build Coastguard Worker   }
5973*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
5974*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(int8_t));
5975*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 15 * sizeof(int8_t));
5976*4bdc9457SAndroid Build Coastguard Worker 
5977*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) x));
5978*4bdc9457SAndroid Build Coastguard Worker     __m256i vmultiplier = _mm256_cmpgt_epi16(vacc, vinput_zero_point);
5979*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_sub_epi16(vinput_zero_point, vacc);
5980*4bdc9457SAndroid Build Coastguard Worker     vmultiplier = _mm256_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
5981*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_slli_epi16(vacc, 7);
5982*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_mulhrs_epi16(vacc, vmultiplier);
5983*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_adds_epi16(vacc, voutput_zero_point);
5984*4bdc9457SAndroid Build Coastguard Worker 
5985*4bdc9457SAndroid Build Coastguard Worker     const __m128i vacc_hi = _mm256_extracti128_si256(vacc, 1);
5986*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_packs_epi16(_mm256_castsi256_si128(vacc), vacc_hi);
5987*4bdc9457SAndroid Build Coastguard Worker     if (n & (8 * sizeof(int8_t))) {
5988*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) y, vy);
5989*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_unpackhi_epi64(vy, vy);
5990*4bdc9457SAndroid Build Coastguard Worker       y += 8;
5991*4bdc9457SAndroid Build Coastguard Worker     }
5992*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(int8_t))) {
5993*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(y, vy);
5994*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi64(vy, 32);
5995*4bdc9457SAndroid Build Coastguard Worker       y += 4;
5996*4bdc9457SAndroid Build Coastguard Worker     }
5997*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(int8_t))) {
5998*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si16(y, vy);
5999*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi32(vy, 16);
6000*4bdc9457SAndroid Build Coastguard Worker       y += 2;
6001*4bdc9457SAndroid Build Coastguard Worker     }
6002*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(int8_t))) {
6003*4bdc9457SAndroid Build Coastguard Worker       *y = (int8_t) _mm_extract_epi8(vy, 0);
6004*4bdc9457SAndroid Build Coastguard Worker     }
6005*4bdc9457SAndroid Build Coastguard Worker   }
6006*4bdc9457SAndroid Build Coastguard Worker }
6007*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6008*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32(
6009*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
6010*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
6011*4bdc9457SAndroid Build Coastguard Worker     const uint8_t** input,
6012*4bdc9457SAndroid Build Coastguard Worker     const void* weights,
6013*4bdc9457SAndroid Build Coastguard Worker     uint8_t* output,
6014*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
6015*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
6016*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
6017*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* zero,
6018*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6019*4bdc9457SAndroid Build Coastguard Worker {
6020*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
6021*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
6022*4bdc9457SAndroid Build Coastguard Worker 
6023*4bdc9457SAndroid Build Coastguard Worker   const __m256i vk_zero_point = _mm256_cvtepu16_epi32(_mm_load_si128((const __m128i*) params->fp32_avx2.kernel_zero_point));
6024*4bdc9457SAndroid Build Coastguard Worker   do {
6025*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i0 = input[0];
6026*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
6027*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
6028*4bdc9457SAndroid Build Coastguard Worker       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
6029*4bdc9457SAndroid Build Coastguard Worker     }
6030*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i1 = input[1];
6031*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
6032*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
6033*4bdc9457SAndroid Build Coastguard Worker       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
6034*4bdc9457SAndroid Build Coastguard Worker     }
6035*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i2 = input[2];
6036*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
6037*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
6038*4bdc9457SAndroid Build Coastguard Worker       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
6039*4bdc9457SAndroid Build Coastguard Worker     }
6040*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i3 = input[3];
6041*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
6042*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
6043*4bdc9457SAndroid Build Coastguard Worker       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
6044*4bdc9457SAndroid Build Coastguard Worker     }
6045*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i4 = input[4];
6046*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
6047*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
6048*4bdc9457SAndroid Build Coastguard Worker       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
6049*4bdc9457SAndroid Build Coastguard Worker     }
6050*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i5 = input[5];
6051*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
6052*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
6053*4bdc9457SAndroid Build Coastguard Worker       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
6054*4bdc9457SAndroid Build Coastguard Worker     }
6055*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i6 = input[6];
6056*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
6057*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
6058*4bdc9457SAndroid Build Coastguard Worker       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
6059*4bdc9457SAndroid Build Coastguard Worker     }
6060*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i7 = input[7];
6061*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
6062*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
6063*4bdc9457SAndroid Build Coastguard Worker       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
6064*4bdc9457SAndroid Build Coastguard Worker     }
6065*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i8 = input[8];
6066*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
6067*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
6068*4bdc9457SAndroid Build Coastguard Worker       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
6069*4bdc9457SAndroid Build Coastguard Worker     }
6070*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i9 = input[9];
6071*4bdc9457SAndroid Build Coastguard Worker     assert(i9 != NULL);
6072*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i9 != zero) {
6073*4bdc9457SAndroid Build Coastguard Worker       i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
6074*4bdc9457SAndroid Build Coastguard Worker     }
6075*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i10 = input[10];
6076*4bdc9457SAndroid Build Coastguard Worker     assert(i10 != NULL);
6077*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i10 != zero) {
6078*4bdc9457SAndroid Build Coastguard Worker       i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
6079*4bdc9457SAndroid Build Coastguard Worker     }
6080*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i11 = input[11];
6081*4bdc9457SAndroid Build Coastguard Worker     assert(i11 != NULL);
6082*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i11 != zero) {
6083*4bdc9457SAndroid Build Coastguard Worker       i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
6084*4bdc9457SAndroid Build Coastguard Worker     }
6085*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i12 = input[12];
6086*4bdc9457SAndroid Build Coastguard Worker     assert(i12 != NULL);
6087*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i12 != zero) {
6088*4bdc9457SAndroid Build Coastguard Worker       i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
6089*4bdc9457SAndroid Build Coastguard Worker     }
6090*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i13 = input[13];
6091*4bdc9457SAndroid Build Coastguard Worker     assert(i13 != NULL);
6092*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i13 != zero) {
6093*4bdc9457SAndroid Build Coastguard Worker       i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
6094*4bdc9457SAndroid Build Coastguard Worker     }
6095*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i14 = input[14];
6096*4bdc9457SAndroid Build Coastguard Worker     assert(i14 != NULL);
6097*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i14 != zero) {
6098*4bdc9457SAndroid Build Coastguard Worker       i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
6099*4bdc9457SAndroid Build Coastguard Worker     }
6100*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i15 = input[15];
6101*4bdc9457SAndroid Build Coastguard Worker     assert(i15 != NULL);
6102*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i15 != zero) {
6103*4bdc9457SAndroid Build Coastguard Worker       i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
6104*4bdc9457SAndroid Build Coastguard Worker     }
6105*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i16 = input[16];
6106*4bdc9457SAndroid Build Coastguard Worker     assert(i16 != NULL);
6107*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i16 != zero) {
6108*4bdc9457SAndroid Build Coastguard Worker       i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
6109*4bdc9457SAndroid Build Coastguard Worker     }
6110*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i17 = input[17];
6111*4bdc9457SAndroid Build Coastguard Worker     assert(i17 != NULL);
6112*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i17 != zero) {
6113*4bdc9457SAndroid Build Coastguard Worker       i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
6114*4bdc9457SAndroid Build Coastguard Worker     }
6115*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i18 = input[18];
6116*4bdc9457SAndroid Build Coastguard Worker     assert(i18 != NULL);
6117*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i18 != zero) {
6118*4bdc9457SAndroid Build Coastguard Worker       i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
6119*4bdc9457SAndroid Build Coastguard Worker     }
6120*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i19 = input[19];
6121*4bdc9457SAndroid Build Coastguard Worker     assert(i19 != NULL);
6122*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i19 != zero) {
6123*4bdc9457SAndroid Build Coastguard Worker       i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
6124*4bdc9457SAndroid Build Coastguard Worker     }
6125*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i20 = input[20];
6126*4bdc9457SAndroid Build Coastguard Worker     assert(i20 != NULL);
6127*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i20 != zero) {
6128*4bdc9457SAndroid Build Coastguard Worker       i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
6129*4bdc9457SAndroid Build Coastguard Worker     }
6130*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i21 = input[21];
6131*4bdc9457SAndroid Build Coastguard Worker     assert(i21 != NULL);
6132*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i21 != zero) {
6133*4bdc9457SAndroid Build Coastguard Worker       i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
6134*4bdc9457SAndroid Build Coastguard Worker     }
6135*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i22 = input[22];
6136*4bdc9457SAndroid Build Coastguard Worker     assert(i22 != NULL);
6137*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i22 != zero) {
6138*4bdc9457SAndroid Build Coastguard Worker       i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
6139*4bdc9457SAndroid Build Coastguard Worker     }
6140*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i23 = input[23];
6141*4bdc9457SAndroid Build Coastguard Worker     assert(i23 != NULL);
6142*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i23 != zero) {
6143*4bdc9457SAndroid Build Coastguard Worker       i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
6144*4bdc9457SAndroid Build Coastguard Worker     }
6145*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i24 = input[24];
6146*4bdc9457SAndroid Build Coastguard Worker     assert(i24 != NULL);
6147*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i24 != zero) {
6148*4bdc9457SAndroid Build Coastguard Worker       i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
6149*4bdc9457SAndroid Build Coastguard Worker     }
6150*4bdc9457SAndroid Build Coastguard Worker     input = (const uint8_t**) ((uintptr_t) input + input_stride);
6151*4bdc9457SAndroid Build Coastguard Worker 
6152*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
6153*4bdc9457SAndroid Build Coastguard Worker     const void* w = weights;
6154*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 16; c -= 16) {
6155*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
6156*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
6157*4bdc9457SAndroid Build Coastguard Worker 
6158*4bdc9457SAndroid Build Coastguard Worker 
6159*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
6160*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
6161*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
6162*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
6163*4bdc9457SAndroid Build Coastguard Worker       i0 += 16;
6164*4bdc9457SAndroid Build Coastguard Worker 
6165*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
6166*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
6167*4bdc9457SAndroid Build Coastguard Worker 
6168*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
6169*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
6170*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
6171*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
6172*4bdc9457SAndroid Build Coastguard Worker       i1 += 16;
6173*4bdc9457SAndroid Build Coastguard Worker 
6174*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
6175*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
6176*4bdc9457SAndroid Build Coastguard Worker 
6177*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
6178*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
6179*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
6180*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
6181*4bdc9457SAndroid Build Coastguard Worker       i2 += 16;
6182*4bdc9457SAndroid Build Coastguard Worker 
6183*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
6184*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
6185*4bdc9457SAndroid Build Coastguard Worker 
6186*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
6187*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
6188*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
6189*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
6190*4bdc9457SAndroid Build Coastguard Worker       i3 += 16;
6191*4bdc9457SAndroid Build Coastguard Worker 
6192*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
6193*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
6194*4bdc9457SAndroid Build Coastguard Worker 
6195*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
6196*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
6197*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
6198*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
6199*4bdc9457SAndroid Build Coastguard Worker       i4 += 16;
6200*4bdc9457SAndroid Build Coastguard Worker 
6201*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
6202*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
6203*4bdc9457SAndroid Build Coastguard Worker 
6204*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
6205*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
6206*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
6207*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
6208*4bdc9457SAndroid Build Coastguard Worker       i5 += 16;
6209*4bdc9457SAndroid Build Coastguard Worker 
6210*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
6211*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
6212*4bdc9457SAndroid Build Coastguard Worker 
6213*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
6214*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
6215*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
6216*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
6217*4bdc9457SAndroid Build Coastguard Worker       i6 += 16;
6218*4bdc9457SAndroid Build Coastguard Worker 
6219*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
6220*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
6221*4bdc9457SAndroid Build Coastguard Worker 
6222*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
6223*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
6224*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
6225*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
6226*4bdc9457SAndroid Build Coastguard Worker       i7 += 16;
6227*4bdc9457SAndroid Build Coastguard Worker 
6228*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
6229*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
6230*4bdc9457SAndroid Build Coastguard Worker 
6231*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
6232*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
6233*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
6234*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
6235*4bdc9457SAndroid Build Coastguard Worker       i8 += 16;
6236*4bdc9457SAndroid Build Coastguard Worker 
6237*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
6238*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
6239*4bdc9457SAndroid Build Coastguard Worker 
6240*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi9x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i9));
6241*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk9x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
6242*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi9x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i9 + 8)));
6243*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk9x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(uint8_t)))), vk_zero_point);
6244*4bdc9457SAndroid Build Coastguard Worker       i9 += 16;
6245*4bdc9457SAndroid Build Coastguard Worker 
6246*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
6247*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi9x89ABCDEF, vk9x89ABCDEF));
6248*4bdc9457SAndroid Build Coastguard Worker 
6249*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi10x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i10));
6250*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk10x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
6251*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi10x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i10 + 8)));
6252*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk10x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(uint8_t)))), vk_zero_point);
6253*4bdc9457SAndroid Build Coastguard Worker       i10 += 16;
6254*4bdc9457SAndroid Build Coastguard Worker 
6255*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
6256*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi10x89ABCDEF, vk10x89ABCDEF));
6257*4bdc9457SAndroid Build Coastguard Worker 
6258*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi11x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i11));
6259*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk11x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
6260*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi11x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i11 + 8)));
6261*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk11x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(uint8_t)))), vk_zero_point);
6262*4bdc9457SAndroid Build Coastguard Worker       i11 += 16;
6263*4bdc9457SAndroid Build Coastguard Worker 
6264*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
6265*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi11x89ABCDEF, vk11x89ABCDEF));
6266*4bdc9457SAndroid Build Coastguard Worker 
6267*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi12x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i12));
6268*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk12x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
6269*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi12x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i12 + 8)));
6270*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk12x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(uint8_t)))), vk_zero_point);
6271*4bdc9457SAndroid Build Coastguard Worker       i12 += 16;
6272*4bdc9457SAndroid Build Coastguard Worker 
6273*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
6274*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi12x89ABCDEF, vk12x89ABCDEF));
6275*4bdc9457SAndroid Build Coastguard Worker 
6276*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi13x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i13));
6277*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk13x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(uint8_t)))), vk_zero_point);
6278*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi13x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i13 + 8)));
6279*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk13x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(uint8_t)))), vk_zero_point);
6280*4bdc9457SAndroid Build Coastguard Worker       i13 += 16;
6281*4bdc9457SAndroid Build Coastguard Worker 
6282*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
6283*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi13x89ABCDEF, vk13x89ABCDEF));
6284*4bdc9457SAndroid Build Coastguard Worker 
6285*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi14x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i14));
6286*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk14x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(uint8_t)))), vk_zero_point);
6287*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi14x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i14 + 8)));
6288*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk14x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(uint8_t)))), vk_zero_point);
6289*4bdc9457SAndroid Build Coastguard Worker       i14 += 16;
6290*4bdc9457SAndroid Build Coastguard Worker 
6291*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
6292*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi14x89ABCDEF, vk14x89ABCDEF));
6293*4bdc9457SAndroid Build Coastguard Worker 
6294*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi15x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i15));
6295*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk15x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(uint8_t)))), vk_zero_point);
6296*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi15x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i15 + 8)));
6297*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk15x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(uint8_t)))), vk_zero_point);
6298*4bdc9457SAndroid Build Coastguard Worker       i15 += 16;
6299*4bdc9457SAndroid Build Coastguard Worker 
6300*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
6301*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi15x89ABCDEF, vk15x89ABCDEF));
6302*4bdc9457SAndroid Build Coastguard Worker 
6303*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi16x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i16));
6304*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk16x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(uint8_t)))), vk_zero_point);
6305*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi16x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i16 + 8)));
6306*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk16x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(uint8_t)))), vk_zero_point);
6307*4bdc9457SAndroid Build Coastguard Worker       i16 += 16;
6308*4bdc9457SAndroid Build Coastguard Worker 
6309*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
6310*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi16x89ABCDEF, vk16x89ABCDEF));
6311*4bdc9457SAndroid Build Coastguard Worker 
6312*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi17x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i17));
6313*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk17x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(uint8_t)))), vk_zero_point);
6314*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi17x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i17 + 8)));
6315*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk17x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(uint8_t)))), vk_zero_point);
6316*4bdc9457SAndroid Build Coastguard Worker       i17 += 16;
6317*4bdc9457SAndroid Build Coastguard Worker 
6318*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
6319*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi17x89ABCDEF, vk17x89ABCDEF));
6320*4bdc9457SAndroid Build Coastguard Worker 
6321*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi18x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i18));
6322*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk18x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(uint8_t)))), vk_zero_point);
6323*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi18x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i18 + 8)));
6324*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk18x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(uint8_t)))), vk_zero_point);
6325*4bdc9457SAndroid Build Coastguard Worker       i18 += 16;
6326*4bdc9457SAndroid Build Coastguard Worker 
6327*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
6328*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi18x89ABCDEF, vk18x89ABCDEF));
6329*4bdc9457SAndroid Build Coastguard Worker 
6330*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi19x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i19));
6331*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk19x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(uint8_t)))), vk_zero_point);
6332*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi19x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i19 + 8)));
6333*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk19x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(uint8_t)))), vk_zero_point);
6334*4bdc9457SAndroid Build Coastguard Worker       i19 += 16;
6335*4bdc9457SAndroid Build Coastguard Worker 
6336*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
6337*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi19x89ABCDEF, vk19x89ABCDEF));
6338*4bdc9457SAndroid Build Coastguard Worker 
6339*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi20x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i20));
6340*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk20x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(uint8_t)))), vk_zero_point);
6341*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi20x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i20 + 8)));
6342*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk20x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(uint8_t)))), vk_zero_point);
6343*4bdc9457SAndroid Build Coastguard Worker       i20 += 16;
6344*4bdc9457SAndroid Build Coastguard Worker 
6345*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
6346*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi20x89ABCDEF, vk20x89ABCDEF));
6347*4bdc9457SAndroid Build Coastguard Worker 
6348*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi21x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i21));
6349*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk21x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(uint8_t)))), vk_zero_point);
6350*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi21x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i21 + 8)));
6351*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk21x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(uint8_t)))), vk_zero_point);
6352*4bdc9457SAndroid Build Coastguard Worker       i21 += 16;
6353*4bdc9457SAndroid Build Coastguard Worker 
6354*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
6355*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi21x89ABCDEF, vk21x89ABCDEF));
6356*4bdc9457SAndroid Build Coastguard Worker 
6357*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi22x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i22));
6358*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk22x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(uint8_t)))), vk_zero_point);
6359*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi22x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i22 + 8)));
6360*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk22x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(uint8_t)))), vk_zero_point);
6361*4bdc9457SAndroid Build Coastguard Worker       i22 += 16;
6362*4bdc9457SAndroid Build Coastguard Worker 
6363*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
6364*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi22x89ABCDEF, vk22x89ABCDEF));
6365*4bdc9457SAndroid Build Coastguard Worker 
6366*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi23x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i23));
6367*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk23x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(uint8_t)))), vk_zero_point);
6368*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi23x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i23 + 8)));
6369*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk23x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(uint8_t)))), vk_zero_point);
6370*4bdc9457SAndroid Build Coastguard Worker       i23 += 16;
6371*4bdc9457SAndroid Build Coastguard Worker 
6372*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
6373*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi23x89ABCDEF, vk23x89ABCDEF));
6374*4bdc9457SAndroid Build Coastguard Worker 
6375*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi24x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i24));
6376*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk24x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(uint8_t)))), vk_zero_point);
6377*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi24x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i24 + 8)));
6378*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk24x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(uint8_t)))), vk_zero_point);
6379*4bdc9457SAndroid Build Coastguard Worker       i24 += 16;
6380*4bdc9457SAndroid Build Coastguard Worker 
6381*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
6382*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi24x89ABCDEF, vk24x89ABCDEF));
6383*4bdc9457SAndroid Build Coastguard Worker 
6384*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(uint8_t));
6385*4bdc9457SAndroid Build Coastguard Worker 
6386*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
6387*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
6388*4bdc9457SAndroid Build Coastguard Worker 
6389*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
6390*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
6391*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
6392*4bdc9457SAndroid Build Coastguard Worker 
6393*4bdc9457SAndroid Build Coastguard Worker       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
6394*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
6395*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
6396*4bdc9457SAndroid Build Coastguard Worker 
6397*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
6398*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
6399*4bdc9457SAndroid Build Coastguard Worker 
6400*4bdc9457SAndroid Build Coastguard Worker       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
6401*4bdc9457SAndroid Build Coastguard Worker       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
6402*4bdc9457SAndroid Build Coastguard Worker 
6403*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
6404*4bdc9457SAndroid Build Coastguard Worker 
6405*4bdc9457SAndroid Build Coastguard Worker       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
6406*4bdc9457SAndroid Build Coastguard Worker       vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
6407*4bdc9457SAndroid Build Coastguard Worker 
6408*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
6409*4bdc9457SAndroid Build Coastguard Worker       output += 16;
6410*4bdc9457SAndroid Build Coastguard Worker     }
6411*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
6412*4bdc9457SAndroid Build Coastguard Worker       const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
6413*4bdc9457SAndroid Build Coastguard Worker       do {
6414*4bdc9457SAndroid Build Coastguard Worker         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
6415*4bdc9457SAndroid Build Coastguard Worker 
6416*4bdc9457SAndroid Build Coastguard Worker 
6417*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
6418*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) k)), vk_zero_point);
6419*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
6420*4bdc9457SAndroid Build Coastguard Worker 
6421*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
6422*4bdc9457SAndroid Build Coastguard Worker 
6423*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
6424*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16))), vk_zero_point);
6425*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
6426*4bdc9457SAndroid Build Coastguard Worker 
6427*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
6428*4bdc9457SAndroid Build Coastguard Worker 
6429*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
6430*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32))), vk_zero_point);
6431*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
6432*4bdc9457SAndroid Build Coastguard Worker 
6433*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
6434*4bdc9457SAndroid Build Coastguard Worker 
6435*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
6436*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48))), vk_zero_point);
6437*4bdc9457SAndroid Build Coastguard Worker         i3 += 8;
6438*4bdc9457SAndroid Build Coastguard Worker 
6439*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
6440*4bdc9457SAndroid Build Coastguard Worker 
6441*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
6442*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64))), vk_zero_point);
6443*4bdc9457SAndroid Build Coastguard Worker         i4 += 8;
6444*4bdc9457SAndroid Build Coastguard Worker 
6445*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
6446*4bdc9457SAndroid Build Coastguard Worker 
6447*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
6448*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80))), vk_zero_point);
6449*4bdc9457SAndroid Build Coastguard Worker         i5 += 8;
6450*4bdc9457SAndroid Build Coastguard Worker 
6451*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
6452*4bdc9457SAndroid Build Coastguard Worker 
6453*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
6454*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96))), vk_zero_point);
6455*4bdc9457SAndroid Build Coastguard Worker         i6 += 8;
6456*4bdc9457SAndroid Build Coastguard Worker 
6457*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
6458*4bdc9457SAndroid Build Coastguard Worker 
6459*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
6460*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112))), vk_zero_point);
6461*4bdc9457SAndroid Build Coastguard Worker         i7 += 8;
6462*4bdc9457SAndroid Build Coastguard Worker 
6463*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
6464*4bdc9457SAndroid Build Coastguard Worker 
6465*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
6466*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128))), vk_zero_point);
6467*4bdc9457SAndroid Build Coastguard Worker         i8 += 8;
6468*4bdc9457SAndroid Build Coastguard Worker 
6469*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
6470*4bdc9457SAndroid Build Coastguard Worker 
6471*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi9x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i9));
6472*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk9x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 144))), vk_zero_point);
6473*4bdc9457SAndroid Build Coastguard Worker         i9 += 8;
6474*4bdc9457SAndroid Build Coastguard Worker 
6475*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi9x01234567, vk9x01234567));
6476*4bdc9457SAndroid Build Coastguard Worker 
6477*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi10x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i10));
6478*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk10x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 160))), vk_zero_point);
6479*4bdc9457SAndroid Build Coastguard Worker         i10 += 8;
6480*4bdc9457SAndroid Build Coastguard Worker 
6481*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi10x01234567, vk10x01234567));
6482*4bdc9457SAndroid Build Coastguard Worker 
6483*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi11x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i11));
6484*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk11x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 176))), vk_zero_point);
6485*4bdc9457SAndroid Build Coastguard Worker         i11 += 8;
6486*4bdc9457SAndroid Build Coastguard Worker 
6487*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi11x01234567, vk11x01234567));
6488*4bdc9457SAndroid Build Coastguard Worker 
6489*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi12x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i12));
6490*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk12x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 192))), vk_zero_point);
6491*4bdc9457SAndroid Build Coastguard Worker         i12 += 8;
6492*4bdc9457SAndroid Build Coastguard Worker 
6493*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi12x01234567, vk12x01234567));
6494*4bdc9457SAndroid Build Coastguard Worker 
6495*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi13x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i13));
6496*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk13x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 208))), vk_zero_point);
6497*4bdc9457SAndroid Build Coastguard Worker         i13 += 8;
6498*4bdc9457SAndroid Build Coastguard Worker 
6499*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi13x01234567, vk13x01234567));
6500*4bdc9457SAndroid Build Coastguard Worker 
6501*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi14x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i14));
6502*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk14x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 224))), vk_zero_point);
6503*4bdc9457SAndroid Build Coastguard Worker         i14 += 8;
6504*4bdc9457SAndroid Build Coastguard Worker 
6505*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi14x01234567, vk14x01234567));
6506*4bdc9457SAndroid Build Coastguard Worker 
6507*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi15x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i15));
6508*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk15x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 240))), vk_zero_point);
6509*4bdc9457SAndroid Build Coastguard Worker         i15 += 8;
6510*4bdc9457SAndroid Build Coastguard Worker 
6511*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi15x01234567, vk15x01234567));
6512*4bdc9457SAndroid Build Coastguard Worker 
6513*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi16x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i16));
6514*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk16x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 256))), vk_zero_point);
6515*4bdc9457SAndroid Build Coastguard Worker         i16 += 8;
6516*4bdc9457SAndroid Build Coastguard Worker 
6517*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi16x01234567, vk16x01234567));
6518*4bdc9457SAndroid Build Coastguard Worker 
6519*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi17x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i17));
6520*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk17x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 272))), vk_zero_point);
6521*4bdc9457SAndroid Build Coastguard Worker         i17 += 8;
6522*4bdc9457SAndroid Build Coastguard Worker 
6523*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi17x01234567, vk17x01234567));
6524*4bdc9457SAndroid Build Coastguard Worker 
6525*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi18x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i18));
6526*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk18x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 288))), vk_zero_point);
6527*4bdc9457SAndroid Build Coastguard Worker         i18 += 8;
6528*4bdc9457SAndroid Build Coastguard Worker 
6529*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi18x01234567, vk18x01234567));
6530*4bdc9457SAndroid Build Coastguard Worker 
6531*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi19x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i19));
6532*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk19x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 304))), vk_zero_point);
6533*4bdc9457SAndroid Build Coastguard Worker         i19 += 8;
6534*4bdc9457SAndroid Build Coastguard Worker 
6535*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi19x01234567, vk19x01234567));
6536*4bdc9457SAndroid Build Coastguard Worker 
6537*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi20x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i20));
6538*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk20x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 320))), vk_zero_point);
6539*4bdc9457SAndroid Build Coastguard Worker         i20 += 8;
6540*4bdc9457SAndroid Build Coastguard Worker 
6541*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi20x01234567, vk20x01234567));
6542*4bdc9457SAndroid Build Coastguard Worker 
6543*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi21x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i21));
6544*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk21x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 336))), vk_zero_point);
6545*4bdc9457SAndroid Build Coastguard Worker         i21 += 8;
6546*4bdc9457SAndroid Build Coastguard Worker 
6547*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi21x01234567, vk21x01234567));
6548*4bdc9457SAndroid Build Coastguard Worker 
6549*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi22x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i22));
6550*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk22x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 352))), vk_zero_point);
6551*4bdc9457SAndroid Build Coastguard Worker         i22 += 8;
6552*4bdc9457SAndroid Build Coastguard Worker 
6553*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi22x01234567, vk22x01234567));
6554*4bdc9457SAndroid Build Coastguard Worker 
6555*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi23x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i23));
6556*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk23x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 368))), vk_zero_point);
6557*4bdc9457SAndroid Build Coastguard Worker         i23 += 8;
6558*4bdc9457SAndroid Build Coastguard Worker 
6559*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi23x01234567, vk23x01234567));
6560*4bdc9457SAndroid Build Coastguard Worker 
6561*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi24x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i24));
6562*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk24x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 384))), vk_zero_point);
6563*4bdc9457SAndroid Build Coastguard Worker         i24 += 8;
6564*4bdc9457SAndroid Build Coastguard Worker 
6565*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
6566*4bdc9457SAndroid Build Coastguard Worker 
6567*4bdc9457SAndroid Build Coastguard Worker         k += 8;
6568*4bdc9457SAndroid Build Coastguard Worker 
6569*4bdc9457SAndroid Build Coastguard Worker         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
6570*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
6571*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
6572*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
6573*4bdc9457SAndroid Build Coastguard Worker 
6574*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int32_t*) w + 8);
6575*4bdc9457SAndroid Build Coastguard Worker 
6576*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
6577*4bdc9457SAndroid Build Coastguard Worker         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
6578*4bdc9457SAndroid Build Coastguard Worker 
6579*4bdc9457SAndroid Build Coastguard Worker         __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6580*4bdc9457SAndroid Build Coastguard Worker 
6581*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
6582*4bdc9457SAndroid Build Coastguard Worker         vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6583*4bdc9457SAndroid Build Coastguard Worker 
6584*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(c >= 8) {
6585*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6586*4bdc9457SAndroid Build Coastguard Worker           output += 8;
6587*4bdc9457SAndroid Build Coastguard Worker           c -= 8;
6588*4bdc9457SAndroid Build Coastguard Worker         } else {
6589*4bdc9457SAndroid Build Coastguard Worker           if (c & 4) {
6590*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6591*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6592*4bdc9457SAndroid Build Coastguard Worker             output += 4;
6593*4bdc9457SAndroid Build Coastguard Worker           }
6594*4bdc9457SAndroid Build Coastguard Worker           if (c & 2) {
6595*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
6596*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6597*4bdc9457SAndroid Build Coastguard Worker             output += 2;
6598*4bdc9457SAndroid Build Coastguard Worker           }
6599*4bdc9457SAndroid Build Coastguard Worker           if (c & 1) {
6600*4bdc9457SAndroid Build Coastguard Worker             *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6601*4bdc9457SAndroid Build Coastguard Worker             output += 1;
6602*4bdc9457SAndroid Build Coastguard Worker           }
6603*4bdc9457SAndroid Build Coastguard Worker           c = 0;
6604*4bdc9457SAndroid Build Coastguard Worker         }
6605*4bdc9457SAndroid Build Coastguard Worker       } while (c != 0);
6606*4bdc9457SAndroid Build Coastguard Worker     }
6607*4bdc9457SAndroid Build Coastguard Worker 
6608*4bdc9457SAndroid Build Coastguard Worker     output = (uint8_t*) ((uintptr_t) output + output_increment);
6609*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
6610*4bdc9457SAndroid Build Coastguard Worker }
6611*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6612*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32(
6613*4bdc9457SAndroid Build Coastguard Worker     size_t channels,
6614*4bdc9457SAndroid Build Coastguard Worker     size_t output_width,
6615*4bdc9457SAndroid Build Coastguard Worker     const uint8_t** input,
6616*4bdc9457SAndroid Build Coastguard Worker     const void* weights,
6617*4bdc9457SAndroid Build Coastguard Worker     uint8_t* output,
6618*4bdc9457SAndroid Build Coastguard Worker     size_t input_stride,
6619*4bdc9457SAndroid Build Coastguard Worker     size_t output_increment,
6620*4bdc9457SAndroid Build Coastguard Worker     size_t input_offset,
6621*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* zero,
6622*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6623*4bdc9457SAndroid Build Coastguard Worker {
6624*4bdc9457SAndroid Build Coastguard Worker   assert(channels != 0);
6625*4bdc9457SAndroid Build Coastguard Worker   assert(output_width != 0);
6626*4bdc9457SAndroid Build Coastguard Worker 
6627*4bdc9457SAndroid Build Coastguard Worker   const __m256i vk_zero_point = _mm256_cvtepu16_epi32(_mm_load_si128((const __m128i*) params->fp32_avx2.kernel_zero_point));
6628*4bdc9457SAndroid Build Coastguard Worker   do {
6629*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i0 = input[0];
6630*4bdc9457SAndroid Build Coastguard Worker     assert(i0 != NULL);
6631*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i0 != zero) {
6632*4bdc9457SAndroid Build Coastguard Worker       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
6633*4bdc9457SAndroid Build Coastguard Worker     }
6634*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i1 = input[1];
6635*4bdc9457SAndroid Build Coastguard Worker     assert(i1 != NULL);
6636*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i1 != zero) {
6637*4bdc9457SAndroid Build Coastguard Worker       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
6638*4bdc9457SAndroid Build Coastguard Worker     }
6639*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i2 = input[2];
6640*4bdc9457SAndroid Build Coastguard Worker     assert(i2 != NULL);
6641*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i2 != zero) {
6642*4bdc9457SAndroid Build Coastguard Worker       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
6643*4bdc9457SAndroid Build Coastguard Worker     }
6644*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i3 = input[3];
6645*4bdc9457SAndroid Build Coastguard Worker     assert(i3 != NULL);
6646*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i3 != zero) {
6647*4bdc9457SAndroid Build Coastguard Worker       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
6648*4bdc9457SAndroid Build Coastguard Worker     }
6649*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i4 = input[4];
6650*4bdc9457SAndroid Build Coastguard Worker     assert(i4 != NULL);
6651*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i4 != zero) {
6652*4bdc9457SAndroid Build Coastguard Worker       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
6653*4bdc9457SAndroid Build Coastguard Worker     }
6654*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i5 = input[5];
6655*4bdc9457SAndroid Build Coastguard Worker     assert(i5 != NULL);
6656*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i5 != zero) {
6657*4bdc9457SAndroid Build Coastguard Worker       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
6658*4bdc9457SAndroid Build Coastguard Worker     }
6659*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i6 = input[6];
6660*4bdc9457SAndroid Build Coastguard Worker     assert(i6 != NULL);
6661*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i6 != zero) {
6662*4bdc9457SAndroid Build Coastguard Worker       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
6663*4bdc9457SAndroid Build Coastguard Worker     }
6664*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i7 = input[7];
6665*4bdc9457SAndroid Build Coastguard Worker     assert(i7 != NULL);
6666*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i7 != zero) {
6667*4bdc9457SAndroid Build Coastguard Worker       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
6668*4bdc9457SAndroid Build Coastguard Worker     }
6669*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* i8 = input[8];
6670*4bdc9457SAndroid Build Coastguard Worker     assert(i8 != NULL);
6671*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(i8 != zero) {
6672*4bdc9457SAndroid Build Coastguard Worker       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
6673*4bdc9457SAndroid Build Coastguard Worker     }
6674*4bdc9457SAndroid Build Coastguard Worker     input = (const uint8_t**) ((uintptr_t) input + input_stride);
6675*4bdc9457SAndroid Build Coastguard Worker 
6676*4bdc9457SAndroid Build Coastguard Worker     size_t c = channels;
6677*4bdc9457SAndroid Build Coastguard Worker     const void* w = weights;
6678*4bdc9457SAndroid Build Coastguard Worker     for (; c >= 16; c -= 16) {
6679*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
6680*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc89ABCDEF = _mm256_loadu_si256((const __m256i*) ((const int32_t*) w + 8));
6681*4bdc9457SAndroid Build Coastguard Worker 
6682*4bdc9457SAndroid Build Coastguard Worker 
6683*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
6684*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
6685*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi0x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
6686*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk0x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
6687*4bdc9457SAndroid Build Coastguard Worker       i0 += 16;
6688*4bdc9457SAndroid Build Coastguard Worker 
6689*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
6690*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi0x89ABCDEF, vk0x89ABCDEF));
6691*4bdc9457SAndroid Build Coastguard Worker 
6692*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
6693*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
6694*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi1x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
6695*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk1x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
6696*4bdc9457SAndroid Build Coastguard Worker       i1 += 16;
6697*4bdc9457SAndroid Build Coastguard Worker 
6698*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
6699*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi1x89ABCDEF, vk1x89ABCDEF));
6700*4bdc9457SAndroid Build Coastguard Worker 
6701*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
6702*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
6703*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi2x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
6704*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk2x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
6705*4bdc9457SAndroid Build Coastguard Worker       i2 += 16;
6706*4bdc9457SAndroid Build Coastguard Worker 
6707*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
6708*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi2x89ABCDEF, vk2x89ABCDEF));
6709*4bdc9457SAndroid Build Coastguard Worker 
6710*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
6711*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
6712*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi3x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
6713*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk3x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
6714*4bdc9457SAndroid Build Coastguard Worker       i3 += 16;
6715*4bdc9457SAndroid Build Coastguard Worker 
6716*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
6717*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi3x89ABCDEF, vk3x89ABCDEF));
6718*4bdc9457SAndroid Build Coastguard Worker 
6719*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
6720*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
6721*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi4x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
6722*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk4x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
6723*4bdc9457SAndroid Build Coastguard Worker       i4 += 16;
6724*4bdc9457SAndroid Build Coastguard Worker 
6725*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
6726*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi4x89ABCDEF, vk4x89ABCDEF));
6727*4bdc9457SAndroid Build Coastguard Worker 
6728*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
6729*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
6730*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi5x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
6731*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk5x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
6732*4bdc9457SAndroid Build Coastguard Worker       i5 += 16;
6733*4bdc9457SAndroid Build Coastguard Worker 
6734*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
6735*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi5x89ABCDEF, vk5x89ABCDEF));
6736*4bdc9457SAndroid Build Coastguard Worker 
6737*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
6738*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
6739*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi6x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
6740*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk6x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
6741*4bdc9457SAndroid Build Coastguard Worker       i6 += 16;
6742*4bdc9457SAndroid Build Coastguard Worker 
6743*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
6744*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi6x89ABCDEF, vk6x89ABCDEF));
6745*4bdc9457SAndroid Build Coastguard Worker 
6746*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
6747*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
6748*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi7x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i7 + 8)));
6749*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk7x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
6750*4bdc9457SAndroid Build Coastguard Worker       i7 += 16;
6751*4bdc9457SAndroid Build Coastguard Worker 
6752*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
6753*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi7x89ABCDEF, vk7x89ABCDEF));
6754*4bdc9457SAndroid Build Coastguard Worker 
6755*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
6756*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
6757*4bdc9457SAndroid Build Coastguard Worker       const __m256i vi8x89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (i8 + 8)));
6758*4bdc9457SAndroid Build Coastguard Worker       const __m256i vk8x89ABCDEF = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
6759*4bdc9457SAndroid Build Coastguard Worker       i8 += 16;
6760*4bdc9457SAndroid Build Coastguard Worker 
6761*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
6762*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vi8x89ABCDEF, vk8x89ABCDEF));
6763*4bdc9457SAndroid Build Coastguard Worker 
6764*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t));
6765*4bdc9457SAndroid Build Coastguard Worker 
6766*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
6767*4bdc9457SAndroid Build Coastguard Worker       __m256 vscaled89ABCDEF = _mm256_cvtepi32_ps(vacc89ABCDEF);
6768*4bdc9457SAndroid Build Coastguard Worker 
6769*4bdc9457SAndroid Build Coastguard Worker       const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
6770*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_mul_ps(vscaled01234567, vscale);
6771*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_mul_ps(vscaled89ABCDEF, vscale);
6772*4bdc9457SAndroid Build Coastguard Worker 
6773*4bdc9457SAndroid Build Coastguard Worker       const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
6774*4bdc9457SAndroid Build Coastguard Worker       vscaled01234567 = _mm256_min_ps(vscaled01234567, voutput_max_less_zero_point);
6775*4bdc9457SAndroid Build Coastguard Worker       vscaled89ABCDEF = _mm256_min_ps(vscaled89ABCDEF, voutput_max_less_zero_point);
6776*4bdc9457SAndroid Build Coastguard Worker 
6777*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
6778*4bdc9457SAndroid Build Coastguard Worker       vacc89ABCDEF = _mm256_cvtps_epi32(vscaled89ABCDEF);
6779*4bdc9457SAndroid Build Coastguard Worker 
6780*4bdc9457SAndroid Build Coastguard Worker       const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
6781*4bdc9457SAndroid Build Coastguard Worker       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
6782*4bdc9457SAndroid Build Coastguard Worker 
6783*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
6784*4bdc9457SAndroid Build Coastguard Worker 
6785*4bdc9457SAndroid Build Coastguard Worker       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
6786*4bdc9457SAndroid Build Coastguard Worker       vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
6787*4bdc9457SAndroid Build Coastguard Worker 
6788*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
6789*4bdc9457SAndroid Build Coastguard Worker       output += 16;
6790*4bdc9457SAndroid Build Coastguard Worker     }
6791*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNLIKELY(c != 0) {
6792*4bdc9457SAndroid Build Coastguard Worker       const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
6793*4bdc9457SAndroid Build Coastguard Worker       do {
6794*4bdc9457SAndroid Build Coastguard Worker         __m256i vacc01234567 = _mm256_loadu_si256((const __m256i*) w);
6795*4bdc9457SAndroid Build Coastguard Worker 
6796*4bdc9457SAndroid Build Coastguard Worker 
6797*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi0x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i0));
6798*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk0x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) k)), vk_zero_point);
6799*4bdc9457SAndroid Build Coastguard Worker         i0 += 8;
6800*4bdc9457SAndroid Build Coastguard Worker 
6801*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi0x01234567, vk0x01234567));
6802*4bdc9457SAndroid Build Coastguard Worker 
6803*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi1x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i1));
6804*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk1x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 16))), vk_zero_point);
6805*4bdc9457SAndroid Build Coastguard Worker         i1 += 8;
6806*4bdc9457SAndroid Build Coastguard Worker 
6807*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi1x01234567, vk1x01234567));
6808*4bdc9457SAndroid Build Coastguard Worker 
6809*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi2x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i2));
6810*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk2x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 32))), vk_zero_point);
6811*4bdc9457SAndroid Build Coastguard Worker         i2 += 8;
6812*4bdc9457SAndroid Build Coastguard Worker 
6813*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi2x01234567, vk2x01234567));
6814*4bdc9457SAndroid Build Coastguard Worker 
6815*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi3x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i3));
6816*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk3x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 48))), vk_zero_point);
6817*4bdc9457SAndroid Build Coastguard Worker         i3 += 8;
6818*4bdc9457SAndroid Build Coastguard Worker 
6819*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi3x01234567, vk3x01234567));
6820*4bdc9457SAndroid Build Coastguard Worker 
6821*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi4x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i4));
6822*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk4x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 64))), vk_zero_point);
6823*4bdc9457SAndroid Build Coastguard Worker         i4 += 8;
6824*4bdc9457SAndroid Build Coastguard Worker 
6825*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi4x01234567, vk4x01234567));
6826*4bdc9457SAndroid Build Coastguard Worker 
6827*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi5x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i5));
6828*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk5x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 80))), vk_zero_point);
6829*4bdc9457SAndroid Build Coastguard Worker         i5 += 8;
6830*4bdc9457SAndroid Build Coastguard Worker 
6831*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi5x01234567, vk5x01234567));
6832*4bdc9457SAndroid Build Coastguard Worker 
6833*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi6x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i6));
6834*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk6x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 96))), vk_zero_point);
6835*4bdc9457SAndroid Build Coastguard Worker         i6 += 8;
6836*4bdc9457SAndroid Build Coastguard Worker 
6837*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi6x01234567, vk6x01234567));
6838*4bdc9457SAndroid Build Coastguard Worker 
6839*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi7x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i7));
6840*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk7x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 112))), vk_zero_point);
6841*4bdc9457SAndroid Build Coastguard Worker         i7 += 8;
6842*4bdc9457SAndroid Build Coastguard Worker 
6843*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi7x01234567, vk7x01234567));
6844*4bdc9457SAndroid Build Coastguard Worker 
6845*4bdc9457SAndroid Build Coastguard Worker         const __m256i vi8x01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) i8));
6846*4bdc9457SAndroid Build Coastguard Worker         const __m256i vk8x01234567 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (k + 128))), vk_zero_point);
6847*4bdc9457SAndroid Build Coastguard Worker         i8 += 8;
6848*4bdc9457SAndroid Build Coastguard Worker 
6849*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
6850*4bdc9457SAndroid Build Coastguard Worker 
6851*4bdc9457SAndroid Build Coastguard Worker         k += 8;
6852*4bdc9457SAndroid Build Coastguard Worker 
6853*4bdc9457SAndroid Build Coastguard Worker         __m256 vscaled01234567 = _mm256_cvtepi32_ps(vacc01234567);
6854*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_mul_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.scale));
6855*4bdc9457SAndroid Build Coastguard Worker         vscaled01234567 = _mm256_min_ps(vscaled01234567, _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point));
6856*4bdc9457SAndroid Build Coastguard Worker         vacc01234567 = _mm256_cvtps_epi32(vscaled01234567);
6857*4bdc9457SAndroid Build Coastguard Worker 
6858*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const int32_t*) w + 8);
6859*4bdc9457SAndroid Build Coastguard Worker 
6860*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_avx2.output_zero_point);
6861*4bdc9457SAndroid Build Coastguard Worker         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
6862*4bdc9457SAndroid Build Coastguard Worker 
6863*4bdc9457SAndroid Build Coastguard Worker         __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6864*4bdc9457SAndroid Build Coastguard Worker 
6865*4bdc9457SAndroid Build Coastguard Worker         const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_avx2.output_min);
6866*4bdc9457SAndroid Build Coastguard Worker         vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6867*4bdc9457SAndroid Build Coastguard Worker 
6868*4bdc9457SAndroid Build Coastguard Worker         if XNN_LIKELY(c >= 8) {
6869*4bdc9457SAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6870*4bdc9457SAndroid Build Coastguard Worker           output += 8;
6871*4bdc9457SAndroid Build Coastguard Worker           c -= 8;
6872*4bdc9457SAndroid Build Coastguard Worker         } else {
6873*4bdc9457SAndroid Build Coastguard Worker           if (c & 4) {
6874*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6875*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6876*4bdc9457SAndroid Build Coastguard Worker             output += 4;
6877*4bdc9457SAndroid Build Coastguard Worker           }
6878*4bdc9457SAndroid Build Coastguard Worker           if (c & 2) {
6879*4bdc9457SAndroid Build Coastguard Worker             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
6880*4bdc9457SAndroid Build Coastguard Worker             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6881*4bdc9457SAndroid Build Coastguard Worker             output += 2;
6882*4bdc9457SAndroid Build Coastguard Worker           }
6883*4bdc9457SAndroid Build Coastguard Worker           if (c & 1) {
6884*4bdc9457SAndroid Build Coastguard Worker             *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6885*4bdc9457SAndroid Build Coastguard Worker             output += 1;
6886*4bdc9457SAndroid Build Coastguard Worker           }
6887*4bdc9457SAndroid Build Coastguard Worker           c = 0;
6888*4bdc9457SAndroid Build Coastguard Worker         }
6889*4bdc9457SAndroid Build Coastguard Worker       } while (c != 0);
6890*4bdc9457SAndroid Build Coastguard Worker     }
6891*4bdc9457SAndroid Build Coastguard Worker 
6892*4bdc9457SAndroid Build Coastguard Worker     output = (uint8_t*) ((uintptr_t) output + output_increment);
6893*4bdc9457SAndroid Build Coastguard Worker   } while (--output_width != 0);
6894*4bdc9457SAndroid Build Coastguard Worker }
6895*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_f32_vcvt_ukernel__avx2_x16(size_t n,const uint8_t * x,float * y,const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])6896*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_f32_vcvt_ukernel__avx2_x16(
6897*4bdc9457SAndroid Build Coastguard Worker     size_t n,
6898*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* x,
6899*4bdc9457SAndroid Build Coastguard Worker     float* y,
6900*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6901*4bdc9457SAndroid Build Coastguard Worker {
6902*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
6903*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(uint8_t) == 0);
6904*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
6905*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
6906*4bdc9457SAndroid Build Coastguard Worker 
6907*4bdc9457SAndroid Build Coastguard Worker   const __m256i vminus_zero_point = _mm256_load_si256((const __m256i*) params->avx.minus_zero_point);
6908*4bdc9457SAndroid Build Coastguard Worker   const __m256 vscale = _mm256_load_ps(params->avx.scale);
6909*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
6910*4bdc9457SAndroid Build Coastguard Worker     __m256i vx01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
6911*4bdc9457SAndroid Build Coastguard Worker     __m256i vx89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (x + 8)));
6912*4bdc9457SAndroid Build Coastguard Worker     x += 16;
6913*4bdc9457SAndroid Build Coastguard Worker 
6914*4bdc9457SAndroid Build Coastguard Worker     vx01234567 = _mm256_add_epi32(vx01234567, vminus_zero_point);
6915*4bdc9457SAndroid Build Coastguard Worker     vx89ABCDEF = _mm256_add_epi32(vx89ABCDEF, vminus_zero_point);
6916*4bdc9457SAndroid Build Coastguard Worker 
6917*4bdc9457SAndroid Build Coastguard Worker     __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
6918*4bdc9457SAndroid Build Coastguard Worker     __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
6919*4bdc9457SAndroid Build Coastguard Worker 
6920*4bdc9457SAndroid Build Coastguard Worker     vy01234567 = _mm256_mul_ps(vy01234567, vscale);
6921*4bdc9457SAndroid Build Coastguard Worker     vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
6922*4bdc9457SAndroid Build Coastguard Worker 
6923*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y, vy01234567);
6924*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y + 8, vy89ABCDEF);
6925*4bdc9457SAndroid Build Coastguard Worker     y += 16;
6926*4bdc9457SAndroid Build Coastguard Worker   }
6927*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
6928*4bdc9457SAndroid Build Coastguard Worker     __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
6929*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_add_epi32(vx, vminus_zero_point);
6930*4bdc9457SAndroid Build Coastguard Worker     x += 8;
6931*4bdc9457SAndroid Build Coastguard Worker 
6932*4bdc9457SAndroid Build Coastguard Worker     __m256 vy = _mm256_cvtepi32_ps(vx);
6933*4bdc9457SAndroid Build Coastguard Worker     vy = _mm256_mul_ps(vy, vscale);
6934*4bdc9457SAndroid Build Coastguard Worker 
6935*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_ps(y, vy);
6936*4bdc9457SAndroid Build Coastguard Worker     y += 8;
6937*4bdc9457SAndroid Build Coastguard Worker   }
6938*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
6939*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(uint8_t));
6940*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 7 * sizeof(uint8_t));
6941*4bdc9457SAndroid Build Coastguard Worker 
6942*4bdc9457SAndroid Build Coastguard Worker     __m256i vx = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) x));
6943*4bdc9457SAndroid Build Coastguard Worker     vx = _mm256_add_epi32(vx, vminus_zero_point);
6944*4bdc9457SAndroid Build Coastguard Worker 
6945*4bdc9457SAndroid Build Coastguard Worker     __m256 vy = _mm256_cvtepi32_ps(vx);
6946*4bdc9457SAndroid Build Coastguard Worker     vy = _mm256_mul_ps(vy, vscale);
6947*4bdc9457SAndroid Build Coastguard Worker 
6948*4bdc9457SAndroid Build Coastguard Worker     __m128 vy_lo = _mm256_castps256_ps128(vy);
6949*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(uint8_t))) {
6950*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_ps(y, vy_lo);
6951*4bdc9457SAndroid Build Coastguard Worker       vy_lo = _mm256_extractf128_ps(vy, 1);
6952*4bdc9457SAndroid Build Coastguard Worker       y += 4;
6953*4bdc9457SAndroid Build Coastguard Worker     }
6954*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(uint8_t))) {
6955*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_pi((__m64*) y, vy_lo);
6956*4bdc9457SAndroid Build Coastguard Worker       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
6957*4bdc9457SAndroid Build Coastguard Worker       y += 2;
6958*4bdc9457SAndroid Build Coastguard Worker     }
6959*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(uint8_t))) {
6960*4bdc9457SAndroid Build Coastguard Worker       _mm_store_ss(y, vy_lo);
6961*4bdc9457SAndroid Build Coastguard Worker     }
6962*4bdc9457SAndroid Build Coastguard Worker   }
6963*4bdc9457SAndroid Build Coastguard Worker }
6964*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6965*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2(
6966*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
6967*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
6968*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
6969*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* restrict a,
6970*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
6971*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
6972*4bdc9457SAndroid Build Coastguard Worker     uint8_t* restrict c,
6973*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
6974*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
6975*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6976*4bdc9457SAndroid Build Coastguard Worker {
6977*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
6978*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
6979*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
6980*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
6981*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(uint8_t) == 0);
6982*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
6983*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
6984*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
6985*4bdc9457SAndroid Build Coastguard Worker 
6986*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
6987*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* a0 = a;
6988*4bdc9457SAndroid Build Coastguard Worker   uint8_t* c0 = c;
6989*4bdc9457SAndroid Build Coastguard Worker 
6990*4bdc9457SAndroid Build Coastguard Worker   do {
6991*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6992*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6993*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
6994*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6995*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6996*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
6997*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
6998*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
6999*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
7000*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
7001*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
7002*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
7003*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
7004*4bdc9457SAndroid Build Coastguard Worker 
7005*4bdc9457SAndroid Build Coastguard Worker     size_t k = 0;
7006*4bdc9457SAndroid Build Coastguard Worker     const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
7007*4bdc9457SAndroid Build Coastguard Worker     while (k < kc) {
7008*4bdc9457SAndroid Build Coastguard Worker       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
7009*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
7010*4bdc9457SAndroid Build Coastguard Worker       a0 += 8;
7011*4bdc9457SAndroid Build Coastguard Worker 
7012*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
7013*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
7014*4bdc9457SAndroid Build Coastguard Worker 
7015*4bdc9457SAndroid Build Coastguard Worker       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
7016*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
7017*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
7018*4bdc9457SAndroid Build Coastguard Worker 
7019*4bdc9457SAndroid Build Coastguard Worker       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
7020*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
7021*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
7022*4bdc9457SAndroid Build Coastguard Worker 
7023*4bdc9457SAndroid Build Coastguard Worker       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
7024*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
7025*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
7026*4bdc9457SAndroid Build Coastguard Worker 
7027*4bdc9457SAndroid Build Coastguard Worker       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
7028*4bdc9457SAndroid Build Coastguard Worker 
7029*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((const uint8_t*) w + 64);
7030*4bdc9457SAndroid Build Coastguard Worker       k += 8 * sizeof(uint8_t);
7031*4bdc9457SAndroid Build Coastguard Worker     }
7032*4bdc9457SAndroid Build Coastguard Worker 
7033*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
7034*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
7035*4bdc9457SAndroid Build Coastguard Worker 
7036*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
7037*4bdc9457SAndroid Build Coastguard Worker 
7038*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
7039*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
7040*4bdc9457SAndroid Build Coastguard Worker 
7041*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
7042*4bdc9457SAndroid Build Coastguard Worker 
7043*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
7044*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
7045*4bdc9457SAndroid Build Coastguard Worker 
7046*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
7047*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
7048*4bdc9457SAndroid Build Coastguard Worker 
7049*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
7050*4bdc9457SAndroid Build Coastguard Worker 
7051*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
7052*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
7053*4bdc9457SAndroid Build Coastguard Worker 
7054*4bdc9457SAndroid Build Coastguard Worker     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
7055*4bdc9457SAndroid Build Coastguard Worker 
7056*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packus_epi16(vacc00x01234567, vacc00x01234567);
7057*4bdc9457SAndroid Build Coastguard Worker 
7058*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
7059*4bdc9457SAndroid Build Coastguard Worker 
7060*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
7061*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
7062*4bdc9457SAndroid Build Coastguard Worker 
7063*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
7064*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
7065*4bdc9457SAndroid Build Coastguard Worker 
7066*4bdc9457SAndroid Build Coastguard Worker       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7067*4bdc9457SAndroid Build Coastguard Worker 
7068*4bdc9457SAndroid Build Coastguard Worker       a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
7069*4bdc9457SAndroid Build Coastguard Worker 
7070*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
7071*4bdc9457SAndroid Build Coastguard Worker     } else {
7072*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
7073*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
7074*4bdc9457SAndroid Build Coastguard Worker 
7075*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
7076*4bdc9457SAndroid Build Coastguard Worker 
7077*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
7078*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
7079*4bdc9457SAndroid Build Coastguard Worker       }
7080*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
7081*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
7082*4bdc9457SAndroid Build Coastguard Worker 
7083*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
7084*4bdc9457SAndroid Build Coastguard Worker 
7085*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
7086*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
7087*4bdc9457SAndroid Build Coastguard Worker       }
7088*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
7089*4bdc9457SAndroid Build Coastguard Worker         *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
7090*4bdc9457SAndroid Build Coastguard Worker       }
7091*4bdc9457SAndroid Build Coastguard Worker 
7092*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
7093*4bdc9457SAndroid Build Coastguard Worker     }
7094*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
7095*4bdc9457SAndroid Build Coastguard Worker }
7096*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7097*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2(
7098*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
7099*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
7100*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
7101*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* restrict a,
7102*4bdc9457SAndroid Build Coastguard Worker     size_t a_stride,
7103*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
7104*4bdc9457SAndroid Build Coastguard Worker     uint8_t* restrict c,
7105*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
7106*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
7107*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7108*4bdc9457SAndroid Build Coastguard Worker {
7109*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
7110*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 3);
7111*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
7112*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
7113*4bdc9457SAndroid Build Coastguard Worker   assert(kc % sizeof(uint8_t) == 0);
7114*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7115*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
7116*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
7117*4bdc9457SAndroid Build Coastguard Worker 
7118*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
7119*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* a0 = a;
7120*4bdc9457SAndroid Build Coastguard Worker   uint8_t* c0 = c;
7121*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
7122*4bdc9457SAndroid Build Coastguard Worker   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
7123*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
7124*4bdc9457SAndroid Build Coastguard Worker     a1 = a0;
7125*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
7126*4bdc9457SAndroid Build Coastguard Worker   }
7127*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
7128*4bdc9457SAndroid Build Coastguard Worker   uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
7129*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
7130*4bdc9457SAndroid Build Coastguard Worker     a2 = a1;
7131*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
7132*4bdc9457SAndroid Build Coastguard Worker   }
7133*4bdc9457SAndroid Build Coastguard Worker 
7134*4bdc9457SAndroid Build Coastguard Worker   do {
7135*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7136*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7137*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
7138*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7139*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7140*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
7141*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
7142*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
7143*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
7144*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
7145*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
7146*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
7147*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01 = vacc0x01;
7148*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x23 = vacc0x23;
7149*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x45 = vacc0x45;
7150*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x67 = vacc0x67;
7151*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01 = vacc0x01;
7152*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x23 = vacc0x23;
7153*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x45 = vacc0x45;
7154*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x67 = vacc0x67;
7155*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
7156*4bdc9457SAndroid Build Coastguard Worker 
7157*4bdc9457SAndroid Build Coastguard Worker     size_t k = 0;
7158*4bdc9457SAndroid Build Coastguard Worker     const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
7159*4bdc9457SAndroid Build Coastguard Worker     while (k < kc) {
7160*4bdc9457SAndroid Build Coastguard Worker       const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
7161*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
7162*4bdc9457SAndroid Build Coastguard Worker       a0 += 8;
7163*4bdc9457SAndroid Build Coastguard Worker       const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
7164*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa1 = _mm256_cvtepu8_epi16(va1);
7165*4bdc9457SAndroid Build Coastguard Worker       a1 += 8;
7166*4bdc9457SAndroid Build Coastguard Worker       const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
7167*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxa2 = _mm256_cvtepu8_epi16(va2);
7168*4bdc9457SAndroid Build Coastguard Worker       a2 += 8;
7169*4bdc9457SAndroid Build Coastguard Worker 
7170*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
7171*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
7172*4bdc9457SAndroid Build Coastguard Worker 
7173*4bdc9457SAndroid Build Coastguard Worker       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
7174*4bdc9457SAndroid Build Coastguard Worker       vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
7175*4bdc9457SAndroid Build Coastguard Worker       vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
7176*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
7177*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
7178*4bdc9457SAndroid Build Coastguard Worker 
7179*4bdc9457SAndroid Build Coastguard Worker       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
7180*4bdc9457SAndroid Build Coastguard Worker       vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
7181*4bdc9457SAndroid Build Coastguard Worker       vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
7182*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
7183*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
7184*4bdc9457SAndroid Build Coastguard Worker 
7185*4bdc9457SAndroid Build Coastguard Worker       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
7186*4bdc9457SAndroid Build Coastguard Worker       vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
7187*4bdc9457SAndroid Build Coastguard Worker       vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
7188*4bdc9457SAndroid Build Coastguard Worker       const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
7189*4bdc9457SAndroid Build Coastguard Worker       const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
7190*4bdc9457SAndroid Build Coastguard Worker 
7191*4bdc9457SAndroid Build Coastguard Worker       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
7192*4bdc9457SAndroid Build Coastguard Worker       vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
7193*4bdc9457SAndroid Build Coastguard Worker       vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
7194*4bdc9457SAndroid Build Coastguard Worker 
7195*4bdc9457SAndroid Build Coastguard Worker       w = (const void*) ((const uint8_t*) w + 64);
7196*4bdc9457SAndroid Build Coastguard Worker       k += 8 * sizeof(uint8_t);
7197*4bdc9457SAndroid Build Coastguard Worker     }
7198*4bdc9457SAndroid Build Coastguard Worker 
7199*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
7200*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
7201*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
7202*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
7203*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
7204*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
7205*4bdc9457SAndroid Build Coastguard Worker 
7206*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
7207*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
7208*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
7209*4bdc9457SAndroid Build Coastguard Worker 
7210*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
7211*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
7212*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
7213*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
7214*4bdc9457SAndroid Build Coastguard Worker 
7215*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
7216*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
7217*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
7218*4bdc9457SAndroid Build Coastguard Worker 
7219*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
7220*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
7221*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
7222*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
7223*4bdc9457SAndroid Build Coastguard Worker 
7224*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
7225*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
7226*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
7227*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
7228*4bdc9457SAndroid Build Coastguard Worker 
7229*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
7230*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
7231*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
7232*4bdc9457SAndroid Build Coastguard Worker 
7233*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
7234*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
7235*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
7236*4bdc9457SAndroid Build Coastguard Worker 
7237*4bdc9457SAndroid Build Coastguard Worker     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
7238*4bdc9457SAndroid Build Coastguard Worker     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
7239*4bdc9457SAndroid Build Coastguard Worker 
7240*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packus_epi16(vacc01x01234567, vacc22x01234567);
7241*4bdc9457SAndroid Build Coastguard Worker 
7242*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
7243*4bdc9457SAndroid Build Coastguard Worker 
7244*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
7245*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
7246*4bdc9457SAndroid Build Coastguard Worker 
7247*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
7248*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
7249*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c1, vout_hi);
7250*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
7251*4bdc9457SAndroid Build Coastguard Worker 
7252*4bdc9457SAndroid Build Coastguard Worker       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7253*4bdc9457SAndroid Build Coastguard Worker       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
7254*4bdc9457SAndroid Build Coastguard Worker       c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
7255*4bdc9457SAndroid Build Coastguard Worker 
7256*4bdc9457SAndroid Build Coastguard Worker       a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
7257*4bdc9457SAndroid Build Coastguard Worker       a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
7258*4bdc9457SAndroid Build Coastguard Worker       a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
7259*4bdc9457SAndroid Build Coastguard Worker 
7260*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
7261*4bdc9457SAndroid Build Coastguard Worker     } else {
7262*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
7263*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
7264*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c1, vout_hi);
7265*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout_lo, 2));
7266*4bdc9457SAndroid Build Coastguard Worker 
7267*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
7268*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
7269*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
7270*4bdc9457SAndroid Build Coastguard Worker 
7271*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
7272*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
7273*4bdc9457SAndroid Build Coastguard Worker       }
7274*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
7275*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
7276*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout_hi, 0));
7277*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout_lo, 4));
7278*4bdc9457SAndroid Build Coastguard Worker 
7279*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
7280*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
7281*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
7282*4bdc9457SAndroid Build Coastguard Worker 
7283*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
7284*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
7285*4bdc9457SAndroid Build Coastguard Worker       }
7286*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
7287*4bdc9457SAndroid Build Coastguard Worker         *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
7288*4bdc9457SAndroid Build Coastguard Worker         *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
7289*4bdc9457SAndroid Build Coastguard Worker         *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
7290*4bdc9457SAndroid Build Coastguard Worker       }
7291*4bdc9457SAndroid Build Coastguard Worker 
7292*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
7293*4bdc9457SAndroid Build Coastguard Worker     }
7294*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
7295*4bdc9457SAndroid Build Coastguard Worker }
7296*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7297*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2(
7298*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
7299*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
7300*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
7301*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
7302*4bdc9457SAndroid Build Coastguard Worker     const uint8_t** restrict a,
7303*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
7304*4bdc9457SAndroid Build Coastguard Worker     uint8_t* restrict c,
7305*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
7306*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
7307*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
7308*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* zero,
7309*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7310*4bdc9457SAndroid Build Coastguard Worker {
7311*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
7312*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 1);
7313*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
7314*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
7315*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
7316*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (1 * sizeof(void*)) == 0);
7317*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(uint8_t) == 0);
7318*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7319*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
7320*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
7321*4bdc9457SAndroid Build Coastguard Worker 
7322*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
7323*4bdc9457SAndroid Build Coastguard Worker   uint8_t* c0 = c;
7324*4bdc9457SAndroid Build Coastguard Worker 
7325*4bdc9457SAndroid Build Coastguard Worker   do {
7326*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7327*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7328*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
7329*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7330*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7331*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
7332*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
7333*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
7334*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
7335*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
7336*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
7337*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
7338*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
7339*4bdc9457SAndroid Build Coastguard Worker 
7340*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
7341*4bdc9457SAndroid Build Coastguard Worker     const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
7342*4bdc9457SAndroid Build Coastguard Worker     do {
7343*4bdc9457SAndroid Build Coastguard Worker       const uint8_t* restrict a0 = a[0];
7344*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
7345*4bdc9457SAndroid Build Coastguard Worker         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
7346*4bdc9457SAndroid Build Coastguard Worker       }
7347*4bdc9457SAndroid Build Coastguard Worker       a += 1;
7348*4bdc9457SAndroid Build Coastguard Worker 
7349*4bdc9457SAndroid Build Coastguard Worker       size_t k = 0;
7350*4bdc9457SAndroid Build Coastguard Worker       while (k < kc) {
7351*4bdc9457SAndroid Build Coastguard Worker         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
7352*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
7353*4bdc9457SAndroid Build Coastguard Worker         a0 += 8;
7354*4bdc9457SAndroid Build Coastguard Worker 
7355*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
7356*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
7357*4bdc9457SAndroid Build Coastguard Worker 
7358*4bdc9457SAndroid Build Coastguard Worker         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
7359*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
7360*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
7361*4bdc9457SAndroid Build Coastguard Worker 
7362*4bdc9457SAndroid Build Coastguard Worker         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
7363*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
7364*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
7365*4bdc9457SAndroid Build Coastguard Worker 
7366*4bdc9457SAndroid Build Coastguard Worker         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
7367*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
7368*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
7369*4bdc9457SAndroid Build Coastguard Worker 
7370*4bdc9457SAndroid Build Coastguard Worker         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
7371*4bdc9457SAndroid Build Coastguard Worker 
7372*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const uint8_t*) w + 64);
7373*4bdc9457SAndroid Build Coastguard Worker         k += 8 * sizeof(uint8_t);
7374*4bdc9457SAndroid Build Coastguard Worker       }
7375*4bdc9457SAndroid Build Coastguard Worker       p -= 1 * sizeof(void*);
7376*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
7377*4bdc9457SAndroid Build Coastguard Worker 
7378*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
7379*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
7380*4bdc9457SAndroid Build Coastguard Worker 
7381*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
7382*4bdc9457SAndroid Build Coastguard Worker 
7383*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
7384*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
7385*4bdc9457SAndroid Build Coastguard Worker 
7386*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
7387*4bdc9457SAndroid Build Coastguard Worker 
7388*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
7389*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
7390*4bdc9457SAndroid Build Coastguard Worker 
7391*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
7392*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
7393*4bdc9457SAndroid Build Coastguard Worker 
7394*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
7395*4bdc9457SAndroid Build Coastguard Worker 
7396*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
7397*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
7398*4bdc9457SAndroid Build Coastguard Worker 
7399*4bdc9457SAndroid Build Coastguard Worker     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
7400*4bdc9457SAndroid Build Coastguard Worker 
7401*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packus_epi16(vacc00x01234567, vacc00x01234567);
7402*4bdc9457SAndroid Build Coastguard Worker 
7403*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
7404*4bdc9457SAndroid Build Coastguard Worker 
7405*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
7406*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
7407*4bdc9457SAndroid Build Coastguard Worker 
7408*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
7409*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
7410*4bdc9457SAndroid Build Coastguard Worker 
7411*4bdc9457SAndroid Build Coastguard Worker       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7412*4bdc9457SAndroid Build Coastguard Worker 
7413*4bdc9457SAndroid Build Coastguard Worker       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
7414*4bdc9457SAndroid Build Coastguard Worker 
7415*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
7416*4bdc9457SAndroid Build Coastguard Worker     } else {
7417*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
7418*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
7419*4bdc9457SAndroid Build Coastguard Worker 
7420*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
7421*4bdc9457SAndroid Build Coastguard Worker 
7422*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
7423*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
7424*4bdc9457SAndroid Build Coastguard Worker       }
7425*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
7426*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
7427*4bdc9457SAndroid Build Coastguard Worker 
7428*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
7429*4bdc9457SAndroid Build Coastguard Worker 
7430*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
7431*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
7432*4bdc9457SAndroid Build Coastguard Worker       }
7433*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
7434*4bdc9457SAndroid Build Coastguard Worker         *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
7435*4bdc9457SAndroid Build Coastguard Worker       }
7436*4bdc9457SAndroid Build Coastguard Worker 
7437*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
7438*4bdc9457SAndroid Build Coastguard Worker     }
7439*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
7440*4bdc9457SAndroid Build Coastguard Worker }
7441*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7442*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2(
7443*4bdc9457SAndroid Build Coastguard Worker     size_t mr,
7444*4bdc9457SAndroid Build Coastguard Worker     size_t nc,
7445*4bdc9457SAndroid Build Coastguard Worker     size_t kc,
7446*4bdc9457SAndroid Build Coastguard Worker     size_t ks,
7447*4bdc9457SAndroid Build Coastguard Worker     const uint8_t** restrict a,
7448*4bdc9457SAndroid Build Coastguard Worker     const void* restrict w,
7449*4bdc9457SAndroid Build Coastguard Worker     uint8_t* restrict c,
7450*4bdc9457SAndroid Build Coastguard Worker     size_t cm_stride,
7451*4bdc9457SAndroid Build Coastguard Worker     size_t cn_stride,
7452*4bdc9457SAndroid Build Coastguard Worker     size_t a_offset,
7453*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* zero,
7454*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7455*4bdc9457SAndroid Build Coastguard Worker {
7456*4bdc9457SAndroid Build Coastguard Worker   assert(mr != 0);
7457*4bdc9457SAndroid Build Coastguard Worker   assert(mr <= 3);
7458*4bdc9457SAndroid Build Coastguard Worker   assert(nc != 0);
7459*4bdc9457SAndroid Build Coastguard Worker   assert(kc != 0);
7460*4bdc9457SAndroid Build Coastguard Worker   assert(ks != 0);
7461*4bdc9457SAndroid Build Coastguard Worker   assert(ks % (3 * sizeof(void*)) == 0);
7462*4bdc9457SAndroid Build Coastguard Worker   assert(a_offset % sizeof(uint8_t) == 0);
7463*4bdc9457SAndroid Build Coastguard Worker   assert(a != NULL);
7464*4bdc9457SAndroid Build Coastguard Worker   assert(w != NULL);
7465*4bdc9457SAndroid Build Coastguard Worker   assert(c != NULL);
7466*4bdc9457SAndroid Build Coastguard Worker 
7467*4bdc9457SAndroid Build Coastguard Worker   kc = round_up_po2(kc, 8);
7468*4bdc9457SAndroid Build Coastguard Worker   uint8_t* c0 = c;
7469*4bdc9457SAndroid Build Coastguard Worker   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
7470*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr < 2) {
7471*4bdc9457SAndroid Build Coastguard Worker     c1 = c0;
7472*4bdc9457SAndroid Build Coastguard Worker   }
7473*4bdc9457SAndroid Build Coastguard Worker   uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
7474*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNPREDICTABLE(mr <= 2) {
7475*4bdc9457SAndroid Build Coastguard Worker     c2 = c1;
7476*4bdc9457SAndroid Build Coastguard Worker   }
7477*4bdc9457SAndroid Build Coastguard Worker 
7478*4bdc9457SAndroid Build Coastguard Worker   do {
7479*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7480*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7481*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
7482*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7483*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7484*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
7485*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x4 = _mm_cvtsi32_si128(((const int*) w)[4]);
7486*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x5 = _mm_cvtsi32_si128(((const int*) w)[5]);
7487*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
7488*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x6 = _mm_cvtsi32_si128(((const int*) w)[6]);
7489*4bdc9457SAndroid Build Coastguard Worker     const __m128i vbias0x7 = _mm_cvtsi32_si128(((const int*) w)[7]);
7490*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
7491*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01 = vacc0x01;
7492*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x23 = vacc0x23;
7493*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x45 = vacc0x45;
7494*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x67 = vacc0x67;
7495*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01 = vacc0x01;
7496*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x23 = vacc0x23;
7497*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x45 = vacc0x45;
7498*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x67 = vacc0x67;
7499*4bdc9457SAndroid Build Coastguard Worker     w = (const int32_t*) w + 8;
7500*4bdc9457SAndroid Build Coastguard Worker 
7501*4bdc9457SAndroid Build Coastguard Worker     size_t p = ks;
7502*4bdc9457SAndroid Build Coastguard Worker     const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point);
7503*4bdc9457SAndroid Build Coastguard Worker     do {
7504*4bdc9457SAndroid Build Coastguard Worker       const uint8_t* restrict a0 = a[0];
7505*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a0 != zero) {
7506*4bdc9457SAndroid Build Coastguard Worker         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
7507*4bdc9457SAndroid Build Coastguard Worker       }
7508*4bdc9457SAndroid Build Coastguard Worker       const uint8_t* restrict a1 = a[1];
7509*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a1 != zero) {
7510*4bdc9457SAndroid Build Coastguard Worker         a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
7511*4bdc9457SAndroid Build Coastguard Worker       }
7512*4bdc9457SAndroid Build Coastguard Worker       const uint8_t* restrict a2 = a[2];
7513*4bdc9457SAndroid Build Coastguard Worker       if XNN_UNPREDICTABLE(a2 != zero) {
7514*4bdc9457SAndroid Build Coastguard Worker         a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
7515*4bdc9457SAndroid Build Coastguard Worker       }
7516*4bdc9457SAndroid Build Coastguard Worker       a += 3;
7517*4bdc9457SAndroid Build Coastguard Worker 
7518*4bdc9457SAndroid Build Coastguard Worker       size_t k = 0;
7519*4bdc9457SAndroid Build Coastguard Worker       while (k < kc) {
7520*4bdc9457SAndroid Build Coastguard Worker         const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
7521*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa0 = _mm256_cvtepu8_epi16(va0);
7522*4bdc9457SAndroid Build Coastguard Worker         a0 += 8;
7523*4bdc9457SAndroid Build Coastguard Worker         const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
7524*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa1 = _mm256_cvtepu8_epi16(va1);
7525*4bdc9457SAndroid Build Coastguard Worker         a1 += 8;
7526*4bdc9457SAndroid Build Coastguard Worker         const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
7527*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxa2 = _mm256_cvtepu8_epi16(va2);
7528*4bdc9457SAndroid Build Coastguard Worker         a2 += 8;
7529*4bdc9457SAndroid Build Coastguard Worker 
7530*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
7531*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb01 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb01), vb_zero_point);
7532*4bdc9457SAndroid Build Coastguard Worker 
7533*4bdc9457SAndroid Build Coastguard Worker         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
7534*4bdc9457SAndroid Build Coastguard Worker         vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
7535*4bdc9457SAndroid Build Coastguard Worker         vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
7536*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
7537*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb23 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb23), vb_zero_point);
7538*4bdc9457SAndroid Build Coastguard Worker 
7539*4bdc9457SAndroid Build Coastguard Worker         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
7540*4bdc9457SAndroid Build Coastguard Worker         vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
7541*4bdc9457SAndroid Build Coastguard Worker         vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
7542*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 32));
7543*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb45 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb45), vb_zero_point);
7544*4bdc9457SAndroid Build Coastguard Worker 
7545*4bdc9457SAndroid Build Coastguard Worker         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
7546*4bdc9457SAndroid Build Coastguard Worker         vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
7547*4bdc9457SAndroid Build Coastguard Worker         vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
7548*4bdc9457SAndroid Build Coastguard Worker         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 48));
7549*4bdc9457SAndroid Build Coastguard Worker         const __m256i vxb67 = _mm256_sub_epi16(_mm256_cvtepu8_epi16(vb67), vb_zero_point);
7550*4bdc9457SAndroid Build Coastguard Worker 
7551*4bdc9457SAndroid Build Coastguard Worker         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
7552*4bdc9457SAndroid Build Coastguard Worker         vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
7553*4bdc9457SAndroid Build Coastguard Worker         vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
7554*4bdc9457SAndroid Build Coastguard Worker 
7555*4bdc9457SAndroid Build Coastguard Worker         w = (const void*) ((const uint8_t*) w + 64);
7556*4bdc9457SAndroid Build Coastguard Worker         k += 8 * sizeof(uint8_t);
7557*4bdc9457SAndroid Build Coastguard Worker       }
7558*4bdc9457SAndroid Build Coastguard Worker       p -= 3 * sizeof(void*);
7559*4bdc9457SAndroid Build Coastguard Worker     } while (p != 0);
7560*4bdc9457SAndroid Build Coastguard Worker 
7561*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
7562*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
7563*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
7564*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
7565*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
7566*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
7567*4bdc9457SAndroid Build Coastguard Worker 
7568*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
7569*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
7570*4bdc9457SAndroid Build Coastguard Worker     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
7571*4bdc9457SAndroid Build Coastguard Worker 
7572*4bdc9457SAndroid Build Coastguard Worker     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
7573*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
7574*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
7575*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
7576*4bdc9457SAndroid Build Coastguard Worker 
7577*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
7578*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
7579*4bdc9457SAndroid Build Coastguard Worker     __m256 vscaled2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
7580*4bdc9457SAndroid Build Coastguard Worker 
7581*4bdc9457SAndroid Build Coastguard Worker     const __m256 vscale = _mm256_load_ps(params->fp32_avx2.scale);
7582*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_mul_ps(vscaled0x01234567, vscale);
7583*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_mul_ps(vscaled1x01234567, vscale);
7584*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_mul_ps(vscaled2x01234567, vscale);
7585*4bdc9457SAndroid Build Coastguard Worker 
7586*4bdc9457SAndroid Build Coastguard Worker     const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->fp32_avx2.output_max_less_zero_point);
7587*4bdc9457SAndroid Build Coastguard Worker     vscaled0x01234567 = _mm256_min_ps(vscaled0x01234567, voutput_max_less_zero_point);
7588*4bdc9457SAndroid Build Coastguard Worker     vscaled1x01234567 = _mm256_min_ps(vscaled1x01234567, voutput_max_less_zero_point);
7589*4bdc9457SAndroid Build Coastguard Worker     vscaled2x01234567 = _mm256_min_ps(vscaled2x01234567, voutput_max_less_zero_point);
7590*4bdc9457SAndroid Build Coastguard Worker 
7591*4bdc9457SAndroid Build Coastguard Worker     vacc0x01234567 = _mm256_cvtps_epi32(vscaled0x01234567);
7592*4bdc9457SAndroid Build Coastguard Worker     vacc1x01234567 = _mm256_cvtps_epi32(vscaled1x01234567);
7593*4bdc9457SAndroid Build Coastguard Worker     vacc2x01234567 = _mm256_cvtps_epi32(vscaled2x01234567);
7594*4bdc9457SAndroid Build Coastguard Worker 
7595*4bdc9457SAndroid Build Coastguard Worker     const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.output_zero_point);
7596*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
7597*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
7598*4bdc9457SAndroid Build Coastguard Worker 
7599*4bdc9457SAndroid Build Coastguard Worker     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
7600*4bdc9457SAndroid Build Coastguard Worker     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
7601*4bdc9457SAndroid Build Coastguard Worker 
7602*4bdc9457SAndroid Build Coastguard Worker     __m256i vout = _mm256_packus_epi16(vacc01x01234567, vacc22x01234567);
7603*4bdc9457SAndroid Build Coastguard Worker 
7604*4bdc9457SAndroid Build Coastguard Worker     vout = _mm256_max_epu8(vout, _mm256_load_si256((const __m256i*) params->fp32_avx2.output_min));
7605*4bdc9457SAndroid Build Coastguard Worker 
7606*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_lo = _mm256_castsi256_si128(vout);
7607*4bdc9457SAndroid Build Coastguard Worker     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
7608*4bdc9457SAndroid Build Coastguard Worker 
7609*4bdc9457SAndroid Build Coastguard Worker     if (nc >= 8) {
7610*4bdc9457SAndroid Build Coastguard Worker       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
7611*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c1, vout_hi);
7612*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) c0, vout_lo);
7613*4bdc9457SAndroid Build Coastguard Worker 
7614*4bdc9457SAndroid Build Coastguard Worker       c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
7615*4bdc9457SAndroid Build Coastguard Worker       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
7616*4bdc9457SAndroid Build Coastguard Worker       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7617*4bdc9457SAndroid Build Coastguard Worker 
7618*4bdc9457SAndroid Build Coastguard Worker       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
7619*4bdc9457SAndroid Build Coastguard Worker 
7620*4bdc9457SAndroid Build Coastguard Worker       nc -= 8;
7621*4bdc9457SAndroid Build Coastguard Worker     } else {
7622*4bdc9457SAndroid Build Coastguard Worker       if (nc & 4) {
7623*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout_lo, 2));
7624*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c1, vout_hi);
7625*4bdc9457SAndroid Build Coastguard Worker         _mm_storeu_si32(c0, vout_lo);
7626*4bdc9457SAndroid Build Coastguard Worker 
7627*4bdc9457SAndroid Build Coastguard Worker         c2 += 4;
7628*4bdc9457SAndroid Build Coastguard Worker         c1 += 4;
7629*4bdc9457SAndroid Build Coastguard Worker         c0 += 4;
7630*4bdc9457SAndroid Build Coastguard Worker 
7631*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi64(vout_lo, 32);
7632*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi64(vout_hi, 32);
7633*4bdc9457SAndroid Build Coastguard Worker       }
7634*4bdc9457SAndroid Build Coastguard Worker       if (nc & 2) {
7635*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout_lo, 4));
7636*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout_hi, 0));
7637*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout_lo, 0));
7638*4bdc9457SAndroid Build Coastguard Worker 
7639*4bdc9457SAndroid Build Coastguard Worker         c2 += 2;
7640*4bdc9457SAndroid Build Coastguard Worker         c1 += 2;
7641*4bdc9457SAndroid Build Coastguard Worker         c0 += 2;
7642*4bdc9457SAndroid Build Coastguard Worker 
7643*4bdc9457SAndroid Build Coastguard Worker         vout_lo = _mm_srli_epi32(vout_lo, 16);
7644*4bdc9457SAndroid Build Coastguard Worker         vout_hi = _mm_srli_epi32(vout_hi, 16);
7645*4bdc9457SAndroid Build Coastguard Worker       }
7646*4bdc9457SAndroid Build Coastguard Worker       if (nc & 1) {
7647*4bdc9457SAndroid Build Coastguard Worker         *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
7648*4bdc9457SAndroid Build Coastguard Worker         *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
7649*4bdc9457SAndroid Build Coastguard Worker         *c0 = (uint8_t) _mm_extract_epi8(vout_lo, 0);
7650*4bdc9457SAndroid Build Coastguard Worker       }
7651*4bdc9457SAndroid Build Coastguard Worker 
7652*4bdc9457SAndroid Build Coastguard Worker       nc = 0;
7653*4bdc9457SAndroid Build Coastguard Worker     }
7654*4bdc9457SAndroid Build Coastguard Worker   } while (nc != 0);
7655*4bdc9457SAndroid Build Coastguard Worker }
7656*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7657*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16(
7658*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7659*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* input_a,
7660*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* input_b,
7661*4bdc9457SAndroid Build Coastguard Worker     uint8_t* output,
7662*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7663*4bdc9457SAndroid Build Coastguard Worker {
7664*4bdc9457SAndroid Build Coastguard Worker   const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
7665*4bdc9457SAndroid Build Coastguard Worker   const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
7666*4bdc9457SAndroid Build Coastguard Worker   const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
7667*4bdc9457SAndroid Build Coastguard Worker   const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
7668*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
7669*4bdc9457SAndroid Build Coastguard Worker   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
7670*4bdc9457SAndroid Build Coastguard Worker   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
7671*4bdc9457SAndroid Build Coastguard Worker 
7672*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
7673*4bdc9457SAndroid Build Coastguard Worker     const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
7674*4bdc9457SAndroid Build Coastguard Worker     const __m256i vb01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
7675*4bdc9457SAndroid Build Coastguard Worker     const __m256i va89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
7676*4bdc9457SAndroid Build Coastguard Worker     const __m256i vb89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
7677*4bdc9457SAndroid Build Coastguard Worker     input_a += 16;
7678*4bdc9457SAndroid Build Coastguard Worker     input_b += 16;
7679*4bdc9457SAndroid Build Coastguard Worker 
7680*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
7681*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
7682*4bdc9457SAndroid Build Coastguard Worker 
7683*4bdc9457SAndroid Build Coastguard Worker     vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
7684*4bdc9457SAndroid Build Coastguard Worker     vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vb89ABCDEF, vb_multiplier));
7685*4bdc9457SAndroid Build Coastguard Worker 
7686*4bdc9457SAndroid Build Coastguard Worker     vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
7687*4bdc9457SAndroid Build Coastguard Worker     vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
7688*4bdc9457SAndroid Build Coastguard Worker 
7689*4bdc9457SAndroid Build Coastguard Worker     __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
7690*4bdc9457SAndroid Build Coastguard Worker 
7691*4bdc9457SAndroid Build Coastguard Worker     __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
7692*4bdc9457SAndroid Build Coastguard Worker 
7693*4bdc9457SAndroid Build Coastguard Worker     vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
7694*4bdc9457SAndroid Build Coastguard Worker 
7695*4bdc9457SAndroid Build Coastguard Worker     vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
7696*4bdc9457SAndroid Build Coastguard Worker 
7697*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
7698*4bdc9457SAndroid Build Coastguard Worker     output += 16;
7699*4bdc9457SAndroid Build Coastguard Worker   }
7700*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7701*4bdc9457SAndroid Build Coastguard Worker     do {
7702*4bdc9457SAndroid Build Coastguard Worker       const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
7703*4bdc9457SAndroid Build Coastguard Worker       const __m256i vb01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_b));
7704*4bdc9457SAndroid Build Coastguard Worker       input_a += 8;
7705*4bdc9457SAndroid Build Coastguard Worker       input_b += 8;
7706*4bdc9457SAndroid Build Coastguard Worker 
7707*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
7708*4bdc9457SAndroid Build Coastguard Worker 
7709*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
7710*4bdc9457SAndroid Build Coastguard Worker 
7711*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
7712*4bdc9457SAndroid Build Coastguard Worker 
7713*4bdc9457SAndroid Build Coastguard Worker       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
7714*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7715*4bdc9457SAndroid Build Coastguard Worker       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7716*4bdc9457SAndroid Build Coastguard Worker       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
7717*4bdc9457SAndroid Build Coastguard Worker 
7718*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
7719*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7720*4bdc9457SAndroid Build Coastguard Worker         output += 8;
7721*4bdc9457SAndroid Build Coastguard Worker         n -= 8 * sizeof(uint8_t);
7722*4bdc9457SAndroid Build Coastguard Worker       } else {
7723*4bdc9457SAndroid Build Coastguard Worker         if (n & (4 * sizeof(uint8_t))) {
7724*4bdc9457SAndroid Build Coastguard Worker           _mm_storeu_si32(output, vout0123456701234567);
7725*4bdc9457SAndroid Build Coastguard Worker           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7726*4bdc9457SAndroid Build Coastguard Worker           output += 4;
7727*4bdc9457SAndroid Build Coastguard Worker         }
7728*4bdc9457SAndroid Build Coastguard Worker         if (n & (2 * sizeof(uint8_t))) {
7729*4bdc9457SAndroid Build Coastguard Worker           _mm_storeu_si16(output, vout0123456701234567);
7730*4bdc9457SAndroid Build Coastguard Worker           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7731*4bdc9457SAndroid Build Coastguard Worker           output += 2;
7732*4bdc9457SAndroid Build Coastguard Worker         }
7733*4bdc9457SAndroid Build Coastguard Worker         if (n & (1 * sizeof(uint8_t))) {
7734*4bdc9457SAndroid Build Coastguard Worker           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
7735*4bdc9457SAndroid Build Coastguard Worker         }
7736*4bdc9457SAndroid Build Coastguard Worker         n = 0;
7737*4bdc9457SAndroid Build Coastguard Worker       }
7738*4bdc9457SAndroid Build Coastguard Worker     } while (n != 0);
7739*4bdc9457SAndroid Build Coastguard Worker   }
7740*4bdc9457SAndroid Build Coastguard Worker }
7741*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7742*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16(
7743*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7744*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* input_a,
7745*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* input_b,
7746*4bdc9457SAndroid Build Coastguard Worker     uint8_t* output,
7747*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7748*4bdc9457SAndroid Build Coastguard Worker {
7749*4bdc9457SAndroid Build Coastguard Worker   const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
7750*4bdc9457SAndroid Build Coastguard Worker   const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
7751*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
7752*4bdc9457SAndroid Build Coastguard Worker   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
7753*4bdc9457SAndroid Build Coastguard Worker   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
7754*4bdc9457SAndroid Build Coastguard Worker 
7755*4bdc9457SAndroid Build Coastguard Worker   const __m256i vbias = _mm256_add_epi32(
7756*4bdc9457SAndroid Build Coastguard Worker     _mm256_broadcastd_epi32(_mm_cvtsi32_si128(params->avx2.b_multiplier[0] * (int32_t) *input_b)),
7757*4bdc9457SAndroid Build Coastguard Worker     _mm256_load_si256((const __m256i*) params->avx2.bias));
7758*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
7759*4bdc9457SAndroid Build Coastguard Worker     const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
7760*4bdc9457SAndroid Build Coastguard Worker     const __m256i va89ABCDEF = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
7761*4bdc9457SAndroid Build Coastguard Worker     input_a += 16;
7762*4bdc9457SAndroid Build Coastguard Worker 
7763*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
7764*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
7765*4bdc9457SAndroid Build Coastguard Worker 
7766*4bdc9457SAndroid Build Coastguard Worker     vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
7767*4bdc9457SAndroid Build Coastguard Worker     vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
7768*4bdc9457SAndroid Build Coastguard Worker 
7769*4bdc9457SAndroid Build Coastguard Worker     __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
7770*4bdc9457SAndroid Build Coastguard Worker 
7771*4bdc9457SAndroid Build Coastguard Worker     __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
7772*4bdc9457SAndroid Build Coastguard Worker 
7773*4bdc9457SAndroid Build Coastguard Worker     vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
7774*4bdc9457SAndroid Build Coastguard Worker 
7775*4bdc9457SAndroid Build Coastguard Worker     vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
7776*4bdc9457SAndroid Build Coastguard Worker 
7777*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
7778*4bdc9457SAndroid Build Coastguard Worker     output += 16;
7779*4bdc9457SAndroid Build Coastguard Worker   }
7780*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7781*4bdc9457SAndroid Build Coastguard Worker     do {
7782*4bdc9457SAndroid Build Coastguard Worker       const __m256i va01234567 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i*) input_a));
7783*4bdc9457SAndroid Build Coastguard Worker       input_a += 8;
7784*4bdc9457SAndroid Build Coastguard Worker 
7785*4bdc9457SAndroid Build Coastguard Worker       __m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
7786*4bdc9457SAndroid Build Coastguard Worker 
7787*4bdc9457SAndroid Build Coastguard Worker       vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
7788*4bdc9457SAndroid Build Coastguard Worker 
7789*4bdc9457SAndroid Build Coastguard Worker       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
7790*4bdc9457SAndroid Build Coastguard Worker       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7791*4bdc9457SAndroid Build Coastguard Worker       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7792*4bdc9457SAndroid Build Coastguard Worker       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
7793*4bdc9457SAndroid Build Coastguard Worker 
7794*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
7795*4bdc9457SAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7796*4bdc9457SAndroid Build Coastguard Worker         output += 8;
7797*4bdc9457SAndroid Build Coastguard Worker         n -= 8 * sizeof(uint8_t);
7798*4bdc9457SAndroid Build Coastguard Worker       } else {
7799*4bdc9457SAndroid Build Coastguard Worker         if (n & (4 * sizeof(uint8_t))) {
7800*4bdc9457SAndroid Build Coastguard Worker           _mm_storeu_si32(output, vout0123456701234567);
7801*4bdc9457SAndroid Build Coastguard Worker           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7802*4bdc9457SAndroid Build Coastguard Worker           output += 4;
7803*4bdc9457SAndroid Build Coastguard Worker         }
7804*4bdc9457SAndroid Build Coastguard Worker         if (n & (2 * sizeof(uint8_t))) {
7805*4bdc9457SAndroid Build Coastguard Worker           _mm_storeu_si16(output, vout0123456701234567);
7806*4bdc9457SAndroid Build Coastguard Worker           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7807*4bdc9457SAndroid Build Coastguard Worker           output += 2;
7808*4bdc9457SAndroid Build Coastguard Worker         }
7809*4bdc9457SAndroid Build Coastguard Worker         if (n & (1 * sizeof(uint8_t))) {
7810*4bdc9457SAndroid Build Coastguard Worker           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
7811*4bdc9457SAndroid Build Coastguard Worker         }
7812*4bdc9457SAndroid Build Coastguard Worker         n = 0;
7813*4bdc9457SAndroid Build Coastguard Worker       }
7814*4bdc9457SAndroid Build Coastguard Worker     } while (n != 0);
7815*4bdc9457SAndroid Build Coastguard Worker   }
7816*4bdc9457SAndroid Build Coastguard Worker }
7817*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_vcvt_ukernel__avx2_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])7818*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_vcvt_ukernel__avx2_x32(
7819*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7820*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* x,
7821*4bdc9457SAndroid Build Coastguard Worker     uint8_t* y,
7822*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7823*4bdc9457SAndroid Build Coastguard Worker {
7824*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7825*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(uint8_t) == 0);
7826*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
7827*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7828*4bdc9457SAndroid Build Coastguard Worker 
7829*4bdc9457SAndroid Build Coastguard Worker   const __m256i vinput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.input_zero_point);
7830*4bdc9457SAndroid Build Coastguard Worker   const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
7831*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
7832*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
7833*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*) x));
7834*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*) (x + 16)));
7835*4bdc9457SAndroid Build Coastguard Worker     x += 32;
7836*4bdc9457SAndroid Build Coastguard Worker 
7837*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_sub_epi16(vinput_zero_point, vacc0);
7838*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_sub_epi16(vinput_zero_point, vacc1);
7839*4bdc9457SAndroid Build Coastguard Worker 
7840*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_slli_epi16(vacc0, 7);
7841*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_slli_epi16(vacc1, 7);
7842*4bdc9457SAndroid Build Coastguard Worker 
7843*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_mulhrs_epi16(vacc0, vmultiplier);
7844*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_mulhrs_epi16(vacc1, vmultiplier);
7845*4bdc9457SAndroid Build Coastguard Worker 
7846*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_adds_epi16(vacc0, voutput_zero_point);
7847*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_adds_epi16(vacc1, voutput_zero_point);
7848*4bdc9457SAndroid Build Coastguard Worker 
7849*4bdc9457SAndroid Build Coastguard Worker     __m256i vy0 = _mm256_packus_epi16(vacc0, vacc1);
7850*4bdc9457SAndroid Build Coastguard Worker 
7851*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_permute4x64_epi64(vy0, _MM_SHUFFLE(3, 1, 2, 0));
7852*4bdc9457SAndroid Build Coastguard Worker 
7853*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) y, vy0);
7854*4bdc9457SAndroid Build Coastguard Worker     y += 32;
7855*4bdc9457SAndroid Build Coastguard Worker   }
7856*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
7857*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*) x));
7858*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_sub_epi16(vinput_zero_point, vacc);
7859*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_slli_epi16(vacc, 7);
7860*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_mulhrs_epi16(vacc, vmultiplier);
7861*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_adds_epi16(vacc, voutput_zero_point);
7862*4bdc9457SAndroid Build Coastguard Worker     x += 16;
7863*4bdc9457SAndroid Build Coastguard Worker 
7864*4bdc9457SAndroid Build Coastguard Worker     const __m128i vacc_hi = _mm256_extracti128_si256(vacc, 1);
7865*4bdc9457SAndroid Build Coastguard Worker     const __m128i vy = _mm_packus_epi16(_mm256_castsi256_si128(vacc), vacc_hi);
7866*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) y, vy);
7867*4bdc9457SAndroid Build Coastguard Worker     y += 16;
7868*4bdc9457SAndroid Build Coastguard Worker   }
7869*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7870*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(uint8_t));
7871*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 15 * sizeof(uint8_t));
7872*4bdc9457SAndroid Build Coastguard Worker 
7873*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*) x));
7874*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_sub_epi16(vinput_zero_point, vacc);
7875*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_slli_epi16(vacc, 7);
7876*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_mulhrs_epi16(vacc, vmultiplier);
7877*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_adds_epi16(vacc, voutput_zero_point);
7878*4bdc9457SAndroid Build Coastguard Worker 
7879*4bdc9457SAndroid Build Coastguard Worker     const __m128i vacc_hi = _mm256_extracti128_si256(vacc, 1);
7880*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_packus_epi16(_mm256_castsi256_si128(vacc), vacc_hi);
7881*4bdc9457SAndroid Build Coastguard Worker     if (n & (8 * sizeof(uint8_t))) {
7882*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) y, vy);
7883*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_unpackhi_epi64(vy, vy);
7884*4bdc9457SAndroid Build Coastguard Worker       y += 8;
7885*4bdc9457SAndroid Build Coastguard Worker     }
7886*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(uint8_t))) {
7887*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(y, vy);
7888*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi64(vy, 32);
7889*4bdc9457SAndroid Build Coastguard Worker       y += 4;
7890*4bdc9457SAndroid Build Coastguard Worker     }
7891*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(uint8_t))) {
7892*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si16(y, vy);
7893*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi32(vy, 16);
7894*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7895*4bdc9457SAndroid Build Coastguard Worker     }
7896*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(uint8_t))) {
7897*4bdc9457SAndroid Build Coastguard Worker       *y = (uint8_t) _mm_extract_epi8(vy, 0);
7898*4bdc9457SAndroid Build Coastguard Worker     }
7899*4bdc9457SAndroid Build Coastguard Worker   }
7900*4bdc9457SAndroid Build Coastguard Worker }
7901*4bdc9457SAndroid Build Coastguard Worker 
xnn_qu8_vlrelu_ukernel__avx2_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])7902*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_vlrelu_ukernel__avx2_x32(
7903*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7904*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* x,
7905*4bdc9457SAndroid Build Coastguard Worker     uint8_t* y,
7906*4bdc9457SAndroid Build Coastguard Worker     const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7907*4bdc9457SAndroid Build Coastguard Worker {
7908*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
7909*4bdc9457SAndroid Build Coastguard Worker   assert(n % sizeof(uint8_t) == 0);
7910*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
7911*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
7912*4bdc9457SAndroid Build Coastguard Worker 
7913*4bdc9457SAndroid Build Coastguard Worker   const __m256i vinput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.input_zero_point);
7914*4bdc9457SAndroid Build Coastguard Worker   const __m256i vpositive_multiplier = _mm256_load_si256((const __m256i*) params->avx2.positive_multiplier);
7915*4bdc9457SAndroid Build Coastguard Worker   const __m256i vnegative_multiplier = _mm256_load_si256((const __m256i*) params->avx2.negative_multiplier);
7916*4bdc9457SAndroid Build Coastguard Worker   const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
7917*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
7918*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*) x));
7919*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*) (x + 16)));
7920*4bdc9457SAndroid Build Coastguard Worker     x += 32;
7921*4bdc9457SAndroid Build Coastguard Worker 
7922*4bdc9457SAndroid Build Coastguard Worker     __m256i vmultiplier0 = _mm256_cmpgt_epi16(vacc0, vinput_zero_point);
7923*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_sub_epi16(vinput_zero_point, vacc0);
7924*4bdc9457SAndroid Build Coastguard Worker     __m256i vmultiplier1 = _mm256_cmpgt_epi16(vacc1, vinput_zero_point);
7925*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_sub_epi16(vinput_zero_point, vacc1);
7926*4bdc9457SAndroid Build Coastguard Worker 
7927*4bdc9457SAndroid Build Coastguard Worker     vmultiplier0 = _mm256_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier0);
7928*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_slli_epi16(vacc0, 7);
7929*4bdc9457SAndroid Build Coastguard Worker     vmultiplier1 = _mm256_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier1);
7930*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_slli_epi16(vacc1, 7);
7931*4bdc9457SAndroid Build Coastguard Worker 
7932*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_mulhrs_epi16(vacc0, vmultiplier0);
7933*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_mulhrs_epi16(vacc1, vmultiplier1);
7934*4bdc9457SAndroid Build Coastguard Worker 
7935*4bdc9457SAndroid Build Coastguard Worker     vacc0 = _mm256_adds_epi16(vacc0, voutput_zero_point);
7936*4bdc9457SAndroid Build Coastguard Worker     vacc1 = _mm256_adds_epi16(vacc1, voutput_zero_point);
7937*4bdc9457SAndroid Build Coastguard Worker 
7938*4bdc9457SAndroid Build Coastguard Worker     __m256i vy0 = _mm256_packus_epi16(vacc0, vacc1);
7939*4bdc9457SAndroid Build Coastguard Worker 
7940*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_permute4x64_epi64(vy0, _MM_SHUFFLE(3, 1, 2, 0));
7941*4bdc9457SAndroid Build Coastguard Worker 
7942*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) y, vy0);
7943*4bdc9457SAndroid Build Coastguard Worker     y += 32;
7944*4bdc9457SAndroid Build Coastguard Worker   }
7945*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
7946*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*) x));
7947*4bdc9457SAndroid Build Coastguard Worker     __m256i vmultiplier = _mm256_cmpgt_epi16(vacc, vinput_zero_point);
7948*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_sub_epi16(vinput_zero_point, vacc);
7949*4bdc9457SAndroid Build Coastguard Worker     vmultiplier = _mm256_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
7950*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_slli_epi16(vacc, 7);
7951*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_mulhrs_epi16(vacc, vmultiplier);
7952*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_adds_epi16(vacc, voutput_zero_point);
7953*4bdc9457SAndroid Build Coastguard Worker     x += 16;
7954*4bdc9457SAndroid Build Coastguard Worker 
7955*4bdc9457SAndroid Build Coastguard Worker     const __m128i vacc_hi = _mm256_extracti128_si256(vacc, 1);
7956*4bdc9457SAndroid Build Coastguard Worker     const __m128i vy = _mm_packus_epi16(_mm256_castsi256_si128(vacc), vacc_hi);
7957*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) y, vy);
7958*4bdc9457SAndroid Build Coastguard Worker     y += 16;
7959*4bdc9457SAndroid Build Coastguard Worker   }
7960*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
7961*4bdc9457SAndroid Build Coastguard Worker     assert(n >= 1 * sizeof(uint8_t));
7962*4bdc9457SAndroid Build Coastguard Worker     assert(n <= 15 * sizeof(uint8_t));
7963*4bdc9457SAndroid Build Coastguard Worker 
7964*4bdc9457SAndroid Build Coastguard Worker     __m256i vacc = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*) x));
7965*4bdc9457SAndroid Build Coastguard Worker     __m256i vmultiplier = _mm256_cmpgt_epi16(vacc, vinput_zero_point);
7966*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_sub_epi16(vinput_zero_point, vacc);
7967*4bdc9457SAndroid Build Coastguard Worker     vmultiplier = _mm256_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
7968*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_slli_epi16(vacc, 7);
7969*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_mulhrs_epi16(vacc, vmultiplier);
7970*4bdc9457SAndroid Build Coastguard Worker     vacc = _mm256_adds_epi16(vacc, voutput_zero_point);
7971*4bdc9457SAndroid Build Coastguard Worker 
7972*4bdc9457SAndroid Build Coastguard Worker     const __m128i vacc_hi = _mm256_extracti128_si256(vacc, 1);
7973*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_packus_epi16(_mm256_castsi256_si128(vacc), vacc_hi);
7974*4bdc9457SAndroid Build Coastguard Worker     if (n & (8 * sizeof(uint8_t))) {
7975*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) y, vy);
7976*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_unpackhi_epi64(vy, vy);
7977*4bdc9457SAndroid Build Coastguard Worker       y += 8;
7978*4bdc9457SAndroid Build Coastguard Worker     }
7979*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(uint8_t))) {
7980*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(y, vy);
7981*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi64(vy, 32);
7982*4bdc9457SAndroid Build Coastguard Worker       y += 4;
7983*4bdc9457SAndroid Build Coastguard Worker     }
7984*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(uint8_t))) {
7985*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si16(y, vy);
7986*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi32(vy, 16);
7987*4bdc9457SAndroid Build Coastguard Worker       y += 2;
7988*4bdc9457SAndroid Build Coastguard Worker     }
7989*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(uint8_t))) {
7990*4bdc9457SAndroid Build Coastguard Worker       *y = (uint8_t) _mm_extract_epi8(vy, 0);
7991*4bdc9457SAndroid Build Coastguard Worker     }
7992*4bdc9457SAndroid Build Coastguard Worker   }
7993*4bdc9457SAndroid Build Coastguard Worker }
7994*4bdc9457SAndroid Build Coastguard Worker 
xnn_x8_lut_ukernel__avx2_x128(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])7995*4bdc9457SAndroid Build Coastguard Worker void xnn_x8_lut_ukernel__avx2_x128(
7996*4bdc9457SAndroid Build Coastguard Worker     size_t n,
7997*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* x,
7998*4bdc9457SAndroid Build Coastguard Worker     uint8_t* y,
7999*4bdc9457SAndroid Build Coastguard Worker     const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
8000*4bdc9457SAndroid Build Coastguard Worker {
8001*4bdc9457SAndroid Build Coastguard Worker   assert(n != 0);
8002*4bdc9457SAndroid Build Coastguard Worker   assert(x != NULL);
8003*4bdc9457SAndroid Build Coastguard Worker   assert(y != NULL);
8004*4bdc9457SAndroid Build Coastguard Worker 
8005*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt0 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) t));
8006*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt1 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 16)));
8007*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 32)));
8008*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt3 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 48)));
8009*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt4 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 64)));
8010*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt5 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 80)));
8011*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt6 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 96)));
8012*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt7 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 112)));
8013*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt8 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 128)));
8014*4bdc9457SAndroid Build Coastguard Worker   const __m256i vt9 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 144)));
8015*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtA = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 160)));
8016*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtB = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 176)));
8017*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtC = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 192)));
8018*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtD = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 208)));
8019*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtE = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 224)));
8020*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtF = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) (t + 240)));
8021*4bdc9457SAndroid Build Coastguard Worker 
8022*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable0 = vt0;
8023*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable1 = _mm256_xor_si256(vt0, vt1);
8024*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable2 = _mm256_xor_si256(vt1, vt2);
8025*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable3 = _mm256_xor_si256(vt2, vt3);
8026*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable4 = _mm256_xor_si256(vt3, vt4);
8027*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable5 = _mm256_xor_si256(vt4, vt5);
8028*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable6 = _mm256_xor_si256(vt5, vt6);
8029*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable7 = _mm256_xor_si256(vt6, vt7);
8030*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable8 = _mm256_xor_si256(_mm256_xor_si256(vt7, vt8), vtable0);
8031*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtable9 = _mm256_xor_si256(_mm256_xor_si256(vt8, vt9), vtable1);
8032*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtableA = _mm256_xor_si256(_mm256_xor_si256(vt9, vtA), vtable2);
8033*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtableB = _mm256_xor_si256(_mm256_xor_si256(vtA, vtB), vtable3);
8034*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtableC = _mm256_xor_si256(_mm256_xor_si256(vtB, vtC), vtable4);
8035*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtableD = _mm256_xor_si256(_mm256_xor_si256(vtC, vtD), vtable5);
8036*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtableE = _mm256_xor_si256(_mm256_xor_si256(vtD, vtE), vtable6);
8037*4bdc9457SAndroid Build Coastguard Worker   const __m256i vtableF = _mm256_xor_si256(_mm256_xor_si256(vtE, vtF), vtable7);
8038*4bdc9457SAndroid Build Coastguard Worker 
8039*4bdc9457SAndroid Build Coastguard Worker   const __m256i voffset = _mm256_set1_epi8(16);
8040*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 128 * sizeof(uint8_t); n -= 128 * sizeof(uint8_t)) {
8041*4bdc9457SAndroid Build Coastguard Worker     __m256i vx0 = _mm256_loadu_si256((const __m256i*) x);
8042*4bdc9457SAndroid Build Coastguard Worker     __m256i vx1 = _mm256_loadu_si256((const __m256i*) (x + 32));
8043*4bdc9457SAndroid Build Coastguard Worker     __m256i vx2 = _mm256_loadu_si256((const __m256i*) (x + 64));
8044*4bdc9457SAndroid Build Coastguard Worker     __m256i vx3 = _mm256_loadu_si256((const __m256i*) (x + 96));
8045*4bdc9457SAndroid Build Coastguard Worker     x += 128;
8046*4bdc9457SAndroid Build Coastguard Worker 
8047*4bdc9457SAndroid Build Coastguard Worker     __m256i vy0 = _mm256_shuffle_epi8(vtable0, vx0);
8048*4bdc9457SAndroid Build Coastguard Worker     __m256i vy1 = _mm256_shuffle_epi8(vtable0, vx1);
8049*4bdc9457SAndroid Build Coastguard Worker     __m256i vy2 = _mm256_shuffle_epi8(vtable0, vx2);
8050*4bdc9457SAndroid Build Coastguard Worker     __m256i vy3 = _mm256_shuffle_epi8(vtable0, vx3);
8051*4bdc9457SAndroid Build Coastguard Worker 
8052*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_sub_epi8(vx0, voffset);
8053*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_sub_epi8(vx1, voffset);
8054*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_sub_epi8(vx2, voffset);
8055*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_sub_epi8(vx3, voffset);
8056*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable1, vx0));
8057*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable1, vx1));
8058*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable1, vx2));
8059*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable1, vx3));
8060*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_sub_epi8(vx0, voffset);
8061*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_sub_epi8(vx1, voffset);
8062*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_sub_epi8(vx2, voffset);
8063*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_sub_epi8(vx3, voffset);
8064*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable2, vx0));
8065*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable2, vx1));
8066*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable2, vx2));
8067*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable2, vx3));
8068*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_sub_epi8(vx0, voffset);
8069*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_sub_epi8(vx1, voffset);
8070*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_sub_epi8(vx2, voffset);
8071*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_sub_epi8(vx3, voffset);
8072*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable3, vx0));
8073*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable3, vx1));
8074*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable3, vx2));
8075*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable3, vx3));
8076*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_sub_epi8(vx0, voffset);
8077*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_sub_epi8(vx1, voffset);
8078*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_sub_epi8(vx2, voffset);
8079*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_sub_epi8(vx3, voffset);
8080*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable4, vx0));
8081*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable4, vx1));
8082*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable4, vx2));
8083*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable4, vx3));
8084*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_sub_epi8(vx0, voffset);
8085*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_sub_epi8(vx1, voffset);
8086*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_sub_epi8(vx2, voffset);
8087*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_sub_epi8(vx3, voffset);
8088*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable5, vx0));
8089*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable5, vx1));
8090*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable5, vx2));
8091*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable5, vx3));
8092*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_sub_epi8(vx0, voffset);
8093*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_sub_epi8(vx1, voffset);
8094*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_sub_epi8(vx2, voffset);
8095*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_sub_epi8(vx3, voffset);
8096*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable6, vx0));
8097*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable6, vx1));
8098*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable6, vx2));
8099*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable6, vx3));
8100*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_sub_epi8(vx0, voffset);
8101*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_sub_epi8(vx1, voffset);
8102*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_sub_epi8(vx2, voffset);
8103*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_sub_epi8(vx3, voffset);
8104*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable7, vx0));
8105*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable7, vx1));
8106*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable7, vx2));
8107*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable7, vx3));
8108*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_sub_epi8(vx0, voffset);
8109*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_sub_epi8(vx1, voffset);
8110*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_sub_epi8(vx2, voffset);
8111*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_sub_epi8(vx3, voffset);
8112*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable8, vx0));
8113*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable8, vx1));
8114*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable8, vx2));
8115*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable8, vx3));
8116*4bdc9457SAndroid Build Coastguard Worker 
8117*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_subs_epi8(vx0, voffset);
8118*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_subs_epi8(vx1, voffset);
8119*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_subs_epi8(vx2, voffset);
8120*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_subs_epi8(vx3, voffset);
8121*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtable9, vx0));
8122*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtable9, vx1));
8123*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtable9, vx2));
8124*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtable9, vx3));
8125*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_subs_epi8(vx0, voffset);
8126*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_subs_epi8(vx1, voffset);
8127*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_subs_epi8(vx2, voffset);
8128*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_subs_epi8(vx3, voffset);
8129*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableA, vx0));
8130*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableA, vx1));
8131*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableA, vx2));
8132*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableA, vx3));
8133*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_subs_epi8(vx0, voffset);
8134*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_subs_epi8(vx1, voffset);
8135*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_subs_epi8(vx2, voffset);
8136*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_subs_epi8(vx3, voffset);
8137*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableB, vx0));
8138*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableB, vx1));
8139*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableB, vx2));
8140*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableB, vx3));
8141*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_subs_epi8(vx0, voffset);
8142*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_subs_epi8(vx1, voffset);
8143*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_subs_epi8(vx2, voffset);
8144*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_subs_epi8(vx3, voffset);
8145*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableC, vx0));
8146*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableC, vx1));
8147*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableC, vx2));
8148*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableC, vx3));
8149*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_subs_epi8(vx0, voffset);
8150*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_subs_epi8(vx1, voffset);
8151*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_subs_epi8(vx2, voffset);
8152*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_subs_epi8(vx3, voffset);
8153*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableD, vx0));
8154*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableD, vx1));
8155*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableD, vx2));
8156*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableD, vx3));
8157*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_subs_epi8(vx0, voffset);
8158*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_subs_epi8(vx1, voffset);
8159*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_subs_epi8(vx2, voffset);
8160*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_subs_epi8(vx3, voffset);
8161*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableE, vx0));
8162*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableE, vx1));
8163*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableE, vx2));
8164*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableE, vx3));
8165*4bdc9457SAndroid Build Coastguard Worker     vx0 = _mm256_subs_epi8(vx0, voffset);
8166*4bdc9457SAndroid Build Coastguard Worker     vx1 = _mm256_subs_epi8(vx1, voffset);
8167*4bdc9457SAndroid Build Coastguard Worker     vx2 = _mm256_subs_epi8(vx2, voffset);
8168*4bdc9457SAndroid Build Coastguard Worker     vx3 = _mm256_subs_epi8(vx3, voffset);
8169*4bdc9457SAndroid Build Coastguard Worker     vy0 = _mm256_xor_si256(vy0, _mm256_shuffle_epi8(vtableF, vx0));
8170*4bdc9457SAndroid Build Coastguard Worker     vy1 = _mm256_xor_si256(vy1, _mm256_shuffle_epi8(vtableF, vx1));
8171*4bdc9457SAndroid Build Coastguard Worker     vy2 = _mm256_xor_si256(vy2, _mm256_shuffle_epi8(vtableF, vx2));
8172*4bdc9457SAndroid Build Coastguard Worker     vy3 = _mm256_xor_si256(vy3, _mm256_shuffle_epi8(vtableF, vx3));
8173*4bdc9457SAndroid Build Coastguard Worker 
8174*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) y, vy0);
8175*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) (y + 32), vy1);
8176*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) (y + 64), vy2);
8177*4bdc9457SAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i*) (y + 96), vy3);
8178*4bdc9457SAndroid Build Coastguard Worker     y += 128;
8179*4bdc9457SAndroid Build Coastguard Worker   }
8180*4bdc9457SAndroid Build Coastguard Worker   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
8181*4bdc9457SAndroid Build Coastguard Worker     __m128i vx = _mm_loadu_si128((const __m128i*) x);
8182*4bdc9457SAndroid Build Coastguard Worker     x += 16;
8183*4bdc9457SAndroid Build Coastguard Worker 
8184*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
8185*4bdc9457SAndroid Build Coastguard Worker 
8186*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8187*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
8188*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8189*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
8190*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8191*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
8192*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8193*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
8194*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8195*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
8196*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8197*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
8198*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8199*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
8200*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8201*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
8202*4bdc9457SAndroid Build Coastguard Worker 
8203*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8204*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
8205*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8206*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
8207*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8208*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
8209*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8210*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
8211*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8212*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
8213*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8214*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
8215*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8216*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
8217*4bdc9457SAndroid Build Coastguard Worker 
8218*4bdc9457SAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i*) y, vy);
8219*4bdc9457SAndroid Build Coastguard Worker     y += 16;
8220*4bdc9457SAndroid Build Coastguard Worker   }
8221*4bdc9457SAndroid Build Coastguard Worker   if XNN_UNLIKELY(n != 0) {
8222*4bdc9457SAndroid Build Coastguard Worker     __m128i vx = _mm_loadu_si128((const __m128i*) x);
8223*4bdc9457SAndroid Build Coastguard Worker 
8224*4bdc9457SAndroid Build Coastguard Worker     __m128i vy = _mm_shuffle_epi8(_mm256_castsi256_si128(vtable0), vx);
8225*4bdc9457SAndroid Build Coastguard Worker 
8226*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8227*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable1), vx));
8228*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8229*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable2), vx));
8230*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8231*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable3), vx));
8232*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8233*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable4), vx));
8234*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8235*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable5), vx));
8236*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8237*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable6), vx));
8238*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8239*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable7), vx));
8240*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_sub_epi8(vx, _mm256_castsi256_si128(voffset));
8241*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable8), vx));
8242*4bdc9457SAndroid Build Coastguard Worker 
8243*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8244*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtable9), vx));
8245*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8246*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableA), vx));
8247*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8248*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableB), vx));
8249*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8250*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableC), vx));
8251*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8252*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableD), vx));
8253*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8254*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableE), vx));
8255*4bdc9457SAndroid Build Coastguard Worker     vx = _mm_subs_epi8(vx, _mm256_castsi256_si128(voffset));
8256*4bdc9457SAndroid Build Coastguard Worker     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(_mm256_castsi256_si128(vtableF), vx));
8257*4bdc9457SAndroid Build Coastguard Worker 
8258*4bdc9457SAndroid Build Coastguard Worker     if (n & (8 * sizeof(uint8_t))) {
8259*4bdc9457SAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i*) y, vy);
8260*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_unpackhi_epi64(vy, vy);
8261*4bdc9457SAndroid Build Coastguard Worker       y += 8;
8262*4bdc9457SAndroid Build Coastguard Worker     }
8263*4bdc9457SAndroid Build Coastguard Worker     if (n & (4 * sizeof(uint8_t))) {
8264*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si32(y, vy);
8265*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi64(vy, 32);
8266*4bdc9457SAndroid Build Coastguard Worker       y += 4;
8267*4bdc9457SAndroid Build Coastguard Worker     }
8268*4bdc9457SAndroid Build Coastguard Worker     if (n & (2 * sizeof(uint8_t))) {
8269*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si16(y, vy);
8270*4bdc9457SAndroid Build Coastguard Worker       vy = _mm_srli_epi32(vy, 16);
8271*4bdc9457SAndroid Build Coastguard Worker       y += 2;
8272*4bdc9457SAndroid Build Coastguard Worker     }
8273*4bdc9457SAndroid Build Coastguard Worker     if (n & (1 * sizeof(uint8_t))) {
8274*4bdc9457SAndroid Build Coastguard Worker       *y = (uint8_t) _mm_extract_epi8(vy, 0);
8275*4bdc9457SAndroid Build Coastguard Worker     }
8276*4bdc9457SAndroid Build Coastguard Worker   }
8277*4bdc9457SAndroid Build Coastguard Worker }
8278