xref: /aosp_15_r20/external/XNNPACK/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-gavgpool/multipass-sse2.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <emmintrin.h>
13 
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17 
18 
xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24(
20     size_t rows,
21     size_t channels,
22     const int8_t* input,
23     size_t input_stride,
24     const int8_t* zero,
25     int32_t* buffer,
26     int8_t* output,
27     const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29   assert(rows > 7);
30   assert(channels != 0);
31 
32   const int8_t* i0 = input;
33   const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
34   const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
35   const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
36   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
37   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
38   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
39   const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
40 
41   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
42   int32_t* b = buffer;
43   size_t c = channels;
44   for (; c >= 24; c -= 24) {
45 
46     const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
47     const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
48     const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
49     i0 += 24;
50 
51     const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
52     const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
53     const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8);
54     const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
55     const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8);
56     const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
57     i1 += 24;
58 
59     const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
60     const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
61     const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8);
62     const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
63     const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8);
64     const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
65     i2 += 24;
66 
67     __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
68     const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
69     const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
70     __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
71     const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8);
72     const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
73     __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
74     const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8);
75     const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
76     i3 += 24;
77 
78     vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
79     const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
80     const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
81     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
82     const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8);
83     const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
84     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
85     const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8);
86     const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
87     i4 += 24;
88 
89     vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
90     const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
91     const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
92     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
93     const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8);
94     const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
95     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
96     const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8);
97     const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
98     i5 += 24;
99 
100     vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
101     const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
102     const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
103     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
104     const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8);
105     const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
106     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
107     const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8);
108     const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
109     i6 += 24;
110 
111     vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
112     const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
113     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
114     const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8);
115     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
116     const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8);
117 
118     vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
119     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
120     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
121 
122     const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
123     __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
124     __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
125     const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
126     __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
127     __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
128     const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
129     __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
130     __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
131 
132     vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
133     vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
134     vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
135     vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
136     vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
137     vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
138 
139     _mm_store_si128((__m128i*) b, vacc0123);
140     _mm_store_si128((__m128i*) (b + 4), vacc4567);
141     _mm_store_si128((__m128i*) (b + 8), vacc89AB);
142     _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
143     _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
144     _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
145     b += 24;
146   }
147   if XNN_UNLIKELY(c != 0) {
148     do {
149 
150       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
151       i0 += 8;
152 
153       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
154       i1 += 8;
155 
156       const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
157       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
158       i2 += 8;
159 
160       const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
161       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
162       i3 += 8;
163 
164       __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
165       const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
166       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
167       i4 += 8;
168 
169       vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
170       const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
171       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
172       i5 += 8;
173 
174       vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
175       const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
176       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
177       i6 += 8;
178 
179       vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
180       const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
181 
182       vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
183       const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
184 
185       vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
186 
187       const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
188       __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
189       __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
190 
191       vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
192       vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
193 
194       _mm_store_si128((__m128i*) b, vacc0123);
195       _mm_store_si128((__m128i*) (b + 4), vacc4567);
196       b += 8;
197 
198       c = doz(c, 8);
199     } while (c != 0);
200   }
201 
202   for (rows -= 7; rows > 7; rows -= 7) {
203     i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
204     i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
205     i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
206     i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
207     i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
208     i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
209     i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
210 
211     int32_t* b = buffer;
212     size_t c = channels;
213     for (; c >= 24; c -= 24) {
214 
215       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
216       const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
217       const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
218       i0 += 24;
219 
220       const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
221       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
222       const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8);
223       const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
224       const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8);
225       const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
226       i1 += 24;
227 
228       const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
229       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
230       const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8);
231       const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
232       const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8);
233       const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
234       i2 += 24;
235 
236       __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
237       const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
238       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
239       __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
240       const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8);
241       const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
242       __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
243       const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8);
244       const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
245       i3 += 24;
246 
247       vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
248       const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
249       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
250       vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
251       const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8);
252       const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
253       vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
254       const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8);
255       const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
256       i4 += 24;
257 
258       vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
259       const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
260       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
261       vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
262       const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8);
263       const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
264       vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
265       const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8);
266       const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
267       i5 += 24;
268 
269       vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
270       const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
271       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
272       vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
273       const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8);
274       const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
275       vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
276       const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8);
277       const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
278       i6 += 24;
279 
280       vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
281       const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
282       vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
283       const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8);
284       vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
285       const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8);
286 
287       vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
288       vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
289       vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
290 
291       const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
292       __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
293       __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
294       const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
295       __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
296       __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
297       const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
298       __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
299       __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
300 
301       vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
302       vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
303       vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
304       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
305       vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16)));
306       vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20)));
307 
308       _mm_store_si128((__m128i*) b, vacc0123);
309       _mm_store_si128((__m128i*) (b + 4), vacc4567);
310       _mm_store_si128((__m128i*) (b + 8), vacc89AB);
311       _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
312       _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
313       _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
314       b += 24;
315     }
316     if XNN_UNLIKELY(c != 0) {
317       do {
318 
319         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
320         i0 += 8;
321 
322         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
323         i1 += 8;
324 
325         const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
326         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
327         i2 += 8;
328 
329         const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
330         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
331         i3 += 8;
332 
333         __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
334         const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
335         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
336         i4 += 8;
337 
338         vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
339         const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
340         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
341         i5 += 8;
342 
343         vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
344         const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
345         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
346         i6 += 8;
347 
348         vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
349         const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
350 
351         vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
352         const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
353 
354         vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
355 
356         const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
357         __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
358         __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
359 
360         vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
361         vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
362 
363         _mm_store_si128((__m128i*) b, vacc0123);
364         _mm_store_si128((__m128i*) (b + 4), vacc4567);
365         b += 8;
366 
367         c = doz(c, 8);
368       } while (c != 0);
369     }
370   }
371 
372   i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
373   i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
374   if XNN_UNPREDICTABLE(rows < 2) {
375     i1 = zero;
376   }
377   i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
378   if XNN_UNPREDICTABLE(rows <= 2) {
379     i2 = zero;
380   }
381   i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
382   if XNN_UNPREDICTABLE(rows < 4) {
383     i3 = zero;
384   }
385   i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
386   if XNN_UNPREDICTABLE(rows <= 4) {
387     i4 = zero;
388   }
389   i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
390   if XNN_UNPREDICTABLE(rows < 6) {
391     i5 = zero;
392   }
393   i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
394   if XNN_UNPREDICTABLE(rows <= 6) {
395     i6 = zero;
396   }
397 
398   const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
399   const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
400   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
401   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
402   for (; channels >= 24; channels -= 24) {
403 
404     const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
405     const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
406     const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
407     i0 += 24;
408 
409     const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
410     const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
411     const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8);
412     const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
413     const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8);
414     const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
415     i1 += 24;
416 
417     const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
418     const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
419     const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8);
420     const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
421     const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8);
422     const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
423     i2 += 24;
424 
425     __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
426     const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
427     const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
428     __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
429     const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8);
430     const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
431     __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
432     const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8);
433     const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
434     i3 += 24;
435 
436     vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
437     const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
438     const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
439     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
440     const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8);
441     const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
442     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
443     const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8);
444     const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
445     i4 += 24;
446 
447     vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
448     const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
449     const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
450     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
451     const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8);
452     const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
453     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
454     const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8);
455     const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
456     i5 += 24;
457 
458     vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
459     const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
460     const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
461     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
462     const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8);
463     const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
464     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
465     const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8);
466     const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
467     i6 += 24;
468 
469     vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
470     const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
471     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
472     const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8);
473     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
474     const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8);
475 
476     vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
477     vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
478     vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
479 
480     const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
481     __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
482     __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
483     const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
484     __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
485     __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
486     const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
487     __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
488     __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
489 
490     vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
491     vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
492     vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
493     vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
494     vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16)));
495     vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20)));
496     buffer += 24;
497 
498     __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
499     __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
500     __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
501     __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
502     __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ);
503     __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN);
504 
505     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
506     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
507     vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
508     vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
509     vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale);
510     vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale);
511 
512     vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
513     vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
514     vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
515     vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
516     vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point);
517     vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point);
518 
519     vacc0123 = _mm_cvtps_epi32(vfpacc0123);
520     vacc4567 = _mm_cvtps_epi32(vfpacc4567);
521     vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
522     vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
523     vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ);
524     vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN);
525 
526     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
527     __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
528     __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
529 
530     vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
531     vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
532     voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
533 
534     __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
535     __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
536 
537 
538     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
539     _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
540     output += 24;
541   }
542   if XNN_UNLIKELY(channels != 0) {
543     do {
544 
545       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
546       i0 += 8;
547 
548       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
549       i1 += 8;
550 
551       const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
552       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
553       i2 += 8;
554 
555       const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
556       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
557       i3 += 8;
558 
559       __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
560       const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
561       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
562       i4 += 8;
563 
564       vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
565       const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
566       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
567       i5 += 8;
568 
569       vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
570       const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
571       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
572       i6 += 8;
573 
574       vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
575       const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
576 
577       vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
578       const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
579 
580       vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
581 
582       const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
583       __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
584       __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
585 
586       vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
587       vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
588       buffer += 8;
589 
590       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
591       __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
592 
593       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
594       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
595 
596       vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
597       vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
598 
599       vacc0123 = _mm_cvtps_epi32(vfpacc0123);
600       vacc4567 = _mm_cvtps_epi32(vfpacc4567);
601 
602       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
603       vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
604 
605       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
606 
607       if XNN_LIKELY(channels >= 8) {
608         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
609         output += 8;
610         channels -= 8;
611       } else {
612         if (channels & 4) {
613           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
614           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
615           output += 4;
616         }
617         uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
618         if (channels & 2) {
619           unaligned_store_u16(output, (uint16_t) vout0123);
620           vout0123 >>= 16;
621           output += 2;
622         }
623         if (channels & 1) {
624           *output = (int8_t) vout0123;
625           output += 1;
626         }
627         channels = 0;
628       }
629     } while (channels != 0);
630   }
631 }
632