1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-sse2.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <emmintrin.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17
18
xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24(
20 size_t rows,
21 size_t channels,
22 const int8_t* input,
23 size_t input_stride,
24 const int8_t* zero,
25 int32_t* buffer,
26 int8_t* output,
27 const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(rows > 7);
30 assert(channels != 0);
31
32 const int8_t* i0 = input;
33 const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
34 const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
35 const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
36 const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
37 const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
38 const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
39 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
40
41 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
42 int32_t* b = buffer;
43 size_t c = channels;
44 for (; c >= 24; c -= 24) {
45
46 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
47 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
48 const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
49 i0 += 24;
50
51 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
52 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
53 const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8);
54 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
55 const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8);
56 const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
57 i1 += 24;
58
59 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
60 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
61 const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8);
62 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
63 const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8);
64 const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
65 i2 += 24;
66
67 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
68 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
69 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
70 __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
71 const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8);
72 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
73 __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
74 const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8);
75 const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
76 i3 += 24;
77
78 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
79 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
80 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
81 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
82 const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8);
83 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
84 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
85 const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8);
86 const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
87 i4 += 24;
88
89 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
90 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
91 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
92 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
93 const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8);
94 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
95 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
96 const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8);
97 const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
98 i5 += 24;
99
100 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
101 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
102 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
103 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
104 const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8);
105 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
106 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
107 const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8);
108 const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
109 i6 += 24;
110
111 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
112 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
113 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
114 const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8);
115 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
116 const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8);
117
118 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
119 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
120 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
121
122 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
123 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
124 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
125 const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
126 __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
127 __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
128 const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
129 __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
130 __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
131
132 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
133 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
134 vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
135 vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
136 vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
137 vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
138
139 _mm_store_si128((__m128i*) b, vacc0123);
140 _mm_store_si128((__m128i*) (b + 4), vacc4567);
141 _mm_store_si128((__m128i*) (b + 8), vacc89AB);
142 _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
143 _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
144 _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
145 b += 24;
146 }
147 if XNN_UNLIKELY(c != 0) {
148 do {
149
150 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
151 i0 += 8;
152
153 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
154 i1 += 8;
155
156 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
157 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
158 i2 += 8;
159
160 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
161 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
162 i3 += 8;
163
164 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
165 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
166 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
167 i4 += 8;
168
169 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
170 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
171 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
172 i5 += 8;
173
174 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
175 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
176 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
177 i6 += 8;
178
179 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
180 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
181
182 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
183 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
184
185 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
186
187 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
188 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
189 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
190
191 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
192 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
193
194 _mm_store_si128((__m128i*) b, vacc0123);
195 _mm_store_si128((__m128i*) (b + 4), vacc4567);
196 b += 8;
197
198 c = doz(c, 8);
199 } while (c != 0);
200 }
201
202 for (rows -= 7; rows > 7; rows -= 7) {
203 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
204 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
205 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
206 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
207 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
208 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
209 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
210
211 int32_t* b = buffer;
212 size_t c = channels;
213 for (; c >= 24; c -= 24) {
214
215 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
216 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
217 const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
218 i0 += 24;
219
220 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
221 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
222 const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8);
223 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
224 const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8);
225 const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
226 i1 += 24;
227
228 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
229 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
230 const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8);
231 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
232 const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8);
233 const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
234 i2 += 24;
235
236 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
237 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
238 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
239 __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
240 const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8);
241 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
242 __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
243 const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8);
244 const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
245 i3 += 24;
246
247 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
248 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
249 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
250 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
251 const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8);
252 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
253 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
254 const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8);
255 const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
256 i4 += 24;
257
258 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
259 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
260 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
261 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
262 const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8);
263 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
264 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
265 const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8);
266 const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
267 i5 += 24;
268
269 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
270 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
271 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
272 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
273 const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8);
274 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
275 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
276 const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8);
277 const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
278 i6 += 24;
279
280 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
281 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
282 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
283 const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8);
284 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
285 const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8);
286
287 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
288 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
289 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
290
291 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
292 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
293 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
294 const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
295 __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
296 __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
297 const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
298 __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
299 __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
300
301 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
302 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
303 vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
304 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
305 vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16)));
306 vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20)));
307
308 _mm_store_si128((__m128i*) b, vacc0123);
309 _mm_store_si128((__m128i*) (b + 4), vacc4567);
310 _mm_store_si128((__m128i*) (b + 8), vacc89AB);
311 _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
312 _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
313 _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
314 b += 24;
315 }
316 if XNN_UNLIKELY(c != 0) {
317 do {
318
319 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
320 i0 += 8;
321
322 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
323 i1 += 8;
324
325 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
326 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
327 i2 += 8;
328
329 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
330 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
331 i3 += 8;
332
333 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
334 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
335 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
336 i4 += 8;
337
338 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
339 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
340 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
341 i5 += 8;
342
343 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
344 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
345 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
346 i6 += 8;
347
348 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
349 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
350
351 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
352 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
353
354 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
355
356 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
357 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
358 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
359
360 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
361 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
362
363 _mm_store_si128((__m128i*) b, vacc0123);
364 _mm_store_si128((__m128i*) (b + 4), vacc4567);
365 b += 8;
366
367 c = doz(c, 8);
368 } while (c != 0);
369 }
370 }
371
372 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
373 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
374 if XNN_UNPREDICTABLE(rows < 2) {
375 i1 = zero;
376 }
377 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
378 if XNN_UNPREDICTABLE(rows <= 2) {
379 i2 = zero;
380 }
381 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
382 if XNN_UNPREDICTABLE(rows < 4) {
383 i3 = zero;
384 }
385 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
386 if XNN_UNPREDICTABLE(rows <= 4) {
387 i4 = zero;
388 }
389 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
390 if XNN_UNPREDICTABLE(rows < 6) {
391 i5 = zero;
392 }
393 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
394 if XNN_UNPREDICTABLE(rows <= 6) {
395 i6 = zero;
396 }
397
398 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
399 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
400 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
401 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
402 for (; channels >= 24; channels -= 24) {
403
404 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
405 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
406 const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
407 i0 += 24;
408
409 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
410 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
411 const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8);
412 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
413 const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8);
414 const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
415 i1 += 24;
416
417 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
418 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
419 const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8);
420 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
421 const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8);
422 const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
423 i2 += 24;
424
425 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
426 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
427 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
428 __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
429 const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8);
430 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
431 __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
432 const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8);
433 const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
434 i3 += 24;
435
436 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
437 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
438 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
439 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
440 const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8);
441 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
442 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
443 const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8);
444 const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
445 i4 += 24;
446
447 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
448 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
449 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
450 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
451 const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8);
452 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
453 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
454 const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8);
455 const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
456 i5 += 24;
457
458 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
459 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
460 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
461 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
462 const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8);
463 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
464 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
465 const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8);
466 const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
467 i6 += 24;
468
469 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
470 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
471 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
472 const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8);
473 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
474 const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8);
475
476 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
477 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
478 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
479
480 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
481 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
482 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
483 const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
484 __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
485 __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
486 const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
487 __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
488 __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
489
490 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
491 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
492 vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
493 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
494 vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16)));
495 vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20)));
496 buffer += 24;
497
498 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
499 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
500 __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
501 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
502 __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ);
503 __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN);
504
505 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
506 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
507 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
508 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
509 vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale);
510 vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale);
511
512 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
513 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
514 vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
515 vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
516 vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point);
517 vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point);
518
519 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
520 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
521 vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
522 vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
523 vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ);
524 vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN);
525
526 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
527 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
528 __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
529
530 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
531 vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
532 voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
533
534 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
535 __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
536
537
538 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
539 _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
540 output += 24;
541 }
542 if XNN_UNLIKELY(channels != 0) {
543 do {
544
545 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
546 i0 += 8;
547
548 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
549 i1 += 8;
550
551 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
552 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
553 i2 += 8;
554
555 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
556 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
557 i3 += 8;
558
559 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
560 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
561 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
562 i4 += 8;
563
564 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
565 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
566 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
567 i5 += 8;
568
569 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
570 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
571 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
572 i6 += 8;
573
574 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
575 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
576
577 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
578 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
579
580 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
581
582 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
583 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
584 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
585
586 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
587 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
588 buffer += 8;
589
590 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
591 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
592
593 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
594 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
595
596 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
597 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
598
599 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
600 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
601
602 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
603 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
604
605 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
606
607 if XNN_LIKELY(channels >= 8) {
608 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
609 output += 8;
610 channels -= 8;
611 } else {
612 if (channels & 4) {
613 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
614 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
615 output += 4;
616 }
617 uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
618 if (channels & 2) {
619 unaligned_store_u16(output, (uint16_t) vout0123);
620 vout0123 >>= 16;
621 output += 2;
622 }
623 if (channels & 1) {
624 *output = (int8_t) vout0123;
625 output += 1;
626 }
627 channels = 0;
628 }
629 } while (channels != 0);
630 }
631 }
632