1 // Auto-generated file. Do not edit!
2 // Template: src/f16-gavgpool/multipass-f16c.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2022 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <immintrin.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/intrinsics-polyfill.h>
16 #include <xnnpack/math.h>
17
18
xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * buffer,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24(
20 size_t rows,
21 size_t channels,
22 const void* input,
23 size_t input_stride,
24 const void* zero,
25 void* buffer,
26 void* output,
27 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(rows > 7);
30 assert(channels != 0);
31
32 const uint16_t* i0 = input;
33 const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
34 const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
35 const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
36 const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
37 const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
38 const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
39 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t);
40
41 uint16_t* b = buffer;
42 size_t c = channels;
43 for (; c >= 24; c -= 24) {
44 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
45 const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
46 const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
47 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
48 const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
49 const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
50
51 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
52 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
53 const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
54 __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vi0x89ABCDEF, vi1x89ABCDEF), _MM_FROUND_NO_EXC);
55 const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
56 __m128i vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(vi0xGHIJKLMN, vi1xGHIJKLMN), _MM_FROUND_NO_EXC);
57
58 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
59 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
60 const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
61 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_NO_EXC);
62 const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
63 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_NO_EXC);
64 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
65 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
66 const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
67 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_NO_EXC);
68 const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
69 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_NO_EXC);
70 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
71 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
72 const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
73 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_NO_EXC);
74 const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
75 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_NO_EXC);
76 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
77 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
78 const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
79 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_NO_EXC);
80 const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
81 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_NO_EXC);
82 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
83 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_NO_EXC);
84 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_NO_EXC);
85
86 _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
87 _mm_store_si128((__m128i*) b, vacc89ABCDEF); b += 8;
88 _mm_store_si128((__m128i*) b, vaccGHIJKLMN); b += 8;
89 }
90 if XNN_UNLIKELY(c != 0) {
91 do {
92 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
93 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
94 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
95 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
96
97 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
98 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
99 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
100 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
101 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
102 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
103 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
104 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
105 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
106
107 _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
108
109 c = doz(c, 8);
110 } while (c != 0);
111 }
112
113 for (rows -= 7; rows > 7; rows -= 7) {
114 i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
115 i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
116 i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
117 i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
118 i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
119 i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
120 i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
121
122 uint16_t* b = buffer;
123 size_t c = channels;
124 for (; c >= 24; c -= 24) {
125 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b);
126 __m128i vacc89ABCDEF = _mm_loadu_si128((const __m128i*) (b + 8));
127 __m128i vaccGHIJKLMN = _mm_loadu_si128((const __m128i*) (b + 16));
128
129 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
130 const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
131 const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
132
133 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
134 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
135 const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
136 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi0x89ABCDEF), _MM_FROUND_NO_EXC);
137 const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
138 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi0xGHIJKLMN), _MM_FROUND_NO_EXC);
139 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
140 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
141 const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
142 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi1x89ABCDEF), _MM_FROUND_NO_EXC);
143 const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
144 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi1xGHIJKLMN), _MM_FROUND_NO_EXC);
145 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
146 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
147 const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
148 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_NO_EXC);
149 const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
150 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_NO_EXC);
151 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
152 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
153 const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
154 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_NO_EXC);
155 const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
156 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_NO_EXC);
157 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
158 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
159 const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
160 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_NO_EXC);
161 const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
162 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_NO_EXC);
163 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
164 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
165 const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
166 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_NO_EXC);
167 const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
168 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_NO_EXC);
169 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
170 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_NO_EXC);
171 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_NO_EXC);
172
173 _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
174 _mm_store_si128((__m128i*) b, vacc89ABCDEF); b += 8;
175 _mm_store_si128((__m128i*) b, vaccGHIJKLMN); b += 8;
176 }
177 if XNN_UNLIKELY(c != 0) {
178 do {
179 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b);
180 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
181
182 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
183 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
184 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
185 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
186 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
187 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
188 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
189 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
190 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
191 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
192 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
193 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
194 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
195
196 _mm_store_si128((__m128i*) b, vacc01234567);
197 b += 8;
198
199 c = doz(c, 8);
200 } while (c != 0);
201 }
202 }
203
204 i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
205 i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
206 if XNN_UNPREDICTABLE(rows < 2) {
207 i1 = (const uint16_t*) zero;
208 }
209 i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
210 if XNN_UNPREDICTABLE(rows <= 2) {
211 i2 = (const uint16_t*) zero;
212 }
213 i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
214 if XNN_UNPREDICTABLE(rows < 4) {
215 i3 = (const uint16_t*) zero;
216 }
217 i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
218 if XNN_UNPREDICTABLE(rows <= 4) {
219 i4 = (const uint16_t*) zero;
220 }
221 i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
222 if XNN_UNPREDICTABLE(rows < 6) {
223 i5 = (const uint16_t*) zero;
224 }
225 i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
226 if XNN_UNPREDICTABLE(rows <= 6) {
227 i6 = (const uint16_t*) zero;
228 }
229 uint16_t* o = (uint16_t*) output;
230
231 const __m256 vscale = _mm256_load_ps(params->avx.scale);
232 const __m256 vmin = _mm256_load_ps(params->avx.min);
233 const __m256 vmax = _mm256_load_ps(params->avx.max);
234 for (; channels >= 24; channels -= 24) {
235 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
236 __m128i vacc89ABCDEF = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
237 __m128i vaccGHIJKLMN = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
238
239 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
240 const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
241 const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
242
243 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
244 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
245 const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
246 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi0x89ABCDEF), _MM_FROUND_NO_EXC);
247 const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
248 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi0xGHIJKLMN), _MM_FROUND_NO_EXC);
249 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
250 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
251 const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
252 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi1x89ABCDEF), _MM_FROUND_NO_EXC);
253 const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
254 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi1xGHIJKLMN), _MM_FROUND_NO_EXC);
255 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
256 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
257 const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
258 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_NO_EXC);
259 const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
260 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_NO_EXC);
261 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
262 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
263 const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
264 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_NO_EXC);
265 const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
266 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_NO_EXC);
267 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
268 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
269 const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
270 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_NO_EXC);
271 const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
272 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_NO_EXC);
273 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
274 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
275 const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
276 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_NO_EXC);
277 const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
278 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_NO_EXC);
279 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
280 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_NO_EXC);
281 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_NO_EXC);
282
283 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
284 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vscale), _MM_FROUND_NO_EXC);
285 vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vscale), _MM_FROUND_NO_EXC);
286
287 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
288 __m256 vout89ABCDEF = _mm256_max_ps(_mm256_cvtph_ps(vacc89ABCDEF), vmin);
289 __m256 voutGHIJKLMN = _mm256_max_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vmin);
290
291 vout01234567 = _mm256_min_ps(vout01234567, vmax);
292 vout89ABCDEF = _mm256_min_ps(vout89ABCDEF, vmax);
293 voutGHIJKLMN = _mm256_min_ps(voutGHIJKLMN, vmax);
294
295 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
296 _mm_storeu_si128((__m128i*) ((uint16_t*) o + 8), _mm256_cvtps_ph(vout89ABCDEF, _MM_FROUND_NO_EXC));
297 _mm_storeu_si128((__m128i*) ((uint16_t*) o + 16), _mm256_cvtps_ph(voutGHIJKLMN, _MM_FROUND_NO_EXC));
298 o += 24;
299 }
300 if XNN_UNLIKELY(channels != 0) {
301 do {
302 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
303
304 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
305 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
306 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
307 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
308 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
309 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
310 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
311 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
312 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
313 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
314 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
315 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
316 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
317 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
318
319 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
320 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
321 vout01234567 = _mm256_min_ps(vout01234567, vmax);
322
323 if XNN_LIKELY(channels >= 8) {
324 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
325 o += 8;
326 channels -= 8;
327 } else {
328 __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
329 if (channels & 4) {
330 _mm_storel_epi64((__m128i*) o, vh01234567);
331 o += 4;
332 vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
333 }
334 if (channels & 2) {
335 _mm_storeu_si32(o, vh01234567);
336 o += 2;
337 vh01234567 = _mm_srli_epi64(vh01234567, 32);
338 }
339 if (channels & 1) {
340 *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
341 }
342 channels = 0;
343 }
344 } while (channels != 0);
345 }
346 }
347