1 // Auto-generated file. Do not edit!
2 // Template: src/f16-dwconv2d-chw/5x5p2-neonfp16arith.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4(size_t input_height,size_t input_width,const void * input,const void * weights,const void * zero,void * output,uint32_t padding_top,const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4(
19 size_t input_height,
20 size_t input_width,
21 const void* input,
22 const void* weights,
23 const void* zero,
24 void* output,
25 uint32_t padding_top,
26 const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28 assert(input_height != 0);
29 assert(input_width != 0);
30 assert(input_width % sizeof(__fp16) == 0);
31 assert(padding_top == 2);
32
33 const uint16x4_t vmask = vld1_u16(params->neonfp16arith.mask);
34 const float16x4_t vmax = vld1_dup_f16(¶ms->neonfp16arith.max);
35 const float16x4_t vmin = vld1_dup_f16(¶ms->neonfp16arith.min);
36
37 const __fp16* w0 = (const __fp16*)weights;
38 const float16x8_t vw01234567 = vld1q_f16(w0);
39 const float16x8_t vw89ABCDEF = vld1q_f16(w0 + 8);
40 const float16x8_t vwGHIJKLMN = vld1q_f16(w0 + 16);
41 const float16x4_t vwOP = vreinterpret_f16_u32(vld1_lane_u32((const void*)(w0 + 24), vmov_n_u32(0), 0));
42
43 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(__fp16));
44
45 const __fp16* i0 = zero;
46 const __fp16* i1 = zero;
47 const __fp16* i2 = input;
48 const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
49 const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
50 const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_width);
51 const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_width);
52 const __fp16* i7 = (const __fp16*) ((uintptr_t) i6 + input_width);
53
54 __fp16* o0 = output;
55 __fp16* o1 = (__fp16*) ((uintptr_t) o0 + input_width);
56 __fp16* o2 = (__fp16*) ((uintptr_t) o1 + input_width);
57 __fp16* o3 = (__fp16*) ((uintptr_t) o2 + input_width);
58
59 size_t output_height = input_height;
60 do {
61 if XNN_UNPREDICTABLE(output_height < 2) {
62 i3 = zero;
63 o1 = o0;
64 }
65 if XNN_UNPREDICTABLE(output_height < 3) {
66 i4 = zero;
67 o2 = o1;
68 }
69 if XNN_UNPREDICTABLE(output_height < 4) {
70 i5 = zero;
71 o3 = o2;
72 }
73 if XNN_UNPREDICTABLE(output_height < 5) {
74 i6 = zero;
75 }
76 if XNN_UNPREDICTABLE(output_height < 6) {
77 i7 = zero;
78 }
79
80 float16x4_t vi0x0123 = vmov_n_f16(0);
81 float16x4_t vi1x0123 = vmov_n_f16(0);
82 float16x4_t vi2x0123 = vmov_n_f16(0);
83 float16x4_t vi3x0123 = vmov_n_f16(0);
84 float16x4_t vi4x0123 = vmov_n_f16(0);
85 float16x4_t vi5x0123 = vmov_n_f16(0);
86 float16x4_t vi6x0123 = vmov_n_f16(0);
87 float16x4_t vi7x0123 = vmov_n_f16(0);
88
89 float16x4_t vi0x4567 = vld1_f16(i0); i0 += 4;
90 float16x4_t vi1x4567 = vld1_f16(i1); i1 += 4;
91 float16x4_t vi2x4567 = vld1_f16(i2); i2 += 4;
92 float16x4_t vi3x4567 = vld1_f16(i3); i3 += 4;
93 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4;
94 float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4;
95 float16x4_t vi6x4567 = vld1_f16(i6); i6 += 4;
96 float16x4_t vi7x4567 = vld1_f16(i7); i7 += 4;
97
98 size_t w = input_width;
99 for (; w > 8 * sizeof(__fp16); w -= 4 * sizeof(__fp16)) {
100 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
101 float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
102 float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
103 float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
104
105 const float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
106 const float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
107 const float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
108 const float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
109 const float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
110 const float16x4_t vi5x89AB = vld1_f16(i5); i5 += 4;
111 const float16x4_t vi6x89AB = vld1_f16(i6); i6 += 4;
112 const float16x4_t vi7x89AB = vld1_f16(i7); i7 += 4;
113
114 vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3);
115 vo1p0 = vfma_laneq_f16(vo1p0, vi1x4567, vw01234567, 3);
116 vo2p0 = vfma_laneq_f16(vo2p0, vi2x4567, vw01234567, 3);
117 vo3p0 = vfma_laneq_f16(vo3p0, vi3x4567, vw01234567, 3);
118
119 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
120 vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
121 vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
122 vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
123
124 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
125 vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
126 vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
127 vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
128
129 vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2);
130 vo1p0 = vfma_laneq_f16(vo1p0, vi4x4567, vwGHIJKLMN, 2);
131 vo2p0 = vfma_laneq_f16(vo2p0, vi5x4567, vwGHIJKLMN, 2);
132 vo3p0 = vfma_laneq_f16(vo3p0, vi6x4567, vwGHIJKLMN, 2);
133
134 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
135 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
136 vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
137 vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
138
139 const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
140 const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
141 const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
142 const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
143 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
144 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
145 const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
146 const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
147
148 vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2);
149 vo1p0 = vfma_laneq_f16(vo1p0, vi1x3456, vw01234567, 2);
150 vo2p0 = vfma_laneq_f16(vo2p0, vi2x3456, vw01234567, 2);
151 vo3p0 = vfma_laneq_f16(vo3p0, vi3x3456, vw01234567, 2);
152
153 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
154 vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
155 vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
156 vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
157
158 vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4);
159 vo1p0 = vfma_laneq_f16(vo1p0, vi3x3456, vw89ABCDEF, 4);
160 vo2p0 = vfma_laneq_f16(vo2p0, vi4x3456, vw89ABCDEF, 4);
161 vo3p0 = vfma_laneq_f16(vo3p0, vi5x3456, vw89ABCDEF, 4);
162
163 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
164 vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
165 vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
166 vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
167
168 vo0p0 = vfma_laneq_f16(vo0p0, vi4x3456, vwGHIJKLMN, 6);
169 vo1p0 = vfma_laneq_f16(vo1p0, vi5x3456, vwGHIJKLMN, 6);
170 vo2p0 = vfma_laneq_f16(vo2p0, vi6x3456, vwGHIJKLMN, 6);
171 vo3p0 = vfma_laneq_f16(vo3p0, vi7x3456, vwGHIJKLMN, 6);
172
173 const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
174 vi0x0123 = vi0x4567;
175 const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
176 vi1x0123 = vi1x4567;
177 const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
178 vi2x0123 = vi2x4567;
179 const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
180 vi3x0123 = vi3x4567;
181 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
182 vi4x0123 = vi4x4567;
183 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
184 vi5x0123 = vi5x4567;
185 const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
186 vi6x0123 = vi6x4567;
187 const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
188 vi7x0123 = vi7x4567;
189
190 vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
191 vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
192 vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
193 vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
194
195 vo0p0 = vfma_laneq_f16(vo0p0, vi1x2345, vw01234567, 6);
196 vo1p0 = vfma_laneq_f16(vo1p0, vi2x2345, vw01234567, 6);
197 vo2p0 = vfma_laneq_f16(vo2p0, vi3x2345, vw01234567, 6);
198 vo3p0 = vfma_laneq_f16(vo3p0, vi4x2345, vw01234567, 6);
199
200 vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
201 vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
202 vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
203 vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
204
205 vo0p0 = vfma_laneq_f16(vo0p0, vi3x2345, vwGHIJKLMN, 0);
206 vo1p0 = vfma_laneq_f16(vo1p0, vi4x2345, vwGHIJKLMN, 0);
207 vo2p0 = vfma_laneq_f16(vo2p0, vi5x2345, vwGHIJKLMN, 0);
208 vo3p0 = vfma_laneq_f16(vo3p0, vi6x2345, vwGHIJKLMN, 0);
209
210 vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
211 vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
212 vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
213 vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
214
215 const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
216 const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
217 const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
218 const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
219 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
220 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1);
221 const float16x4_t vi6x5678 = vext_f16(vi6x4567, vi6x89AB, 1);
222 const float16x4_t vi7x5678 = vext_f16(vi7x4567, vi7x89AB, 1);
223
224 vo0p0 = vfma_laneq_f16(vo0p0, vi0x5678, vw01234567, 4);
225 vo1p0 = vfma_laneq_f16(vo1p0, vi1x5678, vw01234567, 4);
226 vo2p0 = vfma_laneq_f16(vo2p0, vi2x5678, vw01234567, 4);
227 vo3p0 = vfma_laneq_f16(vo3p0, vi3x5678, vw01234567, 4);
228
229 vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
230 vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
231 vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
232 vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
233
234 vo0p0 = vfma_laneq_f16(vo0p0, vi2x5678, vw89ABCDEF, 6);
235 vo1p0 = vfma_laneq_f16(vo1p0, vi3x5678, vw89ABCDEF, 6);
236 vo2p0 = vfma_laneq_f16(vo2p0, vi4x5678, vw89ABCDEF, 6);
237 vo3p0 = vfma_laneq_f16(vo3p0, vi5x5678, vw89ABCDEF, 6);
238
239 vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
240 vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
241 vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
242 vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
243
244 vo0p0 = vfma_lane_f16(vo0p0, vi4x5678, vwOP, 0);
245 vo1p0 = vfma_lane_f16(vo1p0, vi5x5678, vwOP, 0);
246 vo2p0 = vfma_lane_f16(vo2p0, vi6x5678, vwOP, 0);
247 vo3p0 = vfma_lane_f16(vo3p0, vi7x5678, vwOP, 0);
248
249 const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
250 vi0x4567 = vi0x89AB;
251 const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
252 vi1x4567 = vi1x89AB;
253 const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
254 vi2x4567 = vi2x89AB;
255 const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
256 vi3x4567 = vi3x89AB;
257 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
258 vi4x4567 = vi4x89AB;
259 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2);
260 vi5x4567 = vi5x89AB;
261 const float16x4_t vi6x6789 = vext_f16(vi6x4567, vi6x89AB, 2);
262 vi6x4567 = vi6x89AB;
263 const float16x4_t vi7x6789 = vext_f16(vi7x4567, vi7x89AB, 2);
264 vi7x4567 = vi7x89AB;
265
266 vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
267 vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
268 vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
269 vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
270
271 vo0p0 = vfma_laneq_f16(vo0p0, vi1x6789, vw89ABCDEF, 2);
272 vo1p0 = vfma_laneq_f16(vo1p0, vi2x6789, vw89ABCDEF, 2);
273 vo2p0 = vfma_laneq_f16(vo2p0, vi3x6789, vw89ABCDEF, 2);
274 vo3p0 = vfma_laneq_f16(vo3p0, vi4x6789, vw89ABCDEF, 2);
275
276 vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
277 vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
278 vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
279 vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
280
281 vo0p0 = vfma_laneq_f16(vo0p0, vi3x6789, vwGHIJKLMN, 4);
282 vo1p0 = vfma_laneq_f16(vo1p0, vi4x6789, vwGHIJKLMN, 4);
283 vo2p0 = vfma_laneq_f16(vo2p0, vi5x6789, vwGHIJKLMN, 4);
284 vo3p0 = vfma_laneq_f16(vo3p0, vi6x6789, vwGHIJKLMN, 4);
285
286 vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
287 vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
288 vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
289 vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
290
291
292 float16x4_t vo0 = vmax_f16(vo0p0, vmin);
293 float16x4_t vo1 = vmax_f16(vo1p0, vmin);
294 float16x4_t vo2 = vmax_f16(vo2p0, vmin);
295 float16x4_t vo3 = vmax_f16(vo3p0, vmin);
296
297 vo0 = vmin_f16(vo0, vmax);
298 vo1 = vmin_f16(vo1, vmax);
299 vo2 = vmin_f16(vo2, vmax);
300 vo3 = vmin_f16(vo3, vmax);
301
302 vst1_f16(o3, vo3); o3 += 4;
303 vst1_f16(o2, vo2); o2 += 4;
304 vst1_f16(o1, vo1); o1 += 4;
305 vst1_f16(o0, vo0); o0 += 4;
306 }
307 // Always process the last block of 5..8 pixels.
308 if XNN_LIKELY(w > 4 * sizeof(__fp16)) {
309 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
310 float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
311 float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
312 float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
313
314 float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
315 float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
316 float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
317 float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
318 float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
319 float16x4_t vi5x89AB = vld1_f16(i5); i5 += 4;
320 float16x4_t vi6x89AB = vld1_f16(i6); i6 += 4;
321 float16x4_t vi7x89AB = vld1_f16(i7); i7 += 4;
322
323 vi0x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x89AB)));
324 vi1x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x89AB)));
325 vi2x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x89AB)));
326 vi3x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x89AB)));
327 vi4x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x89AB)));
328 vi5x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi5x89AB)));
329 vi6x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi6x89AB)));
330 vi7x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi7x89AB)));
331
332 vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3);
333 vo1p0 = vfma_laneq_f16(vo1p0, vi1x4567, vw01234567, 3);
334 vo2p0 = vfma_laneq_f16(vo2p0, vi2x4567, vw01234567, 3);
335 vo3p0 = vfma_laneq_f16(vo3p0, vi3x4567, vw01234567, 3);
336
337 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
338 vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
339 vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
340 vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
341
342 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
343 vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
344 vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
345 vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
346
347 vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2);
348 vo1p0 = vfma_laneq_f16(vo1p0, vi4x4567, vwGHIJKLMN, 2);
349 vo2p0 = vfma_laneq_f16(vo2p0, vi5x4567, vwGHIJKLMN, 2);
350 vo3p0 = vfma_laneq_f16(vo3p0, vi6x4567, vwGHIJKLMN, 2);
351
352 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
353 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
354 vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
355 vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
356
357 const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
358 const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
359 const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
360 const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
361 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
362 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
363 const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
364 const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
365
366 vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2);
367 vo1p0 = vfma_laneq_f16(vo1p0, vi1x3456, vw01234567, 2);
368 vo2p0 = vfma_laneq_f16(vo2p0, vi2x3456, vw01234567, 2);
369 vo3p0 = vfma_laneq_f16(vo3p0, vi3x3456, vw01234567, 2);
370
371 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
372 vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
373 vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
374 vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
375
376 vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4);
377 vo1p0 = vfma_laneq_f16(vo1p0, vi3x3456, vw89ABCDEF, 4);
378 vo2p0 = vfma_laneq_f16(vo2p0, vi4x3456, vw89ABCDEF, 4);
379 vo3p0 = vfma_laneq_f16(vo3p0, vi5x3456, vw89ABCDEF, 4);
380
381 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
382 vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
383 vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
384 vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
385
386 vo0p0 = vfma_laneq_f16(vo0p0, vi4x3456, vwGHIJKLMN, 6);
387 vo1p0 = vfma_laneq_f16(vo1p0, vi5x3456, vwGHIJKLMN, 6);
388 vo2p0 = vfma_laneq_f16(vo2p0, vi6x3456, vwGHIJKLMN, 6);
389 vo3p0 = vfma_laneq_f16(vo3p0, vi7x3456, vwGHIJKLMN, 6);
390
391 const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
392 vi0x0123 = vi0x4567;
393 const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
394 vi1x0123 = vi1x4567;
395 const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
396 vi2x0123 = vi2x4567;
397 const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
398 vi3x0123 = vi3x4567;
399 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
400 vi4x0123 = vi4x4567;
401 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
402 vi5x0123 = vi5x4567;
403 const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
404 vi6x0123 = vi6x4567;
405 const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
406 vi7x0123 = vi7x4567;
407
408 vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
409 vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
410 vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
411 vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
412
413 vo0p0 = vfma_laneq_f16(vo0p0, vi1x2345, vw01234567, 6);
414 vo1p0 = vfma_laneq_f16(vo1p0, vi2x2345, vw01234567, 6);
415 vo2p0 = vfma_laneq_f16(vo2p0, vi3x2345, vw01234567, 6);
416 vo3p0 = vfma_laneq_f16(vo3p0, vi4x2345, vw01234567, 6);
417
418 vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
419 vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
420 vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
421 vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
422
423 vo0p0 = vfma_laneq_f16(vo0p0, vi3x2345, vwGHIJKLMN, 0);
424 vo1p0 = vfma_laneq_f16(vo1p0, vi4x2345, vwGHIJKLMN, 0);
425 vo2p0 = vfma_laneq_f16(vo2p0, vi5x2345, vwGHIJKLMN, 0);
426 vo3p0 = vfma_laneq_f16(vo3p0, vi6x2345, vwGHIJKLMN, 0);
427
428 vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
429 vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
430 vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
431 vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
432
433 const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
434 const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
435 const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
436 const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
437 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
438 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1);
439 const float16x4_t vi6x5678 = vext_f16(vi6x4567, vi6x89AB, 1);
440 const float16x4_t vi7x5678 = vext_f16(vi7x4567, vi7x89AB, 1);
441
442 vo0p0 = vfma_laneq_f16(vo0p0, vi0x5678, vw01234567, 4);
443 vo1p0 = vfma_laneq_f16(vo1p0, vi1x5678, vw01234567, 4);
444 vo2p0 = vfma_laneq_f16(vo2p0, vi2x5678, vw01234567, 4);
445 vo3p0 = vfma_laneq_f16(vo3p0, vi3x5678, vw01234567, 4);
446
447 vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
448 vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
449 vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
450 vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
451
452 vo0p0 = vfma_laneq_f16(vo0p0, vi2x5678, vw89ABCDEF, 6);
453 vo1p0 = vfma_laneq_f16(vo1p0, vi3x5678, vw89ABCDEF, 6);
454 vo2p0 = vfma_laneq_f16(vo2p0, vi4x5678, vw89ABCDEF, 6);
455 vo3p0 = vfma_laneq_f16(vo3p0, vi5x5678, vw89ABCDEF, 6);
456
457 vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
458 vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
459 vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
460 vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
461
462 vo0p0 = vfma_lane_f16(vo0p0, vi4x5678, vwOP, 0);
463 vo1p0 = vfma_lane_f16(vo1p0, vi5x5678, vwOP, 0);
464 vo2p0 = vfma_lane_f16(vo2p0, vi6x5678, vwOP, 0);
465 vo3p0 = vfma_lane_f16(vo3p0, vi7x5678, vwOP, 0);
466
467 const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
468 vi0x4567 = vi0x89AB;
469 const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
470 vi1x4567 = vi1x89AB;
471 const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
472 vi2x4567 = vi2x89AB;
473 const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
474 vi3x4567 = vi3x89AB;
475 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
476 vi4x4567 = vi4x89AB;
477 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2);
478 vi5x4567 = vi5x89AB;
479 const float16x4_t vi6x6789 = vext_f16(vi6x4567, vi6x89AB, 2);
480 vi6x4567 = vi6x89AB;
481 const float16x4_t vi7x6789 = vext_f16(vi7x4567, vi7x89AB, 2);
482 vi7x4567 = vi7x89AB;
483
484 vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
485 vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
486 vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
487 vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
488
489 vo0p0 = vfma_laneq_f16(vo0p0, vi1x6789, vw89ABCDEF, 2);
490 vo1p0 = vfma_laneq_f16(vo1p0, vi2x6789, vw89ABCDEF, 2);
491 vo2p0 = vfma_laneq_f16(vo2p0, vi3x6789, vw89ABCDEF, 2);
492 vo3p0 = vfma_laneq_f16(vo3p0, vi4x6789, vw89ABCDEF, 2);
493
494 vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
495 vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
496 vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
497 vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
498
499 vo0p0 = vfma_laneq_f16(vo0p0, vi3x6789, vwGHIJKLMN, 4);
500 vo1p0 = vfma_laneq_f16(vo1p0, vi4x6789, vwGHIJKLMN, 4);
501 vo2p0 = vfma_laneq_f16(vo2p0, vi5x6789, vwGHIJKLMN, 4);
502 vo3p0 = vfma_laneq_f16(vo3p0, vi6x6789, vwGHIJKLMN, 4);
503
504 vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
505 vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
506 vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
507 vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
508
509
510 float16x4_t vo0 = vmax_f16(vo0p0, vmin);
511 float16x4_t vo1 = vmax_f16(vo1p0, vmin);
512 float16x4_t vo2 = vmax_f16(vo2p0, vmin);
513 float16x4_t vo3 = vmax_f16(vo3p0, vmin);
514
515 vo0 = vmin_f16(vo0, vmax);
516 vo1 = vmin_f16(vo1, vmax);
517 vo2 = vmin_f16(vo2, vmax);
518 vo3 = vmin_f16(vo3, vmax);
519
520 vst1_f16(o3, vo3); o3 += 4;
521 vst1_f16(o2, vo2); o2 += 4;
522 vst1_f16(o1, vo1); o1 += 4;
523 vst1_f16(o0, vo0); o0 += 4;
524
525 w -= 4 * sizeof(__fp16);
526 }
527 assert(w >= 1 * sizeof(__fp16));
528 assert(w <= 4 * sizeof(__fp16));
529 {
530 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
531 float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
532 float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
533 float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
534
535 vi0x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x4567)));
536 vi1x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x4567)));
537 vi2x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x4567)));
538 vi3x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x4567)));
539 vi4x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x4567)));
540 vi5x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi5x4567)));
541 vi6x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi6x4567)));
542 vi7x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi7x4567)));
543
544 vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3);
545 vo1p0 = vfma_laneq_f16(vo1p0, vi1x4567, vw01234567, 3);
546 vo2p0 = vfma_laneq_f16(vo2p0, vi2x4567, vw01234567, 3);
547 vo3p0 = vfma_laneq_f16(vo3p0, vi3x4567, vw01234567, 3);
548
549 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
550 vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
551 vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
552 vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
553
554 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
555 vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
556 vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
557 vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
558
559 vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2);
560 vo1p0 = vfma_laneq_f16(vo1p0, vi4x4567, vwGHIJKLMN, 2);
561 vo2p0 = vfma_laneq_f16(vo2p0, vi5x4567, vwGHIJKLMN, 2);
562 vo3p0 = vfma_laneq_f16(vo3p0, vi6x4567, vwGHIJKLMN, 2);
563
564 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
565 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
566 vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
567 vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
568
569 const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
570 const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
571 const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
572 const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
573 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
574 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
575 const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
576 const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
577
578 vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2);
579 vo1p0 = vfma_laneq_f16(vo1p0, vi1x3456, vw01234567, 2);
580 vo2p0 = vfma_laneq_f16(vo2p0, vi2x3456, vw01234567, 2);
581 vo3p0 = vfma_laneq_f16(vo3p0, vi3x3456, vw01234567, 2);
582
583 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
584 vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
585 vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
586 vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
587
588 vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4);
589 vo1p0 = vfma_laneq_f16(vo1p0, vi3x3456, vw89ABCDEF, 4);
590 vo2p0 = vfma_laneq_f16(vo2p0, vi4x3456, vw89ABCDEF, 4);
591 vo3p0 = vfma_laneq_f16(vo3p0, vi5x3456, vw89ABCDEF, 4);
592
593 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
594 vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
595 vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
596 vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
597
598 vo0p0 = vfma_laneq_f16(vo0p0, vi4x3456, vwGHIJKLMN, 6);
599 vo1p0 = vfma_laneq_f16(vo1p0, vi5x3456, vwGHIJKLMN, 6);
600 vo2p0 = vfma_laneq_f16(vo2p0, vi6x3456, vwGHIJKLMN, 6);
601 vo3p0 = vfma_laneq_f16(vo3p0, vi7x3456, vwGHIJKLMN, 6);
602
603 const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
604 const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
605 const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
606 const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
607 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
608 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
609 const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
610 const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
611
612 vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
613 vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
614 vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
615 vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
616
617 vo0p0 = vfma_laneq_f16(vo0p0, vi1x2345, vw01234567, 6);
618 vo1p0 = vfma_laneq_f16(vo1p0, vi2x2345, vw01234567, 6);
619 vo2p0 = vfma_laneq_f16(vo2p0, vi3x2345, vw01234567, 6);
620 vo3p0 = vfma_laneq_f16(vo3p0, vi4x2345, vw01234567, 6);
621
622 vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
623 vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
624 vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
625 vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
626
627 vo0p0 = vfma_laneq_f16(vo0p0, vi3x2345, vwGHIJKLMN, 0);
628 vo1p0 = vfma_laneq_f16(vo1p0, vi4x2345, vwGHIJKLMN, 0);
629 vo2p0 = vfma_laneq_f16(vo2p0, vi5x2345, vwGHIJKLMN, 0);
630 vo3p0 = vfma_laneq_f16(vo3p0, vi6x2345, vwGHIJKLMN, 0);
631
632 vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
633 vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
634 vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
635 vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
636
637 const float16x4_t vzero = vmov_n_f16(0);
638 const float16x4_t vi0x5678 = vext_f16(vi0x4567, vzero, 1);
639 const float16x4_t vi1x5678 = vext_f16(vi1x4567, vzero, 1);
640 const float16x4_t vi2x5678 = vext_f16(vi2x4567, vzero, 1);
641 const float16x4_t vi3x5678 = vext_f16(vi3x4567, vzero, 1);
642 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vzero, 1);
643 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vzero, 1);
644 const float16x4_t vi6x5678 = vext_f16(vi6x4567, vzero, 1);
645 const float16x4_t vi7x5678 = vext_f16(vi7x4567, vzero, 1);
646
647 vo0p0 = vfma_laneq_f16(vo0p0, vi0x5678, vw01234567, 4);
648 vo1p0 = vfma_laneq_f16(vo1p0, vi1x5678, vw01234567, 4);
649 vo2p0 = vfma_laneq_f16(vo2p0, vi2x5678, vw01234567, 4);
650 vo3p0 = vfma_laneq_f16(vo3p0, vi3x5678, vw01234567, 4);
651
652 vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
653 vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
654 vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
655 vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
656
657 vo0p0 = vfma_laneq_f16(vo0p0, vi2x5678, vw89ABCDEF, 6);
658 vo1p0 = vfma_laneq_f16(vo1p0, vi3x5678, vw89ABCDEF, 6);
659 vo2p0 = vfma_laneq_f16(vo2p0, vi4x5678, vw89ABCDEF, 6);
660 vo3p0 = vfma_laneq_f16(vo3p0, vi5x5678, vw89ABCDEF, 6);
661
662 vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
663 vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
664 vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
665 vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
666
667 vo0p0 = vfma_lane_f16(vo0p0, vi4x5678, vwOP, 0);
668 vo1p0 = vfma_lane_f16(vo1p0, vi5x5678, vwOP, 0);
669 vo2p0 = vfma_lane_f16(vo2p0, vi6x5678, vwOP, 0);
670 vo3p0 = vfma_lane_f16(vo3p0, vi7x5678, vwOP, 0);
671
672 const float16x4_t vi0x6789 = vext_f16(vi0x5678, vzero, 1);
673 const float16x4_t vi1x6789 = vext_f16(vi1x5678, vzero, 1);
674 const float16x4_t vi2x6789 = vext_f16(vi2x5678, vzero, 1);
675 const float16x4_t vi3x6789 = vext_f16(vi3x5678, vzero, 1);
676 const float16x4_t vi4x6789 = vext_f16(vi4x5678, vzero, 1);
677 const float16x4_t vi5x6789 = vext_f16(vi5x5678, vzero, 1);
678 const float16x4_t vi6x6789 = vext_f16(vi6x5678, vzero, 1);
679 const float16x4_t vi7x6789 = vext_f16(vi7x5678, vzero, 1);
680
681 vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
682 vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
683 vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
684 vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
685
686 vo0p0 = vfma_laneq_f16(vo0p0, vi1x6789, vw89ABCDEF, 2);
687 vo1p0 = vfma_laneq_f16(vo1p0, vi2x6789, vw89ABCDEF, 2);
688 vo2p0 = vfma_laneq_f16(vo2p0, vi3x6789, vw89ABCDEF, 2);
689 vo3p0 = vfma_laneq_f16(vo3p0, vi4x6789, vw89ABCDEF, 2);
690
691 vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
692 vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
693 vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
694 vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
695
696 vo0p0 = vfma_laneq_f16(vo0p0, vi3x6789, vwGHIJKLMN, 4);
697 vo1p0 = vfma_laneq_f16(vo1p0, vi4x6789, vwGHIJKLMN, 4);
698 vo2p0 = vfma_laneq_f16(vo2p0, vi5x6789, vwGHIJKLMN, 4);
699 vo3p0 = vfma_laneq_f16(vo3p0, vi6x6789, vwGHIJKLMN, 4);
700
701 vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
702 vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
703 vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
704 vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
705
706
707 float16x4_t vo0 = vmax_f16(vo0p0, vmin);
708 float16x4_t vo1 = vmax_f16(vo1p0, vmin);
709 float16x4_t vo2 = vmax_f16(vo2p0, vmin);
710 float16x4_t vo3 = vmax_f16(vo3p0, vmin);
711
712 vo0 = vmin_f16(vo0, vmax);
713 vo1 = vmin_f16(vo1, vmax);
714 vo2 = vmin_f16(vo2, vmax);
715 vo3 = vmin_f16(vo3, vmax);
716
717 if XNN_LIKELY(w & (4 * sizeof(__fp16))) {
718 vst1_f16(o3, vo3); o3 += 4;
719 vst1_f16(o2, vo2); o2 += 4;
720 vst1_f16(o1, vo1); o1 += 4;
721 vst1_f16(o0, vo0); o0 += 4;
722 } else {
723 if (w & (2 * sizeof(__fp16))) {
724 vst1_lane_u32((void*) o3, vreinterpret_u32_f16(vo3), 0); o3 += 2;
725 vst1_lane_u32((void*) o2, vreinterpret_u32_f16(vo2), 0); o2 += 2;
726 vst1_lane_u32((void*) o1, vreinterpret_u32_f16(vo1), 0); o1 += 2;
727 vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vo0), 0); o0 += 2;
728
729 vo0 = vext_f16(vo0, vo0, 2);
730 vo1 = vext_f16(vo1, vo1, 2);
731 vo2 = vext_f16(vo2, vo2, 2);
732 vo3 = vext_f16(vo3, vo3, 2);
733 }
734 if (w & (1 * sizeof(__fp16))) {
735 vst1_lane_f16(o3, vo3, 0); o3 += 1;
736 vst1_lane_f16(o2, vo2, 0); o2 += 1;
737 vst1_lane_f16(o1, vo1, 0); o1 += 1;
738 vst1_lane_f16(o0, vo0, 0); o0 += 1;
739 }
740 }
741 }
742
743 i0 = (const __fp16*) ((uintptr_t) i4 - input_decrement);
744 i1 = (const __fp16*) ((uintptr_t) i5 - input_decrement);
745 i2 = (const __fp16*) ((uintptr_t) i1 + input_width);
746 i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
747 i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
748 i5 = (const __fp16*) ((uintptr_t) i4 + input_width);
749 i6 = (const __fp16*) ((uintptr_t) i5 + input_width);
750 i7 = (const __fp16*) ((uintptr_t) i6 + input_width);
751
752 o0 = o3;
753 o1 = (__fp16*) ((uintptr_t) o0 + input_width);
754 o2 = (__fp16*) ((uintptr_t) o1 + input_width);
755 o3 = (__fp16*) ((uintptr_t) o2 + input_width);
756
757 output_height = doz(output_height, 4);
758 } while (output_height != 0);
759 }
760