xref: /aosp_15_r20/external/XNNPACK/src/f16-dwconv2d-chw/gen/5x5p2-minmax-neonfp16arith-4x4.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f16-dwconv2d-chw/5x5p2-neonfp16arith.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4(size_t input_height,size_t input_width,const void * input,const void * weights,const void * zero,void * output,uint32_t padding_top,const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4(
19     size_t input_height,
20     size_t input_width,
21     const void* input,
22     const void* weights,
23     const void* zero,
24     void* output,
25     uint32_t padding_top,
26     const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28   assert(input_height != 0);
29   assert(input_width != 0);
30   assert(input_width % sizeof(__fp16) == 0);
31   assert(padding_top == 2);
32 
33   const uint16x4_t vmask = vld1_u16(params->neonfp16arith.mask);
34   const float16x4_t vmax = vld1_dup_f16(&params->neonfp16arith.max);
35   const float16x4_t vmin = vld1_dup_f16(&params->neonfp16arith.min);
36 
37   const __fp16* w0 = (const __fp16*)weights;
38   const float16x8_t vw01234567 = vld1q_f16(w0);
39   const float16x8_t vw89ABCDEF = vld1q_f16(w0 + 8);
40   const float16x8_t vwGHIJKLMN = vld1q_f16(w0 + 16);
41   const float16x4_t vwOP = vreinterpret_f16_u32(vld1_lane_u32((const void*)(w0 + 24), vmov_n_u32(0), 0));
42 
43   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(__fp16));
44 
45   const __fp16* i0 = zero;
46   const __fp16* i1 = zero;
47   const __fp16* i2 = input;
48   const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
49   const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
50   const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_width);
51   const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_width);
52   const __fp16* i7 = (const __fp16*) ((uintptr_t) i6 + input_width);
53 
54   __fp16* o0 = output;
55   __fp16* o1 = (__fp16*) ((uintptr_t) o0 + input_width);
56   __fp16* o2 = (__fp16*) ((uintptr_t) o1 + input_width);
57   __fp16* o3 = (__fp16*) ((uintptr_t) o2 + input_width);
58 
59   size_t output_height = input_height;
60   do {
61     if XNN_UNPREDICTABLE(output_height < 2) {
62       i3 = zero;
63       o1 = o0;
64     }
65     if XNN_UNPREDICTABLE(output_height < 3) {
66       i4 = zero;
67       o2 = o1;
68     }
69     if XNN_UNPREDICTABLE(output_height < 4) {
70       i5 = zero;
71       o3 = o2;
72     }
73     if XNN_UNPREDICTABLE(output_height < 5) {
74       i6 = zero;
75     }
76     if XNN_UNPREDICTABLE(output_height < 6) {
77       i7 = zero;
78     }
79 
80     float16x4_t vi0x0123 = vmov_n_f16(0);
81     float16x4_t vi1x0123 = vmov_n_f16(0);
82     float16x4_t vi2x0123 = vmov_n_f16(0);
83     float16x4_t vi3x0123 = vmov_n_f16(0);
84     float16x4_t vi4x0123 = vmov_n_f16(0);
85     float16x4_t vi5x0123 = vmov_n_f16(0);
86     float16x4_t vi6x0123 = vmov_n_f16(0);
87     float16x4_t vi7x0123 = vmov_n_f16(0);
88 
89     float16x4_t vi0x4567 = vld1_f16(i0); i0 += 4;
90     float16x4_t vi1x4567 = vld1_f16(i1); i1 += 4;
91     float16x4_t vi2x4567 = vld1_f16(i2); i2 += 4;
92     float16x4_t vi3x4567 = vld1_f16(i3); i3 += 4;
93     float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4;
94     float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4;
95     float16x4_t vi6x4567 = vld1_f16(i6); i6 += 4;
96     float16x4_t vi7x4567 = vld1_f16(i7); i7 += 4;
97 
98     size_t w = input_width;
99     for (; w > 8 * sizeof(__fp16); w -= 4 * sizeof(__fp16)) {
100       float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
101       float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
102       float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
103       float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
104 
105       const float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
106       const float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
107       const float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
108       const float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
109       const float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
110       const float16x4_t vi5x89AB = vld1_f16(i5); i5 += 4;
111       const float16x4_t vi6x89AB = vld1_f16(i6); i6 += 4;
112       const float16x4_t vi7x89AB = vld1_f16(i7); i7 += 4;
113 
114       vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3);
115       vo1p0 = vfma_laneq_f16(vo1p0, vi1x4567, vw01234567, 3);
116       vo2p0 = vfma_laneq_f16(vo2p0, vi2x4567, vw01234567, 3);
117       vo3p0 = vfma_laneq_f16(vo3p0, vi3x4567, vw01234567, 3);
118 
119       vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
120       vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
121       vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
122       vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
123 
124       vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
125       vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
126       vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
127       vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
128 
129       vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2);
130       vo1p0 = vfma_laneq_f16(vo1p0, vi4x4567, vwGHIJKLMN, 2);
131       vo2p0 = vfma_laneq_f16(vo2p0, vi5x4567, vwGHIJKLMN, 2);
132       vo3p0 = vfma_laneq_f16(vo3p0, vi6x4567, vwGHIJKLMN, 2);
133 
134       vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
135       vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
136       vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
137       vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
138 
139       const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
140       const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
141       const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
142       const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
143       const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
144       const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
145       const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
146       const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
147 
148       vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2);
149       vo1p0 = vfma_laneq_f16(vo1p0, vi1x3456, vw01234567, 2);
150       vo2p0 = vfma_laneq_f16(vo2p0, vi2x3456, vw01234567, 2);
151       vo3p0 = vfma_laneq_f16(vo3p0, vi3x3456, vw01234567, 2);
152 
153       vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
154       vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
155       vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
156       vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
157 
158       vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4);
159       vo1p0 = vfma_laneq_f16(vo1p0, vi3x3456, vw89ABCDEF, 4);
160       vo2p0 = vfma_laneq_f16(vo2p0, vi4x3456, vw89ABCDEF, 4);
161       vo3p0 = vfma_laneq_f16(vo3p0, vi5x3456, vw89ABCDEF, 4);
162 
163       vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
164       vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
165       vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
166       vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
167 
168       vo0p0 = vfma_laneq_f16(vo0p0, vi4x3456, vwGHIJKLMN, 6);
169       vo1p0 = vfma_laneq_f16(vo1p0, vi5x3456, vwGHIJKLMN, 6);
170       vo2p0 = vfma_laneq_f16(vo2p0, vi6x3456, vwGHIJKLMN, 6);
171       vo3p0 = vfma_laneq_f16(vo3p0, vi7x3456, vwGHIJKLMN, 6);
172 
173       const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
174       vi0x0123 = vi0x4567;
175       const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
176       vi1x0123 = vi1x4567;
177       const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
178       vi2x0123 = vi2x4567;
179       const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
180       vi3x0123 = vi3x4567;
181       const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
182       vi4x0123 = vi4x4567;
183       const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
184       vi5x0123 = vi5x4567;
185       const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
186       vi6x0123 = vi6x4567;
187       const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
188       vi7x0123 = vi7x4567;
189 
190       vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
191       vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
192       vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
193       vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
194 
195       vo0p0 = vfma_laneq_f16(vo0p0, vi1x2345, vw01234567, 6);
196       vo1p0 = vfma_laneq_f16(vo1p0, vi2x2345, vw01234567, 6);
197       vo2p0 = vfma_laneq_f16(vo2p0, vi3x2345, vw01234567, 6);
198       vo3p0 = vfma_laneq_f16(vo3p0, vi4x2345, vw01234567, 6);
199 
200       vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
201       vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
202       vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
203       vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
204 
205       vo0p0 = vfma_laneq_f16(vo0p0, vi3x2345, vwGHIJKLMN, 0);
206       vo1p0 = vfma_laneq_f16(vo1p0, vi4x2345, vwGHIJKLMN, 0);
207       vo2p0 = vfma_laneq_f16(vo2p0, vi5x2345, vwGHIJKLMN, 0);
208       vo3p0 = vfma_laneq_f16(vo3p0, vi6x2345, vwGHIJKLMN, 0);
209 
210       vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
211       vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
212       vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
213       vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
214 
215       const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
216       const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
217       const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
218       const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
219       const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
220       const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1);
221       const float16x4_t vi6x5678 = vext_f16(vi6x4567, vi6x89AB, 1);
222       const float16x4_t vi7x5678 = vext_f16(vi7x4567, vi7x89AB, 1);
223 
224       vo0p0 = vfma_laneq_f16(vo0p0, vi0x5678, vw01234567, 4);
225       vo1p0 = vfma_laneq_f16(vo1p0, vi1x5678, vw01234567, 4);
226       vo2p0 = vfma_laneq_f16(vo2p0, vi2x5678, vw01234567, 4);
227       vo3p0 = vfma_laneq_f16(vo3p0, vi3x5678, vw01234567, 4);
228 
229       vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
230       vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
231       vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
232       vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
233 
234       vo0p0 = vfma_laneq_f16(vo0p0, vi2x5678, vw89ABCDEF, 6);
235       vo1p0 = vfma_laneq_f16(vo1p0, vi3x5678, vw89ABCDEF, 6);
236       vo2p0 = vfma_laneq_f16(vo2p0, vi4x5678, vw89ABCDEF, 6);
237       vo3p0 = vfma_laneq_f16(vo3p0, vi5x5678, vw89ABCDEF, 6);
238 
239       vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
240       vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
241       vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
242       vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
243 
244       vo0p0 = vfma_lane_f16(vo0p0, vi4x5678, vwOP, 0);
245       vo1p0 = vfma_lane_f16(vo1p0, vi5x5678, vwOP, 0);
246       vo2p0 = vfma_lane_f16(vo2p0, vi6x5678, vwOP, 0);
247       vo3p0 = vfma_lane_f16(vo3p0, vi7x5678, vwOP, 0);
248 
249       const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
250       vi0x4567 = vi0x89AB;
251       const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
252       vi1x4567 = vi1x89AB;
253       const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
254       vi2x4567 = vi2x89AB;
255       const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
256       vi3x4567 = vi3x89AB;
257       const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
258       vi4x4567 = vi4x89AB;
259       const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2);
260       vi5x4567 = vi5x89AB;
261       const float16x4_t vi6x6789 = vext_f16(vi6x4567, vi6x89AB, 2);
262       vi6x4567 = vi6x89AB;
263       const float16x4_t vi7x6789 = vext_f16(vi7x4567, vi7x89AB, 2);
264       vi7x4567 = vi7x89AB;
265 
266       vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
267       vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
268       vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
269       vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
270 
271       vo0p0 = vfma_laneq_f16(vo0p0, vi1x6789, vw89ABCDEF, 2);
272       vo1p0 = vfma_laneq_f16(vo1p0, vi2x6789, vw89ABCDEF, 2);
273       vo2p0 = vfma_laneq_f16(vo2p0, vi3x6789, vw89ABCDEF, 2);
274       vo3p0 = vfma_laneq_f16(vo3p0, vi4x6789, vw89ABCDEF, 2);
275 
276       vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
277       vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
278       vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
279       vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
280 
281       vo0p0 = vfma_laneq_f16(vo0p0, vi3x6789, vwGHIJKLMN, 4);
282       vo1p0 = vfma_laneq_f16(vo1p0, vi4x6789, vwGHIJKLMN, 4);
283       vo2p0 = vfma_laneq_f16(vo2p0, vi5x6789, vwGHIJKLMN, 4);
284       vo3p0 = vfma_laneq_f16(vo3p0, vi6x6789, vwGHIJKLMN, 4);
285 
286       vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
287       vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
288       vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
289       vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
290 
291 
292       float16x4_t vo0 = vmax_f16(vo0p0, vmin);
293       float16x4_t vo1 = vmax_f16(vo1p0, vmin);
294       float16x4_t vo2 = vmax_f16(vo2p0, vmin);
295       float16x4_t vo3 = vmax_f16(vo3p0, vmin);
296 
297       vo0 = vmin_f16(vo0, vmax);
298       vo1 = vmin_f16(vo1, vmax);
299       vo2 = vmin_f16(vo2, vmax);
300       vo3 = vmin_f16(vo3, vmax);
301 
302       vst1_f16(o3, vo3); o3 += 4;
303       vst1_f16(o2, vo2); o2 += 4;
304       vst1_f16(o1, vo1); o1 += 4;
305       vst1_f16(o0, vo0); o0 += 4;
306     }
307     // Always process the last block of 5..8 pixels.
308     if XNN_LIKELY(w > 4 * sizeof(__fp16)) {
309       float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
310       float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
311       float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
312       float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
313 
314       float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
315       float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
316       float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
317       float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
318       float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
319       float16x4_t vi5x89AB = vld1_f16(i5); i5 += 4;
320       float16x4_t vi6x89AB = vld1_f16(i6); i6 += 4;
321       float16x4_t vi7x89AB = vld1_f16(i7); i7 += 4;
322 
323       vi0x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x89AB)));
324       vi1x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x89AB)));
325       vi2x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x89AB)));
326       vi3x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x89AB)));
327       vi4x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x89AB)));
328       vi5x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi5x89AB)));
329       vi6x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi6x89AB)));
330       vi7x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi7x89AB)));
331 
332       vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3);
333       vo1p0 = vfma_laneq_f16(vo1p0, vi1x4567, vw01234567, 3);
334       vo2p0 = vfma_laneq_f16(vo2p0, vi2x4567, vw01234567, 3);
335       vo3p0 = vfma_laneq_f16(vo3p0, vi3x4567, vw01234567, 3);
336 
337       vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
338       vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
339       vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
340       vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
341 
342       vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
343       vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
344       vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
345       vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
346 
347       vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2);
348       vo1p0 = vfma_laneq_f16(vo1p0, vi4x4567, vwGHIJKLMN, 2);
349       vo2p0 = vfma_laneq_f16(vo2p0, vi5x4567, vwGHIJKLMN, 2);
350       vo3p0 = vfma_laneq_f16(vo3p0, vi6x4567, vwGHIJKLMN, 2);
351 
352       vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
353       vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
354       vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
355       vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
356 
357       const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
358       const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
359       const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
360       const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
361       const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
362       const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
363       const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
364       const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
365 
366       vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2);
367       vo1p0 = vfma_laneq_f16(vo1p0, vi1x3456, vw01234567, 2);
368       vo2p0 = vfma_laneq_f16(vo2p0, vi2x3456, vw01234567, 2);
369       vo3p0 = vfma_laneq_f16(vo3p0, vi3x3456, vw01234567, 2);
370 
371       vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
372       vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
373       vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
374       vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
375 
376       vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4);
377       vo1p0 = vfma_laneq_f16(vo1p0, vi3x3456, vw89ABCDEF, 4);
378       vo2p0 = vfma_laneq_f16(vo2p0, vi4x3456, vw89ABCDEF, 4);
379       vo3p0 = vfma_laneq_f16(vo3p0, vi5x3456, vw89ABCDEF, 4);
380 
381       vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
382       vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
383       vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
384       vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
385 
386       vo0p0 = vfma_laneq_f16(vo0p0, vi4x3456, vwGHIJKLMN, 6);
387       vo1p0 = vfma_laneq_f16(vo1p0, vi5x3456, vwGHIJKLMN, 6);
388       vo2p0 = vfma_laneq_f16(vo2p0, vi6x3456, vwGHIJKLMN, 6);
389       vo3p0 = vfma_laneq_f16(vo3p0, vi7x3456, vwGHIJKLMN, 6);
390 
391       const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
392       vi0x0123 = vi0x4567;
393       const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
394       vi1x0123 = vi1x4567;
395       const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
396       vi2x0123 = vi2x4567;
397       const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
398       vi3x0123 = vi3x4567;
399       const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
400       vi4x0123 = vi4x4567;
401       const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
402       vi5x0123 = vi5x4567;
403       const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
404       vi6x0123 = vi6x4567;
405       const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
406       vi7x0123 = vi7x4567;
407 
408       vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
409       vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
410       vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
411       vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
412 
413       vo0p0 = vfma_laneq_f16(vo0p0, vi1x2345, vw01234567, 6);
414       vo1p0 = vfma_laneq_f16(vo1p0, vi2x2345, vw01234567, 6);
415       vo2p0 = vfma_laneq_f16(vo2p0, vi3x2345, vw01234567, 6);
416       vo3p0 = vfma_laneq_f16(vo3p0, vi4x2345, vw01234567, 6);
417 
418       vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
419       vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
420       vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
421       vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
422 
423       vo0p0 = vfma_laneq_f16(vo0p0, vi3x2345, vwGHIJKLMN, 0);
424       vo1p0 = vfma_laneq_f16(vo1p0, vi4x2345, vwGHIJKLMN, 0);
425       vo2p0 = vfma_laneq_f16(vo2p0, vi5x2345, vwGHIJKLMN, 0);
426       vo3p0 = vfma_laneq_f16(vo3p0, vi6x2345, vwGHIJKLMN, 0);
427 
428       vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
429       vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
430       vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
431       vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
432 
433       const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
434       const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
435       const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
436       const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
437       const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
438       const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1);
439       const float16x4_t vi6x5678 = vext_f16(vi6x4567, vi6x89AB, 1);
440       const float16x4_t vi7x5678 = vext_f16(vi7x4567, vi7x89AB, 1);
441 
442       vo0p0 = vfma_laneq_f16(vo0p0, vi0x5678, vw01234567, 4);
443       vo1p0 = vfma_laneq_f16(vo1p0, vi1x5678, vw01234567, 4);
444       vo2p0 = vfma_laneq_f16(vo2p0, vi2x5678, vw01234567, 4);
445       vo3p0 = vfma_laneq_f16(vo3p0, vi3x5678, vw01234567, 4);
446 
447       vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
448       vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
449       vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
450       vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
451 
452       vo0p0 = vfma_laneq_f16(vo0p0, vi2x5678, vw89ABCDEF, 6);
453       vo1p0 = vfma_laneq_f16(vo1p0, vi3x5678, vw89ABCDEF, 6);
454       vo2p0 = vfma_laneq_f16(vo2p0, vi4x5678, vw89ABCDEF, 6);
455       vo3p0 = vfma_laneq_f16(vo3p0, vi5x5678, vw89ABCDEF, 6);
456 
457       vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
458       vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
459       vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
460       vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
461 
462       vo0p0 = vfma_lane_f16(vo0p0, vi4x5678, vwOP, 0);
463       vo1p0 = vfma_lane_f16(vo1p0, vi5x5678, vwOP, 0);
464       vo2p0 = vfma_lane_f16(vo2p0, vi6x5678, vwOP, 0);
465       vo3p0 = vfma_lane_f16(vo3p0, vi7x5678, vwOP, 0);
466 
467       const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
468       vi0x4567 = vi0x89AB;
469       const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
470       vi1x4567 = vi1x89AB;
471       const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
472       vi2x4567 = vi2x89AB;
473       const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
474       vi3x4567 = vi3x89AB;
475       const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
476       vi4x4567 = vi4x89AB;
477       const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2);
478       vi5x4567 = vi5x89AB;
479       const float16x4_t vi6x6789 = vext_f16(vi6x4567, vi6x89AB, 2);
480       vi6x4567 = vi6x89AB;
481       const float16x4_t vi7x6789 = vext_f16(vi7x4567, vi7x89AB, 2);
482       vi7x4567 = vi7x89AB;
483 
484       vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
485       vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
486       vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
487       vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
488 
489       vo0p0 = vfma_laneq_f16(vo0p0, vi1x6789, vw89ABCDEF, 2);
490       vo1p0 = vfma_laneq_f16(vo1p0, vi2x6789, vw89ABCDEF, 2);
491       vo2p0 = vfma_laneq_f16(vo2p0, vi3x6789, vw89ABCDEF, 2);
492       vo3p0 = vfma_laneq_f16(vo3p0, vi4x6789, vw89ABCDEF, 2);
493 
494       vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
495       vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
496       vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
497       vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
498 
499       vo0p0 = vfma_laneq_f16(vo0p0, vi3x6789, vwGHIJKLMN, 4);
500       vo1p0 = vfma_laneq_f16(vo1p0, vi4x6789, vwGHIJKLMN, 4);
501       vo2p0 = vfma_laneq_f16(vo2p0, vi5x6789, vwGHIJKLMN, 4);
502       vo3p0 = vfma_laneq_f16(vo3p0, vi6x6789, vwGHIJKLMN, 4);
503 
504       vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
505       vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
506       vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
507       vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
508 
509 
510       float16x4_t vo0 = vmax_f16(vo0p0, vmin);
511       float16x4_t vo1 = vmax_f16(vo1p0, vmin);
512       float16x4_t vo2 = vmax_f16(vo2p0, vmin);
513       float16x4_t vo3 = vmax_f16(vo3p0, vmin);
514 
515       vo0 = vmin_f16(vo0, vmax);
516       vo1 = vmin_f16(vo1, vmax);
517       vo2 = vmin_f16(vo2, vmax);
518       vo3 = vmin_f16(vo3, vmax);
519 
520       vst1_f16(o3, vo3); o3 += 4;
521       vst1_f16(o2, vo2); o2 += 4;
522       vst1_f16(o1, vo1); o1 += 4;
523       vst1_f16(o0, vo0); o0 += 4;
524 
525       w -= 4 * sizeof(__fp16);
526     }
527     assert(w >= 1 * sizeof(__fp16));
528     assert(w <= 4 * sizeof(__fp16));
529     {
530       float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
531       float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
532       float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
533       float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
534 
535       vi0x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x4567)));
536       vi1x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x4567)));
537       vi2x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x4567)));
538       vi3x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x4567)));
539       vi4x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x4567)));
540       vi5x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi5x4567)));
541       vi6x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi6x4567)));
542       vi7x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi7x4567)));
543 
544       vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3);
545       vo1p0 = vfma_laneq_f16(vo1p0, vi1x4567, vw01234567, 3);
546       vo2p0 = vfma_laneq_f16(vo2p0, vi2x4567, vw01234567, 3);
547       vo3p0 = vfma_laneq_f16(vo3p0, vi3x4567, vw01234567, 3);
548 
549       vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
550       vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
551       vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
552       vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
553 
554       vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
555       vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
556       vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
557       vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
558 
559       vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2);
560       vo1p0 = vfma_laneq_f16(vo1p0, vi4x4567, vwGHIJKLMN, 2);
561       vo2p0 = vfma_laneq_f16(vo2p0, vi5x4567, vwGHIJKLMN, 2);
562       vo3p0 = vfma_laneq_f16(vo3p0, vi6x4567, vwGHIJKLMN, 2);
563 
564       vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
565       vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
566       vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
567       vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
568 
569       const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
570       const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
571       const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
572       const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
573       const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
574       const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
575       const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
576       const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
577 
578       vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2);
579       vo1p0 = vfma_laneq_f16(vo1p0, vi1x3456, vw01234567, 2);
580       vo2p0 = vfma_laneq_f16(vo2p0, vi2x3456, vw01234567, 2);
581       vo3p0 = vfma_laneq_f16(vo3p0, vi3x3456, vw01234567, 2);
582 
583       vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
584       vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
585       vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
586       vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
587 
588       vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4);
589       vo1p0 = vfma_laneq_f16(vo1p0, vi3x3456, vw89ABCDEF, 4);
590       vo2p0 = vfma_laneq_f16(vo2p0, vi4x3456, vw89ABCDEF, 4);
591       vo3p0 = vfma_laneq_f16(vo3p0, vi5x3456, vw89ABCDEF, 4);
592 
593       vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
594       vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
595       vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
596       vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
597 
598       vo0p0 = vfma_laneq_f16(vo0p0, vi4x3456, vwGHIJKLMN, 6);
599       vo1p0 = vfma_laneq_f16(vo1p0, vi5x3456, vwGHIJKLMN, 6);
600       vo2p0 = vfma_laneq_f16(vo2p0, vi6x3456, vwGHIJKLMN, 6);
601       vo3p0 = vfma_laneq_f16(vo3p0, vi7x3456, vwGHIJKLMN, 6);
602 
603       const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
604       const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
605       const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
606       const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
607       const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
608       const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
609       const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
610       const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
611 
612       vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
613       vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
614       vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
615       vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
616 
617       vo0p0 = vfma_laneq_f16(vo0p0, vi1x2345, vw01234567, 6);
618       vo1p0 = vfma_laneq_f16(vo1p0, vi2x2345, vw01234567, 6);
619       vo2p0 = vfma_laneq_f16(vo2p0, vi3x2345, vw01234567, 6);
620       vo3p0 = vfma_laneq_f16(vo3p0, vi4x2345, vw01234567, 6);
621 
622       vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
623       vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
624       vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
625       vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
626 
627       vo0p0 = vfma_laneq_f16(vo0p0, vi3x2345, vwGHIJKLMN, 0);
628       vo1p0 = vfma_laneq_f16(vo1p0, vi4x2345, vwGHIJKLMN, 0);
629       vo2p0 = vfma_laneq_f16(vo2p0, vi5x2345, vwGHIJKLMN, 0);
630       vo3p0 = vfma_laneq_f16(vo3p0, vi6x2345, vwGHIJKLMN, 0);
631 
632       vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
633       vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
634       vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
635       vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
636 
637       const float16x4_t vzero = vmov_n_f16(0);
638       const float16x4_t vi0x5678 = vext_f16(vi0x4567, vzero, 1);
639       const float16x4_t vi1x5678 = vext_f16(vi1x4567, vzero, 1);
640       const float16x4_t vi2x5678 = vext_f16(vi2x4567, vzero, 1);
641       const float16x4_t vi3x5678 = vext_f16(vi3x4567, vzero, 1);
642       const float16x4_t vi4x5678 = vext_f16(vi4x4567, vzero, 1);
643       const float16x4_t vi5x5678 = vext_f16(vi5x4567, vzero, 1);
644       const float16x4_t vi6x5678 = vext_f16(vi6x4567, vzero, 1);
645       const float16x4_t vi7x5678 = vext_f16(vi7x4567, vzero, 1);
646 
647       vo0p0 = vfma_laneq_f16(vo0p0, vi0x5678, vw01234567, 4);
648       vo1p0 = vfma_laneq_f16(vo1p0, vi1x5678, vw01234567, 4);
649       vo2p0 = vfma_laneq_f16(vo2p0, vi2x5678, vw01234567, 4);
650       vo3p0 = vfma_laneq_f16(vo3p0, vi3x5678, vw01234567, 4);
651 
652       vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
653       vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
654       vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
655       vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
656 
657       vo0p0 = vfma_laneq_f16(vo0p0, vi2x5678, vw89ABCDEF, 6);
658       vo1p0 = vfma_laneq_f16(vo1p0, vi3x5678, vw89ABCDEF, 6);
659       vo2p0 = vfma_laneq_f16(vo2p0, vi4x5678, vw89ABCDEF, 6);
660       vo3p0 = vfma_laneq_f16(vo3p0, vi5x5678, vw89ABCDEF, 6);
661 
662       vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
663       vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
664       vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
665       vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
666 
667       vo0p0 = vfma_lane_f16(vo0p0, vi4x5678, vwOP, 0);
668       vo1p0 = vfma_lane_f16(vo1p0, vi5x5678, vwOP, 0);
669       vo2p0 = vfma_lane_f16(vo2p0, vi6x5678, vwOP, 0);
670       vo3p0 = vfma_lane_f16(vo3p0, vi7x5678, vwOP, 0);
671 
672       const float16x4_t vi0x6789 = vext_f16(vi0x5678, vzero, 1);
673       const float16x4_t vi1x6789 = vext_f16(vi1x5678, vzero, 1);
674       const float16x4_t vi2x6789 = vext_f16(vi2x5678, vzero, 1);
675       const float16x4_t vi3x6789 = vext_f16(vi3x5678, vzero, 1);
676       const float16x4_t vi4x6789 = vext_f16(vi4x5678, vzero, 1);
677       const float16x4_t vi5x6789 = vext_f16(vi5x5678, vzero, 1);
678       const float16x4_t vi6x6789 = vext_f16(vi6x5678, vzero, 1);
679       const float16x4_t vi7x6789 = vext_f16(vi7x5678, vzero, 1);
680 
681       vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
682       vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
683       vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
684       vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
685 
686       vo0p0 = vfma_laneq_f16(vo0p0, vi1x6789, vw89ABCDEF, 2);
687       vo1p0 = vfma_laneq_f16(vo1p0, vi2x6789, vw89ABCDEF, 2);
688       vo2p0 = vfma_laneq_f16(vo2p0, vi3x6789, vw89ABCDEF, 2);
689       vo3p0 = vfma_laneq_f16(vo3p0, vi4x6789, vw89ABCDEF, 2);
690 
691       vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
692       vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
693       vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
694       vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
695 
696       vo0p0 = vfma_laneq_f16(vo0p0, vi3x6789, vwGHIJKLMN, 4);
697       vo1p0 = vfma_laneq_f16(vo1p0, vi4x6789, vwGHIJKLMN, 4);
698       vo2p0 = vfma_laneq_f16(vo2p0, vi5x6789, vwGHIJKLMN, 4);
699       vo3p0 = vfma_laneq_f16(vo3p0, vi6x6789, vwGHIJKLMN, 4);
700 
701       vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
702       vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
703       vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
704       vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
705 
706 
707       float16x4_t vo0 = vmax_f16(vo0p0, vmin);
708       float16x4_t vo1 = vmax_f16(vo1p0, vmin);
709       float16x4_t vo2 = vmax_f16(vo2p0, vmin);
710       float16x4_t vo3 = vmax_f16(vo3p0, vmin);
711 
712       vo0 = vmin_f16(vo0, vmax);
713       vo1 = vmin_f16(vo1, vmax);
714       vo2 = vmin_f16(vo2, vmax);
715       vo3 = vmin_f16(vo3, vmax);
716 
717       if XNN_LIKELY(w & (4 * sizeof(__fp16))) {
718         vst1_f16(o3, vo3); o3 += 4;
719         vst1_f16(o2, vo2); o2 += 4;
720         vst1_f16(o1, vo1); o1 += 4;
721         vst1_f16(o0, vo0); o0 += 4;
722       } else {
723         if (w & (2 * sizeof(__fp16))) {
724           vst1_lane_u32((void*) o3, vreinterpret_u32_f16(vo3), 0); o3 += 2;
725           vst1_lane_u32((void*) o2, vreinterpret_u32_f16(vo2), 0); o2 += 2;
726           vst1_lane_u32((void*) o1, vreinterpret_u32_f16(vo1), 0); o1 += 2;
727           vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vo0), 0); o0 += 2;
728 
729           vo0 = vext_f16(vo0, vo0, 2);
730           vo1 = vext_f16(vo1, vo1, 2);
731           vo2 = vext_f16(vo2, vo2, 2);
732           vo3 = vext_f16(vo3, vo3, 2);
733         }
734         if (w & (1 * sizeof(__fp16))) {
735           vst1_lane_f16(o3, vo3, 0); o3 += 1;
736           vst1_lane_f16(o2, vo2, 0); o2 += 1;
737           vst1_lane_f16(o1, vo1, 0); o1 += 1;
738           vst1_lane_f16(o0, vo0, 0); o0 += 1;
739         }
740       }
741     }
742 
743     i0 = (const __fp16*) ((uintptr_t) i4 - input_decrement);
744     i1 = (const __fp16*) ((uintptr_t) i5 - input_decrement);
745     i2 = (const __fp16*) ((uintptr_t) i1 + input_width);
746     i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
747     i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
748     i5 = (const __fp16*) ((uintptr_t) i4 + input_width);
749     i6 = (const __fp16*) ((uintptr_t) i5 + input_width);
750     i7 = (const __fp16*) ((uintptr_t) i6 + input_width);
751 
752     o0 = o3;
753     o1 = (__fp16*) ((uintptr_t) o0 + input_width);
754     o2 = (__fp16*) ((uintptr_t) o1 + input_width);
755     o3 = (__fp16*) ((uintptr_t) o2 + input_width);
756 
757     output_height = doz(output_height, 4);
758   } while (output_height != 0);
759 }
760