xref: /aosp_15_r20/external/XNNPACK/src/f16-dwconv2d-chw/gen/5x5p2-minmax-neonfp16arith-4x4-acc2.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f16-dwconv2d-chw/5x5p2-neonfp16arith.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2(size_t input_height,size_t input_width,const void * input,const void * weights,const void * zero,void * output,uint32_t padding_top,const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2(
19     size_t input_height,
20     size_t input_width,
21     const void* input,
22     const void* weights,
23     const void* zero,
24     void* output,
25     uint32_t padding_top,
26     const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28   assert(input_height != 0);
29   assert(input_width != 0);
30   assert(input_width % sizeof(__fp16) == 0);
31   assert(padding_top == 2);
32 
33   const uint16x4_t vmask = vld1_u16(params->neonfp16arith.mask);
34   const float16x4_t vmax = vld1_dup_f16(&params->neonfp16arith.max);
35   const float16x4_t vmin = vld1_dup_f16(&params->neonfp16arith.min);
36 
37   const __fp16* w0 = (const __fp16*)weights;
38   const float16x8_t vw01234567 = vld1q_f16(w0);
39   const float16x8_t vw89ABCDEF = vld1q_f16(w0 + 8);
40   const float16x8_t vwGHIJKLMN = vld1q_f16(w0 + 16);
41   const float16x4_t vwOP = vreinterpret_f16_u32(vld1_lane_u32((const void*)(w0 + 24), vmov_n_u32(0), 0));
42 
43   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(__fp16));
44 
45   const __fp16* i0 = zero;
46   const __fp16* i1 = zero;
47   const __fp16* i2 = input;
48   const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
49   const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
50   const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_width);
51   const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_width);
52   const __fp16* i7 = (const __fp16*) ((uintptr_t) i6 + input_width);
53 
54   __fp16* o0 = output;
55   __fp16* o1 = (__fp16*) ((uintptr_t) o0 + input_width);
56   __fp16* o2 = (__fp16*) ((uintptr_t) o1 + input_width);
57   __fp16* o3 = (__fp16*) ((uintptr_t) o2 + input_width);
58 
59   size_t output_height = input_height;
60   do {
61     if XNN_UNPREDICTABLE(output_height < 2) {
62       i3 = zero;
63       o1 = o0;
64     }
65     if XNN_UNPREDICTABLE(output_height < 3) {
66       i4 = zero;
67       o2 = o1;
68     }
69     if XNN_UNPREDICTABLE(output_height < 4) {
70       i5 = zero;
71       o3 = o2;
72     }
73     if XNN_UNPREDICTABLE(output_height < 5) {
74       i6 = zero;
75     }
76     if XNN_UNPREDICTABLE(output_height < 6) {
77       i7 = zero;
78     }
79 
80     float16x4_t vi0x0123 = vmov_n_f16(0);
81     float16x4_t vi1x0123 = vmov_n_f16(0);
82     float16x4_t vi2x0123 = vmov_n_f16(0);
83     float16x4_t vi3x0123 = vmov_n_f16(0);
84     float16x4_t vi4x0123 = vmov_n_f16(0);
85     float16x4_t vi5x0123 = vmov_n_f16(0);
86     float16x4_t vi6x0123 = vmov_n_f16(0);
87     float16x4_t vi7x0123 = vmov_n_f16(0);
88 
89     float16x4_t vi0x4567 = vld1_f16(i0); i0 += 4;
90     float16x4_t vi1x4567 = vld1_f16(i1); i1 += 4;
91     float16x4_t vi2x4567 = vld1_f16(i2); i2 += 4;
92     float16x4_t vi3x4567 = vld1_f16(i3); i3 += 4;
93     float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4;
94     float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4;
95     float16x4_t vi6x4567 = vld1_f16(i6); i6 += 4;
96     float16x4_t vi7x4567 = vld1_f16(i7); i7 += 4;
97 
98     size_t w = input_width;
99     for (; w > 8 * sizeof(__fp16); w -= 4 * sizeof(__fp16)) {
100       float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
101       float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
102       float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
103       float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
104 
105       const float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
106       const float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
107       const float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
108       const float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
109       const float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
110       const float16x4_t vi5x89AB = vld1_f16(i5); i5 += 4;
111       const float16x4_t vi6x89AB = vld1_f16(i6); i6 += 4;
112       const float16x4_t vi7x89AB = vld1_f16(i7); i7 += 4;
113 
114       float16x4_t vo0p1 = vmul_laneq_f16(vi0x4567, vw01234567, 3);
115       float16x4_t vo1p1 = vmul_laneq_f16(vi1x4567, vw01234567, 3);
116       float16x4_t vo2p1 = vmul_laneq_f16(vi2x4567, vw01234567, 3);
117       float16x4_t vo3p1 = vmul_laneq_f16(vi3x4567, vw01234567, 3);
118 
119       vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
120       vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
121       vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
122       vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
123 
124       vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
125       vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
126       vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
127       vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
128 
129       vo0p1 = vfma_laneq_f16(vo0p1, vi3x4567, vwGHIJKLMN, 2);
130       vo1p1 = vfma_laneq_f16(vo1p1, vi4x4567, vwGHIJKLMN, 2);
131       vo2p1 = vfma_laneq_f16(vo2p1, vi5x4567, vwGHIJKLMN, 2);
132       vo3p1 = vfma_laneq_f16(vo3p1, vi6x4567, vwGHIJKLMN, 2);
133 
134       vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
135       vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
136       vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
137       vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
138 
139       const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
140       const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
141       const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
142       const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
143       const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
144       const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
145       const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
146       const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
147 
148       vo0p1 = vfma_laneq_f16(vo0p1, vi0x3456, vw01234567, 2);
149       vo1p1 = vfma_laneq_f16(vo1p1, vi1x3456, vw01234567, 2);
150       vo2p1 = vfma_laneq_f16(vo2p1, vi2x3456, vw01234567, 2);
151       vo3p1 = vfma_laneq_f16(vo3p1, vi3x3456, vw01234567, 2);
152 
153       vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
154       vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
155       vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
156       vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
157 
158       vo0p1 = vfma_laneq_f16(vo0p1, vi2x3456, vw89ABCDEF, 4);
159       vo1p1 = vfma_laneq_f16(vo1p1, vi3x3456, vw89ABCDEF, 4);
160       vo2p1 = vfma_laneq_f16(vo2p1, vi4x3456, vw89ABCDEF, 4);
161       vo3p1 = vfma_laneq_f16(vo3p1, vi5x3456, vw89ABCDEF, 4);
162 
163       vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
164       vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
165       vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
166       vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
167 
168       vo0p1 = vfma_laneq_f16(vo0p1, vi4x3456, vwGHIJKLMN, 6);
169       vo1p1 = vfma_laneq_f16(vo1p1, vi5x3456, vwGHIJKLMN, 6);
170       vo2p1 = vfma_laneq_f16(vo2p1, vi6x3456, vwGHIJKLMN, 6);
171       vo3p1 = vfma_laneq_f16(vo3p1, vi7x3456, vwGHIJKLMN, 6);
172 
173       const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
174       vi0x0123 = vi0x4567;
175       const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
176       vi1x0123 = vi1x4567;
177       const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
178       vi2x0123 = vi2x4567;
179       const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
180       vi3x0123 = vi3x4567;
181       const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
182       vi4x0123 = vi4x4567;
183       const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
184       vi5x0123 = vi5x4567;
185       const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
186       vi6x0123 = vi6x4567;
187       const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
188       vi7x0123 = vi7x4567;
189 
190       vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
191       vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
192       vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
193       vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
194 
195       vo0p1 = vfma_laneq_f16(vo0p1, vi1x2345, vw01234567, 6);
196       vo1p1 = vfma_laneq_f16(vo1p1, vi2x2345, vw01234567, 6);
197       vo2p1 = vfma_laneq_f16(vo2p1, vi3x2345, vw01234567, 6);
198       vo3p1 = vfma_laneq_f16(vo3p1, vi4x2345, vw01234567, 6);
199 
200       vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
201       vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
202       vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
203       vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
204 
205       vo0p1 = vfma_laneq_f16(vo0p1, vi3x2345, vwGHIJKLMN, 0);
206       vo1p1 = vfma_laneq_f16(vo1p1, vi4x2345, vwGHIJKLMN, 0);
207       vo2p1 = vfma_laneq_f16(vo2p1, vi5x2345, vwGHIJKLMN, 0);
208       vo3p1 = vfma_laneq_f16(vo3p1, vi6x2345, vwGHIJKLMN, 0);
209 
210       vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
211       vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
212       vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
213       vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
214 
215       const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
216       const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
217       const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
218       const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
219       const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
220       const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1);
221       const float16x4_t vi6x5678 = vext_f16(vi6x4567, vi6x89AB, 1);
222       const float16x4_t vi7x5678 = vext_f16(vi7x4567, vi7x89AB, 1);
223 
224       vo0p1 = vfma_laneq_f16(vo0p1, vi0x5678, vw01234567, 4);
225       vo1p1 = vfma_laneq_f16(vo1p1, vi1x5678, vw01234567, 4);
226       vo2p1 = vfma_laneq_f16(vo2p1, vi2x5678, vw01234567, 4);
227       vo3p1 = vfma_laneq_f16(vo3p1, vi3x5678, vw01234567, 4);
228 
229       vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
230       vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
231       vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
232       vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
233 
234       vo0p1 = vfma_laneq_f16(vo0p1, vi2x5678, vw89ABCDEF, 6);
235       vo1p1 = vfma_laneq_f16(vo1p1, vi3x5678, vw89ABCDEF, 6);
236       vo2p1 = vfma_laneq_f16(vo2p1, vi4x5678, vw89ABCDEF, 6);
237       vo3p1 = vfma_laneq_f16(vo3p1, vi5x5678, vw89ABCDEF, 6);
238 
239       vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
240       vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
241       vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
242       vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
243 
244       vo0p1 = vfma_lane_f16(vo0p1, vi4x5678, vwOP, 0);
245       vo1p1 = vfma_lane_f16(vo1p1, vi5x5678, vwOP, 0);
246       vo2p1 = vfma_lane_f16(vo2p1, vi6x5678, vwOP, 0);
247       vo3p1 = vfma_lane_f16(vo3p1, vi7x5678, vwOP, 0);
248 
249       const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
250       vi0x4567 = vi0x89AB;
251       const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
252       vi1x4567 = vi1x89AB;
253       const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
254       vi2x4567 = vi2x89AB;
255       const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
256       vi3x4567 = vi3x89AB;
257       const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
258       vi4x4567 = vi4x89AB;
259       const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2);
260       vi5x4567 = vi5x89AB;
261       const float16x4_t vi6x6789 = vext_f16(vi6x4567, vi6x89AB, 2);
262       vi6x4567 = vi6x89AB;
263       const float16x4_t vi7x6789 = vext_f16(vi7x4567, vi7x89AB, 2);
264       vi7x4567 = vi7x89AB;
265 
266       vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
267       vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
268       vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
269       vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
270 
271       vo0p1 = vfma_laneq_f16(vo0p1, vi1x6789, vw89ABCDEF, 2);
272       vo1p1 = vfma_laneq_f16(vo1p1, vi2x6789, vw89ABCDEF, 2);
273       vo2p1 = vfma_laneq_f16(vo2p1, vi3x6789, vw89ABCDEF, 2);
274       vo3p1 = vfma_laneq_f16(vo3p1, vi4x6789, vw89ABCDEF, 2);
275 
276       vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
277       vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
278       vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
279       vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
280 
281       vo0p1 = vfma_laneq_f16(vo0p1, vi3x6789, vwGHIJKLMN, 4);
282       vo1p1 = vfma_laneq_f16(vo1p1, vi4x6789, vwGHIJKLMN, 4);
283       vo2p1 = vfma_laneq_f16(vo2p1, vi5x6789, vwGHIJKLMN, 4);
284       vo3p1 = vfma_laneq_f16(vo3p1, vi6x6789, vwGHIJKLMN, 4);
285 
286       vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
287       vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
288       vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
289       vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
290 
291       vo0p0 = vadd_f16(vo0p0, vo0p1);
292       vo1p0 = vadd_f16(vo1p0, vo1p1);
293       vo2p0 = vadd_f16(vo2p0, vo2p1);
294       vo3p0 = vadd_f16(vo3p0, vo3p1);
295 
296       float16x4_t vo0 = vmax_f16(vo0p0, vmin);
297       float16x4_t vo1 = vmax_f16(vo1p0, vmin);
298       float16x4_t vo2 = vmax_f16(vo2p0, vmin);
299       float16x4_t vo3 = vmax_f16(vo3p0, vmin);
300 
301       vo0 = vmin_f16(vo0, vmax);
302       vo1 = vmin_f16(vo1, vmax);
303       vo2 = vmin_f16(vo2, vmax);
304       vo3 = vmin_f16(vo3, vmax);
305 
306       vst1_f16(o3, vo3); o3 += 4;
307       vst1_f16(o2, vo2); o2 += 4;
308       vst1_f16(o1, vo1); o1 += 4;
309       vst1_f16(o0, vo0); o0 += 4;
310     }
311     // Always process the last block of 5..8 pixels.
312     if XNN_LIKELY(w > 4 * sizeof(__fp16)) {
313       float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
314       float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
315       float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
316       float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
317 
318       float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
319       float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
320       float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
321       float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
322       float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
323       float16x4_t vi5x89AB = vld1_f16(i5); i5 += 4;
324       float16x4_t vi6x89AB = vld1_f16(i6); i6 += 4;
325       float16x4_t vi7x89AB = vld1_f16(i7); i7 += 4;
326 
327       vi0x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x89AB)));
328       vi1x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x89AB)));
329       vi2x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x89AB)));
330       vi3x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x89AB)));
331       vi4x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x89AB)));
332       vi5x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi5x89AB)));
333       vi6x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi6x89AB)));
334       vi7x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi7x89AB)));
335 
336       float16x4_t vo0p1 = vmul_laneq_f16(vi0x4567, vw01234567, 3);
337       float16x4_t vo1p1 = vmul_laneq_f16(vi1x4567, vw01234567, 3);
338       float16x4_t vo2p1 = vmul_laneq_f16(vi2x4567, vw01234567, 3);
339       float16x4_t vo3p1 = vmul_laneq_f16(vi3x4567, vw01234567, 3);
340 
341       vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
342       vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
343       vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
344       vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
345 
346       vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
347       vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
348       vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
349       vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
350 
351       vo0p1 = vfma_laneq_f16(vo0p1, vi3x4567, vwGHIJKLMN, 2);
352       vo1p1 = vfma_laneq_f16(vo1p1, vi4x4567, vwGHIJKLMN, 2);
353       vo2p1 = vfma_laneq_f16(vo2p1, vi5x4567, vwGHIJKLMN, 2);
354       vo3p1 = vfma_laneq_f16(vo3p1, vi6x4567, vwGHIJKLMN, 2);
355 
356       vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
357       vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
358       vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
359       vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
360 
361       const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
362       const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
363       const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
364       const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
365       const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
366       const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
367       const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
368       const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
369 
370       vo0p1 = vfma_laneq_f16(vo0p1, vi0x3456, vw01234567, 2);
371       vo1p1 = vfma_laneq_f16(vo1p1, vi1x3456, vw01234567, 2);
372       vo2p1 = vfma_laneq_f16(vo2p1, vi2x3456, vw01234567, 2);
373       vo3p1 = vfma_laneq_f16(vo3p1, vi3x3456, vw01234567, 2);
374 
375       vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
376       vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
377       vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
378       vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
379 
380       vo0p1 = vfma_laneq_f16(vo0p1, vi2x3456, vw89ABCDEF, 4);
381       vo1p1 = vfma_laneq_f16(vo1p1, vi3x3456, vw89ABCDEF, 4);
382       vo2p1 = vfma_laneq_f16(vo2p1, vi4x3456, vw89ABCDEF, 4);
383       vo3p1 = vfma_laneq_f16(vo3p1, vi5x3456, vw89ABCDEF, 4);
384 
385       vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
386       vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
387       vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
388       vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
389 
390       vo0p1 = vfma_laneq_f16(vo0p1, vi4x3456, vwGHIJKLMN, 6);
391       vo1p1 = vfma_laneq_f16(vo1p1, vi5x3456, vwGHIJKLMN, 6);
392       vo2p1 = vfma_laneq_f16(vo2p1, vi6x3456, vwGHIJKLMN, 6);
393       vo3p1 = vfma_laneq_f16(vo3p1, vi7x3456, vwGHIJKLMN, 6);
394 
395       const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
396       vi0x0123 = vi0x4567;
397       const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
398       vi1x0123 = vi1x4567;
399       const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
400       vi2x0123 = vi2x4567;
401       const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
402       vi3x0123 = vi3x4567;
403       const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
404       vi4x0123 = vi4x4567;
405       const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
406       vi5x0123 = vi5x4567;
407       const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
408       vi6x0123 = vi6x4567;
409       const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
410       vi7x0123 = vi7x4567;
411 
412       vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
413       vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
414       vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
415       vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
416 
417       vo0p1 = vfma_laneq_f16(vo0p1, vi1x2345, vw01234567, 6);
418       vo1p1 = vfma_laneq_f16(vo1p1, vi2x2345, vw01234567, 6);
419       vo2p1 = vfma_laneq_f16(vo2p1, vi3x2345, vw01234567, 6);
420       vo3p1 = vfma_laneq_f16(vo3p1, vi4x2345, vw01234567, 6);
421 
422       vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
423       vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
424       vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
425       vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
426 
427       vo0p1 = vfma_laneq_f16(vo0p1, vi3x2345, vwGHIJKLMN, 0);
428       vo1p1 = vfma_laneq_f16(vo1p1, vi4x2345, vwGHIJKLMN, 0);
429       vo2p1 = vfma_laneq_f16(vo2p1, vi5x2345, vwGHIJKLMN, 0);
430       vo3p1 = vfma_laneq_f16(vo3p1, vi6x2345, vwGHIJKLMN, 0);
431 
432       vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
433       vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
434       vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
435       vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
436 
437       const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
438       const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
439       const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
440       const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
441       const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
442       const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1);
443       const float16x4_t vi6x5678 = vext_f16(vi6x4567, vi6x89AB, 1);
444       const float16x4_t vi7x5678 = vext_f16(vi7x4567, vi7x89AB, 1);
445 
446       vo0p1 = vfma_laneq_f16(vo0p1, vi0x5678, vw01234567, 4);
447       vo1p1 = vfma_laneq_f16(vo1p1, vi1x5678, vw01234567, 4);
448       vo2p1 = vfma_laneq_f16(vo2p1, vi2x5678, vw01234567, 4);
449       vo3p1 = vfma_laneq_f16(vo3p1, vi3x5678, vw01234567, 4);
450 
451       vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
452       vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
453       vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
454       vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
455 
456       vo0p1 = vfma_laneq_f16(vo0p1, vi2x5678, vw89ABCDEF, 6);
457       vo1p1 = vfma_laneq_f16(vo1p1, vi3x5678, vw89ABCDEF, 6);
458       vo2p1 = vfma_laneq_f16(vo2p1, vi4x5678, vw89ABCDEF, 6);
459       vo3p1 = vfma_laneq_f16(vo3p1, vi5x5678, vw89ABCDEF, 6);
460 
461       vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
462       vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
463       vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
464       vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
465 
466       vo0p1 = vfma_lane_f16(vo0p1, vi4x5678, vwOP, 0);
467       vo1p1 = vfma_lane_f16(vo1p1, vi5x5678, vwOP, 0);
468       vo2p1 = vfma_lane_f16(vo2p1, vi6x5678, vwOP, 0);
469       vo3p1 = vfma_lane_f16(vo3p1, vi7x5678, vwOP, 0);
470 
471       const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
472       vi0x4567 = vi0x89AB;
473       const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
474       vi1x4567 = vi1x89AB;
475       const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
476       vi2x4567 = vi2x89AB;
477       const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
478       vi3x4567 = vi3x89AB;
479       const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
480       vi4x4567 = vi4x89AB;
481       const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2);
482       vi5x4567 = vi5x89AB;
483       const float16x4_t vi6x6789 = vext_f16(vi6x4567, vi6x89AB, 2);
484       vi6x4567 = vi6x89AB;
485       const float16x4_t vi7x6789 = vext_f16(vi7x4567, vi7x89AB, 2);
486       vi7x4567 = vi7x89AB;
487 
488       vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
489       vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
490       vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
491       vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
492 
493       vo0p1 = vfma_laneq_f16(vo0p1, vi1x6789, vw89ABCDEF, 2);
494       vo1p1 = vfma_laneq_f16(vo1p1, vi2x6789, vw89ABCDEF, 2);
495       vo2p1 = vfma_laneq_f16(vo2p1, vi3x6789, vw89ABCDEF, 2);
496       vo3p1 = vfma_laneq_f16(vo3p1, vi4x6789, vw89ABCDEF, 2);
497 
498       vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
499       vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
500       vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
501       vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
502 
503       vo0p1 = vfma_laneq_f16(vo0p1, vi3x6789, vwGHIJKLMN, 4);
504       vo1p1 = vfma_laneq_f16(vo1p1, vi4x6789, vwGHIJKLMN, 4);
505       vo2p1 = vfma_laneq_f16(vo2p1, vi5x6789, vwGHIJKLMN, 4);
506       vo3p1 = vfma_laneq_f16(vo3p1, vi6x6789, vwGHIJKLMN, 4);
507 
508       vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
509       vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
510       vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
511       vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
512 
513       vo0p0 = vadd_f16(vo0p0, vo0p1);
514       vo1p0 = vadd_f16(vo1p0, vo1p1);
515       vo2p0 = vadd_f16(vo2p0, vo2p1);
516       vo3p0 = vadd_f16(vo3p0, vo3p1);
517 
518       float16x4_t vo0 = vmax_f16(vo0p0, vmin);
519       float16x4_t vo1 = vmax_f16(vo1p0, vmin);
520       float16x4_t vo2 = vmax_f16(vo2p0, vmin);
521       float16x4_t vo3 = vmax_f16(vo3p0, vmin);
522 
523       vo0 = vmin_f16(vo0, vmax);
524       vo1 = vmin_f16(vo1, vmax);
525       vo2 = vmin_f16(vo2, vmax);
526       vo3 = vmin_f16(vo3, vmax);
527 
528       vst1_f16(o3, vo3); o3 += 4;
529       vst1_f16(o2, vo2); o2 += 4;
530       vst1_f16(o1, vo1); o1 += 4;
531       vst1_f16(o0, vo0); o0 += 4;
532 
533       w -= 4 * sizeof(__fp16);
534     }
535     assert(w >= 1 * sizeof(__fp16));
536     assert(w <= 4 * sizeof(__fp16));
537     {
538       float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
539       float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
540       float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
541       float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
542 
543       vi0x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x4567)));
544       vi1x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x4567)));
545       vi2x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x4567)));
546       vi3x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x4567)));
547       vi4x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x4567)));
548       vi5x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi5x4567)));
549       vi6x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi6x4567)));
550       vi7x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi7x4567)));
551 
552       float16x4_t vo0p1 = vmul_laneq_f16(vi0x4567, vw01234567, 3);
553       float16x4_t vo1p1 = vmul_laneq_f16(vi1x4567, vw01234567, 3);
554       float16x4_t vo2p1 = vmul_laneq_f16(vi2x4567, vw01234567, 3);
555       float16x4_t vo3p1 = vmul_laneq_f16(vi3x4567, vw01234567, 3);
556 
557       vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
558       vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
559       vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
560       vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
561 
562       vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
563       vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
564       vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
565       vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
566 
567       vo0p1 = vfma_laneq_f16(vo0p1, vi3x4567, vwGHIJKLMN, 2);
568       vo1p1 = vfma_laneq_f16(vo1p1, vi4x4567, vwGHIJKLMN, 2);
569       vo2p1 = vfma_laneq_f16(vo2p1, vi5x4567, vwGHIJKLMN, 2);
570       vo3p1 = vfma_laneq_f16(vo3p1, vi6x4567, vwGHIJKLMN, 2);
571 
572       vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
573       vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
574       vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
575       vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
576 
577       const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
578       const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
579       const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
580       const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
581       const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
582       const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
583       const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
584       const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
585 
586       vo0p1 = vfma_laneq_f16(vo0p1, vi0x3456, vw01234567, 2);
587       vo1p1 = vfma_laneq_f16(vo1p1, vi1x3456, vw01234567, 2);
588       vo2p1 = vfma_laneq_f16(vo2p1, vi2x3456, vw01234567, 2);
589       vo3p1 = vfma_laneq_f16(vo3p1, vi3x3456, vw01234567, 2);
590 
591       vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
592       vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
593       vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
594       vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
595 
596       vo0p1 = vfma_laneq_f16(vo0p1, vi2x3456, vw89ABCDEF, 4);
597       vo1p1 = vfma_laneq_f16(vo1p1, vi3x3456, vw89ABCDEF, 4);
598       vo2p1 = vfma_laneq_f16(vo2p1, vi4x3456, vw89ABCDEF, 4);
599       vo3p1 = vfma_laneq_f16(vo3p1, vi5x3456, vw89ABCDEF, 4);
600 
601       vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
602       vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
603       vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
604       vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
605 
606       vo0p1 = vfma_laneq_f16(vo0p1, vi4x3456, vwGHIJKLMN, 6);
607       vo1p1 = vfma_laneq_f16(vo1p1, vi5x3456, vwGHIJKLMN, 6);
608       vo2p1 = vfma_laneq_f16(vo2p1, vi6x3456, vwGHIJKLMN, 6);
609       vo3p1 = vfma_laneq_f16(vo3p1, vi7x3456, vwGHIJKLMN, 6);
610 
611       const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
612       const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
613       const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
614       const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
615       const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
616       const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
617       const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
618       const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
619 
620       vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
621       vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
622       vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
623       vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
624 
625       vo0p1 = vfma_laneq_f16(vo0p1, vi1x2345, vw01234567, 6);
626       vo1p1 = vfma_laneq_f16(vo1p1, vi2x2345, vw01234567, 6);
627       vo2p1 = vfma_laneq_f16(vo2p1, vi3x2345, vw01234567, 6);
628       vo3p1 = vfma_laneq_f16(vo3p1, vi4x2345, vw01234567, 6);
629 
630       vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
631       vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
632       vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
633       vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
634 
635       vo0p1 = vfma_laneq_f16(vo0p1, vi3x2345, vwGHIJKLMN, 0);
636       vo1p1 = vfma_laneq_f16(vo1p1, vi4x2345, vwGHIJKLMN, 0);
637       vo2p1 = vfma_laneq_f16(vo2p1, vi5x2345, vwGHIJKLMN, 0);
638       vo3p1 = vfma_laneq_f16(vo3p1, vi6x2345, vwGHIJKLMN, 0);
639 
640       vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
641       vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
642       vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
643       vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
644 
645       const float16x4_t vzero = vmov_n_f16(0);
646       const float16x4_t vi0x5678 = vext_f16(vi0x4567, vzero, 1);
647       const float16x4_t vi1x5678 = vext_f16(vi1x4567, vzero, 1);
648       const float16x4_t vi2x5678 = vext_f16(vi2x4567, vzero, 1);
649       const float16x4_t vi3x5678 = vext_f16(vi3x4567, vzero, 1);
650       const float16x4_t vi4x5678 = vext_f16(vi4x4567, vzero, 1);
651       const float16x4_t vi5x5678 = vext_f16(vi5x4567, vzero, 1);
652       const float16x4_t vi6x5678 = vext_f16(vi6x4567, vzero, 1);
653       const float16x4_t vi7x5678 = vext_f16(vi7x4567, vzero, 1);
654 
655       vo0p1 = vfma_laneq_f16(vo0p1, vi0x5678, vw01234567, 4);
656       vo1p1 = vfma_laneq_f16(vo1p1, vi1x5678, vw01234567, 4);
657       vo2p1 = vfma_laneq_f16(vo2p1, vi2x5678, vw01234567, 4);
658       vo3p1 = vfma_laneq_f16(vo3p1, vi3x5678, vw01234567, 4);
659 
660       vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
661       vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
662       vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
663       vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
664 
665       vo0p1 = vfma_laneq_f16(vo0p1, vi2x5678, vw89ABCDEF, 6);
666       vo1p1 = vfma_laneq_f16(vo1p1, vi3x5678, vw89ABCDEF, 6);
667       vo2p1 = vfma_laneq_f16(vo2p1, vi4x5678, vw89ABCDEF, 6);
668       vo3p1 = vfma_laneq_f16(vo3p1, vi5x5678, vw89ABCDEF, 6);
669 
670       vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
671       vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
672       vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
673       vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
674 
675       vo0p1 = vfma_lane_f16(vo0p1, vi4x5678, vwOP, 0);
676       vo1p1 = vfma_lane_f16(vo1p1, vi5x5678, vwOP, 0);
677       vo2p1 = vfma_lane_f16(vo2p1, vi6x5678, vwOP, 0);
678       vo3p1 = vfma_lane_f16(vo3p1, vi7x5678, vwOP, 0);
679 
680       const float16x4_t vi0x6789 = vext_f16(vi0x5678, vzero, 1);
681       const float16x4_t vi1x6789 = vext_f16(vi1x5678, vzero, 1);
682       const float16x4_t vi2x6789 = vext_f16(vi2x5678, vzero, 1);
683       const float16x4_t vi3x6789 = vext_f16(vi3x5678, vzero, 1);
684       const float16x4_t vi4x6789 = vext_f16(vi4x5678, vzero, 1);
685       const float16x4_t vi5x6789 = vext_f16(vi5x5678, vzero, 1);
686       const float16x4_t vi6x6789 = vext_f16(vi6x5678, vzero, 1);
687       const float16x4_t vi7x6789 = vext_f16(vi7x5678, vzero, 1);
688 
689       vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
690       vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
691       vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
692       vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
693 
694       vo0p1 = vfma_laneq_f16(vo0p1, vi1x6789, vw89ABCDEF, 2);
695       vo1p1 = vfma_laneq_f16(vo1p1, vi2x6789, vw89ABCDEF, 2);
696       vo2p1 = vfma_laneq_f16(vo2p1, vi3x6789, vw89ABCDEF, 2);
697       vo3p1 = vfma_laneq_f16(vo3p1, vi4x6789, vw89ABCDEF, 2);
698 
699       vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
700       vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
701       vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
702       vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
703 
704       vo0p1 = vfma_laneq_f16(vo0p1, vi3x6789, vwGHIJKLMN, 4);
705       vo1p1 = vfma_laneq_f16(vo1p1, vi4x6789, vwGHIJKLMN, 4);
706       vo2p1 = vfma_laneq_f16(vo2p1, vi5x6789, vwGHIJKLMN, 4);
707       vo3p1 = vfma_laneq_f16(vo3p1, vi6x6789, vwGHIJKLMN, 4);
708 
709       vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
710       vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
711       vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
712       vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
713 
714       vo0p0 = vadd_f16(vo0p0, vo0p1);
715       vo1p0 = vadd_f16(vo1p0, vo1p1);
716       vo2p0 = vadd_f16(vo2p0, vo2p1);
717       vo3p0 = vadd_f16(vo3p0, vo3p1);
718 
719       float16x4_t vo0 = vmax_f16(vo0p0, vmin);
720       float16x4_t vo1 = vmax_f16(vo1p0, vmin);
721       float16x4_t vo2 = vmax_f16(vo2p0, vmin);
722       float16x4_t vo3 = vmax_f16(vo3p0, vmin);
723 
724       vo0 = vmin_f16(vo0, vmax);
725       vo1 = vmin_f16(vo1, vmax);
726       vo2 = vmin_f16(vo2, vmax);
727       vo3 = vmin_f16(vo3, vmax);
728 
729       if XNN_LIKELY(w & (4 * sizeof(__fp16))) {
730         vst1_f16(o3, vo3); o3 += 4;
731         vst1_f16(o2, vo2); o2 += 4;
732         vst1_f16(o1, vo1); o1 += 4;
733         vst1_f16(o0, vo0); o0 += 4;
734       } else {
735         if (w & (2 * sizeof(__fp16))) {
736           vst1_lane_u32((void*) o3, vreinterpret_u32_f16(vo3), 0); o3 += 2;
737           vst1_lane_u32((void*) o2, vreinterpret_u32_f16(vo2), 0); o2 += 2;
738           vst1_lane_u32((void*) o1, vreinterpret_u32_f16(vo1), 0); o1 += 2;
739           vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vo0), 0); o0 += 2;
740 
741           vo0 = vext_f16(vo0, vo0, 2);
742           vo1 = vext_f16(vo1, vo1, 2);
743           vo2 = vext_f16(vo2, vo2, 2);
744           vo3 = vext_f16(vo3, vo3, 2);
745         }
746         if (w & (1 * sizeof(__fp16))) {
747           vst1_lane_f16(o3, vo3, 0); o3 += 1;
748           vst1_lane_f16(o2, vo2, 0); o2 += 1;
749           vst1_lane_f16(o1, vo1, 0); o1 += 1;
750           vst1_lane_f16(o0, vo0, 0); o0 += 1;
751         }
752       }
753     }
754 
755     i0 = (const __fp16*) ((uintptr_t) i4 - input_decrement);
756     i1 = (const __fp16*) ((uintptr_t) i5 - input_decrement);
757     i2 = (const __fp16*) ((uintptr_t) i1 + input_width);
758     i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
759     i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
760     i5 = (const __fp16*) ((uintptr_t) i4 + input_width);
761     i6 = (const __fp16*) ((uintptr_t) i5 + input_width);
762     i7 = (const __fp16*) ((uintptr_t) i6 + input_width);
763 
764     o0 = o3;
765     o1 = (__fp16*) ((uintptr_t) o0 + input_width);
766     o2 = (__fp16*) ((uintptr_t) o1 + input_width);
767     o3 = (__fp16*) ((uintptr_t) o2 + input_width);
768 
769     output_height = doz(output_height, 4);
770   } while (output_height != 0);
771 }
772