1 // Auto-generated file. Do not edit!
2 // Template: src/f16-dwconv2d-chw/5x5p2-neonfp16arith.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2(size_t input_height,size_t input_width,const void * input,const void * weights,const void * zero,void * output,uint32_t padding_top,const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2(
19 size_t input_height,
20 size_t input_width,
21 const void* input,
22 const void* weights,
23 const void* zero,
24 void* output,
25 uint32_t padding_top,
26 const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28 assert(input_height != 0);
29 assert(input_width != 0);
30 assert(input_width % sizeof(__fp16) == 0);
31 assert(padding_top == 2);
32
33 const uint16x4_t vmask = vld1_u16(params->neonfp16arith.mask);
34 const float16x4_t vmax = vld1_dup_f16(¶ms->neonfp16arith.max);
35 const float16x4_t vmin = vld1_dup_f16(¶ms->neonfp16arith.min);
36
37 const __fp16* w0 = (const __fp16*)weights;
38 const float16x8_t vw01234567 = vld1q_f16(w0);
39 const float16x8_t vw89ABCDEF = vld1q_f16(w0 + 8);
40 const float16x8_t vwGHIJKLMN = vld1q_f16(w0 + 16);
41 const float16x4_t vwOP = vreinterpret_f16_u32(vld1_lane_u32((const void*)(w0 + 24), vmov_n_u32(0), 0));
42
43 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(__fp16));
44
45 const __fp16* i0 = zero;
46 const __fp16* i1 = zero;
47 const __fp16* i2 = input;
48 const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
49 const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
50 const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_width);
51 const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_width);
52 const __fp16* i7 = (const __fp16*) ((uintptr_t) i6 + input_width);
53
54 __fp16* o0 = output;
55 __fp16* o1 = (__fp16*) ((uintptr_t) o0 + input_width);
56 __fp16* o2 = (__fp16*) ((uintptr_t) o1 + input_width);
57 __fp16* o3 = (__fp16*) ((uintptr_t) o2 + input_width);
58
59 size_t output_height = input_height;
60 do {
61 if XNN_UNPREDICTABLE(output_height < 2) {
62 i3 = zero;
63 o1 = o0;
64 }
65 if XNN_UNPREDICTABLE(output_height < 3) {
66 i4 = zero;
67 o2 = o1;
68 }
69 if XNN_UNPREDICTABLE(output_height < 4) {
70 i5 = zero;
71 o3 = o2;
72 }
73 if XNN_UNPREDICTABLE(output_height < 5) {
74 i6 = zero;
75 }
76 if XNN_UNPREDICTABLE(output_height < 6) {
77 i7 = zero;
78 }
79
80 float16x4_t vi0x0123 = vmov_n_f16(0);
81 float16x4_t vi1x0123 = vmov_n_f16(0);
82 float16x4_t vi2x0123 = vmov_n_f16(0);
83 float16x4_t vi3x0123 = vmov_n_f16(0);
84 float16x4_t vi4x0123 = vmov_n_f16(0);
85 float16x4_t vi5x0123 = vmov_n_f16(0);
86 float16x4_t vi6x0123 = vmov_n_f16(0);
87 float16x4_t vi7x0123 = vmov_n_f16(0);
88
89 float16x4_t vi0x4567 = vld1_f16(i0); i0 += 4;
90 float16x4_t vi1x4567 = vld1_f16(i1); i1 += 4;
91 float16x4_t vi2x4567 = vld1_f16(i2); i2 += 4;
92 float16x4_t vi3x4567 = vld1_f16(i3); i3 += 4;
93 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4;
94 float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4;
95 float16x4_t vi6x4567 = vld1_f16(i6); i6 += 4;
96 float16x4_t vi7x4567 = vld1_f16(i7); i7 += 4;
97
98 size_t w = input_width;
99 for (; w > 8 * sizeof(__fp16); w -= 4 * sizeof(__fp16)) {
100 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
101 float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
102 float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
103 float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
104
105 const float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
106 const float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
107 const float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
108 const float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
109 const float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
110 const float16x4_t vi5x89AB = vld1_f16(i5); i5 += 4;
111 const float16x4_t vi6x89AB = vld1_f16(i6); i6 += 4;
112 const float16x4_t vi7x89AB = vld1_f16(i7); i7 += 4;
113
114 float16x4_t vo0p1 = vmul_laneq_f16(vi0x4567, vw01234567, 3);
115 float16x4_t vo1p1 = vmul_laneq_f16(vi1x4567, vw01234567, 3);
116 float16x4_t vo2p1 = vmul_laneq_f16(vi2x4567, vw01234567, 3);
117 float16x4_t vo3p1 = vmul_laneq_f16(vi3x4567, vw01234567, 3);
118
119 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
120 vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
121 vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
122 vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
123
124 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
125 vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
126 vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
127 vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
128
129 vo0p1 = vfma_laneq_f16(vo0p1, vi3x4567, vwGHIJKLMN, 2);
130 vo1p1 = vfma_laneq_f16(vo1p1, vi4x4567, vwGHIJKLMN, 2);
131 vo2p1 = vfma_laneq_f16(vo2p1, vi5x4567, vwGHIJKLMN, 2);
132 vo3p1 = vfma_laneq_f16(vo3p1, vi6x4567, vwGHIJKLMN, 2);
133
134 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
135 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
136 vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
137 vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
138
139 const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
140 const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
141 const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
142 const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
143 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
144 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
145 const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
146 const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
147
148 vo0p1 = vfma_laneq_f16(vo0p1, vi0x3456, vw01234567, 2);
149 vo1p1 = vfma_laneq_f16(vo1p1, vi1x3456, vw01234567, 2);
150 vo2p1 = vfma_laneq_f16(vo2p1, vi2x3456, vw01234567, 2);
151 vo3p1 = vfma_laneq_f16(vo3p1, vi3x3456, vw01234567, 2);
152
153 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
154 vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
155 vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
156 vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
157
158 vo0p1 = vfma_laneq_f16(vo0p1, vi2x3456, vw89ABCDEF, 4);
159 vo1p1 = vfma_laneq_f16(vo1p1, vi3x3456, vw89ABCDEF, 4);
160 vo2p1 = vfma_laneq_f16(vo2p1, vi4x3456, vw89ABCDEF, 4);
161 vo3p1 = vfma_laneq_f16(vo3p1, vi5x3456, vw89ABCDEF, 4);
162
163 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
164 vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
165 vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
166 vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
167
168 vo0p1 = vfma_laneq_f16(vo0p1, vi4x3456, vwGHIJKLMN, 6);
169 vo1p1 = vfma_laneq_f16(vo1p1, vi5x3456, vwGHIJKLMN, 6);
170 vo2p1 = vfma_laneq_f16(vo2p1, vi6x3456, vwGHIJKLMN, 6);
171 vo3p1 = vfma_laneq_f16(vo3p1, vi7x3456, vwGHIJKLMN, 6);
172
173 const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
174 vi0x0123 = vi0x4567;
175 const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
176 vi1x0123 = vi1x4567;
177 const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
178 vi2x0123 = vi2x4567;
179 const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
180 vi3x0123 = vi3x4567;
181 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
182 vi4x0123 = vi4x4567;
183 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
184 vi5x0123 = vi5x4567;
185 const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
186 vi6x0123 = vi6x4567;
187 const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
188 vi7x0123 = vi7x4567;
189
190 vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
191 vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
192 vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
193 vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
194
195 vo0p1 = vfma_laneq_f16(vo0p1, vi1x2345, vw01234567, 6);
196 vo1p1 = vfma_laneq_f16(vo1p1, vi2x2345, vw01234567, 6);
197 vo2p1 = vfma_laneq_f16(vo2p1, vi3x2345, vw01234567, 6);
198 vo3p1 = vfma_laneq_f16(vo3p1, vi4x2345, vw01234567, 6);
199
200 vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
201 vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
202 vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
203 vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
204
205 vo0p1 = vfma_laneq_f16(vo0p1, vi3x2345, vwGHIJKLMN, 0);
206 vo1p1 = vfma_laneq_f16(vo1p1, vi4x2345, vwGHIJKLMN, 0);
207 vo2p1 = vfma_laneq_f16(vo2p1, vi5x2345, vwGHIJKLMN, 0);
208 vo3p1 = vfma_laneq_f16(vo3p1, vi6x2345, vwGHIJKLMN, 0);
209
210 vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
211 vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
212 vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
213 vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
214
215 const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
216 const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
217 const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
218 const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
219 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
220 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1);
221 const float16x4_t vi6x5678 = vext_f16(vi6x4567, vi6x89AB, 1);
222 const float16x4_t vi7x5678 = vext_f16(vi7x4567, vi7x89AB, 1);
223
224 vo0p1 = vfma_laneq_f16(vo0p1, vi0x5678, vw01234567, 4);
225 vo1p1 = vfma_laneq_f16(vo1p1, vi1x5678, vw01234567, 4);
226 vo2p1 = vfma_laneq_f16(vo2p1, vi2x5678, vw01234567, 4);
227 vo3p1 = vfma_laneq_f16(vo3p1, vi3x5678, vw01234567, 4);
228
229 vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
230 vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
231 vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
232 vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
233
234 vo0p1 = vfma_laneq_f16(vo0p1, vi2x5678, vw89ABCDEF, 6);
235 vo1p1 = vfma_laneq_f16(vo1p1, vi3x5678, vw89ABCDEF, 6);
236 vo2p1 = vfma_laneq_f16(vo2p1, vi4x5678, vw89ABCDEF, 6);
237 vo3p1 = vfma_laneq_f16(vo3p1, vi5x5678, vw89ABCDEF, 6);
238
239 vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
240 vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
241 vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
242 vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
243
244 vo0p1 = vfma_lane_f16(vo0p1, vi4x5678, vwOP, 0);
245 vo1p1 = vfma_lane_f16(vo1p1, vi5x5678, vwOP, 0);
246 vo2p1 = vfma_lane_f16(vo2p1, vi6x5678, vwOP, 0);
247 vo3p1 = vfma_lane_f16(vo3p1, vi7x5678, vwOP, 0);
248
249 const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
250 vi0x4567 = vi0x89AB;
251 const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
252 vi1x4567 = vi1x89AB;
253 const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
254 vi2x4567 = vi2x89AB;
255 const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
256 vi3x4567 = vi3x89AB;
257 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
258 vi4x4567 = vi4x89AB;
259 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2);
260 vi5x4567 = vi5x89AB;
261 const float16x4_t vi6x6789 = vext_f16(vi6x4567, vi6x89AB, 2);
262 vi6x4567 = vi6x89AB;
263 const float16x4_t vi7x6789 = vext_f16(vi7x4567, vi7x89AB, 2);
264 vi7x4567 = vi7x89AB;
265
266 vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
267 vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
268 vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
269 vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
270
271 vo0p1 = vfma_laneq_f16(vo0p1, vi1x6789, vw89ABCDEF, 2);
272 vo1p1 = vfma_laneq_f16(vo1p1, vi2x6789, vw89ABCDEF, 2);
273 vo2p1 = vfma_laneq_f16(vo2p1, vi3x6789, vw89ABCDEF, 2);
274 vo3p1 = vfma_laneq_f16(vo3p1, vi4x6789, vw89ABCDEF, 2);
275
276 vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
277 vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
278 vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
279 vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
280
281 vo0p1 = vfma_laneq_f16(vo0p1, vi3x6789, vwGHIJKLMN, 4);
282 vo1p1 = vfma_laneq_f16(vo1p1, vi4x6789, vwGHIJKLMN, 4);
283 vo2p1 = vfma_laneq_f16(vo2p1, vi5x6789, vwGHIJKLMN, 4);
284 vo3p1 = vfma_laneq_f16(vo3p1, vi6x6789, vwGHIJKLMN, 4);
285
286 vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
287 vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
288 vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
289 vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
290
291 vo0p0 = vadd_f16(vo0p0, vo0p1);
292 vo1p0 = vadd_f16(vo1p0, vo1p1);
293 vo2p0 = vadd_f16(vo2p0, vo2p1);
294 vo3p0 = vadd_f16(vo3p0, vo3p1);
295
296 float16x4_t vo0 = vmax_f16(vo0p0, vmin);
297 float16x4_t vo1 = vmax_f16(vo1p0, vmin);
298 float16x4_t vo2 = vmax_f16(vo2p0, vmin);
299 float16x4_t vo3 = vmax_f16(vo3p0, vmin);
300
301 vo0 = vmin_f16(vo0, vmax);
302 vo1 = vmin_f16(vo1, vmax);
303 vo2 = vmin_f16(vo2, vmax);
304 vo3 = vmin_f16(vo3, vmax);
305
306 vst1_f16(o3, vo3); o3 += 4;
307 vst1_f16(o2, vo2); o2 += 4;
308 vst1_f16(o1, vo1); o1 += 4;
309 vst1_f16(o0, vo0); o0 += 4;
310 }
311 // Always process the last block of 5..8 pixels.
312 if XNN_LIKELY(w > 4 * sizeof(__fp16)) {
313 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
314 float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
315 float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
316 float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
317
318 float16x4_t vi0x89AB = vld1_f16(i0); i0 += 4;
319 float16x4_t vi1x89AB = vld1_f16(i1); i1 += 4;
320 float16x4_t vi2x89AB = vld1_f16(i2); i2 += 4;
321 float16x4_t vi3x89AB = vld1_f16(i3); i3 += 4;
322 float16x4_t vi4x89AB = vld1_f16(i4); i4 += 4;
323 float16x4_t vi5x89AB = vld1_f16(i5); i5 += 4;
324 float16x4_t vi6x89AB = vld1_f16(i6); i6 += 4;
325 float16x4_t vi7x89AB = vld1_f16(i7); i7 += 4;
326
327 vi0x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x89AB)));
328 vi1x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x89AB)));
329 vi2x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x89AB)));
330 vi3x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x89AB)));
331 vi4x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x89AB)));
332 vi5x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi5x89AB)));
333 vi6x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi6x89AB)));
334 vi7x89AB = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi7x89AB)));
335
336 float16x4_t vo0p1 = vmul_laneq_f16(vi0x4567, vw01234567, 3);
337 float16x4_t vo1p1 = vmul_laneq_f16(vi1x4567, vw01234567, 3);
338 float16x4_t vo2p1 = vmul_laneq_f16(vi2x4567, vw01234567, 3);
339 float16x4_t vo3p1 = vmul_laneq_f16(vi3x4567, vw01234567, 3);
340
341 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
342 vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
343 vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
344 vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
345
346 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
347 vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
348 vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
349 vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
350
351 vo0p1 = vfma_laneq_f16(vo0p1, vi3x4567, vwGHIJKLMN, 2);
352 vo1p1 = vfma_laneq_f16(vo1p1, vi4x4567, vwGHIJKLMN, 2);
353 vo2p1 = vfma_laneq_f16(vo2p1, vi5x4567, vwGHIJKLMN, 2);
354 vo3p1 = vfma_laneq_f16(vo3p1, vi6x4567, vwGHIJKLMN, 2);
355
356 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
357 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
358 vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
359 vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
360
361 const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
362 const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
363 const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
364 const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
365 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
366 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
367 const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
368 const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
369
370 vo0p1 = vfma_laneq_f16(vo0p1, vi0x3456, vw01234567, 2);
371 vo1p1 = vfma_laneq_f16(vo1p1, vi1x3456, vw01234567, 2);
372 vo2p1 = vfma_laneq_f16(vo2p1, vi2x3456, vw01234567, 2);
373 vo3p1 = vfma_laneq_f16(vo3p1, vi3x3456, vw01234567, 2);
374
375 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
376 vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
377 vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
378 vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
379
380 vo0p1 = vfma_laneq_f16(vo0p1, vi2x3456, vw89ABCDEF, 4);
381 vo1p1 = vfma_laneq_f16(vo1p1, vi3x3456, vw89ABCDEF, 4);
382 vo2p1 = vfma_laneq_f16(vo2p1, vi4x3456, vw89ABCDEF, 4);
383 vo3p1 = vfma_laneq_f16(vo3p1, vi5x3456, vw89ABCDEF, 4);
384
385 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
386 vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
387 vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
388 vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
389
390 vo0p1 = vfma_laneq_f16(vo0p1, vi4x3456, vwGHIJKLMN, 6);
391 vo1p1 = vfma_laneq_f16(vo1p1, vi5x3456, vwGHIJKLMN, 6);
392 vo2p1 = vfma_laneq_f16(vo2p1, vi6x3456, vwGHIJKLMN, 6);
393 vo3p1 = vfma_laneq_f16(vo3p1, vi7x3456, vwGHIJKLMN, 6);
394
395 const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
396 vi0x0123 = vi0x4567;
397 const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
398 vi1x0123 = vi1x4567;
399 const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
400 vi2x0123 = vi2x4567;
401 const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
402 vi3x0123 = vi3x4567;
403 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
404 vi4x0123 = vi4x4567;
405 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
406 vi5x0123 = vi5x4567;
407 const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
408 vi6x0123 = vi6x4567;
409 const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
410 vi7x0123 = vi7x4567;
411
412 vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
413 vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
414 vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
415 vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
416
417 vo0p1 = vfma_laneq_f16(vo0p1, vi1x2345, vw01234567, 6);
418 vo1p1 = vfma_laneq_f16(vo1p1, vi2x2345, vw01234567, 6);
419 vo2p1 = vfma_laneq_f16(vo2p1, vi3x2345, vw01234567, 6);
420 vo3p1 = vfma_laneq_f16(vo3p1, vi4x2345, vw01234567, 6);
421
422 vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
423 vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
424 vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
425 vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
426
427 vo0p1 = vfma_laneq_f16(vo0p1, vi3x2345, vwGHIJKLMN, 0);
428 vo1p1 = vfma_laneq_f16(vo1p1, vi4x2345, vwGHIJKLMN, 0);
429 vo2p1 = vfma_laneq_f16(vo2p1, vi5x2345, vwGHIJKLMN, 0);
430 vo3p1 = vfma_laneq_f16(vo3p1, vi6x2345, vwGHIJKLMN, 0);
431
432 vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
433 vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
434 vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
435 vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
436
437 const float16x4_t vi0x5678 = vext_f16(vi0x4567, vi0x89AB, 1);
438 const float16x4_t vi1x5678 = vext_f16(vi1x4567, vi1x89AB, 1);
439 const float16x4_t vi2x5678 = vext_f16(vi2x4567, vi2x89AB, 1);
440 const float16x4_t vi3x5678 = vext_f16(vi3x4567, vi3x89AB, 1);
441 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1);
442 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1);
443 const float16x4_t vi6x5678 = vext_f16(vi6x4567, vi6x89AB, 1);
444 const float16x4_t vi7x5678 = vext_f16(vi7x4567, vi7x89AB, 1);
445
446 vo0p1 = vfma_laneq_f16(vo0p1, vi0x5678, vw01234567, 4);
447 vo1p1 = vfma_laneq_f16(vo1p1, vi1x5678, vw01234567, 4);
448 vo2p1 = vfma_laneq_f16(vo2p1, vi2x5678, vw01234567, 4);
449 vo3p1 = vfma_laneq_f16(vo3p1, vi3x5678, vw01234567, 4);
450
451 vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
452 vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
453 vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
454 vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
455
456 vo0p1 = vfma_laneq_f16(vo0p1, vi2x5678, vw89ABCDEF, 6);
457 vo1p1 = vfma_laneq_f16(vo1p1, vi3x5678, vw89ABCDEF, 6);
458 vo2p1 = vfma_laneq_f16(vo2p1, vi4x5678, vw89ABCDEF, 6);
459 vo3p1 = vfma_laneq_f16(vo3p1, vi5x5678, vw89ABCDEF, 6);
460
461 vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
462 vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
463 vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
464 vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
465
466 vo0p1 = vfma_lane_f16(vo0p1, vi4x5678, vwOP, 0);
467 vo1p1 = vfma_lane_f16(vo1p1, vi5x5678, vwOP, 0);
468 vo2p1 = vfma_lane_f16(vo2p1, vi6x5678, vwOP, 0);
469 vo3p1 = vfma_lane_f16(vo3p1, vi7x5678, vwOP, 0);
470
471 const float16x4_t vi0x6789 = vext_f16(vi0x4567, vi0x89AB, 2);
472 vi0x4567 = vi0x89AB;
473 const float16x4_t vi1x6789 = vext_f16(vi1x4567, vi1x89AB, 2);
474 vi1x4567 = vi1x89AB;
475 const float16x4_t vi2x6789 = vext_f16(vi2x4567, vi2x89AB, 2);
476 vi2x4567 = vi2x89AB;
477 const float16x4_t vi3x6789 = vext_f16(vi3x4567, vi3x89AB, 2);
478 vi3x4567 = vi3x89AB;
479 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2);
480 vi4x4567 = vi4x89AB;
481 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2);
482 vi5x4567 = vi5x89AB;
483 const float16x4_t vi6x6789 = vext_f16(vi6x4567, vi6x89AB, 2);
484 vi6x4567 = vi6x89AB;
485 const float16x4_t vi7x6789 = vext_f16(vi7x4567, vi7x89AB, 2);
486 vi7x4567 = vi7x89AB;
487
488 vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
489 vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
490 vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
491 vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
492
493 vo0p1 = vfma_laneq_f16(vo0p1, vi1x6789, vw89ABCDEF, 2);
494 vo1p1 = vfma_laneq_f16(vo1p1, vi2x6789, vw89ABCDEF, 2);
495 vo2p1 = vfma_laneq_f16(vo2p1, vi3x6789, vw89ABCDEF, 2);
496 vo3p1 = vfma_laneq_f16(vo3p1, vi4x6789, vw89ABCDEF, 2);
497
498 vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
499 vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
500 vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
501 vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
502
503 vo0p1 = vfma_laneq_f16(vo0p1, vi3x6789, vwGHIJKLMN, 4);
504 vo1p1 = vfma_laneq_f16(vo1p1, vi4x6789, vwGHIJKLMN, 4);
505 vo2p1 = vfma_laneq_f16(vo2p1, vi5x6789, vwGHIJKLMN, 4);
506 vo3p1 = vfma_laneq_f16(vo3p1, vi6x6789, vwGHIJKLMN, 4);
507
508 vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
509 vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
510 vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
511 vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
512
513 vo0p0 = vadd_f16(vo0p0, vo0p1);
514 vo1p0 = vadd_f16(vo1p0, vo1p1);
515 vo2p0 = vadd_f16(vo2p0, vo2p1);
516 vo3p0 = vadd_f16(vo3p0, vo3p1);
517
518 float16x4_t vo0 = vmax_f16(vo0p0, vmin);
519 float16x4_t vo1 = vmax_f16(vo1p0, vmin);
520 float16x4_t vo2 = vmax_f16(vo2p0, vmin);
521 float16x4_t vo3 = vmax_f16(vo3p0, vmin);
522
523 vo0 = vmin_f16(vo0, vmax);
524 vo1 = vmin_f16(vo1, vmax);
525 vo2 = vmin_f16(vo2, vmax);
526 vo3 = vmin_f16(vo3, vmax);
527
528 vst1_f16(o3, vo3); o3 += 4;
529 vst1_f16(o2, vo2); o2 += 4;
530 vst1_f16(o1, vo1); o1 += 4;
531 vst1_f16(o0, vo0); o0 += 4;
532
533 w -= 4 * sizeof(__fp16);
534 }
535 assert(w >= 1 * sizeof(__fp16));
536 assert(w <= 4 * sizeof(__fp16));
537 {
538 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0);
539 float16x4_t vo1p0 = vdup_laneq_f16(vw01234567, 0);
540 float16x4_t vo2p0 = vdup_laneq_f16(vw01234567, 0);
541 float16x4_t vo3p0 = vdup_laneq_f16(vw01234567, 0);
542
543 vi0x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi0x4567)));
544 vi1x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi1x4567)));
545 vi2x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi2x4567)));
546 vi3x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi3x4567)));
547 vi4x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi4x4567)));
548 vi5x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi5x4567)));
549 vi6x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi6x4567)));
550 vi7x4567 = vreinterpret_f16_u16(vand_u16(vmask, vreinterpret_u16_f16(vi7x4567)));
551
552 float16x4_t vo0p1 = vmul_laneq_f16(vi0x4567, vw01234567, 3);
553 float16x4_t vo1p1 = vmul_laneq_f16(vi1x4567, vw01234567, 3);
554 float16x4_t vo2p1 = vmul_laneq_f16(vi2x4567, vw01234567, 3);
555 float16x4_t vo3p1 = vmul_laneq_f16(vi3x4567, vw01234567, 3);
556
557 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0);
558 vo1p0 = vfma_laneq_f16(vo1p0, vi2x4567, vw89ABCDEF, 0);
559 vo2p0 = vfma_laneq_f16(vo2p0, vi3x4567, vw89ABCDEF, 0);
560 vo3p0 = vfma_laneq_f16(vo3p0, vi4x4567, vw89ABCDEF, 0);
561
562 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5);
563 vo1p0 = vfma_laneq_f16(vo1p0, vi3x4567, vw89ABCDEF, 5);
564 vo2p0 = vfma_laneq_f16(vo2p0, vi4x4567, vw89ABCDEF, 5);
565 vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5);
566
567 vo0p1 = vfma_laneq_f16(vo0p1, vi3x4567, vwGHIJKLMN, 2);
568 vo1p1 = vfma_laneq_f16(vo1p1, vi4x4567, vwGHIJKLMN, 2);
569 vo2p1 = vfma_laneq_f16(vo2p1, vi5x4567, vwGHIJKLMN, 2);
570 vo3p1 = vfma_laneq_f16(vo3p1, vi6x4567, vwGHIJKLMN, 2);
571
572 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7);
573 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7);
574 vo2p0 = vfma_laneq_f16(vo2p0, vi6x4567, vwGHIJKLMN, 7);
575 vo3p0 = vfma_laneq_f16(vo3p0, vi7x4567, vwGHIJKLMN, 7);
576
577 const float16x4_t vi0x3456 = vext_f16(vi0x0123, vi0x4567, 3);
578 const float16x4_t vi1x3456 = vext_f16(vi1x0123, vi1x4567, 3);
579 const float16x4_t vi2x3456 = vext_f16(vi2x0123, vi2x4567, 3);
580 const float16x4_t vi3x3456 = vext_f16(vi3x0123, vi3x4567, 3);
581 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3);
582 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3);
583 const float16x4_t vi6x3456 = vext_f16(vi6x0123, vi6x4567, 3);
584 const float16x4_t vi7x3456 = vext_f16(vi7x0123, vi7x4567, 3);
585
586 vo0p1 = vfma_laneq_f16(vo0p1, vi0x3456, vw01234567, 2);
587 vo1p1 = vfma_laneq_f16(vo1p1, vi1x3456, vw01234567, 2);
588 vo2p1 = vfma_laneq_f16(vo2p1, vi2x3456, vw01234567, 2);
589 vo3p1 = vfma_laneq_f16(vo3p1, vi3x3456, vw01234567, 2);
590
591 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7);
592 vo1p0 = vfma_laneq_f16(vo1p0, vi2x3456, vw01234567, 7);
593 vo2p0 = vfma_laneq_f16(vo2p0, vi3x3456, vw01234567, 7);
594 vo3p0 = vfma_laneq_f16(vo3p0, vi4x3456, vw01234567, 7);
595
596 vo0p1 = vfma_laneq_f16(vo0p1, vi2x3456, vw89ABCDEF, 4);
597 vo1p1 = vfma_laneq_f16(vo1p1, vi3x3456, vw89ABCDEF, 4);
598 vo2p1 = vfma_laneq_f16(vo2p1, vi4x3456, vw89ABCDEF, 4);
599 vo3p1 = vfma_laneq_f16(vo3p1, vi5x3456, vw89ABCDEF, 4);
600
601 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1);
602 vo1p0 = vfma_laneq_f16(vo1p0, vi4x3456, vwGHIJKLMN, 1);
603 vo2p0 = vfma_laneq_f16(vo2p0, vi5x3456, vwGHIJKLMN, 1);
604 vo3p0 = vfma_laneq_f16(vo3p0, vi6x3456, vwGHIJKLMN, 1);
605
606 vo0p1 = vfma_laneq_f16(vo0p1, vi4x3456, vwGHIJKLMN, 6);
607 vo1p1 = vfma_laneq_f16(vo1p1, vi5x3456, vwGHIJKLMN, 6);
608 vo2p1 = vfma_laneq_f16(vo2p1, vi6x3456, vwGHIJKLMN, 6);
609 vo3p1 = vfma_laneq_f16(vo3p1, vi7x3456, vwGHIJKLMN, 6);
610
611 const float16x4_t vi0x2345 = vext_f16(vi0x0123, vi0x4567, 2);
612 const float16x4_t vi1x2345 = vext_f16(vi1x0123, vi1x4567, 2);
613 const float16x4_t vi2x2345 = vext_f16(vi2x0123, vi2x4567, 2);
614 const float16x4_t vi3x2345 = vext_f16(vi3x0123, vi3x4567, 2);
615 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2);
616 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2);
617 const float16x4_t vi6x2345 = vext_f16(vi6x0123, vi6x4567, 2);
618 const float16x4_t vi7x2345 = vext_f16(vi7x0123, vi7x4567, 2);
619
620 vo0p0 = vfma_laneq_f16(vo0p0, vi0x2345, vw01234567, 1);
621 vo1p0 = vfma_laneq_f16(vo1p0, vi1x2345, vw01234567, 1);
622 vo2p0 = vfma_laneq_f16(vo2p0, vi2x2345, vw01234567, 1);
623 vo3p0 = vfma_laneq_f16(vo3p0, vi3x2345, vw01234567, 1);
624
625 vo0p1 = vfma_laneq_f16(vo0p1, vi1x2345, vw01234567, 6);
626 vo1p1 = vfma_laneq_f16(vo1p1, vi2x2345, vw01234567, 6);
627 vo2p1 = vfma_laneq_f16(vo2p1, vi3x2345, vw01234567, 6);
628 vo3p1 = vfma_laneq_f16(vo3p1, vi4x2345, vw01234567, 6);
629
630 vo0p0 = vfma_laneq_f16(vo0p0, vi2x2345, vw89ABCDEF, 3);
631 vo1p0 = vfma_laneq_f16(vo1p0, vi3x2345, vw89ABCDEF, 3);
632 vo2p0 = vfma_laneq_f16(vo2p0, vi4x2345, vw89ABCDEF, 3);
633 vo3p0 = vfma_laneq_f16(vo3p0, vi5x2345, vw89ABCDEF, 3);
634
635 vo0p1 = vfma_laneq_f16(vo0p1, vi3x2345, vwGHIJKLMN, 0);
636 vo1p1 = vfma_laneq_f16(vo1p1, vi4x2345, vwGHIJKLMN, 0);
637 vo2p1 = vfma_laneq_f16(vo2p1, vi5x2345, vwGHIJKLMN, 0);
638 vo3p1 = vfma_laneq_f16(vo3p1, vi6x2345, vwGHIJKLMN, 0);
639
640 vo0p0 = vfma_laneq_f16(vo0p0, vi4x2345, vwGHIJKLMN, 5);
641 vo1p0 = vfma_laneq_f16(vo1p0, vi5x2345, vwGHIJKLMN, 5);
642 vo2p0 = vfma_laneq_f16(vo2p0, vi6x2345, vwGHIJKLMN, 5);
643 vo3p0 = vfma_laneq_f16(vo3p0, vi7x2345, vwGHIJKLMN, 5);
644
645 const float16x4_t vzero = vmov_n_f16(0);
646 const float16x4_t vi0x5678 = vext_f16(vi0x4567, vzero, 1);
647 const float16x4_t vi1x5678 = vext_f16(vi1x4567, vzero, 1);
648 const float16x4_t vi2x5678 = vext_f16(vi2x4567, vzero, 1);
649 const float16x4_t vi3x5678 = vext_f16(vi3x4567, vzero, 1);
650 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vzero, 1);
651 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vzero, 1);
652 const float16x4_t vi6x5678 = vext_f16(vi6x4567, vzero, 1);
653 const float16x4_t vi7x5678 = vext_f16(vi7x4567, vzero, 1);
654
655 vo0p1 = vfma_laneq_f16(vo0p1, vi0x5678, vw01234567, 4);
656 vo1p1 = vfma_laneq_f16(vo1p1, vi1x5678, vw01234567, 4);
657 vo2p1 = vfma_laneq_f16(vo2p1, vi2x5678, vw01234567, 4);
658 vo3p1 = vfma_laneq_f16(vo3p1, vi3x5678, vw01234567, 4);
659
660 vo0p0 = vfma_laneq_f16(vo0p0, vi1x5678, vw89ABCDEF, 1);
661 vo1p0 = vfma_laneq_f16(vo1p0, vi2x5678, vw89ABCDEF, 1);
662 vo2p0 = vfma_laneq_f16(vo2p0, vi3x5678, vw89ABCDEF, 1);
663 vo3p0 = vfma_laneq_f16(vo3p0, vi4x5678, vw89ABCDEF, 1);
664
665 vo0p1 = vfma_laneq_f16(vo0p1, vi2x5678, vw89ABCDEF, 6);
666 vo1p1 = vfma_laneq_f16(vo1p1, vi3x5678, vw89ABCDEF, 6);
667 vo2p1 = vfma_laneq_f16(vo2p1, vi4x5678, vw89ABCDEF, 6);
668 vo3p1 = vfma_laneq_f16(vo3p1, vi5x5678, vw89ABCDEF, 6);
669
670 vo0p0 = vfma_laneq_f16(vo0p0, vi3x5678, vwGHIJKLMN, 3);
671 vo1p0 = vfma_laneq_f16(vo1p0, vi4x5678, vwGHIJKLMN, 3);
672 vo2p0 = vfma_laneq_f16(vo2p0, vi5x5678, vwGHIJKLMN, 3);
673 vo3p0 = vfma_laneq_f16(vo3p0, vi6x5678, vwGHIJKLMN, 3);
674
675 vo0p1 = vfma_lane_f16(vo0p1, vi4x5678, vwOP, 0);
676 vo1p1 = vfma_lane_f16(vo1p1, vi5x5678, vwOP, 0);
677 vo2p1 = vfma_lane_f16(vo2p1, vi6x5678, vwOP, 0);
678 vo3p1 = vfma_lane_f16(vo3p1, vi7x5678, vwOP, 0);
679
680 const float16x4_t vi0x6789 = vext_f16(vi0x5678, vzero, 1);
681 const float16x4_t vi1x6789 = vext_f16(vi1x5678, vzero, 1);
682 const float16x4_t vi2x6789 = vext_f16(vi2x5678, vzero, 1);
683 const float16x4_t vi3x6789 = vext_f16(vi3x5678, vzero, 1);
684 const float16x4_t vi4x6789 = vext_f16(vi4x5678, vzero, 1);
685 const float16x4_t vi5x6789 = vext_f16(vi5x5678, vzero, 1);
686 const float16x4_t vi6x6789 = vext_f16(vi6x5678, vzero, 1);
687 const float16x4_t vi7x6789 = vext_f16(vi7x5678, vzero, 1);
688
689 vo0p0 = vfma_laneq_f16(vo0p0, vi0x6789, vw01234567, 5);
690 vo1p0 = vfma_laneq_f16(vo1p0, vi1x6789, vw01234567, 5);
691 vo2p0 = vfma_laneq_f16(vo2p0, vi2x6789, vw01234567, 5);
692 vo3p0 = vfma_laneq_f16(vo3p0, vi3x6789, vw01234567, 5);
693
694 vo0p1 = vfma_laneq_f16(vo0p1, vi1x6789, vw89ABCDEF, 2);
695 vo1p1 = vfma_laneq_f16(vo1p1, vi2x6789, vw89ABCDEF, 2);
696 vo2p1 = vfma_laneq_f16(vo2p1, vi3x6789, vw89ABCDEF, 2);
697 vo3p1 = vfma_laneq_f16(vo3p1, vi4x6789, vw89ABCDEF, 2);
698
699 vo0p0 = vfma_laneq_f16(vo0p0, vi2x6789, vw89ABCDEF, 7);
700 vo1p0 = vfma_laneq_f16(vo1p0, vi3x6789, vw89ABCDEF, 7);
701 vo2p0 = vfma_laneq_f16(vo2p0, vi4x6789, vw89ABCDEF, 7);
702 vo3p0 = vfma_laneq_f16(vo3p0, vi5x6789, vw89ABCDEF, 7);
703
704 vo0p1 = vfma_laneq_f16(vo0p1, vi3x6789, vwGHIJKLMN, 4);
705 vo1p1 = vfma_laneq_f16(vo1p1, vi4x6789, vwGHIJKLMN, 4);
706 vo2p1 = vfma_laneq_f16(vo2p1, vi5x6789, vwGHIJKLMN, 4);
707 vo3p1 = vfma_laneq_f16(vo3p1, vi6x6789, vwGHIJKLMN, 4);
708
709 vo0p0 = vfma_lane_f16(vo0p0, vi4x6789, vwOP, 1);
710 vo1p0 = vfma_lane_f16(vo1p0, vi5x6789, vwOP, 1);
711 vo2p0 = vfma_lane_f16(vo2p0, vi6x6789, vwOP, 1);
712 vo3p0 = vfma_lane_f16(vo3p0, vi7x6789, vwOP, 1);
713
714 vo0p0 = vadd_f16(vo0p0, vo0p1);
715 vo1p0 = vadd_f16(vo1p0, vo1p1);
716 vo2p0 = vadd_f16(vo2p0, vo2p1);
717 vo3p0 = vadd_f16(vo3p0, vo3p1);
718
719 float16x4_t vo0 = vmax_f16(vo0p0, vmin);
720 float16x4_t vo1 = vmax_f16(vo1p0, vmin);
721 float16x4_t vo2 = vmax_f16(vo2p0, vmin);
722 float16x4_t vo3 = vmax_f16(vo3p0, vmin);
723
724 vo0 = vmin_f16(vo0, vmax);
725 vo1 = vmin_f16(vo1, vmax);
726 vo2 = vmin_f16(vo2, vmax);
727 vo3 = vmin_f16(vo3, vmax);
728
729 if XNN_LIKELY(w & (4 * sizeof(__fp16))) {
730 vst1_f16(o3, vo3); o3 += 4;
731 vst1_f16(o2, vo2); o2 += 4;
732 vst1_f16(o1, vo1); o1 += 4;
733 vst1_f16(o0, vo0); o0 += 4;
734 } else {
735 if (w & (2 * sizeof(__fp16))) {
736 vst1_lane_u32((void*) o3, vreinterpret_u32_f16(vo3), 0); o3 += 2;
737 vst1_lane_u32((void*) o2, vreinterpret_u32_f16(vo2), 0); o2 += 2;
738 vst1_lane_u32((void*) o1, vreinterpret_u32_f16(vo1), 0); o1 += 2;
739 vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vo0), 0); o0 += 2;
740
741 vo0 = vext_f16(vo0, vo0, 2);
742 vo1 = vext_f16(vo1, vo1, 2);
743 vo2 = vext_f16(vo2, vo2, 2);
744 vo3 = vext_f16(vo3, vo3, 2);
745 }
746 if (w & (1 * sizeof(__fp16))) {
747 vst1_lane_f16(o3, vo3, 0); o3 += 1;
748 vst1_lane_f16(o2, vo2, 0); o2 += 1;
749 vst1_lane_f16(o1, vo1, 0); o1 += 1;
750 vst1_lane_f16(o0, vo0, 0); o0 += 1;
751 }
752 }
753 }
754
755 i0 = (const __fp16*) ((uintptr_t) i4 - input_decrement);
756 i1 = (const __fp16*) ((uintptr_t) i5 - input_decrement);
757 i2 = (const __fp16*) ((uintptr_t) i1 + input_width);
758 i3 = (const __fp16*) ((uintptr_t) i2 + input_width);
759 i4 = (const __fp16*) ((uintptr_t) i3 + input_width);
760 i5 = (const __fp16*) ((uintptr_t) i4 + input_width);
761 i6 = (const __fp16*) ((uintptr_t) i5 + input_width);
762 i7 = (const __fp16*) ((uintptr_t) i6 + input_width);
763
764 o0 = o3;
765 o1 = (__fp16*) ((uintptr_t) o0 + input_width);
766 o2 = (__fp16*) ((uintptr_t) o1 + input_width);
767 o3 = (__fp16*) ((uintptr_t) o2 + input_width);
768
769 output_height = doz(output_height, 4);
770 } while (output_height != 0);
771 }
772