xref: /aosp_15_r20/external/XNNPACK/src/f32-dwconv/up4x9-minmax-aarch64-neonfma-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55(
9#     size_t channels,                   x0, x20
10#     size_t output_width,               x1
11#     const float** input,               x2
12#     const float* weights,              x3, x19
13#     float* output,                     x4
14#     size_t input_stride,               x5
15#     size_t output_increment,           x6
16#     size_t input_offset,               x7
17#     const float* zero,                 [sp + 64] -> x17
18#     const xnn_f32_minmax_params params [sp + 72] -> (x16)
19
20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
21
22# inputs
23# i0  x8
24# i1  x9
25# i2 x10
26# i3 x11
27# i4 x12
28# i5 x13
29# i6 x14
30# i7 x15
31# i8 x16
32
33# weights.  Bias and 9 weights.
34# x19
35
36# accumulators
37# v0-v3
38
39# Input and weight paired values.
40# Inputs are even and weights are odd registers
41# v4  v5
42# v6  v7
43# v10 v11
44# v12 v13
45# v14 v15
46# v16 v17
47# v18 v19
48# v20 v21
49# v22 v23
50# v24 v25
51# v26 v27
52# v28 v29
53
54# Clamp v30 v31
55
56# unused v8 v9
57
58BEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55
59
60        # Load zero, params pointer
61        LDP     x17, x16, [sp]
62
63        # Save x19-x20,d10-d15 on stack
64        STP     x19, x20, [sp, -64]!
65        STP     d10, d11, [sp, 16]
66        STP     d12, d13, [sp, 32]
67        STP     d14, d15, [sp, 48]
68
69        # Load min/max values
70        LD2R    {v30.4s, v31.4s}, [x16]
71
720:
73        # Load 9 input pointers
74        LDP     x8, x9, [x2]
75        LDP     x10, x11, [x2, 16]
76        LDP     x12, x13, [x2, 32]
77        LDP     x14, x15, [x2, 48]
78        LDR     x16, [x2, 64]
79
80        CMP     x8, x17                 // if i0 == zero
81        ADD     x8, x8, x7              // i0 += input_offset
82        CSEL    x8, x17, x8, EQ         //   i0 = zero, else += i0 + input_offset
83        CMP     x9, x17                 // if i1 == zero
84        ADD     x9, x9, x7              // i1 += input_offset
85        CSEL    x9, x17, x9, EQ         //   i1 = zero, else += i1 + input_offset
86        CMP     x10, x17                // if i2 == zero
87        ADD     x10, x10, x7            // i2 += input_offset
88        CSEL    x10, x17, x10, EQ       //   i2 = zero, else += i2 + input_offset
89        CMP     x11, x17                // if i3 == zero
90        ADD     x11, x11, x7            // i3 += input_offset
91        CSEL    x11, x17, x11, EQ       //   i3 = zero, else += i3 + input_offset
92        CMP     x12, x17                // if i4 == zero
93        ADD     x12, x12, x7            // i4 += input_offset
94        CSEL    x12, x17, x12, EQ       //   i4 = zero, else += i4 + input_offset
95        CMP     x13, x17                // if i5 == zero
96        ADD     x13, x13, x7            // i5 += input_offset
97        CSEL    x13, x17, x13, EQ       //   i5 = zero, else += i5 + input_offset
98        CMP     x14, x17                // if i6 == zero
99        ADD     x14, x14, x7            // i6 += input_offset
100        CSEL    x14, x17, x14, EQ       //   i6 = zero, else += i6 + input_offset
101        CMP     x15, x17                // if i7 == zero
102        ADD     x15, x15, x7            // i7 += input_offset
103        CSEL    x15, x17, x15, EQ       //   i7 = zero, else += i7 + input_offset
104        CMP     x16, x17                // if i8 == zero
105        ADD     x16, x16, x7            // i8 += input_offset
106        CSEL    x16, x17, x16, EQ       //   i8 = zero, else += i8 + input_offset
107
108        # input += input_stride
109        ADD     x2, x2, x5
110
111        # x20 := c = channels
112        # c -= 8
113        SUBS    x20, x0, 8
114        # x19 := w = weights
115        MOV     x19, x3
116
117        # skip main loop if c < 8
118        B.LO    3f
119
120          # SWP prologue
121
122          # Load vbias.lo
123          LD1     {v0.2S}, [x19], 8
124
125          # Load vbias.hi
126          LD1     {v1.2S}, [x19], 8
127
128          # Load vi0.lo
129          LD1     {v4.2S}, [x8], 8
130
131          # Load vk0.lo
132          LD1     {v5.2S}, [x19], 8
133
134          # Load vi0.hi
135          LD1     {v6.2S}, [x8], 8
136
137          # Load vk0.hi
138          LD1     {v7.2S}, [x19], 8
139
140          # Load vi1.lo
141          LD1     {v28.2S}, [x9], 8
142
143          # Load vk1.lo
144          LD1     {v29.2S}, [x19], 8
145
146          # Load vi1.hi
147          LD1     {v10.2S}, [x9], 8
148
149          # Load vk1.hi
150          LD1     {v11.2S}, [x19], 8
151
152          # Load vi2.lo
153          LD1     {v12.2S}, [x10], 8
154
155          # Load vk2.lo
156          LD1     {v13.2S}, [x19], 8
157
158          # Load vi2.hi
159          LD1     {v14.2S}, [x10], 8
160
161          # Load vk2.hi
162          LD1     {v15.2S}, [x19], 8
163
164          # Load vi3.lo
165          LD1     {v16.2S}, [x11], 8
166
167          # Load vk3.lo
168          LD1     {v17.2S}, [x19], 8
169
170          # Load vi3.hi
171          LD1     {v18.2S}, [x11], 8
172
173          # Load vk3.hi
174          LD1     {v19.2S}, [x19], 8
175
176          # Load vi4.lo
177          LD1     {v20.2S}, [x12], 8
178
179          # Load vk4.lo
180          LD1     {v21.2S}, [x19], 8
181
182          # Load vi4.hi
183          LD1     {v22.2S}, [x12], 8
184
185          # Load vk4.hi
186          LD1     {v23.2S}, [x19], 8
187
188          # Load vi5.lo
189          LD1     {v24.2S}, [x13], 8
190
191          # Load vk5.lo
192          LD1     {v25.2S}, [x19], 8
193
194          # Load vi5.hi
195          LD1     {v26.2S}, [x13], 8
196
197          # Load vk5.hi
198          LD1     {v27.2S}, [x19], 8
199
200          # vacc.lo += vi0.lo * vk0.lo
201          FMLA    v0.2S, v4.2S, v5.2S
202          # Load vi6.lo
203          LD1     {v4.2S}, [x14], 8
204
205          # Load vk6.lo
206          LD1     {v5.2S}, [x19], 8
207
208          # vacc.hi += vi0.hi * vk0.hi
209          FMLA    v1.2S, v6.2S, v7.2S
210          # Load vi6.hi
211          LD1     {v6.2S}, [x14], 8
212
213          # Load vk6.hi
214          LD1     {v7.2S}, [x19], 8
215
216          # vacc.lo += vi1.lo * vk0.lo
217          FMLA    v0.2S, v28.2S, v29.2S
218          # Load vi7.lo
219          LD1     {v28.2S}, [x15], 8
220
221          # Load vk7.lo
222          LD1     {v29.2S}, [x19], 8
223
224          # vacc.hi += vi1.hi * vk0.hi
225          FMLA    v1.2S, v10.2S, v11.2S
226          # Load vi7.hi
227          LD1     {v10.2S}, [x15], 8
228
229          # Load vk7.hi
230          LD1     {v11.2S}, [x19], 8
231
232          # vacc.lo += vi2.lo * vk2.lo
233          FMLA    v0.2S, v12.2S, v13.2S
234          # Load vi8.lo
235          LD1     {v12.2S}, [x16], 8
236
237          # Load vk8.lo
238          LD1     {v13.2S}, [x19], 8
239
240          # vacc.hi += vi2.hi * vk2.hi
241          FMLA    v1.2S, v14.2S, v15.2S
242          # Load vi8.hi
243          LD1     {v14.2S}, [x16], 8
244
245          # Load vk8.hi
246          LD1     {v15.2S}, [x19], 8
247
248          # Load vbias_next.lo
249          LD1     {v2.2S}, [x19], 8
250
251          # Load vbias_next.hi
252          LD1     {v3.2S}, [x19], 8
253
254          # vacc.lo += vi3.lo * vk3.lo
255          FMLA    v0.2S, v16.2S, v17.2S
256          # Load vi0_next.lo
257          LD1     {v16.2S}, [x8], 8
258
259          # Load vk0_next.lo
260          LD1     {v17.2S}, [x19], 8
261
262          # vacc.hi += vi3.hi * vk3.hi
263          FMLA    v1.2S, v18.2S, v19.2S
264          # Load vi0_next.hi
265          LD1     {v18.2S}, [x8], 8
266
267          # Load vk0_next.hi
268          LD1     {v19.2S}, [x19], 8
269
270          # vacc.lo += vi4.lo * vk4.lo
271          FMLA    v0.2S, v20.2S, v21.2S
272          # Load vi1_next.lo
273          LD1     {v20.2S}, [x9], 8
274
275          # Load vk1_next.lo
276          LD1     {v21.2S}, [x19], 8
277
278          # vacc.hi += vi4.hi * vk4.hi
279          FMLA    v1.2S, v22.2S, v23.2S
280          # Load vi1_next.hi
281          LD1     {v22.2S}, [x9], 8
282
283          # Load vk1_next.hi
284          LD1     {v23.2S}, [x19], 8
285
286          # vacc.lo += vi5.lo * vk5.lo
287          FMLA    v0.2S, v24.2S, v25.2S
288          # Load vi2_next.lo
289          LD1     {v24.2S}, [x10], 8
290
291          # Load vk2_next.lo
292          LD1     {v25.2S}, [x19], 8
293
294          # vacc.hi += vi5.hi * vk5.hi
295          FMLA    v1.2S, v26.2S, v27.2S
296          # Load vi2_next.hi
297          LD1     {v26.2S}, [x10], 8
298
299          # Load vk2_next.hi
300          LD1     {v27.2S}, [x19], 8
301
302          # vacc.lo += vi6.lo * vk6.lo
303          FMLA    v0.2S, v4.2S, v5.2S
304          # Load vi3_next.lo
305          LD1     {v4.2S}, [x11], 8
306
307          # Load vk3_next.lo
308          LD1     {v5.2S}, [x19], 8
309
310          # vacc.hi += vi6.hi * vk6.hi
311          FMLA    v1.2S, v6.2S, v7.2S
312          # Load vi3_next.hi
313          LD1     {v6.2S}, [x11], 8
314
315          # Load vk3_next.hi
316          LD1     {v7.2S}, [x19], 8
317
318          # vacc.lo += vi7.lo * vk7.lo
319          FMLA    v0.2S, v28.2S, v29.2S
320          # Load vi4_next.lo
321          LD1     {v28.2S}, [x12], 8
322
323          # Load vk4_next.lo
324          LD1     {v29.2S}, [x19], 8
325
326          # vacc.hi += vi7.hi * vk7.hi
327          FMLA    v1.2S, v10.2S, v11.2S
328          # Load vi4_next.hi
329          LD1     {v10.2S}, [x12], 8
330
331          # Load vk4_next.hi
332          LD1     {v11.2S}, [x19], 8
333
334          # vacc.lo += vi8.lo * vk8.lo
335          FMLA    v0.2S, v12.2S, v13.2S
336          # Load vi5_next.lo
337          LD1     {v12.2S}, [x13], 8
338
339          # Load vk5_next.lo
340          LD1     {v13.2S}, [x19], 8
341
342          # vacc.hi += vi8.hi * vk8.hi
343          FMLA    v1.2S, v14.2S, v15.2S
344          # Load vi5_next.hi
345          LD1     {v14.2S}, [x13], 8
346
347          # Load vk5_next.hi
348          LD1     {v15.2S}, [x19], 8
349
350          # vacc_next.lo += vi0_next.lo * vk0_next.lo
351          FMLA    v2.2S, v16.2S, v17.2S
352          # Load vi6_next.lo
353          LD1     {v16.2S}, [x14], 8
354
355          # vacc.lo = min(vacc.lo, vmin)
356          FMAX    v0.2S, v0.2S, v30.2S
357          # Load vk6_next.lo
358          LD1     {v17.2S}, [x19], 8
359
360          # vacc_next.hi += vi0_next.hi * vk0_next.hi
361          FMLA    v3.2S, v18.2S, v19.2S
362          # Load vi6_next.hi
363          LD1     {v18.2S}, [x14], 8
364
365          # vacc.hi = min(vacc.hi, vmin)
366          FMAX    v1.2S, v1.2S, v30.2S
367          # Load vk6_next.hi
368          LD1     {v19.2S}, [x19], 8
369
370          # vacc_next.lo += vi1_next.lo * vk1_next.lo
371          FMLA    v2.2S, v20.2S, v21.2S
372          # Load vi7_next.lo
373          LD1     {v20.2S}, [x15], 8
374
375          # vacc.lo = max(vacc.lo, vmax)
376          FMIN    v0.2S, v0.2S, v31.2S
377          # Load vk7_next.lo
378          LD1     {v21.2S}, [x19], 8
379
380          # vacc_next.hi += vi1_next.hi * vk1_next.hi
381          FMLA    v3.2S, v22.2S, v23.2S
382          # Load vi7_next.hi
383          LD1     {v22.2S}, [x15], 8
384
385          # vacc.hi = max(vacc.hi, vmax)
386          FMIN    v1.2S, v1.2S, v31.2S
387          # Load vk7_next.hi
388          LD1     {v23.2S}, [x19], 8
389
390          # vacc_next.lo += vi2_next.lo * vk2_next.lo
391          FMLA    v2.2S, v24.2S, v25.2S
392          # Load vi8_next.lo
393          LD1     {v24.2S}, [x16], 8
394
395          # Load vk8_next.lo
396          LD1     {v25.2S}, [x19], 8
397
398          # vacc_next.hi += vi2_next.hi * vk2_next.hi
399          FMLA    v3.2S, v26.2S, v27.2S
400          # Load vi8_next.hi
401          LD1     {v26.2S}, [x16], 8
402
403          # Store vacc
404          STP     d0, d1, [x4], 16
405
406          # c -= 8
407          SUBS    x20, x20, 8
408          # Load vk8_next.hi
409          LD1     {v27.2S}, [x19], 8
410
411          B.LO    2f
412
4131:
414            # SWP iteration
415
416            # Load vbias.lo
417            LD1     {v0.2S}, [x19], 8
418
419            # Load vbias.hi
420            LD1     {v1.2S}, [x19], 8
421
422            # vacc_prev.lo += vi3_prev.lo * vk3_prev.lo
423            FMLA    v2.2S, v4.2S, v5.2S
424            # Load vi0.lo
425            LD1     {v4.2S}, [x8], 8
426
427            # Load vk0.lo
428            LD1     {v5.2S}, [x19], 8
429
430            # vacc_prev.hi += vi3_prev.hi * vk3_prev.hi
431            FMLA    v3.2S, v6.2S, v7.2S
432            # Load vi0.hi
433            LD1     {v6.2S}, [x8], 8
434
435            # Load vk0.hi
436            LD1     {v7.2S}, [x19], 8
437
438            # vacc_prev.lo += vi4_prev.lo * vk4_prev.lo
439            FMLA    v2.2S, v28.2S, v29.2S
440            # Load vi1.lo
441            LD1     {v28.2S}, [x9], 8
442
443            # Load vk1.lo
444            LD1     {v29.2S}, [x19], 8
445
446            # vacc_prev.hi += vi4_prev.hi * vk4_prev.hi
447            FMLA    v3.2S, v10.2S, v11.2S
448            # Load vi1.hi
449            LD1     {v10.2S}, [x9], 8
450
451            # Load vk1.hi
452            LD1     {v11.2S}, [x19], 8
453
454            # vacc_prev.lo += vi5_prev.lo * vk5_prev.lo
455            FMLA    v2.2S, v12.2S, v13.2S
456            # Load vi2.lo
457            LD1     {v12.2S}, [x10], 8
458
459            # Load vk2.lo
460            LD1     {v13.2S}, [x19], 8
461
462            # vacc_prev.hi += vi5_prev.hi * vk5_prev.hi
463            FMLA    v3.2S, v14.2S, v15.2S
464            # Load vi2.hi
465            LD1     {v14.2S}, [x10], 8
466
467            # Load vk2.hi
468            LD1     {v15.2S}, [x19], 8
469
470            # vacc_prev.lo += vi6_prev.lo * vk6_prev.lo
471            FMLA    v2.2S, v16.2S, v17.2S
472            # Load vi3.lo
473            LD1     {v16.2S}, [x11], 8
474
475            # Load vk3.lo
476            LD1     {v17.2S}, [x19], 8
477
478            # vacc_prev.hi += vi6_prev.hi * vk6_prev.hi
479            FMLA    v3.2S, v18.2S, v19.2S
480            # Load vi3.hi
481            LD1     {v18.2S}, [x11], 8
482
483            # Load vk3.hi
484            LD1     {v19.2S}, [x19], 8
485
486            # vacc_prev.lo += vi7_prev.lo * vk7_prev.lo
487            FMLA    v2.2S, v20.2S, v21.2S
488            # Load vi4.lo
489            LD1     {v20.2S}, [x12], 8
490
491            # Load vk4.lo
492            LD1     {v21.2S}, [x19], 8
493
494            # vacc_prev.hi += vi7_prev.hi * vk7_prev.hi
495            FMLA    v3.2S, v22.2S, v23.2S
496            # Load vi4.hi
497            LD1     {v22.2S}, [x12], 8
498
499            # Load vk4.hi
500            LD1     {v23.2S}, [x19], 8
501
502            # vacc_prev.lo += vi8_prev.lo * vk8_prev.lo
503            FMLA    v2.2S, v24.2S, v25.2S
504            # Load vi5.lo
505            LD1     {v24.2S}, [x13], 8
506
507            # Load vk5.lo
508            LD1     {v25.2S}, [x19], 8
509
510            # vacc_prev.hi += vi8_prev.hi * vk8_prev.hi
511            FMLA    v3.2S, v26.2S, v27.2S
512            # Load vi5.hi
513            LD1     {v26.2S}, [x13], 8
514
515            # Load vk5.hi
516            LD1     {v27.2S}, [x19], 8
517
518            # vacc.lo += vi0.lo * vk0.lo
519            FMLA    v0.2S, v4.2S, v5.2S
520            # Load vi6.lo
521            LD1     {v4.2S}, [x14], 8
522
523            # vacc_prev.lo = min(vacc_prev.lo, vmin)
524            FMAX    v2.2S, v2.2S, v30.2S
525            # Load vk6.lo
526            LD1     {v5.2S}, [x19], 8
527
528            # vacc.hi += vi0.hi * vk0.hi
529            FMLA    v1.2S, v6.2S, v7.2S
530            # Load vi6.hi
531            LD1     {v6.2S}, [x14], 8
532
533            # vacc_prev.hi = min(vacc_prev.hi, vmin)
534            FMAX    v3.2S, v3.2S, v30.2S
535            # Load vk6.hi
536            LD1     {v7.2S}, [x19], 8
537
538            # vacc.lo += vi1.lo * vk0.lo
539            FMLA    v0.2S, v28.2S, v29.2S
540            # Load vi7.lo
541            LD1     {v28.2S}, [x15], 8
542
543            # vacc_prev.lo = max(vacc_prev.lo, vmax)
544            FMIN    v2.2S, v2.2S, v31.2S
545            # Load vk7.lo
546            LD1     {v29.2S}, [x19], 8
547
548            # vacc.hi += vi1.hi * vk0.hi
549            FMLA    v1.2S, v10.2S, v11.2S
550            # Load vi7.hi
551            LD1     {v10.2S}, [x15], 8
552
553            # vacc_prev.lo = max(vacc_prev.lo, vmax)
554            FMIN    v3.2S, v3.2S, v31.2S
555            # Load vk7.hi
556            LD1     {v11.2S}, [x19], 8
557
558            # vacc.lo += vi2.lo * vk2.lo
559            FMLA    v0.2S, v12.2S, v13.2S
560            # Load vi8.lo
561            LD1     {v12.2S}, [x16], 8
562
563            # Load vk8.lo
564            LD1     {v13.2S}, [x19], 8
565
566            # vacc.hi += vi2.hi * vk2.hi
567            FMLA    v1.2S, v14.2S, v15.2S
568            # Load vi8.hi
569            LD1     {v14.2S}, [x16], 8
570
571            # Store vacc_prev
572            STP     d2, d3, [x4], 16
573
574            # Load vk8.hi
575            LD1     {v15.2S}, [x19], 8
576
577            # Load vbias_next.lo
578            LD1     {v2.2S}, [x19], 8
579
580            # Load vbias_next.hi
581            LD1     {v3.2S}, [x19], 8
582
583            # vacc.lo += vi3.lo * vk3.lo
584            FMLA    v0.2S, v16.2S, v17.2S
585            # Load vi0_next.lo
586            LD1     {v16.2S}, [x8], 8
587
588            # Load vk0_next.lo
589            LD1     {v17.2S}, [x19], 8
590
591            # vacc.hi += vi3.hi * vk3.hi
592            FMLA    v1.2S, v18.2S, v19.2S
593            # Load vi0_next.hi
594            LD1     {v18.2S}, [x8], 8
595
596            # Load vk0_next.hi
597            LD1     {v19.2S}, [x19], 8
598
599            # vacc.lo += vi4.lo * vk4.lo
600            FMLA    v0.2S, v20.2S, v21.2S
601            # Load vi1_next.lo
602            LD1     {v20.2S}, [x9], 8
603
604            # Load vk1_next.lo
605            LD1     {v21.2S}, [x19], 8
606
607            # vacc.hi += vi4.hi * vk4.hi
608            FMLA    v1.2S, v22.2S, v23.2S
609            # Load vi1_next.hi
610            LD1     {v22.2S}, [x9], 8
611
612            # Load vk1_next.hi
613            LD1     {v23.2S}, [x19], 8
614
615            # vacc.lo += vi5.lo * vk5.lo
616            FMLA    v0.2S, v24.2S, v25.2S
617            # Load vi2_next.lo
618            LD1     {v24.2S}, [x10], 8
619
620            # Load vk2_next.lo
621            LD1     {v25.2S}, [x19], 8
622
623            # vacc.hi += vi5.hi * vk5.hi
624            FMLA    v1.2S, v26.2S, v27.2S
625            # Load vi2_next.hi
626            LD1     {v26.2S}, [x10], 8
627
628            # Load vk2_next.hi
629            LD1     {v27.2S}, [x19], 8
630
631            # vacc.lo += vi6.lo * vk6.lo
632            FMLA    v0.2S, v4.2S, v5.2S
633            # Load vi3_next.lo
634            LD1     {v4.2S}, [x11], 8
635
636            # Load vk3_next.lo
637            LD1     {v5.2S}, [x19], 8
638
639            # vacc.hi += vi6.hi * vk6.hi
640            FMLA    v1.2S, v6.2S, v7.2S
641            # Load vi3_next.hi
642            LD1     {v6.2S}, [x11], 8
643
644            # Load vk3_next.hi
645            LD1     {v7.2S}, [x19], 8
646
647            # vacc.lo += vi7.lo * vk7.lo
648            FMLA    v0.2S, v28.2S, v29.2S
649            # Load vi4_next.lo
650            LD1     {v28.2S}, [x12], 8
651
652            # Load vk4_next.lo
653            LD1     {v29.2S}, [x19], 8
654
655            # vacc.hi += vi7.hi * vk7.hi
656            FMLA    v1.2S, v10.2S, v11.2S
657            # Load vi4_next.hi
658            LD1     {v10.2S}, [x12], 8
659
660            # Load vk4_next.hi
661            LD1     {v11.2S}, [x19], 8
662
663            # vacc.lo += vi8.lo * vk8.lo
664            FMLA    v0.2S, v12.2S, v13.2S
665            # Load vi5_next.lo
666            LD1     {v12.2S}, [x13], 8
667
668            # Load vk5_next.lo
669            LD1     {v13.2S}, [x19], 8
670
671            # vacc.hi += vi8.hi * vk8.hi
672            FMLA    v1.2S, v14.2S, v15.2S
673            # Load vi5_next.hi
674            LD1     {v14.2S}, [x13], 8
675
676            # Load vk5_next.hi
677            LD1     {v15.2S}, [x19], 8
678
679            # vacc_next.lo += vi0_next.lo * vk0_next.lo
680            FMLA    v2.2S, v16.2S, v17.2S
681            # Load vi6_next.lo
682            LD1     {v16.2S}, [x14], 8
683
684            # vacc.lo = min(vacc.lo, vmin)
685            FMAX    v0.2S, v0.2S, v30.2S
686            # Load vk6_next.lo
687            LD1     {v17.2S}, [x19], 8
688
689            # vacc_next.hi += vi0_next.hi * vk0_next.hi
690            FMLA    v3.2S, v18.2S, v19.2S
691            # Load vi6_next.hi
692            LD1     {v18.2S}, [x14], 8
693
694            # vacc.hi = min(vacc.hi, vmin)
695            FMAX    v1.2S, v1.2S, v30.2S
696            # Load vk6_next.hi
697            LD1     {v19.2S}, [x19], 8
698
699            # vacc_next.lo += vi1_next.lo * vk1_next.lo
700            FMLA    v2.2S, v20.2S, v21.2S
701            # Load vi7_next.lo
702            LD1     {v20.2S}, [x15], 8
703
704            # vacc.lo = max(vacc.lo, vmax)
705            FMIN    v0.2S, v0.2S, v31.2S
706            # Load vk7_next.lo
707            LD1     {v21.2S}, [x19], 8
708
709            # vacc_next.hi += vi1_next.hi * vk1_next.hi
710            FMLA    v3.2S, v22.2S, v23.2S
711            # Load vi7_next.hi
712            LD1     {v22.2S}, [x15], 8
713
714            # vacc.hi = max(vacc.hi, vmax)
715            FMIN    v1.2S, v1.2S, v31.2S
716            # Load vk7_next.hi
717            LD1     {v23.2S}, [x19], 8
718
719            # vacc_next.lo += vi2_next.lo * vk2_next.lo
720            FMLA    v2.2S, v24.2S, v25.2S
721            # Load vi8_next.lo
722            LD1     {v24.2S}, [x16], 8
723
724            # Load vk8_next.lo
725            LD1     {v25.2S}, [x19], 8
726
727            # vacc_next.hi += vi2_next.hi * vk2_next.hi
728            FMLA    v3.2S, v26.2S, v27.2S
729            # Load vi8_next.hi
730            LD1     {v26.2S}, [x16], 8
731
732            # Store vacc
733            STP     d0, d1, [x4], 16
734
735            # c -= 8
736            SUBS    x20, x20, 8
737            # Load vk8_next.hi
738            LD1     {v27.2S}, [x19], 8
739
740            B.HS    1b
741
7422:
743          # SWP epilogue
744
745          # vacc_prev.lo += vi3_prev.lo * vk3_prev.lo
746          FMLA    v2.2S, v4.2S, v5.2S
747
748          # vacc_prev.hi += vi3_prev.hi * vk3_prev.hi
749          FMLA    v3.2S, v6.2S, v7.2S
750
751          # vacc_prev.lo += vi4_prev.lo * vk4_prev.lo
752          FMLA    v2.2S, v28.2S, v29.2S
753
754          # vacc_prev.hi += vi4_prev.hi * vk4_prev.hi
755          FMLA    v3.2S, v10.2S, v11.2S
756
757          # vacc_prev.lo += vi5_prev.lo * vk5_prev.lo
758          FMLA    v2.2S, v12.2S, v13.2S
759
760          # vacc_prev.hi += vi5_prev.hi * vk5_prev.hi
761          FMLA    v3.2S, v14.2S, v15.2S
762
763          # vacc_prev.lo += vi6_prev.lo * vk6_prev.lo
764          FMLA    v2.2S, v16.2S, v17.2S
765
766          # vacc_prev.hi += vi6_prev.hi * vk6_prev.hi
767          FMLA    v3.2S, v18.2S, v19.2S
768
769          # vacc_prev.lo += vi7_prev.lo * vk7_prev.lo
770          FMLA    v2.2S, v20.2S, v21.2S
771
772          # vacc_prev.hi += vi7_prev.hi * vk7_prev.hi
773          FMLA    v3.2S, v22.2S, v23.2S
774
775          # vacc_prev.lo += vi8_prev.lo * vk8_prev.lo
776          FMLA    v2.2S, v24.2S, v25.2S
777
778          # vacc_prev.hi += vi8_prev.hi * vk8_prev.hi
779          FMLA    v3.2S, v26.2S, v27.2S
780
781          # vacc_prev.lo = min(vacc_prev.lo, vmin)
782          FMAX    v2.2S, v2.2S, v30.2S
783
784          # vacc_prev.hi = min(vacc_prev.hi, vmin)
785          FMAX    v3.2S, v3.2S, v30.2S
786
787          # vacc_prev.lo = max(vacc_prev.lo, vmax)
788          FMIN    v2.2S, v2.2S, v31.2S
789
790          # vacc_prev.lo = max(vacc_prev.lo, vmax)
791          FMIN    v3.2S, v3.2S, v31.2S
792
793          # Store vacc_prev
794          STP     d2, d3, [x4], 16
795
7963:
797        # Is there a remainder? - 4 channels
798        TBZ     x20, 2, 4f
799
800        LDR     q10, [x8], 16           // load 9 inputs
801        LDP     q0, q1, [x19], 32       // load bias and 9 weights
802        LDR     q11, [x9], 16
803        LDR     q12, [x10], 16
804        LDR     q13, [x11], 16
805        LDR     q14, [x12], 16
806        LDR     q15, [x13], 16
807        LDR     q16, [x14], 16
808        LDR     q17, [x15], 16
809        LDR     q18, [x16], 16
810        LDP     q2, q3, [x19], 32
811        LDP     q4, q5, [x19], 32
812        LDP     q6, q7, [x19], 32
813        LDP     q28, q29, [x19], 32
814
815        FMLA    v0.4S, v1.4S, v10.4S
816        FMLA    v0.4S, v2.4S, v11.4S
817        FMLA    v0.4S, v3.4S, v12.4S
818        FMLA    v0.4S, v4.4S, v13.4S
819        FMLA    v0.4S, v5.4S, v14.4S
820        FMLA    v0.4S, v6.4S, v15.4S
821        FMLA    v0.4S, v7.4S, v16.4S
822        FMLA    v0.4S, v28.4S, v17.4S
823        FMLA    v0.4S, v29.4S, v18.4S
824
825        FMAX    v0.4S, v0.4S, v30.4S
826        FMIN    v0.4S, v0.4S, v31.4S
827
828        STR     q0, [x4], 16
829
8304:
831        # Is there a remainder?- 1 to 3 channels
832        TST     x20, 3
833        B.EQ    6f
834
835        LDR     q10, [x8], 16           // load 9 inputs
836        LDP     q0, q1, [x19], 32       // load bias and 9 weights
837        LDR     q11, [x9], 16
838        LDR     q12, [x10], 16
839        LDR     q13, [x11], 16
840        LDR     q14, [x12], 16
841        LDR     q15, [x13], 16
842        LDR     q16, [x14], 16
843        LDR     q17, [x15], 16
844        LDR     q18, [x16], 16
845        LDP     q2, q3, [x19], 32
846        LDP     q4, q5, [x19], 32
847        LDP     q6, q7, [x19], 32
848        LDP     q28, q29, [x19], 32
849
850        FMLA    v0.4S, v1.4S, v10.4S
851        FMLA    v0.4S, v2.4S, v11.4S
852        FMLA    v0.4S, v3.4S, v12.4S
853        FMLA    v0.4S, v4.4S, v13.4S
854        FMLA    v0.4S, v5.4S, v14.4S
855        FMLA    v0.4S, v6.4S, v15.4S
856        FMLA    v0.4S, v7.4S, v16.4S
857        FMLA    v0.4S, v28.4S, v17.4S
858        FMLA    v0.4S, v29.4S, v18.4S
859
860        FMAX    v0.4S, v0.4S, v30.4S
861        FMIN    v0.4S, v0.4S, v31.4S
862
863        TBZ     x20, 1, 5f
864
865        STR     d0, [x4], 8
866        DUP     d0, v0.D[1]
867        TBZ     x20, 0, 6f
8685:
869        STR     s0, [x4], 4
8706:
871        # output_width -= 1
872        SUBS    x1, x1, 1
873        # output += output_increment
874        ADD     x4, x4, x6
875        # process next pixel if output_width != 0
876        B.NE    0b
877
878        # Restore x19-x20,d10-d15 from stack
879        LDP     d14, d15, [sp, 48]
880        LDP     d12, d13, [sp, 32]
881        LDP     d10, d11, [sp, 16]
882        LDP     x19, x20, [sp], 64
883        RET
884
885END_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55
886
887#ifdef __ELF__
888.section ".note.GNU-stack","",%progbits
889#endif
890