xref: /aosp_15_r20/external/XNNPACK/src/f32-dwconv/up4x9-minmax-aarch64-neonfma.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma(
9#     size_t channels,                   x0, x20
10#     size_t output_width,               x1
11#     const float** input,               x2
12#     const float* weights,              x3, x19
13#     float* output,                     x4
14#     size_t input_stride,               x5
15#     size_t output_increment,           x6
16#     size_t input_offset,               x7
17#     const float* zero,                 [sp + 80] -> x17
18#     const xnn_f32_minmax_params params [sp + 88] -> (x16)
19
20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
21
22# inputs
23# i0  x8 v21
24# i1  x9 v22
25# i2 x10 v23
26# i3 x11 v24
27# i4 x12 v25
28# i5 x13 v26
29# i6 x14 v27
30# i7 x15 v28
31# i8 x16 v29
32
33# weights
34# x19 v0 (acc) v1 v2 v3 v4 v5 v6 v7 v16 v17
35
36# Clamp v30 v31
37
38# unused v18 v19 v20
39
40BEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma
41
42        # Load zero, params pointer
43        LDP     x17, x16, [sp]
44
45        # Save x19,x20 on stack
46        STP     x19, x20, [sp, -16]!
47
48        # Load min/max values
49        LD2R    {v30.4s, v31.4s}, [x16]
50
510:
52        # Load 9 input pointers
53        LDP     x8, x9, [x2]
54        LDP     x10, x11, [x2, 16]
55        LDP     x12, x13, [x2, 32]
56        LDP     x14, x15, [x2, 48]
57        LDR     x16, [x2, 64]
58
59        CMP     x8, x17                 // if i0 == zero
60        ADD     x8, x8, x7              // i0 += input_offset
61        CSEL    x8, x17, x8, EQ         //   i0 = zero, else += i0 + input_offset
62        CMP     x9, x17                 // if i1 == zero
63        ADD     x9, x9, x7              // i1 += input_offset
64        CSEL    x9, x17, x9, EQ         //   i1 = zero, else += i1 + input_offset
65        CMP     x10, x17                // if i2 == zero
66        ADD     x10, x10, x7            // i2 += input_offset
67        CSEL    x10, x17, x10, EQ       //   i2 = zero, else += i2 + input_offset
68        CMP     x11, x17                // if i3 == zero
69        ADD     x11, x11, x7            // i3 += input_offset
70        CSEL    x11, x17, x11, EQ       //   i3 = zero, else += i3 + input_offset
71        CMP     x12, x17                // if i4 == zero
72        ADD     x12, x12, x7            // i4 += input_offset
73        CSEL    x12, x17, x12, EQ       //   i4 = zero, else += i4 + input_offset
74        CMP     x13, x17                // if i5 == zero
75        ADD     x13, x13, x7            // i5 += input_offset
76        CSEL    x13, x17, x13, EQ       //   i5 = zero, else += i5 + input_offset
77        CMP     x14, x17                // if i6 == zero
78        ADD     x14, x14, x7            // i6 += input_offset
79        CSEL    x14, x17, x14, EQ       //   i6 = zero, else += i6 + input_offset
80        CMP     x15, x17                // if i7 == zero
81        ADD     x15, x15, x7            // i7 += input_offset
82        CSEL    x15, x17, x15, EQ       //   i7 = zero, else += i7 + input_offset
83        CMP     x16, x17                // if i8 == zero
84        ADD     x16, x16, x7            // i8 += input_offset
85        CSEL    x16, x17, x16, EQ       //   i8 = zero, else += i8 + input_offset
86
87        # input += input_stride
88        ADD     x2, x2, x5
89
90        # x20 := c = channels
91        # c -= 4
92        SUBS    x20, x0, 4
93        # x19 := w = weights
94        MOV     x19, x3
95
96        # skip main loop if c <= 4
97        B.LO    2f
981:
99        LDR     q21, [x8], 16           // load 9 inputs
100        LDP     q0, q1, [x19], 32       // load bias and 9 weights
101        LDR     q22, [x9], 16
102        LDR     q23, [x10], 16
103        LDR     q24, [x11], 16
104        LDR     q25, [x12], 16
105        LDR     q26, [x13], 16
106        LDR     q27, [x14], 16
107        LDR     q28, [x15], 16
108        LDR     q29, [x16], 16
109        LDP     q2, q3, [x19], 32
110        LDP     q4, q5, [x19], 32
111        LDP     q6, q7, [x19], 32
112        LDP     q16, q17, [x19], 32
113
114        FMLA    v0.4S, v1.4S, v21.4S
115        FMLA    v0.4S, v2.4S, v22.4S
116        FMLA    v0.4S, v3.4S, v23.4S
117        FMLA    v0.4S, v4.4S, v24.4S
118        FMLA    v0.4S, v5.4S, v25.4S
119        FMLA    v0.4S, v6.4S, v26.4S
120        FMLA    v0.4S, v7.4S, v27.4S
121        FMLA    v0.4S, v16.4S, v28.4S
122        FMLA    v0.4S, v17.4S, v29.4S
123        SUBS    x20, x20, 4
124
125        FMAX    v0.4S, v0.4S, v30.4S
126        FMIN    v0.4S, v0.4S, v31.4S
127        STR     q0, [x4], 16
128        B.HS    1b
129
1302:
131        # Is there a remainder?- 1 to 3 channels
132        TST     x20, 3
133        B.EQ    4f
134
135        LDR     q21, [x8], 16           // load 9 inputs
136        LDP     q0, q1, [x19], 32       // load bias and 9 weights
137        LDR     q22, [x9], 16
138        LDR     q23, [x10], 16
139        LDR     q24, [x11], 16
140        LDR     q25, [x12], 16
141        LDR     q26, [x13], 16
142        LDR     q27, [x14], 16
143        LDR     q28, [x15], 16
144        LDR     q29, [x16], 16
145        LDP     q2, q3, [x19], 32
146        LDP     q4, q5, [x19], 32
147        LDP     q6, q7, [x19], 32
148        LDP     q16, q17, [x19], 32
149
150        FMLA    v0.4S, v1.4S, v21.4S
151        FMLA    v0.4S, v2.4S, v22.4S
152        FMLA    v0.4S, v3.4S, v23.4S
153        FMLA    v0.4S, v4.4S, v24.4S
154        FMLA    v0.4S, v5.4S, v25.4S
155        FMLA    v0.4S, v6.4S, v26.4S
156        FMLA    v0.4S, v7.4S, v27.4S
157        FMLA    v0.4S, v16.4S, v28.4S
158        FMLA    v0.4S, v17.4S, v29.4S
159
160        FMAX    v0.4S, v0.4S, v30.4S
161        FMIN    v0.4S, v0.4S, v31.4S
162
163        TBZ     x20, 1, 3f
164
165        STR     d0, [x4], 8
166        DUP     d0, v0.D[1]
167        TBZ     x20, 0, 4f
1683:
169        STR     s0, [x4], 4
1704:
171        # output_width -= 1
172        SUBS    x1, x1, 1
173        # output += output_increment
174        ADD     x4, x4, x6
175        # process next pixel if output_width != 0
176        B.NE    0b
177
178        # Restore x19,x20 from stack
179        LDP     x19, x20, [sp], 16
180        RET
181
182END_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma
183
184#ifdef __ELF__
185.section ".note.GNU-stack","",%progbits
186#endif
187