xref: /aosp_15_r20/external/XNNPACK/src/f32-dwconv/up4x9-minmax-aarch64-neonfma.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2019 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker//
3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker
6*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/assembly.h>
7*4bdc9457SAndroid Build Coastguard Worker
8*4bdc9457SAndroid Build Coastguard Worker# void xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma(
9*4bdc9457SAndroid Build Coastguard Worker#     size_t channels,                   x0, x20
10*4bdc9457SAndroid Build Coastguard Worker#     size_t output_width,               x1
11*4bdc9457SAndroid Build Coastguard Worker#     const float** input,               x2
12*4bdc9457SAndroid Build Coastguard Worker#     const float* weights,              x3, x19
13*4bdc9457SAndroid Build Coastguard Worker#     float* output,                     x4
14*4bdc9457SAndroid Build Coastguard Worker#     size_t input_stride,               x5
15*4bdc9457SAndroid Build Coastguard Worker#     size_t output_increment,           x6
16*4bdc9457SAndroid Build Coastguard Worker#     size_t input_offset,               x7
17*4bdc9457SAndroid Build Coastguard Worker#     const float* zero,                 [sp + 80] -> x17
18*4bdc9457SAndroid Build Coastguard Worker#     const xnn_f32_minmax_params params [sp + 88] -> (x16)
19*4bdc9457SAndroid Build Coastguard Worker
20*4bdc9457SAndroid Build Coastguard Worker# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
21*4bdc9457SAndroid Build Coastguard Worker
22*4bdc9457SAndroid Build Coastguard Worker# inputs
23*4bdc9457SAndroid Build Coastguard Worker# i0  x8 v21
24*4bdc9457SAndroid Build Coastguard Worker# i1  x9 v22
25*4bdc9457SAndroid Build Coastguard Worker# i2 x10 v23
26*4bdc9457SAndroid Build Coastguard Worker# i3 x11 v24
27*4bdc9457SAndroid Build Coastguard Worker# i4 x12 v25
28*4bdc9457SAndroid Build Coastguard Worker# i5 x13 v26
29*4bdc9457SAndroid Build Coastguard Worker# i6 x14 v27
30*4bdc9457SAndroid Build Coastguard Worker# i7 x15 v28
31*4bdc9457SAndroid Build Coastguard Worker# i8 x16 v29
32*4bdc9457SAndroid Build Coastguard Worker
33*4bdc9457SAndroid Build Coastguard Worker# weights
34*4bdc9457SAndroid Build Coastguard Worker# x19 v0 (acc) v1 v2 v3 v4 v5 v6 v7 v16 v17
35*4bdc9457SAndroid Build Coastguard Worker
36*4bdc9457SAndroid Build Coastguard Worker# Clamp v30 v31
37*4bdc9457SAndroid Build Coastguard Worker
38*4bdc9457SAndroid Build Coastguard Worker# unused v18 v19 v20
39*4bdc9457SAndroid Build Coastguard Worker
40*4bdc9457SAndroid Build Coastguard WorkerBEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma
41*4bdc9457SAndroid Build Coastguard Worker
42*4bdc9457SAndroid Build Coastguard Worker        # Load zero, params pointer
43*4bdc9457SAndroid Build Coastguard Worker        LDP     x17, x16, [sp]
44*4bdc9457SAndroid Build Coastguard Worker
45*4bdc9457SAndroid Build Coastguard Worker        # Save x19,x20 on stack
46*4bdc9457SAndroid Build Coastguard Worker        STP     x19, x20, [sp, -16]!
47*4bdc9457SAndroid Build Coastguard Worker
48*4bdc9457SAndroid Build Coastguard Worker        # Load min/max values
49*4bdc9457SAndroid Build Coastguard Worker        LD2R    {v30.4s, v31.4s}, [x16]
50*4bdc9457SAndroid Build Coastguard Worker
51*4bdc9457SAndroid Build Coastguard Worker0:
52*4bdc9457SAndroid Build Coastguard Worker        # Load 9 input pointers
53*4bdc9457SAndroid Build Coastguard Worker        LDP     x8, x9, [x2]
54*4bdc9457SAndroid Build Coastguard Worker        LDP     x10, x11, [x2, 16]
55*4bdc9457SAndroid Build Coastguard Worker        LDP     x12, x13, [x2, 32]
56*4bdc9457SAndroid Build Coastguard Worker        LDP     x14, x15, [x2, 48]
57*4bdc9457SAndroid Build Coastguard Worker        LDR     x16, [x2, 64]
58*4bdc9457SAndroid Build Coastguard Worker
59*4bdc9457SAndroid Build Coastguard Worker        CMP     x8, x17                 // if i0 == zero
60*4bdc9457SAndroid Build Coastguard Worker        ADD     x8, x8, x7              // i0 += input_offset
61*4bdc9457SAndroid Build Coastguard Worker        CSEL    x8, x17, x8, EQ         //   i0 = zero, else += i0 + input_offset
62*4bdc9457SAndroid Build Coastguard Worker        CMP     x9, x17                 // if i1 == zero
63*4bdc9457SAndroid Build Coastguard Worker        ADD     x9, x9, x7              // i1 += input_offset
64*4bdc9457SAndroid Build Coastguard Worker        CSEL    x9, x17, x9, EQ         //   i1 = zero, else += i1 + input_offset
65*4bdc9457SAndroid Build Coastguard Worker        CMP     x10, x17                // if i2 == zero
66*4bdc9457SAndroid Build Coastguard Worker        ADD     x10, x10, x7            // i2 += input_offset
67*4bdc9457SAndroid Build Coastguard Worker        CSEL    x10, x17, x10, EQ       //   i2 = zero, else += i2 + input_offset
68*4bdc9457SAndroid Build Coastguard Worker        CMP     x11, x17                // if i3 == zero
69*4bdc9457SAndroid Build Coastguard Worker        ADD     x11, x11, x7            // i3 += input_offset
70*4bdc9457SAndroid Build Coastguard Worker        CSEL    x11, x17, x11, EQ       //   i3 = zero, else += i3 + input_offset
71*4bdc9457SAndroid Build Coastguard Worker        CMP     x12, x17                // if i4 == zero
72*4bdc9457SAndroid Build Coastguard Worker        ADD     x12, x12, x7            // i4 += input_offset
73*4bdc9457SAndroid Build Coastguard Worker        CSEL    x12, x17, x12, EQ       //   i4 = zero, else += i4 + input_offset
74*4bdc9457SAndroid Build Coastguard Worker        CMP     x13, x17                // if i5 == zero
75*4bdc9457SAndroid Build Coastguard Worker        ADD     x13, x13, x7            // i5 += input_offset
76*4bdc9457SAndroid Build Coastguard Worker        CSEL    x13, x17, x13, EQ       //   i5 = zero, else += i5 + input_offset
77*4bdc9457SAndroid Build Coastguard Worker        CMP     x14, x17                // if i6 == zero
78*4bdc9457SAndroid Build Coastguard Worker        ADD     x14, x14, x7            // i6 += input_offset
79*4bdc9457SAndroid Build Coastguard Worker        CSEL    x14, x17, x14, EQ       //   i6 = zero, else += i6 + input_offset
80*4bdc9457SAndroid Build Coastguard Worker        CMP     x15, x17                // if i7 == zero
81*4bdc9457SAndroid Build Coastguard Worker        ADD     x15, x15, x7            // i7 += input_offset
82*4bdc9457SAndroid Build Coastguard Worker        CSEL    x15, x17, x15, EQ       //   i7 = zero, else += i7 + input_offset
83*4bdc9457SAndroid Build Coastguard Worker        CMP     x16, x17                // if i8 == zero
84*4bdc9457SAndroid Build Coastguard Worker        ADD     x16, x16, x7            // i8 += input_offset
85*4bdc9457SAndroid Build Coastguard Worker        CSEL    x16, x17, x16, EQ       //   i8 = zero, else += i8 + input_offset
86*4bdc9457SAndroid Build Coastguard Worker
87*4bdc9457SAndroid Build Coastguard Worker        # input += input_stride
88*4bdc9457SAndroid Build Coastguard Worker        ADD     x2, x2, x5
89*4bdc9457SAndroid Build Coastguard Worker
90*4bdc9457SAndroid Build Coastguard Worker        # x20 := c = channels
91*4bdc9457SAndroid Build Coastguard Worker        # c -= 4
92*4bdc9457SAndroid Build Coastguard Worker        SUBS    x20, x0, 4
93*4bdc9457SAndroid Build Coastguard Worker        # x19 := w = weights
94*4bdc9457SAndroid Build Coastguard Worker        MOV     x19, x3
95*4bdc9457SAndroid Build Coastguard Worker
96*4bdc9457SAndroid Build Coastguard Worker        # skip main loop if c <= 4
97*4bdc9457SAndroid Build Coastguard Worker        B.LO    2f
98*4bdc9457SAndroid Build Coastguard Worker1:
99*4bdc9457SAndroid Build Coastguard Worker        LDR     q21, [x8], 16           // load 9 inputs
100*4bdc9457SAndroid Build Coastguard Worker        LDP     q0, q1, [x19], 32       // load bias and 9 weights
101*4bdc9457SAndroid Build Coastguard Worker        LDR     q22, [x9], 16
102*4bdc9457SAndroid Build Coastguard Worker        LDR     q23, [x10], 16
103*4bdc9457SAndroid Build Coastguard Worker        LDR     q24, [x11], 16
104*4bdc9457SAndroid Build Coastguard Worker        LDR     q25, [x12], 16
105*4bdc9457SAndroid Build Coastguard Worker        LDR     q26, [x13], 16
106*4bdc9457SAndroid Build Coastguard Worker        LDR     q27, [x14], 16
107*4bdc9457SAndroid Build Coastguard Worker        LDR     q28, [x15], 16
108*4bdc9457SAndroid Build Coastguard Worker        LDR     q29, [x16], 16
109*4bdc9457SAndroid Build Coastguard Worker        LDP     q2, q3, [x19], 32
110*4bdc9457SAndroid Build Coastguard Worker        LDP     q4, q5, [x19], 32
111*4bdc9457SAndroid Build Coastguard Worker        LDP     q6, q7, [x19], 32
112*4bdc9457SAndroid Build Coastguard Worker        LDP     q16, q17, [x19], 32
113*4bdc9457SAndroid Build Coastguard Worker
114*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v1.4S, v21.4S
115*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v2.4S, v22.4S
116*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v3.4S, v23.4S
117*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v4.4S, v24.4S
118*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v5.4S, v25.4S
119*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v6.4S, v26.4S
120*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v7.4S, v27.4S
121*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v16.4S, v28.4S
122*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v17.4S, v29.4S
123*4bdc9457SAndroid Build Coastguard Worker        SUBS    x20, x20, 4
124*4bdc9457SAndroid Build Coastguard Worker
125*4bdc9457SAndroid Build Coastguard Worker        FMAX    v0.4S, v0.4S, v30.4S
126*4bdc9457SAndroid Build Coastguard Worker        FMIN    v0.4S, v0.4S, v31.4S
127*4bdc9457SAndroid Build Coastguard Worker        STR     q0, [x4], 16
128*4bdc9457SAndroid Build Coastguard Worker        B.HS    1b
129*4bdc9457SAndroid Build Coastguard Worker
130*4bdc9457SAndroid Build Coastguard Worker2:
131*4bdc9457SAndroid Build Coastguard Worker        # Is there a remainder?- 1 to 3 channels
132*4bdc9457SAndroid Build Coastguard Worker        TST     x20, 3
133*4bdc9457SAndroid Build Coastguard Worker        B.EQ    4f
134*4bdc9457SAndroid Build Coastguard Worker
135*4bdc9457SAndroid Build Coastguard Worker        LDR     q21, [x8], 16           // load 9 inputs
136*4bdc9457SAndroid Build Coastguard Worker        LDP     q0, q1, [x19], 32       // load bias and 9 weights
137*4bdc9457SAndroid Build Coastguard Worker        LDR     q22, [x9], 16
138*4bdc9457SAndroid Build Coastguard Worker        LDR     q23, [x10], 16
139*4bdc9457SAndroid Build Coastguard Worker        LDR     q24, [x11], 16
140*4bdc9457SAndroid Build Coastguard Worker        LDR     q25, [x12], 16
141*4bdc9457SAndroid Build Coastguard Worker        LDR     q26, [x13], 16
142*4bdc9457SAndroid Build Coastguard Worker        LDR     q27, [x14], 16
143*4bdc9457SAndroid Build Coastguard Worker        LDR     q28, [x15], 16
144*4bdc9457SAndroid Build Coastguard Worker        LDR     q29, [x16], 16
145*4bdc9457SAndroid Build Coastguard Worker        LDP     q2, q3, [x19], 32
146*4bdc9457SAndroid Build Coastguard Worker        LDP     q4, q5, [x19], 32
147*4bdc9457SAndroid Build Coastguard Worker        LDP     q6, q7, [x19], 32
148*4bdc9457SAndroid Build Coastguard Worker        LDP     q16, q17, [x19], 32
149*4bdc9457SAndroid Build Coastguard Worker
150*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v1.4S, v21.4S
151*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v2.4S, v22.4S
152*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v3.4S, v23.4S
153*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v4.4S, v24.4S
154*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v5.4S, v25.4S
155*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v6.4S, v26.4S
156*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v7.4S, v27.4S
157*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v16.4S, v28.4S
158*4bdc9457SAndroid Build Coastguard Worker        FMLA    v0.4S, v17.4S, v29.4S
159*4bdc9457SAndroid Build Coastguard Worker
160*4bdc9457SAndroid Build Coastguard Worker        FMAX    v0.4S, v0.4S, v30.4S
161*4bdc9457SAndroid Build Coastguard Worker        FMIN    v0.4S, v0.4S, v31.4S
162*4bdc9457SAndroid Build Coastguard Worker
163*4bdc9457SAndroid Build Coastguard Worker        TBZ     x20, 1, 3f
164*4bdc9457SAndroid Build Coastguard Worker
165*4bdc9457SAndroid Build Coastguard Worker        STR     d0, [x4], 8
166*4bdc9457SAndroid Build Coastguard Worker        DUP     d0, v0.D[1]
167*4bdc9457SAndroid Build Coastguard Worker        TBZ     x20, 0, 4f
168*4bdc9457SAndroid Build Coastguard Worker3:
169*4bdc9457SAndroid Build Coastguard Worker        STR     s0, [x4], 4
170*4bdc9457SAndroid Build Coastguard Worker4:
171*4bdc9457SAndroid Build Coastguard Worker        # output_width -= 1
172*4bdc9457SAndroid Build Coastguard Worker        SUBS    x1, x1, 1
173*4bdc9457SAndroid Build Coastguard Worker        # output += output_increment
174*4bdc9457SAndroid Build Coastguard Worker        ADD     x4, x4, x6
175*4bdc9457SAndroid Build Coastguard Worker        # process next pixel if output_width != 0
176*4bdc9457SAndroid Build Coastguard Worker        B.NE    0b
177*4bdc9457SAndroid Build Coastguard Worker
178*4bdc9457SAndroid Build Coastguard Worker        # Restore x19,x20 from stack
179*4bdc9457SAndroid Build Coastguard Worker        LDP     x19, x20, [sp], 16
180*4bdc9457SAndroid Build Coastguard Worker        RET
181*4bdc9457SAndroid Build Coastguard Worker
182*4bdc9457SAndroid Build Coastguard WorkerEND_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma
183*4bdc9457SAndroid Build Coastguard Worker
184*4bdc9457SAndroid Build Coastguard Worker#ifdef __ELF__
185*4bdc9457SAndroid Build Coastguard Worker.section ".note.GNU-stack","",%progbits
186*4bdc9457SAndroid Build Coastguard Worker#endif
187