xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53(
9#     size_t mr,                         (x0) - unused.  mr = 1
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const float*restrict w,            x5
15#     float*restrict c,                  x6
16#     size_t cm_stride,                  (x7) - unused
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointer
25# x8  a0
26
27# C pointer
28# x6  c0
29
30# A53 based on a53/75 but with LD64
31
32BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53
33
34        # Load cn_stride, a_offset
35        LDP     x10, x11, [sp]
36
37        # Load zero, params pointer
38        LDP     x12, x8, [sp, 16]
39
40        # Load min/max values
41        LD2R    {v30.4s, v31.4s}, [x8]
42
430:
44        # Load initial bias from w into accumulators
45        LDP     q16, q17, [x5], 32
46        MOVI    v18.4s, 0               // second set of C for pipelining FMLA
47        $if PREFETCH:
48          PRFM    PLDL1KEEP, [x5, 64]
49        MOVI    v19.4s, 0
50        $if PREFETCH:
51          PRFM    PLDL1KEEP, [x5, 128]
52          PRFM    PLDL1KEEP, [x5, 192]
53          PRFM    PLDL1KEEP, [x5, 256]
54          PRFM    PLDL1KEEP, [x5, 320]
55          PRFM    PLDL1KEEP, [x5, 384]
56          PRFM    PLDL1KEEP, [x5, 448]
57          PRFM    PLDL1KEEP, [x5, 512]
58          PRFM    PLDL1KEEP, [x5, 576]
59
60        MOV     x9, x3                  // p = ks
61
621:
63        # Load next A pointer
64        LDR     x8, [x4], 8
65
66        CMP     x8, x12                 // if a0 == zero
67        ADD     x8, x8, x11             // a0 += a_offset
68        CSEL    x8, x12, x8, EQ         //   a0 = zero, else += a0 + a_offset
69
70        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
71        SUBS    x0, x2, 32              // k = kc - 32
72        B.LO    5f
73
74        # 16 prologue
75        # Read first block of A and B.
76        LDP     q20, q21, [x5], 32
77        LDP     q22, q23, [x5], 32
78        LDP     q24, q25, [x5], 32
79        LDP     q26, q27, [x5], 32
80        LDR     q0, [x8], 16
81
82        # Is there at least 8.  yes do main loop
83        SUBS    x0, x0, 32
84        B.LO    3f
85
86        # Main loop - 8 floats of A (32 bytes)
872:
88        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
89        FMLA    v16.4s, v20.4s, v0.s[0]
90        LDR     q1, [x8], 16
91        FMLA    v17.4s, v21.4s, v0.s[0]
92        LDR     q20, [x5], 16
93        FMLA    v18.4s, v22.4s, v0.s[1]
94        LDR     q21, [x5], 16
95        FMLA    v19.4s, v23.4s, v0.s[1]
96        LDR     q22, [x5], 16
97        FMLA    v16.4s, v24.4s, v0.s[2]
98        LDR     q23, [x5], 16
99        FMLA    v17.4s, v25.4s, v0.s[2]
100        LDR     q24, [x5], 16
101        FMLA    v18.4s, v26.4s, v0.s[3]
102        LDR     q25, [x5], 16
103        FMLA    v19.4s, v27.4s, v0.s[3]
104        LDR     q26, [x5], 16
105        LDR     q27, [x5], 16
106
107        $if PREFETCH:
108          PRFM    PLDL1KEEP, [x5, 384]
109          PRFM    PLDL1KEEP, [x5, 448]
110          PRFM    PLDL1KEEP, [x5, 512]
111          PRFM    PLDL1KEEP, [x5, 576]
112
113        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
114        FMLA    v16.4s, v20.4s, v1.s[0]
115        LDR     q0, [x8], 16
116        FMLA    v17.4s, v21.4s, v1.s[0]
117        LDR     q20, [x5], 16
118        FMLA    v18.4s, v22.4s, v1.s[1]
119        LDR     q21, [x5], 16
120        FMLA    v19.4s, v23.4s, v1.s[1]
121        LDR     q22, [x5], 16
122        FMLA    v16.4s, v24.4s, v1.s[2]
123        LDR     q23, [x5], 16
124        FMLA    v17.4s, v25.4s, v1.s[2]
125        LDR     q24, [x5], 16
126        FMLA    v18.4s, v26.4s, v1.s[3]
127        LDR     q25, [x5], 16
128        FMLA    v19.4s, v27.4s, v1.s[3]
129        SUBS    x0, x0, 32
130        LDR     q26, [x5], 16
131        LDR     q27, [x5], 16
132        B.HS    2b
133
1343:
135        # Epilogue
136
137        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
138        FMLA    v16.4s, v20.4s, v0.s[0]
139        LDR     q1, [x8], 16
140        FMLA    v17.4s, v21.4s, v0.s[0]
141        LDR     q20, [x5], 16
142        FMLA    v18.4s, v22.4s, v0.s[1]
143        LDR     q21, [x5], 16
144        FMLA    v19.4s, v23.4s, v0.s[1]
145        LDR     q22, [x5], 16
146        FMLA    v16.4s, v24.4s, v0.s[2]
147        LDR     q23, [x5], 16
148        FMLA    v17.4s, v25.4s, v0.s[2]
149        LDR     q24, [x5], 16
150        FMLA    v18.4s, v26.4s, v0.s[3]
151        LDR     q25, [x5], 16
152        FMLA    v19.4s, v27.4s, v0.s[3]
153        LDR     q26, [x5], 16
154
155        # Second block of 4.  no loads
156        FMLA    v16.4s, v20.4s, v1.s[0]
157        LDR     q27, [x5], 16
158        FMLA    v17.4s, v21.4s, v1.s[0]
159        FMLA    v18.4s, v22.4s, v1.s[1]
160        FMLA    v19.4s, v23.4s, v1.s[1]
161        FMLA    v16.4s, v24.4s, v1.s[2]
162        FMLA    v17.4s, v25.4s, v1.s[2]
163        TST     x0, 31
164        FMLA    v18.4s, v26.4s, v1.s[3]
165        FMLA    v19.4s, v27.4s, v1.s[3]
166        # Is there a remainder?- 4 floats of A (16 bytes) or less
167        B.NE    5f
168
1694:
170        # ks loop
171        SUBS    x9, x9, 8               // ks -= MR * sizeof(void*)
172        B.HI    1b
173
174        FADD    v16.4s, v16.4s, v18.4s
175        FADD    v17.4s, v17.4s, v19.4s
176
177        # Clamp
178        FMAX    v16.4s, v16.4s, v30.4s
179        FMAX    v17.4s, v17.4s, v30.4s
180        FMIN    v16.4s, v16.4s, v31.4s
181        FMIN    v17.4s, v17.4s, v31.4s
182
183        # Store full 1 x 8
184        SUBS    x1, x1, 8
185        B.LO    8f
186
187        ST1     {v16.16b, v17.16b}, [x6], x10
188        SUB     x4, x4, x3              // a -= ks
189
190        # nc loop
191        B.HI    0b
192
193        RET
194
1955:
196        # Is there a remainder?- 2 floats of A (8 bytes)
197        TBZ     x0, 4, 6f
198
199        # Remainder- 4 floats of A (16 bytes)
200        LDR     q20, [x5], 16
201        LDR     q21, [x5], 16
202        LDR     q0, [x8], 16
203        FMLA    v16.4s, v20.4s, v0.s[0]
204        FMLA    v17.4s, v21.4s, v0.s[0]
205        LDR     q22, [x5], 16
206        LDR     q23, [x5], 16
207        LDR     q24, [x5], 16
208        LDR     q25, [x5], 16
209        LDR     q26, [x5], 16
210        LDR     q27, [x5], 16
211        FMLA    v18.4s, v22.4s, v0.s[1]
212        FMLA    v19.4s, v23.4s, v0.s[1]
213        FMLA    v16.4s, v24.4s, v0.s[2]
214        FMLA    v17.4s, v25.4s, v0.s[2]
215        FMLA    v18.4s, v26.4s, v0.s[3]
216        FMLA    v19.4s, v27.4s, v0.s[3]
217
2186:
219        TBZ     x0, 3, 7f
220        # Remainder- 2 floats of A (8 bytes)
221        LDR     q20, [x5], 16
222        LDR     q21, [x5], 16
223        LDR     d0, [x8], 8
224        FMLA    v16.4s, v20.4s, v0.s[0]
225        FMLA    v17.4s, v21.4s, v0.s[0]
226        LDR     q22, [x5], 16
227        LDR     q23, [x5], 16
228        FMLA    v18.4s, v22.4s, v0.s[1]
229        FMLA    v19.4s, v23.4s, v0.s[1]
2307:
231        TBZ     x0, 2, 4b
232        # Remainder- 1 float of A (4 bytes)
233        LDR     q20, [x5], 16
234        LDR     q21, [x5], 16
235        LDR     s0, [x8], 4
236        FMLA    v16.4s, v20.4s, v0.s[0]
237        FMLA    v17.4s, v21.4s, v0.s[0]
238        B       4b
239
2408:
241        # Store odd channels
242        TBZ     x1, 2, 9f
243        STR     q16, [x6], 16
244        MOV     v16.16b, v17.16b
245
2469:
247        TBZ     x1, 1, 10f
248        STR     d16, [x6], 8
249        DUP     d16, v16.d[1]
250
25110:
252        TBZ     x1, 0, 11f
253        STR     s16, [x6], 4
25411:
255        RET
256
257END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53
258
259#ifdef __ELF__
260.section ".note.GNU-stack","",%progbits
261#endif
262