xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# LINT.IfChange
9# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75(
10#     size_t mr,                         (x0) - unused.  mr = 1
11#     size_t nc,                         x1
12#     size_t kc,                         x2 / x0
13#     size_t ks,                         x3 / x9
14#     const float**restrict a,           x4
15#     const float*restrict w,            x5
16#     float*restrict c,                  x6
17#     size_t cm_stride,                  (x7) - unused
18#     size_t cn_stride,                  [sp] -> x10
19#     size_t a_offset,                   [sp + 8] -> x11
20#     const float* zero,                 [sp + 16] -> x12
21#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
22
23# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
24
25# A pointer
26# x8  a0
27
28# C pointer
29# x6  c0
30
31BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
32
33        # Load cn_stride, a_offset
34        LDP     x10, x11, [sp]
35
36        # Load zero, params pointer
37        LDP     x12, x8, [sp, 16]
38
39        # Load min/max values
40        LD2R    {v30.4s, v31.4s}, [x8]
41
420:
43        # Load initial bias from w into accumulators
44        LDP     q16, q17, [x5], 32
45        MOVI    v18.4s, 0               // second set of C for pipelining FMLA
46        $if PREFETCH:
47          PRFM    PLDL1KEEP, [x5]
48        MOVI    v19.4s, 0
49        $if PREFETCH:
50          PRFM    PLDL1KEEP, [x5, 64]
51          PRFM    PLDL1KEEP, [x5, 128]
52          PRFM    PLDL1KEEP, [x5, 192]
53
54        MOV     x9, x3                  // p = ks
55
561:
57        # Load next A pointer
58        LDR     x8, [x4], 8
59
60        CMP     x8, x12                 // if a0 == zero
61        ADD     x8, x8, x11             // a0 += a_offset
62        CSEL    x8, x12, x8, EQ         //   a0 = zero, else += a0 + a_offset
63
64        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
65        SUBS    x0, x2, 32              // k = kc - 32
66        B.LO    4f
67
68        # 16 prologue
69        # Read first block of A and B.
70        LDP     q20, q21, [x5], 32
71        LDP     q22, q23, [x5], 32
72        LDP     q24, q25, [x5], 32
73        LDP     q26, q27, [x5], 32
74        LDR     q0, [x8], 16
75
76        # Is there at least 8.  yes do main loop
77        SUBS    x0, x0, 32
78        B.LO    3f
79
80        # Main loop - 8 floats of A (32 bytes)
812:
82        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
83        FMLA    v16.4s, v20.4s, v0.s[0]
84        LDR     q1, [x8], 16
85        FMLA    v17.4s, v21.4s, v0.s[0]
86        LDP     q20, q21, [x5], 32
87        FMLA    v18.4s, v22.4s, v0.s[1]
88        FMLA    v19.4s, v23.4s, v0.s[1]
89        LDP     q22, q23, [x5], 32
90        FMLA    v16.4s, v24.4s, v0.s[2]
91        FMLA    v17.4s, v25.4s, v0.s[2]
92        LDP     q24, q25, [x5], 32
93        $if PREFETCH:
94          PRFM    PLDL1KEEP, [x5, 128]
95        FMLA    v18.4s, v26.4s, v0.s[3]
96        $if PREFETCH:
97          PRFM    PLDL1KEEP, [x5, 256]
98        FMLA    v19.4s, v27.4s, v0.s[3]
99        LDP     q26, q27, [x5], 32
100
101        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
102        FMLA    v16.4s, v20.4s, v1.s[0]
103        LDR     q0, [x8], 16
104        FMLA    v17.4s, v21.4s, v1.s[0]
105        LDP     q20, q21, [x5], 32
106        FMLA    v18.4s, v22.4s, v1.s[1]
107        FMLA    v19.4s, v23.4s, v1.s[1]
108        LDP     q22, q23, [x5], 32
109        FMLA    v16.4s, v24.4s, v1.s[2]
110        FMLA    v17.4s, v25.4s, v1.s[2]
111        LDP     q24, q25, [x5], 32
112        $if PREFETCH:
113          PRFM    PLDL1KEEP, [x5, 128]
114        FMLA    v18.4s, v26.4s, v1.s[3]
115        $if PREFETCH:
116          PRFM    PLDL1KEEP, [x5, 256]
117        FMLA    v19.4s, v27.4s, v1.s[3]
118        SUBS    x0, x0, 32
119        LDP     q26, q27, [x5], 32
120        B.HS    2b
121
1223:
123        # Epilogue
124
125        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
126        FMLA    v16.4s, v20.4s, v0.s[0]
127        LDR     q1, [x8], 16
128        FMLA    v17.4s, v21.4s, v0.s[0]
129        LDP     q20, q21, [x5], 32
130        FMLA    v18.4s, v22.4s, v0.s[1]
131        FMLA    v19.4s, v23.4s, v0.s[1]
132        LDP     q22, q23, [x5], 32
133        FMLA    v16.4s, v24.4s, v0.s[2]
134        FMLA    v17.4s, v25.4s, v0.s[2]
135        LDP     q24, q25, [x5], 32
136        $if PREFETCH:
137          PRFM    PLDL1KEEP, [x5, 128]
138        FMLA    v18.4s, v26.4s, v0.s[3]
139        $if PREFETCH:
140          PRFM    PLDL1KEEP, [x5, 256]
141        FMLA    v19.4s, v27.4s, v0.s[3]
142        LDP     q26, q27, [x5], 32
143
144        # Second block of 4.  no loads
145        FMLA    v16.4s, v20.4s, v1.s[0]
146        FMLA    v17.4s, v21.4s, v1.s[0]
147        FMLA    v18.4s, v22.4s, v1.s[1]
148        FMLA    v19.4s, v23.4s, v1.s[1]
149        FMLA    v16.4s, v24.4s, v1.s[2]
150        FMLA    v17.4s, v25.4s, v1.s[2]
151        FMLA    v18.4s, v26.4s, v1.s[3]
152        FMLA    v19.4s, v27.4s, v1.s[3]
153
1544:
155        # Is there a remainder?- 4 floats of A (16 bytes)
156        TBNZ    x0, 4, 6f
157        # Is there a remainder?- 2 floats of A (8 bytes)
158        TBNZ    x0, 3, 7f
159        # Is there a remainder?- 1 float of A (4 bytes)
160        TBNZ    x0, 2, 9f
161
1625:
163        # ks loop
164        SUBS    x9, x9, 8               // ks -= MR * sizeof(void*)
165        B.HI    1b
166
167        FADD    v16.4s, v16.4s, v18.4s
168        FADD    v17.4s, v17.4s, v19.4s
169
170        # Clamp
171        FMAX    v16.4s, v16.4s, v30.4s
172        FMAX    v17.4s, v17.4s, v30.4s
173        FMIN    v16.4s, v16.4s, v31.4s
174        FMIN    v17.4s, v17.4s, v31.4s
175
176        # Store full 1 x 8
177        SUBS    x1, x1, 8
178        B.LO    10f
179
180        STP     q16, q17, [x6]
181        ADD     x6, x6, x10
182
183        SUB     x4, x4, x3              // a -= ks
184
185        # nc loop
186        B.HI    0b
187
188        RET
189
1906:
191        # Remainder- 4 floats of A (16 bytes)
192        LDP     q20, q21, [x5], 32
193        LDR     q0, [x8], 16
194        FMLA    v16.4s, v20.4s, v0.s[0]
195        FMLA    v17.4s, v21.4s, v0.s[0]
196        LDP     q22, q23, [x5], 32
197        LDP     q24, q25, [x5], 32
198        LDP     q26, q27, [x5], 32
199        FMLA    v18.4s, v22.4s, v0.s[1]
200        FMLA    v19.4s, v23.4s, v0.s[1]
201        FMLA    v16.4s, v24.4s, v0.s[2]
202        FMLA    v17.4s, v25.4s, v0.s[2]
203        FMLA    v18.4s, v26.4s, v0.s[3]
204        FMLA    v19.4s, v27.4s, v0.s[3]
205
206        TBZ     x0, 3, 8f
2077:
208        # Remainder- 2 floats of A (8 bytes)
209        LDP     q20, q21, [x5], 32
210        LDR     d0, [x8], 8
211        FMLA    v16.4s, v20.4s, v0.s[0]
212        FMLA    v17.4s, v21.4s, v0.s[0]
213        LDP     q22, q23, [x5], 32
214        FMLA    v18.4s, v22.4s, v0.s[1]
215        FMLA    v19.4s, v23.4s, v0.s[1]
2168:
217        TBZ     x0, 2, 5b
2189:
219        # Remainder- 1 float of A (4 bytes)
220        LDP     q20, q21, [x5], 32
221        LDR     s0, [x8], 4
222        FMLA    v16.4s, v20.4s, v0.s[0]
223        FMLA    v17.4s, v21.4s, v0.s[0]
224        B       5b
225
22610:
227        # Store odd channels
228        TBZ     x1, 2, 11f
229        STR     q16, [x6], 16
230        MOV     v16.16b, v17.16b
231
23211:
233        TBZ     x1, 1, 12f
234        STR     d16, [x6], 8
235        DUP     d16, v16.d[1]
236
23712:
238        TBZ     x1, 0, 13f
239        STR     s16, [x6], 4
24013:
241        RET
242
243END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
244# LINT.ThenChange(1x8-aarch64-neonfma-cortex-a75.cc)
245
246#ifdef __ELF__
247.section ".note.GNU-stack","",%progbits
248#endif
249