xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/1x8-minmax-aarch64-neonfma-prfm-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Clamp v4 v5
33
34# A53 based on A57/A75 but with LD64
35
36BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53
37
38        # Load cn_stride, params pointer
39        LDP     x14, x8, [sp]
40
41        # Load min/max values
42        LD2R    {v4.4s, v5.4s}, [x8]
430:
44        # Load initial bias from w into accumulators
45        LDP     q16, q17, [x5], 32
46
47        MOVI    v18.4s, 0               // second set of C for pipelining FMLA
48        PRFM    PLDL1KEEP, [x5, 64]
49        MOVI    v19.4s, 0
50        PRFM    PLDL1KEEP, [x5, 128]
51        PRFM    PLDL1KEEP, [x5, 192]
52        PRFM    PLDL1KEEP, [x5, 256]
53        PRFM    PLDL1KEEP, [x5, 320]
54        PRFM    PLDL1KEEP, [x5, 384]
55        PRFM    PLDL1KEEP, [x5, 448]
56        PRFM    PLDL1KEEP, [x5, 512]
57        PRFM    PLDL1KEEP, [x5, 576]
58
59        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
60        SUBS    x0, x2, 32              // k = kc - 32
61
62        B.LO    3f
63
64        # 16 prologue
65        # Read first block of 1 A and B.
66        LDP     q20, q21, [x5], 32
67        LDP     q22, q23, [x5], 32
68        LDP     q24, q25, [x5], 32
69        LDP     q26, q27, [x5], 32
70        LDR     q0, [x3], 16
71
72        # Is there at least 32.  yes do main loop
73        SUBS    x0, x0, 32
74        B.LO    2f
75
76        # Main loop - 8 floats of A (32 bytes)
771:
78        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
79        FMLA    v16.4s, v20.4s, v0.s[0]
80        LDR     q1, [x3], 16
81        FMLA    v17.4s, v21.4s, v0.s[0]
82        LDR     q20, [x5], 16
83        FMLA    v18.4s, v22.4s, v0.s[1]
84        LDR     q21, [x5], 16
85        FMLA    v19.4s, v23.4s, v0.s[1]
86        LDR     q22, [x5], 16
87        FMLA    v16.4s, v24.4s, v0.s[2]
88        LDR     q23, [x5], 16
89        FMLA    v17.4s, v25.4s, v0.s[2]
90        LDR     q24, [x5], 16
91        FMLA    v18.4s, v26.4s, v0.s[3]
92        LDR     q25, [x5], 16
93        FMLA    v19.4s, v27.4s, v0.s[3]
94        LDR     q26, [x5], 16
95        LDR     q27, [x5], 16
96
97        PRFM    PLDL1KEEP, [x5, 384]
98        PRFM    PLDL1KEEP, [x5, 448]
99        PRFM    PLDL1KEEP, [x5, 512]
100        PRFM    PLDL1KEEP, [x5, 576]
101
102        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
103        FMLA    v16.4s, v20.4s, v1.s[0]
104        LDR     q0, [x3], 16
105        FMLA    v17.4s, v21.4s, v1.s[0]
106        LDR     q20, [x5], 16
107        FMLA    v18.4s, v22.4s, v1.s[1]
108        LDR     q21, [x5], 16
109        FMLA    v19.4s, v23.4s, v1.s[1]
110        LDR     q22, [x5], 16
111        FMLA    v16.4s, v24.4s, v1.s[2]
112        LDR     q23, [x5], 16
113        FMLA    v17.4s, v25.4s, v1.s[2]
114        LDR     q24, [x5], 16
115        FMLA    v18.4s, v26.4s, v1.s[3]
116        LDR     q25, [x5], 16
117        FMLA    v19.4s, v27.4s, v1.s[3]
118        SUBS    x0, x0, 32
119        LDR     q26, [x5], 16
120        LDR     q27, [x5], 16
121        B.HS    1b
122
1232:
124        # Epilogue
125
126        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
127        FMLA    v16.4s, v20.4s, v0.s[0]
128        LDR     q1, [x3], 16
129        FMLA    v17.4s, v21.4s, v0.s[0]
130        LDR     q20, [x5], 16
131        FMLA    v18.4s, v22.4s, v0.s[1]
132        LDR     q21, [x5], 16
133        FMLA    v19.4s, v23.4s, v0.s[1]
134        LDR     q22, [x5], 16
135        FMLA    v16.4s, v24.4s, v0.s[2]
136        LDR     q23, [x5], 16
137        FMLA    v17.4s, v25.4s, v0.s[2]
138        LDR     q24, [x5], 16
139        FMLA    v18.4s, v26.4s, v0.s[3]
140        LDR     q25, [x5], 16
141        FMLA    v19.4s, v27.4s, v0.s[3]
142        LDR     q26, [x5], 16
143
144        # Second block of 4.  no loads
145        FMLA    v16.4s, v20.4s, v1.s[0]
146        LDR     q27, [x5], 16
147        FMLA    v17.4s, v21.4s, v1.s[0]
148        FMLA    v18.4s, v22.4s, v1.s[1]
149        FMLA    v19.4s, v23.4s, v1.s[1]
150        FMLA    v16.4s, v24.4s, v1.s[2]
151        FMLA    v17.4s, v25.4s, v1.s[2]
152        FMLA    v18.4s, v26.4s, v1.s[3]
153        FMLA    v19.4s, v27.4s, v1.s[3]
154
1553:
156        # Is there a remainder?- 4 floats of A (16 bytes)
157        TBNZ    x0, 4, 5f
158        # Is there a remainder?- 2 floats of A (8 bytes)
159        TBNZ    x0, 3, 6f
160        # Is there a remainder?- 1 float of A (4 bytes)
161        TBNZ    x0, 2, 8f
162
1634:
164        FADD    v16.4s, v16.4s, v18.4s
165        FADD    v17.4s, v17.4s, v19.4s
166
167        # Clamp
168        FMAX    v16.4s, v16.4s, v4.4s
169        SUBS    x1, x1, 8
170        FMAX    v17.4s, v17.4s, v4.4s
171        FMIN    v16.4s, v16.4s, v5.4s
172        FMIN    v17.4s, v17.4s, v5.4s
173
174        # Store full 1 x 8
175        B.LO    9f
176
177        ST1     {v16.16b, v17.16b}, [x6], x14
178        SUB     x3,  x3, x2             // a0 -= kc
179
180        B.HI    0b
181
182        RET
183
1845:
185        # Remainder- 4 floats of A (16 bytes)
186        LDR     q20, [x5], 16
187        LDR     q21, [x5], 16
188        LDR     q0, [x3], 16
189        FMLA    v16.4s, v20.4s, v0.s[0]
190        FMLA    v17.4s, v21.4s, v0.s[0]
191        LDR     q22, [x5], 16
192        LDR     q23, [x5], 16
193        LDR     q24, [x5], 16
194        LDR     q25, [x5], 16
195        LDR     q26, [x5], 16
196        LDR     q27, [x5], 16
197        FMLA    v18.4s, v22.4s, v0.s[1]
198        FMLA    v19.4s, v23.4s, v0.s[1]
199        FMLA    v16.4s, v24.4s, v0.s[2]
200        FMLA    v17.4s, v25.4s, v0.s[2]
201        FMLA    v18.4s, v26.4s, v0.s[3]
202        FMLA    v19.4s, v27.4s, v0.s[3]
203
204        TBZ     x0, 3, 7f
2056:
206        # Remainder- 2 floats of A (8 bytes)
207        LDR     q20, [x5], 16
208        LDR     q21, [x5], 16
209        LDR     d0, [x3], 8
210        FMLA    v16.4s, v20.4s, v0.s[0]
211        FMLA    v17.4s, v21.4s, v0.s[0]
212        LDR     q22, [x5], 16
213        LDR     q23, [x5], 16
214        FMLA    v18.4s, v22.4s, v0.s[1]
215        FMLA    v19.4s, v23.4s, v0.s[1]
2167:
217        TBZ     x0, 2, 4b
2188:
219        # Remainder- 1 float of A (4 bytes)
220        LDR     q20, [x5], 16
221        LDR     q21, [x5], 16
222        LDR     s0, [x3], 4
223        FMLA    v16.4s, v20.4s, v0.s[0]
224        FMLA    v17.4s, v21.4s, v0.s[0]
225        B       4b
226
227        # Store odd channels
2289:
229        TBZ     x1, 2, 10f
230        STR     q16, [x6], 16
231        MOV     v16.16b, v17.16b
232
23310:
234        TBZ     x1, 1, 11f
235        STR     d16, [x6], 8
236        DUP     d16, v16.d[1]
237
23811:
239        TBZ     x1, 0, 12f
240        STR     s16, [x6]
24112:
242        RET
243
244END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53
245
246#ifdef __ELF__
247.section ".note.GNU-stack","",%progbits
248#endif
249