xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Clamp v4 v5
33
34# A53 based on A57/A75 but with LD64
35
36BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53
37
38        # Load cn_stride, params pointer
39        LDP     x14, x8, [sp]
40
41        # Load min/max values
42        LD2R    {v4.4s, v5.4s}, [x8]
430:
44        # Load initial bias from w into accumulators
45        LDP     q16, q17, [x5], 32
46
47        MOVI    v18.4s, 0               // second set of C for pipelining FMLA
48        MOVI    v19.4s, 0
49
50        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
51        SUBS    x0, x2, 32              // k = kc - 32
52
53        B.LO    3f
54
55        # 16 prologue
56        # Read first block of 1 A and B.
57        LDP     q20, q21, [x5], 32
58        LDP     q22, q23, [x5], 32
59        LDP     q24, q25, [x5], 32
60        LDP     q26, q27, [x5], 32
61        LDR     q0, [x3], 16
62
63        # Is there at least 32.  yes do main loop
64        SUBS    x0, x0, 32
65        B.LO    2f
66
67        # Main loop - 8 floats of A (32 bytes)
681:
69        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
70        FMLA    v16.4s, v20.4s, v0.s[0]
71        LDR     q1, [x3], 16
72        FMLA    v17.4s, v21.4s, v0.s[0]
73        LDR     q20, [x5], 16
74        FMLA    v18.4s, v22.4s, v0.s[1]
75        LDR     q21, [x5], 16
76        FMLA    v19.4s, v23.4s, v0.s[1]
77        LDR     q22, [x5], 16
78        FMLA    v16.4s, v24.4s, v0.s[2]
79        LDR     q23, [x5], 16
80        FMLA    v17.4s, v25.4s, v0.s[2]
81        LDR     q24, [x5], 16
82        FMLA    v18.4s, v26.4s, v0.s[3]
83        LDR     q25, [x5], 16
84        FMLA    v19.4s, v27.4s, v0.s[3]
85        LDR     q26, [x5], 16
86        LDR     q27, [x5], 16
87
88
89        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
90        FMLA    v16.4s, v20.4s, v1.s[0]
91        LDR     q0, [x3], 16
92        FMLA    v17.4s, v21.4s, v1.s[0]
93        LDR     q20, [x5], 16
94        FMLA    v18.4s, v22.4s, v1.s[1]
95        LDR     q21, [x5], 16
96        FMLA    v19.4s, v23.4s, v1.s[1]
97        LDR     q22, [x5], 16
98        FMLA    v16.4s, v24.4s, v1.s[2]
99        LDR     q23, [x5], 16
100        FMLA    v17.4s, v25.4s, v1.s[2]
101        LDR     q24, [x5], 16
102        FMLA    v18.4s, v26.4s, v1.s[3]
103        LDR     q25, [x5], 16
104        FMLA    v19.4s, v27.4s, v1.s[3]
105        SUBS    x0, x0, 32
106        LDR     q26, [x5], 16
107        LDR     q27, [x5], 16
108        B.HS    1b
109
1102:
111        # Epilogue
112
113        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
114        FMLA    v16.4s, v20.4s, v0.s[0]
115        LDR     q1, [x3], 16
116        FMLA    v17.4s, v21.4s, v0.s[0]
117        LDR     q20, [x5], 16
118        FMLA    v18.4s, v22.4s, v0.s[1]
119        LDR     q21, [x5], 16
120        FMLA    v19.4s, v23.4s, v0.s[1]
121        LDR     q22, [x5], 16
122        FMLA    v16.4s, v24.4s, v0.s[2]
123        LDR     q23, [x5], 16
124        FMLA    v17.4s, v25.4s, v0.s[2]
125        LDR     q24, [x5], 16
126        FMLA    v18.4s, v26.4s, v0.s[3]
127        LDR     q25, [x5], 16
128        FMLA    v19.4s, v27.4s, v0.s[3]
129        LDR     q26, [x5], 16
130
131        # Second block of 4.  no loads
132        FMLA    v16.4s, v20.4s, v1.s[0]
133        LDR     q27, [x5], 16
134        FMLA    v17.4s, v21.4s, v1.s[0]
135        FMLA    v18.4s, v22.4s, v1.s[1]
136        FMLA    v19.4s, v23.4s, v1.s[1]
137        FMLA    v16.4s, v24.4s, v1.s[2]
138        FMLA    v17.4s, v25.4s, v1.s[2]
139        FMLA    v18.4s, v26.4s, v1.s[3]
140        FMLA    v19.4s, v27.4s, v1.s[3]
141
1423:
143        # Is there a remainder?- 4 floats of A (16 bytes)
144        TBNZ    x0, 4, 5f
145        # Is there a remainder?- 2 floats of A (8 bytes)
146        TBNZ    x0, 3, 6f
147        # Is there a remainder?- 1 float of A (4 bytes)
148        TBNZ    x0, 2, 8f
149
1504:
151        FADD    v16.4s, v16.4s, v18.4s
152        FADD    v17.4s, v17.4s, v19.4s
153
154        # Clamp
155        FMAX    v16.4s, v16.4s, v4.4s
156        SUBS    x1, x1, 8
157        FMAX    v17.4s, v17.4s, v4.4s
158        FMIN    v16.4s, v16.4s, v5.4s
159        FMIN    v17.4s, v17.4s, v5.4s
160
161        # Store full 1 x 8
162        B.LO    9f
163
164        ST1     {v16.16b, v17.16b}, [x6], x14
165        SUB     x3,  x3, x2             // a0 -= kc
166
167        B.HI    0b
168
169        RET
170
1715:
172        # Remainder- 4 floats of A (16 bytes)
173        LDR     q20, [x5], 16
174        LDR     q21, [x5], 16
175        LDR     q0, [x3], 16
176        FMLA    v16.4s, v20.4s, v0.s[0]
177        FMLA    v17.4s, v21.4s, v0.s[0]
178        LDR     q22, [x5], 16
179        LDR     q23, [x5], 16
180        LDR     q24, [x5], 16
181        LDR     q25, [x5], 16
182        LDR     q26, [x5], 16
183        LDR     q27, [x5], 16
184        FMLA    v18.4s, v22.4s, v0.s[1]
185        FMLA    v19.4s, v23.4s, v0.s[1]
186        FMLA    v16.4s, v24.4s, v0.s[2]
187        FMLA    v17.4s, v25.4s, v0.s[2]
188        FMLA    v18.4s, v26.4s, v0.s[3]
189        FMLA    v19.4s, v27.4s, v0.s[3]
190
191        TBZ     x0, 3, 7f
1926:
193        # Remainder- 2 floats of A (8 bytes)
194        LDR     q20, [x5], 16
195        LDR     q21, [x5], 16
196        LDR     d0, [x3], 8
197        FMLA    v16.4s, v20.4s, v0.s[0]
198        FMLA    v17.4s, v21.4s, v0.s[0]
199        LDR     q22, [x5], 16
200        LDR     q23, [x5], 16
201        FMLA    v18.4s, v22.4s, v0.s[1]
202        FMLA    v19.4s, v23.4s, v0.s[1]
2037:
204        TBZ     x0, 2, 4b
2058:
206        # Remainder- 1 float of A (4 bytes)
207        LDR     q20, [x5], 16
208        LDR     q21, [x5], 16
209        LDR     s0, [x3], 4
210        FMLA    v16.4s, v20.4s, v0.s[0]
211        FMLA    v17.4s, v21.4s, v0.s[0]
212        B       4b
213
214        # Store odd channels
2159:
216        TBZ     x1, 2, 10f
217        STR     q16, [x6], 16
218        MOV     v16.16b, v17.16b
219
22010:
221        TBZ     x1, 1, 11f
222        STR     d16, [x6], 8
223        DUP     d16, v16.d[1]
224
22511:
226        TBZ     x1, 0, 12f
227        STR     s16, [x6]
22812:
229        RET
230
231END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53
232
233#ifdef __ELF__
234.section ".note.GNU-stack","",%progbits
235#endif
236