xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/1x8-minmax-aarch64-neonfma-prfm-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Clamp v4 v5
33
34BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75
35
36        # Load cn_stride, params pointer
37        LDP     x14, x8, [sp]
38
39        # Load min/max values
40        LD2R    {v4.4s, v5.4s}, [x8]
410:
42        # Load initial bias from w into accumulators
43        LDP     q16, q17, [x5], 32
44
45        MOVI    v18.4s, 0               // second set of C for pipelining FMLA
46        PRFM    PLDL1KEEP, [x5]
47        MOVI    v19.4s, 0
48        PRFM    PLDL1KEEP, [x5, 64]
49        PRFM    PLDL1KEEP, [x5, 128]
50        PRFM    PLDL1KEEP, [x5, 192]
51
52        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
53        SUBS    x0, x2, 32              // k = kc - 32
54
55        B.LO    3f
56
57        # 16 prologue
58        # Read first block of 1 A and B.
59        LDP     q20, q21, [x5], 32
60        LDP     q22, q23, [x5], 32
61        LDP     q24, q25, [x5], 32
62        LDP     q26, q27, [x5], 32
63        LDR     q0, [x3], 16
64
65        # Is there at least 32.  yes do main loop
66        SUBS    x0, x0, 32
67        B.LO    2f
68
69        # Main loop - 8 floats of A (32 bytes)
701:
71        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
72        FMLA    v16.4s, v20.4s, v0.s[0]
73        LDR     q1, [x3], 16
74        FMLA    v17.4s, v21.4s, v0.s[0]
75        LDP     q20, q21, [x5], 32
76        FMLA    v18.4s, v22.4s, v0.s[1]
77        PRFM    PLDL1KEEP, [x5, 96]
78        FMLA    v19.4s, v23.4s, v0.s[1]
79        LDP     q22, q23, [x5], 32
80        FMLA    v16.4s, v24.4s, v0.s[2]
81        FMLA    v17.4s, v25.4s, v0.s[2]
82        LDP     q24, q25, [x5], 32
83        FMLA    v18.4s, v26.4s, v0.s[3]
84        FMLA    v19.4s, v27.4s, v0.s[3]
85        LDP     q26, q27, [x5], 32
86
87        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
88        FMLA    v16.4s, v20.4s, v1.s[0]
89        LDR     q0, [x3], 16
90        FMLA    v17.4s, v21.4s, v1.s[0]
91        LDP     q20, q21, [x5], 32
92        FMLA    v18.4s, v22.4s, v1.s[1]
93        FMLA    v19.4s, v23.4s, v1.s[1]
94        LDP     q22, q23, [x5], 32
95        FMLA    v16.4s, v24.4s, v1.s[2]
96        FMLA    v17.4s, v25.4s, v1.s[2]
97        LDP     q24, q25, [x5], 32
98        FMLA    v18.4s, v26.4s, v1.s[3]
99        FMLA    v19.4s, v27.4s, v1.s[3]
100        SUBS    x0, x0, 32
101        LDP     q26, q27, [x5], 32
102        B.HS    1b
103
1042:
105        # Epilogue
106
107        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
108        FMLA    v16.4s, v20.4s, v0.s[0]
109        LDR     q1, [x3], 16
110        FMLA    v17.4s, v21.4s, v0.s[0]
111        LDP     q20, q21, [x5], 32
112        FMLA    v18.4s, v22.4s, v0.s[1]
113        FMLA    v19.4s, v23.4s, v0.s[1]
114        LDP     q22, q23, [x5], 32
115        FMLA    v16.4s, v24.4s, v0.s[2]
116        FMLA    v17.4s, v25.4s, v0.s[2]
117        LDP     q24, q25, [x5], 32
118        FMLA    v18.4s, v26.4s, v0.s[3]
119        FMLA    v19.4s, v27.4s, v0.s[3]
120        LDP     q26, q27, [x5], 32
121
122        # Second block of 4.  no loads
123        FMLA    v16.4s, v20.4s, v1.s[0]
124        FMLA    v17.4s, v21.4s, v1.s[0]
125        FMLA    v18.4s, v22.4s, v1.s[1]
126        FMLA    v19.4s, v23.4s, v1.s[1]
127        FMLA    v16.4s, v24.4s, v1.s[2]
128        FMLA    v17.4s, v25.4s, v1.s[2]
129        FMLA    v18.4s, v26.4s, v1.s[3]
130        FMLA    v19.4s, v27.4s, v1.s[3]
131
1323:
133        # Is there a remainder?- 4 floats of A (16 bytes)
134        TBNZ    x0, 4, 5f
135        # Is there a remainder?- 2 floats of A (8 bytes)
136        TBNZ    x0, 3, 6f
137        # Is there a remainder?- 1 float of A (4 bytes)
138        TBNZ    x0, 2, 8f
139
1404:
141        FADD    v16.4s, v16.4s, v18.4s
142        SUBS    x1, x1, 8
143        FADD    v17.4s, v17.4s, v19.4s
144
145        # Clamp
146        FMAX    v16.4s, v16.4s, v4.4s
147        FMAX    v17.4s, v17.4s, v4.4s
148        FMIN    v16.4s, v16.4s, v5.4s
149        FMIN    v17.4s, v17.4s, v5.4s
150
151        # Store full 1 x 8
152        B.LO    9f
153
154        STP     q16, q17, [x6]
155        ADD     x6, x6, x14
156
157        SUB     x3,  x3, x2             // a0 -= kc
158
159        B.HI    0b
160
161        RET
162
1635:
164        # Remainder- 4 floats of A (16 bytes)
165        LDP     q20, q21, [x5], 32
166        LDR     q0, [x3], 16
167        FMLA    v16.4s, v20.4s, v0.s[0]
168        FMLA    v17.4s, v21.4s, v0.s[0]
169        LDP     q22, q23, [x5], 32
170        LDP     q24, q25, [x5], 32
171        LDP     q26, q27, [x5], 32
172        FMLA    v18.4s, v22.4s, v0.s[1]
173        FMLA    v19.4s, v23.4s, v0.s[1]
174        FMLA    v16.4s, v24.4s, v0.s[2]
175        FMLA    v17.4s, v25.4s, v0.s[2]
176        FMLA    v18.4s, v26.4s, v0.s[3]
177        FMLA    v19.4s, v27.4s, v0.s[3]
178
179        TBZ     x0, 3, 7f
1806:
181        # Remainder- 2 floats of A (8 bytes)
182        LDP     q20, q21, [x5], 32
183        LDR     d0, [x3], 8
184        FMLA    v16.4s, v20.4s, v0.s[0]
185        FMLA    v17.4s, v21.4s, v0.s[0]
186        LDP     q22, q23, [x5], 32
187        FMLA    v18.4s, v22.4s, v0.s[1]
188        FMLA    v19.4s, v23.4s, v0.s[1]
1897:
190        TBZ     x0, 2, 4b
1918:
192        # Remainder- 1 float of A (4 bytes)
193        LDP     q20, q21, [x5], 32
194        LDR     s0, [x3], 4
195        FMLA    v16.4s, v20.4s, v0.s[0]
196        FMLA    v17.4s, v21.4s, v0.s[0]
197        B       4b
198
199        # Store odd channels
2009:
201        TBZ     x1, 2, 10f
202        STR     q16, [x6], 16
203        MOV     v16.16b, v17.16b
204
20510:
206        TBZ     x1, 1, 11f
207        STR     d16, [x6], 8
208        DUP     d16, v16.d[1]
209
21011:
211        TBZ     x1, 0, 12f
212        STR     s16, [x6]
21312:
214        RET
215
216END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75
217
218#ifdef __ELF__
219.section ".note.GNU-stack","",%progbits
220#endif
221