xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/1x12-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Clamp v2 v3
33
34# A53 based on A57/A75 but with LD64
35
36BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
37
38        # Load cn_stride, params pointer
39        LDP     x14, x8, [sp]
40
41        # Load min/max values
42        LD2R    {v2.4s, v3.4s}, [x8]
430:
44        # Load initial bias from w into accumulators
45        LD1     {v16.16b, v17.16b, v18.16b}, [x5], 48
46
47        MOVI    v5.4s, 0                // second set of C for pipelining FMLA
48        PRFM    PLDL1KEEP, [x5]
49        MOVI    v6.4s, 0
50        PRFM    PLDL1KEEP, [x5, 64]
51        MOVI    v7.4s, 0
52        PRFM    PLDL1KEEP, [x5, 128]
53        PRFM    PLDL1KEEP, [x5, 192]
54
55        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
56        SUBS    x0, x2, 32              // k = kc - 32
57
58        B.LO    3f
59
60        # 16 prologue
61        # Read first block of 1 A and B.
62        LDP     q20, q21, [x5], 32
63        LDP     q22, q23, [x5], 32
64        LDP     q24, q25, [x5], 32
65        LDP     q26, q27, [x5], 32
66        LDP     q28, q29, [x5], 32
67        LDP     q30, q31, [x5], 32
68        LDR     q0, [x3], 16
69
70        # Is there at least 32.  yes do main loop
71        SUBS    x0, x0, 32
72        B.LO    2f
73
74        # Main loop - 8 floats of A (32 bytes)
751:
76        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
77        FMLA    v16.4s, v20.4s, v0.s[0]
78        LDR     q1, [x3], 16
79        FMLA    v17.4s, v21.4s, v0.s[0]
80        LDR     q20, [x5], 16
81        FMLA    v18.4s, v22.4s, v0.s[0]
82        LDR     q21, [x5], 16
83        FMLA    v5.4s, v23.4s, v0.s[1]
84        LDR     q22, [x5], 16
85        FMLA    v6.4s, v24.4s, v0.s[1]
86        LDR     q23, [x5], 16
87        FMLA    v7.4s, v25.4s, v0.s[1]
88        LDR     q24, [x5], 16
89        FMLA    v16.4s, v26.4s, v0.s[2]
90        LDR     q25, [x5], 16
91        FMLA    v17.4s, v27.4s, v0.s[2]
92        LDR     q26, [x5], 16
93        FMLA    v18.4s, v28.4s, v0.s[2]
94        LDR     q27, [x5], 16
95        FMLA    v5.4s, v29.4s, v0.s[3]
96        LDR     q28, [x5], 16
97        FMLA    v6.4s, v30.4s, v0.s[3]
98        LDR     q29, [x5], 16
99        FMLA    v7.4s, v31.4s, v0.s[3]
100        LDR     q30, [x5], 16
101        LDR     q31, [x5], 16
102
103        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
104        FMLA    v16.4s, v20.4s, v1.s[0]
105        LDR     q0, [x3], 16
106        FMLA    v17.4s, v21.4s, v1.s[0]
107        LDR     q20, [x5], 16
108        FMLA    v18.4s, v22.4s, v1.s[0]
109        LDR     q21, [x5], 16
110        FMLA    v5.4s, v23.4s, v1.s[1]
111        LDR     q22, [x5], 16
112        FMLA    v6.4s, v24.4s, v1.s[1]
113        LDR     q23, [x5], 16
114        FMLA    v7.4s, v25.4s, v1.s[1]
115        LDR     q24, [x5], 16
116        FMLA    v16.4s, v26.4s, v1.s[2]
117        LDR     q25, [x5], 16
118        FMLA    v17.4s, v27.4s, v1.s[2]
119        LDR     q26, [x5], 16
120        FMLA    v18.4s, v28.4s, v1.s[2]
121        LDR     q27, [x5], 16
122        FMLA    v5.4s, v29.4s, v1.s[3]
123        LDR     q28, [x5], 16
124        FMLA    v6.4s, v30.4s, v1.s[3]
125        LDR     q29, [x5], 16
126        FMLA    v7.4s, v31.4s, v1.s[3]
127        LDR     q30, [x5], 16
128        SUBS    x0, x0, 32
129        LDR     q31, [x5], 16
130        B.HS    1b
131
1322:
133        # Epilogue
134
135        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
136        FMLA    v16.4s, v20.4s, v0.s[0]
137        LDR     q1, [x3], 16
138        FMLA    v17.4s, v21.4s, v0.s[0]
139        LDR     q20, [x5], 16
140        FMLA    v18.4s, v22.4s, v0.s[0]
141        LDR     q21, [x5], 16
142        FMLA    v5.4s, v23.4s, v0.s[1]
143        LDR     q22, [x5], 16
144        FMLA    v6.4s, v24.4s, v0.s[1]
145        LDR     q23, [x5], 16
146        FMLA    v7.4s, v25.4s, v0.s[1]
147        LDR     q24, [x5], 16
148        FMLA    v16.4s, v26.4s, v0.s[2]
149        LDR     q25, [x5], 16
150        FMLA    v17.4s, v27.4s, v0.s[2]
151        LDR     q26, [x5], 16
152        FMLA    v18.4s, v28.4s, v0.s[2]
153        LDR     q27, [x5], 16
154        FMLA    v5.4s, v29.4s, v0.s[3]
155        LDR     q28, [x5], 16
156        FMLA    v6.4s, v30.4s, v0.s[3]
157        LDR     q29, [x5], 16
158        FMLA    v7.4s, v31.4s, v0.s[3]
159        LDR     q30, [x5], 16
160
161        # Second block of 4.  FMA for second 4, no loads.
162        FMLA    v16.4s, v20.4s, v1.s[0]
163        LDR     q31, [x5], 16
164        FMLA    v17.4s, v21.4s, v1.s[0]
165        FMLA    v18.4s, v22.4s, v1.s[0]
166        FMLA    v5.4s, v23.4s, v1.s[1]
167        FMLA    v6.4s, v24.4s, v1.s[1]
168        FMLA    v7.4s, v25.4s, v1.s[1]
169        FMLA    v16.4s, v26.4s, v1.s[2]
170        FMLA    v17.4s, v27.4s, v1.s[2]
171        FMLA    v18.4s, v28.4s, v1.s[2]
172        FMLA    v5.4s, v29.4s, v1.s[3]
173        FMLA    v6.4s, v30.4s, v1.s[3]
174        FMLA    v7.4s, v31.4s, v1.s[3]
175
1763:
177        # Is there a remainder?- 4 floats of A (16 bytes)
178        TBNZ    x0, 4, 5f
179        # Is there a remainder?- 2 floats of A (8 bytes)
180        TBNZ    x0, 3, 6f
181        # Is there a remainder?- 1 float of A (4 bytes)
182        TBNZ    x0, 2, 8f
183
1844:
185        FADD    v16.4s, v16.4s, v5.4s
186        FADD    v17.4s, v17.4s, v6.4s
187        FADD    v18.4s, v18.4s, v7.4s
188        SUBS    x1, x1, 12
189
190        # Clamp
191        FMAX    v16.4s, v16.4s, v2.4s
192        FMAX    v17.4s, v17.4s, v2.4s
193        FMAX    v18.4s, v18.4s, v2.4s
194        FMIN    v16.4s, v16.4s, v3.4s
195        FMIN    v17.4s, v17.4s, v3.4s
196        FMIN    v18.4s, v18.4s, v3.4s
197
198        # Store full 1 x 12
199        B.LO    9f
200
201        ST1     {v16.16b, v17.16b, v18.16b}, [x6], x14
202        SUB     x3,  x3, x2             // a0 -= kc
203
204        B.HI    0b
205
206        RET
207
2085:
209        # Remainder- 4 floats of A (16 bytes)
210        LDR     q0, [x3], 16
211        LDR     q20, [x5], 16
212        LDR     q21, [x5], 16
213        LDR     q22, [x5], 16
214        FMLA    v16.4s, v20.4s, v0.s[0]
215        FMLA    v17.4s, v21.4s, v0.s[0]
216        FMLA    v18.4s, v22.4s, v0.s[0]
217
218        LDR     q20, [x5], 16
219        LDR     q21, [x5], 16
220        LDR     q22, [x5], 16
221        FMLA    v16.4s, v20.4s, v0.s[1]
222        FMLA    v17.4s, v21.4s, v0.s[1]
223        FMLA    v18.4s, v22.4s, v0.s[1]
224
225        LDR     q20, [x5], 16
226        LDR     q21, [x5], 16
227        LDR     q22, [x5], 16
228        FMLA    v16.4s, v20.4s, v0.s[2]
229        FMLA    v17.4s, v21.4s, v0.s[2]
230        FMLA    v18.4s, v22.4s, v0.s[2]
231
232        LDR     q20, [x5], 16
233        LDR     q21, [x5], 16
234        LDR     q22, [x5], 16
235        FMLA    v16.4s, v20.4s, v0.s[3]
236        FMLA    v17.4s, v21.4s, v0.s[3]
237        FMLA    v18.4s, v22.4s, v0.s[3]
238
239        TBZ     x0, 3, 7f
2406:
241        # Remainder- 2 floats of A (8 bytes)
242        LDR     d0, [x3], 8
243        LDR     q20, [x5], 16
244        LDR     q21, [x5], 16
245        LDR     q22, [x5], 16
246        FMLA    v16.4s, v20.4s, v0.s[0]
247        FMLA    v17.4s, v21.4s, v0.s[0]
248        FMLA    v18.4s, v22.4s, v0.s[0]
249
250        LDR     q20, [x5], 16
251        LDR     q21, [x5], 16
252        LDR     q22, [x5], 16
253        FMLA    v16.4s, v20.4s, v0.s[1]
254        FMLA    v17.4s, v21.4s, v0.s[1]
255        FMLA    v18.4s, v22.4s, v0.s[1]
2567:
257        TBZ     x0, 2, 4b
2588:
259        # Remainder- 1 float of A (4 bytes)
260        LDR     s0, [x3], 4
261        LDR     q20, [x5], 16
262        LDR     q21, [x5], 16
263        LDR     q22, [x5], 16
264        FMLA    v16.4s, v20.4s, v0.s[0]
265        FMLA    v17.4s, v21.4s, v0.s[0]
266        FMLA    v18.4s, v22.4s, v0.s[0]
267        B       4b
268
269        # Store odd channels
2709:
271        ADD     x1, x1, 12
272        TBZ     x1, 3, 10f
273        STP     q16, q17, [x6], 32
274        MOV     v16.16b, v18.16b
275
27610:
277        TBZ     x1, 2, 11f
278        STR     q16, [x6], 16
279        MOV     v16.16b, v17.16b
280
28111:
282        TBZ     x1, 1, 12f
283        STR     d16, [x6], 8
284        DUP     d16, v16.d[1]
285
28612:
287        TBZ     x1, 0, 13f
288        STR     s16, [x6]
28913:
290        RET
291
292END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
293
294#ifdef __ELF__
295.section ".note.GNU-stack","",%progbits
296#endif
297