xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/4x2-minmax-aarch64-neonfma-prfm-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x2-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# Register usage
27# A0  x3 v0  v4
28# A1 x11 v1  v5
29# A2 x12 v2  v6
30# A3  x4 v3  v7
31
32# B   x5 v16 v17 v18 v19 v20 v21 v22 v23
33
34# C0  x6 v24 v25
35# C1  x9 v26 v27
36# C2 x10 v28 v29
37# C3  x7 v30 v31
38
39# Clamp v4 v5
40
41BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75
42
43        # Load cn_stride, params pointer
44        LDP     x14, x8, [sp]
45
46        # Load min/max values
47        LD2R    {v4.2s, v5.2s}, [x8]
48
49        # Clamp A and C pointers
50        CMP     x0, 2                   // if mr < 2
51        ADD     x11, x3, x4             // a1 = a0 + a_stride
52        ADD     x9, x6, x7              // c1 = c0 + cm_stride
53        CSEL    x11, x3, x11, LO        //   a1 = a0
54        CSEL    x9, x6, x9, LO          //   c1 = c0
55
56        ADD     x12, x11, x4            // a2 = a1 + a_stride
57        ADD     x10, x9, x7             // c2 = c1 + cm_stride
58                                        // if mr <= 2
59        CSEL    x12, x11, x12, LS       //   a2 = a1
60        CSEL    x10, x9, x10, LS        //   c2 = c1
61
62        CMP     x0, 4                   // if mr < 4
63        ADD     x4, x12, x4             // a3 = a2 + a_stride
64        ADD     x7, x10, x7             // c3 = c2 + cm_stride
65        CSEL    x4, x12, x4, LO         //   a3 = a2
66        CSEL    x7, x10, x7, LO         //   c3 = c2
67
680:
69        # Load initial bias from w into accumulators
70        LDR     d24, [x5], 8
71        MOV     v26.8b, v24.8b
72        MOV     v30.8b, v24.8b
73        MOV     v28.8b, v24.8b
74        MOVI    v25.2s, 0
75        PRFM    PLDL1KEEP, [x5, 64]
76        MOVI    v27.2s, 0
77        PRFM    PLDL1KEEP, [x5, 128]
78        MOVI    v29.2s, 0
79        PRFM    PLDL1KEEP, [x5, 192]
80        MOVI    v31.2s, 0
81        PRFM    PLDL1KEEP, [x5, 256]
82
83        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
84        SUBS    x0, x2, 32              // k = kc - 32
85        B.LO    4f
86
87        # Prologue
88        # Read first block of 4 A and B.
89        LDR     q0,  [x3], 16
90        LDP     d20, d21, [x5], 16
91        LDR     q1, [x11], 16
92        LDR     q2, [x12], 16
93        LDR     q3,  [x4], 16
94        LDP     d22, d23, [x5], 16
95
96        # Is there at least 32.  yes do main loop
97        SUBS    x0, x0, 32
98        B.LO    2f
99
100        # Main loop - 8 floats of A (32 bytes)
1011:
102        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
103        FMLA    v24.2s, v20.2s, v0.s[0]
104        LDR     q4, [x3], 16
105        FMLA    v26.2s, v20.2s, v1.s[0]
106        FMLA    v28.2s, v20.2s, v2.s[0]
107        LDR     d16, [x5, 0]
108        FMLA    v30.2s, v20.2s, v3.s[0]
109        FMLA    v25.2s, v21.2s, v0.s[1]
110        LDR     q5, [x11], 16
111        FMLA    v27.2s, v21.2s, v1.s[1]
112        FMLA    v29.2s, v21.2s, v2.s[1]
113        LDR     q6, [x12], 16
114        FMLA    v31.2s, v21.2s, v3.s[1]
115        FMLA    v24.2s, v22.2s, v0.s[2]
116        LDR     q7, [x4], 16
117        FMLA    v26.2s, v22.2s, v1.s[2]
118        FMLA    v28.2s, v22.2s, v2.s[2]
119        LDR     d17, [x5, 8]
120        FMLA    v30.2s, v22.2s, v3.s[2]
121        FMLA    v25.2s, v23.2s, v0.s[3]
122        LDR     d18, [x5, 16]
123        FMLA    v27.2s, v23.2s, v1.s[3]
124        FMLA    v29.2s, v23.2s, v2.s[3]
125        LDR     d19, [x5, 24]
126        FMLA    v31.2s, v23.2s, v3.s[3]
127        PRFM    PLDL1KEEP, [x5, 320]
128
129        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
130        FMLA    v24.2s, v16.2s, v4.s[0]
131        LDR     q0, [x3], 16
132        FMLA    v26.2s, v16.2s, v5.s[0]
133        FMLA    v28.2s, v16.2s, v6.s[0]
134        LDR     d20, [x5, 32]
135        FMLA    v30.2s, v16.2s, v7.s[0]
136        FMLA    v25.2s, v17.2s, v4.s[1]
137        LDR     q1, [x11], 16
138        FMLA    v27.2s, v17.2s, v5.s[1]
139        FMLA    v29.2s, v17.2s, v6.s[1]
140        LDR     q2, [x12], 16
141        FMLA    v31.2s, v17.2s, v7.s[1]
142        FMLA    v24.2s, v18.2s, v4.s[2]
143        LDR     q3, [x4], 16
144        FMLA    v26.2s, v18.2s, v5.s[2]
145        FMLA    v28.2s, v18.2s, v6.s[2]
146        LDR     d21, [x5, 40]
147        FMLA    v30.2s, v18.2s, v7.s[2]
148        SUBS    x0, x0, 32
149        FMLA    v25.2s, v19.2s, v4.s[3]
150        LDR     d22, [x5, 48]
151        FMLA    v27.2s, v19.2s, v5.s[3]
152        LDR     d23, [x5, 56]
153        FMLA    v29.2s, v19.2s, v6.s[3]
154        ADD     x5, x5, 64
155        FMLA    v31.2s, v19.2s, v7.s[3]
156        B.HS    1b
157
1582:
159        # Epilogue
160        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
161        FMLA    v24.2s, v20.2s, v0.s[0]
162        LDR     q4, [x3], 16
163        FMLA    v26.2s, v20.2s, v1.s[0]
164        FMLA    v28.2s, v20.2s, v2.s[0]
165        LDR     d16, [x5, 0]
166        FMLA    v30.2s, v20.2s, v3.s[0]
167        FMLA    v25.2s, v21.2s, v0.s[1]
168        LDR     q5, [x11], 16
169        FMLA    v27.2s, v21.2s, v1.s[1]
170        FMLA    v29.2s, v21.2s, v2.s[1]
171        LDR     q6, [x12], 16
172        FMLA    v31.2s, v21.2s, v3.s[1]
173        FMLA    v24.2s, v22.2s, v0.s[2]
174        LDR     q7, [x4], 16
175        FMLA    v26.2s, v22.2s, v1.s[2]
176        FMLA    v28.2s, v22.2s, v2.s[2]
177        LDR     d17, [x5, 8]
178        FMLA    v30.2s, v22.2s, v3.s[2]
179        FMLA    v25.2s, v23.2s, v0.s[3]
180        LDR     d18, [x5, 16]
181        FMLA    v27.2s, v23.2s, v1.s[3]
182        FMLA    v29.2s, v23.2s, v2.s[3]
183        LDR     d19, [x5, 24]
184        FMLA    v31.2s, v23.2s, v3.s[3]
185        PRFM    PLDL1KEEP, [x5, 320]
186
187        # Second block of 4.  FMA for second 4, no loads
188        FMLA    v24.2s, v16.2s, v4.s[0]
189        FMLA    v26.2s, v16.2s, v5.s[0]
190        FMLA    v28.2s, v16.2s, v6.s[0]
191        FMLA    v30.2s, v16.2s, v7.s[0]
192        FMLA    v25.2s, v17.2s, v4.s[1]
193        FMLA    v27.2s, v17.2s, v5.s[1]
194        FMLA    v29.2s, v17.2s, v6.s[1]
195        FMLA    v31.2s, v17.2s, v7.s[1]
196        FMLA    v24.2s, v18.2s, v4.s[2]
197        FMLA    v26.2s, v18.2s, v5.s[2]
198        FMLA    v28.2s, v18.2s, v6.s[2]
199        ADDS    x0, x0, 32
200        FMLA    v30.2s, v18.2s, v7.s[2]
201        FMLA    v25.2s, v19.2s, v4.s[3]
202        ADD     x5, x5, 32
203        FMLA    v27.2s, v19.2s, v5.s[3]
204        FMLA    v29.2s, v19.2s, v6.s[3]
205        LD2R    {v4.2s, v5.2s}, [x8]     // Load min/max values
206        FMLA    v31.2s, v19.2s, v7.s[3]
207
208        # Is there a remainder? up to 8 floats (32 bytes)
209        B.NE    4f
210
2113:
212        FADD    v24.2s, v24.2s, v25.2s
213        FADD    v26.2s, v26.2s, v27.2s
214        FADD    v28.2s, v28.2s, v29.2s
215        FADD    v30.2s, v30.2s, v31.2s
216
217        # Clamp
218        FMAX    v24.2s, v24.2s, v4.2s
219        FMAX    v26.2s, v26.2s, v4.2s
220        FMAX    v28.2s, v28.2s, v4.2s
221        FMAX    v30.2s, v30.2s, v4.2s
222        SUBS    x1, x1, 2
223        FMIN    v24.2s, v24.2s, v5.2s
224        FMIN    v26.2s, v26.2s, v5.2s
225        FMIN    v28.2s, v28.2s, v5.2s
226        FMIN    v30.2s, v30.2s, v5.2s
227
228        # Store full 4 x 2
229        B.LO    7f
230
231        STR     d24, [x6]
232        SUB     x3,  x3, x2             // a0 -= kc
233        ADD     x6,  x6, x14
234        STR     d26, [x9]
235        SUB     x11, x11, x2            // a1 -= kc
236        ADD     x9,  x9, x14
237        STR     d28, [x10]
238        SUB     x12, x12, x2            // a2 -= kc
239        ADD     x10, x10, x14
240        STR     d30, [x7]
241        SUB     x4,  x4, x2             // a3 -= kc
242        ADD     x7,  x7, x14
243
244        B.HI    0b
245        RET
246
2474:
248        # Remainder- 4 floats of A (16 bytes)
249        TBZ     x0, 4, 5f
250
251        LDR     q0,  [x3], 16
252        LDP     d20, d21, [x5], 16
253        LDR     q1, [x11], 16
254        LDR     q2, [x12], 16
255        LDR     q3,  [x4], 16
256        LDP     d22, d23, [x5], 16
257        FMLA    v24.2s, v20.2s, v0.s[0]
258        FMLA    v26.2s, v20.2s, v1.s[0]
259        FMLA    v28.2s, v20.2s, v2.s[0]
260        FMLA    v30.2s, v20.2s, v3.s[0]
261        FMLA    v25.2s, v21.2s, v0.s[1]
262        FMLA    v27.2s, v21.2s, v1.s[1]
263        FMLA    v29.2s, v21.2s, v2.s[1]
264        FMLA    v31.2s, v21.2s, v3.s[1]
265        FMLA    v24.2s, v22.2s, v0.s[2]
266        FMLA    v26.2s, v22.2s, v1.s[2]
267        FMLA    v28.2s, v22.2s, v2.s[2]
268        FMLA    v30.2s, v22.2s, v3.s[2]
269        FMLA    v25.2s, v23.2s, v0.s[3]
270        FMLA    v27.2s, v23.2s, v1.s[3]
271        FMLA    v29.2s, v23.2s, v2.s[3]
272        FMLA    v31.2s, v23.2s, v3.s[3]
273
2745:
275        # Remainder- 2 floats of A (8 bytes)
276        TBZ     x0, 3, 6f
277
278        LDR     d0,  [x3], 8
279        LDP     d20, d21, [x5], 16
280        LDR     d1, [x11], 8
281        LDR     d2, [x12], 8
282        LDR     d3,  [x4], 8
283        FMLA    v24.2s, v20.2s, v0.s[0]
284        FMLA    v26.2s, v20.2s, v1.s[0]
285        FMLA    v28.2s, v20.2s, v2.s[0]
286        FMLA    v30.2s, v20.2s, v3.s[0]
287        FMLA    v25.2s, v21.2s, v0.s[1]
288        FMLA    v27.2s, v21.2s, v1.s[1]
289        FMLA    v29.2s, v21.2s, v2.s[1]
290        FMLA    v31.2s, v21.2s, v3.s[1]
291
2926:
293        # Remainder- 1 float of A (4 bytes)
294        TBZ     x0, 2, 3b
295
296        LDR     s0,  [x3], 4
297        LDR     d20, [x5], 8
298        LDR     s1, [x11], 4
299        LDR     s2, [x12], 4
300        LDR     s3,  [x4], 4
301        FMLA    v24.2s, v20.2s, v0.s[0]
302        FMLA    v26.2s, v20.2s, v1.s[0]
303        FMLA    v28.2s, v20.2s, v2.s[0]
304        FMLA    v30.2s, v20.2s, v3.s[0]
305        B       3b
306
307        # Store odd width
3087:
309        STR     s24,  [x6]
310        STR     s26,  [x9]
311        STR     s28, [x10]
312        STR     s30,  [x7]
31310:
314        RET
315
316
317END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75
318
319#ifdef __ELF__
320.section ".note.GNU-stack","",%progbits
321#endif
322