xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x14 c3
37
38# x4 temporary vector shadow register
39
40# Vector register usage
41# A0  v0     v3
42# A1  v0[1]  v3[1]
43# A2  v1     v4
44# A3  v1[1]  v4[1]
45
46# B   v12 v13 v14 v15 second set of B
47# B   v16 v17 v18 v19 first set
48# C   v20 v21
49# C   v22 v23
50# C   v24 v25
51# C   v26 v27
52# Clamp v6 v7
53
54# unused A   v8 v9 v10 v11
55# x12 a4
56# x13 c4
57#  x7 c5
58# A4  v2     v5
59# A5  v2[1]  v5[1]
60# C   v28 v29
61# C   v30 v31
62
63BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
64
65        # Load params pointer
66        LDR     x8, [sp, 8]
67
68        # Clamp A and C pointers
69        CMP     x0, 2                   // if mr < 2
70        ADD     x9, x3, x4              // a1 = a0 + a_stride
71        ADD     x16, x6, x7             // c1 = c0 + cm_stride
72        CSEL    x9, x3, x9, LO          //   a1 = a0
73        CSEL    x16, x6, x16, LO        //   c1 = c0
74
75        ADD     x10, x9, x4             // a2 = a1 + a_stride
76        ADD     x17, x16, x7            // c2 = c1 + cm_stride
77                                        // if mr <= 2
78        CSEL    x10, x9, x10, LS        //   a2 = a1
79        CSEL    x17, x16, x17, LS       //   c2 = c1
80
81        CMP     x0, 4                   // if mr < 4
82        ADD     x11, x10, x4            // a3 = a2 + a_stride
83        ADD     x14, x17, x7            // c3 = c2 + cm_stride
84        CSEL    x11, x10, x11, LO       //   a3 = a2
85        CSEL    x14, x17, x14, LO       //   c3 = c2
86
87        # Load min/max values
88        LD2R    {v6.4s, v7.4s}, [x8]
89
90        # Save d12-d15 on stack
91        STP     d12, d13, [sp, -32]!
92        STP     d14, d15, [sp, 16]
93
940:
95        # Load initial bias from w into accumulators
96        LDP     q20, q21, [x5], 32
97        MOV     v22.16b, v20.16b
98        PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
99        PRFM    PLDL1KEEP,  [x3, 64]
100        MOV     v23.16b, v21.16b
101        PRFM    PLDL1KEEP,  [x9,  0]
102        PRFM    PLDL1KEEP,  [x9, 64]
103        MOV     v24.16b, v20.16b
104        PRFM    PLDL1KEEP, [x10,  0]
105        PRFM    PLDL1KEEP, [x10, 64]
106        MOV     v25.16b, v21.16b
107        PRFM    PLDL1KEEP, [x11,  0]
108        PRFM    PLDL1KEEP, [x11, 64]
109        MOV     v26.16b, v20.16b
110        PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
111        MOV     v27.16b, v21.16b
112        PRFM    PLDL1KEEP, [x5,  64]
113        PRFM    PLDL1KEEP, [x5, 128]
114        PRFM    PLDL1KEEP, [x5, 192]
115
116        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
117        SUBS    x0, x2, 16              // k = kc - 16
118        B.LO    4f
119
120        # Prologue - First group loads, no FMA
121        LDR     d0, [x3], 8              // a0
122        LDP     q16, q17, [x5], 32         // b
123        LDR     d1, [x10], 8             // a2
124        LD1     {v0.d}[1],  [x9], 8       // a1
125        LD1     {v1.d}[1], [x11], 8       // a3
126        SUBS    x0, x0, 16
127        LDR     q18, [x5], 16
128        LDR     d19, [x5], 8
129        LDR     x4, [x5], 8             // ins is in BLOCK 0
130
131        # Is there at least 4 floats (16 bytes) for main loop?
132        B.LO    2f
133
134        # Main loop - 4 floats of A (16 bytes)
135        # 32 FMA + 8 LD64 A + 8 LDR B
1361:
137        # First group of 16 FMA, Second group loads
138        # BLOCK 0
139        FMLA    v20.4s, v16.4s,  v0.s[0]
140        LDR     d3, [x3], 8              // a0
141        FMLA    v22.4s, v16.4s,  v0.s[2]
142        INS     v19.d[1], x4               // b from second group
143        FMLA    v24.4s, v16.4s,  v1.s[0]
144        LDR     x4, [x9], 8               // a1
145
146        # BLOCK 1
147        FMLA    v26.4s, v16.4s,  v1.s[2]
148        LDR     d12, [x5]
149        FMLA    v21.4s, v17.4s,  v0.s[0]
150        INS     v3.d[1], x4                // a1 ins
151        FMLA    v23.4s, v17.4s,  v0.s[2]
152        LDR     x4, [x5, 8]             // b
153
154        # BLOCK 2
155        FMLA    v25.4s, v17.4s,  v1.s[0]
156        LDR     d4, [x10], 8             // a2
157        FMLA    v27.4s, v17.4s,  v1.s[2]
158        INS     v12.d[1], x4            // b  ins
159        FMLA    v20.4s, v18.4s,  v0.s[1]
160        LDR     x4, [x11], 8              // a3
161
162        # BLOCK 3
163        FMLA    v22.4s, v18.4s,  v0.s[3]
164        LDR     d13, [x5, 16]
165        FMLA    v24.4s, v18.4s,  v1.s[1]
166        INS     v4.d[1], x4                // a3 ins
167        FMLA    v26.4s, v18.4s,  v1.s[3]
168        LDR     x4, [x5, 24]
169
170        # BLOCK 4
171        FMLA    v21.4s, v19.4s,  v0.s[1]
172        LDR     d14, [x5, 32]
173        FMLA    v23.4s, v19.4s,  v0.s[3]
174        INS     v13.d[1], x4            // b
175        FMLA    v25.4s, v19.4s,  v1.s[1]
176        LDR     x4, [x5, 40]
177
178        # BLOCK 5
179        # NOPs to ensure 4 cycle LDR lands on next LDR
180        FMLA    v27.4s, v19.4s,  v1.s[3]
181        LDR     d15, [x5, 48]
182        NOP
183        INS     v14.d[1], x4            // b from previous
184        SUBS    x0, x0, 16
185        LDR     x4, [x5, 56]
186
187        # Second group of 16 FMA, First group of loads
188        # BLOCK 0
189        FMLA    v20.4s, v12.4s,  v3.s[0]
190        LDR     d0, [x3], 8              // a0
191        FMLA    v22.4s, v12.4s,  v3.s[2]
192        INS     v15.d[1], x4            // b from previous
193        FMLA    v24.4s, v12.4s,  v4.s[0]
194        LDR     x4, [x9], 8               // a1
195
196        # BLOCK 1
197        FMLA    v26.4s, v12.4s,  v4.s[2]
198        LDR     d16, [x5, 64]
199        FMLA    v21.4s, v13.4s,  v3.s[0]
200        INS     v0.d[1], x4                // a1 ins
201        FMLA    v23.4s, v13.4s,  v3.s[2]
202        LDR     x4, [x5, 72]            // b
203
204        # BLOCK 2
205        FMLA    v25.4s, v13.4s,  v4.s[0]
206        LDR     d1, [x10], 8             // a2
207        FMLA    v27.4s, v13.4s,  v4.s[2]
208        INS     v16.d[1], x4            // b
209        FMLA    v20.4s, v14.4s,  v3.s[1]
210        LDR     x4, [x11], 8              // a3
211
212        # BLOCK 3
213        FMLA    v22.4s, v14.4s,  v3.s[3]
214        LDR     d17, [x5, 80]
215        FMLA    v24.4s, v14.4s,  v4.s[1]
216        INS     v1.d[1], x4                // a3 ins
217        FMLA    v26.4s, v14.4s,  v4.s[3]
218        LDR     x4, [x5, 88]
219
220        # BLOCK 4
221        FMLA    v21.4s, v15.4s,  v3.s[1]
222        LDR     d18, [x5, 96]
223        FMLA    v23.4s, v15.4s,  v3.s[3]
224        INS     v17.d[1], x4            // b
225        FMLA    v25.4s, v15.4s,  v4.s[1]
226        LDR     x4, [x5, 104]
227
228        # BLOCK 5
229        # NOTE that block needs to be 4 cycles for LDR not to stall
230        FMLA    v27.4s, v15.4s,  v4.s[3]
231        LDR     d19, [x5, 112]
232        INS     v18.d[1], x4
233        LDR     x4, [x5, 120]
234        ADD     x5, x5, 128
235        B.HS    1b
236
237        # Epilogue - 4 floats of A (16 bytes)
238        # 32 FMA + 8 LD64 A + 8 LDR B
2392:
240        # First group of 16 FMA, Second group loads
241        # BLOCK 0
242        FMLA    v20.4s, v16.4s,  v0.s[0]
243        LDR     d3, [x3], 8              // a0
244        FMLA    v22.4s, v16.4s,  v0.s[2]
245        INS     v19.d[1], x4               // b from second group
246        FMLA    v24.4s, v16.4s,  v1.s[0]
247        LDR     x4, [x9], 8               // a1
248
249        # BLOCK 1
250        FMLA    v26.4s, v16.4s,  v1.s[2]
251        LDR     d12, [x5]
252        FMLA    v21.4s, v17.4s,  v0.s[0]
253        INS     v3.d[1], x4                // a1 ins
254        FMLA    v23.4s, v17.4s,  v0.s[2]
255        LDR     x4, [x5, 8]             // b
256
257        # BLOCK 2
258        FMLA    v25.4s, v17.4s,  v1.s[0]
259        LDR     d4, [x10], 8             // a2
260        FMLA    v27.4s, v17.4s,  v1.s[2]
261        INS     v12.d[1], x4            // b  ins
262        FMLA    v20.4s, v18.4s,  v0.s[1]
263        LDR     x4, [x11], 8              // a3
264
265        # BLOCK 3
266        FMLA    v22.4s, v18.4s,  v0.s[3]
267        LDR     d13, [x5, 16]
268        FMLA    v24.4s, v18.4s,  v1.s[1]
269        INS     v4.d[1], x4                // a3 ins
270        FMLA    v26.4s, v18.4s,  v1.s[3]
271        LDR     x4, [x5, 24]
272
273        # BLOCK 4
274        FMLA    v21.4s, v19.4s,  v0.s[1]
275        LDR     d14, [x5, 32]
276        FMLA    v23.4s, v19.4s,  v0.s[3]
277        INS     v13.d[1], x4            // b
278        FMLA    v25.4s, v19.4s,  v1.s[1]
279        LDR     x4, [x5, 40]
280
281        # BLOCK 5
282        # NOPs to ensure 4 cycle LDR lands on next LDR
283        FMLA    v27.4s, v19.4s,  v1.s[3]
284        LDR     d15, [x5, 48]
285        NOP     // fma
286        INS     v14.d[1], x4
287        NOP
288        LDR     x4, [x5, 56]
289
290        # Second group of 16 FMA, no loads
291        # BLOCK 0
292        FMLA    v20.4s, v12.4s,  v3.s[0]
293        FMLA    v22.4s, v12.4s,  v3.s[2]
294        INS     v15.d[1], x4            // b from previous
295        FMLA    v24.4s, v12.4s,  v4.s[0]
296
297        # BLOCK 1
298        FMLA    v26.4s, v12.4s,  v4.s[2]
299        FMLA    v21.4s, v13.4s,  v3.s[0]
300        FMLA    v23.4s, v13.4s,  v3.s[2]
301
302        # BLOCK 2
303        FMLA    v25.4s, v13.4s,  v4.s[0]
304        FMLA    v27.4s, v13.4s,  v4.s[2]
305        FMLA    v20.4s, v14.4s,  v3.s[1]
306
307        # BLOCK 3
308        FMLA    v22.4s, v14.4s,  v3.s[3]
309        FMLA    v24.4s, v14.4s,  v4.s[1]
310        FMLA    v26.4s, v14.4s,  v4.s[3]
311        TST     x0, 15
312
313        # BLOCK 4
314        FMLA    v21.4s, v15.4s,  v3.s[1]
315        FMLA    v23.4s, v15.4s,  v3.s[3]
316        FMLA    v25.4s, v15.4s,  v4.s[1]
317        ADD     x5, x5, 64
318
319        # BLOCK 5
320        FMLA    v27.4s, v15.4s,  v4.s[3]
321
322        # Is there a remainder?- 2 floats of A (8 bytes) or less
323        B.NE    4f
324
3253:
326        # Clamp
327        FMAX    v20.4s, v20.4s, v6.4s
328        # Load cn_stride
329        LDR     x0, [sp, 32]
330        FMAX    v21.4s, v21.4s, v6.4s
331        FMAX    v22.4s, v22.4s, v6.4s
332        FMAX    v23.4s, v23.4s, v6.4s
333        FMAX    v24.4s, v24.4s, v6.4s
334        FMAX    v25.4s, v25.4s, v6.4s
335        FMAX    v26.4s, v26.4s, v6.4s
336        FMAX    v27.4s, v27.4s, v6.4s
337        SUBS    x1, x1, 8
338        FMIN    v20.4s, v20.4s, v7.4s
339        FMIN    v21.4s, v21.4s, v7.4s
340        FMIN    v22.4s, v22.4s, v7.4s
341        FMIN    v23.4s, v23.4s, v7.4s
342        FMIN    v24.4s, v24.4s, v7.4s
343        FMIN    v25.4s, v25.4s, v7.4s
344        FMIN    v26.4s, v26.4s, v7.4s
345        FMIN    v27.4s, v27.4s, v7.4s
346
347        # Store full 4 x 8
348        B.LO    6f
349
350        ST1     {v20.16b, v21.16b},  [x6], x0
351        SUB     x3,  x3, x2             // a0 -= kc
352        ST1     {v22.16b, v23.16b}, [x16], x0
353        SUB     x9,  x9, x2             // a1 -= kc
354        ST1     {v24.16b, v25.16b}, [x17], x0
355        SUB     x10, x10, x2            // a2 -= kc
356        ST1     {v26.16b, v27.16b}, [x14], x0
357        SUB     x11, x11, x2            // a3 -= kc
358
359        B.HI    0b
360
361        # Restore d12-d15 from stack
362        LDP     d14, d15, [sp, 16]
363        LDP     d12, d13, [sp], 32
364        RET
365
3664:
367        # Is there a remainder?- 2 floats of A (8 bytes)
368        TBZ     x0, 3, 5f
369
370        # Remainder- 2 floats of A (8 bytes)
371        LDR     d0,  [x3], 8
372        LDR     q16, [x5], 16
373        LD1     {v0.d}[1], [x9], 8
374        LDR     d1, [x10], 8
375        LD1     {v1.d}[1], [x11], 8
376        LDR     q17, [x5], 16
377        LDR     q18, [x5], 16
378        LDR     q19, [x5], 16
379        FMLA    v20.4s, v16.4s,  v0.s[0]
380        FMLA    v22.4s, v16.4s,  v0.s[2]
381        FMLA    v24.4s, v16.4s,  v1.s[0]
382        FMLA    v26.4s, v16.4s,  v1.s[2]
383        FMLA    v21.4s, v17.4s,  v0.s[0]
384        FMLA    v23.4s, v17.4s,  v0.s[2]
385        FMLA    v25.4s, v17.4s,  v1.s[0]
386        FMLA    v27.4s, v17.4s,  v1.s[2]
387
388        FMLA    v20.4s, v18.4s,  v0.s[1]
389        FMLA    v22.4s, v18.4s,  v0.s[3]
390        FMLA    v24.4s, v18.4s,  v1.s[1]
391        FMLA    v26.4s, v18.4s,  v1.s[3]
392        FMLA    v21.4s, v19.4s,  v0.s[1]
393        FMLA    v23.4s, v19.4s,  v0.s[3]
394        FMLA    v25.4s, v19.4s,  v1.s[1]
395        FMLA    v27.4s, v19.4s,  v1.s[3]
396
397        # Is there a remainder?- 1 float of A (4 bytes)
398        TBZ     x0, 2, 3b
399
4005:
401        # Remainder- 1 float of A (4 bytes)
402        LDR     s0,  [x3], 4
403        LDR     q16, [x5], 16
404        LD1     {v0.s}[2], [x9], 4
405        LDR     s1, [x10], 4
406        LD1     {v1.s}[2], [x11], 4
407        LDR     q17, [x5], 16
408
409        FMLA    v20.4s, v16.4s,  v0.s[0]
410        FMLA    v22.4s, v16.4s,  v0.s[2]
411        FMLA    v24.4s, v16.4s,  v1.s[0]
412        FMLA    v26.4s, v16.4s,  v1.s[2]
413        FMLA    v21.4s, v17.4s,  v0.s[0]
414        FMLA    v23.4s, v17.4s,  v0.s[2]
415        FMLA    v25.4s, v17.4s,  v1.s[0]
416        FMLA    v27.4s, v17.4s,  v1.s[2]
417        B       3b
418
419        # Store odd width
4206:
421        TBZ     x1, 2, 7f
422        STR     q20,  [x6], 16
423        MOV     v20.16b, v21.16b
424        STR     q22, [x16], 16
425        MOV     v22.16b, v23.16b
426        STR     q24, [x17], 16
427        MOV     v24.16b, v25.16b
428        STR     q26, [x14], 16
429        MOV     v26.16b, v27.16b
430
4317:
432        TBZ     x1, 1, 8f
433        STR     d20,  [x6], 8
434        STR     d22, [x16], 8
435        DUP     d20, v20.d[1]
436        DUP     d22, v22.d[1]
437        STR     d24, [x17], 8
438        STR     d26, [x14], 8
439        DUP     d24, v24.d[1]
440        DUP     d26, v26.d[1]
441
4428:
443        TBZ     x1, 0, 9f
444        STR     s20,  [x6]
445        STR     s22, [x16]
446        STR     s24, [x17]
447        STR     s26, [x14]
4489:
449        # Restore d12-d15 from stack
450        LDP     d14, d15, [sp, 16]
451        LDP     d12, d13, [sp], 32
452        RET
453
454END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
455
456#ifdef __ELF__
457.section ".note.GNU-stack","",%progbits
458#endif
459