xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x14 c3
37
38# x4 temporary vector shadow register
39
40# Vector register usage
41# A0  v0     v3
42# A1  v0[1]  v3[1]
43# A2  v1     v4
44# A3  v1[1]  v4[1]
45
46# B   v12 v13 v14 v15 second set of B
47# B   v16 v17 v18 v19 first set
48# C   v20 v21
49# C   v22 v23
50# C   v24 v25
51# C   v26 v27
52# Clamp v6 v7
53
54# unused A   v8 v9 v10 v11
55# x12 a4
56# x13 c4
57#  x7 c5
58# A4  v2     v5
59# A5  v2[1]  v5[1]
60# C   v28 v29
61# C   v30 v31
62
63BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53
64
65        # Load params pointer
66        LDR     x8, [sp, 8]
67
68        # Clamp A and C pointers
69        CMP     x0, 2                   // if mr < 2
70        ADD     x9, x3, x4              // a1 = a0 + a_stride
71        ADD     x16, x6, x7             // c1 = c0 + cm_stride
72        CSEL    x9, x3, x9, LO          //   a1 = a0
73        CSEL    x16, x6, x16, LO        //   c1 = c0
74
75        ADD     x10, x9, x4             // a2 = a1 + a_stride
76        ADD     x17, x16, x7            // c2 = c1 + cm_stride
77                                        // if mr <= 2
78        CSEL    x10, x9, x10, LS        //   a2 = a1
79        CSEL    x17, x16, x17, LS       //   c2 = c1
80
81        CMP     x0, 4                   // if mr < 4
82        ADD     x11, x10, x4            // a3 = a2 + a_stride
83        ADD     x14, x17, x7            // c3 = c2 + cm_stride
84        CSEL    x11, x10, x11, LO       //   a3 = a2
85        CSEL    x14, x17, x14, LO       //   c3 = c2
86
87        # Load min/max values
88        LD2R    {v6.4s, v7.4s}, [x8]
89
90        # Save d12-d15 on stack
91        STP     d12, d13, [sp, -32]!
92        STP     d14, d15, [sp, 16]
93
940:
95        # Load initial bias from w into accumulators
96        LDP     q20, q21, [x5], 32
97        MOV     v22.16b, v20.16b
98        PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
99        PRFM    PLDL1KEEP,  [x3, 64]
100        MOV     v23.16b, v21.16b
101        PRFM    PLDL1KEEP,  [x9,  0]
102        PRFM    PLDL1KEEP,  [x9, 64]
103        MOV     v24.16b, v20.16b
104        PRFM    PLDL1KEEP, [x10,  0]
105        PRFM    PLDL1KEEP, [x10, 64]
106        MOV     v25.16b, v21.16b
107        PRFM    PLDL1KEEP, [x11,  0]
108        PRFM    PLDL1KEEP, [x11, 64]
109        MOV     v26.16b, v20.16b
110        PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
111        MOV     v27.16b, v21.16b
112        PRFM    PLDL1KEEP, [x5,  64]
113        PRFM    PLDL1KEEP, [x5, 128]
114        PRFM    PLDL1KEEP, [x5, 192]
115
116        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
117        SUBS    x0, x2, 16              // k = kc - 16
118        B.LO    4f
119
120        # Prologue - First group loads, no FMA
121        LDR     d0, [x3], 8              // a0
122        LDP     q16, q17, [x5], 32         // b
123        LDR     d1, [x10], 8             // a2
124        LD1     {v0.d}[1],  [x9], 8       // a1
125        LD1     {v1.d}[1], [x11], 8       // a3
126        SUBS    x0, x0, 16
127        LDR     q18, [x5], 16
128        LDR     d19, [x5], 8
129        LDR     x4, [x5], 8             // ins is in BLOCK 0
130
131        # Is there at least 4 floats (16 bytes) for main loop?
132        B.LO    2f
133
134        # Main loop - 4 floats of A (16 bytes)
135        # 32 FMA + 8 LD64 A + 8 LDR B
1361:
137        # First group of 16 FMA, Second group loads
138        # BLOCK 0
139        LDR     d3, [x3], 8              // a0
140        INS     v19.d[1], x4               // b from second group
141        FMLA    v20.4s, v16.4s,  v0.s[0]
142        LDR     x4, [x9], 8               // a1
143        FMLA    v22.4s, v16.4s,  v0.s[2]
144        FMLA    v24.4s, v16.4s,  v1.s[0]
145
146        # BLOCK 1
147        LDR     d12, [x5]
148        INS     v3.d[1], x4                // a1 ins
149        FMLA    v26.4s, v16.4s,  v1.s[2]
150        LDR     x4, [x5, 8]             // b
151        FMLA    v21.4s, v17.4s,  v0.s[0]
152        FMLA    v23.4s, v17.4s,  v0.s[2]
153
154        # BLOCK 2
155        LDR     d4, [x10], 8             // a2
156        INS     v12.d[1], x4            // b  ins
157        FMLA    v25.4s, v17.4s,  v1.s[0]
158        LDR     x4, [x11], 8              // a3
159        FMLA    v27.4s, v17.4s,  v1.s[2]
160        FMLA    v20.4s, v18.4s,  v0.s[1]
161
162        # BLOCK 3
163        LDR     d13, [x5, 16]
164        INS     v4.d[1], x4                // a3 ins
165        FMLA    v22.4s, v18.4s,  v0.s[3]
166        LDR     x4, [x5, 24]
167        FMLA    v24.4s, v18.4s,  v1.s[1]
168        FMLA    v26.4s, v18.4s,  v1.s[3]
169
170        # BLOCK 4
171        LDR     d14, [x5, 32]
172        INS     v13.d[1], x4            // b
173        FMLA    v21.4s, v19.4s,  v0.s[1]
174        LDR     x4, [x5, 40]
175        FMLA    v23.4s, v19.4s,  v0.s[3]
176        FMLA    v25.4s, v19.4s,  v1.s[1]
177
178        # BLOCK 5
179        # NOPs to ensure 4 cycle LDR lands on next LDR
180        LDR     d15, [x5, 48]
181        INS     v14.d[1], x4            // b from previous
182        FMLA    v27.4s, v19.4s,  v1.s[3]
183        LDR     x4, [x5, 56]
184        NOP
185        NOP
186        NOP
187        NOP
188
189        # Second group of 16 FMA, First group of loads
190        # BLOCK 0
191        LDR     d0, [x3], 8              // a0
192        INS     v15.d[1], x4            // b from previous
193        FMLA    v20.4s, v12.4s,  v3.s[0]
194        LDR     x4, [x9], 8               // a1
195        FMLA    v22.4s, v12.4s,  v3.s[2]
196        FMLA    v24.4s, v12.4s,  v4.s[0]
197        PRFM    PLDL1KEEP, [x3, 128]      // Prefetch A0
198
199        # BLOCK 1
200        LDR     d16, [x5, 64]
201        INS     v0.d[1], x4                // a1 ins
202        FMLA    v26.4s, v12.4s,  v4.s[2]
203        LDR     x4, [x5, 72]            // b
204        FMLA    v21.4s, v13.4s,  v3.s[0]
205        FMLA    v23.4s, v13.4s,  v3.s[2]
206        PRFM    PLDL1KEEP, [x9, 128]      // Prefetch A1
207
208        # BLOCK 2
209        LDR     d1, [x10], 8             // a2
210        INS     v16.d[1], x4            // b
211        FMLA    v25.4s, v13.4s,  v4.s[0]
212        LDR     x4, [x11], 8              // a3
213        FMLA    v27.4s, v13.4s,  v4.s[2]
214        FMLA    v20.4s, v14.4s,  v3.s[1]
215        PRFM    PLDL1KEEP, [x10, 128]     // Prefetch A2
216
217        # BLOCK 3
218        LDR     d17, [x5, 80]
219        INS     v1.d[1], x4                // a3 ins
220        FMLA    v22.4s, v14.4s,  v3.s[3]
221        LDR     x4, [x5, 88]
222        FMLA    v24.4s, v14.4s,  v4.s[1]
223        FMLA    v26.4s, v14.4s,  v4.s[3]
224        PRFM    PLDL1KEEP, [x11, 128]     // Prefetch A3
225
226        # BLOCK 4
227        LDR     d18, [x5, 96]
228        INS     v17.d[1], x4            // b
229        FMLA    v21.4s, v15.4s,  v3.s[1]
230        LDR     x4, [x5, 104]
231        FMLA    v23.4s, v15.4s,  v3.s[3]
232        FMLA    v25.4s, v15.4s,  v4.s[1]
233        PRFM    PLDL1KEEP, [x5, 192]      // Prefetch B
234
235        # BLOCK 5
236        # NOTE that block needs to be 4 cycles for LDR not to stall
237        LDR     d19, [x5, 112]
238        INS     v18.d[1], x4
239        FMLA    v27.4s, v15.4s,  v4.s[3]
240        LDR     x4, [x5, 120]
241        SUBS    x0, x0, 16
242        PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
243        ADD     x5, x5, 128
244        B.HS    1b
245
246        # Epilogue - 4 floats of A (16 bytes)
247        # 32 FMA + 8 LD64 A + 8 LDR B
2482:
249        # First group of 16 FMA, Second group loads
250        # BLOCK 0
251        LDR     d3, [x3], 8              // a0
252        INS     v19.d[1], x4               // b from second group
253        FMLA    v20.4s, v16.4s,  v0.s[0]
254        LDR     x4, [x9], 8               // a1
255        FMLA    v22.4s, v16.4s,  v0.s[2]
256        FMLA    v24.4s, v16.4s,  v1.s[0]
257
258        # BLOCK 1
259        LDR     d12, [x5]
260        INS     v3.d[1], x4                // a1 ins
261        FMLA    v26.4s, v16.4s,  v1.s[2]
262        LDR     x4, [x5, 8]             // b
263        FMLA    v21.4s, v17.4s,  v0.s[0]
264        FMLA    v23.4s, v17.4s,  v0.s[2]
265
266        # BLOCK 2
267        LDR     d4, [x10], 8             // a2
268        INS     v12.d[1], x4            // b  ins
269        FMLA    v25.4s, v17.4s,  v1.s[0]
270        LDR     x4, [x11], 8              // a3
271        FMLA    v27.4s, v17.4s,  v1.s[2]
272        FMLA    v20.4s, v18.4s,  v0.s[1]
273
274        # BLOCK 3
275        LDR     d13, [x5, 16]
276        INS     v4.d[1], x4                // a3 ins
277        FMLA    v22.4s, v18.4s,  v0.s[3]
278        LDR     x4, [x5, 24]
279        FMLA    v24.4s, v18.4s,  v1.s[1]
280        FMLA    v26.4s, v18.4s,  v1.s[3]
281
282        # BLOCK 4
283        LDR     d14, [x5, 32]
284        INS     v13.d[1], x4            // b
285        FMLA    v21.4s, v19.4s,  v0.s[1]
286        LDR     x4, [x5, 40]
287        FMLA    v23.4s, v19.4s,  v0.s[3]
288        FMLA    v25.4s, v19.4s,  v1.s[1]
289
290        # BLOCK 5
291        # NOPs to ensure 4 cycle LDR lands on next LDR
292        LDR     d15, [x5, 48]
293        INS     v14.d[1], x4
294        FMLA    v27.4s, v19.4s,  v1.s[3]
295        LDR     x4, [x5, 56]
296        NOP     // fma
297        NOP
298        NOP     // fma
299        NOP
300
301        # Second group of 16 FMA, no loads
302        # BLOCK 0
303        INS     v15.d[1], x4            // b from previous
304        FMLA    v20.4s, v12.4s,  v3.s[0]
305        FMLA    v22.4s, v12.4s,  v3.s[2]
306        FMLA    v24.4s, v12.4s,  v4.s[0]
307
308        # BLOCK 1
309        FMLA    v26.4s, v12.4s,  v4.s[2]
310        FMLA    v21.4s, v13.4s,  v3.s[0]
311        FMLA    v23.4s, v13.4s,  v3.s[2]
312
313        # BLOCK 2
314        FMLA    v25.4s, v13.4s,  v4.s[0]
315        FMLA    v27.4s, v13.4s,  v4.s[2]
316        FMLA    v20.4s, v14.4s,  v3.s[1]
317
318        # BLOCK 3
319        FMLA    v22.4s, v14.4s,  v3.s[3]
320        FMLA    v24.4s, v14.4s,  v4.s[1]
321        FMLA    v26.4s, v14.4s,  v4.s[3]
322        TST     x0, 15
323
324        # BLOCK 4
325        FMLA    v21.4s, v15.4s,  v3.s[1]
326        FMLA    v23.4s, v15.4s,  v3.s[3]
327        FMLA    v25.4s, v15.4s,  v4.s[1]
328        ADD     x5, x5, 64
329
330        # BLOCK 5
331        FMLA    v27.4s, v15.4s,  v4.s[3]
332
333        # Is there a remainder?- 2 floats of A (8 bytes) or less
334        B.NE    4f
335
3363:
337        # Clamp
338        FMAX    v20.4s, v20.4s, v6.4s
339        # Load cn_stride
340        LDR     x0, [sp, 32]
341        FMAX    v21.4s, v21.4s, v6.4s
342        FMAX    v22.4s, v22.4s, v6.4s
343        FMAX    v23.4s, v23.4s, v6.4s
344        FMAX    v24.4s, v24.4s, v6.4s
345        FMAX    v25.4s, v25.4s, v6.4s
346        FMAX    v26.4s, v26.4s, v6.4s
347        FMAX    v27.4s, v27.4s, v6.4s
348        SUBS    x1, x1, 8
349        FMIN    v20.4s, v20.4s, v7.4s
350        FMIN    v21.4s, v21.4s, v7.4s
351        FMIN    v22.4s, v22.4s, v7.4s
352        FMIN    v23.4s, v23.4s, v7.4s
353        FMIN    v24.4s, v24.4s, v7.4s
354        FMIN    v25.4s, v25.4s, v7.4s
355        FMIN    v26.4s, v26.4s, v7.4s
356        FMIN    v27.4s, v27.4s, v7.4s
357
358        # Store full 4 x 8
359        B.LO    6f
360
361        ST1     {v20.16b, v21.16b},  [x6], x0
362        SUB     x3,  x3, x2             // a0 -= kc
363        ST1     {v22.16b, v23.16b}, [x16], x0
364        SUB     x9,  x9, x2             // a1 -= kc
365        ST1     {v24.16b, v25.16b}, [x17], x0
366        SUB     x10, x10, x2            // a2 -= kc
367        ST1     {v26.16b, v27.16b}, [x14], x0
368        SUB     x11, x11, x2            // a3 -= kc
369
370        B.HI    0b
371
372        # Restore d12-d15 from stack
373        LDP     d14, d15, [sp, 16]
374        LDP     d12, d13, [sp], 32
375        RET
376
3774:
378        # Is there a remainder?- 2 floats of A (8 bytes)
379        TBZ     x0, 3, 5f
380
381        # Remainder- 2 floats of A (8 bytes)
382        LDR     d0,  [x3], 8
383        LDR     q16, [x5], 16
384        LD1     {v0.d}[1], [x9], 8
385        LDR     d1, [x10], 8
386        LD1     {v1.d}[1], [x11], 8
387        LDR     q17, [x5], 16
388        LDR     q18, [x5], 16
389        LDR     q19, [x5], 16
390        FMLA    v20.4s, v16.4s,  v0.s[0]
391        FMLA    v22.4s, v16.4s,  v0.s[2]
392        FMLA    v24.4s, v16.4s,  v1.s[0]
393        FMLA    v26.4s, v16.4s,  v1.s[2]
394        FMLA    v21.4s, v17.4s,  v0.s[0]
395        FMLA    v23.4s, v17.4s,  v0.s[2]
396        FMLA    v25.4s, v17.4s,  v1.s[0]
397        FMLA    v27.4s, v17.4s,  v1.s[2]
398
399        FMLA    v20.4s, v18.4s,  v0.s[1]
400        FMLA    v22.4s, v18.4s,  v0.s[3]
401        FMLA    v24.4s, v18.4s,  v1.s[1]
402        FMLA    v26.4s, v18.4s,  v1.s[3]
403        FMLA    v21.4s, v19.4s,  v0.s[1]
404        FMLA    v23.4s, v19.4s,  v0.s[3]
405        FMLA    v25.4s, v19.4s,  v1.s[1]
406        FMLA    v27.4s, v19.4s,  v1.s[3]
407
408        # Is there a remainder?- 1 float of A (4 bytes)
409        TBZ     x0, 2, 3b
410
4115:
412        # Remainder- 1 float of A (4 bytes)
413        LDR     s0,  [x3], 4
414        LDR     q16, [x5], 16
415        LD1     {v0.s}[2], [x9], 4
416        LDR     s1, [x10], 4
417        LD1     {v1.s}[2], [x11], 4
418        LDR     q17, [x5], 16
419
420        FMLA    v20.4s, v16.4s,  v0.s[0]
421        FMLA    v22.4s, v16.4s,  v0.s[2]
422        FMLA    v24.4s, v16.4s,  v1.s[0]
423        FMLA    v26.4s, v16.4s,  v1.s[2]
424        FMLA    v21.4s, v17.4s,  v0.s[0]
425        FMLA    v23.4s, v17.4s,  v0.s[2]
426        FMLA    v25.4s, v17.4s,  v1.s[0]
427        FMLA    v27.4s, v17.4s,  v1.s[2]
428        B       3b
429
430        # Store odd width
4316:
432        TBZ     x1, 2, 7f
433        STR     q20,  [x6], 16
434        MOV     v20.16b, v21.16b
435        STR     q22, [x16], 16
436        MOV     v22.16b, v23.16b
437        STR     q24, [x17], 16
438        MOV     v24.16b, v25.16b
439        STR     q26, [x14], 16
440        MOV     v26.16b, v27.16b
441
4427:
443        TBZ     x1, 1, 8f
444        STR     d20,  [x6], 8
445        STR     d22, [x16], 8
446        DUP     d20, v20.d[1]
447        DUP     d22, v22.d[1]
448        STR     d24, [x17], 8
449        STR     d26, [x14], 8
450        DUP     d24, v24.d[1]
451        DUP     d26, v26.d[1]
452
4538:
454        TBZ     x1, 0, 9f
455        STR     s20,  [x6]
456        STR     s22, [x16]
457        STR     s24, [x17]
458        STR     s26, [x14]
4599:
460        # Restore d12-d15 from stack
461        LDP     d14, d15, [sp, 16]
462        LDP     d12, d13, [sp], 32
463        RET
464
465END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53
466
467#ifdef __ELF__
468.section ".note.GNU-stack","",%progbits
469#endif
470