xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/4x8-minmax-aarch64-neonfma-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointers
25# x13 a0
26# x14 a1
27# x15 a2
28#  x8 a3
29
30# C pointers
31#  x6 c0
32# x16 c1
33# x17 c2
34#  x7 c3
35
36# x19 temporary vector shadow register
37
38# Vector register usage
39# A0  v0     v3
40# A1  v0[1]  v3[1]
41# A2  v1     v4
42# A3  v1[1]  v4[1]
43
44# B   v12 v13 v14 v15 second set of B
45# B   v16 v17 v18 v19 first set
46# C   v20 v21
47# C   v22 v23
48# C   v24 v25
49# C   v26 v27
50# Clamp v6 v7
51
52# unused A   v8 v9 v10 v11
53# x12 a4
54#  x4 a5
55# x13 c4
56#  x7 c5
57# A4  v2     v5
58# A5  v2[1]  v5[1]
59# C   v28 v29
60# C   v30 v31
61
62BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
63
64        # Clamp C pointers
65        CMP     x0, 2                   // if mr < 2
66        ADD     x16, x6, x7             // c1 = c0 + cm_stride
67        CSEL    x16, x6, x16, LO        //   c1 = c0
68
69        ADD     x17, x16, x7            // c2 = c1 + cm_stride
70                                        // if mr <= 2
71        CSEL    x17, x16, x17, LS       //   c2 = c1
72
73        CMP     x0, 4                   // if mr < 4
74        ADD     x7, x17, x7             // c3 = c2 + cm_stride
75        CSEL    x7, x17, x7, LO         //   c3 = c2
76
77        # Load cn_stride, a_offset
78        LDP     x10, x11, [sp]
79
80        # Load zero, params pointer
81        LDP     x12, x8, [sp, 16]
82
83        # Load min/max values
84        LD2R    {v6.4s, v7.4s}, [x8]
85
86        # Save x19, d12-d15 on stack
87        STP     d12, d13, [sp, -48]!
88        STP     d14, d15, [sp, 16]
89        STR     x19,      [sp, 32]
90
910:
92        # Load initial bias from w into accumulators
93        LDP     q20, q21, [x5], 32
94        MOV     v22.16b, v20.16b
95        PRFM    PLDL1KEEP,  [x13,  0]   // Prefetch A
96        PRFM    PLDL1KEEP,  [x13, 64]
97        MOV     v23.16b, v21.16b
98        PRFM    PLDL1KEEP,  [x14,  0]
99        PRFM    PLDL1KEEP,  [x14, 64]
100        MOV     v24.16b, v20.16b
101        PRFM    PLDL1KEEP, [x15,  0]
102        PRFM    PLDL1KEEP, [x15, 64]
103        MOV     v25.16b, v21.16b
104        PRFM    PLDL1KEEP, [x8,  0]
105        PRFM    PLDL1KEEP, [x8, 64]
106        MOV     v26.16b, v20.16b
107        PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
108        PRFM    PLDL1KEEP, [x5,  64]
109        MOV     v27.16b, v21.16b
110        PRFM    PLDL1KEEP, [x5, 128]
111        PRFM    PLDL1KEEP, [x5, 192]
112
113        MOV     x9, x3                  // p = ks
114
1151:
116        # Load next 4 A pointers
117        LDP     x13, x14, [x4], 16
118        LDP     x15, x8, [x4], 16
119
120        CMP     x13, x12                // if a0 == zero
121        ADD     x13, x13, x11           // a0 += a_offset
122        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
123        CMP     x14, x12                // if a1 == zero
124        ADD     x14, x14, x11           // a1 += a_offset
125        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
126        CMP     x15, x12                // if a2 == zero
127        ADD     x15, x15, x11           // a2 += a_offset
128        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
129        CMP     x8, x12                 // if a3 == zero
130        ADD     x8, x8, x11             // a3 += a_offset
131        CSEL    x8, x12, x8, EQ         //   a3 = zero, else += a3 + a_offset
132
133        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
134        SUBS    x0, x2, 16              // k = kc - 16
135        B.LO    4f
136
137        # Prologue - First group loads, no FMA
138        LDR     d0, [x13], 8            // a0
139        LDP     q16, q17, [x5], 32        // b
140        LDR     d1, [x15], 8            // a2
141        LD1     {v0.d}[1],  [x14], 8     // a1
142        LD1     {v1.d}[1], [x8], 8       // a3
143        SUBS    x0, x0, 16
144        LDR     q18, [x5], 16
145        LDR     d19, [x5], 8
146        LDR     x19, [x5], 8            // ins is in BLOCK 0
147
148        # Is there at least 4 floats (16 bytes) for main loop?
149        B.LO    3f
150
151        # Main loop - 4 floats of A (16 bytes)
152        # 32 FMA + 8 LD64 A + 8 LDR B
1532:
154        # First group of 16 FMA, Second group loads
155        # BLOCK 0
156        FMLA    v20.4s, v16.4s,  v0.s[0]
157        LDR     d3, [x13], 8              // a0
158        FMLA    v22.4s, v16.4s,  v0.s[2]
159        INS     v19.d[1], x19               // b from second group
160        FMLA    v24.4s, v16.4s,  v1.s[0]
161        LDR     x19, [x14], 8              // a1
162
163        # BLOCK 1
164        FMLA    v26.4s, v16.4s,  v1.s[2]
165        LDR     d12, [x5]
166        FMLA    v21.4s, v17.4s,  v0.s[0]
167        INS     v3.d[1], x19                // a1 ins
168        FMLA    v23.4s, v17.4s,  v0.s[2]
169        LDR     x19, [x5, 8]            // b
170
171        # BLOCK 2
172        FMLA    v25.4s, v17.4s,  v1.s[0]
173        LDR     d4, [x15], 8              // a2
174        FMLA    v27.4s, v17.4s,  v1.s[2]
175        INS     v12.d[1], x19           // b  ins
176        FMLA    v20.4s, v18.4s,  v0.s[1]
177        LDR     x19, [x8], 8               // a3
178
179        # BLOCK 3
180        FMLA    v22.4s, v18.4s,  v0.s[3]
181        LDR     d13, [x5, 16]
182        FMLA    v24.4s, v18.4s,  v1.s[1]
183        INS     v4.d[1], x19                // a3 ins
184        FMLA    v26.4s, v18.4s,  v1.s[3]
185        LDR     x19, [x5, 24]
186
187        # BLOCK 4
188        FMLA    v21.4s, v19.4s,  v0.s[1]
189        LDR     d14, [x5, 32]
190        FMLA    v23.4s, v19.4s,  v0.s[3]
191        INS     v13.d[1], x19           // b
192        FMLA    v25.4s, v19.4s,  v1.s[1]
193        LDR     x19, [x5, 40]
194
195        # BLOCK 5
196        # NOPs to ensure 4 cycle LDR lands on next LDR
197        FMLA    v27.4s, v19.4s,  v1.s[3]
198        LDR     d15, [x5, 48]
199        NOP
200        INS     v14.d[1], x19           // b from previous
201        SUBS    x0, x0, 16
202        LDR     x19, [x5, 56]
203
204        # Second group of 16 FMA, First group of loads
205        # BLOCK 0
206        FMLA    v20.4s, v12.4s,  v3.s[0]
207        LDR     d0, [x13], 8              // a0
208        FMLA    v22.4s, v12.4s,  v3.s[2]
209        INS     v15.d[1], x19           // b from previous
210        FMLA    v24.4s, v12.4s,  v4.s[0]
211        LDR     x19, [x14], 8              // a1
212
213        # BLOCK 1
214        FMLA    v26.4s, v12.4s,  v4.s[2]
215        LDR     d16, [x5, 64]
216        FMLA    v21.4s, v13.4s,  v3.s[0]
217        INS     v0.d[1], x19               // a1 ins
218        FMLA    v23.4s, v13.4s,  v3.s[2]
219        LDR     x19, [x5, 72]           // b
220
221        # BLOCK 2
222        FMLA    v25.4s, v13.4s,  v4.s[0]
223        LDR     d1, [x15], 8             // a2
224        FMLA    v27.4s, v13.4s,  v4.s[2]
225        INS     v16.d[1], x19           // b
226        FMLA    v20.4s, v14.4s,  v3.s[1]
227        LDR     x19, [x8], 8             // a3
228
229        # BLOCK 3
230        FMLA    v22.4s, v14.4s,  v3.s[3]
231        LDR     d17, [x5, 80]
232        FMLA    v24.4s, v14.4s,  v4.s[1]
233        INS     v1.d[1], x19               // a3 ins
234        FMLA    v26.4s, v14.4s,  v4.s[3]
235        LDR     x19, [x5, 88]
236
237        # BLOCK 4
238        FMLA    v21.4s, v15.4s,  v3.s[1]
239        LDR     d18, [x5, 96]
240        FMLA    v23.4s, v15.4s,  v3.s[3]
241        INS     v17.d[1], x19           // b
242        FMLA    v25.4s, v15.4s,  v4.s[1]
243        LDR     x19, [x5, 104]
244
245        # BLOCK 5
246        # NOTE that block needs to be 4 cycles for LDR not to stall
247        FMLA    v27.4s, v15.4s,  v4.s[3]
248        LDR     d19, [x5, 112]
249        INS     v18.d[1], x19
250        LDR     x19, [x5, 120]
251        ADD     x5, x5, 128
252        B.HS    2b
253
254        # Epilogue - 4 floats of A (16 bytes)
255        # 32 FMA + 8 LD64 A + 8 LDR B
2563:
257        # First group of 16 FMA, Second group loads
258        # BLOCK 0
259        LDR     d3, [x13], 8              // a0
260        INS     v19.d[1], x19              // b from second group
261        FMLA    v20.4s, v16.4s,  v0.s[0]
262        LDR     x19, [x14], 8              // a1
263        FMLA    v22.4s, v16.4s,  v0.s[2]
264        FMLA    v24.4s, v16.4s,  v1.s[0]
265
266        # BLOCK 1
267        LDR     d12, [x5]
268        INS     v3.d[1], x19               // a1 ins
269        FMLA    v26.4s, v16.4s,  v1.s[2]
270        LDR     x19, [x5, 8]            // b
271        FMLA    v21.4s, v17.4s,  v0.s[0]
272        FMLA    v23.4s, v17.4s,  v0.s[2]
273
274        # BLOCK 2
275        LDR     d4, [x15], 8             // a2
276        INS     v12.d[1], x19           // b  ins
277        FMLA    v25.4s, v17.4s,  v1.s[0]
278        LDR     x19, [x8], 8             // a3
279        FMLA    v27.4s, v17.4s,  v1.s[2]
280        FMLA    v20.4s, v18.4s,  v0.s[1]
281
282        # BLOCK 3
283        LDR     d13, [x5, 16]
284        INS     v4.d[1], x19               // a3 ins
285        FMLA    v22.4s, v18.4s,  v0.s[3]
286        LDR     x19, [x5, 24]
287        FMLA    v24.4s, v18.4s,  v1.s[1]
288        FMLA    v26.4s, v18.4s,  v1.s[3]
289
290        # BLOCK 4
291        LDR     d14, [x5, 32]
292        INS     v13.d[1], x19           // b
293        FMLA    v21.4s, v19.4s,  v0.s[1]
294        LDR     x19, [x5, 40]
295        FMLA    v23.4s, v19.4s,  v0.s[3]
296        FMLA    v25.4s, v19.4s,  v1.s[1]
297
298        # BLOCK 5
299        # NOPs to ensure 4 cycle LDR lands on next LDR
300        LDR     d15, [x5, 48]
301        INS     v14.d[1], x19
302        FMLA    v27.4s, v19.4s,  v1.s[3]
303        LDR     x19, [x5, 56]
304        NOP     // fma
305        NOP
306        NOP     // fma
307        NOP
308
309        # Second group of 16 FMA, no loads
310        # BLOCK 0
311        INS     v15.d[1], x19           // b from previous
312        FMLA    v20.4s, v12.4s,  v3.s[0]
313        FMLA    v22.4s, v12.4s,  v3.s[2]
314        FMLA    v24.4s, v12.4s,  v4.s[0]
315
316        # BLOCK 1
317        FMLA    v26.4s, v12.4s,  v4.s[2]
318        FMLA    v21.4s, v13.4s,  v3.s[0]
319        FMLA    v23.4s, v13.4s,  v3.s[2]
320
321        # BLOCK 2
322        FMLA    v25.4s, v13.4s,  v4.s[0]
323        FMLA    v27.4s, v13.4s,  v4.s[2]
324        FMLA    v20.4s, v14.4s,  v3.s[1]
325
326        # BLOCK 3
327        FMLA    v22.4s, v14.4s,  v3.s[3]
328        FMLA    v24.4s, v14.4s,  v4.s[1]
329        FMLA    v26.4s, v14.4s,  v4.s[3]
330
331        # BLOCK 4
332        FMLA    v21.4s, v15.4s,  v3.s[1]
333        FMLA    v23.4s, v15.4s,  v3.s[3]
334        FMLA    v25.4s, v15.4s,  v4.s[1]
335        ADD     x5, x5, 64
336
337        # BLOCK 5
338        FMLA    v27.4s, v15.4s,  v4.s[3]
339
3404:
341        # Is there a remainder?- 2 floats of A (8 bytes)
342        TBNZ    x0, 3, 6f
343        # Is there a remainder?- 1 float of A (4 bytes)
344        TBNZ    x0, 2, 7f
3455:
346        # ks loop
347        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
348        B.HI    1b
349
350        # Clamp
351        FMAX    v20.4s, v20.4s, v6.4s
352        FMAX    v21.4s, v21.4s, v6.4s
353        FMAX    v22.4s, v22.4s, v6.4s
354        FMAX    v23.4s, v23.4s, v6.4s
355        FMAX    v24.4s, v24.4s, v6.4s
356        FMAX    v25.4s, v25.4s, v6.4s
357        FMAX    v26.4s, v26.4s, v6.4s
358        FMAX    v27.4s, v27.4s, v6.4s
359        FMIN    v20.4s, v20.4s, v7.4s
360        FMIN    v21.4s, v21.4s, v7.4s
361        FMIN    v22.4s, v22.4s, v7.4s
362        FMIN    v23.4s, v23.4s, v7.4s
363        FMIN    v24.4s, v24.4s, v7.4s
364        FMIN    v25.4s, v25.4s, v7.4s
365        FMIN    v26.4s, v26.4s, v7.4s
366        FMIN    v27.4s, v27.4s, v7.4s
367
368        # Store full 4 x 8
369        SUBS    x1, x1, 8
370        B.LO    8f
371
372        STP     q26, q27, [x7]
373        ADD     x7, x7, x10
374        STP     q24, q25, [x17]
375        ADD     x17, x17, x10
376        STP     q22, q23, [x16]
377        ADD     x16, x16, x10
378        STP     q20, q21,  [x6]
379        ADD     x6,  x6, x10
380
381        SUB     x4, x4, x3              // a -= ks
382
383        # nc loop
384        B.HI    0b
385
386        # Restore x19, d12-d15 from stack
387        LDR     x19,      [sp, 32]
388        LDP     d14, d15, [sp, 16]
389        LDP     d12, d13, [sp], 48
390        RET
391
392        # Remainder - 2 floats of A (8 bytes)
393        # 16 FMA + 4 LD64 A + 2 LDP B
3946:
395        LDR     d0,  [x13], 8
396        LDP     q16,  q17, [x5], 32
397        LD1     {v0.d}[1], [x14], 8
398        LDR     d1, [x15], 8
399        LD1     {v1.d}[1], [x8], 8
400        LDP     q18,  q19, [x5], 32
401        FMLA    v20.4s, v16.4s,  v0.s[0]
402        FMLA    v22.4s, v16.4s,  v0.s[2]
403        FMLA    v24.4s, v16.4s,  v1.s[0]
404        FMLA    v26.4s, v16.4s,  v1.s[2]
405        FMLA    v21.4s, v17.4s,  v0.s[0]
406        FMLA    v23.4s, v17.4s,  v0.s[2]
407        FMLA    v25.4s, v17.4s,  v1.s[0]
408        FMLA    v27.4s, v17.4s,  v1.s[2]
409
410        FMLA    v20.4s, v18.4s,  v0.s[1]
411        FMLA    v22.4s, v18.4s,  v0.s[3]
412        FMLA    v24.4s, v18.4s,  v1.s[1]
413        FMLA    v26.4s, v18.4s,  v1.s[3]
414        FMLA    v21.4s, v19.4s,  v0.s[1]
415        FMLA    v23.4s, v19.4s,  v0.s[3]
416        FMLA    v25.4s, v19.4s,  v1.s[1]
417        FMLA    v27.4s, v19.4s,  v1.s[3]
418
419        # Is there a remainder?- 1 float of A (4 bytes)
420        TBZ     x0, 2, 5b
421
4227:
423        # Remainder- 1 float of A (4 bytes)
424        LDR     s0,  [x13], 4
425        LDP     q16,  q17, [x5], 32
426        LD1     {v0.s}[2], [x14], 4
427        LDR     s1, [x15], 4
428        LD1     {v1.s}[2], [x8], 4
429
430        FMLA    v20.4s, v16.4s,  v0.s[0]
431        FMLA    v22.4s, v16.4s,  v0.s[2]
432        FMLA    v24.4s, v16.4s,  v1.s[0]
433        FMLA    v26.4s, v16.4s,  v1.s[2]
434        FMLA    v21.4s, v17.4s,  v0.s[0]
435        FMLA    v23.4s, v17.4s,  v0.s[2]
436        FMLA    v25.4s, v17.4s,  v1.s[0]
437        FMLA    v27.4s, v17.4s,  v1.s[2]
438        B       5b
439
440        # Store odd width
4418:
442        TBZ     x1, 2, 9f
443        STR     q26,  [x7], 16
444        MOV     v26.16b, v27.16b
445        STR     q24, [x17], 16
446        MOV     v24.16b, v25.16b
447        STR     q22, [x16], 16
448        MOV     v22.16b, v23.16b
449        STR     q20,  [x6], 16
450        MOV     v20.16b, v21.16b
4519:
452        TBZ     x1, 1, 10f
453        STR     d26,  [x7], 8
454        STR     d24, [x17], 8
455        DUP     d26, v26.d[1]
456        DUP     d24, v24.d[1]
457        STR     d22, [x16], 8
458        STR     d20,  [x6], 8
459        DUP     d22, v22.d[1]
460        DUP     d20, v20.d[1]
461
46210:
463        TBZ     x1, 0, 11f
464        STR     s26,  [x7]
465        STR     s24, [x17]
466        STR     s22, [x16]
467        STR     s20,  [x6]
46811:
469        # Restore x19, d12-d15 from stack
470        LDR     x19,      [sp, 32]
471        LDP     d14, d15, [sp, 16]
472        LDP     d12, d13, [sp], 48
473        RET
474
475END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
476
477#ifdef __ELF__
478.section ".note.GNU-stack","",%progbits
479#endif
480