xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x14 c3
37
38# x4 temporary vector shadow register
39
40# Vector register usage
41# A0  v0     v3
42# A1  v0[1]  v3[1]
43# A2  v1     v4
44# A3  v1[1]  v4[1]
45
46# B   v12 v13 v14 v15 second set of B
47# B   v16 v17 v18 v19 first set
48# C   v20 v21
49# C   v22 v23
50# C   v24 v25
51# C   v26 v27
52# Clamp v6 v7
53
54# unused A   v8 v9 v10 v11
55# x12 a4
56# x13 c4
57#  x7 c5
58# A4  v2     v5
59# A5  v2[1]  v5[1]
60# C   v28 v29
61# C   v30 v31
62
63BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53
64
65        $if INC:
66          # Load acc, params pointer
67          LDP     x15, x8, [sp, 8]
68        $else:
69          # Load params pointer
70          LDR     x8, [sp, 8]
71
72        # Clamp A and C pointers
73        CMP     x0, 2                   // if mr < 2
74        ADD     x9, x3, x4              // a1 = a0 + a_stride
75        ADD     x16, x6, x7             // c1 = c0 + cm_stride
76        CSEL    x9, x3, x9, LO          //   a1 = a0
77        CSEL    x16, x6, x16, LO        //   c1 = c0
78
79        ADD     x10, x9, x4             // a2 = a1 + a_stride
80        ADD     x17, x16, x7            // c2 = c1 + cm_stride
81                                        // if mr <= 2
82        CSEL    x10, x9, x10, LS        //   a2 = a1
83        CSEL    x17, x16, x17, LS       //   c2 = c1
84
85        CMP     x0, 4                   // if mr < 4
86        ADD     x11, x10, x4            // a3 = a2 + a_stride
87        ADD     x14, x17, x7            // c3 = c2 + cm_stride
88        CSEL    x11, x10, x11, LO       //   a3 = a2
89        CSEL    x14, x17, x14, LO       //   c3 = c2
90
91        # Load min/max values
92        LD2R    {v6.4s, v7.4s}, [x8]
93
94        # Save d12-d15 on stack
95        STP     d12, d13, [sp, -32]!
96        STP     d14, d15, [sp, 16]
97
980:
99        $if INC:
100          # Load initial accumulators
101          LDP     q20, q21, [x15], 32
102          LDP     q22, q23, [x15], 32
103          LDP     q24, q25, [x15], 32
104          LDP     q26, q27, [x15], 32
105          $if PREFETCH:
106            PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
107          $if PREFETCH:
108            PRFM    PLDL1KEEP,  [x3, 64]
109          $if PREFETCH:
110            PRFM    PLDL1KEEP,  [x9,  0]
111          $if PREFETCH:
112            PRFM    PLDL1KEEP,  [x9, 64]
113          $if PREFETCH:
114            PRFM    PLDL1KEEP, [x10,  0]
115          $if PREFETCH:
116            PRFM    PLDL1KEEP, [x10, 64]
117          $if PREFETCH:
118            PRFM    PLDL1KEEP, [x11,  0]
119          $if PREFETCH:
120            PRFM    PLDL1KEEP, [x11, 64]
121          $if PREFETCH:
122            PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
123          $if PREFETCH:
124            PRFM    PLDL1KEEP, [x5,  64]
125          $if PREFETCH:
126            PRFM    PLDL1KEEP, [x5, 128]
127          $if PREFETCH:
128            PRFM    PLDL1KEEP, [x5, 192]
129        $else:
130          # Load initial bias from w into accumulators
131          LDP     q20, q21, [x5], 32
132          MOV     v22.16b, v20.16b
133          $if PREFETCH:
134            PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
135          $if PREFETCH:
136            PRFM    PLDL1KEEP,  [x3, 64]
137          MOV     v23.16b, v21.16b
138          $if PREFETCH:
139            PRFM    PLDL1KEEP,  [x9,  0]
140          $if PREFETCH:
141            PRFM    PLDL1KEEP,  [x9, 64]
142          MOV     v24.16b, v20.16b
143          $if PREFETCH:
144            PRFM    PLDL1KEEP, [x10,  0]
145          $if PREFETCH:
146            PRFM    PLDL1KEEP, [x10, 64]
147          MOV     v25.16b, v21.16b
148          $if PREFETCH:
149            PRFM    PLDL1KEEP, [x11,  0]
150          $if PREFETCH:
151            PRFM    PLDL1KEEP, [x11, 64]
152          MOV     v26.16b, v20.16b
153          $if PREFETCH:
154            PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
155          MOV     v27.16b, v21.16b
156          $if PREFETCH:
157            PRFM    PLDL1KEEP, [x5,  64]
158          $if PREFETCH:
159            PRFM    PLDL1KEEP, [x5, 128]
160          $if PREFETCH:
161            PRFM    PLDL1KEEP, [x5, 192]
162
163        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
164        SUBS    x0, x2, 16              // k = kc - 16
165        B.LO    4f
166
167        # Prologue - First group loads, no FMA
168        LDR     d0, [x3], 8              // a0
169        LDP     q16, q17, [x5], 32         // b
170        LDR     d1, [x10], 8             // a2
171        LD1     {v0.d}[1],  [x9], 8       // a1
172        LD1     {v1.d}[1], [x11], 8       // a3
173        SUBS    x0, x0, 16
174        LDR     q18, [x5], 16
175        LDR     d19, [x5], 8
176        LDR     x4, [x5], 8             // ins is in BLOCK 0
177
178        # Is there at least 4 floats (16 bytes) for main loop?
179        B.LO    2f
180
181        # Main loop - 4 floats of A (16 bytes)
182        # 32 FMA + 8 LD64 A + 8 LDR B
1831:
184        # First group of 16 FMA, Second group loads
185        # BLOCK 0
186        LDR     d3, [x3], 8              // a0
187        INS     v19.d[1], x4               // b from second group
188        FMLA    v20.4s, v16.4s,  v0.s[0]
189        LDR     x4, [x9], 8               // a1
190        FMLA    v22.4s, v16.4s,  v0.s[2]
191        FMLA    v24.4s, v16.4s,  v1.s[0]
192
193        # BLOCK 1
194        LDR     d12, [x5]
195        INS     v3.d[1], x4                // a1 ins
196        FMLA    v26.4s, v16.4s,  v1.s[2]
197        LDR     x4, [x5, 8]             // b
198        FMLA    v21.4s, v17.4s,  v0.s[0]
199        FMLA    v23.4s, v17.4s,  v0.s[2]
200
201        # BLOCK 2
202        LDR     d4, [x10], 8             // a2
203        INS     v12.d[1], x4            // b  ins
204        FMLA    v25.4s, v17.4s,  v1.s[0]
205        LDR     x4, [x11], 8              // a3
206        FMLA    v27.4s, v17.4s,  v1.s[2]
207        FMLA    v20.4s, v18.4s,  v0.s[1]
208
209        # BLOCK 3
210        LDR     d13, [x5, 16]
211        INS     v4.d[1], x4                // a3 ins
212        FMLA    v22.4s, v18.4s,  v0.s[3]
213        LDR     x4, [x5, 24]
214        FMLA    v24.4s, v18.4s,  v1.s[1]
215        FMLA    v26.4s, v18.4s,  v1.s[3]
216
217        # BLOCK 4
218        LDR     d14, [x5, 32]
219        INS     v13.d[1], x4            // b
220        FMLA    v21.4s, v19.4s,  v0.s[1]
221        LDR     x4, [x5, 40]
222        FMLA    v23.4s, v19.4s,  v0.s[3]
223        FMLA    v25.4s, v19.4s,  v1.s[1]
224
225        # BLOCK 5
226        # NOPs to ensure 4 cycle LDR lands on next LDR
227        LDR     d15, [x5, 48]
228        INS     v14.d[1], x4            // b from previous
229        FMLA    v27.4s, v19.4s,  v1.s[3]
230        LDR     x4, [x5, 56]
231        NOP
232        NOP
233        NOP
234        NOP
235
236        # Second group of 16 FMA, First group of loads
237        # BLOCK 0
238        LDR     d0, [x3], 8              // a0
239        INS     v15.d[1], x4            // b from previous
240        FMLA    v20.4s, v12.4s,  v3.s[0]
241        LDR     x4, [x9], 8               // a1
242        FMLA    v22.4s, v12.4s,  v3.s[2]
243        FMLA    v24.4s, v12.4s,  v4.s[0]
244        $if PREFETCH:
245          PRFM    PLDL1KEEP, [x3, 128]      // Prefetch A0
246
247        # BLOCK 1
248        LDR     d16, [x5, 64]
249        INS     v0.d[1], x4                // a1 ins
250        FMLA    v26.4s, v12.4s,  v4.s[2]
251        LDR     x4, [x5, 72]            // b
252        FMLA    v21.4s, v13.4s,  v3.s[0]
253        FMLA    v23.4s, v13.4s,  v3.s[2]
254        $if PREFETCH:
255          PRFM    PLDL1KEEP, [x9, 128]      // Prefetch A1
256
257        # BLOCK 2
258        LDR     d1, [x10], 8             // a2
259        INS     v16.d[1], x4            // b
260        FMLA    v25.4s, v13.4s,  v4.s[0]
261        LDR     x4, [x11], 8              // a3
262        FMLA    v27.4s, v13.4s,  v4.s[2]
263        FMLA    v20.4s, v14.4s,  v3.s[1]
264        $if PREFETCH:
265          PRFM    PLDL1KEEP, [x10, 128]     // Prefetch A2
266
267        # BLOCK 3
268        LDR     d17, [x5, 80]
269        INS     v1.d[1], x4                // a3 ins
270        FMLA    v22.4s, v14.4s,  v3.s[3]
271        LDR     x4, [x5, 88]
272        FMLA    v24.4s, v14.4s,  v4.s[1]
273        FMLA    v26.4s, v14.4s,  v4.s[3]
274        $if PREFETCH:
275          PRFM    PLDL1KEEP, [x11, 128]     // Prefetch A3
276
277        # BLOCK 4
278        LDR     d18, [x5, 96]
279        INS     v17.d[1], x4            // b
280        FMLA    v21.4s, v15.4s,  v3.s[1]
281        LDR     x4, [x5, 104]
282        FMLA    v23.4s, v15.4s,  v3.s[3]
283        FMLA    v25.4s, v15.4s,  v4.s[1]
284        $if PREFETCH:
285          PRFM    PLDL1KEEP, [x5, 192]      // Prefetch B
286
287        # BLOCK 5
288        # NOTE that block needs to be 4 cycles for LDR not to stall
289        LDR     d19, [x5, 112]
290        INS     v18.d[1], x4
291        FMLA    v27.4s, v15.4s,  v4.s[3]
292        LDR     x4, [x5, 120]
293        SUBS    x0, x0, 16
294        $if PREFETCH:
295          PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
296        ADD     x5, x5, 128
297        B.HS    1b
298
299        # Epilogue - 4 floats of A (16 bytes)
300        # 32 FMA + 8 LD64 A + 8 LDR B
3012:
302        # First group of 16 FMA, Second group loads
303        # BLOCK 0
304        LDR     d3, [x3], 8              // a0
305        INS     v19.d[1], x4               // b from second group
306        FMLA    v20.4s, v16.4s,  v0.s[0]
307        LDR     x4, [x9], 8               // a1
308        FMLA    v22.4s, v16.4s,  v0.s[2]
309        FMLA    v24.4s, v16.4s,  v1.s[0]
310
311        # BLOCK 1
312        LDR     d12, [x5]
313        INS     v3.d[1], x4                // a1 ins
314        FMLA    v26.4s, v16.4s,  v1.s[2]
315        LDR     x4, [x5, 8]             // b
316        FMLA    v21.4s, v17.4s,  v0.s[0]
317        FMLA    v23.4s, v17.4s,  v0.s[2]
318
319        # BLOCK 2
320        LDR     d4, [x10], 8             // a2
321        INS     v12.d[1], x4            // b  ins
322        FMLA    v25.4s, v17.4s,  v1.s[0]
323        LDR     x4, [x11], 8              // a3
324        FMLA    v27.4s, v17.4s,  v1.s[2]
325        FMLA    v20.4s, v18.4s,  v0.s[1]
326
327        # BLOCK 3
328        LDR     d13, [x5, 16]
329        INS     v4.d[1], x4                // a3 ins
330        FMLA    v22.4s, v18.4s,  v0.s[3]
331        LDR     x4, [x5, 24]
332        FMLA    v24.4s, v18.4s,  v1.s[1]
333        FMLA    v26.4s, v18.4s,  v1.s[3]
334
335        # BLOCK 4
336        LDR     d14, [x5, 32]
337        INS     v13.d[1], x4            // b
338        FMLA    v21.4s, v19.4s,  v0.s[1]
339        LDR     x4, [x5, 40]
340        FMLA    v23.4s, v19.4s,  v0.s[3]
341        FMLA    v25.4s, v19.4s,  v1.s[1]
342
343        # BLOCK 5
344        # NOPs to ensure 4 cycle LDR lands on next LDR
345        LDR     d15, [x5, 48]
346        INS     v14.d[1], x4
347        FMLA    v27.4s, v19.4s,  v1.s[3]
348        LDR     x4, [x5, 56]
349        NOP     // fma
350        NOP
351        NOP     // fma
352        NOP
353
354        # Second group of 16 FMA, no loads
355        # BLOCK 0
356        INS     v15.d[1], x4            // b from previous
357        FMLA    v20.4s, v12.4s,  v3.s[0]
358        FMLA    v22.4s, v12.4s,  v3.s[2]
359        FMLA    v24.4s, v12.4s,  v4.s[0]
360
361        # BLOCK 1
362        FMLA    v26.4s, v12.4s,  v4.s[2]
363        FMLA    v21.4s, v13.4s,  v3.s[0]
364        FMLA    v23.4s, v13.4s,  v3.s[2]
365
366        # BLOCK 2
367        FMLA    v25.4s, v13.4s,  v4.s[0]
368        FMLA    v27.4s, v13.4s,  v4.s[2]
369        FMLA    v20.4s, v14.4s,  v3.s[1]
370
371        # BLOCK 3
372        FMLA    v22.4s, v14.4s,  v3.s[3]
373        FMLA    v24.4s, v14.4s,  v4.s[1]
374        FMLA    v26.4s, v14.4s,  v4.s[3]
375        TST     x0, 15
376
377        # BLOCK 4
378        FMLA    v21.4s, v15.4s,  v3.s[1]
379        FMLA    v23.4s, v15.4s,  v3.s[3]
380        FMLA    v25.4s, v15.4s,  v4.s[1]
381        ADD     x5, x5, 64
382
383        # BLOCK 5
384        FMLA    v27.4s, v15.4s,  v4.s[3]
385
386        # Is there a remainder?- 2 floats of A (8 bytes) or less
387        B.NE    4f
388
3893:
390        # Clamp
391        FMAX    v20.4s, v20.4s, v6.4s
392        # Load cn_stride
393        LDR     x0, [sp, 32]
394        FMAX    v21.4s, v21.4s, v6.4s
395        FMAX    v22.4s, v22.4s, v6.4s
396        FMAX    v23.4s, v23.4s, v6.4s
397        FMAX    v24.4s, v24.4s, v6.4s
398        FMAX    v25.4s, v25.4s, v6.4s
399        FMAX    v26.4s, v26.4s, v6.4s
400        FMAX    v27.4s, v27.4s, v6.4s
401        SUBS    x1, x1, 8
402        FMIN    v20.4s, v20.4s, v7.4s
403        FMIN    v21.4s, v21.4s, v7.4s
404        FMIN    v22.4s, v22.4s, v7.4s
405        FMIN    v23.4s, v23.4s, v7.4s
406        FMIN    v24.4s, v24.4s, v7.4s
407        FMIN    v25.4s, v25.4s, v7.4s
408        FMIN    v26.4s, v26.4s, v7.4s
409        FMIN    v27.4s, v27.4s, v7.4s
410
411        # Store full 4 x 8
412        B.LO    6f
413
414        $if INC:
415          ST1     {v26.16b, v27.16b}, [x14], x0
416          SUB     x3,  x3, x2             // a0 -= kc
417          ST1     {v24.16b, v25.16b}, [x17], x0
418          SUB     x9,  x9, x2             // a1 -= kc
419          ST1     {v22.16b, v23.16b}, [x16], x0
420          SUB     x10, x10, x2            // a2 -= kc
421          ST1     {v20.16b, v21.16b},  [x6], x0
422          SUB     x11, x11, x2            // a3 -= kc
423        $else:
424          ST1     {v20.16b, v21.16b},  [x6], x0
425          SUB     x3,  x3, x2             // a0 -= kc
426          ST1     {v22.16b, v23.16b}, [x16], x0
427          SUB     x9,  x9, x2             // a1 -= kc
428          ST1     {v24.16b, v25.16b}, [x17], x0
429          SUB     x10, x10, x2            // a2 -= kc
430          ST1     {v26.16b, v27.16b}, [x14], x0
431          SUB     x11, x11, x2            // a3 -= kc
432
433        B.HI    0b
434
435        # Restore d12-d15 from stack
436        LDP     d14, d15, [sp, 16]
437        LDP     d12, d13, [sp], 32
438        RET
439
4404:
441        # Is there a remainder?- 2 floats of A (8 bytes)
442        TBZ     x0, 3, 5f
443
444        # Remainder- 2 floats of A (8 bytes)
445        LDR     d0,  [x3], 8
446        LDR     q16, [x5], 16
447        LD1     {v0.d}[1], [x9], 8
448        LDR     d1, [x10], 8
449        LD1     {v1.d}[1], [x11], 8
450        LDR     q17, [x5], 16
451        LDR     q18, [x5], 16
452        LDR     q19, [x5], 16
453        FMLA    v20.4s, v16.4s,  v0.s[0]
454        FMLA    v22.4s, v16.4s,  v0.s[2]
455        FMLA    v24.4s, v16.4s,  v1.s[0]
456        FMLA    v26.4s, v16.4s,  v1.s[2]
457        FMLA    v21.4s, v17.4s,  v0.s[0]
458        FMLA    v23.4s, v17.4s,  v0.s[2]
459        FMLA    v25.4s, v17.4s,  v1.s[0]
460        FMLA    v27.4s, v17.4s,  v1.s[2]
461
462        FMLA    v20.4s, v18.4s,  v0.s[1]
463        FMLA    v22.4s, v18.4s,  v0.s[3]
464        FMLA    v24.4s, v18.4s,  v1.s[1]
465        FMLA    v26.4s, v18.4s,  v1.s[3]
466        FMLA    v21.4s, v19.4s,  v0.s[1]
467        FMLA    v23.4s, v19.4s,  v0.s[3]
468        FMLA    v25.4s, v19.4s,  v1.s[1]
469        FMLA    v27.4s, v19.4s,  v1.s[3]
470
471        # Is there a remainder?- 1 float of A (4 bytes)
472        TBZ     x0, 2, 3b
473
4745:
475        # Remainder- 1 float of A (4 bytes)
476        LDR     s0,  [x3], 4
477        LDR     q16, [x5], 16
478        LD1     {v0.s}[2], [x9], 4
479        LDR     s1, [x10], 4
480        LD1     {v1.s}[2], [x11], 4
481        LDR     q17, [x5], 16
482
483        FMLA    v20.4s, v16.4s,  v0.s[0]
484        FMLA    v22.4s, v16.4s,  v0.s[2]
485        FMLA    v24.4s, v16.4s,  v1.s[0]
486        FMLA    v26.4s, v16.4s,  v1.s[2]
487        FMLA    v21.4s, v17.4s,  v0.s[0]
488        FMLA    v23.4s, v17.4s,  v0.s[2]
489        FMLA    v25.4s, v17.4s,  v1.s[0]
490        FMLA    v27.4s, v17.4s,  v1.s[2]
491        B       3b
492
493        # Store odd width
4946:
495        TBZ     x1, 2, 7f
496        $if INC:
497          STR     q26, [x14], 16
498          MOV     v26.16b, v27.16b
499          STR     q24, [x17], 16
500          MOV     v24.16b, v25.16b
501          STR     q22, [x16], 16
502          MOV     v22.16b, v23.16b
503          STR     q20,  [x6], 16
504          MOV     v20.16b, v21.16b
505        $else:
506          STR     q20,  [x6], 16
507          MOV     v20.16b, v21.16b
508          STR     q22, [x16], 16
509          MOV     v22.16b, v23.16b
510          STR     q24, [x17], 16
511          MOV     v24.16b, v25.16b
512          STR     q26, [x14], 16
513          MOV     v26.16b, v27.16b
514
5157:
516        TBZ     x1, 1, 8f
517        $if INC:
518          STR     d26, [x14], 8
519          STR     d24, [x17], 8
520          DUP     d26, v26.d[1]
521          DUP     d24, v24.d[1]
522          STR     d22, [x16], 8
523          STR     d20,  [x6], 8
524          DUP     d22, v22.d[1]
525          DUP     d20, v20.d[1]
526        $else:
527          STR     d20,  [x6], 8
528          STR     d22, [x16], 8
529          DUP     d20, v20.d[1]
530          DUP     d22, v22.d[1]
531          STR     d24, [x17], 8
532          STR     d26, [x14], 8
533          DUP     d24, v24.d[1]
534          DUP     d26, v26.d[1]
535
5368:
537        TBZ     x1, 0, 9f
538        $if INC:
539          STR     s26, [x14]
540          STR     s24, [x17]
541          STR     s22, [x16]
542          STR     s20,  [x6]
543        $else:
544          STR     s20,  [x6]
545          STR     s22, [x16]
546          STR     s24, [x17]
547          STR     s26, [x14]
5489:
549        # Restore d12-d15 from stack
550        LDP     d14, d15, [sp, 16]
551        LDP     d12, d13, [sp], 32
552        RET
553
554END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53
555
556#ifdef __ELF__
557.section ".note.GNU-stack","",%progbits
558#endif
559