xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x14 c3
37
38# x4 temporary vector shadow register
39
40# Vector register usage
41# A0  v0     v3
42# A1  v0[1]  v3[1]
43# A2  v1     v4
44# A3  v1[1]  v4[1]
45
46# B   v12 v13 v14 v15 second set of B
47# B   v16 v17 v18 v19 first set
48# C   v20 v21
49# C   v22 v23
50# C   v24 v25
51# C   v26 v27
52# Clamp v6 v7
53
54# unused A   v8 v9 v10 v11
55# x12 a4
56# x13 c4
57#  x7 c5
58# A4  v2     v5
59# A5  v2[1]  v5[1]
60# C   v28 v29
61# C   v30 v31
62
63BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
64
65        $if INC:
66          # Load acc, params pointer
67          LDP     x15, x8, [sp, 8]
68        $else:
69          # Load params pointer
70          LDR     x8, [sp, 8]
71
72        # Clamp A and C pointers
73        CMP     x0, 2                   // if mr < 2
74        ADD     x9, x3, x4              // a1 = a0 + a_stride
75        ADD     x16, x6, x7             // c1 = c0 + cm_stride
76        CSEL    x9, x3, x9, LO          //   a1 = a0
77        CSEL    x16, x6, x16, LO        //   c1 = c0
78
79        ADD     x10, x9, x4             // a2 = a1 + a_stride
80        ADD     x17, x16, x7            // c2 = c1 + cm_stride
81                                        // if mr <= 2
82        CSEL    x10, x9, x10, LS        //   a2 = a1
83        CSEL    x17, x16, x17, LS       //   c2 = c1
84
85        CMP     x0, 4                   // if mr < 4
86        ADD     x11, x10, x4            // a3 = a2 + a_stride
87        ADD     x14, x17, x7            // c3 = c2 + cm_stride
88        CSEL    x11, x10, x11, LO       //   a3 = a2
89        CSEL    x14, x17, x14, LO       //   c3 = c2
90
91        # Load min/max values
92        LD2R    {v6.4s, v7.4s}, [x8]
93
94        # Save d12-d15 on stack
95        STP     d12, d13, [sp, -32]!
96        STP     d14, d15, [sp, 16]
97
980:
99        $if INC:
100          # Load initial accumulators
101          LDP     q20, q21, [x15], 32
102          LDP     q22, q23, [x15], 32
103          LDP     q24, q25, [x15], 32
104          LDP     q26, q27, [x15], 32
105          PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
106          PRFM    PLDL1KEEP,  [x3, 64]
107          PRFM    PLDL1KEEP,  [x9,  0]
108          PRFM    PLDL1KEEP,  [x9, 64]
109          PRFM    PLDL1KEEP, [x10,  0]
110          PRFM    PLDL1KEEP, [x10, 64]
111          PRFM    PLDL1KEEP, [x11,  0]
112          PRFM    PLDL1KEEP, [x11, 64]
113          PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
114          PRFM    PLDL1KEEP, [x5,  64]
115          PRFM    PLDL1KEEP, [x5, 128]
116          PRFM    PLDL1KEEP, [x5, 192]
117        $else:
118          # Load initial bias from w into accumulators
119          LDP     q20, q21, [x5], 32
120          MOV     v22.16b, v20.16b
121          PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
122          PRFM    PLDL1KEEP,  [x3, 64]
123          MOV     v23.16b, v21.16b
124          PRFM    PLDL1KEEP,  [x9,  0]
125          PRFM    PLDL1KEEP,  [x9, 64]
126          MOV     v24.16b, v20.16b
127          PRFM    PLDL1KEEP, [x10,  0]
128          PRFM    PLDL1KEEP, [x10, 64]
129          MOV     v25.16b, v21.16b
130          PRFM    PLDL1KEEP, [x11,  0]
131          PRFM    PLDL1KEEP, [x11, 64]
132          MOV     v26.16b, v20.16b
133          PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
134          MOV     v27.16b, v21.16b
135          PRFM    PLDL1KEEP, [x5,  64]
136          PRFM    PLDL1KEEP, [x5, 128]
137          PRFM    PLDL1KEEP, [x5, 192]
138
139        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
140        SUBS    x0, x2, 16              // k = kc - 16
141        B.LO    4f
142
143        # Prologue - First group loads, no FMA
144        LDR     d0, [x3], 8              // a0
145        LDP     q16, q17, [x5], 32         // b
146        LDR     d1, [x10], 8             // a2
147        LD1     {v0.d}[1],  [x9], 8       // a1
148        LD1     {v1.d}[1], [x11], 8       // a3
149        SUBS    x0, x0, 16
150        LDR     q18, [x5], 16
151        LDR     d19, [x5], 8
152        LDR     x4, [x5], 8             // ins is in BLOCK 0
153
154        # Is there at least 4 floats (16 bytes) for main loop?
155        B.LO    2f
156
157        # Main loop - 4 floats of A (16 bytes)
158        # 32 FMA + 8 LD64 A + 8 LDR B
1591:
160        # First group of 16 FMA, Second group loads
161        # BLOCK 0
162        FMLA    v20.4s, v16.4s,  v0.s[0]
163        LDR     d3, [x3], 8              // a0
164        FMLA    v22.4s, v16.4s,  v0.s[2]
165        INS     v19.d[1], x4               // b from second group
166        FMLA    v24.4s, v16.4s,  v1.s[0]
167        LDR     x4, [x9], 8               // a1
168
169        # BLOCK 1
170        FMLA    v26.4s, v16.4s,  v1.s[2]
171        LDR     d12, [x5]
172        FMLA    v21.4s, v17.4s,  v0.s[0]
173        INS     v3.d[1], x4                // a1 ins
174        FMLA    v23.4s, v17.4s,  v0.s[2]
175        LDR     x4, [x5, 8]             // b
176
177        # BLOCK 2
178        FMLA    v25.4s, v17.4s,  v1.s[0]
179        LDR     d4, [x10], 8             // a2
180        FMLA    v27.4s, v17.4s,  v1.s[2]
181        INS     v12.d[1], x4            // b  ins
182        FMLA    v20.4s, v18.4s,  v0.s[1]
183        LDR     x4, [x11], 8              // a3
184
185        # BLOCK 3
186        FMLA    v22.4s, v18.4s,  v0.s[3]
187        LDR     d13, [x5, 16]
188        FMLA    v24.4s, v18.4s,  v1.s[1]
189        INS     v4.d[1], x4                // a3 ins
190        FMLA    v26.4s, v18.4s,  v1.s[3]
191        LDR     x4, [x5, 24]
192
193        # BLOCK 4
194        FMLA    v21.4s, v19.4s,  v0.s[1]
195        LDR     d14, [x5, 32]
196        FMLA    v23.4s, v19.4s,  v0.s[3]
197        INS     v13.d[1], x4            // b
198        FMLA    v25.4s, v19.4s,  v1.s[1]
199        LDR     x4, [x5, 40]
200
201        # BLOCK 5
202        # NOPs to ensure 4 cycle LDR lands on next LDR
203        FMLA    v27.4s, v19.4s,  v1.s[3]
204        LDR     d15, [x5, 48]
205        NOP
206        INS     v14.d[1], x4            // b from previous
207        SUBS    x0, x0, 16
208        LDR     x4, [x5, 56]
209
210        # Second group of 16 FMA, First group of loads
211        # BLOCK 0
212        FMLA    v20.4s, v12.4s,  v3.s[0]
213        LDR     d0, [x3], 8              // a0
214        FMLA    v22.4s, v12.4s,  v3.s[2]
215        INS     v15.d[1], x4            // b from previous
216        FMLA    v24.4s, v12.4s,  v4.s[0]
217        LDR     x4, [x9], 8               // a1
218
219        # BLOCK 1
220        FMLA    v26.4s, v12.4s,  v4.s[2]
221        LDR     d16, [x5, 64]
222        FMLA    v21.4s, v13.4s,  v3.s[0]
223        INS     v0.d[1], x4                // a1 ins
224        FMLA    v23.4s, v13.4s,  v3.s[2]
225        LDR     x4, [x5, 72]            // b
226
227        # BLOCK 2
228        FMLA    v25.4s, v13.4s,  v4.s[0]
229        LDR     d1, [x10], 8             // a2
230        FMLA    v27.4s, v13.4s,  v4.s[2]
231        INS     v16.d[1], x4            // b
232        FMLA    v20.4s, v14.4s,  v3.s[1]
233        LDR     x4, [x11], 8              // a3
234
235        # BLOCK 3
236        FMLA    v22.4s, v14.4s,  v3.s[3]
237        LDR     d17, [x5, 80]
238        FMLA    v24.4s, v14.4s,  v4.s[1]
239        INS     v1.d[1], x4                // a3 ins
240        FMLA    v26.4s, v14.4s,  v4.s[3]
241        LDR     x4, [x5, 88]
242
243        # BLOCK 4
244        FMLA    v21.4s, v15.4s,  v3.s[1]
245        LDR     d18, [x5, 96]
246        FMLA    v23.4s, v15.4s,  v3.s[3]
247        INS     v17.d[1], x4            // b
248        FMLA    v25.4s, v15.4s,  v4.s[1]
249        LDR     x4, [x5, 104]
250
251        # BLOCK 5
252        # NOTE that block needs to be 4 cycles for LDR not to stall
253        FMLA    v27.4s, v15.4s,  v4.s[3]
254        LDR     d19, [x5, 112]
255        INS     v18.d[1], x4
256        LDR     x4, [x5, 120]
257        ADD     x5, x5, 128
258        B.HS    1b
259
260        # Epilogue - 4 floats of A (16 bytes)
261        # 32 FMA + 8 LD64 A + 8 LDR B
2622:
263        # First group of 16 FMA, Second group loads
264        # BLOCK 0
265        FMLA    v20.4s, v16.4s,  v0.s[0]
266        LDR     d3, [x3], 8              // a0
267        FMLA    v22.4s, v16.4s,  v0.s[2]
268        INS     v19.d[1], x4               // b from second group
269        FMLA    v24.4s, v16.4s,  v1.s[0]
270        LDR     x4, [x9], 8               // a1
271
272        # BLOCK 1
273        FMLA    v26.4s, v16.4s,  v1.s[2]
274        LDR     d12, [x5]
275        FMLA    v21.4s, v17.4s,  v0.s[0]
276        INS     v3.d[1], x4                // a1 ins
277        FMLA    v23.4s, v17.4s,  v0.s[2]
278        LDR     x4, [x5, 8]             // b
279
280        # BLOCK 2
281        FMLA    v25.4s, v17.4s,  v1.s[0]
282        LDR     d4, [x10], 8             // a2
283        FMLA    v27.4s, v17.4s,  v1.s[2]
284        INS     v12.d[1], x4            // b  ins
285        FMLA    v20.4s, v18.4s,  v0.s[1]
286        LDR     x4, [x11], 8              // a3
287
288        # BLOCK 3
289        FMLA    v22.4s, v18.4s,  v0.s[3]
290        LDR     d13, [x5, 16]
291        FMLA    v24.4s, v18.4s,  v1.s[1]
292        INS     v4.d[1], x4                // a3 ins
293        FMLA    v26.4s, v18.4s,  v1.s[3]
294        LDR     x4, [x5, 24]
295
296        # BLOCK 4
297        FMLA    v21.4s, v19.4s,  v0.s[1]
298        LDR     d14, [x5, 32]
299        FMLA    v23.4s, v19.4s,  v0.s[3]
300        INS     v13.d[1], x4            // b
301        FMLA    v25.4s, v19.4s,  v1.s[1]
302        LDR     x4, [x5, 40]
303
304        # BLOCK 5
305        # NOPs to ensure 4 cycle LDR lands on next LDR
306        FMLA    v27.4s, v19.4s,  v1.s[3]
307        LDR     d15, [x5, 48]
308        NOP     // fma
309        INS     v14.d[1], x4
310        NOP
311        LDR     x4, [x5, 56]
312
313        # Second group of 16 FMA, no loads
314        # BLOCK 0
315        FMLA    v20.4s, v12.4s,  v3.s[0]
316        FMLA    v22.4s, v12.4s,  v3.s[2]
317        INS     v15.d[1], x4            // b from previous
318        FMLA    v24.4s, v12.4s,  v4.s[0]
319
320        # BLOCK 1
321        FMLA    v26.4s, v12.4s,  v4.s[2]
322        FMLA    v21.4s, v13.4s,  v3.s[0]
323        FMLA    v23.4s, v13.4s,  v3.s[2]
324
325        # BLOCK 2
326        FMLA    v25.4s, v13.4s,  v4.s[0]
327        FMLA    v27.4s, v13.4s,  v4.s[2]
328        FMLA    v20.4s, v14.4s,  v3.s[1]
329
330        # BLOCK 3
331        FMLA    v22.4s, v14.4s,  v3.s[3]
332        FMLA    v24.4s, v14.4s,  v4.s[1]
333        FMLA    v26.4s, v14.4s,  v4.s[3]
334        TST     x0, 15
335
336        # BLOCK 4
337        FMLA    v21.4s, v15.4s,  v3.s[1]
338        FMLA    v23.4s, v15.4s,  v3.s[3]
339        FMLA    v25.4s, v15.4s,  v4.s[1]
340        ADD     x5, x5, 64
341
342        # BLOCK 5
343        FMLA    v27.4s, v15.4s,  v4.s[3]
344
345        # Is there a remainder?- 2 floats of A (8 bytes) or less
346        B.NE    4f
347
3483:
349        # Clamp
350        FMAX    v20.4s, v20.4s, v6.4s
351        # Load cn_stride
352        LDR     x0, [sp, 32]
353        FMAX    v21.4s, v21.4s, v6.4s
354        FMAX    v22.4s, v22.4s, v6.4s
355        FMAX    v23.4s, v23.4s, v6.4s
356        FMAX    v24.4s, v24.4s, v6.4s
357        FMAX    v25.4s, v25.4s, v6.4s
358        FMAX    v26.4s, v26.4s, v6.4s
359        FMAX    v27.4s, v27.4s, v6.4s
360        SUBS    x1, x1, 8
361        FMIN    v20.4s, v20.4s, v7.4s
362        FMIN    v21.4s, v21.4s, v7.4s
363        FMIN    v22.4s, v22.4s, v7.4s
364        FMIN    v23.4s, v23.4s, v7.4s
365        FMIN    v24.4s, v24.4s, v7.4s
366        FMIN    v25.4s, v25.4s, v7.4s
367        FMIN    v26.4s, v26.4s, v7.4s
368        FMIN    v27.4s, v27.4s, v7.4s
369
370        # Store full 4 x 8
371        B.LO    6f
372
373        $if INC:
374          ST1     {v26.16b, v27.16b}, [x14], x0
375          SUB     x3,  x3, x2             // a0 -= kc
376          ST1     {v24.16b, v25.16b}, [x17], x0
377          SUB     x9,  x9, x2             // a1 -= kc
378          ST1     {v22.16b, v23.16b}, [x16], x0
379          SUB     x10, x10, x2            // a2 -= kc
380          ST1     {v20.16b, v21.16b},  [x6], x0
381          SUB     x11, x11, x2            // a3 -= kc
382        $else:
383          ST1     {v20.16b, v21.16b},  [x6], x0
384          SUB     x3,  x3, x2             // a0 -= kc
385          ST1     {v22.16b, v23.16b}, [x16], x0
386          SUB     x9,  x9, x2             // a1 -= kc
387          ST1     {v24.16b, v25.16b}, [x17], x0
388          SUB     x10, x10, x2            // a2 -= kc
389          ST1     {v26.16b, v27.16b}, [x14], x0
390          SUB     x11, x11, x2            // a3 -= kc
391
392        B.HI    0b
393
394        # Restore d12-d15 from stack
395        LDP     d14, d15, [sp, 16]
396        LDP     d12, d13, [sp], 32
397        RET
398
3994:
400        # Is there a remainder?- 2 floats of A (8 bytes)
401        TBZ     x0, 3, 5f
402
403        # Remainder- 2 floats of A (8 bytes)
404        LDR     d0,  [x3], 8
405        LDR     q16, [x5], 16
406        LD1     {v0.d}[1], [x9], 8
407        LDR     d1, [x10], 8
408        LD1     {v1.d}[1], [x11], 8
409        LDR     q17, [x5], 16
410        LDR     q18, [x5], 16
411        LDR     q19, [x5], 16
412        FMLA    v20.4s, v16.4s,  v0.s[0]
413        FMLA    v22.4s, v16.4s,  v0.s[2]
414        FMLA    v24.4s, v16.4s,  v1.s[0]
415        FMLA    v26.4s, v16.4s,  v1.s[2]
416        FMLA    v21.4s, v17.4s,  v0.s[0]
417        FMLA    v23.4s, v17.4s,  v0.s[2]
418        FMLA    v25.4s, v17.4s,  v1.s[0]
419        FMLA    v27.4s, v17.4s,  v1.s[2]
420
421        FMLA    v20.4s, v18.4s,  v0.s[1]
422        FMLA    v22.4s, v18.4s,  v0.s[3]
423        FMLA    v24.4s, v18.4s,  v1.s[1]
424        FMLA    v26.4s, v18.4s,  v1.s[3]
425        FMLA    v21.4s, v19.4s,  v0.s[1]
426        FMLA    v23.4s, v19.4s,  v0.s[3]
427        FMLA    v25.4s, v19.4s,  v1.s[1]
428        FMLA    v27.4s, v19.4s,  v1.s[3]
429
430        # Is there a remainder?- 1 float of A (4 bytes)
431        TBZ     x0, 2, 3b
432
4335:
434        # Remainder- 1 float of A (4 bytes)
435        LDR     s0,  [x3], 4
436        LDR     q16, [x5], 16
437        LD1     {v0.s}[2], [x9], 4
438        LDR     s1, [x10], 4
439        LD1     {v1.s}[2], [x11], 4
440        LDR     q17, [x5], 16
441
442        FMLA    v20.4s, v16.4s,  v0.s[0]
443        FMLA    v22.4s, v16.4s,  v0.s[2]
444        FMLA    v24.4s, v16.4s,  v1.s[0]
445        FMLA    v26.4s, v16.4s,  v1.s[2]
446        FMLA    v21.4s, v17.4s,  v0.s[0]
447        FMLA    v23.4s, v17.4s,  v0.s[2]
448        FMLA    v25.4s, v17.4s,  v1.s[0]
449        FMLA    v27.4s, v17.4s,  v1.s[2]
450        B       3b
451
452        # Store odd width
4536:
454        TBZ     x1, 2, 7f
455        $if INC:
456          STR     q26, [x14], 16
457          MOV     v26.16b, v27.16b
458          STR     q24, [x17], 16
459          MOV     v24.16b, v25.16b
460          STR     q22, [x16], 16
461          MOV     v22.16b, v23.16b
462          STR     q20,  [x6], 16
463          MOV     v20.16b, v21.16b
464        $else:
465          STR     q20,  [x6], 16
466          MOV     v20.16b, v21.16b
467          STR     q22, [x16], 16
468          MOV     v22.16b, v23.16b
469          STR     q24, [x17], 16
470          MOV     v24.16b, v25.16b
471          STR     q26, [x14], 16
472          MOV     v26.16b, v27.16b
473
4747:
475        TBZ     x1, 1, 8f
476        $if INC:
477          STR     d26, [x14], 8
478          STR     d24, [x17], 8
479          DUP     d26, v26.d[1]
480          DUP     d24, v24.d[1]
481          STR     d22, [x16], 8
482          STR     d20,  [x6], 8
483          DUP     d22, v22.d[1]
484          DUP     d20, v20.d[1]
485        $else:
486          STR     d20,  [x6], 8
487          STR     d22, [x16], 8
488          DUP     d20, v20.d[1]
489          DUP     d22, v22.d[1]
490          STR     d24, [x17], 8
491          STR     d26, [x14], 8
492          DUP     d24, v24.d[1]
493          DUP     d26, v26.d[1]
494
4958:
496        TBZ     x1, 0, 9f
497        $if INC:
498          STR     s26, [x14]
499          STR     s24, [x17]
500          STR     s22, [x16]
501          STR     s20,  [x6]
502        $else:
503          STR     s20,  [x6]
504          STR     s22, [x16]
505          STR     s24, [x17]
506          STR     s26, [x14]
5079:
508        # Restore d12-d15 from stack
509        LDP     d14, d15, [sp, 16]
510        LDP     d12, d13, [sp], 32
511        RET
512
513END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
514
515#ifdef __ELF__
516.section ".note.GNU-stack","",%progbits
517#endif
518