xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointers
25# x13 a0
26# x14 a1
27# x15 a2
28#  x8 a3
29
30# C pointers
31#  x6 c0
32# x16 c1
33# x17 c2
34#  x7 c3
35
36# x19 temporary vector shadow register
37
38# Vector register usage
39# A0  v0     v3
40# A1  v0[1]  v3[1]
41# A2  v1     v4
42# A3  v1[1]  v4[1]
43
44# B   v12 v13 v14 v15 second set of B
45# B   v16 v17 v18 v19 first set
46# C   v20 v21
47# C   v22 v23
48# C   v24 v25
49# C   v26 v27
50# Clamp v6 v7
51
52# unused A   v8 v9 v10 v11
53# x12 a4
54#  x4 a5
55# x13 c4
56#  x7 c5
57# A4  v2     v5
58# A5  v2[1]  v5[1]
59# C   v28 v29
60# C   v30 v31
61
62BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53
63
64        # Clamp C pointers
65        CMP     x0, 2                   // if mr < 2
66        ADD     x16, x6, x7             // c1 = c0 + cm_stride
67        CSEL    x16, x6, x16, LO        //   c1 = c0
68
69        ADD     x17, x16, x7            // c2 = c1 + cm_stride
70                                        // if mr <= 2
71        CSEL    x17, x16, x17, LS       //   c2 = c1
72
73        CMP     x0, 4                   // if mr < 4
74        ADD     x7, x17, x7             // c3 = c2 + cm_stride
75        CSEL    x7, x17, x7, LO         //   c3 = c2
76
77        # Load cn_stride, a_offset
78        LDP     x10, x11, [sp]
79
80        # Load zero, params pointer
81        LDP     x12, x8, [sp, 16]
82
83        # Load min/max values
84        LD2R    {v6.4s, v7.4s}, [x8]
85
86        # Save x19, d12-d15 on stack
87        STP     d12, d13, [sp, -48]!
88        STP     d14, d15, [sp, 16]
89        STR     x19,      [sp, 32]
90
910:
92        # Load initial bias from w into accumulators
93        LDP     q20, q21, [x5], 32
94        MOV     v22.16b, v20.16b
95        $if PREFETCH:
96          PRFM    PLDL1KEEP,  [x13,  0]   // Prefetch A
97        $if PREFETCH:
98          PRFM    PLDL1KEEP,  [x13, 64]
99        MOV     v23.16b, v21.16b
100        $if PREFETCH:
101          PRFM    PLDL1KEEP,  [x14,  0]
102        $if PREFETCH:
103          PRFM    PLDL1KEEP,  [x14, 64]
104        MOV     v24.16b, v20.16b
105        $if PREFETCH:
106          PRFM    PLDL1KEEP, [x15,  0]
107        $if PREFETCH:
108          PRFM    PLDL1KEEP, [x15, 64]
109        MOV     v25.16b, v21.16b
110        $if PREFETCH:
111          PRFM    PLDL1KEEP, [x8,  0]
112        $if PREFETCH:
113          PRFM    PLDL1KEEP, [x8, 64]
114        MOV     v26.16b, v20.16b
115        $if PREFETCH:
116          PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
117        $if PREFETCH:
118          PRFM    PLDL1KEEP, [x5,  64]
119        MOV     v27.16b, v21.16b
120        $if PREFETCH:
121          PRFM    PLDL1KEEP, [x5, 128]
122        $if PREFETCH:
123          PRFM    PLDL1KEEP, [x5, 192]
124
125        MOV     x9, x3                  // p = ks
126
1271:
128        # Load next 4 A pointers
129        LDP     x13, x14, [x4], 16
130        LDP     x15, x8, [x4], 16
131
132        CMP     x13, x12                // if a0 == zero
133        ADD     x13, x13, x11           // a0 += a_offset
134        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
135        CMP     x14, x12                // if a1 == zero
136        ADD     x14, x14, x11           // a1 += a_offset
137        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
138        CMP     x15, x12                // if a2 == zero
139        ADD     x15, x15, x11           // a2 += a_offset
140        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
141        CMP     x8, x12                 // if a3 == zero
142        ADD     x8, x8, x11             // a3 += a_offset
143        CSEL    x8, x12, x8, EQ         //   a3 = zero, else += a3 + a_offset
144
145        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
146        SUBS    x0, x2, 16              // k = kc - 16
147        B.LO    4f
148
149        # Prologue - First group loads, no FMA
150        LDR     d0, [x13], 8            // a0
151        LDP     q16, q17, [x5], 32        // b
152        LDR     d1, [x15], 8            // a2
153        LD1     {v0.d}[1],  [x14], 8     // a1
154        LD1     {v1.d}[1], [x8], 8       // a3
155        SUBS    x0, x0, 16
156        LDR     q18, [x5], 16
157        LDR     d19, [x5], 8
158        LDR     x19, [x5], 8            // ins is in BLOCK 0
159
160        # Is there at least 4 floats (16 bytes) for main loop?
161        B.LO    3f
162
163        # Main loop - 4 floats of A (16 bytes)
164        # 32 FMA + 8 LD64 A + 8 LDR B
1652:
166        # First group of 16 FMA, Second group loads
167        # BLOCK 0
168        LDR     d3, [x13], 8              // a0
169        INS     v19.d[1], x19               // b from second group
170        FMLA    v20.4s, v16.4s,  v0.s[0]
171        LDR     x19, [x14], 8              // a1
172        FMLA    v22.4s, v16.4s,  v0.s[2]
173        FMLA    v24.4s, v16.4s,  v1.s[0]
174
175        # BLOCK 1
176        LDR     d12, [x5]
177        INS     v3.d[1], x19                // a1 ins
178        FMLA    v26.4s, v16.4s,  v1.s[2]
179        LDR     x19, [x5, 8]            // b
180        FMLA    v21.4s, v17.4s,  v0.s[0]
181        FMLA    v23.4s, v17.4s,  v0.s[2]
182
183        # BLOCK 2
184        LDR     d4, [x15], 8              // a2
185        INS     v12.d[1], x19           // b  ins
186        FMLA    v25.4s, v17.4s,  v1.s[0]
187        LDR     x19, [x8], 8               // a3
188        FMLA    v27.4s, v17.4s,  v1.s[2]
189        FMLA    v20.4s, v18.4s,  v0.s[1]
190
191        # BLOCK 3
192        LDR     d13, [x5, 16]
193        INS     v4.d[1], x19                // a3 ins
194        FMLA    v22.4s, v18.4s,  v0.s[3]
195        LDR     x19, [x5, 24]
196        FMLA    v24.4s, v18.4s,  v1.s[1]
197        FMLA    v26.4s, v18.4s,  v1.s[3]
198
199        # BLOCK 4
200        LDR     d14, [x5, 32]
201        INS     v13.d[1], x19           // b
202        FMLA    v21.4s, v19.4s,  v0.s[1]
203        LDR     x19, [x5, 40]
204        FMLA    v23.4s, v19.4s,  v0.s[3]
205        FMLA    v25.4s, v19.4s,  v1.s[1]
206
207        # BLOCK 5
208        # NOPs to ensure 4 cycle LDR lands on next LDR
209        LDR     d15, [x5, 48]
210        INS     v14.d[1], x19           // b from previous
211        FMLA    v27.4s, v19.4s,  v1.s[3]
212        LDR     x19, [x5, 56]
213        NOP
214        NOP
215        NOP
216        NOP
217
218        # Second group of 16 FMA, First group of loads
219        # BLOCK 0
220        LDR     d0, [x13], 8              // a0
221        INS     v15.d[1], x19           // b from previous
222        FMLA    v20.4s, v12.4s,  v3.s[0]
223        LDR     x19, [x14], 8              // a1
224        FMLA    v22.4s, v12.4s,  v3.s[2]
225        FMLA    v24.4s, v12.4s,  v4.s[0]
226        $if PREFETCH:
227          PRFM    PLDL1KEEP, [x13, 128]      // Prefetch A0
228
229        # BLOCK 1
230        LDR     d16, [x5, 64]
231        INS     v0.d[1], x19               // a1 ins
232        FMLA    v26.4s, v12.4s,  v4.s[2]
233        LDR     x19, [x5, 72]           // b
234        FMLA    v21.4s, v13.4s,  v3.s[0]
235        FMLA    v23.4s, v13.4s,  v3.s[2]
236        $if PREFETCH:
237          PRFM    PLDL1KEEP, [x14, 128]      // Prefetch A1
238
239        # BLOCK 2
240        LDR     d1, [x15], 8             // a2
241        INS     v16.d[1], x19           // b
242        FMLA    v25.4s, v13.4s,  v4.s[0]
243        LDR     x19, [x8], 8             // a3
244        FMLA    v27.4s, v13.4s,  v4.s[2]
245        FMLA    v20.4s, v14.4s,  v3.s[1]
246        $if PREFETCH:
247          PRFM    PLDL1KEEP, [x15, 128]     // Prefetch A2
248
249        # BLOCK 3
250        LDR     d17, [x5, 80]
251        INS     v1.d[1], x19               // a3 ins
252        FMLA    v22.4s, v14.4s,  v3.s[3]
253        LDR     x19, [x5, 88]
254        FMLA    v24.4s, v14.4s,  v4.s[1]
255        FMLA    v26.4s, v14.4s,  v4.s[3]
256        $if PREFETCH:
257          PRFM    PLDL1KEEP, [x8, 128]     // Prefetch A3
258
259        # BLOCK 4
260        LDR     d18, [x5, 96]
261        INS     v17.d[1], x19           // b
262        FMLA    v21.4s, v15.4s,  v3.s[1]
263        LDR     x19, [x5, 104]
264        FMLA    v23.4s, v15.4s,  v3.s[3]
265        FMLA    v25.4s, v15.4s,  v4.s[1]
266        $if PREFETCH:
267          PRFM    PLDL1KEEP, [x5, 192]      // Prefetch B
268
269        # BLOCK 5
270        # NOTE that block needs to be 4 cycles for LDR not to stall
271        LDR     d19, [x5, 112]
272        INS     v18.d[1], x19
273        FMLA    v27.4s, v15.4s,  v4.s[3]
274        LDR     x19, [x5, 120]
275        SUBS    x0, x0, 16
276        $if PREFETCH:
277          PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
278        ADD     x5, x5, 128
279        B.HS    2b
280
281        # Epilogue - 4 floats of A (16 bytes)
282        # 32 FMA + 8 LD64 A + 8 LDR B
2833:
284        # First group of 16 FMA, Second group loads
285        # BLOCK 0
286        LDR     d3, [x13], 8              // a0
287        INS     v19.d[1], x19              // b from second group
288        FMLA    v20.4s, v16.4s,  v0.s[0]
289        LDR     x19, [x14], 8              // a1
290        FMLA    v22.4s, v16.4s,  v0.s[2]
291        FMLA    v24.4s, v16.4s,  v1.s[0]
292
293        # BLOCK 1
294        LDR     d12, [x5]
295        INS     v3.d[1], x19               // a1 ins
296        FMLA    v26.4s, v16.4s,  v1.s[2]
297        LDR     x19, [x5, 8]            // b
298        FMLA    v21.4s, v17.4s,  v0.s[0]
299        FMLA    v23.4s, v17.4s,  v0.s[2]
300
301        # BLOCK 2
302        LDR     d4, [x15], 8             // a2
303        INS     v12.d[1], x19           // b  ins
304        FMLA    v25.4s, v17.4s,  v1.s[0]
305        LDR     x19, [x8], 8             // a3
306        FMLA    v27.4s, v17.4s,  v1.s[2]
307        FMLA    v20.4s, v18.4s,  v0.s[1]
308
309        # BLOCK 3
310        LDR     d13, [x5, 16]
311        INS     v4.d[1], x19               // a3 ins
312        FMLA    v22.4s, v18.4s,  v0.s[3]
313        LDR     x19, [x5, 24]
314        FMLA    v24.4s, v18.4s,  v1.s[1]
315        FMLA    v26.4s, v18.4s,  v1.s[3]
316
317        # BLOCK 4
318        LDR     d14, [x5, 32]
319        INS     v13.d[1], x19           // b
320        FMLA    v21.4s, v19.4s,  v0.s[1]
321        LDR     x19, [x5, 40]
322        FMLA    v23.4s, v19.4s,  v0.s[3]
323        FMLA    v25.4s, v19.4s,  v1.s[1]
324
325        # BLOCK 5
326        # NOPs to ensure 4 cycle LDR lands on next LDR
327        LDR     d15, [x5, 48]
328        INS     v14.d[1], x19
329        FMLA    v27.4s, v19.4s,  v1.s[3]
330        LDR     x19, [x5, 56]
331        NOP     // fma
332        NOP
333        NOP     // fma
334        NOP
335
336        # Second group of 16 FMA, no loads
337        # BLOCK 0
338        INS     v15.d[1], x19           // b from previous
339        FMLA    v20.4s, v12.4s,  v3.s[0]
340        FMLA    v22.4s, v12.4s,  v3.s[2]
341        FMLA    v24.4s, v12.4s,  v4.s[0]
342
343        # BLOCK 1
344        FMLA    v26.4s, v12.4s,  v4.s[2]
345        FMLA    v21.4s, v13.4s,  v3.s[0]
346        FMLA    v23.4s, v13.4s,  v3.s[2]
347
348        # BLOCK 2
349        FMLA    v25.4s, v13.4s,  v4.s[0]
350        FMLA    v27.4s, v13.4s,  v4.s[2]
351        FMLA    v20.4s, v14.4s,  v3.s[1]
352
353        # BLOCK 3
354        FMLA    v22.4s, v14.4s,  v3.s[3]
355        FMLA    v24.4s, v14.4s,  v4.s[1]
356        FMLA    v26.4s, v14.4s,  v4.s[3]
357
358        # BLOCK 4
359        FMLA    v21.4s, v15.4s,  v3.s[1]
360        FMLA    v23.4s, v15.4s,  v3.s[3]
361        FMLA    v25.4s, v15.4s,  v4.s[1]
362        ADD     x5, x5, 64
363
364        # BLOCK 5
365        FMLA    v27.4s, v15.4s,  v4.s[3]
366
3674:
368        # Is there a remainder?- 2 floats of A (8 bytes)
369        TBNZ    x0, 3, 6f
370        # Is there a remainder?- 1 float of A (4 bytes)
371        TBNZ    x0, 2, 7f
3725:
373        # ks loop
374        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
375        B.HI    1b
376
377        # Clamp
378        FMAX    v20.4s, v20.4s, v6.4s
379        FMAX    v21.4s, v21.4s, v6.4s
380        FMAX    v22.4s, v22.4s, v6.4s
381        FMAX    v23.4s, v23.4s, v6.4s
382        FMAX    v24.4s, v24.4s, v6.4s
383        FMAX    v25.4s, v25.4s, v6.4s
384        FMAX    v26.4s, v26.4s, v6.4s
385        FMAX    v27.4s, v27.4s, v6.4s
386        FMIN    v20.4s, v20.4s, v7.4s
387        FMIN    v21.4s, v21.4s, v7.4s
388        FMIN    v22.4s, v22.4s, v7.4s
389        FMIN    v23.4s, v23.4s, v7.4s
390        FMIN    v24.4s, v24.4s, v7.4s
391        FMIN    v25.4s, v25.4s, v7.4s
392        FMIN    v26.4s, v26.4s, v7.4s
393        FMIN    v27.4s, v27.4s, v7.4s
394
395        # Store full 4 x 8
396        SUBS    x1, x1, 8
397        B.LO    8f
398
399        STP     q26, q27, [x7]
400        ADD     x7, x7, x10
401        STP     q24, q25, [x17]
402        ADD     x17, x17, x10
403        STP     q22, q23, [x16]
404        ADD     x16, x16, x10
405        STP     q20, q21,  [x6]
406        ADD     x6,  x6, x10
407
408        SUB     x4, x4, x3              // a -= ks
409
410        # nc loop
411        B.HI    0b
412
413        # Restore x19, d12-d15 from stack
414        LDR     x19,      [sp, 32]
415        LDP     d14, d15, [sp, 16]
416        LDP     d12, d13, [sp], 48
417        RET
418
419        # Remainder - 2 floats of A (8 bytes)
420        # 16 FMA + 4 LD64 A + 2 LDP B
4216:
422        LDR     d0,  [x13], 8
423        LDP     q16,  q17, [x5], 32
424        LD1     {v0.d}[1], [x14], 8
425        LDR     d1, [x15], 8
426        LD1     {v1.d}[1], [x8], 8
427        LDP     q18,  q19, [x5], 32
428        FMLA    v20.4s, v16.4s,  v0.s[0]
429        FMLA    v22.4s, v16.4s,  v0.s[2]
430        FMLA    v24.4s, v16.4s,  v1.s[0]
431        FMLA    v26.4s, v16.4s,  v1.s[2]
432        FMLA    v21.4s, v17.4s,  v0.s[0]
433        FMLA    v23.4s, v17.4s,  v0.s[2]
434        FMLA    v25.4s, v17.4s,  v1.s[0]
435        FMLA    v27.4s, v17.4s,  v1.s[2]
436
437        FMLA    v20.4s, v18.4s,  v0.s[1]
438        FMLA    v22.4s, v18.4s,  v0.s[3]
439        FMLA    v24.4s, v18.4s,  v1.s[1]
440        FMLA    v26.4s, v18.4s,  v1.s[3]
441        FMLA    v21.4s, v19.4s,  v0.s[1]
442        FMLA    v23.4s, v19.4s,  v0.s[3]
443        FMLA    v25.4s, v19.4s,  v1.s[1]
444        FMLA    v27.4s, v19.4s,  v1.s[3]
445
446        # Is there a remainder?- 1 float of A (4 bytes)
447        TBZ     x0, 2, 5b
448
4497:
450        # Remainder- 1 float of A (4 bytes)
451        LDR     s0,  [x13], 4
452        LDP     q16,  q17, [x5], 32
453        LD1     {v0.s}[2], [x14], 4
454        LDR     s1, [x15], 4
455        LD1     {v1.s}[2], [x8], 4
456
457        FMLA    v20.4s, v16.4s,  v0.s[0]
458        FMLA    v22.4s, v16.4s,  v0.s[2]
459        FMLA    v24.4s, v16.4s,  v1.s[0]
460        FMLA    v26.4s, v16.4s,  v1.s[2]
461        FMLA    v21.4s, v17.4s,  v0.s[0]
462        FMLA    v23.4s, v17.4s,  v0.s[2]
463        FMLA    v25.4s, v17.4s,  v1.s[0]
464        FMLA    v27.4s, v17.4s,  v1.s[2]
465        B       5b
466
467        # Store odd width
4688:
469        TBZ     x1, 2, 9f
470        STR     q26,  [x7], 16
471        MOV     v26.16b, v27.16b
472        STR     q24, [x17], 16
473        MOV     v24.16b, v25.16b
474        STR     q22, [x16], 16
475        MOV     v22.16b, v23.16b
476        STR     q20,  [x6], 16
477        MOV     v20.16b, v21.16b
4789:
479        TBZ     x1, 1, 10f
480        STR     d26,  [x7], 8
481        STR     d24, [x17], 8
482        DUP     d26, v26.d[1]
483        DUP     d24, v24.d[1]
484        STR     d22, [x16], 8
485        STR     d20,  [x6], 8
486        DUP     d22, v22.d[1]
487        DUP     d20, v20.d[1]
488
48910:
490        TBZ     x1, 0, 11f
491        STR     s26,  [x7]
492        STR     s24, [x17]
493        STR     s22, [x16]
494        STR     s20,  [x6]
49511:
496        # Restore x19, d12-d15 from stack
497        LDR     x19,      [sp, 32]
498        LDP     d14, d15, [sp, 16]
499        LDP     d12, d13, [sp], 48
500        RET
501
502END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53
503
504#ifdef __ELF__
505.section ".note.GNU-stack","",%progbits
506#endif
507