xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# Register usage
27# A0  x3 v0     v3
28# A1  x9 v0[1]  v3[1]
29# A2 x10 v1     v4
30# A3 x11 v1[1]  v4[1]
31# A4 x12 v2     v5
32# A5  x4 v2[1]  v5[1]
33
34# B   x5 v12 v13 v14 v15 second set of B
35# B      v16 v17 v18 v19 first set
36
37# C   x6 v20 v21
38# C  x16 v22 v23
39# C  x17 v24 v25
40# C  x14 v26 v27
41# C  x13 v28 v29
42# C   x7 v30 v31
43
44# Clamp v6 v7
45# unused A   v8 v9 v10 v11
46# x8 temporary vector shadow register
47
48BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
49
50        # Load params pointer
51        LDR     x8, [sp, 8]
52
53        # Clamp A and C pointers
54        CMP     x0, 2                   // if mr < 2
55        ADD     x9, x3, x4              // A1 = a0 + a_stride
56        ADD     x16, x6, x7             // c1 = c0 + cm_stride
57        CSEL    x9, x3, x9, LO          //   a1 = a0
58        CSEL    x16, x6, x16, LO        //   c1 = c0
59
60        ADD     x10, x9, x4             // A2 = a1 + a_stride
61        ADD     x17, x16, x7            // c2 = c1 + cm_stride
62                                        // if mr <= 2
63        CSEL    x10, x9, x10, LS        //   a2 = a1
64        CSEL    x17, x16, x17, LS       //   c2 = c1
65
66        CMP     x0, 4                   // if mr < 4
67        ADD     x11, x10, x4            // A3 = a2 + a_stride
68        ADD     x14, x17, x7            // c3 = c2 + cm_stride
69        CSEL    x11, x10, x11, LO       //   a3 = a2
70        CSEL    x14, x17, x14, LO       //   c3 = c2
71
72        ADD     x12, x11, x4            // A4 = a3 + a_stride
73        ADD     x13, x14, x7            // c4 = c3 + cm_stride
74                                        // if mr <= 4
75        CSEL    x12, x11, x12, LS       //   a4 = a3
76        CSEL    x13, x14, x13, LS       //   c4 = c3
77
78        CMP     x0, 6                   // if mr < 6
79        ADD     x4, x12, x4             // A5 = a4 + a_stride
80        ADD     x7, x13, x7             // c5 = c4 + cm_stride
81        CSEL    x4, x12, x4, LO         //   a5 = a4
82        CSEL    x7, x13, x7, LO         //   c5 = c4
83
84        # Load min/max values
85        LD2R    {v6.4s, v7.4s}, [x8]
86
87        # Save d12-d15 on stack
88        STP     d12, d13, [sp, -32]!
89        STP     d14, d15, [sp, 16]
90
910:
92        # Load initial bias from w into accumulators
93        LDP     q20, q21, [x5], 32
94        MOV     v22.16b, v20.16b
95        MOV     v23.16b, v21.16b
96        MOV     v24.16b, v20.16b
97        MOV     v25.16b, v21.16b
98        MOV     v26.16b, v20.16b
99        MOV     v27.16b, v21.16b
100        MOV     v28.16b, v20.16b
101        MOV     v29.16b, v21.16b
102        MOV     v30.16b, v20.16b
103        MOV     v31.16b, v21.16b
104
105        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
106        SUBS    x0, x2, 16              // k = kc - 16
107        B.LO    4f
108
109        # Prologue - First group loads, no FMA
110        LDR     d0, [x3], 8               // A0
111        LDP     q16, q17, [x5], 32        // B
112        LDR     d1, [x10], 8              // A2
113        LDR     d2, [x12], 8              // A4
114        LD1     {v0.d}[1],  [x9], 8       // A1
115        LD1     {v1.d}[1], [x11], 8       // A3
116        LD1     {v2.d}[1],  [x4], 8       // A5
117        SUBS    x0, x0, 16
118        LDR     q18, [x5], 16
119        LDR     d19, [x5], 8
120        LDR     x8, [x5], 8               // ins is in BLOCK 0
121
122        # Is there at least 4 floats (16 bytes) for main loop?
123        B.LO    2f
124
125        # Main loop - 4 floats of A (16 bytes)
126        # 48 FMA + 12 LD64 A + 8 LDR B
1271:
128        # First group of 24 FMA, Second group loads
129        # BLOCK 0
130        LDR     d3, [x3], 8               // A0
131        INS     v19.d[1], x8              // B from second group
132        FMLA    v20.4s, v16.4s,  v0.s[0]
133        LDR     x8, [x9], 8               // A1
134        FMLA    v22.4s, v16.4s,  v0.s[2]
135        FMLA    v24.4s, v16.4s,  v1.s[0]
136
137        # BLOCK 1
138        LDR     d12, [x5]
139        INS     v3.d[1], x8               // A1 ins
140        FMLA    v26.4s, v16.4s,  v1.s[2]
141        LDR     x8, [x5, 8]               // B
142        FMLA    v28.4s, v16.4s,  v2.s[0]
143        FMLA    v30.4s, v16.4s,  v2.s[2]
144
145        # BLOCK 2
146        LDR     d4, [x10], 8              // A2
147        INS     v12.d[1], x8              // B  ins
148        FMLA    v21.4s, v17.4s,  v0.s[0]
149        LDR     x8, [x11], 8              // A3
150        FMLA    v23.4s, v17.4s,  v0.s[2]
151        FMLA    v25.4s, v17.4s,  v1.s[0]
152
153        # BLOCK 3
154        LDR     d5, [x12], 8              // A4
155        INS     v4.d[1], x8               // A3 ins
156        FMLA    v27.4s, v17.4s,  v1.s[2]
157        LDR     x8, [x4], 8               // A5
158        FMLA    v29.4s, v17.4s,  v2.s[0]
159        FMLA    v31.4s, v17.4s,  v2.s[2]
160
161        # BLOCK 4
162        LDR     d13, [x5, 16]
163        INS     v5.d[1], x8               // A5 ins
164        FMLA    v20.4s, v18.4s,  v0.s[1]
165        LDR     x8, [x5, 24]
166        FMLA    v22.4s, v18.4s,  v0.s[3]
167        FMLA    v24.4s, v18.4s,  v1.s[1]
168
169        # BLOCK 5
170        LDR     d14, [x5, 32]
171        INS     v13.d[1], x8              // B
172        FMLA    v26.4s, v18.4s,  v1.s[3]
173        LDR     x8, [x5, 40]
174        FMLA    v28.4s, v18.4s,  v2.s[1]
175        FMLA    v30.4s, v18.4s,  v2.s[3]
176
177        # BLOCK 6
178        LDR     d15, [x5, 48]
179        INS     v14.d[1], x8              // B
180        FMLA    v21.4s, v19.4s,  v0.s[1]
181        LDR     x8, [x5, 56]
182        FMLA    v23.4s, v19.4s,  v0.s[3]
183        FMLA    v25.4s, v19.4s,  v1.s[1]
184
185        # BLOCK 7
186        INS     v15.d[1], x8
187        FMLA    v27.4s, v19.4s,  v1.s[3]
188        FMLA    v29.4s, v19.4s,  v2.s[1]
189        FMLA    v31.4s, v19.4s,  v2.s[3]
190
191        # Second group of 24 FMA, First group of loads
192        # BLOCK 0
193        LDR     d0, [x3], 8               // A0
194        FMLA    v20.4s, v12.4s,  v3.s[0]
195        LDR     x8, [x9], 8               // A1
196        FMLA    v22.4s, v12.4s,  v3.s[2]
197        FMLA    v24.4s, v12.4s,  v4.s[0]
198
199        # BLOCK 1
200        LDR     d16, [x5, 64]
201        INS     v0.d[1], x8               // A1 ins
202        FMLA    v26.4s, v12.4s,  v4.s[2]
203        LDR     x8, [x5, 72]              // B
204        FMLA    v28.4s, v12.4s,  v5.s[0]
205        FMLA    v30.4s, v12.4s,  v5.s[2]
206
207        # BLOCK 2
208        LDR     d1, [x10], 8              // A2
209        INS     v16.d[1], x8              // B
210        FMLA    v21.4s, v13.4s,  v3.s[0]
211        LDR     x8, [x11], 8              // A3
212        FMLA    v23.4s, v13.4s,  v3.s[2]
213        FMLA    v25.4s, v13.4s,  v4.s[0]
214
215        # BLOCK 3
216        LDR     d2, [x12], 8              // A4
217        INS     v1.d[1], x8               // A3 ins
218        FMLA    v27.4s, v13.4s,  v4.s[2]
219        LDR     x8,  [x4], 8              // A5
220        FMLA    v29.4s, v13.4s,  v5.s[0]
221        FMLA    v31.4s, v13.4s,  v5.s[2]
222
223        # BLOCK 4
224        LDR     d17, [x5, 80]
225        INS     v2.d[1], x8               // A5 ins
226        FMLA    v20.4s, v14.4s,  v3.s[1]
227        LDR     x8, [x5, 88]
228        FMLA    v22.4s, v14.4s,  v3.s[3]
229        FMLA    v24.4s, v14.4s,  v4.s[1]
230
231        # BLOCK 5
232        LDR     d18, [x5, 96]
233        INS     v17.d[1], x8              // B
234        FMLA    v26.4s, v14.4s,  v4.s[3]
235        LDR     x8, [x5, 104]
236        FMLA    v28.4s, v14.4s,  v5.s[1]
237        FMLA    v30.4s, v14.4s,  v5.s[3]
238
239        # BLOCK 6
240        LDR     d19, [x5, 112]
241        INS     v18.d[1], x8              // B
242        FMLA    v21.4s, v15.4s,  v3.s[1]
243        LDR     x8, [x5, 120]
244        FMLA    v23.4s, v15.4s,  v3.s[3]
245        FMLA    v25.4s, v15.4s,  v4.s[1]
246
247        # BLOCK 7
248        SUBS    x0, x0, 16                // LDR lands here
249        FMLA    v27.4s, v15.4s,  v4.s[3]
250        FMLA    v29.4s, v15.4s,  v5.s[1]
251        ADD     x5, x5, 128
252        FMLA    v31.4s, v15.4s,  v5.s[3]
253        B.HS    1b
254
255        # Epilogue - 4 floats of A (16 bytes)
256        # 48 FMA + 12 LD64 A + 8 LDR B
2572:
258        # First group of 24 FMA, Second group loads
259        # BLOCK 0
260        LDR     d3, [x3], 8               // A0
261        INS     v19.d[1], x8              // B from second group
262        FMLA    v20.4s, v16.4s,  v0.s[0]
263        LDR     x8, [x9], 8               // A1
264        FMLA    v22.4s, v16.4s,  v0.s[2]
265        FMLA    v24.4s, v16.4s,  v1.s[0]
266
267        # BLOCK 1
268        LDR     d12, [x5]
269        INS     v3.d[1], x8               // A1 ins
270        FMLA    v26.4s, v16.4s,  v1.s[2]
271        LDR     x8, [x5, 8]               // B
272        FMLA    v28.4s, v16.4s,  v2.s[0]
273        FMLA    v30.4s, v16.4s,  v2.s[2]
274
275        # BLOCK 2
276        LDR     d4, [x10], 8              // A2
277        INS     v12.d[1], x8              // B  ins
278        FMLA    v21.4s, v17.4s,  v0.s[0]
279        LDR     x8, [x11], 8              // A3
280        FMLA    v23.4s, v17.4s,  v0.s[2]
281        FMLA    v25.4s, v17.4s,  v1.s[0]
282
283        # BLOCK 3
284        LDR     d5, [x12], 8              // A4
285        INS     v4.d[1], x8               // A3 ins
286        FMLA    v27.4s, v17.4s,  v1.s[2]
287        LDR     x8, [x4], 8               // A5
288        FMLA    v29.4s, v17.4s,  v2.s[0]
289        FMLA    v31.4s, v17.4s,  v2.s[2]
290
291        # BLOCK 4
292        LDR     d13, [x5, 16]
293        INS     v5.d[1], x8               // A5 ins
294        FMLA    v20.4s, v18.4s,  v0.s[1]
295        LDR     x8, [x5, 24]
296        FMLA    v22.4s, v18.4s,  v0.s[3]
297        FMLA    v24.4s, v18.4s,  v1.s[1]
298
299        # BLOCK 5
300        LDR     d14, [x5, 32]
301        INS     v13.d[1], x8              // B
302        FMLA    v26.4s, v18.4s,  v1.s[3]
303        LDR     x8, [x5, 40]
304        FMLA    v28.4s, v18.4s,  v2.s[1]
305        FMLA    v30.4s, v18.4s,  v2.s[3]
306
307        # BLOCK 6
308        LDR     d15, [x5, 48]
309        INS     v14.d[1], x8              // B
310        FMLA    v21.4s, v19.4s,  v0.s[1]
311        LDR     x8, [x5, 56]
312        FMLA    v23.4s, v19.4s,  v0.s[3]
313        FMLA    v25.4s, v19.4s,  v1.s[1]
314
315        # BLOCK 7
316        INS     v15.d[1], x8              // B
317        FMLA    v27.4s, v19.4s,  v1.s[3]
318        FMLA    v29.4s, v19.4s,  v2.s[1]
319        FMLA    v31.4s, v19.4s,  v2.s[3]
320
321        # Second group of 24 FMA, First group of loads
322        # BLOCK 0
323        FMLA    v20.4s, v12.4s,  v3.s[0]
324        FMLA    v22.4s, v12.4s,  v3.s[2]
325        FMLA    v24.4s, v12.4s,  v4.s[0]
326
327        # BLOCK 1
328        FMLA    v26.4s, v12.4s,  v4.s[2]
329        FMLA    v28.4s, v12.4s,  v5.s[0]
330        FMLA    v30.4s, v12.4s,  v5.s[2]
331
332        # BLOCK 2
333        FMLA    v21.4s, v13.4s,  v3.s[0]
334        FMLA    v23.4s, v13.4s,  v3.s[2]
335        FMLA    v25.4s, v13.4s,  v4.s[0]
336
337        # BLOCK 3
338        FMLA    v27.4s, v13.4s,  v4.s[2]
339        FMLA    v29.4s, v13.4s,  v5.s[0]
340        FMLA    v31.4s, v13.4s,  v5.s[2]
341
342        # BLOCK 4
343        FMLA    v20.4s, v14.4s,  v3.s[1]
344        FMLA    v22.4s, v14.4s,  v3.s[3]
345        FMLA    v24.4s, v14.4s,  v4.s[1]
346
347        # BLOCK 5
348        FMLA    v26.4s, v14.4s,  v4.s[3]
349        FMLA    v28.4s, v14.4s,  v5.s[1]
350        FMLA    v30.4s, v14.4s,  v5.s[3]
351        TST     x0, 15
352
353        # BLOCK 6
354        FMLA    v21.4s, v15.4s,  v3.s[1]
355        FMLA    v23.4s, v15.4s,  v3.s[3]
356        FMLA    v25.4s, v15.4s,  v4.s[1]
357        ADD     x5, x5, 64
358
359        # BLOCK 7
360        FMLA    v27.4s, v15.4s,  v4.s[3]
361        FMLA    v29.4s, v15.4s,  v5.s[1]
362        FMLA    v31.4s, v15.4s,  v5.s[3]
363
364        # Is there a remainder?- 2 floats of A (8 bytes) or less
365        B.NE    4f
3663:
367        # Clamp
368        FMAX    v20.4s, v20.4s, v6.4s
369        # Load cn_stride
370        LDR     x0, [sp, 32]
371        FMAX    v21.4s, v21.4s, v6.4s
372        FMAX    v22.4s, v22.4s, v6.4s
373        FMAX    v23.4s, v23.4s, v6.4s
374        FMAX    v24.4s, v24.4s, v6.4s
375        FMAX    v25.4s, v25.4s, v6.4s
376        FMAX    v26.4s, v26.4s, v6.4s
377        FMAX    v27.4s, v27.4s, v6.4s
378        FMAX    v28.4s, v28.4s, v6.4s
379        FMAX    v29.4s, v29.4s, v6.4s
380        FMAX    v30.4s, v30.4s, v6.4s
381        FMAX    v31.4s, v31.4s, v6.4s
382        SUBS    x1, x1, 8
383        FMIN    v20.4s, v20.4s, v7.4s
384        FMIN    v21.4s, v21.4s, v7.4s
385        FMIN    v22.4s, v22.4s, v7.4s
386        FMIN    v23.4s, v23.4s, v7.4s
387        FMIN    v24.4s, v24.4s, v7.4s
388        FMIN    v25.4s, v25.4s, v7.4s
389        FMIN    v26.4s, v26.4s, v7.4s
390        FMIN    v27.4s, v27.4s, v7.4s
391        FMIN    v28.4s, v28.4s, v7.4s
392        FMIN    v29.4s, v29.4s, v7.4s
393        FMIN    v30.4s, v30.4s, v7.4s
394        FMIN    v31.4s, v31.4s, v7.4s
395
396        # Store full 6 x 8
397        B.LO    6f
398
399        ST1     {v20.16b, v21.16b},  [x6], x0
400        SUB     x3,  x3, x2             // A0 -= kc
401        ST1     {v22.16b, v23.16b}, [x16], x0
402        SUB     x9,  x9, x2             // A1 -= kc
403        ST1     {v24.16b, v25.16b}, [x17], x0
404        SUB     x10, x10, x2            // A2 -= kc
405        ST1     {v26.16b, v27.16b}, [x14], x0
406        SUB     x11, x11, x2            // A3 -= kc
407        ST1     {v28.16b, v29.16b}, [x13], x0
408        SUB     x12, x12, x2            // A4 -= kc
409        ST1     {v30.16b, v31.16b},  [x7], x0
410        SUB     x4,  x4, x2             // A5 -= kc
411
412        B.HI    0b
413
414        # Restore d12-d15 from stack
415        LDP     d14, d15, [sp, 16]
416        LDP     d12, d13, [sp], 32
417        RET
418
4194:
420        # Is there a remainder?- 2 floats of A (8 bytes)
421        TBZ     x0, 3, 5f
422
423        # Remainder- 2 floats of A (8 bytes)
424        LDR     d0,  [x3], 8
425        LDR     q16, [x5], 16
426        LD1     {v0.d}[1], [x9], 8
427        LDR     d1, [x10], 8
428        LD1     {v1.d}[1], [x11], 8
429        LDR     d2, [x12], 8
430        LD1     {v2.d}[1], [x4], 8
431        LDR     q17, [x5], 16
432        LDR     q18, [x5], 16
433        LDR     q19, [x5], 16
434
435        FMLA    v20.4s, v16.4s,  v0.s[0]
436        FMLA    v22.4s, v16.4s,  v0.s[2]
437        FMLA    v24.4s, v16.4s,  v1.s[0]
438        FMLA    v26.4s, v16.4s,  v1.s[2]
439        FMLA    v28.4s, v16.4s,  v2.s[0]
440        FMLA    v30.4s, v16.4s,  v2.s[2]
441        FMLA    v21.4s, v17.4s,  v0.s[0]
442        FMLA    v23.4s, v17.4s,  v0.s[2]
443        FMLA    v25.4s, v17.4s,  v1.s[0]
444        FMLA    v27.4s, v17.4s,  v1.s[2]
445        FMLA    v29.4s, v17.4s,  v2.s[0]
446        FMLA    v31.4s, v17.4s,  v2.s[2]
447
448        FMLA    v20.4s, v18.4s,  v0.s[1]
449        FMLA    v22.4s, v18.4s,  v0.s[3]
450        FMLA    v24.4s, v18.4s,  v1.s[1]
451        FMLA    v26.4s, v18.4s,  v1.s[3]
452        FMLA    v28.4s, v18.4s,  v2.s[1]
453        FMLA    v30.4s, v18.4s,  v2.s[3]
454        FMLA    v21.4s, v19.4s,  v0.s[1]
455        FMLA    v23.4s, v19.4s,  v0.s[3]
456        FMLA    v25.4s, v19.4s,  v1.s[1]
457        FMLA    v27.4s, v19.4s,  v1.s[3]
458        FMLA    v29.4s, v19.4s,  v2.s[1]
459        FMLA    v31.4s, v19.4s,  v2.s[3]
460
461        # Is there a remainder?- 1 float of A (4 bytes)
462        TBZ     x0, 2, 3b
4635:
464        # Remainder- 1 float of A (4 bytes)
465        LDR     s0,  [x3], 4
466        LDR     q16, [x5], 16
467        LD1     {v0.s}[2], [x9], 4
468        LDR     s1, [x10], 4
469        LD1     {v1.s}[2], [x11], 4
470        LDR     s2, [x12], 4
471        LD1     {v2.s}[2], [x4], 4
472        LDR     q17, [x5], 16
473
474        FMLA    v20.4s, v16.4s,  v0.s[0]
475        FMLA    v22.4s, v16.4s,  v0.s[2]
476        FMLA    v24.4s, v16.4s,  v1.s[0]
477        FMLA    v26.4s, v16.4s,  v1.s[2]
478        FMLA    v28.4s, v16.4s,  v2.s[0]
479        FMLA    v30.4s, v16.4s,  v2.s[2]
480        FMLA    v21.4s, v17.4s,  v0.s[0]
481        FMLA    v23.4s, v17.4s,  v0.s[2]
482        FMLA    v25.4s, v17.4s,  v1.s[0]
483        FMLA    v27.4s, v17.4s,  v1.s[2]
484        FMLA    v29.4s, v17.4s,  v2.s[0]
485        FMLA    v31.4s, v17.4s,  v2.s[2]
486        B       3b
487
488        # Store odd width
4896:
490        TBZ     x1, 2, 7f
491        STR     q20,  [x6], 16
492        MOV     v20.16b, v21.16b
493        STR     q22, [x16], 16
494        MOV     v22.16b, v23.16b
495        STR     q24, [x17], 16
496        MOV     v24.16b, v25.16b
497        STR     q26, [x14], 16
498        MOV     v26.16b, v27.16b
499        STR     q28, [x13], 16
500        MOV     v28.16b, v29.16b
501        STR     q30,  [x7], 16
502        MOV     v30.16b, v31.16b
503
5047:
505        TBZ     x1, 1, 8f
506        STR     d20,  [x6], 8
507        STR     d22, [x16], 8
508        DUP     d20, v20.d[1]
509        DUP     d22, v22.d[1]
510        STR     d24, [x17], 8
511        STR     d26, [x14], 8
512        DUP     d24, v24.d[1]
513        DUP     d26, v26.d[1]
514        STR     d28, [x13], 8
515        STR     d30,  [x7], 8
516        DUP     d28, v28.d[1]
517        DUP     d30, v30.d[1]
518
5198:
520        TBZ     x1, 0, 9f
521        STR     s20,  [x6]
522        STR     s22, [x16]
523        STR     s24, [x17]
524        STR     s26, [x14]
525        STR     s28, [x13]
526        STR     s30,  [x7]
5279:
528        # Restore d12-d15 from stack
529        LDP     d14, d15, [sp, 16]
530        LDP     d12, d13, [sp], 32
531        RET
532
533END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
534
535#ifdef __ELF__
536.section ".note.GNU-stack","",%progbits
537#endif
538