xref: /aosp_15_r20/external/XNNPACK/src/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const void*restrict a,    x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     void*restrict c,          x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x8
22
23#     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1  x9 v1
30# A2 x10 v2
31# A3 x11 v3
32# A4 x12 v4
33# A5  x4 v5
34
35# B   x5 v16 v17 v18 v19
36
37# C0  x6  v20 v21
38# C1 x16  v22 v23
39# C2 x17  v24 v25
40# C3 x14  v26 v27
41# C4 x13  v28 v29
42# C5  x7  v30 v31
43
44# Clamp v6, (v4), (v5)
45# unused     v7
46# unused A   v8 v9 v10 v11
47# unused B   v12 v13 v14 v15
48
49BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55
50
51        # Load params pointer
52        LDR     x8, [sp, 8]
53
54        # Clamp A and C pointers
55        CMP     x0, 2                   // if mr < 2
56        ADD     x9, x3, x4              // a1 = a0 + a_stride
57        ADD     x16, x6, x7             // c1 = c0 + cm_stride
58        CSEL    x9, x3, x9, LO          //   a1 = a0
59        CSEL    x16, x6, x16, LO        //   c1 = c0
60
61        # Load params
62        LDR     s6, [x8]
63
64        ADD     x10, x9, x4             // a2 = a1 + a_stride
65        ADD     x17, x16, x7            // c2 = c1 + cm_stride
66                                        // if mr <= 2
67        CSEL    x10, x9, x10, LS        //   a2 = a1
68        CSEL    x17, x16, x17, LS       //   c2 = c1
69
70        CMP     x0, 4                   // if mr < 4
71        ADD     x11, x10, x4            // a3 = a2 + a_stride
72        ADD     x14, x17, x7            // c3 = c2 + cm_stride
73        CSEL    x11, x10, x11, LO       //   a3 = a2
74        CSEL    x14, x17, x14, LO       //   c3 = c2
75
76        ADD     x12, x11, x4            // a4 = a3 + a_stride
77        ADD     x13, x14, x7            // c4 = c3 + cm_stride
78                                        // if mr <= 4
79        CSEL    x12, x11, x12, LS       //   a4 = a3
80        CSEL    x13, x14, x13, LS       //   c4 = c3
81
82        CMP     x0, 6                   // if mr < 6
83        ADD     x4, x12, x4             // a5 = a4 + a_stride
84        ADD     x7, x13, x7             // c5 = c4 + cm_stride
85        CSEL    x4, x12, x4, LO         //   a5 = a4
86        CSEL    x7, x13, x7, LO         //   c5 = c4
87
88        LDR     x8, [sp]                // load cn_stride
89
900:
91        # Load initial bias from w into accumulators
92        LDP     q20, q21, [x5], 32
93        MOV     v22.16b, v20.16b
94        MOV     v23.16b, v21.16b
95        MOV     v24.16b, v20.16b
96        MOV     v25.16b, v21.16b
97        MOV     v26.16b, v20.16b
98        MOV     v27.16b, v21.16b
99        MOV     v28.16b, v20.16b
100        MOV     v29.16b, v21.16b
101        MOV     v30.16b, v20.16b
102        MOV     v31.16b, v21.16b
103
104        # Is there at least 2 halffloats (4 bytes)?
105        SUBS    x0, x2, 4               // k = kc - 4
106        B.LO    4f
107
108        # Prologue - load 4 A and 2 B
109
110        LDR     s0,  [x3], 4
111        LDR     q16, [x5], 16
112        LDR     q17, [x5], 16
113        LDR     s1,  [x9], 4
114        LDR     s2, [x10], 4
115        LDR     s3, [x11], 4
116
117        # Is there at least 2 halffloats for main loop?
118        SUBS    x0, x0, 4
119        B.LO    2f
120
121       .p2align 3
122        # Main loop - 2 halffloats of A (4 bytes)
123        # 24 FMA + 6 ld32 A + 4 LDR B
1241:
125        FMLA    v20.8h, v16.8h,  v0.h[0]
126        LDR     s4, [x12], 4              // A4
127        FMLA    v21.8h, v17.8h,  v0.h[0]
128        LDR     s5,  [x4], 4              // A5
129        FMLA    v22.8h, v16.8h,  v1.h[0]
130        LDR     d18, [x5], 8              // B0
131        FMLA    v23.8h, v17.8h,  v1.h[0]
132        LD1     {v18.d}[1], [x5], 8       // B1
133        FMLA    v24.8h, v16.8h,  v2.h[0]
134        LDR     d19, [x5], 8              // B2
135        FMLA    v25.8h, v17.8h,  v2.h[0]
136        LD1     {v19.d}[1], [x5], 8       // B3
137        FMLA    v26.8h, v16.8h,  v3.h[0]
138        FMLA    v27.8h, v17.8h,  v3.h[0]
139        FMLA    v28.8h, v16.8h,  v4.h[0]
140        FMLA    v29.8h, v17.8h,  v4.h[0]
141        FMLA    v30.8h, v16.8h,  v5.h[0]
142        FMLA    v31.8h, v17.8h,  v5.h[0]
143        SUBS    x0, x0, 4
144
145        FMLA    v20.8h, v18.8h,  v0.h[1]
146        LDR     d16, [x5], 8              // B0
147        FMLA    v21.8h, v19.8h,  v0.h[1]
148        LD1     {v16.d}[1], [x5], 8       // B1
149        FMLA    v22.8h, v18.8h,  v1.h[1]
150        LDR     d17, [x5], 8              // B2
151        FMLA    v23.8h, v19.8h,  v1.h[1]
152        LD1     {v17.d}[1], [x5], 8       // B3
153        FMLA    v24.8h, v18.8h,  v2.h[1]
154        FMLA    v25.8h, v19.8h,  v2.h[1]
155        FMLA    v26.8h, v18.8h,  v3.h[1]
156        FMLA    v27.8h, v19.8h,  v3.h[1]
157        LDR     s0,  [x3], 4              // A0
158        FMLA    v28.8h, v18.8h,  v4.h[1]
159        LDR     s1,  [x9], 4              // A1
160        FMLA    v29.8h, v19.8h,  v4.h[1]
161        LDR     s2, [x10], 4              // A2
162        FMLA    v30.8h, v18.8h,  v5.h[1]
163        LDR     s3, [x11], 4              // A3
164        FMLA    v31.8h, v19.8h,  v5.h[1]
165        B.HS    1b
166
167        # Epilogue - same as main loop but no loads for next loop
1682:
169        FMLA    v20.8h, v16.8h,  v0.h[0]
170        LDR     s4, [x12], 4              // A4
171        FMLA    v21.8h, v17.8h,  v0.h[0]
172        LDR     s5,  [x4], 4              // A5
173        FMLA    v22.8h, v16.8h,  v1.h[0]
174        LDR     d18, [x5], 8              // B0
175        FMLA    v23.8h, v17.8h,  v1.h[0]
176        LD1     {v18.d}[1], [x5], 8       // B1
177        FMLA    v24.8h, v16.8h,  v2.h[0]
178        LDR     d19, [x5], 8              // B2
179        FMLA    v25.8h, v17.8h,  v2.h[0]
180        LD1     {v19.d}[1], [x5], 8       // B3
181        FMLA    v26.8h, v16.8h,  v3.h[0]
182        FMLA    v27.8h, v17.8h,  v3.h[0]
183        FMLA    v28.8h, v16.8h,  v4.h[0]
184        FMLA    v29.8h, v17.8h,  v4.h[0]
185        FMLA    v30.8h, v16.8h,  v5.h[0]
186        FMLA    v31.8h, v17.8h,  v5.h[0]
187
188        FMLA    v20.8h, v18.8h,  v0.h[1]
189        FMLA    v21.8h, v19.8h,  v0.h[1]
190        FMLA    v22.8h, v18.8h,  v1.h[1]
191        FMLA    v23.8h, v19.8h,  v1.h[1]
192        FMLA    v24.8h, v18.8h,  v2.h[1]
193        FMLA    v25.8h, v19.8h,  v2.h[1]
194        FMLA    v26.8h, v18.8h,  v3.h[1]
195        FMLA    v27.8h, v19.8h,  v3.h[1]
196        FMLA    v28.8h, v18.8h,  v4.h[1]
197        FMLA    v29.8h, v19.8h,  v4.h[1]
198        FMLA    v30.8h, v18.8h,  v5.h[1]
199        FMLA    v31.8h, v19.8h,  v5.h[1]
200
201        # Is there a remainder?- 1 halffloat of A (2 bytes)
202        TBNZ    x0, 1, 4f
2033:
204        # Clamp
205        DUP     v4.8h, v6.h[0]
206        DUP     v5.8h, v6.h[1]
207        FMAX    v20.8h, v20.8h, v4.8h
208        FMAX    v21.8h, v21.8h, v4.8h
209        FMAX    v22.8h, v22.8h, v4.8h
210        FMAX    v23.8h, v23.8h, v4.8h
211        FMAX    v24.8h, v24.8h, v4.8h
212        FMAX    v25.8h, v25.8h, v4.8h
213        FMAX    v26.8h, v26.8h, v4.8h
214        FMAX    v27.8h, v27.8h, v4.8h
215        FMAX    v28.8h, v28.8h, v4.8h
216        FMAX    v29.8h, v29.8h, v4.8h
217        FMAX    v30.8h, v30.8h, v4.8h
218        FMAX    v31.8h, v31.8h, v4.8h
219        SUBS    x1, x1, 16
220        FMIN    v20.8h, v20.8h, v5.8h
221        FMIN    v21.8h, v21.8h, v5.8h
222        FMIN    v22.8h, v22.8h, v5.8h
223        FMIN    v23.8h, v23.8h, v5.8h
224        FMIN    v24.8h, v24.8h, v5.8h
225        FMIN    v25.8h, v25.8h, v5.8h
226        FMIN    v26.8h, v26.8h, v5.8h
227        FMIN    v27.8h, v27.8h, v5.8h
228        FMIN    v28.8h, v28.8h, v5.8h
229        FMIN    v29.8h, v29.8h, v5.8h
230        FMIN    v30.8h, v30.8h, v5.8h
231        FMIN    v31.8h, v31.8h, v5.8h
232
233        # Store full 6 x 16
234        B.LO    5f
235
236        ST1     {v20.16b, v21.16b},  [x6], x8
237        SUB     x3,  x3, x2             // a0 -= kc
238        ST1     {v22.16b, v23.16b}, [x16], x8
239        SUB     x9,  x9, x2             // a1 -= kc
240        ST1     {v24.16b, v25.16b}, [x17], x8
241        SUB     x10, x10, x2            // a2 -= kc
242        ST1     {v26.16b, v27.16b}, [x14], x8
243        SUB     x11, x11, x2            // a3 -= kc
244        ST1     {v28.16b, v29.16b}, [x13], x8
245        SUB     x12, x12, x2            // a4 -= kc
246        ST1     {v30.16b, v31.16b},  [x7], x8
247        SUB     x4,  x4, x2             // a5 -= kc
248
249        B.HI    0b
250        RET
251
2524:
253        # Remainder- 1 halffloat of A (2 bytes)
254        LDR     h0,  [x3], 2              // A0
255        LDR     q16, [x5], 16             // B
256        LDR     q17, [x5], 16             // B
257        FMLA    v20.8h, v16.8h,  v0.h[0]
258        LDR     h1,  [x9], 2              // A1
259        FMLA    v22.8h, v16.8h,  v1.h[0]
260        LDR     h2, [x10], 2              // A2
261        FMLA    v24.8h, v16.8h,  v2.h[0]
262        LDR     h3, [x11], 2              // A3
263        FMLA    v26.8h, v16.8h,  v3.h[0]
264        LDR     h4, [x12], 2              // A4
265        FMLA    v28.8h, v16.8h,  v4.h[0]
266        LDR     h5,  [x4], 2              // A5
267        FMLA    v30.8h, v16.8h,  v5.h[0]
268        FMLA    v21.8h, v17.8h,  v0.h[0]
269        FMLA    v23.8h, v17.8h,  v1.h[0]
270        FMLA    v25.8h, v17.8h,  v2.h[0]
271        FMLA    v27.8h, v17.8h,  v3.h[0]
272        FMLA    v29.8h, v17.8h,  v4.h[0]
273        FMLA    v31.8h, v17.8h,  v5.h[0]
274        B       3b
275
276        # Store odd width
2775:
278        TBZ     x1, 3, 6f
279        STR     q20,  [x6], 16
280        MOV     v20.16b, v21.16b
281        STR     q22, [x16], 16
282        MOV     v22.16b, v23.16b
283        STR     q24, [x17], 16
284        MOV     v24.16b, v25.16b
285        STR     q26, [x14], 16
286        MOV     v26.16b, v27.16b
287        STR     q28, [x13], 16
288        MOV     v28.16b, v29.16b
289        STR     q30,  [x7], 16
290        MOV     v30.16b, v31.16b
291
2926:
293        TBZ     x1, 2, 7f
294        STR     d20,  [x6], 8
295        STR     d22, [x16], 8
296        DUP     d20, v20.d[1]
297        DUP     d22, v22.d[1]
298        STR     d24, [x17], 8
299        STR     d26, [x14], 8
300        DUP     d24, v24.d[1]
301        DUP     d26, v26.d[1]
302        STR     d28, [x13], 8
303        STR     d30,  [x7], 8
304        DUP     d28, v28.d[1]
305        DUP     d30, v30.d[1]
306
3077:
308        TBZ     x1, 1, 8f
309        STR     s20,  [x6], 4
310        STR     s22, [x16], 4
311        DUP     s20, v20.s[1]
312        DUP     s22, v22.s[1]
313        STR     s24, [x17], 4
314        STR     s26, [x14], 4
315        DUP     s24, v24.s[1]
316        DUP     s26, v26.s[1]
317        STR     s28, [x13], 4
318        STR     s30,  [x7], 4
319        DUP     s28, v28.s[1]
320        DUP     s30, v30.s[1]
321
3228:
323        TBZ     x1, 0, 9f
324        STR     h20,  [x6]
325        STR     h22, [x16]
326        STR     h24, [x17]
327        STR     h26, [x14]
328        STR     h28, [x13]
329        STR     h30,  [x7]
3309:
331        RET
332
333END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55
334
335#ifdef __ELF__
336.section ".note.GNU-stack","",%progbits
337#endif
338