xref: /aosp_15_r20/external/XNNPACK/src/f16-gemm/gen/8x8-minmax-aarch64-neonfp16arith-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/8x8-aarch64-neonfp16arith-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const void*restrict a,    x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     void*restrict c,          x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x8)
22
23#     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x19 a5
34# x20 a6
35#  x4 a7
36
37# C pointers
38#  x6 c0
39# x16 c1
40# x17 c2
41# x14 c3
42# x13 c4
43# x21 c5
44# x22 c6
45#  x7 c7
46
47# Vector register usage
48# A0   v0
49# A1   v1
50# A2   v2
51# A3   v3
52# A4   v4
53# A5   v5
54# A6   v6
55# A7   v7
56# B   v16 v17 v18 v19
57# C   v24
58# C   v25
59# C   v26
60# C   v27
61# C   v28
62# C   v29
63# C   v30
64# C   v31
65
66# Clamp v20 v21
67# unused A   v8 v9 v10 v11
68# unused B   v12 v13 v14 v15
69
70BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64
71
72        # Load params pointer
73        LDR     x8, [sp, 8]
74
75        # Save x19,x20,x21,x22 on stack
76        STP     x19, x20, [sp, -32]!
77        STP     x21, x22, [sp, 16]
78
79        # Clamp A and C pointers
80        CMP     x0, 2                   // if mr < 2
81        ADD     x9, x3, x4              // a1 = a0 + a_stride
82        ADD     x16, x6, x7             // c1 = c0 + cm_stride
83        CSEL    x9, x3, x9, LO          //   a1 = a0
84        CSEL    x16, x6, x16, LO        //   c1 = c0
85
86        # Load params
87        LD2R    {v20.8h, v21.8h}, [x8]
88
89        ADD     x10, x9, x4             // a2 = a1 + a_stride
90        ADD     x17, x16, x7            // c2 = c1 + cm_stride
91                                        // if mr <= 2
92        CSEL    x10, x9, x10, LS        //   a2 = a1
93        CSEL    x17, x16, x17, LS       //   c2 = c1
94
95        CMP     x0, 4                   // if mr < 4
96        ADD     x11, x10, x4            // a3 = a2 + a_stride
97        ADD     x14, x17, x7            // c3 = c2 + cm_stride
98        CSEL    x11, x10, x11, LO       //   a3 = a2
99        CSEL    x14, x17, x14, LO       //   c3 = c2
100
101        ADD     x12, x11, x4            // a4 = a3 + a_stride
102        ADD     x13, x14, x7            // c4 = c3 + cm_stride
103                                        // if mr <= 4
104        CSEL    x12, x11, x12, LS       //   a4 = a3
105        CSEL    x13, x14, x13, LS       //   c4 = c3
106
107        CMP     x0, 6                   // if mr < 6
108        ADD     x19, x12, x4            // a5 = a4 + a_stride
109        ADD     x21, x13, x7            // c5 = c4 + cm_stride
110        CSEL    x19, x12, x19, LO       //   a5 = a4
111        CSEL    x21, x13, x21, LO       //   c5 = c4
112
113        ADD     x20, x19, x4            // a6 = a5 + a_stride
114        ADD     x22, x21, x7            // c6 = c5 + cm_stride
115                                        // if mr <= 6
116        CSEL    x20, x19, x20, LS       //   a6 = a5
117        CSEL    x22, x21, x22, LS       //   c6 = c5
118
119        CMP     x0, 8                   // if mr < 8
120        ADD     x4, x20, x4             // a7 = a5 + a_stride
121        ADD     x7, x22, x7             // c7 = c5 + cm_stride
122        CSEL    x4, x20, x4, LO         //   a7 = a5
123        CSEL    x7, x22, x7, LO         //   c7 = c5
124
125        LDR     x8, [sp, 32]            // load cn_stride
126
1270:
128       # Load initial bias from w into accumulators
129       LDR     q24, [x5], 16
130       MOV     v25.16b, v24.16b
131       MOV     v26.16b, v24.16b
132       MOV     v27.16b, v24.16b
133       MOV     v28.16b, v24.16b
134       MOV     v29.16b, v24.16b
135       MOV     v30.16b, v24.16b
136       MOV     v31.16b, v24.16b
137
138         # Is there at least 4 halffloats (8 bytes)?
139        SUBS    x0, x2, 8               // k = kc - 8
140        B.LO    3f
141
142        # Main loop - 4 halffloats of A (8 bytes)
143        # 32 FMA + 8 ld64 A + 4 LDR B
1441:
145        LDR     d0,  [x3], 8
146        LDR     q16, [x5], 16
147        LDR     q17, [x5], 16
148        LDR     d1,  [x9], 8
149        LDR     d2, [x10], 8
150        LDR     d3, [x11], 8
151        LDR     d4, [x12], 8
152        LDR     d5, [x19], 8
153        LDR     d6, [x20], 8
154        LDR     d7,  [x4], 8
155        LDR     q18, [x5], 16
156        LDR     q19, [x5], 16
157        SUBS    x0, x0, 8
158        FMLA    v24.8h, v16.8h,  v0.h[0]
159        FMLA    v25.8h, v16.8h,  v1.h[0]
160        FMLA    v26.8h, v16.8h,  v2.h[0]
161        FMLA    v27.8h, v16.8h,  v3.h[0]
162        FMLA    v28.8h, v16.8h,  v4.h[0]
163        FMLA    v29.8h, v16.8h,  v5.h[0]
164        FMLA    v30.8h, v16.8h,  v6.h[0]
165        FMLA    v31.8h, v16.8h,  v7.h[0]
166
167        FMLA    v24.8h, v17.8h,  v0.h[1]
168        FMLA    v25.8h, v17.8h,  v1.h[1]
169        FMLA    v26.8h, v17.8h,  v2.h[1]
170        FMLA    v27.8h, v17.8h,  v3.h[1]
171        FMLA    v28.8h, v17.8h,  v4.h[1]
172        FMLA    v29.8h, v17.8h,  v5.h[1]
173        FMLA    v30.8h, v17.8h,  v6.h[1]
174        FMLA    v31.8h, v17.8h,  v7.h[1]
175
176        FMLA    v24.8h, v18.8h,  v0.h[2]
177        FMLA    v25.8h, v18.8h,  v1.h[2]
178        FMLA    v26.8h, v18.8h,  v2.h[2]
179        FMLA    v27.8h, v18.8h,  v3.h[2]
180        FMLA    v28.8h, v18.8h,  v4.h[2]
181        FMLA    v29.8h, v18.8h,  v5.h[2]
182        FMLA    v30.8h, v18.8h,  v6.h[2]
183        FMLA    v31.8h, v18.8h,  v7.h[2]
184
185        FMLA    v24.8h, v19.8h,  v0.h[3]
186        FMLA    v25.8h, v19.8h,  v1.h[3]
187        FMLA    v26.8h, v19.8h,  v2.h[3]
188        FMLA    v27.8h, v19.8h,  v3.h[3]
189        FMLA    v28.8h, v19.8h,  v4.h[3]
190        FMLA    v29.8h, v19.8h,  v5.h[3]
191        FMLA    v30.8h, v19.8h,  v6.h[3]
192        FMLA    v31.8h, v19.8h,  v7.h[3]
193        B.HS    1b
194
195        # Is there a remainder?- 2 halffloats of A (4 bytes)
196        TBNZ    x0, 2, 4f
197        # Is there a remainder?- 1 halffloat of A (2 bytes)
198        TBNZ    x0, 1, 5f
1992:
200        # Clamp
201        FMAX    v24.8h, v24.8h, v20.8h
202        FMAX    v25.8h, v25.8h, v20.8h
203        FMAX    v26.8h, v26.8h, v20.8h
204        FMAX    v27.8h, v27.8h, v20.8h
205        FMAX    v28.8h, v28.8h, v20.8h
206        FMAX    v29.8h, v29.8h, v20.8h
207        FMAX    v30.8h, v30.8h, v20.8h
208        FMAX    v31.8h, v31.8h, v20.8h
209        SUBS    x1, x1, 8
210        FMIN    v24.8h, v24.8h, v21.8h
211        FMIN    v25.8h, v25.8h, v21.8h
212        FMIN    v26.8h, v26.8h, v21.8h
213        FMIN    v27.8h, v27.8h, v21.8h
214        FMIN    v28.8h, v28.8h, v21.8h
215        FMIN    v29.8h, v29.8h, v21.8h
216        FMIN    v30.8h, v30.8h, v21.8h
217        FMIN    v31.8h, v31.8h, v21.8h
218
219        # Store full 8 x 8
220        B.LO    6f
221
222        ST1     {v24.16b},  [x6], x8
223        SUB     x3,  x3, x2             // a0 -= kc
224        ST1     {v25.16b}, [x16], x8
225        SUB     x9,  x9, x2             // a1 -= kc
226        ST1     {v26.16b}, [x17], x8
227        SUB     x10, x10, x2            // a2 -= kc
228        ST1     {v27.16b}, [x14], x8
229        SUB     x11, x11, x2            // a3 -= kc
230        ST1     {v28.16b}, [x13], x8
231        SUB     x12, x12, x2            // a4 -= kc
232        ST1     {v29.16b}, [x21], x8
233        SUB     x19, x19, x2            // a6 -= kc
234        ST1     {v30.16b}, [x22], x8
235        SUB     x20, x20, x2            // a6 -= kc
236        ST1     {v31.16b},  [x7], x8
237        SUB     x4,  x4, x2             // a7 -= kc
238
239        B.HI    0b
240
241        # Restore x19,x20,x21,x22 from stack
242        LDP     x21, x22, [sp, 16]
243        LDP     x19, x20, [sp], 32
244        RET
245
2463:
247        TBZ     x0, 2, 5f
2484:
249        # Remainder- 2 halffloats of A (4 bytes)
250        LDR     s0,  [x3], 4
251        LDR     q16,  [x5], 16
252        LDR     q17,  [x5], 16
253        LDR     s1,  [x9], 4
254        LDR     s2, [x10], 4
255        LDR     s3, [x11], 4
256        LDR     s4, [x12], 4
257        LDR     s5, [x19], 4
258        LDR     s6, [x20], 4
259        LDR     s7,  [x4], 4
260
261        FMLA    v24.8h, v16.8h,  v0.h[0]
262        FMLA    v25.8h, v16.8h,  v1.h[0]
263        FMLA    v26.8h, v16.8h,  v2.h[0]
264        FMLA    v27.8h, v16.8h,  v3.h[0]
265        FMLA    v28.8h, v16.8h,  v4.h[0]
266        FMLA    v29.8h, v16.8h,  v5.h[0]
267        FMLA    v30.8h, v16.8h,  v6.h[0]
268        FMLA    v31.8h, v16.8h,  v7.h[0]
269
270        FMLA    v24.8h, v17.8h,  v0.h[1]
271        FMLA    v25.8h, v17.8h,  v1.h[1]
272        FMLA    v26.8h, v17.8h,  v2.h[1]
273        FMLA    v27.8h, v17.8h,  v3.h[1]
274        FMLA    v28.8h, v17.8h,  v4.h[1]
275        FMLA    v29.8h, v17.8h,  v5.h[1]
276        FMLA    v30.8h, v17.8h,  v6.h[1]
277        FMLA    v31.8h, v17.8h,  v7.h[1]
278        TBZ     x0, 1, 2b
279
2805:
281        # Remainder- 1 halffloat of A (2 bytes)
282        LDR     h0,  [x3], 2
283        LDR     q16,  [x5], 16
284        LDR     h1,  [x9], 2
285        LDR     h2, [x10], 2
286        LDR     h3, [x11], 2
287        LDR     h4, [x12], 2
288        LDR     h5, [x19], 2
289        LDR     h6, [x20], 2
290        LDR     h7,  [x4], 2
291
292        FMLA    v24.8h, v16.8h,  v0.h[0]
293        FMLA    v25.8h, v16.8h,  v1.h[0]
294        FMLA    v26.8h, v16.8h,  v2.h[0]
295        FMLA    v27.8h, v16.8h,  v3.h[0]
296        FMLA    v28.8h, v16.8h,  v4.h[0]
297        FMLA    v29.8h, v16.8h,  v5.h[0]
298        FMLA    v30.8h, v16.8h,  v6.h[0]
299        FMLA    v31.8h, v16.8h,  v7.h[0]
300        B       2b
301
302        # Store odd width
3036:
304        TBZ     x1, 2, 7f
305        STR     d24,  [x6], 8
306        STR     d25, [x16], 8
307        DUP     d24, v24.d[1]
308        DUP     d25, v25.d[1]
309        STR     d26, [x17], 8
310        STR     d27, [x14], 8
311        DUP     d26, v26.d[1]
312        DUP     d27, v27.d[1]
313        STR     d28, [x13], 8
314        STR     d29, [x21], 8
315        DUP     d28, v28.d[1]
316        DUP     d29, v29.d[1]
317        STR     d30, [x22], 8
318        STR     d31,  [x7], 8
319        DUP     d30, v30.d[1]
320        DUP     d31, v31.d[1]
3217:
322        TBZ     x1, 1, 8f
323        STR     s24,  [x6], 4
324        STR     s25, [x16], 4
325        DUP     s24, v24.s[1]
326        DUP     s25, v25.s[1]
327        STR     s26, [x17], 4
328        STR     s27, [x14], 4
329        DUP     s26, v26.s[1]
330        DUP     s27, v27.s[1]
331        STR     s28, [x13], 4
332        STR     s29, [x21], 4
333        DUP     s28, v28.s[1]
334        DUP     s29, v29.s[1]
335        STR     s30, [x22], 4
336        STR     s31,  [x7], 4
337        DUP     s30, v30.s[1]
338        DUP     s31, v31.s[1]
339
3408:
341        TBZ     x1, 0, 9f
342        STR     h24,  [x6]
343        STR     h25, [x16]
344        STR     h26, [x17]
345        STR     h27, [x14]
346        STR     h28, [x13]
347        STR     h29, [x21]
348        STR     h30, [x22]
349        STR     h31,  [x7]
3509:
351        # Restore x19,x20,x21,x22 from stack
352        LDP     x21, x22, [sp, 16]
353        LDP     x19, x20, [sp], 32
354        RET
355
356END_FUNCTION xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64
357
358#ifdef __ELF__
359.section ".note.GNU-stack","",%progbits
360#endif
361