xref: /aosp_15_r20/external/XNNPACK/src/f16-gemm/gen/6x8-minmax-aarch64-neonfp16arith-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const void*restrict a,    x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     void*restrict c,          x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x8)
22#     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0
44# A1   v1
45# A2   v2
46# A3   v3
47# A4   v4
48# A5   v5
49# B   v16 v17 v18 v19
50# C   v20
51# C   v22
52# C   v24
53# C   v26
54# C   v28
55# C   v30
56# Clamp v6, (v4), (v5)
57# unused A   v8 v9 v10 v11
58# unused B   v12 v13 v14 v15
59
60
61BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
62
63        # Load params pointer
64        LDR     x8, [sp, 8]
65
66        # Clamp A and C pointers
67        CMP     x0, 2                   // if mr < 2
68        ADD     x9, x3, x4              // a1 = a0 + a_stride
69        ADD     x16, x6, x7             // c1 = c0 + cm_stride
70        CSEL    x9, x3, x9, LO          //   a1 = a0
71        CSEL    x16, x6, x16, LO        //   c1 = c0
72
73        # Load params
74        LDR     s6, [x8]
75
76        ADD     x10, x9, x4             // a2 = a1 + a_stride
77        ADD     x17, x16, x7            // c2 = c1 + cm_stride
78                                        // if mr <= 2
79        CSEL    x10, x9, x10, LS        //   a2 = a1
80        CSEL    x17, x16, x17, LS       //   c2 = c1
81
82        CMP     x0, 4                   // if mr < 4
83        ADD     x11, x10, x4            // a3 = a2 + a_stride
84        ADD     x14, x17, x7            // c3 = c2 + cm_stride
85        CSEL    x11, x10, x11, LO       //   a3 = a2
86        CSEL    x14, x17, x14, LO       //   c3 = c2
87
88        ADD     x12, x11, x4            // a4 = a3 + a_stride
89        ADD     x13, x14, x7            // c4 = c3 + cm_stride
90                                        // if mr <= 4
91        CSEL    x12, x11, x12, LS       //   a4 = a3
92        CSEL    x13, x14, x13, LS       //   c4 = c3
93
94        CMP     x0, 6                   // if mr < 6
95        ADD     x4, x12, x4             // a5 = a4 + a_stride
96        ADD     x7, x13, x7             // c5 = c4 + cm_stride
97        CSEL    x4, x12, x4, LO         //   a5 = a4
98        CSEL    x7, x13, x7, LO         //   c5 = c4
99
100        LDR     x8, [sp]                // load cn_stride
101
1020:
103        # Load initial bias from w into accumulators
104        LDR     q20, [x5], 16
105        MOV     v22.16b, v20.16b
106        MOV     v24.16b, v20.16b
107        MOV     v26.16b, v20.16b
108        MOV     v28.16b, v20.16b
109        MOV     v30.16b, v20.16b
110
111         # Is there at least 4 halffloats (8 bytes)?
112        SUBS    x0, x2, 8               // k = kc - 8
113        B.LO    3f
114
115        # Main loop - 4 halffloats of A (8 bytes)
116        # 24 FMA + 6 ld64 A + 4 LDR B
1171:
118        LDR     d0,  [x3], 8
119        LDR     q16, [x5], 16
120        LDR     q17, [x5], 16
121        LDR     d1,  [x9], 8
122        LDR     d2, [x10], 8
123        LDR     d3, [x11], 8
124        LDR     d4, [x12], 8
125        LDR     d5,  [x4], 8
126        LDR     q18, [x5], 16
127        LDR     q19, [x5], 16
128        SUBS    x0, x0, 8
129        FMLA    v20.8h, v16.8h,  v0.h[0]
130        FMLA    v22.8h, v16.8h,  v1.h[0]
131        FMLA    v24.8h, v16.8h,  v2.h[0]
132        FMLA    v26.8h, v16.8h,  v3.h[0]
133        FMLA    v28.8h, v16.8h,  v4.h[0]
134        FMLA    v30.8h, v16.8h,  v5.h[0]
135        FMLA    v20.8h, v17.8h,  v0.h[1]
136        FMLA    v22.8h, v17.8h,  v1.h[1]
137        FMLA    v24.8h, v17.8h,  v2.h[1]
138        FMLA    v26.8h, v17.8h,  v3.h[1]
139        FMLA    v28.8h, v17.8h,  v4.h[1]
140        FMLA    v30.8h, v17.8h,  v5.h[1]
141
142        FMLA    v20.8h, v18.8h,  v0.h[2]
143        FMLA    v22.8h, v18.8h,  v1.h[2]
144        FMLA    v24.8h, v18.8h,  v2.h[2]
145        FMLA    v26.8h, v18.8h,  v3.h[2]
146        FMLA    v28.8h, v18.8h,  v4.h[2]
147        FMLA    v30.8h, v18.8h,  v5.h[2]
148        FMLA    v20.8h, v19.8h,  v0.h[3]
149        FMLA    v22.8h, v19.8h,  v1.h[3]
150        FMLA    v24.8h, v19.8h,  v2.h[3]
151        FMLA    v26.8h, v19.8h,  v3.h[3]
152        FMLA    v28.8h, v19.8h,  v4.h[3]
153        FMLA    v30.8h, v19.8h,  v5.h[3]
154        B.HS    1b
155
156        # Is there a remainder?- 2 halffloats of A (4 bytes)
157        TBNZ    x0, 2, 4f
158        # Is there a remainder?- 1 halffloat of A (2 bytes)
159        TBNZ    x0, 1, 5f
1602:
161        # Clamp
162        DUP     v4.8h, v6.h[0]
163        DUP     v5.8h, v6.h[1]
164        FMAX    v20.8h, v20.8h, v4.8h
165        FMAX    v22.8h, v22.8h, v4.8h
166        FMAX    v24.8h, v24.8h, v4.8h
167        FMAX    v26.8h, v26.8h, v4.8h
168        FMAX    v28.8h, v28.8h, v4.8h
169        FMAX    v30.8h, v30.8h, v4.8h
170        SUBS    x1, x1, 8
171        FMIN    v20.8h, v20.8h, v5.8h
172        FMIN    v22.8h, v22.8h, v5.8h
173        FMIN    v24.8h, v24.8h, v5.8h
174        FMIN    v26.8h, v26.8h, v5.8h
175        FMIN    v28.8h, v28.8h, v5.8h
176        FMIN    v30.8h, v30.8h, v5.8h
177
178        # Store full 6 x 8
179        B.LO    6f
180
181        ST1     {v20.16b},  [x6], x8
182        SUB     x3,  x3, x2             // a0 -= kc
183        ST1     {v22.16b}, [x16], x8
184        SUB     x9,  x9, x2             // a1 -= kc
185        ST1     {v24.16b}, [x17], x8
186        SUB     x10, x10, x2            // a2 -= kc
187        ST1     {v26.16b}, [x14], x8
188        SUB     x11, x11, x2            // a3 -= kc
189        ST1     {v28.16b}, [x13], x8
190        SUB     x12, x12, x2            // a4 -= kc
191        ST1     {v30.16b},  [x7], x8
192        SUB     x4,  x4, x2             // a5 -= kc
193
194        B.HI    0b
195        RET
196
1973:
198        TBZ     x0, 2, 5f
1994:
200        # Remainder- 2 halffloats of A (4 bytes)
201        LDR     s0,  [x3], 4
202        LDR     q16, [x5], 16
203        LDR     q17, [x5], 16
204        LDR     s1,  [x9], 4
205        LDR     s2, [x10], 4
206        LDR     s3, [x11], 4
207        LDR     s4, [x12], 4
208        LDR     s5,  [x4], 4
209
210        FMLA    v20.8h, v16.8h,  v0.h[0]
211        FMLA    v22.8h, v16.8h,  v1.h[0]
212        FMLA    v24.8h, v16.8h,  v2.h[0]
213        FMLA    v26.8h, v16.8h,  v3.h[0]
214        FMLA    v28.8h, v16.8h,  v4.h[0]
215        FMLA    v30.8h, v16.8h,  v5.h[0]
216
217        FMLA    v20.8h, v17.8h,  v0.h[1]
218        FMLA    v22.8h, v17.8h,  v1.h[1]
219        FMLA    v24.8h, v17.8h,  v2.h[1]
220        FMLA    v26.8h, v17.8h,  v3.h[1]
221        FMLA    v28.8h, v17.8h,  v4.h[1]
222        FMLA    v30.8h, v17.8h,  v5.h[1]
223        TBZ     x0, 1, 2b
224
2255:
226        # Remainder- 1 halffloat of A (2 bytes)
227        LDR     h0,  [x3], 2
228        LDR     q16,  [x5], 16
229        LDR     h1,  [x9], 2
230        LDR     h2, [x10], 2
231        LDR     h3, [x11], 2
232        LDR     h4, [x12], 2
233        LDR     h5,  [x4], 2
234        FMLA    v20.8h, v16.8h,  v0.h[0]
235        FMLA    v22.8h, v16.8h,  v1.h[0]
236        FMLA    v24.8h, v16.8h,  v2.h[0]
237        FMLA    v26.8h, v16.8h,  v3.h[0]
238        FMLA    v28.8h, v16.8h,  v4.h[0]
239        FMLA    v30.8h, v16.8h,  v5.h[0]
240        B       2b
241
242        # Store odd width
2436:
244        TBZ     x1, 2, 7f
245        STR     d20,  [x6], 8
246        STR     d22, [x16], 8
247        DUP     d20, v20.d[1]
248        DUP     d22, v22.d[1]
249        STR     d24, [x17], 8
250        STR     d26, [x14], 8
251        DUP     d24, v24.d[1]
252        DUP     d26, v26.d[1]
253        STR     d28, [x13], 8
254        STR     d30,  [x7], 8
255        DUP     d28, v28.d[1]
256        DUP     d30, v30.d[1]
257
2587:
259        TBZ     x1, 1, 8f
260        STR     s20,  [x6], 4
261        STR     s22, [x16], 4
262        DUP     s20, v20.s[1]
263        DUP     s22, v22.s[1]
264        STR     s24, [x17], 4
265        STR     s26, [x14], 4
266        DUP     s24, v24.s[1]
267        DUP     s26, v26.s[1]
268        STR     s28, [x13], 4
269        STR     s30,  [x7], 4
270        DUP     s28, v28.s[1]
271        DUP     s30, v30.s[1]
272
2738:
274        TBZ     x1, 0, 9f
275        STR     h20,  [x6]
276        STR     h22, [x16]
277        STR     h24, [x17]
278        STR     h26, [x14]
279        STR     h28, [x13]
280        STR     h30,  [x7]
2819:
282        RET
283
284END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
285
286#ifdef __ELF__
287.section ".note.GNU-stack","",%progbits
288#endif
289