xref: /aosp_15_r20/external/XNNPACK/src/f16-gemm/gen/4x8-minmax-aarch64-neonfp16arith-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/4x8-aarch64-neonfp16arith-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const void*restrict a,    x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     void*restrict c,          x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# Vector register usage
39# A0   v0
40# A1   v1
41# A2   v2
42# A3   v3
43# B   v20 v21 v22 v23
44# C   v24
45# C   v26
46# C   v28
47# C   v30
48
49# Clamp v4, v5
50# unused A   v6, v7 v8 v9 v10 v11
51# unused B   v27
52
53BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64
54
55        # Load cn_stride, params pointer
56        LDP     x14, x8, [sp]
57
58        # Load params values
59        LD2R    {v4.8h, v5.8h}, [x8]
60
61        # Clamp A and C pointers
62        CMP     x0, 2                   // if mr < 2
63        ADD     x11, x3, x4             // a1 = a0 + a_stride
64        ADD     x9, x6, x7              // c1 = c0 + cm_stride
65        CSEL    x11, x3, x11, LO        //   a1 = a0
66        CSEL    x9, x6, x9, LO          //   c1 = c0
67
68        ADD     x12, x11, x4            // a2 = a1 + a_stride
69        ADD     x10, x9, x7             // c2 = c1 + cm_stride
70                                        // if mr <= 2
71        CSEL    x12, x11, x12, LS       //   a2 = a1
72        CSEL    x10, x9, x10, LS        //   c2 = c1
73
74        CMP     x0, 4                   // if mr < 4
75        ADD     x4, x12, x4             // a3 = a2 + a_stride
76        ADD     x7, x10, x7             // c3 = c2 + cm_stride
77        CSEL    x4, x12, x4, LO         //   a3 = a2
78        CSEL    x7, x10, x7, LO         //   c3 = c2
79
800:
81        # Load initial bias from w into accumulators
82        LDR     q24, [x5], 16
83        MOV     v26.16b, v24.16b
84        MOV     v28.16b, v24.16b
85        MOV     v30.16b, v24.16b
86
87        # Is there at least 4 halffloats (8 bytes)?
88        SUBS    x0, x2, 8               // k = kc - 8
89        B.LO    3f
90
91        # Main loop - 4 halffloats of A (8 bytes)
921:
93        LDR     d0,  [x3], 8
94        LDR     q20,  [x5], 16
95        LDR     q21,  [x5], 16
96        LDR     d1, [x11], 8
97        LDR     d2, [x12], 8
98        LDR     d3,  [x4], 8
99        LDR     q22, [x5], 16
100        LDR     q23, [x5], 16
101        SUBS    x0, x0, 8
102        FMLA    v24.8h, v20.8h, v0.h[0]
103        FMLA    v26.8h, v20.8h, v1.h[0]
104        FMLA    v28.8h, v20.8h, v2.h[0]
105        FMLA    v30.8h, v20.8h, v3.h[0]
106        FMLA    v24.8h, v21.8h, v0.h[1]
107        FMLA    v26.8h, v21.8h, v1.h[1]
108        FMLA    v28.8h, v21.8h, v2.h[1]
109        FMLA    v30.8h, v21.8h, v3.h[1]
110
111        FMLA    v24.8h, v22.8h, v0.h[2]
112        FMLA    v26.8h, v22.8h, v1.h[2]
113        FMLA    v28.8h, v22.8h, v2.h[2]
114        FMLA    v30.8h, v22.8h, v3.h[2]
115        FMLA    v24.8h, v23.8h, v0.h[3]
116        FMLA    v26.8h, v23.8h, v1.h[3]
117        FMLA    v28.8h, v23.8h, v2.h[3]
118        FMLA    v30.8h, v23.8h, v3.h[3]
119        B.HS    1b
120
121        # Is there a remainder?- 2 halffloats of A (4 bytes)
122        TBNZ    x0, 2, 4f
123        # Is there a remainder?- 1 halffloat of A (2 bytes)
124        TBNZ    x0, 1, 5f
1252:
126        # Clamp
127        FMAX    v24.8h, v24.8h, v4.8h
128        SUBS    x1, x1, 8
129        FMAX    v26.8h, v26.8h, v4.8h
130        FMAX    v28.8h, v28.8h, v4.8h
131        FMAX    v30.8h, v30.8h, v4.8h
132        FMIN    v24.8h, v24.8h, v5.8h
133        FMIN    v26.8h, v26.8h, v5.8h
134        FMIN    v28.8h, v28.8h, v5.8h
135        FMIN    v30.8h, v30.8h, v5.8h
136
137        # Store full 4 x 8
138        B.LO    6f
139
140        ST1     {v24.16b},  [x6], x14
141        SUB     x3,  x3, x2             // a0 -= kc
142        ST1     {v26.16b},  [x9], x14
143        SUB     x11, x11, x2            // a1 -= kc
144        ST1     {v28.16b}, [x10], x14
145        SUB     x12, x12, x2            // a2 -= kc
146        ST1     {v30.16b},  [x7], x14
147        SUB     x4,  x4, x2             // a3 -= kc
148
149        B.HI    0b
150        RET
151
1523:
153        TBZ     x0, 2, 5f
1544:
155        # Remainder- 2 halffloats of A (4 bytes)
156        LDR     s0,  [x3], 4
157        LDR     q20, [x5], 16
158        LDR     q21, [x5], 16
159        LDR     s1, [x11], 4
160        LDR     s2, [x12], 4
161        LDR     s3,  [x4], 4
162
163        FMLA    v24.8h, v20.8h, v0.h[0]
164        FMLA    v26.8h, v20.8h, v1.h[0]
165        FMLA    v28.8h, v20.8h, v2.h[0]
166        FMLA    v30.8h, v20.8h, v3.h[0]
167
168        FMLA    v24.8h, v21.8h, v0.h[1]
169        FMLA    v26.8h, v21.8h, v1.h[1]
170        FMLA    v28.8h, v21.8h, v2.h[1]
171        FMLA    v30.8h, v21.8h, v3.h[1]
172        TBZ     x0, 1, 2b
173
1745:
175        # Remainder- 1 halffloat of A (2 bytes)
176        LDR     h0,  [x3], 2
177        LDR     q20, [x5], 16
178        LDR     h1, [x11], 2
179        LDR     h2, [x12], 2
180        LDR     h3 , [x4], 2
181        FMLA    v24.8h, v20.8h, v0.h[0]
182        FMLA    v26.8h, v20.8h, v1.h[0]
183        FMLA    v28.8h, v20.8h, v2.h[0]
184        FMLA    v30.8h, v20.8h, v3.h[0]
185        B       2b
186
187        # Store odd width
1886:
189        TBZ     x1, 2, 7f
190        STR     d24, [x6], 8
191        STR     d26, [x9], 8
192        DUP     d24, v24.d[1]
193        DUP     d26, v26.d[1]
194        STR     d28, [x10], 8
195        STR     d30, [x7], 8
196        DUP     d28, v28.d[1]
197        DUP     d30, v30.d[1]
198
1997:
200        TBZ     x1, 1, 8f
201        STR     s24,  [x6], 4
202        STR     s26,  [x9], 4
203        DUP     s24, v24.s[1]
204        DUP     s26, v26.s[1]
205        STR     s28, [x10], 4
206        STR     s30,  [x7], 4
207        DUP     s28, v28.s[1]
208        DUP     s30, v30.s[1]
209
2108:
211        TBZ     x1, 0, 9f
212        STR     h24,  [x6]
213        STR     h26,  [x9]
214        STR     h28, [x10]
215        STR     h30,  [x7]
2169:
217        RET
218
219END_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64
220
221#ifdef __ELF__
222.section ".note.GNU-stack","",%progbits
223#endif
224