xref: /aosp_15_r20/external/XNNPACK/src/f16-gemm/gen/4x16-minmax-aarch64-neonfp16arith-ld32.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/4x16-aarch64-neonfp16arith-ld32.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const void*restrict a,    x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     void*restrict c,          x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# Register usage
27# A0  x3 v0
28# A1 x11 v1
29# A2 x12 v2
30# A3  x4 v3
31
32# B   x5 v20 v21 v22 v23
33
34# C0  x6 v24 v25
35# C1  x9 v26 v27
36# C2 x10 v28 v29
37# C3  x7 v30 v31
38
39# Clamp v4, v5
40
41BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32
42
43        # Load cn_stride, params pointer
44        LDP     x14, x8, [sp]
45
46        # Load params values
47        LD2R    {v4.8h, v5.8h}, [x8]
48
49        # Clamp A and C pointers
50        CMP     x0, 2                   // if mr < 2
51        ADD     x11, x3, x4             // a1 = a0 + a_stride
52        ADD     x9, x6, x7              // c1 = c0 + cm_stride
53        CSEL    x11, x3, x11, LO        //   a1 = a0
54        CSEL    x9, x6, x9, LO          //   c1 = c0
55
56        ADD     x12, x11, x4            // a2 = a1 + a_stride
57        ADD     x10, x9, x7             // c2 = c1 + cm_stride
58                                        // if mr <= 2
59        CSEL    x12, x11, x12, LS       //   a2 = a1
60        CSEL    x10, x9, x10, LS        //   c2 = c1
61
62        CMP     x0, 4                   // if mr < 4
63        ADD     x4, x12, x4             // a3 = a2 + a_stride
64        ADD     x7, x10, x7             // c3 = c2 + cm_stride
65        CSEL    x4, x12, x4, LO         //   a3 = a2
66        CSEL    x7, x10, x7, LO         //   c3 = c2
67
680:
69        # Load initial bias from w into accumulators
70        LDR     q24, [x5], 16
71        LDR     q25, [x5], 16
72        MOV     v26.16b, v24.16b
73        MOV     v28.16b, v24.16b
74        MOV     v30.16b, v24.16b
75        MOV     v27.16b, v25.16b
76        MOV     v29.16b, v25.16b
77        MOV     v31.16b, v25.16b
78
79        # Is there at least 2 halffloats (4 bytes)?
80        SUBS    x0, x2, 4               // k = kc - 4
81        B.LO    3f
82
83       .p2align 3
84        # Main loop - 2 halffloats of A (4 bytes)
851:
86        LDR     s0,  [x3], 4
87        LDR     q20, [x5], 16
88        LDR     q21, [x5], 16
89        LDR     s1, [x11], 4
90        LDR     s2, [x12], 4
91        LDR     s3,  [x4], 4
92        LDR     q22, [x5], 16
93        LDR     q23, [x5], 16
94        SUBS    x0, x0, 4
95        FMLA    v24.8h, v20.8h, v0.h[0]
96        FMLA    v25.8h, v21.8h, v0.h[0]
97        FMLA    v26.8h, v20.8h, v1.h[0]
98        FMLA    v27.8h, v21.8h, v1.h[0]
99        FMLA    v28.8h, v20.8h, v2.h[0]
100        FMLA    v29.8h, v21.8h, v2.h[0]
101        FMLA    v30.8h, v20.8h, v3.h[0]
102        FMLA    v31.8h, v21.8h, v3.h[0]
103
104        FMLA    v24.8h, v22.8h, v0.h[1]
105        FMLA    v25.8h, v23.8h, v0.h[1]
106        FMLA    v26.8h, v22.8h, v1.h[1]
107        FMLA    v27.8h, v23.8h, v1.h[1]
108        FMLA    v28.8h, v22.8h, v2.h[1]
109        FMLA    v29.8h, v23.8h, v2.h[1]
110        FMLA    v30.8h, v22.8h, v3.h[1]
111        FMLA    v31.8h, v23.8h, v3.h[1]
112        B.HS    1b
113
114        # Is there a remainder?- 1 halffloat of A (2 bytes)
115        TBNZ    x0, 1, 3f
116
1172:
118        # Clamp
119        FMAX    v24.8h, v24.8h, v4.8h
120        SUBS    x1, x1, 16
121        FMAX    v25.8h, v25.8h, v4.8h
122        FMAX    v26.8h, v26.8h, v4.8h
123        FMAX    v27.8h, v27.8h, v4.8h
124        FMAX    v28.8h, v28.8h, v4.8h
125        FMAX    v29.8h, v29.8h, v4.8h
126        FMAX    v30.8h, v30.8h, v4.8h
127        FMAX    v31.8h, v31.8h, v4.8h
128        FMIN    v24.8h, v24.8h, v5.8h
129        FMIN    v25.8h, v25.8h, v5.8h
130        FMIN    v26.8h, v26.8h, v5.8h
131        FMIN    v27.8h, v27.8h, v5.8h
132        FMIN    v28.8h, v28.8h, v5.8h
133        FMIN    v29.8h, v29.8h, v5.8h
134        FMIN    v30.8h, v30.8h, v5.8h
135        FMIN    v31.8h, v31.8h, v5.8h
136
137        # Store full 4 x 16
138        B.LO    4f
139
140        ST1     {v24.16b, v25.16b},  [x6], x14
141        SUB     x3,  x3, x2             // a0 -= kc
142        ST1     {v26.16b, v27.16b},  [x9], x14
143        SUB     x11, x11, x2            // a1 -= kc
144        ST1     {v28.16b, v29.16b}, [x10], x14
145        SUB     x12, x12, x2            // a2 -= kc
146        ST1     {v30.16b, v31.16b},  [x7], x14
147        SUB     x4,  x4, x2             // a3 -= kc
148
149        B.HI    0b
150
151        RET
152
153        # Remainder- 1 halffloat of A (2 bytes)
1543:
155        LDR     h0,  [x3], 2
156        LDR     q20, [x5], 16
157        LDR     q21, [x5], 16
158        LDR     h1, [x11], 2
159        LDR     h2, [x12], 2
160        LDR     h3,  [x4], 2
161        FMLA    v24.8h, v20.8h, v0.h[0]
162        FMLA    v25.8h, v21.8h, v0.h[0]
163        FMLA    v26.8h, v20.8h, v1.h[0]
164        FMLA    v27.8h, v21.8h, v1.h[0]
165        FMLA    v28.8h, v20.8h, v2.h[0]
166        FMLA    v29.8h, v21.8h, v2.h[0]
167        FMLA    v30.8h, v20.8h, v3.h[0]
168        FMLA    v31.8h, v21.8h, v3.h[0]
169        B       2b
170
171        # Store odd width
1724:
173        TBZ     x1, 3, 5f
174        STR     q24, [x6], 16
175        MOV     v24.16b, v25.16b
176        STR     q26, [x9], 16
177        MOV     v26.16b, v27.16b
178        STR     q28, [x10], 16
179        MOV     v28.16b, v29.16b
180        STR     q30, [x7], 16
181        MOV     v30.16b, v31.16b
182
1835:
184        TBZ     x1, 2, 6f
185        STR     d24, [x6], 8
186        STR     d26, [x9], 8
187        DUP     d24, v24.d[1]
188        DUP     d26, v26.d[1]
189        STR     d28, [x10], 8
190        STR     d30, [x7], 8
191        DUP     d28, v28.d[1]
192        DUP     d30, v30.d[1]
193
1946:
195        TBZ     x1, 1, 7f
196        STR     s24,  [x6], 4
197        STR     s26,  [x9], 4
198        DUP     s24, v24.s[1]
199        DUP     s26, v26.s[1]
200        STR     s28, [x10], 4
201        STR     s30,  [x7], 4
202        DUP     s28, v28.s[1]
203        DUP     s30, v30.s[1]
204
2057:
206        TBZ     x1, 0, 8f
207        STR     h24,  [x6]
208        STR     h26,  [x9]
209        STR     h28, [x10]
210        STR     h30,  [x7]
2118:
212        RET
213
214END_FUNCTION xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32
215
216#ifdef __ELF__
217.section ".note.GNU-stack","",%progbits
218#endif
219