xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/4x2-minmax-aarch64-neonfma-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/4x2-aarch64-neonfma-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const float*restrict w,            x5
19#     float*restrict c,                  x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x8  a0
30# x13 a1
31# x14 a2
32# x15 a3
33
34# C pointers
35# x6  c0
36# x16 c1
37# x17 c2
38# x7  c3 / cm_stride
39
40# Vector register usage
41# A0  v0
42# A1  v1
43# A2  v2
44# A3  v3
45# B  v20 v21
46# C  v24 v25
47# C  v26 v27
48# C  v28 v29
49# C  v30 v31
50# Clamp v4 v5
51
52BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64
53
54        # Load cn_stride, a_offset
55        LDP     x10, x11, [sp]
56
57        # Load zero, params pointer
58        LDP     x12, x8, [sp, 16]
59
60        # Clamp C pointers
61        CMP     x0, 2                   // if mr < 2
62        ADD     x16, x6, x7             // c1 = c0 + cm_stride
63        CSEL    x16, x6, x16, LO        //   c1 = c0
64
65        # Load min/max values
66        LD2R    {v4.2s, v5.2s}, [x8]
67
68        ADD     x17, x16, x7            // c2 = c1 + cm_stride
69                                        // if mr <= 2
70        CSEL    x17, x16, x17, LS       //   c2 = c1
71
72        CMP     x0, 4                   // if mr < 4
73        ADD     x7, x17, x7             // c3 = c2 + cm_stride
74        CSEL    x7, x17, x7, LO         //   c3 = c2
75
760:
77        # Load initial bias from w into accumulators
78        LDR     d24, [x5], 8
79        MOV     v26.8b, v24.8b
80        MOV     v28.8b, v24.8b
81        MOV     v30.8b, v24.8b
82        MOVI    v25.2s, 0
83        MOVI    v27.2s, 0
84        MOVI    v29.2s, 0
85        MOVI    v31.2s, 0
86
87        MOV     x9, x3                  // p = ks
88
891:
90        # Load next 4 A pointers
91        LDP     x8, x13, [x4], 16
92        LDP     x14, x15, [x4], 16
93
94        CMP     x8, x12                 // if a0 == zero
95        ADD     x8, x8, x11             // a0 += a_offset
96        CSEL    x8, x12, x8, EQ         //   a0 = zero, else += a0 + a_offset
97        CMP     x13, x12                // if a1 == zero
98        ADD     x13, x13, x11           // a1 += a_offset
99        CSEL    x13, x12, x13, EQ       //   a1 = zero, else += a1 + a_offset
100        CMP     x14, x12                // if a2 == zero
101        ADD     x14, x14, x11           // a2 += a_offset
102        CSEL    x14, x12, x14, EQ       //   a2 = zero, else += a2 + a_offset
103        CMP     x15, x12                // if a3 == zero
104        ADD     x15, x15, x11           // a3 += a_offset
105        CSEL    x15, x12, x15, EQ       //   a3 = zero, else += a3 + a_offset
106
107        # Is there at least 2 floats (8 bytes)?
108        SUBS    x0, x2, 8               // k = kc - 8
109        B.LO    4f
110
111        # Main loop - 2 floats of A (8 bytes)
1122:
113        LDR     d0, [x8], 8
114        LDP     d20, d21, [x5], 16
115        LDR     d1, [x13], 8
116        LDR     d2, [x14], 8
117        LDR     d3, [x15], 8
118        SUBS    x0, x0, 8
119        FMLA    v24.2s, v20.2s, v0.s[0]
120        FMLA    v26.2s, v20.2s, v1.s[0]
121        FMLA    v28.2s, v20.2s, v2.s[0]
122        FMLA    v30.2s, v20.2s, v3.s[0]
123        FMLA    v25.2s, v21.2s, v0.s[1]
124        FMLA    v27.2s, v21.2s, v1.s[1]
125        FMLA    v29.2s, v21.2s, v2.s[1]
126        FMLA    v31.2s, v21.2s, v3.s[1]
127        B.HS    2b
128
129        # Is there a remainder?- 1 float of A (4 bytes)
130        TBNZ    x0, 2, 4f
131
1323:
133        # ks loop
134        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
135        B.HI    1b
136
137        FADD    v24.2s, v24.2s, v25.2s
138        FADD    v26.2s, v26.2s, v27.2s
139        FADD    v28.2s, v28.2s, v29.2s
140        FADD    v30.2s, v30.2s, v31.2s
141
142        # Clamp
143        FMAX    v24.2s, v24.2s, v4.2s
144        SUBS    x1, x1, 2
145        FMAX    v26.2s, v26.2s, v4.2s
146        FMAX    v28.2s, v28.2s, v4.2s
147        FMAX    v30.2s, v30.2s, v4.2s
148        FMIN    v24.2s, v24.2s, v5.2s
149        FMIN    v26.2s, v26.2s, v5.2s
150        FMIN    v28.2s, v28.2s, v5.2s
151        FMIN    v30.2s, v30.2s, v5.2s
152
153        # Store full 4 x 2
154        B.LO    5f
155
156        STR     d30, [x7]
157        ADD     x7,  x7, x10
158        STR     d28, [x17]
159        ADD     x17, x17, x10
160        STR     d26, [x16]
161        ADD     x16, x16, x10
162        STR     d24, [x6]
163        ADD     x6,  x6, x10
164
165        SUB     x4, x4, x3              // a -= ks
166
167        # nc loop
168        B.HI    0b
169        RET
170
171        # Remainder- 1 float of A
1724:
173        LDR     s0, [x8], 4
174        LDR     d20, [x5], 8
175        LDR     s1, [x13], 4
176        LDR     s2, [x14], 4
177        LDR     s3, [x15], 4
178        FMLA    v24.2s, v20.2s, v0.s[0]
179        FMLA    v26.2s, v20.2s, v1.s[0]
180        FMLA    v28.2s, v20.2s, v2.s[0]
181        FMLA    v30.2s, v20.2s, v3.s[0]
182        B       3b
183
184        # Store odd width
1855:
186        STR     s30,  [x7]
187        STR     s28, [x17]
188        STR     s26, [x16]
189        STR     s24,  [x6]
190        RET
191
192END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64
193
194#ifdef __ELF__
195.section ".note.GNU-stack","",%progbits
196#endif
197