xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/4x8-aarch64-neonfma-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const float*restrict w,            x5
19#     float*restrict c,                  x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x8  a0
30# x13 a1
31# x14 a2
32# x15 a3
33
34# C pointers
35# x6  c0
36# x16 c1
37# x17 c2
38# x7  c3 / cm_stride
39
40# Vector register usage
41# A0  v0
42# A1  v1
43# A2  v2
44# A3  v3
45# B  v20 v21 v22 v23
46# C  v24 v25
47# C  v26 v27
48# C  v28 v29
49# C  v30 v31
50# Clamp v4 v5
51
52BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64
53
54        # Load cn_stride, a_offset
55        LDP     x10, x11, [sp]
56
57        # Load zero, params pointer
58        LDP     x12, x8, [sp, 16]
59
60        # Clamp C pointers
61        CMP     x0, 2                   // if mr < 2
62        ADD     x16, x6, x7             // c1 = c0 + cm_stride
63        CSEL    x16, x6, x16, LO        //   c1 = c0
64
65        # Load min/max values
66        LD2R    {v4.4s, v5.4s}, [x8]
67
68        ADD     x17, x16, x7            // c2 = c1 + cm_stride
69                                        // if mr <= 2
70        CSEL    x17, x16, x17, LS       //   c2 = c1
71
72        CMP     x0, 4                   // if mr < 4
73        ADD     x7, x17, x7             // c3 = c2 + cm_stride
74        CSEL    x7, x17, x7, LO         //   c3 = c2
75
760:
77        # Load initial bias from w into accumulators
78        LDP     q24, q25, [x5], 32
79        MOV     v26.16b, v24.16b
80        MOV     v27.16b, v25.16b
81        MOV     v28.16b, v24.16b
82        MOV     v29.16b, v25.16b
83        MOV     v30.16b, v24.16b
84        MOV     v31.16b, v25.16b
85
86        MOV     x9, x3                  // p = ks
87
881:
89        # Load next 4 A pointers
90        LDP     x8, x13, [x4], 16
91        LDP     x14, x15, [x4], 16
92
93        CMP     x8, x12                 // if a0 == zero
94        ADD     x8, x8, x11             // a0 += a_offset
95        CSEL    x8, x12, x8, EQ         //   a0 = zero, else += a0 + a_offset
96        CMP     x13, x12                // if a1 == zero
97        ADD     x13, x13, x11           // a1 += a_offset
98        CSEL    x13, x12, x13, EQ       //   a1 = zero, else += a1 + a_offset
99        CMP     x14, x12                // if a2 == zero
100        ADD     x14, x14, x11           // a2 += a_offset
101        CSEL    x14, x12, x14, EQ       //   a2 = zero, else += a2 + a_offset
102        CMP     x15, x12                // if a3 == zero
103        ADD     x15, x15, x11           // a3 += a_offset
104        CSEL    x15, x12, x15, EQ       //   a3 = zero, else += a3 + a_offset
105
106        # Is there at least 2 floats (8 bytes)?
107        SUBS    x0, x2, 8               // k = kc - 8
108        B.LO    4f
109
110        # Main loop - 2 floats of A (8 bytes)
111
1122:
113        LDR     d0, [x8], 8
114        LDP     q20, q21, [x5], 32
115        LDR     d1, [x13], 8
116        LDR     d2, [x14], 8
117        LDR     d3, [x15], 8
118        FMLA    v24.4s, v20.4s, v0.s[0]
119        FMLA    v25.4s, v21.4s, v0.s[0]
120        FMLA    v26.4s, v20.4s, v1.s[0]
121        FMLA    v27.4s, v21.4s, v1.s[0]
122        LDP     q22, q23, [x5], 32
123        FMLA    v28.4s, v20.4s, v2.s[0]
124        FMLA    v29.4s, v21.4s, v2.s[0]
125        FMLA    v30.4s, v20.4s, v3.s[0]
126        FMLA    v31.4s, v21.4s, v3.s[0]
127        FMLA    v24.4s, v22.4s, v0.s[1]
128        FMLA    v25.4s, v23.4s, v0.s[1]
129        FMLA    v26.4s, v22.4s, v1.s[1]
130        FMLA    v27.4s, v23.4s, v1.s[1]
131        SUBS    x0, x0, 8
132        FMLA    v28.4s, v22.4s, v2.s[1]
133        FMLA    v29.4s, v23.4s, v2.s[1]
134        FMLA    v30.4s, v22.4s, v3.s[1]
135        FMLA    v31.4s, v23.4s, v3.s[1]
136        B.HS    2b
137
138        # Is there a remainder?- 1 float of A (4 bytes)
139        TBNZ    x0, 2, 4f
140
1413:
142        # ks loop
143        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
144        B.HI    1b
145
146        # Clamp
147        FMAX    v24.4s, v24.4s, v4.4s
148        FMAX    v25.4s, v25.4s, v4.4s
149        FMAX    v26.4s, v26.4s, v4.4s
150        FMAX    v27.4s, v27.4s, v4.4s
151        FMAX    v28.4s, v28.4s, v4.4s
152        FMAX    v29.4s, v29.4s, v4.4s
153        FMAX    v30.4s, v30.4s, v4.4s
154        FMAX    v31.4s, v31.4s, v4.4s
155        FMIN    v24.4s, v24.4s, v5.4s
156        FMIN    v25.4s, v25.4s, v5.4s
157        FMIN    v26.4s, v26.4s, v5.4s
158        FMIN    v27.4s, v27.4s, v5.4s
159        FMIN    v28.4s, v28.4s, v5.4s
160        FMIN    v29.4s, v29.4s, v5.4s
161        FMIN    v30.4s, v30.4s, v5.4s
162        FMIN    v31.4s, v31.4s, v5.4s
163
164        # Store full 4 x 8
165        SUBS    x1, x1, 8
166        B.LO    5f
167
168        STP     q30, q31,  [x7]
169        ADD     x7,  x7, x10
170        STP     q28, q29, [x17]
171        ADD     x17, x17, x10
172        STP     q26, q27, [x16]
173        ADD     x16, x16, x10
174        STP     q24, q25,  [x6]
175        ADD     x6,  x6, x10
176
177        SUB     x4, x4, x3              // a -= ks
178
179        # nc loop
180        B.HI    0b
181        RET
182
183        # Remainder- 1 float of A
1844:
185        LDR     s0, [x8], 4
186        LDP     q20, q21, [x5], 32
187        LDR     s1, [x13], 4
188        LDR     s2, [x14], 4
189        LDR     s3, [x15], 4
190        FMLA    v24.4s, v20.4s, v0.s[0]
191        FMLA    v25.4s, v21.4s, v0.s[0]
192        FMLA    v26.4s, v20.4s, v1.s[0]
193        FMLA    v27.4s, v21.4s, v1.s[0]
194        FMLA    v28.4s, v20.4s, v2.s[0]
195        FMLA    v29.4s, v21.4s, v2.s[0]
196        FMLA    v30.4s, v20.4s, v3.s[0]
197        FMLA    v31.4s, v21.4s, v3.s[0]
198        B       3b
199
200        # Store odd width
2015:
202        TBZ     x1, 2, 6f
203        STR     q30, [x7], 16
204        MOV     v30.16b, v31.16b
205        STR     q28, [x17], 16
206        MOV     v28.16b, v29.16b
207        STR     q26, [x16], 16
208        MOV     v26.16b, v27.16b
209        STR     q24, [x6], 16
210        MOV     v24.16b, v25.16b
211
2126:
213        TBZ     x1, 1, 7f
214        STR     d30, [x7], 8
215        STR     d28, [x17], 8
216        DUP     d30, v30.d[1]
217        DUP     d28, v28.d[1]
218        STR     d26, [x16], 8
219        STR     d24, [x6], 8
220        DUP     d26, v26.d[1]
221        DUP     d24, v24.d[1]
222
2237:
224        TBZ     x1, 0, 8f
225        STR     s30,  [x7]
226        STR     s28, [x17]
227        STR     s26, [x16]
228        STR     s24,  [x6]
2298:
230        RET
231
232END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64
233
234#ifdef __ELF__
235.section ".note.GNU-stack","",%progbits
236#endif
237