xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/4x8-aarch64-neonfma-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const float*restrict w,            x5
19#     float*restrict c,                  x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x8  a0
30# x13 a1
31# x14 a2
32# x15 a3
33
34# C pointers
35# x6  c0
36# x16 c1
37# x17 c2
38# x7  c3 / cm_stride
39
40# Vector register usage
41# A0  v0
42# A1  v1
43# A2  v2
44# A3  v3
45# B  v20 v21 v22 v23
46# C  v24 v25
47# C  v26 v27
48# C  v28 v29
49# C  v30 v31
50# Clamp v4 v5
51
52BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128
53
54        # Load cn_stride, a_offset
55        LDP     x10, x11, [sp]
56
57        # Load zero, params pointer
58        LDP     x12, x8, [sp, 16]
59
60        # Clamp C pointers
61        CMP     x0, 2                   // if mr < 2
62        ADD     x16, x6, x7             // c1 = c0 + cm_stride
63        CSEL    x16, x6, x16, LO        //   c1 = c0
64
65        # Load min/max values
66        LD2R    {v4.4s, v5.4s}, [x8]
67
68        ADD     x17, x16, x7            // c2 = c1 + cm_stride
69                                        // if mr <= 2
70        CSEL    x17, x16, x17, LS       //   c2 = c1
71
72        CMP     x0, 4                   // if mr < 4
73        ADD     x7, x17, x7             // c3 = c2 + cm_stride
74        CSEL    x7, x17, x7, LO         //   c3 = c2
75
760:
77        # Load initial bias from w into accumulators
78        LDP     q24, q25, [x5], 32
79        MOV     v26.16b, v24.16b
80        MOV     v27.16b, v25.16b
81        MOV     v28.16b, v24.16b
82        MOV     v29.16b, v25.16b
83        MOV     v30.16b, v24.16b
84        MOV     v31.16b, v25.16b
85
86        MOV     x9, x3                  // p = ks
87
881:
89        # Load next 4 A pointers
90        LDP     x8, x13, [x4], 16
91        LDP     x14, x15, [x4], 16
92
93        CMP     x8, x12                 // if a0 == zero
94        ADD     x8, x8, x11             // a0 += a_offset
95        CSEL    x8, x12, x8, EQ         //   a0 = zero, else += a0 + a_offset
96        CMP     x13, x12                // if a1 == zero
97        ADD     x13, x13, x11           // a1 += a_offset
98        CSEL    x13, x12, x13, EQ       //   a1 = zero, else += a1 + a_offset
99        CMP     x14, x12                // if a2 == zero
100        ADD     x14, x14, x11           // a2 += a_offset
101        CSEL    x14, x12, x14, EQ       //   a2 = zero, else += a2 + a_offset
102        CMP     x15, x12                // if a3 == zero
103        ADD     x15, x15, x11           // a3 += a_offset
104        CSEL    x15, x12, x15, EQ       //   a3 = zero, else += a3 + a_offset
105
106        # Is there at least 4 floats (16 bytes)?
107        SUBS    x0, x2, 16              // k = kc - 16
108        B.LO    4f
109
110        # Main loop - 4 floats of A (16 bytes)
1112:
112        LDR     q0,  [x8], 16
113        LDP     q20, q21, [x5], 32
114        LDR     q1, [x13], 16
115        LDR     q2, [x14], 16
116        LDR     q3, [x15], 16
117        FMLA    v24.4s, v20.4s, v0.s[0]
118        FMLA    v25.4s, v21.4s, v0.s[0]
119        FMLA    v26.4s, v20.4s, v1.s[0]
120        FMLA    v27.4s, v21.4s, v1.s[0]
121        LDP     q22, q23, [x5], 32
122        FMLA    v28.4s, v20.4s, v2.s[0]
123        FMLA    v29.4s, v21.4s, v2.s[0]
124        FMLA    v30.4s, v20.4s, v3.s[0]
125        FMLA    v31.4s, v21.4s, v3.s[0]
126        LDP     q16, q17, [x5], 32
127        FMLA    v24.4s, v22.4s, v0.s[1]
128        FMLA    v25.4s, v23.4s, v0.s[1]
129        FMLA    v26.4s, v22.4s, v1.s[1]
130        FMLA    v27.4s, v23.4s, v1.s[1]
131        LDP     q18, q19, [x5], 32
132        FMLA    v28.4s, v22.4s, v2.s[1]
133        FMLA    v29.4s, v23.4s, v2.s[1]
134        FMLA    v30.4s, v22.4s, v3.s[1]
135        FMLA    v31.4s, v23.4s, v3.s[1]
136        FMLA    v24.4s, v16.4s, v0.s[2]
137        FMLA    v25.4s, v17.4s, v0.s[2]
138        FMLA    v26.4s, v16.4s, v1.s[2]
139        FMLA    v27.4s, v17.4s, v1.s[2]
140        FMLA    v28.4s, v16.4s, v2.s[2]
141        FMLA    v29.4s, v17.4s, v2.s[2]
142        FMLA    v30.4s, v16.4s, v3.s[2]
143        FMLA    v31.4s, v17.4s, v3.s[2]
144        FMLA    v24.4s, v18.4s, v0.s[3]
145        FMLA    v25.4s, v19.4s, v0.s[3]
146        FMLA    v26.4s, v18.4s, v1.s[3]
147        FMLA    v27.4s, v19.4s, v1.s[3]
148        FMLA    v28.4s, v18.4s, v2.s[3]
149        FMLA    v29.4s, v19.4s, v2.s[3]
150        SUBS    x0, x0, 16
151        FMLA    v30.4s, v18.4s, v3.s[3]
152        FMLA    v31.4s, v19.4s, v3.s[3]
153        B.HS    2b
154
155        # Is there a remainder?- 2 floats of A (8 bytes) or less
156        TST     x0, 15
157        B.NE    4f
1583:
159        # ks loop
160        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
161        B.HI    1b
162
163        # Clamp
164        FMAX    v24.4s, v24.4s, v4.4s
165        FMAX    v25.4s, v25.4s, v4.4s
166        FMAX    v26.4s, v26.4s, v4.4s
167        FMAX    v27.4s, v27.4s, v4.4s
168        FMAX    v28.4s, v28.4s, v4.4s
169        FMAX    v29.4s, v29.4s, v4.4s
170        FMAX    v30.4s, v30.4s, v4.4s
171        FMAX    v31.4s, v31.4s, v4.4s
172        FMIN    v24.4s, v24.4s, v5.4s
173        FMIN    v25.4s, v25.4s, v5.4s
174        FMIN    v26.4s, v26.4s, v5.4s
175        FMIN    v27.4s, v27.4s, v5.4s
176        FMIN    v28.4s, v28.4s, v5.4s
177        FMIN    v29.4s, v29.4s, v5.4s
178        FMIN    v30.4s, v30.4s, v5.4s
179        FMIN    v31.4s, v31.4s, v5.4s
180
181        # Store full 4 x 8
182        SUBS    x1, x1, 8
183        B.LO    6f
184
185        STP     q30, q31,  [x7]
186        ADD     x7,  x7, x10
187        STP     q28, q29, [x17]
188        ADD     x17, x17, x10
189        STP     q26, q27, [x16]
190        ADD     x16, x16, x10
191        STP     q24, q25,  [x6]
192        ADD     x6,  x6, x10
193
194        SUB     x4, x4, x3              // a -= ks
195
196        # nc loop
197        B.HI    0b
198        RET
199
200        # Remainder- 2 floats of A (8 bytes)
2014:
202        # Is there a remainder?- 2 floats of A (8 bytes)
203        TBZ     x0, 3, 5f
204
205        # Remainder- 2 floats of A (8 bytes)
206        LDP     q20, q21, [x5], 32
207        LDR     d0,  [x8], 8
208        LDR     d1, [x13], 8
209        LDR     d2, [x14], 8
210        LDR     d3, [x15], 8
211        FMLA    v24.4s, v20.4s, v0.s[0]
212        FMLA    v25.4s, v21.4s, v0.s[0]
213        FMLA    v26.4s, v20.4s, v1.s[0]
214        FMLA    v27.4s, v21.4s, v1.s[0]
215        LDP     q22, q23, [x5], 32
216        FMLA    v28.4s, v20.4s, v2.s[0]
217        FMLA    v29.4s, v21.4s, v2.s[0]
218        FMLA    v30.4s, v20.4s, v3.s[0]
219        FMLA    v31.4s, v21.4s, v3.s[0]
220        FMLA    v24.4s, v22.4s, v0.s[1]
221        FMLA    v25.4s, v23.4s, v0.s[1]
222        FMLA    v26.4s, v22.4s, v1.s[1]
223        FMLA    v27.4s, v23.4s, v1.s[1]
224        FMLA    v28.4s, v22.4s, v2.s[1]
225        FMLA    v29.4s, v23.4s, v2.s[1]
226        FMLA    v30.4s, v22.4s, v3.s[1]
227        FMLA    v31.4s, v23.4s, v3.s[1]
228
229        # Is there a remainder?- 1 float of A (4 bytes)
230        TBZ     x0, 2, 3b
231
232        # Remainder- 1 float of A
2335:
234        LDR     s0, [x8], 4
235        LDP     q20, q21, [x5], 32
236        LDR     s1, [x13], 4
237        LDR     s2, [x14], 4
238        LDR     s3, [x15], 4
239        FMLA    v24.4s, v20.4s, v0.s[0]
240        FMLA    v25.4s, v21.4s, v0.s[0]
241        FMLA    v26.4s, v20.4s, v1.s[0]
242        FMLA    v27.4s, v21.4s, v1.s[0]
243        FMLA    v28.4s, v20.4s, v2.s[0]
244        FMLA    v29.4s, v21.4s, v2.s[0]
245        FMLA    v30.4s, v20.4s, v3.s[0]
246        FMLA    v31.4s, v21.4s, v3.s[0]
247        B       3b
248
249        # Store odd width
2506:
251        TBZ     x1, 2, 7f
252        STR     q30, [x7], 16
253        MOV     v30.16b, v31.16b
254        STR     q28, [x17], 16
255        MOV     v28.16b, v29.16b
256        STR     q26, [x16], 16
257        MOV     v26.16b, v27.16b
258        STR     q24, [x6], 16
259        MOV     v24.16b, v25.16b
260
2617:
262        TBZ     x1, 1, 8f
263        STR     d30, [x7], 8
264        STR     d28, [x17], 8
265        DUP     d30, v30.d[1]
266        DUP     d28, v28.d[1]
267        STR     d26, [x16], 8
268        STR     d24, [x6], 8
269        DUP     d26, v26.d[1]
270        DUP     d24, v24.d[1]
271
2728:
273        TBZ     x1, 0, 9f
274        STR     s30,  [x7]
275        STR     s28, [x17]
276        STR     s26, [x16]
277        STR     s24,  [x6]
2789:
279        RET
280
281END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128
282
283#ifdef __ELF__
284.section ".note.GNU-stack","",%progbits
285#endif
286