xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128
39
40        # Load cn_stride, params pointer
41        LDP     x14, x8, [sp]
42
43        # Load min/max values
44        LD2R    {v4.4s, v5.4s}, [x8]
45
46        # Clamp A and C pointers
47        CMP     x0, 2                   // if mr < 2
48        ADD     x11, x3, x4             // a1 = a0 + a_stride
49        ADD     x9, x6, x7              // c1 = c0 + cm_stride
50        CSEL    x11, x3, x11, LO        //   a1 = a0
51        CSEL    x9, x6, x9, LO          //   c1 = c0
52
53        ADD     x12, x11, x4            // a2 = a1 + a_stride
54        ADD     x10, x9, x7             // c2 = c1 + cm_stride
55                                        // if mr <= 2
56        CSEL    x12, x11, x12, LS       //   a2 = a1
57        CSEL    x10, x9, x10, LS        //   c2 = c1
58
59        CMP     x0, 4                   // if mr < 4
60        ADD     x4, x12, x4             // a3 = a2 + a_stride
61        ADD     x7, x10, x7             // c3 = c2 + cm_stride
62        CSEL    x4, x12, x4, LO         //   a3 = a2
63        CSEL    x7, x10, x7, LO         //   c3 = c2
64
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q17, [x5], 32
68        MOV     v18.16b, v16.16b
69        MOV     v19.16b, v17.16b
70        MOV     v28.16b, v16.16b
71        MOV     v29.16b, v17.16b
72        MOV     v30.16b, v16.16b
73        MOV     v31.16b, v17.16b
74
75        # Is there at least 4 floats (16 bytes)?
76        SUBS    x0, x2, 16              // k = kc - 16
77        B.LO    3f
78
79        # Main loop - 4 floats of A (16 bytes)
801:
81        LDR     q0, [x3], 16
82        LDP     q20, q21, [x5], 32
83        LDR     q1, [x11], 16
84        LDR     q2, [x12], 16
85        LDR     q3, [x4], 16
86        FMLA    v16.4s, v20.4s, v0.s[0]
87        FMLA    v17.4s, v21.4s, v0.s[0]
88        FMLA    v18.4s, v20.4s, v1.s[0]
89        FMLA    v19.4s, v21.4s, v1.s[0]
90        LDP     q22, q23, [x5], 32
91        FMLA    v28.4s, v20.4s, v2.s[0]
92        FMLA    v29.4s, v21.4s, v2.s[0]
93        FMLA    v30.4s, v20.4s, v3.s[0]
94        FMLA    v31.4s, v21.4s, v3.s[0]
95        LDP     q24, q25, [x5], 32
96        FMLA    v16.4s, v22.4s, v0.s[1]
97        FMLA    v17.4s, v23.4s, v0.s[1]
98        FMLA    v18.4s, v22.4s, v1.s[1]
99        FMLA    v19.4s, v23.4s, v1.s[1]
100        LDP     q26, q27, [x5], 32
101        FMLA    v28.4s, v22.4s, v2.s[1]
102        FMLA    v29.4s, v23.4s, v2.s[1]
103        FMLA    v30.4s, v22.4s, v3.s[1]
104        FMLA    v31.4s, v23.4s, v3.s[1]
105        FMLA    v16.4s, v24.4s, v0.s[2]
106        FMLA    v17.4s, v25.4s, v0.s[2]
107        FMLA    v18.4s, v24.4s, v1.s[2]
108        FMLA    v19.4s, v25.4s, v1.s[2]
109        FMLA    v28.4s, v24.4s, v2.s[2]
110        FMLA    v29.4s, v25.4s, v2.s[2]
111        FMLA    v30.4s, v24.4s, v3.s[2]
112        FMLA    v31.4s, v25.4s, v3.s[2]
113        FMLA    v16.4s, v26.4s, v0.s[3]
114        FMLA    v17.4s, v27.4s, v0.s[3]
115        FMLA    v18.4s, v26.4s, v1.s[3]
116        FMLA    v19.4s, v27.4s, v1.s[3]
117        FMLA    v28.4s, v26.4s, v2.s[3]
118        FMLA    v29.4s, v27.4s, v2.s[3]
119        SUBS    x0, x0, 16
120        FMLA    v30.4s, v26.4s, v3.s[3]
121        FMLA    v31.4s, v27.4s, v3.s[3]
122        B.HS    1b
123
124        TST     x0, 15
125        B.NE    3f
126
1272:
128        # Clamp
129        FMAX    v16.4s, v16.4s, v4.4s
130        SUBS    x1, x1, 8
131        FMAX    v17.4s, v17.4s, v4.4s
132        FMAX    v18.4s, v18.4s, v4.4s
133        FMAX    v19.4s, v19.4s, v4.4s
134        FMAX    v28.4s, v28.4s, v4.4s
135        FMAX    v29.4s, v29.4s, v4.4s
136        FMAX    v30.4s, v30.4s, v4.4s
137        FMAX    v31.4s, v31.4s, v4.4s
138        FMIN    v16.4s, v16.4s, v5.4s
139        FMIN    v17.4s, v17.4s, v5.4s
140        FMIN    v18.4s, v18.4s, v5.4s
141        FMIN    v19.4s, v19.4s, v5.4s
142        FMIN    v28.4s, v28.4s, v5.4s
143        FMIN    v29.4s, v29.4s, v5.4s
144        FMIN    v30.4s, v30.4s, v5.4s
145        FMIN    v31.4s, v31.4s, v5.4s
146
147        # Store full 4 x 8
148        B.LO    5f
149
150        ST1     {v16.16b, v17.16b},  [x6], x14
151        SUB     x3,  x3, x2             // a0 -= kc
152        ST1     {v18.16b, v19.16b},  [x9], x14
153        SUB     x11, x11, x2            // a1 -= kc
154        ST1     {v28.16b, v29.16b}, [x10], x14
155        SUB     x12, x12, x2            // a2 -= kc
156        ST1     {v30.16b, v31.16b},  [x7], x14
157        SUB     x4,  x4, x2             // a3 -= kc
158
159        B.HI    0b
160        RET
161
162        # Remainder- 2 floats of A (8 bytes)
1633:
164        # Is there a remainder?- 2 floats of A (8 bytes)
165        TBZ     x0, 3, 4f
166
167        # Remainder- 2 floats of A (8 bytes)
168        LDR     d0,  [x3], 8
169        LDP     q20, q21, [x5], 32
170        LDR     d1, [x11], 8
171        LDR     d2, [x12], 8
172        LDR     d3,  [x4], 8
173        FMLA    v16.4s, v20.4s, v0.s[0]
174        FMLA    v17.4s, v21.4s, v0.s[0]
175        FMLA    v18.4s, v20.4s, v1.s[0]
176        FMLA    v19.4s, v21.4s, v1.s[0]
177        LDP     q22, q23, [x5], 32
178        FMLA    v28.4s, v20.4s, v2.s[0]
179        FMLA    v29.4s, v21.4s, v2.s[0]
180        FMLA    v30.4s, v20.4s, v3.s[0]
181        FMLA    v31.4s, v21.4s, v3.s[0]
182        FMLA    v16.4s, v22.4s, v0.s[1]
183        FMLA    v17.4s, v23.4s, v0.s[1]
184        FMLA    v18.4s, v22.4s, v1.s[1]
185        FMLA    v19.4s, v23.4s, v1.s[1]
186        FMLA    v28.4s, v22.4s, v2.s[1]
187        FMLA    v29.4s, v23.4s, v2.s[1]
188        FMLA    v30.4s, v22.4s, v3.s[1]
189        FMLA    v31.4s, v23.4s, v3.s[1]
190
191        # Is there a remainder?- 1 float of A (4 bytes)
192        TBZ     x0, 2, 2b
193
194        # Remainder- 1 float of A (4 bytes)
1954:
196        LDR     s0,  [x3], 4
197        LDP     q20, q21, [x5], 32
198        LDR     s1, [x11], 4
199        LDR     s2, [x12], 4
200        LDR     s3,  [x4], 4
201        FMLA    v16.4s, v20.4s, v0.s[0]
202        FMLA    v17.4s, v21.4s, v0.s[0]
203        FMLA    v18.4s, v20.4s, v1.s[0]
204        FMLA    v19.4s, v21.4s, v1.s[0]
205        FMLA    v28.4s, v20.4s, v2.s[0]
206        FMLA    v29.4s, v21.4s, v2.s[0]
207        FMLA    v30.4s, v20.4s, v3.s[0]
208        FMLA    v31.4s, v21.4s, v3.s[0]
209        B       2b
210
211
212        # Store odd width
2135:
214        TBZ     x1, 2, 6f
215        STR     q16, [x6], 16
216        MOV     v16.16b, v17.16b
217        STR     q18, [x9], 16
218        MOV     v18.16b, v19.16b
219        STR     q28, [x10], 16
220        MOV     v28.16b, v29.16b
221        STR     q30, [x7], 16
222        MOV     v30.16b, v31.16b
223
2246:
225        TBZ     x1, 1, 7f
226        STR     d16, [x6], 8
227        STR     d18, [x9], 8
228        DUP     d16, v16.d[1]
229        DUP     d18, v18.d[1]
230        STR     d28, [x10], 8
231        STR     d30, [x7], 8
232        DUP     d28, v28.d[1]
233        DUP     d30, v30.d[1]
234
2357:
236        TBZ     x1, 0, 8f
237        STR     s16,  [x6]
238        STR     s18,  [x9]
239        STR     s28, [x10]
240        STR     s30,  [x7]
241
2428:
243        RET
244
245END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128
246
247#ifdef __ELF__
248.section ".note.GNU-stack","",%progbits
249#endif
250