xref: /aosp_15_r20/external/XNNPACK/src/f16-igemm/6x16-minmax-aarch64-neonfp16arith-ld32.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const void**restrict a,            x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x8
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const void* zero,                  [sp + 16] -> x12
20#     const xnn_f16_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# Register usage
25# A0 x14 v0
26# A1 x15 v1
27# A2 x20 v2
28# A3 x21 v3
29# A4 x22 v4
30# A5 x23 v5
31
32# B   x5 v16 v17 v18 v19
33
34# C0  x6  v20 v21
35# C1 x16  v22 v23
36# C2 x17  v24 v25
37# C3 x10  v26 v27
38# C4 x13  v28 v29
39# C5  x7  v30 v31
40
41# Clamp v6, (v4), (v5)
42# unused     v7
43# unused A   v8 v9 v10 v11
44# unused B   v12 v13 v14 v15
45
46BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32
47
48        # Load zero, params pointer
49        LDP     x12, x8, [sp, 16]
50
51        # Clamp C pointers
52        CMP     x0, 2                   // if mr < 2
53        ADD     x16, x6, x7             // c1 = c0 + cm_stride
54        CSEL    x16, x6, x16, LO        //   c1 = c0
55        ADD     x17, x16, x7            // c2 = c1 + cm_stride
56                                        // if mr <= 2
57        CSEL    x17, x16, x17, LS       //   c2 = c1
58
59        # Load params
60        LDR     s6, [x8]
61
62        CMP     x0, 4                   // if mr < 4
63        ADD     x10, x17, x7            // c3 = c2 + cm_stride
64        CSEL    x10, x17, x10, LO       //   c3 = c2
65        ADD     x13, x10, x7            // c4 = c3 + cm_stride
66                                        // if mr <= 4
67        CSEL    x13, x10, x13, LS       //   c4 = c3
68        CMP     x0, 6                   // if mr < 6
69        ADD     x7, x13, x7             // c5 = c4 + cm_stride
70        CSEL    x7, x13, x7, LO         //   c5 = c4
71
72        LDP     x8, x11, [sp]           // load cn_stride, a_offset
73
74        # Save x20-x23 on stack
75        STP     x20, x21, [sp, -32]!
76        STP     x22, x23, [sp, 16]
77
780:
79        # Load initial bias from w into accumulators
80        LDP     q20, q21, [x5], 32
81        MOV     x9, x3                  // p = ks
82        MOV     v22.16b, v20.16b
83        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
84        MOV     v23.16b, v21.16b
85        PRFM    PLDL1KEEP, [x5, 64]
86        MOV     v24.16b, v20.16b
87        PRFM    PLDL1KEEP, [x5, 128]
88        MOV     v25.16b, v21.16b
89        PRFM    PLDL1KEEP, [x5, 192]
90        MOV     v26.16b, v20.16b
91        PRFM    PLDL1KEEP, [x5, 256]
92        MOV     v27.16b, v21.16b
93        PRFM    PLDL1KEEP, [x5, 320]
94        MOV     v28.16b, v20.16b
95        MOV     v29.16b, v21.16b
96        MOV     v30.16b, v20.16b
97        MOV     v31.16b, v21.16b
98
991:
100        # Load next 6 A pointers
101        LDP     x14, x15, [x4], 16
102        LDP     x20, x21, [x4], 16
103        LDP     x22, x23, [x4], 16
104
105        CMP     x14, x12                // if a0 == zero
106        ADD     x14, x14, x11           // a0 += a_offset
107        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
108        CMP     x15, x12                // if a1 == zero
109        ADD     x15, x15, x11           // a1 += a_offset
110        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
111        CMP     x20, x12                // if a2 == zero
112        ADD     x20, x20, x11           // a2 += a_offset
113        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
114        CMP     x21, x12                // if a3 == zero
115        ADD     x21, x21, x11           // a3 += a_offset
116        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
117        CMP     x22, x12                // if a4 == zero
118        ADD     x22, x22, x11           // a4 += a_offset
119        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
120        CMP     x23, x12                // if a5 == zero
121        ADD     x23, x23, x11           // a5 += a_offset
122        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
123
124        # Is there at least 2 halffloats (4 bytes)?
125        SUBS    x0, x2, 4               // k = kc - 4
126        B.LO    4f
127
128       .p2align 3
129        # Main loop - 2 halffloats of A (4 bytes)
130        # 24 FMA + 6 ld32 A + 4 LDR B
1312:
132        LDR     s0, [x14], 4              // A0
133        LDR     q16, [x5], 16             // B
134        LDR     q17, [x5], 16             // B
135        LDR     s1, [x15], 4              // A1
136        LDR     s2, [x20], 4              // A2
137        LDR     s3, [x21], 4              // A3
138        LDR     s4, [x22], 4              // A4
139        LDR     s5, [x23], 4              // A5
140        LDR     q18, [x5], 16             // B
141        LDR     q19, [x5], 16             // B
142        SUBS    x0, x0, 4
143        FMLA    v20.8h, v16.8h,  v0.h[0]
144        FMLA    v21.8h, v17.8h,  v0.h[0]
145        FMLA    v22.8h, v16.8h,  v1.h[0]
146        FMLA    v23.8h, v17.8h,  v1.h[0]
147        FMLA    v24.8h, v16.8h,  v2.h[0]
148        FMLA    v25.8h, v17.8h,  v2.h[0]
149        FMLA    v26.8h, v16.8h,  v3.h[0]
150        FMLA    v27.8h, v17.8h,  v3.h[0]
151        FMLA    v28.8h, v16.8h,  v4.h[0]
152        FMLA    v29.8h, v17.8h,  v4.h[0]
153        FMLA    v30.8h, v16.8h,  v5.h[0]
154        FMLA    v31.8h, v17.8h,  v5.h[0]
155
156        FMLA    v20.8h, v18.8h,  v0.h[1]
157        FMLA    v21.8h, v19.8h,  v0.h[1]
158        FMLA    v22.8h, v18.8h,  v1.h[1]
159        FMLA    v23.8h, v19.8h,  v1.h[1]
160        FMLA    v24.8h, v18.8h,  v2.h[1]
161        FMLA    v25.8h, v19.8h,  v2.h[1]
162        FMLA    v26.8h, v18.8h,  v3.h[1]
163        FMLA    v27.8h, v19.8h,  v3.h[1]
164        FMLA    v28.8h, v18.8h,  v4.h[1]
165        FMLA    v29.8h, v19.8h,  v4.h[1]
166        FMLA    v30.8h, v18.8h,  v5.h[1]
167        FMLA    v31.8h, v19.8h,  v5.h[1]
168        B.HS    2b
169
170        # Is there a remainder?- 1 halffloat of A (2 bytes)
171        TBNZ    x0, 1, 4f
172
1733:
174        # ks loop
175        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
176        B.HI    1b
177
178        # Clamp
179        DUP     v4.8h, v6.h[0]
180        DUP     v5.8h, v6.h[1]
181        FMAX    v20.8h, v20.8h, v4.8h
182        FMAX    v21.8h, v21.8h, v4.8h
183        FMAX    v22.8h, v22.8h, v4.8h
184        FMAX    v23.8h, v23.8h, v4.8h
185        FMAX    v24.8h, v24.8h, v4.8h
186        FMAX    v25.8h, v25.8h, v4.8h
187        FMAX    v26.8h, v26.8h, v4.8h
188        FMAX    v27.8h, v27.8h, v4.8h
189        FMAX    v28.8h, v28.8h, v4.8h
190        FMAX    v29.8h, v29.8h, v4.8h
191        FMAX    v30.8h, v30.8h, v4.8h
192        FMAX    v31.8h, v31.8h, v4.8h
193        SUBS    x1, x1, 16
194        FMIN    v20.8h, v20.8h, v5.8h
195        FMIN    v21.8h, v21.8h, v5.8h
196        FMIN    v22.8h, v22.8h, v5.8h
197        FMIN    v23.8h, v23.8h, v5.8h
198        FMIN    v24.8h, v24.8h, v5.8h
199        FMIN    v25.8h, v25.8h, v5.8h
200        FMIN    v26.8h, v26.8h, v5.8h
201        FMIN    v27.8h, v27.8h, v5.8h
202        FMIN    v28.8h, v28.8h, v5.8h
203        FMIN    v29.8h, v29.8h, v5.8h
204        FMIN    v30.8h, v30.8h, v5.8h
205        FMIN    v31.8h, v31.8h, v5.8h
206
207        # Store full 6 x 16
208        B.LO    5f
209
210        ST1     {v30.16b, v31.16b},  [x7], x8
211        ST1     {v28.16b, v29.16b}, [x13], x8
212        ST1     {v26.16b, v27.16b}, [x10], x8
213        ST1     {v24.16b, v25.16b}, [x17], x8
214        ST1     {v22.16b, v23.16b}, [x16], x8
215        ST1     {v20.16b, v21.16b},  [x6], x8
216
217        SUB     x4, x4, x3              // a -= ks
218
219        # nc loop
220        B.HI    0b
221
222        # Restore x20-x23 from stack
223        LDP     x22, x23, [sp, 16]
224        LDP     x20, x21, [sp], 32
225        RET
226
2274:
228        # Remainder- 1 halffloat of A (2 bytes)
229        LDR     h0, [x14], 2              // A0
230        LDR     q16, [x5], 16             // B
231        LDR     q17, [x5], 16             // B
232        LDR     h1, [x15], 2              // A1
233        LDR     h2, [x20], 2              // A2
234        LDR     h3, [x21], 2              // A3
235        LDR     h4,  [x22], 2             // A4
236        LDR     h5,  [x23], 2             // A5
237        FMLA    v20.8h, v16.8h,  v0.h[0]
238        FMLA    v21.8h, v17.8h,  v0.h[0]
239        FMLA    v22.8h, v16.8h,  v1.h[0]
240        FMLA    v23.8h, v17.8h,  v1.h[0]
241        FMLA    v24.8h, v16.8h,  v2.h[0]
242        FMLA    v25.8h, v17.8h,  v2.h[0]
243        FMLA    v26.8h, v16.8h,  v3.h[0]
244        FMLA    v27.8h, v17.8h,  v3.h[0]
245        FMLA    v28.8h, v16.8h,  v4.h[0]
246        FMLA    v29.8h, v17.8h,  v4.h[0]
247        FMLA    v30.8h, v16.8h,  v5.h[0]
248        FMLA    v31.8h, v17.8h,  v5.h[0]
249        B       3b
250
251        # Store odd width
2525:
253        TBZ     x1, 3, 6f
254        STR     q30,  [x7], 16
255        MOV     v30.16b, v31.16b
256        STR     q28, [x13], 16
257        MOV     v28.16b, v29.16b
258        STR     q26, [x10], 16
259        MOV     v26.16b, v27.16b
260        STR     q24, [x17], 16
261        MOV     v24.16b, v25.16b
262        STR     q22, [x16], 16
263        MOV     v22.16b, v23.16b
264        STR     q20,  [x6], 16
265        MOV     v20.16b, v21.16b
2666:
267        TBZ     x1, 2, 7f
268        STR     d30,  [x7], 8
269        STR     d28, [x13], 8
270        DUP     d30, v30.d[1]
271        DUP     d28, v28.d[1]
272        STR     d26, [x10], 8
273        STR     d24, [x17], 8
274        DUP     d26, v26.d[1]
275        DUP     d24, v24.d[1]
276        STR     d22, [x16], 8
277        STR     d20,  [x6], 8
278        DUP     d22, v22.d[1]
279        DUP     d20, v20.d[1]
280
2817:
282        TBZ     x1, 1, 8f
283        STR     s30,  [x7], 4
284        STR     s28, [x13], 4
285        DUP     s30, v30.s[1]
286        DUP     s28, v28.s[1]
287        STR     s26, [x10], 4
288        STR     s24, [x17], 4
289        DUP     s26, v26.s[1]
290        DUP     s24, v24.s[1]
291        STR     s22, [x16], 4
292        STR     s20,  [x6], 4
293        DUP     s22, v22.s[1]
294        DUP     s20, v20.s[1]
295
2968:
297        TBZ     x1, 0, 9f
298        STR     h30,  [x7]
299        STR     h28, [x13]
300        STR     h26, [x10]
301        STR     h24, [x17]
302        STR     h22, [x16]
303        STR     h20,  [x6]
3049:
305        # Restore x20-x23 from stack
306        LDP     x22, x23, [sp, 16]
307        LDP     x20, x21, [sp], 32
308        RET
309
310END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32
311
312#ifdef __ELF__
313.section ".note.GNU-stack","",%progbits
314#endif
315