xref: /aosp_15_r20/external/XNNPACK/src/f16-igemm/6x16-minmax-aarch64-neonfp16arith-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const void**restrict a,            x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x8
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const void* zero,                  [sp + 16] -> x12
20#     const xnn_f16_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# Register usage
25# A0 x14 v0
26# A1 x15 v1
27# A2 x20 v2
28# A3 x21 v3
29# A4 x22 v4
30# A5 x23 v5
31
32# B   x5 v16 v17 v18 v19
33
34# C0  x6  v20 v21
35# C1 x16  v22 v23
36# C2 x17  v24 v25
37# C3 x10  v26 v27
38# C4 x13  v28 v29
39# C5  x7  v30 v31
40
41# Clamp v6, (v4), (v5)
42# unused     v7
43# unused A   v8 v9 v10 v11
44# unused B   v12 v13 v14 v15
45
46BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64
47
48        # Load zero, params pointer
49        LDP     x12, x8, [sp, 16]
50
51        # Clamp C pointers
52        CMP     x0, 2                   // if mr < 2
53        ADD     x16, x6, x7             // c1 = c0 + cm_stride
54        CSEL    x16, x6, x16, LO        //   c1 = c0
55        ADD     x17, x16, x7            // c2 = c1 + cm_stride
56                                        // if mr <= 2
57        CSEL    x17, x16, x17, LS       //   c2 = c1
58
59        # Load params
60        LDR     s6, [x8]
61
62        CMP     x0, 4                   // if mr < 4
63        ADD     x10, x17, x7            // c3 = c2 + cm_stride
64        CSEL    x10, x17, x10, LO       //   c3 = c2
65        ADD     x13, x10, x7            // c4 = c3 + cm_stride
66                                        // if mr <= 4
67        CSEL    x13, x10, x13, LS       //   c4 = c3
68        CMP     x0, 6                   // if mr < 6
69        ADD     x7, x13, x7             // c5 = c4 + cm_stride
70        CSEL    x7, x13, x7, LO         //   c5 = c4
71
72        LDP     x8, x11, [sp]           // load cn_stride, a_offset
73
74        # Save x20-x23 on stack
75        STP     x20, x21, [sp, -32]!
76        STP     x22, x23, [sp, 16]
77
780:
79        # Load initial bias from w into accumulators
80        LDP     q20, q21, [x5], 32
81        MOV     x9, x3                  // p = ks
82        MOV     v22.16b, v20.16b
83        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
84        MOV     v23.16b, v21.16b
85        PRFM    PLDL1KEEP, [x5, 64]
86        MOV     v24.16b, v20.16b
87        PRFM    PLDL1KEEP, [x5, 128]
88        MOV     v25.16b, v21.16b
89        PRFM    PLDL1KEEP, [x5, 192]
90        MOV     v26.16b, v20.16b
91        PRFM    PLDL1KEEP, [x5, 256]
92        MOV     v27.16b, v21.16b
93        PRFM    PLDL1KEEP, [x5, 320]
94        MOV     v28.16b, v20.16b
95        MOV     v29.16b, v21.16b
96        MOV     v30.16b, v20.16b
97        MOV     v31.16b, v21.16b
98
991:
100        # Load next 6 A pointers
101        LDP     x14, x15, [x4], 16
102        LDP     x20, x21, [x4], 16
103        LDP     x22, x23, [x4], 16
104
105        CMP     x14, x12                // if a0 == zero
106        ADD     x14, x14, x11           // a0 += a_offset
107        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
108        CMP     x15, x12                // if a1 == zero
109        ADD     x15, x15, x11           // a1 += a_offset
110        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
111        CMP     x20, x12                // if a2 == zero
112        ADD     x20, x20, x11           // a2 += a_offset
113        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
114        CMP     x21, x12                // if a3 == zero
115        ADD     x21, x21, x11           // a3 += a_offset
116        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
117        CMP     x22, x12                // if a4 == zero
118        ADD     x22, x22, x11           // a4 += a_offset
119        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
120        CMP     x23, x12                // if a5 == zero
121        ADD     x23, x23, x11           // a5 += a_offset
122        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
123
124         # Is there at least 4 halffloats (8 bytes)?
125        SUBS    x0, x2, 8               // k = kc - 8
126        B.LO    4f
127
128       .p2align 3
129        # Main loop - 2 halffloats of A (4 bytes)
130        # 48 FMA + 6 ld64 A + 8 LDR B
1312:
132        LDR     d0, [x14], 8              // A0
133        LDR     q16, [x5], 16             // B
134        LDR     q17, [x5], 16             // B
135        LDR     d1, [x15], 8              // A1
136        LDR     d2, [x20], 8              // A2
137        LDR     d3, [x21], 8              // A3
138        LDR     d4, [x22], 8              // A4
139        LDR     d5, [x23], 8              // A5
140        LDR     q18, [x5], 16             // B
141        LDR     q19, [x5], 16             // B
142        FMLA    v20.8h, v16.8h,  v0.h[0]
143        FMLA    v22.8h, v16.8h,  v1.h[0]
144        FMLA    v24.8h, v16.8h,  v2.h[0]
145        FMLA    v26.8h, v16.8h,  v3.h[0]
146        FMLA    v28.8h, v16.8h,  v4.h[0]
147        FMLA    v30.8h, v16.8h,  v5.h[0]
148        FMLA    v21.8h, v17.8h,  v0.h[0]
149        FMLA    v23.8h, v17.8h,  v1.h[0]
150        FMLA    v25.8h, v17.8h,  v2.h[0]
151        FMLA    v27.8h, v17.8h,  v3.h[0]
152        FMLA    v29.8h, v17.8h,  v4.h[0]
153        FMLA    v31.8h, v17.8h,  v5.h[0]
154
155        FMLA    v20.8h, v18.8h,  v0.h[1]
156        FMLA    v22.8h, v18.8h,  v1.h[1]
157        FMLA    v24.8h, v18.8h,  v2.h[1]
158        FMLA    v26.8h, v18.8h,  v3.h[1]
159        FMLA    v28.8h, v18.8h,  v4.h[1]
160        FMLA    v30.8h, v18.8h,  v5.h[1]
161        FMLA    v21.8h, v19.8h,  v0.h[1]
162        FMLA    v23.8h, v19.8h,  v1.h[1]
163        FMLA    v25.8h, v19.8h,  v2.h[1]
164        FMLA    v27.8h, v19.8h,  v3.h[1]
165        FMLA    v29.8h, v19.8h,  v4.h[1]
166        FMLA    v31.8h, v19.8h,  v5.h[1]
167        LDR     q16, [x5], 16
168        LDR     q17, [x5], 16
169        LDR     q18, [x5], 16
170        LDR     q19, [x5], 16
171        SUBS    x0, x0, 8
172
173        FMLA    v20.8h, v16.8h,  v0.h[2]
174        FMLA    v22.8h, v16.8h,  v1.h[2]
175        FMLA    v24.8h, v16.8h,  v2.h[2]
176        FMLA    v26.8h, v16.8h,  v3.h[2]
177        FMLA    v28.8h, v16.8h,  v4.h[2]
178        FMLA    v30.8h, v16.8h,  v5.h[2]
179        FMLA    v21.8h, v17.8h,  v0.h[2]
180        FMLA    v23.8h, v17.8h,  v1.h[2]
181        FMLA    v25.8h, v17.8h,  v2.h[2]
182        FMLA    v27.8h, v17.8h,  v3.h[2]
183        FMLA    v29.8h, v17.8h,  v4.h[2]
184        FMLA    v31.8h, v17.8h,  v5.h[2]
185
186        FMLA    v20.8h, v18.8h,  v0.h[3]
187        FMLA    v22.8h, v18.8h,  v1.h[3]
188        FMLA    v24.8h, v18.8h,  v2.h[3]
189        FMLA    v26.8h, v18.8h,  v3.h[3]
190        FMLA    v28.8h, v18.8h,  v4.h[3]
191        FMLA    v30.8h, v18.8h,  v5.h[3]
192        FMLA    v21.8h, v19.8h,  v0.h[3]
193        FMLA    v23.8h, v19.8h,  v1.h[3]
194        FMLA    v25.8h, v19.8h,  v2.h[3]
195        FMLA    v27.8h, v19.8h,  v3.h[3]
196        FMLA    v29.8h, v19.8h,  v4.h[3]
197        FMLA    v31.8h, v19.8h,  v5.h[3]
198        B.HS    2b
199
200       # Is there a remainder?- 1-3 halffloat of A (2-6 bytes)
201        ADDS    x0, x0, 8
202        B.NE    4f
203
2043:
205        # ks loop
206        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
207        B.HI    1b
208
209        # Clamp
210        DUP     v4.8h, v6.h[0]
211        DUP     v5.8h, v6.h[1]
212        FMAX    v20.8h, v20.8h, v4.8h
213        FMAX    v21.8h, v21.8h, v4.8h
214        FMAX    v22.8h, v22.8h, v4.8h
215        FMAX    v23.8h, v23.8h, v4.8h
216        FMAX    v24.8h, v24.8h, v4.8h
217        FMAX    v25.8h, v25.8h, v4.8h
218        FMAX    v26.8h, v26.8h, v4.8h
219        FMAX    v27.8h, v27.8h, v4.8h
220        FMAX    v28.8h, v28.8h, v4.8h
221        FMAX    v29.8h, v29.8h, v4.8h
222        FMAX    v30.8h, v30.8h, v4.8h
223        FMAX    v31.8h, v31.8h, v4.8h
224        SUBS    x1, x1, 16
225        FMIN    v20.8h, v20.8h, v5.8h
226        FMIN    v21.8h, v21.8h, v5.8h
227        FMIN    v22.8h, v22.8h, v5.8h
228        FMIN    v23.8h, v23.8h, v5.8h
229        FMIN    v24.8h, v24.8h, v5.8h
230        FMIN    v25.8h, v25.8h, v5.8h
231        FMIN    v26.8h, v26.8h, v5.8h
232        FMIN    v27.8h, v27.8h, v5.8h
233        FMIN    v28.8h, v28.8h, v5.8h
234        FMIN    v29.8h, v29.8h, v5.8h
235        FMIN    v30.8h, v30.8h, v5.8h
236        FMIN    v31.8h, v31.8h, v5.8h
237
238        # Store full 6 x 16
239        B.LO    6f
240
241        ST1     {v30.16b, v31.16b},  [x7], x8
242        ST1     {v28.16b, v29.16b}, [x13], x8
243        ST1     {v26.16b, v27.16b}, [x10], x8
244        ST1     {v24.16b, v25.16b}, [x17], x8
245        ST1     {v22.16b, v23.16b}, [x16], x8
246        ST1     {v20.16b, v21.16b},  [x6], x8
247
248        SUB     x4, x4, x3              // a -= ks
249
250        # nc loop
251        B.HI    0b
252
253        # Restore x20-x23 from stack
254        LDP     x22, x23, [sp, 16]
255        LDP     x20, x21, [sp], 32
256        RET
257
258        # Remainder- 1-3 halffloats of A (2-6 bytes)
2594:
260        TBZ     x0, 2, 5f
261        LDR     s0, [x14], 4              // A0
262        LDR     q16, [x5], 16             // B
263        LDR     q17, [x5], 16             // B
264        LDR     s1, [x15], 4              // A1
265        LDR     s2, [x20], 4              // A2
266        LDR     s3, [x21], 4              // A3
267        LDR     s4, [x22], 4              // A4
268        LDR     s5, [x23], 4              // A5
269        LDR     q18, [x5], 16             // B
270        LDR     q19, [x5], 16             // B
271        SUBS    x0, x0, 4
272        FMLA    v20.8h, v16.8h,  v0.h[0]
273        FMLA    v21.8h, v17.8h,  v0.h[0]
274        FMLA    v22.8h, v16.8h,  v1.h[0]
275        FMLA    v23.8h, v17.8h,  v1.h[0]
276        FMLA    v24.8h, v16.8h,  v2.h[0]
277        FMLA    v25.8h, v17.8h,  v2.h[0]
278        FMLA    v26.8h, v16.8h,  v3.h[0]
279        FMLA    v27.8h, v17.8h,  v3.h[0]
280        FMLA    v28.8h, v16.8h,  v4.h[0]
281        FMLA    v29.8h, v17.8h,  v4.h[0]
282        FMLA    v30.8h, v16.8h,  v5.h[0]
283        FMLA    v31.8h, v17.8h,  v5.h[0]
284
285        FMLA    v20.8h, v18.8h,  v0.h[1]
286        FMLA    v21.8h, v19.8h,  v0.h[1]
287        FMLA    v22.8h, v18.8h,  v1.h[1]
288        FMLA    v23.8h, v19.8h,  v1.h[1]
289        FMLA    v24.8h, v18.8h,  v2.h[1]
290        FMLA    v25.8h, v19.8h,  v2.h[1]
291        FMLA    v26.8h, v18.8h,  v3.h[1]
292        FMLA    v27.8h, v19.8h,  v3.h[1]
293        FMLA    v28.8h, v18.8h,  v4.h[1]
294        FMLA    v29.8h, v19.8h,  v4.h[1]
295        FMLA    v30.8h, v18.8h,  v5.h[1]
296        FMLA    v31.8h, v19.8h,  v5.h[1]
297
2985:
299        TBZ     x0, 1, 3b
300        LDR     h0, [x14], 2              // A0
301        LDR     q16, [x5], 16             // B
302        LDR     q17, [x5], 16             // B
303        LDR     h1, [x15], 2              // A1
304        LDR     h2, [x20], 2              // A2
305        LDR     h3, [x21], 2              // A3
306        LDR     h4,  [x22], 2             // A4
307        LDR     h5,  [x23], 2             // A5
308        FMLA    v20.8h, v16.8h,  v0.h[0]
309        FMLA    v21.8h, v17.8h,  v0.h[0]
310        FMLA    v22.8h, v16.8h,  v1.h[0]
311        FMLA    v23.8h, v17.8h,  v1.h[0]
312        FMLA    v24.8h, v16.8h,  v2.h[0]
313        FMLA    v25.8h, v17.8h,  v2.h[0]
314        FMLA    v26.8h, v16.8h,  v3.h[0]
315        FMLA    v27.8h, v17.8h,  v3.h[0]
316        FMLA    v28.8h, v16.8h,  v4.h[0]
317        FMLA    v29.8h, v17.8h,  v4.h[0]
318        FMLA    v30.8h, v16.8h,  v5.h[0]
319        FMLA    v31.8h, v17.8h,  v5.h[0]
320        B       3b
321
322        # Store odd width
3236:
324        TBZ     x1, 3, 7f
325        STR     q30,  [x7], 16
326        MOV     v30.16b, v31.16b
327        STR     q28, [x13], 16
328        MOV     v28.16b, v29.16b
329        STR     q26, [x10], 16
330        MOV     v26.16b, v27.16b
331        STR     q24, [x17], 16
332        MOV     v24.16b, v25.16b
333        STR     q22, [x16], 16
334        MOV     v22.16b, v23.16b
335        STR     q20,  [x6], 16
336        MOV     v20.16b, v21.16b
3377:
338        TBZ     x1, 2, 8f
339        STR     d30,  [x7], 8
340        STR     d28, [x13], 8
341        DUP     d30, v30.d[1]
342        DUP     d28, v28.d[1]
343        STR     d26, [x10], 8
344        STR     d24, [x17], 8
345        DUP     d26, v26.d[1]
346        DUP     d24, v24.d[1]
347        STR     d22, [x16], 8
348        STR     d20,  [x6], 8
349        DUP     d22, v22.d[1]
350        DUP     d20, v20.d[1]
351
3528:
353        TBZ     x1, 1, 9f
354        STR     s30,  [x7], 4
355        STR     s28, [x13], 4
356        DUP     s30, v30.s[1]
357        DUP     s28, v28.s[1]
358        STR     s26, [x10], 4
359        STR     s24, [x17], 4
360        DUP     s26, v26.s[1]
361        DUP     s24, v24.s[1]
362        STR     s22, [x16], 4
363        STR     s20,  [x6], 4
364        DUP     s22, v22.s[1]
365        DUP     s20, v20.s[1]
366
3679:
368        TBZ     x1, 0, 10f
369        STR     h30,  [x7]
370        STR     h28, [x13]
371        STR     h26, [x10]
372        STR     h24, [x17]
373        STR     h22, [x16]
374        STR     h20,  [x6]
37510:
376        # Restore x20-x23 from stack
377        LDP     x22, x23, [sp, 16]
378        LDP     x20, x21, [sp], 32
379        RET
380
381END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64
382
383#ifdef __ELF__
384.section ".note.GNU-stack","",%progbits
385#endif
386