xref: /aosp_15_r20/external/XNNPACK/src/f16-igemm/1x16-minmax-aarch64-neonfp16arith-ld32.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2022 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const void**restrict a,            x4
14#     const void*restrict w,             x5
15#     void*restrict c,                   x6
16#     size_t cm_stride,                  (x7) - unused
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const void* zero,                  [sp + 16] -> x12
20#     const xnn_f16_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# Register usage
25# A0  x8 v0
26
27# B   x5 v20 v21 v22 v23
28
29# C0  x6 v24 v25
30
31# Clamp v4, v5
32
33BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32
34
35        # Load cn_stride, a_offset
36        LDP     x10, x11, [sp]
37
38        # Load zero, params pointer
39        LDP     x12, x8, [sp, 16]
40
41        # Load params values
42        LD2R    {v4.8h, v5.8h}, [x8]
43
440:
45        # Load initial bias from w into accumulators
46        LDR     q24, [x5], 16
47        LDR     q25, [x5], 16
48        MOVI    v26.8h, 0               // second set of C for pipelining FMLA
49        MOVI    v27.8h, 0
50
51        MOV     x9, x3                  // p = ks
52
531:
54        # Load next A pointer
55        LDR     x8, [x4], 8
56
57        CMP     x8, x12                 // if a0 == zero
58        ADD     x8, x8, x11             // a0 += a_offset
59        CSEL    x8, x12, x8, EQ         //   a0 = zero, else += a0 + a_offset
60
61        # Is there at least 2 halffloats (4 bytes)?
62        SUBS    x0, x2, 4               // k = kc - 4
63        B.LO    4f
64
65       .p2align 3
66        # Main loop - 2 halffloats of A (4 bytes)
672:
68        LDR     s0,  [x8], 4
69        LDR     q20, [x5, 0]
70        LDR     q21, [x5, 16]
71        LDR     q22, [x5, 32]
72        LDR     q23, [x5, 48]
73        SUBS    x0, x0, 4
74        FMLA    v24.8h, v20.8h, v0.h[0]
75        FMLA    v25.8h, v21.8h, v0.h[0]
76        FMLA    v26.8h, v22.8h, v0.h[1]
77        FMLA    v27.8h, v23.8h, v0.h[1]
78        ADD     x5, x5, 64
79        B.HS    2b
80
81        # Is there a remainder?- 1 halffloat of A (2 bytes)
82        TBNZ    x0, 1, 4f
83
843:
85        # ks loop
86        SUBS    x9, x9, 8               // ks -= MR * sizeof(void*)
87        B.HI    1b
88
89        FADD    v24.8h, v24.8h, v26.8h
90        FADD    v25.8h, v25.8h, v27.8h
91
92        # Clamp
93        FMAX    v24.8h, v24.8h, v4.8h
94        FMAX    v25.8h, v25.8h, v4.8h
95        FMIN    v24.8h, v24.8h, v5.8h
96        FMIN    v25.8h, v25.8h, v5.8h
97
98        # Store full 1 x 16
99        SUBS    x1, x1, 16
100        B.LO    5f
101
102        STP     q24, q25,  [x6]
103        ADD     x6,  x6, x10
104
105        SUB     x4, x4, x3              // a -= ks
106
107        # nc loop
108        B.HI    0b
109        RET
110
111        # Remainder- 1 halffloat of A
1124:
113        LDR     h0, [x8], 2
114        LDR     q20, [x5], 16
115        LDR     q21, [x5], 16
116        FMLA    v24.8h, v20.8h, v0.h[0]
117        FMLA    v25.8h, v21.8h, v0.h[0]
118        B       3b
119
120        # Store odd width
1215:
122        TBZ     x1, 3, 6f
123        STR     q24, [x6], 16
124        MOV     v24.16b, v25.16b
1256:
126        TBZ     x1, 2, 7f
127        STR     d24, [x6], 8
128        DUP     d24, v24.d[1]
1297:
130        TBZ     x1, 1, 8f
131        STR     s24,  [x6], 4
132        DUP     s24, v24.s[1]
1338:
134        TBZ     x1, 0, 9f
135        STR     h24,  [x6]
1369:
137        RET
138
139END_FUNCTION xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32
140
141#ifdef __ELF__
142.section ".note.GNU-stack","",%progbits
143#endif
144