xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           (x4)
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# B   x5 v4  v5  v6  v7  v16  v17 v18 v19
30# C0  x6 v28 v29 v30 v31
31# unused v8 v9 v10 v11 v12 v13 v14 v15
32
33BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64
34        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
35        BIC     x2, x2, 3
36
37        .p2align 3
380:
39        # Load initial bias from w into accumulators
40        LDP     q28, q29, [x5], 32
41        SUBS    x0, x2, 8               // k = kc - 8
42        LDP     q30, q31, [x5], 32
43        LDR     x11, [sp, 8]            // params
44
45        # Is there at least 8 bytes?
46        B.LO    3f
47
48        # Main loop - 8 bytes of A
49        .p2align 3
501:
51        LDR     d0,  [x3], 8
52        LDR     q16, [x5, 0]
53        LDR     q17, [x5, 16]
54        SDOT    v28.4s, v16.16b, v0.4b[0]
55        LDR     q18, [x5, 32]
56        SDOT    v29.4s, v17.16b, v0.4b[0]
57        LDR     q19, [x5, 48]
58        SDOT    v30.4s, v18.16b, v0.4b[0]
59        LDR     q4, [x5, 64]
60        SDOT    v31.4s, v19.16b, v0.4b[0]
61        LDR     q5, [x5, 80]
62        SDOT    v28.4s, v4.16b,  v0.4b[1]
63        LDR     q6, [x5, 96]
64        SDOT    v29.4s, v5.16b,  v0.4b[1]
65        LDR     q7, [x5, 112]
66        SDOT    v30.4s, v6.16b,  v0.4b[1]
67        ADD     x5, x5, 128
68        SDOT    v31.4s, v7.16b,  v0.4b[1]
69        SUBS    x0, x0, 8
70        B.HS    1b
71
72        # Is there a remainder?- 1 to 4 bytes of A
73        TBNZ    x0, 2, 3f
74
752:
76        # Load per channel scale values from weights
77        SCVTF   v28.4s, v28.4s
78        LDR     q4, [x5], 16
79        SCVTF   v29.4s, v29.4s
80        LDR     q5, [x5], 16
81        SCVTF   v30.4s, v30.4s
82        LDR     q6, [x5], 16
83        SCVTF   v31.4s, v31.4s
84        FMUL    v28.4s, v28.4s, v4.4s
85        LDR     q4, [x5], 16
86        FMUL    v29.4s, v29.4s, v5.4s
87        FMUL    v30.4s, v30.4s, v6.4s
88        FMUL    v31.4s, v31.4s, v4.4s
89
90        FCVTNS  v28.4s, v28.4s
91        FCVTNS  v29.4s, v29.4s
92        FCVTNS  v30.4s, v30.4s
93        FCVTNS  v31.4s, v31.4s
94
95        LD1R    {v6.8h}, [x11], 2       // add bias
96        SQXTN   v0.4h, v28.4s
97        SQXTN   v2.4h, v30.4s
98        SQXTN2  v0.8h, v29.4s
99        SQXTN2  v2.8h, v31.4s
100
101        LD2R    {v4.16b, v5.16b}, [x11] // clamp to min/max
102        SQADD   v0.8h, v0.8h, v6.8h
103        SQADD   v2.8h, v2.8h, v6.8h
104        LDR     x12, [sp]               // cn_stride
105        SQXTN   v0.8b, v0.8h
106        SQXTN2  v0.16b, v2.8h
107        SUBS    x1, x1, 16
108        SMAX    v0.16b, v0.16b, v4.16b
109        SMIN    v0.16b, v0.16b, v5.16b
110        B.LO    4f
111
112        # Store full 1 x 16
113        ST1     {v0.16b}, [x6], x12
114        SUB     x3,  x3, x2             // a0 -= kc
115        B.NE    0b
116
117        RET
118
119        # Remainder - 4 bytes of A
120        .p2align 3
1213:
122        LDR     s0,  [x3], 4
123        LDR     q16, [x5, 0]
124        LDR     q17, [x5, 16]
125        SDOT    v28.4s, v16.16b, v0.4b[0]
126        LDR     q18, [x5, 32]
127        SDOT    v29.4s, v17.16b, v0.4b[0]
128        LDR     q19, [x5, 48]
129        SDOT    v30.4s, v18.16b, v0.4b[0]
130        ADD     x5, x5, 64
131        SDOT    v31.4s, v19.16b, v0.4b[0]
132        B       2b
133
134        # Store odd width
135        .p2align 3
1364:
137        TBZ     x1, 3, 5f
138        STR     d0, [x6], 8
139        DUP     d0, v0.d[1]
1405:
141        TBZ     x1, 2, 6f
142        STR     s0, [x6], 4
143        DUP     s0, v0.s[1]
1446:
145        TBZ     x1, 1, 7f
146        STR     h0, [x6], 2
147        DUP     h0, v0.h[1]
1487:
149        TBZ     x1, 0, 8f
150        STR     b0, [x6]
1518:
152        RET
153
154END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64
155
156#ifdef __ELF__
157.section ".note.GNU-stack","",%progbits
158#endif
159