xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mull.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c8-aarch64-neon-mull.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0
29# A1  x4  v1
30# B   x5  v4  v5  v6  v7
31# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35# unused  v8 v9
36
37BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull
38
39        # Clamp A and C pointers
40        CMP     x0, 2                   // if mr < 2
41        STP     d10, d11, [sp, -48]!
42        ADD     x4, x3, x4              // a1 = a0 + a_stride
43        STP     d12, d13, [sp, 16]
44        ADD     x7, x6, x7              // c1 = c0 + cm_stride
45        STP     d14, d15, [sp, 32]
46        CSEL    x4, x3, x4, LO          //   a1 = a0
47        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
48        CSEL    x7, x6, x7, LO          //   c1 = c0
49        BIC     x2, x2, 7
50
51        .p2align 3
520:
53        # Load initial bias from w into accumulators
54        MOV     x0, x2                  // k = kc
55        LDP     s16, s18, [x5], 8
56        MOV     v17.16b, v16.16b
57        MOV     v19.16b, v18.16b
58        LDP     s20, s22, [x5], 8
59        MOV     v21.16b, v20.16b
60        MOV     v23.16b, v22.16b
61        LDP     s24, s26, [x5], 8
62        MOV     v25.16b, v24.16b
63        MOV     v27.16b, v26.16b
64        LDP     s28, s30, [x5], 8
65        MOV     v29.16b, v28.16b
66        LDP     x10, x11, [sp, 48]       // cn_stride, params
67        MOV     v31.16b, v30.16b
68
69        # Main loop - 8 bytes of A
70        .p2align 3
711:
72        LDR     d0, [x3], 8
73        LDP     d4, d5, [x5]
74        LDR     d1, [x4], 8
75        LDP     d6, d7, [x5, 16]
76        SMULL   v2.8h, v4.8b, v0.8b
77        SMULL   v3.8h, v4.8b, v1.8b
78        SMULL   v10.8h, v5.8b, v0.8b
79        SMULL   v11.8h, v5.8b, v1.8b
80        SMULL   v12.8h, v6.8b, v0.8b
81        SADALP  v16.4s,  v2.8h
82        SMULL   v13.8h, v6.8b, v1.8b
83        SADALP  v17.4s,  v3.8h
84        SMULL   v14.8h, v7.8b, v0.8b
85        SADALP  v18.4s, v10.8h
86        SMULL   v15.8h, v7.8b, v1.8b
87        SADALP  v19.4s, v11.8h
88        LDP     d4, d5, [x5, 32]
89        SMULL   v2.8h, v4.8b, v0.8b
90        SADALP  v20.4s, v12.8h
91        SMULL   v3.8h, v4.8b, v1.8b
92        SADALP  v21.4s, v13.8h
93        SMULL   v10.8h, v5.8b, v0.8b
94        SADALP  v22.4s, v14.8h
95        SMULL   v11.8h, v5.8b, v1.8b
96        SADALP  v23.4s, v15.8h
97        LDP     d6, d7, [x5, 48]
98        SMULL   v12.8h, v6.8b, v0.8b
99        SADALP  v24.4s,  v2.8h
100        SMULL   v13.8h, v6.8b, v1.8b
101        SADALP  v25.4s,  v3.8h
102        SMULL   v14.8h, v7.8b, v0.8b
103        SADALP  v26.4s, v10.8h
104        SMULL   v15.8h, v7.8b, v1.8b
105        SADALP  v27.4s, v11.8h
106        ADD     x5, x5, 64
107        SADALP  v28.4s, v12.8h
108        SADALP  v29.4s, v13.8h
109        SUBS    x0, x0, 8
110        SADALP  v30.4s, v14.8h
111        SADALP  v31.4s, v15.8h
112        B.HI    1b
113
114        # Add columns
115        ADDP    v16.4s, v16.4s, v18.4s
116        ADDP    v20.4s, v20.4s, v22.4s
117        ADDP    v24.4s, v24.4s, v26.4s
118        ADDP    v28.4s, v28.4s, v30.4s
119        ADDP    v17.4s, v17.4s, v19.4s
120        ADDP    v21.4s, v21.4s, v23.4s
121        ADDP    v25.4s, v25.4s, v27.4s
122        ADDP    v29.4s, v29.4s, v31.4s
123        ADDP    v0.4s, v16.4s, v20.4s
124        ADDP    v1.4s, v24.4s, v28.4s
125        ADDP    v2.4s, v17.4s, v21.4s
126        ADDP    v3.4s, v25.4s, v29.4s
127
128        # Load per channel scale values from weights
129        SCVTF   v0.4s, v0.4s
130        LDR     q4, [x5], 16
131        SCVTF   v1.4s, v1.4s
132        LDR     q5, [x5], 16
133        SCVTF   v2.4s, v2.4s
134        SCVTF   v3.4s, v3.4s
135        FMUL    v0.4s, v0.4s, v4.4s
136        FMUL    v1.4s, v1.4s, v5.4s
137        FMUL    v2.4s, v2.4s, v4.4s
138        FMUL    v3.4s, v3.4s, v5.4s
139
140        FCVTNS  v0.4s, v0.4s
141        FCVTNS  v1.4s, v1.4s
142        FCVTNS  v2.4s, v2.4s
143        FCVTNS  v3.4s, v3.4s
144
145        LD1R    {v5.8h}, [x11], 2
146
147        SQXTN   v0.4h, v0.4s
148        SQXTN   v2.4h, v2.4s
149        SQXTN2  v0.8h, v1.4s
150        SQXTN2  v2.8h, v3.4s
151        SUBS    x1, x1, 8
152        SQADD   v0.8h, v0.8h, v5.8h
153        SQADD   v1.8h, v2.8h, v5.8h
154        SQXTN   v0.8b, v0.8h
155        SQXTN2  v0.16b, v1.8h
156        LD1R    {v1.16b}, [x11], 1
157        LD1R    {v2.16b}, [x11]
158        SMAX    v0.16b, v0.16b, v1.16b
159        SMIN    v0.16b, v0.16b, v2.16b
160        B.LO    2f
161
162        # Store full 2 x 8
163        ST1     {v0.8b}, [x6], x10
164        SUB     x3, x3, x2              // a0 -= kc
165        ST1     {v0.d}[1], [x7], x10
166        SUB     x4, x4, x2              // a1 -= kc
167        B.HI    0b
168
169        # Restore d10-d15 from stack
170        LDP     d14, d15, [sp, 32]
171        LDP     d12, d13, [sp, 16]
172        LDP     d10, d11, [sp], 48
173        RET
174
175        # Store odd width
176        .p2align 3
1772:
178        TBZ     x1, 2, 3f
179        STR     s0, [x6], 4
180        ST1     {v0.s}[2], [x7], 4
181        EXT     v0.16b, v0.16b, v0.16b, 4
182
1833:
184        TBZ     x1, 1, 4f
185        STR     h0, [x6], 2
186        ST1     {v0.h}[4], [x7], 2
187        EXT     v0.16b, v0.16b, v0.16b, 2
1884:
189        TBZ     x1, 0, 5f
190        STR     b0, [x6]
191        ST1     {v0.b}[8], [x7]
1925:
193        # Restore d10-d15 from stack
194        LDP     d14, d15, [sp, 32]
195        LDP     d12, d13, [sp, 16]
196        LDP     d10, d11, [sp], 48
197        RET
198
199END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull
200
201#ifdef __ELF__
202.section ".note.GNU-stack","",%progbits
203#endif
204
205