xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mull.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c8-aarch64-neon-mull.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0
29# A1  x4  v1
30# B   x5  v4  v5  v6  v7
31# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35# unused  v8 v9
36
37BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull
38
39        # Clamp A and C pointers
40        CMP     x0, 2                   // if mr < 2
41        STP     d10, d11, [sp, -48]!
42        ADD     x4, x3, x4              // a1 = a0 + a_stride
43        STP     d12, d13, [sp, 16]
44        ADD     x7, x6, x7              // c1 = c0 + cm_stride
45        STP     d14, d15, [sp, 32]
46        CSEL    x4, x3, x4, LO          //   a1 = a0
47        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
48        CSEL    x7, x6, x7, LO          //   c1 = c0
49        BIC     x2, x2, 7
50
51        .p2align 3
520:
53        # Load initial bias from w into accumulators
54        MOV     x0, x2                  // k = kc
55        LDP     s16, s18, [x5], 8
56        MOV     v17.16b, v16.16b
57        MOV     v19.16b, v18.16b
58        LDP     s20, s22, [x5], 8
59        MOV     v21.16b, v20.16b
60        MOV     v23.16b, v22.16b
61        LDP     s24, s26, [x5], 8
62        MOV     v25.16b, v24.16b
63        MOV     v27.16b, v26.16b
64        LDP     s28, s30, [x5], 8
65        MOV     v29.16b, v28.16b
66        LDP     x10, x11, [sp, 48]       // cn_stride, params
67        MOV     v31.16b, v30.16b
68
69        # Main loop - 8 bytes of A
70        .p2align 3
711:
72        LDR     d0, [x3], 8
73        LDP     d4, d5, [x5]
74        LDR     d1, [x4], 8
75        LDP     d6, d7, [x5, 16]
76        SMULL   v2.8h, v4.8b, v0.8b
77        SMULL   v3.8h, v4.8b, v1.8b
78        SMULL   v10.8h, v5.8b, v0.8b
79        SMULL   v11.8h, v5.8b, v1.8b
80        SMULL   v12.8h, v6.8b, v0.8b
81        SADALP  v16.4s,  v2.8h
82        SMULL   v13.8h, v6.8b, v1.8b
83        SADALP  v17.4s,  v3.8h
84        SMULL   v14.8h, v7.8b, v0.8b
85        SADALP  v18.4s, v10.8h
86        SMULL   v15.8h, v7.8b, v1.8b
87        SADALP  v19.4s, v11.8h
88        LDP     d4, d5, [x5, 32]
89        SMULL   v2.8h, v4.8b, v0.8b
90        SADALP  v20.4s, v12.8h
91        SMULL   v3.8h, v4.8b, v1.8b
92        SADALP  v21.4s, v13.8h
93        SMULL   v10.8h, v5.8b, v0.8b
94        SADALP  v22.4s, v14.8h
95        SMULL   v11.8h, v5.8b, v1.8b
96        SADALP  v23.4s, v15.8h
97        LDP     d6, d7, [x5, 48]
98        SMULL   v12.8h, v6.8b, v0.8b
99        SADALP  v24.4s,  v2.8h
100        SMULL   v13.8h, v6.8b, v1.8b
101        SADALP  v25.4s,  v3.8h
102        SMULL   v14.8h, v7.8b, v0.8b
103        SADALP  v26.4s, v10.8h
104        SMULL   v15.8h, v7.8b, v1.8b
105        SADALP  v27.4s, v11.8h
106        ADD     x5, x5, 64
107        SADALP  v28.4s, v12.8h
108        SADALP  v29.4s, v13.8h
109        SUBS    x0, x0, 8
110        SADALP  v30.4s, v14.8h
111        SADALP  v31.4s, v15.8h
112        B.HI    1b
113
114        # Add columns
115        ADDP    v16.4s, v16.4s, v18.4s
116        ADDP    v20.4s, v20.4s, v22.4s
117        ADDP    v24.4s, v24.4s, v26.4s
118        ADDP    v28.4s, v28.4s, v30.4s
119        ADDP    v17.4s, v17.4s, v19.4s
120        ADDP    v21.4s, v21.4s, v23.4s
121        ADDP    v25.4s, v25.4s, v27.4s
122        ADDP    v29.4s, v29.4s, v31.4s
123        ADDP    v0.4s, v16.4s, v20.4s
124        ADDP    v1.4s, v24.4s, v28.4s
125        ADDP    v2.4s, v17.4s, v21.4s
126        ADDP    v3.4s, v25.4s, v29.4s
127
128        # Apply params - scale, bias and clamp
129        SCVTF   v0.4s, v0.4s
130        LD1R    {v4.4s}, [x11], 4
131        SCVTF   v1.4s, v1.4s
132        SCVTF   v2.4s, v2.4s
133        SCVTF   v3.4s, v3.4s
134        FMUL    v0.4s, v0.4s, v4.4s
135        FMUL    v1.4s, v1.4s, v4.4s
136        FMUL    v2.4s, v2.4s, v4.4s
137        FMUL    v3.4s, v3.4s, v4.4s
138
139        FCVTNS  v0.4s, v0.4s
140        FCVTNS  v1.4s, v1.4s
141        FCVTNS  v2.4s, v2.4s
142        FCVTNS  v3.4s, v3.4s
143
144        LD1R    {v5.8h}, [x11], 2
145
146        SQXTN   v0.4h, v0.4s
147        SQXTN   v2.4h, v2.4s
148        SQXTN2  v0.8h, v1.4s
149        SQXTN2  v2.8h, v3.4s
150        SUBS    x1, x1, 8
151        SQADD   v0.8h, v0.8h, v5.8h
152        SQADD   v1.8h, v2.8h, v5.8h
153        SQXTN   v0.8b, v0.8h
154        SQXTN2  v0.16b, v1.8h
155        LD1R    {v1.16b}, [x11], 1
156        LD1R    {v2.16b}, [x11]
157        SMAX    v0.16b, v0.16b, v1.16b
158        SMIN    v0.16b, v0.16b, v2.16b
159        B.LO    2f
160
161        # Store full 2 x 8
162        ST1     {v0.8b}, [x6], x10
163        SUB     x3, x3, x2              // a0 -= kc
164        ST1     {v0.d}[1], [x7], x10
165        SUB     x4, x4, x2              // a1 -= kc
166        B.HI    0b
167
168        # Restore d10-d15 from stack
169        LDP     d14, d15, [sp, 32]
170        LDP     d12, d13, [sp, 16]
171        LDP     d10, d11, [sp], 48
172        RET
173
174        # Store odd width
175        .p2align 3
1762:
177        TBZ     x1, 2, 3f
178        STR     s0, [x6], 4
179        ST1     {v0.s}[2], [x7], 4
180        EXT     v0.16b, v0.16b, v0.16b, 4
181
1823:
183        TBZ     x1, 1, 4f
184        STR     h0, [x6], 2
185        ST1     {v0.h}[4], [x7], 2
186        EXT     v0.16b, v0.16b, v0.16b, 2
1874:
188        TBZ     x1, 0, 5f
189        STR     b0, [x6]
190        ST1     {v0.b}[8], [x7]
1915:
192        # Restore d10-d15 from stack
193        LDP     d14, d15, [sp, 32]
194        LDP     d12, d13, [sp, 16]
195        LDP     d10, d11, [sp], 48
196        RET
197
198END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull
199
200#ifdef __ELF__
201.section ".note.GNU-stack","",%progbits
202#endif
203
204