xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c16-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0
29# A1  x4  v1
30# B   x5  v4  v5  v6  v7
31# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35# unused  v8 v9
36
37BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal
38
39        # Clamp A and C pointers
40        CMP     x0, 2                   // if mr < 2
41        STP     d10, d11, [sp, -48]!
42        ADD     x4, x3, x4              // a1 = a0 + a_stride
43        STP     d12, d13, [sp, 16]
44        ADD     x7, x6, x7              // c1 = c0 + cm_stride
45        STP     d14, d15, [sp, 32]
46        CSEL    x4, x3, x4, LO          //   a1 = a0
47        ADD     x2, x2, 15              // kc = (kc + 15) & ~15
48        CSEL    x7, x6, x7, LO          //   c1 = c0
49        BIC     x2, x2, 15
50
51        .p2align 3
520:
53        # Load initial bias from w into accumulators
54        MOV     x0, x2                  // k = kc
55        LDP     s16, s18, [x5], 8
56        MOV     v17.16b, v16.16b
57        MOV     v19.16b, v18.16b
58        LDP     s20, s22, [x5], 8
59        MOV     v21.16b, v20.16b
60        MOV     v23.16b, v22.16b
61        LDP     s24, s26, [x5], 8
62        MOV     v25.16b, v24.16b
63        MOV     v27.16b, v26.16b
64        LDP     s28, s30, [x5], 8
65        MOV     v29.16b, v28.16b
66        LDP     x10, x11, [sp, 48]       // cn_stride, params
67        MOV     v31.16b, v30.16b
68
69        # Main loop - 16 bytes of A
70        .p2align 3
711:
72        LDR     q0, [x3], 16
73        LDP     q4, q5, [x5]
74        LDR     q1, [x4], 16
75        LDP     q6, q7, [x5, 32]
76        SMULL   v2.8h, v4.8b, v0.8b
77        SMULL   v3.8h, v4.8b, v1.8b
78        SMULL   v10.8h, v5.8b, v0.8b
79        SMULL   v11.8h, v5.8b, v1.8b
80        SMLAL2  v2.8h, v4.16b, v0.16b
81        SMLAL2  v3.8h, v4.16b, v1.16b
82        SMLAL2  v10.8h, v5.16b, v0.16b
83        SMLAL2  v11.8h, v5.16b, v1.16b
84        SMULL   v12.8h, v6.8b, v0.8b
85        SADALP  v16.4s,  v2.8h
86        SMULL   v13.8h, v6.8b, v1.8b
87        SADALP  v17.4s,  v3.8h
88        SMULL   v14.8h, v7.8b, v0.8b
89        SADALP  v18.4s, v10.8h
90        SMULL   v15.8h, v7.8b, v1.8b
91        SADALP  v19.4s, v11.8h
92        LDP     q4, q5, [x5, 64]
93        SMLAL2  v12.8h, v6.16b, v0.16b
94        SMLAL2  v13.8h, v6.16b, v1.16b
95        SMLAL2  v14.8h, v7.16b, v0.16b
96        SMLAL2  v15.8h, v7.16b, v1.16b
97        SMULL   v2.8h, v4.8b, v0.8b
98        SADALP  v20.4s, v12.8h
99        SMULL   v3.8h, v4.8b, v1.8b
100        SADALP  v21.4s, v13.8h
101        SMULL   v10.8h, v5.8b, v0.8b
102        SADALP  v22.4s, v14.8h
103        SMULL   v11.8h, v5.8b, v1.8b
104        SADALP  v23.4s, v15.8h
105        LDP     q6, q7, [x5, 96]
106
107        SMLAL2  v2.8h, v4.16b, v0.16b
108        SMLAL2  v3.8h, v4.16b, v1.16b
109        SMLAL2  v10.8h, v5.16b, v0.16b
110        SMLAL2  v11.8h, v5.16b, v1.16b
111        ADD     x5, x5, 128
112        SMULL   v12.8h, v6.8b, v0.8b
113        SADALP  v24.4s,  v2.8h
114        SMULL   v13.8h, v6.8b, v1.8b
115        SADALP  v25.4s,  v3.8h
116        SMULL   v14.8h, v7.8b, v0.8b
117        SADALP  v26.4s, v10.8h
118        SMULL   v15.8h, v7.8b, v1.8b
119        SADALP  v27.4s, v11.8h
120        SUBS    x0, x0, 16
121        SMLAL2  v12.8h, v6.16b, v0.16b
122        SMLAL2  v13.8h, v6.16b, v1.16b
123        SMLAL2  v14.8h, v7.16b, v0.16b
124        SMLAL2  v15.8h, v7.16b, v1.16b
125        SADALP  v28.4s, v12.8h
126        SADALP  v29.4s, v13.8h
127        SADALP  v30.4s, v14.8h
128        SADALP  v31.4s, v15.8h
129        B.HI    1b
130
131        # Add columns
132        ADDP    v16.4s, v16.4s, v18.4s
133        ADDP    v20.4s, v20.4s, v22.4s
134        ADDP    v24.4s, v24.4s, v26.4s
135        ADDP    v28.4s, v28.4s, v30.4s
136        ADDP    v17.4s, v17.4s, v19.4s
137        ADDP    v21.4s, v21.4s, v23.4s
138        ADDP    v25.4s, v25.4s, v27.4s
139        ADDP    v29.4s, v29.4s, v31.4s
140        ADDP    v0.4s, v16.4s, v20.4s
141        ADDP    v1.4s, v24.4s, v28.4s
142        ADDP    v2.4s, v17.4s, v21.4s
143        ADDP    v3.4s, v25.4s, v29.4s
144
145        # Load per channel scale values from weights
146        SCVTF   v0.4s, v0.4s
147        LDR     q4, [x5], 16
148        SCVTF   v1.4s, v1.4s
149        LDR     q5, [x5], 16
150        SCVTF   v2.4s, v2.4s
151        SCVTF   v3.4s, v3.4s
152        FMUL    v0.4s, v0.4s, v4.4s
153        FMUL    v1.4s, v1.4s, v5.4s
154        FMUL    v2.4s, v2.4s, v4.4s
155        FMUL    v3.4s, v3.4s, v5.4s
156
157        FCVTNS  v0.4s, v0.4s
158        FCVTNS  v1.4s, v1.4s
159        FCVTNS  v2.4s, v2.4s
160        FCVTNS  v3.4s, v3.4s
161
162        LD1R    {v5.8h}, [x11], 2
163        SQXTN   v0.4h, v0.4s
164        SQXTN   v2.4h, v2.4s
165        SQXTN2  v0.8h, v1.4s
166        SQXTN2  v2.8h, v3.4s
167        SUBS    x1, x1, 8
168        SQADD   v0.8h, v0.8h, v5.8h
169        SQADD   v1.8h, v2.8h, v5.8h
170        SQXTN   v0.8b, v0.8h
171        SQXTN2  v0.16b, v1.8h
172        LD1R    {v1.16b}, [x11], 1
173        LD1R    {v2.16b}, [x11]
174        SMAX    v0.16b, v0.16b, v1.16b
175        SMIN    v0.16b, v0.16b, v2.16b
176        B.LO    2f
177
178        # Store full 2 x 8
179        ST1     {v0.8b}, [x6], x10
180        SUB     x3, x3, x2              // a0 -= kc
181        ST1     {v0.d}[1], [x7], x10
182        SUB     x4, x4, x2              // a1 -= kc
183        B.HI    0b
184
185        # Restore d10-d15 from stack
186        LDP     d14, d15, [sp, 32]
187        LDP     d12, d13, [sp, 16]
188        LDP     d10, d11, [sp], 48
189        RET
190
191        # Store odd width
192        .p2align 3
1932:
194        TBZ     x1, 2, 3f
195        STR     s0, [x6], 4
196        ST1     {v0.s}[2], [x7], 4
197        EXT     v0.16b, v0.16b, v0.16b, 4
198
1993:
200        TBZ     x1, 1, 4f
201        STR     h0, [x6], 2
202        ST1     {v0.h}[4], [x7], 2
203        EXT     v0.16b, v0.16b, v0.16b, 2
2044:
205        TBZ     x1, 0, 5f
206        STR     b0, [x6]
207        ST1     {v0.b}[8], [x7]
2085:
209        # Restore d10-d15 from stack
210        LDP     d14, d15, [sp, 32]
211        LDP     d12, d13, [sp, 16]
212        LDP     d10, d11, [sp], 48
213        RET
214
215END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal
216
217#ifdef __ELF__
218.section ".note.GNU-stack","",%progbits
219#endif
220
221