xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mull.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c8-aarch64-neon-mull.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0
29# A1  x4  v1
30# B   x5  v4  v5  v6  v7
31# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35# unused  v8 v9
36
37BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull
38
39        # Clamp A and C pointers
40        CMP     x0, 2                   // if mr < 2
41        STP     d10, d11, [sp, -48]!
42        ADD     x4, x3, x4              // a1 = a0 + a_stride
43        STP     d12, d13, [sp, 16]
44        ADD     x7, x6, x7              // c1 = c0 + cm_stride
45        STP     d14, d15, [sp, 32]
46        CSEL    x4, x3, x4, LO          //   a1 = a0
47        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
48        CSEL    x7, x6, x7, LO          //   c1 = c0
49        BIC     x2, x2, 7
50
51        .p2align 3
520:
53        # Load initial bias from w into accumulators
54        MOV     x0, x2                  // k = kc
55        LDP     s16, s18, [x5], 8
56        MOV     v17.16b, v16.16b
57        MOV     v19.16b, v18.16b
58        LDP     s20, s22, [x5], 8
59        MOV     v21.16b, v20.16b
60        MOV     v23.16b, v22.16b
61        LDP     s24, s26, [x5], 8
62        MOV     v25.16b, v24.16b
63        MOV     v27.16b, v26.16b
64        LDP     s28, s30, [x5], 8
65        MOV     v29.16b, v28.16b
66        LDP     x10, x11, [sp, 48]       // cn_stride, params
67        MOV     v31.16b, v30.16b
68
69        # Main loop - 8 bytes of A
70        .p2align 3
711:
72        LDR     d0, [x3], 8
73        LDP     d4, d5, [x5]
74        LDR     d1, [x4], 8
75        LDP     d6, d7, [x5, 16]
76        SMULL   v2.8h, v4.8b, v0.8b
77        SMULL   v3.8h, v4.8b, v1.8b
78        SMULL   v10.8h, v5.8b, v0.8b
79        SMULL   v11.8h, v5.8b, v1.8b
80        SMULL   v12.8h, v6.8b, v0.8b
81        SADALP  v16.4s,  v2.8h
82        SMULL   v13.8h, v6.8b, v1.8b
83        SADALP  v17.4s,  v3.8h
84        SMULL   v14.8h, v7.8b, v0.8b
85        SADALP  v18.4s, v10.8h
86        SMULL   v15.8h, v7.8b, v1.8b
87        SADALP  v19.4s, v11.8h
88        LDP     d4, d5, [x5, 32]
89        SMULL   v2.8h, v4.8b, v0.8b
90        SADALP  v20.4s, v12.8h
91        SMULL   v3.8h, v4.8b, v1.8b
92        SADALP  v21.4s, v13.8h
93        SMULL   v10.8h, v5.8b, v0.8b
94        SADALP  v22.4s, v14.8h
95        SMULL   v11.8h, v5.8b, v1.8b
96        SADALP  v23.4s, v15.8h
97        LDP     d6, d7, [x5, 48]
98        SMULL   v12.8h, v6.8b, v0.8b
99        SADALP  v24.4s,  v2.8h
100        SMULL   v13.8h, v6.8b, v1.8b
101        SADALP  v25.4s,  v3.8h
102        SMULL   v14.8h, v7.8b, v0.8b
103        SADALP  v26.4s, v10.8h
104        SMULL   v15.8h, v7.8b, v1.8b
105        SADALP  v27.4s, v11.8h
106        ADD     x5, x5, 64
107        SADALP  v28.4s, v12.8h
108        SADALP  v29.4s, v13.8h
109        SUBS    x0, x0, 8
110        SADALP  v30.4s, v14.8h
111        SADALP  v31.4s, v15.8h
112        B.HI    1b
113
114        # Add columns
115        ADDP    v16.4s, v16.4s, v18.4s
116        ADDP    v20.4s, v20.4s, v22.4s
117        LD1R    {v4.4s}, [x11], 4
118        ADDP    v24.4s, v24.4s, v26.4s
119        ADDP    v28.4s, v28.4s, v30.4s
120        LD1R    {v7.4s}, [x11], 4
121        ADDP    v17.4s, v17.4s, v19.4s
122        ADDP    v21.4s, v21.4s, v23.4s
123        ADDP    v25.4s, v25.4s, v27.4s
124        ADDP    v29.4s, v29.4s, v31.4s
125        ADDP    v0.4s, v16.4s, v20.4s
126        ADDP    v1.4s, v24.4s, v28.4s
127        ADDP    v2.4s, v17.4s, v21.4s
128        ADDP    v3.4s, v25.4s, v29.4s
129
130        # Apply params - preshift, scale, postshift, bias and clamp
131        LD1R    {v5.4s}, [x11], 4
132        SQSHL   v0.4s, v0.4s, v4.4s     // shift to upper bits
133        SQSHL   v1.4s, v1.4s, v4.4s
134        SQSHL   v2.4s, v2.4s, v4.4s
135        SQSHL   v3.4s, v3.4s, v4.4s
136        SQDMULH v0.4s, v0.4s, v7.4s     // scale without rounding
137        SQDMULH v1.4s, v1.4s, v7.4s
138        SQDMULH v2.4s, v2.4s, v7.4s
139        SQDMULH v3.4s, v3.4s, v7.4s
140        SRSHL   v0.4s, v0.4s, v5.4s     // signed rounding shift left
141        SRSHL   v1.4s, v1.4s, v5.4s
142        SRSHL   v2.4s, v2.4s, v5.4s
143        SRSHL   v3.4s, v3.4s, v5.4s
144
145        LD1R    {v5.8h}, [x11], 2
146
147        SQXTN   v0.4h, v0.4s
148        SQXTN   v2.4h, v2.4s
149        SQXTN2  v0.8h, v1.4s
150        SQXTN2  v2.8h, v3.4s
151        SUBS    x1, x1, 8
152        SQADD   v0.8h, v0.8h, v5.8h
153        SQADD   v1.8h, v2.8h, v5.8h
154        SQXTN   v0.8b, v0.8h
155        SQXTN2  v0.16b, v1.8h
156        LD1R    {v1.16b}, [x11], 1
157        LD1R    {v2.16b}, [x11]
158        SMAX    v0.16b, v0.16b, v1.16b
159        SMIN    v0.16b, v0.16b, v2.16b
160        B.LO    2f
161
162        # Store full 2 x 8
163        ST1     {v0.8b}, [x6], x10
164        SUB     x3, x3, x2              // a0 -= kc
165        ST1     {v0.d}[1], [x7], x10
166        SUB     x4, x4, x2              // a1 -= kc
167        B.HI    0b
168
169        # Restore d10-d15 from stack
170        LDP     d14, d15, [sp, 32]
171        LDP     d12, d13, [sp, 16]
172        LDP     d10, d11, [sp], 48
173        RET
174
175        # Store odd width
176        .p2align 3
1772:
178        TBZ     x1, 2, 3f
179        STR     s0, [x6], 4
180        ST1     {v0.s}[2], [x7], 4
181        EXT     v0.16b, v0.16b, v0.16b, 4
182
1833:
184        TBZ     x1, 1, 4f
185        STR     h0, [x6], 2
186        ST1     {v0.h}[4], [x7], 2
187        EXT     v0.16b, v0.16b, v0.16b, 2
1884:
189        TBZ     x1, 0, 5f
190        STR     b0, [x6]
191        ST1     {v0.b}[8], [x7]
1925:
193        # Restore d10-d15 from stack
194        LDP     d14, d15, [sp, 32]
195        LDP     d12, d13, [sp, 16]
196        LDP     d10, d11, [sp], 48
197        RET
198
199END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull
200
201#ifdef __ELF__
202.section ".note.GNU-stack","",%progbits
203#endif
204
205