xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/2x8c16-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0
31# A1 x15  v1
32# B   x5  v4  v5  v6  v7
33# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
34# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
35# temp0   v2 v10 v12 v14
36# temp1   v3 v11 v13 v15
37# unused  v8 v9
38
39BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal
40
41        # Clamp C pointers
42        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
43        CMP     x0, 2                   // if mr < 2
44        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
45        ADD     x7, x6, x7              // c1 = c0 + cm_stride
46        STP     d10, d11, [sp, -48]!
47        ADD     x2, x2, 15              // kc = (kc + 15) & ~15
48        STP     d12, d13, [sp, 16]
49        CSEL    x7, x6, x7, LO          //   c1 = c0
50        STP     d14, d15, [sp, 32]
51        BIC     x2, x2, 15
52
53        .p2align 3
540:
55        # Load initial bias from w into accumulators
56        LDP     s16, s18, [x5], 8
57        MOV     v17.16b, v16.16b
58        MOV     v19.16b, v18.16b
59        LDP     s20, s22, [x5], 8
60        MOV     v21.16b, v20.16b
61        MOV     v23.16b, v22.16b
62        LDP     s24, s26, [x5], 8
63        MOV     v25.16b, v24.16b
64        MOV     v27.16b, v26.16b
65        LDP     s28, s30, [x5], 8
66        MOV     v29.16b, v28.16b
67        MOV     v31.16b, v30.16b
68        MOV     x9, x3                  // p = ks
69
70        .p2align 3
711:
72        # Load next 2 A pointers
73        LDP     x13, x15, [x4], 16
74
75        CMP     x13, x12                // if a0 == zero
76        ADD     x13, x13, x8            // a0 += a_offset
77        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
78        CMP     x15, x12                // if a1 == zero
79        ADD     x15, x15, x8            // a1 += a_offset
80        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
81
82        MOV     x0, x2                  // k = kc
83
84        # Main loop - 16 bytes of A
85        .p2align 3
862:
87        LDR     q0, [x13], 16
88        LDP     q4, q5, [x5]
89        LDR     q1, [x15], 16
90        LDP     q6, q7, [x5, 32]
91        SMULL   v2.8h, v4.8b, v0.8b
92        SMULL   v3.8h, v4.8b, v1.8b
93        SMULL   v10.8h, v5.8b, v0.8b
94        SMULL   v11.8h, v5.8b, v1.8b
95        SMLAL2  v2.8h, v4.16b, v0.16b
96        SMLAL2  v3.8h, v4.16b, v1.16b
97        SMLAL2  v10.8h, v5.16b, v0.16b
98        SMLAL2  v11.8h, v5.16b, v1.16b
99        SMULL   v12.8h, v6.8b, v0.8b
100        SADALP  v16.4s,  v2.8h
101        SMULL   v13.8h, v6.8b, v1.8b
102        SADALP  v17.4s,  v3.8h
103        SMULL   v14.8h, v7.8b, v0.8b
104        SADALP  v18.4s, v10.8h
105        SMULL   v15.8h, v7.8b, v1.8b
106        SADALP  v19.4s, v11.8h
107        LDP     q4, q5, [x5, 64]
108        SMLAL2  v12.8h, v6.16b, v0.16b
109        SMLAL2  v13.8h, v6.16b, v1.16b
110        SMLAL2  v14.8h, v7.16b, v0.16b
111        SMLAL2  v15.8h, v7.16b, v1.16b
112        SMULL   v2.8h, v4.8b, v0.8b
113        SADALP  v20.4s, v12.8h
114        SMULL   v3.8h, v4.8b, v1.8b
115        SADALP  v21.4s, v13.8h
116        SMULL   v10.8h, v5.8b, v0.8b
117        SADALP  v22.4s, v14.8h
118        SMULL   v11.8h, v5.8b, v1.8b
119        SADALP  v23.4s, v15.8h
120        LDP     q6, q7, [x5, 96]
121
122        SMLAL2  v2.8h, v4.16b, v0.16b
123        SMLAL2  v3.8h, v4.16b, v1.16b
124        SMLAL2  v10.8h, v5.16b, v0.16b
125        SMLAL2  v11.8h, v5.16b, v1.16b
126        ADD     x5, x5, 128
127        SMULL   v12.8h, v6.8b, v0.8b
128        SADALP  v24.4s,  v2.8h
129        SMULL   v13.8h, v6.8b, v1.8b
130        SADALP  v25.4s,  v3.8h
131        SMULL   v14.8h, v7.8b, v0.8b
132        SADALP  v26.4s, v10.8h
133        SMULL   v15.8h, v7.8b, v1.8b
134        SADALP  v27.4s, v11.8h
135        SUBS    x0, x0, 16
136        SMLAL2  v12.8h, v6.16b, v0.16b
137        SMLAL2  v13.8h, v6.16b, v1.16b
138        SMLAL2  v14.8h, v7.16b, v0.16b
139        SMLAL2  v15.8h, v7.16b, v1.16b
140        SADALP  v28.4s, v12.8h
141        SADALP  v29.4s, v13.8h
142        SADALP  v30.4s, v14.8h
143        SADALP  v31.4s, v15.8h
144        B.HI    2b
145
146        # ks loop
147        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
148        B.HI    1b
149
150        # Add columns
151        ADDP    v16.4s, v16.4s, v18.4s
152        ADDP    v20.4s, v20.4s, v22.4s
153        LD1R    {v4.4s}, [x11], 4
154        ADDP    v24.4s, v24.4s, v26.4s
155        ADDP    v28.4s, v28.4s, v30.4s
156        LD1R    {v7.4s}, [x11], 4
157        ADDP    v17.4s, v17.4s, v19.4s
158        ADDP    v21.4s, v21.4s, v23.4s
159        ADDP    v25.4s, v25.4s, v27.4s
160        ADDP    v29.4s, v29.4s, v31.4s
161        ADDP    v0.4s, v16.4s, v20.4s
162        ADDP    v1.4s, v24.4s, v28.4s
163        ADDP    v2.4s, v17.4s, v21.4s
164        ADDP    v3.4s, v25.4s, v29.4s
165
166        # Apply params - preshift, scale, postshift, bias and clamp
167        LD1R    {v5.4s}, [x11], 4
168        SQSHL   v0.4s, v0.4s, v4.4s     // shift to upper bits
169        SQSHL   v1.4s, v1.4s, v4.4s
170        SQSHL   v2.4s, v2.4s, v4.4s
171        SQSHL   v3.4s, v3.4s, v4.4s
172        SQDMULH v0.4s, v0.4s, v7.4s     // scale without rounding
173        SQDMULH v1.4s, v1.4s, v7.4s
174        SQDMULH v2.4s, v2.4s, v7.4s
175        SQDMULH v3.4s, v3.4s, v7.4s
176        SRSHL   v0.4s, v0.4s, v5.4s     // signed rounding shift left
177        SRSHL   v1.4s, v1.4s, v5.4s
178        SRSHL   v2.4s, v2.4s, v5.4s
179        SRSHL   v3.4s, v3.4s, v5.4s
180
181        LD1R    {v5.8h}, [x11], 2
182        SQXTN   v0.4h, v0.4s
183        SQXTN   v2.4h, v2.4s
184        SQXTN2  v0.8h, v1.4s
185        SQXTN2  v2.8h, v3.4s
186        SUBS    x1, x1, 8
187        SQADD   v0.8h, v0.8h, v5.8h
188        SQADD   v1.8h, v2.8h, v5.8h
189        SQXTN   v0.8b, v0.8h
190        SQXTN2  v0.16b, v1.8h
191        LD1R    {v1.16b}, [x11], 1
192        LD1R    {v2.16b}, [x11]
193        SMAX    v0.16b, v0.16b, v1.16b
194        SUB     x11, x11, 15          // rewind params pointer
195        SMIN    v0.16b, v0.16b, v2.16b
196        B.LO    3f
197
198        # Store full 2 x 8
199        ST1     {v0.d}[1], [x7], x10
200        SUB     x4, x4, x3              // a -= ks
201        ST1     {v0.8b}, [x6], x10
202
203        # nc loop
204        B.HI    0b
205
206        # Restore d10-d15 from stack
207        LDP     d14, d15, [sp, 32]
208        LDP     d12, d13, [sp, 16]
209        LDP     d10, d11, [sp], 48
210        RET
211
212        # Store odd width
213        .p2align 3
2143:
215        TBZ     x1, 2, 4f
216        ST1     {v0.s}[2], [x7], 4
217        STR     s0, [x6], 4
218        EXT     v0.16b, v0.16b, v0.16b, 4
219
2204:
221        TBZ     x1, 1, 5f
222        ST1     {v0.h}[4], [x7], 2
223        STR     h0, [x6], 2
224        EXT     v0.16b, v0.16b, v0.16b, 2
2255:
226        TBZ     x1, 0, 6f
227        ST1     {v0.b}[8], [x7]
228        STR     b0, [x6]
2296:
230        # Restore d10-d15 from stack
231        LDP     d14, d15, [sp, 32]
232        LDP     d12, d13, [sp, 16]
233        LDP     d10, d11, [sp], 48
234        RET
235
236END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal
237
238#ifdef __ELF__
239.section ".note.GNU-stack","",%progbits
240#endif
241
242