1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           (x4)
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# B   x5  v4  v5  v2  v3
30# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
31# temp0  v17 v19 v21 v23
32# x16, x17, x7 tenporary a53 gpr load data
33
34BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
35
36        LDP     x10, x11, [sp]          // cn_stride, params
37        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
38        BIC     x2, x2, 7
39
40        .p2align 3
410:
42        # Load initial bias from w into accumulators
43        LDP     s16, s18, [x5], 8
44        SUBS    x0, x2, 16              // k = kc - 16
45        LDP     s20, s22, [x5], 8
46        LDP     s24, s26, [x5], 8
47        LDP     s28, s30, [x5], 8
48        # Is there at least 16 bytes for epilogue?
49        B.LO    4f
50
51        # Prologue: load A0 and 4 B's
52        LDP     d0, d6, [x3], 16        // Read A0
53        LDP     d4, d5, [x5]            // Read B
54        LDP     d2, d3, [x5, 64]        // Read B
55        LDR     x16, [x5, 16]           // Read B
56
57        # Is there at least 16 bytes for main loop?
58        SUBS    x0, x0, 16              // k = k - 16
59        B.LO    2f
60
61        # Main loop - 16 bytes of A
62        # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles.
63        # 1 load for A0 = +1 cycle.  Total 41 cycles.
64
65        .p2align 3
661:
67        # BLOCK 0 - 6 cycles
68        SMULL   v17.8h, v4.8b, v0.8b
69        LDR     x17, [x5, 80]
70        SMULL   v19.8h, v5.8b, v0.8b
71        LDR     d5, [x5, 24]
72        INS     v4.d[0], x16
73        SMLAL   v17.8h, v2.8b, v6.8b
74        LDR     x16, [x5, 32]
75        SMLAL   v19.8h, v3.8b, v6.8b
76        LDR     d3, [x5, 88]
77        INS     v2.d[0], x17
78
79        # BLOCK 1 - 10 cycles
80        SMULL   v21.8h, v4.8b, v0.8b
81        LDR     x17, [x5, 96]
82        SMULL   v23.8h, v5.8b, v0.8b
83        SADALP  v16.4s, v17.8h
84        PRFM    PLDL1KEEP, [x5, 448]
85        SADALP  v18.4s, v19.8h
86        PRFM    PLDL1KEEP, [x5, 512]
87        LDR     d5, [x5, 40]
88        INS     v4.d[0], x16
89        SMLAL   v21.8h, v2.8b, v6.8b
90        LDR     x16, [x5, 48]
91        SMLAL   v23.8h, v3.8b, v6.8b
92        LDR     d3, [x5, 104]
93        INS     v2.d[0], x17
94
95        # BLOCK 2 - 10 cycles
96        SMULL   v17.8h, v4.8b, v0.8b
97        LDR     x17, [x5, 112]
98        SMULL   v19.8h, v5.8b, v0.8b
99        SADALP  v20.4s, v21.8h
100        PRFM    PLDL1KEEP, [x3, 128]
101        SADALP  v22.4s, v23.8h
102        LDR     d5, [x5, 56]
103        INS     v4.d[0], x16
104        SMLAL   v17.8h, v2.8b, v6.8b
105        LDR     x16, [x5, 128]
106        SMLAL   v19.8h, v3.8b, v6.8b
107        LDR     d3, [x5, 120]
108        INS     v2.d[0], x17
109
110        # BLOCK 3 - 15 cycles
111        SMULL   v21.8h, v4.8b, v0.8b
112        LDR     x7, [x3], 8             // Read A0
113        SMULL   v23.8h, v5.8b, v0.8b
114        LDR     x17, [x5, 192]          // Read B
115        SADALP  v24.4s, v17.8h
116        SUBS    x0, x0, 16
117        SADALP  v26.4s, v19.8h
118        LDR     d5, [x5, 136]           // Read B
119        INS     v4.d[0], x16
120        SMLAL   v21.8h, v2.8b, v6.8b
121        LDR     x16, [x5, 144]
122        SMLAL   v23.8h, v3.8b, v6.8b
123        LDR     d6, [x3], 8             // Read A0
124        INS     v0.d[0], x7
125        LDR     d3, [x5, 200]           // Read B
126        INS     v2.d[0], x17
127        SADALP  v28.4s, v21.8h
128        ADD     x5, x5, 128
129        SADALP  v30.4s, v23.8h
130        B.HS    1b
131
132        # Epilogue
133        # Same as main loop except no loads at end of loop
134
135        .p2align 3
1362:
137        # BLOCK 0 - 6 cycles
138        SMULL   v17.8h, v4.8b, v0.8b
139        LDR     x17, [x5, 80]
140        SMULL   v19.8h, v5.8b, v0.8b
141        LDR     d5, [x5, 24]
142        INS     v4.d[0], x16
143        SMLAL   v17.8h, v2.8b, v6.8b
144        LDR     x16, [x5, 32]
145        SMLAL   v19.8h, v3.8b, v6.8b
146        LDR     d3, [x5, 88]
147        INS     v2.d[0], x17
148
149        # BLOCK 1 - 10 cycles
150        SMULL   v21.8h, v4.8b, v0.8b
151        LDR     x17, [x5, 96]
152        SMULL   v23.8h, v5.8b, v0.8b
153        SADALP  v16.4s, v17.8h
154        SADALP  v18.4s, v19.8h
155        LDR     d5, [x5, 40]
156        INS     v4.d[0], x16
157        SMLAL   v21.8h, v2.8b, v6.8b
158        LDR     x16, [x5, 48]
159        SMLAL   v23.8h, v3.8b, v6.8b
160        LDR     d3, [x5, 104]
161        INS     v2.d[0], x17
162
163        # BLOCK 2 - 10 cycles
164        SMULL   v17.8h, v4.8b, v0.8b
165        LDR     x17, [x5, 112]
166        SMULL   v19.8h, v5.8b, v0.8b
167        SADALP  v20.4s, v21.8h
168        SADALP  v22.4s, v23.8h
169        LDR     d5, [x5, 56]
170        INS     v4.d[0], x16
171        SMLAL   v17.8h, v2.8b, v6.8b
172        SMLAL   v19.8h, v3.8b, v6.8b
173        LDR     d3, [x5, 120]
174        INS     v2.d[0], x17
175
176        # BLOCK 3 - 12 cycles
177        SMULL   v21.8h, v4.8b, v0.8b
178        SMULL   v23.8h, v5.8b, v0.8b
179        SADALP  v24.4s, v17.8h
180        SADALP  v26.4s, v19.8h
181        SMLAL   v21.8h, v2.8b, v6.8b
182        SMLAL   v23.8h, v3.8b, v6.8b
183        SADALP  v28.4s, v21.8h
184        ADD     x5, x5, 128
185        SADALP  v30.4s, v23.8h
186
187        # Is there a remainder?- 8 bytes of A
188        TBNZ    x0, 3, 4f
189
190        .p2align 3
1913:
192        # Add columns
193        ADDP    v16.4s, v16.4s, v18.4s
194        ADDP    v20.4s, v20.4s, v22.4s
195        LD1R    {v4.4s}, [x11], 4
196        ADDP    v24.4s, v24.4s, v26.4s
197        ADDP    v28.4s, v28.4s, v30.4s
198        LD1R    {v7.4s}, [x11], 4
199        ADDP    v0.4s, v16.4s, v20.4s
200        ADDP    v1.4s, v24.4s, v28.4s
201
202        # Apply params - preshift, scale, postshift, bias and clamp
203        LD1R    {v5.4s}, [x11], 4
204        SQSHL   v0.4s, v0.4s, v4.4s     // shift to upper bits
205        SQSHL   v1.4s, v1.4s, v4.4s
206        SQDMULH v0.4s, v0.4s, v7.4s     // scale without rounding
207        SQDMULH v1.4s, v1.4s, v7.4s
208        SRSHL   v0.4s, v0.4s, v5.4s     // signed rounding shift left
209        SRSHL   v1.4s, v1.4s, v5.4s
210
211        LD1R    {v5.8h}, [x11], 2
212        SQXTN   v0.4h, v0.4s
213        SQXTN2  v0.8h, v1.4s
214        SUBS    x1, x1, 8
215        SQADD   v0.8h, v0.8h, v5.8h
216        LD1R    {v1.16b}, [x11], 1
217        SQXTN   v0.8b, v0.8h
218        LD1R    {v17.16b}, [x11]
219        SMAX    v0.8b, v0.8b, v1.8b
220        SUB     x11, x11, 15            // rewind params pointer
221        SMIN    v0.8b, v0.8b, v17.8b
222        B.LO    5f
223
224        # Store full 1 x 8
225        ST1     {v0.8b}, [x6], x10
226        SUB     x3, x3, x2              // a0 -= kc
227        B.HI    0b
228        RET
229
230        # Remainder - 8 bytes of A
231        .p2align 3
2324:
233        LDR     d0, [x3], 8
234        LDP     d4, d5, [x5]
235        LDP     d6, d7, [x5, 16]
236        SMULL   v17.8h, v4.8b, v0.8b
237        SMULL   v19.8h, v5.8b, v0.8b
238        SMULL   v21.8h, v6.8b, v0.8b
239        SMULL   v23.8h, v7.8b, v0.8b
240        LDP     d4, d5, [x5, 32]
241        LDP     d6, d7, [x5, 48]
242        SADALP  v16.4s, v17.8h
243        SADALP  v18.4s, v19.8h
244        SADALP  v20.4s, v21.8h
245        SADALP  v22.4s, v23.8h
246        SMULL   v17.8h, v4.8b, v0.8b
247        SMULL   v19.8h, v5.8b, v0.8b
248        SMULL   v21.8h, v6.8b, v0.8b
249        SMULL   v23.8h, v7.8b, v0.8b
250        ADD     x5, x5, 64
251        SADALP  v24.4s, v17.8h
252        SADALP  v26.4s, v19.8h
253        SADALP  v28.4s, v21.8h
254        SADALP  v30.4s, v23.8h
255        B       3b
256
257        # Store odd width
258        .p2align 3
2595:
260        TBZ     x1, 2, 6f
261        STR     s0, [x6], 4
262        EXT     v0.16b, v0.16b, v0.16b, 4
263
2646:
265        TBZ     x1, 1, 7f
266        STR     h0, [x6], 2
267        EXT     v0.16b, v0.16b, v0.16b, 2
2687:
269        TBZ     x1, 0, 8f
270        STR     b0, [x6]
2718:
272        RET
273
274END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
275
276#ifdef __ELF__
277.section ".note.GNU-stack","",%progbits
278#endif
279
280