1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           (x4)
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# B   x5  v4  v5  v2  v3
30# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
31# temp0  v17 v19 v21 v23
32# x16, x17, x7 tenporary a53 gpr load data
33
34BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
35
36        LDP     x10, x11, [sp]          // cn_stride, params
37        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
38        BIC     x2, x2, 7
39
40        .p2align 3
410:
42        # Load initial bias from w into accumulators
43        LDP     s16, s18, [x5], 8
44        SUBS    x0, x2, 16              // k = kc - 16
45        LDP     s20, s22, [x5], 8
46        LDP     s24, s26, [x5], 8
47        LDP     s28, s30, [x5], 8
48        # Is there at least 16 bytes for epilogue?
49        B.LO    4f
50
51        # Prologue: load A0 and 4 B's
52        LDP     d0, d6, [x3], 16        // Read A0
53        LDP     d4, d5, [x5]            // Read B
54        LDP     d2, d3, [x5, 64]        // Read B
55        LDR     x16, [x5, 16]           // Read B
56
57        # Is there at least 16 bytes for main loop?
58        SUBS    x0, x0, 16              // k = k - 16
59        B.LO    2f
60
61        # Main loop - 16 bytes of A
62        # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles.
63        # 1 load for A0 = +1 cycle.  Total 41 cycles.
64
65        .p2align 3
661:
67        # BLOCK 0 - 6 cycles
68        SMULL   v17.8h, v4.8b, v0.8b
69        LDR     x17, [x5, 80]
70        SMULL   v19.8h, v5.8b, v0.8b
71        LDR     d5, [x5, 24]
72        INS     v4.d[0], x16
73        SMLAL   v17.8h, v2.8b, v6.8b
74        LDR     x16, [x5, 32]
75        SMLAL   v19.8h, v3.8b, v6.8b
76        LDR     d3, [x5, 88]
77        INS     v2.d[0], x17
78
79        # BLOCK 1 - 10 cycles
80        SMULL   v21.8h, v4.8b, v0.8b
81        LDR     x17, [x5, 96]
82        SMULL   v23.8h, v5.8b, v0.8b
83        SADALP  v16.4s, v17.8h
84        PRFM    PLDL1KEEP, [x5, 448]
85        SADALP  v18.4s, v19.8h
86        PRFM    PLDL1KEEP, [x5, 512]
87        LDR     d5, [x5, 40]
88        INS     v4.d[0], x16
89        SMLAL   v21.8h, v2.8b, v6.8b
90        LDR     x16, [x5, 48]
91        SMLAL   v23.8h, v3.8b, v6.8b
92        LDR     d3, [x5, 104]
93        INS     v2.d[0], x17
94
95        # BLOCK 2 - 10 cycles
96        SMULL   v17.8h, v4.8b, v0.8b
97        LDR     x17, [x5, 112]
98        SMULL   v19.8h, v5.8b, v0.8b
99        SADALP  v20.4s, v21.8h
100        PRFM    PLDL1KEEP, [x3, 128]
101        SADALP  v22.4s, v23.8h
102        LDR     d5, [x5, 56]
103        INS     v4.d[0], x16
104        SMLAL   v17.8h, v2.8b, v6.8b
105        LDR     x16, [x5, 128]
106        SMLAL   v19.8h, v3.8b, v6.8b
107        LDR     d3, [x5, 120]
108        INS     v2.d[0], x17
109
110        # BLOCK 3 - 15 cycles
111        SMULL   v21.8h, v4.8b, v0.8b
112        LDR     x7, [x3], 8             // Read A0
113        SMULL   v23.8h, v5.8b, v0.8b
114        LDR     x17, [x5, 192]          // Read B
115        SADALP  v24.4s, v17.8h
116        SUBS    x0, x0, 16
117        SADALP  v26.4s, v19.8h
118        LDR     d5, [x5, 136]           // Read B
119        INS     v4.d[0], x16
120        SMLAL   v21.8h, v2.8b, v6.8b
121        LDR     x16, [x5, 144]
122        SMLAL   v23.8h, v3.8b, v6.8b
123        LDR     d6, [x3], 8             // Read A0
124        INS     v0.d[0], x7
125        LDR     d3, [x5, 200]           // Read B
126        INS     v2.d[0], x17
127        SADALP  v28.4s, v21.8h
128        ADD     x5, x5, 128
129        SADALP  v30.4s, v23.8h
130        B.HS    1b
131
132        # Epilogue
133        # Same as main loop except no loads at end of loop
134
135        .p2align 3
1362:
137        # BLOCK 0 - 6 cycles
138        SMULL   v17.8h, v4.8b, v0.8b
139        LDR     x17, [x5, 80]
140        SMULL   v19.8h, v5.8b, v0.8b
141        LDR     d5, [x5, 24]
142        INS     v4.d[0], x16
143        SMLAL   v17.8h, v2.8b, v6.8b
144        LDR     x16, [x5, 32]
145        SMLAL   v19.8h, v3.8b, v6.8b
146        LDR     d3, [x5, 88]
147        INS     v2.d[0], x17
148
149        # BLOCK 1 - 10 cycles
150        SMULL   v21.8h, v4.8b, v0.8b
151        LDR     x17, [x5, 96]
152        SMULL   v23.8h, v5.8b, v0.8b
153        SADALP  v16.4s, v17.8h
154        SADALP  v18.4s, v19.8h
155        LDR     d5, [x5, 40]
156        INS     v4.d[0], x16
157        SMLAL   v21.8h, v2.8b, v6.8b
158        LDR     x16, [x5, 48]
159        SMLAL   v23.8h, v3.8b, v6.8b
160        LDR     d3, [x5, 104]
161        INS     v2.d[0], x17
162
163        # BLOCK 2 - 10 cycles
164        SMULL   v17.8h, v4.8b, v0.8b
165        LDR     x17, [x5, 112]
166        SMULL   v19.8h, v5.8b, v0.8b
167        SADALP  v20.4s, v21.8h
168        SADALP  v22.4s, v23.8h
169        LDR     d5, [x5, 56]
170        INS     v4.d[0], x16
171        SMLAL   v17.8h, v2.8b, v6.8b
172        SMLAL   v19.8h, v3.8b, v6.8b
173        LDR     d3, [x5, 120]
174        INS     v2.d[0], x17
175
176        # BLOCK 3 - 12 cycles
177        SMULL   v21.8h, v4.8b, v0.8b
178        SMULL   v23.8h, v5.8b, v0.8b
179        SADALP  v24.4s, v17.8h
180        SADALP  v26.4s, v19.8h
181        SMLAL   v21.8h, v2.8b, v6.8b
182        SMLAL   v23.8h, v3.8b, v6.8b
183        SADALP  v28.4s, v21.8h
184        ADD     x5, x5, 128
185        SADALP  v30.4s, v23.8h
186
187        # Is there a remainder?- 8 bytes of A
188        TBNZ    x0, 3, 4f
189
190        .p2align 3
1913:
192        # Add columns
193        ADDP    v16.4s, v16.4s, v18.4s
194        ADDP    v20.4s, v20.4s, v22.4s
195        ADDP    v24.4s, v24.4s, v26.4s
196        ADDP    v28.4s, v28.4s, v30.4s
197        ADDP    v0.4s, v16.4s, v20.4s
198        ADDP    v1.4s, v24.4s, v28.4s
199
200        # Apply params - scale, bias and clamp
201        SCVTF   v0.4s, v0.4s
202        LD1R    {v4.4s}, [x11], 4
203        SCVTF   v1.4s, v1.4s
204        FMUL    v0.4s, v0.4s, v4.4s
205        FMUL    v1.4s, v1.4s, v4.4s
206
207        FCVTNS  v0.4s, v0.4s
208        FCVTNS  v1.4s, v1.4s
209
210        LD1R    {v5.8h}, [x11], 2
211        SQXTN   v0.4h, v0.4s
212        SQXTN2  v0.8h, v1.4s
213        SUBS    x1, x1, 8
214        SQADD   v0.8h, v0.8h, v5.8h
215        LD1R    {v1.16b}, [x11], 1
216        SQXTN   v0.8b, v0.8h
217        LD1R    {v17.16b}, [x11]
218        SMAX    v0.8b, v0.8b, v1.8b
219        SUB     x11, x11, 7            // rewind params pointer
220        SMIN    v0.8b, v0.8b, v17.8b
221        B.LO    5f
222
223        # Store full 1 x 8
224        ST1     {v0.8b}, [x6], x10
225        SUB     x3, x3, x2              // a0 -= kc
226        B.HI    0b
227        RET
228
229        # Remainder - 8 bytes of A
230        .p2align 3
2314:
232        LDR     d0, [x3], 8
233        LDP     d4, d5, [x5]
234        LDP     d6, d7, [x5, 16]
235        SMULL   v17.8h, v4.8b, v0.8b
236        SMULL   v19.8h, v5.8b, v0.8b
237        SMULL   v21.8h, v6.8b, v0.8b
238        SMULL   v23.8h, v7.8b, v0.8b
239        LDP     d4, d5, [x5, 32]
240        LDP     d6, d7, [x5, 48]
241        SADALP  v16.4s, v17.8h
242        SADALP  v18.4s, v19.8h
243        SADALP  v20.4s, v21.8h
244        SADALP  v22.4s, v23.8h
245        SMULL   v17.8h, v4.8b, v0.8b
246        SMULL   v19.8h, v5.8b, v0.8b
247        SMULL   v21.8h, v6.8b, v0.8b
248        SMULL   v23.8h, v7.8b, v0.8b
249        ADD     x5, x5, 64
250        SADALP  v24.4s, v17.8h
251        SADALP  v26.4s, v19.8h
252        SADALP  v28.4s, v21.8h
253        SADALP  v30.4s, v23.8h
254        B       3b
255
256        # Store odd width
257        .p2align 3
2585:
259        TBZ     x1, 2, 6f
260        STR     s0, [x6], 4
261        EXT     v0.16b, v0.16b, v0.16b, 4
262
2636:
264        TBZ     x1, 1, 7f
265        STR     h0, [x6], 2
266        EXT     v0.16b, v0.16b, v0.16b, 2
2677:
268        TBZ     x1, 0, 8f
269        STR     b0, [x6]
2708:
271        RET
272
273END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
274
275#ifdef __ELF__
276.section ".note.GNU-stack","",%progbits
277#endif
278
279