1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# B   x5  v4  v5  v2  v3
32# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
33# temp0  v17 v19 v21 v23
34# x16, x17, x7 tenporary a53 gpr load data
35
36
37BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
38
39        # Clamp C pointers
40        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
41        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
42        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
43        BIC     x2, x2, 7
44
45        .p2align 3
460:
47        # Load initial bias from w into accumulators
48        LDP     s16, s18, [x5], 8
49        LDP     s20, s22, [x5], 8
50        LDP     s24, s26, [x5], 8
51        LDP     s28, s30, [x5], 8
52        MOV     x9, x3                  // p = ks
53
54        .p2align 3
551:
56        # Load next A pointer
57        LDR     x13, [x4], 8
58        CMP     x13, x12                // if a0 == zero
59        ADD     x13, x13, x8            // a0 += a_offset
60        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
61
62        # Is there at least 16 bytes for epilogue?
63        SUBS    x0, x2, 16              // k = kc - 16
64        B.LO    5f
65
66        # Prologue: load A0 and 4 B's
67        LDP     d0, d6, [x13], 16       // Read A0
68        LDP     d4, d5, [x5]            // Read B
69        LDP     d2, d3, [x5, 64]        // Read B
70        LDR     x16, [x5, 16]           // Read B
71
72        # Is there at least 16 bytes for main loop?
73        SUBS    x0, x0, 16              // k = k - 16
74        B.LO    3f
75
76        # Main loop - 16 bytes of A
77        # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles.
78        # 1 load for A0 = +1 cycle.  Total 41 cycles.
79
80        .p2align 3
812:
82        # BLOCK 0 - 6 cycles
83        SMULL   v17.8h, v4.8b, v0.8b
84        LDR     x17, [x5, 80]
85        SMULL   v19.8h, v5.8b, v0.8b
86        LDR     d5, [x5, 24]
87        INS     v4.d[0], x16
88        SMLAL   v17.8h, v2.8b, v6.8b
89        LDR     x16, [x5, 32]
90        SMLAL   v19.8h, v3.8b, v6.8b
91        LDR     d3, [x5, 88]
92        INS     v2.d[0], x17
93
94        # BLOCK 1 - 10 cycles
95        SMULL   v21.8h, v4.8b, v0.8b
96        LDR     x17, [x5, 96]
97        SMULL   v23.8h, v5.8b, v0.8b
98        SADALP  v16.4s, v17.8h
99        PRFM    PLDL1KEEP, [x5, 448]
100        SADALP  v18.4s, v19.8h
101        PRFM    PLDL1KEEP, [x5, 512]
102        LDR     d5, [x5, 40]
103        INS     v4.d[0], x16
104        SMLAL   v21.8h, v2.8b, v6.8b
105        LDR     x16, [x5, 48]
106        SMLAL   v23.8h, v3.8b, v6.8b
107        LDR     d3, [x5, 104]
108        INS     v2.d[0], x17
109
110        # BLOCK 2 - 10 cycles
111        SMULL   v17.8h, v4.8b, v0.8b
112        LDR     x17, [x5, 112]
113        SMULL   v19.8h, v5.8b, v0.8b
114        SADALP  v20.4s, v21.8h
115        PRFM    PLDL1KEEP, [x13, 128]
116        SADALP  v22.4s, v23.8h
117        LDR     d5, [x5, 56]
118        INS     v4.d[0], x16
119        SMLAL   v17.8h, v2.8b, v6.8b
120        LDR     x16, [x5, 128]
121        SMLAL   v19.8h, v3.8b, v6.8b
122        LDR     d3, [x5, 120]
123        INS     v2.d[0], x17
124
125        # BLOCK 3 - 15 cycles
126        SMULL   v21.8h, v4.8b, v0.8b
127        LDR     x7, [x13], 8            // Read A0
128        SMULL   v23.8h, v5.8b, v0.8b
129        LDR     x17, [x5, 192]          // Read B
130        SADALP  v24.4s, v17.8h
131        SUBS    x0, x0, 16
132        SADALP  v26.4s, v19.8h
133        LDR     d5, [x5, 136]           // Read B
134        INS     v4.d[0], x16
135        SMLAL   v21.8h, v2.8b, v6.8b
136        LDR     x16, [x5, 144]
137        SMLAL   v23.8h, v3.8b, v6.8b
138        LDR     d6, [x13], 8            // Read A0
139        INS     v0.d[0], x7
140        LDR     d3, [x5, 200]           // Read B
141        INS     v2.d[0], x17
142        SADALP  v28.4s, v21.8h
143        ADD     x5, x5, 128
144        SADALP  v30.4s, v23.8h
145        B.HS    2b
146
147        # Epilogue
148        # Same as main loop except no loads at end of loop
149
150        .p2align 3
1513:
152        # BLOCK 0 - 6 cycles
153        SMULL   v17.8h, v4.8b, v0.8b
154        LDR     x17, [x5, 80]
155        SMULL   v19.8h, v5.8b, v0.8b
156        LDR     d5, [x5, 24]
157        INS     v4.d[0], x16
158        SMLAL   v17.8h, v2.8b, v6.8b
159        LDR     x16, [x5, 32]
160        SMLAL   v19.8h, v3.8b, v6.8b
161        LDR     d3, [x5, 88]
162        INS     v2.d[0], x17
163
164        # BLOCK 1 - 10 cycles
165        SMULL   v21.8h, v4.8b, v0.8b
166        LDR     x17, [x5, 96]
167        SMULL   v23.8h, v5.8b, v0.8b
168        SADALP  v16.4s, v17.8h
169        SADALP  v18.4s, v19.8h
170        LDR     d5, [x5, 40]
171        INS     v4.d[0], x16
172        SMLAL   v21.8h, v2.8b, v6.8b
173        LDR     x16, [x5, 48]
174        SMLAL   v23.8h, v3.8b, v6.8b
175        LDR     d3, [x5, 104]
176        INS     v2.d[0], x17
177
178        # BLOCK 2 - 10 cycles
179        SMULL   v17.8h, v4.8b, v0.8b
180        LDR     x17, [x5, 112]
181        SMULL   v19.8h, v5.8b, v0.8b
182        SADALP  v20.4s, v21.8h
183        SADALP  v22.4s, v23.8h
184        LDR     d5, [x5, 56]
185        INS     v4.d[0], x16
186        SMLAL   v17.8h, v2.8b, v6.8b
187        SMLAL   v19.8h, v3.8b, v6.8b
188        LDR     d3, [x5, 120]
189        INS     v2.d[0], x17
190
191        # BLOCK 3 - 12 cycles
192        SMULL   v21.8h, v4.8b, v0.8b
193        SMULL   v23.8h, v5.8b, v0.8b
194        SADALP  v24.4s, v17.8h
195        SADALP  v26.4s, v19.8h
196        SMLAL   v21.8h, v2.8b, v6.8b
197        SMLAL   v23.8h, v3.8b, v6.8b
198        SADALP  v28.4s, v21.8h
199        ADD     x5, x5, 128
200        SADALP  v30.4s, v23.8h
201
202        # Is there a remainder?- 8 bytes of A
203        TBNZ    x0, 3, 5f
204
205        # ks loop
206        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
207        B.HI    1b
208
2094:
210        # Add columns
211        ADDP    v16.4s, v16.4s, v18.4s
212        ADDP    v20.4s, v20.4s, v22.4s
213        ADDP    v24.4s, v24.4s, v26.4s
214        ADDP    v28.4s, v28.4s, v30.4s
215        ADDP    v0.4s, v16.4s, v20.4s
216        ADDP    v1.4s, v24.4s, v28.4s
217
218        # Load per channel scale values from weights
219        SCVTF   v0.4s, v0.4s
220        LDR     q4, [x5], 16
221        SCVTF   v1.4s, v1.4s
222        LDR     q5, [x5], 16
223        FMUL    v0.4s, v0.4s, v4.4s
224        FMUL    v1.4s, v1.4s, v5.4s
225
226        FCVTNS  v0.4s, v0.4s
227        FCVTNS  v1.4s, v1.4s
228
229        LD1R    {v5.8h}, [x11], 2
230        SQXTN   v0.4h, v0.4s
231        SQXTN2  v0.8h, v1.4s
232        SUBS    x1, x1, 8
233        SQADD   v0.8h, v0.8h, v5.8h
234        LD1R    {v1.16b}, [x11], 1
235        SQXTN   v0.8b, v0.8h
236        LD1R    {v17.16b}, [x11]
237        SMAX    v0.8b, v0.8b, v1.8b
238        SUB     x11, x11, 3          // rewind params pointer
239
240        SMIN    v0.8b, v0.8b, v17.8b
241        B.LO    6f
242
243        # Store full 1 x 8
244        ST1     {v0.8b}, [x6], x10
245        SUB     x4, x4, x3              // a -= ks
246        B.HI    0b
247        RET
248
249        # Remainder - 8 bytes of A
250        .p2align 3
2515:
252        LDR     d0, [x13], 8
253        LDP     d4, d5, [x5]
254        LDP     d6, d7, [x5, 16]
255        SMULL   v17.8h, v4.8b, v0.8b
256        SMULL   v19.8h, v5.8b, v0.8b
257        SMULL   v21.8h, v6.8b, v0.8b
258        SMULL   v23.8h, v7.8b, v0.8b
259        LDP     d4, d5, [x5, 32]
260        LDP     d6, d7, [x5, 48]
261        SADALP  v16.4s, v17.8h
262        SADALP  v18.4s, v19.8h
263        SADALP  v20.4s, v21.8h
264        SADALP  v22.4s, v23.8h
265        SMULL   v17.8h, v4.8b, v0.8b
266        SMULL   v19.8h, v5.8b, v0.8b
267        SMULL   v21.8h, v6.8b, v0.8b
268        SMULL   v23.8h, v7.8b, v0.8b
269        ADD     x5, x5, 64
270        SADALP  v24.4s, v17.8h
271        SADALP  v26.4s, v19.8h
272        SADALP  v28.4s, v21.8h
273        SADALP  v30.4s, v23.8h
274
275        # ks loop
276        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
277        B.HI    1b
278        B       4b
279
280        # Store odd width
281        .p2align 3
2826:
283        TBZ     x1, 2, 7f
284        STR     s0, [x6], 4
285        EXT     v0.16b, v0.16b, v0.16b, 4
286
2877:
288        TBZ     x1, 1, 8f
289        STR     h0, [x6], 2
290        EXT     v0.16b, v0.16b, v0.16b, 2
2918:
292        TBZ     x1, 0, 9f
293        STR     b0, [x6]
2949:
295        RET
296
297END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
298
299#ifdef __ELF__
300.section ".note.GNU-stack","",%progbits
301#endif
302