1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# A1  x4  v1  v7
30# B   x5  v4  v5  v8  v9
31# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35# x16, x17, x20, x21 tenporary a53 gpr load data
36
37
38BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
39
40        # Clamp A and C pointers
41        CMP     x0, 2                   // if mr < 2
42        STP     d8, d9, [sp, -80]!
43        ADD     x4, x3, x4              // a1 = a0 + a_stride
44        STP     d10, d11, [sp, 16]
45        ADD     x7, x6, x7              // c1 = c0 + cm_stride
46        STP     d12, d13, [sp, 32]
47        CSEL    x4, x3, x4, LO          //   a1 = a0
48        STP     d14, d15, [sp, 48]
49        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
50        CSEL    x7, x6, x7, LO          //   c1 = c0
51        BIC     x2, x2, 7
52        STP     x20, x21, [sp, 64]      // Save x20,x21 on stack
53
54        .p2align 3
550:
56        # Load initial bias from w into accumulators
57        SUBS    x0, x2, 16              // k = kc - 16
58        LDP     s16, s18, [x5], 8
59        MOV     v17.16b, v16.16b
60        MOV     v19.16b, v18.16b
61        LDP     s20, s22, [x5], 8
62        MOV     v21.16b, v20.16b
63        MOV     v23.16b, v22.16b
64        LDP     s24, s26, [x5], 8
65        MOV     v25.16b, v24.16b
66        MOV     v27.16b, v26.16b
67        LDP     s28, s30, [x5], 8
68        MOV     v29.16b, v28.16b
69        LDP     x10, x11, [sp, 80]       // cn_stride, params
70        MOV     v31.16b, v30.16b
71        # Is there at least 16 bytes for epilogue?
72        B.LO    4f
73
74        # Prologue: load A0, A1 and 2 B's
75        LDP     d4, d5, [x5]            // Read B
76        LDP     d0, d6, [x3], 16        // Read A0
77        LDR     x17, [x5, 64]           // Read B
78        LDP     d1, d7, [x4], 16        // Read A1
79        LDR     x16, [x5, 16]
80
81        # Is there at least 16 bytes for main loop?
82        SUBS    x0, x0, 16              // k = k - 16
83        B.LO    2f
84
85         # Main loop - 16 bytes of A
86         # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles.
87         # 2 loads for A0 = +2 cycles.  Total 18 * 4 + 2 = 74 cycles.
88
89        .p2align 3
901:
91        # BLOCK 0 - 18 cycles - includes prfm
92        LDR     d9, [x5, 72]            // Read B
93        INS     v8.d[0], x17
94        SMULL   v2.8h, v4.8b, v0.8b
95        SMULL   v3.8h, v4.8b, v1.8b
96        LDR     x17, [x5, 80]
97        SMULL   v10.8h, v5.8b, v0.8b
98        SMULL   v11.8h, v5.8b, v1.8b
99        LDR     d5, [x5, 24]
100        INS     v4.d[0], x16
101        SMLAL   v2.8h, v8.8b, v6.8b
102        SMLAL   v3.8h, v8.8b, v7.8b
103        LDR     x16, [x5, 32]
104        SMLAL   v10.8h, v9.8b, v6.8b
105        SMLAL   v11.8h, v9.8b, v7.8b
106        PRFM    PLDL1KEEP, [x5, 448]
107        SADALP  v16.4s,  v2.8h
108        SADALP  v17.4s,  v3.8h
109        PRFM    PLDL1KEEP, [x5, 512]
110        SADALP  v18.4s, v10.8h
111        SADALP  v19.4s, v11.8h
112
113        # BLOCK 1- 18 cycles
114        LDR     d9, [x5, 88]
115        INS     v8.d[0], x17
116        SMULL   v12.8h, v4.8b, v0.8b
117        SMULL   v13.8h, v4.8b, v1.8b
118        LDR     x17, [x5, 96]
119        SMULL   v14.8h, v5.8b, v0.8b
120        SMULL   v15.8h, v5.8b, v1.8b
121        LDR     d5, [x5, 40]
122        INS     v4.d[0], x16
123        SMLAL   v12.8h, v8.8b, v6.8b
124        SMLAL   v13.8h, v8.8b, v7.8b
125        LDR     x16, [x5, 48]
126        SMLAL   v14.8h, v9.8b, v6.8b
127        SMLAL   v15.8h, v9.8b, v7.8b
128        PRFM    PLDL1KEEP, [x3, 128]
129        SADALP  v20.4s, v12.8h
130        SADALP  v21.4s, v13.8h
131        PRFM    PLDL1KEEP, [x4, 128]
132        SADALP  v22.4s, v14.8h
133        SADALP  v23.4s, v15.8h
134
135        # BLOCK 2 - 18 cycles
136        LDR     d9, [x5, 104]
137        INS     v8.d[0], x17
138        SMULL   v2.8h, v4.8b, v0.8b
139        SMULL   v3.8h, v4.8b, v1.8b
140        LDR     x17, [x5, 112]
141        SMULL   v10.8h, v5.8b, v0.8b
142        SMULL   v11.8h, v5.8b, v1.8b
143        LDR     d5, [x5, 56]
144        INS     v4.d[0], x16
145        SMLAL   v2.8h, v8.8b, v6.8b
146        SMLAL   v3.8h, v8.8b, v7.8b
147        LDR     x16, [x5, 128]
148        SMLAL   v10.8h, v9.8b, v6.8b
149        SMLAL   v11.8h, v9.8b, v7.8b
150        SADALP  v24.4s,  v2.8h
151        LDR     x20, [x3], 8            // Read A0
152        SADALP  v25.4s,  v3.8h
153        LDR     x21, [x4], 8            // Read A1
154        SADALP  v26.4s, v10.8h
155        SADALP  v27.4s, v11.8h
156        SUBS    x0, x0, 16
157
158        # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles
159        LDR     d9, [x5, 120]
160        INS     v8.d[0], x17
161        SMULL   v12.8h, v4.8b, v0.8b
162        SMULL   v13.8h, v4.8b, v1.8b
163        LDR     x17, [x5, 192]          // Read B
164        SMULL   v14.8h, v5.8b, v0.8b
165        SMULL   v15.8h, v5.8b, v1.8b
166        LDR     d5, [x5, 136]           // Read B
167        INS     v4.d[0], x16
168        SMLAL   v12.8h, v8.8b, v6.8b
169        SMLAL   v13.8h, v8.8b, v7.8b
170        LDR     x16, [x5, 144]
171        SMLAL   v14.8h, v9.8b, v6.8b
172        SMLAL   v15.8h, v9.8b, v7.8b
173        LDR     d6, [x3], 8             // Read A0
174        INS     v0.d[0], x20
175        LDR     d7, [x4], 8             // Read A1
176        INS     v1.d[0], x21
177        SADALP  v28.4s, v12.8h
178        SADALP  v29.4s, v13.8h
179        ADD     x5, x5, 128
180        SADALP  v30.4s, v14.8h
181        SADALP  v31.4s, v15.8h
182        B.HS    1b
183
184        # Epilogue
185        # Same as main loop except no loads at end of loop
186
187        .p2align 3
1882:
189        # BLOCK 0 - 18 cycles
190        LDR     d9, [x5, 72]            // Read B
191        INS     v8.d[0], x17
192        SMULL   v2.8h, v4.8b, v0.8b
193        SMULL   v3.8h, v4.8b, v1.8b
194        LDR     x17, [x5, 80]
195        SMULL   v10.8h, v5.8b, v0.8b
196        SMULL   v11.8h, v5.8b, v1.8b
197        LDR     d5, [x5, 24]
198        INS     v4.d[0], x16
199        SMLAL   v2.8h, v8.8b, v6.8b
200        SMLAL   v3.8h, v8.8b, v7.8b
201        LDR     x16, [x5, 32]
202        SMLAL   v10.8h, v9.8b, v6.8b
203        SMLAL   v11.8h, v9.8b, v7.8b
204        SADALP  v16.4s,  v2.8h
205        SADALP  v17.4s,  v3.8h
206        SADALP  v18.4s, v10.8h
207        SADALP  v19.4s, v11.8h
208
209        # BLOCK 1- 18 cycles
210        LDR     d9, [x5, 88]
211        INS     v8.d[0], x17
212        SMULL   v12.8h, v4.8b, v0.8b
213        SMULL   v13.8h, v4.8b, v1.8b
214        LDR     x17, [x5, 96]
215        SMULL   v14.8h, v5.8b, v0.8b
216        SMULL   v15.8h, v5.8b, v1.8b
217        LDR     d5, [x5, 40]
218        INS     v4.d[0], x16
219        SMLAL   v12.8h, v8.8b, v6.8b
220        SMLAL   v13.8h, v8.8b, v7.8b
221        LDR     x16, [x5, 48]
222        SMLAL   v14.8h, v9.8b, v6.8b
223        SMLAL   v15.8h, v9.8b, v7.8b
224        SADALP  v20.4s, v12.8h
225        SADALP  v21.4s, v13.8h
226        SADALP  v22.4s, v14.8h
227        SADALP  v23.4s, v15.8h
228
229        # BLOCK 2 - 18 cycles
230        LDR     d9, [x5, 104]
231        INS     v8.d[0], x17
232        SMULL   v2.8h, v4.8b, v0.8b
233        SMULL   v3.8h, v4.8b, v1.8b
234        LDR     x17, [x5, 112]
235        SMULL   v10.8h, v5.8b, v0.8b
236        SMULL   v11.8h, v5.8b, v1.8b
237        LDR     d5, [x5, 56]
238        INS     v4.d[0], x16
239        SMLAL   v2.8h, v8.8b, v6.8b
240        SMLAL   v3.8h, v8.8b, v7.8b
241        SMLAL   v10.8h, v9.8b, v6.8b
242        SMLAL   v11.8h, v9.8b, v7.8b
243        SADALP  v24.4s,  v2.8h
244        SADALP  v25.4s,  v3.8h
245        SADALP  v26.4s, v10.8h
246        SADALP  v27.4s, v11.8h
247
248        # BLOCK 3 - 17 cycles
249        LDR     d9, [x5, 120]
250        INS     v8.d[0], x17
251        SMULL   v12.8h, v4.8b, v0.8b
252        SMULL   v13.8h, v4.8b, v1.8b
253        SMULL   v14.8h, v5.8b, v0.8b
254        SMULL   v15.8h, v5.8b, v1.8b
255        SMLAL   v12.8h, v8.8b, v6.8b
256        SMLAL   v13.8h, v8.8b, v7.8b
257        SMLAL   v14.8h, v9.8b, v6.8b
258        SMLAL   v15.8h, v9.8b, v7.8b
259        SADALP  v28.4s, v12.8h
260        SADALP  v29.4s, v13.8h
261        ADD     x5, x5, 128
262        SADALP  v30.4s, v14.8h
263        SADALP  v31.4s, v15.8h
264
265        # Is there a remainder?- 8 bytes of A
266        TBNZ    x0, 3, 4f
267
268        .p2align 3
2693:
270        # Add columns
271        ADDP    v16.4s, v16.4s, v18.4s
272        ADDP    v20.4s, v20.4s, v22.4s
273        ADDP    v24.4s, v24.4s, v26.4s
274        ADDP    v28.4s, v28.4s, v30.4s
275        ADDP    v17.4s, v17.4s, v19.4s
276        ADDP    v21.4s, v21.4s, v23.4s
277        ADDP    v25.4s, v25.4s, v27.4s
278        ADDP    v29.4s, v29.4s, v31.4s
279        ADDP    v0.4s, v16.4s, v20.4s
280        ADDP    v1.4s, v24.4s, v28.4s
281        ADDP    v2.4s, v17.4s, v21.4s
282        ADDP    v3.4s, v25.4s, v29.4s
283
284        # Apply params - scale, bias and clamp
285        SCVTF   v0.4s, v0.4s
286        LD1R    {v4.4s}, [x11], 4
287        SCVTF   v1.4s, v1.4s
288        SCVTF   v2.4s, v2.4s
289        SCVTF   v3.4s, v3.4s
290        FMUL    v0.4s, v0.4s, v4.4s
291        FMUL    v1.4s, v1.4s, v4.4s
292        FMUL    v2.4s, v2.4s, v4.4s
293        FMUL    v3.4s, v3.4s, v4.4s
294
295        FCVTNS  v0.4s, v0.4s
296        FCVTNS  v1.4s, v1.4s
297        FCVTNS  v2.4s, v2.4s
298        FCVTNS  v3.4s, v3.4s
299
300        LD1R    {v5.8h}, [x11], 2
301        SQXTN   v0.4h, v0.4s
302        SQXTN   v2.4h, v2.4s
303        SQXTN2  v0.8h, v1.4s
304        SQXTN2  v2.8h, v3.4s
305        SUBS    x1, x1, 8
306        SQADD   v0.8h, v0.8h, v5.8h
307        SQADD   v1.8h, v2.8h, v5.8h
308        SQXTN   v0.8b, v0.8h
309        SQXTN2  v0.16b, v1.8h
310        LD1R    {v1.16b}, [x11], 1
311        LD1R    {v2.16b}, [x11]
312        SMAX    v0.16b, v0.16b, v1.16b
313        SMIN    v0.16b, v0.16b, v2.16b
314        B.LO    5f
315
316        # Store full 2 x 8
317        ST1     {v0.8b}, [x6], x10
318        SUB     x3, x3, x2              // a0 -= kc
319        ST1     {v0.d}[1], [x7], x10
320        SUB     x4, x4, x2              // a1 -= kc
321        B.HI    0b
322
323        # Restore x20,x21 from stack
324        LDP     x20, x21, [sp, 64]
325
326        # Restore d8-d15 from stack
327        LDP     d14, d15, [sp, 48]
328        LDP     d12, d13, [sp, 32]
329        LDP     d10, d11, [sp, 16]
330        LDP     d8, d9, [sp], 80
331        RET
332
333        # Remainder - 8 bytes of A
334        .p2align 3
3354:
336        LDR     d0, [x3], 8
337        LDP     d4, d5, [x5]
338        LDR     d1, [x4], 8
339        LDP     d6, d7, [x5, 16]
340        SMULL   v2.8h, v4.8b, v0.8b
341        SMULL   v3.8h, v4.8b, v1.8b
342        SMULL   v10.8h, v5.8b, v0.8b
343        SMULL   v11.8h, v5.8b, v1.8b
344        SMULL   v12.8h, v6.8b, v0.8b
345        SADALP  v16.4s,  v2.8h
346        SMULL   v13.8h, v6.8b, v1.8b
347        SADALP  v17.4s,  v3.8h
348        SMULL   v14.8h, v7.8b, v0.8b
349        SADALP  v18.4s, v10.8h
350        SMULL   v15.8h, v7.8b, v1.8b
351        SADALP  v19.4s, v11.8h
352        LDP     d4, d5, [x5, 32]
353        SMULL   v2.8h, v4.8b, v0.8b
354        SADALP  v20.4s, v12.8h
355        SMULL   v3.8h, v4.8b, v1.8b
356        SADALP  v21.4s, v13.8h
357        SMULL   v10.8h, v5.8b, v0.8b
358        SADALP  v22.4s, v14.8h
359        SMULL   v11.8h, v5.8b, v1.8b
360        SADALP  v23.4s, v15.8h
361        LDP     d6, d7, [x5, 48]
362        SMULL   v12.8h, v6.8b, v0.8b
363        SADALP  v24.4s,  v2.8h
364        SMULL   v13.8h, v6.8b, v1.8b
365        SADALP  v25.4s,  v3.8h
366        SMULL   v14.8h, v7.8b, v0.8b
367        SADALP  v26.4s, v10.8h
368        SMULL   v15.8h, v7.8b, v1.8b
369        SADALP  v27.4s, v11.8h
370        ADD     x5, x5, 64
371        SADALP  v28.4s, v12.8h
372        SADALP  v29.4s, v13.8h
373        SADALP  v30.4s, v14.8h
374        SADALP  v31.4s, v15.8h
375        B       3b
376
377        # Store odd width
378        .p2align 3
3795:
380        TBZ     x1, 2, 6f
381        STR     s0, [x6], 4
382        ST1     {v0.s}[2], [x7], 4
383        EXT     v0.16b, v0.16b, v0.16b, 4
384
3856:
386        TBZ     x1, 1, 7f
387        STR     h0, [x6], 2
388        ST1     {v0.h}[4], [x7], 2
389        EXT     v0.16b, v0.16b, v0.16b, 2
3907:
391        TBZ     x1, 0, 8f
392        STR     b0, [x6]
393        ST1     {v0.b}[8], [x7]
3948:
395        # Restore x20,x21 from stack
396        LDP     x20, x21, [sp, 64]
397
398        # Restore d8-d15 from stack
399        LDP     d14, d15, [sp, 48]
400        LDP     d12, d13, [sp, 32]
401        LDP     d10, d11, [sp, 16]
402        LDP     d8, d9, [sp], 80
403        RET
404
405END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
406
407#ifdef __ELF__
408.section ".note.GNU-stack","",%progbits
409#endif
410
411