xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# B   x5  v4  v5  v2  v3
32# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
33# temp0  v17 v19 v21 v23
34
35
36BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm
37
38        # Clamp C pointers
39        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
40        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
41        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
42        BIC     x2, x2, 7
43
44        .p2align 3
450:
46        # Load initial bias from w into accumulators
47        LDP     s16, s18, [x5], 8
48        LDP     s20, s22, [x5], 8
49        LDP     s24, s26, [x5], 8
50        LDP     s28, s30, [x5], 8
51        MOV     x9, x3                  // p = ks
52
53        .p2align 3
541:
55        # Load next A pointer
56        LDR     x13, [x4], 8
57        CMP     x13, x12                // if a0 == zero
58        ADD     x13, x13, x8            // a0 += a_offset
59        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
60
61        # Is there at least 16 bytes for epilogue?
62        SUBS    x0, x2, 16              // k = kc - 16
63        B.LO    5f
64
65        # Prologue: load A0 and 4 B's
66        LDP     d0, d6, [x13], 16       // Read A0
67        LDP     d4, d5, [x5]            // Read B
68        LDP     d2, d3, [x5, 64]        // Read B
69
70        # Is there at least 16 bytes for main loop?
71        SUBS    x0, x0, 16              // k = k - 16
72        B.LO    3f
73
74        # Main loop - 16 bytes of A
75        # 4 groups of 2 mul/mla/adap = 6 cycles.
76        # 2 load for A0, A1 = +4 cycle.  Total 36 cycles.
77
78        .p2align 3
792:
80        # BLOCK 0 - 4 cycles
81        SMULL   v17.8h, v4.8b, v0.8b
82        SMULL   v19.8h, v5.8b, v0.8b
83        LDP     d4, d5, [x5, 16]
84        SMLAL   v17.8h, v2.8b, v6.8b
85        SMLAL   v19.8h, v3.8b, v6.8b
86        LDP     d2, d3, [x5, 80]
87
88        # BLOCK 1 - 6 cycles
89        SMULL   v21.8h, v4.8b, v0.8b
90        SMULL   v23.8h, v5.8b, v0.8b
91        PRFM    PLDL1KEEP, [x5, 448]
92        SADALP  v16.4s, v17.8h
93        PRFM    PLDL1KEEP, [x5, 512]
94        SADALP  v18.4s, v19.8h
95        LDP     d4, d5, [x5, 32]
96        SMLAL   v21.8h, v2.8b, v6.8b
97        SMLAL   v23.8h, v3.8b, v6.8b
98        LDP     d2, d3, [x5, 96]
99
100        # BLOCK 2 - 6 cycles
101        SMULL   v17.8h, v4.8b, v0.8b
102        SMULL   v19.8h, v5.8b, v0.8b
103        PRFM    PLDL1KEEP, [x13, 128]
104        SADALP  v20.4s, v21.8h
105        SADALP  v22.4s, v23.8h
106        LDP     d4, d5, [x5, 48]
107        SMLAL   v17.8h, v2.8b, v6.8b
108        SMLAL   v19.8h, v3.8b, v6.8b
109        LDP     d2, d3, [x5, 112]
110
111        # BLOCK 3 - 14 cycles
112        SMULL   v21.8h, v4.8b, v0.8b
113        ADD     x5, x5, 128
114        SMULL   v23.8h, v5.8b, v0.8b
115        SADALP  v24.4s, v17.8h
116        SUBS    x0, x0, 16
117        SADALP  v26.4s, v19.8h
118        LDP     d4, d5, [x5]            // Read B
119        SMLAL   v21.8h, v2.8b, v6.8b
120        SMLAL   v23.8h, v3.8b, v6.8b
121        LDP     d0, d6, [x13], 16       // Read A0
122        SADALP  v28.4s, v21.8h
123        LDP     d2, d3, [x5, 64]        // Read B
124        SADALP  v30.4s, v23.8h
125        B.HS    2b
126
127        # Epilogue
128        # Same as main loop except no loads at end of loop
129
130        .p2align 3
1313:
132       # BLOCK 0 - 4 cycles
133        SMULL   v17.8h, v4.8b, v0.8b
134        SMULL   v19.8h, v5.8b, v0.8b
135        LDP     d4, d5, [x5, 16]
136        SMLAL   v17.8h, v2.8b, v6.8b
137        SMLAL   v19.8h, v3.8b, v6.8b
138        LDP     d2, d3, [x5, 80]
139
140        # BLOCK 1 - 6 cycles
141        SMULL   v21.8h, v4.8b, v0.8b
142        SMULL   v23.8h, v5.8b, v0.8b
143        PRFM    PLDL1KEEP, [x5, 448]
144        SADALP  v16.4s, v17.8h
145        PRFM    PLDL1KEEP, [x5, 512]
146        SADALP  v18.4s, v19.8h
147        LDP     d4, d5, [x5, 32]
148        SMLAL   v21.8h, v2.8b, v6.8b
149        SMLAL   v23.8h, v3.8b, v6.8b
150        LDP     d2, d3, [x5, 96]
151
152        # BLOCK 2 - 6 cycles
153        SMULL   v17.8h, v4.8b, v0.8b
154        SMULL   v19.8h, v5.8b, v0.8b
155        PRFM    PLDL1KEEP, [x13, 128]
156        SADALP  v20.4s, v21.8h
157        SADALP  v22.4s, v23.8h
158        LDP     d4, d5, [x5, 48]
159        SMLAL   v17.8h, v2.8b, v6.8b
160        SMLAL   v19.8h, v3.8b, v6.8b
161        LDP     d2, d3, [x5, 112]
162
163        # BLOCK 3 - 8 cycles
164        SMULL   v21.8h, v4.8b, v0.8b
165        ADD     x5, x5, 128
166        SMULL   v23.8h, v5.8b, v0.8b
167        SADALP  v24.4s, v17.8h
168        SUBS    x0, x0, 16
169        SADALP  v26.4s, v19.8h
170        SMLAL   v21.8h, v2.8b, v6.8b
171        SMLAL   v23.8h, v3.8b, v6.8b
172        SADALP  v28.4s, v21.8h
173        SADALP  v30.4s, v23.8h
174
175        # Is there a remainder?- 8 bytes of A
176        TBNZ    x0, 3, 5f
177
178        # ks loop
179        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
180        B.HI    1b
181
1824:
183        # Add columns
184        ADDP    v16.4s, v16.4s, v18.4s
185        ADDP    v20.4s, v20.4s, v22.4s
186        ADDP    v24.4s, v24.4s, v26.4s
187        ADDP    v28.4s, v28.4s, v30.4s
188        ADDP    v0.4s, v16.4s, v20.4s
189        ADDP    v1.4s, v24.4s, v28.4s
190
191        # Apply params - scale, bias and clamp
192        SCVTF   v0.4s, v0.4s
193        LD1R    {v4.4s}, [x11], 4
194        SCVTF   v1.4s, v1.4s
195        FMUL    v0.4s, v0.4s, v4.4s
196        FMUL    v1.4s, v1.4s, v4.4s
197
198        FCVTNS  v0.4s, v0.4s
199        FCVTNS  v1.4s, v1.4s
200
201        LD1R    {v5.8h}, [x11], 2
202        SQXTN   v0.4h, v0.4s
203        SQXTN2  v0.8h, v1.4s
204        SUBS    x1, x1, 8
205        SQADD   v0.8h, v0.8h, v5.8h
206        LD1R    {v1.16b}, [x11], 1
207        SQXTN   v0.8b, v0.8h
208        LD1R    {v17.16b}, [x11]
209        SMAX    v0.8b, v0.8b, v1.8b
210        SUB     x11, x11, 7          // rewind params pointer
211        SMIN    v0.8b, v0.8b, v17.8b
212        B.LO    6f
213
214        # Store full 1 x 8
215        ST1     {v0.8b}, [x6], x10
216        SUB     x4, x4, x3              // a -= ks
217        B.HI    0b
218        RET
219
220        # Remainder - 8 bytes of A
221        .p2align 3
2225:
223        LDR     d0, [x13], 8
224        LDP     d4, d5, [x5]
225        LDP     d6, d7, [x5, 16]
226        SMULL   v17.8h, v4.8b, v0.8b
227        SMULL   v19.8h, v5.8b, v0.8b
228        SMULL   v21.8h, v6.8b, v0.8b
229        SMULL   v23.8h, v7.8b, v0.8b
230        LDP     d4, d5, [x5, 32]
231        LDP     d6, d7, [x5, 48]
232        SADALP  v16.4s, v17.8h
233        SADALP  v18.4s, v19.8h
234        SADALP  v20.4s, v21.8h
235        SADALP  v22.4s, v23.8h
236        SMULL   v17.8h, v4.8b, v0.8b
237        SMULL   v19.8h, v5.8b, v0.8b
238        SMULL   v21.8h, v6.8b, v0.8b
239        SMULL   v23.8h, v7.8b, v0.8b
240        ADD     x5, x5, 64
241        SADALP  v24.4s, v17.8h
242        SADALP  v26.4s, v19.8h
243        SADALP  v28.4s, v21.8h
244        SADALP  v30.4s, v23.8h
245
246        # ks loop
247        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
248        B.HI    1b
249        B       4b
250
251        # Store odd width
252        .p2align 3
2536:
254        TBZ     x1, 2, 7f
255        STR     s0, [x6], 4
256        EXT     v0.16b, v0.16b, v0.16b, 4
257
2587:
259        TBZ     x1, 1, 8f
260        STR     h0, [x6], 2
261        EXT     v0.16b, v0.16b, v0.16b, 2
2628:
263        TBZ     x1, 0, 9f
264        STR     b0, [x6]
2659:
266        RET
267
268END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm
269
270#ifdef __ELF__
271.section ".note.GNU-stack","",%progbits
272#endif
273