xref: /aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# B   x5  v4  v5  v2  v3
32# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
33# temp0  v17 v19 v21 v23
34
35
36BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm
37
38        # Clamp C pointers
39        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
40        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
41        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
42        BIC     x2, x2, 7
43
44        .p2align 3
450:
46        # Load initial bias from w into accumulators
47        LDP     s16, s18, [x5], 8
48        LDP     s20, s22, [x5], 8
49        LDP     s24, s26, [x5], 8
50        LDP     s28, s30, [x5], 8
51        MOV     x9, x3                  // p = ks
52
53        .p2align 3
541:
55        # Load next A pointer
56        LDR     x13, [x4], 8
57        CMP     x13, x12                // if a0 == zero
58        ADD     x13, x13, x8            // a0 += a_offset
59        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
60
61        # Is there at least 16 bytes for epilogue?
62        SUBS    x0, x2, 16              // k = kc - 16
63        B.LO    5f
64
65        # Prologue: load A0 and 4 B's
66        LDP     d0, d6, [x13], 16       // Read A0
67        LDP     d4, d5, [x5]            // Read B
68        LDP     d2, d3, [x5, 64]        // Read B
69
70        # Is there at least 16 bytes for main loop?
71        SUBS    x0, x0, 16              // k = k - 16
72        B.LO    3f
73
74        # Main loop - 16 bytes of A
75        # 4 groups of 2 mul/mla/adap = 6 cycles.
76        # 2 load for A0, A1 = +4 cycle.  Total 36 cycles.
77
78        .p2align 3
792:
80        # BLOCK 0 - 4 cycles
81        SMULL   v17.8h, v4.8b, v0.8b
82        SMULL   v19.8h, v5.8b, v0.8b
83        LDP     d4, d5, [x5, 16]
84        SMLAL   v17.8h, v2.8b, v6.8b
85        SMLAL   v19.8h, v3.8b, v6.8b
86        LDP     d2, d3, [x5, 80]
87
88        # BLOCK 1 - 6 cycles
89        SMULL   v21.8h, v4.8b, v0.8b
90        SMULL   v23.8h, v5.8b, v0.8b
91        PRFM    PLDL1KEEP, [x5, 448]
92        SADALP  v16.4s, v17.8h
93        PRFM    PLDL1KEEP, [x5, 512]
94        SADALP  v18.4s, v19.8h
95        LDP     d4, d5, [x5, 32]
96        SMLAL   v21.8h, v2.8b, v6.8b
97        SMLAL   v23.8h, v3.8b, v6.8b
98        LDP     d2, d3, [x5, 96]
99
100        # BLOCK 2 - 6 cycles
101        SMULL   v17.8h, v4.8b, v0.8b
102        SMULL   v19.8h, v5.8b, v0.8b
103        PRFM    PLDL1KEEP, [x13, 128]
104        SADALP  v20.4s, v21.8h
105        SADALP  v22.4s, v23.8h
106        LDP     d4, d5, [x5, 48]
107        SMLAL   v17.8h, v2.8b, v6.8b
108        SMLAL   v19.8h, v3.8b, v6.8b
109        LDP     d2, d3, [x5, 112]
110
111        # BLOCK 3 - 14 cycles
112        SMULL   v21.8h, v4.8b, v0.8b
113        ADD     x5, x5, 128
114        SMULL   v23.8h, v5.8b, v0.8b
115        SADALP  v24.4s, v17.8h
116        SUBS    x0, x0, 16
117        SADALP  v26.4s, v19.8h
118        LDP     d4, d5, [x5]            // Read B
119        SMLAL   v21.8h, v2.8b, v6.8b
120        SMLAL   v23.8h, v3.8b, v6.8b
121        LDP     d0, d6, [x13], 16       // Read A0
122        SADALP  v28.4s, v21.8h
123        LDP     d2, d3, [x5, 64]        // Read B
124        SADALP  v30.4s, v23.8h
125        B.HS    2b
126
127        # Epilogue
128        # Same as main loop except no loads at end of loop
129
130        .p2align 3
1313:
132       # BLOCK 0 - 4 cycles
133        SMULL   v17.8h, v4.8b, v0.8b
134        SMULL   v19.8h, v5.8b, v0.8b
135        LDP     d4, d5, [x5, 16]
136        SMLAL   v17.8h, v2.8b, v6.8b
137        SMLAL   v19.8h, v3.8b, v6.8b
138        LDP     d2, d3, [x5, 80]
139
140        # BLOCK 1 - 6 cycles
141        SMULL   v21.8h, v4.8b, v0.8b
142        SMULL   v23.8h, v5.8b, v0.8b
143        PRFM    PLDL1KEEP, [x5, 448]
144        SADALP  v16.4s, v17.8h
145        PRFM    PLDL1KEEP, [x5, 512]
146        SADALP  v18.4s, v19.8h
147        LDP     d4, d5, [x5, 32]
148        SMLAL   v21.8h, v2.8b, v6.8b
149        SMLAL   v23.8h, v3.8b, v6.8b
150        LDP     d2, d3, [x5, 96]
151
152        # BLOCK 2 - 6 cycles
153        SMULL   v17.8h, v4.8b, v0.8b
154        SMULL   v19.8h, v5.8b, v0.8b
155        PRFM    PLDL1KEEP, [x13, 128]
156        SADALP  v20.4s, v21.8h
157        SADALP  v22.4s, v23.8h
158        LDP     d4, d5, [x5, 48]
159        SMLAL   v17.8h, v2.8b, v6.8b
160        SMLAL   v19.8h, v3.8b, v6.8b
161        LDP     d2, d3, [x5, 112]
162
163        # BLOCK 3 - 8 cycles
164        SMULL   v21.8h, v4.8b, v0.8b
165        ADD     x5, x5, 128
166        SMULL   v23.8h, v5.8b, v0.8b
167        SADALP  v24.4s, v17.8h
168        SUBS    x0, x0, 16
169        SADALP  v26.4s, v19.8h
170        SMLAL   v21.8h, v2.8b, v6.8b
171        SMLAL   v23.8h, v3.8b, v6.8b
172        SADALP  v28.4s, v21.8h
173        SADALP  v30.4s, v23.8h
174
175        # Is there a remainder?- 8 bytes of A
176        TBNZ    x0, 3, 5f
177
178        # ks loop
179        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
180        B.HI    1b
181
1824:
183        # Add columns
184        ADDP    v16.4s, v16.4s, v18.4s
185        ADDP    v20.4s, v20.4s, v22.4s
186        ADDP    v24.4s, v24.4s, v26.4s
187        ADDP    v28.4s, v28.4s, v30.4s
188        ADDP    v0.4s, v16.4s, v20.4s
189        ADDP    v1.4s, v24.4s, v28.4s
190
191        # Load per channel scale values from weights
192        SCVTF   v0.4s, v0.4s
193        LDR     q4, [x5], 16
194        SCVTF   v1.4s, v1.4s
195        LDR     q5, [x5], 16
196        FMUL    v0.4s, v0.4s, v4.4s
197        FMUL    v1.4s, v1.4s, v5.4s
198
199        FCVTNS  v0.4s, v0.4s
200        FCVTNS  v1.4s, v1.4s
201
202        LD1R    {v5.8h}, [x11], 2
203        SQXTN   v0.4h, v0.4s
204        SQXTN2  v0.8h, v1.4s
205        SUBS    x1, x1, 8
206        SQADD   v0.8h, v0.8h, v5.8h
207        LD1R    {v1.16b}, [x11], 1
208        SQXTN   v0.8b, v0.8h
209        LD1R    {v17.16b}, [x11]
210        SMAX    v0.8b, v0.8b, v1.8b
211        SUB     x11, x11, 3          // rewind params pointer
212        SMIN    v0.8b, v0.8b, v17.8b
213        B.LO    6f
214
215        # Store full 1 x 8
216        ST1     {v0.8b}, [x6], x10
217        SUB     x4, x4, x3              // a -= ks
218        B.HI    0b
219        RET
220
221        # Remainder - 8 bytes of A
222        .p2align 3
2235:
224        LDR     d0, [x13], 8
225        LDP     d4, d5, [x5]
226        LDP     d6, d7, [x5, 16]
227        SMULL   v17.8h, v4.8b, v0.8b
228        SMULL   v19.8h, v5.8b, v0.8b
229        SMULL   v21.8h, v6.8b, v0.8b
230        SMULL   v23.8h, v7.8b, v0.8b
231        LDP     d4, d5, [x5, 32]
232        LDP     d6, d7, [x5, 48]
233        SADALP  v16.4s, v17.8h
234        SADALP  v18.4s, v19.8h
235        SADALP  v20.4s, v21.8h
236        SADALP  v22.4s, v23.8h
237        SMULL   v17.8h, v4.8b, v0.8b
238        SMULL   v19.8h, v5.8b, v0.8b
239        SMULL   v21.8h, v6.8b, v0.8b
240        SMULL   v23.8h, v7.8b, v0.8b
241        ADD     x5, x5, 64
242        SADALP  v24.4s, v17.8h
243        SADALP  v26.4s, v19.8h
244        SADALP  v28.4s, v21.8h
245        SADALP  v30.4s, v23.8h
246
247        # ks loop
248        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
249        B.HI    1b
250        B       4b
251
252        # Store odd width
253        .p2align 3
2546:
255        TBZ     x1, 2, 7f
256        STR     s0, [x6], 4
257        EXT     v0.16b, v0.16b, v0.16b, 4
258
2597:
260        TBZ     x1, 1, 8f
261        STR     h0, [x6], 2
262        EXT     v0.16b, v0.16b, v0.16b, 2
2638:
264        TBZ     x1, 0, 9f
265        STR     b0, [x6]
2669:
267        RET
268
269END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm
270
271#ifdef __ELF__
272.section ".note.GNU-stack","",%progbits
273#endif
274