1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1 x15 v1
30# A2 x13 v2
31# A3  x4 v3
32# B   x5 v4  v5  v6
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
38
39# x10 x17 a53 temp registers
40
41BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
42
43        # Clamp A and C pointers
44        CMP     x0, 2                   // if mr < 2
45        LDP     x12, x11, [sp]          // Load cn_stride, params
46        ADD     x15, x3, x4             // a1 = a0 + a_stride
47        ADD     x8, x6, x7              // c1 = c0 + cm_stride
48        CSEL    x15, x3, x15, LO        //   a1 = a0
49        CSEL    x8, x6,  x8, LO         //   c1 = c0
50
51        ADD     x13, x15, x4            // a2 = a1 + a_stride
52        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
53                                        // if mr <= 2
54        CSEL    x13, x15, x13, LS       //   a2 = a1
55        CSEL    x9,  x8,  x9, LS        //   c2 = c1
56
57        CMP     x0, 4                   // if mr < 4
58        ADD     x4, x13, x4             // a3 = a2 + a_stride
59        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
60        CSEL    x4, x13, x4, LO         //   a3 = a2
61        CSEL    x7,  x9, x7, LO         //   c3 = c2
62
63        .p2align 3
640:
65        # Load initial bias from w into accumulators
66        LDP     q16, q20, [x5], 32
67        MOV     v17.16b, v16.16b
68        MOV     v18.16b, v16.16b
69        LDP     q24, q28, [x5], 32
70        MOV     v19.16b, v16.16b
71        MOV     v21.16b, v20.16b
72        MOV     v22.16b, v20.16b
73        MOV     v23.16b, v20.16b
74        SUBS    x0, x2, 8               // k = kc - 8
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        MOV     v27.16b, v24.16b
78        MOV     v29.16b, v28.16b
79        MOV     v30.16b, v28.16b
80        MOV     v31.16b, v28.16b
81        # Is there at least 8 bytes for epilogue?
82        B.LO    4f
83
84        # Prologue
85        LDR     d0, [x3], 8
86        LDP     d4, d6, [x5]
87        LDR     d1, [x15], 8
88        LDR     d2, [x13], 8
89        LDR     d3, [x4], 8
90        SXTL    v0.8h, v0.8b
91        LDR     x17, [x5, 16]
92        SXTL    v4.8h, v4.8b
93        SXTL    v1.8h, v1.8b
94        SXTL    v2.8h, v2.8b
95        SXTL    v3.8h, v3.8b
96        SXTL    v6.8h, v6.8b
97
98        SUBS    x0, x0, 8               // k = k - 8
99        # Is there at least 8 bytes for main loop?
100        B.LO    2f
101
102        # Main loop - 8 bytes of A
103        .p2align 3
1041:
105        SMLAL   v16.4s, v4.4h, v0.h[0]
106        SMLAL2  v20.4s, v4.8h, v0.h[0]
107        PRFM    PLDL1KEEP, [x3, 128]
108        SMLAL   v17.4s, v4.4h, v1.h[0]
109        SMLAL2  v21.4s, v4.8h, v1.h[0]
110        PRFM    PLDL1KEEP, [x15, 128]
111        SMLAL   v18.4s, v4.4h, v2.h[0]
112        SMLAL2  v22.4s, v4.8h, v2.h[0]
113        PRFM    PLDL1KEEP, [x13, 128]
114        SMLAL   v19.4s, v4.4h, v3.h[0]
115        SMLAL2  v23.4s, v4.8h, v3.h[0]
116        PRFM    PLDL1KEEP, [x4, 128]
117        LDR     d4, [x5, 24]
118        INS     v5.d[0], x17
119        SMLAL   v24.4s, v6.4h, v0.h[0]
120        SMLAL2  v28.4s, v6.8h, v0.h[0]
121        PRFM    PLDL1KEEP, [x5, 448]
122        SMLAL   v25.4s, v6.4h, v1.h[0]
123        SMLAL2  v29.4s, v6.8h, v1.h[0]
124        PRFM    PLDL1KEEP, [x5, 512]
125        SXTL    v5.8h, v5.8b
126        SMLAL   v26.4s, v6.4h, v2.h[0]
127        SMLAL2  v30.4s, v6.8h, v2.h[0]
128        SMLAL   v27.4s, v6.4h, v3.h[0]
129        SMLAL2  v31.4s, v6.8h, v3.h[0]
130        LDR     x17, [x5, 32]
131        SMLAL   v16.4s, v5.4h, v0.h[1]
132        SMLAL2  v20.4s, v5.8h, v0.h[1]
133        SMLAL   v17.4s, v5.4h, v1.h[1]
134        SMLAL2  v21.4s, v5.8h, v1.h[1]
135        SXTL    v4.8h, v4.8b
136        SMLAL   v18.4s, v5.4h, v2.h[1]
137        SMLAL2  v22.4s, v5.8h, v2.h[1]
138        SMLAL   v19.4s, v5.4h, v3.h[1]
139        SMLAL2  v23.4s, v5.8h, v3.h[1]
140        LDR     d5, [x5, 40]
141        INS     v6.d[0], x17
142        SMLAL   v24.4s, v4.4h, v0.h[1]
143        SMLAL2  v28.4s, v4.8h, v0.h[1]
144        SMLAL   v25.4s, v4.4h, v1.h[1]
145        SMLAL2  v29.4s, v4.8h, v1.h[1]
146        SXTL    v6.8h, v6.8b
147        SMLAL   v26.4s, v4.4h, v2.h[1]
148        SMLAL2  v30.4s, v4.8h, v2.h[1]
149        SMLAL   v27.4s, v4.4h, v3.h[1]
150        SMLAL2  v31.4s, v4.8h, v3.h[1]
151        LDR     x17, [x5, 48]
152        SMLAL   v16.4s, v6.4h, v0.h[2]
153        SMLAL2  v20.4s, v6.8h, v0.h[2]
154        SMLAL   v17.4s, v6.4h, v1.h[2]
155        SXTL    v5.8h, v5.8b
156        SMLAL2  v21.4s, v6.8h, v1.h[2]
157        SMLAL   v18.4s, v6.4h, v2.h[2]
158        SMLAL2  v22.4s, v6.8h, v2.h[2]
159        SMLAL   v19.4s, v6.4h, v3.h[2]
160        SMLAL2  v23.4s, v6.8h, v3.h[2]
161        LDR     d6, [x5, 56]
162        INS     v4.d[0], x17
163        SMLAL   v24.4s, v5.4h, v0.h[2]
164        SMLAL2  v28.4s, v5.8h, v0.h[2]
165        SMLAL   v25.4s, v5.4h, v1.h[2]
166        SMLAL2  v29.4s, v5.8h, v1.h[2]
167        SXTL    v4.8h, v4.8b
168        SMLAL   v26.4s, v5.4h, v2.h[2]
169        SMLAL2  v30.4s, v5.8h, v2.h[2]
170        SMLAL   v27.4s, v5.4h, v3.h[2]
171        SMLAL2  v31.4s, v5.8h, v3.h[2]
172        LDR     x17, [x5, 64]
173        SMLAL   v16.4s, v4.4h, v0.h[3]
174        SMLAL2  v20.4s, v4.8h, v0.h[3]
175        SMLAL   v17.4s, v4.4h, v1.h[3]
176        SMLAL2  v21.4s, v4.8h, v1.h[3]
177        SXTL    v6.8h, v6.8b
178        SMLAL   v18.4s, v4.4h, v2.h[3]
179        SMLAL2  v22.4s, v4.8h, v2.h[3]
180        SMLAL   v19.4s, v4.4h, v3.h[3]
181        SMLAL2  v23.4s, v4.8h, v3.h[3]
182        LDR     d4, [x5, 72]
183        INS     v5.d[0], x17
184        SMLAL   v24.4s, v6.4h, v0.h[3]
185        SMLAL2  v28.4s, v6.8h, v0.h[3]
186        SXTL    v5.8h, v5.8b
187        SMLAL   v25.4s, v6.4h, v1.h[3]
188        SMLAL2  v29.4s, v6.8h, v1.h[3]
189        SMLAL   v26.4s, v6.4h, v2.h[3]
190        SMLAL2  v30.4s, v6.8h, v2.h[3]
191        SMLAL   v27.4s, v6.4h, v3.h[3]
192        SMLAL2  v31.4s, v6.8h, v3.h[3]
193        LDR     x17, [x5, 80]
194        SMLAL   v16.4s, v5.4h, v0.h[4]
195        SMLAL2  v20.4s, v5.8h, v0.h[4]
196        SMLAL   v17.4s, v5.4h, v1.h[4]
197        SMLAL2  v21.4s, v5.8h, v1.h[4]
198        SXTL    v4.8h, v4.8b
199        SMLAL   v18.4s, v5.4h, v2.h[4]
200        SMLAL2  v22.4s, v5.8h, v2.h[4]
201        SMLAL   v19.4s, v5.4h, v3.h[4]
202        SMLAL2  v23.4s, v5.8h, v3.h[4]
203        LDR     d5, [x5, 88]
204        INS     v6.d[0], x17
205        SMLAL   v24.4s, v4.4h, v0.h[4]
206        SMLAL2  v28.4s, v4.8h, v0.h[4]
207        SMLAL   v25.4s, v4.4h, v1.h[4]
208        SMLAL2  v29.4s, v4.8h, v1.h[4]
209        SXTL    v6.8h, v6.8b
210        SMLAL   v26.4s, v4.4h, v2.h[4]
211        SMLAL2  v30.4s, v4.8h, v2.h[4]
212        SMLAL   v27.4s, v4.4h, v3.h[4]
213        SMLAL2  v31.4s, v4.8h, v3.h[4]
214        LDR     x17, [x5, 96]
215        SMLAL   v16.4s, v6.4h, v0.h[5]
216        SMLAL2  v20.4s, v6.8h, v0.h[5]
217        SMLAL   v17.4s, v6.4h, v1.h[5]
218        SMLAL2  v21.4s, v6.8h, v1.h[5]
219        SXTL    v5.8h, v5.8b
220        SMLAL   v18.4s, v6.4h, v2.h[5]
221        SMLAL2  v22.4s, v6.8h, v2.h[5]
222        SMLAL   v19.4s, v6.4h, v3.h[5]
223        SMLAL2  v23.4s, v6.8h, v3.h[5]
224        LDR     d6, [x5, 104]
225        INS     v4.d[0], x17
226        SMLAL   v24.4s, v5.4h, v0.h[5]
227        SMLAL2  v28.4s, v5.8h, v0.h[5]
228        SMLAL   v25.4s, v5.4h, v1.h[5]
229        SMLAL2  v29.4s, v5.8h, v1.h[5]
230        SXTL    v4.8h, v4.8b
231        SMLAL   v26.4s, v5.4h, v2.h[5]
232        SMLAL2  v30.4s, v5.8h, v2.h[5]
233        SMLAL   v27.4s, v5.4h, v3.h[5]
234        SMLAL2  v31.4s, v5.8h, v3.h[5]
235        SXTL    v6.8h, v6.8b
236        LDR     x17, [x5, 112]
237        SMLAL   v16.4s, v4.4h, v0.h[6]
238        SMLAL2  v20.4s, v4.8h, v0.h[6]
239        SMLAL   v17.4s, v4.4h, v1.h[6]
240        SMLAL2  v21.4s, v4.8h, v1.h[6]
241        SMLAL   v18.4s, v4.4h, v2.h[6]
242        SMLAL2  v22.4s, v4.8h, v2.h[6]
243        SMLAL   v19.4s, v4.4h, v3.h[6]
244        SMLAL2  v23.4s, v4.8h, v3.h[6]
245        LDR     d5, [x5, 120]
246        INS     v4.d[0], x17
247        SMLAL   v24.4s, v6.4h, v0.h[6]
248        SMLAL2  v28.4s, v6.8h, v0.h[6]
249        SMLAL   v25.4s, v6.4h, v1.h[6]
250        SMLAL2  v29.4s, v6.8h, v1.h[6]
251        SXTL    v4.8h, v4.8b
252        ADD     x5, x5, 128
253
254        SMLAL   v26.4s, v6.4h, v2.h[6]
255        SMLAL2  v30.4s, v6.8h, v2.h[6]
256        LDR     x17, [x5]
257        SMLAL   v27.4s, v6.4h, v3.h[6]
258        SMLAL2  v31.4s, v6.8h, v3.h[6]
259        SXTL    v5.8h, v5.8b
260        LDR     x10, [x3], 8
261
262        SMLAL   v16.4s, v4.4h, v0.h[7]
263        SMLAL2  v20.4s, v4.8h, v0.h[7]
264        SMLAL   v17.4s, v4.4h, v1.h[7]
265        SMLAL2  v21.4s, v4.8h, v1.h[7]
266        SMLAL   v18.4s, v4.4h, v2.h[7]
267        SMLAL2  v22.4s, v4.8h, v2.h[7]
268        SMLAL   v19.4s, v4.4h, v3.h[7]
269        SMLAL2  v23.4s, v4.8h, v3.h[7]
270        LDR     d6, [x5, 8]
271        INS     v4.d[0], x17
272        SMLAL   v24.4s, v5.4h, v0.h[7]
273        SMLAL2  v28.4s, v5.8h, v0.h[7]
274        LDR     x17, [x13], 8
275        SMLAL   v25.4s, v5.4h, v1.h[7]
276        SMLAL2  v29.4s, v5.8h, v1.h[7]
277        LDR     d1, [x15], 8
278        INS     v0.d[0], x10
279        SMLAL   v26.4s, v5.4h, v2.h[7]
280        SMLAL2  v30.4s, v5.8h, v2.h[7]
281        SMLAL   v27.4s, v5.4h, v3.h[7]
282        SMLAL2  v31.4s, v5.8h, v3.h[7]
283        LDR     d3, [x4], 8
284        INS     v2.d[0], x17
285
286        SXTL    v0.8h, v0.8b
287        SXTL    v1.8h, v1.8b
288        LDR     x17, [x5, 16]
289        SXTL    v4.8h, v4.8b
290        SXTL    v2.8h, v2.8b
291        SUBS    x0, x0, 8
292        SXTL    v3.8h, v3.8b
293        SXTL    v6.8h, v6.8b
294        B.HS    1b
295
296        # Epilogue.  Same as main loop but no preloads in final group
297
298        .p2align 3
2992:
300        SMLAL   v16.4s, v4.4h, v0.h[0]
301        SMLAL2  v20.4s, v4.8h, v0.h[0]
302        SMLAL   v17.4s, v4.4h, v1.h[0]
303        SMLAL2  v21.4s, v4.8h, v1.h[0]
304        SMLAL   v18.4s, v4.4h, v2.h[0]
305        SMLAL2  v22.4s, v4.8h, v2.h[0]
306        SMLAL   v19.4s, v4.4h, v3.h[0]
307        SMLAL2  v23.4s, v4.8h, v3.h[0]
308        LDR     d4, [x5, 24]
309        INS     v5.d[0], x17
310        SMLAL   v24.4s, v6.4h, v0.h[0]
311        SMLAL2  v28.4s, v6.8h, v0.h[0]
312        SMLAL   v25.4s, v6.4h, v1.h[0]
313        SMLAL2  v29.4s, v6.8h, v1.h[0]
314        SXTL    v5.8h, v5.8b
315        SMLAL   v26.4s, v6.4h, v2.h[0]
316        SMLAL2  v30.4s, v6.8h, v2.h[0]
317        SMLAL   v27.4s, v6.4h, v3.h[0]
318        SMLAL2  v31.4s, v6.8h, v3.h[0]
319        LDR     x17, [x5, 32]
320        SMLAL   v16.4s, v5.4h, v0.h[1]
321        SMLAL2  v20.4s, v5.8h, v0.h[1]
322        SMLAL   v17.4s, v5.4h, v1.h[1]
323        SMLAL2  v21.4s, v5.8h, v1.h[1]
324        SXTL    v4.8h, v4.8b
325        SMLAL   v18.4s, v5.4h, v2.h[1]
326        SMLAL2  v22.4s, v5.8h, v2.h[1]
327        SMLAL   v19.4s, v5.4h, v3.h[1]
328        SMLAL2  v23.4s, v5.8h, v3.h[1]
329        LDR     d5, [x5, 40]
330        INS     v6.d[0], x17
331        SMLAL   v24.4s, v4.4h, v0.h[1]
332        SMLAL2  v28.4s, v4.8h, v0.h[1]
333        SMLAL   v25.4s, v4.4h, v1.h[1]
334        SMLAL2  v29.4s, v4.8h, v1.h[1]
335        SXTL    v6.8h, v6.8b
336        SMLAL   v26.4s, v4.4h, v2.h[1]
337        SMLAL2  v30.4s, v4.8h, v2.h[1]
338        SMLAL   v27.4s, v4.4h, v3.h[1]
339        SMLAL2  v31.4s, v4.8h, v3.h[1]
340        LDR     x17, [x5, 48]
341        SMLAL   v16.4s, v6.4h, v0.h[2]
342        SMLAL2  v20.4s, v6.8h, v0.h[2]
343        SMLAL   v17.4s, v6.4h, v1.h[2]
344        SXTL    v5.8h, v5.8b
345        SMLAL2  v21.4s, v6.8h, v1.h[2]
346        SMLAL   v18.4s, v6.4h, v2.h[2]
347        SMLAL2  v22.4s, v6.8h, v2.h[2]
348        SMLAL   v19.4s, v6.4h, v3.h[2]
349        SMLAL2  v23.4s, v6.8h, v3.h[2]
350        LDR     d6, [x5, 56]
351        INS     v4.d[0], x17
352        SMLAL   v24.4s, v5.4h, v0.h[2]
353        SMLAL2  v28.4s, v5.8h, v0.h[2]
354        SMLAL   v25.4s, v5.4h, v1.h[2]
355        SMLAL2  v29.4s, v5.8h, v1.h[2]
356        SXTL    v4.8h, v4.8b
357        SMLAL   v26.4s, v5.4h, v2.h[2]
358        SMLAL2  v30.4s, v5.8h, v2.h[2]
359        SMLAL   v27.4s, v5.4h, v3.h[2]
360        SMLAL2  v31.4s, v5.8h, v3.h[2]
361        LDR     x17, [x5, 64]
362        SMLAL   v16.4s, v4.4h, v0.h[3]
363        SMLAL2  v20.4s, v4.8h, v0.h[3]
364        SMLAL   v17.4s, v4.4h, v1.h[3]
365        SMLAL2  v21.4s, v4.8h, v1.h[3]
366        SXTL    v6.8h, v6.8b
367        SMLAL   v18.4s, v4.4h, v2.h[3]
368        SMLAL2  v22.4s, v4.8h, v2.h[3]
369        SMLAL   v19.4s, v4.4h, v3.h[3]
370        SMLAL2  v23.4s, v4.8h, v3.h[3]
371        LDR     d4, [x5, 72]
372        INS     v5.d[0], x17
373        SMLAL   v24.4s, v6.4h, v0.h[3]
374        SMLAL2  v28.4s, v6.8h, v0.h[3]
375        SXTL    v5.8h, v5.8b
376        SMLAL   v25.4s, v6.4h, v1.h[3]
377        SMLAL2  v29.4s, v6.8h, v1.h[3]
378        SMLAL   v26.4s, v6.4h, v2.h[3]
379        SMLAL2  v30.4s, v6.8h, v2.h[3]
380        SMLAL   v27.4s, v6.4h, v3.h[3]
381        SMLAL2  v31.4s, v6.8h, v3.h[3]
382        LDR     x17, [x5, 80]
383        SMLAL   v16.4s, v5.4h, v0.h[4]
384        SMLAL2  v20.4s, v5.8h, v0.h[4]
385        SMLAL   v17.4s, v5.4h, v1.h[4]
386        SMLAL2  v21.4s, v5.8h, v1.h[4]
387        SXTL    v4.8h, v4.8b
388        SMLAL   v18.4s, v5.4h, v2.h[4]
389        SMLAL2  v22.4s, v5.8h, v2.h[4]
390        SMLAL   v19.4s, v5.4h, v3.h[4]
391        SMLAL2  v23.4s, v5.8h, v3.h[4]
392        LDR     d5, [x5, 88]
393        INS     v6.d[0], x17
394        SMLAL   v24.4s, v4.4h, v0.h[4]
395        SMLAL2  v28.4s, v4.8h, v0.h[4]
396        SMLAL   v25.4s, v4.4h, v1.h[4]
397        SMLAL2  v29.4s, v4.8h, v1.h[4]
398        SXTL    v6.8h, v6.8b
399        SMLAL   v26.4s, v4.4h, v2.h[4]
400        SMLAL2  v30.4s, v4.8h, v2.h[4]
401        SMLAL   v27.4s, v4.4h, v3.h[4]
402        SMLAL2  v31.4s, v4.8h, v3.h[4]
403        LDR     x17, [x5, 96]
404        SMLAL   v16.4s, v6.4h, v0.h[5]
405        SMLAL2  v20.4s, v6.8h, v0.h[5]
406        SMLAL   v17.4s, v6.4h, v1.h[5]
407        SMLAL2  v21.4s, v6.8h, v1.h[5]
408        SXTL    v5.8h, v5.8b
409        SMLAL   v18.4s, v6.4h, v2.h[5]
410        SMLAL2  v22.4s, v6.8h, v2.h[5]
411        SMLAL   v19.4s, v6.4h, v3.h[5]
412        SMLAL2  v23.4s, v6.8h, v3.h[5]
413        LDR     d6, [x5, 104]
414        INS     v4.d[0], x17
415        SMLAL   v24.4s, v5.4h, v0.h[5]
416        SMLAL2  v28.4s, v5.8h, v0.h[5]
417        SMLAL   v25.4s, v5.4h, v1.h[5]
418        SMLAL2  v29.4s, v5.8h, v1.h[5]
419        SXTL    v4.8h, v4.8b
420        SMLAL   v26.4s, v5.4h, v2.h[5]
421        SMLAL2  v30.4s, v5.8h, v2.h[5]
422        SMLAL   v27.4s, v5.4h, v3.h[5]
423        SMLAL2  v31.4s, v5.8h, v3.h[5]
424        SXTL    v6.8h, v6.8b
425        SMLAL   v16.4s, v4.4h, v0.h[6]
426        SMLAL2  v20.4s, v4.8h, v0.h[6]
427        SMLAL   v17.4s, v4.4h, v1.h[6]
428        SMLAL2  v21.4s, v4.8h, v1.h[6]
429        SMLAL   v18.4s, v4.4h, v2.h[6]
430        SMLAL2  v22.4s, v4.8h, v2.h[6]
431        SMLAL   v19.4s, v4.4h, v3.h[6]
432        SMLAL2  v23.4s, v4.8h, v3.h[6]
433        LDR     x17, [x5, 112]
434        SMLAL   v24.4s, v6.4h, v0.h[6]
435        SMLAL2  v28.4s, v6.8h, v0.h[6]
436        SMLAL   v25.4s, v6.4h, v1.h[6]
437        SMLAL2  v29.4s, v6.8h, v1.h[6]
438        LDR     d5, [x5, 120]
439        INS     v4.d[0], x17
440        SXTL    v4.8h, v4.8b
441        SMLAL   v26.4s, v6.4h, v2.h[6]
442        SMLAL2  v30.4s, v6.8h, v2.h[6]
443        SMLAL   v27.4s, v6.4h, v3.h[6]
444        SMLAL2  v31.4s, v6.8h, v3.h[6]
445        SMLAL   v16.4s, v4.4h, v0.h[7]
446        SMLAL2  v20.4s, v4.8h, v0.h[7]
447        SMLAL   v17.4s, v4.4h, v1.h[7]
448        SMLAL2  v21.4s, v4.8h, v1.h[7]
449        SXTL    v5.8h, v5.8b
450        SMLAL   v18.4s, v4.4h, v2.h[7]
451        SMLAL2  v22.4s, v4.8h, v2.h[7]
452        SMLAL   v19.4s, v4.4h, v3.h[7]
453        SMLAL2  v23.4s, v4.8h, v3.h[7]
454        ADD     x5, x5, 128
455        SMLAL   v24.4s, v5.4h, v0.h[7]
456        SMLAL2  v28.4s, v5.8h, v0.h[7]
457        SMLAL   v25.4s, v5.4h, v1.h[7]
458        SMLAL2  v29.4s, v5.8h, v1.h[7]
459        AND     x0, x2, 7               // kc remainder 0 to 7
460        SMLAL   v26.4s, v5.4h, v2.h[7]
461        SMLAL2  v30.4s, v5.8h, v2.h[7]
462        SMLAL   v27.4s, v5.4h, v3.h[7]
463        SMLAL2  v31.4s, v5.8h, v3.h[7]
464
465        # Is there a remainder?- 1 to 7 bytes of A
466        CBNZ    x0, 4f
467
4683:
469        SCVTF   v16.4s, v16.4s
470        SCVTF   v17.4s, v17.4s
471        # Load per channel scale values from weights
472        LDR     q4, [x5], 16
473        SCVTF   v18.4s, v18.4s
474        SCVTF   v19.4s, v19.4s
475        LDR     q5, [x5], 16
476        SCVTF   v20.4s, v20.4s
477        SCVTF   v21.4s, v21.4s
478        SCVTF   v22.4s, v22.4s
479        SCVTF   v23.4s, v23.4s
480        SCVTF   v24.4s, v24.4s
481        SCVTF   v25.4s, v25.4s
482        SCVTF   v26.4s, v26.4s
483        SCVTF   v27.4s, v27.4s
484        SCVTF   v28.4s, v28.4s
485        SCVTF   v29.4s, v29.4s
486        SCVTF   v30.4s, v30.4s
487        SCVTF   v31.4s, v31.4s
488
489        LDR     q6, [x5], 16
490        FMUL    v16.4s, v16.4s, v4.4s
491        FMUL    v17.4s, v17.4s, v4.4s
492        FMUL    v18.4s, v18.4s, v4.4s
493        FMUL    v19.4s, v19.4s, v4.4s
494        FMUL    v20.4s, v20.4s, v5.4s
495        LDR     q4, [x5], 16
496        FMUL    v21.4s, v21.4s, v5.4s
497        FMUL    v22.4s, v22.4s, v5.4s
498        FMUL    v23.4s, v23.4s, v5.4s
499        FMUL    v24.4s, v24.4s, v6.4s
500        FMUL    v25.4s, v25.4s, v6.4s
501        FMUL    v26.4s, v26.4s, v6.4s
502        FMUL    v27.4s, v27.4s, v6.4s
503        FMUL    v28.4s, v28.4s, v4.4s
504        FMUL    v29.4s, v29.4s, v4.4s
505        FMUL    v30.4s, v30.4s, v4.4s
506        FMUL    v31.4s, v31.4s, v4.4s
507
508        FCVTNS  v16.4s, v16.4s
509        FCVTNS  v17.4s, v17.4s
510        FCVTNS  v18.4s, v18.4s
511        FCVTNS  v19.4s, v19.4s
512        FCVTNS  v20.4s, v20.4s
513        FCVTNS  v21.4s, v21.4s
514        FCVTNS  v22.4s, v22.4s
515        FCVTNS  v23.4s, v23.4s
516        FCVTNS  v24.4s, v24.4s
517        FCVTNS  v25.4s, v25.4s
518        FCVTNS  v26.4s, v26.4s
519        FCVTNS  v27.4s, v27.4s
520        FCVTNS  v28.4s, v28.4s
521        FCVTNS  v29.4s, v29.4s
522        FCVTNS  v30.4s, v30.4s
523        FCVTNS  v31.4s, v31.4s
524
525        SQXTN   v16.4h, v16.4s
526        SQXTN   v17.4h, v17.4s
527        SQXTN   v18.4h, v18.4s
528        SQXTN   v19.4h, v19.4s
529        SQXTN   v24.4h, v24.4s
530        SQXTN   v25.4h, v25.4s
531        SQXTN   v26.4h, v26.4s
532        SQXTN   v27.4h, v27.4s
533        LD1R    {v6.8h}, [x11], 2       // add bias
534
535        SQXTN2  v16.8h, v20.4s
536        SQXTN2  v17.8h, v21.4s
537        SQXTN2  v18.8h, v22.4s
538        SQXTN2  v19.8h, v23.4s
539        SQXTN2  v24.8h, v28.4s
540        SQXTN2  v25.8h, v29.4s
541        SQXTN2  v26.8h, v30.4s
542        SQXTN2  v27.8h, v31.4s
543
544        SQADD   v16.8h, v16.8h, v6.8h
545        SQADD   v17.8h, v17.8h, v6.8h
546        SQADD   v18.8h, v18.8h, v6.8h
547        SQADD   v19.8h, v19.8h, v6.8h
548        SQADD   v24.8h, v24.8h, v6.8h
549        SQADD   v25.8h, v25.8h, v6.8h
550        SQADD   v26.8h, v26.8h, v6.8h
551        SQADD   v27.8h, v27.8h, v6.8h
552        LD1R    {v4.16b}, [x11], 1      // clamp min value
553
554        SQXTN   v0.8b, v16.8h
555        SQXTN   v1.8b, v17.8h
556        SQXTN   v2.8b, v18.8h
557        SQXTN   v3.8b, v19.8h
558        LD1R    {v5.16b}, [x11]         // clamp max value
559        SQXTN2  v0.16b, v24.8h
560        SQXTN2  v1.16b, v25.8h
561        SQXTN2  v2.16b, v26.8h
562        SQXTN2  v3.16b, v27.8h
563        SUB     x11, x11, 3             // rewind params pointer
564
565        SMAX    v0.16b, v0.16b, v4.16b
566        SMAX    v1.16b, v1.16b, v4.16b
567        SMAX    v2.16b, v2.16b, v4.16b
568        SMAX    v3.16b, v3.16b, v4.16b
569        SUBS    x1, x1, 16
570        SMIN    v0.16b, v0.16b, v5.16b
571        SMIN    v1.16b, v1.16b, v5.16b
572        SMIN    v2.16b, v2.16b, v5.16b
573        SMIN    v3.16b, v3.16b, v5.16b
574        B.LO    5f
575
576        # Store full 4 x 16
577        ST1     {v0.16b}, [x6], x12
578        SUB     x3,  x3, x2             // a0 -= kc
579        ST1     {v1.16b}, [x8], x12
580        SUB     x15, x15, x2            // a1 -= kc
581        ST1     {v2.16b}, [x9], x12
582        SUB     x13, x13, x2            // a2 -= kc
583        ST1     {v3.16b}, [x7], x12
584        SUB     x4,  x4, x2             // a3 -= kc
585        B.NE    0b
586        RET
587
588        # Remainder- 1 to 7 bytes of A
589        .p2align 3
5904:
591        AND     x0, x2, 7               // kc remainder 1 to 7
592
593        LD1     {v0.8b},  [x3], x0
594        LDP     d4, d5, [x5], 16
595        LD1     {v1.8b}, [x15], x0
596        LD1     {v2.8b}, [x13], x0
597        LD1     {v3.8b},  [x4], x0
598        SXTL    v0.8h, v0.8b
599        SXTL    v4.8h, v4.8b
600        SXTL    v5.8h, v5.8b
601        SXTL    v1.8h, v1.8b
602        SXTL    v2.8h, v2.8b
603        SXTL    v3.8h, v3.8b
604        SMLAL   v16.4s, v4.4h, v0.h[0]
605        SMLAL2  v20.4s, v4.8h, v0.h[0]
606        SMLAL   v24.4s, v5.4h, v0.h[0]
607        SMLAL2  v28.4s, v5.8h, v0.h[0]
608        SMLAL   v17.4s, v4.4h, v1.h[0]
609        SMLAL2  v21.4s, v4.8h, v1.h[0]
610        SMLAL   v25.4s, v5.4h, v1.h[0]
611        SMLAL2  v29.4s, v5.8h, v1.h[0]
612        SMLAL   v18.4s, v4.4h, v2.h[0]
613        SMLAL2  v22.4s, v4.8h, v2.h[0]
614        SMLAL   v26.4s, v5.4h, v2.h[0]
615        SMLAL2  v30.4s, v5.8h, v2.h[0]
616        SMLAL   v19.4s, v4.4h, v3.h[0]
617        SMLAL2  v23.4s, v4.8h, v3.h[0]
618        SMLAL   v27.4s, v5.4h, v3.h[0]
619        SMLAL2  v31.4s, v5.8h, v3.h[0]
620        CMP     x0, 2
621        B.LO    3b
622
623        LDP     d4, d5, [x5], 16
624        SXTL    v4.8h, v4.8b
625        SXTL    v5.8h, v5.8b
626        SMLAL   v16.4s, v4.4h, v0.h[1]
627        SMLAL2  v20.4s, v4.8h, v0.h[1]
628        SMLAL   v24.4s, v5.4h, v0.h[1]
629        SMLAL2  v28.4s, v5.8h, v0.h[1]
630        SMLAL   v17.4s, v4.4h, v1.h[1]
631        SMLAL2  v21.4s, v4.8h, v1.h[1]
632        SMLAL   v25.4s, v5.4h, v1.h[1]
633        SMLAL2  v29.4s, v5.8h, v1.h[1]
634        SMLAL   v18.4s, v4.4h, v2.h[1]
635        SMLAL2  v22.4s, v4.8h, v2.h[1]
636        SMLAL   v26.4s, v5.4h, v2.h[1]
637        SMLAL2  v30.4s, v5.8h, v2.h[1]
638        SMLAL   v19.4s, v4.4h, v3.h[1]
639        SMLAL2  v23.4s, v4.8h, v3.h[1]
640        SMLAL   v27.4s, v5.4h, v3.h[1]
641        SMLAL2  v31.4s, v5.8h, v3.h[1]
642        B.EQ    3b
643
644        LDP     d4, d5, [x5], 16
645        SXTL    v4.8h, v4.8b
646        SXTL    v5.8h, v5.8b
647        SMLAL   v16.4s, v4.4h, v0.h[2]
648        SMLAL2  v20.4s, v4.8h, v0.h[2]
649        SMLAL   v24.4s, v5.4h, v0.h[2]
650        SMLAL2  v28.4s, v5.8h, v0.h[2]
651        SMLAL   v17.4s, v4.4h, v1.h[2]
652        SMLAL2  v21.4s, v4.8h, v1.h[2]
653        SMLAL   v25.4s, v5.4h, v1.h[2]
654        SMLAL2  v29.4s, v5.8h, v1.h[2]
655        SMLAL   v18.4s, v4.4h, v2.h[2]
656        SMLAL2  v22.4s, v4.8h, v2.h[2]
657        SMLAL   v26.4s, v5.4h, v2.h[2]
658        SMLAL2  v30.4s, v5.8h, v2.h[2]
659        SMLAL   v19.4s, v4.4h, v3.h[2]
660        SMLAL2  v23.4s, v4.8h, v3.h[2]
661        SMLAL   v27.4s, v5.4h, v3.h[2]
662        SMLAL2  v31.4s, v5.8h, v3.h[2]
663        CMP     x0, 4
664        B.LO    3b
665
666        LDP     d4, d5, [x5], 16
667        SXTL    v4.8h, v4.8b
668        SXTL    v5.8h, v5.8b
669        SMLAL   v16.4s, v4.4h, v0.h[3]
670        SMLAL2  v20.4s, v4.8h, v0.h[3]
671        SMLAL   v24.4s, v5.4h, v0.h[3]
672        SMLAL2  v28.4s, v5.8h, v0.h[3]
673        SMLAL   v17.4s, v4.4h, v1.h[3]
674        SMLAL2  v21.4s, v4.8h, v1.h[3]
675        SMLAL   v25.4s, v5.4h, v1.h[3]
676        SMLAL2  v29.4s, v5.8h, v1.h[3]
677        SMLAL   v18.4s, v4.4h, v2.h[3]
678        SMLAL2  v22.4s, v4.8h, v2.h[3]
679        SMLAL   v26.4s, v5.4h, v2.h[3]
680        SMLAL2  v30.4s, v5.8h, v2.h[3]
681        SMLAL   v19.4s, v4.4h, v3.h[3]
682        SMLAL2  v23.4s, v4.8h, v3.h[3]
683        SMLAL   v27.4s, v5.4h, v3.h[3]
684        SMLAL2  v31.4s, v5.8h, v3.h[3]
685        B.EQ    3b
686
687        LDP     d4, d5, [x5], 16
688        SXTL    v4.8h, v4.8b
689        SXTL    v5.8h, v5.8b
690        SMLAL   v16.4s, v4.4h, v0.h[4]
691        SMLAL2  v20.4s, v4.8h, v0.h[4]
692        SMLAL   v24.4s, v5.4h, v0.h[4]
693        SMLAL2  v28.4s, v5.8h, v0.h[4]
694        SMLAL   v17.4s, v4.4h, v1.h[4]
695        SMLAL2  v21.4s, v4.8h, v1.h[4]
696        SMLAL   v25.4s, v5.4h, v1.h[4]
697        SMLAL2  v29.4s, v5.8h, v1.h[4]
698        SMLAL   v18.4s, v4.4h, v2.h[4]
699        SMLAL2  v22.4s, v4.8h, v2.h[4]
700        SMLAL   v26.4s, v5.4h, v2.h[4]
701        SMLAL2  v30.4s, v5.8h, v2.h[4]
702        SMLAL   v19.4s, v4.4h, v3.h[4]
703        SMLAL2  v23.4s, v4.8h, v3.h[4]
704        SMLAL   v27.4s, v5.4h, v3.h[4]
705        SMLAL2  v31.4s, v5.8h, v3.h[4]
706        CMP     x0, 6
707        B.LO    3b
708
709        LDP     d4, d5, [x5], 16
710        SXTL    v4.8h, v4.8b
711        SXTL    v5.8h, v5.8b
712        SMLAL   v16.4s, v4.4h, v0.h[5]
713        SMLAL2  v20.4s, v4.8h, v0.h[5]
714        SMLAL   v24.4s, v5.4h, v0.h[5]
715        SMLAL2  v28.4s, v5.8h, v0.h[5]
716        SMLAL   v17.4s, v4.4h, v1.h[5]
717        SMLAL2  v21.4s, v4.8h, v1.h[5]
718        SMLAL   v25.4s, v5.4h, v1.h[5]
719        SMLAL2  v29.4s, v5.8h, v1.h[5]
720        SMLAL   v18.4s, v4.4h, v2.h[5]
721        SMLAL2  v22.4s, v4.8h, v2.h[5]
722        SMLAL   v26.4s, v5.4h, v2.h[5]
723        SMLAL2  v30.4s, v5.8h, v2.h[5]
724        SMLAL   v19.4s, v4.4h, v3.h[5]
725        SMLAL2  v23.4s, v4.8h, v3.h[5]
726        SMLAL   v27.4s, v5.4h, v3.h[5]
727        SMLAL2  v31.4s, v5.8h, v3.h[5]
728        B.EQ    3b
729
730        LDP     d4, d5, [x5], 16
731        SXTL    v4.8h, v4.8b
732        SXTL    v5.8h, v5.8b
733        SMLAL   v16.4s, v4.4h, v0.h[6]
734        SMLAL2  v20.4s, v4.8h, v0.h[6]
735        SMLAL   v24.4s, v5.4h, v0.h[6]
736        SMLAL2  v28.4s, v5.8h, v0.h[6]
737        SMLAL   v17.4s, v4.4h, v1.h[6]
738        SMLAL2  v21.4s, v4.8h, v1.h[6]
739        SMLAL   v25.4s, v5.4h, v1.h[6]
740        SMLAL2  v29.4s, v5.8h, v1.h[6]
741        SMLAL   v18.4s, v4.4h, v2.h[6]
742        SMLAL2  v22.4s, v4.8h, v2.h[6]
743        SMLAL   v26.4s, v5.4h, v2.h[6]
744        SMLAL2  v30.4s, v5.8h, v2.h[6]
745        SMLAL   v19.4s, v4.4h, v3.h[6]
746        SMLAL2  v23.4s, v4.8h, v3.h[6]
747        SMLAL   v27.4s, v5.4h, v3.h[6]
748        SMLAL2  v31.4s, v5.8h, v3.h[6]
749        B       3b
750
751        # Store odd width
752        .p2align 3
7535:
754        TBZ     x1, 3, 6f
755        STR     d0, [x6], 8
756        STR     d1, [x8], 8
757        DUP     d0, v0.d[1]
758        DUP     d1, v1.d[1]
759        STR     d2, [x9], 8
760        STR     d3, [x7], 8
761        DUP     d2, v2.d[1]
762        DUP     d3, v3.d[1]
7636:
764        TBZ     x1, 2, 7f
765        STR     s0, [x6], 4
766        STR     s1, [x8], 4
767        DUP     s0, v0.s[1]
768        DUP     s1, v1.s[1]
769        STR     s2, [x9], 4
770        STR     s3, [x7], 4
771        DUP     s2, v2.s[1]
772        DUP     s3, v3.s[1]
7737:
774        TBZ     x1, 1, 8f
775        STR     h0, [x6], 2
776        STR     h1, [x8], 2
777        DUP     h0, v0.h[1]
778        DUP     h1, v1.h[1]
779        STR     h2, [x9], 2
780        STR     h3, [x7], 2
781        DUP     h2, v2.h[1]
782        DUP     h3, v3.h[1]
7838:
784        TBZ     x1, 0, 9f
785        STR     b0, [x6]
786        STR     b1, [x8]
787        STR     b2, [x9]
788        STR     b3, [x7]
7899:
790        RET
791
792END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
793
794#ifdef __ELF__
795.section ".note.GNU-stack","",%progbits
796#endif
797