1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t** restrict a, x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x20  v3
34# B    x5  v4  v5  v6
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
40# x11, x21 temp for Cortex-A53 loads
41
42BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
43
44        # Clamp C pointers
45        CMP     x0, 2                   // if mr < 2
46        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
47        ADD     x16, x6, x7             // c1 = c0 + cm_stride
48        CSEL    x16, x6,  x16, LO       //   c1 = c0
49
50        ADD     x17, x16, x7            // c2 = c1 + cm_stride
51        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
52                                        // if mr <= 2
53        CSEL    x17, x16, x17, LS       //   c2 = c1
54
55        CMP     x0, 4                   // if mr < 4
56        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
57        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
58        CSEL    x7,  x17, x7, LO        //   c3 = c2
59
60
61        .p2align 3
620:
63        # Load initial bias from w into accumulators
64        LDP     q16, q20, [x5], 32
65        MOV     v17.16b, v16.16b
66        MOV     v18.16b, v16.16b
67        LDP     q24, q28, [x5], 32
68        MOV     v19.16b, v16.16b
69        MOV     v21.16b, v20.16b
70        MOV     v22.16b, v20.16b
71        MOV     v23.16b, v20.16b
72        MOV     v25.16b, v24.16b
73        MOV     v26.16b, v24.16b
74        MOV     v27.16b, v24.16b
75        MOV     v29.16b, v28.16b
76        MOV     v30.16b, v28.16b
77        MOV     v31.16b, v28.16b
78        MOV     x9, x3                  // p = ks
79
80        .p2align 3
811:
82        # Load next 4 A pointers
83        LDP     x13, x14, [x4], 16
84        LDP     x15, x20, [x4], 16
85
86        CMP     x13, x12                // if a0 == zero
87        ADD     x13, x13, x8            // a0 += a_offset
88        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
89        CMP     x14, x12                // if a1 == zero
90        ADD     x14, x14, x8            // a1 += a_offset
91        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
92        CMP     x15, x12                // if a2 == zero
93        ADD     x15, x15, x8            // a2 += a_offset
94        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
95        CMP     x20, x12                // if a3 == zero
96        ADD     x20, x20, x8            // a3 += a_offset
97        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
98
99        # Is there at least 8 bytes for epilogue?
100        SUBS    x0, x2, 8               // k = kc - 8
101        B.LO    5f
102
103        # Prologue
104        LDR     d0, [x13], 8
105        LDP     d4, d6, [x5]
106        LDR     d1, [x14], 8
107        LDR     d2, [x15], 8
108        LDR     d3, [x20], 8
109        SXTL    v0.8h, v0.8b
110        LDR     x11, [x5, 16]
111        SXTL    v4.8h, v4.8b
112        SXTL    v1.8h, v1.8b
113        SXTL    v2.8h, v2.8b
114        SXTL    v3.8h, v3.8b
115        SXTL    v6.8h, v6.8b
116
117        SUBS    x0, x0, 8               // k = k - 8
118        # Is there at least 8 bytes for main loop?
119        B.LO    3f
120
121        # Main loop - 8 bytes of A
122        .p2align 3
1232:
124        SMLAL   v16.4s, v4.4h, v0.h[0]
125        SMLAL2  v20.4s, v4.8h, v0.h[0]
126        PRFM    PLDL1KEEP, [x13, 128]
127        SMLAL   v17.4s, v4.4h, v1.h[0]
128        SMLAL2  v21.4s, v4.8h, v1.h[0]
129        PRFM    PLDL1KEEP, [x14, 128]
130        SMLAL   v18.4s, v4.4h, v2.h[0]
131        SMLAL2  v22.4s, v4.8h, v2.h[0]
132        PRFM    PLDL1KEEP, [x15, 128]
133        SMLAL   v19.4s, v4.4h, v3.h[0]
134        SMLAL2  v23.4s, v4.8h, v3.h[0]
135        PRFM    PLDL1KEEP, [x20, 128]
136        LDR     d4, [x5, 24]
137        INS     v5.d[0], x11
138        SMLAL   v24.4s, v6.4h, v0.h[0]
139        SMLAL2  v28.4s, v6.8h, v0.h[0]
140        PRFM    PLDL1KEEP, [x5, 448]
141        SMLAL   v25.4s, v6.4h, v1.h[0]
142        SMLAL2  v29.4s, v6.8h, v1.h[0]
143        PRFM    PLDL1KEEP, [x5, 512]
144        SXTL    v5.8h, v5.8b
145        SMLAL   v26.4s, v6.4h, v2.h[0]
146        SMLAL2  v30.4s, v6.8h, v2.h[0]
147        SMLAL   v27.4s, v6.4h, v3.h[0]
148        SMLAL2  v31.4s, v6.8h, v3.h[0]
149        LDR     x11, [x5, 32]
150        SMLAL   v16.4s, v5.4h, v0.h[1]
151        SMLAL2  v20.4s, v5.8h, v0.h[1]
152        SMLAL   v17.4s, v5.4h, v1.h[1]
153        SMLAL2  v21.4s, v5.8h, v1.h[1]
154        SXTL    v4.8h, v4.8b
155        SMLAL   v18.4s, v5.4h, v2.h[1]
156        SMLAL2  v22.4s, v5.8h, v2.h[1]
157        SMLAL   v19.4s, v5.4h, v3.h[1]
158        SMLAL2  v23.4s, v5.8h, v3.h[1]
159        LDR     d5, [x5, 40]
160        INS     v6.d[0], x11
161        SMLAL   v24.4s, v4.4h, v0.h[1]
162        SMLAL2  v28.4s, v4.8h, v0.h[1]
163        SMLAL   v25.4s, v4.4h, v1.h[1]
164        SMLAL2  v29.4s, v4.8h, v1.h[1]
165        SXTL    v6.8h, v6.8b
166        SMLAL   v26.4s, v4.4h, v2.h[1]
167        SMLAL2  v30.4s, v4.8h, v2.h[1]
168        SMLAL   v27.4s, v4.4h, v3.h[1]
169        SMLAL2  v31.4s, v4.8h, v3.h[1]
170        LDR     x11, [x5, 48]
171        SMLAL   v16.4s, v6.4h, v0.h[2]
172        SMLAL2  v20.4s, v6.8h, v0.h[2]
173        SMLAL   v17.4s, v6.4h, v1.h[2]
174        SXTL    v5.8h, v5.8b
175        SMLAL2  v21.4s, v6.8h, v1.h[2]
176        SMLAL   v18.4s, v6.4h, v2.h[2]
177        SMLAL2  v22.4s, v6.8h, v2.h[2]
178        SMLAL   v19.4s, v6.4h, v3.h[2]
179        SMLAL2  v23.4s, v6.8h, v3.h[2]
180        LDR     d6, [x5, 56]
181        INS     v4.d[0], x11
182        SMLAL   v24.4s, v5.4h, v0.h[2]
183        SMLAL2  v28.4s, v5.8h, v0.h[2]
184        SMLAL   v25.4s, v5.4h, v1.h[2]
185        SMLAL2  v29.4s, v5.8h, v1.h[2]
186        SXTL    v4.8h, v4.8b
187        SMLAL   v26.4s, v5.4h, v2.h[2]
188        SMLAL2  v30.4s, v5.8h, v2.h[2]
189        SMLAL   v27.4s, v5.4h, v3.h[2]
190        SMLAL2  v31.4s, v5.8h, v3.h[2]
191        LDR     x11, [x5, 64]
192        SMLAL   v16.4s, v4.4h, v0.h[3]
193        SMLAL2  v20.4s, v4.8h, v0.h[3]
194        SMLAL   v17.4s, v4.4h, v1.h[3]
195        SMLAL2  v21.4s, v4.8h, v1.h[3]
196        SXTL    v6.8h, v6.8b
197        SMLAL   v18.4s, v4.4h, v2.h[3]
198        SMLAL2  v22.4s, v4.8h, v2.h[3]
199        SMLAL   v19.4s, v4.4h, v3.h[3]
200        SMLAL2  v23.4s, v4.8h, v3.h[3]
201        LDR     d4, [x5, 72]
202        INS     v5.d[0], x11
203        SMLAL   v24.4s, v6.4h, v0.h[3]
204        SMLAL2  v28.4s, v6.8h, v0.h[3]
205        SXTL    v5.8h, v5.8b
206        SMLAL   v25.4s, v6.4h, v1.h[3]
207        SMLAL2  v29.4s, v6.8h, v1.h[3]
208        SMLAL   v26.4s, v6.4h, v2.h[3]
209        SMLAL2  v30.4s, v6.8h, v2.h[3]
210        SMLAL   v27.4s, v6.4h, v3.h[3]
211        SMLAL2  v31.4s, v6.8h, v3.h[3]
212        LDR     x11, [x5, 80]
213        SMLAL   v16.4s, v5.4h, v0.h[4]
214        SMLAL2  v20.4s, v5.8h, v0.h[4]
215        SMLAL   v17.4s, v5.4h, v1.h[4]
216        SMLAL2  v21.4s, v5.8h, v1.h[4]
217        SXTL    v4.8h, v4.8b
218        SMLAL   v18.4s, v5.4h, v2.h[4]
219        SMLAL2  v22.4s, v5.8h, v2.h[4]
220        SMLAL   v19.4s, v5.4h, v3.h[4]
221        SMLAL2  v23.4s, v5.8h, v3.h[4]
222        LDR     d5, [x5, 88]
223        INS     v6.d[0], x11
224        SMLAL   v24.4s, v4.4h, v0.h[4]
225        SMLAL2  v28.4s, v4.8h, v0.h[4]
226        SMLAL   v25.4s, v4.4h, v1.h[4]
227        SMLAL2  v29.4s, v4.8h, v1.h[4]
228        SXTL    v6.8h, v6.8b
229        SMLAL   v26.4s, v4.4h, v2.h[4]
230        SMLAL2  v30.4s, v4.8h, v2.h[4]
231        SMLAL   v27.4s, v4.4h, v3.h[4]
232        SMLAL2  v31.4s, v4.8h, v3.h[4]
233        LDR     x11, [x5, 96]
234        SMLAL   v16.4s, v6.4h, v0.h[5]
235        SMLAL2  v20.4s, v6.8h, v0.h[5]
236        SMLAL   v17.4s, v6.4h, v1.h[5]
237        SMLAL2  v21.4s, v6.8h, v1.h[5]
238        SXTL    v5.8h, v5.8b
239        SMLAL   v18.4s, v6.4h, v2.h[5]
240        SMLAL2  v22.4s, v6.8h, v2.h[5]
241        SMLAL   v19.4s, v6.4h, v3.h[5]
242        SMLAL2  v23.4s, v6.8h, v3.h[5]
243        LDR     d6, [x5, 104]
244        INS     v4.d[0], x11
245        SMLAL   v24.4s, v5.4h, v0.h[5]
246        SMLAL2  v28.4s, v5.8h, v0.h[5]
247        SMLAL   v25.4s, v5.4h, v1.h[5]
248        SMLAL2  v29.4s, v5.8h, v1.h[5]
249        SXTL    v4.8h, v4.8b
250        SMLAL   v26.4s, v5.4h, v2.h[5]
251        SMLAL2  v30.4s, v5.8h, v2.h[5]
252        SMLAL   v27.4s, v5.4h, v3.h[5]
253        SMLAL2  v31.4s, v5.8h, v3.h[5]
254        SXTL    v6.8h, v6.8b
255        LDR     x11, [x5, 112]
256        SMLAL   v16.4s, v4.4h, v0.h[6]
257        SMLAL2  v20.4s, v4.8h, v0.h[6]
258        SMLAL   v17.4s, v4.4h, v1.h[6]
259        SMLAL2  v21.4s, v4.8h, v1.h[6]
260        SMLAL   v18.4s, v4.4h, v2.h[6]
261        SMLAL2  v22.4s, v4.8h, v2.h[6]
262        SMLAL   v19.4s, v4.4h, v3.h[6]
263        SMLAL2  v23.4s, v4.8h, v3.h[6]
264        LDR     d5, [x5, 120]
265        INS     v4.d[0], x11
266        SMLAL   v24.4s, v6.4h, v0.h[6]
267        SMLAL2  v28.4s, v6.8h, v0.h[6]
268        SMLAL   v25.4s, v6.4h, v1.h[6]
269        SMLAL2  v29.4s, v6.8h, v1.h[6]
270        SXTL    v4.8h, v4.8b
271        ADD     x5, x5, 128
272
273        SMLAL   v26.4s, v6.4h, v2.h[6]
274        SMLAL2  v30.4s, v6.8h, v2.h[6]
275        LDR     x11, [x5]
276        SMLAL   v27.4s, v6.4h, v3.h[6]
277        SMLAL2  v31.4s, v6.8h, v3.h[6]
278        SXTL    v5.8h, v5.8b
279        LDR     x21, [x13], 8
280
281        SMLAL   v16.4s, v4.4h, v0.h[7]
282        SMLAL2  v20.4s, v4.8h, v0.h[7]
283        SMLAL   v17.4s, v4.4h, v1.h[7]
284        SMLAL2  v21.4s, v4.8h, v1.h[7]
285        SMLAL   v18.4s, v4.4h, v2.h[7]
286        SMLAL2  v22.4s, v4.8h, v2.h[7]
287        SMLAL   v19.4s, v4.4h, v3.h[7]
288        SMLAL2  v23.4s, v4.8h, v3.h[7]
289        LDR     d6, [x5, 8]
290        INS     v4.d[0], x11
291        SMLAL   v24.4s, v5.4h, v0.h[7]
292        SMLAL2  v28.4s, v5.8h, v0.h[7]
293        LDR     x11, [x15], 8
294        SMLAL   v25.4s, v5.4h, v1.h[7]
295        SMLAL2  v29.4s, v5.8h, v1.h[7]
296        LDR     d1, [x14], 8
297        INS     v0.d[0], x21
298        SMLAL   v26.4s, v5.4h, v2.h[7]
299        SMLAL2  v30.4s, v5.8h, v2.h[7]
300        SMLAL   v27.4s, v5.4h, v3.h[7]
301        SMLAL2  v31.4s, v5.8h, v3.h[7]
302        LDR     d3, [x20], 8
303        INS     v2.d[0], x11
304
305        SXTL    v0.8h, v0.8b
306        SXTL    v1.8h, v1.8b
307        LDR     x11, [x5, 16]
308        SXTL    v4.8h, v4.8b
309        SXTL    v2.8h, v2.8b
310        SUBS    x0, x0, 8
311        SXTL    v3.8h, v3.8b
312        SXTL    v6.8h, v6.8b
313        B.HS    2b
314
315        # Epilogue.  Same as main loop but no preloads in final group
316
317        .p2align 3
3183:
319        SMLAL   v16.4s, v4.4h, v0.h[0]
320        SMLAL2  v20.4s, v4.8h, v0.h[0]
321        SMLAL   v17.4s, v4.4h, v1.h[0]
322        SMLAL2  v21.4s, v4.8h, v1.h[0]
323        SMLAL   v18.4s, v4.4h, v2.h[0]
324        SMLAL2  v22.4s, v4.8h, v2.h[0]
325        SMLAL   v19.4s, v4.4h, v3.h[0]
326        SMLAL2  v23.4s, v4.8h, v3.h[0]
327        LDR     d4, [x5, 24]
328        INS     v5.d[0], x11
329        SMLAL   v24.4s, v6.4h, v0.h[0]
330        SMLAL2  v28.4s, v6.8h, v0.h[0]
331        SMLAL   v25.4s, v6.4h, v1.h[0]
332        SMLAL2  v29.4s, v6.8h, v1.h[0]
333        SXTL    v5.8h, v5.8b
334        SMLAL   v26.4s, v6.4h, v2.h[0]
335        SMLAL2  v30.4s, v6.8h, v2.h[0]
336        SMLAL   v27.4s, v6.4h, v3.h[0]
337        SMLAL2  v31.4s, v6.8h, v3.h[0]
338        LDR     x11, [x5, 32]
339        SMLAL   v16.4s, v5.4h, v0.h[1]
340        SMLAL2  v20.4s, v5.8h, v0.h[1]
341        SMLAL   v17.4s, v5.4h, v1.h[1]
342        SMLAL2  v21.4s, v5.8h, v1.h[1]
343        SXTL    v4.8h, v4.8b
344        SMLAL   v18.4s, v5.4h, v2.h[1]
345        SMLAL2  v22.4s, v5.8h, v2.h[1]
346        SMLAL   v19.4s, v5.4h, v3.h[1]
347        SMLAL2  v23.4s, v5.8h, v3.h[1]
348        LDR     d5, [x5, 40]
349        INS     v6.d[0], x11
350        SMLAL   v24.4s, v4.4h, v0.h[1]
351        SMLAL2  v28.4s, v4.8h, v0.h[1]
352        SMLAL   v25.4s, v4.4h, v1.h[1]
353        SMLAL2  v29.4s, v4.8h, v1.h[1]
354        SXTL    v6.8h, v6.8b
355        SMLAL   v26.4s, v4.4h, v2.h[1]
356        SMLAL2  v30.4s, v4.8h, v2.h[1]
357        SMLAL   v27.4s, v4.4h, v3.h[1]
358        SMLAL2  v31.4s, v4.8h, v3.h[1]
359        LDR     x11, [x5, 48]
360        SMLAL   v16.4s, v6.4h, v0.h[2]
361        SMLAL2  v20.4s, v6.8h, v0.h[2]
362        SMLAL   v17.4s, v6.4h, v1.h[2]
363        SXTL    v5.8h, v5.8b
364        SMLAL2  v21.4s, v6.8h, v1.h[2]
365        SMLAL   v18.4s, v6.4h, v2.h[2]
366        SMLAL2  v22.4s, v6.8h, v2.h[2]
367        SMLAL   v19.4s, v6.4h, v3.h[2]
368        SMLAL2  v23.4s, v6.8h, v3.h[2]
369        LDR     d6, [x5, 56]
370        INS     v4.d[0], x11
371        SMLAL   v24.4s, v5.4h, v0.h[2]
372        SMLAL2  v28.4s, v5.8h, v0.h[2]
373        SMLAL   v25.4s, v5.4h, v1.h[2]
374        SMLAL2  v29.4s, v5.8h, v1.h[2]
375        SXTL    v4.8h, v4.8b
376        SMLAL   v26.4s, v5.4h, v2.h[2]
377        SMLAL2  v30.4s, v5.8h, v2.h[2]
378        SMLAL   v27.4s, v5.4h, v3.h[2]
379        SMLAL2  v31.4s, v5.8h, v3.h[2]
380        LDR     x11, [x5, 64]
381        SMLAL   v16.4s, v4.4h, v0.h[3]
382        SMLAL2  v20.4s, v4.8h, v0.h[3]
383        SMLAL   v17.4s, v4.4h, v1.h[3]
384        SMLAL2  v21.4s, v4.8h, v1.h[3]
385        SXTL    v6.8h, v6.8b
386        SMLAL   v18.4s, v4.4h, v2.h[3]
387        SMLAL2  v22.4s, v4.8h, v2.h[3]
388        SMLAL   v19.4s, v4.4h, v3.h[3]
389        SMLAL2  v23.4s, v4.8h, v3.h[3]
390        LDR     d4, [x5, 72]
391        INS     v5.d[0], x11
392        SMLAL   v24.4s, v6.4h, v0.h[3]
393        SMLAL2  v28.4s, v6.8h, v0.h[3]
394        SXTL    v5.8h, v5.8b
395        SMLAL   v25.4s, v6.4h, v1.h[3]
396        SMLAL2  v29.4s, v6.8h, v1.h[3]
397        SMLAL   v26.4s, v6.4h, v2.h[3]
398        SMLAL2  v30.4s, v6.8h, v2.h[3]
399        SMLAL   v27.4s, v6.4h, v3.h[3]
400        SMLAL2  v31.4s, v6.8h, v3.h[3]
401        LDR     x11, [x5, 80]
402        SMLAL   v16.4s, v5.4h, v0.h[4]
403        SMLAL2  v20.4s, v5.8h, v0.h[4]
404        SMLAL   v17.4s, v5.4h, v1.h[4]
405        SMLAL2  v21.4s, v5.8h, v1.h[4]
406        SXTL    v4.8h, v4.8b
407        SMLAL   v18.4s, v5.4h, v2.h[4]
408        SMLAL2  v22.4s, v5.8h, v2.h[4]
409        SMLAL   v19.4s, v5.4h, v3.h[4]
410        SMLAL2  v23.4s, v5.8h, v3.h[4]
411        LDR     d5, [x5, 88]
412        INS     v6.d[0], x11
413        SMLAL   v24.4s, v4.4h, v0.h[4]
414        SMLAL2  v28.4s, v4.8h, v0.h[4]
415        SMLAL   v25.4s, v4.4h, v1.h[4]
416        SMLAL2  v29.4s, v4.8h, v1.h[4]
417        SXTL    v6.8h, v6.8b
418        SMLAL   v26.4s, v4.4h, v2.h[4]
419        SMLAL2  v30.4s, v4.8h, v2.h[4]
420        SMLAL   v27.4s, v4.4h, v3.h[4]
421        SMLAL2  v31.4s, v4.8h, v3.h[4]
422        LDR     x11, [x5, 96]
423        SMLAL   v16.4s, v6.4h, v0.h[5]
424        SMLAL2  v20.4s, v6.8h, v0.h[5]
425        SMLAL   v17.4s, v6.4h, v1.h[5]
426        SMLAL2  v21.4s, v6.8h, v1.h[5]
427        SXTL    v5.8h, v5.8b
428        SMLAL   v18.4s, v6.4h, v2.h[5]
429        SMLAL2  v22.4s, v6.8h, v2.h[5]
430        SMLAL   v19.4s, v6.4h, v3.h[5]
431        SMLAL2  v23.4s, v6.8h, v3.h[5]
432        LDR     d6, [x5, 104]
433        INS     v4.d[0], x11
434        SMLAL   v24.4s, v5.4h, v0.h[5]
435        SMLAL2  v28.4s, v5.8h, v0.h[5]
436        SMLAL   v25.4s, v5.4h, v1.h[5]
437        SMLAL2  v29.4s, v5.8h, v1.h[5]
438        SXTL    v4.8h, v4.8b
439        SMLAL   v26.4s, v5.4h, v2.h[5]
440        SMLAL2  v30.4s, v5.8h, v2.h[5]
441        SMLAL   v27.4s, v5.4h, v3.h[5]
442        SMLAL2  v31.4s, v5.8h, v3.h[5]
443        SXTL    v6.8h, v6.8b
444        SMLAL   v16.4s, v4.4h, v0.h[6]
445        SMLAL2  v20.4s, v4.8h, v0.h[6]
446        SMLAL   v17.4s, v4.4h, v1.h[6]
447        SMLAL2  v21.4s, v4.8h, v1.h[6]
448        SMLAL   v18.4s, v4.4h, v2.h[6]
449        SMLAL2  v22.4s, v4.8h, v2.h[6]
450        SMLAL   v19.4s, v4.4h, v3.h[6]
451        SMLAL2  v23.4s, v4.8h, v3.h[6]
452        LDR     x11, [x5, 112]
453        SMLAL   v24.4s, v6.4h, v0.h[6]
454        SMLAL2  v28.4s, v6.8h, v0.h[6]
455        SMLAL   v25.4s, v6.4h, v1.h[6]
456        SMLAL2  v29.4s, v6.8h, v1.h[6]
457        LDR     d5, [x5, 120]
458        INS     v4.d[0], x11
459        SXTL    v4.8h, v4.8b
460        SMLAL   v26.4s, v6.4h, v2.h[6]
461        SMLAL2  v30.4s, v6.8h, v2.h[6]
462        SMLAL   v27.4s, v6.4h, v3.h[6]
463        SMLAL2  v31.4s, v6.8h, v3.h[6]
464        SMLAL   v16.4s, v4.4h, v0.h[7]
465        SMLAL2  v20.4s, v4.8h, v0.h[7]
466        SMLAL   v17.4s, v4.4h, v1.h[7]
467        SMLAL2  v21.4s, v4.8h, v1.h[7]
468        SXTL    v5.8h, v5.8b
469        SMLAL   v18.4s, v4.4h, v2.h[7]
470        SMLAL2  v22.4s, v4.8h, v2.h[7]
471        SMLAL   v19.4s, v4.4h, v3.h[7]
472        SMLAL2  v23.4s, v4.8h, v3.h[7]
473        ADD     x5, x5, 128
474        SMLAL   v24.4s, v5.4h, v0.h[7]
475        SMLAL2  v28.4s, v5.8h, v0.h[7]
476        SMLAL   v25.4s, v5.4h, v1.h[7]
477        SMLAL2  v29.4s, v5.8h, v1.h[7]
478        AND     x0, x2, 7               // kc remainder 0 to 7
479        SMLAL   v26.4s, v5.4h, v2.h[7]
480        SMLAL2  v30.4s, v5.8h, v2.h[7]
481        LDR     x11, [sp, 40]            // reload params pointer
482        SMLAL   v27.4s, v5.4h, v3.h[7]
483        SMLAL2  v31.4s, v5.8h, v3.h[7]
484
485        # Is there a remainder?- 1 to 7 bytes of A
486        CBNZ    x0, 5f
487
4884:
489        # ks loop
490        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
491        B.HI    1b
492
493        SCVTF   v16.4s, v16.4s
494        SCVTF   v17.4s, v17.4s
495        # Load per channel scale values from weights
496        LDR     q4, [x5], 16
497        SCVTF   v18.4s, v18.4s
498        SCVTF   v19.4s, v19.4s
499        LDR     q5, [x5], 16
500        SCVTF   v20.4s, v20.4s
501        SCVTF   v21.4s, v21.4s
502        SCVTF   v22.4s, v22.4s
503        SCVTF   v23.4s, v23.4s
504        SCVTF   v24.4s, v24.4s
505        SCVTF   v25.4s, v25.4s
506        SCVTF   v26.4s, v26.4s
507        SCVTF   v27.4s, v27.4s
508        SCVTF   v28.4s, v28.4s
509        SCVTF   v29.4s, v29.4s
510        SCVTF   v30.4s, v30.4s
511        SCVTF   v31.4s, v31.4s
512
513        LDR     q6, [x5], 16
514        FMUL    v16.4s, v16.4s, v4.4s
515        FMUL    v17.4s, v17.4s, v4.4s
516        FMUL    v18.4s, v18.4s, v4.4s
517        FMUL    v19.4s, v19.4s, v4.4s
518        FMUL    v20.4s, v20.4s, v5.4s
519        LDR     q4, [x5], 16
520        FMUL    v21.4s, v21.4s, v5.4s
521        FMUL    v22.4s, v22.4s, v5.4s
522        FMUL    v23.4s, v23.4s, v5.4s
523        FMUL    v24.4s, v24.4s, v6.4s
524        FMUL    v25.4s, v25.4s, v6.4s
525        FMUL    v26.4s, v26.4s, v6.4s
526        FMUL    v27.4s, v27.4s, v6.4s
527        FMUL    v28.4s, v28.4s, v4.4s
528        FMUL    v29.4s, v29.4s, v4.4s
529        FMUL    v30.4s, v30.4s, v4.4s
530        FMUL    v31.4s, v31.4s, v4.4s
531
532        FCVTNS  v16.4s, v16.4s
533        FCVTNS  v17.4s, v17.4s
534        FCVTNS  v18.4s, v18.4s
535        FCVTNS  v19.4s, v19.4s
536        FCVTNS  v20.4s, v20.4s
537        FCVTNS  v21.4s, v21.4s
538        FCVTNS  v22.4s, v22.4s
539        FCVTNS  v23.4s, v23.4s
540        FCVTNS  v24.4s, v24.4s
541        FCVTNS  v25.4s, v25.4s
542        FCVTNS  v26.4s, v26.4s
543        FCVTNS  v27.4s, v27.4s
544        FCVTNS  v28.4s, v28.4s
545        FCVTNS  v29.4s, v29.4s
546        FCVTNS  v30.4s, v30.4s
547        FCVTNS  v31.4s, v31.4s
548
549        SQXTN   v16.4h, v16.4s
550        SQXTN   v17.4h, v17.4s
551        SQXTN   v18.4h, v18.4s
552        SQXTN   v19.4h, v19.4s
553        SQXTN   v24.4h, v24.4s
554        SQXTN   v25.4h, v25.4s
555        SQXTN   v26.4h, v26.4s
556        SQXTN   v27.4h, v27.4s
557        LD1R    {v6.8h}, [x11], 2        // add bias
558
559        SQXTN2  v16.8h, v20.4s
560        SQXTN2  v17.8h, v21.4s
561        SQXTN2  v18.8h, v22.4s
562        SQXTN2  v19.8h, v23.4s
563        SQXTN2  v24.8h, v28.4s
564        SQXTN2  v25.8h, v29.4s
565        SQXTN2  v26.8h, v30.4s
566        SQXTN2  v27.8h, v31.4s
567
568        SQADD   v16.8h, v16.8h, v6.8h
569        SQADD   v17.8h, v17.8h, v6.8h
570        SQADD   v18.8h, v18.8h, v6.8h
571        SQADD   v19.8h, v19.8h, v6.8h
572        SQADD   v24.8h, v24.8h, v6.8h
573        SQADD   v25.8h, v25.8h, v6.8h
574        SQADD   v26.8h, v26.8h, v6.8h
575        SQADD   v27.8h, v27.8h, v6.8h
576        LD1R    {v4.16b}, [x11], 1       // clamp min value
577
578        SQXTN   v0.8b, v16.8h
579        SQXTN   v1.8b, v17.8h
580        SQXTN   v2.8b, v18.8h
581        SQXTN   v3.8b, v19.8h
582        LD1R    {v5.16b}, [x11]          // clamp max value
583        SQXTN2  v0.16b, v24.8h
584        SQXTN2  v1.16b, v25.8h
585        SQXTN2  v2.16b, v26.8h
586        SQXTN2  v3.16b, v27.8h
587        SUB     x11, x11, 3             // rewind params pointer
588
589        SMAX    v0.16b, v0.16b, v4.16b
590        SMAX    v1.16b, v1.16b, v4.16b
591        SMAX    v2.16b, v2.16b, v4.16b
592        SMAX    v3.16b, v3.16b, v4.16b
593        SUBS    x1, x1, 16
594        SMIN    v0.16b, v0.16b, v5.16b
595        SMIN    v1.16b, v1.16b, v5.16b
596        SMIN    v2.16b, v2.16b, v5.16b
597        SMIN    v3.16b, v3.16b, v5.16b
598        B.LO    6f
599
600        # Store full 4 x 16
601        ST1     {v3.16b},  [x7], x10
602        ST1     {v2.16b}, [x17], x10
603        ST1     {v1.16b}, [x16], x10
604        ST1     {v0.16b},  [x6], x10
605
606        SUB     x4, x4, x3              // a -= ks
607
608        # nc loop
609        B.HI    0b
610
611        # Restore x20-x21 from stack
612        LDP     x20, x21, [sp], 16
613        RET
614
615        # Remainder- 1 to 7 bytes of A
616        .p2align 3
6175:
618        AND     x0, x2, 7               // kc remainder 1 to 7
619
620        LD1     {v0.8b}, [x13], x0
621        LDP     d4, d5, [x5], 16
622        LD1     {v1.8b}, [x14], x0
623        LD1     {v2.8b}, [x15], x0
624        LD1     {v3.8b}, [x20], x0
625        SXTL    v0.8h, v0.8b
626        SXTL    v4.8h, v4.8b
627        SXTL    v5.8h, v5.8b
628        SXTL    v1.8h, v1.8b
629        SXTL    v2.8h, v2.8b
630        SXTL    v3.8h, v3.8b
631        SMLAL   v16.4s, v4.4h, v0.h[0]
632        SMLAL2  v20.4s, v4.8h, v0.h[0]
633        SMLAL   v24.4s, v5.4h, v0.h[0]
634        SMLAL2  v28.4s, v5.8h, v0.h[0]
635        SMLAL   v17.4s, v4.4h, v1.h[0]
636        SMLAL2  v21.4s, v4.8h, v1.h[0]
637        SMLAL   v25.4s, v5.4h, v1.h[0]
638        SMLAL2  v29.4s, v5.8h, v1.h[0]
639        SMLAL   v18.4s, v4.4h, v2.h[0]
640        SMLAL2  v22.4s, v4.8h, v2.h[0]
641        SMLAL   v26.4s, v5.4h, v2.h[0]
642        SMLAL2  v30.4s, v5.8h, v2.h[0]
643        SMLAL   v19.4s, v4.4h, v3.h[0]
644        SMLAL2  v23.4s, v4.8h, v3.h[0]
645        SMLAL   v27.4s, v5.4h, v3.h[0]
646        SMLAL2  v31.4s, v5.8h, v3.h[0]
647        CMP     x0, 2
648        B.LO    4b
649
650        LDP     d4, d5, [x5], 16
651        SXTL    v4.8h, v4.8b
652        SXTL    v5.8h, v5.8b
653        SMLAL   v16.4s, v4.4h, v0.h[1]
654        SMLAL2  v20.4s, v4.8h, v0.h[1]
655        SMLAL   v24.4s, v5.4h, v0.h[1]
656        SMLAL2  v28.4s, v5.8h, v0.h[1]
657        SMLAL   v17.4s, v4.4h, v1.h[1]
658        SMLAL2  v21.4s, v4.8h, v1.h[1]
659        SMLAL   v25.4s, v5.4h, v1.h[1]
660        SMLAL2  v29.4s, v5.8h, v1.h[1]
661        SMLAL   v18.4s, v4.4h, v2.h[1]
662        SMLAL2  v22.4s, v4.8h, v2.h[1]
663        SMLAL   v26.4s, v5.4h, v2.h[1]
664        SMLAL2  v30.4s, v5.8h, v2.h[1]
665        SMLAL   v19.4s, v4.4h, v3.h[1]
666        SMLAL2  v23.4s, v4.8h, v3.h[1]
667        SMLAL   v27.4s, v5.4h, v3.h[1]
668        SMLAL2  v31.4s, v5.8h, v3.h[1]
669        B.EQ    4b
670
671        LDP     d4, d5, [x5], 16
672        SXTL    v4.8h, v4.8b
673        SXTL    v5.8h, v5.8b
674        SMLAL   v16.4s, v4.4h, v0.h[2]
675        SMLAL2  v20.4s, v4.8h, v0.h[2]
676        SMLAL   v24.4s, v5.4h, v0.h[2]
677        SMLAL2  v28.4s, v5.8h, v0.h[2]
678        SMLAL   v17.4s, v4.4h, v1.h[2]
679        SMLAL2  v21.4s, v4.8h, v1.h[2]
680        SMLAL   v25.4s, v5.4h, v1.h[2]
681        SMLAL2  v29.4s, v5.8h, v1.h[2]
682        SMLAL   v18.4s, v4.4h, v2.h[2]
683        SMLAL2  v22.4s, v4.8h, v2.h[2]
684        SMLAL   v26.4s, v5.4h, v2.h[2]
685        SMLAL2  v30.4s, v5.8h, v2.h[2]
686        SMLAL   v19.4s, v4.4h, v3.h[2]
687        SMLAL2  v23.4s, v4.8h, v3.h[2]
688        SMLAL   v27.4s, v5.4h, v3.h[2]
689        SMLAL2  v31.4s, v5.8h, v3.h[2]
690        CMP     x0, 4
691        B.LO    4b
692
693        LDP     d4, d5, [x5], 16
694        SXTL    v4.8h, v4.8b
695        SXTL    v5.8h, v5.8b
696        SMLAL   v16.4s, v4.4h, v0.h[3]
697        SMLAL2  v20.4s, v4.8h, v0.h[3]
698        SMLAL   v24.4s, v5.4h, v0.h[3]
699        SMLAL2  v28.4s, v5.8h, v0.h[3]
700        SMLAL   v17.4s, v4.4h, v1.h[3]
701        SMLAL2  v21.4s, v4.8h, v1.h[3]
702        SMLAL   v25.4s, v5.4h, v1.h[3]
703        SMLAL2  v29.4s, v5.8h, v1.h[3]
704        SMLAL   v18.4s, v4.4h, v2.h[3]
705        SMLAL2  v22.4s, v4.8h, v2.h[3]
706        SMLAL   v26.4s, v5.4h, v2.h[3]
707        SMLAL2  v30.4s, v5.8h, v2.h[3]
708        SMLAL   v19.4s, v4.4h, v3.h[3]
709        SMLAL2  v23.4s, v4.8h, v3.h[3]
710        SMLAL   v27.4s, v5.4h, v3.h[3]
711        SMLAL2  v31.4s, v5.8h, v3.h[3]
712        B.EQ    4b
713
714        LDP     d4, d5, [x5], 16
715        SXTL    v4.8h, v4.8b
716        SXTL    v5.8h, v5.8b
717        SMLAL   v16.4s, v4.4h, v0.h[4]
718        SMLAL2  v20.4s, v4.8h, v0.h[4]
719        SMLAL   v24.4s, v5.4h, v0.h[4]
720        SMLAL2  v28.4s, v5.8h, v0.h[4]
721        SMLAL   v17.4s, v4.4h, v1.h[4]
722        SMLAL2  v21.4s, v4.8h, v1.h[4]
723        SMLAL   v25.4s, v5.4h, v1.h[4]
724        SMLAL2  v29.4s, v5.8h, v1.h[4]
725        SMLAL   v18.4s, v4.4h, v2.h[4]
726        SMLAL2  v22.4s, v4.8h, v2.h[4]
727        SMLAL   v26.4s, v5.4h, v2.h[4]
728        SMLAL2  v30.4s, v5.8h, v2.h[4]
729        SMLAL   v19.4s, v4.4h, v3.h[4]
730        SMLAL2  v23.4s, v4.8h, v3.h[4]
731        SMLAL   v27.4s, v5.4h, v3.h[4]
732        SMLAL2  v31.4s, v5.8h, v3.h[4]
733        CMP     x0, 6
734        B.LO    4b
735
736        LDP     d4, d5, [x5], 16
737        SXTL    v4.8h, v4.8b
738        SXTL    v5.8h, v5.8b
739        SMLAL   v16.4s, v4.4h, v0.h[5]
740        SMLAL2  v20.4s, v4.8h, v0.h[5]
741        SMLAL   v24.4s, v5.4h, v0.h[5]
742        SMLAL2  v28.4s, v5.8h, v0.h[5]
743        SMLAL   v17.4s, v4.4h, v1.h[5]
744        SMLAL2  v21.4s, v4.8h, v1.h[5]
745        SMLAL   v25.4s, v5.4h, v1.h[5]
746        SMLAL2  v29.4s, v5.8h, v1.h[5]
747        SMLAL   v18.4s, v4.4h, v2.h[5]
748        SMLAL2  v22.4s, v4.8h, v2.h[5]
749        SMLAL   v26.4s, v5.4h, v2.h[5]
750        SMLAL2  v30.4s, v5.8h, v2.h[5]
751        SMLAL   v19.4s, v4.4h, v3.h[5]
752        SMLAL2  v23.4s, v4.8h, v3.h[5]
753        SMLAL   v27.4s, v5.4h, v3.h[5]
754        SMLAL2  v31.4s, v5.8h, v3.h[5]
755        B.EQ    4b
756
757        LDP     d4, d5, [x5], 16
758        SXTL    v4.8h, v4.8b
759        SXTL    v5.8h, v5.8b
760        SMLAL   v16.4s, v4.4h, v0.h[6]
761        SMLAL2  v20.4s, v4.8h, v0.h[6]
762        SMLAL   v24.4s, v5.4h, v0.h[6]
763        SMLAL2  v28.4s, v5.8h, v0.h[6]
764        SMLAL   v17.4s, v4.4h, v1.h[6]
765        SMLAL2  v21.4s, v4.8h, v1.h[6]
766        SMLAL   v25.4s, v5.4h, v1.h[6]
767        SMLAL2  v29.4s, v5.8h, v1.h[6]
768        SMLAL   v18.4s, v4.4h, v2.h[6]
769        SMLAL2  v22.4s, v4.8h, v2.h[6]
770        SMLAL   v26.4s, v5.4h, v2.h[6]
771        SMLAL2  v30.4s, v5.8h, v2.h[6]
772        SMLAL   v19.4s, v4.4h, v3.h[6]
773        SMLAL2  v23.4s, v4.8h, v3.h[6]
774        SMLAL   v27.4s, v5.4h, v3.h[6]
775        SMLAL2  v31.4s, v5.8h, v3.h[6]
776        B       4b
777
778        # Store odd width
779        .p2align 3
7806:
781        TBZ     x1, 3, 7f
782        STR     d3, [x7], 8
783        STR     d2, [x17], 8
784        DUP     d3, v3.d[1]
785        DUP     d2, v2.d[1]
786        STR     d1, [x16], 8
787        STR     d0, [x6], 8
788        DUP     d1, v1.d[1]
789        DUP     d0, v0.d[1]
7907:
791        TBZ     x1, 2, 8f
792        STR     s3, [x7], 4
793        STR     s2, [x17], 4
794        DUP     s3, v3.s[1]
795        DUP     s2, v2.s[1]
796        STR     s1, [x16], 4
797        STR     s0, [x6], 4
798        DUP     s1, v1.s[1]
799        DUP     s0, v0.s[1]
8008:
801        TBZ     x1, 1, 9f
802        STR     h3, [x7], 2
803        STR     h2, [x17], 2
804        DUP     h3, v3.h[1]
805        DUP     h2, v2.h[1]
806        STR     h1, [x16], 2
807        STR     h0, [x6], 2
808        DUP     h1, v1.h[1]
809        DUP     h0, v0.h[1]
8109:
811        TBZ     x1, 0, 10f
812        STR     b3, [x7]
813        STR     b2, [x17]
814        STR     b1, [x16]
815        STR     b0, [x6]
81610:
817        # Restore x20-x21 from stack
818        LDP     x20, x21, [sp], 16
819        RET
820
821END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
822
823#ifdef __ELF__
824.section ".note.GNU-stack","",%progbits
825#endif
826