1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t** restrict a, x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x20  v3
34# B    x5  v4  v5  v6
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
40# x11, x21 temp for Cortex-A53 loads
41
42BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
43
44        # Clamp C pointers
45        CMP     x0, 2                   // if mr < 2
46        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
47        ADD     x16, x6, x7             // c1 = c0 + cm_stride
48        CSEL    x16, x6,  x16, LO       //   c1 = c0
49
50        ADD     x17, x16, x7            // c2 = c1 + cm_stride
51        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
52                                        // if mr <= 2
53        CSEL    x17, x16, x17, LS       //   c2 = c1
54
55        CMP     x0, 4                   // if mr < 4
56        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
57        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
58        CSEL    x7,  x17, x7, LO        //   c3 = c2
59
60
61        .p2align 3
620:
63        # Load initial bias from w into accumulators
64        LDP     q16, q20, [x5], 32
65        MOV     v17.16b, v16.16b
66        MOV     v18.16b, v16.16b
67        LDP     q24, q28, [x5], 32
68        MOV     v19.16b, v16.16b
69        MOV     v21.16b, v20.16b
70        MOV     v22.16b, v20.16b
71        MOV     v23.16b, v20.16b
72        MOV     v25.16b, v24.16b
73        MOV     v26.16b, v24.16b
74        MOV     v27.16b, v24.16b
75        MOV     v29.16b, v28.16b
76        MOV     v30.16b, v28.16b
77        MOV     v31.16b, v28.16b
78        MOV     x9, x3                  // p = ks
79
80        .p2align 3
811:
82        # Load next 4 A pointers
83        LDP     x13, x14, [x4], 16
84        LDP     x15, x20, [x4], 16
85
86        CMP     x13, x12                // if a0 == zero
87        ADD     x13, x13, x8            // a0 += a_offset
88        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
89        CMP     x14, x12                // if a1 == zero
90        ADD     x14, x14, x8            // a1 += a_offset
91        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
92        CMP     x15, x12                // if a2 == zero
93        ADD     x15, x15, x8            // a2 += a_offset
94        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
95        CMP     x20, x12                // if a3 == zero
96        ADD     x20, x20, x8            // a3 += a_offset
97        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
98
99        # Is there at least 8 bytes for epilogue?
100        SUBS    x0, x2, 8               // k = kc - 8
101        B.LO    5f
102
103        # Prologue
104        LDR     d0, [x13], 8
105        LDP     d4, d6, [x5]
106        LDR     d1, [x14], 8
107        LDR     d2, [x15], 8
108        LDR     d3, [x20], 8
109        SXTL    v0.8h, v0.8b
110        LDR     x11, [x5, 16]
111        SXTL    v4.8h, v4.8b
112        SXTL    v1.8h, v1.8b
113        SXTL    v2.8h, v2.8b
114        SXTL    v3.8h, v3.8b
115        SXTL    v6.8h, v6.8b
116
117        SUBS    x0, x0, 8               // k = k - 8
118        # Is there at least 8 bytes for main loop?
119        B.LO    3f
120
121        # Main loop - 8 bytes of A
122        .p2align 3
1232:
124        SMLAL   v16.4s, v4.4h, v0.h[0]
125        SMLAL2  v20.4s, v4.8h, v0.h[0]
126        SMLAL   v17.4s, v4.4h, v1.h[0]
127        SMLAL2  v21.4s, v4.8h, v1.h[0]
128        SMLAL   v18.4s, v4.4h, v2.h[0]
129        SMLAL2  v22.4s, v4.8h, v2.h[0]
130        SMLAL   v19.4s, v4.4h, v3.h[0]
131        SMLAL2  v23.4s, v4.8h, v3.h[0]
132        LDR     d4, [x5, 24]
133        INS     v5.d[0], x11
134        SMLAL   v24.4s, v6.4h, v0.h[0]
135        SMLAL2  v28.4s, v6.8h, v0.h[0]
136        SMLAL   v25.4s, v6.4h, v1.h[0]
137        SMLAL2  v29.4s, v6.8h, v1.h[0]
138        SXTL    v5.8h, v5.8b
139        SMLAL   v26.4s, v6.4h, v2.h[0]
140        SMLAL2  v30.4s, v6.8h, v2.h[0]
141        SMLAL   v27.4s, v6.4h, v3.h[0]
142        SMLAL2  v31.4s, v6.8h, v3.h[0]
143        LDR     x11, [x5, 32]
144        SMLAL   v16.4s, v5.4h, v0.h[1]
145        SMLAL2  v20.4s, v5.8h, v0.h[1]
146        SMLAL   v17.4s, v5.4h, v1.h[1]
147        SMLAL2  v21.4s, v5.8h, v1.h[1]
148        SXTL    v4.8h, v4.8b
149        SMLAL   v18.4s, v5.4h, v2.h[1]
150        SMLAL2  v22.4s, v5.8h, v2.h[1]
151        SMLAL   v19.4s, v5.4h, v3.h[1]
152        SMLAL2  v23.4s, v5.8h, v3.h[1]
153        LDR     d5, [x5, 40]
154        INS     v6.d[0], x11
155        SMLAL   v24.4s, v4.4h, v0.h[1]
156        SMLAL2  v28.4s, v4.8h, v0.h[1]
157        SMLAL   v25.4s, v4.4h, v1.h[1]
158        SMLAL2  v29.4s, v4.8h, v1.h[1]
159        SXTL    v6.8h, v6.8b
160        SMLAL   v26.4s, v4.4h, v2.h[1]
161        SMLAL2  v30.4s, v4.8h, v2.h[1]
162        SMLAL   v27.4s, v4.4h, v3.h[1]
163        SMLAL2  v31.4s, v4.8h, v3.h[1]
164        LDR     x11, [x5, 48]
165        SMLAL   v16.4s, v6.4h, v0.h[2]
166        SMLAL2  v20.4s, v6.8h, v0.h[2]
167        SMLAL   v17.4s, v6.4h, v1.h[2]
168        SXTL    v5.8h, v5.8b
169        SMLAL2  v21.4s, v6.8h, v1.h[2]
170        SMLAL   v18.4s, v6.4h, v2.h[2]
171        SMLAL2  v22.4s, v6.8h, v2.h[2]
172        SMLAL   v19.4s, v6.4h, v3.h[2]
173        SMLAL2  v23.4s, v6.8h, v3.h[2]
174        LDR     d6, [x5, 56]
175        INS     v4.d[0], x11
176        SMLAL   v24.4s, v5.4h, v0.h[2]
177        SMLAL2  v28.4s, v5.8h, v0.h[2]
178        SMLAL   v25.4s, v5.4h, v1.h[2]
179        SMLAL2  v29.4s, v5.8h, v1.h[2]
180        SXTL    v4.8h, v4.8b
181        SMLAL   v26.4s, v5.4h, v2.h[2]
182        SMLAL2  v30.4s, v5.8h, v2.h[2]
183        SMLAL   v27.4s, v5.4h, v3.h[2]
184        SMLAL2  v31.4s, v5.8h, v3.h[2]
185        LDR     x11, [x5, 64]
186        SMLAL   v16.4s, v4.4h, v0.h[3]
187        SMLAL2  v20.4s, v4.8h, v0.h[3]
188        SMLAL   v17.4s, v4.4h, v1.h[3]
189        SMLAL2  v21.4s, v4.8h, v1.h[3]
190        SXTL    v6.8h, v6.8b
191        SMLAL   v18.4s, v4.4h, v2.h[3]
192        SMLAL2  v22.4s, v4.8h, v2.h[3]
193        SMLAL   v19.4s, v4.4h, v3.h[3]
194        SMLAL2  v23.4s, v4.8h, v3.h[3]
195        LDR     d4, [x5, 72]
196        INS     v5.d[0], x11
197        SMLAL   v24.4s, v6.4h, v0.h[3]
198        SMLAL2  v28.4s, v6.8h, v0.h[3]
199        SXTL    v5.8h, v5.8b
200        SMLAL   v25.4s, v6.4h, v1.h[3]
201        SMLAL2  v29.4s, v6.8h, v1.h[3]
202        SMLAL   v26.4s, v6.4h, v2.h[3]
203        SMLAL2  v30.4s, v6.8h, v2.h[3]
204        SMLAL   v27.4s, v6.4h, v3.h[3]
205        SMLAL2  v31.4s, v6.8h, v3.h[3]
206        LDR     x11, [x5, 80]
207        SMLAL   v16.4s, v5.4h, v0.h[4]
208        SMLAL2  v20.4s, v5.8h, v0.h[4]
209        SMLAL   v17.4s, v5.4h, v1.h[4]
210        SMLAL2  v21.4s, v5.8h, v1.h[4]
211        SXTL    v4.8h, v4.8b
212        SMLAL   v18.4s, v5.4h, v2.h[4]
213        SMLAL2  v22.4s, v5.8h, v2.h[4]
214        SMLAL   v19.4s, v5.4h, v3.h[4]
215        SMLAL2  v23.4s, v5.8h, v3.h[4]
216        LDR     d5, [x5, 88]
217        INS     v6.d[0], x11
218        SMLAL   v24.4s, v4.4h, v0.h[4]
219        SMLAL2  v28.4s, v4.8h, v0.h[4]
220        SMLAL   v25.4s, v4.4h, v1.h[4]
221        SMLAL2  v29.4s, v4.8h, v1.h[4]
222        SXTL    v6.8h, v6.8b
223        SMLAL   v26.4s, v4.4h, v2.h[4]
224        SMLAL2  v30.4s, v4.8h, v2.h[4]
225        SMLAL   v27.4s, v4.4h, v3.h[4]
226        SMLAL2  v31.4s, v4.8h, v3.h[4]
227        LDR     x11, [x5, 96]
228        SMLAL   v16.4s, v6.4h, v0.h[5]
229        SMLAL2  v20.4s, v6.8h, v0.h[5]
230        SMLAL   v17.4s, v6.4h, v1.h[5]
231        SMLAL2  v21.4s, v6.8h, v1.h[5]
232        SXTL    v5.8h, v5.8b
233        SMLAL   v18.4s, v6.4h, v2.h[5]
234        SMLAL2  v22.4s, v6.8h, v2.h[5]
235        SMLAL   v19.4s, v6.4h, v3.h[5]
236        SMLAL2  v23.4s, v6.8h, v3.h[5]
237        LDR     d6, [x5, 104]
238        INS     v4.d[0], x11
239        SMLAL   v24.4s, v5.4h, v0.h[5]
240        SMLAL2  v28.4s, v5.8h, v0.h[5]
241        SMLAL   v25.4s, v5.4h, v1.h[5]
242        SMLAL2  v29.4s, v5.8h, v1.h[5]
243        SXTL    v4.8h, v4.8b
244        SMLAL   v26.4s, v5.4h, v2.h[5]
245        SMLAL2  v30.4s, v5.8h, v2.h[5]
246        SMLAL   v27.4s, v5.4h, v3.h[5]
247        SMLAL2  v31.4s, v5.8h, v3.h[5]
248        SXTL    v6.8h, v6.8b
249        LDR     x11, [x5, 112]
250        SMLAL   v16.4s, v4.4h, v0.h[6]
251        SMLAL2  v20.4s, v4.8h, v0.h[6]
252        SMLAL   v17.4s, v4.4h, v1.h[6]
253        SMLAL2  v21.4s, v4.8h, v1.h[6]
254        SMLAL   v18.4s, v4.4h, v2.h[6]
255        SMLAL2  v22.4s, v4.8h, v2.h[6]
256        SMLAL   v19.4s, v4.4h, v3.h[6]
257        SMLAL2  v23.4s, v4.8h, v3.h[6]
258        LDR     d5, [x5, 120]
259        INS     v4.d[0], x11
260        SMLAL   v24.4s, v6.4h, v0.h[6]
261        SMLAL2  v28.4s, v6.8h, v0.h[6]
262        SMLAL   v25.4s, v6.4h, v1.h[6]
263        SMLAL2  v29.4s, v6.8h, v1.h[6]
264        SXTL    v4.8h, v4.8b
265        ADD     x5, x5, 128
266
267        SMLAL   v26.4s, v6.4h, v2.h[6]
268        SMLAL2  v30.4s, v6.8h, v2.h[6]
269        LDR     x11, [x5]
270        SMLAL   v27.4s, v6.4h, v3.h[6]
271        SMLAL2  v31.4s, v6.8h, v3.h[6]
272        SXTL    v5.8h, v5.8b
273        LDR     x21, [x13], 8
274
275        SMLAL   v16.4s, v4.4h, v0.h[7]
276        SMLAL2  v20.4s, v4.8h, v0.h[7]
277        SMLAL   v17.4s, v4.4h, v1.h[7]
278        SMLAL2  v21.4s, v4.8h, v1.h[7]
279        SMLAL   v18.4s, v4.4h, v2.h[7]
280        SMLAL2  v22.4s, v4.8h, v2.h[7]
281        SMLAL   v19.4s, v4.4h, v3.h[7]
282        SMLAL2  v23.4s, v4.8h, v3.h[7]
283        LDR     d6, [x5, 8]
284        INS     v4.d[0], x11
285        SMLAL   v24.4s, v5.4h, v0.h[7]
286        SMLAL2  v28.4s, v5.8h, v0.h[7]
287        LDR     x11, [x15], 8
288        SMLAL   v25.4s, v5.4h, v1.h[7]
289        SMLAL2  v29.4s, v5.8h, v1.h[7]
290        LDR     d1, [x14], 8
291        INS     v0.d[0], x21
292        SMLAL   v26.4s, v5.4h, v2.h[7]
293        SMLAL2  v30.4s, v5.8h, v2.h[7]
294        SMLAL   v27.4s, v5.4h, v3.h[7]
295        SMLAL2  v31.4s, v5.8h, v3.h[7]
296        LDR     d3, [x20], 8
297        INS     v2.d[0], x11
298
299        SXTL    v0.8h, v0.8b
300        SXTL    v1.8h, v1.8b
301        LDR     x11, [x5, 16]
302        SXTL    v4.8h, v4.8b
303        SXTL    v2.8h, v2.8b
304        SUBS    x0, x0, 8
305        SXTL    v3.8h, v3.8b
306        SXTL    v6.8h, v6.8b
307        B.HS    2b
308
309        # Epilogue.  Same as main loop but no preloads in final group
310
311        .p2align 3
3123:
313        SMLAL   v16.4s, v4.4h, v0.h[0]
314        SMLAL2  v20.4s, v4.8h, v0.h[0]
315        SMLAL   v17.4s, v4.4h, v1.h[0]
316        SMLAL2  v21.4s, v4.8h, v1.h[0]
317        SMLAL   v18.4s, v4.4h, v2.h[0]
318        SMLAL2  v22.4s, v4.8h, v2.h[0]
319        SMLAL   v19.4s, v4.4h, v3.h[0]
320        SMLAL2  v23.4s, v4.8h, v3.h[0]
321        LDR     d4, [x5, 24]
322        INS     v5.d[0], x11
323        SMLAL   v24.4s, v6.4h, v0.h[0]
324        SMLAL2  v28.4s, v6.8h, v0.h[0]
325        SMLAL   v25.4s, v6.4h, v1.h[0]
326        SMLAL2  v29.4s, v6.8h, v1.h[0]
327        SXTL    v5.8h, v5.8b
328        SMLAL   v26.4s, v6.4h, v2.h[0]
329        SMLAL2  v30.4s, v6.8h, v2.h[0]
330        SMLAL   v27.4s, v6.4h, v3.h[0]
331        SMLAL2  v31.4s, v6.8h, v3.h[0]
332        LDR     x11, [x5, 32]
333        SMLAL   v16.4s, v5.4h, v0.h[1]
334        SMLAL2  v20.4s, v5.8h, v0.h[1]
335        SMLAL   v17.4s, v5.4h, v1.h[1]
336        SMLAL2  v21.4s, v5.8h, v1.h[1]
337        SXTL    v4.8h, v4.8b
338        SMLAL   v18.4s, v5.4h, v2.h[1]
339        SMLAL2  v22.4s, v5.8h, v2.h[1]
340        SMLAL   v19.4s, v5.4h, v3.h[1]
341        SMLAL2  v23.4s, v5.8h, v3.h[1]
342        LDR     d5, [x5, 40]
343        INS     v6.d[0], x11
344        SMLAL   v24.4s, v4.4h, v0.h[1]
345        SMLAL2  v28.4s, v4.8h, v0.h[1]
346        SMLAL   v25.4s, v4.4h, v1.h[1]
347        SMLAL2  v29.4s, v4.8h, v1.h[1]
348        SXTL    v6.8h, v6.8b
349        SMLAL   v26.4s, v4.4h, v2.h[1]
350        SMLAL2  v30.4s, v4.8h, v2.h[1]
351        SMLAL   v27.4s, v4.4h, v3.h[1]
352        SMLAL2  v31.4s, v4.8h, v3.h[1]
353        LDR     x11, [x5, 48]
354        SMLAL   v16.4s, v6.4h, v0.h[2]
355        SMLAL2  v20.4s, v6.8h, v0.h[2]
356        SMLAL   v17.4s, v6.4h, v1.h[2]
357        SXTL    v5.8h, v5.8b
358        SMLAL2  v21.4s, v6.8h, v1.h[2]
359        SMLAL   v18.4s, v6.4h, v2.h[2]
360        SMLAL2  v22.4s, v6.8h, v2.h[2]
361        SMLAL   v19.4s, v6.4h, v3.h[2]
362        SMLAL2  v23.4s, v6.8h, v3.h[2]
363        LDR     d6, [x5, 56]
364        INS     v4.d[0], x11
365        SMLAL   v24.4s, v5.4h, v0.h[2]
366        SMLAL2  v28.4s, v5.8h, v0.h[2]
367        SMLAL   v25.4s, v5.4h, v1.h[2]
368        SMLAL2  v29.4s, v5.8h, v1.h[2]
369        SXTL    v4.8h, v4.8b
370        SMLAL   v26.4s, v5.4h, v2.h[2]
371        SMLAL2  v30.4s, v5.8h, v2.h[2]
372        SMLAL   v27.4s, v5.4h, v3.h[2]
373        SMLAL2  v31.4s, v5.8h, v3.h[2]
374        LDR     x11, [x5, 64]
375        SMLAL   v16.4s, v4.4h, v0.h[3]
376        SMLAL2  v20.4s, v4.8h, v0.h[3]
377        SMLAL   v17.4s, v4.4h, v1.h[3]
378        SMLAL2  v21.4s, v4.8h, v1.h[3]
379        SXTL    v6.8h, v6.8b
380        SMLAL   v18.4s, v4.4h, v2.h[3]
381        SMLAL2  v22.4s, v4.8h, v2.h[3]
382        SMLAL   v19.4s, v4.4h, v3.h[3]
383        SMLAL2  v23.4s, v4.8h, v3.h[3]
384        LDR     d4, [x5, 72]
385        INS     v5.d[0], x11
386        SMLAL   v24.4s, v6.4h, v0.h[3]
387        SMLAL2  v28.4s, v6.8h, v0.h[3]
388        SXTL    v5.8h, v5.8b
389        SMLAL   v25.4s, v6.4h, v1.h[3]
390        SMLAL2  v29.4s, v6.8h, v1.h[3]
391        SMLAL   v26.4s, v6.4h, v2.h[3]
392        SMLAL2  v30.4s, v6.8h, v2.h[3]
393        SMLAL   v27.4s, v6.4h, v3.h[3]
394        SMLAL2  v31.4s, v6.8h, v3.h[3]
395        LDR     x11, [x5, 80]
396        SMLAL   v16.4s, v5.4h, v0.h[4]
397        SMLAL2  v20.4s, v5.8h, v0.h[4]
398        SMLAL   v17.4s, v5.4h, v1.h[4]
399        SMLAL2  v21.4s, v5.8h, v1.h[4]
400        SXTL    v4.8h, v4.8b
401        SMLAL   v18.4s, v5.4h, v2.h[4]
402        SMLAL2  v22.4s, v5.8h, v2.h[4]
403        SMLAL   v19.4s, v5.4h, v3.h[4]
404        SMLAL2  v23.4s, v5.8h, v3.h[4]
405        LDR     d5, [x5, 88]
406        INS     v6.d[0], x11
407        SMLAL   v24.4s, v4.4h, v0.h[4]
408        SMLAL2  v28.4s, v4.8h, v0.h[4]
409        SMLAL   v25.4s, v4.4h, v1.h[4]
410        SMLAL2  v29.4s, v4.8h, v1.h[4]
411        SXTL    v6.8h, v6.8b
412        SMLAL   v26.4s, v4.4h, v2.h[4]
413        SMLAL2  v30.4s, v4.8h, v2.h[4]
414        SMLAL   v27.4s, v4.4h, v3.h[4]
415        SMLAL2  v31.4s, v4.8h, v3.h[4]
416        LDR     x11, [x5, 96]
417        SMLAL   v16.4s, v6.4h, v0.h[5]
418        SMLAL2  v20.4s, v6.8h, v0.h[5]
419        SMLAL   v17.4s, v6.4h, v1.h[5]
420        SMLAL2  v21.4s, v6.8h, v1.h[5]
421        SXTL    v5.8h, v5.8b
422        SMLAL   v18.4s, v6.4h, v2.h[5]
423        SMLAL2  v22.4s, v6.8h, v2.h[5]
424        SMLAL   v19.4s, v6.4h, v3.h[5]
425        SMLAL2  v23.4s, v6.8h, v3.h[5]
426        LDR     d6, [x5, 104]
427        INS     v4.d[0], x11
428        SMLAL   v24.4s, v5.4h, v0.h[5]
429        SMLAL2  v28.4s, v5.8h, v0.h[5]
430        SMLAL   v25.4s, v5.4h, v1.h[5]
431        SMLAL2  v29.4s, v5.8h, v1.h[5]
432        SXTL    v4.8h, v4.8b
433        SMLAL   v26.4s, v5.4h, v2.h[5]
434        SMLAL2  v30.4s, v5.8h, v2.h[5]
435        SMLAL   v27.4s, v5.4h, v3.h[5]
436        SMLAL2  v31.4s, v5.8h, v3.h[5]
437        SXTL    v6.8h, v6.8b
438        SMLAL   v16.4s, v4.4h, v0.h[6]
439        SMLAL2  v20.4s, v4.8h, v0.h[6]
440        SMLAL   v17.4s, v4.4h, v1.h[6]
441        SMLAL2  v21.4s, v4.8h, v1.h[6]
442        SMLAL   v18.4s, v4.4h, v2.h[6]
443        SMLAL2  v22.4s, v4.8h, v2.h[6]
444        SMLAL   v19.4s, v4.4h, v3.h[6]
445        SMLAL2  v23.4s, v4.8h, v3.h[6]
446        LDR     x11, [x5, 112]
447        SMLAL   v24.4s, v6.4h, v0.h[6]
448        SMLAL2  v28.4s, v6.8h, v0.h[6]
449        SMLAL   v25.4s, v6.4h, v1.h[6]
450        SMLAL2  v29.4s, v6.8h, v1.h[6]
451        LDR     d5, [x5, 120]
452        INS     v4.d[0], x11
453        SXTL    v4.8h, v4.8b
454        SMLAL   v26.4s, v6.4h, v2.h[6]
455        SMLAL2  v30.4s, v6.8h, v2.h[6]
456        SMLAL   v27.4s, v6.4h, v3.h[6]
457        SMLAL2  v31.4s, v6.8h, v3.h[6]
458        SMLAL   v16.4s, v4.4h, v0.h[7]
459        SMLAL2  v20.4s, v4.8h, v0.h[7]
460        SMLAL   v17.4s, v4.4h, v1.h[7]
461        SMLAL2  v21.4s, v4.8h, v1.h[7]
462        SXTL    v5.8h, v5.8b
463        SMLAL   v18.4s, v4.4h, v2.h[7]
464        SMLAL2  v22.4s, v4.8h, v2.h[7]
465        SMLAL   v19.4s, v4.4h, v3.h[7]
466        SMLAL2  v23.4s, v4.8h, v3.h[7]
467        ADD     x5, x5, 128
468        SMLAL   v24.4s, v5.4h, v0.h[7]
469        SMLAL2  v28.4s, v5.8h, v0.h[7]
470        SMLAL   v25.4s, v5.4h, v1.h[7]
471        SMLAL2  v29.4s, v5.8h, v1.h[7]
472        AND     x0, x2, 7               // kc remainder 0 to 7
473        SMLAL   v26.4s, v5.4h, v2.h[7]
474        SMLAL2  v30.4s, v5.8h, v2.h[7]
475        LDR     x11, [sp, 40]            // reload params pointer
476        SMLAL   v27.4s, v5.4h, v3.h[7]
477        SMLAL2  v31.4s, v5.8h, v3.h[7]
478
479        # Is there a remainder?- 1 to 7 bytes of A
480        CBNZ    x0, 5f
481
4824:
483        # ks loop
484        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
485        B.HI    1b
486
487        SCVTF   v16.4s, v16.4s
488        SCVTF   v17.4s, v17.4s
489        # Load per channel scale values from weights
490        LDR     q4, [x5], 16
491        SCVTF   v18.4s, v18.4s
492        SCVTF   v19.4s, v19.4s
493        LDR     q5, [x5], 16
494        SCVTF   v20.4s, v20.4s
495        SCVTF   v21.4s, v21.4s
496        SCVTF   v22.4s, v22.4s
497        SCVTF   v23.4s, v23.4s
498        SCVTF   v24.4s, v24.4s
499        SCVTF   v25.4s, v25.4s
500        SCVTF   v26.4s, v26.4s
501        SCVTF   v27.4s, v27.4s
502        SCVTF   v28.4s, v28.4s
503        SCVTF   v29.4s, v29.4s
504        SCVTF   v30.4s, v30.4s
505        SCVTF   v31.4s, v31.4s
506
507        LDR     q6, [x5], 16
508        FMUL    v16.4s, v16.4s, v4.4s
509        FMUL    v17.4s, v17.4s, v4.4s
510        FMUL    v18.4s, v18.4s, v4.4s
511        FMUL    v19.4s, v19.4s, v4.4s
512        FMUL    v20.4s, v20.4s, v5.4s
513        LDR     q4, [x5], 16
514        FMUL    v21.4s, v21.4s, v5.4s
515        FMUL    v22.4s, v22.4s, v5.4s
516        FMUL    v23.4s, v23.4s, v5.4s
517        FMUL    v24.4s, v24.4s, v6.4s
518        FMUL    v25.4s, v25.4s, v6.4s
519        FMUL    v26.4s, v26.4s, v6.4s
520        FMUL    v27.4s, v27.4s, v6.4s
521        FMUL    v28.4s, v28.4s, v4.4s
522        FMUL    v29.4s, v29.4s, v4.4s
523        FMUL    v30.4s, v30.4s, v4.4s
524        FMUL    v31.4s, v31.4s, v4.4s
525
526        FCVTNS  v16.4s, v16.4s
527        FCVTNS  v17.4s, v17.4s
528        FCVTNS  v18.4s, v18.4s
529        FCVTNS  v19.4s, v19.4s
530        FCVTNS  v20.4s, v20.4s
531        FCVTNS  v21.4s, v21.4s
532        FCVTNS  v22.4s, v22.4s
533        FCVTNS  v23.4s, v23.4s
534        FCVTNS  v24.4s, v24.4s
535        FCVTNS  v25.4s, v25.4s
536        FCVTNS  v26.4s, v26.4s
537        FCVTNS  v27.4s, v27.4s
538        FCVTNS  v28.4s, v28.4s
539        FCVTNS  v29.4s, v29.4s
540        FCVTNS  v30.4s, v30.4s
541        FCVTNS  v31.4s, v31.4s
542
543        SQXTN   v16.4h, v16.4s
544        SQXTN   v17.4h, v17.4s
545        SQXTN   v18.4h, v18.4s
546        SQXTN   v19.4h, v19.4s
547        SQXTN   v24.4h, v24.4s
548        SQXTN   v25.4h, v25.4s
549        SQXTN   v26.4h, v26.4s
550        SQXTN   v27.4h, v27.4s
551        LD1R    {v6.8h}, [x11], 2        // add bias
552
553        SQXTN2  v16.8h, v20.4s
554        SQXTN2  v17.8h, v21.4s
555        SQXTN2  v18.8h, v22.4s
556        SQXTN2  v19.8h, v23.4s
557        SQXTN2  v24.8h, v28.4s
558        SQXTN2  v25.8h, v29.4s
559        SQXTN2  v26.8h, v30.4s
560        SQXTN2  v27.8h, v31.4s
561
562        SQADD   v16.8h, v16.8h, v6.8h
563        SQADD   v17.8h, v17.8h, v6.8h
564        SQADD   v18.8h, v18.8h, v6.8h
565        SQADD   v19.8h, v19.8h, v6.8h
566        SQADD   v24.8h, v24.8h, v6.8h
567        SQADD   v25.8h, v25.8h, v6.8h
568        SQADD   v26.8h, v26.8h, v6.8h
569        SQADD   v27.8h, v27.8h, v6.8h
570        LD1R    {v4.16b}, [x11], 1       // clamp min value
571
572        SQXTN   v0.8b, v16.8h
573        SQXTN   v1.8b, v17.8h
574        SQXTN   v2.8b, v18.8h
575        SQXTN   v3.8b, v19.8h
576        LD1R    {v5.16b}, [x11]          // clamp max value
577        SQXTN2  v0.16b, v24.8h
578        SQXTN2  v1.16b, v25.8h
579        SQXTN2  v2.16b, v26.8h
580        SQXTN2  v3.16b, v27.8h
581        SUB     x11, x11, 3             // rewind params pointer
582
583        SMAX    v0.16b, v0.16b, v4.16b
584        SMAX    v1.16b, v1.16b, v4.16b
585        SMAX    v2.16b, v2.16b, v4.16b
586        SMAX    v3.16b, v3.16b, v4.16b
587        SUBS    x1, x1, 16
588        SMIN    v0.16b, v0.16b, v5.16b
589        SMIN    v1.16b, v1.16b, v5.16b
590        SMIN    v2.16b, v2.16b, v5.16b
591        SMIN    v3.16b, v3.16b, v5.16b
592        B.LO    6f
593
594        # Store full 4 x 16
595        ST1     {v3.16b},  [x7], x10
596        ST1     {v2.16b}, [x17], x10
597        ST1     {v1.16b}, [x16], x10
598        ST1     {v0.16b},  [x6], x10
599
600        SUB     x4, x4, x3              // a -= ks
601
602        # nc loop
603        B.HI    0b
604
605        # Restore x20-x21 from stack
606        LDP     x20, x21, [sp], 16
607        RET
608
609        # Remainder- 1 to 7 bytes of A
610        .p2align 3
6115:
612        AND     x0, x2, 7               // kc remainder 1 to 7
613
614        LD1     {v0.8b}, [x13], x0
615        LDP     d4, d5, [x5], 16
616        LD1     {v1.8b}, [x14], x0
617        LD1     {v2.8b}, [x15], x0
618        LD1     {v3.8b}, [x20], x0
619        SXTL    v0.8h, v0.8b
620        SXTL    v4.8h, v4.8b
621        SXTL    v5.8h, v5.8b
622        SXTL    v1.8h, v1.8b
623        SXTL    v2.8h, v2.8b
624        SXTL    v3.8h, v3.8b
625        SMLAL   v16.4s, v4.4h, v0.h[0]
626        SMLAL2  v20.4s, v4.8h, v0.h[0]
627        SMLAL   v24.4s, v5.4h, v0.h[0]
628        SMLAL2  v28.4s, v5.8h, v0.h[0]
629        SMLAL   v17.4s, v4.4h, v1.h[0]
630        SMLAL2  v21.4s, v4.8h, v1.h[0]
631        SMLAL   v25.4s, v5.4h, v1.h[0]
632        SMLAL2  v29.4s, v5.8h, v1.h[0]
633        SMLAL   v18.4s, v4.4h, v2.h[0]
634        SMLAL2  v22.4s, v4.8h, v2.h[0]
635        SMLAL   v26.4s, v5.4h, v2.h[0]
636        SMLAL2  v30.4s, v5.8h, v2.h[0]
637        SMLAL   v19.4s, v4.4h, v3.h[0]
638        SMLAL2  v23.4s, v4.8h, v3.h[0]
639        SMLAL   v27.4s, v5.4h, v3.h[0]
640        SMLAL2  v31.4s, v5.8h, v3.h[0]
641        CMP     x0, 2
642        B.LO    4b
643
644        LDP     d4, d5, [x5], 16
645        SXTL    v4.8h, v4.8b
646        SXTL    v5.8h, v5.8b
647        SMLAL   v16.4s, v4.4h, v0.h[1]
648        SMLAL2  v20.4s, v4.8h, v0.h[1]
649        SMLAL   v24.4s, v5.4h, v0.h[1]
650        SMLAL2  v28.4s, v5.8h, v0.h[1]
651        SMLAL   v17.4s, v4.4h, v1.h[1]
652        SMLAL2  v21.4s, v4.8h, v1.h[1]
653        SMLAL   v25.4s, v5.4h, v1.h[1]
654        SMLAL2  v29.4s, v5.8h, v1.h[1]
655        SMLAL   v18.4s, v4.4h, v2.h[1]
656        SMLAL2  v22.4s, v4.8h, v2.h[1]
657        SMLAL   v26.4s, v5.4h, v2.h[1]
658        SMLAL2  v30.4s, v5.8h, v2.h[1]
659        SMLAL   v19.4s, v4.4h, v3.h[1]
660        SMLAL2  v23.4s, v4.8h, v3.h[1]
661        SMLAL   v27.4s, v5.4h, v3.h[1]
662        SMLAL2  v31.4s, v5.8h, v3.h[1]
663        B.EQ    4b
664
665        LDP     d4, d5, [x5], 16
666        SXTL    v4.8h, v4.8b
667        SXTL    v5.8h, v5.8b
668        SMLAL   v16.4s, v4.4h, v0.h[2]
669        SMLAL2  v20.4s, v4.8h, v0.h[2]
670        SMLAL   v24.4s, v5.4h, v0.h[2]
671        SMLAL2  v28.4s, v5.8h, v0.h[2]
672        SMLAL   v17.4s, v4.4h, v1.h[2]
673        SMLAL2  v21.4s, v4.8h, v1.h[2]
674        SMLAL   v25.4s, v5.4h, v1.h[2]
675        SMLAL2  v29.4s, v5.8h, v1.h[2]
676        SMLAL   v18.4s, v4.4h, v2.h[2]
677        SMLAL2  v22.4s, v4.8h, v2.h[2]
678        SMLAL   v26.4s, v5.4h, v2.h[2]
679        SMLAL2  v30.4s, v5.8h, v2.h[2]
680        SMLAL   v19.4s, v4.4h, v3.h[2]
681        SMLAL2  v23.4s, v4.8h, v3.h[2]
682        SMLAL   v27.4s, v5.4h, v3.h[2]
683        SMLAL2  v31.4s, v5.8h, v3.h[2]
684        CMP     x0, 4
685        B.LO    4b
686
687        LDP     d4, d5, [x5], 16
688        SXTL    v4.8h, v4.8b
689        SXTL    v5.8h, v5.8b
690        SMLAL   v16.4s, v4.4h, v0.h[3]
691        SMLAL2  v20.4s, v4.8h, v0.h[3]
692        SMLAL   v24.4s, v5.4h, v0.h[3]
693        SMLAL2  v28.4s, v5.8h, v0.h[3]
694        SMLAL   v17.4s, v4.4h, v1.h[3]
695        SMLAL2  v21.4s, v4.8h, v1.h[3]
696        SMLAL   v25.4s, v5.4h, v1.h[3]
697        SMLAL2  v29.4s, v5.8h, v1.h[3]
698        SMLAL   v18.4s, v4.4h, v2.h[3]
699        SMLAL2  v22.4s, v4.8h, v2.h[3]
700        SMLAL   v26.4s, v5.4h, v2.h[3]
701        SMLAL2  v30.4s, v5.8h, v2.h[3]
702        SMLAL   v19.4s, v4.4h, v3.h[3]
703        SMLAL2  v23.4s, v4.8h, v3.h[3]
704        SMLAL   v27.4s, v5.4h, v3.h[3]
705        SMLAL2  v31.4s, v5.8h, v3.h[3]
706        B.EQ    4b
707
708        LDP     d4, d5, [x5], 16
709        SXTL    v4.8h, v4.8b
710        SXTL    v5.8h, v5.8b
711        SMLAL   v16.4s, v4.4h, v0.h[4]
712        SMLAL2  v20.4s, v4.8h, v0.h[4]
713        SMLAL   v24.4s, v5.4h, v0.h[4]
714        SMLAL2  v28.4s, v5.8h, v0.h[4]
715        SMLAL   v17.4s, v4.4h, v1.h[4]
716        SMLAL2  v21.4s, v4.8h, v1.h[4]
717        SMLAL   v25.4s, v5.4h, v1.h[4]
718        SMLAL2  v29.4s, v5.8h, v1.h[4]
719        SMLAL   v18.4s, v4.4h, v2.h[4]
720        SMLAL2  v22.4s, v4.8h, v2.h[4]
721        SMLAL   v26.4s, v5.4h, v2.h[4]
722        SMLAL2  v30.4s, v5.8h, v2.h[4]
723        SMLAL   v19.4s, v4.4h, v3.h[4]
724        SMLAL2  v23.4s, v4.8h, v3.h[4]
725        SMLAL   v27.4s, v5.4h, v3.h[4]
726        SMLAL2  v31.4s, v5.8h, v3.h[4]
727        CMP     x0, 6
728        B.LO    4b
729
730        LDP     d4, d5, [x5], 16
731        SXTL    v4.8h, v4.8b
732        SXTL    v5.8h, v5.8b
733        SMLAL   v16.4s, v4.4h, v0.h[5]
734        SMLAL2  v20.4s, v4.8h, v0.h[5]
735        SMLAL   v24.4s, v5.4h, v0.h[5]
736        SMLAL2  v28.4s, v5.8h, v0.h[5]
737        SMLAL   v17.4s, v4.4h, v1.h[5]
738        SMLAL2  v21.4s, v4.8h, v1.h[5]
739        SMLAL   v25.4s, v5.4h, v1.h[5]
740        SMLAL2  v29.4s, v5.8h, v1.h[5]
741        SMLAL   v18.4s, v4.4h, v2.h[5]
742        SMLAL2  v22.4s, v4.8h, v2.h[5]
743        SMLAL   v26.4s, v5.4h, v2.h[5]
744        SMLAL2  v30.4s, v5.8h, v2.h[5]
745        SMLAL   v19.4s, v4.4h, v3.h[5]
746        SMLAL2  v23.4s, v4.8h, v3.h[5]
747        SMLAL   v27.4s, v5.4h, v3.h[5]
748        SMLAL2  v31.4s, v5.8h, v3.h[5]
749        B.EQ    4b
750
751        LDP     d4, d5, [x5], 16
752        SXTL    v4.8h, v4.8b
753        SXTL    v5.8h, v5.8b
754        SMLAL   v16.4s, v4.4h, v0.h[6]
755        SMLAL2  v20.4s, v4.8h, v0.h[6]
756        SMLAL   v24.4s, v5.4h, v0.h[6]
757        SMLAL2  v28.4s, v5.8h, v0.h[6]
758        SMLAL   v17.4s, v4.4h, v1.h[6]
759        SMLAL2  v21.4s, v4.8h, v1.h[6]
760        SMLAL   v25.4s, v5.4h, v1.h[6]
761        SMLAL2  v29.4s, v5.8h, v1.h[6]
762        SMLAL   v18.4s, v4.4h, v2.h[6]
763        SMLAL2  v22.4s, v4.8h, v2.h[6]
764        SMLAL   v26.4s, v5.4h, v2.h[6]
765        SMLAL2  v30.4s, v5.8h, v2.h[6]
766        SMLAL   v19.4s, v4.4h, v3.h[6]
767        SMLAL2  v23.4s, v4.8h, v3.h[6]
768        SMLAL   v27.4s, v5.4h, v3.h[6]
769        SMLAL2  v31.4s, v5.8h, v3.h[6]
770        B       4b
771
772        # Store odd width
773        .p2align 3
7746:
775        TBZ     x1, 3, 7f
776        STR     d3, [x7], 8
777        STR     d2, [x17], 8
778        DUP     d3, v3.d[1]
779        DUP     d2, v2.d[1]
780        STR     d1, [x16], 8
781        STR     d0, [x6], 8
782        DUP     d1, v1.d[1]
783        DUP     d0, v0.d[1]
7847:
785        TBZ     x1, 2, 8f
786        STR     s3, [x7], 4
787        STR     s2, [x17], 4
788        DUP     s3, v3.s[1]
789        DUP     s2, v2.s[1]
790        STR     s1, [x16], 4
791        STR     s0, [x6], 4
792        DUP     s1, v1.s[1]
793        DUP     s0, v0.s[1]
7948:
795        TBZ     x1, 1, 9f
796        STR     h3, [x7], 2
797        STR     h2, [x17], 2
798        DUP     h3, v3.h[1]
799        DUP     h2, v2.h[1]
800        STR     h1, [x16], 2
801        STR     h0, [x6], 2
802        DUP     h1, v1.h[1]
803        DUP     h0, v0.h[1]
8049:
805        TBZ     x1, 0, 10f
806        STR     b3, [x7]
807        STR     b2, [x17]
808        STR     b1, [x16]
809        STR     b0, [x6]
81010:
811        # Restore x20-x21 from stack
812        LDP     x20, x21, [sp], 16
813        RET
814
815END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
816
817#ifdef __ELF__
818.section ".note.GNU-stack","",%progbits
819#endif
820