1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const uint8_t** restrict a, x4
19#     const uint8_t* restrict w,  x5
20#     uint8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const uint8_t* zero,                [sp + 16] -> x12
25#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
26
27# params structure is 20 bytes
28#  struct {
29#    uint8_t kernel_zero_point[4];
30#    int32_t right_pre_shift;
31#    int32_t multiplier;
32#    int32_t right_post_shift;
33#    int16_t output_zero_point;
34#    uint8_t output_min;
35#    uint8_t output_max;
36#  } rndnu_neon;
37#
38# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
39
40# Register usage
41# A0  x13  v0
42# A1  x14  v1
43# A2  x15  v2
44# A3  x20  v3
45# B    x5  v4  v5  v6
46# C0   x6 v16 v20 v24 v28
47# C1  x16 v17 v21 v25 v29
48# C2  x17 v18 v22 v26 v30
49# C3   x7 v19 v23 v27 v31
50# zero_point v7
51# unused  v8 v9 v10 v11 v12 v13 v14 v15
52# x11, x21 temp for Cortex-A53 loads
53
54BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
55
56        # Clamp C pointers
57        CMP     x0, 2                   // if mr < 2
58        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
59        ADD     x16, x6, x7             // c1 = c0 + cm_stride
60        CSEL    x16, x6,  x16, LO       //   c1 = c0
61
62        ADD     x17, x16, x7            // c2 = c1 + cm_stride
63        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
64                                        // if mr <= 2
65        CSEL    x17, x16, x17, LS       //   c2 = c1
66
67        CMP     x0, 4                   // if mr < 4
68        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
69        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
70        CSEL    x7,  x17, x7, LO        //   c3 = c2
71
72        LD1R    {v7.4s}, [x11]          // kernel_zero_point
73
74        .p2align 3
750:
76        # Load initial bias from w into accumulators
77        LDP     q16, q20, [x5], 32
78        MOV     v17.16b, v16.16b
79        MOV     v18.16b, v16.16b
80        LDP     q24, q28, [x5], 32
81        MOV     v19.16b, v16.16b
82        MOV     v21.16b, v20.16b
83        ADD     x11, x11, 4              // adjust params pointer
84        MOV     v22.16b, v20.16b
85        MOV     v23.16b, v20.16b
86        MOV     v25.16b, v24.16b
87        MOV     v26.16b, v24.16b
88        MOV     v27.16b, v24.16b
89        MOV     v29.16b, v28.16b
90        MOV     v30.16b, v28.16b
91        MOV     v31.16b, v28.16b
92        MOV     x9, x3                  // p = ks
93
94        .p2align 3
951:
96        # Load next 4 A pointers
97        LDP     x13, x14, [x4], 16
98        LDP     x15, x20, [x4], 16
99
100        CMP     x13, x12                // if a0 == zero
101        ADD     x13, x13, x8            // a0 += a_offset
102        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
103        CMP     x14, x12                // if a1 == zero
104        ADD     x14, x14, x8            // a1 += a_offset
105        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
106        CMP     x15, x12                // if a2 == zero
107        ADD     x15, x15, x8            // a2 += a_offset
108        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
109        CMP     x20, x12                // if a3 == zero
110        ADD     x20, x20, x8            // a3 += a_offset
111        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
112
113        # Is there at least 8 bytes for epilogue?
114        SUBS    x0, x2, 8               // k = kc - 8
115        B.LO    5f
116
117        # Prologue
118        LDR     d0, [x13], 8
119        LDP     d4, d6, [x5]
120        LDR     d1, [x14], 8
121        LDR     d2, [x15], 8
122        LDR     d3, [x20], 8
123        UXTL    v0.8h, v0.8b
124        LDR     x11, [x5, 16]
125        USUBL   v4.8h, v4.8b, v7.8b
126        UXTL    v1.8h, v1.8b
127        UXTL    v2.8h, v2.8b
128        UXTL    v3.8h, v3.8b
129        USUBL   v6.8h, v6.8b, v7.8b
130
131        SUBS    x0, x0, 8               // k = k - 8
132        # Is there at least 8 bytes for main loop?
133        B.LO    3f
134
135        # Main loop - 8 bytes of A
136        .p2align 3
1372:
138        SMLAL   v16.4s, v4.4h, v0.h[0]
139        SMLAL2  v20.4s, v4.8h, v0.h[0]
140        SMLAL   v17.4s, v4.4h, v1.h[0]
141        SMLAL2  v21.4s, v4.8h, v1.h[0]
142        SMLAL   v18.4s, v4.4h, v2.h[0]
143        SMLAL2  v22.4s, v4.8h, v2.h[0]
144        SMLAL   v19.4s, v4.4h, v3.h[0]
145        SMLAL2  v23.4s, v4.8h, v3.h[0]
146        LDR     d4, [x5, 24]
147        INS     v5.d[0], x11
148        SMLAL   v24.4s, v6.4h, v0.h[0]
149        SMLAL2  v28.4s, v6.8h, v0.h[0]
150        SMLAL   v25.4s, v6.4h, v1.h[0]
151        SMLAL2  v29.4s, v6.8h, v1.h[0]
152        USUBL   v5.8h, v5.8b, v7.8b
153        SMLAL   v26.4s, v6.4h, v2.h[0]
154        SMLAL2  v30.4s, v6.8h, v2.h[0]
155        SMLAL   v27.4s, v6.4h, v3.h[0]
156        SMLAL2  v31.4s, v6.8h, v3.h[0]
157        LDR     x11, [x5, 32]
158        SMLAL   v16.4s, v5.4h, v0.h[1]
159        SMLAL2  v20.4s, v5.8h, v0.h[1]
160        SMLAL   v17.4s, v5.4h, v1.h[1]
161        SMLAL2  v21.4s, v5.8h, v1.h[1]
162        USUBL   v4.8h, v4.8b, v7.8b
163        SMLAL   v18.4s, v5.4h, v2.h[1]
164        SMLAL2  v22.4s, v5.8h, v2.h[1]
165        SMLAL   v19.4s, v5.4h, v3.h[1]
166        SMLAL2  v23.4s, v5.8h, v3.h[1]
167        LDR     d5, [x5, 40]
168        INS     v6.d[0], x11
169        SMLAL   v24.4s, v4.4h, v0.h[1]
170        SMLAL2  v28.4s, v4.8h, v0.h[1]
171        SMLAL   v25.4s, v4.4h, v1.h[1]
172        SMLAL2  v29.4s, v4.8h, v1.h[1]
173        USUBL   v6.8h, v6.8b, v7.8b
174        SMLAL   v26.4s, v4.4h, v2.h[1]
175        SMLAL2  v30.4s, v4.8h, v2.h[1]
176        SMLAL   v27.4s, v4.4h, v3.h[1]
177        SMLAL2  v31.4s, v4.8h, v3.h[1]
178        LDR     x11, [x5, 48]
179        SMLAL   v16.4s, v6.4h, v0.h[2]
180        SMLAL2  v20.4s, v6.8h, v0.h[2]
181        SMLAL   v17.4s, v6.4h, v1.h[2]
182        USUBL   v5.8h, v5.8b, v7.8b
183        SMLAL2  v21.4s, v6.8h, v1.h[2]
184        SMLAL   v18.4s, v6.4h, v2.h[2]
185        SMLAL2  v22.4s, v6.8h, v2.h[2]
186        SMLAL   v19.4s, v6.4h, v3.h[2]
187        SMLAL2  v23.4s, v6.8h, v3.h[2]
188        LDR     d6, [x5, 56]
189        INS     v4.d[0], x11
190        SMLAL   v24.4s, v5.4h, v0.h[2]
191        SMLAL2  v28.4s, v5.8h, v0.h[2]
192        SMLAL   v25.4s, v5.4h, v1.h[2]
193        SMLAL2  v29.4s, v5.8h, v1.h[2]
194        USUBL   v4.8h, v4.8b, v7.8b
195        SMLAL   v26.4s, v5.4h, v2.h[2]
196        SMLAL2  v30.4s, v5.8h, v2.h[2]
197        SMLAL   v27.4s, v5.4h, v3.h[2]
198        SMLAL2  v31.4s, v5.8h, v3.h[2]
199        LDR     x11, [x5, 64]
200        SMLAL   v16.4s, v4.4h, v0.h[3]
201        SMLAL2  v20.4s, v4.8h, v0.h[3]
202        SMLAL   v17.4s, v4.4h, v1.h[3]
203        SMLAL2  v21.4s, v4.8h, v1.h[3]
204        USUBL   v6.8h, v6.8b, v7.8b
205        SMLAL   v18.4s, v4.4h, v2.h[3]
206        SMLAL2  v22.4s, v4.8h, v2.h[3]
207        SMLAL   v19.4s, v4.4h, v3.h[3]
208        SMLAL2  v23.4s, v4.8h, v3.h[3]
209        LDR     d4, [x5, 72]
210        INS     v5.d[0], x11
211        SMLAL   v24.4s, v6.4h, v0.h[3]
212        SMLAL2  v28.4s, v6.8h, v0.h[3]
213        USUBL   v5.8h, v5.8b, v7.8b
214        SMLAL   v25.4s, v6.4h, v1.h[3]
215        SMLAL2  v29.4s, v6.8h, v1.h[3]
216        SMLAL   v26.4s, v6.4h, v2.h[3]
217        SMLAL2  v30.4s, v6.8h, v2.h[3]
218        SMLAL   v27.4s, v6.4h, v3.h[3]
219        SMLAL2  v31.4s, v6.8h, v3.h[3]
220        LDR     x11, [x5, 80]
221        SMLAL   v16.4s, v5.4h, v0.h[4]
222        SMLAL2  v20.4s, v5.8h, v0.h[4]
223        SMLAL   v17.4s, v5.4h, v1.h[4]
224        SMLAL2  v21.4s, v5.8h, v1.h[4]
225        USUBL   v4.8h, v4.8b, v7.8b
226        SMLAL   v18.4s, v5.4h, v2.h[4]
227        SMLAL2  v22.4s, v5.8h, v2.h[4]
228        SMLAL   v19.4s, v5.4h, v3.h[4]
229        SMLAL2  v23.4s, v5.8h, v3.h[4]
230        LDR     d5, [x5, 88]
231        INS     v6.d[0], x11
232        SMLAL   v24.4s, v4.4h, v0.h[4]
233        SMLAL2  v28.4s, v4.8h, v0.h[4]
234        SMLAL   v25.4s, v4.4h, v1.h[4]
235        SMLAL2  v29.4s, v4.8h, v1.h[4]
236        USUBL   v6.8h, v6.8b, v7.8b
237        SMLAL   v26.4s, v4.4h, v2.h[4]
238        SMLAL2  v30.4s, v4.8h, v2.h[4]
239        SMLAL   v27.4s, v4.4h, v3.h[4]
240        SMLAL2  v31.4s, v4.8h, v3.h[4]
241        LDR     x11, [x5, 96]
242        SMLAL   v16.4s, v6.4h, v0.h[5]
243        SMLAL2  v20.4s, v6.8h, v0.h[5]
244        SMLAL   v17.4s, v6.4h, v1.h[5]
245        SMLAL2  v21.4s, v6.8h, v1.h[5]
246        USUBL   v5.8h, v5.8b, v7.8b
247        SMLAL   v18.4s, v6.4h, v2.h[5]
248        SMLAL2  v22.4s, v6.8h, v2.h[5]
249        SMLAL   v19.4s, v6.4h, v3.h[5]
250        SMLAL2  v23.4s, v6.8h, v3.h[5]
251        LDR     d6, [x5, 104]
252        INS     v4.d[0], x11
253        SMLAL   v24.4s, v5.4h, v0.h[5]
254        SMLAL2  v28.4s, v5.8h, v0.h[5]
255        SMLAL   v25.4s, v5.4h, v1.h[5]
256        SMLAL2  v29.4s, v5.8h, v1.h[5]
257        USUBL   v4.8h, v4.8b, v7.8b
258        SMLAL   v26.4s, v5.4h, v2.h[5]
259        SMLAL2  v30.4s, v5.8h, v2.h[5]
260        SMLAL   v27.4s, v5.4h, v3.h[5]
261        SMLAL2  v31.4s, v5.8h, v3.h[5]
262        USUBL   v6.8h, v6.8b, v7.8b
263        LDR     x11, [x5, 112]
264        SMLAL   v16.4s, v4.4h, v0.h[6]
265        SMLAL2  v20.4s, v4.8h, v0.h[6]
266        SMLAL   v17.4s, v4.4h, v1.h[6]
267        SMLAL2  v21.4s, v4.8h, v1.h[6]
268        SMLAL   v18.4s, v4.4h, v2.h[6]
269        SMLAL2  v22.4s, v4.8h, v2.h[6]
270        SMLAL   v19.4s, v4.4h, v3.h[6]
271        SMLAL2  v23.4s, v4.8h, v3.h[6]
272        LDR     d5, [x5, 120]
273        INS     v4.d[0], x11
274        SMLAL   v24.4s, v6.4h, v0.h[6]
275        SMLAL2  v28.4s, v6.8h, v0.h[6]
276        SMLAL   v25.4s, v6.4h, v1.h[6]
277        SMLAL2  v29.4s, v6.8h, v1.h[6]
278        USUBL   v4.8h, v4.8b, v7.8b
279        ADD     x5, x5, 128
280
281        SMLAL   v26.4s, v6.4h, v2.h[6]
282        SMLAL2  v30.4s, v6.8h, v2.h[6]
283        LDR     x11, [x5]
284        SMLAL   v27.4s, v6.4h, v3.h[6]
285        SMLAL2  v31.4s, v6.8h, v3.h[6]
286        USUBL   v5.8h, v5.8b, v7.8b
287        LDR     x21, [x13], 8
288
289        SMLAL   v16.4s, v4.4h, v0.h[7]
290        SMLAL2  v20.4s, v4.8h, v0.h[7]
291        SMLAL   v17.4s, v4.4h, v1.h[7]
292        SMLAL2  v21.4s, v4.8h, v1.h[7]
293        SMLAL   v18.4s, v4.4h, v2.h[7]
294        SMLAL2  v22.4s, v4.8h, v2.h[7]
295        SMLAL   v19.4s, v4.4h, v3.h[7]
296        SMLAL2  v23.4s, v4.8h, v3.h[7]
297        LDR     d6, [x5, 8]
298        INS     v4.d[0], x11
299        SMLAL   v24.4s, v5.4h, v0.h[7]
300        SMLAL2  v28.4s, v5.8h, v0.h[7]
301        LDR     x11, [x15], 8
302        SMLAL   v25.4s, v5.4h, v1.h[7]
303        SMLAL2  v29.4s, v5.8h, v1.h[7]
304        LDR     d1, [x14], 8
305        INS     v0.d[0], x21
306        SMLAL   v26.4s, v5.4h, v2.h[7]
307        SMLAL2  v30.4s, v5.8h, v2.h[7]
308        SMLAL   v27.4s, v5.4h, v3.h[7]
309        SMLAL2  v31.4s, v5.8h, v3.h[7]
310        LDR     d3, [x20], 8
311        INS     v2.d[0], x11
312
313        UXTL    v0.8h, v0.8b
314        UXTL    v1.8h, v1.8b
315        LDR     x11, [x5, 16]
316        USUBL   v4.8h, v4.8b, v7.8b
317        UXTL    v2.8h, v2.8b
318        SUBS    x0, x0, 8
319        UXTL    v3.8h, v3.8b
320        USUBL   v6.8h, v6.8b, v7.8b
321        B.HS    2b
322
323        # Epilogue.  Same as main loop but no preloads in final group
324
325        .p2align 3
3263:
327        SMLAL   v16.4s, v4.4h, v0.h[0]
328        SMLAL2  v20.4s, v4.8h, v0.h[0]
329        SMLAL   v17.4s, v4.4h, v1.h[0]
330        SMLAL2  v21.4s, v4.8h, v1.h[0]
331        SMLAL   v18.4s, v4.4h, v2.h[0]
332        SMLAL2  v22.4s, v4.8h, v2.h[0]
333        SMLAL   v19.4s, v4.4h, v3.h[0]
334        SMLAL2  v23.4s, v4.8h, v3.h[0]
335        LDR     d4, [x5, 24]
336        INS     v5.d[0], x11
337        SMLAL   v24.4s, v6.4h, v0.h[0]
338        SMLAL2  v28.4s, v6.8h, v0.h[0]
339        SMLAL   v25.4s, v6.4h, v1.h[0]
340        SMLAL2  v29.4s, v6.8h, v1.h[0]
341        USUBL   v5.8h, v5.8b, v7.8b
342        SMLAL   v26.4s, v6.4h, v2.h[0]
343        SMLAL2  v30.4s, v6.8h, v2.h[0]
344        SMLAL   v27.4s, v6.4h, v3.h[0]
345        SMLAL2  v31.4s, v6.8h, v3.h[0]
346        LDR     x11, [x5, 32]
347        SMLAL   v16.4s, v5.4h, v0.h[1]
348        SMLAL2  v20.4s, v5.8h, v0.h[1]
349        SMLAL   v17.4s, v5.4h, v1.h[1]
350        SMLAL2  v21.4s, v5.8h, v1.h[1]
351        USUBL   v4.8h, v4.8b, v7.8b
352        SMLAL   v18.4s, v5.4h, v2.h[1]
353        SMLAL2  v22.4s, v5.8h, v2.h[1]
354        SMLAL   v19.4s, v5.4h, v3.h[1]
355        SMLAL2  v23.4s, v5.8h, v3.h[1]
356        LDR     d5, [x5, 40]
357        INS     v6.d[0], x11
358        SMLAL   v24.4s, v4.4h, v0.h[1]
359        SMLAL2  v28.4s, v4.8h, v0.h[1]
360        SMLAL   v25.4s, v4.4h, v1.h[1]
361        SMLAL2  v29.4s, v4.8h, v1.h[1]
362        USUBL   v6.8h, v6.8b, v7.8b
363        SMLAL   v26.4s, v4.4h, v2.h[1]
364        SMLAL2  v30.4s, v4.8h, v2.h[1]
365        SMLAL   v27.4s, v4.4h, v3.h[1]
366        SMLAL2  v31.4s, v4.8h, v3.h[1]
367        LDR     x11, [x5, 48]
368        SMLAL   v16.4s, v6.4h, v0.h[2]
369        SMLAL2  v20.4s, v6.8h, v0.h[2]
370        SMLAL   v17.4s, v6.4h, v1.h[2]
371        USUBL   v5.8h, v5.8b, v7.8b
372        SMLAL2  v21.4s, v6.8h, v1.h[2]
373        SMLAL   v18.4s, v6.4h, v2.h[2]
374        SMLAL2  v22.4s, v6.8h, v2.h[2]
375        SMLAL   v19.4s, v6.4h, v3.h[2]
376        SMLAL2  v23.4s, v6.8h, v3.h[2]
377        LDR     d6, [x5, 56]
378        INS     v4.d[0], x11
379        SMLAL   v24.4s, v5.4h, v0.h[2]
380        SMLAL2  v28.4s, v5.8h, v0.h[2]
381        SMLAL   v25.4s, v5.4h, v1.h[2]
382        SMLAL2  v29.4s, v5.8h, v1.h[2]
383        USUBL   v4.8h, v4.8b, v7.8b
384        SMLAL   v26.4s, v5.4h, v2.h[2]
385        SMLAL2  v30.4s, v5.8h, v2.h[2]
386        SMLAL   v27.4s, v5.4h, v3.h[2]
387        SMLAL2  v31.4s, v5.8h, v3.h[2]
388        LDR     x11, [x5, 64]
389        SMLAL   v16.4s, v4.4h, v0.h[3]
390        SMLAL2  v20.4s, v4.8h, v0.h[3]
391        SMLAL   v17.4s, v4.4h, v1.h[3]
392        SMLAL2  v21.4s, v4.8h, v1.h[3]
393        USUBL   v6.8h, v6.8b, v7.8b
394        SMLAL   v18.4s, v4.4h, v2.h[3]
395        SMLAL2  v22.4s, v4.8h, v2.h[3]
396        SMLAL   v19.4s, v4.4h, v3.h[3]
397        SMLAL2  v23.4s, v4.8h, v3.h[3]
398        LDR     d4, [x5, 72]
399        INS     v5.d[0], x11
400        SMLAL   v24.4s, v6.4h, v0.h[3]
401        SMLAL2  v28.4s, v6.8h, v0.h[3]
402        USUBL   v5.8h, v5.8b, v7.8b
403        SMLAL   v25.4s, v6.4h, v1.h[3]
404        SMLAL2  v29.4s, v6.8h, v1.h[3]
405        SMLAL   v26.4s, v6.4h, v2.h[3]
406        SMLAL2  v30.4s, v6.8h, v2.h[3]
407        SMLAL   v27.4s, v6.4h, v3.h[3]
408        SMLAL2  v31.4s, v6.8h, v3.h[3]
409        LDR     x11, [x5, 80]
410        SMLAL   v16.4s, v5.4h, v0.h[4]
411        SMLAL2  v20.4s, v5.8h, v0.h[4]
412        SMLAL   v17.4s, v5.4h, v1.h[4]
413        SMLAL2  v21.4s, v5.8h, v1.h[4]
414        USUBL   v4.8h, v4.8b, v7.8b
415        SMLAL   v18.4s, v5.4h, v2.h[4]
416        SMLAL2  v22.4s, v5.8h, v2.h[4]
417        SMLAL   v19.4s, v5.4h, v3.h[4]
418        SMLAL2  v23.4s, v5.8h, v3.h[4]
419        LDR     d5, [x5, 88]
420        INS     v6.d[0], x11
421        SMLAL   v24.4s, v4.4h, v0.h[4]
422        SMLAL2  v28.4s, v4.8h, v0.h[4]
423        SMLAL   v25.4s, v4.4h, v1.h[4]
424        SMLAL2  v29.4s, v4.8h, v1.h[4]
425        USUBL   v6.8h, v6.8b, v7.8b
426        SMLAL   v26.4s, v4.4h, v2.h[4]
427        SMLAL2  v30.4s, v4.8h, v2.h[4]
428        SMLAL   v27.4s, v4.4h, v3.h[4]
429        SMLAL2  v31.4s, v4.8h, v3.h[4]
430        LDR     x11, [x5, 96]
431        SMLAL   v16.4s, v6.4h, v0.h[5]
432        SMLAL2  v20.4s, v6.8h, v0.h[5]
433        SMLAL   v17.4s, v6.4h, v1.h[5]
434        SMLAL2  v21.4s, v6.8h, v1.h[5]
435        USUBL   v5.8h, v5.8b, v7.8b
436        SMLAL   v18.4s, v6.4h, v2.h[5]
437        SMLAL2  v22.4s, v6.8h, v2.h[5]
438        SMLAL   v19.4s, v6.4h, v3.h[5]
439        SMLAL2  v23.4s, v6.8h, v3.h[5]
440        LDR     d6, [x5, 104]
441        INS     v4.d[0], x11
442        SMLAL   v24.4s, v5.4h, v0.h[5]
443        SMLAL2  v28.4s, v5.8h, v0.h[5]
444        SMLAL   v25.4s, v5.4h, v1.h[5]
445        SMLAL2  v29.4s, v5.8h, v1.h[5]
446        USUBL   v4.8h, v4.8b, v7.8b
447        SMLAL   v26.4s, v5.4h, v2.h[5]
448        SMLAL2  v30.4s, v5.8h, v2.h[5]
449        SMLAL   v27.4s, v5.4h, v3.h[5]
450        SMLAL2  v31.4s, v5.8h, v3.h[5]
451        USUBL   v6.8h, v6.8b, v7.8b
452        SMLAL   v16.4s, v4.4h, v0.h[6]
453        SMLAL2  v20.4s, v4.8h, v0.h[6]
454        SMLAL   v17.4s, v4.4h, v1.h[6]
455        SMLAL2  v21.4s, v4.8h, v1.h[6]
456        SMLAL   v18.4s, v4.4h, v2.h[6]
457        SMLAL2  v22.4s, v4.8h, v2.h[6]
458        SMLAL   v19.4s, v4.4h, v3.h[6]
459        SMLAL2  v23.4s, v4.8h, v3.h[6]
460        LDR     x11, [x5, 112]
461        SMLAL   v24.4s, v6.4h, v0.h[6]
462        SMLAL2  v28.4s, v6.8h, v0.h[6]
463        SMLAL   v25.4s, v6.4h, v1.h[6]
464        SMLAL2  v29.4s, v6.8h, v1.h[6]
465        LDR     d5, [x5, 120]
466        INS     v4.d[0], x11
467        USUBL   v4.8h, v4.8b, v7.8b
468        SMLAL   v26.4s, v6.4h, v2.h[6]
469        SMLAL2  v30.4s, v6.8h, v2.h[6]
470        SMLAL   v27.4s, v6.4h, v3.h[6]
471        SMLAL2  v31.4s, v6.8h, v3.h[6]
472        SMLAL   v16.4s, v4.4h, v0.h[7]
473        SMLAL2  v20.4s, v4.8h, v0.h[7]
474        SMLAL   v17.4s, v4.4h, v1.h[7]
475        SMLAL2  v21.4s, v4.8h, v1.h[7]
476        USUBL   v5.8h, v5.8b, v7.8b
477        SMLAL   v18.4s, v4.4h, v2.h[7]
478        SMLAL2  v22.4s, v4.8h, v2.h[7]
479        SMLAL   v19.4s, v4.4h, v3.h[7]
480        SMLAL2  v23.4s, v4.8h, v3.h[7]
481        ADD     x5, x5, 128
482        SMLAL   v24.4s, v5.4h, v0.h[7]
483        SMLAL2  v28.4s, v5.8h, v0.h[7]
484        SMLAL   v25.4s, v5.4h, v1.h[7]
485        SMLAL2  v29.4s, v5.8h, v1.h[7]
486        AND     x0, x2, 7               // kc remainder 0 to 7
487        SMLAL   v26.4s, v5.4h, v2.h[7]
488        SMLAL2  v30.4s, v5.8h, v2.h[7]
489        LDR     x11, [sp, 40]            // reload params pointer
490        SMLAL   v27.4s, v5.4h, v3.h[7]
491        SMLAL2  v31.4s, v5.8h, v3.h[7]
492        ADD     x11, x11, 4
493
494        # Is there a remainder?- 1 to 7 bytes of A
495        CBNZ    x0, 5f
496
4974:
498        # ks loop
499        SUBS    x9, x9, 32              // ks -= MR * sizeof(uint8_t*)
500        B.HI    1b
501
502        # Apply params - preshift, scale, postshift, bias and clamp
503        LD1R    {v4.4s}, [x11], 4
504        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
505        SQSHL   v17.4s, v17.4s, v4.4s
506        SQSHL   v18.4s, v18.4s, v4.4s
507        SQSHL   v19.4s, v19.4s, v4.4s
508        SQSHL   v20.4s, v20.4s, v4.4s
509        SQSHL   v21.4s, v21.4s, v4.4s
510        SQSHL   v22.4s, v22.4s, v4.4s
511        SQSHL   v23.4s, v23.4s, v4.4s
512        LD1R    {v5.4s}, [x11], 4
513        SQSHL   v24.4s, v24.4s, v4.4s
514        SQSHL   v25.4s, v25.4s, v4.4s
515        SQSHL   v26.4s, v26.4s, v4.4s
516        SQSHL   v27.4s, v27.4s, v4.4s
517        SQSHL   v28.4s, v28.4s, v4.4s
518        SQSHL   v29.4s, v29.4s, v4.4s
519        SQSHL   v30.4s, v30.4s, v4.4s
520        SQSHL   v31.4s, v31.4s, v4.4s
521        LD1R    {v6.4s}, [x11], 4
522        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
523        SQDMULH v17.4s, v17.4s, v5.4s
524        SQDMULH v18.4s, v18.4s, v5.4s
525        SQDMULH v19.4s, v19.4s, v5.4s
526        SQDMULH v20.4s, v20.4s, v5.4s
527        SQDMULH v21.4s, v21.4s, v5.4s
528        SQDMULH v22.4s, v22.4s, v5.4s
529        SQDMULH v23.4s, v23.4s, v5.4s
530        SQDMULH v24.4s, v24.4s, v5.4s
531        SQDMULH v25.4s, v25.4s, v5.4s
532        SQDMULH v26.4s, v26.4s, v5.4s
533        SQDMULH v27.4s, v27.4s, v5.4s
534        SQDMULH v28.4s, v28.4s, v5.4s
535        SQDMULH v29.4s, v29.4s, v5.4s
536        SQDMULH v30.4s, v30.4s, v5.4s
537        SQDMULH v31.4s, v31.4s, v5.4s
538        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
539        SRSHL   v17.4s, v17.4s, v6.4s
540        SRSHL   v18.4s, v18.4s, v6.4s
541        SRSHL   v19.4s, v19.4s, v6.4s
542        SRSHL   v20.4s, v20.4s, v6.4s
543        SRSHL   v21.4s, v21.4s, v6.4s
544        SRSHL   v22.4s, v22.4s, v6.4s
545        SRSHL   v23.4s, v23.4s, v6.4s
546        SRSHL   v24.4s, v24.4s, v6.4s
547        SRSHL   v25.4s, v25.4s, v6.4s
548        SRSHL   v26.4s, v26.4s, v6.4s
549        SRSHL   v27.4s, v27.4s, v6.4s
550        SRSHL   v28.4s, v28.4s, v6.4s
551        SRSHL   v29.4s, v29.4s, v6.4s
552        SRSHL   v30.4s, v30.4s, v6.4s
553        SRSHL   v31.4s, v31.4s, v6.4s
554
555        SQXTN   v16.4h, v16.4s
556        SQXTN   v17.4h, v17.4s
557        SQXTN   v18.4h, v18.4s
558        SQXTN   v19.4h, v19.4s
559        SQXTN   v24.4h, v24.4s
560        SQXTN   v25.4h, v25.4s
561        SQXTN   v26.4h, v26.4s
562        SQXTN   v27.4h, v27.4s
563        LD1R    {v6.8h}, [x11], 2        // add bias
564
565        SQXTN2  v16.8h, v20.4s
566        SQXTN2  v17.8h, v21.4s
567        SQXTN2  v18.8h, v22.4s
568        SQXTN2  v19.8h, v23.4s
569        SQXTN2  v24.8h, v28.4s
570        SQXTN2  v25.8h, v29.4s
571        SQXTN2  v26.8h, v30.4s
572        SQXTN2  v27.8h, v31.4s
573
574        SQADD   v16.8h, v16.8h, v6.8h
575        SQADD   v17.8h, v17.8h, v6.8h
576        SQADD   v18.8h, v18.8h, v6.8h
577        SQADD   v19.8h, v19.8h, v6.8h
578        SQADD   v24.8h, v24.8h, v6.8h
579        SQADD   v25.8h, v25.8h, v6.8h
580        SQADD   v26.8h, v26.8h, v6.8h
581        SQADD   v27.8h, v27.8h, v6.8h
582        LD1R    {v4.16b}, [x11], 1       // clamp min value
583
584        SQXTUN   v0.8b, v16.8h
585        SQXTUN   v1.8b, v17.8h
586        SQXTUN   v2.8b, v18.8h
587        SQXTUN   v3.8b, v19.8h
588        LD1R    {v5.16b}, [x11]          // clamp max value
589        SQXTUN2  v0.16b, v24.8h
590        SQXTUN2  v1.16b, v25.8h
591        SQXTUN2  v2.16b, v26.8h
592        SQXTUN2  v3.16b, v27.8h
593        SUB     x11, x11, 19             // rewind params pointer
594
595        UMAX    v0.16b, v0.16b, v4.16b
596        UMAX    v1.16b, v1.16b, v4.16b
597        UMAX    v2.16b, v2.16b, v4.16b
598        UMAX    v3.16b, v3.16b, v4.16b
599        SUBS    x1, x1, 16
600        UMIN    v0.16b, v0.16b, v5.16b
601        UMIN    v1.16b, v1.16b, v5.16b
602        UMIN    v2.16b, v2.16b, v5.16b
603        UMIN    v3.16b, v3.16b, v5.16b
604        B.LO    6f
605
606        # Store full 4 x 16
607        ST1     {v3.16b},  [x7], x10
608        ST1     {v2.16b}, [x17], x10
609        ST1     {v1.16b}, [x16], x10
610        ST1     {v0.16b},  [x6], x10
611
612        SUB     x4, x4, x3              // a -= ks
613
614        # nc loop
615        B.HI    0b
616
617        # Restore x20-x21 from stack
618        LDP     x20, x21, [sp], 16
619        RET
620
621        # Remainder- 1 to 7 bytes of A
622        .p2align 3
6235:
624        AND     x0, x2, 7               // kc remainder 1 to 7
625
626        LD1     {v0.8b}, [x13], x0
627        LDP     d4, d5, [x5], 16
628        LD1     {v1.8b}, [x14], x0
629        LD1     {v2.8b}, [x15], x0
630        LD1     {v3.8b}, [x20], x0
631        UXTL    v0.8h, v0.8b
632        USUBL   v4.8h, v4.8b, v7.8b
633        USUBL   v5.8h, v5.8b, v7.8b
634        UXTL    v1.8h, v1.8b
635        UXTL    v2.8h, v2.8b
636        UXTL    v3.8h, v3.8b
637        SMLAL   v16.4s, v4.4h, v0.h[0]
638        SMLAL2  v20.4s, v4.8h, v0.h[0]
639        SMLAL   v24.4s, v5.4h, v0.h[0]
640        SMLAL2  v28.4s, v5.8h, v0.h[0]
641        SMLAL   v17.4s, v4.4h, v1.h[0]
642        SMLAL2  v21.4s, v4.8h, v1.h[0]
643        SMLAL   v25.4s, v5.4h, v1.h[0]
644        SMLAL2  v29.4s, v5.8h, v1.h[0]
645        SMLAL   v18.4s, v4.4h, v2.h[0]
646        SMLAL2  v22.4s, v4.8h, v2.h[0]
647        SMLAL   v26.4s, v5.4h, v2.h[0]
648        SMLAL2  v30.4s, v5.8h, v2.h[0]
649        SMLAL   v19.4s, v4.4h, v3.h[0]
650        SMLAL2  v23.4s, v4.8h, v3.h[0]
651        SMLAL   v27.4s, v5.4h, v3.h[0]
652        SMLAL2  v31.4s, v5.8h, v3.h[0]
653        CMP     x0, 2
654        B.LO    4b
655
656        LDP     d4, d5, [x5], 16
657        USUBL   v4.8h, v4.8b, v7.8b
658        USUBL   v5.8h, v5.8b, v7.8b
659        SMLAL   v16.4s, v4.4h, v0.h[1]
660        SMLAL2  v20.4s, v4.8h, v0.h[1]
661        SMLAL   v24.4s, v5.4h, v0.h[1]
662        SMLAL2  v28.4s, v5.8h, v0.h[1]
663        SMLAL   v17.4s, v4.4h, v1.h[1]
664        SMLAL2  v21.4s, v4.8h, v1.h[1]
665        SMLAL   v25.4s, v5.4h, v1.h[1]
666        SMLAL2  v29.4s, v5.8h, v1.h[1]
667        SMLAL   v18.4s, v4.4h, v2.h[1]
668        SMLAL2  v22.4s, v4.8h, v2.h[1]
669        SMLAL   v26.4s, v5.4h, v2.h[1]
670        SMLAL2  v30.4s, v5.8h, v2.h[1]
671        SMLAL   v19.4s, v4.4h, v3.h[1]
672        SMLAL2  v23.4s, v4.8h, v3.h[1]
673        SMLAL   v27.4s, v5.4h, v3.h[1]
674        SMLAL2  v31.4s, v5.8h, v3.h[1]
675        B.EQ    4b
676
677        LDP     d4, d5, [x5], 16
678        USUBL   v4.8h, v4.8b, v7.8b
679        USUBL   v5.8h, v5.8b, v7.8b
680        SMLAL   v16.4s, v4.4h, v0.h[2]
681        SMLAL2  v20.4s, v4.8h, v0.h[2]
682        SMLAL   v24.4s, v5.4h, v0.h[2]
683        SMLAL2  v28.4s, v5.8h, v0.h[2]
684        SMLAL   v17.4s, v4.4h, v1.h[2]
685        SMLAL2  v21.4s, v4.8h, v1.h[2]
686        SMLAL   v25.4s, v5.4h, v1.h[2]
687        SMLAL2  v29.4s, v5.8h, v1.h[2]
688        SMLAL   v18.4s, v4.4h, v2.h[2]
689        SMLAL2  v22.4s, v4.8h, v2.h[2]
690        SMLAL   v26.4s, v5.4h, v2.h[2]
691        SMLAL2  v30.4s, v5.8h, v2.h[2]
692        SMLAL   v19.4s, v4.4h, v3.h[2]
693        SMLAL2  v23.4s, v4.8h, v3.h[2]
694        SMLAL   v27.4s, v5.4h, v3.h[2]
695        SMLAL2  v31.4s, v5.8h, v3.h[2]
696        CMP     x0, 4
697        B.LO    4b
698
699        LDP     d4, d5, [x5], 16
700        USUBL   v4.8h, v4.8b, v7.8b
701        USUBL   v5.8h, v5.8b, v7.8b
702        SMLAL   v16.4s, v4.4h, v0.h[3]
703        SMLAL2  v20.4s, v4.8h, v0.h[3]
704        SMLAL   v24.4s, v5.4h, v0.h[3]
705        SMLAL2  v28.4s, v5.8h, v0.h[3]
706        SMLAL   v17.4s, v4.4h, v1.h[3]
707        SMLAL2  v21.4s, v4.8h, v1.h[3]
708        SMLAL   v25.4s, v5.4h, v1.h[3]
709        SMLAL2  v29.4s, v5.8h, v1.h[3]
710        SMLAL   v18.4s, v4.4h, v2.h[3]
711        SMLAL2  v22.4s, v4.8h, v2.h[3]
712        SMLAL   v26.4s, v5.4h, v2.h[3]
713        SMLAL2  v30.4s, v5.8h, v2.h[3]
714        SMLAL   v19.4s, v4.4h, v3.h[3]
715        SMLAL2  v23.4s, v4.8h, v3.h[3]
716        SMLAL   v27.4s, v5.4h, v3.h[3]
717        SMLAL2  v31.4s, v5.8h, v3.h[3]
718        B.EQ    4b
719
720        LDP     d4, d5, [x5], 16
721        USUBL   v4.8h, v4.8b, v7.8b
722        USUBL   v5.8h, v5.8b, v7.8b
723        SMLAL   v16.4s, v4.4h, v0.h[4]
724        SMLAL2  v20.4s, v4.8h, v0.h[4]
725        SMLAL   v24.4s, v5.4h, v0.h[4]
726        SMLAL2  v28.4s, v5.8h, v0.h[4]
727        SMLAL   v17.4s, v4.4h, v1.h[4]
728        SMLAL2  v21.4s, v4.8h, v1.h[4]
729        SMLAL   v25.4s, v5.4h, v1.h[4]
730        SMLAL2  v29.4s, v5.8h, v1.h[4]
731        SMLAL   v18.4s, v4.4h, v2.h[4]
732        SMLAL2  v22.4s, v4.8h, v2.h[4]
733        SMLAL   v26.4s, v5.4h, v2.h[4]
734        SMLAL2  v30.4s, v5.8h, v2.h[4]
735        SMLAL   v19.4s, v4.4h, v3.h[4]
736        SMLAL2  v23.4s, v4.8h, v3.h[4]
737        SMLAL   v27.4s, v5.4h, v3.h[4]
738        SMLAL2  v31.4s, v5.8h, v3.h[4]
739        CMP     x0, 6
740        B.LO    4b
741
742        LDP     d4, d5, [x5], 16
743        USUBL   v4.8h, v4.8b, v7.8b
744        USUBL   v5.8h, v5.8b, v7.8b
745        SMLAL   v16.4s, v4.4h, v0.h[5]
746        SMLAL2  v20.4s, v4.8h, v0.h[5]
747        SMLAL   v24.4s, v5.4h, v0.h[5]
748        SMLAL2  v28.4s, v5.8h, v0.h[5]
749        SMLAL   v17.4s, v4.4h, v1.h[5]
750        SMLAL2  v21.4s, v4.8h, v1.h[5]
751        SMLAL   v25.4s, v5.4h, v1.h[5]
752        SMLAL2  v29.4s, v5.8h, v1.h[5]
753        SMLAL   v18.4s, v4.4h, v2.h[5]
754        SMLAL2  v22.4s, v4.8h, v2.h[5]
755        SMLAL   v26.4s, v5.4h, v2.h[5]
756        SMLAL2  v30.4s, v5.8h, v2.h[5]
757        SMLAL   v19.4s, v4.4h, v3.h[5]
758        SMLAL2  v23.4s, v4.8h, v3.h[5]
759        SMLAL   v27.4s, v5.4h, v3.h[5]
760        SMLAL2  v31.4s, v5.8h, v3.h[5]
761        B.EQ    4b
762
763        LDP     d4, d5, [x5], 16
764        USUBL   v4.8h, v4.8b, v7.8b
765        USUBL   v5.8h, v5.8b, v7.8b
766        SMLAL   v16.4s, v4.4h, v0.h[6]
767        SMLAL2  v20.4s, v4.8h, v0.h[6]
768        SMLAL   v24.4s, v5.4h, v0.h[6]
769        SMLAL2  v28.4s, v5.8h, v0.h[6]
770        SMLAL   v17.4s, v4.4h, v1.h[6]
771        SMLAL2  v21.4s, v4.8h, v1.h[6]
772        SMLAL   v25.4s, v5.4h, v1.h[6]
773        SMLAL2  v29.4s, v5.8h, v1.h[6]
774        SMLAL   v18.4s, v4.4h, v2.h[6]
775        SMLAL2  v22.4s, v4.8h, v2.h[6]
776        SMLAL   v26.4s, v5.4h, v2.h[6]
777        SMLAL2  v30.4s, v5.8h, v2.h[6]
778        SMLAL   v19.4s, v4.4h, v3.h[6]
779        SMLAL2  v23.4s, v4.8h, v3.h[6]
780        SMLAL   v27.4s, v5.4h, v3.h[6]
781        SMLAL2  v31.4s, v5.8h, v3.h[6]
782        B       4b
783
784        # Store odd width
785        .p2align 3
7866:
787        TBZ     x1, 3, 7f
788        STR     d3, [x7], 8
789        STR     d2, [x17], 8
790        DUP     d3, v3.d[1]
791        DUP     d2, v2.d[1]
792        STR     d1, [x16], 8
793        STR     d0, [x6], 8
794        DUP     d1, v1.d[1]
795        DUP     d0, v0.d[1]
7967:
797        TBZ     x1, 2, 8f
798        STR     s3, [x7], 4
799        STR     s2, [x17], 4
800        DUP     s3, v3.s[1]
801        DUP     s2, v2.s[1]
802        STR     s1, [x16], 4
803        STR     s0, [x6], 4
804        DUP     s1, v1.s[1]
805        DUP     s0, v0.s[1]
8068:
807        TBZ     x1, 1, 9f
808        STR     h3, [x7], 2
809        STR     h2, [x17], 2
810        DUP     h3, v3.h[1]
811        DUP     h2, v2.h[1]
812        STR     h1, [x16], 2
813        STR     h0, [x6], 2
814        DUP     h1, v1.h[1]
815        DUP     h0, v0.h[1]
8169:
817        TBZ     x1, 0, 10f
818        STR     b3, [x7]
819        STR     b2, [x17]
820        STR     b1, [x16]
821        STR     b0, [x6]
82210:
823        # Restore x20-x21 from stack
824        LDP     x20, x21, [sp], 16
825        RET
826
827END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
828
829#ifdef __ELF__
830.section ".note.GNU-stack","",%progbits
831#endif
832