1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const uint8_t** restrict a, x4
19#     const uint8_t* restrict w,  x5
20#     uint8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const uint8_t* zero,                [sp + 16] -> x12
25#     const xnn_qs8_conv_minmax_params params [sp + 24] -> x11
26
27# params structure is 20 bytes
28#  struct {
29#    uint8_t kernel_zero_point;
30#    uint8_t padding[3];
31#    int32_t right_pre_shift;
32#    int32_t multiplier;
33#    int32_t right_post_shift;
34#    int16_t output_zero_point;
35#    uint8_t output_min;
36#    uint8_t output_max;
37#  } rndnu_neon;
38#
39# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
40
41# Register usage
42# A0  x13  v0
43# A1  x14  v1
44# A2  x15  v2
45# A3  x20  v3
46# B    x5  v4  v5  v6
47# C0   x6 v16 v20 v24 v28
48# C1  x16 v17 v21 v25 v29
49# C2  x17 v18 v22 v26 v30
50# C3   x7 v19 v23 v27 v31
51# zero_point v7
52# unused  v8 v9 v10 v11 v12 v13 v14 v15
53
54BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75
55
56        # Clamp C pointers
57        CMP     x0, 2                   // if mr < 2
58        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
59        ADD     x16, x6, x7             // c1 = c0 + cm_stride
60        CSEL    x16, x6,  x16, LO       //   c1 = c0
61
62        ADD     x17, x16, x7            // c2 = c1 + cm_stride
63        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
64                                        // if mr <= 2
65        CSEL    x17, x16, x17, LS       //   c2 = c1
66
67        CMP     x0, 4                   // if mr < 4
68        STR     x20, [sp, -16]!         // Save x20 on stack
69        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
70        CSEL    x7,  x17, x7, LO        //   c3 = c2
71        LD1R    {v7.4s}, [x11], 4        // kernel_zero_point
72
73        .p2align 3
740:
75        # Load initial bias from w into accumulators
76        LDP     q16, q20, [x5], 32
77        MOV     v17.16b, v16.16b
78        MOV     v18.16b, v16.16b
79        LDP     q24, q28, [x5], 32
80        MOV     v19.16b, v16.16b
81        MOV     v21.16b, v20.16b
82        MOV     v22.16b, v20.16b
83        MOV     v23.16b, v20.16b
84        MOV     v25.16b, v24.16b
85        MOV     v26.16b, v24.16b
86        MOV     v27.16b, v24.16b
87        MOV     v29.16b, v28.16b
88        MOV     v30.16b, v28.16b
89        MOV     v31.16b, v28.16b
90        MOV     x9, x3                  // p = ks
91
92        .p2align 3
931:
94        # Load next 4 A pointers
95        LDP     x13, x14, [x4], 16
96        LDP     x15, x20, [x4], 16
97
98        CMP     x13, x12                // if a0 == zero
99        ADD     x13, x13, x8            // a0 += a_offset
100        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
101        CMP     x14, x12                // if a1 == zero
102        ADD     x14, x14, x8            // a1 += a_offset
103        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
104        CMP     x15, x12                // if a2 == zero
105        ADD     x15, x15, x8            // a2 += a_offset
106        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
107        CMP     x20, x12                // if a3 == zero
108        ADD     x20, x20, x8            // a3 += a_offset
109        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
110
111        # Is there at least 8 bytes for epilogue?
112        SUBS    x0, x2, 8               // k = kc - 8
113        B.LO    5f
114
115        # Prologue
116        LDR     d0, [x13], 8
117        LDP     d4, d6, [x5]
118        LDR     d1, [x14], 8
119        LDR     d2, [x15], 8
120        LDR     d3, [x20], 8
121        UXTL    v0.8h, v0.8b
122        USUBL   v4.8h, v4.8b, v7.8b
123        UXTL    v1.8h, v1.8b
124        UXTL    v2.8h, v2.8b
125        UXTL    v3.8h, v3.8b
126        USUBL   v6.8h, v6.8b, v7.8b
127
128        SUBS    x0, x0, 8               // k = k - 8
129        # Is there at least 8 bytes for main loop?
130        B.LO    3f
131
132        # Main loop - 8 bytes of A
133        .p2align 3
1342:
135        SMLAL   v16.4s, v4.4h, v0.h[0]
136        SMLAL2  v20.4s, v4.8h, v0.h[0]
137        SMLAL   v17.4s, v4.4h, v1.h[0]
138        SMLAL2  v21.4s, v4.8h, v1.h[0]
139        SMLAL   v18.4s, v4.4h, v2.h[0]
140        SMLAL2  v22.4s, v4.8h, v2.h[0]
141        SMLAL   v19.4s, v4.4h, v3.h[0]
142        SMLAL2  v23.4s, v4.8h, v3.h[0]
143        LDR     d5, [x5, 16]
144        SMLAL   v24.4s, v6.4h, v0.h[0]
145        LDR     d4, [x5, 24]
146        SMLAL2  v28.4s, v6.8h, v0.h[0]
147        SMLAL   v25.4s, v6.4h, v1.h[0]
148        SMLAL2  v29.4s, v6.8h, v1.h[0]
149        USUBL   v5.8h, v5.8b, v7.8b
150        SMLAL   v26.4s, v6.4h, v2.h[0]
151        SMLAL2  v30.4s, v6.8h, v2.h[0]
152        SMLAL   v27.4s, v6.4h, v3.h[0]
153        SMLAL2  v31.4s, v6.8h, v3.h[0]
154        SMLAL   v16.4s, v5.4h, v0.h[1]
155        SMLAL2  v20.4s, v5.8h, v0.h[1]
156        SMLAL   v17.4s, v5.4h, v1.h[1]
157        SMLAL2  v21.4s, v5.8h, v1.h[1]
158        USUBL   v4.8h, v4.8b, v7.8b
159        SMLAL   v18.4s, v5.4h, v2.h[1]
160        SMLAL2  v22.4s, v5.8h, v2.h[1]
161        SMLAL   v19.4s, v5.4h, v3.h[1]
162        SMLAL2  v23.4s, v5.8h, v3.h[1]
163        LDR     d6, [x5, 32]
164        SMLAL   v24.4s, v4.4h, v0.h[1]
165        LDR     d5, [x5, 40]
166        SMLAL2  v28.4s, v4.8h, v0.h[1]
167        SMLAL   v25.4s, v4.4h, v1.h[1]
168        SMLAL2  v29.4s, v4.8h, v1.h[1]
169        USUBL   v6.8h, v6.8b, v7.8b
170        SMLAL   v26.4s, v4.4h, v2.h[1]
171        SMLAL2  v30.4s, v4.8h, v2.h[1]
172        SMLAL   v27.4s, v4.4h, v3.h[1]
173        SMLAL2  v31.4s, v4.8h, v3.h[1]
174        SMLAL   v16.4s, v6.4h, v0.h[2]
175        SMLAL2  v20.4s, v6.8h, v0.h[2]
176        SMLAL   v17.4s, v6.4h, v1.h[2]
177        USUBL   v5.8h, v5.8b, v7.8b
178        SMLAL2  v21.4s, v6.8h, v1.h[2]
179        SMLAL   v18.4s, v6.4h, v2.h[2]
180        SMLAL2  v22.4s, v6.8h, v2.h[2]
181        SMLAL   v19.4s, v6.4h, v3.h[2]
182        SMLAL2  v23.4s, v6.8h, v3.h[2]
183        LDR     d4, [x5, 48]
184        SMLAL   v24.4s, v5.4h, v0.h[2]
185        LDR     d6, [x5, 56]
186        SMLAL2  v28.4s, v5.8h, v0.h[2]
187        SMLAL   v25.4s, v5.4h, v1.h[2]
188        SMLAL2  v29.4s, v5.8h, v1.h[2]
189        USUBL   v4.8h, v4.8b, v7.8b
190        SMLAL   v26.4s, v5.4h, v2.h[2]
191        SMLAL2  v30.4s, v5.8h, v2.h[2]
192        SMLAL   v27.4s, v5.4h, v3.h[2]
193        SMLAL2  v31.4s, v5.8h, v3.h[2]
194        SMLAL   v16.4s, v4.4h, v0.h[3]
195        SMLAL2  v20.4s, v4.8h, v0.h[3]
196        SMLAL   v17.4s, v4.4h, v1.h[3]
197        SMLAL2  v21.4s, v4.8h, v1.h[3]
198        USUBL   v6.8h, v6.8b, v7.8b
199        SMLAL   v18.4s, v4.4h, v2.h[3]
200        SMLAL2  v22.4s, v4.8h, v2.h[3]
201        SMLAL   v19.4s, v4.4h, v3.h[3]
202        SMLAL2  v23.4s, v4.8h, v3.h[3]
203        LDR     d5, [x5, 64]
204        SMLAL   v24.4s, v6.4h, v0.h[3]
205        LDR     d4, [x5, 72]
206        SMLAL2  v28.4s, v6.8h, v0.h[3]
207        USUBL   v5.8h, v5.8b, v7.8b
208        SMLAL   v25.4s, v6.4h, v1.h[3]
209        SMLAL2  v29.4s, v6.8h, v1.h[3]
210        SMLAL   v26.4s, v6.4h, v2.h[3]
211        SMLAL2  v30.4s, v6.8h, v2.h[3]
212        SMLAL   v27.4s, v6.4h, v3.h[3]
213        SMLAL2  v31.4s, v6.8h, v3.h[3]
214        SMLAL   v16.4s, v5.4h, v0.h[4]
215        SMLAL2  v20.4s, v5.8h, v0.h[4]
216        SMLAL   v17.4s, v5.4h, v1.h[4]
217        SMLAL2  v21.4s, v5.8h, v1.h[4]
218        USUBL   v4.8h, v4.8b, v7.8b
219        SMLAL   v18.4s, v5.4h, v2.h[4]
220        SMLAL2  v22.4s, v5.8h, v2.h[4]
221        SMLAL   v19.4s, v5.4h, v3.h[4]
222        SMLAL2  v23.4s, v5.8h, v3.h[4]
223        LDR     d6, [x5, 80]
224        SMLAL   v24.4s, v4.4h, v0.h[4]
225        LDR     d5, [x5, 88]
226        SMLAL2  v28.4s, v4.8h, v0.h[4]
227        SMLAL   v25.4s, v4.4h, v1.h[4]
228        SMLAL2  v29.4s, v4.8h, v1.h[4]
229        USUBL   v6.8h, v6.8b, v7.8b
230        SMLAL   v26.4s, v4.4h, v2.h[4]
231        SMLAL2  v30.4s, v4.8h, v2.h[4]
232        SMLAL   v27.4s, v4.4h, v3.h[4]
233        SMLAL2  v31.4s, v4.8h, v3.h[4]
234        SMLAL   v16.4s, v6.4h, v0.h[5]
235        SMLAL2  v20.4s, v6.8h, v0.h[5]
236        SMLAL   v17.4s, v6.4h, v1.h[5]
237        SMLAL2  v21.4s, v6.8h, v1.h[5]
238        USUBL   v5.8h, v5.8b, v7.8b
239        SMLAL   v18.4s, v6.4h, v2.h[5]
240        SMLAL2  v22.4s, v6.8h, v2.h[5]
241        SMLAL   v19.4s, v6.4h, v3.h[5]
242        SMLAL2  v23.4s, v6.8h, v3.h[5]
243        LDR     d4, [x5, 96]
244        SMLAL   v24.4s, v5.4h, v0.h[5]
245        LDR     d6, [x5, 104]
246        SMLAL2  v28.4s, v5.8h, v0.h[5]
247        SMLAL   v25.4s, v5.4h, v1.h[5]
248        SMLAL2  v29.4s, v5.8h, v1.h[5]
249        USUBL   v4.8h, v4.8b, v7.8b
250        SMLAL   v26.4s, v5.4h, v2.h[5]
251        SMLAL2  v30.4s, v5.8h, v2.h[5]
252        SMLAL   v27.4s, v5.4h, v3.h[5]
253        SMLAL2  v31.4s, v5.8h, v3.h[5]
254        USUBL   v6.8h, v6.8b, v7.8b
255        SMLAL   v16.4s, v4.4h, v0.h[6]
256        SMLAL2  v20.4s, v4.8h, v0.h[6]
257        SMLAL   v17.4s, v4.4h, v1.h[6]
258        SMLAL2  v21.4s, v4.8h, v1.h[6]
259        SMLAL   v18.4s, v4.4h, v2.h[6]
260        SMLAL2  v22.4s, v4.8h, v2.h[6]
261        SMLAL   v19.4s, v4.4h, v3.h[6]
262        SMLAL2  v23.4s, v4.8h, v3.h[6]
263        LDR     d4, [x5, 112]
264        SMLAL   v24.4s, v6.4h, v0.h[6]
265        LDR     d5, [x5, 120]
266        SMLAL2  v28.4s, v6.8h, v0.h[6]
267        SMLAL   v25.4s, v6.4h, v1.h[6]
268        SMLAL2  v29.4s, v6.8h, v1.h[6]
269        USUBL   v4.8h, v4.8b, v7.8b
270        ADD     x5, x5, 128
271
272        SMLAL   v26.4s, v6.4h, v2.h[6]
273        SMLAL2  v30.4s, v6.8h, v2.h[6]
274        SMLAL   v27.4s, v6.4h, v3.h[6]
275        SMLAL2  v31.4s, v6.8h, v3.h[6]
276        USUBL   v5.8h, v5.8b, v7.8b
277
278        SMLAL   v16.4s, v4.4h, v0.h[7]
279        SMLAL2  v20.4s, v4.8h, v0.h[7]
280        SMLAL   v17.4s, v4.4h, v1.h[7]
281        SMLAL2  v21.4s, v4.8h, v1.h[7]
282        SMLAL   v18.4s, v4.4h, v2.h[7]
283        SMLAL2  v22.4s, v4.8h, v2.h[7]
284        SMLAL   v19.4s, v4.4h, v3.h[7]
285        SMLAL2  v23.4s, v4.8h, v3.h[7]
286        LDR     d4, [x5]
287        SMLAL   v24.4s, v5.4h, v0.h[7]
288        LDR     d6, [x5, 8]
289        SMLAL2  v28.4s, v5.8h, v0.h[7]
290        SMLAL   v25.4s, v5.4h, v1.h[7]
291        SMLAL2  v29.4s, v5.8h, v1.h[7]
292        LDR     d0, [x13], 8
293        SMLAL   v26.4s, v5.4h, v2.h[7]
294        LDR     d1, [x14], 8
295        SMLAL2  v30.4s, v5.8h, v2.h[7]
296        SMLAL   v27.4s, v5.4h, v3.h[7]
297        SMLAL2  v31.4s, v5.8h, v3.h[7]
298        LDR     d2, [x15], 8
299
300        UXTL    v0.8h, v0.8b
301        LDR     d3, [x20], 8
302        UXTL    v1.8h, v1.8b
303        USUBL   v4.8h, v4.8b, v7.8b
304        UXTL    v2.8h, v2.8b
305        SUBS    x0, x0, 8
306        UXTL    v3.8h, v3.8b
307        USUBL   v6.8h, v6.8b, v7.8b
308        B.HS    2b
309
310        # Epilogue.  Same as main loop but no preloads in final group
311
312        .p2align 3
3133:
314        SMLAL   v16.4s, v4.4h, v0.h[0]
315        SMLAL2  v20.4s, v4.8h, v0.h[0]
316        SMLAL   v17.4s, v4.4h, v1.h[0]
317        SMLAL2  v21.4s, v4.8h, v1.h[0]
318        SMLAL   v18.4s, v4.4h, v2.h[0]
319        SMLAL2  v22.4s, v4.8h, v2.h[0]
320        SMLAL   v19.4s, v4.4h, v3.h[0]
321        SMLAL2  v23.4s, v4.8h, v3.h[0]
322        LDR     d5, [x5, 16]
323        SMLAL   v24.4s, v6.4h, v0.h[0]
324        LDR     d4, [x5, 24]
325        SMLAL2  v28.4s, v6.8h, v0.h[0]
326        SMLAL   v25.4s, v6.4h, v1.h[0]
327        SMLAL2  v29.4s, v6.8h, v1.h[0]
328        USUBL   v5.8h, v5.8b, v7.8b
329        SMLAL   v26.4s, v6.4h, v2.h[0]
330        SMLAL2  v30.4s, v6.8h, v2.h[0]
331        SMLAL   v27.4s, v6.4h, v3.h[0]
332        SMLAL2  v31.4s, v6.8h, v3.h[0]
333        SMLAL   v16.4s, v5.4h, v0.h[1]
334        SMLAL2  v20.4s, v5.8h, v0.h[1]
335        SMLAL   v17.4s, v5.4h, v1.h[1]
336        SMLAL2  v21.4s, v5.8h, v1.h[1]
337        USUBL   v4.8h, v4.8b, v7.8b
338        SMLAL   v18.4s, v5.4h, v2.h[1]
339        SMLAL2  v22.4s, v5.8h, v2.h[1]
340        SMLAL   v19.4s, v5.4h, v3.h[1]
341        SMLAL2  v23.4s, v5.8h, v3.h[1]
342        LDR     d6, [x5, 32]
343        SMLAL   v24.4s, v4.4h, v0.h[1]
344        LDR     d5, [x5, 40]
345        SMLAL2  v28.4s, v4.8h, v0.h[1]
346        SMLAL   v25.4s, v4.4h, v1.h[1]
347        SMLAL2  v29.4s, v4.8h, v1.h[1]
348        USUBL   v6.8h, v6.8b, v7.8b
349        SMLAL   v26.4s, v4.4h, v2.h[1]
350        SMLAL2  v30.4s, v4.8h, v2.h[1]
351        SMLAL   v27.4s, v4.4h, v3.h[1]
352        SMLAL2  v31.4s, v4.8h, v3.h[1]
353        SMLAL   v16.4s, v6.4h, v0.h[2]
354        SMLAL2  v20.4s, v6.8h, v0.h[2]
355        SMLAL   v17.4s, v6.4h, v1.h[2]
356        USUBL   v5.8h, v5.8b, v7.8b
357        SMLAL2  v21.4s, v6.8h, v1.h[2]
358        SMLAL   v18.4s, v6.4h, v2.h[2]
359        SMLAL2  v22.4s, v6.8h, v2.h[2]
360        SMLAL   v19.4s, v6.4h, v3.h[2]
361        SMLAL2  v23.4s, v6.8h, v3.h[2]
362        LDR     d4, [x5, 48]
363        SMLAL   v24.4s, v5.4h, v0.h[2]
364        LDR     d6, [x5, 56]
365        SMLAL2  v28.4s, v5.8h, v0.h[2]
366        SMLAL   v25.4s, v5.4h, v1.h[2]
367        SMLAL2  v29.4s, v5.8h, v1.h[2]
368        USUBL   v4.8h, v4.8b, v7.8b
369        SMLAL   v26.4s, v5.4h, v2.h[2]
370        SMLAL2  v30.4s, v5.8h, v2.h[2]
371        SMLAL   v27.4s, v5.4h, v3.h[2]
372        SMLAL2  v31.4s, v5.8h, v3.h[2]
373        SMLAL   v16.4s, v4.4h, v0.h[3]
374        SMLAL2  v20.4s, v4.8h, v0.h[3]
375        SMLAL   v17.4s, v4.4h, v1.h[3]
376        SMLAL2  v21.4s, v4.8h, v1.h[3]
377        USUBL   v6.8h, v6.8b, v7.8b
378        SMLAL   v18.4s, v4.4h, v2.h[3]
379        SMLAL2  v22.4s, v4.8h, v2.h[3]
380        SMLAL   v19.4s, v4.4h, v3.h[3]
381        SMLAL2  v23.4s, v4.8h, v3.h[3]
382        LDR     d5, [x5, 64]
383        SMLAL   v24.4s, v6.4h, v0.h[3]
384        LDR     d4, [x5, 72]
385        SMLAL2  v28.4s, v6.8h, v0.h[3]
386        USUBL   v5.8h, v5.8b, v7.8b
387        SMLAL   v25.4s, v6.4h, v1.h[3]
388        SMLAL2  v29.4s, v6.8h, v1.h[3]
389        SMLAL   v26.4s, v6.4h, v2.h[3]
390        SMLAL2  v30.4s, v6.8h, v2.h[3]
391        SMLAL   v27.4s, v6.4h, v3.h[3]
392        SMLAL2  v31.4s, v6.8h, v3.h[3]
393        SMLAL   v16.4s, v5.4h, v0.h[4]
394        SMLAL2  v20.4s, v5.8h, v0.h[4]
395        SMLAL   v17.4s, v5.4h, v1.h[4]
396        SMLAL2  v21.4s, v5.8h, v1.h[4]
397        USUBL   v4.8h, v4.8b, v7.8b
398        SMLAL   v18.4s, v5.4h, v2.h[4]
399        SMLAL2  v22.4s, v5.8h, v2.h[4]
400        SMLAL   v19.4s, v5.4h, v3.h[4]
401        SMLAL2  v23.4s, v5.8h, v3.h[4]
402        LDR     d6, [x5, 80]
403        SMLAL   v24.4s, v4.4h, v0.h[4]
404        LDR     d5, [x5, 88]
405        SMLAL2  v28.4s, v4.8h, v0.h[4]
406        SMLAL   v25.4s, v4.4h, v1.h[4]
407        SMLAL2  v29.4s, v4.8h, v1.h[4]
408        USUBL   v6.8h, v6.8b, v7.8b
409        SMLAL   v26.4s, v4.4h, v2.h[4]
410        SMLAL2  v30.4s, v4.8h, v2.h[4]
411        SMLAL   v27.4s, v4.4h, v3.h[4]
412        SMLAL2  v31.4s, v4.8h, v3.h[4]
413        SMLAL   v16.4s, v6.4h, v0.h[5]
414        SMLAL2  v20.4s, v6.8h, v0.h[5]
415        SMLAL   v17.4s, v6.4h, v1.h[5]
416        SMLAL2  v21.4s, v6.8h, v1.h[5]
417        USUBL   v5.8h, v5.8b, v7.8b
418        SMLAL   v18.4s, v6.4h, v2.h[5]
419        SMLAL2  v22.4s, v6.8h, v2.h[5]
420        SMLAL   v19.4s, v6.4h, v3.h[5]
421        SMLAL2  v23.4s, v6.8h, v3.h[5]
422        LDR     d4, [x5, 96]
423        SMLAL   v24.4s, v5.4h, v0.h[5]
424        LDR     d6, [x5, 104]
425        SMLAL2  v28.4s, v5.8h, v0.h[5]
426        SMLAL   v25.4s, v5.4h, v1.h[5]
427        SMLAL2  v29.4s, v5.8h, v1.h[5]
428        USUBL   v4.8h, v4.8b, v7.8b
429        SMLAL   v26.4s, v5.4h, v2.h[5]
430        SMLAL2  v30.4s, v5.8h, v2.h[5]
431        SMLAL   v27.4s, v5.4h, v3.h[5]
432        SMLAL2  v31.4s, v5.8h, v3.h[5]
433        USUBL   v6.8h, v6.8b, v7.8b
434        SMLAL   v16.4s, v4.4h, v0.h[6]
435        SMLAL2  v20.4s, v4.8h, v0.h[6]
436        SMLAL   v17.4s, v4.4h, v1.h[6]
437        SMLAL2  v21.4s, v4.8h, v1.h[6]
438        SMLAL   v18.4s, v4.4h, v2.h[6]
439        SMLAL2  v22.4s, v4.8h, v2.h[6]
440        SMLAL   v19.4s, v4.4h, v3.h[6]
441        SMLAL2  v23.4s, v4.8h, v3.h[6]
442        SMLAL   v24.4s, v6.4h, v0.h[6]
443        SMLAL2  v28.4s, v6.8h, v0.h[6]
444        SMLAL   v25.4s, v6.4h, v1.h[6]
445        SMLAL2  v29.4s, v6.8h, v1.h[6]
446        LDR     d4, [x5, 112]
447        USUBL   v4.8h, v4.8b, v7.8b
448        LDR     d5, [x5, 120]
449        SMLAL   v26.4s, v6.4h, v2.h[6]
450        SMLAL2  v30.4s, v6.8h, v2.h[6]
451        SMLAL   v27.4s, v6.4h, v3.h[6]
452        SMLAL2  v31.4s, v6.8h, v3.h[6]
453        SMLAL   v16.4s, v4.4h, v0.h[7]
454        SMLAL2  v20.4s, v4.8h, v0.h[7]
455        SMLAL   v17.4s, v4.4h, v1.h[7]
456        SMLAL2  v21.4s, v4.8h, v1.h[7]
457        USUBL   v5.8h, v5.8b, v7.8b
458        SMLAL   v18.4s, v4.4h, v2.h[7]
459        SMLAL2  v22.4s, v4.8h, v2.h[7]
460        SMLAL   v19.4s, v4.4h, v3.h[7]
461        SMLAL2  v23.4s, v4.8h, v3.h[7]
462        ADD     x5, x5, 128
463        SMLAL   v24.4s, v5.4h, v0.h[7]
464        SMLAL2  v28.4s, v5.8h, v0.h[7]
465        SMLAL   v25.4s, v5.4h, v1.h[7]
466        SMLAL2  v29.4s, v5.8h, v1.h[7]
467        AND     x0, x2, 7               // kc remainder 0 to 7
468        SMLAL   v26.4s, v5.4h, v2.h[7]
469        SMLAL2  v30.4s, v5.8h, v2.h[7]
470        SMLAL   v27.4s, v5.4h, v3.h[7]
471        SMLAL2  v31.4s, v5.8h, v3.h[7]
472
473        # Is there a remainder?- 1 to 7 bytes of A
474        CBNZ    x0, 5f
475
4764:
477        # ks loop
478        SUBS    x9, x9, 32              // ks -= MR * sizeof(uint8_t*)
479        B.HI    1b
480
481        # Apply params - preshift, scale, postshift, bias and clamp
482        LD1R    {v4.4s}, [x11], 4
483        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
484        SQSHL   v17.4s, v17.4s, v4.4s
485        SQSHL   v18.4s, v18.4s, v4.4s
486        SQSHL   v19.4s, v19.4s, v4.4s
487        SQSHL   v20.4s, v20.4s, v4.4s
488        SQSHL   v21.4s, v21.4s, v4.4s
489        SQSHL   v22.4s, v22.4s, v4.4s
490        SQSHL   v23.4s, v23.4s, v4.4s
491        LD1R    {v5.4s}, [x11], 4
492        SQSHL   v24.4s, v24.4s, v4.4s
493        SQSHL   v25.4s, v25.4s, v4.4s
494        SQSHL   v26.4s, v26.4s, v4.4s
495        SQSHL   v27.4s, v27.4s, v4.4s
496        SQSHL   v28.4s, v28.4s, v4.4s
497        SQSHL   v29.4s, v29.4s, v4.4s
498        SQSHL   v30.4s, v30.4s, v4.4s
499        SQSHL   v31.4s, v31.4s, v4.4s
500        LD1R    {v6.4s}, [x11], 4
501        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
502        SQDMULH v17.4s, v17.4s, v5.4s
503        SQDMULH v18.4s, v18.4s, v5.4s
504        SQDMULH v19.4s, v19.4s, v5.4s
505        SQDMULH v20.4s, v20.4s, v5.4s
506        SQDMULH v21.4s, v21.4s, v5.4s
507        SQDMULH v22.4s, v22.4s, v5.4s
508        SQDMULH v23.4s, v23.4s, v5.4s
509        SQDMULH v24.4s, v24.4s, v5.4s
510        SQDMULH v25.4s, v25.4s, v5.4s
511        SQDMULH v26.4s, v26.4s, v5.4s
512        SQDMULH v27.4s, v27.4s, v5.4s
513        SQDMULH v28.4s, v28.4s, v5.4s
514        SQDMULH v29.4s, v29.4s, v5.4s
515        SQDMULH v30.4s, v30.4s, v5.4s
516        SQDMULH v31.4s, v31.4s, v5.4s
517        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
518        SRSHL   v17.4s, v17.4s, v6.4s
519        SRSHL   v18.4s, v18.4s, v6.4s
520        SRSHL   v19.4s, v19.4s, v6.4s
521        SRSHL   v20.4s, v20.4s, v6.4s
522        SRSHL   v21.4s, v21.4s, v6.4s
523        SRSHL   v22.4s, v22.4s, v6.4s
524        SRSHL   v23.4s, v23.4s, v6.4s
525        SRSHL   v24.4s, v24.4s, v6.4s
526        SRSHL   v25.4s, v25.4s, v6.4s
527        SRSHL   v26.4s, v26.4s, v6.4s
528        SRSHL   v27.4s, v27.4s, v6.4s
529        SRSHL   v28.4s, v28.4s, v6.4s
530        SRSHL   v29.4s, v29.4s, v6.4s
531        SRSHL   v30.4s, v30.4s, v6.4s
532        SRSHL   v31.4s, v31.4s, v6.4s
533
534        SQXTN   v16.4h, v16.4s
535        SQXTN   v17.4h, v17.4s
536        SQXTN   v18.4h, v18.4s
537        SQXTN   v19.4h, v19.4s
538        SQXTN   v24.4h, v24.4s
539        SQXTN   v25.4h, v25.4s
540        SQXTN   v26.4h, v26.4s
541        SQXTN   v27.4h, v27.4s
542        LD1R    {v6.8h}, [x11], 2        // add bias
543
544        SQXTN2  v16.8h, v20.4s
545        SQXTN2  v17.8h, v21.4s
546        SQXTN2  v18.8h, v22.4s
547        SQXTN2  v19.8h, v23.4s
548        SQXTN2  v24.8h, v28.4s
549        SQXTN2  v25.8h, v29.4s
550        SQXTN2  v26.8h, v30.4s
551        SQXTN2  v27.8h, v31.4s
552
553        SQADD   v16.8h, v16.8h, v6.8h
554        SQADD   v17.8h, v17.8h, v6.8h
555        SQADD   v18.8h, v18.8h, v6.8h
556        SQADD   v19.8h, v19.8h, v6.8h
557        SQADD   v24.8h, v24.8h, v6.8h
558        SQADD   v25.8h, v25.8h, v6.8h
559        SQADD   v26.8h, v26.8h, v6.8h
560        SQADD   v27.8h, v27.8h, v6.8h
561        LD1R    {v4.16b}, [x11], 1       // clamp min value
562
563        SQXTUN   v0.8b, v16.8h
564        SQXTUN   v1.8b, v17.8h
565        SQXTUN   v2.8b, v18.8h
566        SQXTUN   v3.8b, v19.8h
567        LD1R    {v5.16b}, [x11]          // clamp max value
568        SQXTUN2  v0.16b, v24.8h
569        SQXTUN2  v1.16b, v25.8h
570        SQXTUN2  v2.16b, v26.8h
571        SQXTUN2  v3.16b, v27.8h
572        SUB     x11, x11, 15             // rewind params pointer
573
574        UMAX    v0.16b, v0.16b, v4.16b
575        UMAX    v1.16b, v1.16b, v4.16b
576        UMAX    v2.16b, v2.16b, v4.16b
577        UMAX    v3.16b, v3.16b, v4.16b
578        SUBS    x1, x1, 16
579        UMIN    v0.16b, v0.16b, v5.16b
580        UMIN    v1.16b, v1.16b, v5.16b
581        UMIN    v2.16b, v2.16b, v5.16b
582        UMIN    v3.16b, v3.16b, v5.16b
583        B.LO    6f
584
585        # Store full 4 x 16
586        ST1     {v3.16b},  [x7], x10
587        ST1     {v2.16b}, [x17], x10
588        ST1     {v1.16b}, [x16], x10
589        ST1     {v0.16b},  [x6], x10
590
591        SUB     x4, x4, x3              // a -= ks
592
593        # nc loop
594        B.HI    0b
595
596        # Restore x20 from stack
597        LDR     x20, [sp], 16
598        RET
599
600        # Remainder- 1 to 7 bytes of A
601        .p2align 3
6025:
603        AND     x0, x2, 7               // kc remainder 1 to 7
604
605        LD1     {v0.8b}, [x13], x0
606        LDP     d4, d5, [x5], 16
607        LD1     {v1.8b}, [x14], x0
608        LD1     {v2.8b}, [x15], x0
609        LD1     {v3.8b}, [x20], x0
610        UXTL    v0.8h, v0.8b
611        USUBL   v4.8h, v4.8b, v7.8b
612        USUBL   v5.8h, v5.8b, v7.8b
613        UXTL    v1.8h, v1.8b
614        UXTL    v2.8h, v2.8b
615        UXTL    v3.8h, v3.8b
616        SMLAL   v16.4s, v4.4h, v0.h[0]
617        SMLAL2  v20.4s, v4.8h, v0.h[0]
618        SMLAL   v24.4s, v5.4h, v0.h[0]
619        SMLAL2  v28.4s, v5.8h, v0.h[0]
620        SMLAL   v17.4s, v4.4h, v1.h[0]
621        SMLAL2  v21.4s, v4.8h, v1.h[0]
622        SMLAL   v25.4s, v5.4h, v1.h[0]
623        SMLAL2  v29.4s, v5.8h, v1.h[0]
624        SMLAL   v18.4s, v4.4h, v2.h[0]
625        SMLAL2  v22.4s, v4.8h, v2.h[0]
626        SMLAL   v26.4s, v5.4h, v2.h[0]
627        SMLAL2  v30.4s, v5.8h, v2.h[0]
628        SMLAL   v19.4s, v4.4h, v3.h[0]
629        SMLAL2  v23.4s, v4.8h, v3.h[0]
630        SMLAL   v27.4s, v5.4h, v3.h[0]
631        SMLAL2  v31.4s, v5.8h, v3.h[0]
632        CMP     x0, 2
633        B.LO    4b
634
635        LDP     d4, d5, [x5], 16
636        USUBL   v4.8h, v4.8b, v7.8b
637        USUBL   v5.8h, v5.8b, v7.8b
638        SMLAL   v16.4s, v4.4h, v0.h[1]
639        SMLAL2  v20.4s, v4.8h, v0.h[1]
640        SMLAL   v24.4s, v5.4h, v0.h[1]
641        SMLAL2  v28.4s, v5.8h, v0.h[1]
642        SMLAL   v17.4s, v4.4h, v1.h[1]
643        SMLAL2  v21.4s, v4.8h, v1.h[1]
644        SMLAL   v25.4s, v5.4h, v1.h[1]
645        SMLAL2  v29.4s, v5.8h, v1.h[1]
646        SMLAL   v18.4s, v4.4h, v2.h[1]
647        SMLAL2  v22.4s, v4.8h, v2.h[1]
648        SMLAL   v26.4s, v5.4h, v2.h[1]
649        SMLAL2  v30.4s, v5.8h, v2.h[1]
650        SMLAL   v19.4s, v4.4h, v3.h[1]
651        SMLAL2  v23.4s, v4.8h, v3.h[1]
652        SMLAL   v27.4s, v5.4h, v3.h[1]
653        SMLAL2  v31.4s, v5.8h, v3.h[1]
654        B.EQ    4b
655
656        LDP     d4, d5, [x5], 16
657        USUBL   v4.8h, v4.8b, v7.8b
658        USUBL   v5.8h, v5.8b, v7.8b
659        SMLAL   v16.4s, v4.4h, v0.h[2]
660        SMLAL2  v20.4s, v4.8h, v0.h[2]
661        SMLAL   v24.4s, v5.4h, v0.h[2]
662        SMLAL2  v28.4s, v5.8h, v0.h[2]
663        SMLAL   v17.4s, v4.4h, v1.h[2]
664        SMLAL2  v21.4s, v4.8h, v1.h[2]
665        SMLAL   v25.4s, v5.4h, v1.h[2]
666        SMLAL2  v29.4s, v5.8h, v1.h[2]
667        SMLAL   v18.4s, v4.4h, v2.h[2]
668        SMLAL2  v22.4s, v4.8h, v2.h[2]
669        SMLAL   v26.4s, v5.4h, v2.h[2]
670        SMLAL2  v30.4s, v5.8h, v2.h[2]
671        SMLAL   v19.4s, v4.4h, v3.h[2]
672        SMLAL2  v23.4s, v4.8h, v3.h[2]
673        SMLAL   v27.4s, v5.4h, v3.h[2]
674        SMLAL2  v31.4s, v5.8h, v3.h[2]
675        CMP     x0, 4
676        B.LO    4b
677
678        LDP     d4, d5, [x5], 16
679        USUBL   v4.8h, v4.8b, v7.8b
680        USUBL   v5.8h, v5.8b, v7.8b
681        SMLAL   v16.4s, v4.4h, v0.h[3]
682        SMLAL2  v20.4s, v4.8h, v0.h[3]
683        SMLAL   v24.4s, v5.4h, v0.h[3]
684        SMLAL2  v28.4s, v5.8h, v0.h[3]
685        SMLAL   v17.4s, v4.4h, v1.h[3]
686        SMLAL2  v21.4s, v4.8h, v1.h[3]
687        SMLAL   v25.4s, v5.4h, v1.h[3]
688        SMLAL2  v29.4s, v5.8h, v1.h[3]
689        SMLAL   v18.4s, v4.4h, v2.h[3]
690        SMLAL2  v22.4s, v4.8h, v2.h[3]
691        SMLAL   v26.4s, v5.4h, v2.h[3]
692        SMLAL2  v30.4s, v5.8h, v2.h[3]
693        SMLAL   v19.4s, v4.4h, v3.h[3]
694        SMLAL2  v23.4s, v4.8h, v3.h[3]
695        SMLAL   v27.4s, v5.4h, v3.h[3]
696        SMLAL2  v31.4s, v5.8h, v3.h[3]
697        B.EQ    4b
698
699        LDP     d4, d5, [x5], 16
700        USUBL   v4.8h, v4.8b, v7.8b
701        USUBL   v5.8h, v5.8b, v7.8b
702        SMLAL   v16.4s, v4.4h, v0.h[4]
703        SMLAL2  v20.4s, v4.8h, v0.h[4]
704        SMLAL   v24.4s, v5.4h, v0.h[4]
705        SMLAL2  v28.4s, v5.8h, v0.h[4]
706        SMLAL   v17.4s, v4.4h, v1.h[4]
707        SMLAL2  v21.4s, v4.8h, v1.h[4]
708        SMLAL   v25.4s, v5.4h, v1.h[4]
709        SMLAL2  v29.4s, v5.8h, v1.h[4]
710        SMLAL   v18.4s, v4.4h, v2.h[4]
711        SMLAL2  v22.4s, v4.8h, v2.h[4]
712        SMLAL   v26.4s, v5.4h, v2.h[4]
713        SMLAL2  v30.4s, v5.8h, v2.h[4]
714        SMLAL   v19.4s, v4.4h, v3.h[4]
715        SMLAL2  v23.4s, v4.8h, v3.h[4]
716        SMLAL   v27.4s, v5.4h, v3.h[4]
717        SMLAL2  v31.4s, v5.8h, v3.h[4]
718        CMP     x0, 6
719        B.LO    4b
720
721        LDP     d4, d5, [x5], 16
722        USUBL   v4.8h, v4.8b, v7.8b
723        USUBL   v5.8h, v5.8b, v7.8b
724        SMLAL   v16.4s, v4.4h, v0.h[5]
725        SMLAL2  v20.4s, v4.8h, v0.h[5]
726        SMLAL   v24.4s, v5.4h, v0.h[5]
727        SMLAL2  v28.4s, v5.8h, v0.h[5]
728        SMLAL   v17.4s, v4.4h, v1.h[5]
729        SMLAL2  v21.4s, v4.8h, v1.h[5]
730        SMLAL   v25.4s, v5.4h, v1.h[5]
731        SMLAL2  v29.4s, v5.8h, v1.h[5]
732        SMLAL   v18.4s, v4.4h, v2.h[5]
733        SMLAL2  v22.4s, v4.8h, v2.h[5]
734        SMLAL   v26.4s, v5.4h, v2.h[5]
735        SMLAL2  v30.4s, v5.8h, v2.h[5]
736        SMLAL   v19.4s, v4.4h, v3.h[5]
737        SMLAL2  v23.4s, v4.8h, v3.h[5]
738        SMLAL   v27.4s, v5.4h, v3.h[5]
739        SMLAL2  v31.4s, v5.8h, v3.h[5]
740        B.EQ    4b
741
742        LDP     d4, d5, [x5], 16
743        USUBL   v4.8h, v4.8b, v7.8b
744        USUBL   v5.8h, v5.8b, v7.8b
745        SMLAL   v16.4s, v4.4h, v0.h[6]
746        SMLAL2  v20.4s, v4.8h, v0.h[6]
747        SMLAL   v24.4s, v5.4h, v0.h[6]
748        SMLAL2  v28.4s, v5.8h, v0.h[6]
749        SMLAL   v17.4s, v4.4h, v1.h[6]
750        SMLAL2  v21.4s, v4.8h, v1.h[6]
751        SMLAL   v25.4s, v5.4h, v1.h[6]
752        SMLAL2  v29.4s, v5.8h, v1.h[6]
753        SMLAL   v18.4s, v4.4h, v2.h[6]
754        SMLAL2  v22.4s, v4.8h, v2.h[6]
755        SMLAL   v26.4s, v5.4h, v2.h[6]
756        SMLAL2  v30.4s, v5.8h, v2.h[6]
757        SMLAL   v19.4s, v4.4h, v3.h[6]
758        SMLAL2  v23.4s, v4.8h, v3.h[6]
759        SMLAL   v27.4s, v5.4h, v3.h[6]
760        SMLAL2  v31.4s, v5.8h, v3.h[6]
761        B       4b
762
763        # Store odd width
764        .p2align 3
7656:
766        TBZ     x1, 3, 7f
767        STR     d3, [x7], 8
768        STR     d2, [x17], 8
769        DUP     d3, v3.d[1]
770        DUP     d2, v2.d[1]
771        STR     d1, [x16], 8
772        STR     d0, [x6], 8
773        DUP     d1, v1.d[1]
774        DUP     d0, v0.d[1]
7757:
776        TBZ     x1, 2, 8f
777        STR     s3, [x7], 4
778        STR     s2, [x17], 4
779        DUP     s3, v3.s[1]
780        DUP     s2, v2.s[1]
781        STR     s1, [x16], 4
782        STR     s0, [x6], 4
783        DUP     s1, v1.s[1]
784        DUP     s0, v0.s[1]
7858:
786        TBZ     x1, 1, 9f
787        STR     h3, [x7], 2
788        STR     h2, [x17], 2
789        DUP     h3, v3.h[1]
790        DUP     h2, v2.h[1]
791        STR     h1, [x16], 2
792        STR     h0, [x6], 2
793        DUP     h1, v1.h[1]
794        DUP     h0, v0.h[1]
7959:
796        TBZ     x1, 0, 10f
797        STR     b3, [x7]
798        STR     b2, [x17]
799        STR     b1, [x16]
800        STR     b0, [x6]
80110:
802        # Restore x20 from stack
803        LDR     x20, [sp], 16
804        RET
805
806END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75
807
808#ifdef __ELF__
809.section ".note.GNU-stack","",%progbits
810#endif
811