1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t** restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x20 v3 34# B x5 v4 v5 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 40 41BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64 42 43 # Clamp C pointers 44 CMP x0, 2 // if mr < 2 45 LDP x10, x8, [sp] // Load cn_stride, a_offset 46 ADD x16, x6, x7 // c1 = c0 + cm_stride 47 CSEL x16, x6, x16, LO // c1 = c0 48 49 ADD x17, x16, x7 // c2 = c1 + cm_stride 50 LDP x12, x11, [sp, 16] // Load zero, params pointer 51 // if mr <= 2 52 CSEL x17, x16, x17, LS // c2 = c1 53 54 CMP x0, 4 // if mr < 4 55 STR x20, [sp, -16]! // Save x20 on stack 56 ADD x7, x17, x7 // c3 = c2 + cm_stride 57 CSEL x7, x17, x7, LO // c3 = c2 58 59 60 .p2align 3 610: 62 # Load initial bias from w into accumulators 63 LDP q16, q20, [x5], 32 64 MOV v17.16b, v16.16b 65 MOV v18.16b, v16.16b 66 LDP q24, q28, [x5], 32 67 MOV v19.16b, v16.16b 68 MOV v21.16b, v20.16b 69 MOV v22.16b, v20.16b 70 MOV v23.16b, v20.16b 71 MOV v25.16b, v24.16b 72 MOV v26.16b, v24.16b 73 MOV v27.16b, v24.16b 74 MOV v29.16b, v28.16b 75 MOV v30.16b, v28.16b 76 MOV v31.16b, v28.16b 77 MOV x9, x3 // p = ks 78 79 .p2align 3 801: 81 # Load next 4 A pointers 82 LDP x13, x14, [x4], 16 83 LDP x15, x20, [x4], 16 84 85 CMP x13, x12 // if a0 == zero 86 ADD x13, x13, x8 // a0 += a_offset 87 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 88 CMP x14, x12 // if a1 == zero 89 ADD x14, x14, x8 // a1 += a_offset 90 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 91 CMP x15, x12 // if a2 == zero 92 ADD x15, x15, x8 // a2 += a_offset 93 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 94 CMP x20, x12 // if a3 == zero 95 ADD x20, x20, x8 // a3 += a_offset 96 CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset 97 98 # Is there at least 8 bytes for main loop? 99 SUBS x0, x2, 8 // k = kc - 8 100 B.LO 4f 101 102 # Main loop - 8 bytes of A 103 .p2align 3 1042: 105 LD1 {v0.8b}, [x13], 8 106 LDP d4, d5, [x5], 16 107 LD1 {v1.8b}, [x14], 8 108 LD1 {v2.8b}, [x15], 8 109 LD1 {v3.8b}, [x20], 8 110 SXTL v0.8h, v0.8b 111 SXTL v4.8h, v4.8b 112 SXTL v5.8h, v5.8b 113 SXTL v1.8h, v1.8b 114 SXTL v2.8h, v2.8b 115 SXTL v3.8h, v3.8b 116 SMLAL v16.4s, v4.4h, v0.h[0] 117 SMLAL2 v20.4s, v4.8h, v0.h[0] 118 SMLAL v24.4s, v5.4h, v0.h[0] 119 SMLAL2 v28.4s, v5.8h, v0.h[0] 120 SMLAL v17.4s, v4.4h, v1.h[0] 121 SMLAL2 v21.4s, v4.8h, v1.h[0] 122 SMLAL v25.4s, v5.4h, v1.h[0] 123 SMLAL2 v29.4s, v5.8h, v1.h[0] 124 SMLAL v18.4s, v4.4h, v2.h[0] 125 SMLAL2 v22.4s, v4.8h, v2.h[0] 126 SMLAL v26.4s, v5.4h, v2.h[0] 127 SMLAL2 v30.4s, v5.8h, v2.h[0] 128 SMLAL v19.4s, v4.4h, v3.h[0] 129 SMLAL2 v23.4s, v4.8h, v3.h[0] 130 SMLAL v27.4s, v5.4h, v3.h[0] 131 SMLAL2 v31.4s, v5.8h, v3.h[0] 132 133 LDP d4, d5, [x5], 16 134 SXTL v4.8h, v4.8b 135 SXTL v5.8h, v5.8b 136 SMLAL v16.4s, v4.4h, v0.h[1] 137 SMLAL2 v20.4s, v4.8h, v0.h[1] 138 SMLAL v24.4s, v5.4h, v0.h[1] 139 SMLAL2 v28.4s, v5.8h, v0.h[1] 140 SMLAL v17.4s, v4.4h, v1.h[1] 141 SMLAL2 v21.4s, v4.8h, v1.h[1] 142 SMLAL v25.4s, v5.4h, v1.h[1] 143 SMLAL2 v29.4s, v5.8h, v1.h[1] 144 SMLAL v18.4s, v4.4h, v2.h[1] 145 SMLAL2 v22.4s, v4.8h, v2.h[1] 146 SMLAL v26.4s, v5.4h, v2.h[1] 147 SMLAL2 v30.4s, v5.8h, v2.h[1] 148 SMLAL v19.4s, v4.4h, v3.h[1] 149 SMLAL2 v23.4s, v4.8h, v3.h[1] 150 SMLAL v27.4s, v5.4h, v3.h[1] 151 SMLAL2 v31.4s, v5.8h, v3.h[1] 152 153 LDP d4, d5, [x5], 16 154 SXTL v4.8h, v4.8b 155 SXTL v5.8h, v5.8b 156 SMLAL v16.4s, v4.4h, v0.h[2] 157 SMLAL2 v20.4s, v4.8h, v0.h[2] 158 SMLAL v24.4s, v5.4h, v0.h[2] 159 SMLAL2 v28.4s, v5.8h, v0.h[2] 160 SMLAL v17.4s, v4.4h, v1.h[2] 161 SMLAL2 v21.4s, v4.8h, v1.h[2] 162 SMLAL v25.4s, v5.4h, v1.h[2] 163 SMLAL2 v29.4s, v5.8h, v1.h[2] 164 SMLAL v18.4s, v4.4h, v2.h[2] 165 SMLAL2 v22.4s, v4.8h, v2.h[2] 166 SMLAL v26.4s, v5.4h, v2.h[2] 167 SMLAL2 v30.4s, v5.8h, v2.h[2] 168 SMLAL v19.4s, v4.4h, v3.h[2] 169 SMLAL2 v23.4s, v4.8h, v3.h[2] 170 SMLAL v27.4s, v5.4h, v3.h[2] 171 SMLAL2 v31.4s, v5.8h, v3.h[2] 172 173 LDP d4, d5, [x5], 16 174 SXTL v4.8h, v4.8b 175 SXTL v5.8h, v5.8b 176 SMLAL v16.4s, v4.4h, v0.h[3] 177 SMLAL2 v20.4s, v4.8h, v0.h[3] 178 SMLAL v24.4s, v5.4h, v0.h[3] 179 SMLAL2 v28.4s, v5.8h, v0.h[3] 180 SMLAL v17.4s, v4.4h, v1.h[3] 181 SMLAL2 v21.4s, v4.8h, v1.h[3] 182 SMLAL v25.4s, v5.4h, v1.h[3] 183 SMLAL2 v29.4s, v5.8h, v1.h[3] 184 SMLAL v18.4s, v4.4h, v2.h[3] 185 SMLAL2 v22.4s, v4.8h, v2.h[3] 186 SMLAL v26.4s, v5.4h, v2.h[3] 187 SMLAL2 v30.4s, v5.8h, v2.h[3] 188 SMLAL v19.4s, v4.4h, v3.h[3] 189 SMLAL2 v23.4s, v4.8h, v3.h[3] 190 SMLAL v27.4s, v5.4h, v3.h[3] 191 SMLAL2 v31.4s, v5.8h, v3.h[3] 192 193 LDP d4, d5, [x5], 16 194 SXTL v4.8h, v4.8b 195 SXTL v5.8h, v5.8b 196 SMLAL v16.4s, v4.4h, v0.h[4] 197 SMLAL2 v20.4s, v4.8h, v0.h[4] 198 SMLAL v24.4s, v5.4h, v0.h[4] 199 SMLAL2 v28.4s, v5.8h, v0.h[4] 200 SMLAL v17.4s, v4.4h, v1.h[4] 201 SMLAL2 v21.4s, v4.8h, v1.h[4] 202 SMLAL v25.4s, v5.4h, v1.h[4] 203 SMLAL2 v29.4s, v5.8h, v1.h[4] 204 SMLAL v18.4s, v4.4h, v2.h[4] 205 SMLAL2 v22.4s, v4.8h, v2.h[4] 206 SMLAL v26.4s, v5.4h, v2.h[4] 207 SMLAL2 v30.4s, v5.8h, v2.h[4] 208 SMLAL v19.4s, v4.4h, v3.h[4] 209 SMLAL2 v23.4s, v4.8h, v3.h[4] 210 SMLAL v27.4s, v5.4h, v3.h[4] 211 SMLAL2 v31.4s, v5.8h, v3.h[4] 212 213 LDP d4, d5, [x5], 16 214 SXTL v4.8h, v4.8b 215 SXTL v5.8h, v5.8b 216 SMLAL v16.4s, v4.4h, v0.h[5] 217 SMLAL2 v20.4s, v4.8h, v0.h[5] 218 SMLAL v24.4s, v5.4h, v0.h[5] 219 SMLAL2 v28.4s, v5.8h, v0.h[5] 220 SMLAL v17.4s, v4.4h, v1.h[5] 221 SMLAL2 v21.4s, v4.8h, v1.h[5] 222 SMLAL v25.4s, v5.4h, v1.h[5] 223 SMLAL2 v29.4s, v5.8h, v1.h[5] 224 SMLAL v18.4s, v4.4h, v2.h[5] 225 SMLAL2 v22.4s, v4.8h, v2.h[5] 226 SMLAL v26.4s, v5.4h, v2.h[5] 227 SMLAL2 v30.4s, v5.8h, v2.h[5] 228 SMLAL v19.4s, v4.4h, v3.h[5] 229 SMLAL2 v23.4s, v4.8h, v3.h[5] 230 SMLAL v27.4s, v5.4h, v3.h[5] 231 SMLAL2 v31.4s, v5.8h, v3.h[5] 232 233 LDP d4, d5, [x5], 16 234 SXTL v4.8h, v4.8b 235 SXTL v5.8h, v5.8b 236 SMLAL v16.4s, v4.4h, v0.h[6] 237 SMLAL2 v20.4s, v4.8h, v0.h[6] 238 SMLAL v24.4s, v5.4h, v0.h[6] 239 SMLAL2 v28.4s, v5.8h, v0.h[6] 240 SMLAL v17.4s, v4.4h, v1.h[6] 241 SMLAL2 v21.4s, v4.8h, v1.h[6] 242 SMLAL v25.4s, v5.4h, v1.h[6] 243 SMLAL2 v29.4s, v5.8h, v1.h[6] 244 SMLAL v18.4s, v4.4h, v2.h[6] 245 SMLAL2 v22.4s, v4.8h, v2.h[6] 246 SMLAL v26.4s, v5.4h, v2.h[6] 247 SMLAL2 v30.4s, v5.8h, v2.h[6] 248 SMLAL v19.4s, v4.4h, v3.h[6] 249 SMLAL2 v23.4s, v4.8h, v3.h[6] 250 SMLAL v27.4s, v5.4h, v3.h[6] 251 SMLAL2 v31.4s, v5.8h, v3.h[6] 252 253 LDP d4, d5, [x5], 16 254 SXTL v4.8h, v4.8b 255 SXTL v5.8h, v5.8b 256 SMLAL v16.4s, v4.4h, v0.h[7] 257 SMLAL2 v20.4s, v4.8h, v0.h[7] 258 SMLAL v24.4s, v5.4h, v0.h[7] 259 SMLAL2 v28.4s, v5.8h, v0.h[7] 260 SMLAL v17.4s, v4.4h, v1.h[7] 261 SMLAL2 v21.4s, v4.8h, v1.h[7] 262 SMLAL v25.4s, v5.4h, v1.h[7] 263 SMLAL2 v29.4s, v5.8h, v1.h[7] 264 SMLAL v18.4s, v4.4h, v2.h[7] 265 SMLAL2 v22.4s, v4.8h, v2.h[7] 266 SMLAL v26.4s, v5.4h, v2.h[7] 267 SMLAL2 v30.4s, v5.8h, v2.h[7] 268 SMLAL v19.4s, v4.4h, v3.h[7] 269 SMLAL2 v23.4s, v4.8h, v3.h[7] 270 SMLAL v27.4s, v5.4h, v3.h[7] 271 SMLAL2 v31.4s, v5.8h, v3.h[7] 272 273 SUBS x0, x0, 8 274 B.HS 2b 275 276 AND x0, x2, 7 // kc remainder 0 to 7 277 # Is there a remainder?- 1 to 7 bytes of A 278 CBNZ x0, 4f 279 2803: 281 # ks loop 282 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 283 B.HI 1b 284 285 SCVTF v16.4s, v16.4s 286 SCVTF v17.4s, v17.4s 287 # Apply params - scale, bias and clamp 288 LD1R {v4.4s}, [x11], 4 289 SCVTF v18.4s, v18.4s 290 SCVTF v19.4s, v19.4s 291 SCVTF v20.4s, v20.4s 292 SCVTF v21.4s, v21.4s 293 SCVTF v22.4s, v22.4s 294 SCVTF v23.4s, v23.4s 295 SCVTF v24.4s, v24.4s 296 SCVTF v25.4s, v25.4s 297 SCVTF v26.4s, v26.4s 298 SCVTF v27.4s, v27.4s 299 SCVTF v28.4s, v28.4s 300 SCVTF v29.4s, v29.4s 301 SCVTF v30.4s, v30.4s 302 SCVTF v31.4s, v31.4s 303 304 FMUL v16.4s, v16.4s, v4.4s 305 FMUL v17.4s, v17.4s, v4.4s 306 FMUL v18.4s, v18.4s, v4.4s 307 FMUL v19.4s, v19.4s, v4.4s 308 FMUL v20.4s, v20.4s, v4.4s 309 FMUL v21.4s, v21.4s, v4.4s 310 FMUL v22.4s, v22.4s, v4.4s 311 FMUL v23.4s, v23.4s, v4.4s 312 FMUL v24.4s, v24.4s, v4.4s 313 FMUL v25.4s, v25.4s, v4.4s 314 FMUL v26.4s, v26.4s, v4.4s 315 FMUL v27.4s, v27.4s, v4.4s 316 FMUL v28.4s, v28.4s, v4.4s 317 FMUL v29.4s, v29.4s, v4.4s 318 FMUL v30.4s, v30.4s, v4.4s 319 FMUL v31.4s, v31.4s, v4.4s 320 321 FCVTNS v16.4s, v16.4s 322 FCVTNS v17.4s, v17.4s 323 FCVTNS v18.4s, v18.4s 324 FCVTNS v19.4s, v19.4s 325 FCVTNS v20.4s, v20.4s 326 FCVTNS v21.4s, v21.4s 327 FCVTNS v22.4s, v22.4s 328 FCVTNS v23.4s, v23.4s 329 FCVTNS v24.4s, v24.4s 330 FCVTNS v25.4s, v25.4s 331 FCVTNS v26.4s, v26.4s 332 FCVTNS v27.4s, v27.4s 333 FCVTNS v28.4s, v28.4s 334 FCVTNS v29.4s, v29.4s 335 FCVTNS v30.4s, v30.4s 336 FCVTNS v31.4s, v31.4s 337 338 SQXTN v16.4h, v16.4s 339 SQXTN v17.4h, v17.4s 340 SQXTN v18.4h, v18.4s 341 SQXTN v19.4h, v19.4s 342 SQXTN v24.4h, v24.4s 343 SQXTN v25.4h, v25.4s 344 SQXTN v26.4h, v26.4s 345 SQXTN v27.4h, v27.4s 346 LD1R {v6.8h}, [x11], 2 // add bias 347 348 SQXTN2 v16.8h, v20.4s 349 SQXTN2 v17.8h, v21.4s 350 SQXTN2 v18.8h, v22.4s 351 SQXTN2 v19.8h, v23.4s 352 SQXTN2 v24.8h, v28.4s 353 SQXTN2 v25.8h, v29.4s 354 SQXTN2 v26.8h, v30.4s 355 SQXTN2 v27.8h, v31.4s 356 357 SQADD v16.8h, v16.8h, v6.8h 358 SQADD v17.8h, v17.8h, v6.8h 359 SQADD v18.8h, v18.8h, v6.8h 360 SQADD v19.8h, v19.8h, v6.8h 361 SQADD v24.8h, v24.8h, v6.8h 362 SQADD v25.8h, v25.8h, v6.8h 363 SQADD v26.8h, v26.8h, v6.8h 364 SQADD v27.8h, v27.8h, v6.8h 365 LD1R {v4.16b}, [x11], 1 // clamp min value 366 367 SQXTN v0.8b, v16.8h 368 SQXTN v1.8b, v17.8h 369 SQXTN v2.8b, v18.8h 370 SQXTN v3.8b, v19.8h 371 LD1R {v5.16b}, [x11] // clamp max value 372 SQXTN2 v0.16b, v24.8h 373 SQXTN2 v1.16b, v25.8h 374 SQXTN2 v2.16b, v26.8h 375 SQXTN2 v3.16b, v27.8h 376 SUB x11, x11, 7 // rewind params pointer 377 378 SMAX v0.16b, v0.16b, v4.16b 379 SMAX v1.16b, v1.16b, v4.16b 380 SMAX v2.16b, v2.16b, v4.16b 381 SMAX v3.16b, v3.16b, v4.16b 382 SUBS x1, x1, 16 383 SMIN v0.16b, v0.16b, v5.16b 384 SMIN v1.16b, v1.16b, v5.16b 385 SMIN v2.16b, v2.16b, v5.16b 386 SMIN v3.16b, v3.16b, v5.16b 387 B.LO 5f 388 389 # Store full 4 x 16 390 ST1 {v3.16b}, [x7], x10 391 ST1 {v2.16b}, [x17], x10 392 ST1 {v1.16b}, [x16], x10 393 ST1 {v0.16b}, [x6], x10 394 395 SUB x4, x4, x3 // a -= ks 396 397 # nc loop 398 B.HI 0b 399 400 # Restore x20 from stack 401 LDR x20, [sp], 16 402 RET 403 404 # Remainder- 1 to 7 bytes of A 405 .p2align 3 4064: 407 AND x0, x2, 7 // kc remainder 1 to 7 408 409 LD1 {v0.8b}, [x13], x0 410 LDP d4, d5, [x5], 16 411 LD1 {v1.8b}, [x14], x0 412 LD1 {v2.8b}, [x15], x0 413 LD1 {v3.8b}, [x20], x0 414 SXTL v0.8h, v0.8b 415 SXTL v4.8h, v4.8b 416 SXTL v5.8h, v5.8b 417 SXTL v1.8h, v1.8b 418 SXTL v2.8h, v2.8b 419 SXTL v3.8h, v3.8b 420 SMLAL v16.4s, v4.4h, v0.h[0] 421 SMLAL2 v20.4s, v4.8h, v0.h[0] 422 SMLAL v24.4s, v5.4h, v0.h[0] 423 SMLAL2 v28.4s, v5.8h, v0.h[0] 424 SMLAL v17.4s, v4.4h, v1.h[0] 425 SMLAL2 v21.4s, v4.8h, v1.h[0] 426 SMLAL v25.4s, v5.4h, v1.h[0] 427 SMLAL2 v29.4s, v5.8h, v1.h[0] 428 SMLAL v18.4s, v4.4h, v2.h[0] 429 SMLAL2 v22.4s, v4.8h, v2.h[0] 430 SMLAL v26.4s, v5.4h, v2.h[0] 431 SMLAL2 v30.4s, v5.8h, v2.h[0] 432 SMLAL v19.4s, v4.4h, v3.h[0] 433 SMLAL2 v23.4s, v4.8h, v3.h[0] 434 SMLAL v27.4s, v5.4h, v3.h[0] 435 SMLAL2 v31.4s, v5.8h, v3.h[0] 436 CMP x0, 2 437 B.LO 3b 438 439 LDP d4, d5, [x5], 16 440 SXTL v4.8h, v4.8b 441 SXTL v5.8h, v5.8b 442 SMLAL v16.4s, v4.4h, v0.h[1] 443 SMLAL2 v20.4s, v4.8h, v0.h[1] 444 SMLAL v24.4s, v5.4h, v0.h[1] 445 SMLAL2 v28.4s, v5.8h, v0.h[1] 446 SMLAL v17.4s, v4.4h, v1.h[1] 447 SMLAL2 v21.4s, v4.8h, v1.h[1] 448 SMLAL v25.4s, v5.4h, v1.h[1] 449 SMLAL2 v29.4s, v5.8h, v1.h[1] 450 SMLAL v18.4s, v4.4h, v2.h[1] 451 SMLAL2 v22.4s, v4.8h, v2.h[1] 452 SMLAL v26.4s, v5.4h, v2.h[1] 453 SMLAL2 v30.4s, v5.8h, v2.h[1] 454 SMLAL v19.4s, v4.4h, v3.h[1] 455 SMLAL2 v23.4s, v4.8h, v3.h[1] 456 SMLAL v27.4s, v5.4h, v3.h[1] 457 SMLAL2 v31.4s, v5.8h, v3.h[1] 458 B.EQ 3b 459 460 LDP d4, d5, [x5], 16 461 SXTL v4.8h, v4.8b 462 SXTL v5.8h, v5.8b 463 SMLAL v16.4s, v4.4h, v0.h[2] 464 SMLAL2 v20.4s, v4.8h, v0.h[2] 465 SMLAL v24.4s, v5.4h, v0.h[2] 466 SMLAL2 v28.4s, v5.8h, v0.h[2] 467 SMLAL v17.4s, v4.4h, v1.h[2] 468 SMLAL2 v21.4s, v4.8h, v1.h[2] 469 SMLAL v25.4s, v5.4h, v1.h[2] 470 SMLAL2 v29.4s, v5.8h, v1.h[2] 471 SMLAL v18.4s, v4.4h, v2.h[2] 472 SMLAL2 v22.4s, v4.8h, v2.h[2] 473 SMLAL v26.4s, v5.4h, v2.h[2] 474 SMLAL2 v30.4s, v5.8h, v2.h[2] 475 SMLAL v19.4s, v4.4h, v3.h[2] 476 SMLAL2 v23.4s, v4.8h, v3.h[2] 477 SMLAL v27.4s, v5.4h, v3.h[2] 478 SMLAL2 v31.4s, v5.8h, v3.h[2] 479 CMP x0, 4 480 B.LO 3b 481 482 LDP d4, d5, [x5], 16 483 SXTL v4.8h, v4.8b 484 SXTL v5.8h, v5.8b 485 SMLAL v16.4s, v4.4h, v0.h[3] 486 SMLAL2 v20.4s, v4.8h, v0.h[3] 487 SMLAL v24.4s, v5.4h, v0.h[3] 488 SMLAL2 v28.4s, v5.8h, v0.h[3] 489 SMLAL v17.4s, v4.4h, v1.h[3] 490 SMLAL2 v21.4s, v4.8h, v1.h[3] 491 SMLAL v25.4s, v5.4h, v1.h[3] 492 SMLAL2 v29.4s, v5.8h, v1.h[3] 493 SMLAL v18.4s, v4.4h, v2.h[3] 494 SMLAL2 v22.4s, v4.8h, v2.h[3] 495 SMLAL v26.4s, v5.4h, v2.h[3] 496 SMLAL2 v30.4s, v5.8h, v2.h[3] 497 SMLAL v19.4s, v4.4h, v3.h[3] 498 SMLAL2 v23.4s, v4.8h, v3.h[3] 499 SMLAL v27.4s, v5.4h, v3.h[3] 500 SMLAL2 v31.4s, v5.8h, v3.h[3] 501 B.EQ 3b 502 503 LDP d4, d5, [x5], 16 504 SXTL v4.8h, v4.8b 505 SXTL v5.8h, v5.8b 506 SMLAL v16.4s, v4.4h, v0.h[4] 507 SMLAL2 v20.4s, v4.8h, v0.h[4] 508 SMLAL v24.4s, v5.4h, v0.h[4] 509 SMLAL2 v28.4s, v5.8h, v0.h[4] 510 SMLAL v17.4s, v4.4h, v1.h[4] 511 SMLAL2 v21.4s, v4.8h, v1.h[4] 512 SMLAL v25.4s, v5.4h, v1.h[4] 513 SMLAL2 v29.4s, v5.8h, v1.h[4] 514 SMLAL v18.4s, v4.4h, v2.h[4] 515 SMLAL2 v22.4s, v4.8h, v2.h[4] 516 SMLAL v26.4s, v5.4h, v2.h[4] 517 SMLAL2 v30.4s, v5.8h, v2.h[4] 518 SMLAL v19.4s, v4.4h, v3.h[4] 519 SMLAL2 v23.4s, v4.8h, v3.h[4] 520 SMLAL v27.4s, v5.4h, v3.h[4] 521 SMLAL2 v31.4s, v5.8h, v3.h[4] 522 CMP x0, 6 523 B.LO 3b 524 525 LDP d4, d5, [x5], 16 526 SXTL v4.8h, v4.8b 527 SXTL v5.8h, v5.8b 528 SMLAL v16.4s, v4.4h, v0.h[5] 529 SMLAL2 v20.4s, v4.8h, v0.h[5] 530 SMLAL v24.4s, v5.4h, v0.h[5] 531 SMLAL2 v28.4s, v5.8h, v0.h[5] 532 SMLAL v17.4s, v4.4h, v1.h[5] 533 SMLAL2 v21.4s, v4.8h, v1.h[5] 534 SMLAL v25.4s, v5.4h, v1.h[5] 535 SMLAL2 v29.4s, v5.8h, v1.h[5] 536 SMLAL v18.4s, v4.4h, v2.h[5] 537 SMLAL2 v22.4s, v4.8h, v2.h[5] 538 SMLAL v26.4s, v5.4h, v2.h[5] 539 SMLAL2 v30.4s, v5.8h, v2.h[5] 540 SMLAL v19.4s, v4.4h, v3.h[5] 541 SMLAL2 v23.4s, v4.8h, v3.h[5] 542 SMLAL v27.4s, v5.4h, v3.h[5] 543 SMLAL2 v31.4s, v5.8h, v3.h[5] 544 B.EQ 3b 545 546 LDP d4, d5, [x5], 16 547 SXTL v4.8h, v4.8b 548 SXTL v5.8h, v5.8b 549 SMLAL v16.4s, v4.4h, v0.h[6] 550 SMLAL2 v20.4s, v4.8h, v0.h[6] 551 SMLAL v24.4s, v5.4h, v0.h[6] 552 SMLAL2 v28.4s, v5.8h, v0.h[6] 553 SMLAL v17.4s, v4.4h, v1.h[6] 554 SMLAL2 v21.4s, v4.8h, v1.h[6] 555 SMLAL v25.4s, v5.4h, v1.h[6] 556 SMLAL2 v29.4s, v5.8h, v1.h[6] 557 SMLAL v18.4s, v4.4h, v2.h[6] 558 SMLAL2 v22.4s, v4.8h, v2.h[6] 559 SMLAL v26.4s, v5.4h, v2.h[6] 560 SMLAL2 v30.4s, v5.8h, v2.h[6] 561 SMLAL v19.4s, v4.4h, v3.h[6] 562 SMLAL2 v23.4s, v4.8h, v3.h[6] 563 SMLAL v27.4s, v5.4h, v3.h[6] 564 SMLAL2 v31.4s, v5.8h, v3.h[6] 565 B 3b 566 567 # Store odd width 568 .p2align 3 5695: 570 TBZ x1, 3, 6f 571 STR d3, [x7], 8 572 STR d2, [x17], 8 573 DUP d3, v3.d[1] 574 DUP d2, v2.d[1] 575 STR d1, [x16], 8 576 STR d0, [x6], 8 577 DUP d1, v1.d[1] 578 DUP d0, v0.d[1] 5796: 580 TBZ x1, 2, 7f 581 STR s3, [x7], 4 582 STR s2, [x17], 4 583 DUP s3, v3.s[1] 584 DUP s2, v2.s[1] 585 STR s1, [x16], 4 586 STR s0, [x6], 4 587 DUP s1, v1.s[1] 588 DUP s0, v0.s[1] 5897: 590 TBZ x1, 1, 8f 591 STR h3, [x7], 2 592 STR h2, [x17], 2 593 DUP h3, v3.h[1] 594 DUP h2, v2.h[1] 595 STR h1, [x16], 2 596 STR h0, [x6], 2 597 DUP h1, v1.h[1] 598 DUP h0, v0.h[1] 5998: 600 TBZ x1, 0, 9f 601 STR b3, [x7] 602 STR b2, [x17] 603 STR b1, [x16] 604 STR b0, [x6] 6059: 606 # Restore x20 from stack 607 LDR x20, [sp], 16 608 RET 609 610END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64 611 612#ifdef __ELF__ 613.section ".note.GNU-stack","",%progbits 614#endif 615