1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x0) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v4 31# A1 x14 v1 v5 32# A2 x15 v2 v6 33# A3 x10 v3 v7 34# B x5 v8 v9 v10 v11 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v12 v13 v14 v15 40 41# x11 temp for Cortex-A55 loads 42 43BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 44 45 # Clamp C pointers 46 CMP x0, 2 // if mr < 2 47 LDR x8, [sp, 8] // Load a_offset 48 ADD x16, x6, x7 // c1 = c0 + cm_stride 49 LDP x12, x11, [sp, 16] // Load zero, params pointer 50 CSEL x16, x6, x16, LO // c1 = c0 51 ADD x2, x2, 3 // kc = (kc + 3) & ~3 52 STP d8, d9, [sp, -32]! // Save d8-d11 on stack 53 54 ADD x17, x16, x7 // c2 = c1 + cm_stride 55 STP d10, d11, [sp, 16] 56 // if mr <= 2 57 CSEL x17, x16, x17, LS // c2 = c1 58 BIC x2, x2, 3 59 60 CMP x0, 4 // if mr < 4 61 ADD x7, x17, x7 // c3 = c2 + cm_stride 62 CSEL x7, x17, x7, LO // c3 = c2 63 64 .p2align 3 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q20, [x5], 32 68 MOV v17.16b, v16.16b 69 MOV v18.16b, v16.16b 70 LDP q24, q28, [x5], 32 71 MOV v19.16b, v16.16b 72 MOV v21.16b, v20.16b 73 MOV v22.16b, v20.16b 74 MOV v23.16b, v20.16b 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 MOV v27.16b, v24.16b 78 MOV v29.16b, v28.16b 79 MOV v30.16b, v28.16b 80 MOV v31.16b, v28.16b 81 MOV x9, x3 // p = ks 82 83 .p2align 3 841: 85 # Load next 4 A pointers 86 LDP x13, x14, [x4], 16 87 LDP x15, x10, [x4], 16 88 89 CMP x13, x12 // if a0 == zero 90 ADD x13, x13, x8 // a0 += a_offset 91 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 92 CMP x14, x12 // if a1 == zero 93 ADD x14, x14, x8 // a1 += a_offset 94 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 95 CMP x15, x12 // if a2 == zero 96 ADD x15, x15, x8 // a2 += a_offset 97 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 98 CMP x10, x12 // if a3 == zero 99 ADD x10, x10, x8 // a3 += a_offset 100 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 101 102 # Is there at least 16 bytes for prologue/epilogue? 103 SUBS x0, x2, 16 // k = kc - 16 104 B.LO 5f 105 106 # prologue - read A and B values for block 0 and 1 107 LDR d0, [x13], 8 108 LDR q8, [x5], 16 109 LDR d1, [x14], 8 110 LDR d2, [x15], 8 111 LDR d3, [x10], 8 112 SUBS x0, x0, 16 // is there 16 for main loop? 113 LDR d9, [x5], 8 114 LDR x11, [x5], 8 115 # Is there at least 16 bytes for main loop? 116 B.LO 3f 117 118 # Main loop - 16 bytes of A in 4 groups. 119 # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels 120 # 4 LD64 for A 121 # 4 LD128 for W. = 2 LD64 + INS. 122 # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS. 123 124 .p2align 3 1252: 126 # BLOCK 0 127 SDOT v16.4s, v8.16b, v0.4b[0] 128 LDR d10, [x5], 8 129 SDOT v17.4s, v8.16b, v1.4b[0] 130 INS v9.d[1], x11 131 SDOT v18.4s, v8.16b, v2.4b[0] 132 LDR x11, [x5], 8 133 SDOT v19.4s, v8.16b, v3.4b[0] 134 LDR d4, [x13], 8 135 136 # BLOCK 1 137 SDOT v20.4s, v9.16b, v0.4b[0] 138 LDR d11, [x5], 8 139 SDOT v21.4s, v9.16b, v1.4b[0] 140 INS v10.d[1], x11 141 SDOT v22.4s, v9.16b, v2.4b[0] 142 LDR x11, [x5], 8 143 SDOT v23.4s, v9.16b, v3.4b[0] 144 LDR d5, [x14], 8 145 146 # BLOCK 2 147 SDOT v24.4s, v10.16b, v0.4b[0] 148 LDR d8, [x5], 8 149 SDOT v25.4s, v10.16b, v1.4b[0] 150 INS v11.d[1], x11 151 SDOT v26.4s, v10.16b, v2.4b[0] 152 LDR x11, [x5], 8 153 SDOT v27.4s, v10.16b, v3.4b[0] 154 LDR d6, [x15], 8 155 156 # BLOCK 3 157 SDOT v28.4s, v11.16b, v0.4b[0] 158 LDR d9, [x5], 8 159 SDOT v29.4s, v11.16b, v1.4b[0] 160 INS v8.d[1], x11 161 SDOT v30.4s, v11.16b, v2.4b[0] 162 LDR x11, [x5], 8 163 SDOT v31.4s, v11.16b, v3.4b[0] 164 LDR d7, [x10], 8 165 166 # BLOCK 0 167 SDOT v16.4s, v8.16b, v0.4b[1] 168 LDR d10, [x5], 8 169 SDOT v17.4s, v8.16b, v1.4b[1] 170 INS v9.d[1], x11 171 SDOT v18.4s, v8.16b, v2.4b[1] 172 LDR x11, [x5], 8 173 SDOT v19.4s, v8.16b, v3.4b[1] 174 175 # BLOCK 1 176 SDOT v20.4s, v9.16b, v0.4b[1] 177 LDR d11, [x5], 8 178 SDOT v21.4s, v9.16b, v1.4b[1] 179 INS v10.d[1], x11 180 SDOT v22.4s, v9.16b, v2.4b[1] 181 LDR x11, [x5], 8 182 SDOT v23.4s, v9.16b, v3.4b[1] 183 184 # BLOCK 2 185 SDOT v24.4s, v10.16b, v0.4b[1] 186 LDR d8, [x5], 8 187 SDOT v25.4s, v10.16b, v1.4b[1] 188 INS v11.d[1], x11 189 SDOT v26.4s, v10.16b, v2.4b[1] 190 LDR x11, [x5], 8 191 SDOT v27.4s, v10.16b, v3.4b[1] 192 193 # BLOCK 3 194 SDOT v28.4s, v11.16b, v0.4b[1] 195 LDR d9, [x5], 8 196 SDOT v29.4s, v11.16b, v1.4b[1] 197 INS v8.d[1], x11 198 SDOT v30.4s, v11.16b, v2.4b[1] 199 LDR x11, [x5], 8 200 SDOT v31.4s, v11.16b, v3.4b[1] 201 202 # BLOCK 0 203 SDOT v16.4s, v8.16b, v4.4b[0] 204 LDR d10, [x5], 8 205 SDOT v17.4s, v8.16b, v5.4b[0] 206 INS v9.d[1], x11 207 SDOT v18.4s, v8.16b, v6.4b[0] 208 LDR x11, [x5], 8 209 SDOT v19.4s, v8.16b, v7.4b[0] 210 LDR d0, [x13], 8 211 212 # BLOCK 1 213 SDOT v20.4s, v9.16b, v4.4b[0] 214 LDR d11, [x5], 8 215 SDOT v21.4s, v9.16b, v5.4b[0] 216 INS v10.d[1], x11 217 SDOT v22.4s, v9.16b, v6.4b[0] 218 LDR x11, [x5], 8 219 SDOT v23.4s, v9.16b, v7.4b[0] 220 LDR d1, [x14], 8 221 222 # BLOCK 2 223 SDOT v24.4s, v10.16b, v4.4b[0] 224 LDR d8, [x5], 8 225 SDOT v25.4s, v10.16b, v5.4b[0] 226 INS v11.d[1], x11 227 SDOT v26.4s, v10.16b, v6.4b[0] 228 LDR x11, [x5], 8 229 SDOT v27.4s, v10.16b, v7.4b[0] 230 LDR d2, [x15], 8 231 232 # BLOCK 3 233 SDOT v28.4s, v11.16b, v4.4b[0] 234 LDR d9, [x5], 8 235 SDOT v29.4s, v11.16b, v5.4b[0] 236 INS v8.d[1], x11 237 SDOT v30.4s, v11.16b, v6.4b[0] 238 LDR x11, [x5], 8 239 SDOT v31.4s, v11.16b, v7.4b[0] 240 LDR d3, [x10], 8 241 242 # BLOCK 0 243 SDOT v16.4s, v8.16b, v4.4b[1] 244 LDR d10, [x5], 8 245 SDOT v17.4s, v8.16b, v5.4b[1] 246 INS v9.d[1], x11 247 SDOT v18.4s, v8.16b, v6.4b[1] 248 LDR x11, [x5], 8 249 SDOT v19.4s, v8.16b, v7.4b[1] 250 251 # BLOCK 1 252 SDOT v20.4s, v9.16b, v4.4b[1] 253 LDR d11, [x5], 8 254 SDOT v21.4s, v9.16b, v5.4b[1] 255 INS v10.d[1], x11 256 SDOT v22.4s, v9.16b, v6.4b[1] 257 LDR x11, [x5], 8 258 SDOT v23.4s, v9.16b, v7.4b[1] 259 260 # BLOCK 2 261 SDOT v24.4s, v10.16b, v4.4b[1] 262 LDR d8, [x5], 8 // First B values for block 0 and 1 263 SDOT v25.4s, v10.16b, v5.4b[1] 264 INS v11.d[1], x11 265 SDOT v26.4s, v10.16b, v6.4b[1] 266 LDR x11, [x5], 8 267 SDOT v27.4s, v10.16b, v7.4b[1] 268 SUBS x0, x0, 16 269 270 # BLOCK 3 271 SDOT v28.4s, v11.16b, v4.4b[1] 272 LDR d9, [x5], 8 273 SDOT v29.4s, v11.16b, v5.4b[1] 274 INS v8.d[1], x11 275 SDOT v30.4s, v11.16b, v6.4b[1] 276 LDR x11, [x5], 8 277 SDOT v31.4s, v11.16b, v7.4b[1] 278 B.HS 2b 279 280 # Epilogue. Same as main loop but no preloads in final group 2813: 282 # BLOCK 0 283 SDOT v16.4s, v8.16b, v0.4b[0] 284 LDR d10, [x5], 8 285 SDOT v17.4s, v8.16b, v1.4b[0] 286 INS v9.d[1], x11 287 SDOT v18.4s, v8.16b, v2.4b[0] 288 LDR x11, [x5], 8 289 SDOT v19.4s, v8.16b, v3.4b[0] 290 LDR d4, [x13], 8 291 292 # BLOCK 1 293 SDOT v20.4s, v9.16b, v0.4b[0] 294 LDR d11, [x5], 8 295 SDOT v21.4s, v9.16b, v1.4b[0] 296 INS v10.d[1], x11 297 SDOT v22.4s, v9.16b, v2.4b[0] 298 LDR x11, [x5], 8 299 SDOT v23.4s, v9.16b, v3.4b[0] 300 LDR d5, [x14], 8 301 302 # BLOCK 2 303 SDOT v24.4s, v10.16b, v0.4b[0] 304 LDR d8, [x5], 8 305 SDOT v25.4s, v10.16b, v1.4b[0] 306 INS v11.d[1], x11 307 SDOT v26.4s, v10.16b, v2.4b[0] 308 LDR x11, [x5], 8 309 SDOT v27.4s, v10.16b, v3.4b[0] 310 LDR d6, [x15], 8 311 312 # BLOCK 3 313 SDOT v28.4s, v11.16b, v0.4b[0] 314 LDR d9, [x5], 8 315 SDOT v29.4s, v11.16b, v1.4b[0] 316 INS v8.d[1], x11 317 SDOT v30.4s, v11.16b, v2.4b[0] 318 LDR x11, [x5], 8 319 SDOT v31.4s, v11.16b, v3.4b[0] 320 LDR d7, [x10], 8 321 322 # BLOCK 0 323 SDOT v16.4s, v8.16b, v0.4b[1] 324 LDR d10, [x5], 8 325 SDOT v17.4s, v8.16b, v1.4b[1] 326 INS v9.d[1], x11 327 SDOT v18.4s, v8.16b, v2.4b[1] 328 LDR x11, [x5], 8 329 SDOT v19.4s, v8.16b, v3.4b[1] 330 331 # BLOCK 1 332 SDOT v20.4s, v9.16b, v0.4b[1] 333 LDR d11, [x5], 8 334 SDOT v21.4s, v9.16b, v1.4b[1] 335 INS v10.d[1], x11 336 SDOT v22.4s, v9.16b, v2.4b[1] 337 LDR x11, [x5], 8 338 SDOT v23.4s, v9.16b, v3.4b[1] 339 340 # BLOCK 2 341 SDOT v24.4s, v10.16b, v0.4b[1] 342 LDR d8, [x5], 8 343 SDOT v25.4s, v10.16b, v1.4b[1] 344 INS v11.d[1], x11 345 SDOT v26.4s, v10.16b, v2.4b[1] 346 LDR x11, [x5], 8 347 SDOT v27.4s, v10.16b, v3.4b[1] 348 349 # BLOCK 3 350 SDOT v28.4s, v11.16b, v0.4b[1] 351 LDR d9, [x5], 8 352 SDOT v29.4s, v11.16b, v1.4b[1] 353 INS v8.d[1], x11 354 SDOT v30.4s, v11.16b, v2.4b[1] 355 LDR x11, [x5], 8 356 SDOT v31.4s, v11.16b, v3.4b[1] 357 358 # BLOCK 0 359 SDOT v16.4s, v8.16b, v4.4b[0] 360 LDR d10, [x5], 8 361 SDOT v17.4s, v8.16b, v5.4b[0] 362 INS v9.d[1], x11 363 SDOT v18.4s, v8.16b, v6.4b[0] 364 LDR x11, [x5], 8 365 SDOT v19.4s, v8.16b, v7.4b[0] 366 367 # BLOCK 1 368 SDOT v20.4s, v9.16b, v4.4b[0] 369 LDR d11, [x5], 8 370 SDOT v21.4s, v9.16b, v5.4b[0] 371 INS v10.d[1], x11 372 SDOT v22.4s, v9.16b, v6.4b[0] 373 LDR x11, [x5], 8 374 SDOT v23.4s, v9.16b, v7.4b[0] 375 376 # BLOCK 2 377 SDOT v24.4s, v10.16b, v4.4b[0] 378 LDR d8, [x5], 8 379 SDOT v25.4s, v10.16b, v5.4b[0] 380 INS v11.d[1], x11 381 SDOT v26.4s, v10.16b, v6.4b[0] 382 LDR x11, [x5], 8 383 SDOT v27.4s, v10.16b, v7.4b[0] 384 385 # BLOCK 3 386 SDOT v28.4s, v11.16b, v4.4b[0] 387 LDR d9, [x5], 8 388 SDOT v29.4s, v11.16b, v5.4b[0] 389 INS v8.d[1], x11 390 SDOT v30.4s, v11.16b, v6.4b[0] 391 LDR x11, [x5], 8 392 SDOT v31.4s, v11.16b, v7.4b[0] 393 394 # BLOCK 0 395 SDOT v16.4s, v8.16b, v4.4b[1] 396 LDR d10, [x5], 8 397 SDOT v17.4s, v8.16b, v5.4b[1] 398 INS v9.d[1], x11 399 SDOT v18.4s, v8.16b, v6.4b[1] 400 LDR x11, [x5], 8 401 SDOT v19.4s, v8.16b, v7.4b[1] 402 403 # BLOCK 1 404 SDOT v20.4s, v9.16b, v4.4b[1] 405 LDR d11, [x5], 8 406 SDOT v21.4s, v9.16b, v5.4b[1] 407 INS v10.d[1], x11 408 SDOT v22.4s, v9.16b, v6.4b[1] 409 LDR x11, [x5], 8 410 SDOT v23.4s, v9.16b, v7.4b[1] 411 412 # BLOCK 2 413 SDOT v24.4s, v10.16b, v4.4b[1] 414 SDOT v25.4s, v10.16b, v5.4b[1] 415 INS v11.d[1], x11 416 SDOT v26.4s, v10.16b, v6.4b[1] 417 SDOT v27.4s, v10.16b, v7.4b[1] 418 AND x0, x2, 15 // kc remainder 0 to 12 419 420 # BLOCK 3 421 SDOT v28.4s, v11.16b, v4.4b[1] 422 SDOT v29.4s, v11.16b, v5.4b[1] 423 LDR x11, [sp, 56] // reload params pointer 424 SDOT v30.4s, v11.16b, v6.4b[1] 425 SDOT v31.4s, v11.16b, v7.4b[1] 426 427 # Is there a remainder?- 4 to 12 bytes of A 428 CBNZ x0, 6f 429 430 .p2align 3 4314: 432 # ks loop 433 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 434 B.HI 1b 435 436 SCVTF v16.4s, v16.4s 437 SCVTF v17.4s, v17.4s 438 # Load per channel scale values from weights 439 LDR q4, [x5], 16 440 SCVTF v18.4s, v18.4s 441 SCVTF v19.4s, v19.4s 442 LDR q5, [x5], 16 443 SCVTF v20.4s, v20.4s 444 SCVTF v21.4s, v21.4s 445 SCVTF v22.4s, v22.4s 446 SCVTF v23.4s, v23.4s 447 SCVTF v24.4s, v24.4s 448 SCVTF v25.4s, v25.4s 449 SCVTF v26.4s, v26.4s 450 SCVTF v27.4s, v27.4s 451 SCVTF v28.4s, v28.4s 452 SCVTF v29.4s, v29.4s 453 SCVTF v30.4s, v30.4s 454 SCVTF v31.4s, v31.4s 455 456 LDR q6, [x5], 16 457 FMUL v16.4s, v16.4s, v4.4s 458 FMUL v17.4s, v17.4s, v4.4s 459 FMUL v18.4s, v18.4s, v4.4s 460 FMUL v19.4s, v19.4s, v4.4s 461 FMUL v20.4s, v20.4s, v5.4s 462 LDR q4, [x5], 16 463 FMUL v21.4s, v21.4s, v5.4s 464 FMUL v22.4s, v22.4s, v5.4s 465 FMUL v23.4s, v23.4s, v5.4s 466 FMUL v24.4s, v24.4s, v6.4s 467 FMUL v25.4s, v25.4s, v6.4s 468 FMUL v26.4s, v26.4s, v6.4s 469 FMUL v27.4s, v27.4s, v6.4s 470 FMUL v28.4s, v28.4s, v4.4s 471 FMUL v29.4s, v29.4s, v4.4s 472 FMUL v30.4s, v30.4s, v4.4s 473 FMUL v31.4s, v31.4s, v4.4s 474 475 FCVTNS v16.4s, v16.4s 476 FCVTNS v17.4s, v17.4s 477 FCVTNS v18.4s, v18.4s 478 FCVTNS v19.4s, v19.4s 479 FCVTNS v20.4s, v20.4s 480 FCVTNS v21.4s, v21.4s 481 FCVTNS v22.4s, v22.4s 482 FCVTNS v23.4s, v23.4s 483 FCVTNS v24.4s, v24.4s 484 FCVTNS v25.4s, v25.4s 485 FCVTNS v26.4s, v26.4s 486 FCVTNS v27.4s, v27.4s 487 FCVTNS v28.4s, v28.4s 488 FCVTNS v29.4s, v29.4s 489 FCVTNS v30.4s, v30.4s 490 FCVTNS v31.4s, v31.4s 491 492 SQXTN v16.4h, v16.4s 493 SQXTN v17.4h, v17.4s 494 SQXTN v18.4h, v18.4s 495 SQXTN v19.4h, v19.4s 496 SQXTN v24.4h, v24.4s 497 SQXTN v25.4h, v25.4s 498 SQXTN v26.4h, v26.4s 499 SQXTN v27.4h, v27.4s 500 LD1R {v6.8h}, [x11], 2 // add bias 501 502 SQXTN2 v16.8h, v20.4s 503 SQXTN2 v17.8h, v21.4s 504 SQXTN2 v18.8h, v22.4s 505 SQXTN2 v19.8h, v23.4s 506 SQXTN2 v24.8h, v28.4s 507 SQXTN2 v25.8h, v29.4s 508 SQXTN2 v26.8h, v30.4s 509 SQXTN2 v27.8h, v31.4s 510 511 SQADD v16.8h, v16.8h, v6.8h 512 SQADD v17.8h, v17.8h, v6.8h 513 SQADD v18.8h, v18.8h, v6.8h 514 SQADD v19.8h, v19.8h, v6.8h 515 SQADD v24.8h, v24.8h, v6.8h 516 SQADD v25.8h, v25.8h, v6.8h 517 SQADD v26.8h, v26.8h, v6.8h 518 SQADD v27.8h, v27.8h, v6.8h 519 LD1R {v4.16b}, [x11], 1 // clamp min value 520 521 SQXTN v0.8b, v16.8h 522 SQXTN v1.8b, v17.8h 523 SQXTN v2.8b, v18.8h 524 SQXTN v3.8b, v19.8h 525 LD1R {v5.16b}, [x11] // clamp max value 526 SQXTN2 v0.16b, v24.8h 527 SQXTN2 v1.16b, v25.8h 528 SQXTN2 v2.16b, v26.8h 529 SQXTN2 v3.16b, v27.8h 530 LDR x0, [sp, 32] // cn_stride 531 SMAX v0.16b, v0.16b, v4.16b 532 SMAX v1.16b, v1.16b, v4.16b 533 SUB x11, x11, 3 // rewind params pointer 534 SMAX v2.16b, v2.16b, v4.16b 535 SMAX v3.16b, v3.16b, v4.16b 536 SUBS x1, x1, 16 537 SMIN v0.16b, v0.16b, v5.16b 538 SMIN v1.16b, v1.16b, v5.16b 539 SMIN v2.16b, v2.16b, v5.16b 540 SMIN v3.16b, v3.16b, v5.16b 541 B.LO 7f 542 543 # Store full 4 x 16 544 ST1 {v3.16b}, [x7], x0 545 ST1 {v2.16b}, [x17], x0 546 ST1 {v1.16b}, [x16], x0 547 ST1 {v0.16b}, [x6], x0 548 549 SUB x4, x4, x3 // a -= ks 550 551 # nc loop 552 B.HI 0b 553 554 # Restore d8-d11 from stack 555 LDP d10, d11, [sp, 16] 556 LDP d8, d9, [sp], 32 557 RET 558 559 # Remainder- 4 to 12 bytes of A 560 # Although C4, its safe to read 16 bytes. 561 .p2align 3 5625: 563 AND x0, x2, 15 // kc remainder 4 to 12 5646: 565 LDR q0, [x13] 566 LDP q8, q9, [x5], 32 567 LDR q1, [x14] 568 LDR q2, [x15] 569 LDR q3, [x10] 570 LDP q10, q11, [x5], 32 571 SDOT v16.4s, v8.16b, v0.4b[0] 572 SDOT v17.4s, v8.16b, v1.4b[0] 573 SDOT v18.4s, v8.16b, v2.4b[0] 574 SDOT v19.4s, v8.16b, v3.4b[0] 575 SDOT v20.4s, v9.16b, v0.4b[0] 576 SDOT v21.4s, v9.16b, v1.4b[0] 577 SDOT v22.4s, v9.16b, v2.4b[0] 578 SDOT v23.4s, v9.16b, v3.4b[0] 579 SDOT v24.4s, v10.16b, v0.4b[0] 580 SDOT v25.4s, v10.16b, v1.4b[0] 581 SDOT v26.4s, v10.16b, v2.4b[0] 582 SDOT v27.4s, v10.16b, v3.4b[0] 583 SDOT v28.4s, v11.16b, v0.4b[0] 584 SDOT v29.4s, v11.16b, v1.4b[0] 585 SDOT v30.4s, v11.16b, v2.4b[0] 586 SDOT v31.4s, v11.16b, v3.4b[0] 587 CMP x0, 4 588 B.LS 4b 589 LDP q8, q9, [x5], 32 590 LDP q10, q11, [x5], 32 591 SDOT v16.4s, v8.16b, v0.4b[1] 592 SDOT v17.4s, v8.16b, v1.4b[1] 593 SDOT v18.4s, v8.16b, v2.4b[1] 594 SDOT v19.4s, v8.16b, v3.4b[1] 595 SDOT v20.4s, v9.16b, v0.4b[1] 596 SDOT v21.4s, v9.16b, v1.4b[1] 597 SDOT v22.4s, v9.16b, v2.4b[1] 598 SDOT v23.4s, v9.16b, v3.4b[1] 599 SDOT v24.4s, v10.16b, v0.4b[1] 600 SDOT v25.4s, v10.16b, v1.4b[1] 601 SDOT v26.4s, v10.16b, v2.4b[1] 602 SDOT v27.4s, v10.16b, v3.4b[1] 603 SDOT v28.4s, v11.16b, v0.4b[1] 604 SDOT v29.4s, v11.16b, v1.4b[1] 605 SDOT v30.4s, v11.16b, v2.4b[1] 606 SDOT v31.4s, v11.16b, v3.4b[1] 607 CMP x0, 8 608 B.LS 4b 609 LDP q8, q9, [x5], 32 610 LDP q10, q11, [x5], 32 611 SDOT v16.4s, v8.16b, v0.4b[2] 612 SDOT v17.4s, v8.16b, v1.4b[2] 613 SDOT v18.4s, v8.16b, v2.4b[2] 614 SDOT v19.4s, v8.16b, v3.4b[2] 615 SDOT v20.4s, v9.16b, v0.4b[2] 616 SDOT v21.4s, v9.16b, v1.4b[2] 617 SDOT v22.4s, v9.16b, v2.4b[2] 618 SDOT v23.4s, v9.16b, v3.4b[2] 619 SDOT v24.4s, v10.16b, v0.4b[2] 620 SDOT v25.4s, v10.16b, v1.4b[2] 621 SDOT v26.4s, v10.16b, v2.4b[2] 622 SDOT v27.4s, v10.16b, v3.4b[2] 623 SDOT v28.4s, v11.16b, v0.4b[2] 624 SDOT v29.4s, v11.16b, v1.4b[2] 625 SDOT v30.4s, v11.16b, v2.4b[2] 626 SDOT v31.4s, v11.16b, v3.4b[2] 627 B 4b 628 629 # Store odd width 630 .p2align 3 6317: 632 TBZ x1, 3, 8f 633 STR d3, [x7], 8 634 STR d2, [x17], 8 635 DUP d3, v3.d[1] 636 DUP d2, v2.d[1] 637 STR d1, [x16], 8 638 STR d0, [x6], 8 639 DUP d1, v1.d[1] 640 DUP d0, v0.d[1] 6418: 642 TBZ x1, 2, 9f 643 STR s3, [x7], 4 644 STR s2, [x17], 4 645 DUP s3, v3.s[1] 646 DUP s2, v2.s[1] 647 STR s1, [x16], 4 648 STR s0, [x6], 4 649 DUP s1, v1.s[1] 650 DUP s0, v0.s[1] 6519: 652 TBZ x1, 1, 10f 653 STR h3, [x7], 2 654 STR h2, [x17], 2 655 DUP h3, v3.h[1] 656 DUP h2, v2.h[1] 657 STR h1, [x16], 2 658 STR h0, [x6], 2 659 DUP h1, v1.h[1] 660 DUP h0, v0.h[1] 66110: 662 TBZ x1, 0, 11f 663 STR b3, [x7] 664 STR b2, [x17] 665 STR b1, [x16] 666 STR b0, [x6] 66711: 668 # Restore d8-d11 from stack 669 LDP d10, d11, [sp, 16] 670 LDP d8, d9, [sp], 32 671 RET 672 673END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 674 675#ifdef __ELF__ 676.section ".note.GNU-stack","",%progbits 677#endif 678