1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v4 29# A1 x15 v1 v5 30# A2 x13 v2 v6 31# A3 x4 v3 v7 32# B x5 v8 v9 v10 v11 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v12 v13 v14 v15 38 39# x14 temp for Cortex-A55 loads 40 41BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 42 43 # Clamp A and C pointers 44 CMP x0, 2 // if mr < 2 45 46 LDP x12, x11, [sp] // cn_stride, params 47 48 ADD x15, x3, x4 // a1 = a0 + a_stride 49 ADD x8, x6, x7 // c1 = c0 + cm_stride 50 51 STP d8, d9, [sp, -32]! 52 53 CSEL x15, x3, x15, LO // a1 = a0 54 CSEL x8, x6, x8, LO // c1 = c0 55 ADD x2, x2, 3 // kc = (kc + 3) & ~3 56 57 ADD x13, x15, x4 // a2 = a1 + a_stride 58 ADD x9, x8, x7 // c2 = c1 + cm_stride 59 // if mr <= 2 60 CSEL x13, x15, x13, LS // a2 = a1 61 CSEL x9, x8, x9, LS // c2 = c1 62 BIC x2, x2, 3 63 64 STP d10, d11, [sp, 16] 65 66 CMP x0, 4 // if mr < 4 67 ADD x4, x13, x4 // a3 = a2 + a_stride 68 ADD x7, x9, x7 // c3 = c2 + cm_stride 69 CSEL x4, x13, x4, LO // a3 = a2 70 CSEL x7, x9, x7, LO // c3 = c2 71 72 .p2align 3 730: 74 # Load initial bias from w into accumulators 75 LDP q16, q20, [x5], 32 76 MOV v17.16b, v16.16b 77 MOV v18.16b, v16.16b 78 LDP q24, q28, [x5], 32 79 MOV v19.16b, v16.16b 80 MOV v21.16b, v20.16b 81 MOV v22.16b, v20.16b 82 MOV v23.16b, v20.16b 83 MOV v25.16b, v24.16b 84 MOV v26.16b, v24.16b 85 SUBS x0, x2, 16 // k = kc - 16 86 MOV v27.16b, v24.16b 87 MOV v29.16b, v28.16b 88 MOV v30.16b, v28.16b 89 MOV v31.16b, v28.16b 90 # Is there at least 16 bytes for prologue/epilogue? 91 B.LO 4f 92 93 # prologue - read A and B values for block 0 and 1 94 LDR d0, [x3], 8 95 LDR q8, [x5], 16 96 LDR d1, [x15], 8 97 LDR d2, [x13], 8 98 LDR d3, [x4], 8 99 SUBS x0, x0, 16 // is there 16 for main loop? 100 LDR d9, [x5], 8 101 LDR x14, [x5], 8 102 # Is there at least 16 bytes for main loop? 103 B.LO 2f 104 105 # Main loop - 16 bytes of A in 4 groups. 106 # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels 107 # 4 LD64 for A 108 # 4 LD128 for W. = 2 LD64 + INS. 109 # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS. 110 111 .p2align 3 1121: 113 # BLOCK 0 114 SDOT v16.4s, v8.16b, v0.4b[0] 115 LDR d10, [x5], 8 116 SDOT v17.4s, v8.16b, v1.4b[0] 117 INS v9.d[1], x14 118 SDOT v18.4s, v8.16b, v2.4b[0] 119 LDR x14, [x5], 8 120 SDOT v19.4s, v8.16b, v3.4b[0] 121 LDR d4, [x3], 8 122 123 # BLOCK 1 124 SDOT v20.4s, v9.16b, v0.4b[0] 125 LDR d11, [x5], 8 126 SDOT v21.4s, v9.16b, v1.4b[0] 127 INS v10.d[1], x14 128 SDOT v22.4s, v9.16b, v2.4b[0] 129 LDR x14, [x5], 8 130 SDOT v23.4s, v9.16b, v3.4b[0] 131 LDR d5, [x15], 8 132 133 # BLOCK 2 134 SDOT v24.4s, v10.16b, v0.4b[0] 135 LDR d8, [x5], 8 136 SDOT v25.4s, v10.16b, v1.4b[0] 137 INS v11.d[1], x14 138 SDOT v26.4s, v10.16b, v2.4b[0] 139 LDR x14, [x5], 8 140 SDOT v27.4s, v10.16b, v3.4b[0] 141 LDR d6, [x13], 8 142 143 # BLOCK 3 144 SDOT v28.4s, v11.16b, v0.4b[0] 145 LDR d9, [x5], 8 146 SDOT v29.4s, v11.16b, v1.4b[0] 147 INS v8.d[1], x14 148 SDOT v30.4s, v11.16b, v2.4b[0] 149 LDR x14, [x5], 8 150 SDOT v31.4s, v11.16b, v3.4b[0] 151 LDR d7, [x4], 8 152 153 # BLOCK 0 154 SDOT v16.4s, v8.16b, v0.4b[1] 155 LDR d10, [x5], 8 156 SDOT v17.4s, v8.16b, v1.4b[1] 157 INS v9.d[1], x14 158 SDOT v18.4s, v8.16b, v2.4b[1] 159 LDR x14, [x5], 8 160 SDOT v19.4s, v8.16b, v3.4b[1] 161 162 # BLOCK 1 163 SDOT v20.4s, v9.16b, v0.4b[1] 164 LDR d11, [x5], 8 165 SDOT v21.4s, v9.16b, v1.4b[1] 166 INS v10.d[1], x14 167 SDOT v22.4s, v9.16b, v2.4b[1] 168 LDR x14, [x5], 8 169 SDOT v23.4s, v9.16b, v3.4b[1] 170 171 # BLOCK 2 172 SDOT v24.4s, v10.16b, v0.4b[1] 173 LDR d8, [x5], 8 174 SDOT v25.4s, v10.16b, v1.4b[1] 175 INS v11.d[1], x14 176 SDOT v26.4s, v10.16b, v2.4b[1] 177 LDR x14, [x5], 8 178 SDOT v27.4s, v10.16b, v3.4b[1] 179 180 # BLOCK 3 181 SDOT v28.4s, v11.16b, v0.4b[1] 182 LDR d9, [x5], 8 183 SDOT v29.4s, v11.16b, v1.4b[1] 184 INS v8.d[1], x14 185 SDOT v30.4s, v11.16b, v2.4b[1] 186 LDR x14, [x5], 8 187 SDOT v31.4s, v11.16b, v3.4b[1] 188 189 # BLOCK 0 190 SDOT v16.4s, v8.16b, v4.4b[0] 191 LDR d10, [x5], 8 192 SDOT v17.4s, v8.16b, v5.4b[0] 193 INS v9.d[1], x14 194 SDOT v18.4s, v8.16b, v6.4b[0] 195 LDR x14, [x5], 8 196 SDOT v19.4s, v8.16b, v7.4b[0] 197 LDR d0, [x3], 8 198 199 # BLOCK 1 200 SDOT v20.4s, v9.16b, v4.4b[0] 201 LDR d11, [x5], 8 202 SDOT v21.4s, v9.16b, v5.4b[0] 203 INS v10.d[1], x14 204 SDOT v22.4s, v9.16b, v6.4b[0] 205 LDR x14, [x5], 8 206 SDOT v23.4s, v9.16b, v7.4b[0] 207 LDR d1, [x15], 8 208 209 # BLOCK 2 210 SDOT v24.4s, v10.16b, v4.4b[0] 211 LDR d8, [x5], 8 212 SDOT v25.4s, v10.16b, v5.4b[0] 213 INS v11.d[1], x14 214 SDOT v26.4s, v10.16b, v6.4b[0] 215 LDR x14, [x5], 8 216 SDOT v27.4s, v10.16b, v7.4b[0] 217 LDR d2, [x13], 8 218 219 # BLOCK 3 220 SDOT v28.4s, v11.16b, v4.4b[0] 221 LDR d9, [x5], 8 222 SDOT v29.4s, v11.16b, v5.4b[0] 223 INS v8.d[1], x14 224 SDOT v30.4s, v11.16b, v6.4b[0] 225 LDR x14, [x5], 8 226 SDOT v31.4s, v11.16b, v7.4b[0] 227 LDR d3, [x4], 8 228 229 # BLOCK 0 230 SDOT v16.4s, v8.16b, v4.4b[1] 231 LDR d10, [x5], 8 232 SDOT v17.4s, v8.16b, v5.4b[1] 233 INS v9.d[1], x14 234 SDOT v18.4s, v8.16b, v6.4b[1] 235 LDR x14, [x5], 8 236 SDOT v19.4s, v8.16b, v7.4b[1] 237 238 # BLOCK 1 239 SDOT v20.4s, v9.16b, v4.4b[1] 240 LDR d11, [x5], 8 241 SDOT v21.4s, v9.16b, v5.4b[1] 242 INS v10.d[1], x14 243 SDOT v22.4s, v9.16b, v6.4b[1] 244 LDR x14, [x5], 8 245 SDOT v23.4s, v9.16b, v7.4b[1] 246 247 # BLOCK 2 248 SDOT v24.4s, v10.16b, v4.4b[1] 249 LDR d8, [x5], 8 // First B values for block 0 and 1 250 SDOT v25.4s, v10.16b, v5.4b[1] 251 INS v11.d[1], x14 252 SDOT v26.4s, v10.16b, v6.4b[1] 253 LDR x14, [x5], 8 254 SDOT v27.4s, v10.16b, v7.4b[1] 255 SUBS x0, x0, 16 256 257 # BLOCK 3 258 SDOT v28.4s, v11.16b, v4.4b[1] 259 LDR d9, [x5], 8 260 SDOT v29.4s, v11.16b, v5.4b[1] 261 INS v8.d[1], x14 262 SDOT v30.4s, v11.16b, v6.4b[1] 263 LDR x14, [x5], 8 264 SDOT v31.4s, v11.16b, v7.4b[1] 265 B.HS 1b 266 267 # Epilogue. Same as main loop but no preloads in final group 2682: 269 # BLOCK 0 270 SDOT v16.4s, v8.16b, v0.4b[0] 271 LDR d10, [x5], 8 272 SDOT v17.4s, v8.16b, v1.4b[0] 273 INS v9.d[1], x14 274 SDOT v18.4s, v8.16b, v2.4b[0] 275 LDR x14, [x5], 8 276 SDOT v19.4s, v8.16b, v3.4b[0] 277 LDR d4, [x3], 8 278 279 # BLOCK 1 280 SDOT v20.4s, v9.16b, v0.4b[0] 281 LDR d11, [x5], 8 282 SDOT v21.4s, v9.16b, v1.4b[0] 283 INS v10.d[1], x14 284 SDOT v22.4s, v9.16b, v2.4b[0] 285 LDR x14, [x5], 8 286 SDOT v23.4s, v9.16b, v3.4b[0] 287 LDR d5, [x15], 8 288 289 # BLOCK 2 290 SDOT v24.4s, v10.16b, v0.4b[0] 291 LDR d8, [x5], 8 292 SDOT v25.4s, v10.16b, v1.4b[0] 293 INS v11.d[1], x14 294 SDOT v26.4s, v10.16b, v2.4b[0] 295 LDR x14, [x5], 8 296 SDOT v27.4s, v10.16b, v3.4b[0] 297 LDR d6, [x13], 8 298 299 # BLOCK 3 300 SDOT v28.4s, v11.16b, v0.4b[0] 301 LDR d9, [x5], 8 302 SDOT v29.4s, v11.16b, v1.4b[0] 303 INS v8.d[1], x14 304 SDOT v30.4s, v11.16b, v2.4b[0] 305 LDR x14, [x5], 8 306 SDOT v31.4s, v11.16b, v3.4b[0] 307 LDR d7, [x4], 8 308 309 # BLOCK 0 310 SDOT v16.4s, v8.16b, v0.4b[1] 311 LDR d10, [x5], 8 312 SDOT v17.4s, v8.16b, v1.4b[1] 313 INS v9.d[1], x14 314 SDOT v18.4s, v8.16b, v2.4b[1] 315 LDR x14, [x5], 8 316 SDOT v19.4s, v8.16b, v3.4b[1] 317 318 # BLOCK 1 319 SDOT v20.4s, v9.16b, v0.4b[1] 320 LDR d11, [x5], 8 321 SDOT v21.4s, v9.16b, v1.4b[1] 322 INS v10.d[1], x14 323 SDOT v22.4s, v9.16b, v2.4b[1] 324 LDR x14, [x5], 8 325 SDOT v23.4s, v9.16b, v3.4b[1] 326 327 # BLOCK 2 328 SDOT v24.4s, v10.16b, v0.4b[1] 329 LDR d8, [x5], 8 330 SDOT v25.4s, v10.16b, v1.4b[1] 331 INS v11.d[1], x14 332 SDOT v26.4s, v10.16b, v2.4b[1] 333 LDR x14, [x5], 8 334 SDOT v27.4s, v10.16b, v3.4b[1] 335 336 # BLOCK 3 337 SDOT v28.4s, v11.16b, v0.4b[1] 338 LDR d9, [x5], 8 339 SDOT v29.4s, v11.16b, v1.4b[1] 340 INS v8.d[1], x14 341 SDOT v30.4s, v11.16b, v2.4b[1] 342 LDR x14, [x5], 8 343 SDOT v31.4s, v11.16b, v3.4b[1] 344 345 # BLOCK 0 346 SDOT v16.4s, v8.16b, v4.4b[0] 347 LDR d10, [x5], 8 348 SDOT v17.4s, v8.16b, v5.4b[0] 349 INS v9.d[1], x14 350 SDOT v18.4s, v8.16b, v6.4b[0] 351 LDR x14, [x5], 8 352 SDOT v19.4s, v8.16b, v7.4b[0] 353 354 # BLOCK 1 355 SDOT v20.4s, v9.16b, v4.4b[0] 356 LDR d11, [x5], 8 357 SDOT v21.4s, v9.16b, v5.4b[0] 358 INS v10.d[1], x14 359 SDOT v22.4s, v9.16b, v6.4b[0] 360 LDR x14, [x5], 8 361 SDOT v23.4s, v9.16b, v7.4b[0] 362 363 # BLOCK 2 364 SDOT v24.4s, v10.16b, v4.4b[0] 365 LDR d8, [x5], 8 366 SDOT v25.4s, v10.16b, v5.4b[0] 367 INS v11.d[1], x14 368 SDOT v26.4s, v10.16b, v6.4b[0] 369 LDR x14, [x5], 8 370 SDOT v27.4s, v10.16b, v7.4b[0] 371 372 # BLOCK 3 373 SDOT v28.4s, v11.16b, v4.4b[0] 374 LDR d9, [x5], 8 375 SDOT v29.4s, v11.16b, v5.4b[0] 376 INS v8.d[1], x14 377 SDOT v30.4s, v11.16b, v6.4b[0] 378 LDR x14, [x5], 8 379 SDOT v31.4s, v11.16b, v7.4b[0] 380 381 # BLOCK 0 382 SDOT v16.4s, v8.16b, v4.4b[1] 383 LDR d10, [x5], 8 384 SDOT v17.4s, v8.16b, v5.4b[1] 385 INS v9.d[1], x14 386 SDOT v18.4s, v8.16b, v6.4b[1] 387 LDR x14, [x5], 8 388 SDOT v19.4s, v8.16b, v7.4b[1] 389 390 # BLOCK 1 391 SDOT v20.4s, v9.16b, v4.4b[1] 392 LDR d11, [x5], 8 393 SDOT v21.4s, v9.16b, v5.4b[1] 394 INS v10.d[1], x14 395 SDOT v22.4s, v9.16b, v6.4b[1] 396 LDR x14, [x5], 8 397 SDOT v23.4s, v9.16b, v7.4b[1] 398 399 # BLOCK 2 400 SDOT v24.4s, v10.16b, v4.4b[1] 401 SDOT v25.4s, v10.16b, v5.4b[1] 402 INS v11.d[1], x14 403 SDOT v26.4s, v10.16b, v6.4b[1] 404 SDOT v27.4s, v10.16b, v7.4b[1] 405 AND x0, x2, 15 // kc remainder 0 to 12 406 407 # BLOCK 3 408 SDOT v28.4s, v11.16b, v4.4b[1] 409 SDOT v29.4s, v11.16b, v5.4b[1] 410 SDOT v30.4s, v11.16b, v6.4b[1] 411 SDOT v31.4s, v11.16b, v7.4b[1] 412 413 # Is there a remainder?- 4 to 12 bytes of A 414 CBNZ x0, 5f 415 416 .p2align 3 4173: 418 SCVTF v16.4s, v16.4s 419 SCVTF v17.4s, v17.4s 420 # Apply params - scale, bias and clamp 421 LD1R {v4.4s}, [x11], 4 422 SCVTF v18.4s, v18.4s 423 SCVTF v19.4s, v19.4s 424 SCVTF v20.4s, v20.4s 425 SCVTF v21.4s, v21.4s 426 SCVTF v22.4s, v22.4s 427 SCVTF v23.4s, v23.4s 428 SCVTF v24.4s, v24.4s 429 SCVTF v25.4s, v25.4s 430 SCVTF v26.4s, v26.4s 431 SCVTF v27.4s, v27.4s 432 SCVTF v28.4s, v28.4s 433 SCVTF v29.4s, v29.4s 434 SCVTF v30.4s, v30.4s 435 SCVTF v31.4s, v31.4s 436 437 FMUL v16.4s, v16.4s, v4.4s 438 FMUL v17.4s, v17.4s, v4.4s 439 FMUL v18.4s, v18.4s, v4.4s 440 FMUL v19.4s, v19.4s, v4.4s 441 FMUL v20.4s, v20.4s, v4.4s 442 FMUL v21.4s, v21.4s, v4.4s 443 FMUL v22.4s, v22.4s, v4.4s 444 FMUL v23.4s, v23.4s, v4.4s 445 FMUL v24.4s, v24.4s, v4.4s 446 FMUL v25.4s, v25.4s, v4.4s 447 FMUL v26.4s, v26.4s, v4.4s 448 FMUL v27.4s, v27.4s, v4.4s 449 FMUL v28.4s, v28.4s, v4.4s 450 FMUL v29.4s, v29.4s, v4.4s 451 FMUL v30.4s, v30.4s, v4.4s 452 FMUL v31.4s, v31.4s, v4.4s 453 454 FCVTNS v16.4s, v16.4s 455 FCVTNS v17.4s, v17.4s 456 FCVTNS v18.4s, v18.4s 457 FCVTNS v19.4s, v19.4s 458 FCVTNS v20.4s, v20.4s 459 FCVTNS v21.4s, v21.4s 460 FCVTNS v22.4s, v22.4s 461 FCVTNS v23.4s, v23.4s 462 FCVTNS v24.4s, v24.4s 463 FCVTNS v25.4s, v25.4s 464 FCVTNS v26.4s, v26.4s 465 FCVTNS v27.4s, v27.4s 466 FCVTNS v28.4s, v28.4s 467 FCVTNS v29.4s, v29.4s 468 FCVTNS v30.4s, v30.4s 469 FCVTNS v31.4s, v31.4s 470 471 SQXTN v16.4h, v16.4s 472 SQXTN v17.4h, v17.4s 473 SQXTN v18.4h, v18.4s 474 SQXTN v19.4h, v19.4s 475 SQXTN v24.4h, v24.4s 476 SQXTN v25.4h, v25.4s 477 SQXTN v26.4h, v26.4s 478 SQXTN v27.4h, v27.4s 479 LD1R {v6.8h}, [x11], 2 // add bias 480 481 SQXTN2 v16.8h, v20.4s 482 SQXTN2 v17.8h, v21.4s 483 SQXTN2 v18.8h, v22.4s 484 SQXTN2 v19.8h, v23.4s 485 SQXTN2 v24.8h, v28.4s 486 SQXTN2 v25.8h, v29.4s 487 SQXTN2 v26.8h, v30.4s 488 SQXTN2 v27.8h, v31.4s 489 490 SQADD v16.8h, v16.8h, v6.8h 491 SQADD v17.8h, v17.8h, v6.8h 492 SQADD v18.8h, v18.8h, v6.8h 493 SQADD v19.8h, v19.8h, v6.8h 494 SQADD v24.8h, v24.8h, v6.8h 495 SQADD v25.8h, v25.8h, v6.8h 496 SQADD v26.8h, v26.8h, v6.8h 497 SQADD v27.8h, v27.8h, v6.8h 498 LD1R {v4.16b}, [x11], 1 // clamp min value 499 500 SQXTN v0.8b, v16.8h 501 SQXTN v1.8b, v17.8h 502 SQXTN v2.8b, v18.8h 503 SQXTN v3.8b, v19.8h 504 LD1R {v5.16b}, [x11] // clamp max value 505 SQXTN2 v0.16b, v24.8h 506 SQXTN2 v1.16b, v25.8h 507 SQXTN2 v2.16b, v26.8h 508 SQXTN2 v3.16b, v27.8h 509 SUB x11, x11, 7 // rewind params pointer 510 511 SMAX v0.16b, v0.16b, v4.16b 512 SMAX v1.16b, v1.16b, v4.16b 513 SMAX v2.16b, v2.16b, v4.16b 514 SMAX v3.16b, v3.16b, v4.16b 515 SUBS x1, x1, 16 516 SMIN v0.16b, v0.16b, v5.16b 517 SMIN v1.16b, v1.16b, v5.16b 518 SMIN v2.16b, v2.16b, v5.16b 519 SMIN v3.16b, v3.16b, v5.16b 520 B.LO 6f 521 522 # Store full 4 x 16 523 ST1 {v0.16b}, [x6], x12 524 SUB x3, x3, x2 // a0 -= kc 525 ST1 {v1.16b}, [x8], x12 526 SUB x15, x15, x2 // a1 -= kc 527 ST1 {v2.16b}, [x9], x12 528 SUB x13, x13, x2 // a2 -= kc 529 ST1 {v3.16b}, [x7], x12 530 SUB x4, x4, x2 // a3 -= kc 531 B.NE 0b 532 533 # Restore d8-d11 from stack 534 LDP d10, d11, [sp, 16] 535 LDP d8, d9, [sp], 32 536 RET 537 538 # Remainder- 4 to 12 bytes of A 539 # Although C4, its safe to read 16 bytes. 540 .p2align 3 5414: 542 AND x0, x2, 15 // kc remainder 4 to 12 5435: 544 LDP q8, q9, [x5], 32 545 LDP q10, q11, [x5], 32 546 LD1 {v0.16b}, [x3], x0 547 LD1 {v1.16b}, [x15], x0 548 LD1 {v2.16b}, [x13], x0 549 LD1 {v3.16b}, [x4], x0 550 SDOT v16.4s, v8.16b, v0.4b[0] 551 SDOT v17.4s, v8.16b, v1.4b[0] 552 SDOT v18.4s, v8.16b, v2.4b[0] 553 SDOT v19.4s, v8.16b, v3.4b[0] 554 SDOT v20.4s, v9.16b, v0.4b[0] 555 SDOT v21.4s, v9.16b, v1.4b[0] 556 SDOT v22.4s, v9.16b, v2.4b[0] 557 SDOT v23.4s, v9.16b, v3.4b[0] 558 SDOT v24.4s, v10.16b, v0.4b[0] 559 SDOT v25.4s, v10.16b, v1.4b[0] 560 SDOT v26.4s, v10.16b, v2.4b[0] 561 SDOT v27.4s, v10.16b, v3.4b[0] 562 SDOT v28.4s, v11.16b, v0.4b[0] 563 SDOT v29.4s, v11.16b, v1.4b[0] 564 SDOT v30.4s, v11.16b, v2.4b[0] 565 SDOT v31.4s, v11.16b, v3.4b[0] 566 CMP x0, 4 567 B.LS 3b 568 LDP q8, q9, [x5], 32 569 LDP q10, q11, [x5], 32 570 SDOT v16.4s, v8.16b, v0.4b[1] 571 SDOT v17.4s, v8.16b, v1.4b[1] 572 SDOT v18.4s, v8.16b, v2.4b[1] 573 SDOT v19.4s, v8.16b, v3.4b[1] 574 SDOT v20.4s, v9.16b, v0.4b[1] 575 SDOT v21.4s, v9.16b, v1.4b[1] 576 SDOT v22.4s, v9.16b, v2.4b[1] 577 SDOT v23.4s, v9.16b, v3.4b[1] 578 SDOT v24.4s, v10.16b, v0.4b[1] 579 SDOT v25.4s, v10.16b, v1.4b[1] 580 SDOT v26.4s, v10.16b, v2.4b[1] 581 SDOT v27.4s, v10.16b, v3.4b[1] 582 SDOT v28.4s, v11.16b, v0.4b[1] 583 SDOT v29.4s, v11.16b, v1.4b[1] 584 SDOT v30.4s, v11.16b, v2.4b[1] 585 SDOT v31.4s, v11.16b, v3.4b[1] 586 CMP x0, 8 587 B.LS 3b 588 LDP q8, q9, [x5], 32 589 LDP q10, q11, [x5], 32 590 SDOT v16.4s, v8.16b, v0.4b[2] 591 SDOT v17.4s, v8.16b, v1.4b[2] 592 SDOT v18.4s, v8.16b, v2.4b[2] 593 SDOT v19.4s, v8.16b, v3.4b[2] 594 SDOT v20.4s, v9.16b, v0.4b[2] 595 SDOT v21.4s, v9.16b, v1.4b[2] 596 SDOT v22.4s, v9.16b, v2.4b[2] 597 SDOT v23.4s, v9.16b, v3.4b[2] 598 SDOT v24.4s, v10.16b, v0.4b[2] 599 SDOT v25.4s, v10.16b, v1.4b[2] 600 SDOT v26.4s, v10.16b, v2.4b[2] 601 SDOT v27.4s, v10.16b, v3.4b[2] 602 SDOT v28.4s, v11.16b, v0.4b[2] 603 SDOT v29.4s, v11.16b, v1.4b[2] 604 SDOT v30.4s, v11.16b, v2.4b[2] 605 SDOT v31.4s, v11.16b, v3.4b[2] 606 B 3b 607 608 # Store odd width 609 .p2align 3 6106: 611 TBZ x1, 3, 7f 612 STR d0, [x6], 8 613 STR d1, [x8], 8 614 DUP d0, v0.d[1] 615 DUP d1, v1.d[1] 616 STR d2, [x9], 8 617 STR d3, [x7], 8 618 DUP d2, v2.d[1] 619 DUP d3, v3.d[1] 6207: 621 TBZ x1, 2, 8f 622 STR s0, [x6], 4 623 STR s1, [x8], 4 624 DUP s0, v0.s[1] 625 DUP s1, v1.s[1] 626 STR s2, [x9], 4 627 STR s3, [x7], 4 628 DUP s2, v2.s[1] 629 DUP s3, v3.s[1] 6308: 631 TBZ x1, 1, 9f 632 STR h0, [x6], 2 633 STR h1, [x8], 2 634 DUP h0, v0.h[1] 635 DUP h1, v1.h[1] 636 STR h2, [x9], 2 637 STR h3, [x7], 2 638 DUP h2, v2.h[1] 639 DUP h3, v3.h[1] 6409: 641 TBZ x1, 0, 10f 642 STR b0, [x6] 643 STR b1, [x8] 644 STR b2, [x9] 645 STR b3, [x7] 64610: 647 # Restore d8-d11 from stack 648 LDP d10, d11, [sp, 16] 649 LDP d8, d9, [sp], 32 650 RET 651 652END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 653 654#ifdef __ELF__ 655.section ".note.GNU-stack","",%progbits 656#endif 657