1// Auto-generated file. Do not edit! 2// Template: src/qu8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11 24 25# params structure is 12 bytes 26# struct { 27# uint8_t kernel_zero_point[4]; 28# float scale; 29# int16_t output_zero_point; 30# int8_t output_min; 31# int8_t output_max; 32# } fp32_neonv8; 33 34# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 35 36# Register usage 37# A0 x3 v0 v4 38# A1 x15 v1 v5 39# A2 x13 v2 v6 40# A3 x4 v3 (v0) 41# B x5 v8 v9 v10 v11 42# C0 x6 v16 v20 v24 v28 43# C1 x8 v17 v21 v25 v29 44# C2 x9 v18 v22 v26 v30 45# C3 x7 v19 v23 v27 v31 46# zero point v7 v12 v13 v14 v15 47 48# x14 temp for Cortex-A55 loads 49 50BEGIN_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 51 52 # Clamp A and C pointers 53 CMP x0, 2 // if mr < 2 54 LDP x12, x11, [sp] // cn_stride, params 55 ADD x2, x2, 3 // kc = (kc + 3) & ~3 56 ADD x15, x3, x4 // a1 = a0 + a_stride 57 ADD x8, x6, x7 // c1 = c0 + cm_stride 58 59 # Save d8-d15 to stack 60 STP d8, d9, [sp, -64]! 61 CSEL x15, x3, x15, LO // a1 = a0 62 CSEL x8, x6, x8, LO // c1 = c0 63 BIC x2, x2, 3 64 STP d10, d11, [sp, 16] 65 66 ADD x13, x15, x4 // a2 = a1 + a_stride 67 ADD x9, x8, x7 // c2 = c1 + cm_stride 68 STP d12, d13, [sp, 32] 69 // if mr <= 2 70 CSEL x13, x15, x13, LS // a2 = a1 71 CSEL x9, x8, x9, LS // c2 = c1 72 STP d14, d15, [sp, 48] 73 74 CMP x0, 4 // if mr < 4 75 ADD x4, x13, x4 // a3 = a2 + a_stride 76 ADD x7, x9, x7 // c3 = c2 + cm_stride 77 78 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 79 80 CSEL x4, x13, x4, LO // a3 = a2 81 CSEL x7, x9, x7, LO // c3 = c2 82 83 84 .p2align 3 850: 86 # Load initial bias from w into accumulators 87 LDP q16, q20, [x5], 32 88 89 MOVI v12.4s, 0 90 MOVI v13.4s, 0 91 MOVI v14.4s, 0 92 MOVI v15.4s, 0 93 94 LDP q24, q28, [x5], 32 95 MOV v17.16b, v16.16b 96 MOV v18.16b, v16.16b 97 MOV v19.16b, v16.16b 98 MOV v21.16b, v20.16b 99 SUBS x0, x2, 16 // k = kc - 16 100 MOV v22.16b, v20.16b 101 MOV v23.16b, v20.16b 102 MOV v25.16b, v24.16b 103 MOV v26.16b, v24.16b 104 MOV v27.16b, v24.16b 105 MOV v29.16b, v28.16b 106 MOV v30.16b, v28.16b 107 MOV v31.16b, v28.16b 108 109 # Is there at least 16 bytes for prologue/epilogue? 110 B.LO 4f 111 112 # prologue - read A and B values for block 0 and 1 113 LDR d0, [x3], 8 114 LDR q8, [x5], 16 115 LDR d1, [x15], 8 116 LDR d2, [x13], 8 117 LDR d3, [x4], 8 118 SUBS x0, x0, 16 // is there 16 for main loop? 119 LDR d9, [x5], 8 120 LDR x14, [x5], 8 121 # Is there at least 16 bytes for main loop? 122 B.LO 2f 123 124 # Main loop - 16 bytes of A in 4 groups. 125 # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels 126 # 4 LD64 for A 127 # 4 LD128 for W. = 2 LD64 + INS. 128 # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS. 129 130 .p2align 3 1311: 132 # BLOCK 0 133 UDOT v16.4s, v8.16b, v0.4b[0] 134 LDR d10, [x5], 8 135 UDOT v17.4s, v8.16b, v1.4b[0] 136 INS v9.d[1], x14 137 UDOT v18.4s, v8.16b, v2.4b[0] 138 LDR x14, [x5], 8 139 UDOT v19.4s, v8.16b, v3.4b[0] 140 141 # BLOCK 1 142 UDOT v20.4s, v9.16b, v0.4b[0] 143 LDR d11, [x5], 8 144 UDOT v21.4s, v9.16b, v1.4b[0] 145 INS v10.d[1], x14 146 UDOT v22.4s, v9.16b, v2.4b[0] 147 LDR x14, [x5], 8 148 UDOT v23.4s, v9.16b, v3.4b[0] 149 150 # BLOCK 2 151 UDOT v24.4s, v10.16b, v0.4b[0] 152 LDR d8, [x5], 8 153 UDOT v25.4s, v10.16b, v1.4b[0] 154 INS v11.d[1], x14 155 UDOT v26.4s, v10.16b, v2.4b[0] 156 LDR x14, [x5], 8 157 UDOT v27.4s, v10.16b, v3.4b[0] 158 159 # BLOCK 3 160 UDOT v28.4s, v11.16b, v0.4b[0] 161 LDR d9, [x5], 8 162 UDOT v29.4s, v11.16b, v1.4b[0] 163 INS v8.d[1], x14 164 UDOT v30.4s, v11.16b, v2.4b[0] 165 LDR x14, [x5], 8 166 UDOT v31.4s, v11.16b, v3.4b[0] 167 168 UDOT v12.2s, v7.8b, v0.8b 169 UDOT v13.2s, v7.8b, v1.8b 170 UDOT v14.2s, v7.8b, v2.8b 171 UDOT v15.2s, v7.8b, v3.8b 172 173 # BLOCK 0 174 UDOT v16.4s, v8.16b, v0.4b[1] 175 LDR d10, [x5], 8 176 UDOT v17.4s, v8.16b, v1.4b[1] 177 INS v9.d[1], x14 178 UDOT v18.4s, v8.16b, v2.4b[1] 179 LDR x14, [x5], 8 180 UDOT v19.4s, v8.16b, v3.4b[1] 181 LDR d4, [x3], 8 182 183 # BLOCK 1 184 UDOT v20.4s, v9.16b, v0.4b[1] 185 LDR d11, [x5], 8 186 UDOT v21.4s, v9.16b, v1.4b[1] 187 INS v10.d[1], x14 188 UDOT v22.4s, v9.16b, v2.4b[1] 189 LDR x14, [x5], 8 190 UDOT v23.4s, v9.16b, v3.4b[1] 191 LDR d5, [x15], 8 192 193 # BLOCK 2 194 UDOT v24.4s, v10.16b, v0.4b[1] 195 LDR d8, [x5], 8 196 UDOT v25.4s, v10.16b, v1.4b[1] 197 INS v11.d[1], x14 198 UDOT v26.4s, v10.16b, v2.4b[1] 199 LDR x14, [x5], 8 200 UDOT v27.4s, v10.16b, v3.4b[1] 201 LDR d6, [x13], 8 202 203 # BLOCK 3 204 UDOT v28.4s, v11.16b, v0.4b[1] 205 LDR d9, [x5], 8 206 UDOT v29.4s, v11.16b, v1.4b[1] 207 INS v8.d[1], x14 208 UDOT v30.4s, v11.16b, v2.4b[1] 209 LDR x14, [x5], 8 210 UDOT v31.4s, v11.16b, v3.4b[1] 211 LDR d0, [x4], 8 212 213 # BLOCK 0 214 UDOT v16.4s, v8.16b, v4.4b[0] 215 LDR d10, [x5], 8 216 UDOT v17.4s, v8.16b, v5.4b[0] 217 INS v9.d[1], x14 218 UDOT v18.4s, v8.16b, v6.4b[0] 219 LDR x14, [x5], 8 220 UDOT v19.4s, v8.16b, v0.4b[0] 221 222 # BLOCK 1 223 UDOT v20.4s, v9.16b, v4.4b[0] 224 LDR d11, [x5], 8 225 UDOT v21.4s, v9.16b, v5.4b[0] 226 INS v10.d[1], x14 227 UDOT v22.4s, v9.16b, v6.4b[0] 228 LDR x14, [x5], 8 229 UDOT v23.4s, v9.16b, v0.4b[0] 230 231 # BLOCK 2 232 UDOT v24.4s, v10.16b, v4.4b[0] 233 LDR d8, [x5], 8 234 UDOT v25.4s, v10.16b, v5.4b[0] 235 INS v11.d[1], x14 236 UDOT v26.4s, v10.16b, v6.4b[0] 237 LDR x14, [x5], 8 238 UDOT v27.4s, v10.16b, v0.4b[0] 239 240 # BLOCK 3 241 UDOT v28.4s, v11.16b, v4.4b[0] 242 LDR d9, [x5], 8 243 UDOT v29.4s, v11.16b, v5.4b[0] 244 INS v8.d[1], x14 245 UDOT v30.4s, v11.16b, v6.4b[0] 246 LDR x14, [x5], 8 247 UDOT v31.4s, v11.16b, v0.4b[0] 248 249 # BLOCK 0 250 UDOT v16.4s, v8.16b, v4.4b[1] 251 LDR d10, [x5], 8 252 UDOT v17.4s, v8.16b, v5.4b[1] 253 INS v9.d[1], x14 254 UDOT v18.4s, v8.16b, v6.4b[1] 255 LDR x14, [x5], 8 256 UDOT v19.4s, v8.16b, v0.4b[1] 257 LDR d1, [x15], 8 258 259 # BLOCK 1 260 UDOT v20.4s, v9.16b, v4.4b[1] 261 LDR d11, [x5], 8 262 UDOT v21.4s, v9.16b, v5.4b[1] 263 INS v10.d[1], x14 264 UDOT v22.4s, v9.16b, v6.4b[1] 265 LDR x14, [x5], 8 266 UDOT v23.4s, v9.16b, v0.4b[1] 267 LDR d2, [x13], 8 268 269 # BLOCK 2 270 UDOT v24.4s, v10.16b, v4.4b[1] 271 LDR d8, [x5], 8 // First B values for block 0 and 1 272 UDOT v25.4s, v10.16b, v5.4b[1] 273 INS v11.d[1], x14 274 UDOT v26.4s, v10.16b, v6.4b[1] 275 LDR x14, [x5], 8 276 UDOT v27.4s, v10.16b, v0.4b[1] 277 LDR d3, [x4], 8 278 279 # BLOCK 3 special 280 UDOT v31.4s, v11.16b, v0.4b[1] 281 LDR d9, [x5], 8 282 UDOT v15.2s, v7.8b, v0.8b // free up v0 early 283 INS v8.d[1], x14 284 UDOT v28.4s, v11.16b, v4.4b[1] 285 LDR x14, [x5], 8 286 UDOT v29.4s, v11.16b, v5.4b[1] 287 LDR d0, [x3], 8 288 UDOT v30.4s, v11.16b, v6.4b[1] 289 SUBS x0, x0, 16 290 291 UDOT v12.2s, v7.8b, v4.8b 292 UDOT v13.2s, v7.8b, v5.8b 293 UDOT v14.2s, v7.8b, v6.8b 294 B.HS 1b 295 296 # Epilogue. Same as main loop but no preloads in final group 2972: 298 # BLOCK 0 299 UDOT v16.4s, v8.16b, v0.4b[0] 300 LDR d10, [x5], 8 301 UDOT v17.4s, v8.16b, v1.4b[0] 302 INS v9.d[1], x14 303 UDOT v18.4s, v8.16b, v2.4b[0] 304 LDR x14, [x5], 8 305 UDOT v19.4s, v8.16b, v3.4b[0] 306 307 # BLOCK 1 308 UDOT v20.4s, v9.16b, v0.4b[0] 309 LDR d11, [x5], 8 310 UDOT v21.4s, v9.16b, v1.4b[0] 311 INS v10.d[1], x14 312 UDOT v22.4s, v9.16b, v2.4b[0] 313 LDR x14, [x5], 8 314 UDOT v23.4s, v9.16b, v3.4b[0] 315 316 # BLOCK 2 317 UDOT v24.4s, v10.16b, v0.4b[0] 318 LDR d8, [x5], 8 319 UDOT v25.4s, v10.16b, v1.4b[0] 320 INS v11.d[1], x14 321 UDOT v26.4s, v10.16b, v2.4b[0] 322 LDR x14, [x5], 8 323 UDOT v27.4s, v10.16b, v3.4b[0] 324 325 # BLOCK 3 326 UDOT v28.4s, v11.16b, v0.4b[0] 327 LDR d9, [x5], 8 328 UDOT v29.4s, v11.16b, v1.4b[0] 329 INS v8.d[1], x14 330 UDOT v30.4s, v11.16b, v2.4b[0] 331 LDR x14, [x5], 8 332 UDOT v31.4s, v11.16b, v3.4b[0] 333 334 UDOT v12.2s, v7.8b, v0.8b 335 UDOT v13.2s, v7.8b, v1.8b 336 UDOT v14.2s, v7.8b, v2.8b 337 UDOT v15.2s, v7.8b, v3.8b 338 339 # BLOCK 0 340 UDOT v16.4s, v8.16b, v0.4b[1] 341 LDR d10, [x5], 8 342 UDOT v17.4s, v8.16b, v1.4b[1] 343 INS v9.d[1], x14 344 UDOT v18.4s, v8.16b, v2.4b[1] 345 LDR x14, [x5], 8 346 UDOT v19.4s, v8.16b, v3.4b[1] 347 LDR d4, [x3], 8 348 349 # BLOCK 1 350 UDOT v20.4s, v9.16b, v0.4b[1] 351 LDR d11, [x5], 8 352 UDOT v21.4s, v9.16b, v1.4b[1] 353 INS v10.d[1], x14 354 UDOT v22.4s, v9.16b, v2.4b[1] 355 LDR x14, [x5], 8 356 UDOT v23.4s, v9.16b, v3.4b[1] 357 LDR d5, [x15], 8 358 359 # BLOCK 2 360 UDOT v24.4s, v10.16b, v0.4b[1] 361 LDR d8, [x5], 8 362 UDOT v25.4s, v10.16b, v1.4b[1] 363 INS v11.d[1], x14 364 UDOT v26.4s, v10.16b, v2.4b[1] 365 LDR x14, [x5], 8 366 UDOT v27.4s, v10.16b, v3.4b[1] 367 LDR d6, [x13], 8 368 369 # BLOCK 3 370 UDOT v28.4s, v11.16b, v0.4b[1] 371 LDR d9, [x5], 8 372 UDOT v29.4s, v11.16b, v1.4b[1] 373 INS v8.d[1], x14 374 UDOT v30.4s, v11.16b, v2.4b[1] 375 LDR x14, [x5], 8 376 UDOT v31.4s, v11.16b, v3.4b[1] 377 LDR d0, [x4], 8 378 379 # BLOCK 0 380 UDOT v16.4s, v8.16b, v4.4b[0] 381 LDR d10, [x5], 8 382 UDOT v17.4s, v8.16b, v5.4b[0] 383 INS v9.d[1], x14 384 UDOT v18.4s, v8.16b, v6.4b[0] 385 LDR x14, [x5], 8 386 UDOT v19.4s, v8.16b, v0.4b[0] 387 388 # BLOCK 1 389 UDOT v20.4s, v9.16b, v4.4b[0] 390 LDR d11, [x5], 8 391 UDOT v21.4s, v9.16b, v5.4b[0] 392 INS v10.d[1], x14 393 UDOT v22.4s, v9.16b, v6.4b[0] 394 LDR x14, [x5], 8 395 UDOT v23.4s, v9.16b, v0.4b[0] 396 397 # BLOCK 2 398 UDOT v24.4s, v10.16b, v4.4b[0] 399 LDR d8, [x5], 8 400 UDOT v25.4s, v10.16b, v5.4b[0] 401 INS v11.d[1], x14 402 UDOT v26.4s, v10.16b, v6.4b[0] 403 LDR x14, [x5], 8 404 UDOT v27.4s, v10.16b, v0.4b[0] 405 406 # BLOCK 3 407 UDOT v28.4s, v11.16b, v4.4b[0] 408 LDR d9, [x5], 8 409 UDOT v29.4s, v11.16b, v5.4b[0] 410 INS v8.d[1], x14 411 UDOT v30.4s, v11.16b, v6.4b[0] 412 LDR x14, [x5], 8 413 UDOT v31.4s, v11.16b, v0.4b[0] 414 415 # BLOCK 0 416 UDOT v16.4s, v8.16b, v4.4b[1] 417 LDR d10, [x5], 8 418 UDOT v17.4s, v8.16b, v5.4b[1] 419 INS v9.d[1], x14 420 UDOT v18.4s, v8.16b, v6.4b[1] 421 LDR x14, [x5], 8 422 UDOT v19.4s, v8.16b, v0.4b[1] 423 424 # BLOCK 1 425 UDOT v20.4s, v9.16b, v4.4b[1] 426 LDR d11, [x5], 8 427 UDOT v21.4s, v9.16b, v5.4b[1] 428 INS v10.d[1], x14 429 UDOT v22.4s, v9.16b, v6.4b[1] 430 LDR x14, [x5], 8 431 UDOT v23.4s, v9.16b, v0.4b[1] 432 433 # BLOCK 2 434 UDOT v24.4s, v10.16b, v4.4b[1] 435 UDOT v25.4s, v10.16b, v5.4b[1] 436 INS v11.d[1], x14 437 UDOT v26.4s, v10.16b, v6.4b[1] 438 UDOT v27.4s, v10.16b, v0.4b[1] 439 440 # BLOCK 3 441 UDOT v28.4s, v11.16b, v4.4b[1] 442 UDOT v29.4s, v11.16b, v5.4b[1] 443 UDOT v30.4s, v11.16b, v6.4b[1] 444 UDOT v31.4s, v11.16b, v0.4b[1] 445 AND x0, x2, 15 // kc remainder 0 to 12 446 447 UDOT v12.2s, v7.8b, v4.8b 448 UDOT v13.2s, v7.8b, v5.8b 449 UDOT v14.2s, v7.8b, v6.8b 450 UDOT v15.2s, v7.8b, v0.8b 451 452 # Is there a remainder?- 4 to 12 bytes of A 453 CBNZ x0, 4f 454 4553: 456 ADDP v0.2s, v12.2s, v13.2s 457 ADDP v1.2s, v14.2s, v15.2s 458 DUP v12.4s, v0.s[0] 459 DUP v13.4s, v0.s[1] 460 DUP v14.4s, v1.s[0] 461 DUP v15.4s, v1.s[1] 462 463 # Subtract zero point from accumulators 464 SUB v16.4s, v16.4s, v12.4s 465 SUB v17.4s, v17.4s, v13.4s 466 SUB v18.4s, v18.4s, v14.4s 467 SUB v19.4s, v19.4s, v15.4s 468 SUB v20.4s, v20.4s, v12.4s 469 SUB v21.4s, v21.4s, v13.4s 470 SUB v22.4s, v22.4s, v14.4s 471 SUB v23.4s, v23.4s, v15.4s 472 SUB v24.4s, v24.4s, v12.4s 473 SUB v25.4s, v25.4s, v13.4s 474 SUB v26.4s, v26.4s, v14.4s 475 SUB v27.4s, v27.4s, v15.4s 476 SUB v28.4s, v28.4s, v12.4s 477 SUB v29.4s, v29.4s, v13.4s 478 SUB v30.4s, v30.4s, v14.4s 479 SUB v31.4s, v31.4s, v15.4s 480 481 SCVTF v16.4s, v16.4s 482 SCVTF v17.4s, v17.4s 483 # Apply params - scale, bias and clamp 484 LD1R {v4.4s}, [x11], 4 485 SCVTF v18.4s, v18.4s 486 SCVTF v19.4s, v19.4s 487 SCVTF v20.4s, v20.4s 488 SCVTF v21.4s, v21.4s 489 SCVTF v22.4s, v22.4s 490 SCVTF v23.4s, v23.4s 491 SCVTF v24.4s, v24.4s 492 SCVTF v25.4s, v25.4s 493 SCVTF v26.4s, v26.4s 494 SCVTF v27.4s, v27.4s 495 SCVTF v28.4s, v28.4s 496 SCVTF v29.4s, v29.4s 497 SCVTF v30.4s, v30.4s 498 SCVTF v31.4s, v31.4s 499 500 FMUL v16.4s, v16.4s, v4.4s 501 FMUL v17.4s, v17.4s, v4.4s 502 FMUL v18.4s, v18.4s, v4.4s 503 FMUL v19.4s, v19.4s, v4.4s 504 FMUL v20.4s, v20.4s, v4.4s 505 FMUL v21.4s, v21.4s, v4.4s 506 FMUL v22.4s, v22.4s, v4.4s 507 FMUL v23.4s, v23.4s, v4.4s 508 FMUL v24.4s, v24.4s, v4.4s 509 FMUL v25.4s, v25.4s, v4.4s 510 FMUL v26.4s, v26.4s, v4.4s 511 FMUL v27.4s, v27.4s, v4.4s 512 FMUL v28.4s, v28.4s, v4.4s 513 FMUL v29.4s, v29.4s, v4.4s 514 FMUL v30.4s, v30.4s, v4.4s 515 FMUL v31.4s, v31.4s, v4.4s 516 517 FCVTNS v16.4s, v16.4s 518 FCVTNS v17.4s, v17.4s 519 FCVTNS v18.4s, v18.4s 520 FCVTNS v19.4s, v19.4s 521 FCVTNS v20.4s, v20.4s 522 FCVTNS v21.4s, v21.4s 523 FCVTNS v22.4s, v22.4s 524 FCVTNS v23.4s, v23.4s 525 FCVTNS v24.4s, v24.4s 526 FCVTNS v25.4s, v25.4s 527 FCVTNS v26.4s, v26.4s 528 FCVTNS v27.4s, v27.4s 529 FCVTNS v28.4s, v28.4s 530 FCVTNS v29.4s, v29.4s 531 FCVTNS v30.4s, v30.4s 532 FCVTNS v31.4s, v31.4s 533 534 SQXTN v16.4h, v16.4s 535 SQXTN v17.4h, v17.4s 536 SQXTN v18.4h, v18.4s 537 SQXTN v19.4h, v19.4s 538 SQXTN v24.4h, v24.4s 539 SQXTN v25.4h, v25.4s 540 SQXTN v26.4h, v26.4s 541 SQXTN v27.4h, v27.4s 542 LD1R {v6.8h}, [x11], 2 // add bias 543 544 SQXTN2 v16.8h, v20.4s 545 SQXTN2 v17.8h, v21.4s 546 SQXTN2 v18.8h, v22.4s 547 SQXTN2 v19.8h, v23.4s 548 SQXTN2 v24.8h, v28.4s 549 SQXTN2 v25.8h, v29.4s 550 SQXTN2 v26.8h, v30.4s 551 SQXTN2 v27.8h, v31.4s 552 553 SQADD v16.8h, v16.8h, v6.8h 554 SQADD v17.8h, v17.8h, v6.8h 555 SQADD v18.8h, v18.8h, v6.8h 556 SQADD v19.8h, v19.8h, v6.8h 557 SQADD v24.8h, v24.8h, v6.8h 558 SQADD v25.8h, v25.8h, v6.8h 559 SQADD v26.8h, v26.8h, v6.8h 560 SQADD v27.8h, v27.8h, v6.8h 561 LD1R {v4.16b}, [x11], 1 // clamp min value 562 563 SQXTUN v0.8b, v16.8h 564 SQXTUN v1.8b, v17.8h 565 SQXTUN v2.8b, v18.8h 566 SQXTUN v3.8b, v19.8h 567 LD1R {v5.16b}, [x11] // clamp max value 568 SQXTUN2 v0.16b, v24.8h 569 SQXTUN2 v1.16b, v25.8h 570 SQXTUN2 v2.16b, v26.8h 571 SQXTUN2 v3.16b, v27.8h 572 573 SUB x11, x11, 7 // rewind params pointer 574 575 UMAX v0.16b, v0.16b, v4.16b 576 UMAX v1.16b, v1.16b, v4.16b 577 UMAX v2.16b, v2.16b, v4.16b 578 UMAX v3.16b, v3.16b, v4.16b 579 SUBS x1, x1, 16 580 UMIN v0.16b, v0.16b, v5.16b 581 UMIN v1.16b, v1.16b, v5.16b 582 UMIN v2.16b, v2.16b, v5.16b 583 UMIN v3.16b, v3.16b, v5.16b 584 B.LO 6f 585 586 # Store full 4 x 16 587 ST1 {v0.16b}, [x6], x12 588 SUB x3, x3, x2 // a0 -= kc 589 ST1 {v1.16b}, [x8], x12 590 SUB x15, x15, x2 // a1 -= kc 591 ST1 {v2.16b}, [x9], x12 592 SUB x13, x13, x2 // a2 -= kc 593 ST1 {v3.16b}, [x7], x12 594 SUB x4, x4, x2 // a3 -= kc 595 B.NE 0b 596 597 # Restore d8-d15 from stack 598 LDP d14, d15, [sp, 48] 599 LDP d12, d13, [sp, 32] 600 LDP d10, d11, [sp, 16] 601 LDP d8, d9, [sp], 64 602 RET 603 604 # Remainder- 4 to 12 bytes of A 605 .p2align 3 6064: 607 TBZ x0, 3, 5f 608 609 LDR d0, [x3], 8 610 LDP q8, q9, [x5], 32 611 LDR d1, [x15], 8 612 LDR d2, [x13], 8 613 LDR d3, [x4], 8 614 LDP q10, q11, [x5], 32 615 UDOT v12.2s, v7.8b, v0.8b 616 UDOT v13.2s, v7.8b, v1.8b 617 UDOT v14.2s, v7.8b, v2.8b 618 UDOT v15.2s, v7.8b, v3.8b 619 UDOT v16.4s, v8.16b, v0.4b[0] 620 UDOT v17.4s, v8.16b, v1.4b[0] 621 UDOT v18.4s, v8.16b, v2.4b[0] 622 UDOT v19.4s, v8.16b, v3.4b[0] 623 UDOT v20.4s, v9.16b, v0.4b[0] 624 UDOT v21.4s, v9.16b, v1.4b[0] 625 UDOT v22.4s, v9.16b, v2.4b[0] 626 UDOT v23.4s, v9.16b, v3.4b[0] 627 UDOT v24.4s, v10.16b, v0.4b[0] 628 UDOT v25.4s, v10.16b, v1.4b[0] 629 UDOT v26.4s, v10.16b, v2.4b[0] 630 UDOT v27.4s, v10.16b, v3.4b[0] 631 UDOT v28.4s, v11.16b, v0.4b[0] 632 UDOT v29.4s, v11.16b, v1.4b[0] 633 UDOT v30.4s, v11.16b, v2.4b[0] 634 UDOT v31.4s, v11.16b, v3.4b[0] 635 LDP q8, q9, [x5], 32 636 LDP q10, q11, [x5], 32 637 UDOT v16.4s, v8.16b, v0.4b[1] 638 UDOT v17.4s, v8.16b, v1.4b[1] 639 UDOT v18.4s, v8.16b, v2.4b[1] 640 UDOT v19.4s, v8.16b, v3.4b[1] 641 UDOT v20.4s, v9.16b, v0.4b[1] 642 UDOT v21.4s, v9.16b, v1.4b[1] 643 UDOT v22.4s, v9.16b, v2.4b[1] 644 UDOT v23.4s, v9.16b, v3.4b[1] 645 UDOT v24.4s, v10.16b, v0.4b[1] 646 UDOT v25.4s, v10.16b, v1.4b[1] 647 UDOT v26.4s, v10.16b, v2.4b[1] 648 UDOT v27.4s, v10.16b, v3.4b[1] 649 UDOT v28.4s, v11.16b, v0.4b[1] 650 UDOT v29.4s, v11.16b, v1.4b[1] 651 UDOT v30.4s, v11.16b, v2.4b[1] 652 UDOT v31.4s, v11.16b, v3.4b[1] 653 TBZ x0, 2, 3b 6545: 655 LDR s0, [x3], 4 656 LDP q8, q9, [x5], 32 657 LDR s1, [x15], 4 658 LDR s2, [x13], 4 659 LDR s3, [x4], 4 660 LDP q10, q11, [x5], 32 661 UDOT v12.2s, v7.8b, v0.8b 662 UDOT v13.2s, v7.8b, v1.8b 663 UDOT v14.2s, v7.8b, v2.8b 664 UDOT v15.2s, v7.8b, v3.8b 665 UDOT v16.4s, v8.16b, v0.4b[0] 666 UDOT v17.4s, v8.16b, v1.4b[0] 667 UDOT v18.4s, v8.16b, v2.4b[0] 668 UDOT v19.4s, v8.16b, v3.4b[0] 669 UDOT v20.4s, v9.16b, v0.4b[0] 670 UDOT v21.4s, v9.16b, v1.4b[0] 671 UDOT v22.4s, v9.16b, v2.4b[0] 672 UDOT v23.4s, v9.16b, v3.4b[0] 673 UDOT v24.4s, v10.16b, v0.4b[0] 674 UDOT v25.4s, v10.16b, v1.4b[0] 675 UDOT v26.4s, v10.16b, v2.4b[0] 676 UDOT v27.4s, v10.16b, v3.4b[0] 677 UDOT v28.4s, v11.16b, v0.4b[0] 678 UDOT v29.4s, v11.16b, v1.4b[0] 679 UDOT v30.4s, v11.16b, v2.4b[0] 680 UDOT v31.4s, v11.16b, v3.4b[0] 681 B 3b 682 683 # Store odd width 684 .p2align 3 6856: 686 TBZ x1, 3, 7f 687 STR d0, [x6], 8 688 STR d1, [x8], 8 689 DUP d0, v0.d[1] 690 DUP d1, v1.d[1] 691 STR d2, [x9], 8 692 STR d3, [x7], 8 693 DUP d2, v2.d[1] 694 DUP d3, v3.d[1] 6957: 696 TBZ x1, 2, 8f 697 STR s0, [x6], 4 698 STR s1, [x8], 4 699 DUP s0, v0.s[1] 700 DUP s1, v1.s[1] 701 STR s2, [x9], 4 702 STR s3, [x7], 4 703 DUP s2, v2.s[1] 704 DUP s3, v3.s[1] 7058: 706 TBZ x1, 1, 9f 707 STR h0, [x6], 2 708 STR h1, [x8], 2 709 DUP h0, v0.h[1] 710 DUP h1, v1.h[1] 711 STR h2, [x9], 2 712 STR h3, [x7], 2 713 DUP h2, v2.h[1] 714 DUP h3, v3.h[1] 7159: 716 TBZ x1, 0, 10f 717 STR b0, [x6] 718 STR b1, [x8] 719 STR b2, [x9] 720 STR b3, [x7] 72110: 722 # Restore d8-d15 from stack 723 LDP d14, d15, [sp, 48] 724 LDP d12, d13, [sp, 32] 725 LDP d10, d11, [sp, 16] 726 LDP d8, d9, [sp], 64 727 RET 728 729END_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 730 731#ifdef __ELF__ 732.section ".note.GNU-stack","",%progbits 733#endif 734