1// Auto-generated file. Do not edit! 2// Template: src/qu8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x0) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qu8_conv_minmax_params params) [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v4 31# A1 x14 v1 v5 32# A2 x15 v2 v6 33# A3 x10 v3 (v0) 34# B x5 v8 v9 v10 v11 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# zero point v7 v12 v13 v14 v15 40 41# x11 temp for Cortex-A55 loads 42 43BEGIN_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 44 45 # Clamp C pointers 46 CMP x0, 2 // if mr < 2 47 LDR x8, [sp, 8] // Load a_offset 48 ADD x16, x6, x7 // c1 = c0 + cm_stride 49 CSEL x16, x6, x16, LO // c1 = c0 50 LDP x12, x11, [sp, 16] // Load zero pointer, params 51 ADD x2, x2, 3 // kc = (kc + 3) & ~3 52 ADD x17, x16, x7 // c2 = c1 + cm_stride 53 // if mr <= 2 54 # Save d8-d15 to stack 55 STP d8, d9, [sp, -64]! 56 57 CSEL x17, x16, x17, LS // c2 = c1 58 BIC x2, x2, 3 59 STP d10, d11, [sp, 16] 60 CMP x0, 4 // if mr < 4 61 ADD x7, x17, x7 // c3 = c2 + cm_stride 62 STP d12, d13, [sp, 32] 63 CSEL x7, x17, x7, LO // c3 = c2 64 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 65 STP d14, d15, [sp, 48] 66 67 .p2align 3 680: 69 # Load initial bias from w into accumulators 70 LDP q16, q20, [x5], 32 71 72 MOVI v12.4s, 0 73 MOVI v13.4s, 0 74 MOVI v14.4s, 0 75 MOVI v15.4s, 0 76 77 MOV v17.16b, v16.16b 78 MOV v18.16b, v16.16b 79 LDP q24, q28, [x5], 32 80 MOV v19.16b, v16.16b 81 MOV v21.16b, v20.16b 82 MOV v22.16b, v20.16b 83 MOV v23.16b, v20.16b 84 MOV v25.16b, v24.16b 85 MOV v26.16b, v24.16b 86 MOV v27.16b, v24.16b 87 MOV v29.16b, v28.16b 88 MOV v30.16b, v28.16b 89 MOV v31.16b, v28.16b 90 91 MOV x9, x3 // p = ks 92 93 .p2align 3 941: 95 # Load next 4 A pointers 96 LDP x13, x14, [x4], 16 97 LDP x15, x10, [x4], 16 98 99 CMP x13, x12 // if a0 == zero 100 ADD x13, x13, x8 // a0 += a_offset 101 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 102 CMP x14, x12 // if a1 == zero 103 ADD x14, x14, x8 // a1 += a_offset 104 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 105 CMP x15, x12 // if a2 == zero 106 ADD x15, x15, x8 // a2 += a_offset 107 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 108 CMP x10, x12 // if a3 == zero 109 ADD x10, x10, x8 // a3 += a_offset 110 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 111 112 # Is there at least 16 bytes for prologue/epilogue? 113 SUBS x0, x2, 16 // k = kc - 16 114 B.LO 5f 115 116 # prologue - read A and B values for block 0 and 1 117 LDR q8, [x5], 16 118 LDR d0, [x13], 8 119 LDR d1, [x14], 8 120 LDR d2, [x15], 8 121 LDR d3, [x10], 8 122 SUBS x0, x0, 16 // is there 16 for main loop? 123 LDR d9, [x5], 8 124 LDR x11, [x5], 8 125 # Is there at least 16 bytes for main loop? 126 B.LO 3f 127 128 # Main loop - 16 bytes of A in 4 groups. 129 # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels 130 # 4 LD64 for A 131 # 4 LD128 for W. = 2 LD64 + INS. 132 # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS. 133 134 .p2align 3 1352: 136 # BLOCK 0 137 UDOT v16.4s, v8.16b, v0.4b[0] 138 LDR d10, [x5], 8 139 UDOT v17.4s, v8.16b, v1.4b[0] 140 INS v9.d[1], x11 141 UDOT v18.4s, v8.16b, v2.4b[0] 142 LDR x11, [x5], 8 143 UDOT v19.4s, v8.16b, v3.4b[0] 144 145 # BLOCK 1 146 UDOT v20.4s, v9.16b, v0.4b[0] 147 LDR d11, [x5], 8 148 UDOT v21.4s, v9.16b, v1.4b[0] 149 INS v10.d[1], x11 150 UDOT v22.4s, v9.16b, v2.4b[0] 151 LDR x11, [x5], 8 152 UDOT v23.4s, v9.16b, v3.4b[0] 153 154 # BLOCK 2 155 UDOT v24.4s, v10.16b, v0.4b[0] 156 LDR d8, [x5], 8 157 UDOT v25.4s, v10.16b, v1.4b[0] 158 INS v11.d[1], x11 159 UDOT v26.4s, v10.16b, v2.4b[0] 160 LDR x11, [x5], 8 161 UDOT v27.4s, v10.16b, v3.4b[0] 162 163 # BLOCK 3 164 UDOT v28.4s, v11.16b, v0.4b[0] 165 LDR d9, [x5], 8 166 UDOT v29.4s, v11.16b, v1.4b[0] 167 INS v8.d[1], x11 168 UDOT v30.4s, v11.16b, v2.4b[0] 169 LDR x11, [x5], 8 170 UDOT v31.4s, v11.16b, v3.4b[0] 171 172 UDOT v12.2s, v7.8b, v0.8b 173 UDOT v13.2s, v7.8b, v1.8b 174 UDOT v14.2s, v7.8b, v2.8b 175 UDOT v15.2s, v7.8b, v3.8b 176 177 # BLOCK 0 178 UDOT v16.4s, v8.16b, v0.4b[1] 179 LDR d10, [x5], 8 180 UDOT v17.4s, v8.16b, v1.4b[1] 181 INS v9.d[1], x11 182 UDOT v18.4s, v8.16b, v2.4b[1] 183 LDR x11, [x5], 8 184 UDOT v19.4s, v8.16b, v3.4b[1] 185 LDR d4, [x13], 8 186 187 # BLOCK 1 188 UDOT v20.4s, v9.16b, v0.4b[1] 189 LDR d11, [x5], 8 190 UDOT v21.4s, v9.16b, v1.4b[1] 191 INS v10.d[1], x11 192 UDOT v22.4s, v9.16b, v2.4b[1] 193 LDR x11, [x5], 8 194 UDOT v23.4s, v9.16b, v3.4b[1] 195 LDR d5, [x14], 8 196 197 # BLOCK 2 198 UDOT v24.4s, v10.16b, v0.4b[1] 199 LDR d8, [x5], 8 200 UDOT v25.4s, v10.16b, v1.4b[1] 201 INS v11.d[1], x11 202 UDOT v26.4s, v10.16b, v2.4b[1] 203 LDR x11, [x5], 8 204 UDOT v27.4s, v10.16b, v3.4b[1] 205 LDR d6, [x15], 8 206 207 # BLOCK 3 208 UDOT v28.4s, v11.16b, v0.4b[1] 209 LDR d9, [x5], 8 210 UDOT v29.4s, v11.16b, v1.4b[1] 211 INS v8.d[1], x11 212 UDOT v30.4s, v11.16b, v2.4b[1] 213 LDR x11, [x5], 8 214 UDOT v31.4s, v11.16b, v3.4b[1] 215 LDR d0, [x10], 8 216 217 # BLOCK 0 218 UDOT v16.4s, v8.16b, v4.4b[0] 219 LDR d10, [x5], 8 220 UDOT v17.4s, v8.16b, v5.4b[0] 221 INS v9.d[1], x11 222 UDOT v18.4s, v8.16b, v6.4b[0] 223 LDR x11, [x5], 8 224 UDOT v19.4s, v8.16b, v0.4b[0] 225 226 # BLOCK 1 227 UDOT v20.4s, v9.16b, v4.4b[0] 228 LDR d11, [x5], 8 229 UDOT v21.4s, v9.16b, v5.4b[0] 230 INS v10.d[1], x11 231 UDOT v22.4s, v9.16b, v6.4b[0] 232 LDR x11, [x5], 8 233 UDOT v23.4s, v9.16b, v0.4b[0] 234 235 # BLOCK 2 236 UDOT v24.4s, v10.16b, v4.4b[0] 237 LDR d8, [x5], 8 238 UDOT v25.4s, v10.16b, v5.4b[0] 239 INS v11.d[1], x11 240 UDOT v26.4s, v10.16b, v6.4b[0] 241 LDR x11, [x5], 8 242 UDOT v27.4s, v10.16b, v0.4b[0] 243 244 # BLOCK 3 245 UDOT v28.4s, v11.16b, v4.4b[0] 246 LDR d9, [x5], 8 247 UDOT v29.4s, v11.16b, v5.4b[0] 248 INS v8.d[1], x11 249 UDOT v30.4s, v11.16b, v6.4b[0] 250 LDR x11, [x5], 8 251 UDOT v31.4s, v11.16b, v0.4b[0] 252 253 # BLOCK 0 254 UDOT v16.4s, v8.16b, v4.4b[1] 255 LDR d10, [x5], 8 256 UDOT v17.4s, v8.16b, v5.4b[1] 257 INS v9.d[1], x11 258 UDOT v18.4s, v8.16b, v6.4b[1] 259 LDR x11, [x5], 8 260 UDOT v19.4s, v8.16b, v0.4b[1] 261 LDR d1, [x14], 8 262 263 # BLOCK 1 264 UDOT v20.4s, v9.16b, v4.4b[1] 265 LDR d11, [x5], 8 266 UDOT v21.4s, v9.16b, v5.4b[1] 267 INS v10.d[1], x11 268 UDOT v22.4s, v9.16b, v6.4b[1] 269 LDR x11, [x5], 8 270 UDOT v23.4s, v9.16b, v0.4b[1] 271 LDR d2, [x15], 8 272 273 # BLOCK 2 274 UDOT v24.4s, v10.16b, v4.4b[1] 275 LDR d8, [x5], 8 // First B values for block 0 and 1 276 UDOT v25.4s, v10.16b, v5.4b[1] 277 INS v11.d[1], x11 278 UDOT v26.4s, v10.16b, v6.4b[1] 279 LDR x11, [x5], 8 280 UDOT v27.4s, v10.16b, v0.4b[1] 281 LDR d3, [x10], 8 282 283 # BLOCK 3 special 284 UDOT v31.4s, v11.16b, v0.4b[1] 285 LDR d9, [x5], 8 286 UDOT v15.2s, v7.8b, v0.8b // free up v0 early 287 INS v8.d[1], x11 288 UDOT v28.4s, v11.16b, v4.4b[1] 289 LDR x11, [x5], 8 290 UDOT v29.4s, v11.16b, v5.4b[1] 291 LDR d0, [x13], 8 292 UDOT v30.4s, v11.16b, v6.4b[1] 293 SUBS x0, x0, 16 294 295 UDOT v12.2s, v7.8b, v4.8b 296 UDOT v13.2s, v7.8b, v5.8b 297 UDOT v14.2s, v7.8b, v6.8b 298 B.HS 2b 299 300 # Epilogue. Same as main loop but no preloads in final group 3013: 302 # BLOCK 0 303 UDOT v16.4s, v8.16b, v0.4b[0] 304 LDR d10, [x5], 8 305 UDOT v17.4s, v8.16b, v1.4b[0] 306 INS v9.d[1], x11 307 UDOT v18.4s, v8.16b, v2.4b[0] 308 LDR x11, [x5], 8 309 UDOT v19.4s, v8.16b, v3.4b[0] 310 311 # BLOCK 1 312 UDOT v20.4s, v9.16b, v0.4b[0] 313 LDR d11, [x5], 8 314 UDOT v21.4s, v9.16b, v1.4b[0] 315 INS v10.d[1], x11 316 UDOT v22.4s, v9.16b, v2.4b[0] 317 LDR x11, [x5], 8 318 UDOT v23.4s, v9.16b, v3.4b[0] 319 320 # BLOCK 2 321 UDOT v24.4s, v10.16b, v0.4b[0] 322 LDR d8, [x5], 8 323 UDOT v25.4s, v10.16b, v1.4b[0] 324 INS v11.d[1], x11 325 UDOT v26.4s, v10.16b, v2.4b[0] 326 LDR x11, [x5], 8 327 UDOT v27.4s, v10.16b, v3.4b[0] 328 329 # BLOCK 3 330 UDOT v28.4s, v11.16b, v0.4b[0] 331 LDR d9, [x5], 8 332 UDOT v29.4s, v11.16b, v1.4b[0] 333 INS v8.d[1], x11 334 UDOT v30.4s, v11.16b, v2.4b[0] 335 LDR x11, [x5], 8 336 UDOT v31.4s, v11.16b, v3.4b[0] 337 338 UDOT v12.2s, v7.8b, v0.8b 339 UDOT v13.2s, v7.8b, v1.8b 340 UDOT v14.2s, v7.8b, v2.8b 341 UDOT v15.2s, v7.8b, v3.8b 342 343 # BLOCK 0 344 UDOT v16.4s, v8.16b, v0.4b[1] 345 LDR d10, [x5], 8 346 UDOT v17.4s, v8.16b, v1.4b[1] 347 INS v9.d[1], x11 348 UDOT v18.4s, v8.16b, v2.4b[1] 349 LDR x11, [x5], 8 350 UDOT v19.4s, v8.16b, v3.4b[1] 351 LDR d4, [x13], 8 352 353 # BLOCK 1 354 UDOT v20.4s, v9.16b, v0.4b[1] 355 LDR d11, [x5], 8 356 UDOT v21.4s, v9.16b, v1.4b[1] 357 INS v10.d[1], x11 358 UDOT v22.4s, v9.16b, v2.4b[1] 359 LDR x11, [x5], 8 360 UDOT v23.4s, v9.16b, v3.4b[1] 361 LDR d5, [x14], 8 362 363 # BLOCK 2 364 UDOT v24.4s, v10.16b, v0.4b[1] 365 LDR d8, [x5], 8 366 UDOT v25.4s, v10.16b, v1.4b[1] 367 INS v11.d[1], x11 368 UDOT v26.4s, v10.16b, v2.4b[1] 369 LDR x11, [x5], 8 370 UDOT v27.4s, v10.16b, v3.4b[1] 371 LDR d6, [x15], 8 372 373 # BLOCK 3 374 UDOT v28.4s, v11.16b, v0.4b[1] 375 LDR d9, [x5], 8 376 UDOT v29.4s, v11.16b, v1.4b[1] 377 INS v8.d[1], x11 378 UDOT v30.4s, v11.16b, v2.4b[1] 379 LDR x11, [x5], 8 380 UDOT v31.4s, v11.16b, v3.4b[1] 381 LDR d0, [x10], 8 382 383 # BLOCK 0 384 UDOT v16.4s, v8.16b, v4.4b[0] 385 LDR d10, [x5], 8 386 UDOT v17.4s, v8.16b, v5.4b[0] 387 INS v9.d[1], x11 388 UDOT v18.4s, v8.16b, v6.4b[0] 389 LDR x11, [x5], 8 390 UDOT v19.4s, v8.16b, v0.4b[0] 391 392 # BLOCK 1 393 UDOT v20.4s, v9.16b, v4.4b[0] 394 LDR d11, [x5], 8 395 UDOT v21.4s, v9.16b, v5.4b[0] 396 INS v10.d[1], x11 397 UDOT v22.4s, v9.16b, v6.4b[0] 398 LDR x11, [x5], 8 399 UDOT v23.4s, v9.16b, v0.4b[0] 400 401 # BLOCK 2 402 UDOT v24.4s, v10.16b, v4.4b[0] 403 LDR d8, [x5], 8 404 UDOT v25.4s, v10.16b, v5.4b[0] 405 INS v11.d[1], x11 406 UDOT v26.4s, v10.16b, v6.4b[0] 407 LDR x11, [x5], 8 408 UDOT v27.4s, v10.16b, v0.4b[0] 409 410 # BLOCK 3 411 UDOT v28.4s, v11.16b, v4.4b[0] 412 LDR d9, [x5], 8 413 UDOT v29.4s, v11.16b, v5.4b[0] 414 INS v8.d[1], x11 415 UDOT v30.4s, v11.16b, v6.4b[0] 416 LDR x11, [x5], 8 417 UDOT v31.4s, v11.16b, v0.4b[0] 418 419 # BLOCK 0 420 UDOT v16.4s, v8.16b, v4.4b[1] 421 LDR d10, [x5], 8 422 UDOT v17.4s, v8.16b, v5.4b[1] 423 INS v9.d[1], x11 424 UDOT v18.4s, v8.16b, v6.4b[1] 425 LDR x11, [x5], 8 426 UDOT v19.4s, v8.16b, v0.4b[1] 427 428 # BLOCK 1 429 UDOT v20.4s, v9.16b, v4.4b[1] 430 LDR d11, [x5], 8 431 UDOT v21.4s, v9.16b, v5.4b[1] 432 INS v10.d[1], x11 433 UDOT v22.4s, v9.16b, v6.4b[1] 434 LDR x11, [x5], 8 435 UDOT v23.4s, v9.16b, v0.4b[1] 436 437 # BLOCK 2 438 UDOT v24.4s, v10.16b, v4.4b[1] 439 UDOT v25.4s, v10.16b, v5.4b[1] 440 INS v11.d[1], x11 441 UDOT v26.4s, v10.16b, v6.4b[1] 442 UDOT v27.4s, v10.16b, v0.4b[1] 443 444 # BLOCK 3 445 UDOT v28.4s, v11.16b, v4.4b[1] 446 UDOT v29.4s, v11.16b, v5.4b[1] 447 UDOT v30.4s, v11.16b, v6.4b[1] 448 UDOT v31.4s, v11.16b, v0.4b[1] 449 450 UDOT v12.2s, v7.8b, v4.8b 451 UDOT v13.2s, v7.8b, v5.8b 452 UDOT v14.2s, v7.8b, v6.8b 453 UDOT v15.2s, v7.8b, v0.8b 454 455 # Is there a remainder?- 4 to 12 bytes of A 456 TST x0, 15 457 B.NE 5f 458 4594: 460 # ks loop 461 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 462 B.HI 1b 463 464 ADDP v0.2s, v12.2s, v13.2s 465 ADDP v1.2s, v14.2s, v15.2s 466 LDR x11, [sp, 88] // Reload params 467 DUP v12.4s, v0.s[0] 468 DUP v13.4s, v0.s[1] 469 DUP v14.4s, v1.s[0] 470 DUP v15.4s, v1.s[1] 471 ADD x11, x11, 4 472 473 # Subtract zero point from accumulators 474 SUB v16.4s, v16.4s, v12.4s 475 SUB v17.4s, v17.4s, v13.4s 476 SUB v18.4s, v18.4s, v14.4s 477 SUB v19.4s, v19.4s, v15.4s 478 SUB v20.4s, v20.4s, v12.4s 479 SUB v21.4s, v21.4s, v13.4s 480 SUB v22.4s, v22.4s, v14.4s 481 SUB v23.4s, v23.4s, v15.4s 482 SUB v24.4s, v24.4s, v12.4s 483 SUB v25.4s, v25.4s, v13.4s 484 SUB v26.4s, v26.4s, v14.4s 485 SUB v27.4s, v27.4s, v15.4s 486 SUB v28.4s, v28.4s, v12.4s 487 SUB v29.4s, v29.4s, v13.4s 488 SUB v30.4s, v30.4s, v14.4s 489 SUB v31.4s, v31.4s, v15.4s 490 491 SCVTF v16.4s, v16.4s 492 SCVTF v17.4s, v17.4s 493 # Apply params - scale, bias and clamp 494 LD1R {v4.4s}, [x11], 4 495 SCVTF v18.4s, v18.4s 496 SCVTF v19.4s, v19.4s 497 SCVTF v20.4s, v20.4s 498 SCVTF v21.4s, v21.4s 499 SCVTF v22.4s, v22.4s 500 SCVTF v23.4s, v23.4s 501 SCVTF v24.4s, v24.4s 502 SCVTF v25.4s, v25.4s 503 SCVTF v26.4s, v26.4s 504 SCVTF v27.4s, v27.4s 505 SCVTF v28.4s, v28.4s 506 SCVTF v29.4s, v29.4s 507 SCVTF v30.4s, v30.4s 508 SCVTF v31.4s, v31.4s 509 510 FMUL v16.4s, v16.4s, v4.4s 511 FMUL v17.4s, v17.4s, v4.4s 512 FMUL v18.4s, v18.4s, v4.4s 513 FMUL v19.4s, v19.4s, v4.4s 514 FMUL v20.4s, v20.4s, v4.4s 515 FMUL v21.4s, v21.4s, v4.4s 516 FMUL v22.4s, v22.4s, v4.4s 517 FMUL v23.4s, v23.4s, v4.4s 518 FMUL v24.4s, v24.4s, v4.4s 519 FMUL v25.4s, v25.4s, v4.4s 520 FMUL v26.4s, v26.4s, v4.4s 521 FMUL v27.4s, v27.4s, v4.4s 522 FMUL v28.4s, v28.4s, v4.4s 523 FMUL v29.4s, v29.4s, v4.4s 524 FMUL v30.4s, v30.4s, v4.4s 525 FMUL v31.4s, v31.4s, v4.4s 526 527 FCVTNS v16.4s, v16.4s 528 FCVTNS v17.4s, v17.4s 529 FCVTNS v18.4s, v18.4s 530 FCVTNS v19.4s, v19.4s 531 FCVTNS v20.4s, v20.4s 532 FCVTNS v21.4s, v21.4s 533 FCVTNS v22.4s, v22.4s 534 FCVTNS v23.4s, v23.4s 535 FCVTNS v24.4s, v24.4s 536 FCVTNS v25.4s, v25.4s 537 FCVTNS v26.4s, v26.4s 538 FCVTNS v27.4s, v27.4s 539 FCVTNS v28.4s, v28.4s 540 FCVTNS v29.4s, v29.4s 541 FCVTNS v30.4s, v30.4s 542 FCVTNS v31.4s, v31.4s 543 544 SQXTN v16.4h, v16.4s 545 SQXTN v17.4h, v17.4s 546 SQXTN v18.4h, v18.4s 547 SQXTN v19.4h, v19.4s 548 SQXTN v24.4h, v24.4s 549 SQXTN v25.4h, v25.4s 550 SQXTN v26.4h, v26.4s 551 SQXTN v27.4h, v27.4s 552 LD1R {v6.8h}, [x11], 2 // add bias 553 554 SQXTN2 v16.8h, v20.4s 555 SQXTN2 v17.8h, v21.4s 556 SQXTN2 v18.8h, v22.4s 557 SQXTN2 v19.8h, v23.4s 558 SQXTN2 v24.8h, v28.4s 559 SQXTN2 v25.8h, v29.4s 560 SQXTN2 v26.8h, v30.4s 561 SQXTN2 v27.8h, v31.4s 562 563 SQADD v16.8h, v16.8h, v6.8h 564 SQADD v17.8h, v17.8h, v6.8h 565 SQADD v18.8h, v18.8h, v6.8h 566 SQADD v19.8h, v19.8h, v6.8h 567 SQADD v24.8h, v24.8h, v6.8h 568 SQADD v25.8h, v25.8h, v6.8h 569 SQADD v26.8h, v26.8h, v6.8h 570 SQADD v27.8h, v27.8h, v6.8h 571 LD1R {v4.16b}, [x11], 1 // clamp min value 572 573 SQXTUN v0.8b, v16.8h 574 SQXTUN v1.8b, v17.8h 575 SQXTUN v2.8b, v18.8h 576 SQXTUN v3.8b, v19.8h 577 LD1R {v5.16b}, [x11] // clamp max value 578 SQXTUN2 v0.16b, v24.8h 579 SQXTUN2 v1.16b, v25.8h 580 SQXTUN2 v2.16b, v26.8h 581 SQXTUN2 v3.16b, v27.8h 582 LDR x0, [sp, 64] // Load cn_stride 583 584 UMAX v0.16b, v0.16b, v4.16b 585 UMAX v1.16b, v1.16b, v4.16b 586 UMAX v2.16b, v2.16b, v4.16b 587 UMAX v3.16b, v3.16b, v4.16b 588 SUBS x1, x1, 16 589 UMIN v0.16b, v0.16b, v5.16b 590 UMIN v1.16b, v1.16b, v5.16b 591 UMIN v2.16b, v2.16b, v5.16b 592 UMIN v3.16b, v3.16b, v5.16b 593 B.LO 7f 594 595 # Store full 4 x 16 596 ST1 {v3.16b}, [x7], x0 597 ST1 {v2.16b}, [x17], x0 598 ST1 {v1.16b}, [x16], x0 599 ST1 {v0.16b}, [x6], x0 600 601 SUB x4, x4, x3 // a -= ks 602 603 # nc loop 604 B.HI 0b 605 606 # Restore d8-d15 from stack 607 LDP d14, d15, [sp, 48] 608 LDP d12, d13, [sp, 32] 609 LDP d10, d11, [sp, 16] 610 LDP d8, d9, [sp], 64 611 RET 612 613 # Remainder- 4 to 12 bytes of A 614 .p2align 3 6155: 616 TBZ x0, 3, 6f 617 618 LDR d0, [x13], 8 619 LDP q8, q9, [x5], 32 620 LDR d1, [x14], 8 621 LDR d2, [x15], 8 622 LDR d3, [x10], 8 623 LDP q10, q11, [x5], 32 624 UDOT v12.2s, v7.8b, v0.8b 625 UDOT v13.2s, v7.8b, v1.8b 626 UDOT v14.2s, v7.8b, v2.8b 627 UDOT v15.2s, v7.8b, v3.8b 628 UDOT v16.4s, v8.16b, v0.4b[0] 629 UDOT v17.4s, v8.16b, v1.4b[0] 630 UDOT v18.4s, v8.16b, v2.4b[0] 631 UDOT v19.4s, v8.16b, v3.4b[0] 632 UDOT v20.4s, v9.16b, v0.4b[0] 633 UDOT v21.4s, v9.16b, v1.4b[0] 634 UDOT v22.4s, v9.16b, v2.4b[0] 635 UDOT v23.4s, v9.16b, v3.4b[0] 636 UDOT v24.4s, v10.16b, v0.4b[0] 637 UDOT v25.4s, v10.16b, v1.4b[0] 638 UDOT v26.4s, v10.16b, v2.4b[0] 639 UDOT v27.4s, v10.16b, v3.4b[0] 640 UDOT v28.4s, v11.16b, v0.4b[0] 641 UDOT v29.4s, v11.16b, v1.4b[0] 642 UDOT v30.4s, v11.16b, v2.4b[0] 643 UDOT v31.4s, v11.16b, v3.4b[0] 644 LDP q8, q9, [x5], 32 645 LDP q10, q11, [x5], 32 646 UDOT v16.4s, v8.16b, v0.4b[1] 647 UDOT v17.4s, v8.16b, v1.4b[1] 648 UDOT v18.4s, v8.16b, v2.4b[1] 649 UDOT v19.4s, v8.16b, v3.4b[1] 650 UDOT v20.4s, v9.16b, v0.4b[1] 651 UDOT v21.4s, v9.16b, v1.4b[1] 652 UDOT v22.4s, v9.16b, v2.4b[1] 653 UDOT v23.4s, v9.16b, v3.4b[1] 654 UDOT v24.4s, v10.16b, v0.4b[1] 655 UDOT v25.4s, v10.16b, v1.4b[1] 656 UDOT v26.4s, v10.16b, v2.4b[1] 657 UDOT v27.4s, v10.16b, v3.4b[1] 658 UDOT v28.4s, v11.16b, v0.4b[1] 659 UDOT v29.4s, v11.16b, v1.4b[1] 660 UDOT v30.4s, v11.16b, v2.4b[1] 661 UDOT v31.4s, v11.16b, v3.4b[1] 662 TBZ x0, 2, 4b 6636: 664 LDR s0, [x13], 4 665 LDP q8, q9, [x5], 32 666 LDR s1, [x14], 4 667 LDR s2, [x15], 4 668 LDR s3, [x10], 4 669 LDP q10, q11, [x5], 32 670 UDOT v12.2s, v7.8b, v0.8b 671 UDOT v13.2s, v7.8b, v1.8b 672 UDOT v14.2s, v7.8b, v2.8b 673 UDOT v15.2s, v7.8b, v3.8b 674 UDOT v16.4s, v8.16b, v0.4b[0] 675 UDOT v17.4s, v8.16b, v1.4b[0] 676 UDOT v18.4s, v8.16b, v2.4b[0] 677 UDOT v19.4s, v8.16b, v3.4b[0] 678 UDOT v20.4s, v9.16b, v0.4b[0] 679 UDOT v21.4s, v9.16b, v1.4b[0] 680 UDOT v22.4s, v9.16b, v2.4b[0] 681 UDOT v23.4s, v9.16b, v3.4b[0] 682 UDOT v24.4s, v10.16b, v0.4b[0] 683 UDOT v25.4s, v10.16b, v1.4b[0] 684 UDOT v26.4s, v10.16b, v2.4b[0] 685 UDOT v27.4s, v10.16b, v3.4b[0] 686 UDOT v28.4s, v11.16b, v0.4b[0] 687 UDOT v29.4s, v11.16b, v1.4b[0] 688 UDOT v30.4s, v11.16b, v2.4b[0] 689 UDOT v31.4s, v11.16b, v3.4b[0] 690 B 4b 691 692 # Store odd width 693 .p2align 3 6947: 695 TBZ x1, 3, 8f 696 STR d3, [x7], 8 697 STR d2, [x17], 8 698 DUP d3, v3.d[1] 699 DUP d2, v2.d[1] 700 STR d1, [x16], 8 701 STR d0, [x6], 8 702 DUP d1, v1.d[1] 703 DUP d0, v0.d[1] 7048: 705 TBZ x1, 2, 9f 706 STR s3, [x7], 4 707 STR s2, [x17], 4 708 DUP s3, v3.s[1] 709 DUP s2, v2.s[1] 710 STR s1, [x16], 4 711 STR s0, [x6], 4 712 DUP s1, v1.s[1] 713 DUP s0, v0.s[1] 7149: 715 TBZ x1, 1, 10f 716 STR h3, [x7], 2 717 STR h2, [x17], 2 718 DUP h3, v3.h[1] 719 DUP h2, v2.h[1] 720 STR h1, [x16], 2 721 STR h0, [x6], 2 722 DUP h1, v1.h[1] 723 DUP h0, v0.h[1] 72410: 725 TBZ x1, 0, 11f 726 STR b3, [x7] 727 STR b2, [x17] 728 STR b1, [x16] 729 STR b0, [x6] 73011: 731 # Restore d8-d15 from stack 732 LDP d14, d15, [sp, 48] 733 LDP d12, d13, [sp, 32] 734 LDP d10, d11, [sp, 16] 735 LDP d8, d9, [sp], 64 736 RET 737 738END_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 739 740#ifdef __ELF__ 741.section ".note.GNU-stack","",%progbits 742#endif 743