1// Auto-generated file. Do not edit! 2// Template: src/qu8-gemm/4x8c4-aarch64-neondot-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qu8_conv_minmax_params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v4 29# A1 x15 v1 v5 30# A2 x13 v2 v6 31# A3 x4 v3 v7 32# B x5 v28 v29 v30 v31 33# C0 x6 v16 v20 34# C1 x8 v17 v21 35# C2 x9 v18 v22 36# C3 x7 v19 v23 37# zero_point v24 v25 v26 v27 v8 38# unused v12 v13 v14 v15 v29 v30 v31 39 40# x14 temp for Cortex-A55 loads 41 42BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55 43 44 # Clamp A and C pointers 45 CMP x0, 2 // if mr < 2 46 ADD x2, x2, 3 // kc = (kc + 3) & ~3 47 ADD x15, x3, x4 // a1 = a0 + a_stride 48 ADD x8, x6, x7 // c1 = c0 + cm_stride 49 CSEL x15, x3, x15, LO // a1 = a0 50 CSEL x8, x6, x8, LO // c1 = c0 51 BIC x2, x2, 3 52 53 LDP x12, x11, [sp] // cn_stride, params 54 55 ADD x13, x15, x4 // a2 = a1 + a_stride 56 ADD x9, x8, x7 // c2 = c1 + cm_stride 57 STR d8, [sp, -16]! // Save d8 on stack 58 // if mr <= 2 59 CSEL x13, x15, x13, LS // a2 = a1 60 CSEL x9, x8, x9, LS // c2 = c1 61 62 LD1R {v8.4s}, [x11], 4 // kernel_zero_point 63 64 CMP x0, 4 // if mr < 4 65 ADD x4, x13, x4 // a3 = a2 + a_stride 66 ADD x7, x9, x7 // c3 = c2 + cm_stride 67 CSEL x4, x13, x4, LO // a3 = a2 68 CSEL x7, x9, x7, LO // c3 = c2 69 70 .p2align 3 710: 72 # Load initial bias from w into accumulators 73 LDP q16, q20, [x5], 32 74 MOV v17.16b, v16.16b 75 MOV v18.16b, v16.16b 76 MOV v19.16b, v16.16b 77 MOV v21.16b, v20.16b 78 MOV v22.16b, v20.16b 79 MOV v23.16b, v20.16b 80 SUBS x0, x2, 16 // k = kc - 16 81 MOVI v24.16b, 0 82 MOVI v25.16b, 0 83 MOVI v26.16b, 0 84 MOVI v27.16b, 0 85 86 # Is there at least 16 bytes for prologue/epilogue? 87 B.LO 4f 88 89 # prologue - read A and B values for block 0 and 1 90 LDR d0, [x3], 8 91 LDR q28, [x5], 16 92 LDR d1, [x15], 8 93 LDR d2, [x13], 8 94 LDR d3, [x4], 8 95 SUBS x0, x0, 16 // is there 16 for main loop? 96 LDR d29, [x5], 8 97 LDR x14, [x5], 8 98 # Is there at least 16 bytes for main loop? 99 B.LO 2f 100 101 # Main loop - 16 bytes of A in 4 groups of 2 blocks 102 # 4 row of 2 vectors wide = 8 UDOT instructions for 4 channels 103 # 4 LD64 for A 104 # 4 LD128 for W. = 2 LD64 + INS. 105 # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS. 106 107 .p2align 3 1081: 109 # BLOCK 0 110 UDOT v16.4s, v28.16b, v0.4b[0] 111 LDR d30, [x5], 8 112 UDOT v17.4s, v28.16b, v1.4b[0] 113 INS v29.d[1], x14 114 UDOT v18.4s, v28.16b, v2.4b[0] 115 LDR x14, [x5], 8 116 UDOT v19.4s, v28.16b, v3.4b[0] 117 LDR d4, [x3], 8 118 119 # BLOCK 1 120 UDOT v20.4s, v29.16b, v0.4b[0] 121 LDR d31, [x5], 8 122 UDOT v21.4s, v29.16b, v1.4b[0] 123 INS v30.d[1], x14 124 UDOT v22.4s, v29.16b, v2.4b[0] 125 LDR x14, [x5], 8 126 UDOT v23.4s, v29.16b, v3.4b[0] 127 LDR d5, [x15], 8 128 129 # BLOCK 0 130 UDOT v16.4s, v30.16b, v0.4b[1] 131 LDR d28, [x5], 8 132 UDOT v17.4s, v30.16b, v1.4b[1] 133 INS v31.d[1], x14 134 UDOT v18.4s, v30.16b, v2.4b[1] 135 LDR x14, [x5], 8 136 UDOT v19.4s, v30.16b, v3.4b[1] 137 LDR d6, [x13], 8 138 139 # BLOCK 1 140 UDOT v20.4s, v31.16b, v0.4b[1] 141 LDR d29, [x5], 8 142 UDOT v21.4s, v31.16b, v1.4b[1] 143 INS v28.d[1], x14 144 UDOT v22.4s, v31.16b, v2.4b[1] 145 LDR x14, [x5], 8 146 UDOT v23.4s, v31.16b, v3.4b[1] 147 LDR d7, [x4], 8 148 149 UDOT v24.2s, v8.8b, v0.8b 150 UDOT v25.2s, v8.8b, v1.8b 151 UDOT v26.2s, v8.8b, v2.8b 152 UDOT v27.2s, v8.8b, v3.8b 153 154 # BLOCK 0 155 UDOT v16.4s, v28.16b, v4.4b[0] 156 LDR d30, [x5], 8 157 UDOT v17.4s, v28.16b, v5.4b[0] 158 INS v29.d[1], x14 159 UDOT v18.4s, v28.16b, v6.4b[0] 160 LDR x14, [x5], 8 161 UDOT v19.4s, v28.16b, v7.4b[0] 162 LDR d0, [x3], 8 163 164 # BLOCK 1 165 UDOT v20.4s, v29.16b, v4.4b[0] 166 LDR d31, [x5], 8 167 UDOT v21.4s, v29.16b, v5.4b[0] 168 INS v30.d[1], x14 169 UDOT v22.4s, v29.16b, v6.4b[0] 170 LDR x14, [x5], 8 171 UDOT v23.4s, v29.16b, v7.4b[0] 172 LDR d1, [x15], 8 173 174 # BLOCK 0 175 UDOT v16.4s, v30.16b, v4.4b[1] 176 LDR d28, [x5], 8 177 UDOT v17.4s, v30.16b, v5.4b[1] 178 INS v31.d[1], x14 179 UDOT v18.4s, v30.16b, v6.4b[1] 180 LDR x14, [x5], 8 181 UDOT v19.4s, v30.16b, v7.4b[1] 182 LDR d2, [x13], 8 183 184 # BLOCK 1 185 UDOT v20.4s, v31.16b, v4.4b[1] 186 LDR d29, [x5], 8 187 UDOT v21.4s, v31.16b, v5.4b[1] 188 INS v28.d[1], x14 189 UDOT v22.4s, v31.16b, v6.4b[1] 190 LDR x14, [x5], 8 191 UDOT v23.4s, v31.16b, v7.4b[1] 192 LDR d3, [x4], 8 193 194 UDOT v24.2s, v8.8b, v4.8b 195 UDOT v25.2s, v8.8b, v5.8b 196 SUBS x0, x0, 16 197 UDOT v26.2s, v8.8b, v6.8b 198 UDOT v27.2s, v8.8b, v7.8b 199 200 B.HS 1b 201 202 # Epilogue. Same as main loop but no preloads in final group 2032: 204 # BLOCK 0 205 UDOT v16.4s, v28.16b, v0.4b[0] 206 LDR d30, [x5], 8 207 UDOT v17.4s, v28.16b, v1.4b[0] 208 INS v29.d[1], x14 209 UDOT v18.4s, v28.16b, v2.4b[0] 210 LDR x14, [x5], 8 211 UDOT v19.4s, v28.16b, v3.4b[0] 212 LDR d4, [x3], 8 213 214 # BLOCK 1 215 UDOT v20.4s, v29.16b, v0.4b[0] 216 LDR d31, [x5], 8 217 UDOT v21.4s, v29.16b, v1.4b[0] 218 INS v30.d[1], x14 219 UDOT v22.4s, v29.16b, v2.4b[0] 220 LDR x14, [x5], 8 221 UDOT v23.4s, v29.16b, v3.4b[0] 222 LDR d5, [x15], 8 223 224 # BLOCK 0 225 UDOT v16.4s, v30.16b, v0.4b[1] 226 LDR d28, [x5], 8 227 UDOT v17.4s, v30.16b, v1.4b[1] 228 INS v31.d[1], x14 229 UDOT v18.4s, v30.16b, v2.4b[1] 230 LDR x14, [x5], 8 231 UDOT v19.4s, v30.16b, v3.4b[1] 232 LDR d6, [x13], 8 233 234 # BLOCK 1 235 UDOT v20.4s, v31.16b, v0.4b[1] 236 LDR d29, [x5], 8 237 UDOT v21.4s, v31.16b, v1.4b[1] 238 INS v28.d[1], x14 239 UDOT v22.4s, v31.16b, v2.4b[1] 240 LDR x14, [x5], 8 241 UDOT v23.4s, v31.16b, v3.4b[1] 242 LDR d7, [x4], 8 243 244 UDOT v24.2s, v8.8b, v0.8b 245 UDOT v25.2s, v8.8b, v1.8b 246 UDOT v26.2s, v8.8b, v2.8b 247 UDOT v27.2s, v8.8b, v3.8b 248 249 # BLOCK 0 250 UDOT v16.4s, v28.16b, v4.4b[0] 251 LDR d30, [x5], 8 252 UDOT v17.4s, v28.16b, v5.4b[0] 253 INS v29.d[1], x14 254 UDOT v18.4s, v28.16b, v6.4b[0] 255 LDR x14, [x5], 8 256 UDOT v19.4s, v28.16b, v7.4b[0] 257 258 # BLOCK 1 259 UDOT v20.4s, v29.16b, v4.4b[0] 260 LDR d31, [x5], 8 261 UDOT v21.4s, v29.16b, v5.4b[0] 262 INS v30.d[1], x14 263 UDOT v22.4s, v29.16b, v6.4b[0] 264 LDR x14, [x5], 8 265 UDOT v23.4s, v29.16b, v7.4b[0] 266 267 # BLOCK 0 268 UDOT v16.4s, v30.16b, v4.4b[1] 269 UDOT v17.4s, v30.16b, v5.4b[1] 270 INS v31.d[1], x14 271 UDOT v18.4s, v30.16b, v6.4b[1] 272 UDOT v19.4s, v30.16b, v7.4b[1] 273 274 # BLOCK 1 275 UDOT v20.4s, v31.16b, v4.4b[1] 276 UDOT v21.4s, v31.16b, v5.4b[1] 277 UDOT v22.4s, v31.16b, v6.4b[1] 278 UDOT v23.4s, v31.16b, v7.4b[1] 279 280 AND x0, x2, 15 // kc remainder 0 to 12 281 282 UDOT v24.2s, v8.8b, v4.8b 283 UDOT v25.2s, v8.8b, v5.8b 284 UDOT v26.2s, v8.8b, v6.8b 285 UDOT v27.2s, v8.8b, v7.8b 286 287 # Is there a remainder?- 4 to 12 bytes of A 288 CBNZ x0, 4f 289 290 .p2align 3 2913: 292 ADDP v0.2s, v24.2s, v25.2s 293 ADDP v1.2s, v26.2s, v27.2s 294 DUP v24.4s, v0.s[0] 295 DUP v25.4s, v0.s[1] 296 DUP v26.4s, v1.s[0] 297 DUP v27.4s, v1.s[1] 298 299 # Subtract zero point from accumulators 300 SUB v16.4s, v16.4s, v24.4s 301 SUB v17.4s, v17.4s, v25.4s 302 SUB v18.4s, v18.4s, v26.4s 303 SUB v19.4s, v19.4s, v27.4s 304 SUB v20.4s, v20.4s, v24.4s 305 SUB v21.4s, v21.4s, v25.4s 306 SUB v22.4s, v22.4s, v26.4s 307 SUB v23.4s, v23.4s, v27.4s 308 309 # Apply params - preshift, scale, postshift, bias and clamp 310 LD1R {v4.4s}, [x11], 4 311 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 312 SSHL v17.4s, v17.4s, v4.4s 313 SSHL v18.4s, v18.4s, v4.4s 314 SSHL v19.4s, v19.4s, v4.4s 315 LD1R {v5.4s}, [x11], 4 316 SSHL v20.4s, v20.4s, v4.4s 317 SSHL v21.4s, v21.4s, v4.4s 318 SSHL v22.4s, v22.4s, v4.4s 319 SSHL v23.4s, v23.4s, v4.4s 320 LD1R {v6.4s}, [x11], 4 321 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 322 SQDMULH v17.4s, v17.4s, v5.4s 323 SQDMULH v18.4s, v18.4s, v5.4s 324 SQDMULH v19.4s, v19.4s, v5.4s 325 SQDMULH v20.4s, v20.4s, v5.4s 326 SQDMULH v21.4s, v21.4s, v5.4s 327 SQDMULH v22.4s, v22.4s, v5.4s 328 SQDMULH v23.4s, v23.4s, v5.4s 329 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 330 SRSHL v17.4s, v17.4s, v6.4s 331 SRSHL v18.4s, v18.4s, v6.4s 332 SRSHL v19.4s, v19.4s, v6.4s 333 SRSHL v20.4s, v20.4s, v6.4s 334 SRSHL v21.4s, v21.4s, v6.4s 335 SRSHL v22.4s, v22.4s, v6.4s 336 SRSHL v23.4s, v23.4s, v6.4s 337 338 SQXTN v16.4h, v16.4s 339 SQXTN v17.4h, v17.4s 340 SQXTN v18.4h, v18.4s 341 SQXTN v19.4h, v19.4s 342 LD1R {v6.8h}, [x11], 2 // add bias 343 344 SQXTN2 v16.8h, v20.4s 345 SQXTN2 v17.8h, v21.4s 346 SQXTN2 v18.8h, v22.4s 347 SQXTN2 v19.8h, v23.4s 348 349 SQADD v16.8h, v16.8h, v6.8h 350 SQADD v17.8h, v17.8h, v6.8h 351 SQADD v18.8h, v18.8h, v6.8h 352 SQADD v19.8h, v19.8h, v6.8h 353 LD1R {v4.16b}, [x11], 1 // clamp min value 354 355 SQXTUN v0.8b, v16.8h 356 SQXTUN v1.8b, v18.8h 357 LD1R {v5.16b}, [x11] // clamp max value 358 SQXTUN2 v0.16b, v17.8h 359 SQXTUN2 v1.16b, v19.8h 360 SUB x11, x11, 15 // rewind params pointer 361 362 UMAX v0.16b, v0.16b, v4.16b 363 UMAX v1.16b, v1.16b, v4.16b 364 SUBS x1, x1, 8 365 UMIN v0.16b, v0.16b, v5.16b 366 UMIN v1.16b, v1.16b, v5.16b 367 B.LO 6f 368 369 # Store full 4 x 8 370 ST1 {v0.8b}, [x6], x12 371 SUB x3, x3, x2 // a0 -= kc 372 ST1 {v0.d}[1], [x8], x12 373 SUB x15, x15, x2 // a1 -= kc 374 ST1 {v1.8b}, [x9], x12 375 SUB x13, x13, x2 // a2 -= kc 376 ST1 {v1.d}[1], [x7], x12 377 SUB x4, x4, x2 // a3 -= kc 378 B.NE 0b 379 380 # Restore d8 from stack 381 LDR d8, [sp], 16 382 RET 383 384 # Remainder- 4 to 12 bytes of A 385 # Although C4, its safe to read 16 bytes. 386 .p2align 3 3874: 388 TBZ x0, 3, 5f 389 390 LDR d0, [x3], 8 391 LDR q4, [x5], 16 392 LDR d1, [x15], 8 393 LDR d2, [x13], 8 394 LDR d3, [x4], 8 395 LDR q5, [x5], 16 396 UDOT v24.2s, v8.8b, v0.8b 397 UDOT v25.2s, v8.8b, v1.8b 398 UDOT v26.2s, v8.8b, v2.8b 399 UDOT v27.2s, v8.8b, v3.8b 400 UDOT v16.4s, v4.16b, v0.4b[0] 401 UDOT v17.4s, v4.16b, v1.4b[0] 402 UDOT v18.4s, v4.16b, v2.4b[0] 403 UDOT v19.4s, v4.16b, v3.4b[0] 404 LDR q6, [x5], 16 405 UDOT v20.4s, v5.16b, v0.4b[0] 406 UDOT v21.4s, v5.16b, v1.4b[0] 407 UDOT v22.4s, v5.16b, v2.4b[0] 408 UDOT v23.4s, v5.16b, v3.4b[0] 409 LDR q4, [x5], 16 410 UDOT v16.4s, v6.16b, v0.4b[1] 411 UDOT v17.4s, v6.16b, v1.4b[1] 412 UDOT v18.4s, v6.16b, v2.4b[1] 413 UDOT v19.4s, v6.16b, v3.4b[1] 414 UDOT v20.4s, v4.16b, v0.4b[1] 415 UDOT v21.4s, v4.16b, v1.4b[1] 416 UDOT v22.4s, v4.16b, v2.4b[1] 417 UDOT v23.4s, v4.16b, v3.4b[1] 418 TBZ x0, 2, 3b 4195: 420 LDR s0, [x3], 4 421 LDR q4, [x5], 16 422 LDR s1, [x15], 4 423 LDR s2, [x13], 4 424 LDR s3, [x4], 4 425 LDR q5, [x5], 16 426 UDOT v24.2s, v8.8b, v0.8b 427 UDOT v25.2s, v8.8b, v1.8b 428 UDOT v26.2s, v8.8b, v2.8b 429 UDOT v27.2s, v8.8b, v3.8b 430 UDOT v16.4s, v4.16b, v0.4b[0] 431 UDOT v17.4s, v4.16b, v1.4b[0] 432 UDOT v18.4s, v4.16b, v2.4b[0] 433 UDOT v19.4s, v4.16b, v3.4b[0] 434 UDOT v20.4s, v5.16b, v0.4b[0] 435 UDOT v21.4s, v5.16b, v1.4b[0] 436 UDOT v22.4s, v5.16b, v2.4b[0] 437 UDOT v23.4s, v5.16b, v3.4b[0] 438 B 3b 439 440 # Store odd width 441 .p2align 3 4426: 443 TBZ x1, 2, 7f 444 STR s0, [x6], 4 445 ST1 {v0.s}[2], [x8], 4 446 STR s1, [x9], 4 447 ST1 {v1.s}[2], [x7], 4 448 EXT v0.16b, v0.16b, v0.16b, 4 449 EXT v1.16b, v1.16b, v1.16b, 4 4507: 451 TBZ x1, 1, 8f 452 STR h0, [x6], 2 453 ST1 {v0.h}[4], [x8], 2 454 STR h1, [x9], 2 455 ST1 {v1.h}[4], [x7], 2 456 EXT v0.16b, v0.16b, v0.16b, 2 457 EXT v1.16b, v1.16b, v1.16b, 2 4588: 459 TBZ x1, 0, 9f 460 STR b0, [x6] 461 ST1 {v0.b}[8], [x8] 462 STR b1, [x9] 463 ST1 {v1.b}[8], [x7] 4649: 465 # Restore d8 from stack 466 LDR d8, [sp], 16 467 RET 468 469END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55 470 471#ifdef __ELF__ 472.section ".note.GNU-stack","",%progbits 473#endif 474