1// Auto-generated file. Do not edit! 2// Template: src/qu8-igemm/4x8c4-aarch64-neondot-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x10) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qu8_conv_minmax_params [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v4 31# A1 x14 v1 v5 32# A2 x15 v2 v6 33# A3 x10 v3 v7 34# B x5 v28 v29 v30 v31 35# C0 x6 v16 v20 36# C1 x16 v17 v21 37# C2 x17 v18 v22 38# C3 x7 v19 v23 39# zero_point v8 v24 v25 v26 v27 40# unused v9 v10 v11 v12 v13 v14 v15 41 42# x11 temp for Cortex-A55 loads 43 44BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55 45 46 # Clamp C pointers 47 CMP x0, 2 // if mr < 2 48 LDR x8, [sp, 8] // Load a_offset 49 ADD x16, x6, x7 // c1 = c0 + cm_stride 50 CSEL x16, x6, x16, LO // c1 = c0 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 ADD x2, x2, 3 // kc = (kc + 3) & ~3 53 ADD x17, x16, x7 // c2 = c1 + cm_stride 54 STR d8, [sp, -16]! // Save d8 on stack 55 // if mr <= 2 56 CSEL x17, x16, x17, LS // c2 = c1 57 BIC x2, x2, 3 58 CMP x0, 4 // if mr < 4 59 LD1R {v8.4s}, [x11], 4 // kernel_zero_point 60 ADD x7, x17, x7 // c3 = c2 + cm_stride 61 CSEL x7, x17, x7, LO // c3 = c2 62 63 .p2align 3 640: 65 # Load initial bias from w into accumulators 66 LDP q16, q20, [x5], 32 67 MOV v17.16b, v16.16b 68 MOV v18.16b, v16.16b 69 MOV v19.16b, v16.16b 70 MOV v21.16b, v20.16b 71 MOV v22.16b, v20.16b 72 MOV v23.16b, v20.16b 73 MOVI v24.16b, 0 74 MOVI v25.16b, 0 75 MOVI v26.16b, 0 76 MOVI v27.16b, 0 77 MOV x9, x3 // p = ks 78 79 .p2align 3 801: 81 # Load next 4 A pointers 82 LDP x13, x14, [x4], 16 83 LDP x15, x10, [x4], 16 84 85 CMP x13, x12 // if a0 == zero 86 ADD x13, x13, x8 // a0 += a_offset 87 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 88 CMP x14, x12 // if a1 == zero 89 ADD x14, x14, x8 // a1 += a_offset 90 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 91 CMP x15, x12 // if a2 == zero 92 ADD x15, x15, x8 // a2 += a_offset 93 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 94 CMP x10, x12 // if a3 == zero 95 ADD x10, x10, x8 // a3 += a_offset 96 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 97 98 # Is there at least 16 bytes for prologue/epilogue? 99 SUBS x0, x2, 16 // k = kc - 16 100 B.LO 5f 101 102 # prologue - read A and B values for block 0 and 1 103 LDR d0, [x13], 8 104 LDR q28, [x5], 16 105 LDR d1, [x14], 8 106 LDR d2, [x15], 8 107 LDR d3, [x10], 8 108 SUBS x0, x0, 16 // is there 16 for main loop? 109 LDR d29, [x5], 8 110 LDR x11, [x5], 8 111 # Is there at least 16 bytes for main loop? 112 B.LO 3f 113 114 # Main loop - 16 bytes of A in 4 groups of 2 blocks 115 # 4 row of 2 vectors wide = 8 UDOT instructions for 4 channels 116 # 4 LD64 for A 117 # 4 LD128 for W. = 2 LD64 + INS. 118 # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS. 119 120 .p2align 3 1212: 122 # BLOCK 0 123 UDOT v16.4s, v28.16b, v0.4b[0] 124 LDR d30, [x5], 8 125 UDOT v17.4s, v28.16b, v1.4b[0] 126 INS v29.d[1], x11 127 UDOT v18.4s, v28.16b, v2.4b[0] 128 LDR x11, [x5], 8 129 UDOT v19.4s, v28.16b, v3.4b[0] 130 LDR d4, [x13], 8 131 132 # BLOCK 1 133 UDOT v20.4s, v29.16b, v0.4b[0] 134 LDR d31, [x5], 8 135 UDOT v21.4s, v29.16b, v1.4b[0] 136 INS v30.d[1], x11 137 UDOT v22.4s, v29.16b, v2.4b[0] 138 LDR x11, [x5], 8 139 UDOT v23.4s, v29.16b, v3.4b[0] 140 LDR d5, [x14], 8 141 142 # BLOCK 0 143 UDOT v16.4s, v30.16b, v0.4b[1] 144 LDR d28, [x5], 8 145 UDOT v17.4s, v30.16b, v1.4b[1] 146 INS v31.d[1], x11 147 UDOT v18.4s, v30.16b, v2.4b[1] 148 LDR x11, [x5], 8 149 UDOT v19.4s, v30.16b, v3.4b[1] 150 LDR d6, [x15], 8 151 152 # BLOCK 1 153 UDOT v20.4s, v31.16b, v0.4b[1] 154 LDR d29, [x5], 8 155 UDOT v21.4s, v31.16b, v1.4b[1] 156 INS v28.d[1], x11 157 UDOT v22.4s, v31.16b, v2.4b[1] 158 LDR x11, [x5], 8 159 UDOT v23.4s, v31.16b, v3.4b[1] 160 LDR d7, [x10], 8 161 162 UDOT v24.2s, v8.8b, v0.8b 163 UDOT v25.2s, v8.8b, v1.8b 164 UDOT v26.2s, v8.8b, v2.8b 165 UDOT v27.2s, v8.8b, v3.8b 166 167 # BLOCK 0 168 UDOT v16.4s, v28.16b, v4.4b[0] 169 LDR d30, [x5], 8 170 UDOT v17.4s, v28.16b, v5.4b[0] 171 INS v29.d[1], x11 172 UDOT v18.4s, v28.16b, v6.4b[0] 173 LDR x11, [x5], 8 174 UDOT v19.4s, v28.16b, v7.4b[0] 175 LDR d0, [x13], 8 176 177 # BLOCK 1 178 UDOT v20.4s, v29.16b, v4.4b[0] 179 LDR d31, [x5], 8 180 UDOT v21.4s, v29.16b, v5.4b[0] 181 INS v30.d[1], x11 182 UDOT v22.4s, v29.16b, v6.4b[0] 183 LDR x11, [x5], 8 184 UDOT v23.4s, v29.16b, v7.4b[0] 185 LDR d1, [x14], 8 186 187 # BLOCK 0 188 UDOT v16.4s, v30.16b, v4.4b[1] 189 LDR d28, [x5], 8 190 UDOT v17.4s, v30.16b, v5.4b[1] 191 INS v31.d[1], x11 192 UDOT v18.4s, v30.16b, v6.4b[1] 193 LDR x11, [x5], 8 194 UDOT v19.4s, v30.16b, v7.4b[1] 195 LDR d2, [x15], 8 196 197 # BLOCK 1 198 UDOT v20.4s, v31.16b, v4.4b[1] 199 LDR d29, [x5], 8 200 UDOT v21.4s, v31.16b, v5.4b[1] 201 INS v28.d[1], x11 202 UDOT v22.4s, v31.16b, v6.4b[1] 203 LDR x11, [x5], 8 204 UDOT v23.4s, v31.16b, v7.4b[1] 205 LDR d3, [x10], 8 206 207 UDOT v24.2s, v8.8b, v4.8b 208 UDOT v25.2s, v8.8b, v5.8b 209 SUBS x0, x0, 16 210 UDOT v26.2s, v8.8b, v6.8b 211 UDOT v27.2s, v8.8b, v7.8b 212 213 B.HS 2b 214 215 # Epilogue. Same as main loop but no preloads in final group 2163: 217 # BLOCK 0 218 UDOT v16.4s, v28.16b, v0.4b[0] 219 LDR d30, [x5], 8 220 UDOT v17.4s, v28.16b, v1.4b[0] 221 INS v29.d[1], x11 222 UDOT v18.4s, v28.16b, v2.4b[0] 223 LDR x11, [x5], 8 224 UDOT v19.4s, v28.16b, v3.4b[0] 225 LDR d4, [x13], 8 226 227 # BLOCK 1 228 UDOT v20.4s, v29.16b, v0.4b[0] 229 LDR d31, [x5], 8 230 UDOT v21.4s, v29.16b, v1.4b[0] 231 INS v30.d[1], x11 232 UDOT v22.4s, v29.16b, v2.4b[0] 233 LDR x11, [x5], 8 234 UDOT v23.4s, v29.16b, v3.4b[0] 235 LDR d5, [x14], 8 236 237 # BLOCK 0 238 UDOT v16.4s, v30.16b, v0.4b[1] 239 LDR d28, [x5], 8 240 UDOT v17.4s, v30.16b, v1.4b[1] 241 INS v31.d[1], x11 242 UDOT v18.4s, v30.16b, v2.4b[1] 243 LDR x11, [x5], 8 244 UDOT v19.4s, v30.16b, v3.4b[1] 245 LDR d6, [x15], 8 246 247 # BLOCK 1 248 UDOT v20.4s, v31.16b, v0.4b[1] 249 LDR d29, [x5], 8 250 UDOT v21.4s, v31.16b, v1.4b[1] 251 INS v28.d[1], x11 252 UDOT v22.4s, v31.16b, v2.4b[1] 253 LDR x11, [x5], 8 254 UDOT v23.4s, v31.16b, v3.4b[1] 255 LDR d7, [x10], 8 256 257 UDOT v24.2s, v8.8b, v0.8b 258 UDOT v25.2s, v8.8b, v1.8b 259 UDOT v26.2s, v8.8b, v2.8b 260 UDOT v27.2s, v8.8b, v3.8b 261 262 # BLOCK 0 263 UDOT v16.4s, v28.16b, v4.4b[0] 264 LDR d30, [x5], 8 265 UDOT v17.4s, v28.16b, v5.4b[0] 266 INS v29.d[1], x11 267 UDOT v18.4s, v28.16b, v6.4b[0] 268 LDR x11, [x5], 8 269 UDOT v19.4s, v28.16b, v7.4b[0] 270 271 # BLOCK 1 272 UDOT v20.4s, v29.16b, v4.4b[0] 273 LDR d31, [x5], 8 274 UDOT v21.4s, v29.16b, v5.4b[0] 275 INS v30.d[1], x11 276 UDOT v22.4s, v29.16b, v6.4b[0] 277 LDR x11, [x5], 8 278 UDOT v23.4s, v29.16b, v7.4b[0] 279 280 # BLOCK 0 281 UDOT v16.4s, v30.16b, v4.4b[1] 282 UDOT v17.4s, v30.16b, v5.4b[1] 283 INS v31.d[1], x11 284 UDOT v18.4s, v30.16b, v6.4b[1] 285 UDOT v19.4s, v30.16b, v7.4b[1] 286 287 # BLOCK 1 288 UDOT v20.4s, v31.16b, v4.4b[1] 289 UDOT v21.4s, v31.16b, v5.4b[1] 290 UDOT v22.4s, v31.16b, v6.4b[1] 291 UDOT v23.4s, v31.16b, v7.4b[1] 292 293 AND x0, x2, 15 // kc remainder 0 to 12 294 295 UDOT v24.2s, v8.8b, v4.8b 296 UDOT v25.2s, v8.8b, v5.8b 297 UDOT v26.2s, v8.8b, v6.8b 298 UDOT v27.2s, v8.8b, v7.8b 299 300 # Is there a remainder?- 4 to 12 bytes of A 301 CBNZ x0, 5f 302 303 .p2align 3 3044: 305 # ks loop 306 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 307 B.HI 1b 308 309 ADDP v0.2s, v24.2s, v25.2s 310 ADDP v1.2s, v26.2s, v27.2s 311 LDR x11, [sp, 40] // reload params pointer 312 DUP v24.4s, v0.s[0] 313 DUP v25.4s, v0.s[1] 314 DUP v26.4s, v1.s[0] 315 DUP v27.4s, v1.s[1] 316 ADD x11, x11, 4 317 318 # Subtract zero point from accumulators 319 SUB v16.4s, v16.4s, v24.4s 320 SUB v17.4s, v17.4s, v25.4s 321 SUB v18.4s, v18.4s, v26.4s 322 SUB v19.4s, v19.4s, v27.4s 323 SUB v20.4s, v20.4s, v24.4s 324 SUB v21.4s, v21.4s, v25.4s 325 SUB v22.4s, v22.4s, v26.4s 326 SUB v23.4s, v23.4s, v27.4s 327 328 # Apply params - preshift, scale, postshift, bias and clamp 329 LD1R {v4.4s}, [x11], 4 330 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 331 SSHL v17.4s, v17.4s, v4.4s 332 SSHL v18.4s, v18.4s, v4.4s 333 SSHL v19.4s, v19.4s, v4.4s 334 LD1R {v5.4s}, [x11], 4 335 SSHL v20.4s, v20.4s, v4.4s 336 SSHL v21.4s, v21.4s, v4.4s 337 SSHL v22.4s, v22.4s, v4.4s 338 SSHL v23.4s, v23.4s, v4.4s 339 LD1R {v6.4s}, [x11], 4 340 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 341 SQDMULH v17.4s, v17.4s, v5.4s 342 SQDMULH v18.4s, v18.4s, v5.4s 343 SQDMULH v19.4s, v19.4s, v5.4s 344 SQDMULH v20.4s, v20.4s, v5.4s 345 SQDMULH v21.4s, v21.4s, v5.4s 346 SQDMULH v22.4s, v22.4s, v5.4s 347 SQDMULH v23.4s, v23.4s, v5.4s 348 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 349 SRSHL v17.4s, v17.4s, v6.4s 350 SRSHL v18.4s, v18.4s, v6.4s 351 SRSHL v19.4s, v19.4s, v6.4s 352 SRSHL v20.4s, v20.4s, v6.4s 353 SRSHL v21.4s, v21.4s, v6.4s 354 SRSHL v22.4s, v22.4s, v6.4s 355 SRSHL v23.4s, v23.4s, v6.4s 356 357 SQXTN v16.4h, v16.4s 358 SQXTN v17.4h, v17.4s 359 SQXTN v18.4h, v18.4s 360 SQXTN v19.4h, v19.4s 361 LD1R {v6.8h}, [x11], 2 // add bias 362 363 SQXTN2 v16.8h, v20.4s 364 SQXTN2 v17.8h, v21.4s 365 SQXTN2 v18.8h, v22.4s 366 SQXTN2 v19.8h, v23.4s 367 368 SQADD v16.8h, v16.8h, v6.8h 369 SQADD v17.8h, v17.8h, v6.8h 370 LDR x10, [sp, 16] // Load cn_stride 371 SQADD v18.8h, v18.8h, v6.8h 372 SQADD v19.8h, v19.8h, v6.8h 373 LD1R {v4.16b}, [x11], 1 // clamp min value 374 375 SQXTUN v0.8b, v16.8h 376 SQXTUN v1.8b, v18.8h 377 LD1R {v5.16b}, [x11] // clamp max value 378 SQXTUN2 v0.16b, v17.8h 379 SQXTUN2 v1.16b, v19.8h 380 381 UMAX v0.16b, v0.16b, v4.16b 382 UMAX v1.16b, v1.16b, v4.16b 383 SUBS x1, x1, 8 384 UMIN v0.16b, v0.16b, v5.16b 385 UMIN v1.16b, v1.16b, v5.16b 386 B.LO 7f 387 388 # Store full 4 x 8 389 ST1 {v1.d}[1], [x7], x10 390 ST1 {v1.8b}, [x17], x10 391 ST1 {v0.d}[1], [x16], x10 392 ST1 {v0.8b}, [x6], x10 393 SUB x4, x4, x3 // a -= ks 394 395 # nc loop 396 B.HI 0b 397 398 # Restore d8 from stack 399 LDR d8, [sp], 16 400 RET 401 402 # Remainder- 4 to 12 bytes of A 403 .p2align 3 4045: 405 TBZ x0, 3, 6f 406 407 LDR d0, [x13], 8 408 LDR q4, [x5], 16 409 LDR d1, [x14], 8 410 LDR d2, [x15], 8 411 LDR d3, [x10], 8 412 LDR q5, [x5], 16 413 UDOT v24.2s, v8.8b, v0.8b 414 UDOT v25.2s, v8.8b, v1.8b 415 UDOT v26.2s, v8.8b, v2.8b 416 UDOT v27.2s, v8.8b, v3.8b 417 UDOT v16.4s, v4.16b, v0.4b[0] 418 UDOT v17.4s, v4.16b, v1.4b[0] 419 UDOT v18.4s, v4.16b, v2.4b[0] 420 UDOT v19.4s, v4.16b, v3.4b[0] 421 LDR q6, [x5], 16 422 UDOT v20.4s, v5.16b, v0.4b[0] 423 UDOT v21.4s, v5.16b, v1.4b[0] 424 UDOT v22.4s, v5.16b, v2.4b[0] 425 UDOT v23.4s, v5.16b, v3.4b[0] 426 LDR q4, [x5], 16 427 UDOT v16.4s, v6.16b, v0.4b[1] 428 UDOT v17.4s, v6.16b, v1.4b[1] 429 UDOT v18.4s, v6.16b, v2.4b[1] 430 UDOT v19.4s, v6.16b, v3.4b[1] 431 UDOT v20.4s, v4.16b, v0.4b[1] 432 UDOT v21.4s, v4.16b, v1.4b[1] 433 UDOT v22.4s, v4.16b, v2.4b[1] 434 UDOT v23.4s, v4.16b, v3.4b[1] 435 TBZ x0, 2, 4b 4366: 437 LDR s0, [x13], 4 438 LDR q4, [x5], 16 439 LDR s1, [x14], 4 440 LDR s2, [x15], 4 441 LDR s3, [x10], 4 442 LDR q5, [x5], 16 443 UDOT v24.2s, v8.8b, v0.8b 444 UDOT v25.2s, v8.8b, v1.8b 445 UDOT v26.2s, v8.8b, v2.8b 446 UDOT v27.2s, v8.8b, v3.8b 447 UDOT v16.4s, v4.16b, v0.4b[0] 448 UDOT v17.4s, v4.16b, v1.4b[0] 449 UDOT v18.4s, v4.16b, v2.4b[0] 450 UDOT v19.4s, v4.16b, v3.4b[0] 451 UDOT v20.4s, v5.16b, v0.4b[0] 452 UDOT v21.4s, v5.16b, v1.4b[0] 453 UDOT v22.4s, v5.16b, v2.4b[0] 454 UDOT v23.4s, v5.16b, v3.4b[0] 455 B 4b 456 457 # Store odd width 458 .p2align 3 4597: 460 TBZ x1, 2, 8f 461 ST1 {v1.s}[2], [x7], 4 462 STR s1, [x17], 4 463 ST1 {v0.s}[2], [x16], 4 464 STR s0, [x6], 4 465 EXT v0.16b, v0.16b, v0.16b, 4 466 EXT v1.16b, v1.16b, v1.16b, 4 4678: 468 TBZ x1, 1, 9f 469 ST1 {v1.h}[4], [x7], 2 470 STR h1, [x17], 2 471 ST1 {v0.h}[4], [x16], 2 472 STR h0, [x6], 2 473 EXT v0.16b, v0.16b, v0.16b, 2 474 EXT v1.16b, v1.16b, v1.16b, 2 4759: 476 TBZ x1, 0, 10f 477 ST1 {v1.b}[8], [x7] 478 STR b1, [x17] 479 ST1 {v0.b}[8], [x16] 480 STR b0, [x6] 48110: 482 # Restore d8 from stack 483 LDR d8, [sp], 16 484 RET 485 486END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55 487 488#ifdef __ELF__ 489.section ".note.GNU-stack","",%progbits 490#endif 491