1// Auto-generated file. Do not edit! 2// Template: src/qu8-igemm/4x16c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x0) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qu8_conv_minmax_params params) [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x10 v3 34# B x5 v4 v5 v6 v7 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# zero_point v8 v12 v13 v14 v15 40# unused v9 v10 v11 41 42BEGIN_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 43 44 # Clamp C pointers 45 CMP x0, 2 // if mr < 2 46 LDR x8, [sp, 8] // Load a_offset 47 ADD x16, x6, x7 // c1 = c0 + cm_stride 48 CSEL x16, x6, x16, LO // c1 = c0 49 LDP x12, x11, [sp, 16] // Load zero pointer, params 50 ADD x2, x2, 3 // kc = (kc + 3) & ~3 51 ADD x17, x16, x7 // c2 = c1 + cm_stride 52 // if mr <= 2 53 # Save d8,d12-d15 on stack 54 STR d8, [sp, -48]! 55 CSEL x17, x16, x17, LS // c2 = c1 56 BIC x2, x2, 3 57 STP d12, d13, [sp, 16] 58 CMP x0, 4 // if mr < 4 59 ADD x7, x17, x7 // c3 = c2 + cm_stride 60 STP d14, d15, [sp, 32] 61 CSEL x7, x17, x7, LO // c3 = c2 62 LD1R {v8.4s}, [x11], 4 // kernel_zero_point 63 64 .p2align 3 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q20, [x5], 32 68 69 MOVI v12.4s, 0 70 MOVI v13.4s, 0 71 MOVI v14.4s, 0 72 MOVI v15.4s, 0 73 74 MOV v17.16b, v16.16b 75 MOV v18.16b, v16.16b 76 LDP q24, q28, [x5], 32 77 MOV v19.16b, v16.16b 78 MOV v21.16b, v20.16b 79 MOV v22.16b, v20.16b 80 MOV v23.16b, v20.16b 81 MOV v25.16b, v24.16b 82 MOV v26.16b, v24.16b 83 MOV v27.16b, v24.16b 84 MOV v29.16b, v28.16b 85 MOV v30.16b, v28.16b 86 MOV v31.16b, v28.16b 87 88 MOV x9, x3 // p = ks 89 90 .p2align 3 911: 92 # Load next 4 A pointers 93 LDP x13, x14, [x4], 16 94 LDP x15, x10, [x4], 16 95 96 CMP x13, x12 // if a0 == zero 97 ADD x13, x13, x8 // a0 += a_offset 98 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 99 CMP x14, x12 // if a1 == zero 100 ADD x14, x14, x8 // a1 += a_offset 101 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 102 CMP x15, x12 // if a2 == zero 103 ADD x15, x15, x8 // a2 += a_offset 104 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 105 CMP x10, x12 // if a3 == zero 106 ADD x10, x10, x8 // a3 += a_offset 107 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 108 109 # Is there at least 16 bytes for main loop? 110 SUBS x0, x2, 16 // k = kc - 16 111 B.LO 4f 112 113 # Main loop - 16 bytes of A 114 .p2align 3 1152: 116 LDR q0, [x13], 16 117 LDR q4, [x5], 16 118 LDR q1, [x14], 16 119 LDR q2, [x15], 16 120 LDR q3, [x10], 16 121 LDR q5, [x5], 16 122 123 UDOT v12.4s, v8.16b, v0.16b // update zero point 124 UDOT v13.4s, v8.16b, v1.16b 125 UDOT v14.4s, v8.16b, v2.16b 126 UDOT v15.4s, v8.16b, v3.16b 127 128 UDOT v16.4s, v4.16b, v0.4b[0] 129 UDOT v17.4s, v4.16b, v1.4b[0] 130 LDP q6, q7, [x5], 32 131 UDOT v18.4s, v4.16b, v2.4b[0] 132 UDOT v19.4s, v4.16b, v3.4b[0] 133 UDOT v20.4s, v5.16b, v0.4b[0] 134 UDOT v21.4s, v5.16b, v1.4b[0] 135 UDOT v22.4s, v5.16b, v2.4b[0] 136 UDOT v23.4s, v5.16b, v3.4b[0] 137 UDOT v24.4s, v6.16b, v0.4b[0] 138 UDOT v25.4s, v6.16b, v1.4b[0] 139 LDP q4, q5, [x5], 32 140 UDOT v26.4s, v6.16b, v2.4b[0] 141 UDOT v27.4s, v6.16b, v3.4b[0] 142 UDOT v28.4s, v7.16b, v0.4b[0] 143 UDOT v29.4s, v7.16b, v1.4b[0] 144 UDOT v30.4s, v7.16b, v2.4b[0] 145 UDOT v31.4s, v7.16b, v3.4b[0] 146 147 UDOT v16.4s, v4.16b, v0.4b[1] 148 UDOT v17.4s, v4.16b, v1.4b[1] 149 LDP q6, q7, [x5], 32 150 UDOT v18.4s, v4.16b, v2.4b[1] 151 UDOT v19.4s, v4.16b, v3.4b[1] 152 UDOT v20.4s, v5.16b, v0.4b[1] 153 UDOT v21.4s, v5.16b, v1.4b[1] 154 UDOT v22.4s, v5.16b, v2.4b[1] 155 UDOT v23.4s, v5.16b, v3.4b[1] 156 UDOT v24.4s, v6.16b, v0.4b[1] 157 UDOT v25.4s, v6.16b, v1.4b[1] 158 LDP q4, q5, [x5], 32 159 UDOT v26.4s, v6.16b, v2.4b[1] 160 UDOT v27.4s, v6.16b, v3.4b[1] 161 UDOT v28.4s, v7.16b, v0.4b[1] 162 UDOT v29.4s, v7.16b, v1.4b[1] 163 UDOT v30.4s, v7.16b, v2.4b[1] 164 UDOT v31.4s, v7.16b, v3.4b[1] 165 166 UDOT v16.4s, v4.16b, v0.4b[2] 167 UDOT v17.4s, v4.16b, v1.4b[2] 168 LDP q6, q7, [x5], 32 169 UDOT v18.4s, v4.16b, v2.4b[2] 170 UDOT v19.4s, v4.16b, v3.4b[2] 171 UDOT v20.4s, v5.16b, v0.4b[2] 172 UDOT v21.4s, v5.16b, v1.4b[2] 173 UDOT v22.4s, v5.16b, v2.4b[2] 174 UDOT v23.4s, v5.16b, v3.4b[2] 175 UDOT v24.4s, v6.16b, v0.4b[2] 176 UDOT v25.4s, v6.16b, v1.4b[2] 177 LDP q4, q5, [x5], 32 178 UDOT v26.4s, v6.16b, v2.4b[2] 179 UDOT v27.4s, v6.16b, v3.4b[2] 180 UDOT v28.4s, v7.16b, v0.4b[2] 181 UDOT v29.4s, v7.16b, v1.4b[2] 182 UDOT v30.4s, v7.16b, v2.4b[2] 183 UDOT v31.4s, v7.16b, v3.4b[2] 184 185 UDOT v16.4s, v4.16b, v0.4b[3] 186 UDOT v17.4s, v4.16b, v1.4b[3] 187 LDP q6, q7, [x5], 32 188 UDOT v18.4s, v4.16b, v2.4b[3] 189 UDOT v19.4s, v4.16b, v3.4b[3] 190 UDOT v20.4s, v5.16b, v0.4b[3] 191 UDOT v21.4s, v5.16b, v1.4b[3] 192 UDOT v22.4s, v5.16b, v2.4b[3] 193 UDOT v23.4s, v5.16b, v3.4b[3] 194 UDOT v24.4s, v6.16b, v0.4b[3] 195 UDOT v25.4s, v6.16b, v1.4b[3] 196 UDOT v26.4s, v6.16b, v2.4b[3] 197 UDOT v27.4s, v6.16b, v3.4b[3] 198 SUBS x0, x0, 16 199 UDOT v28.4s, v7.16b, v0.4b[3] 200 UDOT v29.4s, v7.16b, v1.4b[3] 201 UDOT v30.4s, v7.16b, v2.4b[3] 202 UDOT v31.4s, v7.16b, v3.4b[3] 203 B.HS 2b 204 205 # Is there a remainder?- 4 to 12 bytes of A 206 TST x0, 15 207 B.NE 4f 208 2093: 210 # ks loop 211 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 212 B.HI 1b 213 214 ADDP v0.4s, v12.4s, v12.4s 215 ADDP v1.4s, v13.4s, v13.4s 216 ADDP v2.4s, v14.4s, v14.4s 217 ADDP v3.4s, v15.4s, v15.4s 218 ADDP v12.4s, v0.4s, v0.4s 219 ADDP v13.4s, v1.4s, v1.4s 220 ADDP v14.4s, v2.4s, v2.4s 221 ADDP v15.4s, v3.4s, v3.4s 222 223 # Subtract zero point from accumulators 224 SUB v16.4s, v16.4s, v12.4s 225 SUB v17.4s, v17.4s, v13.4s 226 SUB v18.4s, v18.4s, v14.4s 227 SUB v19.4s, v19.4s, v15.4s 228 SUB v20.4s, v20.4s, v12.4s 229 SUB v21.4s, v21.4s, v13.4s 230 SUB v22.4s, v22.4s, v14.4s 231 SUB v23.4s, v23.4s, v15.4s 232 SUB v24.4s, v24.4s, v12.4s 233 SUB v25.4s, v25.4s, v13.4s 234 SUB v26.4s, v26.4s, v14.4s 235 SUB v27.4s, v27.4s, v15.4s 236 SUB v28.4s, v28.4s, v12.4s 237 SUB v29.4s, v29.4s, v13.4s 238 SUB v30.4s, v30.4s, v14.4s 239 SUB v31.4s, v31.4s, v15.4s 240 241 SCVTF v16.4s, v16.4s 242 SCVTF v17.4s, v17.4s 243 # Apply params - scale, bias and clamp 244 LD1R {v4.4s}, [x11], 4 245 SCVTF v18.4s, v18.4s 246 SCVTF v19.4s, v19.4s 247 SCVTF v20.4s, v20.4s 248 SCVTF v21.4s, v21.4s 249 SCVTF v22.4s, v22.4s 250 SCVTF v23.4s, v23.4s 251 SCVTF v24.4s, v24.4s 252 SCVTF v25.4s, v25.4s 253 SCVTF v26.4s, v26.4s 254 SCVTF v27.4s, v27.4s 255 SCVTF v28.4s, v28.4s 256 SCVTF v29.4s, v29.4s 257 SCVTF v30.4s, v30.4s 258 SCVTF v31.4s, v31.4s 259 260 FMUL v16.4s, v16.4s, v4.4s 261 FMUL v17.4s, v17.4s, v4.4s 262 FMUL v18.4s, v18.4s, v4.4s 263 FMUL v19.4s, v19.4s, v4.4s 264 FMUL v20.4s, v20.4s, v4.4s 265 FMUL v21.4s, v21.4s, v4.4s 266 FMUL v22.4s, v22.4s, v4.4s 267 FMUL v23.4s, v23.4s, v4.4s 268 FMUL v24.4s, v24.4s, v4.4s 269 FMUL v25.4s, v25.4s, v4.4s 270 FMUL v26.4s, v26.4s, v4.4s 271 FMUL v27.4s, v27.4s, v4.4s 272 FMUL v28.4s, v28.4s, v4.4s 273 FMUL v29.4s, v29.4s, v4.4s 274 FMUL v30.4s, v30.4s, v4.4s 275 FMUL v31.4s, v31.4s, v4.4s 276 277 FCVTNS v16.4s, v16.4s 278 FCVTNS v17.4s, v17.4s 279 FCVTNS v18.4s, v18.4s 280 FCVTNS v19.4s, v19.4s 281 FCVTNS v20.4s, v20.4s 282 FCVTNS v21.4s, v21.4s 283 FCVTNS v22.4s, v22.4s 284 FCVTNS v23.4s, v23.4s 285 FCVTNS v24.4s, v24.4s 286 FCVTNS v25.4s, v25.4s 287 FCVTNS v26.4s, v26.4s 288 FCVTNS v27.4s, v27.4s 289 FCVTNS v28.4s, v28.4s 290 FCVTNS v29.4s, v29.4s 291 FCVTNS v30.4s, v30.4s 292 FCVTNS v31.4s, v31.4s 293 294 SQXTN v16.4h, v16.4s 295 SQXTN v17.4h, v17.4s 296 SQXTN v18.4h, v18.4s 297 SQXTN v19.4h, v19.4s 298 SQXTN v24.4h, v24.4s 299 SQXTN v25.4h, v25.4s 300 SQXTN v26.4h, v26.4s 301 SQXTN v27.4h, v27.4s 302 LD1R {v6.8h}, [x11], 2 // add bias 303 304 SQXTN2 v16.8h, v20.4s 305 SQXTN2 v17.8h, v21.4s 306 SQXTN2 v18.8h, v22.4s 307 SQXTN2 v19.8h, v23.4s 308 SQXTN2 v24.8h, v28.4s 309 SQXTN2 v25.8h, v29.4s 310 SQXTN2 v26.8h, v30.4s 311 SQXTN2 v27.8h, v31.4s 312 313 SQADD v16.8h, v16.8h, v6.8h 314 SQADD v17.8h, v17.8h, v6.8h 315 SQADD v18.8h, v18.8h, v6.8h 316 SQADD v19.8h, v19.8h, v6.8h 317 SQADD v24.8h, v24.8h, v6.8h 318 SQADD v25.8h, v25.8h, v6.8h 319 SQADD v26.8h, v26.8h, v6.8h 320 SQADD v27.8h, v27.8h, v6.8h 321 LD1R {v4.16b}, [x11], 1 // clamp min value 322 323 SQXTUN v0.8b, v16.8h 324 SQXTUN v1.8b, v17.8h 325 SQXTUN v2.8b, v18.8h 326 SQXTUN v3.8b, v19.8h 327 LD1R {v5.16b}, [x11] // clamp max value 328 SQXTUN2 v0.16b, v24.8h 329 SQXTUN2 v1.16b, v25.8h 330 SQXTUN2 v2.16b, v26.8h 331 SQXTUN2 v3.16b, v27.8h 332 LDR x0, [sp, 48] // Load cn_stride 333 334 UMAX v0.16b, v0.16b, v4.16b 335 UMAX v1.16b, v1.16b, v4.16b 336 SUB x11, x11, 7 // rewind params pointer 337 UMAX v2.16b, v2.16b, v4.16b 338 UMAX v3.16b, v3.16b, v4.16b 339 SUBS x1, x1, 16 340 UMIN v0.16b, v0.16b, v5.16b 341 UMIN v1.16b, v1.16b, v5.16b 342 UMIN v2.16b, v2.16b, v5.16b 343 UMIN v3.16b, v3.16b, v5.16b 344 B.LO 6f 345 346 # Store full 4 x 16 347 ST1 {v3.16b}, [x7], x0 348 ST1 {v2.16b}, [x17], x0 349 ST1 {v1.16b}, [x16], x0 350 ST1 {v0.16b}, [x6], x0 351 352 SUB x4, x4, x3 // a -= ks 353 354 # nc loop 355 B.HI 0b 356 357 # Restore d8,d12-d15 from stack 358 LDP d14, d15, [sp, 32] 359 LDP d12, d13, [sp, 16] 360 LDR d8, [sp], 48 361 RET 362 363 # Remainder- 8 bytes of A 364 .p2align 3 3654: 366 # Is there a remainder?- 8 bytes of A 367 TBZ x0, 3, 5f 368 369 LDR d0, [x13], 8 370 LDR q4, [x5], 16 371 LDR d1, [x14], 8 372 LDR d2, [x15], 8 373 LDR d3, [x10], 8 374 LDR q5, [x5], 16 375 376 UDOT v12.4s, v8.16b, v0.16b // update zero point 377 UDOT v13.4s, v8.16b, v1.16b 378 UDOT v14.4s, v8.16b, v2.16b 379 UDOT v15.4s, v8.16b, v3.16b 380 381 UDOT v16.4s, v4.16b, v0.4b[0] 382 UDOT v17.4s, v4.16b, v1.4b[0] 383 LDP q6, q7, [x5], 32 384 UDOT v18.4s, v4.16b, v2.4b[0] 385 UDOT v19.4s, v4.16b, v3.4b[0] 386 UDOT v20.4s, v5.16b, v0.4b[0] 387 UDOT v21.4s, v5.16b, v1.4b[0] 388 UDOT v22.4s, v5.16b, v2.4b[0] 389 UDOT v23.4s, v5.16b, v3.4b[0] 390 UDOT v24.4s, v6.16b, v0.4b[0] 391 UDOT v25.4s, v6.16b, v1.4b[0] 392 LDP q4, q5, [x5], 32 393 UDOT v26.4s, v6.16b, v2.4b[0] 394 UDOT v27.4s, v6.16b, v3.4b[0] 395 UDOT v28.4s, v7.16b, v0.4b[0] 396 UDOT v29.4s, v7.16b, v1.4b[0] 397 UDOT v30.4s, v7.16b, v2.4b[0] 398 UDOT v31.4s, v7.16b, v3.4b[0] 399 UDOT v16.4s, v4.16b, v0.4b[1] 400 UDOT v17.4s, v4.16b, v1.4b[1] 401 LDP q6, q7, [x5], 32 402 UDOT v18.4s, v4.16b, v2.4b[1] 403 UDOT v19.4s, v4.16b, v3.4b[1] 404 UDOT v20.4s, v5.16b, v0.4b[1] 405 UDOT v21.4s, v5.16b, v1.4b[1] 406 UDOT v22.4s, v5.16b, v2.4b[1] 407 UDOT v23.4s, v5.16b, v3.4b[1] 408 UDOT v24.4s, v6.16b, v0.4b[1] 409 UDOT v25.4s, v6.16b, v1.4b[1] 410 UDOT v26.4s, v6.16b, v2.4b[1] 411 UDOT v27.4s, v6.16b, v3.4b[1] 412 UDOT v28.4s, v7.16b, v0.4b[1] 413 UDOT v29.4s, v7.16b, v1.4b[1] 414 UDOT v30.4s, v7.16b, v2.4b[1] 415 UDOT v31.4s, v7.16b, v3.4b[1] 416 # Is there a remainder?- 4 bytes of A 417 TBZ x0, 2, 3b 418 419 # Remainder- 4 bytes of A 4205: 421 LDR s0, [x13], 4 422 LDR q4, [x5], 16 423 LDR s1, [x14], 4 424 LDR s2, [x15], 4 425 LDR s3, [x10], 4 426 LDR q5, [x5], 16 427 428 UDOT v12.4s, v8.16b, v0.16b // update zero point 429 UDOT v13.4s, v8.16b, v1.16b 430 UDOT v14.4s, v8.16b, v2.16b 431 UDOT v15.4s, v8.16b, v3.16b 432 433 UDOT v16.4s, v4.16b, v0.4b[0] 434 UDOT v17.4s, v4.16b, v1.4b[0] 435 UDOT v18.4s, v4.16b, v2.4b[0] 436 UDOT v19.4s, v4.16b, v3.4b[0] 437 LDP q6, q7, [x5], 32 438 UDOT v20.4s, v5.16b, v0.4b[0] 439 UDOT v21.4s, v5.16b, v1.4b[0] 440 UDOT v22.4s, v5.16b, v2.4b[0] 441 UDOT v23.4s, v5.16b, v3.4b[0] 442 UDOT v24.4s, v6.16b, v0.4b[0] 443 UDOT v25.4s, v6.16b, v1.4b[0] 444 UDOT v26.4s, v6.16b, v2.4b[0] 445 UDOT v27.4s, v6.16b, v3.4b[0] 446 UDOT v28.4s, v7.16b, v0.4b[0] 447 UDOT v29.4s, v7.16b, v1.4b[0] 448 UDOT v30.4s, v7.16b, v2.4b[0] 449 UDOT v31.4s, v7.16b, v3.4b[0] 450 B 3b 451 452 # Store odd width 453 .p2align 3 4546: 455 TBZ x1, 3, 7f 456 STR d3, [x7], 8 457 STR d2, [x17], 8 458 DUP d3, v3.d[1] 459 DUP d2, v2.d[1] 460 STR d1, [x16], 8 461 STR d0, [x6], 8 462 DUP d1, v1.d[1] 463 DUP d0, v0.d[1] 4647: 465 TBZ x1, 2, 8f 466 STR s3, [x7], 4 467 STR s2, [x17], 4 468 DUP s3, v3.s[1] 469 DUP s2, v2.s[1] 470 STR s1, [x16], 4 471 STR s0, [x6], 4 472 DUP s1, v1.s[1] 473 DUP s0, v0.s[1] 4748: 475 TBZ x1, 1, 9f 476 STR h3, [x7], 2 477 STR h2, [x17], 2 478 DUP h3, v3.h[1] 479 DUP h2, v2.h[1] 480 STR h1, [x16], 2 481 STR h0, [x6], 2 482 DUP h1, v1.h[1] 483 DUP h0, v0.h[1] 4849: 485 TBZ x1, 0, 10f 486 STR b3, [x7] 487 STR b2, [x17] 488 STR b1, [x16] 489 STR b0, [x6] 49010: 491 # Restore d8,d12-d15 from stack 492 LDP d14, d15, [sp, 32] 493 LDP d12, d13, [sp, 16] 494 LDR d8, [sp], 48 495 RET 496 497END_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 498 499#ifdef __ELF__ 500.section ".note.GNU-stack","",%progbits 501#endif 502