1// Auto-generated file. Do not edit! 2// Template: src/qu8-gemm/4x16c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11 24 25# params structure is 12 bytes 26# struct { 27# uint8_t kernel_zero_point[4]; 28# float scale; 29# int16_t output_zero_point; 30# int8_t output_min; 31# int8_t output_max; 32# } fp32_neonv8; 33 34# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 35 36# Register usage 37# A0 x3 v0 38# A1 x15 v1 39# A2 x13 v2 40# A3 x4 v3 41# B x5 v4 v5 v6 v7 42# C0 x6 v16 v20 v24 v28 43# C1 x8 v17 v21 v25 v29 44# C2 x9 v18 v22 v26 v30 45# C3 x7 v19 v23 v27 v31 46# zero_point v8 v12 v13 v14 v15 47# unused v9 v10 v11 48 49BEGIN_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 50 51 # Clamp A and C pointers 52 CMP x0, 2 // if mr < 2 53 LDP x12, x11, [sp] // cn_stride, params 54 ADD x2, x2, 3 // kc = (kc + 3) & ~3 55 ADD x15, x3, x4 // a1 = a0 + a_stride 56 ADD x8, x6, x7 // c1 = c0 + cm_stride 57 58 # Save d8,d12-d15 on stack 59 STR d8, [sp, -48]! 60 CSEL x15, x3, x15, LO // a1 = a0 61 CSEL x8, x6, x8, LO // c1 = c0 62 BIC x2, x2, 3 63 64 STP d12, d13, [sp, 16] 65 ADD x13, x15, x4 // a2 = a1 + a_stride 66 ADD x9, x8, x7 // c2 = c1 + cm_stride 67 // if mr <= 2 68 CSEL x13, x15, x13, LS // a2 = a1 69 CSEL x9, x8, x9, LS // c2 = c1 70 71 STP d14, d15, [sp, 32] 72 CMP x0, 4 // if mr < 4 73 ADD x4, x13, x4 // a3 = a2 + a_stride 74 ADD x7, x9, x7 // c3 = c2 + cm_stride 75 76 LD1R {v8.4s}, [x11], 4 // kernel_zero_point 77 78 CSEL x4, x13, x4, LO // a3 = a2 79 CSEL x7, x9, x7, LO // c3 = c2 80 81 .p2align 3 820: 83 # Load initial bias from w into accumulators 84 LDP q16, q20, [x5], 32 85 86 MOVI v12.4s, 0 87 MOVI v13.4s, 0 88 MOVI v14.4s, 0 89 MOVI v15.4s, 0 90 91 MOV v17.16b, v16.16b 92 MOV v18.16b, v16.16b 93 LDP q24, q28, [x5], 32 94 MOV v19.16b, v16.16b 95 MOV v21.16b, v20.16b 96 SUBS x0, x2, 16 // k = kc - 16 97 MOV v22.16b, v20.16b 98 MOV v23.16b, v20.16b 99 MOV v25.16b, v24.16b 100 MOV v26.16b, v24.16b 101 MOV v27.16b, v24.16b 102 MOV v29.16b, v28.16b 103 MOV v30.16b, v28.16b 104 MOV v31.16b, v28.16b 105 106 # Is there at least 16 bytes? 107 B.LO 3f 108 109 # Main loop - 16 bytes of A 110 .p2align 3 1111: 112 LDR q0, [x3], 16 113 LDR q4, [x5], 16 114 LDR q1, [x15], 16 115 LDR q2, [x13], 16 116 LDR q3, [x4], 16 117 LDR q5, [x5], 16 118 119 UDOT v12.4s, v8.16b, v0.16b // update zero point 120 UDOT v13.4s, v8.16b, v1.16b 121 UDOT v14.4s, v8.16b, v2.16b 122 UDOT v15.4s, v8.16b, v3.16b 123 124 UDOT v16.4s, v4.16b, v0.4b[0] 125 UDOT v17.4s, v4.16b, v1.4b[0] 126 LDP q6, q7, [x5], 32 127 UDOT v18.4s, v4.16b, v2.4b[0] 128 UDOT v19.4s, v4.16b, v3.4b[0] 129 UDOT v20.4s, v5.16b, v0.4b[0] 130 UDOT v21.4s, v5.16b, v1.4b[0] 131 UDOT v22.4s, v5.16b, v2.4b[0] 132 UDOT v23.4s, v5.16b, v3.4b[0] 133 UDOT v24.4s, v6.16b, v0.4b[0] 134 UDOT v25.4s, v6.16b, v1.4b[0] 135 LDP q4, q5, [x5], 32 136 UDOT v26.4s, v6.16b, v2.4b[0] 137 UDOT v27.4s, v6.16b, v3.4b[0] 138 UDOT v28.4s, v7.16b, v0.4b[0] 139 UDOT v29.4s, v7.16b, v1.4b[0] 140 UDOT v30.4s, v7.16b, v2.4b[0] 141 UDOT v31.4s, v7.16b, v3.4b[0] 142 143 UDOT v16.4s, v4.16b, v0.4b[1] 144 UDOT v17.4s, v4.16b, v1.4b[1] 145 LDP q6, q7, [x5], 32 146 UDOT v18.4s, v4.16b, v2.4b[1] 147 UDOT v19.4s, v4.16b, v3.4b[1] 148 UDOT v20.4s, v5.16b, v0.4b[1] 149 UDOT v21.4s, v5.16b, v1.4b[1] 150 UDOT v22.4s, v5.16b, v2.4b[1] 151 UDOT v23.4s, v5.16b, v3.4b[1] 152 UDOT v24.4s, v6.16b, v0.4b[1] 153 UDOT v25.4s, v6.16b, v1.4b[1] 154 LDP q4, q5, [x5], 32 155 UDOT v26.4s, v6.16b, v2.4b[1] 156 UDOT v27.4s, v6.16b, v3.4b[1] 157 UDOT v28.4s, v7.16b, v0.4b[1] 158 UDOT v29.4s, v7.16b, v1.4b[1] 159 UDOT v30.4s, v7.16b, v2.4b[1] 160 UDOT v31.4s, v7.16b, v3.4b[1] 161 162 UDOT v16.4s, v4.16b, v0.4b[2] 163 UDOT v17.4s, v4.16b, v1.4b[2] 164 LDP q6, q7, [x5], 32 165 UDOT v18.4s, v4.16b, v2.4b[2] 166 UDOT v19.4s, v4.16b, v3.4b[2] 167 UDOT v20.4s, v5.16b, v0.4b[2] 168 UDOT v21.4s, v5.16b, v1.4b[2] 169 UDOT v22.4s, v5.16b, v2.4b[2] 170 UDOT v23.4s, v5.16b, v3.4b[2] 171 UDOT v24.4s, v6.16b, v0.4b[2] 172 UDOT v25.4s, v6.16b, v1.4b[2] 173 LDP q4, q5, [x5], 32 174 UDOT v26.4s, v6.16b, v2.4b[2] 175 UDOT v27.4s, v6.16b, v3.4b[2] 176 UDOT v28.4s, v7.16b, v0.4b[2] 177 UDOT v29.4s, v7.16b, v1.4b[2] 178 UDOT v30.4s, v7.16b, v2.4b[2] 179 UDOT v31.4s, v7.16b, v3.4b[2] 180 181 UDOT v16.4s, v4.16b, v0.4b[3] 182 UDOT v17.4s, v4.16b, v1.4b[3] 183 LDP q6, q7, [x5], 32 184 UDOT v18.4s, v4.16b, v2.4b[3] 185 UDOT v19.4s, v4.16b, v3.4b[3] 186 UDOT v20.4s, v5.16b, v0.4b[3] 187 UDOT v21.4s, v5.16b, v1.4b[3] 188 UDOT v22.4s, v5.16b, v2.4b[3] 189 UDOT v23.4s, v5.16b, v3.4b[3] 190 UDOT v24.4s, v6.16b, v0.4b[3] 191 UDOT v25.4s, v6.16b, v1.4b[3] 192 UDOT v26.4s, v6.16b, v2.4b[3] 193 UDOT v27.4s, v6.16b, v3.4b[3] 194 SUBS x0, x0, 16 195 UDOT v28.4s, v7.16b, v0.4b[3] 196 UDOT v29.4s, v7.16b, v1.4b[3] 197 UDOT v30.4s, v7.16b, v2.4b[3] 198 UDOT v31.4s, v7.16b, v3.4b[3] 199 B.HS 1b 200 201 # Is there a remainder?- 4 to 12 bytes of A 202 TST x0, 15 203 B.NE 3f 204 2052: 206 ADDP v0.4s, v12.4s, v12.4s 207 ADDP v1.4s, v13.4s, v13.4s 208 ADDP v2.4s, v14.4s, v14.4s 209 ADDP v3.4s, v15.4s, v15.4s 210 ADDP v12.4s, v0.4s, v0.4s 211 ADDP v13.4s, v1.4s, v1.4s 212 ADDP v14.4s, v2.4s, v2.4s 213 ADDP v15.4s, v3.4s, v3.4s 214 215 # Subtract zero point from accumulators 216 SUB v16.4s, v16.4s, v12.4s 217 SUB v17.4s, v17.4s, v13.4s 218 SUB v18.4s, v18.4s, v14.4s 219 SUB v19.4s, v19.4s, v15.4s 220 SUB v20.4s, v20.4s, v12.4s 221 SUB v21.4s, v21.4s, v13.4s 222 SUB v22.4s, v22.4s, v14.4s 223 SUB v23.4s, v23.4s, v15.4s 224 SUB v24.4s, v24.4s, v12.4s 225 SUB v25.4s, v25.4s, v13.4s 226 SUB v26.4s, v26.4s, v14.4s 227 SUB v27.4s, v27.4s, v15.4s 228 SUB v28.4s, v28.4s, v12.4s 229 SUB v29.4s, v29.4s, v13.4s 230 SUB v30.4s, v30.4s, v14.4s 231 SUB v31.4s, v31.4s, v15.4s 232 233 SCVTF v16.4s, v16.4s 234 SCVTF v17.4s, v17.4s 235 # Apply params - scale, bias and clamp 236 LD1R {v4.4s}, [x11], 4 237 SCVTF v18.4s, v18.4s 238 SCVTF v19.4s, v19.4s 239 SCVTF v20.4s, v20.4s 240 SCVTF v21.4s, v21.4s 241 SCVTF v22.4s, v22.4s 242 SCVTF v23.4s, v23.4s 243 SCVTF v24.4s, v24.4s 244 SCVTF v25.4s, v25.4s 245 SCVTF v26.4s, v26.4s 246 SCVTF v27.4s, v27.4s 247 SCVTF v28.4s, v28.4s 248 SCVTF v29.4s, v29.4s 249 SCVTF v30.4s, v30.4s 250 SCVTF v31.4s, v31.4s 251 252 FMUL v16.4s, v16.4s, v4.4s 253 FMUL v17.4s, v17.4s, v4.4s 254 FMUL v18.4s, v18.4s, v4.4s 255 FMUL v19.4s, v19.4s, v4.4s 256 FMUL v20.4s, v20.4s, v4.4s 257 FMUL v21.4s, v21.4s, v4.4s 258 FMUL v22.4s, v22.4s, v4.4s 259 FMUL v23.4s, v23.4s, v4.4s 260 FMUL v24.4s, v24.4s, v4.4s 261 FMUL v25.4s, v25.4s, v4.4s 262 FMUL v26.4s, v26.4s, v4.4s 263 FMUL v27.4s, v27.4s, v4.4s 264 FMUL v28.4s, v28.4s, v4.4s 265 FMUL v29.4s, v29.4s, v4.4s 266 FMUL v30.4s, v30.4s, v4.4s 267 FMUL v31.4s, v31.4s, v4.4s 268 269 FCVTNS v16.4s, v16.4s 270 FCVTNS v17.4s, v17.4s 271 FCVTNS v18.4s, v18.4s 272 FCVTNS v19.4s, v19.4s 273 FCVTNS v20.4s, v20.4s 274 FCVTNS v21.4s, v21.4s 275 FCVTNS v22.4s, v22.4s 276 FCVTNS v23.4s, v23.4s 277 FCVTNS v24.4s, v24.4s 278 FCVTNS v25.4s, v25.4s 279 FCVTNS v26.4s, v26.4s 280 FCVTNS v27.4s, v27.4s 281 FCVTNS v28.4s, v28.4s 282 FCVTNS v29.4s, v29.4s 283 FCVTNS v30.4s, v30.4s 284 FCVTNS v31.4s, v31.4s 285 286 SQXTN v16.4h, v16.4s 287 SQXTN v17.4h, v17.4s 288 SQXTN v18.4h, v18.4s 289 SQXTN v19.4h, v19.4s 290 SQXTN v24.4h, v24.4s 291 SQXTN v25.4h, v25.4s 292 SQXTN v26.4h, v26.4s 293 SQXTN v27.4h, v27.4s 294 LD1R {v6.8h}, [x11], 2 // add bias 295 296 SQXTN2 v16.8h, v20.4s 297 SQXTN2 v17.8h, v21.4s 298 SQXTN2 v18.8h, v22.4s 299 SQXTN2 v19.8h, v23.4s 300 SQXTN2 v24.8h, v28.4s 301 SQXTN2 v25.8h, v29.4s 302 SQXTN2 v26.8h, v30.4s 303 SQXTN2 v27.8h, v31.4s 304 305 SQADD v16.8h, v16.8h, v6.8h 306 SQADD v17.8h, v17.8h, v6.8h 307 SQADD v18.8h, v18.8h, v6.8h 308 SQADD v19.8h, v19.8h, v6.8h 309 SQADD v24.8h, v24.8h, v6.8h 310 SQADD v25.8h, v25.8h, v6.8h 311 SQADD v26.8h, v26.8h, v6.8h 312 SQADD v27.8h, v27.8h, v6.8h 313 LD1R {v4.16b}, [x11], 1 // clamp min value 314 315 SQXTUN v0.8b, v16.8h 316 SQXTUN v1.8b, v17.8h 317 SQXTUN v2.8b, v18.8h 318 SQXTUN v3.8b, v19.8h 319 LD1R {v5.16b}, [x11] // clamp max value 320 SQXTUN2 v0.16b, v24.8h 321 SQXTUN2 v1.16b, v25.8h 322 SQXTUN2 v2.16b, v26.8h 323 SQXTUN2 v3.16b, v27.8h 324 325 SUB x11, x11, 7 // rewind params pointer 326 327 UMAX v0.16b, v0.16b, v4.16b 328 UMAX v1.16b, v1.16b, v4.16b 329 UMAX v2.16b, v2.16b, v4.16b 330 UMAX v3.16b, v3.16b, v4.16b 331 SUBS x1, x1, 16 332 UMIN v0.16b, v0.16b, v5.16b 333 UMIN v1.16b, v1.16b, v5.16b 334 UMIN v2.16b, v2.16b, v5.16b 335 UMIN v3.16b, v3.16b, v5.16b 336 B.LO 5f 337 338 # Store full 4 x 16 339 ST1 {v0.16b}, [x6], x12 340 SUB x3, x3, x2 // a0 -= kc 341 ST1 {v1.16b}, [x8], x12 342 SUB x15, x15, x2 // a1 -= kc 343 ST1 {v2.16b}, [x9], x12 344 SUB x13, x13, x2 // a2 -= kc 345 ST1 {v3.16b}, [x7], x12 346 SUB x4, x4, x2 // a3 -= kc 347 B.NE 0b 348 349 # Restore d8,d12-d15 from stack 350 LDP d14, d15, [sp, 32] 351 LDP d12, d13, [sp, 16] 352 LDR d8, [sp], 48 353 RET 354 355 # Remainder- 8 bytes of A 356 .p2align 3 3573: 358 # Is there a remainder?- 8 bytes of A 359 TBZ x0, 3, 4f 360 361 LDR d0, [x3], 8 362 LDR q4, [x5], 16 363 LDR d1, [x15], 8 364 LDR d2, [x13], 8 365 LDR d3, [x4], 8 366 LDR q5, [x5], 16 367 368 UDOT v12.4s, v8.16b, v0.16b // update zero point 369 UDOT v13.4s, v8.16b, v1.16b 370 UDOT v14.4s, v8.16b, v2.16b 371 UDOT v15.4s, v8.16b, v3.16b 372 373 UDOT v16.4s, v4.16b, v0.4b[0] 374 UDOT v17.4s, v4.16b, v1.4b[0] 375 LDP q6, q7, [x5], 32 376 UDOT v18.4s, v4.16b, v2.4b[0] 377 UDOT v19.4s, v4.16b, v3.4b[0] 378 UDOT v20.4s, v5.16b, v0.4b[0] 379 UDOT v21.4s, v5.16b, v1.4b[0] 380 UDOT v22.4s, v5.16b, v2.4b[0] 381 UDOT v23.4s, v5.16b, v3.4b[0] 382 UDOT v24.4s, v6.16b, v0.4b[0] 383 UDOT v25.4s, v6.16b, v1.4b[0] 384 LDP q4, q5, [x5], 32 385 UDOT v26.4s, v6.16b, v2.4b[0] 386 UDOT v27.4s, v6.16b, v3.4b[0] 387 UDOT v28.4s, v7.16b, v0.4b[0] 388 UDOT v29.4s, v7.16b, v1.4b[0] 389 UDOT v30.4s, v7.16b, v2.4b[0] 390 UDOT v31.4s, v7.16b, v3.4b[0] 391 UDOT v16.4s, v4.16b, v0.4b[1] 392 UDOT v17.4s, v4.16b, v1.4b[1] 393 LDP q6, q7, [x5], 32 394 UDOT v18.4s, v4.16b, v2.4b[1] 395 UDOT v19.4s, v4.16b, v3.4b[1] 396 UDOT v20.4s, v5.16b, v0.4b[1] 397 UDOT v21.4s, v5.16b, v1.4b[1] 398 UDOT v22.4s, v5.16b, v2.4b[1] 399 UDOT v23.4s, v5.16b, v3.4b[1] 400 UDOT v24.4s, v6.16b, v0.4b[1] 401 UDOT v25.4s, v6.16b, v1.4b[1] 402 UDOT v26.4s, v6.16b, v2.4b[1] 403 UDOT v27.4s, v6.16b, v3.4b[1] 404 UDOT v28.4s, v7.16b, v0.4b[1] 405 UDOT v29.4s, v7.16b, v1.4b[1] 406 UDOT v30.4s, v7.16b, v2.4b[1] 407 UDOT v31.4s, v7.16b, v3.4b[1] 408 # Is there a remainder?- 4 bytes of A 409 TBZ x0, 2, 2b 410 411 # Remainder- 4 bytes of A 4124: 413 LDR s0, [x3], 4 414 LDR q4, [x5], 16 415 LDR s1, [x15], 4 416 LDR s2, [x13], 4 417 LDR s3, [x4], 4 418 LDR q5, [x5], 16 419 420 UDOT v12.4s, v8.16b, v0.16b // update zero point 421 UDOT v13.4s, v8.16b, v1.16b 422 UDOT v14.4s, v8.16b, v2.16b 423 UDOT v15.4s, v8.16b, v3.16b 424 425 UDOT v16.4s, v4.16b, v0.4b[0] 426 UDOT v17.4s, v4.16b, v1.4b[0] 427 UDOT v18.4s, v4.16b, v2.4b[0] 428 UDOT v19.4s, v4.16b, v3.4b[0] 429 LDP q6, q7, [x5], 32 430 UDOT v20.4s, v5.16b, v0.4b[0] 431 UDOT v21.4s, v5.16b, v1.4b[0] 432 UDOT v22.4s, v5.16b, v2.4b[0] 433 UDOT v23.4s, v5.16b, v3.4b[0] 434 UDOT v24.4s, v6.16b, v0.4b[0] 435 UDOT v25.4s, v6.16b, v1.4b[0] 436 UDOT v26.4s, v6.16b, v2.4b[0] 437 UDOT v27.4s, v6.16b, v3.4b[0] 438 UDOT v28.4s, v7.16b, v0.4b[0] 439 UDOT v29.4s, v7.16b, v1.4b[0] 440 UDOT v30.4s, v7.16b, v2.4b[0] 441 UDOT v31.4s, v7.16b, v3.4b[0] 442 B 2b 443 444 # Store odd width 445 .p2align 3 4465: 447 TBZ x1, 3, 6f 448 STR d0, [x6], 8 449 STR d1, [x8], 8 450 DUP d0, v0.d[1] 451 DUP d1, v1.d[1] 452 STR d2, [x9], 8 453 STR d3, [x7], 8 454 DUP d2, v2.d[1] 455 DUP d3, v3.d[1] 4566: 457 TBZ x1, 2, 7f 458 STR s0, [x6], 4 459 STR s1, [x8], 4 460 DUP s0, v0.s[1] 461 DUP s1, v1.s[1] 462 STR s2, [x9], 4 463 STR s3, [x7], 4 464 DUP s2, v2.s[1] 465 DUP s3, v3.s[1] 4667: 467 TBZ x1, 1, 8f 468 STR h0, [x6], 2 469 STR h1, [x8], 2 470 DUP h0, v0.h[1] 471 DUP h1, v1.h[1] 472 STR h2, [x9], 2 473 STR h3, [x7], 2 474 DUP h2, v2.h[1] 475 DUP h3, v3.h[1] 4768: 477 TBZ x1, 0, 9f 478 STR b0, [x6] 479 STR b1, [x8] 480 STR b2, [x9] 481 STR b3, [x7] 4829: 483 # Restore d8,d12-d15 from stack 484 LDP d14, d15, [sp, 32] 485 LDP d12, d13, [sp, 16] 486 LDR d8, [sp], 48 487 RET 488 489END_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 490 491#ifdef __ELF__ 492.section ".note.GNU-stack","",%progbits 493#endif 494