1// Auto-generated file. Do not edit! 2// Template: src/qu8-gemm/4x16c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11 24 25# params structure is 20 bytes 26# struct { 27# uint8_t kernel_zero_point[4]; 28# int32_t right_pre_shift; 29# int32_t multiplier; 30# int32_t right_post_shift; 31# int16_t output_zero_point; 32# int8_t output_min; 33# int8_t output_max; 34# } rndnu_neon; 35 36# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 37 38# Register usage 39# A0 x3 v0 40# A1 x15 v1 41# A2 x13 v2 42# A3 x4 v3 43# B x5 v4 v5 v6 v7 44# C0 x6 v16 v20 v24 v28 45# C1 x8 v17 v21 v25 v29 46# C2 x9 v18 v22 v26 v30 47# C3 x7 v19 v23 v27 v31 48# zero_point v8 v12 v13 v14 v15 49# unused v9 v10 v11 50 51BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128 52 53 # Clamp A and C pointers 54 CMP x0, 2 // if mr < 2 55 LDP x12, x11, [sp] // cn_stride, params 56 ADD x2, x2, 3 // kc = (kc + 3) & ~3 57 ADD x15, x3, x4 // a1 = a0 + a_stride 58 ADD x8, x6, x7 // c1 = c0 + cm_stride 59 60 # Save d8,d12-d15 on stack 61 STR d8, [sp, -48]! 62 CSEL x15, x3, x15, LO // a1 = a0 63 CSEL x8, x6, x8, LO // c1 = c0 64 BIC x2, x2, 3 65 66 STP d12, d13, [sp, 16] 67 ADD x13, x15, x4 // a2 = a1 + a_stride 68 ADD x9, x8, x7 // c2 = c1 + cm_stride 69 // if mr <= 2 70 CSEL x13, x15, x13, LS // a2 = a1 71 CSEL x9, x8, x9, LS // c2 = c1 72 73 STP d14, d15, [sp, 32] 74 CMP x0, 4 // if mr < 4 75 ADD x4, x13, x4 // a3 = a2 + a_stride 76 ADD x7, x9, x7 // c3 = c2 + cm_stride 77 78 LD1R {v8.4s}, [x11], 4 // kernel_zero_point 79 80 CSEL x4, x13, x4, LO // a3 = a2 81 CSEL x7, x9, x7, LO // c3 = c2 82 83 .p2align 3 840: 85 # Load initial bias from w into accumulators 86 LDP q16, q20, [x5], 32 87 88 MOVI v12.4s, 0 89 MOVI v13.4s, 0 90 MOVI v14.4s, 0 91 MOVI v15.4s, 0 92 93 MOV v17.16b, v16.16b 94 MOV v18.16b, v16.16b 95 LDP q24, q28, [x5], 32 96 MOV v19.16b, v16.16b 97 MOV v21.16b, v20.16b 98 SUBS x0, x2, 16 // k = kc - 16 99 MOV v22.16b, v20.16b 100 MOV v23.16b, v20.16b 101 MOV v25.16b, v24.16b 102 MOV v26.16b, v24.16b 103 MOV v27.16b, v24.16b 104 MOV v29.16b, v28.16b 105 MOV v30.16b, v28.16b 106 MOV v31.16b, v28.16b 107 108 # Is there at least 16 bytes? 109 B.LO 3f 110 111 # Main loop - 16 bytes of A 112 .p2align 3 1131: 114 LDR q0, [x3], 16 115 LDR q4, [x5], 16 116 LDR q1, [x15], 16 117 LDR q2, [x13], 16 118 LDR q3, [x4], 16 119 LDR q5, [x5], 16 120 121 UDOT v12.4s, v8.16b, v0.16b // update zero point 122 UDOT v13.4s, v8.16b, v1.16b 123 UDOT v14.4s, v8.16b, v2.16b 124 UDOT v15.4s, v8.16b, v3.16b 125 126 UDOT v16.4s, v4.16b, v0.4b[0] 127 UDOT v17.4s, v4.16b, v1.4b[0] 128 LDP q6, q7, [x5], 32 129 UDOT v18.4s, v4.16b, v2.4b[0] 130 UDOT v19.4s, v4.16b, v3.4b[0] 131 UDOT v20.4s, v5.16b, v0.4b[0] 132 UDOT v21.4s, v5.16b, v1.4b[0] 133 UDOT v22.4s, v5.16b, v2.4b[0] 134 UDOT v23.4s, v5.16b, v3.4b[0] 135 UDOT v24.4s, v6.16b, v0.4b[0] 136 UDOT v25.4s, v6.16b, v1.4b[0] 137 LDP q4, q5, [x5], 32 138 UDOT v26.4s, v6.16b, v2.4b[0] 139 UDOT v27.4s, v6.16b, v3.4b[0] 140 UDOT v28.4s, v7.16b, v0.4b[0] 141 UDOT v29.4s, v7.16b, v1.4b[0] 142 UDOT v30.4s, v7.16b, v2.4b[0] 143 UDOT v31.4s, v7.16b, v3.4b[0] 144 145 UDOT v16.4s, v4.16b, v0.4b[1] 146 UDOT v17.4s, v4.16b, v1.4b[1] 147 LDP q6, q7, [x5], 32 148 UDOT v18.4s, v4.16b, v2.4b[1] 149 UDOT v19.4s, v4.16b, v3.4b[1] 150 UDOT v20.4s, v5.16b, v0.4b[1] 151 UDOT v21.4s, v5.16b, v1.4b[1] 152 UDOT v22.4s, v5.16b, v2.4b[1] 153 UDOT v23.4s, v5.16b, v3.4b[1] 154 UDOT v24.4s, v6.16b, v0.4b[1] 155 UDOT v25.4s, v6.16b, v1.4b[1] 156 LDP q4, q5, [x5], 32 157 UDOT v26.4s, v6.16b, v2.4b[1] 158 UDOT v27.4s, v6.16b, v3.4b[1] 159 UDOT v28.4s, v7.16b, v0.4b[1] 160 UDOT v29.4s, v7.16b, v1.4b[1] 161 UDOT v30.4s, v7.16b, v2.4b[1] 162 UDOT v31.4s, v7.16b, v3.4b[1] 163 164 UDOT v16.4s, v4.16b, v0.4b[2] 165 UDOT v17.4s, v4.16b, v1.4b[2] 166 LDP q6, q7, [x5], 32 167 UDOT v18.4s, v4.16b, v2.4b[2] 168 UDOT v19.4s, v4.16b, v3.4b[2] 169 UDOT v20.4s, v5.16b, v0.4b[2] 170 UDOT v21.4s, v5.16b, v1.4b[2] 171 UDOT v22.4s, v5.16b, v2.4b[2] 172 UDOT v23.4s, v5.16b, v3.4b[2] 173 UDOT v24.4s, v6.16b, v0.4b[2] 174 UDOT v25.4s, v6.16b, v1.4b[2] 175 LDP q4, q5, [x5], 32 176 UDOT v26.4s, v6.16b, v2.4b[2] 177 UDOT v27.4s, v6.16b, v3.4b[2] 178 UDOT v28.4s, v7.16b, v0.4b[2] 179 UDOT v29.4s, v7.16b, v1.4b[2] 180 UDOT v30.4s, v7.16b, v2.4b[2] 181 UDOT v31.4s, v7.16b, v3.4b[2] 182 183 UDOT v16.4s, v4.16b, v0.4b[3] 184 UDOT v17.4s, v4.16b, v1.4b[3] 185 LDP q6, q7, [x5], 32 186 UDOT v18.4s, v4.16b, v2.4b[3] 187 UDOT v19.4s, v4.16b, v3.4b[3] 188 UDOT v20.4s, v5.16b, v0.4b[3] 189 UDOT v21.4s, v5.16b, v1.4b[3] 190 UDOT v22.4s, v5.16b, v2.4b[3] 191 UDOT v23.4s, v5.16b, v3.4b[3] 192 UDOT v24.4s, v6.16b, v0.4b[3] 193 UDOT v25.4s, v6.16b, v1.4b[3] 194 UDOT v26.4s, v6.16b, v2.4b[3] 195 UDOT v27.4s, v6.16b, v3.4b[3] 196 SUBS x0, x0, 16 197 UDOT v28.4s, v7.16b, v0.4b[3] 198 UDOT v29.4s, v7.16b, v1.4b[3] 199 UDOT v30.4s, v7.16b, v2.4b[3] 200 UDOT v31.4s, v7.16b, v3.4b[3] 201 B.HS 1b 202 203 # Is there a remainder?- 4 to 12 bytes of A 204 TST x0, 15 205 B.NE 3f 206 2072: 208 ADDP v0.4s, v12.4s, v12.4s 209 ADDP v1.4s, v13.4s, v13.4s 210 ADDP v2.4s, v14.4s, v14.4s 211 ADDP v3.4s, v15.4s, v15.4s 212 ADDP v12.4s, v0.4s, v0.4s 213 ADDP v13.4s, v1.4s, v1.4s 214 ADDP v14.4s, v2.4s, v2.4s 215 ADDP v15.4s, v3.4s, v3.4s 216 217 # Subtract zero point from accumulators 218 SUB v16.4s, v16.4s, v12.4s 219 SUB v17.4s, v17.4s, v13.4s 220 SUB v18.4s, v18.4s, v14.4s 221 SUB v19.4s, v19.4s, v15.4s 222 SUB v20.4s, v20.4s, v12.4s 223 SUB v21.4s, v21.4s, v13.4s 224 SUB v22.4s, v22.4s, v14.4s 225 SUB v23.4s, v23.4s, v15.4s 226 SUB v24.4s, v24.4s, v12.4s 227 SUB v25.4s, v25.4s, v13.4s 228 SUB v26.4s, v26.4s, v14.4s 229 SUB v27.4s, v27.4s, v15.4s 230 SUB v28.4s, v28.4s, v12.4s 231 SUB v29.4s, v29.4s, v13.4s 232 SUB v30.4s, v30.4s, v14.4s 233 SUB v31.4s, v31.4s, v15.4s 234 235 # Apply params - preshift, scale, postshift, bias and clamp 236 LD1R {v4.4s}, [x11], 4 237 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 238 SSHL v17.4s, v17.4s, v4.4s 239 SSHL v18.4s, v18.4s, v4.4s 240 SSHL v19.4s, v19.4s, v4.4s 241 SSHL v20.4s, v20.4s, v4.4s 242 SSHL v21.4s, v21.4s, v4.4s 243 SSHL v22.4s, v22.4s, v4.4s 244 SSHL v23.4s, v23.4s, v4.4s 245 LD1R {v5.4s}, [x11], 4 246 SSHL v24.4s, v24.4s, v4.4s 247 SSHL v25.4s, v25.4s, v4.4s 248 SSHL v26.4s, v26.4s, v4.4s 249 SSHL v27.4s, v27.4s, v4.4s 250 SSHL v28.4s, v28.4s, v4.4s 251 SSHL v29.4s, v29.4s, v4.4s 252 SSHL v30.4s, v30.4s, v4.4s 253 SSHL v31.4s, v31.4s, v4.4s 254 LD1R {v6.4s}, [x11], 4 255 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 256 SQDMULH v17.4s, v17.4s, v5.4s 257 SQDMULH v18.4s, v18.4s, v5.4s 258 SQDMULH v19.4s, v19.4s, v5.4s 259 SQDMULH v20.4s, v20.4s, v5.4s 260 SQDMULH v21.4s, v21.4s, v5.4s 261 SQDMULH v22.4s, v22.4s, v5.4s 262 SQDMULH v23.4s, v23.4s, v5.4s 263 SQDMULH v24.4s, v24.4s, v5.4s 264 SQDMULH v25.4s, v25.4s, v5.4s 265 SQDMULH v26.4s, v26.4s, v5.4s 266 SQDMULH v27.4s, v27.4s, v5.4s 267 SQDMULH v28.4s, v28.4s, v5.4s 268 SQDMULH v29.4s, v29.4s, v5.4s 269 SQDMULH v30.4s, v30.4s, v5.4s 270 SQDMULH v31.4s, v31.4s, v5.4s 271 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 272 SRSHL v17.4s, v17.4s, v6.4s 273 SRSHL v18.4s, v18.4s, v6.4s 274 SRSHL v19.4s, v19.4s, v6.4s 275 SRSHL v20.4s, v20.4s, v6.4s 276 SRSHL v21.4s, v21.4s, v6.4s 277 SRSHL v22.4s, v22.4s, v6.4s 278 SRSHL v23.4s, v23.4s, v6.4s 279 SRSHL v24.4s, v24.4s, v6.4s 280 SRSHL v25.4s, v25.4s, v6.4s 281 SRSHL v26.4s, v26.4s, v6.4s 282 SRSHL v27.4s, v27.4s, v6.4s 283 SRSHL v28.4s, v28.4s, v6.4s 284 SRSHL v29.4s, v29.4s, v6.4s 285 SRSHL v30.4s, v30.4s, v6.4s 286 SRSHL v31.4s, v31.4s, v6.4s 287 288 SQXTN v16.4h, v16.4s 289 SQXTN v17.4h, v17.4s 290 SQXTN v18.4h, v18.4s 291 SQXTN v19.4h, v19.4s 292 SQXTN v24.4h, v24.4s 293 SQXTN v25.4h, v25.4s 294 SQXTN v26.4h, v26.4s 295 SQXTN v27.4h, v27.4s 296 LD1R {v6.8h}, [x11], 2 // add bias 297 298 SQXTN2 v16.8h, v20.4s 299 SQXTN2 v17.8h, v21.4s 300 SQXTN2 v18.8h, v22.4s 301 SQXTN2 v19.8h, v23.4s 302 SQXTN2 v24.8h, v28.4s 303 SQXTN2 v25.8h, v29.4s 304 SQXTN2 v26.8h, v30.4s 305 SQXTN2 v27.8h, v31.4s 306 307 SQADD v16.8h, v16.8h, v6.8h 308 SQADD v17.8h, v17.8h, v6.8h 309 SQADD v18.8h, v18.8h, v6.8h 310 SQADD v19.8h, v19.8h, v6.8h 311 SQADD v24.8h, v24.8h, v6.8h 312 SQADD v25.8h, v25.8h, v6.8h 313 SQADD v26.8h, v26.8h, v6.8h 314 SQADD v27.8h, v27.8h, v6.8h 315 LD1R {v4.16b}, [x11], 1 // clamp min value 316 317 SQXTUN v0.8b, v16.8h 318 SQXTUN v1.8b, v17.8h 319 SQXTUN v2.8b, v18.8h 320 SQXTUN v3.8b, v19.8h 321 LD1R {v5.16b}, [x11] // clamp max value 322 SQXTUN2 v0.16b, v24.8h 323 SQXTUN2 v1.16b, v25.8h 324 SQXTUN2 v2.16b, v26.8h 325 SQXTUN2 v3.16b, v27.8h 326 327 SUB x11, x11, 15 // rewind params pointer 328 329 UMAX v0.16b, v0.16b, v4.16b 330 UMAX v1.16b, v1.16b, v4.16b 331 UMAX v2.16b, v2.16b, v4.16b 332 UMAX v3.16b, v3.16b, v4.16b 333 SUBS x1, x1, 16 334 UMIN v0.16b, v0.16b, v5.16b 335 UMIN v1.16b, v1.16b, v5.16b 336 UMIN v2.16b, v2.16b, v5.16b 337 UMIN v3.16b, v3.16b, v5.16b 338 B.LO 5f 339 340 # Store full 4 x 16 341 ST1 {v0.16b}, [x6], x12 342 SUB x3, x3, x2 // a0 -= kc 343 ST1 {v1.16b}, [x8], x12 344 SUB x15, x15, x2 // a1 -= kc 345 ST1 {v2.16b}, [x9], x12 346 SUB x13, x13, x2 // a2 -= kc 347 ST1 {v3.16b}, [x7], x12 348 SUB x4, x4, x2 // a3 -= kc 349 B.NE 0b 350 351 # Restore d8,d12-d15 from stack 352 LDP d14, d15, [sp, 32] 353 LDP d12, d13, [sp, 16] 354 LDR d8, [sp], 48 355 RET 356 357 # Remainder- 8 bytes of A 358 .p2align 3 3593: 360 # Is there a remainder?- 8 bytes of A 361 TBZ x0, 3, 4f 362 363 LDR d0, [x3], 8 364 LDR q4, [x5], 16 365 LDR d1, [x15], 8 366 LDR d2, [x13], 8 367 LDR d3, [x4], 8 368 LDR q5, [x5], 16 369 370 UDOT v12.4s, v8.16b, v0.16b // update zero point 371 UDOT v13.4s, v8.16b, v1.16b 372 UDOT v14.4s, v8.16b, v2.16b 373 UDOT v15.4s, v8.16b, v3.16b 374 375 UDOT v16.4s, v4.16b, v0.4b[0] 376 UDOT v17.4s, v4.16b, v1.4b[0] 377 LDP q6, q7, [x5], 32 378 UDOT v18.4s, v4.16b, v2.4b[0] 379 UDOT v19.4s, v4.16b, v3.4b[0] 380 UDOT v20.4s, v5.16b, v0.4b[0] 381 UDOT v21.4s, v5.16b, v1.4b[0] 382 UDOT v22.4s, v5.16b, v2.4b[0] 383 UDOT v23.4s, v5.16b, v3.4b[0] 384 UDOT v24.4s, v6.16b, v0.4b[0] 385 UDOT v25.4s, v6.16b, v1.4b[0] 386 LDP q4, q5, [x5], 32 387 UDOT v26.4s, v6.16b, v2.4b[0] 388 UDOT v27.4s, v6.16b, v3.4b[0] 389 UDOT v28.4s, v7.16b, v0.4b[0] 390 UDOT v29.4s, v7.16b, v1.4b[0] 391 UDOT v30.4s, v7.16b, v2.4b[0] 392 UDOT v31.4s, v7.16b, v3.4b[0] 393 UDOT v16.4s, v4.16b, v0.4b[1] 394 UDOT v17.4s, v4.16b, v1.4b[1] 395 LDP q6, q7, [x5], 32 396 UDOT v18.4s, v4.16b, v2.4b[1] 397 UDOT v19.4s, v4.16b, v3.4b[1] 398 UDOT v20.4s, v5.16b, v0.4b[1] 399 UDOT v21.4s, v5.16b, v1.4b[1] 400 UDOT v22.4s, v5.16b, v2.4b[1] 401 UDOT v23.4s, v5.16b, v3.4b[1] 402 UDOT v24.4s, v6.16b, v0.4b[1] 403 UDOT v25.4s, v6.16b, v1.4b[1] 404 UDOT v26.4s, v6.16b, v2.4b[1] 405 UDOT v27.4s, v6.16b, v3.4b[1] 406 UDOT v28.4s, v7.16b, v0.4b[1] 407 UDOT v29.4s, v7.16b, v1.4b[1] 408 UDOT v30.4s, v7.16b, v2.4b[1] 409 UDOT v31.4s, v7.16b, v3.4b[1] 410 # Is there a remainder?- 4 bytes of A 411 TBZ x0, 2, 2b 412 413 # Remainder- 4 bytes of A 4144: 415 LDR s0, [x3], 4 416 LDR q4, [x5], 16 417 LDR s1, [x15], 4 418 LDR s2, [x13], 4 419 LDR s3, [x4], 4 420 LDR q5, [x5], 16 421 422 UDOT v12.4s, v8.16b, v0.16b // update zero point 423 UDOT v13.4s, v8.16b, v1.16b 424 UDOT v14.4s, v8.16b, v2.16b 425 UDOT v15.4s, v8.16b, v3.16b 426 427 UDOT v16.4s, v4.16b, v0.4b[0] 428 UDOT v17.4s, v4.16b, v1.4b[0] 429 UDOT v18.4s, v4.16b, v2.4b[0] 430 UDOT v19.4s, v4.16b, v3.4b[0] 431 LDP q6, q7, [x5], 32 432 UDOT v20.4s, v5.16b, v0.4b[0] 433 UDOT v21.4s, v5.16b, v1.4b[0] 434 UDOT v22.4s, v5.16b, v2.4b[0] 435 UDOT v23.4s, v5.16b, v3.4b[0] 436 UDOT v24.4s, v6.16b, v0.4b[0] 437 UDOT v25.4s, v6.16b, v1.4b[0] 438 UDOT v26.4s, v6.16b, v2.4b[0] 439 UDOT v27.4s, v6.16b, v3.4b[0] 440 UDOT v28.4s, v7.16b, v0.4b[0] 441 UDOT v29.4s, v7.16b, v1.4b[0] 442 UDOT v30.4s, v7.16b, v2.4b[0] 443 UDOT v31.4s, v7.16b, v3.4b[0] 444 B 2b 445 446 # Store odd width 447 .p2align 3 4485: 449 TBZ x1, 3, 6f 450 STR d0, [x6], 8 451 STR d1, [x8], 8 452 DUP d0, v0.d[1] 453 DUP d1, v1.d[1] 454 STR d2, [x9], 8 455 STR d3, [x7], 8 456 DUP d2, v2.d[1] 457 DUP d3, v3.d[1] 4586: 459 TBZ x1, 2, 7f 460 STR s0, [x6], 4 461 STR s1, [x8], 4 462 DUP s0, v0.s[1] 463 DUP s1, v1.s[1] 464 STR s2, [x9], 4 465 STR s3, [x7], 4 466 DUP s2, v2.s[1] 467 DUP s3, v3.s[1] 4687: 469 TBZ x1, 1, 8f 470 STR h0, [x6], 2 471 STR h1, [x8], 2 472 DUP h0, v0.h[1] 473 DUP h1, v1.h[1] 474 STR h2, [x9], 2 475 STR h3, [x7], 2 476 DUP h2, v2.h[1] 477 DUP h3, v3.h[1] 4788: 479 TBZ x1, 0, 9f 480 STR b0, [x6] 481 STR b1, [x8] 482 STR b2, [x9] 483 STR b3, [x7] 4849: 485 # Restore d8,d12-d15 from stack 486 LDP d14, d15, [sp, 32] 487 LDP d12, d13, [sp, 16] 488 LDR d8, [sp], 48 489 RET 490 491END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128 492 493#ifdef __ELF__ 494.section ".note.GNU-stack","",%progbits 495#endif 496