1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x0) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x10 v3 34# B x5 v4 v5 v6 v7 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v8 v9 v10 v11 v12 v13 v14 v15 40 41BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 42 43 # Clamp C pointers 44 CMP x0, 2 // if mr < 2 45 LDR x8, [sp, 8] // Load a_offset 46 ADD x16, x6, x7 // c1 = c0 + cm_stride 47 CSEL x16, x6, x16, LO // c1 = c0 48 ADD x2, x2, 3 // kc = (kc + 3) & ~3 49 50 ADD x17, x16, x7 // c2 = c1 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 // if mr <= 2 53 CSEL x17, x16, x17, LS // c2 = c1 54 BIC x2, x2, 3 55 56 CMP x0, 4 // if mr < 4 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 CSEL x7, x17, x7, LO // c3 = c2 59 60 .p2align 3 610: 62 # Load initial bias from w into accumulators 63 LDP q16, q20, [x5], 32 64 MOV v17.16b, v16.16b 65 MOV v18.16b, v16.16b 66 LDP q24, q28, [x5], 32 67 MOV v19.16b, v16.16b 68 MOV v21.16b, v20.16b 69 MOV v22.16b, v20.16b 70 MOV v23.16b, v20.16b 71 MOV v25.16b, v24.16b 72 MOV v26.16b, v24.16b 73 MOV v27.16b, v24.16b 74 MOV v29.16b, v28.16b 75 MOV v30.16b, v28.16b 76 MOV v31.16b, v28.16b 77 MOV x9, x3 // p = ks 78 79 .p2align 3 801: 81 # Load next 4 A pointers 82 LDP x13, x14, [x4], 16 83 LDP x15, x10, [x4], 16 84 85 CMP x13, x12 // if a0 == zero 86 ADD x13, x13, x8 // a0 += a_offset 87 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 88 CMP x14, x12 // if a1 == zero 89 ADD x14, x14, x8 // a1 += a_offset 90 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 91 CMP x15, x12 // if a2 == zero 92 ADD x15, x15, x8 // a2 += a_offset 93 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 94 CMP x10, x12 // if a3 == zero 95 ADD x10, x10, x8 // a3 += a_offset 96 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 97 98 # Is there at least 16 bytes for main loop? 99 SUBS x0, x2, 16 // k = kc - 16 100 B.LO 4f 101 102 # Main loop - 16 bytes of A 103 .p2align 3 1042: 105 LDR q0, [x13], 16 106 LDR q4, [x5], 16 107 LDR q1, [x14], 16 108 LDR q2, [x15], 16 109 LDR q3, [x10], 16 110 LDR q5, [x5], 16 111 SDOT v16.4s, v4.16b, v0.4b[0] 112 SDOT v17.4s, v4.16b, v1.4b[0] 113 LDP q6, q7, [x5], 32 114 SDOT v18.4s, v4.16b, v2.4b[0] 115 SDOT v19.4s, v4.16b, v3.4b[0] 116 SDOT v20.4s, v5.16b, v0.4b[0] 117 SDOT v21.4s, v5.16b, v1.4b[0] 118 SDOT v22.4s, v5.16b, v2.4b[0] 119 SDOT v23.4s, v5.16b, v3.4b[0] 120 SDOT v24.4s, v6.16b, v0.4b[0] 121 SDOT v25.4s, v6.16b, v1.4b[0] 122 LDP q4, q5, [x5], 32 123 SDOT v26.4s, v6.16b, v2.4b[0] 124 SDOT v27.4s, v6.16b, v3.4b[0] 125 SDOT v28.4s, v7.16b, v0.4b[0] 126 SDOT v29.4s, v7.16b, v1.4b[0] 127 SDOT v30.4s, v7.16b, v2.4b[0] 128 SDOT v31.4s, v7.16b, v3.4b[0] 129 130 SDOT v16.4s, v4.16b, v0.4b[1] 131 SDOT v17.4s, v4.16b, v1.4b[1] 132 LDP q6, q7, [x5], 32 133 SDOT v18.4s, v4.16b, v2.4b[1] 134 SDOT v19.4s, v4.16b, v3.4b[1] 135 SDOT v20.4s, v5.16b, v0.4b[1] 136 SDOT v21.4s, v5.16b, v1.4b[1] 137 SDOT v22.4s, v5.16b, v2.4b[1] 138 SDOT v23.4s, v5.16b, v3.4b[1] 139 SDOT v24.4s, v6.16b, v0.4b[1] 140 SDOT v25.4s, v6.16b, v1.4b[1] 141 LDP q4, q5, [x5], 32 142 SDOT v26.4s, v6.16b, v2.4b[1] 143 SDOT v27.4s, v6.16b, v3.4b[1] 144 SDOT v28.4s, v7.16b, v0.4b[1] 145 SDOT v29.4s, v7.16b, v1.4b[1] 146 SDOT v30.4s, v7.16b, v2.4b[1] 147 SDOT v31.4s, v7.16b, v3.4b[1] 148 149 SDOT v16.4s, v4.16b, v0.4b[2] 150 SDOT v17.4s, v4.16b, v1.4b[2] 151 LDP q6, q7, [x5], 32 152 SDOT v18.4s, v4.16b, v2.4b[2] 153 SDOT v19.4s, v4.16b, v3.4b[2] 154 SDOT v20.4s, v5.16b, v0.4b[2] 155 SDOT v21.4s, v5.16b, v1.4b[2] 156 SDOT v22.4s, v5.16b, v2.4b[2] 157 SDOT v23.4s, v5.16b, v3.4b[2] 158 SDOT v24.4s, v6.16b, v0.4b[2] 159 SDOT v25.4s, v6.16b, v1.4b[2] 160 LDP q4, q5, [x5], 32 161 SDOT v26.4s, v6.16b, v2.4b[2] 162 SDOT v27.4s, v6.16b, v3.4b[2] 163 SDOT v28.4s, v7.16b, v0.4b[2] 164 SDOT v29.4s, v7.16b, v1.4b[2] 165 SDOT v30.4s, v7.16b, v2.4b[2] 166 SDOT v31.4s, v7.16b, v3.4b[2] 167 168 SDOT v16.4s, v4.16b, v0.4b[3] 169 SDOT v17.4s, v4.16b, v1.4b[3] 170 LDP q6, q7, [x5], 32 171 SDOT v18.4s, v4.16b, v2.4b[3] 172 SDOT v19.4s, v4.16b, v3.4b[3] 173 SDOT v20.4s, v5.16b, v0.4b[3] 174 SDOT v21.4s, v5.16b, v1.4b[3] 175 SDOT v22.4s, v5.16b, v2.4b[3] 176 SDOT v23.4s, v5.16b, v3.4b[3] 177 SDOT v24.4s, v6.16b, v0.4b[3] 178 SDOT v25.4s, v6.16b, v1.4b[3] 179 SDOT v26.4s, v6.16b, v2.4b[3] 180 SDOT v27.4s, v6.16b, v3.4b[3] 181 SUBS x0, x0, 16 182 SDOT v28.4s, v7.16b, v0.4b[3] 183 SDOT v29.4s, v7.16b, v1.4b[3] 184 SDOT v30.4s, v7.16b, v2.4b[3] 185 SDOT v31.4s, v7.16b, v3.4b[3] 186 B.HS 2b 187 188 # Is there a remainder?- 4 to 12 bytes of A 189 TST x0, 15 190 B.NE 4f 191 1923: 193 # ks loop 194 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 195 B.HI 1b 196 197 SCVTF v16.4s, v16.4s 198 SCVTF v17.4s, v17.4s 199 # Load per channel scale values from weights 200 LDR q4, [x5], 16 201 SCVTF v18.4s, v18.4s 202 SCVTF v19.4s, v19.4s 203 LDR q5, [x5], 16 204 SCVTF v20.4s, v20.4s 205 SCVTF v21.4s, v21.4s 206 SCVTF v22.4s, v22.4s 207 SCVTF v23.4s, v23.4s 208 SCVTF v24.4s, v24.4s 209 SCVTF v25.4s, v25.4s 210 SCVTF v26.4s, v26.4s 211 SCVTF v27.4s, v27.4s 212 SCVTF v28.4s, v28.4s 213 SCVTF v29.4s, v29.4s 214 SCVTF v30.4s, v30.4s 215 SCVTF v31.4s, v31.4s 216 217 LDR q6, [x5], 16 218 FMUL v16.4s, v16.4s, v4.4s 219 FMUL v17.4s, v17.4s, v4.4s 220 FMUL v18.4s, v18.4s, v4.4s 221 FMUL v19.4s, v19.4s, v4.4s 222 FMUL v20.4s, v20.4s, v5.4s 223 LDR q4, [x5], 16 224 FMUL v21.4s, v21.4s, v5.4s 225 FMUL v22.4s, v22.4s, v5.4s 226 FMUL v23.4s, v23.4s, v5.4s 227 FMUL v24.4s, v24.4s, v6.4s 228 FMUL v25.4s, v25.4s, v6.4s 229 FMUL v26.4s, v26.4s, v6.4s 230 FMUL v27.4s, v27.4s, v6.4s 231 FMUL v28.4s, v28.4s, v4.4s 232 FMUL v29.4s, v29.4s, v4.4s 233 FMUL v30.4s, v30.4s, v4.4s 234 FMUL v31.4s, v31.4s, v4.4s 235 236 FCVTNS v16.4s, v16.4s 237 FCVTNS v17.4s, v17.4s 238 FCVTNS v18.4s, v18.4s 239 FCVTNS v19.4s, v19.4s 240 FCVTNS v20.4s, v20.4s 241 FCVTNS v21.4s, v21.4s 242 FCVTNS v22.4s, v22.4s 243 FCVTNS v23.4s, v23.4s 244 FCVTNS v24.4s, v24.4s 245 FCVTNS v25.4s, v25.4s 246 FCVTNS v26.4s, v26.4s 247 FCVTNS v27.4s, v27.4s 248 FCVTNS v28.4s, v28.4s 249 FCVTNS v29.4s, v29.4s 250 FCVTNS v30.4s, v30.4s 251 FCVTNS v31.4s, v31.4s 252 253 SQXTN v16.4h, v16.4s 254 SQXTN v17.4h, v17.4s 255 SQXTN v18.4h, v18.4s 256 SQXTN v19.4h, v19.4s 257 SQXTN v24.4h, v24.4s 258 SQXTN v25.4h, v25.4s 259 SQXTN v26.4h, v26.4s 260 SQXTN v27.4h, v27.4s 261 LD1R {v6.8h}, [x11], 2 // add bias 262 263 SQXTN2 v16.8h, v20.4s 264 SQXTN2 v17.8h, v21.4s 265 SQXTN2 v18.8h, v22.4s 266 SQXTN2 v19.8h, v23.4s 267 SQXTN2 v24.8h, v28.4s 268 SQXTN2 v25.8h, v29.4s 269 SQXTN2 v26.8h, v30.4s 270 SQXTN2 v27.8h, v31.4s 271 272 SQADD v16.8h, v16.8h, v6.8h 273 SQADD v17.8h, v17.8h, v6.8h 274 SQADD v18.8h, v18.8h, v6.8h 275 SQADD v19.8h, v19.8h, v6.8h 276 SQADD v24.8h, v24.8h, v6.8h 277 SQADD v25.8h, v25.8h, v6.8h 278 SQADD v26.8h, v26.8h, v6.8h 279 SQADD v27.8h, v27.8h, v6.8h 280 LD1R {v4.16b}, [x11], 1 // clamp min value 281 282 SQXTN v0.8b, v16.8h 283 SQXTN v1.8b, v17.8h 284 SQXTN v2.8b, v18.8h 285 SQXTN v3.8b, v19.8h 286 LD1R {v5.16b}, [x11] // clamp max value 287 SQXTN2 v0.16b, v24.8h 288 SQXTN2 v1.16b, v25.8h 289 SQXTN2 v2.16b, v26.8h 290 SQXTN2 v3.16b, v27.8h 291 LDR x0, [sp] // cn_stride 292 293 SMAX v0.16b, v0.16b, v4.16b 294 SMAX v1.16b, v1.16b, v4.16b 295 SUB x11, x11, 3 // rewind params pointer 296 SMAX v2.16b, v2.16b, v4.16b 297 SMAX v3.16b, v3.16b, v4.16b 298 SUBS x1, x1, 16 299 SMIN v0.16b, v0.16b, v5.16b 300 SMIN v1.16b, v1.16b, v5.16b 301 SMIN v2.16b, v2.16b, v5.16b 302 SMIN v3.16b, v3.16b, v5.16b 303 B.LO 6f 304 305 # Store full 4 x 16 306 ST1 {v3.16b}, [x7], x0 307 ST1 {v2.16b}, [x17], x0 308 ST1 {v1.16b}, [x16], x0 309 ST1 {v0.16b}, [x6], x0 310 311 SUB x4, x4, x3 // a -= ks 312 313 # nc loop 314 B.HI 0b 315 RET 316 317 # Remainder- 8 bytes of A 318 .p2align 3 3194: 320 # Is there a remainder?- 8 bytes of A 321 TBZ x0, 3, 5f 322 323 LDR d0, [x13], 8 324 LDR q4, [x5], 16 325 LDR d1, [x14], 8 326 LDR d2, [x15], 8 327 LDR d3, [x10], 8 328 LDR q5, [x5], 16 329 SDOT v16.4s, v4.16b, v0.4b[0] 330 SDOT v17.4s, v4.16b, v1.4b[0] 331 LDP q6, q7, [x5], 32 332 SDOT v18.4s, v4.16b, v2.4b[0] 333 SDOT v19.4s, v4.16b, v3.4b[0] 334 SDOT v20.4s, v5.16b, v0.4b[0] 335 SDOT v21.4s, v5.16b, v1.4b[0] 336 SDOT v22.4s, v5.16b, v2.4b[0] 337 SDOT v23.4s, v5.16b, v3.4b[0] 338 SDOT v24.4s, v6.16b, v0.4b[0] 339 SDOT v25.4s, v6.16b, v1.4b[0] 340 LDP q4, q5, [x5], 32 341 SDOT v26.4s, v6.16b, v2.4b[0] 342 SDOT v27.4s, v6.16b, v3.4b[0] 343 SDOT v28.4s, v7.16b, v0.4b[0] 344 SDOT v29.4s, v7.16b, v1.4b[0] 345 SDOT v30.4s, v7.16b, v2.4b[0] 346 SDOT v31.4s, v7.16b, v3.4b[0] 347 SDOT v16.4s, v4.16b, v0.4b[1] 348 SDOT v17.4s, v4.16b, v1.4b[1] 349 LDP q6, q7, [x5], 32 350 SDOT v18.4s, v4.16b, v2.4b[1] 351 SDOT v19.4s, v4.16b, v3.4b[1] 352 SDOT v20.4s, v5.16b, v0.4b[1] 353 SDOT v21.4s, v5.16b, v1.4b[1] 354 SDOT v22.4s, v5.16b, v2.4b[1] 355 SDOT v23.4s, v5.16b, v3.4b[1] 356 SDOT v24.4s, v6.16b, v0.4b[1] 357 SDOT v25.4s, v6.16b, v1.4b[1] 358 SDOT v26.4s, v6.16b, v2.4b[1] 359 SDOT v27.4s, v6.16b, v3.4b[1] 360 SDOT v28.4s, v7.16b, v0.4b[1] 361 SDOT v29.4s, v7.16b, v1.4b[1] 362 SDOT v30.4s, v7.16b, v2.4b[1] 363 SDOT v31.4s, v7.16b, v3.4b[1] 364 # Is there a remainder?- 4 bytes of A 365 TBZ x0, 2, 3b 366 367 # Remainder- 4 bytes of A 3685: 369 LDR s0, [x13], 4 370 LDR q4, [x5], 16 371 LDR s1, [x14], 4 372 LDR s2, [x15], 4 373 LDR s3, [x10], 4 374 LDR q5, [x5], 16 375 SDOT v16.4s, v4.16b, v0.4b[0] 376 SDOT v17.4s, v4.16b, v1.4b[0] 377 LDP q6, q7, [x5], 32 378 SDOT v18.4s, v4.16b, v2.4b[0] 379 SDOT v19.4s, v4.16b, v3.4b[0] 380 SDOT v20.4s, v5.16b, v0.4b[0] 381 SDOT v21.4s, v5.16b, v1.4b[0] 382 SDOT v22.4s, v5.16b, v2.4b[0] 383 SDOT v23.4s, v5.16b, v3.4b[0] 384 SDOT v24.4s, v6.16b, v0.4b[0] 385 SDOT v25.4s, v6.16b, v1.4b[0] 386 SDOT v26.4s, v6.16b, v2.4b[0] 387 SDOT v27.4s, v6.16b, v3.4b[0] 388 SDOT v28.4s, v7.16b, v0.4b[0] 389 SDOT v29.4s, v7.16b, v1.4b[0] 390 SDOT v30.4s, v7.16b, v2.4b[0] 391 SDOT v31.4s, v7.16b, v3.4b[0] 392 B 3b 393 394 # Store odd width 395 .p2align 3 3966: 397 TBZ x1, 3, 7f 398 STR d3, [x7], 8 399 STR d2, [x17], 8 400 DUP d3, v3.d[1] 401 DUP d2, v2.d[1] 402 STR d1, [x16], 8 403 STR d0, [x6], 8 404 DUP d1, v1.d[1] 405 DUP d0, v0.d[1] 4067: 407 TBZ x1, 2, 8f 408 STR s3, [x7], 4 409 STR s2, [x17], 4 410 DUP s3, v3.s[1] 411 DUP s2, v2.s[1] 412 STR s1, [x16], 4 413 STR s0, [x6], 4 414 DUP s1, v1.s[1] 415 DUP s0, v0.s[1] 4168: 417 TBZ x1, 1, 9f 418 STR h3, [x7], 2 419 STR h2, [x17], 2 420 DUP h3, v3.h[1] 421 DUP h2, v2.h[1] 422 STR h1, [x16], 2 423 STR h0, [x6], 2 424 DUP h1, v1.h[1] 425 DUP h0, v0.h[1] 4269: 427 TBZ x1, 0, 10f 428 STR b3, [x7] 429 STR b2, [x17] 430 STR b1, [x16] 431 STR b0, [x6] 43210: 433 RET 434 435END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 436 437#ifdef __ELF__ 438.section ".note.GNU-stack","",%progbits 439#endif 440