1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# params structure is 4 bytes 26# struct { 27# int16_t output_zero_point; 28# uint8_t output_min; 29# uint8_t output_max; 30# } neon; 31 32# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 33 34# Register usage 35# A0 x3 v0 36# A1 x15 v1 37# A2 x13 v2 38# A3 x4 v3 39# B x5 v4 v5 v6 v7 40# C0 x6 v16 v20 v24 v28 41# C1 x8 v17 v21 v25 v29 42# C2 x9 v18 v22 v26 v30 43# C3 x7 v19 v23 v27 v31 44# unused v8 v9 v10 v11 v12 v13 v14 v15 45 46BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 47 48 # Clamp A and C pointers 49 CMP x0, 2 // if mr < 2 50 ADD x2, x2, 3 // kc = (kc + 3) & ~3 51 ADD x15, x3, x4 // a1 = a0 + a_stride 52 ADD x8, x6, x7 // c1 = c0 + cm_stride 53 CSEL x15, x3, x15, LO // a1 = a0 54 CSEL x8, x6, x8, LO // c1 = c0 55 BIC x2, x2, 3 56 57 ADD x13, x15, x4 // a2 = a1 + a_stride 58 ADD x9, x8, x7 // c2 = c1 + cm_stride 59 // if mr <= 2 60 CSEL x13, x15, x13, LS // a2 = a1 61 CSEL x9, x8, x9, LS // c2 = c1 62 63 LDP x12, x11, [sp] // cn_stride, params 64 65 CMP x0, 4 // if mr < 4 66 ADD x4, x13, x4 // a3 = a2 + a_stride 67 ADD x7, x9, x7 // c3 = c2 + cm_stride 68 CSEL x4, x13, x4, LO // a3 = a2 69 CSEL x7, x9, x7, LO // c3 = c2 70 71 .p2align 3 720: 73 # Load initial bias from w into accumulators 74 LDP q16, q20, [x5], 32 75 MOV v17.16b, v16.16b 76 MOV v18.16b, v16.16b 77 LDP q24, q28, [x5], 32 78 MOV v19.16b, v16.16b 79 MOV v21.16b, v20.16b 80 MOV v22.16b, v20.16b 81 MOV v23.16b, v20.16b 82 MOV v25.16b, v24.16b 83 MOV v26.16b, v24.16b 84 SUBS x0, x2, 16 // k = kc - 16 85 MOV v27.16b, v24.16b 86 MOV v29.16b, v28.16b 87 MOV v30.16b, v28.16b 88 MOV v31.16b, v28.16b 89 # Is there at least 16 bytes? 90 B.LO 3f 91 92 # Main loop - 16 bytes of A 93 .p2align 3 941: 95 LDR q0, [x3], 16 96 LDR q4, [x5], 16 97 LDR q1, [x15], 16 98 LDR q2, [x13], 16 99 LDR q3, [x4], 16 100 LDR q5, [x5], 16 101 SDOT v16.4s, v4.16b, v0.4b[0] 102 SDOT v17.4s, v4.16b, v1.4b[0] 103 LDP q6, q7, [x5], 32 104 SDOT v18.4s, v4.16b, v2.4b[0] 105 SDOT v19.4s, v4.16b, v3.4b[0] 106 SDOT v20.4s, v5.16b, v0.4b[0] 107 SDOT v21.4s, v5.16b, v1.4b[0] 108 SDOT v22.4s, v5.16b, v2.4b[0] 109 SDOT v23.4s, v5.16b, v3.4b[0] 110 SDOT v24.4s, v6.16b, v0.4b[0] 111 SDOT v25.4s, v6.16b, v1.4b[0] 112 LDP q4, q5, [x5], 32 113 SDOT v26.4s, v6.16b, v2.4b[0] 114 SDOT v27.4s, v6.16b, v3.4b[0] 115 SDOT v28.4s, v7.16b, v0.4b[0] 116 SDOT v29.4s, v7.16b, v1.4b[0] 117 SDOT v30.4s, v7.16b, v2.4b[0] 118 SDOT v31.4s, v7.16b, v3.4b[0] 119 120 SDOT v16.4s, v4.16b, v0.4b[1] 121 SDOT v17.4s, v4.16b, v1.4b[1] 122 LDP q6, q7, [x5], 32 123 SDOT v18.4s, v4.16b, v2.4b[1] 124 SDOT v19.4s, v4.16b, v3.4b[1] 125 SDOT v20.4s, v5.16b, v0.4b[1] 126 SDOT v21.4s, v5.16b, v1.4b[1] 127 SDOT v22.4s, v5.16b, v2.4b[1] 128 SDOT v23.4s, v5.16b, v3.4b[1] 129 SDOT v24.4s, v6.16b, v0.4b[1] 130 SDOT v25.4s, v6.16b, v1.4b[1] 131 LDP q4, q5, [x5], 32 132 SDOT v26.4s, v6.16b, v2.4b[1] 133 SDOT v27.4s, v6.16b, v3.4b[1] 134 SDOT v28.4s, v7.16b, v0.4b[1] 135 SDOT v29.4s, v7.16b, v1.4b[1] 136 SDOT v30.4s, v7.16b, v2.4b[1] 137 SDOT v31.4s, v7.16b, v3.4b[1] 138 139 SDOT v16.4s, v4.16b, v0.4b[2] 140 SDOT v17.4s, v4.16b, v1.4b[2] 141 LDP q6, q7, [x5], 32 142 SDOT v18.4s, v4.16b, v2.4b[2] 143 SDOT v19.4s, v4.16b, v3.4b[2] 144 SDOT v20.4s, v5.16b, v0.4b[2] 145 SDOT v21.4s, v5.16b, v1.4b[2] 146 SDOT v22.4s, v5.16b, v2.4b[2] 147 SDOT v23.4s, v5.16b, v3.4b[2] 148 SDOT v24.4s, v6.16b, v0.4b[2] 149 SDOT v25.4s, v6.16b, v1.4b[2] 150 LDP q4, q5, [x5], 32 151 SDOT v26.4s, v6.16b, v2.4b[2] 152 SDOT v27.4s, v6.16b, v3.4b[2] 153 SDOT v28.4s, v7.16b, v0.4b[2] 154 SDOT v29.4s, v7.16b, v1.4b[2] 155 SDOT v30.4s, v7.16b, v2.4b[2] 156 SDOT v31.4s, v7.16b, v3.4b[2] 157 158 SDOT v16.4s, v4.16b, v0.4b[3] 159 SDOT v17.4s, v4.16b, v1.4b[3] 160 LDP q6, q7, [x5], 32 161 SDOT v18.4s, v4.16b, v2.4b[3] 162 SDOT v19.4s, v4.16b, v3.4b[3] 163 SDOT v20.4s, v5.16b, v0.4b[3] 164 SDOT v21.4s, v5.16b, v1.4b[3] 165 SDOT v22.4s, v5.16b, v2.4b[3] 166 SDOT v23.4s, v5.16b, v3.4b[3] 167 SDOT v24.4s, v6.16b, v0.4b[3] 168 SDOT v25.4s, v6.16b, v1.4b[3] 169 SDOT v26.4s, v6.16b, v2.4b[3] 170 SDOT v27.4s, v6.16b, v3.4b[3] 171 SUBS x0, x0, 16 172 SDOT v28.4s, v7.16b, v0.4b[3] 173 SDOT v29.4s, v7.16b, v1.4b[3] 174 SDOT v30.4s, v7.16b, v2.4b[3] 175 SDOT v31.4s, v7.16b, v3.4b[3] 176 B.HS 1b 177 178 # Is there a remainder?- 4 to 12 bytes of A 179 TST x0, 15 180 B.NE 3f 181 1822: 183 SCVTF v16.4s, v16.4s 184 SCVTF v17.4s, v17.4s 185 # Load per channel scale values from weights 186 LDR q4, [x5], 16 187 SCVTF v18.4s, v18.4s 188 SCVTF v19.4s, v19.4s 189 LDR q5, [x5], 16 190 SCVTF v20.4s, v20.4s 191 SCVTF v21.4s, v21.4s 192 SCVTF v22.4s, v22.4s 193 SCVTF v23.4s, v23.4s 194 SCVTF v24.4s, v24.4s 195 SCVTF v25.4s, v25.4s 196 SCVTF v26.4s, v26.4s 197 SCVTF v27.4s, v27.4s 198 SCVTF v28.4s, v28.4s 199 SCVTF v29.4s, v29.4s 200 SCVTF v30.4s, v30.4s 201 SCVTF v31.4s, v31.4s 202 203 LDR q6, [x5], 16 204 FMUL v16.4s, v16.4s, v4.4s 205 FMUL v17.4s, v17.4s, v4.4s 206 FMUL v18.4s, v18.4s, v4.4s 207 FMUL v19.4s, v19.4s, v4.4s 208 FMUL v20.4s, v20.4s, v5.4s 209 LDR q4, [x5], 16 210 FMUL v21.4s, v21.4s, v5.4s 211 FMUL v22.4s, v22.4s, v5.4s 212 FMUL v23.4s, v23.4s, v5.4s 213 FMUL v24.4s, v24.4s, v6.4s 214 FMUL v25.4s, v25.4s, v6.4s 215 FMUL v26.4s, v26.4s, v6.4s 216 FMUL v27.4s, v27.4s, v6.4s 217 FMUL v28.4s, v28.4s, v4.4s 218 FMUL v29.4s, v29.4s, v4.4s 219 FMUL v30.4s, v30.4s, v4.4s 220 FMUL v31.4s, v31.4s, v4.4s 221 222 FCVTNS v16.4s, v16.4s 223 FCVTNS v17.4s, v17.4s 224 FCVTNS v18.4s, v18.4s 225 FCVTNS v19.4s, v19.4s 226 FCVTNS v20.4s, v20.4s 227 FCVTNS v21.4s, v21.4s 228 FCVTNS v22.4s, v22.4s 229 FCVTNS v23.4s, v23.4s 230 FCVTNS v24.4s, v24.4s 231 FCVTNS v25.4s, v25.4s 232 FCVTNS v26.4s, v26.4s 233 FCVTNS v27.4s, v27.4s 234 FCVTNS v28.4s, v28.4s 235 FCVTNS v29.4s, v29.4s 236 FCVTNS v30.4s, v30.4s 237 FCVTNS v31.4s, v31.4s 238 239 SQXTN v16.4h, v16.4s 240 SQXTN v17.4h, v17.4s 241 SQXTN v18.4h, v18.4s 242 SQXTN v19.4h, v19.4s 243 SQXTN v24.4h, v24.4s 244 SQXTN v25.4h, v25.4s 245 SQXTN v26.4h, v26.4s 246 SQXTN v27.4h, v27.4s 247 LD1R {v6.8h}, [x11], 2 // add bias 248 249 SQXTN2 v16.8h, v20.4s 250 SQXTN2 v17.8h, v21.4s 251 SQXTN2 v18.8h, v22.4s 252 SQXTN2 v19.8h, v23.4s 253 SQXTN2 v24.8h, v28.4s 254 SQXTN2 v25.8h, v29.4s 255 SQXTN2 v26.8h, v30.4s 256 SQXTN2 v27.8h, v31.4s 257 258 SQADD v16.8h, v16.8h, v6.8h 259 SQADD v17.8h, v17.8h, v6.8h 260 SQADD v18.8h, v18.8h, v6.8h 261 SQADD v19.8h, v19.8h, v6.8h 262 SQADD v24.8h, v24.8h, v6.8h 263 SQADD v25.8h, v25.8h, v6.8h 264 SQADD v26.8h, v26.8h, v6.8h 265 SQADD v27.8h, v27.8h, v6.8h 266 LD1R {v4.16b}, [x11], 1 // clamp min value 267 268 SQXTN v0.8b, v16.8h 269 SQXTN v1.8b, v17.8h 270 SQXTN v2.8b, v18.8h 271 SQXTN v3.8b, v19.8h 272 LD1R {v5.16b}, [x11] // clamp max value 273 SQXTN2 v0.16b, v24.8h 274 SQXTN2 v1.16b, v25.8h 275 SQXTN2 v2.16b, v26.8h 276 SQXTN2 v3.16b, v27.8h 277 SUB x11, x11, 3 // rewind params pointer 278 279 SMAX v0.16b, v0.16b, v4.16b 280 SMAX v1.16b, v1.16b, v4.16b 281 SMAX v2.16b, v2.16b, v4.16b 282 SMAX v3.16b, v3.16b, v4.16b 283 SUBS x1, x1, 16 284 SMIN v0.16b, v0.16b, v5.16b 285 SMIN v1.16b, v1.16b, v5.16b 286 SMIN v2.16b, v2.16b, v5.16b 287 SMIN v3.16b, v3.16b, v5.16b 288 B.LO 5f 289 290 # Store full 4 x 16 291 ST1 {v0.16b}, [x6], x12 292 SUB x3, x3, x2 // a0 -= kc 293 ST1 {v1.16b}, [x8], x12 294 SUB x15, x15, x2 // a1 -= kc 295 ST1 {v2.16b}, [x9], x12 296 SUB x13, x13, x2 // a2 -= kc 297 ST1 {v3.16b}, [x7], x12 298 SUB x4, x4, x2 // a3 -= kc 299 B.NE 0b 300 RET 301 302 # Remainder- 8 bytes of A 303 .p2align 3 3043: 305 # Is there a remainder?- 8 bytes of A 306 TBZ x0, 3, 4f 307 308 LDR d0, [x3], 8 309 LDR q4, [x5], 16 310 LDR d1, [x15], 8 311 LDR d2, [x13], 8 312 LDR d3, [x4], 8 313 LDR q5, [x5], 16 314 SDOT v16.4s, v4.16b, v0.4b[0] 315 SDOT v17.4s, v4.16b, v1.4b[0] 316 LDP q6, q7, [x5], 32 317 SDOT v18.4s, v4.16b, v2.4b[0] 318 SDOT v19.4s, v4.16b, v3.4b[0] 319 SDOT v20.4s, v5.16b, v0.4b[0] 320 SDOT v21.4s, v5.16b, v1.4b[0] 321 SDOT v22.4s, v5.16b, v2.4b[0] 322 SDOT v23.4s, v5.16b, v3.4b[0] 323 SDOT v24.4s, v6.16b, v0.4b[0] 324 SDOT v25.4s, v6.16b, v1.4b[0] 325 LDP q4, q5, [x5], 32 326 SDOT v26.4s, v6.16b, v2.4b[0] 327 SDOT v27.4s, v6.16b, v3.4b[0] 328 SDOT v28.4s, v7.16b, v0.4b[0] 329 SDOT v29.4s, v7.16b, v1.4b[0] 330 SDOT v30.4s, v7.16b, v2.4b[0] 331 SDOT v31.4s, v7.16b, v3.4b[0] 332 SDOT v16.4s, v4.16b, v0.4b[1] 333 SDOT v17.4s, v4.16b, v1.4b[1] 334 LDP q6, q7, [x5], 32 335 SDOT v18.4s, v4.16b, v2.4b[1] 336 SDOT v19.4s, v4.16b, v3.4b[1] 337 SDOT v20.4s, v5.16b, v0.4b[1] 338 SDOT v21.4s, v5.16b, v1.4b[1] 339 SDOT v22.4s, v5.16b, v2.4b[1] 340 SDOT v23.4s, v5.16b, v3.4b[1] 341 SDOT v24.4s, v6.16b, v0.4b[1] 342 SDOT v25.4s, v6.16b, v1.4b[1] 343 SDOT v26.4s, v6.16b, v2.4b[1] 344 SDOT v27.4s, v6.16b, v3.4b[1] 345 SDOT v28.4s, v7.16b, v0.4b[1] 346 SDOT v29.4s, v7.16b, v1.4b[1] 347 SDOT v30.4s, v7.16b, v2.4b[1] 348 SDOT v31.4s, v7.16b, v3.4b[1] 349 # Is there a remainder?- 4 bytes of A 350 TBZ x0, 2, 2b 351 352 # Remainder- 4 bytes of A 3534: 354 LDR s0, [x3], 4 355 LDR q4, [x5], 16 356 LDR s1, [x15], 4 357 LDR s2, [x13], 4 358 LDR s3, [x4], 4 359 SDOT v16.4s, v4.16b, v0.4b[0] 360 LDR q5, [x5], 16 361 SDOT v17.4s, v4.16b, v1.4b[0] 362 SDOT v18.4s, v4.16b, v2.4b[0] 363 SDOT v19.4s, v4.16b, v3.4b[0] 364 SDOT v20.4s, v5.16b, v0.4b[0] 365 LDP q6, q7, [x5], 32 366 SDOT v21.4s, v5.16b, v1.4b[0] 367 SDOT v22.4s, v5.16b, v2.4b[0] 368 SDOT v23.4s, v5.16b, v3.4b[0] 369 SDOT v24.4s, v6.16b, v0.4b[0] 370 SDOT v25.4s, v6.16b, v1.4b[0] 371 SDOT v26.4s, v6.16b, v2.4b[0] 372 SDOT v27.4s, v6.16b, v3.4b[0] 373 SDOT v28.4s, v7.16b, v0.4b[0] 374 SDOT v29.4s, v7.16b, v1.4b[0] 375 SDOT v30.4s, v7.16b, v2.4b[0] 376 SDOT v31.4s, v7.16b, v3.4b[0] 377 B 2b 378 379 # Store odd width 380 .p2align 3 3815: 382 TBZ x1, 3, 6f 383 STR d0, [x6], 8 384 STR d1, [x8], 8 385 DUP d0, v0.d[1] 386 DUP d1, v1.d[1] 387 STR d2, [x9], 8 388 STR d3, [x7], 8 389 DUP d2, v2.d[1] 390 DUP d3, v3.d[1] 3916: 392 TBZ x1, 2, 7f 393 STR s0, [x6], 4 394 STR s1, [x8], 4 395 DUP s0, v0.s[1] 396 DUP s1, v1.s[1] 397 STR s2, [x9], 4 398 STR s3, [x7], 4 399 DUP s2, v2.s[1] 400 DUP s3, v3.s[1] 4017: 402 TBZ x1, 1, 8f 403 STR h0, [x6], 2 404 STR h1, [x8], 2 405 DUP h0, v0.h[1] 406 DUP h1, v1.h[1] 407 STR h2, [x9], 2 408 STR h3, [x7], 2 409 DUP h2, v2.h[1] 410 DUP h3, v3.h[1] 4118: 412 TBZ x1, 0, 9f 413 STR b0, [x6] 414 STR b1, [x8] 415 STR b2, [x9] 416 STR b3, [x7] 4179: 418 RET 419 420END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 421 422#ifdef __ELF__ 423.section ".note.GNU-stack","",%progbits 424#endif 425