1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# params structure is 16 bytes 26# struct { 27# int32_t right_pre_shift; 28# int32_t multiplier; 29# int32_t right_post_shift; 30# int16_t output_zero_point; 31# int8_t output_min; 32# int8_t output_max; 33# } rndnu_neon; 34 35# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 36 37# Register usage 38# A0 x3 v0 39# A1 x15 v1 40# A2 x13 v2 41# A3 x4 v3 42# B x5 v4 v5 v6 v7 43# C0 x6 v16 v20 v24 v28 44# C1 x8 v17 v21 v25 v29 45# C2 x9 v18 v22 v26 v30 46# C3 x7 v19 v23 v27 v31 47# unused v8 v9 v10 v11 v12 v13 v14 v15 48 49BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128 50 51 # Clamp A and C pointers 52 CMP x0, 2 // if mr < 2 53 ADD x2, x2, 3 // kc = (kc + 3) & ~3 54 ADD x15, x3, x4 // a1 = a0 + a_stride 55 ADD x8, x6, x7 // c1 = c0 + cm_stride 56 CSEL x15, x3, x15, LO // a1 = a0 57 CSEL x8, x6, x8, LO // c1 = c0 58 BIC x2, x2, 3 59 60 ADD x13, x15, x4 // a2 = a1 + a_stride 61 ADD x9, x8, x7 // c2 = c1 + cm_stride 62 // if mr <= 2 63 CSEL x13, x15, x13, LS // a2 = a1 64 CSEL x9, x8, x9, LS // c2 = c1 65 66 LDP x12, x11, [sp] // cn_stride, params 67 68 CMP x0, 4 // if mr < 4 69 ADD x4, x13, x4 // a3 = a2 + a_stride 70 ADD x7, x9, x7 // c3 = c2 + cm_stride 71 CSEL x4, x13, x4, LO // a3 = a2 72 CSEL x7, x9, x7, LO // c3 = c2 73 74 .p2align 3 750: 76 # Load initial bias from w into accumulators 77 LDP q16, q20, [x5], 32 78 MOV v17.16b, v16.16b 79 MOV v18.16b, v16.16b 80 LDP q24, q28, [x5], 32 81 MOV v19.16b, v16.16b 82 MOV v21.16b, v20.16b 83 MOV v22.16b, v20.16b 84 MOV v23.16b, v20.16b 85 MOV v25.16b, v24.16b 86 MOV v26.16b, v24.16b 87 SUBS x0, x2, 16 // k = kc - 16 88 MOV v27.16b, v24.16b 89 MOV v29.16b, v28.16b 90 MOV v30.16b, v28.16b 91 MOV v31.16b, v28.16b 92 # Is there at least 16 bytes? 93 B.LO 3f 94 95 # Main loop - 16 bytes of A 96 .p2align 3 971: 98 LDR q0, [x3], 16 99 LDR q4, [x5], 16 100 LDR q1, [x15], 16 101 LDR q2, [x13], 16 102 LDR q3, [x4], 16 103 LDR q5, [x5], 16 104 SDOT v16.4s, v4.16b, v0.4b[0] 105 SDOT v17.4s, v4.16b, v1.4b[0] 106 LDP q6, q7, [x5], 32 107 SDOT v18.4s, v4.16b, v2.4b[0] 108 SDOT v19.4s, v4.16b, v3.4b[0] 109 SDOT v20.4s, v5.16b, v0.4b[0] 110 SDOT v21.4s, v5.16b, v1.4b[0] 111 SDOT v22.4s, v5.16b, v2.4b[0] 112 SDOT v23.4s, v5.16b, v3.4b[0] 113 SDOT v24.4s, v6.16b, v0.4b[0] 114 SDOT v25.4s, v6.16b, v1.4b[0] 115 LDP q4, q5, [x5], 32 116 SDOT v26.4s, v6.16b, v2.4b[0] 117 SDOT v27.4s, v6.16b, v3.4b[0] 118 SDOT v28.4s, v7.16b, v0.4b[0] 119 SDOT v29.4s, v7.16b, v1.4b[0] 120 SDOT v30.4s, v7.16b, v2.4b[0] 121 SDOT v31.4s, v7.16b, v3.4b[0] 122 123 SDOT v16.4s, v4.16b, v0.4b[1] 124 SDOT v17.4s, v4.16b, v1.4b[1] 125 LDP q6, q7, [x5], 32 126 SDOT v18.4s, v4.16b, v2.4b[1] 127 SDOT v19.4s, v4.16b, v3.4b[1] 128 SDOT v20.4s, v5.16b, v0.4b[1] 129 SDOT v21.4s, v5.16b, v1.4b[1] 130 SDOT v22.4s, v5.16b, v2.4b[1] 131 SDOT v23.4s, v5.16b, v3.4b[1] 132 SDOT v24.4s, v6.16b, v0.4b[1] 133 SDOT v25.4s, v6.16b, v1.4b[1] 134 LDP q4, q5, [x5], 32 135 SDOT v26.4s, v6.16b, v2.4b[1] 136 SDOT v27.4s, v6.16b, v3.4b[1] 137 SDOT v28.4s, v7.16b, v0.4b[1] 138 SDOT v29.4s, v7.16b, v1.4b[1] 139 SDOT v30.4s, v7.16b, v2.4b[1] 140 SDOT v31.4s, v7.16b, v3.4b[1] 141 142 SDOT v16.4s, v4.16b, v0.4b[2] 143 SDOT v17.4s, v4.16b, v1.4b[2] 144 LDP q6, q7, [x5], 32 145 SDOT v18.4s, v4.16b, v2.4b[2] 146 SDOT v19.4s, v4.16b, v3.4b[2] 147 SDOT v20.4s, v5.16b, v0.4b[2] 148 SDOT v21.4s, v5.16b, v1.4b[2] 149 SDOT v22.4s, v5.16b, v2.4b[2] 150 SDOT v23.4s, v5.16b, v3.4b[2] 151 SDOT v24.4s, v6.16b, v0.4b[2] 152 SDOT v25.4s, v6.16b, v1.4b[2] 153 LDP q4, q5, [x5], 32 154 SDOT v26.4s, v6.16b, v2.4b[2] 155 SDOT v27.4s, v6.16b, v3.4b[2] 156 SDOT v28.4s, v7.16b, v0.4b[2] 157 SDOT v29.4s, v7.16b, v1.4b[2] 158 SDOT v30.4s, v7.16b, v2.4b[2] 159 SDOT v31.4s, v7.16b, v3.4b[2] 160 161 SDOT v16.4s, v4.16b, v0.4b[3] 162 SDOT v17.4s, v4.16b, v1.4b[3] 163 LDP q6, q7, [x5], 32 164 SDOT v18.4s, v4.16b, v2.4b[3] 165 SDOT v19.4s, v4.16b, v3.4b[3] 166 SDOT v20.4s, v5.16b, v0.4b[3] 167 SDOT v21.4s, v5.16b, v1.4b[3] 168 SDOT v22.4s, v5.16b, v2.4b[3] 169 SDOT v23.4s, v5.16b, v3.4b[3] 170 SDOT v24.4s, v6.16b, v0.4b[3] 171 SDOT v25.4s, v6.16b, v1.4b[3] 172 SDOT v26.4s, v6.16b, v2.4b[3] 173 SDOT v27.4s, v6.16b, v3.4b[3] 174 SUBS x0, x0, 16 175 SDOT v28.4s, v7.16b, v0.4b[3] 176 SDOT v29.4s, v7.16b, v1.4b[3] 177 SDOT v30.4s, v7.16b, v2.4b[3] 178 SDOT v31.4s, v7.16b, v3.4b[3] 179 B.HS 1b 180 181 # Is there a remainder?- 4 to 12 bytes of A 182 TST x0, 15 183 B.NE 3f 184 1852: 186 # Apply params - preshift, scale, postshift, bias and clamp 187 LD1R {v4.4s}, [x11], 4 188 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 189 SQSHL v17.4s, v17.4s, v4.4s 190 SQSHL v18.4s, v18.4s, v4.4s 191 SQSHL v19.4s, v19.4s, v4.4s 192 SQSHL v20.4s, v20.4s, v4.4s 193 SQSHL v21.4s, v21.4s, v4.4s 194 SQSHL v22.4s, v22.4s, v4.4s 195 SQSHL v23.4s, v23.4s, v4.4s 196 LD1R {v5.4s}, [x11], 4 197 SQSHL v24.4s, v24.4s, v4.4s 198 SQSHL v25.4s, v25.4s, v4.4s 199 SQSHL v26.4s, v26.4s, v4.4s 200 SQSHL v27.4s, v27.4s, v4.4s 201 SQSHL v28.4s, v28.4s, v4.4s 202 SQSHL v29.4s, v29.4s, v4.4s 203 SQSHL v30.4s, v30.4s, v4.4s 204 SQSHL v31.4s, v31.4s, v4.4s 205 LD1R {v6.4s}, [x11], 4 206 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 207 SQDMULH v17.4s, v17.4s, v5.4s 208 SQDMULH v18.4s, v18.4s, v5.4s 209 SQDMULH v19.4s, v19.4s, v5.4s 210 SQDMULH v20.4s, v20.4s, v5.4s 211 SQDMULH v21.4s, v21.4s, v5.4s 212 SQDMULH v22.4s, v22.4s, v5.4s 213 SQDMULH v23.4s, v23.4s, v5.4s 214 SQDMULH v24.4s, v24.4s, v5.4s 215 SQDMULH v25.4s, v25.4s, v5.4s 216 SQDMULH v26.4s, v26.4s, v5.4s 217 SQDMULH v27.4s, v27.4s, v5.4s 218 SQDMULH v28.4s, v28.4s, v5.4s 219 SQDMULH v29.4s, v29.4s, v5.4s 220 SQDMULH v30.4s, v30.4s, v5.4s 221 SQDMULH v31.4s, v31.4s, v5.4s 222 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 223 SRSHL v17.4s, v17.4s, v6.4s 224 SRSHL v18.4s, v18.4s, v6.4s 225 SRSHL v19.4s, v19.4s, v6.4s 226 SRSHL v20.4s, v20.4s, v6.4s 227 SRSHL v21.4s, v21.4s, v6.4s 228 SRSHL v22.4s, v22.4s, v6.4s 229 SRSHL v23.4s, v23.4s, v6.4s 230 SRSHL v24.4s, v24.4s, v6.4s 231 SRSHL v25.4s, v25.4s, v6.4s 232 SRSHL v26.4s, v26.4s, v6.4s 233 SRSHL v27.4s, v27.4s, v6.4s 234 SRSHL v28.4s, v28.4s, v6.4s 235 SRSHL v29.4s, v29.4s, v6.4s 236 SRSHL v30.4s, v30.4s, v6.4s 237 SRSHL v31.4s, v31.4s, v6.4s 238 239 SQXTN v16.4h, v16.4s 240 SQXTN v17.4h, v17.4s 241 SQXTN v18.4h, v18.4s 242 SQXTN v19.4h, v19.4s 243 SQXTN v24.4h, v24.4s 244 SQXTN v25.4h, v25.4s 245 SQXTN v26.4h, v26.4s 246 SQXTN v27.4h, v27.4s 247 LD1R {v6.8h}, [x11], 2 // add bias 248 249 SQXTN2 v16.8h, v20.4s 250 SQXTN2 v17.8h, v21.4s 251 SQXTN2 v18.8h, v22.4s 252 SQXTN2 v19.8h, v23.4s 253 SQXTN2 v24.8h, v28.4s 254 SQXTN2 v25.8h, v29.4s 255 SQXTN2 v26.8h, v30.4s 256 SQXTN2 v27.8h, v31.4s 257 258 SQADD v16.8h, v16.8h, v6.8h 259 SQADD v17.8h, v17.8h, v6.8h 260 SQADD v18.8h, v18.8h, v6.8h 261 SQADD v19.8h, v19.8h, v6.8h 262 SQADD v24.8h, v24.8h, v6.8h 263 SQADD v25.8h, v25.8h, v6.8h 264 SQADD v26.8h, v26.8h, v6.8h 265 SQADD v27.8h, v27.8h, v6.8h 266 LD1R {v4.16b}, [x11], 1 // clamp min value 267 268 SQXTN v0.8b, v16.8h 269 SQXTN v1.8b, v17.8h 270 SQXTN v2.8b, v18.8h 271 SQXTN v3.8b, v19.8h 272 LD1R {v5.16b}, [x11] // clamp max value 273 SQXTN2 v0.16b, v24.8h 274 SQXTN2 v1.16b, v25.8h 275 SQXTN2 v2.16b, v26.8h 276 SQXTN2 v3.16b, v27.8h 277 SUB x11, x11, 15 // rewind params pointer 278 279 SMAX v0.16b, v0.16b, v4.16b 280 SMAX v1.16b, v1.16b, v4.16b 281 SMAX v2.16b, v2.16b, v4.16b 282 SMAX v3.16b, v3.16b, v4.16b 283 SUBS x1, x1, 16 284 SMIN v0.16b, v0.16b, v5.16b 285 SMIN v1.16b, v1.16b, v5.16b 286 SMIN v2.16b, v2.16b, v5.16b 287 SMIN v3.16b, v3.16b, v5.16b 288 B.LO 5f 289 290 # Store full 4 x 16 291 ST1 {v0.16b}, [x6], x12 292 SUB x3, x3, x2 // a0 -= kc 293 ST1 {v1.16b}, [x8], x12 294 SUB x15, x15, x2 // a1 -= kc 295 ST1 {v2.16b}, [x9], x12 296 SUB x13, x13, x2 // a2 -= kc 297 ST1 {v3.16b}, [x7], x12 298 SUB x4, x4, x2 // a3 -= kc 299 B.NE 0b 300 RET 301 302 # Remainder- 8 bytes of A 303 .p2align 3 3043: 305 # Is there a remainder?- 8 bytes of A 306 TBZ x0, 3, 4f 307 308 LDR d0, [x3], 8 309 LDR q4, [x5], 16 310 LDR d1, [x15], 8 311 LDR d2, [x13], 8 312 LDR d3, [x4], 8 313 LDR q5, [x5], 16 314 SDOT v16.4s, v4.16b, v0.4b[0] 315 SDOT v17.4s, v4.16b, v1.4b[0] 316 LDP q6, q7, [x5], 32 317 SDOT v18.4s, v4.16b, v2.4b[0] 318 SDOT v19.4s, v4.16b, v3.4b[0] 319 SDOT v20.4s, v5.16b, v0.4b[0] 320 SDOT v21.4s, v5.16b, v1.4b[0] 321 SDOT v22.4s, v5.16b, v2.4b[0] 322 SDOT v23.4s, v5.16b, v3.4b[0] 323 SDOT v24.4s, v6.16b, v0.4b[0] 324 SDOT v25.4s, v6.16b, v1.4b[0] 325 LDP q4, q5, [x5], 32 326 SDOT v26.4s, v6.16b, v2.4b[0] 327 SDOT v27.4s, v6.16b, v3.4b[0] 328 SDOT v28.4s, v7.16b, v0.4b[0] 329 SDOT v29.4s, v7.16b, v1.4b[0] 330 SDOT v30.4s, v7.16b, v2.4b[0] 331 SDOT v31.4s, v7.16b, v3.4b[0] 332 SDOT v16.4s, v4.16b, v0.4b[1] 333 SDOT v17.4s, v4.16b, v1.4b[1] 334 LDP q6, q7, [x5], 32 335 SDOT v18.4s, v4.16b, v2.4b[1] 336 SDOT v19.4s, v4.16b, v3.4b[1] 337 SDOT v20.4s, v5.16b, v0.4b[1] 338 SDOT v21.4s, v5.16b, v1.4b[1] 339 SDOT v22.4s, v5.16b, v2.4b[1] 340 SDOT v23.4s, v5.16b, v3.4b[1] 341 SDOT v24.4s, v6.16b, v0.4b[1] 342 SDOT v25.4s, v6.16b, v1.4b[1] 343 SDOT v26.4s, v6.16b, v2.4b[1] 344 SDOT v27.4s, v6.16b, v3.4b[1] 345 SDOT v28.4s, v7.16b, v0.4b[1] 346 SDOT v29.4s, v7.16b, v1.4b[1] 347 SDOT v30.4s, v7.16b, v2.4b[1] 348 SDOT v31.4s, v7.16b, v3.4b[1] 349 # Is there a remainder?- 4 bytes of A 350 TBZ x0, 2, 2b 351 352 # Remainder- 4 bytes of A 3534: 354 LDR s0, [x3], 4 355 LDR q4, [x5], 16 356 LDR s1, [x15], 4 357 LDR s2, [x13], 4 358 LDR s3, [x4], 4 359 SDOT v16.4s, v4.16b, v0.4b[0] 360 LDR q5, [x5], 16 361 SDOT v17.4s, v4.16b, v1.4b[0] 362 SDOT v18.4s, v4.16b, v2.4b[0] 363 SDOT v19.4s, v4.16b, v3.4b[0] 364 SDOT v20.4s, v5.16b, v0.4b[0] 365 LDP q6, q7, [x5], 32 366 SDOT v21.4s, v5.16b, v1.4b[0] 367 SDOT v22.4s, v5.16b, v2.4b[0] 368 SDOT v23.4s, v5.16b, v3.4b[0] 369 SDOT v24.4s, v6.16b, v0.4b[0] 370 SDOT v25.4s, v6.16b, v1.4b[0] 371 SDOT v26.4s, v6.16b, v2.4b[0] 372 SDOT v27.4s, v6.16b, v3.4b[0] 373 SDOT v28.4s, v7.16b, v0.4b[0] 374 SDOT v29.4s, v7.16b, v1.4b[0] 375 SDOT v30.4s, v7.16b, v2.4b[0] 376 SDOT v31.4s, v7.16b, v3.4b[0] 377 B 2b 378 379 # Store odd width 380 .p2align 3 3815: 382 TBZ x1, 3, 6f 383 STR d0, [x6], 8 384 STR d1, [x8], 8 385 DUP d0, v0.d[1] 386 DUP d1, v1.d[1] 387 STR d2, [x9], 8 388 STR d3, [x7], 8 389 DUP d2, v2.d[1] 390 DUP d3, v3.d[1] 3916: 392 TBZ x1, 2, 7f 393 STR s0, [x6], 4 394 STR s1, [x8], 4 395 DUP s0, v0.s[1] 396 DUP s1, v1.s[1] 397 STR s2, [x9], 4 398 STR s3, [x7], 4 399 DUP s2, v2.s[1] 400 DUP s3, v3.s[1] 4017: 402 TBZ x1, 1, 8f 403 STR h0, [x6], 2 404 STR h1, [x8], 2 405 DUP h0, v0.h[1] 406 DUP h1, v1.h[1] 407 STR h2, [x9], 2 408 STR h3, [x7], 2 409 DUP h2, v2.h[1] 410 DUP h3, v3.h[1] 4118: 412 TBZ x1, 0, 9f 413 STR b0, [x6] 414 STR b1, [x8] 415 STR b2, [x9] 416 STR b3, [x7] 4179: 418 RET 419 420END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128 421 422#ifdef __ELF__ 423.section ".note.GNU-stack","",%progbits 424#endif 425