1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# A1 x15 v1 v7 32# B x5 v4 v5 v8 v9 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 35# temp0 v2 v10 v12 v14 36# temp1 v3 v11 v13 v15 37# x16, x17, x20, x21 tenporary a53 gpr load data 38 39 40BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53 41 42 # Clamp C pointers 43 LDP x10, x8, [sp] // Load cn_stride, a_offset 44 CMP x0, 2 // if mr < 2 45 LDP x12, x11, [sp, 16] // Load zero, params pointer 46 ADD x7, x6, x7 // c1 = c0 + cm_stride 47 STP d8, d9, [sp, -80]! 48 ADD x2, x2, 7 // kc = (kc + 7) & ~7 49 STP d10, d11, [sp, 16] 50 CSEL x7, x6, x7, LO // c1 = c0 51 STP d12, d13, [sp, 32] 52 BIC x2, x2, 7 53 STP d14, d15, [sp, 48] 54 STP x20, x21, [sp, 64] // Save x20,x21 on stack 55 56 .p2align 3 570: 58 # Load initial bias from w into accumulators 59 LDP s16, s18, [x5], 8 60 MOV v17.16b, v16.16b 61 MOV v19.16b, v18.16b 62 LDP s20, s22, [x5], 8 63 MOV v21.16b, v20.16b 64 MOV v23.16b, v22.16b 65 LDP s24, s26, [x5], 8 66 MOV v25.16b, v24.16b 67 MOV v27.16b, v26.16b 68 LDP s28, s30, [x5], 8 69 MOV v29.16b, v28.16b 70 MOV v31.16b, v30.16b 71 MOV x9, x3 // p = ks 72 73 .p2align 3 741: 75 # Load next 2 A pointers 76 LDP x13, x15, [x4], 16 77 CMP x13, x12 // if a0 == zero 78 ADD x13, x13, x8 // a0 += a_offset 79 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 80 CMP x15, x12 // if a1 == zero 81 ADD x15, x15, x8 // a1 += a_offset 82 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 83 84 # Is there at least 16 bytes for epilogue? 85 SUBS x0, x2, 16 // k = kc - 16 86 B.LO 5f 87 88 # Prologue: load A0, A1 and 2 B's 89 LDP d4, d5, [x5] // Read B 90 LDP d0, d6, [x13], 16 91 LDP d1, d7, [x15], 16 92// LDP d8, d9, [x5, 64] 93 LDR x17, [x5, 64] // Read B 94 LDR x16, [x5, 16] 95 96 # Is there at least 16 bytes for main loop? 97 SUBS x0, x0, 16 // k = k - 16 98 B.LO 3f 99 100 # Main loop - 16 bytes of A 101 # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles. 102 # 2 loads for A0 = +2 cycles. Total 18 * 4 + 2 = 74 cycles. 103 104 .p2align 3 1052: 106 # BLOCK 0 - 18 cycles - includes prfm 107 LDR d9, [x5, 72] // Read B 108 INS v8.d[0], x17 109 SMULL v2.8h, v4.8b, v0.8b 110 SMULL v3.8h, v4.8b, v1.8b 111 LDR x17, [x5, 80] 112 SMULL v10.8h, v5.8b, v0.8b 113 SMULL v11.8h, v5.8b, v1.8b 114 LDR d5, [x5, 24] 115 INS v4.d[0], x16 116 SMLAL v2.8h, v8.8b, v6.8b 117 SMLAL v3.8h, v8.8b, v7.8b 118 LDR x16, [x5, 32] 119 SMLAL v10.8h, v9.8b, v6.8b 120 SMLAL v11.8h, v9.8b, v7.8b 121 SADALP v16.4s, v2.8h 122 SADALP v17.4s, v3.8h 123 SADALP v18.4s, v10.8h 124 SADALP v19.4s, v11.8h 125 126 # BLOCK 1- 18 cycles 127 LDR d9, [x5, 88] 128 INS v8.d[0], x17 129 SMULL v12.8h, v4.8b, v0.8b 130 SMULL v13.8h, v4.8b, v1.8b 131 LDR x17, [x5, 96] 132 SMULL v14.8h, v5.8b, v0.8b 133 SMULL v15.8h, v5.8b, v1.8b 134 LDR d5, [x5, 40] 135 INS v4.d[0], x16 136 SMLAL v12.8h, v8.8b, v6.8b 137 SMLAL v13.8h, v8.8b, v7.8b 138 LDR x16, [x5, 48] 139 SMLAL v14.8h, v9.8b, v6.8b 140 SMLAL v15.8h, v9.8b, v7.8b 141 SADALP v20.4s, v12.8h 142 SADALP v21.4s, v13.8h 143 SADALP v22.4s, v14.8h 144 SADALP v23.4s, v15.8h 145 146 # BLOCK 2 - 18 cycles 147 LDR d9, [x5, 104] 148 INS v8.d[0], x17 149 SMULL v2.8h, v4.8b, v0.8b 150 SMULL v3.8h, v4.8b, v1.8b 151 LDR x17, [x5, 112] 152 SMULL v10.8h, v5.8b, v0.8b 153 SMULL v11.8h, v5.8b, v1.8b 154 LDR d5, [x5, 56] 155 INS v4.d[0], x16 156 SMLAL v2.8h, v8.8b, v6.8b 157 SMLAL v3.8h, v8.8b, v7.8b 158 LDR x16, [x5, 128] 159 SMLAL v10.8h, v9.8b, v6.8b 160 SMLAL v11.8h, v9.8b, v7.8b 161 SADALP v24.4s, v2.8h 162 LDR x20, [x13], 8 // Read A0 163 SADALP v25.4s, v3.8h 164 LDR x21, [x15], 8 // Read A1 165 SADALP v26.4s, v10.8h 166 SADALP v27.4s, v11.8h 167 SUBS x0, x0, 16 168 169 # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles 170 LDR d9, [x5, 120] 171 INS v8.d[0], x17 172 SMULL v12.8h, v4.8b, v0.8b 173 SMULL v13.8h, v4.8b, v1.8b 174 LDR x17, [x5, 192] // Read B 175 SMULL v14.8h, v5.8b, v0.8b 176 SMULL v15.8h, v5.8b, v1.8b 177 LDR d5, [x5, 136] // Read B 178 INS v4.d[0], x16 179 SMLAL v12.8h, v8.8b, v6.8b 180 SMLAL v13.8h, v8.8b, v7.8b 181 LDR x16, [x5, 144] 182 SMLAL v14.8h, v9.8b, v6.8b 183 SMLAL v15.8h, v9.8b, v7.8b 184 LDR d6, [x13], 8 // Read A0 185 INS v0.d[0], x20 186 LDR d7, [x15], 8 // Read A1 187 INS v1.d[0], x21 188 SADALP v28.4s, v12.8h 189 SADALP v29.4s, v13.8h 190 ADD x5, x5, 128 191 SADALP v30.4s, v14.8h 192 SADALP v31.4s, v15.8h 193 B.HS 2b 194 195 # Epilogue 196 # Same as main loop except no loads at end of loop 197 .p2align 3 1983: 199 # BLOCK 0 - 18 cycles 200 LDR d9, [x5, 72] // Read B 201 INS v8.d[0], x17 202 SMULL v2.8h, v4.8b, v0.8b 203 SMULL v3.8h, v4.8b, v1.8b 204 LDR x17, [x5, 80] 205 SMULL v10.8h, v5.8b, v0.8b 206 SMULL v11.8h, v5.8b, v1.8b 207 LDR d5, [x5, 24] 208 INS v4.d[0], x16 209 SMLAL v2.8h, v8.8b, v6.8b 210 SMLAL v3.8h, v8.8b, v7.8b 211 LDR x16, [x5, 32] 212 SMLAL v10.8h, v9.8b, v6.8b 213 SMLAL v11.8h, v9.8b, v7.8b 214 SADALP v16.4s, v2.8h 215 SADALP v17.4s, v3.8h 216 SADALP v18.4s, v10.8h 217 SADALP v19.4s, v11.8h 218 219 # BLOCK 1- 18 cycles 220 LDR d9, [x5, 88] 221 INS v8.d[0], x17 222 SMULL v12.8h, v4.8b, v0.8b 223 SMULL v13.8h, v4.8b, v1.8b 224 LDR x17, [x5, 96] 225 SMULL v14.8h, v5.8b, v0.8b 226 SMULL v15.8h, v5.8b, v1.8b 227 LDR d5, [x5, 40] 228 INS v4.d[0], x16 229 SMLAL v12.8h, v8.8b, v6.8b 230 SMLAL v13.8h, v8.8b, v7.8b 231 LDR x16, [x5, 48] 232 SMLAL v14.8h, v9.8b, v6.8b 233 SMLAL v15.8h, v9.8b, v7.8b 234 SADALP v20.4s, v12.8h 235 SADALP v21.4s, v13.8h 236 SADALP v22.4s, v14.8h 237 SADALP v23.4s, v15.8h 238 239 # BLOCK 2 - 18 cycles 240 LDR d9, [x5, 104] 241 INS v8.d[0], x17 242 SMULL v2.8h, v4.8b, v0.8b 243 SMULL v3.8h, v4.8b, v1.8b 244 LDR x17, [x5, 112] 245 SMULL v10.8h, v5.8b, v0.8b 246 SMULL v11.8h, v5.8b, v1.8b 247 LDR d5, [x5, 56] 248 INS v4.d[0], x16 249 SMLAL v2.8h, v8.8b, v6.8b 250 SMLAL v3.8h, v8.8b, v7.8b 251 SMLAL v10.8h, v9.8b, v6.8b 252 SMLAL v11.8h, v9.8b, v7.8b 253 SADALP v24.4s, v2.8h 254 SADALP v25.4s, v3.8h 255 SADALP v26.4s, v10.8h 256 SADALP v27.4s, v11.8h 257 258 # BLOCK 3 - 17 cycles 259 LDR d9, [x5, 120] 260 INS v8.d[0], x17 261 SMULL v12.8h, v4.8b, v0.8b 262 SMULL v13.8h, v4.8b, v1.8b 263 SMULL v14.8h, v5.8b, v0.8b 264 SMULL v15.8h, v5.8b, v1.8b 265 SMLAL v12.8h, v8.8b, v6.8b 266 SMLAL v13.8h, v8.8b, v7.8b 267 SMLAL v14.8h, v9.8b, v6.8b 268 SMLAL v15.8h, v9.8b, v7.8b 269 SADALP v28.4s, v12.8h 270 SADALP v29.4s, v13.8h 271 ADD x5, x5, 128 272 SADALP v30.4s, v14.8h 273 SADALP v31.4s, v15.8h 274 275 # Is there a remainder?- 8 bytes of A 276 TBNZ x0, 3, 5f 277 278 # ks loop 279 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 280 B.HI 1b 281 2824: 283 # Add columns 284 ADDP v16.4s, v16.4s, v18.4s 285 ADDP v20.4s, v20.4s, v22.4s 286 LD1R {v4.4s}, [x11], 4 287 ADDP v24.4s, v24.4s, v26.4s 288 ADDP v28.4s, v28.4s, v30.4s 289 LD1R {v7.4s}, [x11], 4 290 ADDP v17.4s, v17.4s, v19.4s 291 ADDP v21.4s, v21.4s, v23.4s 292 ADDP v25.4s, v25.4s, v27.4s 293 ADDP v29.4s, v29.4s, v31.4s 294 ADDP v0.4s, v16.4s, v20.4s 295 ADDP v1.4s, v24.4s, v28.4s 296 ADDP v2.4s, v17.4s, v21.4s 297 ADDP v3.4s, v25.4s, v29.4s 298 299 # Apply params - preshift, scale, postshift, bias and clamp 300 LD1R {v5.4s}, [x11], 4 301 SQSHL v0.4s, v0.4s, v4.4s // shift to upper bits 302 SQSHL v1.4s, v1.4s, v4.4s 303 SQSHL v2.4s, v2.4s, v4.4s 304 SQSHL v3.4s, v3.4s, v4.4s 305 SQDMULH v0.4s, v0.4s, v7.4s // scale without rounding 306 SQDMULH v1.4s, v1.4s, v7.4s 307 SQDMULH v2.4s, v2.4s, v7.4s 308 SQDMULH v3.4s, v3.4s, v7.4s 309 SRSHL v0.4s, v0.4s, v5.4s // signed rounding shift left 310 SRSHL v1.4s, v1.4s, v5.4s 311 SRSHL v2.4s, v2.4s, v5.4s 312 SRSHL v3.4s, v3.4s, v5.4s 313 314 LD1R {v5.8h}, [x11], 2 315 SQXTN v0.4h, v0.4s 316 SQXTN v2.4h, v2.4s 317 SQXTN2 v0.8h, v1.4s 318 SQXTN2 v2.8h, v3.4s 319 SUBS x1, x1, 8 320 SQADD v0.8h, v0.8h, v5.8h 321 SQADD v1.8h, v2.8h, v5.8h 322 SQXTN v0.8b, v0.8h 323 SQXTN2 v0.16b, v1.8h 324 LD1R {v1.16b}, [x11], 1 325 LD1R {v2.16b}, [x11] 326 SMAX v0.16b, v0.16b, v1.16b 327 SUB x11, x11, 15 // rewind params pointer 328 SMIN v0.16b, v0.16b, v2.16b 329 B.LO 6f 330 331 # Store full 2 x 8 332 ST1 {v0.d}[1], [x7], x10 333 ST1 {v0.8b}, [x6], x10 334 335 SUB x4, x4, x3 // a -= ks 336 337 # nc loop 338 B.HI 0b 339 340 # Restore x20,x21 from stack 341 LDP x20, x21, [sp, 64] 342 343 # Restore d8-d15 from stack 344 LDP d14, d15, [sp, 48] 345 LDP d12, d13, [sp, 32] 346 LDP d10, d11, [sp, 16] 347 LDP d8, d9, [sp], 80 348 RET 349 350 # Remainder - 8 bytes of A 351 .p2align 3 3525: 353 LDR d0, [x13], 8 354 LDP d4, d5, [x5] 355 LDR d1, [x15], 8 356 LDP d6, d7, [x5, 16] 357 SMULL v2.8h, v4.8b, v0.8b 358 SMULL v3.8h, v4.8b, v1.8b 359 SMULL v10.8h, v5.8b, v0.8b 360 SMULL v11.8h, v5.8b, v1.8b 361 SMULL v12.8h, v6.8b, v0.8b 362 SADALP v16.4s, v2.8h 363 SMULL v13.8h, v6.8b, v1.8b 364 SADALP v17.4s, v3.8h 365 SMULL v14.8h, v7.8b, v0.8b 366 SADALP v18.4s, v10.8h 367 SMULL v15.8h, v7.8b, v1.8b 368 SADALP v19.4s, v11.8h 369 LDP d4, d5, [x5, 32] 370 SMULL v2.8h, v4.8b, v0.8b 371 SADALP v20.4s, v12.8h 372 SMULL v3.8h, v4.8b, v1.8b 373 SADALP v21.4s, v13.8h 374 SMULL v10.8h, v5.8b, v0.8b 375 SADALP v22.4s, v14.8h 376 SMULL v11.8h, v5.8b, v1.8b 377 SADALP v23.4s, v15.8h 378 LDP d6, d7, [x5, 48] 379 SMULL v12.8h, v6.8b, v0.8b 380 SADALP v24.4s, v2.8h 381 SMULL v13.8h, v6.8b, v1.8b 382 SADALP v25.4s, v3.8h 383 SMULL v14.8h, v7.8b, v0.8b 384 SADALP v26.4s, v10.8h 385 SMULL v15.8h, v7.8b, v1.8b 386 SADALP v27.4s, v11.8h 387 ADD x5, x5, 64 388 SADALP v28.4s, v12.8h 389 SADALP v29.4s, v13.8h 390 SADALP v30.4s, v14.8h 391 SADALP v31.4s, v15.8h 392 393 # ks loop 394 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 395 B.HI 1b 396 B 4b 397 398 # Store odd width 399 .p2align 3 4006: 401 TBZ x1, 2, 7f 402 ST1 {v0.s}[2], [x7], 4 403 STR s0, [x6], 4 404 EXT v0.16b, v0.16b, v0.16b, 4 405 4067: 407 TBZ x1, 1, 8f 408 ST1 {v0.h}[4], [x7], 2 409 STR h0, [x6], 2 410 EXT v0.16b, v0.16b, v0.16b, 2 4118: 412 TBZ x1, 0, 9f 413 ST1 {v0.b}[8], [x7] 414 STR b0, [x6] 4159: 416 # Restore x20,x21 from stack 417 LDP x20, x21, [sp, 64] 418 419 # Restore d8-d15 from stack 420 LDP d14, d15, [sp, 48] 421 LDP d12, d13, [sp, 32] 422 LDP d10, d11, [sp, 16] 423 LDP d8, d9, [sp], 80 424 RET 425 426END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53 427 428#ifdef __ELF__ 429.section ".note.GNU-stack","",%progbits 430#endif 431