1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# A1 x15 v1 v7 32# B x5 v4 v5 v8 v9 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 35# temp0 v2 v10 v12 v14 36# temp1 v3 v11 v13 v15 37# x16, x17, x20, x21 tenporary a53 gpr load data 38 39 40BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53 41 42 # Clamp C pointers 43 LDP x10, x8, [sp] // Load cn_stride, a_offset 44 CMP x0, 2 // if mr < 2 45 LDP x12, x11, [sp, 16] // Load zero, params pointer 46 ADD x7, x6, x7 // c1 = c0 + cm_stride 47 STP d8, d9, [sp, -80]! 48 ADD x2, x2, 7 // kc = (kc + 7) & ~7 49 STP d10, d11, [sp, 16] 50 CSEL x7, x6, x7, LO // c1 = c0 51 STP d12, d13, [sp, 32] 52 BIC x2, x2, 7 53 STP d14, d15, [sp, 48] 54 STP x20, x21, [sp, 64] // Save x20,x21 on stack 55 56 .p2align 3 570: 58 # Load initial bias from w into accumulators 59 LDP s16, s18, [x5], 8 60 MOV v17.16b, v16.16b 61 MOV v19.16b, v18.16b 62 LDP s20, s22, [x5], 8 63 MOV v21.16b, v20.16b 64 MOV v23.16b, v22.16b 65 LDP s24, s26, [x5], 8 66 MOV v25.16b, v24.16b 67 MOV v27.16b, v26.16b 68 LDP s28, s30, [x5], 8 69 MOV v29.16b, v28.16b 70 MOV v31.16b, v30.16b 71 MOV x9, x3 // p = ks 72 73 .p2align 3 741: 75 # Load next 2 A pointers 76 LDP x13, x15, [x4], 16 77 CMP x13, x12 // if a0 == zero 78 ADD x13, x13, x8 // a0 += a_offset 79 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 80 CMP x15, x12 // if a1 == zero 81 ADD x15, x15, x8 // a1 += a_offset 82 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 83 84 # Is there at least 16 bytes for epilogue? 85 SUBS x0, x2, 16 // k = kc - 16 86 B.LO 5f 87 88 # Prologue: load A0, A1 and 2 B's 89 LDP d4, d5, [x5] // Read B 90 LDP d0, d6, [x13], 16 91 LDP d1, d7, [x15], 16 92// LDP d8, d9, [x5, 64] 93 LDR x17, [x5, 64] // Read B 94 LDR x16, [x5, 16] 95 96 # Is there at least 16 bytes for main loop? 97 SUBS x0, x0, 16 // k = k - 16 98 B.LO 3f 99 100 # Main loop - 16 bytes of A 101 # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles. 102 # 2 loads for A0 = +2 cycles. Total 18 * 4 + 2 = 74 cycles. 103 104 .p2align 3 1052: 106 # BLOCK 0 - 18 cycles - includes prfm 107 LDR d9, [x5, 72] // Read B 108 INS v8.d[0], x17 109 SMULL v2.8h, v4.8b, v0.8b 110 SMULL v3.8h, v4.8b, v1.8b 111 LDR x17, [x5, 80] 112 SMULL v10.8h, v5.8b, v0.8b 113 SMULL v11.8h, v5.8b, v1.8b 114 LDR d5, [x5, 24] 115 INS v4.d[0], x16 116 SMLAL v2.8h, v8.8b, v6.8b 117 SMLAL v3.8h, v8.8b, v7.8b 118 LDR x16, [x5, 32] 119 SMLAL v10.8h, v9.8b, v6.8b 120 SMLAL v11.8h, v9.8b, v7.8b 121 SADALP v16.4s, v2.8h 122 SADALP v17.4s, v3.8h 123 SADALP v18.4s, v10.8h 124 SADALP v19.4s, v11.8h 125 126 # BLOCK 1- 18 cycles 127 LDR d9, [x5, 88] 128 INS v8.d[0], x17 129 SMULL v12.8h, v4.8b, v0.8b 130 SMULL v13.8h, v4.8b, v1.8b 131 LDR x17, [x5, 96] 132 SMULL v14.8h, v5.8b, v0.8b 133 SMULL v15.8h, v5.8b, v1.8b 134 LDR d5, [x5, 40] 135 INS v4.d[0], x16 136 SMLAL v12.8h, v8.8b, v6.8b 137 SMLAL v13.8h, v8.8b, v7.8b 138 LDR x16, [x5, 48] 139 SMLAL v14.8h, v9.8b, v6.8b 140 SMLAL v15.8h, v9.8b, v7.8b 141 SADALP v20.4s, v12.8h 142 SADALP v21.4s, v13.8h 143 SADALP v22.4s, v14.8h 144 SADALP v23.4s, v15.8h 145 146 # BLOCK 2 - 18 cycles 147 LDR d9, [x5, 104] 148 INS v8.d[0], x17 149 SMULL v2.8h, v4.8b, v0.8b 150 SMULL v3.8h, v4.8b, v1.8b 151 LDR x17, [x5, 112] 152 SMULL v10.8h, v5.8b, v0.8b 153 SMULL v11.8h, v5.8b, v1.8b 154 LDR d5, [x5, 56] 155 INS v4.d[0], x16 156 SMLAL v2.8h, v8.8b, v6.8b 157 SMLAL v3.8h, v8.8b, v7.8b 158 LDR x16, [x5, 128] 159 SMLAL v10.8h, v9.8b, v6.8b 160 SMLAL v11.8h, v9.8b, v7.8b 161 SADALP v24.4s, v2.8h 162 LDR x20, [x13], 8 // Read A0 163 SADALP v25.4s, v3.8h 164 LDR x21, [x15], 8 // Read A1 165 SADALP v26.4s, v10.8h 166 SADALP v27.4s, v11.8h 167 SUBS x0, x0, 16 168 169 # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles 170 LDR d9, [x5, 120] 171 INS v8.d[0], x17 172 SMULL v12.8h, v4.8b, v0.8b 173 SMULL v13.8h, v4.8b, v1.8b 174 LDR x17, [x5, 192] // Read B 175 SMULL v14.8h, v5.8b, v0.8b 176 SMULL v15.8h, v5.8b, v1.8b 177 LDR d5, [x5, 136] // Read B 178 INS v4.d[0], x16 179 SMLAL v12.8h, v8.8b, v6.8b 180 SMLAL v13.8h, v8.8b, v7.8b 181 LDR x16, [x5, 144] 182 SMLAL v14.8h, v9.8b, v6.8b 183 SMLAL v15.8h, v9.8b, v7.8b 184 LDR d6, [x13], 8 // Read A0 185 INS v0.d[0], x20 186 LDR d7, [x15], 8 // Read A1 187 INS v1.d[0], x21 188 SADALP v28.4s, v12.8h 189 SADALP v29.4s, v13.8h 190 ADD x5, x5, 128 191 SADALP v30.4s, v14.8h 192 SADALP v31.4s, v15.8h 193 B.HS 2b 194 195 # Epilogue 196 # Same as main loop except no loads at end of loop 197 .p2align 3 1983: 199 # BLOCK 0 - 18 cycles 200 LDR d9, [x5, 72] // Read B 201 INS v8.d[0], x17 202 SMULL v2.8h, v4.8b, v0.8b 203 SMULL v3.8h, v4.8b, v1.8b 204 LDR x17, [x5, 80] 205 SMULL v10.8h, v5.8b, v0.8b 206 SMULL v11.8h, v5.8b, v1.8b 207 LDR d5, [x5, 24] 208 INS v4.d[0], x16 209 SMLAL v2.8h, v8.8b, v6.8b 210 SMLAL v3.8h, v8.8b, v7.8b 211 LDR x16, [x5, 32] 212 SMLAL v10.8h, v9.8b, v6.8b 213 SMLAL v11.8h, v9.8b, v7.8b 214 SADALP v16.4s, v2.8h 215 SADALP v17.4s, v3.8h 216 SADALP v18.4s, v10.8h 217 SADALP v19.4s, v11.8h 218 219 # BLOCK 1- 18 cycles 220 LDR d9, [x5, 88] 221 INS v8.d[0], x17 222 SMULL v12.8h, v4.8b, v0.8b 223 SMULL v13.8h, v4.8b, v1.8b 224 LDR x17, [x5, 96] 225 SMULL v14.8h, v5.8b, v0.8b 226 SMULL v15.8h, v5.8b, v1.8b 227 LDR d5, [x5, 40] 228 INS v4.d[0], x16 229 SMLAL v12.8h, v8.8b, v6.8b 230 SMLAL v13.8h, v8.8b, v7.8b 231 LDR x16, [x5, 48] 232 SMLAL v14.8h, v9.8b, v6.8b 233 SMLAL v15.8h, v9.8b, v7.8b 234 SADALP v20.4s, v12.8h 235 SADALP v21.4s, v13.8h 236 SADALP v22.4s, v14.8h 237 SADALP v23.4s, v15.8h 238 239 # BLOCK 2 - 18 cycles 240 LDR d9, [x5, 104] 241 INS v8.d[0], x17 242 SMULL v2.8h, v4.8b, v0.8b 243 SMULL v3.8h, v4.8b, v1.8b 244 LDR x17, [x5, 112] 245 SMULL v10.8h, v5.8b, v0.8b 246 SMULL v11.8h, v5.8b, v1.8b 247 LDR d5, [x5, 56] 248 INS v4.d[0], x16 249 SMLAL v2.8h, v8.8b, v6.8b 250 SMLAL v3.8h, v8.8b, v7.8b 251 SMLAL v10.8h, v9.8b, v6.8b 252 SMLAL v11.8h, v9.8b, v7.8b 253 SADALP v24.4s, v2.8h 254 SADALP v25.4s, v3.8h 255 SADALP v26.4s, v10.8h 256 SADALP v27.4s, v11.8h 257 258 # BLOCK 3 - 17 cycles 259 LDR d9, [x5, 120] 260 INS v8.d[0], x17 261 SMULL v12.8h, v4.8b, v0.8b 262 SMULL v13.8h, v4.8b, v1.8b 263 SMULL v14.8h, v5.8b, v0.8b 264 SMULL v15.8h, v5.8b, v1.8b 265 SMLAL v12.8h, v8.8b, v6.8b 266 SMLAL v13.8h, v8.8b, v7.8b 267 SMLAL v14.8h, v9.8b, v6.8b 268 SMLAL v15.8h, v9.8b, v7.8b 269 SADALP v28.4s, v12.8h 270 SADALP v29.4s, v13.8h 271 ADD x5, x5, 128 272 SADALP v30.4s, v14.8h 273 SADALP v31.4s, v15.8h 274 275 # Is there a remainder?- 8 bytes of A 276 TBNZ x0, 3, 5f 277 278 # ks loop 279 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 280 B.HI 1b 281 2824: 283 # Add columns 284 ADDP v16.4s, v16.4s, v18.4s 285 ADDP v20.4s, v20.4s, v22.4s 286 ADDP v24.4s, v24.4s, v26.4s 287 ADDP v28.4s, v28.4s, v30.4s 288 ADDP v17.4s, v17.4s, v19.4s 289 ADDP v21.4s, v21.4s, v23.4s 290 ADDP v25.4s, v25.4s, v27.4s 291 ADDP v29.4s, v29.4s, v31.4s 292 ADDP v0.4s, v16.4s, v20.4s 293 ADDP v1.4s, v24.4s, v28.4s 294 ADDP v2.4s, v17.4s, v21.4s 295 ADDP v3.4s, v25.4s, v29.4s 296 297 # Apply params - scale, bias and clamp 298 SCVTF v0.4s, v0.4s 299 LD1R {v4.4s}, [x11], 4 300 SCVTF v1.4s, v1.4s 301 SCVTF v2.4s, v2.4s 302 SCVTF v3.4s, v3.4s 303 FMUL v0.4s, v0.4s, v4.4s 304 FMUL v1.4s, v1.4s, v4.4s 305 FMUL v2.4s, v2.4s, v4.4s 306 FMUL v3.4s, v3.4s, v4.4s 307 308 FCVTNS v0.4s, v0.4s 309 FCVTNS v1.4s, v1.4s 310 FCVTNS v2.4s, v2.4s 311 FCVTNS v3.4s, v3.4s 312 313 LD1R {v5.8h}, [x11], 2 314 SQXTN v0.4h, v0.4s 315 SQXTN v2.4h, v2.4s 316 SQXTN2 v0.8h, v1.4s 317 SQXTN2 v2.8h, v3.4s 318 SUBS x1, x1, 8 319 SQADD v0.8h, v0.8h, v5.8h 320 SQADD v1.8h, v2.8h, v5.8h 321 SQXTN v0.8b, v0.8h 322 SQXTN2 v0.16b, v1.8h 323 LD1R {v1.16b}, [x11], 1 324 LD1R {v2.16b}, [x11] 325 SMAX v0.16b, v0.16b, v1.16b 326 SUB x11, x11, 7 // rewind params pointer 327 SMIN v0.16b, v0.16b, v2.16b 328 B.LO 6f 329 330 # Store full 2 x 8 331 ST1 {v0.d}[1], [x7], x10 332 ST1 {v0.8b}, [x6], x10 333 334 SUB x4, x4, x3 // a -= ks 335 336 # nc loop 337 B.HI 0b 338 339 # Restore x20,x21 from stack 340 LDP x20, x21, [sp, 64] 341 342 # Restore d8-d15 from stack 343 LDP d14, d15, [sp, 48] 344 LDP d12, d13, [sp, 32] 345 LDP d10, d11, [sp, 16] 346 LDP d8, d9, [sp], 80 347 RET 348 349 # Remainder - 8 bytes of A 350 .p2align 3 3515: 352 LDR d0, [x13], 8 353 LDP d4, d5, [x5] 354 LDR d1, [x15], 8 355 LDP d6, d7, [x5, 16] 356 SMULL v2.8h, v4.8b, v0.8b 357 SMULL v3.8h, v4.8b, v1.8b 358 SMULL v10.8h, v5.8b, v0.8b 359 SMULL v11.8h, v5.8b, v1.8b 360 SMULL v12.8h, v6.8b, v0.8b 361 SADALP v16.4s, v2.8h 362 SMULL v13.8h, v6.8b, v1.8b 363 SADALP v17.4s, v3.8h 364 SMULL v14.8h, v7.8b, v0.8b 365 SADALP v18.4s, v10.8h 366 SMULL v15.8h, v7.8b, v1.8b 367 SADALP v19.4s, v11.8h 368 LDP d4, d5, [x5, 32] 369 SMULL v2.8h, v4.8b, v0.8b 370 SADALP v20.4s, v12.8h 371 SMULL v3.8h, v4.8b, v1.8b 372 SADALP v21.4s, v13.8h 373 SMULL v10.8h, v5.8b, v0.8b 374 SADALP v22.4s, v14.8h 375 SMULL v11.8h, v5.8b, v1.8b 376 SADALP v23.4s, v15.8h 377 LDP d6, d7, [x5, 48] 378 SMULL v12.8h, v6.8b, v0.8b 379 SADALP v24.4s, v2.8h 380 SMULL v13.8h, v6.8b, v1.8b 381 SADALP v25.4s, v3.8h 382 SMULL v14.8h, v7.8b, v0.8b 383 SADALP v26.4s, v10.8h 384 SMULL v15.8h, v7.8b, v1.8b 385 SADALP v27.4s, v11.8h 386 ADD x5, x5, 64 387 SADALP v28.4s, v12.8h 388 SADALP v29.4s, v13.8h 389 SADALP v30.4s, v14.8h 390 SADALP v31.4s, v15.8h 391 392 # ks loop 393 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 394 B.HI 1b 395 B 4b 396 397 # Store odd width 398 .p2align 3 3996: 400 TBZ x1, 2, 7f 401 ST1 {v0.s}[2], [x7], 4 402 STR s0, [x6], 4 403 EXT v0.16b, v0.16b, v0.16b, 4 404 4057: 406 TBZ x1, 1, 8f 407 ST1 {v0.h}[4], [x7], 2 408 STR h0, [x6], 2 409 EXT v0.16b, v0.16b, v0.16b, 2 4108: 411 TBZ x1, 0, 9f 412 ST1 {v0.b}[8], [x7] 413 STR b0, [x6] 4149: 415 # Restore x20,x21 from stack 416 LDP x20, x21, [sp, 64] 417 418 # Restore d8-d15 from stack 419 LDP d14, d15, [sp, 48] 420 LDP d12, d13, [sp, 32] 421 LDP d10, d11, [sp, 16] 422 LDP d8, d9, [sp], 80 423 RET 424 425END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53 426 427#ifdef __ELF__ 428.section ".note.GNU-stack","",%progbits 429#endif 430