1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# A1 x4 v1 v7 30# B x5 v4 v5 v8 v9 31# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 32# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 33# temp0 v2 v10 v12 v14 34# temp1 v3 v11 v13 v15 35# x16, x17, x20, x21 tenporary a53 gpr load data 36 37 38BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53 39 40 # Clamp A and C pointers 41 CMP x0, 2 // if mr < 2 42 STP d8, d9, [sp, -80]! 43 ADD x4, x3, x4 // a1 = a0 + a_stride 44 STP d10, d11, [sp, 16] 45 ADD x7, x6, x7 // c1 = c0 + cm_stride 46 STP d12, d13, [sp, 32] 47 CSEL x4, x3, x4, LO // a1 = a0 48 STP d14, d15, [sp, 48] 49 ADD x2, x2, 7 // kc = (kc + 7) & ~7 50 CSEL x7, x6, x7, LO // c1 = c0 51 BIC x2, x2, 7 52 STP x20, x21, [sp, 64] // Save x20,x21 on stack 53 54 .p2align 3 550: 56 # Load initial bias from w into accumulators 57 SUBS x0, x2, 16 // k = kc - 16 58 LDP s16, s18, [x5], 8 59 MOV v17.16b, v16.16b 60 MOV v19.16b, v18.16b 61 LDP s20, s22, [x5], 8 62 MOV v21.16b, v20.16b 63 MOV v23.16b, v22.16b 64 LDP s24, s26, [x5], 8 65 MOV v25.16b, v24.16b 66 MOV v27.16b, v26.16b 67 LDP s28, s30, [x5], 8 68 MOV v29.16b, v28.16b 69 LDP x10, x11, [sp, 80] // cn_stride, params 70 MOV v31.16b, v30.16b 71 # Is there at least 16 bytes for epilogue? 72 B.LO 4f 73 74 # Prologue: load A0, A1 and 2 B's 75 LDP d4, d5, [x5] // Read B 76 LDP d0, d6, [x3], 16 // Read A0 77 LDR x17, [x5, 64] // Read B 78 LDP d1, d7, [x4], 16 // Read A1 79 LDR x16, [x5, 16] 80 81 # Is there at least 16 bytes for main loop? 82 SUBS x0, x0, 16 // k = k - 16 83 B.LO 2f 84 85 # Main loop - 16 bytes of A 86 # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles. 87 # 2 loads for A0 = +2 cycles. Total 18 * 4 + 2 = 74 cycles. 88 89 .p2align 3 901: 91 # BLOCK 0 - 18 cycles - includes prfm 92 LDR d9, [x5, 72] // Read B 93 INS v8.d[0], x17 94 SMULL v2.8h, v4.8b, v0.8b 95 SMULL v3.8h, v4.8b, v1.8b 96 LDR x17, [x5, 80] 97 SMULL v10.8h, v5.8b, v0.8b 98 SMULL v11.8h, v5.8b, v1.8b 99 LDR d5, [x5, 24] 100 INS v4.d[0], x16 101 SMLAL v2.8h, v8.8b, v6.8b 102 SMLAL v3.8h, v8.8b, v7.8b 103 LDR x16, [x5, 32] 104 SMLAL v10.8h, v9.8b, v6.8b 105 SMLAL v11.8h, v9.8b, v7.8b 106 PRFM PLDL1KEEP, [x5, 448] 107 SADALP v16.4s, v2.8h 108 SADALP v17.4s, v3.8h 109 PRFM PLDL1KEEP, [x5, 512] 110 SADALP v18.4s, v10.8h 111 SADALP v19.4s, v11.8h 112 113 # BLOCK 1- 18 cycles 114 LDR d9, [x5, 88] 115 INS v8.d[0], x17 116 SMULL v12.8h, v4.8b, v0.8b 117 SMULL v13.8h, v4.8b, v1.8b 118 LDR x17, [x5, 96] 119 SMULL v14.8h, v5.8b, v0.8b 120 SMULL v15.8h, v5.8b, v1.8b 121 LDR d5, [x5, 40] 122 INS v4.d[0], x16 123 SMLAL v12.8h, v8.8b, v6.8b 124 SMLAL v13.8h, v8.8b, v7.8b 125 LDR x16, [x5, 48] 126 SMLAL v14.8h, v9.8b, v6.8b 127 SMLAL v15.8h, v9.8b, v7.8b 128 PRFM PLDL1KEEP, [x3, 128] 129 SADALP v20.4s, v12.8h 130 SADALP v21.4s, v13.8h 131 PRFM PLDL1KEEP, [x4, 128] 132 SADALP v22.4s, v14.8h 133 SADALP v23.4s, v15.8h 134 135 # BLOCK 2 - 18 cycles 136 LDR d9, [x5, 104] 137 INS v8.d[0], x17 138 SMULL v2.8h, v4.8b, v0.8b 139 SMULL v3.8h, v4.8b, v1.8b 140 LDR x17, [x5, 112] 141 SMULL v10.8h, v5.8b, v0.8b 142 SMULL v11.8h, v5.8b, v1.8b 143 LDR d5, [x5, 56] 144 INS v4.d[0], x16 145 SMLAL v2.8h, v8.8b, v6.8b 146 SMLAL v3.8h, v8.8b, v7.8b 147 LDR x16, [x5, 128] 148 SMLAL v10.8h, v9.8b, v6.8b 149 SMLAL v11.8h, v9.8b, v7.8b 150 SADALP v24.4s, v2.8h 151 LDR x20, [x3], 8 // Read A0 152 SADALP v25.4s, v3.8h 153 LDR x21, [x4], 8 // Read A1 154 SADALP v26.4s, v10.8h 155 SADALP v27.4s, v11.8h 156 SUBS x0, x0, 16 157 158 # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles 159 LDR d9, [x5, 120] 160 INS v8.d[0], x17 161 SMULL v12.8h, v4.8b, v0.8b 162 SMULL v13.8h, v4.8b, v1.8b 163 LDR x17, [x5, 192] // Read B 164 SMULL v14.8h, v5.8b, v0.8b 165 SMULL v15.8h, v5.8b, v1.8b 166 LDR d5, [x5, 136] // Read B 167 INS v4.d[0], x16 168 SMLAL v12.8h, v8.8b, v6.8b 169 SMLAL v13.8h, v8.8b, v7.8b 170 LDR x16, [x5, 144] 171 SMLAL v14.8h, v9.8b, v6.8b 172 SMLAL v15.8h, v9.8b, v7.8b 173 LDR d6, [x3], 8 // Read A0 174 INS v0.d[0], x20 175 LDR d7, [x4], 8 // Read A1 176 INS v1.d[0], x21 177 SADALP v28.4s, v12.8h 178 SADALP v29.4s, v13.8h 179 ADD x5, x5, 128 180 SADALP v30.4s, v14.8h 181 SADALP v31.4s, v15.8h 182 B.HS 1b 183 184 # Epilogue 185 # Same as main loop except no loads at end of loop 186 187 .p2align 3 1882: 189 # BLOCK 0 - 18 cycles 190 LDR d9, [x5, 72] // Read B 191 INS v8.d[0], x17 192 SMULL v2.8h, v4.8b, v0.8b 193 SMULL v3.8h, v4.8b, v1.8b 194 LDR x17, [x5, 80] 195 SMULL v10.8h, v5.8b, v0.8b 196 SMULL v11.8h, v5.8b, v1.8b 197 LDR d5, [x5, 24] 198 INS v4.d[0], x16 199 SMLAL v2.8h, v8.8b, v6.8b 200 SMLAL v3.8h, v8.8b, v7.8b 201 LDR x16, [x5, 32] 202 SMLAL v10.8h, v9.8b, v6.8b 203 SMLAL v11.8h, v9.8b, v7.8b 204 SADALP v16.4s, v2.8h 205 SADALP v17.4s, v3.8h 206 SADALP v18.4s, v10.8h 207 SADALP v19.4s, v11.8h 208 209 # BLOCK 1- 18 cycles 210 LDR d9, [x5, 88] 211 INS v8.d[0], x17 212 SMULL v12.8h, v4.8b, v0.8b 213 SMULL v13.8h, v4.8b, v1.8b 214 LDR x17, [x5, 96] 215 SMULL v14.8h, v5.8b, v0.8b 216 SMULL v15.8h, v5.8b, v1.8b 217 LDR d5, [x5, 40] 218 INS v4.d[0], x16 219 SMLAL v12.8h, v8.8b, v6.8b 220 SMLAL v13.8h, v8.8b, v7.8b 221 LDR x16, [x5, 48] 222 SMLAL v14.8h, v9.8b, v6.8b 223 SMLAL v15.8h, v9.8b, v7.8b 224 SADALP v20.4s, v12.8h 225 SADALP v21.4s, v13.8h 226 SADALP v22.4s, v14.8h 227 SADALP v23.4s, v15.8h 228 229 # BLOCK 2 - 18 cycles 230 LDR d9, [x5, 104] 231 INS v8.d[0], x17 232 SMULL v2.8h, v4.8b, v0.8b 233 SMULL v3.8h, v4.8b, v1.8b 234 LDR x17, [x5, 112] 235 SMULL v10.8h, v5.8b, v0.8b 236 SMULL v11.8h, v5.8b, v1.8b 237 LDR d5, [x5, 56] 238 INS v4.d[0], x16 239 SMLAL v2.8h, v8.8b, v6.8b 240 SMLAL v3.8h, v8.8b, v7.8b 241 SMLAL v10.8h, v9.8b, v6.8b 242 SMLAL v11.8h, v9.8b, v7.8b 243 SADALP v24.4s, v2.8h 244 SADALP v25.4s, v3.8h 245 SADALP v26.4s, v10.8h 246 SADALP v27.4s, v11.8h 247 248 # BLOCK 3 - 17 cycles 249 LDR d9, [x5, 120] 250 INS v8.d[0], x17 251 SMULL v12.8h, v4.8b, v0.8b 252 SMULL v13.8h, v4.8b, v1.8b 253 SMULL v14.8h, v5.8b, v0.8b 254 SMULL v15.8h, v5.8b, v1.8b 255 SMLAL v12.8h, v8.8b, v6.8b 256 SMLAL v13.8h, v8.8b, v7.8b 257 SMLAL v14.8h, v9.8b, v6.8b 258 SMLAL v15.8h, v9.8b, v7.8b 259 SADALP v28.4s, v12.8h 260 SADALP v29.4s, v13.8h 261 ADD x5, x5, 128 262 SADALP v30.4s, v14.8h 263 SADALP v31.4s, v15.8h 264 265 # Is there a remainder?- 8 bytes of A 266 TBNZ x0, 3, 4f 267 268 .p2align 3 2693: 270 # Add columns 271 ADDP v16.4s, v16.4s, v18.4s 272 ADDP v20.4s, v20.4s, v22.4s 273 ADDP v24.4s, v24.4s, v26.4s 274 ADDP v28.4s, v28.4s, v30.4s 275 ADDP v17.4s, v17.4s, v19.4s 276 ADDP v21.4s, v21.4s, v23.4s 277 ADDP v25.4s, v25.4s, v27.4s 278 ADDP v29.4s, v29.4s, v31.4s 279 ADDP v0.4s, v16.4s, v20.4s 280 ADDP v1.4s, v24.4s, v28.4s 281 ADDP v2.4s, v17.4s, v21.4s 282 ADDP v3.4s, v25.4s, v29.4s 283 284 # Load per channel scale values from weights 285 SCVTF v0.4s, v0.4s 286 LDR q4, [x5], 16 287 SCVTF v1.4s, v1.4s 288 LDR q5, [x5], 16 289 SCVTF v2.4s, v2.4s 290 SCVTF v3.4s, v3.4s 291 FMUL v0.4s, v0.4s, v4.4s 292 FMUL v1.4s, v1.4s, v5.4s 293 FMUL v2.4s, v2.4s, v4.4s 294 FMUL v3.4s, v3.4s, v5.4s 295 296 FCVTNS v0.4s, v0.4s 297 FCVTNS v1.4s, v1.4s 298 FCVTNS v2.4s, v2.4s 299 FCVTNS v3.4s, v3.4s 300 301 LD1R {v5.8h}, [x11], 2 302 SQXTN v0.4h, v0.4s 303 SQXTN v2.4h, v2.4s 304 SQXTN2 v0.8h, v1.4s 305 SQXTN2 v2.8h, v3.4s 306 SUBS x1, x1, 8 307 SQADD v0.8h, v0.8h, v5.8h 308 SQADD v1.8h, v2.8h, v5.8h 309 SQXTN v0.8b, v0.8h 310 SQXTN2 v0.16b, v1.8h 311 LD1R {v1.16b}, [x11], 1 312 LD1R {v2.16b}, [x11] 313 SMAX v0.16b, v0.16b, v1.16b 314 SMIN v0.16b, v0.16b, v2.16b 315 B.LO 5f 316 317 # Store full 2 x 8 318 ST1 {v0.8b}, [x6], x10 319 SUB x3, x3, x2 // a0 -= kc 320 ST1 {v0.d}[1], [x7], x10 321 SUB x4, x4, x2 // a1 -= kc 322 B.HI 0b 323 324 # Restore x20,x21 from stack 325 LDP x20, x21, [sp, 64] 326 327 # Restore d8-d15 from stack 328 LDP d14, d15, [sp, 48] 329 LDP d12, d13, [sp, 32] 330 LDP d10, d11, [sp, 16] 331 LDP d8, d9, [sp], 80 332 RET 333 334 # Remainder - 8 bytes of A 335 .p2align 3 3364: 337 LDR d0, [x3], 8 338 LDP d4, d5, [x5] 339 LDR d1, [x4], 8 340 LDP d6, d7, [x5, 16] 341 SMULL v2.8h, v4.8b, v0.8b 342 SMULL v3.8h, v4.8b, v1.8b 343 SMULL v10.8h, v5.8b, v0.8b 344 SMULL v11.8h, v5.8b, v1.8b 345 SMULL v12.8h, v6.8b, v0.8b 346 SADALP v16.4s, v2.8h 347 SMULL v13.8h, v6.8b, v1.8b 348 SADALP v17.4s, v3.8h 349 SMULL v14.8h, v7.8b, v0.8b 350 SADALP v18.4s, v10.8h 351 SMULL v15.8h, v7.8b, v1.8b 352 SADALP v19.4s, v11.8h 353 LDP d4, d5, [x5, 32] 354 SMULL v2.8h, v4.8b, v0.8b 355 SADALP v20.4s, v12.8h 356 SMULL v3.8h, v4.8b, v1.8b 357 SADALP v21.4s, v13.8h 358 SMULL v10.8h, v5.8b, v0.8b 359 SADALP v22.4s, v14.8h 360 SMULL v11.8h, v5.8b, v1.8b 361 SADALP v23.4s, v15.8h 362 LDP d6, d7, [x5, 48] 363 SMULL v12.8h, v6.8b, v0.8b 364 SADALP v24.4s, v2.8h 365 SMULL v13.8h, v6.8b, v1.8b 366 SADALP v25.4s, v3.8h 367 SMULL v14.8h, v7.8b, v0.8b 368 SADALP v26.4s, v10.8h 369 SMULL v15.8h, v7.8b, v1.8b 370 SADALP v27.4s, v11.8h 371 ADD x5, x5, 64 372 SADALP v28.4s, v12.8h 373 SADALP v29.4s, v13.8h 374 SADALP v30.4s, v14.8h 375 SADALP v31.4s, v15.8h 376 B 3b 377 378 # Store odd width 379 .p2align 3 3805: 381 TBZ x1, 2, 6f 382 STR s0, [x6], 4 383 ST1 {v0.s}[2], [x7], 4 384 EXT v0.16b, v0.16b, v0.16b, 4 385 3866: 387 TBZ x1, 1, 7f 388 STR h0, [x6], 2 389 ST1 {v0.h}[4], [x7], 2 390 EXT v0.16b, v0.16b, v0.16b, 2 3917: 392 TBZ x1, 0, 8f 393 STR b0, [x6] 394 ST1 {v0.b}[8], [x7] 3958: 396 # Restore x20,x21 from stack 397 LDP x20, x21, [sp, 64] 398 399 # Restore d8-d15 from stack 400 LDP d14, d15, [sp, 48] 401 LDP d12, d13, [sp, 32] 402 LDP d10, d11, [sp, 16] 403 LDP d8, d9, [sp], 80 404 RET 405 406END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53 407 408#ifdef __ELF__ 409.section ".note.GNU-stack","",%progbits 410#endif 411 412