1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# A1 x4 v1 v7 30# B x5 v4 v5 v8 v9 31# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 32# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 33# temp0 v2 v10 v12 v14 34# temp1 v3 v11 v13 v15 35# x16, x17, x20, x21 tenporary a53 gpr load data 36 37 38BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53 39 40 # Clamp A and C pointers 41 CMP x0, 2 // if mr < 2 42 STP d8, d9, [sp, -80]! 43 ADD x4, x3, x4 // a1 = a0 + a_stride 44 STP d10, d11, [sp, 16] 45 ADD x7, x6, x7 // c1 = c0 + cm_stride 46 STP d12, d13, [sp, 32] 47 CSEL x4, x3, x4, LO // a1 = a0 48 STP d14, d15, [sp, 48] 49 ADD x2, x2, 7 // kc = (kc + 7) & ~7 50 CSEL x7, x6, x7, LO // c1 = c0 51 BIC x2, x2, 7 52 STP x20, x21, [sp, 64] // Save x20,x21 on stack 53 54 .p2align 3 550: 56 # Load initial bias from w into accumulators 57 SUBS x0, x2, 16 // k = kc - 16 58 LDP s16, s18, [x5], 8 59 MOV v17.16b, v16.16b 60 MOV v19.16b, v18.16b 61 LDP s20, s22, [x5], 8 62 MOV v21.16b, v20.16b 63 MOV v23.16b, v22.16b 64 LDP s24, s26, [x5], 8 65 MOV v25.16b, v24.16b 66 MOV v27.16b, v26.16b 67 LDP s28, s30, [x5], 8 68 MOV v29.16b, v28.16b 69 LDP x10, x11, [sp, 80] // cn_stride, params 70 MOV v31.16b, v30.16b 71 # Is there at least 16 bytes for epilogue? 72 B.LO 4f 73 74 # Prologue: load A0, A1 and 2 B's 75 LDP d4, d5, [x5] // Read B 76 LDP d0, d6, [x3], 16 // Read A0 77 LDR x17, [x5, 64] // Read B 78 LDP d1, d7, [x4], 16 // Read A1 79 LDR x16, [x5, 16] 80 81 # Is there at least 16 bytes for main loop? 82 SUBS x0, x0, 16 // k = k - 16 83 B.LO 2f 84 85 # Main loop - 16 bytes of A 86 # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles. 87 # 2 loads for A0 = +2 cycles. Total 18 * 4 + 2 = 74 cycles. 88 89 .p2align 3 901: 91 # BLOCK 0 - 18 cycles - includes prfm 92 LDR d9, [x5, 72] // Read B 93 INS v8.d[0], x17 94 SMULL v2.8h, v4.8b, v0.8b 95 SMULL v3.8h, v4.8b, v1.8b 96 LDR x17, [x5, 80] 97 SMULL v10.8h, v5.8b, v0.8b 98 SMULL v11.8h, v5.8b, v1.8b 99 LDR d5, [x5, 24] 100 INS v4.d[0], x16 101 SMLAL v2.8h, v8.8b, v6.8b 102 SMLAL v3.8h, v8.8b, v7.8b 103 LDR x16, [x5, 32] 104 SMLAL v10.8h, v9.8b, v6.8b 105 SMLAL v11.8h, v9.8b, v7.8b 106 SADALP v16.4s, v2.8h 107 SADALP v17.4s, v3.8h 108 SADALP v18.4s, v10.8h 109 SADALP v19.4s, v11.8h 110 111 # BLOCK 1- 18 cycles 112 LDR d9, [x5, 88] 113 INS v8.d[0], x17 114 SMULL v12.8h, v4.8b, v0.8b 115 SMULL v13.8h, v4.8b, v1.8b 116 LDR x17, [x5, 96] 117 SMULL v14.8h, v5.8b, v0.8b 118 SMULL v15.8h, v5.8b, v1.8b 119 LDR d5, [x5, 40] 120 INS v4.d[0], x16 121 SMLAL v12.8h, v8.8b, v6.8b 122 SMLAL v13.8h, v8.8b, v7.8b 123 LDR x16, [x5, 48] 124 SMLAL v14.8h, v9.8b, v6.8b 125 SMLAL v15.8h, v9.8b, v7.8b 126 SADALP v20.4s, v12.8h 127 SADALP v21.4s, v13.8h 128 SADALP v22.4s, v14.8h 129 SADALP v23.4s, v15.8h 130 131 # BLOCK 2 - 18 cycles 132 LDR d9, [x5, 104] 133 INS v8.d[0], x17 134 SMULL v2.8h, v4.8b, v0.8b 135 SMULL v3.8h, v4.8b, v1.8b 136 LDR x17, [x5, 112] 137 SMULL v10.8h, v5.8b, v0.8b 138 SMULL v11.8h, v5.8b, v1.8b 139 LDR d5, [x5, 56] 140 INS v4.d[0], x16 141 SMLAL v2.8h, v8.8b, v6.8b 142 SMLAL v3.8h, v8.8b, v7.8b 143 LDR x16, [x5, 128] 144 SMLAL v10.8h, v9.8b, v6.8b 145 SMLAL v11.8h, v9.8b, v7.8b 146 SADALP v24.4s, v2.8h 147 LDR x20, [x3], 8 // Read A0 148 SADALP v25.4s, v3.8h 149 LDR x21, [x4], 8 // Read A1 150 SADALP v26.4s, v10.8h 151 SADALP v27.4s, v11.8h 152 SUBS x0, x0, 16 153 154 # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles 155 LDR d9, [x5, 120] 156 INS v8.d[0], x17 157 SMULL v12.8h, v4.8b, v0.8b 158 SMULL v13.8h, v4.8b, v1.8b 159 LDR x17, [x5, 192] // Read B 160 SMULL v14.8h, v5.8b, v0.8b 161 SMULL v15.8h, v5.8b, v1.8b 162 LDR d5, [x5, 136] // Read B 163 INS v4.d[0], x16 164 SMLAL v12.8h, v8.8b, v6.8b 165 SMLAL v13.8h, v8.8b, v7.8b 166 LDR x16, [x5, 144] 167 SMLAL v14.8h, v9.8b, v6.8b 168 SMLAL v15.8h, v9.8b, v7.8b 169 LDR d6, [x3], 8 // Read A0 170 INS v0.d[0], x20 171 LDR d7, [x4], 8 // Read A1 172 INS v1.d[0], x21 173 SADALP v28.4s, v12.8h 174 SADALP v29.4s, v13.8h 175 ADD x5, x5, 128 176 SADALP v30.4s, v14.8h 177 SADALP v31.4s, v15.8h 178 B.HS 1b 179 180 # Epilogue 181 # Same as main loop except no loads at end of loop 182 183 .p2align 3 1842: 185 # BLOCK 0 - 18 cycles 186 LDR d9, [x5, 72] // Read B 187 INS v8.d[0], x17 188 SMULL v2.8h, v4.8b, v0.8b 189 SMULL v3.8h, v4.8b, v1.8b 190 LDR x17, [x5, 80] 191 SMULL v10.8h, v5.8b, v0.8b 192 SMULL v11.8h, v5.8b, v1.8b 193 LDR d5, [x5, 24] 194 INS v4.d[0], x16 195 SMLAL v2.8h, v8.8b, v6.8b 196 SMLAL v3.8h, v8.8b, v7.8b 197 LDR x16, [x5, 32] 198 SMLAL v10.8h, v9.8b, v6.8b 199 SMLAL v11.8h, v9.8b, v7.8b 200 SADALP v16.4s, v2.8h 201 SADALP v17.4s, v3.8h 202 SADALP v18.4s, v10.8h 203 SADALP v19.4s, v11.8h 204 205 # BLOCK 1- 18 cycles 206 LDR d9, [x5, 88] 207 INS v8.d[0], x17 208 SMULL v12.8h, v4.8b, v0.8b 209 SMULL v13.8h, v4.8b, v1.8b 210 LDR x17, [x5, 96] 211 SMULL v14.8h, v5.8b, v0.8b 212 SMULL v15.8h, v5.8b, v1.8b 213 LDR d5, [x5, 40] 214 INS v4.d[0], x16 215 SMLAL v12.8h, v8.8b, v6.8b 216 SMLAL v13.8h, v8.8b, v7.8b 217 LDR x16, [x5, 48] 218 SMLAL v14.8h, v9.8b, v6.8b 219 SMLAL v15.8h, v9.8b, v7.8b 220 SADALP v20.4s, v12.8h 221 SADALP v21.4s, v13.8h 222 SADALP v22.4s, v14.8h 223 SADALP v23.4s, v15.8h 224 225 # BLOCK 2 - 18 cycles 226 LDR d9, [x5, 104] 227 INS v8.d[0], x17 228 SMULL v2.8h, v4.8b, v0.8b 229 SMULL v3.8h, v4.8b, v1.8b 230 LDR x17, [x5, 112] 231 SMULL v10.8h, v5.8b, v0.8b 232 SMULL v11.8h, v5.8b, v1.8b 233 LDR d5, [x5, 56] 234 INS v4.d[0], x16 235 SMLAL v2.8h, v8.8b, v6.8b 236 SMLAL v3.8h, v8.8b, v7.8b 237 SMLAL v10.8h, v9.8b, v6.8b 238 SMLAL v11.8h, v9.8b, v7.8b 239 SADALP v24.4s, v2.8h 240 SADALP v25.4s, v3.8h 241 SADALP v26.4s, v10.8h 242 SADALP v27.4s, v11.8h 243 244 # BLOCK 3 - 17 cycles 245 LDR d9, [x5, 120] 246 INS v8.d[0], x17 247 SMULL v12.8h, v4.8b, v0.8b 248 SMULL v13.8h, v4.8b, v1.8b 249 SMULL v14.8h, v5.8b, v0.8b 250 SMULL v15.8h, v5.8b, v1.8b 251 SMLAL v12.8h, v8.8b, v6.8b 252 SMLAL v13.8h, v8.8b, v7.8b 253 SMLAL v14.8h, v9.8b, v6.8b 254 SMLAL v15.8h, v9.8b, v7.8b 255 SADALP v28.4s, v12.8h 256 SADALP v29.4s, v13.8h 257 ADD x5, x5, 128 258 SADALP v30.4s, v14.8h 259 SADALP v31.4s, v15.8h 260 261 # Is there a remainder?- 8 bytes of A 262 TBNZ x0, 3, 4f 263 264 .p2align 3 2653: 266 # Add columns 267 ADDP v16.4s, v16.4s, v18.4s 268 ADDP v20.4s, v20.4s, v22.4s 269 ADDP v24.4s, v24.4s, v26.4s 270 ADDP v28.4s, v28.4s, v30.4s 271 ADDP v17.4s, v17.4s, v19.4s 272 ADDP v21.4s, v21.4s, v23.4s 273 ADDP v25.4s, v25.4s, v27.4s 274 ADDP v29.4s, v29.4s, v31.4s 275 ADDP v0.4s, v16.4s, v20.4s 276 ADDP v1.4s, v24.4s, v28.4s 277 ADDP v2.4s, v17.4s, v21.4s 278 ADDP v3.4s, v25.4s, v29.4s 279 280 # Load per channel scale values from weights 281 SCVTF v0.4s, v0.4s 282 LDR q4, [x5], 16 283 SCVTF v1.4s, v1.4s 284 LDR q5, [x5], 16 285 SCVTF v2.4s, v2.4s 286 SCVTF v3.4s, v3.4s 287 FMUL v0.4s, v0.4s, v4.4s 288 FMUL v1.4s, v1.4s, v5.4s 289 FMUL v2.4s, v2.4s, v4.4s 290 FMUL v3.4s, v3.4s, v5.4s 291 292 FCVTNS v0.4s, v0.4s 293 FCVTNS v1.4s, v1.4s 294 FCVTNS v2.4s, v2.4s 295 FCVTNS v3.4s, v3.4s 296 297 LD1R {v5.8h}, [x11], 2 298 SQXTN v0.4h, v0.4s 299 SQXTN v2.4h, v2.4s 300 SQXTN2 v0.8h, v1.4s 301 SQXTN2 v2.8h, v3.4s 302 SUBS x1, x1, 8 303 SQADD v0.8h, v0.8h, v5.8h 304 SQADD v1.8h, v2.8h, v5.8h 305 SQXTN v0.8b, v0.8h 306 SQXTN2 v0.16b, v1.8h 307 LD1R {v1.16b}, [x11], 1 308 LD1R {v2.16b}, [x11] 309 SMAX v0.16b, v0.16b, v1.16b 310 SMIN v0.16b, v0.16b, v2.16b 311 B.LO 5f 312 313 # Store full 2 x 8 314 ST1 {v0.8b}, [x6], x10 315 SUB x3, x3, x2 // a0 -= kc 316 ST1 {v0.d}[1], [x7], x10 317 SUB x4, x4, x2 // a1 -= kc 318 B.HI 0b 319 320 # Restore x20,x21 from stack 321 LDP x20, x21, [sp, 64] 322 323 # Restore d8-d15 from stack 324 LDP d14, d15, [sp, 48] 325 LDP d12, d13, [sp, 32] 326 LDP d10, d11, [sp, 16] 327 LDP d8, d9, [sp], 80 328 RET 329 330 # Remainder - 8 bytes of A 331 .p2align 3 3324: 333 LDR d0, [x3], 8 334 LDP d4, d5, [x5] 335 LDR d1, [x4], 8 336 LDP d6, d7, [x5, 16] 337 SMULL v2.8h, v4.8b, v0.8b 338 SMULL v3.8h, v4.8b, v1.8b 339 SMULL v10.8h, v5.8b, v0.8b 340 SMULL v11.8h, v5.8b, v1.8b 341 SMULL v12.8h, v6.8b, v0.8b 342 SADALP v16.4s, v2.8h 343 SMULL v13.8h, v6.8b, v1.8b 344 SADALP v17.4s, v3.8h 345 SMULL v14.8h, v7.8b, v0.8b 346 SADALP v18.4s, v10.8h 347 SMULL v15.8h, v7.8b, v1.8b 348 SADALP v19.4s, v11.8h 349 LDP d4, d5, [x5, 32] 350 SMULL v2.8h, v4.8b, v0.8b 351 SADALP v20.4s, v12.8h 352 SMULL v3.8h, v4.8b, v1.8b 353 SADALP v21.4s, v13.8h 354 SMULL v10.8h, v5.8b, v0.8b 355 SADALP v22.4s, v14.8h 356 SMULL v11.8h, v5.8b, v1.8b 357 SADALP v23.4s, v15.8h 358 LDP d6, d7, [x5, 48] 359 SMULL v12.8h, v6.8b, v0.8b 360 SADALP v24.4s, v2.8h 361 SMULL v13.8h, v6.8b, v1.8b 362 SADALP v25.4s, v3.8h 363 SMULL v14.8h, v7.8b, v0.8b 364 SADALP v26.4s, v10.8h 365 SMULL v15.8h, v7.8b, v1.8b 366 SADALP v27.4s, v11.8h 367 ADD x5, x5, 64 368 SADALP v28.4s, v12.8h 369 SADALP v29.4s, v13.8h 370 SADALP v30.4s, v14.8h 371 SADALP v31.4s, v15.8h 372 B 3b 373 374 # Store odd width 375 .p2align 3 3765: 377 TBZ x1, 2, 6f 378 STR s0, [x6], 4 379 ST1 {v0.s}[2], [x7], 4 380 EXT v0.16b, v0.16b, v0.16b, 4 381 3826: 383 TBZ x1, 1, 7f 384 STR h0, [x6], 2 385 ST1 {v0.h}[4], [x7], 2 386 EXT v0.16b, v0.16b, v0.16b, 2 3877: 388 TBZ x1, 0, 8f 389 STR b0, [x6] 390 ST1 {v0.b}[8], [x7] 3918: 392 # Restore x20,x21 from stack 393 LDP x20, x21, [sp, 64] 394 395 # Restore d8-d15 from stack 396 LDP d14, d15, [sp, 48] 397 LDP d12, d13, [sp, 32] 398 LDP d10, d11, [sp, 16] 399 LDP d8, d9, [sp], 80 400 RET 401 402END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53 403 404#ifdef __ELF__ 405.section ".note.GNU-stack","",%progbits 406#endif 407 408