1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# A1 x4 v1 v7 30# B x5 v4 v5 v8 v9 31# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 32# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 33# temp0 v2 v10 v12 v14 34# temp1 v3 v11 v13 v15 35 36 37BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal 38 39 # Clamp A and C pointers 40 CMP x0, 2 // if mr < 2 41 STP d8, d9, [sp, -64]! 42 ADD x4, x3, x4 // a1 = a0 + a_stride 43 STP d10, d11, [sp, 16] 44 ADD x7, x6, x7 // c1 = c0 + cm_stride 45 STP d12, d13, [sp, 32] 46 CSEL x4, x3, x4, LO // a1 = a0 47 STP d14, d15, [sp, 48] 48 ADD x2, x2, 7 // kc = (kc + 7) & ~7 49 CSEL x7, x6, x7, LO // c1 = c0 50 BIC x2, x2, 7 51 52 .p2align 3 530: 54 # Load initial bias from w into accumulators 55 SUBS x0, x2, 16 // k = kc - 16 56 LDP s16, s18, [x5], 8 57 MOV v17.16b, v16.16b 58 MOV v19.16b, v18.16b 59 LDP s20, s22, [x5], 8 60 MOV v21.16b, v20.16b 61 MOV v23.16b, v22.16b 62 LDP s24, s26, [x5], 8 63 MOV v25.16b, v24.16b 64 MOV v27.16b, v26.16b 65 LDP s28, s30, [x5], 8 66 MOV v29.16b, v28.16b 67 LDP x10, x11, [sp, 64] // cn_stride, params 68 MOV v31.16b, v30.16b 69 # Is there at least 16 bytes for epilogue? 70 B.LO 4f 71 72 # Prologue: load A0, A1 and 2 B's 73 LDP d4, d5, [x5] 74 LDP d0, d6, [x3], 16 75 LDP d1, d7, [x4], 16 76 LDP d8, d9, [x5, 64] 77 78 # Is there at least 16 bytes for main loop? 79 SUBS x0, x0, 16 // k = k - 16 80 B.LO 2f 81 82 # Main loop - 16 bytes of A 83 .p2align 3 841: 85 SMULL v2.8h, v4.8b, v0.8b 86 SMULL v3.8h, v4.8b, v1.8b 87 SMULL v10.8h, v5.8b, v0.8b 88 SMULL v11.8h, v5.8b, v1.8b 89 LDP d4, d5, [x5, 16] 90 SMLAL v2.8h, v8.8b, v6.8b 91 SMLAL v3.8h, v8.8b, v7.8b 92 SMLAL v10.8h, v9.8b, v6.8b 93 SMLAL v11.8h, v9.8b, v7.8b 94 95 LDP d8, d9, [x5, 80] 96 SMULL v12.8h, v4.8b, v0.8b 97 SADALP v16.4s, v2.8h 98 SMULL v13.8h, v4.8b, v1.8b 99 SADALP v17.4s, v3.8h 100 SMULL v14.8h, v5.8b, v0.8b 101 SADALP v18.4s, v10.8h 102 SMULL v15.8h, v5.8b, v1.8b 103 SADALP v19.4s, v11.8h 104 LDP d4, d5, [x5, 32] 105 SMLAL v12.8h, v8.8b, v6.8b 106 SMLAL v13.8h, v8.8b, v7.8b 107 SMLAL v14.8h, v9.8b, v6.8b 108 SMLAL v15.8h, v9.8b, v7.8b 109 110 LDP d8, d9, [x5, 96] 111 SMULL v2.8h, v4.8b, v0.8b 112 SADALP v20.4s, v12.8h 113 SMULL v3.8h, v4.8b, v1.8b 114 SADALP v21.4s, v13.8h 115 SMULL v10.8h, v5.8b, v0.8b 116 SADALP v22.4s, v14.8h 117 SMULL v11.8h, v5.8b, v1.8b 118 SADALP v23.4s, v15.8h 119 LDP d4, d5, [x5, 48] 120 SMLAL v2.8h, v8.8b, v6.8b 121 SMLAL v3.8h, v8.8b, v7.8b 122 SMLAL v10.8h, v9.8b, v6.8b 123 SMLAL v11.8h, v9.8b, v7.8b 124 125 LDP d8, d9, [x5, 112] 126 SMULL v12.8h, v4.8b, v0.8b 127 ADD x5, x5, 128 128 SADALP v24.4s, v2.8h 129 SMULL v13.8h, v4.8b, v1.8b 130 SADALP v25.4s, v3.8h 131 SMULL v14.8h, v5.8b, v0.8b 132 SADALP v26.4s, v10.8h 133 SMULL v15.8h, v5.8b, v1.8b 134 SADALP v27.4s, v11.8h 135 SMLAL v12.8h, v8.8b, v6.8b 136 LDP d4, d5, [x5] // Read B 137 SMLAL v13.8h, v8.8b, v7.8b 138 SUBS x0, x0, 16 139 SMLAL v14.8h, v9.8b, v6.8b 140 LDP d0, d6, [x3], 16 // Read A0 141 SMLAL v15.8h, v9.8b, v7.8b 142 143 SADALP v28.4s, v12.8h 144 LDP d1, d7, [x4], 16 // Read A1 145 SADALP v29.4s, v13.8h 146 SADALP v30.4s, v14.8h 147 LDP d8, d9, [x5, 64] // Read B 148 SADALP v31.4s, v15.8h 149 B.HS 1b 150 151 # Epilogue 152 # Same as main loop except no loads at end of loop 153 .p2align 3 1542: 155 SMULL v2.8h, v4.8b, v0.8b 156 SMULL v3.8h, v4.8b, v1.8b 157 SMULL v10.8h, v5.8b, v0.8b 158 SMULL v11.8h, v5.8b, v1.8b 159 LDP d4, d5, [x5, 16] 160 SMLAL v2.8h, v8.8b, v6.8b 161 SMLAL v3.8h, v8.8b, v7.8b 162 SMLAL v10.8h, v9.8b, v6.8b 163 SMLAL v11.8h, v9.8b, v7.8b 164 165 LDP d8, d9, [x5, 80] 166 SMULL v12.8h, v4.8b, v0.8b 167 SADALP v16.4s, v2.8h 168 SMULL v13.8h, v4.8b, v1.8b 169 SADALP v17.4s, v3.8h 170 SMULL v14.8h, v5.8b, v0.8b 171 SADALP v18.4s, v10.8h 172 SMULL v15.8h, v5.8b, v1.8b 173 SADALP v19.4s, v11.8h 174 LDP d4, d5, [x5, 32] 175 SMLAL v12.8h, v8.8b, v6.8b 176 SMLAL v13.8h, v8.8b, v7.8b 177 SMLAL v14.8h, v9.8b, v6.8b 178 SMLAL v15.8h, v9.8b, v7.8b 179 180 LDP d8, d9, [x5, 96] 181 SMULL v2.8h, v4.8b, v0.8b 182 SADALP v20.4s, v12.8h 183 SMULL v3.8h, v4.8b, v1.8b 184 SADALP v21.4s, v13.8h 185 SMULL v10.8h, v5.8b, v0.8b 186 SADALP v22.4s, v14.8h 187 SMULL v11.8h, v5.8b, v1.8b 188 SADALP v23.4s, v15.8h 189 LDP d4, d5, [x5, 48] 190 SMLAL v2.8h, v8.8b, v6.8b 191 SMLAL v3.8h, v8.8b, v7.8b 192 SMLAL v10.8h, v9.8b, v6.8b 193 SMLAL v11.8h, v9.8b, v7.8b 194 195 LDP d8, d9, [x5, 112] 196 SMULL v12.8h, v4.8b, v0.8b 197 SADALP v24.4s, v2.8h 198 SMULL v13.8h, v4.8b, v1.8b 199 SADALP v25.4s, v3.8h 200 SMULL v14.8h, v5.8b, v0.8b 201 SADALP v26.4s, v10.8h 202 SMULL v15.8h, v5.8b, v1.8b 203 SADALP v27.4s, v11.8h 204 SMLAL v12.8h, v8.8b, v6.8b 205 SMLAL v13.8h, v8.8b, v7.8b 206 SMLAL v14.8h, v9.8b, v6.8b 207 SMLAL v15.8h, v9.8b, v7.8b 208 ADD x5, x5, 128 209 210 SADALP v28.4s, v12.8h 211 SADALP v29.4s, v13.8h 212 SADALP v30.4s, v14.8h 213 SADALP v31.4s, v15.8h 214 215 # Is there a remainder?- 8 bytes of A 216 TBNZ x0, 3, 4f 217 218 .p2align 3 2193: 220 # Add columns 221 ADDP v16.4s, v16.4s, v18.4s 222 ADDP v20.4s, v20.4s, v22.4s 223 ADDP v24.4s, v24.4s, v26.4s 224 ADDP v28.4s, v28.4s, v30.4s 225 ADDP v17.4s, v17.4s, v19.4s 226 ADDP v21.4s, v21.4s, v23.4s 227 ADDP v25.4s, v25.4s, v27.4s 228 ADDP v29.4s, v29.4s, v31.4s 229 ADDP v0.4s, v16.4s, v20.4s 230 ADDP v1.4s, v24.4s, v28.4s 231 ADDP v2.4s, v17.4s, v21.4s 232 ADDP v3.4s, v25.4s, v29.4s 233 234 # Load per channel scale values from weights 235 SCVTF v0.4s, v0.4s 236 LDR q4, [x5], 16 237 SCVTF v1.4s, v1.4s 238 LDR q5, [x5], 16 239 SCVTF v2.4s, v2.4s 240 SCVTF v3.4s, v3.4s 241 FMUL v0.4s, v0.4s, v4.4s 242 FMUL v1.4s, v1.4s, v5.4s 243 FMUL v2.4s, v2.4s, v4.4s 244 FMUL v3.4s, v3.4s, v5.4s 245 246 FCVTNS v0.4s, v0.4s 247 FCVTNS v1.4s, v1.4s 248 FCVTNS v2.4s, v2.4s 249 FCVTNS v3.4s, v3.4s 250 251 LD1R {v5.8h}, [x11], 2 252 SQXTN v0.4h, v0.4s 253 SQXTN v2.4h, v2.4s 254 SQXTN2 v0.8h, v1.4s 255 SQXTN2 v2.8h, v3.4s 256 SUBS x1, x1, 8 257 SQADD v0.8h, v0.8h, v5.8h 258 SQADD v1.8h, v2.8h, v5.8h 259 SQXTN v0.8b, v0.8h 260 SQXTN2 v0.16b, v1.8h 261 LD1R {v1.16b}, [x11], 1 262 LD1R {v2.16b}, [x11] 263 SMAX v0.16b, v0.16b, v1.16b 264 SMIN v0.16b, v0.16b, v2.16b 265 B.LO 5f 266 267 # Store full 2 x 8 268 ST1 {v0.8b}, [x6], x10 269 SUB x3, x3, x2 // a0 -= kc 270 ST1 {v0.d}[1], [x7], x10 271 SUB x4, x4, x2 // a1 -= kc 272 B.HI 0b 273 274 # Restore d8-d15 from stack 275 LDP d14, d15, [sp, 48] 276 LDP d12, d13, [sp, 32] 277 LDP d10, d11, [sp, 16] 278 LDP d8, d9, [sp], 64 279 RET 280 281 # Remainder - 8 bytes of A 282 .p2align 3 2834: 284 LDR d0, [x3], 8 285 LDP d4, d5, [x5] 286 LDR d1, [x4], 8 287 LDP d6, d7, [x5, 16] 288 SMULL v2.8h, v4.8b, v0.8b 289 SMULL v3.8h, v4.8b, v1.8b 290 SMULL v10.8h, v5.8b, v0.8b 291 SMULL v11.8h, v5.8b, v1.8b 292 SMULL v12.8h, v6.8b, v0.8b 293 SADALP v16.4s, v2.8h 294 SMULL v13.8h, v6.8b, v1.8b 295 SADALP v17.4s, v3.8h 296 SMULL v14.8h, v7.8b, v0.8b 297 SADALP v18.4s, v10.8h 298 SMULL v15.8h, v7.8b, v1.8b 299 SADALP v19.4s, v11.8h 300 LDP d4, d5, [x5, 32] 301 SMULL v2.8h, v4.8b, v0.8b 302 SADALP v20.4s, v12.8h 303 SMULL v3.8h, v4.8b, v1.8b 304 SADALP v21.4s, v13.8h 305 SMULL v10.8h, v5.8b, v0.8b 306 SADALP v22.4s, v14.8h 307 SMULL v11.8h, v5.8b, v1.8b 308 SADALP v23.4s, v15.8h 309 LDP d6, d7, [x5, 48] 310 SMULL v12.8h, v6.8b, v0.8b 311 SADALP v24.4s, v2.8h 312 SMULL v13.8h, v6.8b, v1.8b 313 SADALP v25.4s, v3.8h 314 SMULL v14.8h, v7.8b, v0.8b 315 SADALP v26.4s, v10.8h 316 SMULL v15.8h, v7.8b, v1.8b 317 SADALP v27.4s, v11.8h 318 ADD x5, x5, 64 319 SADALP v28.4s, v12.8h 320 SADALP v29.4s, v13.8h 321 SADALP v30.4s, v14.8h 322 SADALP v31.4s, v15.8h 323 B 3b 324 325 # Store odd width 326 .p2align 3 3275: 328 TBZ x1, 2, 6f 329 STR s0, [x6], 4 330 ST1 {v0.s}[2], [x7], 4 331 EXT v0.16b, v0.16b, v0.16b, 4 332 3336: 334 TBZ x1, 1, 7f 335 STR h0, [x6], 2 336 ST1 {v0.h}[4], [x7], 2 337 EXT v0.16b, v0.16b, v0.16b, 2 3387: 339 TBZ x1, 0, 8f 340 STR b0, [x6] 341 ST1 {v0.b}[8], [x7] 3428: 343 # Restore d8-d15 from stack 344 LDP d14, d15, [sp, 48] 345 LDP d12, d13, [sp, 32] 346 LDP d10, d11, [sp, 16] 347 LDP d8, d9, [sp], 64 348 RET 349 350END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal 351 352#ifdef __ELF__ 353.section ".note.GNU-stack","",%progbits 354#endif 355 356