1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# A1 x4 v1 v7 30# B x5 v4 v5 v8 v9 31# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 32# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 33# temp0 v2 v10 v12 v14 34# temp1 v3 v11 v13 v15 35 36 37BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal 38 39 # Clamp A and C pointers 40 CMP x0, 2 // if mr < 2 41 STP d8, d9, [sp, -64]! 42 ADD x4, x3, x4 // a1 = a0 + a_stride 43 STP d10, d11, [sp, 16] 44 ADD x7, x6, x7 // c1 = c0 + cm_stride 45 STP d12, d13, [sp, 32] 46 CSEL x4, x3, x4, LO // a1 = a0 47 STP d14, d15, [sp, 48] 48 ADD x2, x2, 7 // kc = (kc + 7) & ~7 49 CSEL x7, x6, x7, LO // c1 = c0 50 BIC x2, x2, 7 51 52 .p2align 3 530: 54 # Load initial bias from w into accumulators 55 SUBS x0, x2, 16 // k = kc - 16 56 LDP s16, s18, [x5], 8 57 MOV v17.16b, v16.16b 58 MOV v19.16b, v18.16b 59 LDP s20, s22, [x5], 8 60 MOV v21.16b, v20.16b 61 MOV v23.16b, v22.16b 62 LDP s24, s26, [x5], 8 63 MOV v25.16b, v24.16b 64 MOV v27.16b, v26.16b 65 LDP s28, s30, [x5], 8 66 MOV v29.16b, v28.16b 67 LDP x10, x11, [sp, 64] // cn_stride, params 68 MOV v31.16b, v30.16b 69 # Is there at least 16 bytes for epilogue? 70 B.LO 4f 71 72 # Prologue: load A0, A1 and 2 B's 73 LDP d4, d5, [x5] 74 LDP d0, d6, [x3], 16 75 LDP d1, d7, [x4], 16 76 LDP d8, d9, [x5, 64] 77 78 # Is there at least 16 bytes for main loop? 79 SUBS x0, x0, 16 // k = k - 16 80 B.LO 2f 81 82 # Main loop - 16 bytes of A 83 .p2align 3 841: 85 SMULL v2.8h, v4.8b, v0.8b 86 SMULL v3.8h, v4.8b, v1.8b 87 SMULL v10.8h, v5.8b, v0.8b 88 SMULL v11.8h, v5.8b, v1.8b 89 LDP d4, d5, [x5, 16] 90 SMLAL v2.8h, v8.8b, v6.8b 91 SMLAL v3.8h, v8.8b, v7.8b 92 SMLAL v10.8h, v9.8b, v6.8b 93 SMLAL v11.8h, v9.8b, v7.8b 94 95 LDP d8, d9, [x5, 80] 96 SMULL v12.8h, v4.8b, v0.8b 97 SADALP v16.4s, v2.8h 98 SMULL v13.8h, v4.8b, v1.8b 99 SADALP v17.4s, v3.8h 100 SMULL v14.8h, v5.8b, v0.8b 101 SADALP v18.4s, v10.8h 102 SMULL v15.8h, v5.8b, v1.8b 103 SADALP v19.4s, v11.8h 104 LDP d4, d5, [x5, 32] 105 SMLAL v12.8h, v8.8b, v6.8b 106 SMLAL v13.8h, v8.8b, v7.8b 107 SMLAL v14.8h, v9.8b, v6.8b 108 SMLAL v15.8h, v9.8b, v7.8b 109 110 LDP d8, d9, [x5, 96] 111 SMULL v2.8h, v4.8b, v0.8b 112 SADALP v20.4s, v12.8h 113 SMULL v3.8h, v4.8b, v1.8b 114 SADALP v21.4s, v13.8h 115 SMULL v10.8h, v5.8b, v0.8b 116 SADALP v22.4s, v14.8h 117 SMULL v11.8h, v5.8b, v1.8b 118 SADALP v23.4s, v15.8h 119 LDP d4, d5, [x5, 48] 120 SMLAL v2.8h, v8.8b, v6.8b 121 SMLAL v3.8h, v8.8b, v7.8b 122 SMLAL v10.8h, v9.8b, v6.8b 123 SMLAL v11.8h, v9.8b, v7.8b 124 125 LDP d8, d9, [x5, 112] 126 SMULL v12.8h, v4.8b, v0.8b 127 ADD x5, x5, 128 128 SADALP v24.4s, v2.8h 129 SMULL v13.8h, v4.8b, v1.8b 130 SADALP v25.4s, v3.8h 131 SMULL v14.8h, v5.8b, v0.8b 132 SADALP v26.4s, v10.8h 133 SMULL v15.8h, v5.8b, v1.8b 134 SADALP v27.4s, v11.8h 135 SMLAL v12.8h, v8.8b, v6.8b 136 LDP d4, d5, [x5] // Read B 137 SMLAL v13.8h, v8.8b, v7.8b 138 SUBS x0, x0, 16 139 SMLAL v14.8h, v9.8b, v6.8b 140 LDP d0, d6, [x3], 16 // Read A0 141 SMLAL v15.8h, v9.8b, v7.8b 142 143 SADALP v28.4s, v12.8h 144 LDP d1, d7, [x4], 16 // Read A1 145 SADALP v29.4s, v13.8h 146 SADALP v30.4s, v14.8h 147 LDP d8, d9, [x5, 64] // Read B 148 SADALP v31.4s, v15.8h 149 B.HS 1b 150 151 # Epilogue 152 # Same as main loop except no loads at end of loop 153 .p2align 3 1542: 155 SMULL v2.8h, v4.8b, v0.8b 156 SMULL v3.8h, v4.8b, v1.8b 157 SMULL v10.8h, v5.8b, v0.8b 158 SMULL v11.8h, v5.8b, v1.8b 159 LDP d4, d5, [x5, 16] 160 SMLAL v2.8h, v8.8b, v6.8b 161 SMLAL v3.8h, v8.8b, v7.8b 162 SMLAL v10.8h, v9.8b, v6.8b 163 SMLAL v11.8h, v9.8b, v7.8b 164 165 LDP d8, d9, [x5, 80] 166 SMULL v12.8h, v4.8b, v0.8b 167 SADALP v16.4s, v2.8h 168 SMULL v13.8h, v4.8b, v1.8b 169 SADALP v17.4s, v3.8h 170 SMULL v14.8h, v5.8b, v0.8b 171 SADALP v18.4s, v10.8h 172 SMULL v15.8h, v5.8b, v1.8b 173 SADALP v19.4s, v11.8h 174 LDP d4, d5, [x5, 32] 175 SMLAL v12.8h, v8.8b, v6.8b 176 SMLAL v13.8h, v8.8b, v7.8b 177 SMLAL v14.8h, v9.8b, v6.8b 178 SMLAL v15.8h, v9.8b, v7.8b 179 180 LDP d8, d9, [x5, 96] 181 SMULL v2.8h, v4.8b, v0.8b 182 SADALP v20.4s, v12.8h 183 SMULL v3.8h, v4.8b, v1.8b 184 SADALP v21.4s, v13.8h 185 SMULL v10.8h, v5.8b, v0.8b 186 SADALP v22.4s, v14.8h 187 SMULL v11.8h, v5.8b, v1.8b 188 SADALP v23.4s, v15.8h 189 LDP d4, d5, [x5, 48] 190 SMLAL v2.8h, v8.8b, v6.8b 191 SMLAL v3.8h, v8.8b, v7.8b 192 SMLAL v10.8h, v9.8b, v6.8b 193 SMLAL v11.8h, v9.8b, v7.8b 194 195 LDP d8, d9, [x5, 112] 196 SMULL v12.8h, v4.8b, v0.8b 197 SADALP v24.4s, v2.8h 198 SMULL v13.8h, v4.8b, v1.8b 199 SADALP v25.4s, v3.8h 200 SMULL v14.8h, v5.8b, v0.8b 201 SADALP v26.4s, v10.8h 202 SMULL v15.8h, v5.8b, v1.8b 203 SADALP v27.4s, v11.8h 204 SMLAL v12.8h, v8.8b, v6.8b 205 SMLAL v13.8h, v8.8b, v7.8b 206 SMLAL v14.8h, v9.8b, v6.8b 207 SMLAL v15.8h, v9.8b, v7.8b 208 ADD x5, x5, 128 209 210 SADALP v28.4s, v12.8h 211 SADALP v29.4s, v13.8h 212 SADALP v30.4s, v14.8h 213 SADALP v31.4s, v15.8h 214 215 # Is there a remainder?- 8 bytes of A 216 TBNZ x0, 3, 4f 217 218 .p2align 3 2193: 220 # Add columns 221 ADDP v16.4s, v16.4s, v18.4s 222 ADDP v20.4s, v20.4s, v22.4s 223 ADDP v24.4s, v24.4s, v26.4s 224 ADDP v28.4s, v28.4s, v30.4s 225 ADDP v17.4s, v17.4s, v19.4s 226 ADDP v21.4s, v21.4s, v23.4s 227 ADDP v25.4s, v25.4s, v27.4s 228 ADDP v29.4s, v29.4s, v31.4s 229 ADDP v0.4s, v16.4s, v20.4s 230 ADDP v1.4s, v24.4s, v28.4s 231 ADDP v2.4s, v17.4s, v21.4s 232 ADDP v3.4s, v25.4s, v29.4s 233 234 # Apply params - scale, bias and clamp 235 SCVTF v0.4s, v0.4s 236 LD1R {v4.4s}, [x11], 4 237 SCVTF v1.4s, v1.4s 238 SCVTF v2.4s, v2.4s 239 SCVTF v3.4s, v3.4s 240 FMUL v0.4s, v0.4s, v4.4s 241 FMUL v1.4s, v1.4s, v4.4s 242 FMUL v2.4s, v2.4s, v4.4s 243 FMUL v3.4s, v3.4s, v4.4s 244 245 FCVTNS v0.4s, v0.4s 246 FCVTNS v1.4s, v1.4s 247 FCVTNS v2.4s, v2.4s 248 FCVTNS v3.4s, v3.4s 249 250 LD1R {v5.8h}, [x11], 2 251 SQXTN v0.4h, v0.4s 252 SQXTN v2.4h, v2.4s 253 SQXTN2 v0.8h, v1.4s 254 SQXTN2 v2.8h, v3.4s 255 SUBS x1, x1, 8 256 SQADD v0.8h, v0.8h, v5.8h 257 SQADD v1.8h, v2.8h, v5.8h 258 SQXTN v0.8b, v0.8h 259 SQXTN2 v0.16b, v1.8h 260 LD1R {v1.16b}, [x11], 1 261 LD1R {v2.16b}, [x11] 262 SMAX v0.16b, v0.16b, v1.16b 263 SMIN v0.16b, v0.16b, v2.16b 264 B.LO 5f 265 266 # Store full 2 x 8 267 ST1 {v0.8b}, [x6], x10 268 SUB x3, x3, x2 // a0 -= kc 269 ST1 {v0.d}[1], [x7], x10 270 SUB x4, x4, x2 // a1 -= kc 271 B.HI 0b 272 273 # Restore d8-d15 from stack 274 LDP d14, d15, [sp, 48] 275 LDP d12, d13, [sp, 32] 276 LDP d10, d11, [sp, 16] 277 LDP d8, d9, [sp], 64 278 RET 279 280 # Remainder - 8 bytes of A 281 .p2align 3 2824: 283 LDR d0, [x3], 8 284 LDP d4, d5, [x5] 285 LDR d1, [x4], 8 286 LDP d6, d7, [x5, 16] 287 SMULL v2.8h, v4.8b, v0.8b 288 SMULL v3.8h, v4.8b, v1.8b 289 SMULL v10.8h, v5.8b, v0.8b 290 SMULL v11.8h, v5.8b, v1.8b 291 SMULL v12.8h, v6.8b, v0.8b 292 SADALP v16.4s, v2.8h 293 SMULL v13.8h, v6.8b, v1.8b 294 SADALP v17.4s, v3.8h 295 SMULL v14.8h, v7.8b, v0.8b 296 SADALP v18.4s, v10.8h 297 SMULL v15.8h, v7.8b, v1.8b 298 SADALP v19.4s, v11.8h 299 LDP d4, d5, [x5, 32] 300 SMULL v2.8h, v4.8b, v0.8b 301 SADALP v20.4s, v12.8h 302 SMULL v3.8h, v4.8b, v1.8b 303 SADALP v21.4s, v13.8h 304 SMULL v10.8h, v5.8b, v0.8b 305 SADALP v22.4s, v14.8h 306 SMULL v11.8h, v5.8b, v1.8b 307 SADALP v23.4s, v15.8h 308 LDP d6, d7, [x5, 48] 309 SMULL v12.8h, v6.8b, v0.8b 310 SADALP v24.4s, v2.8h 311 SMULL v13.8h, v6.8b, v1.8b 312 SADALP v25.4s, v3.8h 313 SMULL v14.8h, v7.8b, v0.8b 314 SADALP v26.4s, v10.8h 315 SMULL v15.8h, v7.8b, v1.8b 316 SADALP v27.4s, v11.8h 317 ADD x5, x5, 64 318 SADALP v28.4s, v12.8h 319 SADALP v29.4s, v13.8h 320 SADALP v30.4s, v14.8h 321 SADALP v31.4s, v15.8h 322 B 3b 323 324 # Store odd width 325 .p2align 3 3265: 327 TBZ x1, 2, 6f 328 STR s0, [x6], 4 329 ST1 {v0.s}[2], [x7], 4 330 EXT v0.16b, v0.16b, v0.16b, 4 331 3326: 333 TBZ x1, 1, 7f 334 STR h0, [x6], 2 335 ST1 {v0.h}[4], [x7], 2 336 EXT v0.16b, v0.16b, v0.16b, 2 3377: 338 TBZ x1, 0, 8f 339 STR b0, [x6] 340 ST1 {v0.b}[8], [x7] 3418: 342 # Restore d8-d15 from stack 343 LDP d14, d15, [sp, 48] 344 LDP d12, d13, [sp, 32] 345 LDP d10, d11, [sp, 16] 346 LDP d8, d9, [sp], 64 347 RET 348 349END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal 350 351#ifdef __ELF__ 352.section ".note.GNU-stack","",%progbits 353#endif 354 355