1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x0) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x10 v3 34# B x5 v4 v5 v6 v7 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v8 v9 v10 v11 v12 v13 v14 v15 40 41BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64 42 43 # Clamp C pointers 44 CMP x0, 2 // if mr < 2 45 LDR x8, [sp, 8] // Load a_offset 46 ADD x16, x6, x7 // c1 = c0 + cm_stride 47 CSEL x16, x6, x16, LO // c1 = c0 48 ADD x2, x2, 3 // kc = (kc + 3) & ~3 49 50 ADD x17, x16, x7 // c2 = c1 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 // if mr <= 2 53 CSEL x17, x16, x17, LS // c2 = c1 54 BIC x2, x2, 3 55 56 CMP x0, 4 // if mr < 4 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 CSEL x7, x17, x7, LO // c3 = c2 59 60 .p2align 3 610: 62 # Load initial bias from w into accumulators 63 LDP q16, q20, [x5], 32 64 MOV v17.16b, v16.16b 65 MOV v18.16b, v16.16b 66 LDP q24, q28, [x5], 32 67 MOV v19.16b, v16.16b 68 MOV v21.16b, v20.16b 69 MOV v22.16b, v20.16b 70 MOV v23.16b, v20.16b 71 MOV v25.16b, v24.16b 72 MOV v26.16b, v24.16b 73 MOV v27.16b, v24.16b 74 MOV v29.16b, v28.16b 75 MOV v30.16b, v28.16b 76 MOV v31.16b, v28.16b 77 MOV x9, x3 // p = ks 78 79 .p2align 3 801: 81 # Load next 4 A pointers 82 LDP x13, x14, [x4], 16 83 LDP x15, x10, [x4], 16 84 85 CMP x13, x12 // if a0 == zero 86 ADD x13, x13, x8 // a0 += a_offset 87 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 88 CMP x14, x12 // if a1 == zero 89 ADD x14, x14, x8 // a1 += a_offset 90 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 91 CMP x15, x12 // if a2 == zero 92 ADD x15, x15, x8 // a2 += a_offset 93 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 94 CMP x10, x12 // if a3 == zero 95 ADD x10, x10, x8 // a3 += a_offset 96 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 97 98 # Is there at least 8 bytes for main loop? 99 SUBS x0, x2, 8 // k = kc - 8 100 B.LO 4f 101 102 # Main loop - 8 bytes of A 103 .p2align 3 1042: 105 LDR d0, [x13], 8 106 LDR q4, [x5], 16 107 LDR d1, [x14], 8 108 LDR d2, [x15], 8 109 LDR d3, [x10], 8 110 LDR q5, [x5], 16 111 SDOT v16.4s, v4.16b, v0.4b[0] 112 SDOT v17.4s, v4.16b, v1.4b[0] 113 LDP q6, q7, [x5], 32 114 SDOT v18.4s, v4.16b, v2.4b[0] 115 SDOT v19.4s, v4.16b, v3.4b[0] 116 SDOT v20.4s, v5.16b, v0.4b[0] 117 SDOT v21.4s, v5.16b, v1.4b[0] 118 SDOT v22.4s, v5.16b, v2.4b[0] 119 SDOT v23.4s, v5.16b, v3.4b[0] 120 SDOT v24.4s, v6.16b, v0.4b[0] 121 SDOT v25.4s, v6.16b, v1.4b[0] 122 LDP q4, q5, [x5], 32 123 SDOT v26.4s, v6.16b, v2.4b[0] 124 SDOT v27.4s, v6.16b, v3.4b[0] 125 SDOT v28.4s, v7.16b, v0.4b[0] 126 SDOT v29.4s, v7.16b, v1.4b[0] 127 SDOT v30.4s, v7.16b, v2.4b[0] 128 SDOT v31.4s, v7.16b, v3.4b[0] 129 SDOT v16.4s, v4.16b, v0.4b[1] 130 SDOT v17.4s, v4.16b, v1.4b[1] 131 LDP q6, q7, [x5], 32 132 SDOT v18.4s, v4.16b, v2.4b[1] 133 SDOT v19.4s, v4.16b, v3.4b[1] 134 SDOT v20.4s, v5.16b, v0.4b[1] 135 SDOT v21.4s, v5.16b, v1.4b[1] 136 SDOT v22.4s, v5.16b, v2.4b[1] 137 SDOT v23.4s, v5.16b, v3.4b[1] 138 SDOT v24.4s, v6.16b, v0.4b[1] 139 SDOT v25.4s, v6.16b, v1.4b[1] 140 SDOT v26.4s, v6.16b, v2.4b[1] 141 SDOT v27.4s, v6.16b, v3.4b[1] 142 SDOT v28.4s, v7.16b, v0.4b[1] 143 SDOT v29.4s, v7.16b, v1.4b[1] 144 SDOT v30.4s, v7.16b, v2.4b[1] 145 SUBS x0, x0, 8 146 SDOT v31.4s, v7.16b, v3.4b[1] 147 B.HS 2b 148 149 # Is there a remainder?- 4 bytes of A 150 TBNZ x0, 2, 4f 151 152 # ks loop 153 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 154 B.HI 1b 155 1563: 157 SCVTF v16.4s, v16.4s 158 SCVTF v17.4s, v17.4s 159 # Load per channel scale values from weights 160 LDR q4, [x5], 16 161 SCVTF v18.4s, v18.4s 162 SCVTF v19.4s, v19.4s 163 LDR q5, [x5], 16 164 SCVTF v20.4s, v20.4s 165 SCVTF v21.4s, v21.4s 166 SCVTF v22.4s, v22.4s 167 SCVTF v23.4s, v23.4s 168 SCVTF v24.4s, v24.4s 169 SCVTF v25.4s, v25.4s 170 SCVTF v26.4s, v26.4s 171 SCVTF v27.4s, v27.4s 172 SCVTF v28.4s, v28.4s 173 SCVTF v29.4s, v29.4s 174 SCVTF v30.4s, v30.4s 175 SCVTF v31.4s, v31.4s 176 177 LDR q6, [x5], 16 178 FMUL v16.4s, v16.4s, v4.4s 179 FMUL v17.4s, v17.4s, v4.4s 180 FMUL v18.4s, v18.4s, v4.4s 181 FMUL v19.4s, v19.4s, v4.4s 182 FMUL v20.4s, v20.4s, v5.4s 183 LDR q4, [x5], 16 184 FMUL v21.4s, v21.4s, v5.4s 185 FMUL v22.4s, v22.4s, v5.4s 186 FMUL v23.4s, v23.4s, v5.4s 187 FMUL v24.4s, v24.4s, v6.4s 188 FMUL v25.4s, v25.4s, v6.4s 189 FMUL v26.4s, v26.4s, v6.4s 190 FMUL v27.4s, v27.4s, v6.4s 191 FMUL v28.4s, v28.4s, v4.4s 192 FMUL v29.4s, v29.4s, v4.4s 193 FMUL v30.4s, v30.4s, v4.4s 194 FMUL v31.4s, v31.4s, v4.4s 195 196 FCVTNS v16.4s, v16.4s 197 FCVTNS v17.4s, v17.4s 198 FCVTNS v18.4s, v18.4s 199 FCVTNS v19.4s, v19.4s 200 FCVTNS v20.4s, v20.4s 201 FCVTNS v21.4s, v21.4s 202 FCVTNS v22.4s, v22.4s 203 FCVTNS v23.4s, v23.4s 204 FCVTNS v24.4s, v24.4s 205 FCVTNS v25.4s, v25.4s 206 FCVTNS v26.4s, v26.4s 207 FCVTNS v27.4s, v27.4s 208 FCVTNS v28.4s, v28.4s 209 FCVTNS v29.4s, v29.4s 210 FCVTNS v30.4s, v30.4s 211 FCVTNS v31.4s, v31.4s 212 213 SQXTN v16.4h, v16.4s 214 SQXTN v17.4h, v17.4s 215 SQXTN v18.4h, v18.4s 216 SQXTN v19.4h, v19.4s 217 SQXTN v24.4h, v24.4s 218 SQXTN v25.4h, v25.4s 219 SQXTN v26.4h, v26.4s 220 SQXTN v27.4h, v27.4s 221 LD1R {v6.8h}, [x11], 2 // add bias 222 223 SQXTN2 v16.8h, v20.4s 224 SQXTN2 v17.8h, v21.4s 225 SQXTN2 v18.8h, v22.4s 226 SQXTN2 v19.8h, v23.4s 227 SQXTN2 v24.8h, v28.4s 228 SQXTN2 v25.8h, v29.4s 229 SQXTN2 v26.8h, v30.4s 230 SQXTN2 v27.8h, v31.4s 231 232 SQADD v16.8h, v16.8h, v6.8h 233 SQADD v17.8h, v17.8h, v6.8h 234 SQADD v18.8h, v18.8h, v6.8h 235 SQADD v19.8h, v19.8h, v6.8h 236 SQADD v24.8h, v24.8h, v6.8h 237 SQADD v25.8h, v25.8h, v6.8h 238 SQADD v26.8h, v26.8h, v6.8h 239 SQADD v27.8h, v27.8h, v6.8h 240 LD1R {v4.16b}, [x11], 1 // clamp min value 241 242 SQXTN v0.8b, v16.8h 243 SQXTN v1.8b, v17.8h 244 SQXTN v2.8b, v18.8h 245 SQXTN v3.8b, v19.8h 246 LD1R {v5.16b}, [x11] // clamp max value 247 SQXTN2 v0.16b, v24.8h 248 SQXTN2 v1.16b, v25.8h 249 SQXTN2 v2.16b, v26.8h 250 SQXTN2 v3.16b, v27.8h 251 LDR x0, [sp] // cn_stride 252 SMAX v0.16b, v0.16b, v4.16b 253 SMAX v1.16b, v1.16b, v4.16b 254 SUB x11, x11, 3 // rewind params pointer 255 SMAX v2.16b, v2.16b, v4.16b 256 SMAX v3.16b, v3.16b, v4.16b 257 SUBS x1, x1, 16 258 SMIN v0.16b, v0.16b, v5.16b 259 SMIN v1.16b, v1.16b, v5.16b 260 SMIN v2.16b, v2.16b, v5.16b 261 SMIN v3.16b, v3.16b, v5.16b 262 B.LO 5f 263 264 # Store full 4 x 16 265 ST1 {v3.16b}, [x7], x0 266 ST1 {v2.16b}, [x17], x0 267 ST1 {v1.16b}, [x16], x0 268 ST1 {v0.16b}, [x6], x0 269 270 SUB x4, x4, x3 // a -= ks 271 272 # nc loop 273 B.HI 0b 274 RET 275 276 # Remainder- 4 bytes of A 277 .p2align 3 2784: 279 LDR s0, [x13], 4 280 LDR q4, [x5], 16 281 LDR s1, [x14], 4 282 LDR s2, [x15], 4 283 LDR s3, [x10], 4 284 LDR q5, [x5], 16 285 SDOT v16.4s, v4.16b, v0.4b[0] 286 SDOT v17.4s, v4.16b, v1.4b[0] 287 LDP q6, q7, [x5], 32 288 SDOT v18.4s, v4.16b, v2.4b[0] 289 SDOT v19.4s, v4.16b, v3.4b[0] 290 SDOT v20.4s, v5.16b, v0.4b[0] 291 SDOT v21.4s, v5.16b, v1.4b[0] 292 SDOT v22.4s, v5.16b, v2.4b[0] 293 SDOT v23.4s, v5.16b, v3.4b[0] 294 SDOT v24.4s, v6.16b, v0.4b[0] 295 SDOT v25.4s, v6.16b, v1.4b[0] 296 SDOT v26.4s, v6.16b, v2.4b[0] 297 SDOT v27.4s, v6.16b, v3.4b[0] 298 SDOT v28.4s, v7.16b, v0.4b[0] 299 SDOT v29.4s, v7.16b, v1.4b[0] 300 SDOT v30.4s, v7.16b, v2.4b[0] 301 SDOT v31.4s, v7.16b, v3.4b[0] 302 303 # ks loop 304 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 305 B.HI 1b 306 B 3b 307 308 # Store odd width 309 .p2align 3 3105: 311 TBZ x1, 3, 6f 312 STR d3, [x7], 8 313 STR d2, [x17], 8 314 DUP d3, v3.d[1] 315 DUP d2, v2.d[1] 316 STR d1, [x16], 8 317 STR d0, [x6], 8 318 DUP d1, v1.d[1] 319 DUP d0, v0.d[1] 3206: 321 TBZ x1, 2, 7f 322 STR s3, [x7], 4 323 STR s2, [x17], 4 324 DUP s3, v3.s[1] 325 DUP s2, v2.s[1] 326 STR s1, [x16], 4 327 STR s0, [x6], 4 328 DUP s1, v1.s[1] 329 DUP s0, v0.s[1] 3307: 331 TBZ x1, 1, 8f 332 STR h3, [x7], 2 333 STR h2, [x17], 2 334 DUP h3, v3.h[1] 335 DUP h2, v2.h[1] 336 STR h1, [x16], 2 337 STR h0, [x6], 2 338 DUP h1, v1.h[1] 339 DUP h0, v0.h[1] 3408: 341 TBZ x1, 0, 9f 342 STR b3, [x7] 343 STR b2, [x17] 344 STR b1, [x16] 345 STR b0, [x6] 3469: 347 RET 348 349END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64 350 351#ifdef __ELF__ 352.section ".note.GNU-stack","",%progbits 353#endif 354