1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x15 v1 30# A2 x13 v2 31# A3 x4 v3 32# B x5 v4 v5 v6 v7 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v8 v9 v10 v11 v12 v13 v14 v15 38 39BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64 40 41 # Clamp A and C pointers 42 CMP x0, 2 // if mr < 2 43 ADD x2, x2, 3 // kc = (kc + 3) & ~3 44 ADD x15, x3, x4 // a1 = a0 + a_stride 45 ADD x8, x6, x7 // c1 = c0 + cm_stride 46 CSEL x15, x3, x15, LO // a1 = a0 47 CSEL x8, x6, x8, LO // c1 = c0 48 BIC x2, x2, 3 49 50 ADD x13, x15, x4 // a2 = a1 + a_stride 51 ADD x9, x8, x7 // c2 = c1 + cm_stride 52 // if mr <= 2 53 CSEL x13, x15, x13, LS // a2 = a1 54 CSEL x9, x8, x9, LS // c2 = c1 55 56 LDP x12, x11, [sp] // cn_stride, params 57 58 CMP x0, 4 // if mr < 4 59 ADD x4, x13, x4 // a3 = a2 + a_stride 60 ADD x7, x9, x7 // c3 = c2 + cm_stride 61 CSEL x4, x13, x4, LO // a3 = a2 62 CSEL x7, x9, x7, LO // c3 = c2 63 64 .p2align 3 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q20, [x5], 32 68 MOV v17.16b, v16.16b 69 MOV v18.16b, v16.16b 70 LDP q24, q28, [x5], 32 71 MOV v19.16b, v16.16b 72 MOV v21.16b, v20.16b 73 MOV v22.16b, v20.16b 74 MOV v23.16b, v20.16b 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 SUBS x0, x2, 8 // k = kc - 8 78 MOV v27.16b, v24.16b 79 MOV v29.16b, v28.16b 80 MOV v30.16b, v28.16b 81 MOV v31.16b, v28.16b 82 # Is there at least 8 bytes? 83 B.LO 3f 84 85 # Main loop - 8 bytes of A 86 .p2align 3 871: 88 LDR d0, [x3], 8 89 LDR q4, [x5], 16 90 LDR d1, [x15], 8 91 LDR d2, [x13], 8 92 LDR d3, [x4], 8 93 LDR q5, [x5], 16 94 SDOT v16.4s, v4.16b, v0.4b[0] 95 SDOT v17.4s, v4.16b, v1.4b[0] 96 LDP q6, q7, [x5], 32 97 SDOT v18.4s, v4.16b, v2.4b[0] 98 SDOT v19.4s, v4.16b, v3.4b[0] 99 SDOT v20.4s, v5.16b, v0.4b[0] 100 SDOT v21.4s, v5.16b, v1.4b[0] 101 SDOT v22.4s, v5.16b, v2.4b[0] 102 SDOT v23.4s, v5.16b, v3.4b[0] 103 SDOT v24.4s, v6.16b, v0.4b[0] 104 SDOT v25.4s, v6.16b, v1.4b[0] 105 LDP q4, q5, [x5], 32 106 SDOT v26.4s, v6.16b, v2.4b[0] 107 SDOT v27.4s, v6.16b, v3.4b[0] 108 SDOT v28.4s, v7.16b, v0.4b[0] 109 SDOT v29.4s, v7.16b, v1.4b[0] 110 SDOT v30.4s, v7.16b, v2.4b[0] 111 SDOT v31.4s, v7.16b, v3.4b[0] 112 SDOT v16.4s, v4.16b, v0.4b[1] 113 SDOT v17.4s, v4.16b, v1.4b[1] 114 LDP q6, q7, [x5], 32 115 SDOT v18.4s, v4.16b, v2.4b[1] 116 SDOT v19.4s, v4.16b, v3.4b[1] 117 SDOT v20.4s, v5.16b, v0.4b[1] 118 SDOT v21.4s, v5.16b, v1.4b[1] 119 SDOT v22.4s, v5.16b, v2.4b[1] 120 SDOT v23.4s, v5.16b, v3.4b[1] 121 SDOT v24.4s, v6.16b, v0.4b[1] 122 SDOT v25.4s, v6.16b, v1.4b[1] 123 SDOT v26.4s, v6.16b, v2.4b[1] 124 SDOT v27.4s, v6.16b, v3.4b[1] 125 SDOT v28.4s, v7.16b, v0.4b[1] 126 SDOT v29.4s, v7.16b, v1.4b[1] 127 SDOT v30.4s, v7.16b, v2.4b[1] 128 SUBS x0, x0, 8 129 SDOT v31.4s, v7.16b, v3.4b[1] 130 B.HS 1b 131 132 # Is there a remainder?- 4 bytes of A 133 TBNZ x0, 2, 3f 134 1352: 136 SCVTF v16.4s, v16.4s 137 SCVTF v17.4s, v17.4s 138 # Load per channel scale values from weights 139 LDR q4, [x5], 16 140 SCVTF v18.4s, v18.4s 141 SCVTF v19.4s, v19.4s 142 LDR q5, [x5], 16 143 SCVTF v20.4s, v20.4s 144 SCVTF v21.4s, v21.4s 145 SCVTF v22.4s, v22.4s 146 SCVTF v23.4s, v23.4s 147 SCVTF v24.4s, v24.4s 148 SCVTF v25.4s, v25.4s 149 SCVTF v26.4s, v26.4s 150 SCVTF v27.4s, v27.4s 151 SCVTF v28.4s, v28.4s 152 SCVTF v29.4s, v29.4s 153 SCVTF v30.4s, v30.4s 154 SCVTF v31.4s, v31.4s 155 156 LDR q6, [x5], 16 157 FMUL v16.4s, v16.4s, v4.4s 158 FMUL v17.4s, v17.4s, v4.4s 159 FMUL v18.4s, v18.4s, v4.4s 160 FMUL v19.4s, v19.4s, v4.4s 161 FMUL v20.4s, v20.4s, v5.4s 162 LDR q4, [x5], 16 163 FMUL v21.4s, v21.4s, v5.4s 164 FMUL v22.4s, v22.4s, v5.4s 165 FMUL v23.4s, v23.4s, v5.4s 166 FMUL v24.4s, v24.4s, v6.4s 167 FMUL v25.4s, v25.4s, v6.4s 168 FMUL v26.4s, v26.4s, v6.4s 169 FMUL v27.4s, v27.4s, v6.4s 170 FMUL v28.4s, v28.4s, v4.4s 171 FMUL v29.4s, v29.4s, v4.4s 172 FMUL v30.4s, v30.4s, v4.4s 173 FMUL v31.4s, v31.4s, v4.4s 174 175 FCVTNS v16.4s, v16.4s 176 FCVTNS v17.4s, v17.4s 177 FCVTNS v18.4s, v18.4s 178 FCVTNS v19.4s, v19.4s 179 FCVTNS v20.4s, v20.4s 180 FCVTNS v21.4s, v21.4s 181 FCVTNS v22.4s, v22.4s 182 FCVTNS v23.4s, v23.4s 183 FCVTNS v24.4s, v24.4s 184 FCVTNS v25.4s, v25.4s 185 FCVTNS v26.4s, v26.4s 186 FCVTNS v27.4s, v27.4s 187 FCVTNS v28.4s, v28.4s 188 FCVTNS v29.4s, v29.4s 189 FCVTNS v30.4s, v30.4s 190 FCVTNS v31.4s, v31.4s 191 192 SQXTN v16.4h, v16.4s 193 SQXTN v17.4h, v17.4s 194 SQXTN v18.4h, v18.4s 195 SQXTN v19.4h, v19.4s 196 SQXTN v24.4h, v24.4s 197 SQXTN v25.4h, v25.4s 198 SQXTN v26.4h, v26.4s 199 SQXTN v27.4h, v27.4s 200 LD1R {v6.8h}, [x11], 2 // add bias 201 202 SQXTN2 v16.8h, v20.4s 203 SQXTN2 v17.8h, v21.4s 204 SQXTN2 v18.8h, v22.4s 205 SQXTN2 v19.8h, v23.4s 206 SQXTN2 v24.8h, v28.4s 207 SQXTN2 v25.8h, v29.4s 208 SQXTN2 v26.8h, v30.4s 209 SQXTN2 v27.8h, v31.4s 210 211 SQADD v16.8h, v16.8h, v6.8h 212 SQADD v17.8h, v17.8h, v6.8h 213 SQADD v18.8h, v18.8h, v6.8h 214 SQADD v19.8h, v19.8h, v6.8h 215 SQADD v24.8h, v24.8h, v6.8h 216 SQADD v25.8h, v25.8h, v6.8h 217 SQADD v26.8h, v26.8h, v6.8h 218 SQADD v27.8h, v27.8h, v6.8h 219 LD1R {v4.16b}, [x11], 1 // clamp min value 220 221 SQXTN v0.8b, v16.8h 222 SQXTN v1.8b, v17.8h 223 SQXTN v2.8b, v18.8h 224 SQXTN v3.8b, v19.8h 225 LD1R {v5.16b}, [x11] // clamp max value 226 SQXTN2 v0.16b, v24.8h 227 SQXTN2 v1.16b, v25.8h 228 SQXTN2 v2.16b, v26.8h 229 SQXTN2 v3.16b, v27.8h 230 SUB x11, x11, 3 // rewind params pointer 231 232 SMAX v0.16b, v0.16b, v4.16b 233 SMAX v1.16b, v1.16b, v4.16b 234 SMAX v2.16b, v2.16b, v4.16b 235 SMAX v3.16b, v3.16b, v4.16b 236 SUBS x1, x1, 16 237 SMIN v0.16b, v0.16b, v5.16b 238 SMIN v1.16b, v1.16b, v5.16b 239 SMIN v2.16b, v2.16b, v5.16b 240 SMIN v3.16b, v3.16b, v5.16b 241 B.LO 4f 242 243 # Store full 4 x 16 244 ST1 {v0.16b}, [x6], x12 245 SUB x3, x3, x2 // a0 -= kc 246 ST1 {v1.16b}, [x8], x12 247 SUB x15, x15, x2 // a1 -= kc 248 ST1 {v2.16b}, [x9], x12 249 SUB x13, x13, x2 // a2 -= kc 250 ST1 {v3.16b}, [x7], x12 251 SUB x4, x4, x2 // a3 -= kc 252 B.NE 0b 253 RET 254 255 256 # Remainder- 4 bytes of A 257 .p2align 3 2583: 259 LDR s0, [x3], 4 260 LDR q4, [x5], 16 261 LDR s1, [x15], 4 262 LDR s2, [x13], 4 263 LDR s3, [x4], 4 264 SDOT v16.4s, v4.16b, v0.4b[0] 265 LDR q5, [x5], 16 266 SDOT v17.4s, v4.16b, v1.4b[0] 267 SDOT v18.4s, v4.16b, v2.4b[0] 268 SDOT v19.4s, v4.16b, v3.4b[0] 269 SDOT v20.4s, v5.16b, v0.4b[0] 270 LDP q6, q7, [x5], 32 271 SDOT v21.4s, v5.16b, v1.4b[0] 272 SDOT v22.4s, v5.16b, v2.4b[0] 273 SDOT v23.4s, v5.16b, v3.4b[0] 274 SDOT v24.4s, v6.16b, v0.4b[0] 275 SDOT v25.4s, v6.16b, v1.4b[0] 276 SDOT v26.4s, v6.16b, v2.4b[0] 277 SDOT v27.4s, v6.16b, v3.4b[0] 278 SDOT v28.4s, v7.16b, v0.4b[0] 279 SDOT v29.4s, v7.16b, v1.4b[0] 280 SDOT v30.4s, v7.16b, v2.4b[0] 281 SDOT v31.4s, v7.16b, v3.4b[0] 282 B 2b 283 284 # Store odd width 285 .p2align 3 2864: 287 TBZ x1, 3, 5f 288 STR d0, [x6], 8 289 STR d1, [x8], 8 290 DUP d0, v0.d[1] 291 DUP d1, v1.d[1] 292 STR d2, [x9], 8 293 STR d3, [x7], 8 294 DUP d2, v2.d[1] 295 DUP d3, v3.d[1] 2965: 297 TBZ x1, 2, 6f 298 STR s0, [x6], 4 299 STR s1, [x8], 4 300 DUP s0, v0.s[1] 301 DUP s1, v1.s[1] 302 STR s2, [x9], 4 303 STR s3, [x7], 4 304 DUP s2, v2.s[1] 305 DUP s3, v3.s[1] 3066: 307 TBZ x1, 1, 7f 308 STR h0, [x6], 2 309 STR h1, [x8], 2 310 DUP h0, v0.h[1] 311 DUP h1, v1.h[1] 312 STR h2, [x9], 2 313 STR h3, [x7], 2 314 DUP h2, v2.h[1] 315 DUP h3, v3.h[1] 3167: 317 TBZ x1, 0, 8f 318 STR b0, [x6] 319 STR b1, [x8] 320 STR b2, [x9] 321 STR b3, [x7] 3228: 323 RET 324 325END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64 326 327#ifdef __ELF__ 328.section ".note.GNU-stack","",%progbits 329#endif 330