1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x15 v1 30# A2 x13 v2 31# A3 x4 v3 32# B x5 v4 v5 v6 v7 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v8 v9 v10 v11 v12 v13 v14 v15 38 39BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64 40 41 # Clamp A and C pointers 42 CMP x0, 2 // if mr < 2 43 ADD x2, x2, 3 // kc = (kc + 3) & ~3 44 ADD x15, x3, x4 // a1 = a0 + a_stride 45 ADD x8, x6, x7 // c1 = c0 + cm_stride 46 CSEL x15, x3, x15, LO // a1 = a0 47 CSEL x8, x6, x8, LO // c1 = c0 48 BIC x2, x2, 3 49 50 ADD x13, x15, x4 // a2 = a1 + a_stride 51 ADD x9, x8, x7 // c2 = c1 + cm_stride 52 // if mr <= 2 53 CSEL x13, x15, x13, LS // a2 = a1 54 CSEL x9, x8, x9, LS // c2 = c1 55 56 LDP x12, x11, [sp] // cn_stride, params 57 58 CMP x0, 4 // if mr < 4 59 ADD x4, x13, x4 // a3 = a2 + a_stride 60 ADD x7, x9, x7 // c3 = c2 + cm_stride 61 CSEL x4, x13, x4, LO // a3 = a2 62 CSEL x7, x9, x7, LO // c3 = c2 63 64 .p2align 3 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q20, [x5], 32 68 MOV v17.16b, v16.16b 69 MOV v18.16b, v16.16b 70 LDP q24, q28, [x5], 32 71 MOV v19.16b, v16.16b 72 MOV v21.16b, v20.16b 73 MOV v22.16b, v20.16b 74 MOV v23.16b, v20.16b 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 SUBS x0, x2, 8 // k = kc - 8 78 MOV v27.16b, v24.16b 79 MOV v29.16b, v28.16b 80 MOV v30.16b, v28.16b 81 MOV v31.16b, v28.16b 82 # Is there at least 8 bytes? 83 B.LO 3f 84 85 # Main loop - 8 bytes of A 86 .p2align 3 871: 88 LDR d0, [x3], 8 89 LDR q4, [x5], 16 90 LDR d1, [x15], 8 91 LDR d2, [x13], 8 92 LDR d3, [x4], 8 93 LDR q5, [x5], 16 94 SDOT v16.4s, v4.16b, v0.4b[0] 95 SDOT v17.4s, v4.16b, v1.4b[0] 96 LDP q6, q7, [x5], 32 97 SDOT v18.4s, v4.16b, v2.4b[0] 98 SDOT v19.4s, v4.16b, v3.4b[0] 99 SDOT v20.4s, v5.16b, v0.4b[0] 100 SDOT v21.4s, v5.16b, v1.4b[0] 101 SDOT v22.4s, v5.16b, v2.4b[0] 102 SDOT v23.4s, v5.16b, v3.4b[0] 103 SDOT v24.4s, v6.16b, v0.4b[0] 104 SDOT v25.4s, v6.16b, v1.4b[0] 105 LDP q4, q5, [x5], 32 106 SDOT v26.4s, v6.16b, v2.4b[0] 107 SDOT v27.4s, v6.16b, v3.4b[0] 108 SDOT v28.4s, v7.16b, v0.4b[0] 109 SDOT v29.4s, v7.16b, v1.4b[0] 110 SDOT v30.4s, v7.16b, v2.4b[0] 111 SDOT v31.4s, v7.16b, v3.4b[0] 112 SDOT v16.4s, v4.16b, v0.4b[1] 113 SDOT v17.4s, v4.16b, v1.4b[1] 114 LDP q6, q7, [x5], 32 115 SDOT v18.4s, v4.16b, v2.4b[1] 116 SDOT v19.4s, v4.16b, v3.4b[1] 117 SDOT v20.4s, v5.16b, v0.4b[1] 118 SDOT v21.4s, v5.16b, v1.4b[1] 119 SDOT v22.4s, v5.16b, v2.4b[1] 120 SDOT v23.4s, v5.16b, v3.4b[1] 121 SDOT v24.4s, v6.16b, v0.4b[1] 122 SDOT v25.4s, v6.16b, v1.4b[1] 123 SDOT v26.4s, v6.16b, v2.4b[1] 124 SDOT v27.4s, v6.16b, v3.4b[1] 125 SDOT v28.4s, v7.16b, v0.4b[1] 126 SDOT v29.4s, v7.16b, v1.4b[1] 127 SDOT v30.4s, v7.16b, v2.4b[1] 128 SUBS x0, x0, 8 129 SDOT v31.4s, v7.16b, v3.4b[1] 130 B.HS 1b 131 132 # Is there a remainder?- 4 bytes of A 133 TBNZ x0, 2, 3f 134 1352: 136 # Apply params - preshift, scale, postshift, bias and clamp 137 LD1R {v4.4s}, [x11], 4 138 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 139 SQSHL v17.4s, v17.4s, v4.4s 140 SQSHL v18.4s, v18.4s, v4.4s 141 SQSHL v19.4s, v19.4s, v4.4s 142 SQSHL v20.4s, v20.4s, v4.4s 143 SQSHL v21.4s, v21.4s, v4.4s 144 SQSHL v22.4s, v22.4s, v4.4s 145 SQSHL v23.4s, v23.4s, v4.4s 146 LD1R {v5.4s}, [x11], 4 147 SQSHL v24.4s, v24.4s, v4.4s 148 SQSHL v25.4s, v25.4s, v4.4s 149 SQSHL v26.4s, v26.4s, v4.4s 150 SQSHL v27.4s, v27.4s, v4.4s 151 SQSHL v28.4s, v28.4s, v4.4s 152 SQSHL v29.4s, v29.4s, v4.4s 153 SQSHL v30.4s, v30.4s, v4.4s 154 SQSHL v31.4s, v31.4s, v4.4s 155 LD1R {v6.4s}, [x11], 4 156 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 157 SQDMULH v17.4s, v17.4s, v5.4s 158 SQDMULH v18.4s, v18.4s, v5.4s 159 SQDMULH v19.4s, v19.4s, v5.4s 160 SQDMULH v20.4s, v20.4s, v5.4s 161 SQDMULH v21.4s, v21.4s, v5.4s 162 SQDMULH v22.4s, v22.4s, v5.4s 163 SQDMULH v23.4s, v23.4s, v5.4s 164 SQDMULH v24.4s, v24.4s, v5.4s 165 SQDMULH v25.4s, v25.4s, v5.4s 166 SQDMULH v26.4s, v26.4s, v5.4s 167 SQDMULH v27.4s, v27.4s, v5.4s 168 SQDMULH v28.4s, v28.4s, v5.4s 169 SQDMULH v29.4s, v29.4s, v5.4s 170 SQDMULH v30.4s, v30.4s, v5.4s 171 SQDMULH v31.4s, v31.4s, v5.4s 172 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 173 SRSHL v17.4s, v17.4s, v6.4s 174 SRSHL v18.4s, v18.4s, v6.4s 175 SRSHL v19.4s, v19.4s, v6.4s 176 SRSHL v20.4s, v20.4s, v6.4s 177 SRSHL v21.4s, v21.4s, v6.4s 178 SRSHL v22.4s, v22.4s, v6.4s 179 SRSHL v23.4s, v23.4s, v6.4s 180 SRSHL v24.4s, v24.4s, v6.4s 181 SRSHL v25.4s, v25.4s, v6.4s 182 SRSHL v26.4s, v26.4s, v6.4s 183 SRSHL v27.4s, v27.4s, v6.4s 184 SRSHL v28.4s, v28.4s, v6.4s 185 SRSHL v29.4s, v29.4s, v6.4s 186 SRSHL v30.4s, v30.4s, v6.4s 187 SRSHL v31.4s, v31.4s, v6.4s 188 189 SQXTN v16.4h, v16.4s 190 SQXTN v17.4h, v17.4s 191 SQXTN v18.4h, v18.4s 192 SQXTN v19.4h, v19.4s 193 SQXTN v24.4h, v24.4s 194 SQXTN v25.4h, v25.4s 195 SQXTN v26.4h, v26.4s 196 SQXTN v27.4h, v27.4s 197 LD1R {v6.8h}, [x11], 2 // add bias 198 199 SQXTN2 v16.8h, v20.4s 200 SQXTN2 v17.8h, v21.4s 201 SQXTN2 v18.8h, v22.4s 202 SQXTN2 v19.8h, v23.4s 203 SQXTN2 v24.8h, v28.4s 204 SQXTN2 v25.8h, v29.4s 205 SQXTN2 v26.8h, v30.4s 206 SQXTN2 v27.8h, v31.4s 207 208 SQADD v16.8h, v16.8h, v6.8h 209 SQADD v17.8h, v17.8h, v6.8h 210 SQADD v18.8h, v18.8h, v6.8h 211 SQADD v19.8h, v19.8h, v6.8h 212 SQADD v24.8h, v24.8h, v6.8h 213 SQADD v25.8h, v25.8h, v6.8h 214 SQADD v26.8h, v26.8h, v6.8h 215 SQADD v27.8h, v27.8h, v6.8h 216 LD1R {v4.16b}, [x11], 1 // clamp min value 217 218 SQXTN v0.8b, v16.8h 219 SQXTN v1.8b, v17.8h 220 SQXTN v2.8b, v18.8h 221 SQXTN v3.8b, v19.8h 222 LD1R {v5.16b}, [x11] // clamp max value 223 SQXTN2 v0.16b, v24.8h 224 SQXTN2 v1.16b, v25.8h 225 SQXTN2 v2.16b, v26.8h 226 SQXTN2 v3.16b, v27.8h 227 SUB x11, x11, 15 // rewind params pointer 228 229 SMAX v0.16b, v0.16b, v4.16b 230 SMAX v1.16b, v1.16b, v4.16b 231 SMAX v2.16b, v2.16b, v4.16b 232 SMAX v3.16b, v3.16b, v4.16b 233 SUBS x1, x1, 16 234 SMIN v0.16b, v0.16b, v5.16b 235 SMIN v1.16b, v1.16b, v5.16b 236 SMIN v2.16b, v2.16b, v5.16b 237 SMIN v3.16b, v3.16b, v5.16b 238 B.LO 4f 239 240 # Store full 4 x 16 241 ST1 {v0.16b}, [x6], x12 242 SUB x3, x3, x2 // a0 -= kc 243 ST1 {v1.16b}, [x8], x12 244 SUB x15, x15, x2 // a1 -= kc 245 ST1 {v2.16b}, [x9], x12 246 SUB x13, x13, x2 // a2 -= kc 247 ST1 {v3.16b}, [x7], x12 248 SUB x4, x4, x2 // a3 -= kc 249 B.NE 0b 250 RET 251 252 253 # Remainder- 4 bytes of A 254 .p2align 3 2553: 256 LDR s0, [x3], 4 257 LDR q4, [x5], 16 258 LDR s1, [x15], 4 259 LDR s2, [x13], 4 260 LDR s3, [x4], 4 261 SDOT v16.4s, v4.16b, v0.4b[0] 262 LDR q5, [x5], 16 263 SDOT v17.4s, v4.16b, v1.4b[0] 264 SDOT v18.4s, v4.16b, v2.4b[0] 265 SDOT v19.4s, v4.16b, v3.4b[0] 266 SDOT v20.4s, v5.16b, v0.4b[0] 267 LDP q6, q7, [x5], 32 268 SDOT v21.4s, v5.16b, v1.4b[0] 269 SDOT v22.4s, v5.16b, v2.4b[0] 270 SDOT v23.4s, v5.16b, v3.4b[0] 271 SDOT v24.4s, v6.16b, v0.4b[0] 272 SDOT v25.4s, v6.16b, v1.4b[0] 273 SDOT v26.4s, v6.16b, v2.4b[0] 274 SDOT v27.4s, v6.16b, v3.4b[0] 275 SDOT v28.4s, v7.16b, v0.4b[0] 276 SDOT v29.4s, v7.16b, v1.4b[0] 277 SDOT v30.4s, v7.16b, v2.4b[0] 278 SDOT v31.4s, v7.16b, v3.4b[0] 279 B 2b 280 281 # Store odd width 282 .p2align 3 2834: 284 TBZ x1, 3, 5f 285 STR d0, [x6], 8 286 STR d1, [x8], 8 287 DUP d0, v0.d[1] 288 DUP d1, v1.d[1] 289 STR d2, [x9], 8 290 STR d3, [x7], 8 291 DUP d2, v2.d[1] 292 DUP d3, v3.d[1] 2935: 294 TBZ x1, 2, 6f 295 STR s0, [x6], 4 296 STR s1, [x8], 4 297 DUP s0, v0.s[1] 298 DUP s1, v1.s[1] 299 STR s2, [x9], 4 300 STR s3, [x7], 4 301 DUP s2, v2.s[1] 302 DUP s3, v3.s[1] 3036: 304 TBZ x1, 1, 7f 305 STR h0, [x6], 2 306 STR h1, [x8], 2 307 DUP h0, v0.h[1] 308 DUP h1, v1.h[1] 309 STR h2, [x9], 2 310 STR h3, [x7], 2 311 DUP h2, v2.h[1] 312 DUP h3, v3.h[1] 3137: 314 TBZ x1, 0, 8f 315 STR b0, [x6] 316 STR b1, [x8] 317 STR b2, [x9] 318 STR b3, [x7] 3198: 320 RET 321 322END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64 323 324#ifdef __ELF__ 325.section ".note.GNU-stack","",%progbits 326#endif 327