1// Auto-generated file. Do not edit! 2// Template: src/qu8-gemm/4x8c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x15 v1 30# A2 x13 v2 31# A3 x4 v3 32# B x5 v4 v5 v6 33# C0 x6 v16 v20 34# C1 x8 v17 v21 35# C2 x9 v18 v22 36# C3 x7 v19 v23 37# zero_point v7 v24 v25 v26 v27 38# unused v8 v9 v10 v11 v13 v14 v15 v28 v29 v30 v31 39 40BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128 41 42 # Clamp A and C pointers 43 CMP x0, 2 // if mr < 2 44 ADD x2, x2, 3 // kc = (kc + 3) & ~3 45 ADD x15, x3, x4 // a1 = a0 + a_stride 46 ADD x8, x6, x7 // c1 = c0 + cm_stride 47 CSEL x15, x3, x15, LO // a1 = a0 48 CSEL x8, x6, x8, LO // c1 = c0 49 BIC x2, x2, 3 50 51 LDP x12, x11, [sp] // cn_stride, params 52 53 ADD x13, x15, x4 // a2 = a1 + a_stride 54 ADD x9, x8, x7 // c2 = c1 + cm_stride 55 // if mr <= 2 56 CSEL x13, x15, x13, LS // a2 = a1 57 CSEL x9, x8, x9, LS // c2 = c1 58 59 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 60 61 CMP x0, 4 // if mr < 4 62 ADD x4, x13, x4 // a3 = a2 + a_stride 63 ADD x7, x9, x7 // c3 = c2 + cm_stride 64 CSEL x4, x13, x4, LO // a3 = a2 65 CSEL x7, x9, x7, LO // c3 = c2 66 67 .p2align 3 680: 69 # Load initial bias from w into accumulators 70 LDP q16, q20, [x5], 32 71 SUBS x0, x2, 16 // k = kc - 16 72 MOV v17.16b, v16.16b 73 MOV v18.16b, v16.16b 74 MOV v19.16b, v16.16b 75 MOV v21.16b, v20.16b 76 MOV v22.16b, v20.16b 77 MOV v23.16b, v20.16b 78 MOVI v24.16b, 0 79 MOVI v25.16b, 0 80 MOVI v26.16b, 0 81 MOVI v27.16b, 0 82 83 # Is there at least 16 bytes? 84 B.LO 30f 85 86 # Main loop - 16 bytes of A 87 .p2align 3 881: 89 LDR q0, [x3], 16 90 LDR q4, [x5], 16 91 LDR q1, [x15], 16 92 LDR q2, [x13], 16 93 LDR q3, [x4], 16 94 LDR q5, [x5], 16 95 UDOT v24.4s, v7.16b, v0.16b // update zero point 96 UDOT v25.4s, v7.16b, v1.16b 97 UDOT v26.4s, v7.16b, v2.16b 98 UDOT v27.4s, v7.16b, v3.16b 99 UDOT v16.4s, v4.16b, v0.4b[0] 100 UDOT v17.4s, v4.16b, v1.4b[0] 101 LDR q6, [x5], 16 102 UDOT v18.4s, v4.16b, v2.4b[0] 103 UDOT v19.4s, v4.16b, v3.4b[0] 104 UDOT v20.4s, v5.16b, v0.4b[0] 105 UDOT v21.4s, v5.16b, v1.4b[0] 106 LDR q4, [x5], 16 107 UDOT v22.4s, v5.16b, v2.4b[0] 108 UDOT v23.4s, v5.16b, v3.4b[0] 109 UDOT v16.4s, v6.16b, v0.4b[1] 110 UDOT v17.4s, v6.16b, v1.4b[1] 111 LDR q5, [x5], 16 112 UDOT v18.4s, v6.16b, v2.4b[1] 113 UDOT v19.4s, v6.16b, v3.4b[1] 114 UDOT v20.4s, v4.16b, v0.4b[1] 115 UDOT v21.4s, v4.16b, v1.4b[1] 116 LDR q6, [x5], 16 117 UDOT v22.4s, v4.16b, v2.4b[1] 118 UDOT v23.4s, v4.16b, v3.4b[1] 119 UDOT v16.4s, v5.16b, v0.4b[2] 120 UDOT v17.4s, v5.16b, v1.4b[2] 121 LDR q4, [x5], 16 122 UDOT v18.4s, v5.16b, v2.4b[2] 123 UDOT v19.4s, v5.16b, v3.4b[2] 124 UDOT v20.4s, v6.16b, v0.4b[2] 125 UDOT v21.4s, v6.16b, v1.4b[2] 126 LDR q5, [x5], 16 127 UDOT v22.4s, v6.16b, v2.4b[2] 128 UDOT v23.4s, v6.16b, v3.4b[2] 129 UDOT v16.4s, v4.16b, v0.4b[3] 130 UDOT v17.4s, v4.16b, v1.4b[3] 131 UDOT v18.4s, v4.16b, v2.4b[3] 132 UDOT v19.4s, v4.16b, v3.4b[3] 133 SUBS x0, x0, 16 134 UDOT v20.4s, v5.16b, v0.4b[3] 135 UDOT v21.4s, v5.16b, v1.4b[3] 136 UDOT v22.4s, v5.16b, v2.4b[3] 137 UDOT v23.4s, v5.16b, v3.4b[3] 138 B.HS 1b 139 140 # Is there a remainder?- 8 bytes of A 141 TBNZ x0, 3, 3f 142 # Is there a remainder?- 4 bytes of A 143 TBNZ x0, 2, 4f 144 1452: 146 ADDP v0.4s, v24.4s, v24.4s 147 ADDP v1.4s, v25.4s, v25.4s 148 ADDP v2.4s, v26.4s, v26.4s 149 ADDP v3.4s, v27.4s, v27.4s 150 ADDP v24.4s, v0.4s, v0.4s 151 ADDP v25.4s, v1.4s, v1.4s 152 ADDP v26.4s, v2.4s, v2.4s 153 ADDP v27.4s, v3.4s, v3.4s 154 155 # Subtract zero point from accumulators 156 SUB v16.4s, v16.4s, v24.4s 157 SUB v17.4s, v17.4s, v25.4s 158 SUB v18.4s, v18.4s, v26.4s 159 SUB v19.4s, v19.4s, v27.4s 160 SUB v20.4s, v20.4s, v24.4s 161 SUB v21.4s, v21.4s, v25.4s 162 SUB v22.4s, v22.4s, v26.4s 163 SUB v23.4s, v23.4s, v27.4s 164 165 # Apply params - preshift, scale, postshift, bias and clamp 166 LD1R {v4.4s}, [x11], 4 167 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 168 SSHL v17.4s, v17.4s, v4.4s 169 SSHL v18.4s, v18.4s, v4.4s 170 SSHL v19.4s, v19.4s, v4.4s 171 LD1R {v5.4s}, [x11], 4 172 SSHL v20.4s, v20.4s, v4.4s 173 SSHL v21.4s, v21.4s, v4.4s 174 SSHL v22.4s, v22.4s, v4.4s 175 SSHL v23.4s, v23.4s, v4.4s 176 LD1R {v6.4s}, [x11], 4 177 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 178 SQDMULH v17.4s, v17.4s, v5.4s 179 SQDMULH v18.4s, v18.4s, v5.4s 180 SQDMULH v19.4s, v19.4s, v5.4s 181 SQDMULH v20.4s, v20.4s, v5.4s 182 SQDMULH v21.4s, v21.4s, v5.4s 183 SQDMULH v22.4s, v22.4s, v5.4s 184 SQDMULH v23.4s, v23.4s, v5.4s 185 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 186 SRSHL v17.4s, v17.4s, v6.4s 187 SRSHL v18.4s, v18.4s, v6.4s 188 SRSHL v19.4s, v19.4s, v6.4s 189 SRSHL v20.4s, v20.4s, v6.4s 190 SRSHL v21.4s, v21.4s, v6.4s 191 SRSHL v22.4s, v22.4s, v6.4s 192 SRSHL v23.4s, v23.4s, v6.4s 193 194 SQXTN v16.4h, v16.4s 195 SQXTN v17.4h, v17.4s 196 SQXTN v18.4h, v18.4s 197 SQXTN v19.4h, v19.4s 198 LD1R {v6.8h}, [x11], 2 // add bias 199 200 SQXTN2 v16.8h, v20.4s 201 SQXTN2 v17.8h, v21.4s 202 SQXTN2 v18.8h, v22.4s 203 SQXTN2 v19.8h, v23.4s 204 205 SQADD v16.8h, v16.8h, v6.8h 206 SQADD v17.8h, v17.8h, v6.8h 207 SQADD v18.8h, v18.8h, v6.8h 208 SQADD v19.8h, v19.8h, v6.8h 209 LD1R {v4.16b}, [x11], 1 // clamp min value 210 211 SQXTUN v0.8b, v16.8h 212 SQXTUN v1.8b, v18.8h 213 LD1R {v5.16b}, [x11] // clamp max value 214 SQXTUN2 v0.16b, v17.8h 215 SQXTUN2 v1.16b, v19.8h 216 SUB x11, x11, 15 // rewind params pointer 217 218 UMAX v0.16b, v0.16b, v4.16b 219 UMAX v1.16b, v1.16b, v4.16b 220 SUBS x1, x1, 8 221 UMIN v0.16b, v0.16b, v5.16b 222 UMIN v1.16b, v1.16b, v5.16b 223 B.LO 5f 224 225 # Store full 4 x 8 226 ST1 {v0.8b}, [x6], x12 227 SUB x3, x3, x2 // a0 -= kc 228 ST1 {v0.d}[1], [x8], x12 229 SUB x15, x15, x2 // a1 -= kc 230 ST1 {v1.8b}, [x9], x12 231 SUB x13, x13, x2 // a2 -= kc 232 ST1 {v1.d}[1], [x7], x12 233 SUB x4, x4, x2 // a3 -= kc 234 B.NE 0b 235 RET 236 237 # Remainder- 4-12 bytes of A 238 .p2align 3 23930: TBZ x0, 3, 4f 240 2413: 242 LDR d0, [x3], 8 243 LDR q4, [x5] 244 LDR d1, [x15], 8 245 LDR d2, [x13], 8 246 LDR d3, [x4], 8 247 LDR q5, [x5, 16] 248 UDOT v24.4s, v7.16b, v0.16b // update zero point 249 UDOT v25.4s, v7.16b, v1.16b 250 UDOT v26.4s, v7.16b, v2.16b 251 UDOT v27.4s, v7.16b, v3.16b 252 UDOT v16.4s, v4.16b, v0.4b[0] 253 UDOT v17.4s, v4.16b, v1.4b[0] 254 LDR q6, [x5, 32] 255 UDOT v18.4s, v4.16b, v2.4b[0] 256 UDOT v19.4s, v4.16b, v3.4b[0] 257 UDOT v20.4s, v5.16b, v0.4b[0] 258 UDOT v21.4s, v5.16b, v1.4b[0] 259 LDR q4, [x5, 48] 260 UDOT v22.4s, v5.16b, v2.4b[0] 261 UDOT v23.4s, v5.16b, v3.4b[0] 262 UDOT v16.4s, v6.16b, v0.4b[1] 263 UDOT v17.4s, v6.16b, v1.4b[1] 264 UDOT v18.4s, v6.16b, v2.4b[1] 265 UDOT v19.4s, v6.16b, v3.4b[1] 266 ADD x5, x5, 64 267 UDOT v20.4s, v4.16b, v0.4b[1] 268 UDOT v21.4s, v4.16b, v1.4b[1] 269 UDOT v22.4s, v4.16b, v2.4b[1] 270 UDOT v23.4s, v4.16b, v3.4b[1] 271 TBZ x0, 2, 2b 272 2734: 274 LDR s0, [x3], 4 275 LDR q4, [x5], 16 276 LDR s1, [x15], 4 277 LDR s2, [x13], 4 278 LDR s3, [x4], 4 279 LDR q5, [x5], 16 280 UDOT v24.4s, v7.16b, v0.16b // update zero point 281 UDOT v25.4s, v7.16b, v1.16b 282 UDOT v26.4s, v7.16b, v2.16b 283 UDOT v27.4s, v7.16b, v3.16b 284 UDOT v16.4s, v4.16b, v0.4b[0] 285 UDOT v17.4s, v4.16b, v1.4b[0] 286 UDOT v18.4s, v4.16b, v2.4b[0] 287 UDOT v19.4s, v4.16b, v3.4b[0] 288 UDOT v20.4s, v5.16b, v0.4b[0] 289 UDOT v21.4s, v5.16b, v1.4b[0] 290 UDOT v22.4s, v5.16b, v2.4b[0] 291 UDOT v23.4s, v5.16b, v3.4b[0] 292 B 2b 293 294 # Store odd width 295 .p2align 3 2965: 297 TBZ x1, 2, 6f 298 STR s0, [x6], 4 299 ST1 {v0.s}[2], [x8], 4 300 STR s1, [x9], 4 301 ST1 {v1.s}[2], [x7], 4 302 EXT v0.16b, v0.16b, v0.16b, 4 303 EXT v1.16b, v1.16b, v1.16b, 4 3046: 305 TBZ x1, 1, 7f 306 STR h0, [x6], 2 307 ST1 {v0.h}[4], [x8], 2 308 STR h1, [x9], 2 309 ST1 {v1.h}[4], [x7], 2 310 EXT v0.16b, v0.16b, v0.16b, 2 311 EXT v1.16b, v1.16b, v1.16b, 2 3127: 313 TBZ x1, 0, 8f 314 STR b0, [x6] 315 ST1 {v0.b}[8], [x8] 316 STR b1, [x9] 317 ST1 {v1.b}[8], [x7] 3188: 319 RET 320 321END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128 322 323#ifdef __ELF__ 324.section ".note.GNU-stack","",%progbits 325#endif 326