1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x15 v1 30# A2 x13 v2 31# A3 x4 v3 32# B x5 v4 v5 v6 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 38 39# x10 x17 a53 temp registers 40 41BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 42 43 # Clamp A and C pointers 44 CMP x0, 2 // if mr < 2 45 LDP x12, x11, [sp] // Load cn_stride, params 46 ADD x15, x3, x4 // a1 = a0 + a_stride 47 ADD x8, x6, x7 // c1 = c0 + cm_stride 48 CSEL x15, x3, x15, LO // a1 = a0 49 CSEL x8, x6, x8, LO // c1 = c0 50 51 ADD x13, x15, x4 // a2 = a1 + a_stride 52 ADD x9, x8, x7 // c2 = c1 + cm_stride 53 // if mr <= 2 54 CSEL x13, x15, x13, LS // a2 = a1 55 CSEL x9, x8, x9, LS // c2 = c1 56 57 CMP x0, 4 // if mr < 4 58 ADD x4, x13, x4 // a3 = a2 + a_stride 59 ADD x7, x9, x7 // c3 = c2 + cm_stride 60 CSEL x4, x13, x4, LO // a3 = a2 61 CSEL x7, x9, x7, LO // c3 = c2 62 63 .p2align 3 640: 65 # Load initial bias from w into accumulators 66 LDP q16, q20, [x5], 32 67 MOV v17.16b, v16.16b 68 MOV v18.16b, v16.16b 69 LDP q24, q28, [x5], 32 70 MOV v19.16b, v16.16b 71 MOV v21.16b, v20.16b 72 MOV v22.16b, v20.16b 73 MOV v23.16b, v20.16b 74 SUBS x0, x2, 8 // k = kc - 8 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 MOV v27.16b, v24.16b 78 MOV v29.16b, v28.16b 79 MOV v30.16b, v28.16b 80 MOV v31.16b, v28.16b 81 # Is there at least 8 bytes for epilogue? 82 B.LO 4f 83 84 # Prologue 85 LDR d0, [x3], 8 86 LDP d4, d6, [x5] 87 LDR d1, [x15], 8 88 LDR d2, [x13], 8 89 LDR d3, [x4], 8 90 SXTL v0.8h, v0.8b 91 LDR x17, [x5, 16] 92 SXTL v4.8h, v4.8b 93 SXTL v1.8h, v1.8b 94 SXTL v2.8h, v2.8b 95 SXTL v3.8h, v3.8b 96 SXTL v6.8h, v6.8b 97 98 SUBS x0, x0, 8 // k = k - 8 99 # Is there at least 8 bytes for main loop? 100 B.LO 2f 101 102 # Main loop - 8 bytes of A 103 .p2align 3 1041: 105 SMLAL v16.4s, v4.4h, v0.h[0] 106 SMLAL2 v20.4s, v4.8h, v0.h[0] 107 SMLAL v17.4s, v4.4h, v1.h[0] 108 SMLAL2 v21.4s, v4.8h, v1.h[0] 109 SMLAL v18.4s, v4.4h, v2.h[0] 110 SMLAL2 v22.4s, v4.8h, v2.h[0] 111 SMLAL v19.4s, v4.4h, v3.h[0] 112 SMLAL2 v23.4s, v4.8h, v3.h[0] 113 LDR d4, [x5, 24] 114 INS v5.d[0], x17 115 SMLAL v24.4s, v6.4h, v0.h[0] 116 SMLAL2 v28.4s, v6.8h, v0.h[0] 117 SMLAL v25.4s, v6.4h, v1.h[0] 118 SMLAL2 v29.4s, v6.8h, v1.h[0] 119 SXTL v5.8h, v5.8b 120 SMLAL v26.4s, v6.4h, v2.h[0] 121 SMLAL2 v30.4s, v6.8h, v2.h[0] 122 SMLAL v27.4s, v6.4h, v3.h[0] 123 SMLAL2 v31.4s, v6.8h, v3.h[0] 124 LDR x17, [x5, 32] 125 SMLAL v16.4s, v5.4h, v0.h[1] 126 SMLAL2 v20.4s, v5.8h, v0.h[1] 127 SMLAL v17.4s, v5.4h, v1.h[1] 128 SMLAL2 v21.4s, v5.8h, v1.h[1] 129 SXTL v4.8h, v4.8b 130 SMLAL v18.4s, v5.4h, v2.h[1] 131 SMLAL2 v22.4s, v5.8h, v2.h[1] 132 SMLAL v19.4s, v5.4h, v3.h[1] 133 SMLAL2 v23.4s, v5.8h, v3.h[1] 134 LDR d5, [x5, 40] 135 INS v6.d[0], x17 136 SMLAL v24.4s, v4.4h, v0.h[1] 137 SMLAL2 v28.4s, v4.8h, v0.h[1] 138 SMLAL v25.4s, v4.4h, v1.h[1] 139 SMLAL2 v29.4s, v4.8h, v1.h[1] 140 SXTL v6.8h, v6.8b 141 SMLAL v26.4s, v4.4h, v2.h[1] 142 SMLAL2 v30.4s, v4.8h, v2.h[1] 143 SMLAL v27.4s, v4.4h, v3.h[1] 144 SMLAL2 v31.4s, v4.8h, v3.h[1] 145 LDR x17, [x5, 48] 146 SMLAL v16.4s, v6.4h, v0.h[2] 147 SMLAL2 v20.4s, v6.8h, v0.h[2] 148 SMLAL v17.4s, v6.4h, v1.h[2] 149 SXTL v5.8h, v5.8b 150 SMLAL2 v21.4s, v6.8h, v1.h[2] 151 SMLAL v18.4s, v6.4h, v2.h[2] 152 SMLAL2 v22.4s, v6.8h, v2.h[2] 153 SMLAL v19.4s, v6.4h, v3.h[2] 154 SMLAL2 v23.4s, v6.8h, v3.h[2] 155 LDR d6, [x5, 56] 156 INS v4.d[0], x17 157 SMLAL v24.4s, v5.4h, v0.h[2] 158 SMLAL2 v28.4s, v5.8h, v0.h[2] 159 SMLAL v25.4s, v5.4h, v1.h[2] 160 SMLAL2 v29.4s, v5.8h, v1.h[2] 161 SXTL v4.8h, v4.8b 162 SMLAL v26.4s, v5.4h, v2.h[2] 163 SMLAL2 v30.4s, v5.8h, v2.h[2] 164 SMLAL v27.4s, v5.4h, v3.h[2] 165 SMLAL2 v31.4s, v5.8h, v3.h[2] 166 LDR x17, [x5, 64] 167 SMLAL v16.4s, v4.4h, v0.h[3] 168 SMLAL2 v20.4s, v4.8h, v0.h[3] 169 SMLAL v17.4s, v4.4h, v1.h[3] 170 SMLAL2 v21.4s, v4.8h, v1.h[3] 171 SXTL v6.8h, v6.8b 172 SMLAL v18.4s, v4.4h, v2.h[3] 173 SMLAL2 v22.4s, v4.8h, v2.h[3] 174 SMLAL v19.4s, v4.4h, v3.h[3] 175 SMLAL2 v23.4s, v4.8h, v3.h[3] 176 LDR d4, [x5, 72] 177 INS v5.d[0], x17 178 SMLAL v24.4s, v6.4h, v0.h[3] 179 SMLAL2 v28.4s, v6.8h, v0.h[3] 180 SXTL v5.8h, v5.8b 181 SMLAL v25.4s, v6.4h, v1.h[3] 182 SMLAL2 v29.4s, v6.8h, v1.h[3] 183 SMLAL v26.4s, v6.4h, v2.h[3] 184 SMLAL2 v30.4s, v6.8h, v2.h[3] 185 SMLAL v27.4s, v6.4h, v3.h[3] 186 SMLAL2 v31.4s, v6.8h, v3.h[3] 187 LDR x17, [x5, 80] 188 SMLAL v16.4s, v5.4h, v0.h[4] 189 SMLAL2 v20.4s, v5.8h, v0.h[4] 190 SMLAL v17.4s, v5.4h, v1.h[4] 191 SMLAL2 v21.4s, v5.8h, v1.h[4] 192 SXTL v4.8h, v4.8b 193 SMLAL v18.4s, v5.4h, v2.h[4] 194 SMLAL2 v22.4s, v5.8h, v2.h[4] 195 SMLAL v19.4s, v5.4h, v3.h[4] 196 SMLAL2 v23.4s, v5.8h, v3.h[4] 197 LDR d5, [x5, 88] 198 INS v6.d[0], x17 199 SMLAL v24.4s, v4.4h, v0.h[4] 200 SMLAL2 v28.4s, v4.8h, v0.h[4] 201 SMLAL v25.4s, v4.4h, v1.h[4] 202 SMLAL2 v29.4s, v4.8h, v1.h[4] 203 SXTL v6.8h, v6.8b 204 SMLAL v26.4s, v4.4h, v2.h[4] 205 SMLAL2 v30.4s, v4.8h, v2.h[4] 206 SMLAL v27.4s, v4.4h, v3.h[4] 207 SMLAL2 v31.4s, v4.8h, v3.h[4] 208 LDR x17, [x5, 96] 209 SMLAL v16.4s, v6.4h, v0.h[5] 210 SMLAL2 v20.4s, v6.8h, v0.h[5] 211 SMLAL v17.4s, v6.4h, v1.h[5] 212 SMLAL2 v21.4s, v6.8h, v1.h[5] 213 SXTL v5.8h, v5.8b 214 SMLAL v18.4s, v6.4h, v2.h[5] 215 SMLAL2 v22.4s, v6.8h, v2.h[5] 216 SMLAL v19.4s, v6.4h, v3.h[5] 217 SMLAL2 v23.4s, v6.8h, v3.h[5] 218 LDR d6, [x5, 104] 219 INS v4.d[0], x17 220 SMLAL v24.4s, v5.4h, v0.h[5] 221 SMLAL2 v28.4s, v5.8h, v0.h[5] 222 SMLAL v25.4s, v5.4h, v1.h[5] 223 SMLAL2 v29.4s, v5.8h, v1.h[5] 224 SXTL v4.8h, v4.8b 225 SMLAL v26.4s, v5.4h, v2.h[5] 226 SMLAL2 v30.4s, v5.8h, v2.h[5] 227 SMLAL v27.4s, v5.4h, v3.h[5] 228 SMLAL2 v31.4s, v5.8h, v3.h[5] 229 SXTL v6.8h, v6.8b 230 LDR x17, [x5, 112] 231 SMLAL v16.4s, v4.4h, v0.h[6] 232 SMLAL2 v20.4s, v4.8h, v0.h[6] 233 SMLAL v17.4s, v4.4h, v1.h[6] 234 SMLAL2 v21.4s, v4.8h, v1.h[6] 235 SMLAL v18.4s, v4.4h, v2.h[6] 236 SMLAL2 v22.4s, v4.8h, v2.h[6] 237 SMLAL v19.4s, v4.4h, v3.h[6] 238 SMLAL2 v23.4s, v4.8h, v3.h[6] 239 LDR d5, [x5, 120] 240 INS v4.d[0], x17 241 SMLAL v24.4s, v6.4h, v0.h[6] 242 SMLAL2 v28.4s, v6.8h, v0.h[6] 243 SMLAL v25.4s, v6.4h, v1.h[6] 244 SMLAL2 v29.4s, v6.8h, v1.h[6] 245 SXTL v4.8h, v4.8b 246 ADD x5, x5, 128 247 248 SMLAL v26.4s, v6.4h, v2.h[6] 249 SMLAL2 v30.4s, v6.8h, v2.h[6] 250 LDR x17, [x5] 251 SMLAL v27.4s, v6.4h, v3.h[6] 252 SMLAL2 v31.4s, v6.8h, v3.h[6] 253 SXTL v5.8h, v5.8b 254 LDR x10, [x3], 8 255 256 SMLAL v16.4s, v4.4h, v0.h[7] 257 SMLAL2 v20.4s, v4.8h, v0.h[7] 258 SMLAL v17.4s, v4.4h, v1.h[7] 259 SMLAL2 v21.4s, v4.8h, v1.h[7] 260 SMLAL v18.4s, v4.4h, v2.h[7] 261 SMLAL2 v22.4s, v4.8h, v2.h[7] 262 SMLAL v19.4s, v4.4h, v3.h[7] 263 SMLAL2 v23.4s, v4.8h, v3.h[7] 264 LDR d6, [x5, 8] 265 INS v4.d[0], x17 266 SMLAL v24.4s, v5.4h, v0.h[7] 267 SMLAL2 v28.4s, v5.8h, v0.h[7] 268 LDR x17, [x13], 8 269 SMLAL v25.4s, v5.4h, v1.h[7] 270 SMLAL2 v29.4s, v5.8h, v1.h[7] 271 LDR d1, [x15], 8 272 INS v0.d[0], x10 273 SMLAL v26.4s, v5.4h, v2.h[7] 274 SMLAL2 v30.4s, v5.8h, v2.h[7] 275 SMLAL v27.4s, v5.4h, v3.h[7] 276 SMLAL2 v31.4s, v5.8h, v3.h[7] 277 LDR d3, [x4], 8 278 INS v2.d[0], x17 279 280 SXTL v0.8h, v0.8b 281 SXTL v1.8h, v1.8b 282 LDR x17, [x5, 16] 283 SXTL v4.8h, v4.8b 284 SXTL v2.8h, v2.8b 285 SUBS x0, x0, 8 286 SXTL v3.8h, v3.8b 287 SXTL v6.8h, v6.8b 288 B.HS 1b 289 290 # Epilogue. Same as main loop but no preloads in final group 291 292 .p2align 3 2932: 294 SMLAL v16.4s, v4.4h, v0.h[0] 295 SMLAL2 v20.4s, v4.8h, v0.h[0] 296 SMLAL v17.4s, v4.4h, v1.h[0] 297 SMLAL2 v21.4s, v4.8h, v1.h[0] 298 SMLAL v18.4s, v4.4h, v2.h[0] 299 SMLAL2 v22.4s, v4.8h, v2.h[0] 300 SMLAL v19.4s, v4.4h, v3.h[0] 301 SMLAL2 v23.4s, v4.8h, v3.h[0] 302 LDR d4, [x5, 24] 303 INS v5.d[0], x17 304 SMLAL v24.4s, v6.4h, v0.h[0] 305 SMLAL2 v28.4s, v6.8h, v0.h[0] 306 SMLAL v25.4s, v6.4h, v1.h[0] 307 SMLAL2 v29.4s, v6.8h, v1.h[0] 308 SXTL v5.8h, v5.8b 309 SMLAL v26.4s, v6.4h, v2.h[0] 310 SMLAL2 v30.4s, v6.8h, v2.h[0] 311 SMLAL v27.4s, v6.4h, v3.h[0] 312 SMLAL2 v31.4s, v6.8h, v3.h[0] 313 LDR x17, [x5, 32] 314 SMLAL v16.4s, v5.4h, v0.h[1] 315 SMLAL2 v20.4s, v5.8h, v0.h[1] 316 SMLAL v17.4s, v5.4h, v1.h[1] 317 SMLAL2 v21.4s, v5.8h, v1.h[1] 318 SXTL v4.8h, v4.8b 319 SMLAL v18.4s, v5.4h, v2.h[1] 320 SMLAL2 v22.4s, v5.8h, v2.h[1] 321 SMLAL v19.4s, v5.4h, v3.h[1] 322 SMLAL2 v23.4s, v5.8h, v3.h[1] 323 LDR d5, [x5, 40] 324 INS v6.d[0], x17 325 SMLAL v24.4s, v4.4h, v0.h[1] 326 SMLAL2 v28.4s, v4.8h, v0.h[1] 327 SMLAL v25.4s, v4.4h, v1.h[1] 328 SMLAL2 v29.4s, v4.8h, v1.h[1] 329 SXTL v6.8h, v6.8b 330 SMLAL v26.4s, v4.4h, v2.h[1] 331 SMLAL2 v30.4s, v4.8h, v2.h[1] 332 SMLAL v27.4s, v4.4h, v3.h[1] 333 SMLAL2 v31.4s, v4.8h, v3.h[1] 334 LDR x17, [x5, 48] 335 SMLAL v16.4s, v6.4h, v0.h[2] 336 SMLAL2 v20.4s, v6.8h, v0.h[2] 337 SMLAL v17.4s, v6.4h, v1.h[2] 338 SXTL v5.8h, v5.8b 339 SMLAL2 v21.4s, v6.8h, v1.h[2] 340 SMLAL v18.4s, v6.4h, v2.h[2] 341 SMLAL2 v22.4s, v6.8h, v2.h[2] 342 SMLAL v19.4s, v6.4h, v3.h[2] 343 SMLAL2 v23.4s, v6.8h, v3.h[2] 344 LDR d6, [x5, 56] 345 INS v4.d[0], x17 346 SMLAL v24.4s, v5.4h, v0.h[2] 347 SMLAL2 v28.4s, v5.8h, v0.h[2] 348 SMLAL v25.4s, v5.4h, v1.h[2] 349 SMLAL2 v29.4s, v5.8h, v1.h[2] 350 SXTL v4.8h, v4.8b 351 SMLAL v26.4s, v5.4h, v2.h[2] 352 SMLAL2 v30.4s, v5.8h, v2.h[2] 353 SMLAL v27.4s, v5.4h, v3.h[2] 354 SMLAL2 v31.4s, v5.8h, v3.h[2] 355 LDR x17, [x5, 64] 356 SMLAL v16.4s, v4.4h, v0.h[3] 357 SMLAL2 v20.4s, v4.8h, v0.h[3] 358 SMLAL v17.4s, v4.4h, v1.h[3] 359 SMLAL2 v21.4s, v4.8h, v1.h[3] 360 SXTL v6.8h, v6.8b 361 SMLAL v18.4s, v4.4h, v2.h[3] 362 SMLAL2 v22.4s, v4.8h, v2.h[3] 363 SMLAL v19.4s, v4.4h, v3.h[3] 364 SMLAL2 v23.4s, v4.8h, v3.h[3] 365 LDR d4, [x5, 72] 366 INS v5.d[0], x17 367 SMLAL v24.4s, v6.4h, v0.h[3] 368 SMLAL2 v28.4s, v6.8h, v0.h[3] 369 SXTL v5.8h, v5.8b 370 SMLAL v25.4s, v6.4h, v1.h[3] 371 SMLAL2 v29.4s, v6.8h, v1.h[3] 372 SMLAL v26.4s, v6.4h, v2.h[3] 373 SMLAL2 v30.4s, v6.8h, v2.h[3] 374 SMLAL v27.4s, v6.4h, v3.h[3] 375 SMLAL2 v31.4s, v6.8h, v3.h[3] 376 LDR x17, [x5, 80] 377 SMLAL v16.4s, v5.4h, v0.h[4] 378 SMLAL2 v20.4s, v5.8h, v0.h[4] 379 SMLAL v17.4s, v5.4h, v1.h[4] 380 SMLAL2 v21.4s, v5.8h, v1.h[4] 381 SXTL v4.8h, v4.8b 382 SMLAL v18.4s, v5.4h, v2.h[4] 383 SMLAL2 v22.4s, v5.8h, v2.h[4] 384 SMLAL v19.4s, v5.4h, v3.h[4] 385 SMLAL2 v23.4s, v5.8h, v3.h[4] 386 LDR d5, [x5, 88] 387 INS v6.d[0], x17 388 SMLAL v24.4s, v4.4h, v0.h[4] 389 SMLAL2 v28.4s, v4.8h, v0.h[4] 390 SMLAL v25.4s, v4.4h, v1.h[4] 391 SMLAL2 v29.4s, v4.8h, v1.h[4] 392 SXTL v6.8h, v6.8b 393 SMLAL v26.4s, v4.4h, v2.h[4] 394 SMLAL2 v30.4s, v4.8h, v2.h[4] 395 SMLAL v27.4s, v4.4h, v3.h[4] 396 SMLAL2 v31.4s, v4.8h, v3.h[4] 397 LDR x17, [x5, 96] 398 SMLAL v16.4s, v6.4h, v0.h[5] 399 SMLAL2 v20.4s, v6.8h, v0.h[5] 400 SMLAL v17.4s, v6.4h, v1.h[5] 401 SMLAL2 v21.4s, v6.8h, v1.h[5] 402 SXTL v5.8h, v5.8b 403 SMLAL v18.4s, v6.4h, v2.h[5] 404 SMLAL2 v22.4s, v6.8h, v2.h[5] 405 SMLAL v19.4s, v6.4h, v3.h[5] 406 SMLAL2 v23.4s, v6.8h, v3.h[5] 407 LDR d6, [x5, 104] 408 INS v4.d[0], x17 409 SMLAL v24.4s, v5.4h, v0.h[5] 410 SMLAL2 v28.4s, v5.8h, v0.h[5] 411 SMLAL v25.4s, v5.4h, v1.h[5] 412 SMLAL2 v29.4s, v5.8h, v1.h[5] 413 SXTL v4.8h, v4.8b 414 SMLAL v26.4s, v5.4h, v2.h[5] 415 SMLAL2 v30.4s, v5.8h, v2.h[5] 416 SMLAL v27.4s, v5.4h, v3.h[5] 417 SMLAL2 v31.4s, v5.8h, v3.h[5] 418 SXTL v6.8h, v6.8b 419 SMLAL v16.4s, v4.4h, v0.h[6] 420 SMLAL2 v20.4s, v4.8h, v0.h[6] 421 SMLAL v17.4s, v4.4h, v1.h[6] 422 SMLAL2 v21.4s, v4.8h, v1.h[6] 423 SMLAL v18.4s, v4.4h, v2.h[6] 424 SMLAL2 v22.4s, v4.8h, v2.h[6] 425 SMLAL v19.4s, v4.4h, v3.h[6] 426 SMLAL2 v23.4s, v4.8h, v3.h[6] 427 LDR x17, [x5, 112] 428 SMLAL v24.4s, v6.4h, v0.h[6] 429 SMLAL2 v28.4s, v6.8h, v0.h[6] 430 SMLAL v25.4s, v6.4h, v1.h[6] 431 SMLAL2 v29.4s, v6.8h, v1.h[6] 432 LDR d5, [x5, 120] 433 INS v4.d[0], x17 434 SXTL v4.8h, v4.8b 435 SMLAL v26.4s, v6.4h, v2.h[6] 436 SMLAL2 v30.4s, v6.8h, v2.h[6] 437 SMLAL v27.4s, v6.4h, v3.h[6] 438 SMLAL2 v31.4s, v6.8h, v3.h[6] 439 SMLAL v16.4s, v4.4h, v0.h[7] 440 SMLAL2 v20.4s, v4.8h, v0.h[7] 441 SMLAL v17.4s, v4.4h, v1.h[7] 442 SMLAL2 v21.4s, v4.8h, v1.h[7] 443 SXTL v5.8h, v5.8b 444 SMLAL v18.4s, v4.4h, v2.h[7] 445 SMLAL2 v22.4s, v4.8h, v2.h[7] 446 SMLAL v19.4s, v4.4h, v3.h[7] 447 SMLAL2 v23.4s, v4.8h, v3.h[7] 448 ADD x5, x5, 128 449 SMLAL v24.4s, v5.4h, v0.h[7] 450 SMLAL2 v28.4s, v5.8h, v0.h[7] 451 SMLAL v25.4s, v5.4h, v1.h[7] 452 SMLAL2 v29.4s, v5.8h, v1.h[7] 453 AND x0, x2, 7 // kc remainder 0 to 7 454 SMLAL v26.4s, v5.4h, v2.h[7] 455 SMLAL2 v30.4s, v5.8h, v2.h[7] 456 SMLAL v27.4s, v5.4h, v3.h[7] 457 SMLAL2 v31.4s, v5.8h, v3.h[7] 458 459 # Is there a remainder?- 1 to 7 bytes of A 460 CBNZ x0, 4f 461 4623: 463 # Apply params - preshift, scale, postshift, bias and clamp 464 LD1R {v4.4s}, [x11], 4 465 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 466 SQSHL v17.4s, v17.4s, v4.4s 467 SQSHL v18.4s, v18.4s, v4.4s 468 SQSHL v19.4s, v19.4s, v4.4s 469 SQSHL v20.4s, v20.4s, v4.4s 470 SQSHL v21.4s, v21.4s, v4.4s 471 SQSHL v22.4s, v22.4s, v4.4s 472 SQSHL v23.4s, v23.4s, v4.4s 473 LD1R {v5.4s}, [x11], 4 474 SQSHL v24.4s, v24.4s, v4.4s 475 SQSHL v25.4s, v25.4s, v4.4s 476 SQSHL v26.4s, v26.4s, v4.4s 477 SQSHL v27.4s, v27.4s, v4.4s 478 SQSHL v28.4s, v28.4s, v4.4s 479 SQSHL v29.4s, v29.4s, v4.4s 480 SQSHL v30.4s, v30.4s, v4.4s 481 SQSHL v31.4s, v31.4s, v4.4s 482 LD1R {v6.4s}, [x11], 4 483 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 484 SQDMULH v17.4s, v17.4s, v5.4s 485 SQDMULH v18.4s, v18.4s, v5.4s 486 SQDMULH v19.4s, v19.4s, v5.4s 487 SQDMULH v20.4s, v20.4s, v5.4s 488 SQDMULH v21.4s, v21.4s, v5.4s 489 SQDMULH v22.4s, v22.4s, v5.4s 490 SQDMULH v23.4s, v23.4s, v5.4s 491 SQDMULH v24.4s, v24.4s, v5.4s 492 SQDMULH v25.4s, v25.4s, v5.4s 493 SQDMULH v26.4s, v26.4s, v5.4s 494 SQDMULH v27.4s, v27.4s, v5.4s 495 SQDMULH v28.4s, v28.4s, v5.4s 496 SQDMULH v29.4s, v29.4s, v5.4s 497 SQDMULH v30.4s, v30.4s, v5.4s 498 SQDMULH v31.4s, v31.4s, v5.4s 499 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 500 SRSHL v17.4s, v17.4s, v6.4s 501 SRSHL v18.4s, v18.4s, v6.4s 502 SRSHL v19.4s, v19.4s, v6.4s 503 SRSHL v20.4s, v20.4s, v6.4s 504 SRSHL v21.4s, v21.4s, v6.4s 505 SRSHL v22.4s, v22.4s, v6.4s 506 SRSHL v23.4s, v23.4s, v6.4s 507 SRSHL v24.4s, v24.4s, v6.4s 508 SRSHL v25.4s, v25.4s, v6.4s 509 SRSHL v26.4s, v26.4s, v6.4s 510 SRSHL v27.4s, v27.4s, v6.4s 511 SRSHL v28.4s, v28.4s, v6.4s 512 SRSHL v29.4s, v29.4s, v6.4s 513 SRSHL v30.4s, v30.4s, v6.4s 514 SRSHL v31.4s, v31.4s, v6.4s 515 516 SQXTN v16.4h, v16.4s 517 SQXTN v17.4h, v17.4s 518 SQXTN v18.4h, v18.4s 519 SQXTN v19.4h, v19.4s 520 SQXTN v24.4h, v24.4s 521 SQXTN v25.4h, v25.4s 522 SQXTN v26.4h, v26.4s 523 SQXTN v27.4h, v27.4s 524 LD1R {v6.8h}, [x11], 2 // add bias 525 526 SQXTN2 v16.8h, v20.4s 527 SQXTN2 v17.8h, v21.4s 528 SQXTN2 v18.8h, v22.4s 529 SQXTN2 v19.8h, v23.4s 530 SQXTN2 v24.8h, v28.4s 531 SQXTN2 v25.8h, v29.4s 532 SQXTN2 v26.8h, v30.4s 533 SQXTN2 v27.8h, v31.4s 534 535 SQADD v16.8h, v16.8h, v6.8h 536 SQADD v17.8h, v17.8h, v6.8h 537 SQADD v18.8h, v18.8h, v6.8h 538 SQADD v19.8h, v19.8h, v6.8h 539 SQADD v24.8h, v24.8h, v6.8h 540 SQADD v25.8h, v25.8h, v6.8h 541 SQADD v26.8h, v26.8h, v6.8h 542 SQADD v27.8h, v27.8h, v6.8h 543 LD1R {v4.16b}, [x11], 1 // clamp min value 544 545 SQXTN v0.8b, v16.8h 546 SQXTN v1.8b, v17.8h 547 SQXTN v2.8b, v18.8h 548 SQXTN v3.8b, v19.8h 549 LD1R {v5.16b}, [x11] // clamp max value 550 SQXTN2 v0.16b, v24.8h 551 SQXTN2 v1.16b, v25.8h 552 SQXTN2 v2.16b, v26.8h 553 SQXTN2 v3.16b, v27.8h 554 SUB x11, x11, 15 // rewind params pointer 555 556 SMAX v0.16b, v0.16b, v4.16b 557 SMAX v1.16b, v1.16b, v4.16b 558 SMAX v2.16b, v2.16b, v4.16b 559 SMAX v3.16b, v3.16b, v4.16b 560 SUBS x1, x1, 16 561 SMIN v0.16b, v0.16b, v5.16b 562 SMIN v1.16b, v1.16b, v5.16b 563 SMIN v2.16b, v2.16b, v5.16b 564 SMIN v3.16b, v3.16b, v5.16b 565 B.LO 5f 566 567 # Store full 4 x 16 568 ST1 {v0.16b}, [x6], x12 569 SUB x3, x3, x2 // a0 -= kc 570 ST1 {v1.16b}, [x8], x12 571 SUB x15, x15, x2 // a1 -= kc 572 ST1 {v2.16b}, [x9], x12 573 SUB x13, x13, x2 // a2 -= kc 574 ST1 {v3.16b}, [x7], x12 575 SUB x4, x4, x2 // a3 -= kc 576 B.NE 0b 577 RET 578 579 # Remainder- 1 to 7 bytes of A 580 .p2align 3 5814: 582 AND x0, x2, 7 // kc remainder 1 to 7 583 584 LD1 {v0.8b}, [x3], x0 585 LDP d4, d5, [x5], 16 586 LD1 {v1.8b}, [x15], x0 587 LD1 {v2.8b}, [x13], x0 588 LD1 {v3.8b}, [x4], x0 589 SXTL v0.8h, v0.8b 590 SXTL v4.8h, v4.8b 591 SXTL v5.8h, v5.8b 592 SXTL v1.8h, v1.8b 593 SXTL v2.8h, v2.8b 594 SXTL v3.8h, v3.8b 595 SMLAL v16.4s, v4.4h, v0.h[0] 596 SMLAL2 v20.4s, v4.8h, v0.h[0] 597 SMLAL v24.4s, v5.4h, v0.h[0] 598 SMLAL2 v28.4s, v5.8h, v0.h[0] 599 SMLAL v17.4s, v4.4h, v1.h[0] 600 SMLAL2 v21.4s, v4.8h, v1.h[0] 601 SMLAL v25.4s, v5.4h, v1.h[0] 602 SMLAL2 v29.4s, v5.8h, v1.h[0] 603 SMLAL v18.4s, v4.4h, v2.h[0] 604 SMLAL2 v22.4s, v4.8h, v2.h[0] 605 SMLAL v26.4s, v5.4h, v2.h[0] 606 SMLAL2 v30.4s, v5.8h, v2.h[0] 607 SMLAL v19.4s, v4.4h, v3.h[0] 608 SMLAL2 v23.4s, v4.8h, v3.h[0] 609 SMLAL v27.4s, v5.4h, v3.h[0] 610 SMLAL2 v31.4s, v5.8h, v3.h[0] 611 CMP x0, 2 612 B.LO 3b 613 614 LDP d4, d5, [x5], 16 615 SXTL v4.8h, v4.8b 616 SXTL v5.8h, v5.8b 617 SMLAL v16.4s, v4.4h, v0.h[1] 618 SMLAL2 v20.4s, v4.8h, v0.h[1] 619 SMLAL v24.4s, v5.4h, v0.h[1] 620 SMLAL2 v28.4s, v5.8h, v0.h[1] 621 SMLAL v17.4s, v4.4h, v1.h[1] 622 SMLAL2 v21.4s, v4.8h, v1.h[1] 623 SMLAL v25.4s, v5.4h, v1.h[1] 624 SMLAL2 v29.4s, v5.8h, v1.h[1] 625 SMLAL v18.4s, v4.4h, v2.h[1] 626 SMLAL2 v22.4s, v4.8h, v2.h[1] 627 SMLAL v26.4s, v5.4h, v2.h[1] 628 SMLAL2 v30.4s, v5.8h, v2.h[1] 629 SMLAL v19.4s, v4.4h, v3.h[1] 630 SMLAL2 v23.4s, v4.8h, v3.h[1] 631 SMLAL v27.4s, v5.4h, v3.h[1] 632 SMLAL2 v31.4s, v5.8h, v3.h[1] 633 B.EQ 3b 634 635 LDP d4, d5, [x5], 16 636 SXTL v4.8h, v4.8b 637 SXTL v5.8h, v5.8b 638 SMLAL v16.4s, v4.4h, v0.h[2] 639 SMLAL2 v20.4s, v4.8h, v0.h[2] 640 SMLAL v24.4s, v5.4h, v0.h[2] 641 SMLAL2 v28.4s, v5.8h, v0.h[2] 642 SMLAL v17.4s, v4.4h, v1.h[2] 643 SMLAL2 v21.4s, v4.8h, v1.h[2] 644 SMLAL v25.4s, v5.4h, v1.h[2] 645 SMLAL2 v29.4s, v5.8h, v1.h[2] 646 SMLAL v18.4s, v4.4h, v2.h[2] 647 SMLAL2 v22.4s, v4.8h, v2.h[2] 648 SMLAL v26.4s, v5.4h, v2.h[2] 649 SMLAL2 v30.4s, v5.8h, v2.h[2] 650 SMLAL v19.4s, v4.4h, v3.h[2] 651 SMLAL2 v23.4s, v4.8h, v3.h[2] 652 SMLAL v27.4s, v5.4h, v3.h[2] 653 SMLAL2 v31.4s, v5.8h, v3.h[2] 654 CMP x0, 4 655 B.LO 3b 656 657 LDP d4, d5, [x5], 16 658 SXTL v4.8h, v4.8b 659 SXTL v5.8h, v5.8b 660 SMLAL v16.4s, v4.4h, v0.h[3] 661 SMLAL2 v20.4s, v4.8h, v0.h[3] 662 SMLAL v24.4s, v5.4h, v0.h[3] 663 SMLAL2 v28.4s, v5.8h, v0.h[3] 664 SMLAL v17.4s, v4.4h, v1.h[3] 665 SMLAL2 v21.4s, v4.8h, v1.h[3] 666 SMLAL v25.4s, v5.4h, v1.h[3] 667 SMLAL2 v29.4s, v5.8h, v1.h[3] 668 SMLAL v18.4s, v4.4h, v2.h[3] 669 SMLAL2 v22.4s, v4.8h, v2.h[3] 670 SMLAL v26.4s, v5.4h, v2.h[3] 671 SMLAL2 v30.4s, v5.8h, v2.h[3] 672 SMLAL v19.4s, v4.4h, v3.h[3] 673 SMLAL2 v23.4s, v4.8h, v3.h[3] 674 SMLAL v27.4s, v5.4h, v3.h[3] 675 SMLAL2 v31.4s, v5.8h, v3.h[3] 676 B.EQ 3b 677 678 LDP d4, d5, [x5], 16 679 SXTL v4.8h, v4.8b 680 SXTL v5.8h, v5.8b 681 SMLAL v16.4s, v4.4h, v0.h[4] 682 SMLAL2 v20.4s, v4.8h, v0.h[4] 683 SMLAL v24.4s, v5.4h, v0.h[4] 684 SMLAL2 v28.4s, v5.8h, v0.h[4] 685 SMLAL v17.4s, v4.4h, v1.h[4] 686 SMLAL2 v21.4s, v4.8h, v1.h[4] 687 SMLAL v25.4s, v5.4h, v1.h[4] 688 SMLAL2 v29.4s, v5.8h, v1.h[4] 689 SMLAL v18.4s, v4.4h, v2.h[4] 690 SMLAL2 v22.4s, v4.8h, v2.h[4] 691 SMLAL v26.4s, v5.4h, v2.h[4] 692 SMLAL2 v30.4s, v5.8h, v2.h[4] 693 SMLAL v19.4s, v4.4h, v3.h[4] 694 SMLAL2 v23.4s, v4.8h, v3.h[4] 695 SMLAL v27.4s, v5.4h, v3.h[4] 696 SMLAL2 v31.4s, v5.8h, v3.h[4] 697 CMP x0, 6 698 B.LO 3b 699 700 LDP d4, d5, [x5], 16 701 SXTL v4.8h, v4.8b 702 SXTL v5.8h, v5.8b 703 SMLAL v16.4s, v4.4h, v0.h[5] 704 SMLAL2 v20.4s, v4.8h, v0.h[5] 705 SMLAL v24.4s, v5.4h, v0.h[5] 706 SMLAL2 v28.4s, v5.8h, v0.h[5] 707 SMLAL v17.4s, v4.4h, v1.h[5] 708 SMLAL2 v21.4s, v4.8h, v1.h[5] 709 SMLAL v25.4s, v5.4h, v1.h[5] 710 SMLAL2 v29.4s, v5.8h, v1.h[5] 711 SMLAL v18.4s, v4.4h, v2.h[5] 712 SMLAL2 v22.4s, v4.8h, v2.h[5] 713 SMLAL v26.4s, v5.4h, v2.h[5] 714 SMLAL2 v30.4s, v5.8h, v2.h[5] 715 SMLAL v19.4s, v4.4h, v3.h[5] 716 SMLAL2 v23.4s, v4.8h, v3.h[5] 717 SMLAL v27.4s, v5.4h, v3.h[5] 718 SMLAL2 v31.4s, v5.8h, v3.h[5] 719 B.EQ 3b 720 721 LDP d4, d5, [x5], 16 722 SXTL v4.8h, v4.8b 723 SXTL v5.8h, v5.8b 724 SMLAL v16.4s, v4.4h, v0.h[6] 725 SMLAL2 v20.4s, v4.8h, v0.h[6] 726 SMLAL v24.4s, v5.4h, v0.h[6] 727 SMLAL2 v28.4s, v5.8h, v0.h[6] 728 SMLAL v17.4s, v4.4h, v1.h[6] 729 SMLAL2 v21.4s, v4.8h, v1.h[6] 730 SMLAL v25.4s, v5.4h, v1.h[6] 731 SMLAL2 v29.4s, v5.8h, v1.h[6] 732 SMLAL v18.4s, v4.4h, v2.h[6] 733 SMLAL2 v22.4s, v4.8h, v2.h[6] 734 SMLAL v26.4s, v5.4h, v2.h[6] 735 SMLAL2 v30.4s, v5.8h, v2.h[6] 736 SMLAL v19.4s, v4.4h, v3.h[6] 737 SMLAL2 v23.4s, v4.8h, v3.h[6] 738 SMLAL v27.4s, v5.4h, v3.h[6] 739 SMLAL2 v31.4s, v5.8h, v3.h[6] 740 B 3b 741 742 # Store odd width 743 .p2align 3 7445: 745 TBZ x1, 3, 6f 746 STR d0, [x6], 8 747 STR d1, [x8], 8 748 DUP d0, v0.d[1] 749 DUP d1, v1.d[1] 750 STR d2, [x9], 8 751 STR d3, [x7], 8 752 DUP d2, v2.d[1] 753 DUP d3, v3.d[1] 7546: 755 TBZ x1, 2, 7f 756 STR s0, [x6], 4 757 STR s1, [x8], 4 758 DUP s0, v0.s[1] 759 DUP s1, v1.s[1] 760 STR s2, [x9], 4 761 STR s3, [x7], 4 762 DUP s2, v2.s[1] 763 DUP s3, v3.s[1] 7647: 765 TBZ x1, 1, 8f 766 STR h0, [x6], 2 767 STR h1, [x8], 2 768 DUP h0, v0.h[1] 769 DUP h1, v1.h[1] 770 STR h2, [x9], 2 771 STR h3, [x7], 2 772 DUP h2, v2.h[1] 773 DUP h3, v3.h[1] 7748: 775 TBZ x1, 0, 9f 776 STR b0, [x6] 777 STR b1, [x8] 778 STR b2, [x9] 779 STR b3, [x7] 7809: 781 RET 782 783END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 784 785#ifdef __ELF__ 786.section ".note.GNU-stack","",%progbits 787#endif 788