1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x15 v1 30# A2 x13 v2 31# A3 x4 v3 32# B x5 v4 v5 v6 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 38 39# x10 x17 a53 temp registers 40 41BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 42 43 # Clamp A and C pointers 44 CMP x0, 2 // if mr < 2 45 LDP x12, x11, [sp] // Load cn_stride, params 46 ADD x15, x3, x4 // a1 = a0 + a_stride 47 ADD x8, x6, x7 // c1 = c0 + cm_stride 48 CSEL x15, x3, x15, LO // a1 = a0 49 CSEL x8, x6, x8, LO // c1 = c0 50 51 ADD x13, x15, x4 // a2 = a1 + a_stride 52 ADD x9, x8, x7 // c2 = c1 + cm_stride 53 // if mr <= 2 54 CSEL x13, x15, x13, LS // a2 = a1 55 CSEL x9, x8, x9, LS // c2 = c1 56 57 CMP x0, 4 // if mr < 4 58 ADD x4, x13, x4 // a3 = a2 + a_stride 59 ADD x7, x9, x7 // c3 = c2 + cm_stride 60 CSEL x4, x13, x4, LO // a3 = a2 61 CSEL x7, x9, x7, LO // c3 = c2 62 63 .p2align 3 640: 65 # Load initial bias from w into accumulators 66 LDP q16, q20, [x5], 32 67 MOV v17.16b, v16.16b 68 MOV v18.16b, v16.16b 69 LDP q24, q28, [x5], 32 70 MOV v19.16b, v16.16b 71 MOV v21.16b, v20.16b 72 MOV v22.16b, v20.16b 73 MOV v23.16b, v20.16b 74 SUBS x0, x2, 8 // k = kc - 8 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 MOV v27.16b, v24.16b 78 MOV v29.16b, v28.16b 79 MOV v30.16b, v28.16b 80 MOV v31.16b, v28.16b 81 # Is there at least 8 bytes for epilogue? 82 B.LO 4f 83 84 # Prologue 85 LDR d0, [x3], 8 86 LDP d4, d6, [x5] 87 LDR d1, [x15], 8 88 LDR d2, [x13], 8 89 LDR d3, [x4], 8 90 SXTL v0.8h, v0.8b 91 LDR x17, [x5, 16] 92 SXTL v4.8h, v4.8b 93 SXTL v1.8h, v1.8b 94 SXTL v2.8h, v2.8b 95 SXTL v3.8h, v3.8b 96 SXTL v6.8h, v6.8b 97 98 SUBS x0, x0, 8 // k = k - 8 99 # Is there at least 8 bytes for main loop? 100 B.LO 2f 101 102 # Main loop - 8 bytes of A 103 .p2align 3 1041: 105 SMLAL v16.4s, v4.4h, v0.h[0] 106 SMLAL2 v20.4s, v4.8h, v0.h[0] 107 SMLAL v17.4s, v4.4h, v1.h[0] 108 SMLAL2 v21.4s, v4.8h, v1.h[0] 109 SMLAL v18.4s, v4.4h, v2.h[0] 110 SMLAL2 v22.4s, v4.8h, v2.h[0] 111 SMLAL v19.4s, v4.4h, v3.h[0] 112 SMLAL2 v23.4s, v4.8h, v3.h[0] 113 LDR d4, [x5, 24] 114 INS v5.d[0], x17 115 SMLAL v24.4s, v6.4h, v0.h[0] 116 SMLAL2 v28.4s, v6.8h, v0.h[0] 117 SMLAL v25.4s, v6.4h, v1.h[0] 118 SMLAL2 v29.4s, v6.8h, v1.h[0] 119 SXTL v5.8h, v5.8b 120 SMLAL v26.4s, v6.4h, v2.h[0] 121 SMLAL2 v30.4s, v6.8h, v2.h[0] 122 SMLAL v27.4s, v6.4h, v3.h[0] 123 SMLAL2 v31.4s, v6.8h, v3.h[0] 124 LDR x17, [x5, 32] 125 SMLAL v16.4s, v5.4h, v0.h[1] 126 SMLAL2 v20.4s, v5.8h, v0.h[1] 127 SMLAL v17.4s, v5.4h, v1.h[1] 128 SMLAL2 v21.4s, v5.8h, v1.h[1] 129 SXTL v4.8h, v4.8b 130 SMLAL v18.4s, v5.4h, v2.h[1] 131 SMLAL2 v22.4s, v5.8h, v2.h[1] 132 SMLAL v19.4s, v5.4h, v3.h[1] 133 SMLAL2 v23.4s, v5.8h, v3.h[1] 134 LDR d5, [x5, 40] 135 INS v6.d[0], x17 136 SMLAL v24.4s, v4.4h, v0.h[1] 137 SMLAL2 v28.4s, v4.8h, v0.h[1] 138 SMLAL v25.4s, v4.4h, v1.h[1] 139 SMLAL2 v29.4s, v4.8h, v1.h[1] 140 SXTL v6.8h, v6.8b 141 SMLAL v26.4s, v4.4h, v2.h[1] 142 SMLAL2 v30.4s, v4.8h, v2.h[1] 143 SMLAL v27.4s, v4.4h, v3.h[1] 144 SMLAL2 v31.4s, v4.8h, v3.h[1] 145 LDR x17, [x5, 48] 146 SMLAL v16.4s, v6.4h, v0.h[2] 147 SMLAL2 v20.4s, v6.8h, v0.h[2] 148 SMLAL v17.4s, v6.4h, v1.h[2] 149 SXTL v5.8h, v5.8b 150 SMLAL2 v21.4s, v6.8h, v1.h[2] 151 SMLAL v18.4s, v6.4h, v2.h[2] 152 SMLAL2 v22.4s, v6.8h, v2.h[2] 153 SMLAL v19.4s, v6.4h, v3.h[2] 154 SMLAL2 v23.4s, v6.8h, v3.h[2] 155 LDR d6, [x5, 56] 156 INS v4.d[0], x17 157 SMLAL v24.4s, v5.4h, v0.h[2] 158 SMLAL2 v28.4s, v5.8h, v0.h[2] 159 SMLAL v25.4s, v5.4h, v1.h[2] 160 SMLAL2 v29.4s, v5.8h, v1.h[2] 161 SXTL v4.8h, v4.8b 162 SMLAL v26.4s, v5.4h, v2.h[2] 163 SMLAL2 v30.4s, v5.8h, v2.h[2] 164 SMLAL v27.4s, v5.4h, v3.h[2] 165 SMLAL2 v31.4s, v5.8h, v3.h[2] 166 LDR x17, [x5, 64] 167 SMLAL v16.4s, v4.4h, v0.h[3] 168 SMLAL2 v20.4s, v4.8h, v0.h[3] 169 SMLAL v17.4s, v4.4h, v1.h[3] 170 SMLAL2 v21.4s, v4.8h, v1.h[3] 171 SXTL v6.8h, v6.8b 172 SMLAL v18.4s, v4.4h, v2.h[3] 173 SMLAL2 v22.4s, v4.8h, v2.h[3] 174 SMLAL v19.4s, v4.4h, v3.h[3] 175 SMLAL2 v23.4s, v4.8h, v3.h[3] 176 LDR d4, [x5, 72] 177 INS v5.d[0], x17 178 SMLAL v24.4s, v6.4h, v0.h[3] 179 SMLAL2 v28.4s, v6.8h, v0.h[3] 180 SXTL v5.8h, v5.8b 181 SMLAL v25.4s, v6.4h, v1.h[3] 182 SMLAL2 v29.4s, v6.8h, v1.h[3] 183 SMLAL v26.4s, v6.4h, v2.h[3] 184 SMLAL2 v30.4s, v6.8h, v2.h[3] 185 SMLAL v27.4s, v6.4h, v3.h[3] 186 SMLAL2 v31.4s, v6.8h, v3.h[3] 187 LDR x17, [x5, 80] 188 SMLAL v16.4s, v5.4h, v0.h[4] 189 SMLAL2 v20.4s, v5.8h, v0.h[4] 190 SMLAL v17.4s, v5.4h, v1.h[4] 191 SMLAL2 v21.4s, v5.8h, v1.h[4] 192 SXTL v4.8h, v4.8b 193 SMLAL v18.4s, v5.4h, v2.h[4] 194 SMLAL2 v22.4s, v5.8h, v2.h[4] 195 SMLAL v19.4s, v5.4h, v3.h[4] 196 SMLAL2 v23.4s, v5.8h, v3.h[4] 197 LDR d5, [x5, 88] 198 INS v6.d[0], x17 199 SMLAL v24.4s, v4.4h, v0.h[4] 200 SMLAL2 v28.4s, v4.8h, v0.h[4] 201 SMLAL v25.4s, v4.4h, v1.h[4] 202 SMLAL2 v29.4s, v4.8h, v1.h[4] 203 SXTL v6.8h, v6.8b 204 SMLAL v26.4s, v4.4h, v2.h[4] 205 SMLAL2 v30.4s, v4.8h, v2.h[4] 206 SMLAL v27.4s, v4.4h, v3.h[4] 207 SMLAL2 v31.4s, v4.8h, v3.h[4] 208 LDR x17, [x5, 96] 209 SMLAL v16.4s, v6.4h, v0.h[5] 210 SMLAL2 v20.4s, v6.8h, v0.h[5] 211 SMLAL v17.4s, v6.4h, v1.h[5] 212 SMLAL2 v21.4s, v6.8h, v1.h[5] 213 SXTL v5.8h, v5.8b 214 SMLAL v18.4s, v6.4h, v2.h[5] 215 SMLAL2 v22.4s, v6.8h, v2.h[5] 216 SMLAL v19.4s, v6.4h, v3.h[5] 217 SMLAL2 v23.4s, v6.8h, v3.h[5] 218 LDR d6, [x5, 104] 219 INS v4.d[0], x17 220 SMLAL v24.4s, v5.4h, v0.h[5] 221 SMLAL2 v28.4s, v5.8h, v0.h[5] 222 SMLAL v25.4s, v5.4h, v1.h[5] 223 SMLAL2 v29.4s, v5.8h, v1.h[5] 224 SXTL v4.8h, v4.8b 225 SMLAL v26.4s, v5.4h, v2.h[5] 226 SMLAL2 v30.4s, v5.8h, v2.h[5] 227 SMLAL v27.4s, v5.4h, v3.h[5] 228 SMLAL2 v31.4s, v5.8h, v3.h[5] 229 SXTL v6.8h, v6.8b 230 LDR x17, [x5, 112] 231 SMLAL v16.4s, v4.4h, v0.h[6] 232 SMLAL2 v20.4s, v4.8h, v0.h[6] 233 SMLAL v17.4s, v4.4h, v1.h[6] 234 SMLAL2 v21.4s, v4.8h, v1.h[6] 235 SMLAL v18.4s, v4.4h, v2.h[6] 236 SMLAL2 v22.4s, v4.8h, v2.h[6] 237 SMLAL v19.4s, v4.4h, v3.h[6] 238 SMLAL2 v23.4s, v4.8h, v3.h[6] 239 LDR d5, [x5, 120] 240 INS v4.d[0], x17 241 SMLAL v24.4s, v6.4h, v0.h[6] 242 SMLAL2 v28.4s, v6.8h, v0.h[6] 243 SMLAL v25.4s, v6.4h, v1.h[6] 244 SMLAL2 v29.4s, v6.8h, v1.h[6] 245 SXTL v4.8h, v4.8b 246 ADD x5, x5, 128 247 248 SMLAL v26.4s, v6.4h, v2.h[6] 249 SMLAL2 v30.4s, v6.8h, v2.h[6] 250 LDR x17, [x5] 251 SMLAL v27.4s, v6.4h, v3.h[6] 252 SMLAL2 v31.4s, v6.8h, v3.h[6] 253 SXTL v5.8h, v5.8b 254 LDR x10, [x3], 8 255 256 SMLAL v16.4s, v4.4h, v0.h[7] 257 SMLAL2 v20.4s, v4.8h, v0.h[7] 258 SMLAL v17.4s, v4.4h, v1.h[7] 259 SMLAL2 v21.4s, v4.8h, v1.h[7] 260 SMLAL v18.4s, v4.4h, v2.h[7] 261 SMLAL2 v22.4s, v4.8h, v2.h[7] 262 SMLAL v19.4s, v4.4h, v3.h[7] 263 SMLAL2 v23.4s, v4.8h, v3.h[7] 264 LDR d6, [x5, 8] 265 INS v4.d[0], x17 266 SMLAL v24.4s, v5.4h, v0.h[7] 267 SMLAL2 v28.4s, v5.8h, v0.h[7] 268 LDR x17, [x13], 8 269 SMLAL v25.4s, v5.4h, v1.h[7] 270 SMLAL2 v29.4s, v5.8h, v1.h[7] 271 LDR d1, [x15], 8 272 INS v0.d[0], x10 273 SMLAL v26.4s, v5.4h, v2.h[7] 274 SMLAL2 v30.4s, v5.8h, v2.h[7] 275 SMLAL v27.4s, v5.4h, v3.h[7] 276 SMLAL2 v31.4s, v5.8h, v3.h[7] 277 LDR d3, [x4], 8 278 INS v2.d[0], x17 279 280 SXTL v0.8h, v0.8b 281 SXTL v1.8h, v1.8b 282 LDR x17, [x5, 16] 283 SXTL v4.8h, v4.8b 284 SXTL v2.8h, v2.8b 285 SUBS x0, x0, 8 286 SXTL v3.8h, v3.8b 287 SXTL v6.8h, v6.8b 288 B.HS 1b 289 290 # Epilogue. Same as main loop but no preloads in final group 291 292 .p2align 3 2932: 294 SMLAL v16.4s, v4.4h, v0.h[0] 295 SMLAL2 v20.4s, v4.8h, v0.h[0] 296 SMLAL v17.4s, v4.4h, v1.h[0] 297 SMLAL2 v21.4s, v4.8h, v1.h[0] 298 SMLAL v18.4s, v4.4h, v2.h[0] 299 SMLAL2 v22.4s, v4.8h, v2.h[0] 300 SMLAL v19.4s, v4.4h, v3.h[0] 301 SMLAL2 v23.4s, v4.8h, v3.h[0] 302 LDR d4, [x5, 24] 303 INS v5.d[0], x17 304 SMLAL v24.4s, v6.4h, v0.h[0] 305 SMLAL2 v28.4s, v6.8h, v0.h[0] 306 SMLAL v25.4s, v6.4h, v1.h[0] 307 SMLAL2 v29.4s, v6.8h, v1.h[0] 308 SXTL v5.8h, v5.8b 309 SMLAL v26.4s, v6.4h, v2.h[0] 310 SMLAL2 v30.4s, v6.8h, v2.h[0] 311 SMLAL v27.4s, v6.4h, v3.h[0] 312 SMLAL2 v31.4s, v6.8h, v3.h[0] 313 LDR x17, [x5, 32] 314 SMLAL v16.4s, v5.4h, v0.h[1] 315 SMLAL2 v20.4s, v5.8h, v0.h[1] 316 SMLAL v17.4s, v5.4h, v1.h[1] 317 SMLAL2 v21.4s, v5.8h, v1.h[1] 318 SXTL v4.8h, v4.8b 319 SMLAL v18.4s, v5.4h, v2.h[1] 320 SMLAL2 v22.4s, v5.8h, v2.h[1] 321 SMLAL v19.4s, v5.4h, v3.h[1] 322 SMLAL2 v23.4s, v5.8h, v3.h[1] 323 LDR d5, [x5, 40] 324 INS v6.d[0], x17 325 SMLAL v24.4s, v4.4h, v0.h[1] 326 SMLAL2 v28.4s, v4.8h, v0.h[1] 327 SMLAL v25.4s, v4.4h, v1.h[1] 328 SMLAL2 v29.4s, v4.8h, v1.h[1] 329 SXTL v6.8h, v6.8b 330 SMLAL v26.4s, v4.4h, v2.h[1] 331 SMLAL2 v30.4s, v4.8h, v2.h[1] 332 SMLAL v27.4s, v4.4h, v3.h[1] 333 SMLAL2 v31.4s, v4.8h, v3.h[1] 334 LDR x17, [x5, 48] 335 SMLAL v16.4s, v6.4h, v0.h[2] 336 SMLAL2 v20.4s, v6.8h, v0.h[2] 337 SMLAL v17.4s, v6.4h, v1.h[2] 338 SXTL v5.8h, v5.8b 339 SMLAL2 v21.4s, v6.8h, v1.h[2] 340 SMLAL v18.4s, v6.4h, v2.h[2] 341 SMLAL2 v22.4s, v6.8h, v2.h[2] 342 SMLAL v19.4s, v6.4h, v3.h[2] 343 SMLAL2 v23.4s, v6.8h, v3.h[2] 344 LDR d6, [x5, 56] 345 INS v4.d[0], x17 346 SMLAL v24.4s, v5.4h, v0.h[2] 347 SMLAL2 v28.4s, v5.8h, v0.h[2] 348 SMLAL v25.4s, v5.4h, v1.h[2] 349 SMLAL2 v29.4s, v5.8h, v1.h[2] 350 SXTL v4.8h, v4.8b 351 SMLAL v26.4s, v5.4h, v2.h[2] 352 SMLAL2 v30.4s, v5.8h, v2.h[2] 353 SMLAL v27.4s, v5.4h, v3.h[2] 354 SMLAL2 v31.4s, v5.8h, v3.h[2] 355 LDR x17, [x5, 64] 356 SMLAL v16.4s, v4.4h, v0.h[3] 357 SMLAL2 v20.4s, v4.8h, v0.h[3] 358 SMLAL v17.4s, v4.4h, v1.h[3] 359 SMLAL2 v21.4s, v4.8h, v1.h[3] 360 SXTL v6.8h, v6.8b 361 SMLAL v18.4s, v4.4h, v2.h[3] 362 SMLAL2 v22.4s, v4.8h, v2.h[3] 363 SMLAL v19.4s, v4.4h, v3.h[3] 364 SMLAL2 v23.4s, v4.8h, v3.h[3] 365 LDR d4, [x5, 72] 366 INS v5.d[0], x17 367 SMLAL v24.4s, v6.4h, v0.h[3] 368 SMLAL2 v28.4s, v6.8h, v0.h[3] 369 SXTL v5.8h, v5.8b 370 SMLAL v25.4s, v6.4h, v1.h[3] 371 SMLAL2 v29.4s, v6.8h, v1.h[3] 372 SMLAL v26.4s, v6.4h, v2.h[3] 373 SMLAL2 v30.4s, v6.8h, v2.h[3] 374 SMLAL v27.4s, v6.4h, v3.h[3] 375 SMLAL2 v31.4s, v6.8h, v3.h[3] 376 LDR x17, [x5, 80] 377 SMLAL v16.4s, v5.4h, v0.h[4] 378 SMLAL2 v20.4s, v5.8h, v0.h[4] 379 SMLAL v17.4s, v5.4h, v1.h[4] 380 SMLAL2 v21.4s, v5.8h, v1.h[4] 381 SXTL v4.8h, v4.8b 382 SMLAL v18.4s, v5.4h, v2.h[4] 383 SMLAL2 v22.4s, v5.8h, v2.h[4] 384 SMLAL v19.4s, v5.4h, v3.h[4] 385 SMLAL2 v23.4s, v5.8h, v3.h[4] 386 LDR d5, [x5, 88] 387 INS v6.d[0], x17 388 SMLAL v24.4s, v4.4h, v0.h[4] 389 SMLAL2 v28.4s, v4.8h, v0.h[4] 390 SMLAL v25.4s, v4.4h, v1.h[4] 391 SMLAL2 v29.4s, v4.8h, v1.h[4] 392 SXTL v6.8h, v6.8b 393 SMLAL v26.4s, v4.4h, v2.h[4] 394 SMLAL2 v30.4s, v4.8h, v2.h[4] 395 SMLAL v27.4s, v4.4h, v3.h[4] 396 SMLAL2 v31.4s, v4.8h, v3.h[4] 397 LDR x17, [x5, 96] 398 SMLAL v16.4s, v6.4h, v0.h[5] 399 SMLAL2 v20.4s, v6.8h, v0.h[5] 400 SMLAL v17.4s, v6.4h, v1.h[5] 401 SMLAL2 v21.4s, v6.8h, v1.h[5] 402 SXTL v5.8h, v5.8b 403 SMLAL v18.4s, v6.4h, v2.h[5] 404 SMLAL2 v22.4s, v6.8h, v2.h[5] 405 SMLAL v19.4s, v6.4h, v3.h[5] 406 SMLAL2 v23.4s, v6.8h, v3.h[5] 407 LDR d6, [x5, 104] 408 INS v4.d[0], x17 409 SMLAL v24.4s, v5.4h, v0.h[5] 410 SMLAL2 v28.4s, v5.8h, v0.h[5] 411 SMLAL v25.4s, v5.4h, v1.h[5] 412 SMLAL2 v29.4s, v5.8h, v1.h[5] 413 SXTL v4.8h, v4.8b 414 SMLAL v26.4s, v5.4h, v2.h[5] 415 SMLAL2 v30.4s, v5.8h, v2.h[5] 416 SMLAL v27.4s, v5.4h, v3.h[5] 417 SMLAL2 v31.4s, v5.8h, v3.h[5] 418 SXTL v6.8h, v6.8b 419 SMLAL v16.4s, v4.4h, v0.h[6] 420 SMLAL2 v20.4s, v4.8h, v0.h[6] 421 SMLAL v17.4s, v4.4h, v1.h[6] 422 SMLAL2 v21.4s, v4.8h, v1.h[6] 423 SMLAL v18.4s, v4.4h, v2.h[6] 424 SMLAL2 v22.4s, v4.8h, v2.h[6] 425 SMLAL v19.4s, v4.4h, v3.h[6] 426 SMLAL2 v23.4s, v4.8h, v3.h[6] 427 LDR x17, [x5, 112] 428 SMLAL v24.4s, v6.4h, v0.h[6] 429 SMLAL2 v28.4s, v6.8h, v0.h[6] 430 SMLAL v25.4s, v6.4h, v1.h[6] 431 SMLAL2 v29.4s, v6.8h, v1.h[6] 432 LDR d5, [x5, 120] 433 INS v4.d[0], x17 434 SXTL v4.8h, v4.8b 435 SMLAL v26.4s, v6.4h, v2.h[6] 436 SMLAL2 v30.4s, v6.8h, v2.h[6] 437 SMLAL v27.4s, v6.4h, v3.h[6] 438 SMLAL2 v31.4s, v6.8h, v3.h[6] 439 SMLAL v16.4s, v4.4h, v0.h[7] 440 SMLAL2 v20.4s, v4.8h, v0.h[7] 441 SMLAL v17.4s, v4.4h, v1.h[7] 442 SMLAL2 v21.4s, v4.8h, v1.h[7] 443 SXTL v5.8h, v5.8b 444 SMLAL v18.4s, v4.4h, v2.h[7] 445 SMLAL2 v22.4s, v4.8h, v2.h[7] 446 SMLAL v19.4s, v4.4h, v3.h[7] 447 SMLAL2 v23.4s, v4.8h, v3.h[7] 448 ADD x5, x5, 128 449 SMLAL v24.4s, v5.4h, v0.h[7] 450 SMLAL2 v28.4s, v5.8h, v0.h[7] 451 SMLAL v25.4s, v5.4h, v1.h[7] 452 SMLAL2 v29.4s, v5.8h, v1.h[7] 453 AND x0, x2, 7 // kc remainder 0 to 7 454 SMLAL v26.4s, v5.4h, v2.h[7] 455 SMLAL2 v30.4s, v5.8h, v2.h[7] 456 SMLAL v27.4s, v5.4h, v3.h[7] 457 SMLAL2 v31.4s, v5.8h, v3.h[7] 458 459 # Is there a remainder?- 1 to 7 bytes of A 460 CBNZ x0, 4f 461 4623: 463 SCVTF v16.4s, v16.4s 464 SCVTF v17.4s, v17.4s 465 # Load per channel scale values from weights 466 LDR q4, [x5], 16 467 SCVTF v18.4s, v18.4s 468 SCVTF v19.4s, v19.4s 469 LDR q5, [x5], 16 470 SCVTF v20.4s, v20.4s 471 SCVTF v21.4s, v21.4s 472 SCVTF v22.4s, v22.4s 473 SCVTF v23.4s, v23.4s 474 SCVTF v24.4s, v24.4s 475 SCVTF v25.4s, v25.4s 476 SCVTF v26.4s, v26.4s 477 SCVTF v27.4s, v27.4s 478 SCVTF v28.4s, v28.4s 479 SCVTF v29.4s, v29.4s 480 SCVTF v30.4s, v30.4s 481 SCVTF v31.4s, v31.4s 482 483 LDR q6, [x5], 16 484 FMUL v16.4s, v16.4s, v4.4s 485 FMUL v17.4s, v17.4s, v4.4s 486 FMUL v18.4s, v18.4s, v4.4s 487 FMUL v19.4s, v19.4s, v4.4s 488 FMUL v20.4s, v20.4s, v5.4s 489 LDR q4, [x5], 16 490 FMUL v21.4s, v21.4s, v5.4s 491 FMUL v22.4s, v22.4s, v5.4s 492 FMUL v23.4s, v23.4s, v5.4s 493 FMUL v24.4s, v24.4s, v6.4s 494 FMUL v25.4s, v25.4s, v6.4s 495 FMUL v26.4s, v26.4s, v6.4s 496 FMUL v27.4s, v27.4s, v6.4s 497 FMUL v28.4s, v28.4s, v4.4s 498 FMUL v29.4s, v29.4s, v4.4s 499 FMUL v30.4s, v30.4s, v4.4s 500 FMUL v31.4s, v31.4s, v4.4s 501 502 FCVTNS v16.4s, v16.4s 503 FCVTNS v17.4s, v17.4s 504 FCVTNS v18.4s, v18.4s 505 FCVTNS v19.4s, v19.4s 506 FCVTNS v20.4s, v20.4s 507 FCVTNS v21.4s, v21.4s 508 FCVTNS v22.4s, v22.4s 509 FCVTNS v23.4s, v23.4s 510 FCVTNS v24.4s, v24.4s 511 FCVTNS v25.4s, v25.4s 512 FCVTNS v26.4s, v26.4s 513 FCVTNS v27.4s, v27.4s 514 FCVTNS v28.4s, v28.4s 515 FCVTNS v29.4s, v29.4s 516 FCVTNS v30.4s, v30.4s 517 FCVTNS v31.4s, v31.4s 518 519 SQXTN v16.4h, v16.4s 520 SQXTN v17.4h, v17.4s 521 SQXTN v18.4h, v18.4s 522 SQXTN v19.4h, v19.4s 523 SQXTN v24.4h, v24.4s 524 SQXTN v25.4h, v25.4s 525 SQXTN v26.4h, v26.4s 526 SQXTN v27.4h, v27.4s 527 LD1R {v6.8h}, [x11], 2 // add bias 528 529 SQXTN2 v16.8h, v20.4s 530 SQXTN2 v17.8h, v21.4s 531 SQXTN2 v18.8h, v22.4s 532 SQXTN2 v19.8h, v23.4s 533 SQXTN2 v24.8h, v28.4s 534 SQXTN2 v25.8h, v29.4s 535 SQXTN2 v26.8h, v30.4s 536 SQXTN2 v27.8h, v31.4s 537 538 SQADD v16.8h, v16.8h, v6.8h 539 SQADD v17.8h, v17.8h, v6.8h 540 SQADD v18.8h, v18.8h, v6.8h 541 SQADD v19.8h, v19.8h, v6.8h 542 SQADD v24.8h, v24.8h, v6.8h 543 SQADD v25.8h, v25.8h, v6.8h 544 SQADD v26.8h, v26.8h, v6.8h 545 SQADD v27.8h, v27.8h, v6.8h 546 LD1R {v4.16b}, [x11], 1 // clamp min value 547 548 SQXTN v0.8b, v16.8h 549 SQXTN v1.8b, v17.8h 550 SQXTN v2.8b, v18.8h 551 SQXTN v3.8b, v19.8h 552 LD1R {v5.16b}, [x11] // clamp max value 553 SQXTN2 v0.16b, v24.8h 554 SQXTN2 v1.16b, v25.8h 555 SQXTN2 v2.16b, v26.8h 556 SQXTN2 v3.16b, v27.8h 557 SUB x11, x11, 3 // rewind params pointer 558 559 SMAX v0.16b, v0.16b, v4.16b 560 SMAX v1.16b, v1.16b, v4.16b 561 SMAX v2.16b, v2.16b, v4.16b 562 SMAX v3.16b, v3.16b, v4.16b 563 SUBS x1, x1, 16 564 SMIN v0.16b, v0.16b, v5.16b 565 SMIN v1.16b, v1.16b, v5.16b 566 SMIN v2.16b, v2.16b, v5.16b 567 SMIN v3.16b, v3.16b, v5.16b 568 B.LO 5f 569 570 # Store full 4 x 16 571 ST1 {v0.16b}, [x6], x12 572 SUB x3, x3, x2 // a0 -= kc 573 ST1 {v1.16b}, [x8], x12 574 SUB x15, x15, x2 // a1 -= kc 575 ST1 {v2.16b}, [x9], x12 576 SUB x13, x13, x2 // a2 -= kc 577 ST1 {v3.16b}, [x7], x12 578 SUB x4, x4, x2 // a3 -= kc 579 B.NE 0b 580 RET 581 582 # Remainder- 1 to 7 bytes of A 583 .p2align 3 5844: 585 AND x0, x2, 7 // kc remainder 1 to 7 586 587 LD1 {v0.8b}, [x3], x0 588 LDP d4, d5, [x5], 16 589 LD1 {v1.8b}, [x15], x0 590 LD1 {v2.8b}, [x13], x0 591 LD1 {v3.8b}, [x4], x0 592 SXTL v0.8h, v0.8b 593 SXTL v4.8h, v4.8b 594 SXTL v5.8h, v5.8b 595 SXTL v1.8h, v1.8b 596 SXTL v2.8h, v2.8b 597 SXTL v3.8h, v3.8b 598 SMLAL v16.4s, v4.4h, v0.h[0] 599 SMLAL2 v20.4s, v4.8h, v0.h[0] 600 SMLAL v24.4s, v5.4h, v0.h[0] 601 SMLAL2 v28.4s, v5.8h, v0.h[0] 602 SMLAL v17.4s, v4.4h, v1.h[0] 603 SMLAL2 v21.4s, v4.8h, v1.h[0] 604 SMLAL v25.4s, v5.4h, v1.h[0] 605 SMLAL2 v29.4s, v5.8h, v1.h[0] 606 SMLAL v18.4s, v4.4h, v2.h[0] 607 SMLAL2 v22.4s, v4.8h, v2.h[0] 608 SMLAL v26.4s, v5.4h, v2.h[0] 609 SMLAL2 v30.4s, v5.8h, v2.h[0] 610 SMLAL v19.4s, v4.4h, v3.h[0] 611 SMLAL2 v23.4s, v4.8h, v3.h[0] 612 SMLAL v27.4s, v5.4h, v3.h[0] 613 SMLAL2 v31.4s, v5.8h, v3.h[0] 614 CMP x0, 2 615 B.LO 3b 616 617 LDP d4, d5, [x5], 16 618 SXTL v4.8h, v4.8b 619 SXTL v5.8h, v5.8b 620 SMLAL v16.4s, v4.4h, v0.h[1] 621 SMLAL2 v20.4s, v4.8h, v0.h[1] 622 SMLAL v24.4s, v5.4h, v0.h[1] 623 SMLAL2 v28.4s, v5.8h, v0.h[1] 624 SMLAL v17.4s, v4.4h, v1.h[1] 625 SMLAL2 v21.4s, v4.8h, v1.h[1] 626 SMLAL v25.4s, v5.4h, v1.h[1] 627 SMLAL2 v29.4s, v5.8h, v1.h[1] 628 SMLAL v18.4s, v4.4h, v2.h[1] 629 SMLAL2 v22.4s, v4.8h, v2.h[1] 630 SMLAL v26.4s, v5.4h, v2.h[1] 631 SMLAL2 v30.4s, v5.8h, v2.h[1] 632 SMLAL v19.4s, v4.4h, v3.h[1] 633 SMLAL2 v23.4s, v4.8h, v3.h[1] 634 SMLAL v27.4s, v5.4h, v3.h[1] 635 SMLAL2 v31.4s, v5.8h, v3.h[1] 636 B.EQ 3b 637 638 LDP d4, d5, [x5], 16 639 SXTL v4.8h, v4.8b 640 SXTL v5.8h, v5.8b 641 SMLAL v16.4s, v4.4h, v0.h[2] 642 SMLAL2 v20.4s, v4.8h, v0.h[2] 643 SMLAL v24.4s, v5.4h, v0.h[2] 644 SMLAL2 v28.4s, v5.8h, v0.h[2] 645 SMLAL v17.4s, v4.4h, v1.h[2] 646 SMLAL2 v21.4s, v4.8h, v1.h[2] 647 SMLAL v25.4s, v5.4h, v1.h[2] 648 SMLAL2 v29.4s, v5.8h, v1.h[2] 649 SMLAL v18.4s, v4.4h, v2.h[2] 650 SMLAL2 v22.4s, v4.8h, v2.h[2] 651 SMLAL v26.4s, v5.4h, v2.h[2] 652 SMLAL2 v30.4s, v5.8h, v2.h[2] 653 SMLAL v19.4s, v4.4h, v3.h[2] 654 SMLAL2 v23.4s, v4.8h, v3.h[2] 655 SMLAL v27.4s, v5.4h, v3.h[2] 656 SMLAL2 v31.4s, v5.8h, v3.h[2] 657 CMP x0, 4 658 B.LO 3b 659 660 LDP d4, d5, [x5], 16 661 SXTL v4.8h, v4.8b 662 SXTL v5.8h, v5.8b 663 SMLAL v16.4s, v4.4h, v0.h[3] 664 SMLAL2 v20.4s, v4.8h, v0.h[3] 665 SMLAL v24.4s, v5.4h, v0.h[3] 666 SMLAL2 v28.4s, v5.8h, v0.h[3] 667 SMLAL v17.4s, v4.4h, v1.h[3] 668 SMLAL2 v21.4s, v4.8h, v1.h[3] 669 SMLAL v25.4s, v5.4h, v1.h[3] 670 SMLAL2 v29.4s, v5.8h, v1.h[3] 671 SMLAL v18.4s, v4.4h, v2.h[3] 672 SMLAL2 v22.4s, v4.8h, v2.h[3] 673 SMLAL v26.4s, v5.4h, v2.h[3] 674 SMLAL2 v30.4s, v5.8h, v2.h[3] 675 SMLAL v19.4s, v4.4h, v3.h[3] 676 SMLAL2 v23.4s, v4.8h, v3.h[3] 677 SMLAL v27.4s, v5.4h, v3.h[3] 678 SMLAL2 v31.4s, v5.8h, v3.h[3] 679 B.EQ 3b 680 681 LDP d4, d5, [x5], 16 682 SXTL v4.8h, v4.8b 683 SXTL v5.8h, v5.8b 684 SMLAL v16.4s, v4.4h, v0.h[4] 685 SMLAL2 v20.4s, v4.8h, v0.h[4] 686 SMLAL v24.4s, v5.4h, v0.h[4] 687 SMLAL2 v28.4s, v5.8h, v0.h[4] 688 SMLAL v17.4s, v4.4h, v1.h[4] 689 SMLAL2 v21.4s, v4.8h, v1.h[4] 690 SMLAL v25.4s, v5.4h, v1.h[4] 691 SMLAL2 v29.4s, v5.8h, v1.h[4] 692 SMLAL v18.4s, v4.4h, v2.h[4] 693 SMLAL2 v22.4s, v4.8h, v2.h[4] 694 SMLAL v26.4s, v5.4h, v2.h[4] 695 SMLAL2 v30.4s, v5.8h, v2.h[4] 696 SMLAL v19.4s, v4.4h, v3.h[4] 697 SMLAL2 v23.4s, v4.8h, v3.h[4] 698 SMLAL v27.4s, v5.4h, v3.h[4] 699 SMLAL2 v31.4s, v5.8h, v3.h[4] 700 CMP x0, 6 701 B.LO 3b 702 703 LDP d4, d5, [x5], 16 704 SXTL v4.8h, v4.8b 705 SXTL v5.8h, v5.8b 706 SMLAL v16.4s, v4.4h, v0.h[5] 707 SMLAL2 v20.4s, v4.8h, v0.h[5] 708 SMLAL v24.4s, v5.4h, v0.h[5] 709 SMLAL2 v28.4s, v5.8h, v0.h[5] 710 SMLAL v17.4s, v4.4h, v1.h[5] 711 SMLAL2 v21.4s, v4.8h, v1.h[5] 712 SMLAL v25.4s, v5.4h, v1.h[5] 713 SMLAL2 v29.4s, v5.8h, v1.h[5] 714 SMLAL v18.4s, v4.4h, v2.h[5] 715 SMLAL2 v22.4s, v4.8h, v2.h[5] 716 SMLAL v26.4s, v5.4h, v2.h[5] 717 SMLAL2 v30.4s, v5.8h, v2.h[5] 718 SMLAL v19.4s, v4.4h, v3.h[5] 719 SMLAL2 v23.4s, v4.8h, v3.h[5] 720 SMLAL v27.4s, v5.4h, v3.h[5] 721 SMLAL2 v31.4s, v5.8h, v3.h[5] 722 B.EQ 3b 723 724 LDP d4, d5, [x5], 16 725 SXTL v4.8h, v4.8b 726 SXTL v5.8h, v5.8b 727 SMLAL v16.4s, v4.4h, v0.h[6] 728 SMLAL2 v20.4s, v4.8h, v0.h[6] 729 SMLAL v24.4s, v5.4h, v0.h[6] 730 SMLAL2 v28.4s, v5.8h, v0.h[6] 731 SMLAL v17.4s, v4.4h, v1.h[6] 732 SMLAL2 v21.4s, v4.8h, v1.h[6] 733 SMLAL v25.4s, v5.4h, v1.h[6] 734 SMLAL2 v29.4s, v5.8h, v1.h[6] 735 SMLAL v18.4s, v4.4h, v2.h[6] 736 SMLAL2 v22.4s, v4.8h, v2.h[6] 737 SMLAL v26.4s, v5.4h, v2.h[6] 738 SMLAL2 v30.4s, v5.8h, v2.h[6] 739 SMLAL v19.4s, v4.4h, v3.h[6] 740 SMLAL2 v23.4s, v4.8h, v3.h[6] 741 SMLAL v27.4s, v5.4h, v3.h[6] 742 SMLAL2 v31.4s, v5.8h, v3.h[6] 743 B 3b 744 745 # Store odd width 746 .p2align 3 7475: 748 TBZ x1, 3, 6f 749 STR d0, [x6], 8 750 STR d1, [x8], 8 751 DUP d0, v0.d[1] 752 DUP d1, v1.d[1] 753 STR d2, [x9], 8 754 STR d3, [x7], 8 755 DUP d2, v2.d[1] 756 DUP d3, v3.d[1] 7576: 758 TBZ x1, 2, 7f 759 STR s0, [x6], 4 760 STR s1, [x8], 4 761 DUP s0, v0.s[1] 762 DUP s1, v1.s[1] 763 STR s2, [x9], 4 764 STR s3, [x7], 4 765 DUP s2, v2.s[1] 766 DUP s3, v3.s[1] 7677: 768 TBZ x1, 1, 8f 769 STR h0, [x6], 2 770 STR h1, [x8], 2 771 DUP h0, v0.h[1] 772 DUP h1, v1.h[1] 773 STR h2, [x9], 2 774 STR h3, [x7], 2 775 DUP h2, v2.h[1] 776 DUP h3, v3.h[1] 7778: 778 TBZ x1, 0, 9f 779 STR b0, [x6] 780 STR b1, [x8] 781 STR b2, [x9] 782 STR b3, [x7] 7839: 784 RET 785 786END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 787 788#ifdef __ELF__ 789.section ".note.GNU-stack","",%progbits 790#endif 791