1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x15 v1 30# A2 x13 v2 31# A3 x4 v3 32# B x5 v4 v5 v6 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 38 39# x10 x17 a53 temp registers 40 41BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53 42 43 # Clamp A and C pointers 44 CMP x0, 2 // if mr < 2 45 LDP x12, x11, [sp] // Load cn_stride, params 46 ADD x15, x3, x4 // a1 = a0 + a_stride 47 ADD x8, x6, x7 // c1 = c0 + cm_stride 48 CSEL x15, x3, x15, LO // a1 = a0 49 CSEL x8, x6, x8, LO // c1 = c0 50 51 ADD x13, x15, x4 // a2 = a1 + a_stride 52 ADD x9, x8, x7 // c2 = c1 + cm_stride 53 // if mr <= 2 54 CSEL x13, x15, x13, LS // a2 = a1 55 CSEL x9, x8, x9, LS // c2 = c1 56 57 CMP x0, 4 // if mr < 4 58 ADD x4, x13, x4 // a3 = a2 + a_stride 59 ADD x7, x9, x7 // c3 = c2 + cm_stride 60 CSEL x4, x13, x4, LO // a3 = a2 61 CSEL x7, x9, x7, LO // c3 = c2 62 63 .p2align 3 640: 65 # Load initial bias from w into accumulators 66 LDP q16, q20, [x5], 32 67 MOV v17.16b, v16.16b 68 MOV v18.16b, v16.16b 69 LDP q24, q28, [x5], 32 70 MOV v19.16b, v16.16b 71 MOV v21.16b, v20.16b 72 MOV v22.16b, v20.16b 73 MOV v23.16b, v20.16b 74 SUBS x0, x2, 8 // k = kc - 8 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 MOV v27.16b, v24.16b 78 MOV v29.16b, v28.16b 79 MOV v30.16b, v28.16b 80 MOV v31.16b, v28.16b 81 # Is there at least 8 bytes for epilogue? 82 B.LO 4f 83 84 # Prologue 85 LDR d0, [x3], 8 86 LDP d4, d6, [x5] 87 LDR d1, [x15], 8 88 LDR d2, [x13], 8 89 LDR d3, [x4], 8 90 SXTL v0.8h, v0.8b 91 LDR x17, [x5, 16] 92 SXTL v4.8h, v4.8b 93 SXTL v1.8h, v1.8b 94 SXTL v2.8h, v2.8b 95 SXTL v3.8h, v3.8b 96 SXTL v6.8h, v6.8b 97 98 SUBS x0, x0, 8 // k = k - 8 99 # Is there at least 8 bytes for main loop? 100 B.LO 2f 101 102 # Main loop - 8 bytes of A 103 .p2align 3 1041: 105 SMLAL v16.4s, v4.4h, v0.h[0] 106 SMLAL2 v20.4s, v4.8h, v0.h[0] 107 PRFM PLDL1KEEP, [x3, 128] 108 SMLAL v17.4s, v4.4h, v1.h[0] 109 SMLAL2 v21.4s, v4.8h, v1.h[0] 110 PRFM PLDL1KEEP, [x15, 128] 111 SMLAL v18.4s, v4.4h, v2.h[0] 112 SMLAL2 v22.4s, v4.8h, v2.h[0] 113 PRFM PLDL1KEEP, [x13, 128] 114 SMLAL v19.4s, v4.4h, v3.h[0] 115 SMLAL2 v23.4s, v4.8h, v3.h[0] 116 PRFM PLDL1KEEP, [x4, 128] 117 LDR d4, [x5, 24] 118 INS v5.d[0], x17 119 SMLAL v24.4s, v6.4h, v0.h[0] 120 SMLAL2 v28.4s, v6.8h, v0.h[0] 121 PRFM PLDL1KEEP, [x5, 448] 122 SMLAL v25.4s, v6.4h, v1.h[0] 123 SMLAL2 v29.4s, v6.8h, v1.h[0] 124 PRFM PLDL1KEEP, [x5, 512] 125 SXTL v5.8h, v5.8b 126 SMLAL v26.4s, v6.4h, v2.h[0] 127 SMLAL2 v30.4s, v6.8h, v2.h[0] 128 SMLAL v27.4s, v6.4h, v3.h[0] 129 SMLAL2 v31.4s, v6.8h, v3.h[0] 130 LDR x17, [x5, 32] 131 SMLAL v16.4s, v5.4h, v0.h[1] 132 SMLAL2 v20.4s, v5.8h, v0.h[1] 133 SMLAL v17.4s, v5.4h, v1.h[1] 134 SMLAL2 v21.4s, v5.8h, v1.h[1] 135 SXTL v4.8h, v4.8b 136 SMLAL v18.4s, v5.4h, v2.h[1] 137 SMLAL2 v22.4s, v5.8h, v2.h[1] 138 SMLAL v19.4s, v5.4h, v3.h[1] 139 SMLAL2 v23.4s, v5.8h, v3.h[1] 140 LDR d5, [x5, 40] 141 INS v6.d[0], x17 142 SMLAL v24.4s, v4.4h, v0.h[1] 143 SMLAL2 v28.4s, v4.8h, v0.h[1] 144 SMLAL v25.4s, v4.4h, v1.h[1] 145 SMLAL2 v29.4s, v4.8h, v1.h[1] 146 SXTL v6.8h, v6.8b 147 SMLAL v26.4s, v4.4h, v2.h[1] 148 SMLAL2 v30.4s, v4.8h, v2.h[1] 149 SMLAL v27.4s, v4.4h, v3.h[1] 150 SMLAL2 v31.4s, v4.8h, v3.h[1] 151 LDR x17, [x5, 48] 152 SMLAL v16.4s, v6.4h, v0.h[2] 153 SMLAL2 v20.4s, v6.8h, v0.h[2] 154 SMLAL v17.4s, v6.4h, v1.h[2] 155 SXTL v5.8h, v5.8b 156 SMLAL2 v21.4s, v6.8h, v1.h[2] 157 SMLAL v18.4s, v6.4h, v2.h[2] 158 SMLAL2 v22.4s, v6.8h, v2.h[2] 159 SMLAL v19.4s, v6.4h, v3.h[2] 160 SMLAL2 v23.4s, v6.8h, v3.h[2] 161 LDR d6, [x5, 56] 162 INS v4.d[0], x17 163 SMLAL v24.4s, v5.4h, v0.h[2] 164 SMLAL2 v28.4s, v5.8h, v0.h[2] 165 SMLAL v25.4s, v5.4h, v1.h[2] 166 SMLAL2 v29.4s, v5.8h, v1.h[2] 167 SXTL v4.8h, v4.8b 168 SMLAL v26.4s, v5.4h, v2.h[2] 169 SMLAL2 v30.4s, v5.8h, v2.h[2] 170 SMLAL v27.4s, v5.4h, v3.h[2] 171 SMLAL2 v31.4s, v5.8h, v3.h[2] 172 LDR x17, [x5, 64] 173 SMLAL v16.4s, v4.4h, v0.h[3] 174 SMLAL2 v20.4s, v4.8h, v0.h[3] 175 SMLAL v17.4s, v4.4h, v1.h[3] 176 SMLAL2 v21.4s, v4.8h, v1.h[3] 177 SXTL v6.8h, v6.8b 178 SMLAL v18.4s, v4.4h, v2.h[3] 179 SMLAL2 v22.4s, v4.8h, v2.h[3] 180 SMLAL v19.4s, v4.4h, v3.h[3] 181 SMLAL2 v23.4s, v4.8h, v3.h[3] 182 LDR d4, [x5, 72] 183 INS v5.d[0], x17 184 SMLAL v24.4s, v6.4h, v0.h[3] 185 SMLAL2 v28.4s, v6.8h, v0.h[3] 186 SXTL v5.8h, v5.8b 187 SMLAL v25.4s, v6.4h, v1.h[3] 188 SMLAL2 v29.4s, v6.8h, v1.h[3] 189 SMLAL v26.4s, v6.4h, v2.h[3] 190 SMLAL2 v30.4s, v6.8h, v2.h[3] 191 SMLAL v27.4s, v6.4h, v3.h[3] 192 SMLAL2 v31.4s, v6.8h, v3.h[3] 193 LDR x17, [x5, 80] 194 SMLAL v16.4s, v5.4h, v0.h[4] 195 SMLAL2 v20.4s, v5.8h, v0.h[4] 196 SMLAL v17.4s, v5.4h, v1.h[4] 197 SMLAL2 v21.4s, v5.8h, v1.h[4] 198 SXTL v4.8h, v4.8b 199 SMLAL v18.4s, v5.4h, v2.h[4] 200 SMLAL2 v22.4s, v5.8h, v2.h[4] 201 SMLAL v19.4s, v5.4h, v3.h[4] 202 SMLAL2 v23.4s, v5.8h, v3.h[4] 203 LDR d5, [x5, 88] 204 INS v6.d[0], x17 205 SMLAL v24.4s, v4.4h, v0.h[4] 206 SMLAL2 v28.4s, v4.8h, v0.h[4] 207 SMLAL v25.4s, v4.4h, v1.h[4] 208 SMLAL2 v29.4s, v4.8h, v1.h[4] 209 SXTL v6.8h, v6.8b 210 SMLAL v26.4s, v4.4h, v2.h[4] 211 SMLAL2 v30.4s, v4.8h, v2.h[4] 212 SMLAL v27.4s, v4.4h, v3.h[4] 213 SMLAL2 v31.4s, v4.8h, v3.h[4] 214 LDR x17, [x5, 96] 215 SMLAL v16.4s, v6.4h, v0.h[5] 216 SMLAL2 v20.4s, v6.8h, v0.h[5] 217 SMLAL v17.4s, v6.4h, v1.h[5] 218 SMLAL2 v21.4s, v6.8h, v1.h[5] 219 SXTL v5.8h, v5.8b 220 SMLAL v18.4s, v6.4h, v2.h[5] 221 SMLAL2 v22.4s, v6.8h, v2.h[5] 222 SMLAL v19.4s, v6.4h, v3.h[5] 223 SMLAL2 v23.4s, v6.8h, v3.h[5] 224 LDR d6, [x5, 104] 225 INS v4.d[0], x17 226 SMLAL v24.4s, v5.4h, v0.h[5] 227 SMLAL2 v28.4s, v5.8h, v0.h[5] 228 SMLAL v25.4s, v5.4h, v1.h[5] 229 SMLAL2 v29.4s, v5.8h, v1.h[5] 230 SXTL v4.8h, v4.8b 231 SMLAL v26.4s, v5.4h, v2.h[5] 232 SMLAL2 v30.4s, v5.8h, v2.h[5] 233 SMLAL v27.4s, v5.4h, v3.h[5] 234 SMLAL2 v31.4s, v5.8h, v3.h[5] 235 SXTL v6.8h, v6.8b 236 LDR x17, [x5, 112] 237 SMLAL v16.4s, v4.4h, v0.h[6] 238 SMLAL2 v20.4s, v4.8h, v0.h[6] 239 SMLAL v17.4s, v4.4h, v1.h[6] 240 SMLAL2 v21.4s, v4.8h, v1.h[6] 241 SMLAL v18.4s, v4.4h, v2.h[6] 242 SMLAL2 v22.4s, v4.8h, v2.h[6] 243 SMLAL v19.4s, v4.4h, v3.h[6] 244 SMLAL2 v23.4s, v4.8h, v3.h[6] 245 LDR d5, [x5, 120] 246 INS v4.d[0], x17 247 SMLAL v24.4s, v6.4h, v0.h[6] 248 SMLAL2 v28.4s, v6.8h, v0.h[6] 249 SMLAL v25.4s, v6.4h, v1.h[6] 250 SMLAL2 v29.4s, v6.8h, v1.h[6] 251 SXTL v4.8h, v4.8b 252 ADD x5, x5, 128 253 254 SMLAL v26.4s, v6.4h, v2.h[6] 255 SMLAL2 v30.4s, v6.8h, v2.h[6] 256 LDR x17, [x5] 257 SMLAL v27.4s, v6.4h, v3.h[6] 258 SMLAL2 v31.4s, v6.8h, v3.h[6] 259 SXTL v5.8h, v5.8b 260 LDR x10, [x3], 8 261 262 SMLAL v16.4s, v4.4h, v0.h[7] 263 SMLAL2 v20.4s, v4.8h, v0.h[7] 264 SMLAL v17.4s, v4.4h, v1.h[7] 265 SMLAL2 v21.4s, v4.8h, v1.h[7] 266 SMLAL v18.4s, v4.4h, v2.h[7] 267 SMLAL2 v22.4s, v4.8h, v2.h[7] 268 SMLAL v19.4s, v4.4h, v3.h[7] 269 SMLAL2 v23.4s, v4.8h, v3.h[7] 270 LDR d6, [x5, 8] 271 INS v4.d[0], x17 272 SMLAL v24.4s, v5.4h, v0.h[7] 273 SMLAL2 v28.4s, v5.8h, v0.h[7] 274 LDR x17, [x13], 8 275 SMLAL v25.4s, v5.4h, v1.h[7] 276 SMLAL2 v29.4s, v5.8h, v1.h[7] 277 LDR d1, [x15], 8 278 INS v0.d[0], x10 279 SMLAL v26.4s, v5.4h, v2.h[7] 280 SMLAL2 v30.4s, v5.8h, v2.h[7] 281 SMLAL v27.4s, v5.4h, v3.h[7] 282 SMLAL2 v31.4s, v5.8h, v3.h[7] 283 LDR d3, [x4], 8 284 INS v2.d[0], x17 285 286 SXTL v0.8h, v0.8b 287 SXTL v1.8h, v1.8b 288 LDR x17, [x5, 16] 289 SXTL v4.8h, v4.8b 290 SXTL v2.8h, v2.8b 291 SUBS x0, x0, 8 292 SXTL v3.8h, v3.8b 293 SXTL v6.8h, v6.8b 294 B.HS 1b 295 296 # Epilogue. Same as main loop but no preloads in final group 297 298 .p2align 3 2992: 300 SMLAL v16.4s, v4.4h, v0.h[0] 301 SMLAL2 v20.4s, v4.8h, v0.h[0] 302 SMLAL v17.4s, v4.4h, v1.h[0] 303 SMLAL2 v21.4s, v4.8h, v1.h[0] 304 SMLAL v18.4s, v4.4h, v2.h[0] 305 SMLAL2 v22.4s, v4.8h, v2.h[0] 306 SMLAL v19.4s, v4.4h, v3.h[0] 307 SMLAL2 v23.4s, v4.8h, v3.h[0] 308 LDR d4, [x5, 24] 309 INS v5.d[0], x17 310 SMLAL v24.4s, v6.4h, v0.h[0] 311 SMLAL2 v28.4s, v6.8h, v0.h[0] 312 SMLAL v25.4s, v6.4h, v1.h[0] 313 SMLAL2 v29.4s, v6.8h, v1.h[0] 314 SXTL v5.8h, v5.8b 315 SMLAL v26.4s, v6.4h, v2.h[0] 316 SMLAL2 v30.4s, v6.8h, v2.h[0] 317 SMLAL v27.4s, v6.4h, v3.h[0] 318 SMLAL2 v31.4s, v6.8h, v3.h[0] 319 LDR x17, [x5, 32] 320 SMLAL v16.4s, v5.4h, v0.h[1] 321 SMLAL2 v20.4s, v5.8h, v0.h[1] 322 SMLAL v17.4s, v5.4h, v1.h[1] 323 SMLAL2 v21.4s, v5.8h, v1.h[1] 324 SXTL v4.8h, v4.8b 325 SMLAL v18.4s, v5.4h, v2.h[1] 326 SMLAL2 v22.4s, v5.8h, v2.h[1] 327 SMLAL v19.4s, v5.4h, v3.h[1] 328 SMLAL2 v23.4s, v5.8h, v3.h[1] 329 LDR d5, [x5, 40] 330 INS v6.d[0], x17 331 SMLAL v24.4s, v4.4h, v0.h[1] 332 SMLAL2 v28.4s, v4.8h, v0.h[1] 333 SMLAL v25.4s, v4.4h, v1.h[1] 334 SMLAL2 v29.4s, v4.8h, v1.h[1] 335 SXTL v6.8h, v6.8b 336 SMLAL v26.4s, v4.4h, v2.h[1] 337 SMLAL2 v30.4s, v4.8h, v2.h[1] 338 SMLAL v27.4s, v4.4h, v3.h[1] 339 SMLAL2 v31.4s, v4.8h, v3.h[1] 340 LDR x17, [x5, 48] 341 SMLAL v16.4s, v6.4h, v0.h[2] 342 SMLAL2 v20.4s, v6.8h, v0.h[2] 343 SMLAL v17.4s, v6.4h, v1.h[2] 344 SXTL v5.8h, v5.8b 345 SMLAL2 v21.4s, v6.8h, v1.h[2] 346 SMLAL v18.4s, v6.4h, v2.h[2] 347 SMLAL2 v22.4s, v6.8h, v2.h[2] 348 SMLAL v19.4s, v6.4h, v3.h[2] 349 SMLAL2 v23.4s, v6.8h, v3.h[2] 350 LDR d6, [x5, 56] 351 INS v4.d[0], x17 352 SMLAL v24.4s, v5.4h, v0.h[2] 353 SMLAL2 v28.4s, v5.8h, v0.h[2] 354 SMLAL v25.4s, v5.4h, v1.h[2] 355 SMLAL2 v29.4s, v5.8h, v1.h[2] 356 SXTL v4.8h, v4.8b 357 SMLAL v26.4s, v5.4h, v2.h[2] 358 SMLAL2 v30.4s, v5.8h, v2.h[2] 359 SMLAL v27.4s, v5.4h, v3.h[2] 360 SMLAL2 v31.4s, v5.8h, v3.h[2] 361 LDR x17, [x5, 64] 362 SMLAL v16.4s, v4.4h, v0.h[3] 363 SMLAL2 v20.4s, v4.8h, v0.h[3] 364 SMLAL v17.4s, v4.4h, v1.h[3] 365 SMLAL2 v21.4s, v4.8h, v1.h[3] 366 SXTL v6.8h, v6.8b 367 SMLAL v18.4s, v4.4h, v2.h[3] 368 SMLAL2 v22.4s, v4.8h, v2.h[3] 369 SMLAL v19.4s, v4.4h, v3.h[3] 370 SMLAL2 v23.4s, v4.8h, v3.h[3] 371 LDR d4, [x5, 72] 372 INS v5.d[0], x17 373 SMLAL v24.4s, v6.4h, v0.h[3] 374 SMLAL2 v28.4s, v6.8h, v0.h[3] 375 SXTL v5.8h, v5.8b 376 SMLAL v25.4s, v6.4h, v1.h[3] 377 SMLAL2 v29.4s, v6.8h, v1.h[3] 378 SMLAL v26.4s, v6.4h, v2.h[3] 379 SMLAL2 v30.4s, v6.8h, v2.h[3] 380 SMLAL v27.4s, v6.4h, v3.h[3] 381 SMLAL2 v31.4s, v6.8h, v3.h[3] 382 LDR x17, [x5, 80] 383 SMLAL v16.4s, v5.4h, v0.h[4] 384 SMLAL2 v20.4s, v5.8h, v0.h[4] 385 SMLAL v17.4s, v5.4h, v1.h[4] 386 SMLAL2 v21.4s, v5.8h, v1.h[4] 387 SXTL v4.8h, v4.8b 388 SMLAL v18.4s, v5.4h, v2.h[4] 389 SMLAL2 v22.4s, v5.8h, v2.h[4] 390 SMLAL v19.4s, v5.4h, v3.h[4] 391 SMLAL2 v23.4s, v5.8h, v3.h[4] 392 LDR d5, [x5, 88] 393 INS v6.d[0], x17 394 SMLAL v24.4s, v4.4h, v0.h[4] 395 SMLAL2 v28.4s, v4.8h, v0.h[4] 396 SMLAL v25.4s, v4.4h, v1.h[4] 397 SMLAL2 v29.4s, v4.8h, v1.h[4] 398 SXTL v6.8h, v6.8b 399 SMLAL v26.4s, v4.4h, v2.h[4] 400 SMLAL2 v30.4s, v4.8h, v2.h[4] 401 SMLAL v27.4s, v4.4h, v3.h[4] 402 SMLAL2 v31.4s, v4.8h, v3.h[4] 403 LDR x17, [x5, 96] 404 SMLAL v16.4s, v6.4h, v0.h[5] 405 SMLAL2 v20.4s, v6.8h, v0.h[5] 406 SMLAL v17.4s, v6.4h, v1.h[5] 407 SMLAL2 v21.4s, v6.8h, v1.h[5] 408 SXTL v5.8h, v5.8b 409 SMLAL v18.4s, v6.4h, v2.h[5] 410 SMLAL2 v22.4s, v6.8h, v2.h[5] 411 SMLAL v19.4s, v6.4h, v3.h[5] 412 SMLAL2 v23.4s, v6.8h, v3.h[5] 413 LDR d6, [x5, 104] 414 INS v4.d[0], x17 415 SMLAL v24.4s, v5.4h, v0.h[5] 416 SMLAL2 v28.4s, v5.8h, v0.h[5] 417 SMLAL v25.4s, v5.4h, v1.h[5] 418 SMLAL2 v29.4s, v5.8h, v1.h[5] 419 SXTL v4.8h, v4.8b 420 SMLAL v26.4s, v5.4h, v2.h[5] 421 SMLAL2 v30.4s, v5.8h, v2.h[5] 422 SMLAL v27.4s, v5.4h, v3.h[5] 423 SMLAL2 v31.4s, v5.8h, v3.h[5] 424 SXTL v6.8h, v6.8b 425 SMLAL v16.4s, v4.4h, v0.h[6] 426 SMLAL2 v20.4s, v4.8h, v0.h[6] 427 SMLAL v17.4s, v4.4h, v1.h[6] 428 SMLAL2 v21.4s, v4.8h, v1.h[6] 429 SMLAL v18.4s, v4.4h, v2.h[6] 430 SMLAL2 v22.4s, v4.8h, v2.h[6] 431 SMLAL v19.4s, v4.4h, v3.h[6] 432 SMLAL2 v23.4s, v4.8h, v3.h[6] 433 LDR x17, [x5, 112] 434 SMLAL v24.4s, v6.4h, v0.h[6] 435 SMLAL2 v28.4s, v6.8h, v0.h[6] 436 SMLAL v25.4s, v6.4h, v1.h[6] 437 SMLAL2 v29.4s, v6.8h, v1.h[6] 438 LDR d5, [x5, 120] 439 INS v4.d[0], x17 440 SXTL v4.8h, v4.8b 441 SMLAL v26.4s, v6.4h, v2.h[6] 442 SMLAL2 v30.4s, v6.8h, v2.h[6] 443 SMLAL v27.4s, v6.4h, v3.h[6] 444 SMLAL2 v31.4s, v6.8h, v3.h[6] 445 SMLAL v16.4s, v4.4h, v0.h[7] 446 SMLAL2 v20.4s, v4.8h, v0.h[7] 447 SMLAL v17.4s, v4.4h, v1.h[7] 448 SMLAL2 v21.4s, v4.8h, v1.h[7] 449 SXTL v5.8h, v5.8b 450 SMLAL v18.4s, v4.4h, v2.h[7] 451 SMLAL2 v22.4s, v4.8h, v2.h[7] 452 SMLAL v19.4s, v4.4h, v3.h[7] 453 SMLAL2 v23.4s, v4.8h, v3.h[7] 454 ADD x5, x5, 128 455 SMLAL v24.4s, v5.4h, v0.h[7] 456 SMLAL2 v28.4s, v5.8h, v0.h[7] 457 SMLAL v25.4s, v5.4h, v1.h[7] 458 SMLAL2 v29.4s, v5.8h, v1.h[7] 459 AND x0, x2, 7 // kc remainder 0 to 7 460 SMLAL v26.4s, v5.4h, v2.h[7] 461 SMLAL2 v30.4s, v5.8h, v2.h[7] 462 SMLAL v27.4s, v5.4h, v3.h[7] 463 SMLAL2 v31.4s, v5.8h, v3.h[7] 464 465 # Is there a remainder?- 1 to 7 bytes of A 466 CBNZ x0, 4f 467 4683: 469 SCVTF v16.4s, v16.4s 470 SCVTF v17.4s, v17.4s 471 # Load per channel scale values from weights 472 LDR q4, [x5], 16 473 SCVTF v18.4s, v18.4s 474 SCVTF v19.4s, v19.4s 475 LDR q5, [x5], 16 476 SCVTF v20.4s, v20.4s 477 SCVTF v21.4s, v21.4s 478 SCVTF v22.4s, v22.4s 479 SCVTF v23.4s, v23.4s 480 SCVTF v24.4s, v24.4s 481 SCVTF v25.4s, v25.4s 482 SCVTF v26.4s, v26.4s 483 SCVTF v27.4s, v27.4s 484 SCVTF v28.4s, v28.4s 485 SCVTF v29.4s, v29.4s 486 SCVTF v30.4s, v30.4s 487 SCVTF v31.4s, v31.4s 488 489 LDR q6, [x5], 16 490 FMUL v16.4s, v16.4s, v4.4s 491 FMUL v17.4s, v17.4s, v4.4s 492 FMUL v18.4s, v18.4s, v4.4s 493 FMUL v19.4s, v19.4s, v4.4s 494 FMUL v20.4s, v20.4s, v5.4s 495 LDR q4, [x5], 16 496 FMUL v21.4s, v21.4s, v5.4s 497 FMUL v22.4s, v22.4s, v5.4s 498 FMUL v23.4s, v23.4s, v5.4s 499 FMUL v24.4s, v24.4s, v6.4s 500 FMUL v25.4s, v25.4s, v6.4s 501 FMUL v26.4s, v26.4s, v6.4s 502 FMUL v27.4s, v27.4s, v6.4s 503 FMUL v28.4s, v28.4s, v4.4s 504 FMUL v29.4s, v29.4s, v4.4s 505 FMUL v30.4s, v30.4s, v4.4s 506 FMUL v31.4s, v31.4s, v4.4s 507 508 FCVTNS v16.4s, v16.4s 509 FCVTNS v17.4s, v17.4s 510 FCVTNS v18.4s, v18.4s 511 FCVTNS v19.4s, v19.4s 512 FCVTNS v20.4s, v20.4s 513 FCVTNS v21.4s, v21.4s 514 FCVTNS v22.4s, v22.4s 515 FCVTNS v23.4s, v23.4s 516 FCVTNS v24.4s, v24.4s 517 FCVTNS v25.4s, v25.4s 518 FCVTNS v26.4s, v26.4s 519 FCVTNS v27.4s, v27.4s 520 FCVTNS v28.4s, v28.4s 521 FCVTNS v29.4s, v29.4s 522 FCVTNS v30.4s, v30.4s 523 FCVTNS v31.4s, v31.4s 524 525 SQXTN v16.4h, v16.4s 526 SQXTN v17.4h, v17.4s 527 SQXTN v18.4h, v18.4s 528 SQXTN v19.4h, v19.4s 529 SQXTN v24.4h, v24.4s 530 SQXTN v25.4h, v25.4s 531 SQXTN v26.4h, v26.4s 532 SQXTN v27.4h, v27.4s 533 LD1R {v6.8h}, [x11], 2 // add bias 534 535 SQXTN2 v16.8h, v20.4s 536 SQXTN2 v17.8h, v21.4s 537 SQXTN2 v18.8h, v22.4s 538 SQXTN2 v19.8h, v23.4s 539 SQXTN2 v24.8h, v28.4s 540 SQXTN2 v25.8h, v29.4s 541 SQXTN2 v26.8h, v30.4s 542 SQXTN2 v27.8h, v31.4s 543 544 SQADD v16.8h, v16.8h, v6.8h 545 SQADD v17.8h, v17.8h, v6.8h 546 SQADD v18.8h, v18.8h, v6.8h 547 SQADD v19.8h, v19.8h, v6.8h 548 SQADD v24.8h, v24.8h, v6.8h 549 SQADD v25.8h, v25.8h, v6.8h 550 SQADD v26.8h, v26.8h, v6.8h 551 SQADD v27.8h, v27.8h, v6.8h 552 LD1R {v4.16b}, [x11], 1 // clamp min value 553 554 SQXTN v0.8b, v16.8h 555 SQXTN v1.8b, v17.8h 556 SQXTN v2.8b, v18.8h 557 SQXTN v3.8b, v19.8h 558 LD1R {v5.16b}, [x11] // clamp max value 559 SQXTN2 v0.16b, v24.8h 560 SQXTN2 v1.16b, v25.8h 561 SQXTN2 v2.16b, v26.8h 562 SQXTN2 v3.16b, v27.8h 563 SUB x11, x11, 3 // rewind params pointer 564 565 SMAX v0.16b, v0.16b, v4.16b 566 SMAX v1.16b, v1.16b, v4.16b 567 SMAX v2.16b, v2.16b, v4.16b 568 SMAX v3.16b, v3.16b, v4.16b 569 SUBS x1, x1, 16 570 SMIN v0.16b, v0.16b, v5.16b 571 SMIN v1.16b, v1.16b, v5.16b 572 SMIN v2.16b, v2.16b, v5.16b 573 SMIN v3.16b, v3.16b, v5.16b 574 B.LO 5f 575 576 # Store full 4 x 16 577 ST1 {v0.16b}, [x6], x12 578 SUB x3, x3, x2 // a0 -= kc 579 ST1 {v1.16b}, [x8], x12 580 SUB x15, x15, x2 // a1 -= kc 581 ST1 {v2.16b}, [x9], x12 582 SUB x13, x13, x2 // a2 -= kc 583 ST1 {v3.16b}, [x7], x12 584 SUB x4, x4, x2 // a3 -= kc 585 B.NE 0b 586 RET 587 588 # Remainder- 1 to 7 bytes of A 589 .p2align 3 5904: 591 AND x0, x2, 7 // kc remainder 1 to 7 592 593 LD1 {v0.8b}, [x3], x0 594 LDP d4, d5, [x5], 16 595 LD1 {v1.8b}, [x15], x0 596 LD1 {v2.8b}, [x13], x0 597 LD1 {v3.8b}, [x4], x0 598 SXTL v0.8h, v0.8b 599 SXTL v4.8h, v4.8b 600 SXTL v5.8h, v5.8b 601 SXTL v1.8h, v1.8b 602 SXTL v2.8h, v2.8b 603 SXTL v3.8h, v3.8b 604 SMLAL v16.4s, v4.4h, v0.h[0] 605 SMLAL2 v20.4s, v4.8h, v0.h[0] 606 SMLAL v24.4s, v5.4h, v0.h[0] 607 SMLAL2 v28.4s, v5.8h, v0.h[0] 608 SMLAL v17.4s, v4.4h, v1.h[0] 609 SMLAL2 v21.4s, v4.8h, v1.h[0] 610 SMLAL v25.4s, v5.4h, v1.h[0] 611 SMLAL2 v29.4s, v5.8h, v1.h[0] 612 SMLAL v18.4s, v4.4h, v2.h[0] 613 SMLAL2 v22.4s, v4.8h, v2.h[0] 614 SMLAL v26.4s, v5.4h, v2.h[0] 615 SMLAL2 v30.4s, v5.8h, v2.h[0] 616 SMLAL v19.4s, v4.4h, v3.h[0] 617 SMLAL2 v23.4s, v4.8h, v3.h[0] 618 SMLAL v27.4s, v5.4h, v3.h[0] 619 SMLAL2 v31.4s, v5.8h, v3.h[0] 620 CMP x0, 2 621 B.LO 3b 622 623 LDP d4, d5, [x5], 16 624 SXTL v4.8h, v4.8b 625 SXTL v5.8h, v5.8b 626 SMLAL v16.4s, v4.4h, v0.h[1] 627 SMLAL2 v20.4s, v4.8h, v0.h[1] 628 SMLAL v24.4s, v5.4h, v0.h[1] 629 SMLAL2 v28.4s, v5.8h, v0.h[1] 630 SMLAL v17.4s, v4.4h, v1.h[1] 631 SMLAL2 v21.4s, v4.8h, v1.h[1] 632 SMLAL v25.4s, v5.4h, v1.h[1] 633 SMLAL2 v29.4s, v5.8h, v1.h[1] 634 SMLAL v18.4s, v4.4h, v2.h[1] 635 SMLAL2 v22.4s, v4.8h, v2.h[1] 636 SMLAL v26.4s, v5.4h, v2.h[1] 637 SMLAL2 v30.4s, v5.8h, v2.h[1] 638 SMLAL v19.4s, v4.4h, v3.h[1] 639 SMLAL2 v23.4s, v4.8h, v3.h[1] 640 SMLAL v27.4s, v5.4h, v3.h[1] 641 SMLAL2 v31.4s, v5.8h, v3.h[1] 642 B.EQ 3b 643 644 LDP d4, d5, [x5], 16 645 SXTL v4.8h, v4.8b 646 SXTL v5.8h, v5.8b 647 SMLAL v16.4s, v4.4h, v0.h[2] 648 SMLAL2 v20.4s, v4.8h, v0.h[2] 649 SMLAL v24.4s, v5.4h, v0.h[2] 650 SMLAL2 v28.4s, v5.8h, v0.h[2] 651 SMLAL v17.4s, v4.4h, v1.h[2] 652 SMLAL2 v21.4s, v4.8h, v1.h[2] 653 SMLAL v25.4s, v5.4h, v1.h[2] 654 SMLAL2 v29.4s, v5.8h, v1.h[2] 655 SMLAL v18.4s, v4.4h, v2.h[2] 656 SMLAL2 v22.4s, v4.8h, v2.h[2] 657 SMLAL v26.4s, v5.4h, v2.h[2] 658 SMLAL2 v30.4s, v5.8h, v2.h[2] 659 SMLAL v19.4s, v4.4h, v3.h[2] 660 SMLAL2 v23.4s, v4.8h, v3.h[2] 661 SMLAL v27.4s, v5.4h, v3.h[2] 662 SMLAL2 v31.4s, v5.8h, v3.h[2] 663 CMP x0, 4 664 B.LO 3b 665 666 LDP d4, d5, [x5], 16 667 SXTL v4.8h, v4.8b 668 SXTL v5.8h, v5.8b 669 SMLAL v16.4s, v4.4h, v0.h[3] 670 SMLAL2 v20.4s, v4.8h, v0.h[3] 671 SMLAL v24.4s, v5.4h, v0.h[3] 672 SMLAL2 v28.4s, v5.8h, v0.h[3] 673 SMLAL v17.4s, v4.4h, v1.h[3] 674 SMLAL2 v21.4s, v4.8h, v1.h[3] 675 SMLAL v25.4s, v5.4h, v1.h[3] 676 SMLAL2 v29.4s, v5.8h, v1.h[3] 677 SMLAL v18.4s, v4.4h, v2.h[3] 678 SMLAL2 v22.4s, v4.8h, v2.h[3] 679 SMLAL v26.4s, v5.4h, v2.h[3] 680 SMLAL2 v30.4s, v5.8h, v2.h[3] 681 SMLAL v19.4s, v4.4h, v3.h[3] 682 SMLAL2 v23.4s, v4.8h, v3.h[3] 683 SMLAL v27.4s, v5.4h, v3.h[3] 684 SMLAL2 v31.4s, v5.8h, v3.h[3] 685 B.EQ 3b 686 687 LDP d4, d5, [x5], 16 688 SXTL v4.8h, v4.8b 689 SXTL v5.8h, v5.8b 690 SMLAL v16.4s, v4.4h, v0.h[4] 691 SMLAL2 v20.4s, v4.8h, v0.h[4] 692 SMLAL v24.4s, v5.4h, v0.h[4] 693 SMLAL2 v28.4s, v5.8h, v0.h[4] 694 SMLAL v17.4s, v4.4h, v1.h[4] 695 SMLAL2 v21.4s, v4.8h, v1.h[4] 696 SMLAL v25.4s, v5.4h, v1.h[4] 697 SMLAL2 v29.4s, v5.8h, v1.h[4] 698 SMLAL v18.4s, v4.4h, v2.h[4] 699 SMLAL2 v22.4s, v4.8h, v2.h[4] 700 SMLAL v26.4s, v5.4h, v2.h[4] 701 SMLAL2 v30.4s, v5.8h, v2.h[4] 702 SMLAL v19.4s, v4.4h, v3.h[4] 703 SMLAL2 v23.4s, v4.8h, v3.h[4] 704 SMLAL v27.4s, v5.4h, v3.h[4] 705 SMLAL2 v31.4s, v5.8h, v3.h[4] 706 CMP x0, 6 707 B.LO 3b 708 709 LDP d4, d5, [x5], 16 710 SXTL v4.8h, v4.8b 711 SXTL v5.8h, v5.8b 712 SMLAL v16.4s, v4.4h, v0.h[5] 713 SMLAL2 v20.4s, v4.8h, v0.h[5] 714 SMLAL v24.4s, v5.4h, v0.h[5] 715 SMLAL2 v28.4s, v5.8h, v0.h[5] 716 SMLAL v17.4s, v4.4h, v1.h[5] 717 SMLAL2 v21.4s, v4.8h, v1.h[5] 718 SMLAL v25.4s, v5.4h, v1.h[5] 719 SMLAL2 v29.4s, v5.8h, v1.h[5] 720 SMLAL v18.4s, v4.4h, v2.h[5] 721 SMLAL2 v22.4s, v4.8h, v2.h[5] 722 SMLAL v26.4s, v5.4h, v2.h[5] 723 SMLAL2 v30.4s, v5.8h, v2.h[5] 724 SMLAL v19.4s, v4.4h, v3.h[5] 725 SMLAL2 v23.4s, v4.8h, v3.h[5] 726 SMLAL v27.4s, v5.4h, v3.h[5] 727 SMLAL2 v31.4s, v5.8h, v3.h[5] 728 B.EQ 3b 729 730 LDP d4, d5, [x5], 16 731 SXTL v4.8h, v4.8b 732 SXTL v5.8h, v5.8b 733 SMLAL v16.4s, v4.4h, v0.h[6] 734 SMLAL2 v20.4s, v4.8h, v0.h[6] 735 SMLAL v24.4s, v5.4h, v0.h[6] 736 SMLAL2 v28.4s, v5.8h, v0.h[6] 737 SMLAL v17.4s, v4.4h, v1.h[6] 738 SMLAL2 v21.4s, v4.8h, v1.h[6] 739 SMLAL v25.4s, v5.4h, v1.h[6] 740 SMLAL2 v29.4s, v5.8h, v1.h[6] 741 SMLAL v18.4s, v4.4h, v2.h[6] 742 SMLAL2 v22.4s, v4.8h, v2.h[6] 743 SMLAL v26.4s, v5.4h, v2.h[6] 744 SMLAL2 v30.4s, v5.8h, v2.h[6] 745 SMLAL v19.4s, v4.4h, v3.h[6] 746 SMLAL2 v23.4s, v4.8h, v3.h[6] 747 SMLAL v27.4s, v5.4h, v3.h[6] 748 SMLAL2 v31.4s, v5.8h, v3.h[6] 749 B 3b 750 751 # Store odd width 752 .p2align 3 7535: 754 TBZ x1, 3, 6f 755 STR d0, [x6], 8 756 STR d1, [x8], 8 757 DUP d0, v0.d[1] 758 DUP d1, v1.d[1] 759 STR d2, [x9], 8 760 STR d3, [x7], 8 761 DUP d2, v2.d[1] 762 DUP d3, v3.d[1] 7636: 764 TBZ x1, 2, 7f 765 STR s0, [x6], 4 766 STR s1, [x8], 4 767 DUP s0, v0.s[1] 768 DUP s1, v1.s[1] 769 STR s2, [x9], 4 770 STR s3, [x7], 4 771 DUP s2, v2.s[1] 772 DUP s3, v3.s[1] 7737: 774 TBZ x1, 1, 8f 775 STR h0, [x6], 2 776 STR h1, [x8], 2 777 DUP h0, v0.h[1] 778 DUP h1, v1.h[1] 779 STR h2, [x9], 2 780 STR h3, [x7], 2 781 DUP h2, v2.h[1] 782 DUP h3, v3.h[1] 7838: 784 TBZ x1, 0, 9f 785 STR b0, [x6] 786 STR b1, [x8] 787 STR b2, [x9] 788 STR b3, [x7] 7899: 790 RET 791 792END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53 793 794#ifdef __ELF__ 795.section ".note.GNU-stack","",%progbits 796#endif 797