1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t** restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x20 v3 34# B x5 v4 v5 v6 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 40# x11, x21 temp for Cortex-A53 loads 41 42BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53 43 44 # Clamp C pointers 45 CMP x0, 2 // if mr < 2 46 LDP x10, x8, [sp] // Load cn_stride, a_offset 47 ADD x16, x6, x7 // c1 = c0 + cm_stride 48 CSEL x16, x6, x16, LO // c1 = c0 49 50 ADD x17, x16, x7 // c2 = c1 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 // if mr <= 2 53 CSEL x17, x16, x17, LS // c2 = c1 54 55 CMP x0, 4 // if mr < 4 56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 CSEL x7, x17, x7, LO // c3 = c2 59 60 61 .p2align 3 620: 63 # Load initial bias from w into accumulators 64 LDP q16, q20, [x5], 32 65 MOV v17.16b, v16.16b 66 MOV v18.16b, v16.16b 67 LDP q24, q28, [x5], 32 68 MOV v19.16b, v16.16b 69 MOV v21.16b, v20.16b 70 MOV v22.16b, v20.16b 71 MOV v23.16b, v20.16b 72 MOV v25.16b, v24.16b 73 MOV v26.16b, v24.16b 74 MOV v27.16b, v24.16b 75 MOV v29.16b, v28.16b 76 MOV v30.16b, v28.16b 77 MOV v31.16b, v28.16b 78 MOV x9, x3 // p = ks 79 80 .p2align 3 811: 82 # Load next 4 A pointers 83 LDP x13, x14, [x4], 16 84 LDP x15, x20, [x4], 16 85 86 CMP x13, x12 // if a0 == zero 87 ADD x13, x13, x8 // a0 += a_offset 88 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 89 CMP x14, x12 // if a1 == zero 90 ADD x14, x14, x8 // a1 += a_offset 91 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 92 CMP x15, x12 // if a2 == zero 93 ADD x15, x15, x8 // a2 += a_offset 94 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 95 CMP x20, x12 // if a3 == zero 96 ADD x20, x20, x8 // a3 += a_offset 97 CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset 98 99 # Is there at least 8 bytes for epilogue? 100 SUBS x0, x2, 8 // k = kc - 8 101 B.LO 5f 102 103 # Prologue 104 LDR d0, [x13], 8 105 LDP d4, d6, [x5] 106 LDR d1, [x14], 8 107 LDR d2, [x15], 8 108 LDR d3, [x20], 8 109 SXTL v0.8h, v0.8b 110 LDR x11, [x5, 16] 111 SXTL v4.8h, v4.8b 112 SXTL v1.8h, v1.8b 113 SXTL v2.8h, v2.8b 114 SXTL v3.8h, v3.8b 115 SXTL v6.8h, v6.8b 116 117 SUBS x0, x0, 8 // k = k - 8 118 # Is there at least 8 bytes for main loop? 119 B.LO 3f 120 121 # Main loop - 8 bytes of A 122 .p2align 3 1232: 124 SMLAL v16.4s, v4.4h, v0.h[0] 125 SMLAL2 v20.4s, v4.8h, v0.h[0] 126 PRFM PLDL1KEEP, [x13, 128] 127 SMLAL v17.4s, v4.4h, v1.h[0] 128 SMLAL2 v21.4s, v4.8h, v1.h[0] 129 PRFM PLDL1KEEP, [x14, 128] 130 SMLAL v18.4s, v4.4h, v2.h[0] 131 SMLAL2 v22.4s, v4.8h, v2.h[0] 132 PRFM PLDL1KEEP, [x15, 128] 133 SMLAL v19.4s, v4.4h, v3.h[0] 134 SMLAL2 v23.4s, v4.8h, v3.h[0] 135 PRFM PLDL1KEEP, [x20, 128] 136 LDR d4, [x5, 24] 137 INS v5.d[0], x11 138 SMLAL v24.4s, v6.4h, v0.h[0] 139 SMLAL2 v28.4s, v6.8h, v0.h[0] 140 PRFM PLDL1KEEP, [x5, 448] 141 SMLAL v25.4s, v6.4h, v1.h[0] 142 SMLAL2 v29.4s, v6.8h, v1.h[0] 143 PRFM PLDL1KEEP, [x5, 512] 144 SXTL v5.8h, v5.8b 145 SMLAL v26.4s, v6.4h, v2.h[0] 146 SMLAL2 v30.4s, v6.8h, v2.h[0] 147 SMLAL v27.4s, v6.4h, v3.h[0] 148 SMLAL2 v31.4s, v6.8h, v3.h[0] 149 LDR x11, [x5, 32] 150 SMLAL v16.4s, v5.4h, v0.h[1] 151 SMLAL2 v20.4s, v5.8h, v0.h[1] 152 SMLAL v17.4s, v5.4h, v1.h[1] 153 SMLAL2 v21.4s, v5.8h, v1.h[1] 154 SXTL v4.8h, v4.8b 155 SMLAL v18.4s, v5.4h, v2.h[1] 156 SMLAL2 v22.4s, v5.8h, v2.h[1] 157 SMLAL v19.4s, v5.4h, v3.h[1] 158 SMLAL2 v23.4s, v5.8h, v3.h[1] 159 LDR d5, [x5, 40] 160 INS v6.d[0], x11 161 SMLAL v24.4s, v4.4h, v0.h[1] 162 SMLAL2 v28.4s, v4.8h, v0.h[1] 163 SMLAL v25.4s, v4.4h, v1.h[1] 164 SMLAL2 v29.4s, v4.8h, v1.h[1] 165 SXTL v6.8h, v6.8b 166 SMLAL v26.4s, v4.4h, v2.h[1] 167 SMLAL2 v30.4s, v4.8h, v2.h[1] 168 SMLAL v27.4s, v4.4h, v3.h[1] 169 SMLAL2 v31.4s, v4.8h, v3.h[1] 170 LDR x11, [x5, 48] 171 SMLAL v16.4s, v6.4h, v0.h[2] 172 SMLAL2 v20.4s, v6.8h, v0.h[2] 173 SMLAL v17.4s, v6.4h, v1.h[2] 174 SXTL v5.8h, v5.8b 175 SMLAL2 v21.4s, v6.8h, v1.h[2] 176 SMLAL v18.4s, v6.4h, v2.h[2] 177 SMLAL2 v22.4s, v6.8h, v2.h[2] 178 SMLAL v19.4s, v6.4h, v3.h[2] 179 SMLAL2 v23.4s, v6.8h, v3.h[2] 180 LDR d6, [x5, 56] 181 INS v4.d[0], x11 182 SMLAL v24.4s, v5.4h, v0.h[2] 183 SMLAL2 v28.4s, v5.8h, v0.h[2] 184 SMLAL v25.4s, v5.4h, v1.h[2] 185 SMLAL2 v29.4s, v5.8h, v1.h[2] 186 SXTL v4.8h, v4.8b 187 SMLAL v26.4s, v5.4h, v2.h[2] 188 SMLAL2 v30.4s, v5.8h, v2.h[2] 189 SMLAL v27.4s, v5.4h, v3.h[2] 190 SMLAL2 v31.4s, v5.8h, v3.h[2] 191 LDR x11, [x5, 64] 192 SMLAL v16.4s, v4.4h, v0.h[3] 193 SMLAL2 v20.4s, v4.8h, v0.h[3] 194 SMLAL v17.4s, v4.4h, v1.h[3] 195 SMLAL2 v21.4s, v4.8h, v1.h[3] 196 SXTL v6.8h, v6.8b 197 SMLAL v18.4s, v4.4h, v2.h[3] 198 SMLAL2 v22.4s, v4.8h, v2.h[3] 199 SMLAL v19.4s, v4.4h, v3.h[3] 200 SMLAL2 v23.4s, v4.8h, v3.h[3] 201 LDR d4, [x5, 72] 202 INS v5.d[0], x11 203 SMLAL v24.4s, v6.4h, v0.h[3] 204 SMLAL2 v28.4s, v6.8h, v0.h[3] 205 SXTL v5.8h, v5.8b 206 SMLAL v25.4s, v6.4h, v1.h[3] 207 SMLAL2 v29.4s, v6.8h, v1.h[3] 208 SMLAL v26.4s, v6.4h, v2.h[3] 209 SMLAL2 v30.4s, v6.8h, v2.h[3] 210 SMLAL v27.4s, v6.4h, v3.h[3] 211 SMLAL2 v31.4s, v6.8h, v3.h[3] 212 LDR x11, [x5, 80] 213 SMLAL v16.4s, v5.4h, v0.h[4] 214 SMLAL2 v20.4s, v5.8h, v0.h[4] 215 SMLAL v17.4s, v5.4h, v1.h[4] 216 SMLAL2 v21.4s, v5.8h, v1.h[4] 217 SXTL v4.8h, v4.8b 218 SMLAL v18.4s, v5.4h, v2.h[4] 219 SMLAL2 v22.4s, v5.8h, v2.h[4] 220 SMLAL v19.4s, v5.4h, v3.h[4] 221 SMLAL2 v23.4s, v5.8h, v3.h[4] 222 LDR d5, [x5, 88] 223 INS v6.d[0], x11 224 SMLAL v24.4s, v4.4h, v0.h[4] 225 SMLAL2 v28.4s, v4.8h, v0.h[4] 226 SMLAL v25.4s, v4.4h, v1.h[4] 227 SMLAL2 v29.4s, v4.8h, v1.h[4] 228 SXTL v6.8h, v6.8b 229 SMLAL v26.4s, v4.4h, v2.h[4] 230 SMLAL2 v30.4s, v4.8h, v2.h[4] 231 SMLAL v27.4s, v4.4h, v3.h[4] 232 SMLAL2 v31.4s, v4.8h, v3.h[4] 233 LDR x11, [x5, 96] 234 SMLAL v16.4s, v6.4h, v0.h[5] 235 SMLAL2 v20.4s, v6.8h, v0.h[5] 236 SMLAL v17.4s, v6.4h, v1.h[5] 237 SMLAL2 v21.4s, v6.8h, v1.h[5] 238 SXTL v5.8h, v5.8b 239 SMLAL v18.4s, v6.4h, v2.h[5] 240 SMLAL2 v22.4s, v6.8h, v2.h[5] 241 SMLAL v19.4s, v6.4h, v3.h[5] 242 SMLAL2 v23.4s, v6.8h, v3.h[5] 243 LDR d6, [x5, 104] 244 INS v4.d[0], x11 245 SMLAL v24.4s, v5.4h, v0.h[5] 246 SMLAL2 v28.4s, v5.8h, v0.h[5] 247 SMLAL v25.4s, v5.4h, v1.h[5] 248 SMLAL2 v29.4s, v5.8h, v1.h[5] 249 SXTL v4.8h, v4.8b 250 SMLAL v26.4s, v5.4h, v2.h[5] 251 SMLAL2 v30.4s, v5.8h, v2.h[5] 252 SMLAL v27.4s, v5.4h, v3.h[5] 253 SMLAL2 v31.4s, v5.8h, v3.h[5] 254 SXTL v6.8h, v6.8b 255 LDR x11, [x5, 112] 256 SMLAL v16.4s, v4.4h, v0.h[6] 257 SMLAL2 v20.4s, v4.8h, v0.h[6] 258 SMLAL v17.4s, v4.4h, v1.h[6] 259 SMLAL2 v21.4s, v4.8h, v1.h[6] 260 SMLAL v18.4s, v4.4h, v2.h[6] 261 SMLAL2 v22.4s, v4.8h, v2.h[6] 262 SMLAL v19.4s, v4.4h, v3.h[6] 263 SMLAL2 v23.4s, v4.8h, v3.h[6] 264 LDR d5, [x5, 120] 265 INS v4.d[0], x11 266 SMLAL v24.4s, v6.4h, v0.h[6] 267 SMLAL2 v28.4s, v6.8h, v0.h[6] 268 SMLAL v25.4s, v6.4h, v1.h[6] 269 SMLAL2 v29.4s, v6.8h, v1.h[6] 270 SXTL v4.8h, v4.8b 271 ADD x5, x5, 128 272 273 SMLAL v26.4s, v6.4h, v2.h[6] 274 SMLAL2 v30.4s, v6.8h, v2.h[6] 275 LDR x11, [x5] 276 SMLAL v27.4s, v6.4h, v3.h[6] 277 SMLAL2 v31.4s, v6.8h, v3.h[6] 278 SXTL v5.8h, v5.8b 279 LDR x21, [x13], 8 280 281 SMLAL v16.4s, v4.4h, v0.h[7] 282 SMLAL2 v20.4s, v4.8h, v0.h[7] 283 SMLAL v17.4s, v4.4h, v1.h[7] 284 SMLAL2 v21.4s, v4.8h, v1.h[7] 285 SMLAL v18.4s, v4.4h, v2.h[7] 286 SMLAL2 v22.4s, v4.8h, v2.h[7] 287 SMLAL v19.4s, v4.4h, v3.h[7] 288 SMLAL2 v23.4s, v4.8h, v3.h[7] 289 LDR d6, [x5, 8] 290 INS v4.d[0], x11 291 SMLAL v24.4s, v5.4h, v0.h[7] 292 SMLAL2 v28.4s, v5.8h, v0.h[7] 293 LDR x11, [x15], 8 294 SMLAL v25.4s, v5.4h, v1.h[7] 295 SMLAL2 v29.4s, v5.8h, v1.h[7] 296 LDR d1, [x14], 8 297 INS v0.d[0], x21 298 SMLAL v26.4s, v5.4h, v2.h[7] 299 SMLAL2 v30.4s, v5.8h, v2.h[7] 300 SMLAL v27.4s, v5.4h, v3.h[7] 301 SMLAL2 v31.4s, v5.8h, v3.h[7] 302 LDR d3, [x20], 8 303 INS v2.d[0], x11 304 305 SXTL v0.8h, v0.8b 306 SXTL v1.8h, v1.8b 307 LDR x11, [x5, 16] 308 SXTL v4.8h, v4.8b 309 SXTL v2.8h, v2.8b 310 SUBS x0, x0, 8 311 SXTL v3.8h, v3.8b 312 SXTL v6.8h, v6.8b 313 B.HS 2b 314 315 # Epilogue. Same as main loop but no preloads in final group 316 317 .p2align 3 3183: 319 SMLAL v16.4s, v4.4h, v0.h[0] 320 SMLAL2 v20.4s, v4.8h, v0.h[0] 321 SMLAL v17.4s, v4.4h, v1.h[0] 322 SMLAL2 v21.4s, v4.8h, v1.h[0] 323 SMLAL v18.4s, v4.4h, v2.h[0] 324 SMLAL2 v22.4s, v4.8h, v2.h[0] 325 SMLAL v19.4s, v4.4h, v3.h[0] 326 SMLAL2 v23.4s, v4.8h, v3.h[0] 327 LDR d4, [x5, 24] 328 INS v5.d[0], x11 329 SMLAL v24.4s, v6.4h, v0.h[0] 330 SMLAL2 v28.4s, v6.8h, v0.h[0] 331 SMLAL v25.4s, v6.4h, v1.h[0] 332 SMLAL2 v29.4s, v6.8h, v1.h[0] 333 SXTL v5.8h, v5.8b 334 SMLAL v26.4s, v6.4h, v2.h[0] 335 SMLAL2 v30.4s, v6.8h, v2.h[0] 336 SMLAL v27.4s, v6.4h, v3.h[0] 337 SMLAL2 v31.4s, v6.8h, v3.h[0] 338 LDR x11, [x5, 32] 339 SMLAL v16.4s, v5.4h, v0.h[1] 340 SMLAL2 v20.4s, v5.8h, v0.h[1] 341 SMLAL v17.4s, v5.4h, v1.h[1] 342 SMLAL2 v21.4s, v5.8h, v1.h[1] 343 SXTL v4.8h, v4.8b 344 SMLAL v18.4s, v5.4h, v2.h[1] 345 SMLAL2 v22.4s, v5.8h, v2.h[1] 346 SMLAL v19.4s, v5.4h, v3.h[1] 347 SMLAL2 v23.4s, v5.8h, v3.h[1] 348 LDR d5, [x5, 40] 349 INS v6.d[0], x11 350 SMLAL v24.4s, v4.4h, v0.h[1] 351 SMLAL2 v28.4s, v4.8h, v0.h[1] 352 SMLAL v25.4s, v4.4h, v1.h[1] 353 SMLAL2 v29.4s, v4.8h, v1.h[1] 354 SXTL v6.8h, v6.8b 355 SMLAL v26.4s, v4.4h, v2.h[1] 356 SMLAL2 v30.4s, v4.8h, v2.h[1] 357 SMLAL v27.4s, v4.4h, v3.h[1] 358 SMLAL2 v31.4s, v4.8h, v3.h[1] 359 LDR x11, [x5, 48] 360 SMLAL v16.4s, v6.4h, v0.h[2] 361 SMLAL2 v20.4s, v6.8h, v0.h[2] 362 SMLAL v17.4s, v6.4h, v1.h[2] 363 SXTL v5.8h, v5.8b 364 SMLAL2 v21.4s, v6.8h, v1.h[2] 365 SMLAL v18.4s, v6.4h, v2.h[2] 366 SMLAL2 v22.4s, v6.8h, v2.h[2] 367 SMLAL v19.4s, v6.4h, v3.h[2] 368 SMLAL2 v23.4s, v6.8h, v3.h[2] 369 LDR d6, [x5, 56] 370 INS v4.d[0], x11 371 SMLAL v24.4s, v5.4h, v0.h[2] 372 SMLAL2 v28.4s, v5.8h, v0.h[2] 373 SMLAL v25.4s, v5.4h, v1.h[2] 374 SMLAL2 v29.4s, v5.8h, v1.h[2] 375 SXTL v4.8h, v4.8b 376 SMLAL v26.4s, v5.4h, v2.h[2] 377 SMLAL2 v30.4s, v5.8h, v2.h[2] 378 SMLAL v27.4s, v5.4h, v3.h[2] 379 SMLAL2 v31.4s, v5.8h, v3.h[2] 380 LDR x11, [x5, 64] 381 SMLAL v16.4s, v4.4h, v0.h[3] 382 SMLAL2 v20.4s, v4.8h, v0.h[3] 383 SMLAL v17.4s, v4.4h, v1.h[3] 384 SMLAL2 v21.4s, v4.8h, v1.h[3] 385 SXTL v6.8h, v6.8b 386 SMLAL v18.4s, v4.4h, v2.h[3] 387 SMLAL2 v22.4s, v4.8h, v2.h[3] 388 SMLAL v19.4s, v4.4h, v3.h[3] 389 SMLAL2 v23.4s, v4.8h, v3.h[3] 390 LDR d4, [x5, 72] 391 INS v5.d[0], x11 392 SMLAL v24.4s, v6.4h, v0.h[3] 393 SMLAL2 v28.4s, v6.8h, v0.h[3] 394 SXTL v5.8h, v5.8b 395 SMLAL v25.4s, v6.4h, v1.h[3] 396 SMLAL2 v29.4s, v6.8h, v1.h[3] 397 SMLAL v26.4s, v6.4h, v2.h[3] 398 SMLAL2 v30.4s, v6.8h, v2.h[3] 399 SMLAL v27.4s, v6.4h, v3.h[3] 400 SMLAL2 v31.4s, v6.8h, v3.h[3] 401 LDR x11, [x5, 80] 402 SMLAL v16.4s, v5.4h, v0.h[4] 403 SMLAL2 v20.4s, v5.8h, v0.h[4] 404 SMLAL v17.4s, v5.4h, v1.h[4] 405 SMLAL2 v21.4s, v5.8h, v1.h[4] 406 SXTL v4.8h, v4.8b 407 SMLAL v18.4s, v5.4h, v2.h[4] 408 SMLAL2 v22.4s, v5.8h, v2.h[4] 409 SMLAL v19.4s, v5.4h, v3.h[4] 410 SMLAL2 v23.4s, v5.8h, v3.h[4] 411 LDR d5, [x5, 88] 412 INS v6.d[0], x11 413 SMLAL v24.4s, v4.4h, v0.h[4] 414 SMLAL2 v28.4s, v4.8h, v0.h[4] 415 SMLAL v25.4s, v4.4h, v1.h[4] 416 SMLAL2 v29.4s, v4.8h, v1.h[4] 417 SXTL v6.8h, v6.8b 418 SMLAL v26.4s, v4.4h, v2.h[4] 419 SMLAL2 v30.4s, v4.8h, v2.h[4] 420 SMLAL v27.4s, v4.4h, v3.h[4] 421 SMLAL2 v31.4s, v4.8h, v3.h[4] 422 LDR x11, [x5, 96] 423 SMLAL v16.4s, v6.4h, v0.h[5] 424 SMLAL2 v20.4s, v6.8h, v0.h[5] 425 SMLAL v17.4s, v6.4h, v1.h[5] 426 SMLAL2 v21.4s, v6.8h, v1.h[5] 427 SXTL v5.8h, v5.8b 428 SMLAL v18.4s, v6.4h, v2.h[5] 429 SMLAL2 v22.4s, v6.8h, v2.h[5] 430 SMLAL v19.4s, v6.4h, v3.h[5] 431 SMLAL2 v23.4s, v6.8h, v3.h[5] 432 LDR d6, [x5, 104] 433 INS v4.d[0], x11 434 SMLAL v24.4s, v5.4h, v0.h[5] 435 SMLAL2 v28.4s, v5.8h, v0.h[5] 436 SMLAL v25.4s, v5.4h, v1.h[5] 437 SMLAL2 v29.4s, v5.8h, v1.h[5] 438 SXTL v4.8h, v4.8b 439 SMLAL v26.4s, v5.4h, v2.h[5] 440 SMLAL2 v30.4s, v5.8h, v2.h[5] 441 SMLAL v27.4s, v5.4h, v3.h[5] 442 SMLAL2 v31.4s, v5.8h, v3.h[5] 443 SXTL v6.8h, v6.8b 444 SMLAL v16.4s, v4.4h, v0.h[6] 445 SMLAL2 v20.4s, v4.8h, v0.h[6] 446 SMLAL v17.4s, v4.4h, v1.h[6] 447 SMLAL2 v21.4s, v4.8h, v1.h[6] 448 SMLAL v18.4s, v4.4h, v2.h[6] 449 SMLAL2 v22.4s, v4.8h, v2.h[6] 450 SMLAL v19.4s, v4.4h, v3.h[6] 451 SMLAL2 v23.4s, v4.8h, v3.h[6] 452 LDR x11, [x5, 112] 453 SMLAL v24.4s, v6.4h, v0.h[6] 454 SMLAL2 v28.4s, v6.8h, v0.h[6] 455 SMLAL v25.4s, v6.4h, v1.h[6] 456 SMLAL2 v29.4s, v6.8h, v1.h[6] 457 LDR d5, [x5, 120] 458 INS v4.d[0], x11 459 SXTL v4.8h, v4.8b 460 SMLAL v26.4s, v6.4h, v2.h[6] 461 SMLAL2 v30.4s, v6.8h, v2.h[6] 462 SMLAL v27.4s, v6.4h, v3.h[6] 463 SMLAL2 v31.4s, v6.8h, v3.h[6] 464 SMLAL v16.4s, v4.4h, v0.h[7] 465 SMLAL2 v20.4s, v4.8h, v0.h[7] 466 SMLAL v17.4s, v4.4h, v1.h[7] 467 SMLAL2 v21.4s, v4.8h, v1.h[7] 468 SXTL v5.8h, v5.8b 469 SMLAL v18.4s, v4.4h, v2.h[7] 470 SMLAL2 v22.4s, v4.8h, v2.h[7] 471 SMLAL v19.4s, v4.4h, v3.h[7] 472 SMLAL2 v23.4s, v4.8h, v3.h[7] 473 ADD x5, x5, 128 474 SMLAL v24.4s, v5.4h, v0.h[7] 475 SMLAL2 v28.4s, v5.8h, v0.h[7] 476 SMLAL v25.4s, v5.4h, v1.h[7] 477 SMLAL2 v29.4s, v5.8h, v1.h[7] 478 AND x0, x2, 7 // kc remainder 0 to 7 479 SMLAL v26.4s, v5.4h, v2.h[7] 480 SMLAL2 v30.4s, v5.8h, v2.h[7] 481 LDR x11, [sp, 40] // reload params pointer 482 SMLAL v27.4s, v5.4h, v3.h[7] 483 SMLAL2 v31.4s, v5.8h, v3.h[7] 484 485 # Is there a remainder?- 1 to 7 bytes of A 486 CBNZ x0, 5f 487 4884: 489 # ks loop 490 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 491 B.HI 1b 492 493 SCVTF v16.4s, v16.4s 494 SCVTF v17.4s, v17.4s 495 # Load per channel scale values from weights 496 LDR q4, [x5], 16 497 SCVTF v18.4s, v18.4s 498 SCVTF v19.4s, v19.4s 499 LDR q5, [x5], 16 500 SCVTF v20.4s, v20.4s 501 SCVTF v21.4s, v21.4s 502 SCVTF v22.4s, v22.4s 503 SCVTF v23.4s, v23.4s 504 SCVTF v24.4s, v24.4s 505 SCVTF v25.4s, v25.4s 506 SCVTF v26.4s, v26.4s 507 SCVTF v27.4s, v27.4s 508 SCVTF v28.4s, v28.4s 509 SCVTF v29.4s, v29.4s 510 SCVTF v30.4s, v30.4s 511 SCVTF v31.4s, v31.4s 512 513 LDR q6, [x5], 16 514 FMUL v16.4s, v16.4s, v4.4s 515 FMUL v17.4s, v17.4s, v4.4s 516 FMUL v18.4s, v18.4s, v4.4s 517 FMUL v19.4s, v19.4s, v4.4s 518 FMUL v20.4s, v20.4s, v5.4s 519 LDR q4, [x5], 16 520 FMUL v21.4s, v21.4s, v5.4s 521 FMUL v22.4s, v22.4s, v5.4s 522 FMUL v23.4s, v23.4s, v5.4s 523 FMUL v24.4s, v24.4s, v6.4s 524 FMUL v25.4s, v25.4s, v6.4s 525 FMUL v26.4s, v26.4s, v6.4s 526 FMUL v27.4s, v27.4s, v6.4s 527 FMUL v28.4s, v28.4s, v4.4s 528 FMUL v29.4s, v29.4s, v4.4s 529 FMUL v30.4s, v30.4s, v4.4s 530 FMUL v31.4s, v31.4s, v4.4s 531 532 FCVTNS v16.4s, v16.4s 533 FCVTNS v17.4s, v17.4s 534 FCVTNS v18.4s, v18.4s 535 FCVTNS v19.4s, v19.4s 536 FCVTNS v20.4s, v20.4s 537 FCVTNS v21.4s, v21.4s 538 FCVTNS v22.4s, v22.4s 539 FCVTNS v23.4s, v23.4s 540 FCVTNS v24.4s, v24.4s 541 FCVTNS v25.4s, v25.4s 542 FCVTNS v26.4s, v26.4s 543 FCVTNS v27.4s, v27.4s 544 FCVTNS v28.4s, v28.4s 545 FCVTNS v29.4s, v29.4s 546 FCVTNS v30.4s, v30.4s 547 FCVTNS v31.4s, v31.4s 548 549 SQXTN v16.4h, v16.4s 550 SQXTN v17.4h, v17.4s 551 SQXTN v18.4h, v18.4s 552 SQXTN v19.4h, v19.4s 553 SQXTN v24.4h, v24.4s 554 SQXTN v25.4h, v25.4s 555 SQXTN v26.4h, v26.4s 556 SQXTN v27.4h, v27.4s 557 LD1R {v6.8h}, [x11], 2 // add bias 558 559 SQXTN2 v16.8h, v20.4s 560 SQXTN2 v17.8h, v21.4s 561 SQXTN2 v18.8h, v22.4s 562 SQXTN2 v19.8h, v23.4s 563 SQXTN2 v24.8h, v28.4s 564 SQXTN2 v25.8h, v29.4s 565 SQXTN2 v26.8h, v30.4s 566 SQXTN2 v27.8h, v31.4s 567 568 SQADD v16.8h, v16.8h, v6.8h 569 SQADD v17.8h, v17.8h, v6.8h 570 SQADD v18.8h, v18.8h, v6.8h 571 SQADD v19.8h, v19.8h, v6.8h 572 SQADD v24.8h, v24.8h, v6.8h 573 SQADD v25.8h, v25.8h, v6.8h 574 SQADD v26.8h, v26.8h, v6.8h 575 SQADD v27.8h, v27.8h, v6.8h 576 LD1R {v4.16b}, [x11], 1 // clamp min value 577 578 SQXTN v0.8b, v16.8h 579 SQXTN v1.8b, v17.8h 580 SQXTN v2.8b, v18.8h 581 SQXTN v3.8b, v19.8h 582 LD1R {v5.16b}, [x11] // clamp max value 583 SQXTN2 v0.16b, v24.8h 584 SQXTN2 v1.16b, v25.8h 585 SQXTN2 v2.16b, v26.8h 586 SQXTN2 v3.16b, v27.8h 587 SUB x11, x11, 3 // rewind params pointer 588 589 SMAX v0.16b, v0.16b, v4.16b 590 SMAX v1.16b, v1.16b, v4.16b 591 SMAX v2.16b, v2.16b, v4.16b 592 SMAX v3.16b, v3.16b, v4.16b 593 SUBS x1, x1, 16 594 SMIN v0.16b, v0.16b, v5.16b 595 SMIN v1.16b, v1.16b, v5.16b 596 SMIN v2.16b, v2.16b, v5.16b 597 SMIN v3.16b, v3.16b, v5.16b 598 B.LO 6f 599 600 # Store full 4 x 16 601 ST1 {v3.16b}, [x7], x10 602 ST1 {v2.16b}, [x17], x10 603 ST1 {v1.16b}, [x16], x10 604 ST1 {v0.16b}, [x6], x10 605 606 SUB x4, x4, x3 // a -= ks 607 608 # nc loop 609 B.HI 0b 610 611 # Restore x20-x21 from stack 612 LDP x20, x21, [sp], 16 613 RET 614 615 # Remainder- 1 to 7 bytes of A 616 .p2align 3 6175: 618 AND x0, x2, 7 // kc remainder 1 to 7 619 620 LD1 {v0.8b}, [x13], x0 621 LDP d4, d5, [x5], 16 622 LD1 {v1.8b}, [x14], x0 623 LD1 {v2.8b}, [x15], x0 624 LD1 {v3.8b}, [x20], x0 625 SXTL v0.8h, v0.8b 626 SXTL v4.8h, v4.8b 627 SXTL v5.8h, v5.8b 628 SXTL v1.8h, v1.8b 629 SXTL v2.8h, v2.8b 630 SXTL v3.8h, v3.8b 631 SMLAL v16.4s, v4.4h, v0.h[0] 632 SMLAL2 v20.4s, v4.8h, v0.h[0] 633 SMLAL v24.4s, v5.4h, v0.h[0] 634 SMLAL2 v28.4s, v5.8h, v0.h[0] 635 SMLAL v17.4s, v4.4h, v1.h[0] 636 SMLAL2 v21.4s, v4.8h, v1.h[0] 637 SMLAL v25.4s, v5.4h, v1.h[0] 638 SMLAL2 v29.4s, v5.8h, v1.h[0] 639 SMLAL v18.4s, v4.4h, v2.h[0] 640 SMLAL2 v22.4s, v4.8h, v2.h[0] 641 SMLAL v26.4s, v5.4h, v2.h[0] 642 SMLAL2 v30.4s, v5.8h, v2.h[0] 643 SMLAL v19.4s, v4.4h, v3.h[0] 644 SMLAL2 v23.4s, v4.8h, v3.h[0] 645 SMLAL v27.4s, v5.4h, v3.h[0] 646 SMLAL2 v31.4s, v5.8h, v3.h[0] 647 CMP x0, 2 648 B.LO 4b 649 650 LDP d4, d5, [x5], 16 651 SXTL v4.8h, v4.8b 652 SXTL v5.8h, v5.8b 653 SMLAL v16.4s, v4.4h, v0.h[1] 654 SMLAL2 v20.4s, v4.8h, v0.h[1] 655 SMLAL v24.4s, v5.4h, v0.h[1] 656 SMLAL2 v28.4s, v5.8h, v0.h[1] 657 SMLAL v17.4s, v4.4h, v1.h[1] 658 SMLAL2 v21.4s, v4.8h, v1.h[1] 659 SMLAL v25.4s, v5.4h, v1.h[1] 660 SMLAL2 v29.4s, v5.8h, v1.h[1] 661 SMLAL v18.4s, v4.4h, v2.h[1] 662 SMLAL2 v22.4s, v4.8h, v2.h[1] 663 SMLAL v26.4s, v5.4h, v2.h[1] 664 SMLAL2 v30.4s, v5.8h, v2.h[1] 665 SMLAL v19.4s, v4.4h, v3.h[1] 666 SMLAL2 v23.4s, v4.8h, v3.h[1] 667 SMLAL v27.4s, v5.4h, v3.h[1] 668 SMLAL2 v31.4s, v5.8h, v3.h[1] 669 B.EQ 4b 670 671 LDP d4, d5, [x5], 16 672 SXTL v4.8h, v4.8b 673 SXTL v5.8h, v5.8b 674 SMLAL v16.4s, v4.4h, v0.h[2] 675 SMLAL2 v20.4s, v4.8h, v0.h[2] 676 SMLAL v24.4s, v5.4h, v0.h[2] 677 SMLAL2 v28.4s, v5.8h, v0.h[2] 678 SMLAL v17.4s, v4.4h, v1.h[2] 679 SMLAL2 v21.4s, v4.8h, v1.h[2] 680 SMLAL v25.4s, v5.4h, v1.h[2] 681 SMLAL2 v29.4s, v5.8h, v1.h[2] 682 SMLAL v18.4s, v4.4h, v2.h[2] 683 SMLAL2 v22.4s, v4.8h, v2.h[2] 684 SMLAL v26.4s, v5.4h, v2.h[2] 685 SMLAL2 v30.4s, v5.8h, v2.h[2] 686 SMLAL v19.4s, v4.4h, v3.h[2] 687 SMLAL2 v23.4s, v4.8h, v3.h[2] 688 SMLAL v27.4s, v5.4h, v3.h[2] 689 SMLAL2 v31.4s, v5.8h, v3.h[2] 690 CMP x0, 4 691 B.LO 4b 692 693 LDP d4, d5, [x5], 16 694 SXTL v4.8h, v4.8b 695 SXTL v5.8h, v5.8b 696 SMLAL v16.4s, v4.4h, v0.h[3] 697 SMLAL2 v20.4s, v4.8h, v0.h[3] 698 SMLAL v24.4s, v5.4h, v0.h[3] 699 SMLAL2 v28.4s, v5.8h, v0.h[3] 700 SMLAL v17.4s, v4.4h, v1.h[3] 701 SMLAL2 v21.4s, v4.8h, v1.h[3] 702 SMLAL v25.4s, v5.4h, v1.h[3] 703 SMLAL2 v29.4s, v5.8h, v1.h[3] 704 SMLAL v18.4s, v4.4h, v2.h[3] 705 SMLAL2 v22.4s, v4.8h, v2.h[3] 706 SMLAL v26.4s, v5.4h, v2.h[3] 707 SMLAL2 v30.4s, v5.8h, v2.h[3] 708 SMLAL v19.4s, v4.4h, v3.h[3] 709 SMLAL2 v23.4s, v4.8h, v3.h[3] 710 SMLAL v27.4s, v5.4h, v3.h[3] 711 SMLAL2 v31.4s, v5.8h, v3.h[3] 712 B.EQ 4b 713 714 LDP d4, d5, [x5], 16 715 SXTL v4.8h, v4.8b 716 SXTL v5.8h, v5.8b 717 SMLAL v16.4s, v4.4h, v0.h[4] 718 SMLAL2 v20.4s, v4.8h, v0.h[4] 719 SMLAL v24.4s, v5.4h, v0.h[4] 720 SMLAL2 v28.4s, v5.8h, v0.h[4] 721 SMLAL v17.4s, v4.4h, v1.h[4] 722 SMLAL2 v21.4s, v4.8h, v1.h[4] 723 SMLAL v25.4s, v5.4h, v1.h[4] 724 SMLAL2 v29.4s, v5.8h, v1.h[4] 725 SMLAL v18.4s, v4.4h, v2.h[4] 726 SMLAL2 v22.4s, v4.8h, v2.h[4] 727 SMLAL v26.4s, v5.4h, v2.h[4] 728 SMLAL2 v30.4s, v5.8h, v2.h[4] 729 SMLAL v19.4s, v4.4h, v3.h[4] 730 SMLAL2 v23.4s, v4.8h, v3.h[4] 731 SMLAL v27.4s, v5.4h, v3.h[4] 732 SMLAL2 v31.4s, v5.8h, v3.h[4] 733 CMP x0, 6 734 B.LO 4b 735 736 LDP d4, d5, [x5], 16 737 SXTL v4.8h, v4.8b 738 SXTL v5.8h, v5.8b 739 SMLAL v16.4s, v4.4h, v0.h[5] 740 SMLAL2 v20.4s, v4.8h, v0.h[5] 741 SMLAL v24.4s, v5.4h, v0.h[5] 742 SMLAL2 v28.4s, v5.8h, v0.h[5] 743 SMLAL v17.4s, v4.4h, v1.h[5] 744 SMLAL2 v21.4s, v4.8h, v1.h[5] 745 SMLAL v25.4s, v5.4h, v1.h[5] 746 SMLAL2 v29.4s, v5.8h, v1.h[5] 747 SMLAL v18.4s, v4.4h, v2.h[5] 748 SMLAL2 v22.4s, v4.8h, v2.h[5] 749 SMLAL v26.4s, v5.4h, v2.h[5] 750 SMLAL2 v30.4s, v5.8h, v2.h[5] 751 SMLAL v19.4s, v4.4h, v3.h[5] 752 SMLAL2 v23.4s, v4.8h, v3.h[5] 753 SMLAL v27.4s, v5.4h, v3.h[5] 754 SMLAL2 v31.4s, v5.8h, v3.h[5] 755 B.EQ 4b 756 757 LDP d4, d5, [x5], 16 758 SXTL v4.8h, v4.8b 759 SXTL v5.8h, v5.8b 760 SMLAL v16.4s, v4.4h, v0.h[6] 761 SMLAL2 v20.4s, v4.8h, v0.h[6] 762 SMLAL v24.4s, v5.4h, v0.h[6] 763 SMLAL2 v28.4s, v5.8h, v0.h[6] 764 SMLAL v17.4s, v4.4h, v1.h[6] 765 SMLAL2 v21.4s, v4.8h, v1.h[6] 766 SMLAL v25.4s, v5.4h, v1.h[6] 767 SMLAL2 v29.4s, v5.8h, v1.h[6] 768 SMLAL v18.4s, v4.4h, v2.h[6] 769 SMLAL2 v22.4s, v4.8h, v2.h[6] 770 SMLAL v26.4s, v5.4h, v2.h[6] 771 SMLAL2 v30.4s, v5.8h, v2.h[6] 772 SMLAL v19.4s, v4.4h, v3.h[6] 773 SMLAL2 v23.4s, v4.8h, v3.h[6] 774 SMLAL v27.4s, v5.4h, v3.h[6] 775 SMLAL2 v31.4s, v5.8h, v3.h[6] 776 B 4b 777 778 # Store odd width 779 .p2align 3 7806: 781 TBZ x1, 3, 7f 782 STR d3, [x7], 8 783 STR d2, [x17], 8 784 DUP d3, v3.d[1] 785 DUP d2, v2.d[1] 786 STR d1, [x16], 8 787 STR d0, [x6], 8 788 DUP d1, v1.d[1] 789 DUP d0, v0.d[1] 7907: 791 TBZ x1, 2, 8f 792 STR s3, [x7], 4 793 STR s2, [x17], 4 794 DUP s3, v3.s[1] 795 DUP s2, v2.s[1] 796 STR s1, [x16], 4 797 STR s0, [x6], 4 798 DUP s1, v1.s[1] 799 DUP s0, v0.s[1] 8008: 801 TBZ x1, 1, 9f 802 STR h3, [x7], 2 803 STR h2, [x17], 2 804 DUP h3, v3.h[1] 805 DUP h2, v2.h[1] 806 STR h1, [x16], 2 807 STR h0, [x6], 2 808 DUP h1, v1.h[1] 809 DUP h0, v0.h[1] 8109: 811 TBZ x1, 0, 10f 812 STR b3, [x7] 813 STR b2, [x17] 814 STR b1, [x16] 815 STR b0, [x6] 81610: 817 # Restore x20-x21 from stack 818 LDP x20, x21, [sp], 16 819 RET 820 821END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53 822 823#ifdef __ELF__ 824.section ".note.GNU-stack","",%progbits 825#endif 826