1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t** restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x20 v3 34# B x5 v4 v5 v6 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 40# x11, x21 temp for Cortex-A53 loads 41 42BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 43 44 # Clamp C pointers 45 CMP x0, 2 // if mr < 2 46 LDP x10, x8, [sp] // Load cn_stride, a_offset 47 ADD x16, x6, x7 // c1 = c0 + cm_stride 48 CSEL x16, x6, x16, LO // c1 = c0 49 50 ADD x17, x16, x7 // c2 = c1 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 // if mr <= 2 53 CSEL x17, x16, x17, LS // c2 = c1 54 55 CMP x0, 4 // if mr < 4 56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 CSEL x7, x17, x7, LO // c3 = c2 59 60 61 .p2align 3 620: 63 # Load initial bias from w into accumulators 64 LDP q16, q20, [x5], 32 65 MOV v17.16b, v16.16b 66 MOV v18.16b, v16.16b 67 LDP q24, q28, [x5], 32 68 MOV v19.16b, v16.16b 69 MOV v21.16b, v20.16b 70 MOV v22.16b, v20.16b 71 MOV v23.16b, v20.16b 72 MOV v25.16b, v24.16b 73 MOV v26.16b, v24.16b 74 MOV v27.16b, v24.16b 75 MOV v29.16b, v28.16b 76 MOV v30.16b, v28.16b 77 MOV v31.16b, v28.16b 78 MOV x9, x3 // p = ks 79 80 .p2align 3 811: 82 # Load next 4 A pointers 83 LDP x13, x14, [x4], 16 84 LDP x15, x20, [x4], 16 85 86 CMP x13, x12 // if a0 == zero 87 ADD x13, x13, x8 // a0 += a_offset 88 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 89 CMP x14, x12 // if a1 == zero 90 ADD x14, x14, x8 // a1 += a_offset 91 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 92 CMP x15, x12 // if a2 == zero 93 ADD x15, x15, x8 // a2 += a_offset 94 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 95 CMP x20, x12 // if a3 == zero 96 ADD x20, x20, x8 // a3 += a_offset 97 CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset 98 99 # Is there at least 8 bytes for epilogue? 100 SUBS x0, x2, 8 // k = kc - 8 101 B.LO 5f 102 103 # Prologue 104 LDR d0, [x13], 8 105 LDP d4, d6, [x5] 106 LDR d1, [x14], 8 107 LDR d2, [x15], 8 108 LDR d3, [x20], 8 109 SXTL v0.8h, v0.8b 110 LDR x11, [x5, 16] 111 SXTL v4.8h, v4.8b 112 SXTL v1.8h, v1.8b 113 SXTL v2.8h, v2.8b 114 SXTL v3.8h, v3.8b 115 SXTL v6.8h, v6.8b 116 117 SUBS x0, x0, 8 // k = k - 8 118 # Is there at least 8 bytes for main loop? 119 B.LO 3f 120 121 # Main loop - 8 bytes of A 122 .p2align 3 1232: 124 SMLAL v16.4s, v4.4h, v0.h[0] 125 SMLAL2 v20.4s, v4.8h, v0.h[0] 126 SMLAL v17.4s, v4.4h, v1.h[0] 127 SMLAL2 v21.4s, v4.8h, v1.h[0] 128 SMLAL v18.4s, v4.4h, v2.h[0] 129 SMLAL2 v22.4s, v4.8h, v2.h[0] 130 SMLAL v19.4s, v4.4h, v3.h[0] 131 SMLAL2 v23.4s, v4.8h, v3.h[0] 132 LDR d4, [x5, 24] 133 INS v5.d[0], x11 134 SMLAL v24.4s, v6.4h, v0.h[0] 135 SMLAL2 v28.4s, v6.8h, v0.h[0] 136 SMLAL v25.4s, v6.4h, v1.h[0] 137 SMLAL2 v29.4s, v6.8h, v1.h[0] 138 SXTL v5.8h, v5.8b 139 SMLAL v26.4s, v6.4h, v2.h[0] 140 SMLAL2 v30.4s, v6.8h, v2.h[0] 141 SMLAL v27.4s, v6.4h, v3.h[0] 142 SMLAL2 v31.4s, v6.8h, v3.h[0] 143 LDR x11, [x5, 32] 144 SMLAL v16.4s, v5.4h, v0.h[1] 145 SMLAL2 v20.4s, v5.8h, v0.h[1] 146 SMLAL v17.4s, v5.4h, v1.h[1] 147 SMLAL2 v21.4s, v5.8h, v1.h[1] 148 SXTL v4.8h, v4.8b 149 SMLAL v18.4s, v5.4h, v2.h[1] 150 SMLAL2 v22.4s, v5.8h, v2.h[1] 151 SMLAL v19.4s, v5.4h, v3.h[1] 152 SMLAL2 v23.4s, v5.8h, v3.h[1] 153 LDR d5, [x5, 40] 154 INS v6.d[0], x11 155 SMLAL v24.4s, v4.4h, v0.h[1] 156 SMLAL2 v28.4s, v4.8h, v0.h[1] 157 SMLAL v25.4s, v4.4h, v1.h[1] 158 SMLAL2 v29.4s, v4.8h, v1.h[1] 159 SXTL v6.8h, v6.8b 160 SMLAL v26.4s, v4.4h, v2.h[1] 161 SMLAL2 v30.4s, v4.8h, v2.h[1] 162 SMLAL v27.4s, v4.4h, v3.h[1] 163 SMLAL2 v31.4s, v4.8h, v3.h[1] 164 LDR x11, [x5, 48] 165 SMLAL v16.4s, v6.4h, v0.h[2] 166 SMLAL2 v20.4s, v6.8h, v0.h[2] 167 SMLAL v17.4s, v6.4h, v1.h[2] 168 SXTL v5.8h, v5.8b 169 SMLAL2 v21.4s, v6.8h, v1.h[2] 170 SMLAL v18.4s, v6.4h, v2.h[2] 171 SMLAL2 v22.4s, v6.8h, v2.h[2] 172 SMLAL v19.4s, v6.4h, v3.h[2] 173 SMLAL2 v23.4s, v6.8h, v3.h[2] 174 LDR d6, [x5, 56] 175 INS v4.d[0], x11 176 SMLAL v24.4s, v5.4h, v0.h[2] 177 SMLAL2 v28.4s, v5.8h, v0.h[2] 178 SMLAL v25.4s, v5.4h, v1.h[2] 179 SMLAL2 v29.4s, v5.8h, v1.h[2] 180 SXTL v4.8h, v4.8b 181 SMLAL v26.4s, v5.4h, v2.h[2] 182 SMLAL2 v30.4s, v5.8h, v2.h[2] 183 SMLAL v27.4s, v5.4h, v3.h[2] 184 SMLAL2 v31.4s, v5.8h, v3.h[2] 185 LDR x11, [x5, 64] 186 SMLAL v16.4s, v4.4h, v0.h[3] 187 SMLAL2 v20.4s, v4.8h, v0.h[3] 188 SMLAL v17.4s, v4.4h, v1.h[3] 189 SMLAL2 v21.4s, v4.8h, v1.h[3] 190 SXTL v6.8h, v6.8b 191 SMLAL v18.4s, v4.4h, v2.h[3] 192 SMLAL2 v22.4s, v4.8h, v2.h[3] 193 SMLAL v19.4s, v4.4h, v3.h[3] 194 SMLAL2 v23.4s, v4.8h, v3.h[3] 195 LDR d4, [x5, 72] 196 INS v5.d[0], x11 197 SMLAL v24.4s, v6.4h, v0.h[3] 198 SMLAL2 v28.4s, v6.8h, v0.h[3] 199 SXTL v5.8h, v5.8b 200 SMLAL v25.4s, v6.4h, v1.h[3] 201 SMLAL2 v29.4s, v6.8h, v1.h[3] 202 SMLAL v26.4s, v6.4h, v2.h[3] 203 SMLAL2 v30.4s, v6.8h, v2.h[3] 204 SMLAL v27.4s, v6.4h, v3.h[3] 205 SMLAL2 v31.4s, v6.8h, v3.h[3] 206 LDR x11, [x5, 80] 207 SMLAL v16.4s, v5.4h, v0.h[4] 208 SMLAL2 v20.4s, v5.8h, v0.h[4] 209 SMLAL v17.4s, v5.4h, v1.h[4] 210 SMLAL2 v21.4s, v5.8h, v1.h[4] 211 SXTL v4.8h, v4.8b 212 SMLAL v18.4s, v5.4h, v2.h[4] 213 SMLAL2 v22.4s, v5.8h, v2.h[4] 214 SMLAL v19.4s, v5.4h, v3.h[4] 215 SMLAL2 v23.4s, v5.8h, v3.h[4] 216 LDR d5, [x5, 88] 217 INS v6.d[0], x11 218 SMLAL v24.4s, v4.4h, v0.h[4] 219 SMLAL2 v28.4s, v4.8h, v0.h[4] 220 SMLAL v25.4s, v4.4h, v1.h[4] 221 SMLAL2 v29.4s, v4.8h, v1.h[4] 222 SXTL v6.8h, v6.8b 223 SMLAL v26.4s, v4.4h, v2.h[4] 224 SMLAL2 v30.4s, v4.8h, v2.h[4] 225 SMLAL v27.4s, v4.4h, v3.h[4] 226 SMLAL2 v31.4s, v4.8h, v3.h[4] 227 LDR x11, [x5, 96] 228 SMLAL v16.4s, v6.4h, v0.h[5] 229 SMLAL2 v20.4s, v6.8h, v0.h[5] 230 SMLAL v17.4s, v6.4h, v1.h[5] 231 SMLAL2 v21.4s, v6.8h, v1.h[5] 232 SXTL v5.8h, v5.8b 233 SMLAL v18.4s, v6.4h, v2.h[5] 234 SMLAL2 v22.4s, v6.8h, v2.h[5] 235 SMLAL v19.4s, v6.4h, v3.h[5] 236 SMLAL2 v23.4s, v6.8h, v3.h[5] 237 LDR d6, [x5, 104] 238 INS v4.d[0], x11 239 SMLAL v24.4s, v5.4h, v0.h[5] 240 SMLAL2 v28.4s, v5.8h, v0.h[5] 241 SMLAL v25.4s, v5.4h, v1.h[5] 242 SMLAL2 v29.4s, v5.8h, v1.h[5] 243 SXTL v4.8h, v4.8b 244 SMLAL v26.4s, v5.4h, v2.h[5] 245 SMLAL2 v30.4s, v5.8h, v2.h[5] 246 SMLAL v27.4s, v5.4h, v3.h[5] 247 SMLAL2 v31.4s, v5.8h, v3.h[5] 248 SXTL v6.8h, v6.8b 249 LDR x11, [x5, 112] 250 SMLAL v16.4s, v4.4h, v0.h[6] 251 SMLAL2 v20.4s, v4.8h, v0.h[6] 252 SMLAL v17.4s, v4.4h, v1.h[6] 253 SMLAL2 v21.4s, v4.8h, v1.h[6] 254 SMLAL v18.4s, v4.4h, v2.h[6] 255 SMLAL2 v22.4s, v4.8h, v2.h[6] 256 SMLAL v19.4s, v4.4h, v3.h[6] 257 SMLAL2 v23.4s, v4.8h, v3.h[6] 258 LDR d5, [x5, 120] 259 INS v4.d[0], x11 260 SMLAL v24.4s, v6.4h, v0.h[6] 261 SMLAL2 v28.4s, v6.8h, v0.h[6] 262 SMLAL v25.4s, v6.4h, v1.h[6] 263 SMLAL2 v29.4s, v6.8h, v1.h[6] 264 SXTL v4.8h, v4.8b 265 ADD x5, x5, 128 266 267 SMLAL v26.4s, v6.4h, v2.h[6] 268 SMLAL2 v30.4s, v6.8h, v2.h[6] 269 LDR x11, [x5] 270 SMLAL v27.4s, v6.4h, v3.h[6] 271 SMLAL2 v31.4s, v6.8h, v3.h[6] 272 SXTL v5.8h, v5.8b 273 LDR x21, [x13], 8 274 275 SMLAL v16.4s, v4.4h, v0.h[7] 276 SMLAL2 v20.4s, v4.8h, v0.h[7] 277 SMLAL v17.4s, v4.4h, v1.h[7] 278 SMLAL2 v21.4s, v4.8h, v1.h[7] 279 SMLAL v18.4s, v4.4h, v2.h[7] 280 SMLAL2 v22.4s, v4.8h, v2.h[7] 281 SMLAL v19.4s, v4.4h, v3.h[7] 282 SMLAL2 v23.4s, v4.8h, v3.h[7] 283 LDR d6, [x5, 8] 284 INS v4.d[0], x11 285 SMLAL v24.4s, v5.4h, v0.h[7] 286 SMLAL2 v28.4s, v5.8h, v0.h[7] 287 LDR x11, [x15], 8 288 SMLAL v25.4s, v5.4h, v1.h[7] 289 SMLAL2 v29.4s, v5.8h, v1.h[7] 290 LDR d1, [x14], 8 291 INS v0.d[0], x21 292 SMLAL v26.4s, v5.4h, v2.h[7] 293 SMLAL2 v30.4s, v5.8h, v2.h[7] 294 SMLAL v27.4s, v5.4h, v3.h[7] 295 SMLAL2 v31.4s, v5.8h, v3.h[7] 296 LDR d3, [x20], 8 297 INS v2.d[0], x11 298 299 SXTL v0.8h, v0.8b 300 SXTL v1.8h, v1.8b 301 LDR x11, [x5, 16] 302 SXTL v4.8h, v4.8b 303 SXTL v2.8h, v2.8b 304 SUBS x0, x0, 8 305 SXTL v3.8h, v3.8b 306 SXTL v6.8h, v6.8b 307 B.HS 2b 308 309 # Epilogue. Same as main loop but no preloads in final group 310 311 .p2align 3 3123: 313 SMLAL v16.4s, v4.4h, v0.h[0] 314 SMLAL2 v20.4s, v4.8h, v0.h[0] 315 SMLAL v17.4s, v4.4h, v1.h[0] 316 SMLAL2 v21.4s, v4.8h, v1.h[0] 317 SMLAL v18.4s, v4.4h, v2.h[0] 318 SMLAL2 v22.4s, v4.8h, v2.h[0] 319 SMLAL v19.4s, v4.4h, v3.h[0] 320 SMLAL2 v23.4s, v4.8h, v3.h[0] 321 LDR d4, [x5, 24] 322 INS v5.d[0], x11 323 SMLAL v24.4s, v6.4h, v0.h[0] 324 SMLAL2 v28.4s, v6.8h, v0.h[0] 325 SMLAL v25.4s, v6.4h, v1.h[0] 326 SMLAL2 v29.4s, v6.8h, v1.h[0] 327 SXTL v5.8h, v5.8b 328 SMLAL v26.4s, v6.4h, v2.h[0] 329 SMLAL2 v30.4s, v6.8h, v2.h[0] 330 SMLAL v27.4s, v6.4h, v3.h[0] 331 SMLAL2 v31.4s, v6.8h, v3.h[0] 332 LDR x11, [x5, 32] 333 SMLAL v16.4s, v5.4h, v0.h[1] 334 SMLAL2 v20.4s, v5.8h, v0.h[1] 335 SMLAL v17.4s, v5.4h, v1.h[1] 336 SMLAL2 v21.4s, v5.8h, v1.h[1] 337 SXTL v4.8h, v4.8b 338 SMLAL v18.4s, v5.4h, v2.h[1] 339 SMLAL2 v22.4s, v5.8h, v2.h[1] 340 SMLAL v19.4s, v5.4h, v3.h[1] 341 SMLAL2 v23.4s, v5.8h, v3.h[1] 342 LDR d5, [x5, 40] 343 INS v6.d[0], x11 344 SMLAL v24.4s, v4.4h, v0.h[1] 345 SMLAL2 v28.4s, v4.8h, v0.h[1] 346 SMLAL v25.4s, v4.4h, v1.h[1] 347 SMLAL2 v29.4s, v4.8h, v1.h[1] 348 SXTL v6.8h, v6.8b 349 SMLAL v26.4s, v4.4h, v2.h[1] 350 SMLAL2 v30.4s, v4.8h, v2.h[1] 351 SMLAL v27.4s, v4.4h, v3.h[1] 352 SMLAL2 v31.4s, v4.8h, v3.h[1] 353 LDR x11, [x5, 48] 354 SMLAL v16.4s, v6.4h, v0.h[2] 355 SMLAL2 v20.4s, v6.8h, v0.h[2] 356 SMLAL v17.4s, v6.4h, v1.h[2] 357 SXTL v5.8h, v5.8b 358 SMLAL2 v21.4s, v6.8h, v1.h[2] 359 SMLAL v18.4s, v6.4h, v2.h[2] 360 SMLAL2 v22.4s, v6.8h, v2.h[2] 361 SMLAL v19.4s, v6.4h, v3.h[2] 362 SMLAL2 v23.4s, v6.8h, v3.h[2] 363 LDR d6, [x5, 56] 364 INS v4.d[0], x11 365 SMLAL v24.4s, v5.4h, v0.h[2] 366 SMLAL2 v28.4s, v5.8h, v0.h[2] 367 SMLAL v25.4s, v5.4h, v1.h[2] 368 SMLAL2 v29.4s, v5.8h, v1.h[2] 369 SXTL v4.8h, v4.8b 370 SMLAL v26.4s, v5.4h, v2.h[2] 371 SMLAL2 v30.4s, v5.8h, v2.h[2] 372 SMLAL v27.4s, v5.4h, v3.h[2] 373 SMLAL2 v31.4s, v5.8h, v3.h[2] 374 LDR x11, [x5, 64] 375 SMLAL v16.4s, v4.4h, v0.h[3] 376 SMLAL2 v20.4s, v4.8h, v0.h[3] 377 SMLAL v17.4s, v4.4h, v1.h[3] 378 SMLAL2 v21.4s, v4.8h, v1.h[3] 379 SXTL v6.8h, v6.8b 380 SMLAL v18.4s, v4.4h, v2.h[3] 381 SMLAL2 v22.4s, v4.8h, v2.h[3] 382 SMLAL v19.4s, v4.4h, v3.h[3] 383 SMLAL2 v23.4s, v4.8h, v3.h[3] 384 LDR d4, [x5, 72] 385 INS v5.d[0], x11 386 SMLAL v24.4s, v6.4h, v0.h[3] 387 SMLAL2 v28.4s, v6.8h, v0.h[3] 388 SXTL v5.8h, v5.8b 389 SMLAL v25.4s, v6.4h, v1.h[3] 390 SMLAL2 v29.4s, v6.8h, v1.h[3] 391 SMLAL v26.4s, v6.4h, v2.h[3] 392 SMLAL2 v30.4s, v6.8h, v2.h[3] 393 SMLAL v27.4s, v6.4h, v3.h[3] 394 SMLAL2 v31.4s, v6.8h, v3.h[3] 395 LDR x11, [x5, 80] 396 SMLAL v16.4s, v5.4h, v0.h[4] 397 SMLAL2 v20.4s, v5.8h, v0.h[4] 398 SMLAL v17.4s, v5.4h, v1.h[4] 399 SMLAL2 v21.4s, v5.8h, v1.h[4] 400 SXTL v4.8h, v4.8b 401 SMLAL v18.4s, v5.4h, v2.h[4] 402 SMLAL2 v22.4s, v5.8h, v2.h[4] 403 SMLAL v19.4s, v5.4h, v3.h[4] 404 SMLAL2 v23.4s, v5.8h, v3.h[4] 405 LDR d5, [x5, 88] 406 INS v6.d[0], x11 407 SMLAL v24.4s, v4.4h, v0.h[4] 408 SMLAL2 v28.4s, v4.8h, v0.h[4] 409 SMLAL v25.4s, v4.4h, v1.h[4] 410 SMLAL2 v29.4s, v4.8h, v1.h[4] 411 SXTL v6.8h, v6.8b 412 SMLAL v26.4s, v4.4h, v2.h[4] 413 SMLAL2 v30.4s, v4.8h, v2.h[4] 414 SMLAL v27.4s, v4.4h, v3.h[4] 415 SMLAL2 v31.4s, v4.8h, v3.h[4] 416 LDR x11, [x5, 96] 417 SMLAL v16.4s, v6.4h, v0.h[5] 418 SMLAL2 v20.4s, v6.8h, v0.h[5] 419 SMLAL v17.4s, v6.4h, v1.h[5] 420 SMLAL2 v21.4s, v6.8h, v1.h[5] 421 SXTL v5.8h, v5.8b 422 SMLAL v18.4s, v6.4h, v2.h[5] 423 SMLAL2 v22.4s, v6.8h, v2.h[5] 424 SMLAL v19.4s, v6.4h, v3.h[5] 425 SMLAL2 v23.4s, v6.8h, v3.h[5] 426 LDR d6, [x5, 104] 427 INS v4.d[0], x11 428 SMLAL v24.4s, v5.4h, v0.h[5] 429 SMLAL2 v28.4s, v5.8h, v0.h[5] 430 SMLAL v25.4s, v5.4h, v1.h[5] 431 SMLAL2 v29.4s, v5.8h, v1.h[5] 432 SXTL v4.8h, v4.8b 433 SMLAL v26.4s, v5.4h, v2.h[5] 434 SMLAL2 v30.4s, v5.8h, v2.h[5] 435 SMLAL v27.4s, v5.4h, v3.h[5] 436 SMLAL2 v31.4s, v5.8h, v3.h[5] 437 SXTL v6.8h, v6.8b 438 SMLAL v16.4s, v4.4h, v0.h[6] 439 SMLAL2 v20.4s, v4.8h, v0.h[6] 440 SMLAL v17.4s, v4.4h, v1.h[6] 441 SMLAL2 v21.4s, v4.8h, v1.h[6] 442 SMLAL v18.4s, v4.4h, v2.h[6] 443 SMLAL2 v22.4s, v4.8h, v2.h[6] 444 SMLAL v19.4s, v4.4h, v3.h[6] 445 SMLAL2 v23.4s, v4.8h, v3.h[6] 446 LDR x11, [x5, 112] 447 SMLAL v24.4s, v6.4h, v0.h[6] 448 SMLAL2 v28.4s, v6.8h, v0.h[6] 449 SMLAL v25.4s, v6.4h, v1.h[6] 450 SMLAL2 v29.4s, v6.8h, v1.h[6] 451 LDR d5, [x5, 120] 452 INS v4.d[0], x11 453 SXTL v4.8h, v4.8b 454 SMLAL v26.4s, v6.4h, v2.h[6] 455 SMLAL2 v30.4s, v6.8h, v2.h[6] 456 SMLAL v27.4s, v6.4h, v3.h[6] 457 SMLAL2 v31.4s, v6.8h, v3.h[6] 458 SMLAL v16.4s, v4.4h, v0.h[7] 459 SMLAL2 v20.4s, v4.8h, v0.h[7] 460 SMLAL v17.4s, v4.4h, v1.h[7] 461 SMLAL2 v21.4s, v4.8h, v1.h[7] 462 SXTL v5.8h, v5.8b 463 SMLAL v18.4s, v4.4h, v2.h[7] 464 SMLAL2 v22.4s, v4.8h, v2.h[7] 465 SMLAL v19.4s, v4.4h, v3.h[7] 466 SMLAL2 v23.4s, v4.8h, v3.h[7] 467 ADD x5, x5, 128 468 SMLAL v24.4s, v5.4h, v0.h[7] 469 SMLAL2 v28.4s, v5.8h, v0.h[7] 470 SMLAL v25.4s, v5.4h, v1.h[7] 471 SMLAL2 v29.4s, v5.8h, v1.h[7] 472 AND x0, x2, 7 // kc remainder 0 to 7 473 SMLAL v26.4s, v5.4h, v2.h[7] 474 SMLAL2 v30.4s, v5.8h, v2.h[7] 475 LDR x11, [sp, 40] // reload params pointer 476 SMLAL v27.4s, v5.4h, v3.h[7] 477 SMLAL2 v31.4s, v5.8h, v3.h[7] 478 479 # Is there a remainder?- 1 to 7 bytes of A 480 CBNZ x0, 5f 481 4824: 483 # ks loop 484 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 485 B.HI 1b 486 487 SCVTF v16.4s, v16.4s 488 SCVTF v17.4s, v17.4s 489 # Load per channel scale values from weights 490 LDR q4, [x5], 16 491 SCVTF v18.4s, v18.4s 492 SCVTF v19.4s, v19.4s 493 LDR q5, [x5], 16 494 SCVTF v20.4s, v20.4s 495 SCVTF v21.4s, v21.4s 496 SCVTF v22.4s, v22.4s 497 SCVTF v23.4s, v23.4s 498 SCVTF v24.4s, v24.4s 499 SCVTF v25.4s, v25.4s 500 SCVTF v26.4s, v26.4s 501 SCVTF v27.4s, v27.4s 502 SCVTF v28.4s, v28.4s 503 SCVTF v29.4s, v29.4s 504 SCVTF v30.4s, v30.4s 505 SCVTF v31.4s, v31.4s 506 507 LDR q6, [x5], 16 508 FMUL v16.4s, v16.4s, v4.4s 509 FMUL v17.4s, v17.4s, v4.4s 510 FMUL v18.4s, v18.4s, v4.4s 511 FMUL v19.4s, v19.4s, v4.4s 512 FMUL v20.4s, v20.4s, v5.4s 513 LDR q4, [x5], 16 514 FMUL v21.4s, v21.4s, v5.4s 515 FMUL v22.4s, v22.4s, v5.4s 516 FMUL v23.4s, v23.4s, v5.4s 517 FMUL v24.4s, v24.4s, v6.4s 518 FMUL v25.4s, v25.4s, v6.4s 519 FMUL v26.4s, v26.4s, v6.4s 520 FMUL v27.4s, v27.4s, v6.4s 521 FMUL v28.4s, v28.4s, v4.4s 522 FMUL v29.4s, v29.4s, v4.4s 523 FMUL v30.4s, v30.4s, v4.4s 524 FMUL v31.4s, v31.4s, v4.4s 525 526 FCVTNS v16.4s, v16.4s 527 FCVTNS v17.4s, v17.4s 528 FCVTNS v18.4s, v18.4s 529 FCVTNS v19.4s, v19.4s 530 FCVTNS v20.4s, v20.4s 531 FCVTNS v21.4s, v21.4s 532 FCVTNS v22.4s, v22.4s 533 FCVTNS v23.4s, v23.4s 534 FCVTNS v24.4s, v24.4s 535 FCVTNS v25.4s, v25.4s 536 FCVTNS v26.4s, v26.4s 537 FCVTNS v27.4s, v27.4s 538 FCVTNS v28.4s, v28.4s 539 FCVTNS v29.4s, v29.4s 540 FCVTNS v30.4s, v30.4s 541 FCVTNS v31.4s, v31.4s 542 543 SQXTN v16.4h, v16.4s 544 SQXTN v17.4h, v17.4s 545 SQXTN v18.4h, v18.4s 546 SQXTN v19.4h, v19.4s 547 SQXTN v24.4h, v24.4s 548 SQXTN v25.4h, v25.4s 549 SQXTN v26.4h, v26.4s 550 SQXTN v27.4h, v27.4s 551 LD1R {v6.8h}, [x11], 2 // add bias 552 553 SQXTN2 v16.8h, v20.4s 554 SQXTN2 v17.8h, v21.4s 555 SQXTN2 v18.8h, v22.4s 556 SQXTN2 v19.8h, v23.4s 557 SQXTN2 v24.8h, v28.4s 558 SQXTN2 v25.8h, v29.4s 559 SQXTN2 v26.8h, v30.4s 560 SQXTN2 v27.8h, v31.4s 561 562 SQADD v16.8h, v16.8h, v6.8h 563 SQADD v17.8h, v17.8h, v6.8h 564 SQADD v18.8h, v18.8h, v6.8h 565 SQADD v19.8h, v19.8h, v6.8h 566 SQADD v24.8h, v24.8h, v6.8h 567 SQADD v25.8h, v25.8h, v6.8h 568 SQADD v26.8h, v26.8h, v6.8h 569 SQADD v27.8h, v27.8h, v6.8h 570 LD1R {v4.16b}, [x11], 1 // clamp min value 571 572 SQXTN v0.8b, v16.8h 573 SQXTN v1.8b, v17.8h 574 SQXTN v2.8b, v18.8h 575 SQXTN v3.8b, v19.8h 576 LD1R {v5.16b}, [x11] // clamp max value 577 SQXTN2 v0.16b, v24.8h 578 SQXTN2 v1.16b, v25.8h 579 SQXTN2 v2.16b, v26.8h 580 SQXTN2 v3.16b, v27.8h 581 SUB x11, x11, 3 // rewind params pointer 582 583 SMAX v0.16b, v0.16b, v4.16b 584 SMAX v1.16b, v1.16b, v4.16b 585 SMAX v2.16b, v2.16b, v4.16b 586 SMAX v3.16b, v3.16b, v4.16b 587 SUBS x1, x1, 16 588 SMIN v0.16b, v0.16b, v5.16b 589 SMIN v1.16b, v1.16b, v5.16b 590 SMIN v2.16b, v2.16b, v5.16b 591 SMIN v3.16b, v3.16b, v5.16b 592 B.LO 6f 593 594 # Store full 4 x 16 595 ST1 {v3.16b}, [x7], x10 596 ST1 {v2.16b}, [x17], x10 597 ST1 {v1.16b}, [x16], x10 598 ST1 {v0.16b}, [x6], x10 599 600 SUB x4, x4, x3 // a -= ks 601 602 # nc loop 603 B.HI 0b 604 605 # Restore x20-x21 from stack 606 LDP x20, x21, [sp], 16 607 RET 608 609 # Remainder- 1 to 7 bytes of A 610 .p2align 3 6115: 612 AND x0, x2, 7 // kc remainder 1 to 7 613 614 LD1 {v0.8b}, [x13], x0 615 LDP d4, d5, [x5], 16 616 LD1 {v1.8b}, [x14], x0 617 LD1 {v2.8b}, [x15], x0 618 LD1 {v3.8b}, [x20], x0 619 SXTL v0.8h, v0.8b 620 SXTL v4.8h, v4.8b 621 SXTL v5.8h, v5.8b 622 SXTL v1.8h, v1.8b 623 SXTL v2.8h, v2.8b 624 SXTL v3.8h, v3.8b 625 SMLAL v16.4s, v4.4h, v0.h[0] 626 SMLAL2 v20.4s, v4.8h, v0.h[0] 627 SMLAL v24.4s, v5.4h, v0.h[0] 628 SMLAL2 v28.4s, v5.8h, v0.h[0] 629 SMLAL v17.4s, v4.4h, v1.h[0] 630 SMLAL2 v21.4s, v4.8h, v1.h[0] 631 SMLAL v25.4s, v5.4h, v1.h[0] 632 SMLAL2 v29.4s, v5.8h, v1.h[0] 633 SMLAL v18.4s, v4.4h, v2.h[0] 634 SMLAL2 v22.4s, v4.8h, v2.h[0] 635 SMLAL v26.4s, v5.4h, v2.h[0] 636 SMLAL2 v30.4s, v5.8h, v2.h[0] 637 SMLAL v19.4s, v4.4h, v3.h[0] 638 SMLAL2 v23.4s, v4.8h, v3.h[0] 639 SMLAL v27.4s, v5.4h, v3.h[0] 640 SMLAL2 v31.4s, v5.8h, v3.h[0] 641 CMP x0, 2 642 B.LO 4b 643 644 LDP d4, d5, [x5], 16 645 SXTL v4.8h, v4.8b 646 SXTL v5.8h, v5.8b 647 SMLAL v16.4s, v4.4h, v0.h[1] 648 SMLAL2 v20.4s, v4.8h, v0.h[1] 649 SMLAL v24.4s, v5.4h, v0.h[1] 650 SMLAL2 v28.4s, v5.8h, v0.h[1] 651 SMLAL v17.4s, v4.4h, v1.h[1] 652 SMLAL2 v21.4s, v4.8h, v1.h[1] 653 SMLAL v25.4s, v5.4h, v1.h[1] 654 SMLAL2 v29.4s, v5.8h, v1.h[1] 655 SMLAL v18.4s, v4.4h, v2.h[1] 656 SMLAL2 v22.4s, v4.8h, v2.h[1] 657 SMLAL v26.4s, v5.4h, v2.h[1] 658 SMLAL2 v30.4s, v5.8h, v2.h[1] 659 SMLAL v19.4s, v4.4h, v3.h[1] 660 SMLAL2 v23.4s, v4.8h, v3.h[1] 661 SMLAL v27.4s, v5.4h, v3.h[1] 662 SMLAL2 v31.4s, v5.8h, v3.h[1] 663 B.EQ 4b 664 665 LDP d4, d5, [x5], 16 666 SXTL v4.8h, v4.8b 667 SXTL v5.8h, v5.8b 668 SMLAL v16.4s, v4.4h, v0.h[2] 669 SMLAL2 v20.4s, v4.8h, v0.h[2] 670 SMLAL v24.4s, v5.4h, v0.h[2] 671 SMLAL2 v28.4s, v5.8h, v0.h[2] 672 SMLAL v17.4s, v4.4h, v1.h[2] 673 SMLAL2 v21.4s, v4.8h, v1.h[2] 674 SMLAL v25.4s, v5.4h, v1.h[2] 675 SMLAL2 v29.4s, v5.8h, v1.h[2] 676 SMLAL v18.4s, v4.4h, v2.h[2] 677 SMLAL2 v22.4s, v4.8h, v2.h[2] 678 SMLAL v26.4s, v5.4h, v2.h[2] 679 SMLAL2 v30.4s, v5.8h, v2.h[2] 680 SMLAL v19.4s, v4.4h, v3.h[2] 681 SMLAL2 v23.4s, v4.8h, v3.h[2] 682 SMLAL v27.4s, v5.4h, v3.h[2] 683 SMLAL2 v31.4s, v5.8h, v3.h[2] 684 CMP x0, 4 685 B.LO 4b 686 687 LDP d4, d5, [x5], 16 688 SXTL v4.8h, v4.8b 689 SXTL v5.8h, v5.8b 690 SMLAL v16.4s, v4.4h, v0.h[3] 691 SMLAL2 v20.4s, v4.8h, v0.h[3] 692 SMLAL v24.4s, v5.4h, v0.h[3] 693 SMLAL2 v28.4s, v5.8h, v0.h[3] 694 SMLAL v17.4s, v4.4h, v1.h[3] 695 SMLAL2 v21.4s, v4.8h, v1.h[3] 696 SMLAL v25.4s, v5.4h, v1.h[3] 697 SMLAL2 v29.4s, v5.8h, v1.h[3] 698 SMLAL v18.4s, v4.4h, v2.h[3] 699 SMLAL2 v22.4s, v4.8h, v2.h[3] 700 SMLAL v26.4s, v5.4h, v2.h[3] 701 SMLAL2 v30.4s, v5.8h, v2.h[3] 702 SMLAL v19.4s, v4.4h, v3.h[3] 703 SMLAL2 v23.4s, v4.8h, v3.h[3] 704 SMLAL v27.4s, v5.4h, v3.h[3] 705 SMLAL2 v31.4s, v5.8h, v3.h[3] 706 B.EQ 4b 707 708 LDP d4, d5, [x5], 16 709 SXTL v4.8h, v4.8b 710 SXTL v5.8h, v5.8b 711 SMLAL v16.4s, v4.4h, v0.h[4] 712 SMLAL2 v20.4s, v4.8h, v0.h[4] 713 SMLAL v24.4s, v5.4h, v0.h[4] 714 SMLAL2 v28.4s, v5.8h, v0.h[4] 715 SMLAL v17.4s, v4.4h, v1.h[4] 716 SMLAL2 v21.4s, v4.8h, v1.h[4] 717 SMLAL v25.4s, v5.4h, v1.h[4] 718 SMLAL2 v29.4s, v5.8h, v1.h[4] 719 SMLAL v18.4s, v4.4h, v2.h[4] 720 SMLAL2 v22.4s, v4.8h, v2.h[4] 721 SMLAL v26.4s, v5.4h, v2.h[4] 722 SMLAL2 v30.4s, v5.8h, v2.h[4] 723 SMLAL v19.4s, v4.4h, v3.h[4] 724 SMLAL2 v23.4s, v4.8h, v3.h[4] 725 SMLAL v27.4s, v5.4h, v3.h[4] 726 SMLAL2 v31.4s, v5.8h, v3.h[4] 727 CMP x0, 6 728 B.LO 4b 729 730 LDP d4, d5, [x5], 16 731 SXTL v4.8h, v4.8b 732 SXTL v5.8h, v5.8b 733 SMLAL v16.4s, v4.4h, v0.h[5] 734 SMLAL2 v20.4s, v4.8h, v0.h[5] 735 SMLAL v24.4s, v5.4h, v0.h[5] 736 SMLAL2 v28.4s, v5.8h, v0.h[5] 737 SMLAL v17.4s, v4.4h, v1.h[5] 738 SMLAL2 v21.4s, v4.8h, v1.h[5] 739 SMLAL v25.4s, v5.4h, v1.h[5] 740 SMLAL2 v29.4s, v5.8h, v1.h[5] 741 SMLAL v18.4s, v4.4h, v2.h[5] 742 SMLAL2 v22.4s, v4.8h, v2.h[5] 743 SMLAL v26.4s, v5.4h, v2.h[5] 744 SMLAL2 v30.4s, v5.8h, v2.h[5] 745 SMLAL v19.4s, v4.4h, v3.h[5] 746 SMLAL2 v23.4s, v4.8h, v3.h[5] 747 SMLAL v27.4s, v5.4h, v3.h[5] 748 SMLAL2 v31.4s, v5.8h, v3.h[5] 749 B.EQ 4b 750 751 LDP d4, d5, [x5], 16 752 SXTL v4.8h, v4.8b 753 SXTL v5.8h, v5.8b 754 SMLAL v16.4s, v4.4h, v0.h[6] 755 SMLAL2 v20.4s, v4.8h, v0.h[6] 756 SMLAL v24.4s, v5.4h, v0.h[6] 757 SMLAL2 v28.4s, v5.8h, v0.h[6] 758 SMLAL v17.4s, v4.4h, v1.h[6] 759 SMLAL2 v21.4s, v4.8h, v1.h[6] 760 SMLAL v25.4s, v5.4h, v1.h[6] 761 SMLAL2 v29.4s, v5.8h, v1.h[6] 762 SMLAL v18.4s, v4.4h, v2.h[6] 763 SMLAL2 v22.4s, v4.8h, v2.h[6] 764 SMLAL v26.4s, v5.4h, v2.h[6] 765 SMLAL2 v30.4s, v5.8h, v2.h[6] 766 SMLAL v19.4s, v4.4h, v3.h[6] 767 SMLAL2 v23.4s, v4.8h, v3.h[6] 768 SMLAL v27.4s, v5.4h, v3.h[6] 769 SMLAL2 v31.4s, v5.8h, v3.h[6] 770 B 4b 771 772 # Store odd width 773 .p2align 3 7746: 775 TBZ x1, 3, 7f 776 STR d3, [x7], 8 777 STR d2, [x17], 8 778 DUP d3, v3.d[1] 779 DUP d2, v2.d[1] 780 STR d1, [x16], 8 781 STR d0, [x6], 8 782 DUP d1, v1.d[1] 783 DUP d0, v0.d[1] 7847: 785 TBZ x1, 2, 8f 786 STR s3, [x7], 4 787 STR s2, [x17], 4 788 DUP s3, v3.s[1] 789 DUP s2, v2.s[1] 790 STR s1, [x16], 4 791 STR s0, [x6], 4 792 DUP s1, v1.s[1] 793 DUP s0, v0.s[1] 7948: 795 TBZ x1, 1, 9f 796 STR h3, [x7], 2 797 STR h2, [x17], 2 798 DUP h3, v3.h[1] 799 DUP h2, v2.h[1] 800 STR h1, [x16], 2 801 STR h0, [x6], 2 802 DUP h1, v1.h[1] 803 DUP h0, v0.h[1] 8049: 805 TBZ x1, 0, 10f 806 STR b3, [x7] 807 STR b2, [x17] 808 STR b1, [x16] 809 STR b0, [x6] 81010: 811 # Restore x20-x21 from stack 812 LDP x20, x21, [sp], 16 813 RET 814 815END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 816 817#ifdef __ELF__ 818.section ".note.GNU-stack","",%progbits 819#endif 820