1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const uint8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# uint8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# params structure is 20 bytes 26# struct { 27# uint8_t kernel_zero_point; 28# uint8_t padding[3]; 29# int32_t right_pre_shift; 30# int32_t multiplier; 31# int32_t right_post_shift; 32# int16_t output_zero_point; 33# uint8_t output_min; 34# uint8_t output_max; 35# } rndnu_neon; 36# 37# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 38 39# Register usage 40# A0 x3 v0 41# A1 x15 v1 42# A2 x13 v2 43# A3 x4 v3 44# B x5 v4 v5 v6 45# C0 x6 v16 v20 v24 v28 46# C1 x8 v17 v21 v25 v29 47# C2 x9 v18 v22 v26 v30 48# C3 x7 v19 v23 v27 v31 49# zero_point v7 50# unused v8 v9 v10 v11 v12 v13 v14 v15 51 52BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75 53 54 # Clamp A and C pointers 55 CMP x0, 2 // if mr < 2 56 LDP x12, x11, [sp] // Load cn_stride, params 57 ADD x15, x3, x4 // a1 = a0 + a_stride 58 ADD x8, x6, x7 // c1 = c0 + cm_stride 59 CSEL x15, x3, x15, LO // a1 = a0 60 CSEL x8, x6, x8, LO // c1 = c0 61 62 ADD x13, x15, x4 // a2 = a1 + a_stride 63 ADD x9, x8, x7 // c2 = c1 + cm_stride 64 // if mr <= 2 65 CSEL x13, x15, x13, LS // a2 = a1 66 CSEL x9, x8, x9, LS // c2 = c1 67 68 CMP x0, 4 // if mr < 4 69 ADD x4, x13, x4 // a3 = a2 + a_stride 70 ADD x7, x9, x7 // c3 = c2 + cm_stride 71 CSEL x4, x13, x4, LO // a3 = a2 72 CSEL x7, x9, x7, LO // c3 = c2 73 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 74 75 .p2align 3 760: 77 # Load initial bias from w into accumulators 78 LDP q16, q20, [x5], 32 79 MOV v17.16b, v16.16b 80 MOV v18.16b, v16.16b 81 LDP q24, q28, [x5], 32 82 MOV v19.16b, v16.16b 83 MOV v21.16b, v20.16b 84 MOV v22.16b, v20.16b 85 MOV v23.16b, v20.16b 86 SUBS x0, x2, 8 // k = kc - 8 87 MOV v25.16b, v24.16b 88 MOV v26.16b, v24.16b 89 MOV v27.16b, v24.16b 90 MOV v29.16b, v28.16b 91 MOV v30.16b, v28.16b 92 MOV v31.16b, v28.16b 93 # Is there at least 8 bytes for epilogue? 94 B.LO 4f 95 96 # Prologue 97 LDR d0, [x3], 8 98 LDP d4, d6, [x5] 99 LDR d1, [x15], 8 100 LDR d2, [x13], 8 101 LDR d3, [x4], 8 102 UXTL v0.8h, v0.8b 103 USUBL v4.8h, v4.8b, v7.8b 104 UXTL v1.8h, v1.8b 105 UXTL v2.8h, v2.8b 106 UXTL v3.8h, v3.8b 107 USUBL v6.8h, v6.8b, v7.8b 108 109 SUBS x0, x0, 8 // k = k - 8 110 # Is there at least 8 bytes for main loop? 111 B.LO 2f 112 113 # Main loop - 8 bytes of A 114 .p2align 3 1151: 116 SMLAL v16.4s, v4.4h, v0.h[0] 117 SMLAL2 v20.4s, v4.8h, v0.h[0] 118 SMLAL v17.4s, v4.4h, v1.h[0] 119 SMLAL2 v21.4s, v4.8h, v1.h[0] 120 SMLAL v18.4s, v4.4h, v2.h[0] 121 SMLAL2 v22.4s, v4.8h, v2.h[0] 122 SMLAL v19.4s, v4.4h, v3.h[0] 123 SMLAL2 v23.4s, v4.8h, v3.h[0] 124 LDR d5, [x5, 16] 125 SMLAL v24.4s, v6.4h, v0.h[0] 126 LDR d4, [x5, 24] 127 SMLAL2 v28.4s, v6.8h, v0.h[0] 128 SMLAL v25.4s, v6.4h, v1.h[0] 129 SMLAL2 v29.4s, v6.8h, v1.h[0] 130 USUBL v5.8h, v5.8b, v7.8b 131 SMLAL v26.4s, v6.4h, v2.h[0] 132 SMLAL2 v30.4s, v6.8h, v2.h[0] 133 SMLAL v27.4s, v6.4h, v3.h[0] 134 SMLAL2 v31.4s, v6.8h, v3.h[0] 135 SMLAL v16.4s, v5.4h, v0.h[1] 136 SMLAL2 v20.4s, v5.8h, v0.h[1] 137 SMLAL v17.4s, v5.4h, v1.h[1] 138 SMLAL2 v21.4s, v5.8h, v1.h[1] 139 USUBL v4.8h, v4.8b, v7.8b 140 SMLAL v18.4s, v5.4h, v2.h[1] 141 SMLAL2 v22.4s, v5.8h, v2.h[1] 142 SMLAL v19.4s, v5.4h, v3.h[1] 143 SMLAL2 v23.4s, v5.8h, v3.h[1] 144 LDR d6, [x5, 32] 145 SMLAL v24.4s, v4.4h, v0.h[1] 146 LDR d5, [x5, 40] 147 SMLAL2 v28.4s, v4.8h, v0.h[1] 148 SMLAL v25.4s, v4.4h, v1.h[1] 149 SMLAL2 v29.4s, v4.8h, v1.h[1] 150 USUBL v6.8h, v6.8b, v7.8b 151 SMLAL v26.4s, v4.4h, v2.h[1] 152 SMLAL2 v30.4s, v4.8h, v2.h[1] 153 SMLAL v27.4s, v4.4h, v3.h[1] 154 SMLAL2 v31.4s, v4.8h, v3.h[1] 155 SMLAL v16.4s, v6.4h, v0.h[2] 156 SMLAL2 v20.4s, v6.8h, v0.h[2] 157 SMLAL v17.4s, v6.4h, v1.h[2] 158 USUBL v5.8h, v5.8b, v7.8b 159 SMLAL2 v21.4s, v6.8h, v1.h[2] 160 SMLAL v18.4s, v6.4h, v2.h[2] 161 SMLAL2 v22.4s, v6.8h, v2.h[2] 162 SMLAL v19.4s, v6.4h, v3.h[2] 163 SMLAL2 v23.4s, v6.8h, v3.h[2] 164 LDR d4, [x5, 48] 165 SMLAL v24.4s, v5.4h, v0.h[2] 166 LDR d6, [x5, 56] 167 SMLAL2 v28.4s, v5.8h, v0.h[2] 168 SMLAL v25.4s, v5.4h, v1.h[2] 169 SMLAL2 v29.4s, v5.8h, v1.h[2] 170 USUBL v4.8h, v4.8b, v7.8b 171 SMLAL v26.4s, v5.4h, v2.h[2] 172 SMLAL2 v30.4s, v5.8h, v2.h[2] 173 SMLAL v27.4s, v5.4h, v3.h[2] 174 SMLAL2 v31.4s, v5.8h, v3.h[2] 175 SMLAL v16.4s, v4.4h, v0.h[3] 176 SMLAL2 v20.4s, v4.8h, v0.h[3] 177 SMLAL v17.4s, v4.4h, v1.h[3] 178 SMLAL2 v21.4s, v4.8h, v1.h[3] 179 USUBL v6.8h, v6.8b, v7.8b 180 SMLAL v18.4s, v4.4h, v2.h[3] 181 SMLAL2 v22.4s, v4.8h, v2.h[3] 182 SMLAL v19.4s, v4.4h, v3.h[3] 183 SMLAL2 v23.4s, v4.8h, v3.h[3] 184 LDR d5, [x5, 64] 185 SMLAL v24.4s, v6.4h, v0.h[3] 186 LDR d4, [x5, 72] 187 SMLAL2 v28.4s, v6.8h, v0.h[3] 188 USUBL v5.8h, v5.8b, v7.8b 189 SMLAL v25.4s, v6.4h, v1.h[3] 190 SMLAL2 v29.4s, v6.8h, v1.h[3] 191 SMLAL v26.4s, v6.4h, v2.h[3] 192 SMLAL2 v30.4s, v6.8h, v2.h[3] 193 SMLAL v27.4s, v6.4h, v3.h[3] 194 SMLAL2 v31.4s, v6.8h, v3.h[3] 195 SMLAL v16.4s, v5.4h, v0.h[4] 196 SMLAL2 v20.4s, v5.8h, v0.h[4] 197 SMLAL v17.4s, v5.4h, v1.h[4] 198 SMLAL2 v21.4s, v5.8h, v1.h[4] 199 USUBL v4.8h, v4.8b, v7.8b 200 SMLAL v18.4s, v5.4h, v2.h[4] 201 SMLAL2 v22.4s, v5.8h, v2.h[4] 202 SMLAL v19.4s, v5.4h, v3.h[4] 203 SMLAL2 v23.4s, v5.8h, v3.h[4] 204 LDR d6, [x5, 80] 205 SMLAL v24.4s, v4.4h, v0.h[4] 206 LDR d5, [x5, 88] 207 SMLAL2 v28.4s, v4.8h, v0.h[4] 208 SMLAL v25.4s, v4.4h, v1.h[4] 209 SMLAL2 v29.4s, v4.8h, v1.h[4] 210 USUBL v6.8h, v6.8b, v7.8b 211 SMLAL v26.4s, v4.4h, v2.h[4] 212 SMLAL2 v30.4s, v4.8h, v2.h[4] 213 SMLAL v27.4s, v4.4h, v3.h[4] 214 SMLAL2 v31.4s, v4.8h, v3.h[4] 215 SMLAL v16.4s, v6.4h, v0.h[5] 216 SMLAL2 v20.4s, v6.8h, v0.h[5] 217 SMLAL v17.4s, v6.4h, v1.h[5] 218 SMLAL2 v21.4s, v6.8h, v1.h[5] 219 USUBL v5.8h, v5.8b, v7.8b 220 SMLAL v18.4s, v6.4h, v2.h[5] 221 SMLAL2 v22.4s, v6.8h, v2.h[5] 222 SMLAL v19.4s, v6.4h, v3.h[5] 223 SMLAL2 v23.4s, v6.8h, v3.h[5] 224 LDR d4, [x5, 96] 225 SMLAL v24.4s, v5.4h, v0.h[5] 226 LDR d6, [x5, 104] 227 SMLAL2 v28.4s, v5.8h, v0.h[5] 228 SMLAL v25.4s, v5.4h, v1.h[5] 229 SMLAL2 v29.4s, v5.8h, v1.h[5] 230 USUBL v4.8h, v4.8b, v7.8b 231 SMLAL v26.4s, v5.4h, v2.h[5] 232 SMLAL2 v30.4s, v5.8h, v2.h[5] 233 SMLAL v27.4s, v5.4h, v3.h[5] 234 SMLAL2 v31.4s, v5.8h, v3.h[5] 235 USUBL v6.8h, v6.8b, v7.8b 236 SMLAL v16.4s, v4.4h, v0.h[6] 237 SMLAL2 v20.4s, v4.8h, v0.h[6] 238 SMLAL v17.4s, v4.4h, v1.h[6] 239 SMLAL2 v21.4s, v4.8h, v1.h[6] 240 SMLAL v18.4s, v4.4h, v2.h[6] 241 SMLAL2 v22.4s, v4.8h, v2.h[6] 242 SMLAL v19.4s, v4.4h, v3.h[6] 243 SMLAL2 v23.4s, v4.8h, v3.h[6] 244 LDR d4, [x5, 112] 245 SMLAL v24.4s, v6.4h, v0.h[6] 246 LDR d5, [x5, 120] 247 SMLAL2 v28.4s, v6.8h, v0.h[6] 248 SMLAL v25.4s, v6.4h, v1.h[6] 249 SMLAL2 v29.4s, v6.8h, v1.h[6] 250 USUBL v4.8h, v4.8b, v7.8b 251 ADD x5, x5, 128 252 253 SMLAL v26.4s, v6.4h, v2.h[6] 254 SMLAL2 v30.4s, v6.8h, v2.h[6] 255 SMLAL v27.4s, v6.4h, v3.h[6] 256 SMLAL2 v31.4s, v6.8h, v3.h[6] 257 USUBL v5.8h, v5.8b, v7.8b 258 259 SMLAL v16.4s, v4.4h, v0.h[7] 260 SMLAL2 v20.4s, v4.8h, v0.h[7] 261 SMLAL v17.4s, v4.4h, v1.h[7] 262 SMLAL2 v21.4s, v4.8h, v1.h[7] 263 SMLAL v18.4s, v4.4h, v2.h[7] 264 SMLAL2 v22.4s, v4.8h, v2.h[7] 265 SMLAL v19.4s, v4.4h, v3.h[7] 266 SMLAL2 v23.4s, v4.8h, v3.h[7] 267 LDR d4, [x5] 268 SMLAL v24.4s, v5.4h, v0.h[7] 269 LDR d6, [x5, 8] 270 SMLAL2 v28.4s, v5.8h, v0.h[7] 271 SMLAL v25.4s, v5.4h, v1.h[7] 272 SMLAL2 v29.4s, v5.8h, v1.h[7] 273 LDR d0, [x3], 8 274 SMLAL v26.4s, v5.4h, v2.h[7] 275 LDR d1, [x15], 8 276 SMLAL2 v30.4s, v5.8h, v2.h[7] 277 SMLAL v27.4s, v5.4h, v3.h[7] 278 SMLAL2 v31.4s, v5.8h, v3.h[7] 279 LDR d2, [x13], 8 280 281 UXTL v0.8h, v0.8b 282 LDR d3, [x4], 8 283 UXTL v1.8h, v1.8b 284 USUBL v4.8h, v4.8b, v7.8b 285 UXTL v2.8h, v2.8b 286 SUBS x0, x0, 8 287 UXTL v3.8h, v3.8b 288 USUBL v6.8h, v6.8b, v7.8b 289 B.HS 1b 290 291 # Epilogue. Same as main loop but no preloads in final group 292 293 .p2align 3 2942: 295 SMLAL v16.4s, v4.4h, v0.h[0] 296 SMLAL2 v20.4s, v4.8h, v0.h[0] 297 SMLAL v17.4s, v4.4h, v1.h[0] 298 SMLAL2 v21.4s, v4.8h, v1.h[0] 299 SMLAL v18.4s, v4.4h, v2.h[0] 300 SMLAL2 v22.4s, v4.8h, v2.h[0] 301 SMLAL v19.4s, v4.4h, v3.h[0] 302 SMLAL2 v23.4s, v4.8h, v3.h[0] 303 LDR d5, [x5, 16] 304 SMLAL v24.4s, v6.4h, v0.h[0] 305 LDR d4, [x5, 24] 306 SMLAL2 v28.4s, v6.8h, v0.h[0] 307 SMLAL v25.4s, v6.4h, v1.h[0] 308 SMLAL2 v29.4s, v6.8h, v1.h[0] 309 USUBL v5.8h, v5.8b, v7.8b 310 SMLAL v26.4s, v6.4h, v2.h[0] 311 SMLAL2 v30.4s, v6.8h, v2.h[0] 312 SMLAL v27.4s, v6.4h, v3.h[0] 313 SMLAL2 v31.4s, v6.8h, v3.h[0] 314 SMLAL v16.4s, v5.4h, v0.h[1] 315 SMLAL2 v20.4s, v5.8h, v0.h[1] 316 SMLAL v17.4s, v5.4h, v1.h[1] 317 SMLAL2 v21.4s, v5.8h, v1.h[1] 318 USUBL v4.8h, v4.8b, v7.8b 319 SMLAL v18.4s, v5.4h, v2.h[1] 320 SMLAL2 v22.4s, v5.8h, v2.h[1] 321 SMLAL v19.4s, v5.4h, v3.h[1] 322 SMLAL2 v23.4s, v5.8h, v3.h[1] 323 LDR d6, [x5, 32] 324 SMLAL v24.4s, v4.4h, v0.h[1] 325 LDR d5, [x5, 40] 326 SMLAL2 v28.4s, v4.8h, v0.h[1] 327 SMLAL v25.4s, v4.4h, v1.h[1] 328 SMLAL2 v29.4s, v4.8h, v1.h[1] 329 USUBL v6.8h, v6.8b, v7.8b 330 SMLAL v26.4s, v4.4h, v2.h[1] 331 SMLAL2 v30.4s, v4.8h, v2.h[1] 332 SMLAL v27.4s, v4.4h, v3.h[1] 333 SMLAL2 v31.4s, v4.8h, v3.h[1] 334 SMLAL v16.4s, v6.4h, v0.h[2] 335 SMLAL2 v20.4s, v6.8h, v0.h[2] 336 SMLAL v17.4s, v6.4h, v1.h[2] 337 USUBL v5.8h, v5.8b, v7.8b 338 SMLAL2 v21.4s, v6.8h, v1.h[2] 339 SMLAL v18.4s, v6.4h, v2.h[2] 340 SMLAL2 v22.4s, v6.8h, v2.h[2] 341 SMLAL v19.4s, v6.4h, v3.h[2] 342 SMLAL2 v23.4s, v6.8h, v3.h[2] 343 LDR d4, [x5, 48] 344 SMLAL v24.4s, v5.4h, v0.h[2] 345 LDR d6, [x5, 56] 346 SMLAL2 v28.4s, v5.8h, v0.h[2] 347 SMLAL v25.4s, v5.4h, v1.h[2] 348 SMLAL2 v29.4s, v5.8h, v1.h[2] 349 USUBL v4.8h, v4.8b, v7.8b 350 SMLAL v26.4s, v5.4h, v2.h[2] 351 SMLAL2 v30.4s, v5.8h, v2.h[2] 352 SMLAL v27.4s, v5.4h, v3.h[2] 353 SMLAL2 v31.4s, v5.8h, v3.h[2] 354 SMLAL v16.4s, v4.4h, v0.h[3] 355 SMLAL2 v20.4s, v4.8h, v0.h[3] 356 SMLAL v17.4s, v4.4h, v1.h[3] 357 SMLAL2 v21.4s, v4.8h, v1.h[3] 358 USUBL v6.8h, v6.8b, v7.8b 359 SMLAL v18.4s, v4.4h, v2.h[3] 360 SMLAL2 v22.4s, v4.8h, v2.h[3] 361 SMLAL v19.4s, v4.4h, v3.h[3] 362 SMLAL2 v23.4s, v4.8h, v3.h[3] 363 LDR d5, [x5, 64] 364 SMLAL v24.4s, v6.4h, v0.h[3] 365 LDR d4, [x5, 72] 366 SMLAL2 v28.4s, v6.8h, v0.h[3] 367 USUBL v5.8h, v5.8b, v7.8b 368 SMLAL v25.4s, v6.4h, v1.h[3] 369 SMLAL2 v29.4s, v6.8h, v1.h[3] 370 SMLAL v26.4s, v6.4h, v2.h[3] 371 SMLAL2 v30.4s, v6.8h, v2.h[3] 372 SMLAL v27.4s, v6.4h, v3.h[3] 373 SMLAL2 v31.4s, v6.8h, v3.h[3] 374 SMLAL v16.4s, v5.4h, v0.h[4] 375 SMLAL2 v20.4s, v5.8h, v0.h[4] 376 SMLAL v17.4s, v5.4h, v1.h[4] 377 SMLAL2 v21.4s, v5.8h, v1.h[4] 378 USUBL v4.8h, v4.8b, v7.8b 379 SMLAL v18.4s, v5.4h, v2.h[4] 380 SMLAL2 v22.4s, v5.8h, v2.h[4] 381 SMLAL v19.4s, v5.4h, v3.h[4] 382 SMLAL2 v23.4s, v5.8h, v3.h[4] 383 LDR d6, [x5, 80] 384 SMLAL v24.4s, v4.4h, v0.h[4] 385 LDR d5, [x5, 88] 386 SMLAL2 v28.4s, v4.8h, v0.h[4] 387 SMLAL v25.4s, v4.4h, v1.h[4] 388 SMLAL2 v29.4s, v4.8h, v1.h[4] 389 USUBL v6.8h, v6.8b, v7.8b 390 SMLAL v26.4s, v4.4h, v2.h[4] 391 SMLAL2 v30.4s, v4.8h, v2.h[4] 392 SMLAL v27.4s, v4.4h, v3.h[4] 393 SMLAL2 v31.4s, v4.8h, v3.h[4] 394 SMLAL v16.4s, v6.4h, v0.h[5] 395 SMLAL2 v20.4s, v6.8h, v0.h[5] 396 SMLAL v17.4s, v6.4h, v1.h[5] 397 SMLAL2 v21.4s, v6.8h, v1.h[5] 398 USUBL v5.8h, v5.8b, v7.8b 399 SMLAL v18.4s, v6.4h, v2.h[5] 400 SMLAL2 v22.4s, v6.8h, v2.h[5] 401 SMLAL v19.4s, v6.4h, v3.h[5] 402 SMLAL2 v23.4s, v6.8h, v3.h[5] 403 LDR d4, [x5, 96] 404 SMLAL v24.4s, v5.4h, v0.h[5] 405 LDR d6, [x5, 104] 406 SMLAL2 v28.4s, v5.8h, v0.h[5] 407 SMLAL v25.4s, v5.4h, v1.h[5] 408 SMLAL2 v29.4s, v5.8h, v1.h[5] 409 USUBL v4.8h, v4.8b, v7.8b 410 SMLAL v26.4s, v5.4h, v2.h[5] 411 SMLAL2 v30.4s, v5.8h, v2.h[5] 412 SMLAL v27.4s, v5.4h, v3.h[5] 413 SMLAL2 v31.4s, v5.8h, v3.h[5] 414 USUBL v6.8h, v6.8b, v7.8b 415 SMLAL v16.4s, v4.4h, v0.h[6] 416 SMLAL2 v20.4s, v4.8h, v0.h[6] 417 SMLAL v17.4s, v4.4h, v1.h[6] 418 SMLAL2 v21.4s, v4.8h, v1.h[6] 419 SMLAL v18.4s, v4.4h, v2.h[6] 420 SMLAL2 v22.4s, v4.8h, v2.h[6] 421 SMLAL v19.4s, v4.4h, v3.h[6] 422 SMLAL2 v23.4s, v4.8h, v3.h[6] 423 SMLAL v24.4s, v6.4h, v0.h[6] 424 SMLAL2 v28.4s, v6.8h, v0.h[6] 425 SMLAL v25.4s, v6.4h, v1.h[6] 426 SMLAL2 v29.4s, v6.8h, v1.h[6] 427 LDR d4, [x5, 112] 428 USUBL v4.8h, v4.8b, v7.8b 429 LDR d5, [x5, 120] 430 SMLAL v26.4s, v6.4h, v2.h[6] 431 SMLAL2 v30.4s, v6.8h, v2.h[6] 432 SMLAL v27.4s, v6.4h, v3.h[6] 433 SMLAL2 v31.4s, v6.8h, v3.h[6] 434 SMLAL v16.4s, v4.4h, v0.h[7] 435 SMLAL2 v20.4s, v4.8h, v0.h[7] 436 SMLAL v17.4s, v4.4h, v1.h[7] 437 SMLAL2 v21.4s, v4.8h, v1.h[7] 438 USUBL v5.8h, v5.8b, v7.8b 439 SMLAL v18.4s, v4.4h, v2.h[7] 440 SMLAL2 v22.4s, v4.8h, v2.h[7] 441 SMLAL v19.4s, v4.4h, v3.h[7] 442 SMLAL2 v23.4s, v4.8h, v3.h[7] 443 ADD x5, x5, 128 444 SMLAL v24.4s, v5.4h, v0.h[7] 445 SMLAL2 v28.4s, v5.8h, v0.h[7] 446 SMLAL v25.4s, v5.4h, v1.h[7] 447 SMLAL2 v29.4s, v5.8h, v1.h[7] 448 AND x0, x2, 7 // kc remainder 0 to 7 449 SMLAL v26.4s, v5.4h, v2.h[7] 450 SMLAL2 v30.4s, v5.8h, v2.h[7] 451 SMLAL v27.4s, v5.4h, v3.h[7] 452 SMLAL2 v31.4s, v5.8h, v3.h[7] 453 454 # Is there a remainder?- 1 to 7 bytes of A 455 CBNZ x0, 4f 456 4573: 458 # Apply params - preshift, scale, postshift, bias and clamp 459 LD1R {v4.4s}, [x11], 4 460 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 461 SQSHL v17.4s, v17.4s, v4.4s 462 SQSHL v18.4s, v18.4s, v4.4s 463 SQSHL v19.4s, v19.4s, v4.4s 464 SQSHL v20.4s, v20.4s, v4.4s 465 SQSHL v21.4s, v21.4s, v4.4s 466 SQSHL v22.4s, v22.4s, v4.4s 467 SQSHL v23.4s, v23.4s, v4.4s 468 LD1R {v5.4s}, [x11], 4 469 SQSHL v24.4s, v24.4s, v4.4s 470 SQSHL v25.4s, v25.4s, v4.4s 471 SQSHL v26.4s, v26.4s, v4.4s 472 SQSHL v27.4s, v27.4s, v4.4s 473 SQSHL v28.4s, v28.4s, v4.4s 474 SQSHL v29.4s, v29.4s, v4.4s 475 SQSHL v30.4s, v30.4s, v4.4s 476 SQSHL v31.4s, v31.4s, v4.4s 477 LD1R {v6.4s}, [x11], 4 478 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 479 SQDMULH v17.4s, v17.4s, v5.4s 480 SQDMULH v18.4s, v18.4s, v5.4s 481 SQDMULH v19.4s, v19.4s, v5.4s 482 SQDMULH v20.4s, v20.4s, v5.4s 483 SQDMULH v21.4s, v21.4s, v5.4s 484 SQDMULH v22.4s, v22.4s, v5.4s 485 SQDMULH v23.4s, v23.4s, v5.4s 486 SQDMULH v24.4s, v24.4s, v5.4s 487 SQDMULH v25.4s, v25.4s, v5.4s 488 SQDMULH v26.4s, v26.4s, v5.4s 489 SQDMULH v27.4s, v27.4s, v5.4s 490 SQDMULH v28.4s, v28.4s, v5.4s 491 SQDMULH v29.4s, v29.4s, v5.4s 492 SQDMULH v30.4s, v30.4s, v5.4s 493 SQDMULH v31.4s, v31.4s, v5.4s 494 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 495 SRSHL v17.4s, v17.4s, v6.4s 496 SRSHL v18.4s, v18.4s, v6.4s 497 SRSHL v19.4s, v19.4s, v6.4s 498 SRSHL v20.4s, v20.4s, v6.4s 499 SRSHL v21.4s, v21.4s, v6.4s 500 SRSHL v22.4s, v22.4s, v6.4s 501 SRSHL v23.4s, v23.4s, v6.4s 502 SRSHL v24.4s, v24.4s, v6.4s 503 SRSHL v25.4s, v25.4s, v6.4s 504 SRSHL v26.4s, v26.4s, v6.4s 505 SRSHL v27.4s, v27.4s, v6.4s 506 SRSHL v28.4s, v28.4s, v6.4s 507 SRSHL v29.4s, v29.4s, v6.4s 508 SRSHL v30.4s, v30.4s, v6.4s 509 SRSHL v31.4s, v31.4s, v6.4s 510 511 SQXTN v16.4h, v16.4s 512 SQXTN v17.4h, v17.4s 513 SQXTN v18.4h, v18.4s 514 SQXTN v19.4h, v19.4s 515 SQXTN v24.4h, v24.4s 516 SQXTN v25.4h, v25.4s 517 SQXTN v26.4h, v26.4s 518 SQXTN v27.4h, v27.4s 519 LD1R {v6.8h}, [x11], 2 // add bias 520 521 SQXTN2 v16.8h, v20.4s 522 SQXTN2 v17.8h, v21.4s 523 SQXTN2 v18.8h, v22.4s 524 SQXTN2 v19.8h, v23.4s 525 SQXTN2 v24.8h, v28.4s 526 SQXTN2 v25.8h, v29.4s 527 SQXTN2 v26.8h, v30.4s 528 SQXTN2 v27.8h, v31.4s 529 530 SQADD v16.8h, v16.8h, v6.8h 531 SQADD v17.8h, v17.8h, v6.8h 532 SQADD v18.8h, v18.8h, v6.8h 533 SQADD v19.8h, v19.8h, v6.8h 534 SQADD v24.8h, v24.8h, v6.8h 535 SQADD v25.8h, v25.8h, v6.8h 536 SQADD v26.8h, v26.8h, v6.8h 537 SQADD v27.8h, v27.8h, v6.8h 538 LD1R {v4.16b}, [x11], 1 // clamp min value 539 540 SQXTUN v0.8b, v16.8h 541 SQXTUN v1.8b, v17.8h 542 SQXTUN v2.8b, v18.8h 543 SQXTUN v3.8b, v19.8h 544 LD1R {v5.16b}, [x11] // clamp max value 545 SQXTUN2 v0.16b, v24.8h 546 SQXTUN2 v1.16b, v25.8h 547 SQXTUN2 v2.16b, v26.8h 548 SQXTUN2 v3.16b, v27.8h 549 SUB x11, x11, 15 // rewind params pointer 550 551 UMAX v0.16b, v0.16b, v4.16b 552 UMAX v1.16b, v1.16b, v4.16b 553 UMAX v2.16b, v2.16b, v4.16b 554 UMAX v3.16b, v3.16b, v4.16b 555 SUBS x1, x1, 16 556 UMIN v0.16b, v0.16b, v5.16b 557 UMIN v1.16b, v1.16b, v5.16b 558 UMIN v2.16b, v2.16b, v5.16b 559 UMIN v3.16b, v3.16b, v5.16b 560 B.LO 5f 561 562 # Store full 4 x 16 563 ST1 {v0.16b}, [x6], x12 564 SUB x3, x3, x2 // a0 -= kc 565 ST1 {v1.16b}, [x8], x12 566 SUB x15, x15, x2 // a1 -= kc 567 ST1 {v2.16b}, [x9], x12 568 SUB x13, x13, x2 // a2 -= kc 569 ST1 {v3.16b}, [x7], x12 570 SUB x4, x4, x2 // a3 -= kc 571 B.NE 0b 572 RET 573 574 # Remainder- 1 to 7 bytes of A 575 .p2align 3 5764: 577 AND x0, x2, 7 // kc remainder 1 to 7 578 579 LD1 {v0.8b}, [x3], x0 580 LDP d4, d5, [x5], 16 581 LD1 {v1.8b}, [x15], x0 582 LD1 {v2.8b}, [x13], x0 583 LD1 {v3.8b}, [x4], x0 584 UXTL v0.8h, v0.8b 585 USUBL v4.8h, v4.8b, v7.8b 586 USUBL v5.8h, v5.8b, v7.8b 587 UXTL v1.8h, v1.8b 588 UXTL v2.8h, v2.8b 589 UXTL v3.8h, v3.8b 590 SMLAL v16.4s, v4.4h, v0.h[0] 591 SMLAL2 v20.4s, v4.8h, v0.h[0] 592 SMLAL v24.4s, v5.4h, v0.h[0] 593 SMLAL2 v28.4s, v5.8h, v0.h[0] 594 SMLAL v17.4s, v4.4h, v1.h[0] 595 SMLAL2 v21.4s, v4.8h, v1.h[0] 596 SMLAL v25.4s, v5.4h, v1.h[0] 597 SMLAL2 v29.4s, v5.8h, v1.h[0] 598 SMLAL v18.4s, v4.4h, v2.h[0] 599 SMLAL2 v22.4s, v4.8h, v2.h[0] 600 SMLAL v26.4s, v5.4h, v2.h[0] 601 SMLAL2 v30.4s, v5.8h, v2.h[0] 602 SMLAL v19.4s, v4.4h, v3.h[0] 603 SMLAL2 v23.4s, v4.8h, v3.h[0] 604 SMLAL v27.4s, v5.4h, v3.h[0] 605 SMLAL2 v31.4s, v5.8h, v3.h[0] 606 CMP x0, 2 607 B.LO 3b 608 609 LDP d4, d5, [x5], 16 610 USUBL v4.8h, v4.8b, v7.8b 611 USUBL v5.8h, v5.8b, v7.8b 612 SMLAL v16.4s, v4.4h, v0.h[1] 613 SMLAL2 v20.4s, v4.8h, v0.h[1] 614 SMLAL v24.4s, v5.4h, v0.h[1] 615 SMLAL2 v28.4s, v5.8h, v0.h[1] 616 SMLAL v17.4s, v4.4h, v1.h[1] 617 SMLAL2 v21.4s, v4.8h, v1.h[1] 618 SMLAL v25.4s, v5.4h, v1.h[1] 619 SMLAL2 v29.4s, v5.8h, v1.h[1] 620 SMLAL v18.4s, v4.4h, v2.h[1] 621 SMLAL2 v22.4s, v4.8h, v2.h[1] 622 SMLAL v26.4s, v5.4h, v2.h[1] 623 SMLAL2 v30.4s, v5.8h, v2.h[1] 624 SMLAL v19.4s, v4.4h, v3.h[1] 625 SMLAL2 v23.4s, v4.8h, v3.h[1] 626 SMLAL v27.4s, v5.4h, v3.h[1] 627 SMLAL2 v31.4s, v5.8h, v3.h[1] 628 B.EQ 3b 629 630 LDP d4, d5, [x5], 16 631 USUBL v4.8h, v4.8b, v7.8b 632 USUBL v5.8h, v5.8b, v7.8b 633 SMLAL v16.4s, v4.4h, v0.h[2] 634 SMLAL2 v20.4s, v4.8h, v0.h[2] 635 SMLAL v24.4s, v5.4h, v0.h[2] 636 SMLAL2 v28.4s, v5.8h, v0.h[2] 637 SMLAL v17.4s, v4.4h, v1.h[2] 638 SMLAL2 v21.4s, v4.8h, v1.h[2] 639 SMLAL v25.4s, v5.4h, v1.h[2] 640 SMLAL2 v29.4s, v5.8h, v1.h[2] 641 SMLAL v18.4s, v4.4h, v2.h[2] 642 SMLAL2 v22.4s, v4.8h, v2.h[2] 643 SMLAL v26.4s, v5.4h, v2.h[2] 644 SMLAL2 v30.4s, v5.8h, v2.h[2] 645 SMLAL v19.4s, v4.4h, v3.h[2] 646 SMLAL2 v23.4s, v4.8h, v3.h[2] 647 SMLAL v27.4s, v5.4h, v3.h[2] 648 SMLAL2 v31.4s, v5.8h, v3.h[2] 649 CMP x0, 4 650 B.LO 3b 651 652 LDP d4, d5, [x5], 16 653 USUBL v4.8h, v4.8b, v7.8b 654 USUBL v5.8h, v5.8b, v7.8b 655 SMLAL v16.4s, v4.4h, v0.h[3] 656 SMLAL2 v20.4s, v4.8h, v0.h[3] 657 SMLAL v24.4s, v5.4h, v0.h[3] 658 SMLAL2 v28.4s, v5.8h, v0.h[3] 659 SMLAL v17.4s, v4.4h, v1.h[3] 660 SMLAL2 v21.4s, v4.8h, v1.h[3] 661 SMLAL v25.4s, v5.4h, v1.h[3] 662 SMLAL2 v29.4s, v5.8h, v1.h[3] 663 SMLAL v18.4s, v4.4h, v2.h[3] 664 SMLAL2 v22.4s, v4.8h, v2.h[3] 665 SMLAL v26.4s, v5.4h, v2.h[3] 666 SMLAL2 v30.4s, v5.8h, v2.h[3] 667 SMLAL v19.4s, v4.4h, v3.h[3] 668 SMLAL2 v23.4s, v4.8h, v3.h[3] 669 SMLAL v27.4s, v5.4h, v3.h[3] 670 SMLAL2 v31.4s, v5.8h, v3.h[3] 671 B.EQ 3b 672 673 LDP d4, d5, [x5], 16 674 USUBL v4.8h, v4.8b, v7.8b 675 USUBL v5.8h, v5.8b, v7.8b 676 SMLAL v16.4s, v4.4h, v0.h[4] 677 SMLAL2 v20.4s, v4.8h, v0.h[4] 678 SMLAL v24.4s, v5.4h, v0.h[4] 679 SMLAL2 v28.4s, v5.8h, v0.h[4] 680 SMLAL v17.4s, v4.4h, v1.h[4] 681 SMLAL2 v21.4s, v4.8h, v1.h[4] 682 SMLAL v25.4s, v5.4h, v1.h[4] 683 SMLAL2 v29.4s, v5.8h, v1.h[4] 684 SMLAL v18.4s, v4.4h, v2.h[4] 685 SMLAL2 v22.4s, v4.8h, v2.h[4] 686 SMLAL v26.4s, v5.4h, v2.h[4] 687 SMLAL2 v30.4s, v5.8h, v2.h[4] 688 SMLAL v19.4s, v4.4h, v3.h[4] 689 SMLAL2 v23.4s, v4.8h, v3.h[4] 690 SMLAL v27.4s, v5.4h, v3.h[4] 691 SMLAL2 v31.4s, v5.8h, v3.h[4] 692 CMP x0, 6 693 B.LO 3b 694 695 LDP d4, d5, [x5], 16 696 USUBL v4.8h, v4.8b, v7.8b 697 USUBL v5.8h, v5.8b, v7.8b 698 SMLAL v16.4s, v4.4h, v0.h[5] 699 SMLAL2 v20.4s, v4.8h, v0.h[5] 700 SMLAL v24.4s, v5.4h, v0.h[5] 701 SMLAL2 v28.4s, v5.8h, v0.h[5] 702 SMLAL v17.4s, v4.4h, v1.h[5] 703 SMLAL2 v21.4s, v4.8h, v1.h[5] 704 SMLAL v25.4s, v5.4h, v1.h[5] 705 SMLAL2 v29.4s, v5.8h, v1.h[5] 706 SMLAL v18.4s, v4.4h, v2.h[5] 707 SMLAL2 v22.4s, v4.8h, v2.h[5] 708 SMLAL v26.4s, v5.4h, v2.h[5] 709 SMLAL2 v30.4s, v5.8h, v2.h[5] 710 SMLAL v19.4s, v4.4h, v3.h[5] 711 SMLAL2 v23.4s, v4.8h, v3.h[5] 712 SMLAL v27.4s, v5.4h, v3.h[5] 713 SMLAL2 v31.4s, v5.8h, v3.h[5] 714 B.EQ 3b 715 716 LDP d4, d5, [x5], 16 717 USUBL v4.8h, v4.8b, v7.8b 718 USUBL v5.8h, v5.8b, v7.8b 719 SMLAL v16.4s, v4.4h, v0.h[6] 720 SMLAL2 v20.4s, v4.8h, v0.h[6] 721 SMLAL v24.4s, v5.4h, v0.h[6] 722 SMLAL2 v28.4s, v5.8h, v0.h[6] 723 SMLAL v17.4s, v4.4h, v1.h[6] 724 SMLAL2 v21.4s, v4.8h, v1.h[6] 725 SMLAL v25.4s, v5.4h, v1.h[6] 726 SMLAL2 v29.4s, v5.8h, v1.h[6] 727 SMLAL v18.4s, v4.4h, v2.h[6] 728 SMLAL2 v22.4s, v4.8h, v2.h[6] 729 SMLAL v26.4s, v5.4h, v2.h[6] 730 SMLAL2 v30.4s, v5.8h, v2.h[6] 731 SMLAL v19.4s, v4.4h, v3.h[6] 732 SMLAL2 v23.4s, v4.8h, v3.h[6] 733 SMLAL v27.4s, v5.4h, v3.h[6] 734 SMLAL2 v31.4s, v5.8h, v3.h[6] 735 B 3b 736 737 # Store odd width 738 .p2align 3 7395: 740 TBZ x1, 3, 6f 741 STR d0, [x6], 8 742 STR d1, [x8], 8 743 DUP d0, v0.d[1] 744 DUP d1, v1.d[1] 745 STR d2, [x9], 8 746 STR d3, [x7], 8 747 DUP d2, v2.d[1] 748 DUP d3, v3.d[1] 7496: 750 TBZ x1, 2, 7f 751 STR s0, [x6], 4 752 STR s1, [x8], 4 753 DUP s0, v0.s[1] 754 DUP s1, v1.s[1] 755 STR s2, [x9], 4 756 STR s3, [x7], 4 757 DUP s2, v2.s[1] 758 DUP s3, v3.s[1] 7597: 760 TBZ x1, 1, 8f 761 STR h0, [x6], 2 762 STR h1, [x8], 2 763 DUP h0, v0.h[1] 764 DUP h1, v1.h[1] 765 STR h2, [x9], 2 766 STR h3, [x7], 2 767 DUP h2, v2.h[1] 768 DUP h3, v3.h[1] 7698: 770 TBZ x1, 0, 9f 771 STR b0, [x6] 772 STR b1, [x8] 773 STR b2, [x9] 774 STR b3, [x7] 7759: 776 RET 777 778END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75 779 780#ifdef __ELF__ 781.section ".note.GNU-stack","",%progbits 782#endif 783