1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# Vector register usage 39# A0 v0 v4 40# A1 v1 v5 41# A2 v2 v6 42# A3 v3 v7 43# B v8 v9 v10 v11 44# B v12 v13 v14 v15 45# B v16 v17 v18 v19 46# B v20 v21 v22 v23 47# C v24 v25 48# C v26 v27 49# C v28 v29 50# C v30 v31 51# Clamp v4 v5 52 53BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75 54 55 # Load cn_stride, params pointer 56 LDP x14, x8, [sp] 57 58 # Load min/max values 59 LD2R {v4.4s, v5.4s}, [x8] 60 61 # Save d8-d15 on stack 62 STP d8, d9, [sp, -64]! 63 STP d10, d11, [sp, 16] 64 STP d12, d13, [sp, 32] 65 STP d14, d15, [sp, 48] 66 67 # Clamp A and C pointers 68 CMP x0, 2 // if mr < 2 69 ADD x11, x3, x4 // a1 = a0 + a_stride 70 ADD x9, x6, x7 // c1 = c0 + cm_stride 71 CSEL x11, x3, x11, LO // a1 = a0 72 CSEL x9, x6, x9, LO // c1 = c0 73 74 ADD x12, x11, x4 // a2 = a1 + a_stride 75 ADD x10, x9, x7 // c2 = c1 + cm_stride 76 // if mr <= 2 77 CSEL x12, x11, x12, LS // a2 = a1 78 CSEL x10, x9, x10, LS // c2 = c1 79 80 CMP x0, 4 // if mr < 4 81 ADD x4, x12, x4 // a3 = a2 + a_stride 82 ADD x7, x10, x7 // c3 = c2 + cm_stride 83 CSEL x4, x12, x4, LO // a3 = a2 84 CSEL x7, x10, x7, LO // c3 = c2 85 860: 87 # Load initial bias from w into accumulators 88 LDP q24, q25, [x5], 32 89 MOV v26.16b, v24.16b 90 MOV v27.16b, v25.16b 91 MOV v28.16b, v24.16b 92 MOV v29.16b, v25.16b 93 MOV v30.16b, v24.16b 94 MOV v31.16b, v25.16b 95 96 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 97 SUBS x0, x2, 32 // k = kc - 32 98 B.LO 3f 99 100 # 16 prologue 101 # Read first block of 4 A and B. 102 LDR q0, [x3], 16 103 LDP q16, q17, [x5], 32 104 LDR q1, [x11], 16 105 LDR q2, [x12], 16 106 LDR q3, [x4], 16 107 LDP q18, q19, [x5], 32 108 LDP q20, q21, [x5], 32 109 LDP q22, q23, [x5], 32 110 111 # Is there at least 32. yes do main loop 112 SUBS x0, x0, 32 113 B.LO 2f 114 115 # Main loop - 8 floats of A (32 bytes) 1161: 117 # First block of 4. FMA for first 4, loads for 2nd block of 4. 118 FMLA v24.4s, v16.4s, v0.s[0] 119 LDP q8, q9, [x5], 32 120 FMLA v25.4s, v17.4s, v0.s[0] 121 FMLA v26.4s, v16.4s, v1.s[0] 122 LDP q10, q11, [x5], 32 123 FMLA v27.4s, v17.4s, v1.s[0] 124 FMLA v28.4s, v16.4s, v2.s[0] 125 LDP q12, q13, [x5], 32 126 FMLA v29.4s, v17.4s, v2.s[0] 127 FMLA v30.4s, v16.4s, v3.s[0] 128 LDP q14, q15, [x5], 32 129 FMLA v31.4s, v17.4s, v3.s[0] 130 FMLA v24.4s, v18.4s, v0.s[1] 131 LDR q4, [x3], 16 132 FMLA v25.4s, v19.4s, v0.s[1] 133 FMLA v26.4s, v18.4s, v1.s[1] 134 LDR q5, [x11], 16 135 FMLA v27.4s, v19.4s, v1.s[1] 136 FMLA v28.4s, v18.4s, v2.s[1] 137 LDR q6, [x12], 16 138 FMLA v29.4s, v19.4s, v2.s[1] 139 FMLA v30.4s, v18.4s, v3.s[1] 140 LDR q7, [x4], 16 141 FMLA v31.4s, v19.4s, v3.s[1] 142 FMLA v24.4s, v20.4s, v0.s[2] 143 FMLA v25.4s, v21.4s, v0.s[2] 144 FMLA v26.4s, v20.4s, v1.s[2] 145 FMLA v27.4s, v21.4s, v1.s[2] 146 FMLA v28.4s, v20.4s, v2.s[2] 147 FMLA v29.4s, v21.4s, v2.s[2] 148 FMLA v30.4s, v20.4s, v3.s[2] 149 FMLA v31.4s, v21.4s, v3.s[2] 150 FMLA v24.4s, v22.4s, v0.s[3] 151 FMLA v25.4s, v23.4s, v0.s[3] 152 FMLA v26.4s, v22.4s, v1.s[3] 153 FMLA v27.4s, v23.4s, v1.s[3] 154 FMLA v28.4s, v22.4s, v2.s[3] 155 FMLA v29.4s, v23.4s, v2.s[3] 156 FMLA v30.4s, v22.4s, v3.s[3] 157 FMLA v31.4s, v23.4s, v3.s[3] 158 159 # Second block of 4. FMA for second 4, loads for 1st block of 4. 160 FMLA v24.4s, v8.4s, v4.s[0] 161 LDP q16, q17, [x5], 32 162 FMLA v25.4s, v9.4s, v4.s[0] 163 FMLA v26.4s, v8.4s, v5.s[0] 164 LDP q18, q19, [x5], 32 165 FMLA v27.4s, v9.4s, v5.s[0] 166 FMLA v28.4s, v8.4s, v6.s[0] 167 LDP q20, q21, [x5], 32 168 FMLA v29.4s, v9.4s, v6.s[0] 169 FMLA v30.4s, v8.4s, v7.s[0] 170 LDP q22, q23, [x5], 32 171 FMLA v31.4s, v9.4s, v7.s[0] 172 FMLA v24.4s, v10.4s, v4.s[1] 173 LDR q0, [x3], 16 174 FMLA v25.4s, v11.4s, v4.s[1] 175 FMLA v26.4s, v10.4s, v5.s[1] 176 LDR q1, [x11], 16 177 FMLA v27.4s, v11.4s, v5.s[1] 178 FMLA v28.4s, v10.4s, v6.s[1] 179 LDR q2, [x12], 16 180 FMLA v29.4s, v11.4s, v6.s[1] 181 FMLA v30.4s, v10.4s, v7.s[1] 182 LDR q3, [x4], 16 183 FMLA v31.4s, v11.4s, v7.s[1] 184 FMLA v24.4s, v12.4s, v4.s[2] 185 FMLA v25.4s, v13.4s, v4.s[2] 186 FMLA v26.4s, v12.4s, v5.s[2] 187 FMLA v27.4s, v13.4s, v5.s[2] 188 FMLA v28.4s, v12.4s, v6.s[2] 189 FMLA v29.4s, v13.4s, v6.s[2] 190 FMLA v30.4s, v12.4s, v7.s[2] 191 FMLA v31.4s, v13.4s, v7.s[2] 192 FMLA v24.4s, v14.4s, v4.s[3] 193 FMLA v25.4s, v15.4s, v4.s[3] 194 FMLA v26.4s, v14.4s, v5.s[3] 195 FMLA v27.4s, v15.4s, v5.s[3] 196 FMLA v28.4s, v14.4s, v6.s[3] 197 FMLA v29.4s, v15.4s, v6.s[3] 198 SUBS x0, x0, 32 199 FMLA v30.4s, v14.4s, v7.s[3] 200 FMLA v31.4s, v15.4s, v7.s[3] 201 B.HS 1b 202 2032: 204 # Epilogue 205 # First block of 4. FMA for first 4, loads for 2nd block of 4. 206 FMLA v24.4s, v16.4s, v0.s[0] 207 LDP q8, q9, [x5], 32 208 FMLA v25.4s, v17.4s, v0.s[0] 209 FMLA v26.4s, v16.4s, v1.s[0] 210 LDP q10, q11, [x5], 32 211 FMLA v27.4s, v17.4s, v1.s[0] 212 FMLA v28.4s, v16.4s, v2.s[0] 213 LDP q12, q13, [x5], 32 214 FMLA v29.4s, v17.4s, v2.s[0] 215 FMLA v30.4s, v16.4s, v3.s[0] 216 LDP q14, q15, [x5], 32 217 FMLA v31.4s, v17.4s, v3.s[0] 218 FMLA v24.4s, v18.4s, v0.s[1] 219 LDR q4, [x3], 16 220 FMLA v25.4s, v19.4s, v0.s[1] 221 FMLA v26.4s, v18.4s, v1.s[1] 222 LDR q5, [x11], 16 223 FMLA v27.4s, v19.4s, v1.s[1] 224 FMLA v28.4s, v18.4s, v2.s[1] 225 LDR q6, [x12], 16 226 FMLA v29.4s, v19.4s, v2.s[1] 227 FMLA v30.4s, v18.4s, v3.s[1] 228 LDR q7, [x4], 16 229 FMLA v31.4s, v19.4s, v3.s[1] 230 FMLA v24.4s, v20.4s, v0.s[2] 231 FMLA v25.4s, v21.4s, v0.s[2] 232 FMLA v26.4s, v20.4s, v1.s[2] 233 FMLA v27.4s, v21.4s, v1.s[2] 234 FMLA v28.4s, v20.4s, v2.s[2] 235 FMLA v29.4s, v21.4s, v2.s[2] 236 FMLA v30.4s, v20.4s, v3.s[2] 237 FMLA v31.4s, v21.4s, v3.s[2] 238 FMLA v24.4s, v22.4s, v0.s[3] 239 FMLA v25.4s, v23.4s, v0.s[3] 240 FMLA v26.4s, v22.4s, v1.s[3] 241 FMLA v27.4s, v23.4s, v1.s[3] 242 FMLA v28.4s, v22.4s, v2.s[3] 243 FMLA v29.4s, v23.4s, v2.s[3] 244 FMLA v30.4s, v22.4s, v3.s[3] 245 FMLA v31.4s, v23.4s, v3.s[3] 246 247 # Second block of 4. FMA for second 4, noloads 248 FMLA v24.4s, v8.4s, v4.s[0] 249 FMLA v25.4s, v9.4s, v4.s[0] 250 FMLA v26.4s, v8.4s, v5.s[0] 251 FMLA v27.4s, v9.4s, v5.s[0] 252 FMLA v28.4s, v8.4s, v6.s[0] 253 FMLA v29.4s, v9.4s, v6.s[0] 254 FMLA v30.4s, v8.4s, v7.s[0] 255 FMLA v31.4s, v9.4s, v7.s[0] 256 257 FMLA v24.4s, v10.4s, v4.s[1] 258 FMLA v25.4s, v11.4s, v4.s[1] 259 FMLA v26.4s, v10.4s, v5.s[1] 260 FMLA v27.4s, v11.4s, v5.s[1] 261 FMLA v28.4s, v10.4s, v6.s[1] 262 FMLA v29.4s, v11.4s, v6.s[1] 263 FMLA v30.4s, v10.4s, v7.s[1] 264 FMLA v31.4s, v11.4s, v7.s[1] 265 266 FMLA v24.4s, v12.4s, v4.s[2] 267 FMLA v25.4s, v13.4s, v4.s[2] 268 FMLA v26.4s, v12.4s, v5.s[2] 269 FMLA v27.4s, v13.4s, v5.s[2] 270 FMLA v28.4s, v12.4s, v6.s[2] 271 FMLA v29.4s, v13.4s, v6.s[2] 272 FMLA v30.4s, v12.4s, v7.s[2] 273 FMLA v31.4s, v13.4s, v7.s[2] 274 275 FMLA v24.4s, v14.4s, v4.s[3] 276 FMLA v25.4s, v15.4s, v4.s[3] 277 FMLA v26.4s, v14.4s, v5.s[3] 278 FMLA v27.4s, v15.4s, v5.s[3] 279 280 # Load min/max values 281 LD2R {v4.4s, v5.4s}, [x8] 282 283 FMLA v28.4s, v14.4s, v6.s[3] 284 FMLA v29.4s, v15.4s, v6.s[3] 285 FMLA v30.4s, v14.4s, v7.s[3] 286 FMLA v31.4s, v15.4s, v7.s[3] 287 2883: 289 # Remainder- 4 floats of A (16 bytes) 290 TBZ x0, 4, 4f 291 292 LDR q0, [x3], 16 293 LDP q16, q17, [x5], 32 294 LDR q1, [x11], 16 295 LDR q2, [x12], 16 296 LDR q3, [x4], 16 297 FMLA v24.4s, v16.4s, v0.s[0] 298 FMLA v25.4s, v17.4s, v0.s[0] 299 LDP q18, q19, [x5], 32 300 FMLA v26.4s, v16.4s, v1.s[0] 301 FMLA v27.4s, v17.4s, v1.s[0] 302 LDP q20, q21, [x5], 32 303 FMLA v28.4s, v16.4s, v2.s[0] 304 FMLA v29.4s, v17.4s, v2.s[0] 305 LDP q22, q23, [x5], 32 306 FMLA v30.4s, v16.4s, v3.s[0] 307 FMLA v31.4s, v17.4s, v3.s[0] 308 FMLA v24.4s, v18.4s, v0.s[1] 309 FMLA v25.4s, v19.4s, v0.s[1] 310 FMLA v26.4s, v18.4s, v1.s[1] 311 FMLA v27.4s, v19.4s, v1.s[1] 312 FMLA v28.4s, v18.4s, v2.s[1] 313 FMLA v29.4s, v19.4s, v2.s[1] 314 FMLA v30.4s, v18.4s, v3.s[1] 315 FMLA v31.4s, v19.4s, v3.s[1] 316 FMLA v24.4s, v20.4s, v0.s[2] 317 FMLA v25.4s, v21.4s, v0.s[2] 318 FMLA v26.4s, v20.4s, v1.s[2] 319 FMLA v27.4s, v21.4s, v1.s[2] 320 FMLA v28.4s, v20.4s, v2.s[2] 321 FMLA v29.4s, v21.4s, v2.s[2] 322 FMLA v30.4s, v20.4s, v3.s[2] 323 FMLA v31.4s, v21.4s, v3.s[2] 324 FMLA v24.4s, v22.4s, v0.s[3] 325 FMLA v25.4s, v23.4s, v0.s[3] 326 FMLA v26.4s, v22.4s, v1.s[3] 327 FMLA v27.4s, v23.4s, v1.s[3] 328 FMLA v28.4s, v22.4s, v2.s[3] 329 FMLA v29.4s, v23.4s, v2.s[3] 330 FMLA v30.4s, v22.4s, v3.s[3] 331 FMLA v31.4s, v23.4s, v3.s[3] 332 3334: 334 # Remainder- 2 floats of A (8 bytes) 335 TBZ x0, 3, 5f 336 337 LDR d0, [x3], 8 338 LDP q16, q17, [x5], 32 339 LDR d1, [x11], 8 340 LDR d2, [x12], 8 341 LDR d3, [x4], 8 342 FMLA v24.4s, v16.4s, v0.s[0] 343 FMLA v25.4s, v17.4s, v0.s[0] 344 LDP q18, q19, [x5], 32 345 FMLA v26.4s, v16.4s, v1.s[0] 346 FMLA v27.4s, v17.4s, v1.s[0] 347 FMLA v28.4s, v16.4s, v2.s[0] 348 FMLA v29.4s, v17.4s, v2.s[0] 349 FMLA v30.4s, v16.4s, v3.s[0] 350 FMLA v31.4s, v17.4s, v3.s[0] 351 FMLA v24.4s, v18.4s, v0.s[1] 352 FMLA v25.4s, v19.4s, v0.s[1] 353 FMLA v26.4s, v18.4s, v1.s[1] 354 FMLA v27.4s, v19.4s, v1.s[1] 355 FMLA v28.4s, v18.4s, v2.s[1] 356 FMLA v29.4s, v19.4s, v2.s[1] 357 FMLA v30.4s, v18.4s, v3.s[1] 358 FMLA v31.4s, v19.4s, v3.s[1] 359 3605: 361 # Remainder- 1 float of A (4 bytes) 362 TBZ x0, 2, 6f 363 364 LDR s0, [x3], 4 365 LDP q16, q17, [x5], 32 366 LDR s1, [x11], 4 367 LDR s2, [x12], 4 368 LDR s3, [x4], 4 369 FMLA v24.4s, v16.4s, v0.s[0] 370 FMLA v25.4s, v17.4s, v0.s[0] 371 FMLA v26.4s, v16.4s, v1.s[0] 372 FMLA v27.4s, v17.4s, v1.s[0] 373 FMLA v28.4s, v16.4s, v2.s[0] 374 FMLA v29.4s, v17.4s, v2.s[0] 375 FMLA v30.4s, v16.4s, v3.s[0] 376 FMLA v31.4s, v17.4s, v3.s[0] 377 3786: 379 # Clamp 380 FMAX v24.4s, v24.4s, v4.4s 381 SUBS x1, x1, 8 382 FMAX v25.4s, v25.4s, v4.4s 383 FMAX v26.4s, v26.4s, v4.4s 384 FMAX v27.4s, v27.4s, v4.4s 385 FMAX v28.4s, v28.4s, v4.4s 386 FMAX v29.4s, v29.4s, v4.4s 387 FMAX v30.4s, v30.4s, v4.4s 388 FMAX v31.4s, v31.4s, v4.4s 389 FMIN v24.4s, v24.4s, v5.4s 390 FMIN v25.4s, v25.4s, v5.4s 391 FMIN v26.4s, v26.4s, v5.4s 392 FMIN v27.4s, v27.4s, v5.4s 393 FMIN v28.4s, v28.4s, v5.4s 394 FMIN v29.4s, v29.4s, v5.4s 395 FMIN v30.4s, v30.4s, v5.4s 396 FMIN v31.4s, v31.4s, v5.4s 397 398 # Store full 4 x 8 399 B.LO 7f 400 401 STP q24, q25, [x6] 402 SUB x3, x3, x2 // a0 -= kc 403 ADD x6, x6, x14 404 STP q26, q27, [x9] 405 SUB x11, x11, x2 // a1 -= kc 406 ADD x9, x9, x14 407 STP q28, q29, [x10] 408 SUB x12, x12, x2 // a2 -= kc 409 ADD x10, x10, x14 410 STP q30, q31, [x7] 411 SUB x4, x4, x2 // a3 -= kc 412 ADD x7, x7, x14 413 414 B.HI 0b 415 416 # Restore d8-d15 from stack 417 LDP d14, d15, [sp, 48] 418 LDP d12, d13, [sp, 32] 419 LDP d10, d11, [sp, 16] 420 LDP d8, d9, [sp], 64 421 RET 422 423 # Store odd width 4247: 425 TBZ x1, 2, 8f 426 STR q24, [x6], 16 427 MOV v24.16b, v25.16b 428 STR q26, [x9], 16 429 MOV v26.16b, v27.16b 430 STR q28, [x10], 16 431 MOV v28.16b, v29.16b 432 STR q30, [x7], 16 433 MOV v30.16b, v31.16b 434 4358: 436 TBZ x1, 1, 9f 437 STR d24, [x6], 8 438 STR d26, [x9], 8 439 DUP d24, v24.d[1] 440 DUP d26, v26.d[1] 441 STR d28, [x10], 8 442 STR d30, [x7], 8 443 DUP d28, v28.d[1] 444 DUP d30, v30.d[1] 445 4469: 447 TBZ x1, 0, 10f 448 STR s24, [x6] 449 STR s26, [x9] 450 STR s28, [x10] 451 STR s30, [x7] 45210: 453 # Restore d8-d15 from stack 454 LDP d14, d15, [sp, 48] 455 LDP d12, d13, [sp, 32] 456 LDP d10, d11, [sp, 16] 457 LDP d8, d9, [sp], 64 458 RET 459 460 461END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75 462 463#ifdef __ELF__ 464.section ".note.GNU-stack","",%progbits 465#endif 466