1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# LINT.IfChange 9# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75( 10# size_t mr, x0 11# size_t nc, x1 12# size_t kc, x2 / x0 13# size_t ks, x3 / x9 14# const float**restrict a, x4 15# const float*restrict w, x5 16# float*restrict c, x6 17# size_t cm_stride, x7 18# size_t cn_stride, [sp] -> x10 19# size_t a_offset, [sp + 8] -> x11 20# const float* zero, [sp + 16] -> x12 21# const xnn_f32_minmax_params params [sp + 24] -> x8 22 23# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 24 25# A pointers 26# x20 a0 27# x13 a1 28# x14 a2 29# x15 a3 30 31# C pointers 32# x6 c0 33# x16 c1 34# x17 c2 35# x7 c3 / cm_stride 36 37# Vector register usage 38# A0 v0 v4 39# A1 v1 v5 40# A2 v2 v6 41# A3 v3 v7 42# B v8 v9 v10 v11 43# B v12 v13 v14 v15 44# B v16 v17 v18 v19 45# B v20 v21 v22 v23 46# C v24 v25 47# C v26 v27 48# C v28 v29 49# C v30 v31 50# Clamp v4 v5 51 52BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75 53 54 # Load cn_stride, a_offset 55 LDP x10, x11, [sp] 56 57 # Load zero, params pointer 58 LDP x12, x8, [sp, 16] 59 60 # Load min/max values 61 LD2R {v4.4s, v5.4s}, [x8] 62 63 # Save x20 on stack 64 STR x20, [sp, -80]! 65 66 # Save d8-d15 on stack 67 STP d8, d9, [sp, 16] 68 STP d10, d11, [sp, 32] 69 STP d12, d13, [sp, 48] 70 STP d14, d15, [sp, 64] 71 72 # Clamp C pointers 73 CMP x0, 2 // if mr < 2 74 ADD x16, x6, x7 // c1 = c0 + cm_stride 75 CSEL x16, x6, x16, LO // c1 = c0 76 77 ADD x17, x16, x7 // c2 = c1 + cm_stride 78 // if mr <= 2 79 CSEL x17, x16, x17, LS // c2 = c1 80 81 CMP x0, 4 // if mr < 4 82 ADD x7, x17, x7 // c3 = c2 + cm_stride 83 CSEL x7, x17, x7, LO // c3 = c2 84 850: 86 # Load initial bias from w into accumulators 87 LDP q24, q25, [x5], 32 88 MOV v26.16b, v24.16b 89 MOV v27.16b, v25.16b 90 MOV v28.16b, v24.16b 91 MOV v29.16b, v25.16b 92 MOV v30.16b, v24.16b 93 MOV v31.16b, v25.16b 94 95 MOV x9, x3 // p = ks 96 971: 98 # Load next 4 A pointers 99 LDP x20, x13, [x4], 16 100 LDP x14, x15, [x4], 16 101 102 CMP x20, x12 // if a0 == zero 103 ADD x20, x20, x11 // a0 += a_offset 104 CSEL x20, x12, x20, EQ // a0 = zero, else += a0 + a_offset 105 CMP x13, x12 // if a1 == zero 106 ADD x13, x13, x11 // a1 += a_offset 107 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset 108 CMP x14, x12 // if a2 == zero 109 ADD x14, x14, x11 // a2 += a_offset 110 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset 111 CMP x15, x12 // if a3 == zero 112 ADD x15, x15, x11 // a3 += a_offset 113 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset 114 115 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 116 SUBS x0, x2, 32 // k = kc - 32 117 B.LO 4f 118 119 # 16 prologue 120 # Read first block of 4 A and B. 121 LDR q0, [x20], 16 122 LDP q16, q17, [x5], 32 123 LDR q1, [x13], 16 124 LDR q2, [x14], 16 125 LDR q3, [x15], 16 126 LDP q18, q19, [x5], 32 127 LDP q20, q21, [x5], 32 128 LDP q22, q23, [x5], 32 129 130 # Is there at least 32. yes do main loop 131 SUBS x0, x0, 32 132 B.LO 3f 133 134 # Main loop - 8 floats of A 1352: 136 # First block of 4. FMA for first 4, loads for 2nd block of 4. 137 FMLA v24.4s, v16.4s, v0.s[0] 138 LDP q8, q9, [x5], 32 139 FMLA v25.4s, v17.4s, v0.s[0] 140 FMLA v26.4s, v16.4s, v1.s[0] 141 LDP q10, q11, [x5], 32 142 FMLA v27.4s, v17.4s, v1.s[0] 143 FMLA v28.4s, v16.4s, v2.s[0] 144 LDP q12, q13, [x5], 32 145 FMLA v29.4s, v17.4s, v2.s[0] 146 FMLA v30.4s, v16.4s, v3.s[0] 147 LDP q14, q15, [x5], 32 148 FMLA v31.4s, v17.4s, v3.s[0] 149 FMLA v24.4s, v18.4s, v0.s[1] 150 LDR q4, [x20], 16 151 FMLA v25.4s, v19.4s, v0.s[1] 152 FMLA v26.4s, v18.4s, v1.s[1] 153 LDR q5, [x13], 16 154 FMLA v27.4s, v19.4s, v1.s[1] 155 FMLA v28.4s, v18.4s, v2.s[1] 156 LDR q6, [x14], 16 157 FMLA v29.4s, v19.4s, v2.s[1] 158 FMLA v30.4s, v18.4s, v3.s[1] 159 LDR q7, [x15], 16 160 FMLA v31.4s, v19.4s, v3.s[1] 161 FMLA v24.4s, v20.4s, v0.s[2] 162 $if PREFETCH: 163 PRFM PLDL1KEEP, [x5, 128] 164 FMLA v25.4s, v21.4s, v0.s[2] 165 FMLA v26.4s, v20.4s, v1.s[2] 166 $if PREFETCH: 167 PRFM PLDL1KEEP, [x5, 192] 168 FMLA v27.4s, v21.4s, v1.s[2] 169 FMLA v28.4s, v20.4s, v2.s[2] 170 $if PREFETCH: 171 PRFM PLDL1KEEP, [x5, 256] 172 FMLA v29.4s, v21.4s, v2.s[2] 173 FMLA v30.4s, v20.4s, v3.s[2] 174 $if PREFETCH: 175 PRFM PLDL1KEEP, [x5, 320] 176 FMLA v31.4s, v21.4s, v3.s[2] 177 FMLA v24.4s, v22.4s, v0.s[3] 178 FMLA v25.4s, v23.4s, v0.s[3] 179 FMLA v26.4s, v22.4s, v1.s[3] 180 FMLA v27.4s, v23.4s, v1.s[3] 181 FMLA v28.4s, v22.4s, v2.s[3] 182 FMLA v29.4s, v23.4s, v2.s[3] 183 FMLA v30.4s, v22.4s, v3.s[3] 184 FMLA v31.4s, v23.4s, v3.s[3] 185 186 # Second block of 4. FMA for second 4, loads for 1st block of 4. 187 FMLA v24.4s, v8.4s, v4.s[0] 188 LDP q16, q17, [x5], 32 189 FMLA v25.4s, v9.4s, v4.s[0] 190 FMLA v26.4s, v8.4s, v5.s[0] 191 LDP q18, q19, [x5], 32 192 FMLA v27.4s, v9.4s, v5.s[0] 193 FMLA v28.4s, v8.4s, v6.s[0] 194 LDP q20, q21, [x5], 32 195 FMLA v29.4s, v9.4s, v6.s[0] 196 FMLA v30.4s, v8.4s, v7.s[0] 197 LDP q22, q23, [x5], 32 198 FMLA v31.4s, v9.4s, v7.s[0] 199 FMLA v24.4s, v10.4s, v4.s[1] 200 LDR q0, [x20], 16 201 FMLA v25.4s, v11.4s, v4.s[1] 202 FMLA v26.4s, v10.4s, v5.s[1] 203 LDR q1, [x13], 16 204 FMLA v27.4s, v11.4s, v5.s[1] 205 FMLA v28.4s, v10.4s, v6.s[1] 206 LDR q2, [x14], 16 207 FMLA v29.4s, v11.4s, v6.s[1] 208 FMLA v30.4s, v10.4s, v7.s[1] 209 LDR q3, [x15], 16 210 FMLA v31.4s, v11.4s, v7.s[1] 211 FMLA v24.4s, v12.4s, v4.s[2] 212 FMLA v25.4s, v13.4s, v4.s[2] 213 FMLA v26.4s, v12.4s, v5.s[2] 214 FMLA v27.4s, v13.4s, v5.s[2] 215 FMLA v28.4s, v12.4s, v6.s[2] 216 FMLA v29.4s, v13.4s, v6.s[2] 217 FMLA v30.4s, v12.4s, v7.s[2] 218 FMLA v31.4s, v13.4s, v7.s[2] 219 FMLA v24.4s, v14.4s, v4.s[3] 220 FMLA v25.4s, v15.4s, v4.s[3] 221 FMLA v26.4s, v14.4s, v5.s[3] 222 FMLA v27.4s, v15.4s, v5.s[3] 223 FMLA v28.4s, v14.4s, v6.s[3] 224 FMLA v29.4s, v15.4s, v6.s[3] 225 SUBS x0, x0, 32 226 FMLA v30.4s, v14.4s, v7.s[3] 227 FMLA v31.4s, v15.4s, v7.s[3] 228 229 B.HS 2b 230 2313: 232 # Epilogue 233 # First block of 4. FMA for first 4, loads for 2nd block of 4. 234 FMLA v24.4s, v16.4s, v0.s[0] 235 LDP q8, q9, [x5], 32 236 FMLA v25.4s, v17.4s, v0.s[0] 237 FMLA v26.4s, v16.4s, v1.s[0] 238 LDP q10, q11, [x5], 32 239 FMLA v27.4s, v17.4s, v1.s[0] 240 FMLA v28.4s, v16.4s, v2.s[0] 241 LDP q12, q13, [x5], 32 242 FMLA v29.4s, v17.4s, v2.s[0] 243 FMLA v30.4s, v16.4s, v3.s[0] 244 LDP q14, q15, [x5], 32 245 FMLA v31.4s, v17.4s, v3.s[0] 246 FMLA v24.4s, v18.4s, v0.s[1] 247 LDR q4, [x20], 16 248 FMLA v25.4s, v19.4s, v0.s[1] 249 FMLA v26.4s, v18.4s, v1.s[1] 250 LDR q5, [x13], 16 251 FMLA v27.4s, v19.4s, v1.s[1] 252 FMLA v28.4s, v18.4s, v2.s[1] 253 LDR q6, [x14], 16 254 FMLA v29.4s, v19.4s, v2.s[1] 255 FMLA v30.4s, v18.4s, v3.s[1] 256 LDR q7, [x15], 16 257 FMLA v31.4s, v19.4s, v3.s[1] 258 FMLA v24.4s, v20.4s, v0.s[2] 259 FMLA v25.4s, v21.4s, v0.s[2] 260 FMLA v26.4s, v20.4s, v1.s[2] 261 FMLA v27.4s, v21.4s, v1.s[2] 262 FMLA v28.4s, v20.4s, v2.s[2] 263 FMLA v29.4s, v21.4s, v2.s[2] 264 FMLA v30.4s, v20.4s, v3.s[2] 265 FMLA v31.4s, v21.4s, v3.s[2] 266 FMLA v24.4s, v22.4s, v0.s[3] 267 FMLA v25.4s, v23.4s, v0.s[3] 268 FMLA v26.4s, v22.4s, v1.s[3] 269 FMLA v27.4s, v23.4s, v1.s[3] 270 FMLA v28.4s, v22.4s, v2.s[3] 271 FMLA v29.4s, v23.4s, v2.s[3] 272 FMLA v30.4s, v22.4s, v3.s[3] 273 FMLA v31.4s, v23.4s, v3.s[3] 274 275 # Second block of 4. FMA for second 4, noloads 276 FMLA v24.4s, v8.4s, v4.s[0] 277 FMLA v25.4s, v9.4s, v4.s[0] 278 FMLA v26.4s, v8.4s, v5.s[0] 279 FMLA v27.4s, v9.4s, v5.s[0] 280 FMLA v28.4s, v8.4s, v6.s[0] 281 FMLA v29.4s, v9.4s, v6.s[0] 282 FMLA v30.4s, v8.4s, v7.s[0] 283 FMLA v31.4s, v9.4s, v7.s[0] 284 FMLA v24.4s, v10.4s, v4.s[1] 285 FMLA v25.4s, v11.4s, v4.s[1] 286 FMLA v26.4s, v10.4s, v5.s[1] 287 FMLA v27.4s, v11.4s, v5.s[1] 288 FMLA v28.4s, v10.4s, v6.s[1] 289 FMLA v29.4s, v11.4s, v6.s[1] 290 FMLA v30.4s, v10.4s, v7.s[1] 291 FMLA v31.4s, v11.4s, v7.s[1] 292 FMLA v24.4s, v12.4s, v4.s[2] 293 FMLA v25.4s, v13.4s, v4.s[2] 294 FMLA v26.4s, v12.4s, v5.s[2] 295 FMLA v27.4s, v13.4s, v5.s[2] 296 FMLA v28.4s, v12.4s, v6.s[2] 297 FMLA v29.4s, v13.4s, v6.s[2] 298 FMLA v30.4s, v12.4s, v7.s[2] 299 FMLA v31.4s, v13.4s, v7.s[2] 300 301 FMLA v24.4s, v14.4s, v4.s[3] 302 FMLA v25.4s, v15.4s, v4.s[3] 303 FMLA v26.4s, v14.4s, v5.s[3] 304 FMLA v27.4s, v15.4s, v5.s[3] 305 306 # Load min/max values 307 LD2R {v4.4s, v5.4s}, [x8] 308 309 FMLA v28.4s, v14.4s, v6.s[3] 310 FMLA v29.4s, v15.4s, v6.s[3] 311 FMLA v30.4s, v14.4s, v7.s[3] 312 FMLA v31.4s, v15.4s, v7.s[3] 313 3144: 315 # Remainder- 4 floats of A 316 TBZ x0, 4, 5f 317 318 LDR q0, [x20], 16 319 LDP q16, q17, [x5], 32 320 LDR q1, [x13], 16 321 LDR q2, [x14], 16 322 LDR q3, [x15], 16 323 FMLA v24.4s, v16.4s, v0.s[0] 324 FMLA v25.4s, v17.4s, v0.s[0] 325 LDP q18, q19, [x5], 32 326 FMLA v26.4s, v16.4s, v1.s[0] 327 FMLA v27.4s, v17.4s, v1.s[0] 328 LDP q20, q21, [x5], 32 329 FMLA v28.4s, v16.4s, v2.s[0] 330 FMLA v29.4s, v17.4s, v2.s[0] 331 LDP q22, q23, [x5], 32 332 FMLA v30.4s, v16.4s, v3.s[0] 333 FMLA v31.4s, v17.4s, v3.s[0] 334 FMLA v24.4s, v18.4s, v0.s[1] 335 FMLA v25.4s, v19.4s, v0.s[1] 336 FMLA v26.4s, v18.4s, v1.s[1] 337 FMLA v27.4s, v19.4s, v1.s[1] 338 FMLA v28.4s, v18.4s, v2.s[1] 339 FMLA v29.4s, v19.4s, v2.s[1] 340 FMLA v30.4s, v18.4s, v3.s[1] 341 FMLA v31.4s, v19.4s, v3.s[1] 342 FMLA v24.4s, v20.4s, v0.s[2] 343 FMLA v25.4s, v21.4s, v0.s[2] 344 FMLA v26.4s, v20.4s, v1.s[2] 345 FMLA v27.4s, v21.4s, v1.s[2] 346 FMLA v28.4s, v20.4s, v2.s[2] 347 FMLA v29.4s, v21.4s, v2.s[2] 348 FMLA v30.4s, v20.4s, v3.s[2] 349 FMLA v31.4s, v21.4s, v3.s[2] 350 FMLA v24.4s, v22.4s, v0.s[3] 351 FMLA v25.4s, v23.4s, v0.s[3] 352 FMLA v26.4s, v22.4s, v1.s[3] 353 FMLA v27.4s, v23.4s, v1.s[3] 354 FMLA v28.4s, v22.4s, v2.s[3] 355 FMLA v29.4s, v23.4s, v2.s[3] 356 FMLA v30.4s, v22.4s, v3.s[3] 357 FMLA v31.4s, v23.4s, v3.s[3] 358 3595: 360 # Remainder- 2 floats of A 361 TBZ x0, 3, 6f 362 363 LDR d0, [x20], 8 364 LDP q16, q17, [x5], 32 365 LDR d1, [x13], 8 366 LDR d2, [x14], 8 367 LDR d3, [x15], 8 368 FMLA v24.4s, v16.4s, v0.s[0] 369 FMLA v25.4s, v17.4s, v0.s[0] 370 LDP q18, q19, [x5], 32 371 FMLA v26.4s, v16.4s, v1.s[0] 372 FMLA v27.4s, v17.4s, v1.s[0] 373 FMLA v28.4s, v16.4s, v2.s[0] 374 FMLA v29.4s, v17.4s, v2.s[0] 375 FMLA v30.4s, v16.4s, v3.s[0] 376 FMLA v31.4s, v17.4s, v3.s[0] 377 FMLA v24.4s, v18.4s, v0.s[1] 378 FMLA v25.4s, v19.4s, v0.s[1] 379 FMLA v26.4s, v18.4s, v1.s[1] 380 FMLA v27.4s, v19.4s, v1.s[1] 381 FMLA v28.4s, v18.4s, v2.s[1] 382 FMLA v29.4s, v19.4s, v2.s[1] 383 FMLA v30.4s, v18.4s, v3.s[1] 384 FMLA v31.4s, v19.4s, v3.s[1] 385 3866: 387 # Remainder- 1 float of A 388 TBZ x0, 2, 7f 389 390 LDR s0, [x20], 4 391 LDP q16, q17, [x5], 32 392 LDR s1, [x13], 4 393 LDR s2, [x14], 4 394 LDR s3, [x15], 4 395 FMLA v24.4s, v16.4s, v0.s[0] 396 FMLA v25.4s, v17.4s, v0.s[0] 397 FMLA v26.4s, v16.4s, v1.s[0] 398 FMLA v27.4s, v17.4s, v1.s[0] 399 FMLA v28.4s, v16.4s, v2.s[0] 400 FMLA v29.4s, v17.4s, v2.s[0] 401 FMLA v30.4s, v16.4s, v3.s[0] 402 FMLA v31.4s, v17.4s, v3.s[0] 403 4047: 405 # ks loop 406 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 407 B.HI 1b 408 409 # Clamp 410 FMAX v24.4s, v24.4s, v4.4s 411 FMAX v25.4s, v25.4s, v4.4s 412 FMAX v26.4s, v26.4s, v4.4s 413 FMAX v27.4s, v27.4s, v4.4s 414 FMAX v28.4s, v28.4s, v4.4s 415 FMAX v29.4s, v29.4s, v4.4s 416 FMAX v30.4s, v30.4s, v4.4s 417 FMAX v31.4s, v31.4s, v4.4s 418 FMIN v24.4s, v24.4s, v5.4s 419 FMIN v25.4s, v25.4s, v5.4s 420 FMIN v26.4s, v26.4s, v5.4s 421 FMIN v27.4s, v27.4s, v5.4s 422 FMIN v28.4s, v28.4s, v5.4s 423 FMIN v29.4s, v29.4s, v5.4s 424 FMIN v30.4s, v30.4s, v5.4s 425 FMIN v31.4s, v31.4s, v5.4s 426 427 # Store full 4 x 8 428 SUBS x1, x1, 8 429 B.LO 8f 430 431 STP q30, q31, [x7] 432 ADD x7, x7, x10 433 STP q28, q29, [x17] 434 ADD x17, x17, x10 435 STP q26, q27, [x16] 436 ADD x16, x16, x10 437 STP q24, q25, [x6] 438 ADD x6, x6, x10 439 440 SUB x4, x4, x3 // a -= ks 441 442 # nc loop 443 B.HI 0b 444 445 # Restore d8-d15 from stack 446 LDP d14, d15, [sp, 64] 447 LDP d12, d13, [sp, 48] 448 LDP d10, d11, [sp, 32] 449 LDP d8, d9, [sp, 16] 450 451 # Restore x20 from stack 452 LDR x20, [sp], 80 453 RET 454 455 # Store odd width 4568: 457 TBZ x1, 2, 9f 458 STR q30, [x7], 16 459 MOV v30.16b, v31.16b 460 STR q28, [x17], 16 461 MOV v28.16b, v29.16b 462 STR q26, [x16], 16 463 MOV v26.16b, v27.16b 464 STR q24, [x6], 16 465 MOV v24.16b, v25.16b 466 4679: 468 TBZ x1, 1, 10f 469 STR d30, [x7], 8 470 STR d28, [x17], 8 471 DUP d30, v30.d[1] 472 DUP d28, v28.d[1] 473 STR d26, [x16], 8 474 STR d24, [x6], 8 475 DUP d26, v26.d[1] 476 DUP d24, v24.d[1] 477 47810: 479 TBZ x1, 0, 11f 480 STR s30, [x7] 481 STR s28, [x17] 482 STR s26, [x16] 483 STR s24, [x6] 48411: 485 # Restore d8-d15 from stack 486 LDP d14, d15, [sp, 64] 487 LDP d12, d13, [sp, 48] 488 LDP d10, d11, [sp, 32] 489 LDP d8, d9, [sp, 16] 490 491 # Restore x20 from stack 492 LDR x20, [sp], 80 493 RET 494 495END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75 496# LINT.ThenChange(4x8-aarch64-neonfma-cortex-a75.cc) 497 498#ifdef __ELF__ 499.section ".note.GNU-stack","",%progbits 500#endif 501