1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53( 9# size_t mr, (x0) - unused. mr = 1 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const float*restrict w, x5 15# float*restrict c, x6 16# size_t cm_stride, (x7) - unused 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointer 25# x8 a0 26 27# C pointer 28# x6 c0 29 30# Vector register usage and GPR shadows 31# a0 v0 first set of A 32# a0 v1 second set of A 33# B v2 v3 v4 x14 x15 x16 first set of B 34# B v5 v6 v7 x17 x13 x7 35# B v23 v24 v25 x14 x15 x16 second set of B (same x as first set) 36# B v17 v18 v19 x17 x13 x7 37# C v20 v21 v22 38 39BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53 40 41 # Load cn_stride, a_offset 42 LDP x10, x11, [sp] 43 44 # Load zero, params pointer 45 LDP x12, x8, [sp, 16] 46 47 # Load min/max values 48 LD2R {v30.4s, v31.4s}, [x8] 49 500: 51 # Load initial bias from w into accumulators 52 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48 53 54 PRFM PLDL1KEEP, [x5] 55 PRFM PLDL1KEEP, [x5, 64] 56 PRFM PLDL1KEEP, [x5, 128] 57 PRFM PLDL1KEEP, [x5, 192] 58 PRFM PLDL1KEEP, [x5, 256] 59 PRFM PLDL1KEEP, [x5, 320] 60 61 MOV x9, x3 // p = ks 62 631: 64 # Load next A pointer 65 LDR x8, [x4], 8 66 67 CMP x8, x12 // if a0 == zero 68 ADD x8, x8, x11 // a0 += a_offset 69 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 70 71 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 72 SUBS x0, x2, 16 // k = kc - 16 73 B.LO 5f 74 75 # Prologue - loads for first group of 6 fma 76 77 # Read first block of 1 A. 78 LDR d0, [x8], 8 // a0 79 80 LDR d2, [x5] // vb0x0123 81 LDR x14, [x5, 8] 82 83 LDR d3, [x5, 16] // vb0x25567 84 LDR x15, [x5, 24] 85 86 LDR d4, [x5, 32] // vb0x89AB 87 LDR x16, [x5, 40] 88 89 LDR d5, [x5, 48] // vb1x0123 90 LDR x17, [x5, 56] 91 92 LDR d6, [x5, 64] // vb1x25567 93 LDR x13, [x5, 72] 94 95 LDR d7, [x5, 80] // vb1x89AB 96 LDR x7, [x5, 88] 97 INS v2.d[1], x14 98 ADD x5, x5, 96 99 100 # Is there at least 4 floats (16 bytes) for main loop? 101 SUBS x0, x0, 16 // 4 floats for main loop 102 B.LO 3f 103 104 # Main loop - 4 floats of A (16 bytes) 1052: 106 # First group of 6 fma. 107 # A is loaded for 2nd group into v1 108 109 # BLOCK 0 110 LDR d1, [x8], 8 // a0 111 INS v3.d[1], x15 112 FMLA v20.4s, v2.4s, v0.s[0] 113 PRFM PLDL1KEEP, [x5, 192] 114 115 # BLOCK 1 116 INS v4.d[1], x16 117 FMLA v21.4s, v3.4s, v0.s[0] 118 PRFM PLDL1KEEP, [x5, 256] 119 120 # BLOCK 2 121 LDR d23, [x5] // vb0x0123 122 INS v5.d[1], x17 123 LDR x14, [x5, 8] 124 PRFM PLDL1KEEP, [x5, 320] 125 FMLA v22.4s, v4.4s, v0.s[0] 126 127 # BLOCK 3 128 LDR d24, [x5, 16] // vb0x25567 129 INS v6.d[1], x13 130 LDR x15, [x5, 24] 131 132 # BLOCK 4 133 LDR d25, [x5, 32] // vb0x89AB 134 INS v7.d[1], x7 135 FMLA v20.4s, v5.4s, v0.s[1] 136 LDR x16, [x5, 40] 137 138 # BLOCK 5 139 LDR d17, [x5, 48] // vb1x0123 140 LDR x17, [x5, 56] 141 FMLA v21.4s, v6.4s, v0.s[1] 142 143 # BLOCK 6 144 LDR d18, [x5, 64] // vb1x25567 145 LDR x13, [x5, 72] 146 FMLA v22.4s, v7.4s, v0.s[1] 147 148 # BLOCK 7 149 LDR d19, [x5, 80] // vb1x89AB 150 INS v23.d[1], x14 // v23 was loaded in block 2 151 LDR x7, [x5, 88] 152 153 # Second group of 6 fma. 154 # A is loaded for 1st group into v0 155 156 # BLOCK 0 157 LDR d0, [x8], 8 // a0 158 INS v24.d[1], x15 159 FMLA v20.4s, v23.4s, v1.s[0] 160 161 # BLOCK 1 162 INS v25.d[1], x16 163 FMLA v21.4s, v24.4s, v1.s[0] 164 165 # BLOCK 2 166 LDR d2, [x5, 96] // vb0x0123 167 INS v17.d[1], x17 168 LDR x14, [x5, 104] 169 FMLA v22.4s, v25.4s, v1.s[0] 170 171 # BLOCK 3 172 LDR d3, [x5, 112] // vb0x25567 173 INS v18.d[1], x13 174 LDR x15, [x5, 120] 175 176 # BLOCK 4 177 LDR d4, [x5, 128] // vb0x89AB 178 INS v19.d[1], x7 179 FMLA v20.4s, v17.4s, v1.s[1] 180 LDR x16, [x5, 136] 181 182 # BLOCK 5 183 LDR d5, [x5, 144] // vb1x0123 184 LDR x17, [x5, 152] 185 FMLA v21.4s, v18.4s, v1.s[1] 186 187 # BLOCK 6 188 LDR d6, [x5, 160] // vb1x25567 189 LDR x13, [x5, 168] 190 SUBS x0, x0, 16 191 FMLA v22.4s, v19.4s, v1.s[1] 192 193 # BLOCK 7 194 LDR d7, [x5, 176] // vb1x89AB 195 INS v2.d[1], x14 196 LDR x7, [x5, 184] 197 ADD x5, x5, 192 198 B.HS 2b 199 200 # Epilogue 201 # First block same as main loop. Second block has no loads. 2023: 203 # BLOCK 0 204 LDR d1, [x8], 8 // a0 205 INS v3.d[1], x15 206 FMLA v20.4s, v2.4s, v0.s[0] 207 PRFM PLDL1KEEP, [x5, 192] 208 209 # BLOCK 1 210 INS v4.d[1], x16 211 FMLA v21.4s, v3.4s, v0.s[0] 212 PRFM PLDL1KEEP, [x5, 256] 213 214 # BLOCK 2 215 LDR d23, [x5] // vb0x0123 216 INS v5.d[1], x17 217 LDR x14, [x5, 8] 218 PRFM PLDL1KEEP, [x5, 320] 219 FMLA v22.4s, v4.4s, v0.s[0] 220 221 # BLOCK 3 222 LDR d24, [x5, 16] // vb0x25567 223 INS v6.d[1], x13 224 LDR x15, [x5, 24] 225 226 # BLOCK 4 227 LDR d25, [x5, 32] // vb0x89AB 228 INS v7.d[1], x7 229 FMLA v20.4s, v5.4s, v0.s[1] 230 LDR x16, [x5, 40] 231 232 # BLOCK 5 233 LDR d17, [x5, 48] // vb1x0123 234 LDR x17, [x5, 56] 235 FMLA v21.4s, v6.4s, v0.s[1] 236 237 # BLOCK 6 238 LDR d18, [x5, 64] // vb1x25567 239 LDR x13, [x5, 72] 240 FMLA v22.4s, v7.4s, v0.s[1] 241 242 # BLOCK 7 243 LDR d19, [x5, 80] // vb1x89AB 244 INS v23.d[1], x14 // v23 was loaded in block 2 245 LDR x7, [x5, 88] 246 ADD x5, x5, 96 247 248 # Second group of 6 fma. 8 blocks of 4 cycles. 249 # Epilogue version does no loads 250 251 # BLOCK 0 252 INS v24.d[1], x15 253 FMLA v20.4s, v23.4s, v1.s[0] 254 255 # BLOCK 1 256 INS v25.d[1], x16 257 FMLA v21.4s, v24.4s, v1.s[0] 258 259 # BLOCK 2 260 INS v17.d[1], x17 261 FMLA v22.4s, v25.4s, v1.s[0] 262 263 # BLOCK 3 264 INS v18.d[1], x13 265 266 # BLOCK 4 267 INS v19.d[1], x7 268 FMLA v20.4s, v17.4s, v1.s[1] 269 TST x0, 15 270 271 # BLOCK 5 272 FMLA v21.4s, v18.4s, v1.s[1] 273 274 # BLOCK 6 275 FMLA v22.4s, v19.4s, v1.s[1] 276 277 # BLOCK 7 278 # Is there a remainder?- 2 floats of A (8 bytes) or less 279 B.NE 5f 280 2814: 282 # ks loop 283 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 284 B.HI 1b 285 286 # Clamp 287 FMAX v20.4s, v20.4s, v30.4s 288 FMAX v21.4s, v21.4s, v30.4s 289 FMAX v22.4s, v22.4s, v30.4s 290 FMIN v20.4s, v20.4s, v31.4s 291 FMIN v21.4s, v21.4s, v31.4s 292 FMIN v22.4s, v22.4s, v31.4s 293 294 # Store full 1 x 12 295 SUBS x1, x1, 12 296 B.LO 7f 297 298 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x10 299 SUB x4, x4, x3 // a -= ks 300 301 # nc loop 302 B.HI 0b 303 RET 304 3055: 306 # Is there a remainder?- 2 floats of A (8 bytes) 307 TBZ x0, 3, 6f 308 309 # Remainder- 2 floats of A (8 bytes) 310 LDR d0, [x8], 8 // a0 311 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48 312 LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48 313 314 # First block of 3 B 315 FMLA v20.4s, v2.4s, v0.s[0] 316 FMLA v21.4s, v3.4s, v0.s[0] 317 FMLA v22.4s, v4.4s, v0.s[0] 318 319 # Second block of 3 B 320 FMLA v20.4s, v5.4s, v0.s[1] 321 FMLA v21.4s, v6.4s, v0.s[1] 322 FMLA v22.4s, v7.4s, v0.s[1] 323 324 TBZ x0, 2, 4b 3256: 326 # Remainder - 1 float of A (4 bytes) 327 LDR s0, [x8], 4 // a0 328 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48 329 330 FMLA v20.4s, v2.4s, v0.s[0] 331 FMLA v21.4s, v3.4s, v0.s[0] 332 FMLA v22.4s, v4.4s, v0.s[0] 333 B 4b 334 3357: 336 ADD x1, x1, 12 337 # Store odd channels 338 TBZ x1, 3, 8f 339 STP q20, q21, [x6] 340 ADD x6, x6, 32 341 MOV v20.16b, v22.16b 342 3438: 344 TBZ x1, 2, 9f 345 STR q20, [x6], 16 346 MOV v20.16b, v21.16b 347 3489: 349 TBZ x1, 1, 10f 350 STR d20, [x6], 8 351 DUP d20, v20.d[1] 352 35310: 354 TBZ x1, 0, 11f 355 STR s20, [x6] 35611: 357 RET 358 359END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53 360 361#ifdef __ELF__ 362.section ".note.GNU-stack","",%progbits 363#endif 364