1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointer 29# x8 a0 30 31# C pointer 32# x6 c0 33 34# A53 based on a53/75 but with LD64 35 36BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53 37 38 # Load cn_stride, a_offset 39 LDP x10, x11, [sp] 40 41 # Load zero, params pointer 42 LDP x12, x8, [sp, 16] 43 44 # Load min/max values 45 LD2R {v30.4s, v31.4s}, [x8] 46 470: 48 # Load initial bias from w into accumulators 49 LDP q16, q17, [x5], 32 50 MOVI v18.4s, 0 // second set of C for pipelining FMLA 51 PRFM PLDL1KEEP, [x5, 64] 52 MOVI v19.4s, 0 53 PRFM PLDL1KEEP, [x5, 128] 54 PRFM PLDL1KEEP, [x5, 192] 55 PRFM PLDL1KEEP, [x5, 256] 56 PRFM PLDL1KEEP, [x5, 320] 57 PRFM PLDL1KEEP, [x5, 384] 58 PRFM PLDL1KEEP, [x5, 448] 59 PRFM PLDL1KEEP, [x5, 512] 60 PRFM PLDL1KEEP, [x5, 576] 61 62 MOV x9, x3 // p = ks 63 641: 65 # Load next A pointer 66 LDR x8, [x4], 8 67 68 CMP x8, x12 // if a0 == zero 69 ADD x8, x8, x11 // a0 += a_offset 70 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 71 72 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 73 SUBS x0, x2, 32 // k = kc - 32 74 B.LO 5f 75 76 # 16 prologue 77 # Read first block of A and B. 78 LDP q20, q21, [x5], 32 79 LDP q22, q23, [x5], 32 80 LDP q24, q25, [x5], 32 81 LDP q26, q27, [x5], 32 82 LDR q0, [x8], 16 83 84 # Is there at least 8. yes do main loop 85 SUBS x0, x0, 32 86 B.LO 3f 87 88 # Main loop - 8 floats of A (32 bytes) 892: 90 # First block of 4. FMA for first 4, loads for 2nd block of 4. 91 FMLA v16.4s, v20.4s, v0.s[0] 92 LDR q1, [x8], 16 93 FMLA v17.4s, v21.4s, v0.s[0] 94 LDR q20, [x5], 16 95 FMLA v18.4s, v22.4s, v0.s[1] 96 LDR q21, [x5], 16 97 FMLA v19.4s, v23.4s, v0.s[1] 98 LDR q22, [x5], 16 99 FMLA v16.4s, v24.4s, v0.s[2] 100 LDR q23, [x5], 16 101 FMLA v17.4s, v25.4s, v0.s[2] 102 LDR q24, [x5], 16 103 FMLA v18.4s, v26.4s, v0.s[3] 104 LDR q25, [x5], 16 105 FMLA v19.4s, v27.4s, v0.s[3] 106 LDR q26, [x5], 16 107 LDR q27, [x5], 16 108 109 PRFM PLDL1KEEP, [x5, 384] 110 PRFM PLDL1KEEP, [x5, 448] 111 PRFM PLDL1KEEP, [x5, 512] 112 PRFM PLDL1KEEP, [x5, 576] 113 114 # Second block of 4. FMA for second 4, loads for 1st block of 4. 115 FMLA v16.4s, v20.4s, v1.s[0] 116 LDR q0, [x8], 16 117 FMLA v17.4s, v21.4s, v1.s[0] 118 LDR q20, [x5], 16 119 FMLA v18.4s, v22.4s, v1.s[1] 120 LDR q21, [x5], 16 121 FMLA v19.4s, v23.4s, v1.s[1] 122 LDR q22, [x5], 16 123 FMLA v16.4s, v24.4s, v1.s[2] 124 LDR q23, [x5], 16 125 FMLA v17.4s, v25.4s, v1.s[2] 126 LDR q24, [x5], 16 127 FMLA v18.4s, v26.4s, v1.s[3] 128 LDR q25, [x5], 16 129 FMLA v19.4s, v27.4s, v1.s[3] 130 SUBS x0, x0, 32 131 LDR q26, [x5], 16 132 LDR q27, [x5], 16 133 B.HS 2b 134 1353: 136 # Epilogue 137 138 # First block of 4. FMA for first 4, loads for 2nd block of 4. 139 FMLA v16.4s, v20.4s, v0.s[0] 140 LDR q1, [x8], 16 141 FMLA v17.4s, v21.4s, v0.s[0] 142 LDR q20, [x5], 16 143 FMLA v18.4s, v22.4s, v0.s[1] 144 LDR q21, [x5], 16 145 FMLA v19.4s, v23.4s, v0.s[1] 146 LDR q22, [x5], 16 147 FMLA v16.4s, v24.4s, v0.s[2] 148 LDR q23, [x5], 16 149 FMLA v17.4s, v25.4s, v0.s[2] 150 LDR q24, [x5], 16 151 FMLA v18.4s, v26.4s, v0.s[3] 152 LDR q25, [x5], 16 153 FMLA v19.4s, v27.4s, v0.s[3] 154 LDR q26, [x5], 16 155 156 # Second block of 4. no loads 157 FMLA v16.4s, v20.4s, v1.s[0] 158 LDR q27, [x5], 16 159 FMLA v17.4s, v21.4s, v1.s[0] 160 FMLA v18.4s, v22.4s, v1.s[1] 161 FMLA v19.4s, v23.4s, v1.s[1] 162 FMLA v16.4s, v24.4s, v1.s[2] 163 FMLA v17.4s, v25.4s, v1.s[2] 164 TST x0, 31 165 FMLA v18.4s, v26.4s, v1.s[3] 166 FMLA v19.4s, v27.4s, v1.s[3] 167 # Is there a remainder?- 4 floats of A (16 bytes) or less 168 B.NE 5f 169 1704: 171 # ks loop 172 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 173 B.HI 1b 174 175 FADD v16.4s, v16.4s, v18.4s 176 FADD v17.4s, v17.4s, v19.4s 177 178 # Clamp 179 FMAX v16.4s, v16.4s, v30.4s 180 FMAX v17.4s, v17.4s, v30.4s 181 FMIN v16.4s, v16.4s, v31.4s 182 FMIN v17.4s, v17.4s, v31.4s 183 184 # Store full 1 x 8 185 SUBS x1, x1, 8 186 B.LO 8f 187 188 ST1 {v16.16b, v17.16b}, [x6], x10 189 SUB x4, x4, x3 // a -= ks 190 191 # nc loop 192 B.HI 0b 193 194 RET 195 1965: 197 # Is there a remainder?- 2 floats of A (8 bytes) 198 TBZ x0, 4, 6f 199 200 # Remainder- 4 floats of A (16 bytes) 201 LDR q20, [x5], 16 202 LDR q21, [x5], 16 203 LDR q0, [x8], 16 204 FMLA v16.4s, v20.4s, v0.s[0] 205 FMLA v17.4s, v21.4s, v0.s[0] 206 LDR q22, [x5], 16 207 LDR q23, [x5], 16 208 LDR q24, [x5], 16 209 LDR q25, [x5], 16 210 LDR q26, [x5], 16 211 LDR q27, [x5], 16 212 FMLA v18.4s, v22.4s, v0.s[1] 213 FMLA v19.4s, v23.4s, v0.s[1] 214 FMLA v16.4s, v24.4s, v0.s[2] 215 FMLA v17.4s, v25.4s, v0.s[2] 216 FMLA v18.4s, v26.4s, v0.s[3] 217 FMLA v19.4s, v27.4s, v0.s[3] 218 2196: 220 TBZ x0, 3, 7f 221 # Remainder- 2 floats of A (8 bytes) 222 LDR q20, [x5], 16 223 LDR q21, [x5], 16 224 LDR d0, [x8], 8 225 FMLA v16.4s, v20.4s, v0.s[0] 226 FMLA v17.4s, v21.4s, v0.s[0] 227 LDR q22, [x5], 16 228 LDR q23, [x5], 16 229 FMLA v18.4s, v22.4s, v0.s[1] 230 FMLA v19.4s, v23.4s, v0.s[1] 2317: 232 TBZ x0, 2, 4b 233 # Remainder- 1 float of A (4 bytes) 234 LDR q20, [x5], 16 235 LDR q21, [x5], 16 236 LDR s0, [x8], 4 237 FMLA v16.4s, v20.4s, v0.s[0] 238 FMLA v17.4s, v21.4s, v0.s[0] 239 B 4b 240 2418: 242 # Store odd channels 243 TBZ x1, 2, 9f 244 STR q16, [x6], 16 245 MOV v16.16b, v17.16b 246 2479: 248 TBZ x1, 1, 10f 249 STR d16, [x6], 8 250 DUP d16, v16.d[1] 251 25210: 253 TBZ x1, 0, 11f 254 STR s16, [x6], 4 25511: 256 RET 257 258END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53 259 260#ifdef __ELF__ 261.section ".note.GNU-stack","",%progbits 262#endif 263