1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointer 29# x8 a0 30 31# C pointer 32# x6 c0 33 34# A53 based on a53/75 but with LD64 35 36BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53 37 38 # Load cn_stride, a_offset 39 LDP x10, x11, [sp] 40 41 # Load zero, params pointer 42 LDP x12, x8, [sp, 16] 43 44 # Load min/max values 45 LD2R {v30.4s, v31.4s}, [x8] 46 470: 48 # Load initial bias from w into accumulators 49 LDP q16, q17, [x5], 32 50 MOVI v18.4s, 0 // second set of C for pipelining FMLA 51 MOVI v19.4s, 0 52 53 MOV x9, x3 // p = ks 54 551: 56 # Load next A pointer 57 LDR x8, [x4], 8 58 59 CMP x8, x12 // if a0 == zero 60 ADD x8, x8, x11 // a0 += a_offset 61 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 62 63 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 64 SUBS x0, x2, 32 // k = kc - 32 65 B.LO 5f 66 67 # 16 prologue 68 # Read first block of A and B. 69 LDP q20, q21, [x5], 32 70 LDP q22, q23, [x5], 32 71 LDP q24, q25, [x5], 32 72 LDP q26, q27, [x5], 32 73 LDR q0, [x8], 16 74 75 # Is there at least 8. yes do main loop 76 SUBS x0, x0, 32 77 B.LO 3f 78 79 # Main loop - 8 floats of A (32 bytes) 802: 81 # First block of 4. FMA for first 4, loads for 2nd block of 4. 82 FMLA v16.4s, v20.4s, v0.s[0] 83 LDR q1, [x8], 16 84 FMLA v17.4s, v21.4s, v0.s[0] 85 LDR q20, [x5], 16 86 FMLA v18.4s, v22.4s, v0.s[1] 87 LDR q21, [x5], 16 88 FMLA v19.4s, v23.4s, v0.s[1] 89 LDR q22, [x5], 16 90 FMLA v16.4s, v24.4s, v0.s[2] 91 LDR q23, [x5], 16 92 FMLA v17.4s, v25.4s, v0.s[2] 93 LDR q24, [x5], 16 94 FMLA v18.4s, v26.4s, v0.s[3] 95 LDR q25, [x5], 16 96 FMLA v19.4s, v27.4s, v0.s[3] 97 LDR q26, [x5], 16 98 LDR q27, [x5], 16 99 100 101 # Second block of 4. FMA for second 4, loads for 1st block of 4. 102 FMLA v16.4s, v20.4s, v1.s[0] 103 LDR q0, [x8], 16 104 FMLA v17.4s, v21.4s, v1.s[0] 105 LDR q20, [x5], 16 106 FMLA v18.4s, v22.4s, v1.s[1] 107 LDR q21, [x5], 16 108 FMLA v19.4s, v23.4s, v1.s[1] 109 LDR q22, [x5], 16 110 FMLA v16.4s, v24.4s, v1.s[2] 111 LDR q23, [x5], 16 112 FMLA v17.4s, v25.4s, v1.s[2] 113 LDR q24, [x5], 16 114 FMLA v18.4s, v26.4s, v1.s[3] 115 LDR q25, [x5], 16 116 FMLA v19.4s, v27.4s, v1.s[3] 117 SUBS x0, x0, 32 118 LDR q26, [x5], 16 119 LDR q27, [x5], 16 120 B.HS 2b 121 1223: 123 # Epilogue 124 125 # First block of 4. FMA for first 4, loads for 2nd block of 4. 126 FMLA v16.4s, v20.4s, v0.s[0] 127 LDR q1, [x8], 16 128 FMLA v17.4s, v21.4s, v0.s[0] 129 LDR q20, [x5], 16 130 FMLA v18.4s, v22.4s, v0.s[1] 131 LDR q21, [x5], 16 132 FMLA v19.4s, v23.4s, v0.s[1] 133 LDR q22, [x5], 16 134 FMLA v16.4s, v24.4s, v0.s[2] 135 LDR q23, [x5], 16 136 FMLA v17.4s, v25.4s, v0.s[2] 137 LDR q24, [x5], 16 138 FMLA v18.4s, v26.4s, v0.s[3] 139 LDR q25, [x5], 16 140 FMLA v19.4s, v27.4s, v0.s[3] 141 LDR q26, [x5], 16 142 143 # Second block of 4. no loads 144 FMLA v16.4s, v20.4s, v1.s[0] 145 LDR q27, [x5], 16 146 FMLA v17.4s, v21.4s, v1.s[0] 147 FMLA v18.4s, v22.4s, v1.s[1] 148 FMLA v19.4s, v23.4s, v1.s[1] 149 FMLA v16.4s, v24.4s, v1.s[2] 150 FMLA v17.4s, v25.4s, v1.s[2] 151 TST x0, 31 152 FMLA v18.4s, v26.4s, v1.s[3] 153 FMLA v19.4s, v27.4s, v1.s[3] 154 # Is there a remainder?- 4 floats of A (16 bytes) or less 155 B.NE 5f 156 1574: 158 # ks loop 159 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 160 B.HI 1b 161 162 FADD v16.4s, v16.4s, v18.4s 163 FADD v17.4s, v17.4s, v19.4s 164 165 # Clamp 166 FMAX v16.4s, v16.4s, v30.4s 167 FMAX v17.4s, v17.4s, v30.4s 168 FMIN v16.4s, v16.4s, v31.4s 169 FMIN v17.4s, v17.4s, v31.4s 170 171 # Store full 1 x 8 172 SUBS x1, x1, 8 173 B.LO 8f 174 175 ST1 {v16.16b, v17.16b}, [x6], x10 176 SUB x4, x4, x3 // a -= ks 177 178 # nc loop 179 B.HI 0b 180 181 RET 182 1835: 184 # Is there a remainder?- 2 floats of A (8 bytes) 185 TBZ x0, 4, 6f 186 187 # Remainder- 4 floats of A (16 bytes) 188 LDR q20, [x5], 16 189 LDR q21, [x5], 16 190 LDR q0, [x8], 16 191 FMLA v16.4s, v20.4s, v0.s[0] 192 FMLA v17.4s, v21.4s, v0.s[0] 193 LDR q22, [x5], 16 194 LDR q23, [x5], 16 195 LDR q24, [x5], 16 196 LDR q25, [x5], 16 197 LDR q26, [x5], 16 198 LDR q27, [x5], 16 199 FMLA v18.4s, v22.4s, v0.s[1] 200 FMLA v19.4s, v23.4s, v0.s[1] 201 FMLA v16.4s, v24.4s, v0.s[2] 202 FMLA v17.4s, v25.4s, v0.s[2] 203 FMLA v18.4s, v26.4s, v0.s[3] 204 FMLA v19.4s, v27.4s, v0.s[3] 205 2066: 207 TBZ x0, 3, 7f 208 # Remainder- 2 floats of A (8 bytes) 209 LDR q20, [x5], 16 210 LDR q21, [x5], 16 211 LDR d0, [x8], 8 212 FMLA v16.4s, v20.4s, v0.s[0] 213 FMLA v17.4s, v21.4s, v0.s[0] 214 LDR q22, [x5], 16 215 LDR q23, [x5], 16 216 FMLA v18.4s, v22.4s, v0.s[1] 217 FMLA v19.4s, v23.4s, v0.s[1] 2187: 219 TBZ x0, 2, 4b 220 # Remainder- 1 float of A (4 bytes) 221 LDR q20, [x5], 16 222 LDR q21, [x5], 16 223 LDR s0, [x8], 4 224 FMLA v16.4s, v20.4s, v0.s[0] 225 FMLA v17.4s, v21.4s, v0.s[0] 226 B 4b 227 2288: 229 # Store odd channels 230 TBZ x1, 2, 9f 231 STR q16, [x6], 16 232 MOV v16.16b, v17.16b 233 2349: 235 TBZ x1, 1, 10f 236 STR d16, [x6], 8 237 DUP d16, v16.d[1] 238 23910: 240 TBZ x1, 0, 11f 241 STR s16, [x6], 4 24211: 243 RET 244 245END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53 246 247#ifdef __ELF__ 248.section ".note.GNU-stack","",%progbits 249#endif 250