1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53( 9# size_t mr, (x0) - unused. mr = 1 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const float*restrict w, x5 15# float*restrict c, x6 16# size_t cm_stride, (x7) - unused 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointer 25# x8 a0 26 27# C pointer 28# x6 c0 29 30# A53 based on a53/75 but with LD64 31 32BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53 33 34 # Load cn_stride, a_offset 35 LDP x10, x11, [sp] 36 37 # Load zero, params pointer 38 LDP x12, x8, [sp, 16] 39 40 # Load min/max values 41 LD2R {v30.4s, v31.4s}, [x8] 42 430: 44 # Load initial bias from w into accumulators 45 LDP q16, q17, [x5], 32 46 MOVI v18.4s, 0 // second set of C for pipelining FMLA 47 $if PREFETCH: 48 PRFM PLDL1KEEP, [x5, 64] 49 MOVI v19.4s, 0 50 $if PREFETCH: 51 PRFM PLDL1KEEP, [x5, 128] 52 PRFM PLDL1KEEP, [x5, 192] 53 PRFM PLDL1KEEP, [x5, 256] 54 PRFM PLDL1KEEP, [x5, 320] 55 PRFM PLDL1KEEP, [x5, 384] 56 PRFM PLDL1KEEP, [x5, 448] 57 PRFM PLDL1KEEP, [x5, 512] 58 PRFM PLDL1KEEP, [x5, 576] 59 60 MOV x9, x3 // p = ks 61 621: 63 # Load next A pointer 64 LDR x8, [x4], 8 65 66 CMP x8, x12 // if a0 == zero 67 ADD x8, x8, x11 // a0 += a_offset 68 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 69 70 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 71 SUBS x0, x2, 32 // k = kc - 32 72 B.LO 5f 73 74 # 16 prologue 75 # Read first block of A and B. 76 LDP q20, q21, [x5], 32 77 LDP q22, q23, [x5], 32 78 LDP q24, q25, [x5], 32 79 LDP q26, q27, [x5], 32 80 LDR q0, [x8], 16 81 82 # Is there at least 8. yes do main loop 83 SUBS x0, x0, 32 84 B.LO 3f 85 86 # Main loop - 8 floats of A (32 bytes) 872: 88 # First block of 4. FMA for first 4, loads for 2nd block of 4. 89 FMLA v16.4s, v20.4s, v0.s[0] 90 LDR q1, [x8], 16 91 FMLA v17.4s, v21.4s, v0.s[0] 92 LDR q20, [x5], 16 93 FMLA v18.4s, v22.4s, v0.s[1] 94 LDR q21, [x5], 16 95 FMLA v19.4s, v23.4s, v0.s[1] 96 LDR q22, [x5], 16 97 FMLA v16.4s, v24.4s, v0.s[2] 98 LDR q23, [x5], 16 99 FMLA v17.4s, v25.4s, v0.s[2] 100 LDR q24, [x5], 16 101 FMLA v18.4s, v26.4s, v0.s[3] 102 LDR q25, [x5], 16 103 FMLA v19.4s, v27.4s, v0.s[3] 104 LDR q26, [x5], 16 105 LDR q27, [x5], 16 106 107 $if PREFETCH: 108 PRFM PLDL1KEEP, [x5, 384] 109 PRFM PLDL1KEEP, [x5, 448] 110 PRFM PLDL1KEEP, [x5, 512] 111 PRFM PLDL1KEEP, [x5, 576] 112 113 # Second block of 4. FMA for second 4, loads for 1st block of 4. 114 FMLA v16.4s, v20.4s, v1.s[0] 115 LDR q0, [x8], 16 116 FMLA v17.4s, v21.4s, v1.s[0] 117 LDR q20, [x5], 16 118 FMLA v18.4s, v22.4s, v1.s[1] 119 LDR q21, [x5], 16 120 FMLA v19.4s, v23.4s, v1.s[1] 121 LDR q22, [x5], 16 122 FMLA v16.4s, v24.4s, v1.s[2] 123 LDR q23, [x5], 16 124 FMLA v17.4s, v25.4s, v1.s[2] 125 LDR q24, [x5], 16 126 FMLA v18.4s, v26.4s, v1.s[3] 127 LDR q25, [x5], 16 128 FMLA v19.4s, v27.4s, v1.s[3] 129 SUBS x0, x0, 32 130 LDR q26, [x5], 16 131 LDR q27, [x5], 16 132 B.HS 2b 133 1343: 135 # Epilogue 136 137 # First block of 4. FMA for first 4, loads for 2nd block of 4. 138 FMLA v16.4s, v20.4s, v0.s[0] 139 LDR q1, [x8], 16 140 FMLA v17.4s, v21.4s, v0.s[0] 141 LDR q20, [x5], 16 142 FMLA v18.4s, v22.4s, v0.s[1] 143 LDR q21, [x5], 16 144 FMLA v19.4s, v23.4s, v0.s[1] 145 LDR q22, [x5], 16 146 FMLA v16.4s, v24.4s, v0.s[2] 147 LDR q23, [x5], 16 148 FMLA v17.4s, v25.4s, v0.s[2] 149 LDR q24, [x5], 16 150 FMLA v18.4s, v26.4s, v0.s[3] 151 LDR q25, [x5], 16 152 FMLA v19.4s, v27.4s, v0.s[3] 153 LDR q26, [x5], 16 154 155 # Second block of 4. no loads 156 FMLA v16.4s, v20.4s, v1.s[0] 157 LDR q27, [x5], 16 158 FMLA v17.4s, v21.4s, v1.s[0] 159 FMLA v18.4s, v22.4s, v1.s[1] 160 FMLA v19.4s, v23.4s, v1.s[1] 161 FMLA v16.4s, v24.4s, v1.s[2] 162 FMLA v17.4s, v25.4s, v1.s[2] 163 TST x0, 31 164 FMLA v18.4s, v26.4s, v1.s[3] 165 FMLA v19.4s, v27.4s, v1.s[3] 166 # Is there a remainder?- 4 floats of A (16 bytes) or less 167 B.NE 5f 168 1694: 170 # ks loop 171 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 172 B.HI 1b 173 174 FADD v16.4s, v16.4s, v18.4s 175 FADD v17.4s, v17.4s, v19.4s 176 177 # Clamp 178 FMAX v16.4s, v16.4s, v30.4s 179 FMAX v17.4s, v17.4s, v30.4s 180 FMIN v16.4s, v16.4s, v31.4s 181 FMIN v17.4s, v17.4s, v31.4s 182 183 # Store full 1 x 8 184 SUBS x1, x1, 8 185 B.LO 8f 186 187 ST1 {v16.16b, v17.16b}, [x6], x10 188 SUB x4, x4, x3 // a -= ks 189 190 # nc loop 191 B.HI 0b 192 193 RET 194 1955: 196 # Is there a remainder?- 2 floats of A (8 bytes) 197 TBZ x0, 4, 6f 198 199 # Remainder- 4 floats of A (16 bytes) 200 LDR q20, [x5], 16 201 LDR q21, [x5], 16 202 LDR q0, [x8], 16 203 FMLA v16.4s, v20.4s, v0.s[0] 204 FMLA v17.4s, v21.4s, v0.s[0] 205 LDR q22, [x5], 16 206 LDR q23, [x5], 16 207 LDR q24, [x5], 16 208 LDR q25, [x5], 16 209 LDR q26, [x5], 16 210 LDR q27, [x5], 16 211 FMLA v18.4s, v22.4s, v0.s[1] 212 FMLA v19.4s, v23.4s, v0.s[1] 213 FMLA v16.4s, v24.4s, v0.s[2] 214 FMLA v17.4s, v25.4s, v0.s[2] 215 FMLA v18.4s, v26.4s, v0.s[3] 216 FMLA v19.4s, v27.4s, v0.s[3] 217 2186: 219 TBZ x0, 3, 7f 220 # Remainder- 2 floats of A (8 bytes) 221 LDR q20, [x5], 16 222 LDR q21, [x5], 16 223 LDR d0, [x8], 8 224 FMLA v16.4s, v20.4s, v0.s[0] 225 FMLA v17.4s, v21.4s, v0.s[0] 226 LDR q22, [x5], 16 227 LDR q23, [x5], 16 228 FMLA v18.4s, v22.4s, v0.s[1] 229 FMLA v19.4s, v23.4s, v0.s[1] 2307: 231 TBZ x0, 2, 4b 232 # Remainder- 1 float of A (4 bytes) 233 LDR q20, [x5], 16 234 LDR q21, [x5], 16 235 LDR s0, [x8], 4 236 FMLA v16.4s, v20.4s, v0.s[0] 237 FMLA v17.4s, v21.4s, v0.s[0] 238 B 4b 239 2408: 241 # Store odd channels 242 TBZ x1, 2, 9f 243 STR q16, [x6], 16 244 MOV v16.16b, v17.16b 245 2469: 247 TBZ x1, 1, 10f 248 STR d16, [x6], 8 249 DUP d16, v16.d[1] 250 25110: 252 TBZ x1, 0, 11f 253 STR s16, [x6], 4 25411: 255 RET 256 257END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53 258 259#ifdef __ELF__ 260.section ".note.GNU-stack","",%progbits 261#endif 262