1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# LINT.IfChange 9# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75( 10# size_t mr, (x0) - unused. mr = 1 11# size_t nc, x1 12# size_t kc, x2 / x0 13# size_t ks, x3 / x9 14# const float**restrict a, x4 15# const float*restrict w, x5 16# float*restrict c, x6 17# size_t cm_stride, (x7) - unused 18# size_t cn_stride, [sp] -> x10 19# size_t a_offset, [sp + 8] -> x11 20# const float* zero, [sp + 16] -> x12 21# const xnn_f32_minmax_params params [sp + 24] -> (x8) 22 23# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 24 25# A pointer 26# x8 a0 27 28# C pointer 29# x6 c0 30 31BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75 32 33 # Load cn_stride, a_offset 34 LDP x10, x11, [sp] 35 36 # Load zero, params pointer 37 LDP x12, x8, [sp, 16] 38 39 # Load min/max values 40 LD2R {v30.4s, v31.4s}, [x8] 41 420: 43 # Load initial bias from w into accumulators 44 LDP q16, q17, [x5], 32 45 MOVI v18.4s, 0 // second set of C for pipelining FMLA 46 $if PREFETCH: 47 PRFM PLDL1KEEP, [x5] 48 MOVI v19.4s, 0 49 $if PREFETCH: 50 PRFM PLDL1KEEP, [x5, 64] 51 PRFM PLDL1KEEP, [x5, 128] 52 PRFM PLDL1KEEP, [x5, 192] 53 54 MOV x9, x3 // p = ks 55 561: 57 # Load next A pointer 58 LDR x8, [x4], 8 59 60 CMP x8, x12 // if a0 == zero 61 ADD x8, x8, x11 // a0 += a_offset 62 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 63 64 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 65 SUBS x0, x2, 32 // k = kc - 32 66 B.LO 4f 67 68 # 16 prologue 69 # Read first block of A and B. 70 LDP q20, q21, [x5], 32 71 LDP q22, q23, [x5], 32 72 LDP q24, q25, [x5], 32 73 LDP q26, q27, [x5], 32 74 LDR q0, [x8], 16 75 76 # Is there at least 8. yes do main loop 77 SUBS x0, x0, 32 78 B.LO 3f 79 80 # Main loop - 8 floats of A (32 bytes) 812: 82 # First block of 4. FMA for first 4, loads for 2nd block of 4. 83 FMLA v16.4s, v20.4s, v0.s[0] 84 LDR q1, [x8], 16 85 FMLA v17.4s, v21.4s, v0.s[0] 86 LDP q20, q21, [x5], 32 87 FMLA v18.4s, v22.4s, v0.s[1] 88 FMLA v19.4s, v23.4s, v0.s[1] 89 LDP q22, q23, [x5], 32 90 FMLA v16.4s, v24.4s, v0.s[2] 91 FMLA v17.4s, v25.4s, v0.s[2] 92 LDP q24, q25, [x5], 32 93 $if PREFETCH: 94 PRFM PLDL1KEEP, [x5, 128] 95 FMLA v18.4s, v26.4s, v0.s[3] 96 $if PREFETCH: 97 PRFM PLDL1KEEP, [x5, 256] 98 FMLA v19.4s, v27.4s, v0.s[3] 99 LDP q26, q27, [x5], 32 100 101 # Second block of 4. FMA for second 4, loads for 1st block of 4. 102 FMLA v16.4s, v20.4s, v1.s[0] 103 LDR q0, [x8], 16 104 FMLA v17.4s, v21.4s, v1.s[0] 105 LDP q20, q21, [x5], 32 106 FMLA v18.4s, v22.4s, v1.s[1] 107 FMLA v19.4s, v23.4s, v1.s[1] 108 LDP q22, q23, [x5], 32 109 FMLA v16.4s, v24.4s, v1.s[2] 110 FMLA v17.4s, v25.4s, v1.s[2] 111 LDP q24, q25, [x5], 32 112 $if PREFETCH: 113 PRFM PLDL1KEEP, [x5, 128] 114 FMLA v18.4s, v26.4s, v1.s[3] 115 $if PREFETCH: 116 PRFM PLDL1KEEP, [x5, 256] 117 FMLA v19.4s, v27.4s, v1.s[3] 118 SUBS x0, x0, 32 119 LDP q26, q27, [x5], 32 120 B.HS 2b 121 1223: 123 # Epilogue 124 125 # First block of 4. FMA for first 4, loads for 2nd block of 4. 126 FMLA v16.4s, v20.4s, v0.s[0] 127 LDR q1, [x8], 16 128 FMLA v17.4s, v21.4s, v0.s[0] 129 LDP q20, q21, [x5], 32 130 FMLA v18.4s, v22.4s, v0.s[1] 131 FMLA v19.4s, v23.4s, v0.s[1] 132 LDP q22, q23, [x5], 32 133 FMLA v16.4s, v24.4s, v0.s[2] 134 FMLA v17.4s, v25.4s, v0.s[2] 135 LDP q24, q25, [x5], 32 136 $if PREFETCH: 137 PRFM PLDL1KEEP, [x5, 128] 138 FMLA v18.4s, v26.4s, v0.s[3] 139 $if PREFETCH: 140 PRFM PLDL1KEEP, [x5, 256] 141 FMLA v19.4s, v27.4s, v0.s[3] 142 LDP q26, q27, [x5], 32 143 144 # Second block of 4. no loads 145 FMLA v16.4s, v20.4s, v1.s[0] 146 FMLA v17.4s, v21.4s, v1.s[0] 147 FMLA v18.4s, v22.4s, v1.s[1] 148 FMLA v19.4s, v23.4s, v1.s[1] 149 FMLA v16.4s, v24.4s, v1.s[2] 150 FMLA v17.4s, v25.4s, v1.s[2] 151 FMLA v18.4s, v26.4s, v1.s[3] 152 FMLA v19.4s, v27.4s, v1.s[3] 153 1544: 155 # Is there a remainder?- 4 floats of A (16 bytes) 156 TBNZ x0, 4, 6f 157 # Is there a remainder?- 2 floats of A (8 bytes) 158 TBNZ x0, 3, 7f 159 # Is there a remainder?- 1 float of A (4 bytes) 160 TBNZ x0, 2, 9f 161 1625: 163 # ks loop 164 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 165 B.HI 1b 166 167 FADD v16.4s, v16.4s, v18.4s 168 FADD v17.4s, v17.4s, v19.4s 169 170 # Clamp 171 FMAX v16.4s, v16.4s, v30.4s 172 FMAX v17.4s, v17.4s, v30.4s 173 FMIN v16.4s, v16.4s, v31.4s 174 FMIN v17.4s, v17.4s, v31.4s 175 176 # Store full 1 x 8 177 SUBS x1, x1, 8 178 B.LO 10f 179 180 STP q16, q17, [x6] 181 ADD x6, x6, x10 182 183 SUB x4, x4, x3 // a -= ks 184 185 # nc loop 186 B.HI 0b 187 188 RET 189 1906: 191 # Remainder- 4 floats of A (16 bytes) 192 LDP q20, q21, [x5], 32 193 LDR q0, [x8], 16 194 FMLA v16.4s, v20.4s, v0.s[0] 195 FMLA v17.4s, v21.4s, v0.s[0] 196 LDP q22, q23, [x5], 32 197 LDP q24, q25, [x5], 32 198 LDP q26, q27, [x5], 32 199 FMLA v18.4s, v22.4s, v0.s[1] 200 FMLA v19.4s, v23.4s, v0.s[1] 201 FMLA v16.4s, v24.4s, v0.s[2] 202 FMLA v17.4s, v25.4s, v0.s[2] 203 FMLA v18.4s, v26.4s, v0.s[3] 204 FMLA v19.4s, v27.4s, v0.s[3] 205 206 TBZ x0, 3, 8f 2077: 208 # Remainder- 2 floats of A (8 bytes) 209 LDP q20, q21, [x5], 32 210 LDR d0, [x8], 8 211 FMLA v16.4s, v20.4s, v0.s[0] 212 FMLA v17.4s, v21.4s, v0.s[0] 213 LDP q22, q23, [x5], 32 214 FMLA v18.4s, v22.4s, v0.s[1] 215 FMLA v19.4s, v23.4s, v0.s[1] 2168: 217 TBZ x0, 2, 5b 2189: 219 # Remainder- 1 float of A (4 bytes) 220 LDP q20, q21, [x5], 32 221 LDR s0, [x8], 4 222 FMLA v16.4s, v20.4s, v0.s[0] 223 FMLA v17.4s, v21.4s, v0.s[0] 224 B 5b 225 22610: 227 # Store odd channels 228 TBZ x1, 2, 11f 229 STR q16, [x6], 16 230 MOV v16.16b, v17.16b 231 23211: 233 TBZ x1, 1, 12f 234 STR d16, [x6], 8 235 DUP d16, v16.d[1] 236 23712: 238 TBZ x1, 0, 13f 239 STR s16, [x6], 4 24013: 241 RET 242 243END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75 244# LINT.ThenChange(1x8-aarch64-neonfma-cortex-a75.cc) 245 246#ifdef __ELF__ 247.section ".note.GNU-stack","",%progbits 248#endif 249