1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointer 27# x3 a0 28 29# C pointer 30# x6 c0 31 32# Clamp v4 v5 33 34# A53 based on A57/A75 but with LD64 35 36BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53 37 38 # Load cn_stride, params pointer 39 LDP x14, x8, [sp] 40 41 # Load min/max values 42 LD2R {v4.4s, v5.4s}, [x8] 430: 44 # Load initial bias from w into accumulators 45 LDP q16, q17, [x5], 32 46 47 MOVI v18.4s, 0 // second set of C for pipelining FMLA 48 PRFM PLDL1KEEP, [x5, 64] 49 MOVI v19.4s, 0 50 PRFM PLDL1KEEP, [x5, 128] 51 PRFM PLDL1KEEP, [x5, 192] 52 PRFM PLDL1KEEP, [x5, 256] 53 PRFM PLDL1KEEP, [x5, 320] 54 PRFM PLDL1KEEP, [x5, 384] 55 PRFM PLDL1KEEP, [x5, 448] 56 PRFM PLDL1KEEP, [x5, 512] 57 PRFM PLDL1KEEP, [x5, 576] 58 59 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 60 SUBS x0, x2, 32 // k = kc - 32 61 62 B.LO 3f 63 64 # 16 prologue 65 # Read first block of 1 A and B. 66 LDP q20, q21, [x5], 32 67 LDP q22, q23, [x5], 32 68 LDP q24, q25, [x5], 32 69 LDP q26, q27, [x5], 32 70 LDR q0, [x3], 16 71 72 # Is there at least 32. yes do main loop 73 SUBS x0, x0, 32 74 B.LO 2f 75 76 # Main loop - 8 floats of A (32 bytes) 771: 78 # First block of 4. FMA for first 4, loads for 2nd block of 4. 79 FMLA v16.4s, v20.4s, v0.s[0] 80 LDR q1, [x3], 16 81 FMLA v17.4s, v21.4s, v0.s[0] 82 LDR q20, [x5], 16 83 FMLA v18.4s, v22.4s, v0.s[1] 84 LDR q21, [x5], 16 85 FMLA v19.4s, v23.4s, v0.s[1] 86 LDR q22, [x5], 16 87 FMLA v16.4s, v24.4s, v0.s[2] 88 LDR q23, [x5], 16 89 FMLA v17.4s, v25.4s, v0.s[2] 90 LDR q24, [x5], 16 91 FMLA v18.4s, v26.4s, v0.s[3] 92 LDR q25, [x5], 16 93 FMLA v19.4s, v27.4s, v0.s[3] 94 LDR q26, [x5], 16 95 LDR q27, [x5], 16 96 97 PRFM PLDL1KEEP, [x5, 384] 98 PRFM PLDL1KEEP, [x5, 448] 99 PRFM PLDL1KEEP, [x5, 512] 100 PRFM PLDL1KEEP, [x5, 576] 101 102 # Second block of 4. FMA for second 4, loads for 1st block of 4. 103 FMLA v16.4s, v20.4s, v1.s[0] 104 LDR q0, [x3], 16 105 FMLA v17.4s, v21.4s, v1.s[0] 106 LDR q20, [x5], 16 107 FMLA v18.4s, v22.4s, v1.s[1] 108 LDR q21, [x5], 16 109 FMLA v19.4s, v23.4s, v1.s[1] 110 LDR q22, [x5], 16 111 FMLA v16.4s, v24.4s, v1.s[2] 112 LDR q23, [x5], 16 113 FMLA v17.4s, v25.4s, v1.s[2] 114 LDR q24, [x5], 16 115 FMLA v18.4s, v26.4s, v1.s[3] 116 LDR q25, [x5], 16 117 FMLA v19.4s, v27.4s, v1.s[3] 118 SUBS x0, x0, 32 119 LDR q26, [x5], 16 120 LDR q27, [x5], 16 121 B.HS 1b 122 1232: 124 # Epilogue 125 126 # First block of 4. FMA for first 4, loads for 2nd block of 4. 127 FMLA v16.4s, v20.4s, v0.s[0] 128 LDR q1, [x3], 16 129 FMLA v17.4s, v21.4s, v0.s[0] 130 LDR q20, [x5], 16 131 FMLA v18.4s, v22.4s, v0.s[1] 132 LDR q21, [x5], 16 133 FMLA v19.4s, v23.4s, v0.s[1] 134 LDR q22, [x5], 16 135 FMLA v16.4s, v24.4s, v0.s[2] 136 LDR q23, [x5], 16 137 FMLA v17.4s, v25.4s, v0.s[2] 138 LDR q24, [x5], 16 139 FMLA v18.4s, v26.4s, v0.s[3] 140 LDR q25, [x5], 16 141 FMLA v19.4s, v27.4s, v0.s[3] 142 LDR q26, [x5], 16 143 144 # Second block of 4. no loads 145 FMLA v16.4s, v20.4s, v1.s[0] 146 LDR q27, [x5], 16 147 FMLA v17.4s, v21.4s, v1.s[0] 148 FMLA v18.4s, v22.4s, v1.s[1] 149 FMLA v19.4s, v23.4s, v1.s[1] 150 FMLA v16.4s, v24.4s, v1.s[2] 151 FMLA v17.4s, v25.4s, v1.s[2] 152 FMLA v18.4s, v26.4s, v1.s[3] 153 FMLA v19.4s, v27.4s, v1.s[3] 154 1553: 156 # Is there a remainder?- 4 floats of A (16 bytes) 157 TBNZ x0, 4, 5f 158 # Is there a remainder?- 2 floats of A (8 bytes) 159 TBNZ x0, 3, 6f 160 # Is there a remainder?- 1 float of A (4 bytes) 161 TBNZ x0, 2, 8f 162 1634: 164 FADD v16.4s, v16.4s, v18.4s 165 FADD v17.4s, v17.4s, v19.4s 166 167 # Clamp 168 FMAX v16.4s, v16.4s, v4.4s 169 SUBS x1, x1, 8 170 FMAX v17.4s, v17.4s, v4.4s 171 FMIN v16.4s, v16.4s, v5.4s 172 FMIN v17.4s, v17.4s, v5.4s 173 174 # Store full 1 x 8 175 B.LO 9f 176 177 ST1 {v16.16b, v17.16b}, [x6], x14 178 SUB x3, x3, x2 // a0 -= kc 179 180 B.HI 0b 181 182 RET 183 1845: 185 # Remainder- 4 floats of A (16 bytes) 186 LDR q20, [x5], 16 187 LDR q21, [x5], 16 188 LDR q0, [x3], 16 189 FMLA v16.4s, v20.4s, v0.s[0] 190 FMLA v17.4s, v21.4s, v0.s[0] 191 LDR q22, [x5], 16 192 LDR q23, [x5], 16 193 LDR q24, [x5], 16 194 LDR q25, [x5], 16 195 LDR q26, [x5], 16 196 LDR q27, [x5], 16 197 FMLA v18.4s, v22.4s, v0.s[1] 198 FMLA v19.4s, v23.4s, v0.s[1] 199 FMLA v16.4s, v24.4s, v0.s[2] 200 FMLA v17.4s, v25.4s, v0.s[2] 201 FMLA v18.4s, v26.4s, v0.s[3] 202 FMLA v19.4s, v27.4s, v0.s[3] 203 204 TBZ x0, 3, 7f 2056: 206 # Remainder- 2 floats of A (8 bytes) 207 LDR q20, [x5], 16 208 LDR q21, [x5], 16 209 LDR d0, [x3], 8 210 FMLA v16.4s, v20.4s, v0.s[0] 211 FMLA v17.4s, v21.4s, v0.s[0] 212 LDR q22, [x5], 16 213 LDR q23, [x5], 16 214 FMLA v18.4s, v22.4s, v0.s[1] 215 FMLA v19.4s, v23.4s, v0.s[1] 2167: 217 TBZ x0, 2, 4b 2188: 219 # Remainder- 1 float of A (4 bytes) 220 LDR q20, [x5], 16 221 LDR q21, [x5], 16 222 LDR s0, [x3], 4 223 FMLA v16.4s, v20.4s, v0.s[0] 224 FMLA v17.4s, v21.4s, v0.s[0] 225 B 4b 226 227 # Store odd channels 2289: 229 TBZ x1, 2, 10f 230 STR q16, [x6], 16 231 MOV v16.16b, v17.16b 232 23310: 234 TBZ x1, 1, 11f 235 STR d16, [x6], 8 236 DUP d16, v16.d[1] 237 23811: 239 TBZ x1, 0, 12f 240 STR s16, [x6] 24112: 242 RET 243 244END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53 245 246#ifdef __ELF__ 247.section ".note.GNU-stack","",%progbits 248#endif 249