1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128 39 40 # Load cn_stride, params pointer 41 LDP x14, x8, [sp] 42 43 # Load min/max values 44 LD2R {v4.4s, v5.4s}, [x8] 45 46 # Clamp A and C pointers 47 CMP x0, 2 // if mr < 2 48 ADD x11, x3, x4 // a1 = a0 + a_stride 49 ADD x9, x6, x7 // c1 = c0 + cm_stride 50 CSEL x11, x3, x11, LO // a1 = a0 51 CSEL x9, x6, x9, LO // c1 = c0 52 53 ADD x12, x11, x4 // a2 = a1 + a_stride 54 ADD x10, x9, x7 // c2 = c1 + cm_stride 55 // if mr <= 2 56 CSEL x12, x11, x12, LS // a2 = a1 57 CSEL x10, x9, x10, LS // c2 = c1 58 59 CMP x0, 4 // if mr < 4 60 ADD x4, x12, x4 // a3 = a2 + a_stride 61 ADD x7, x10, x7 // c3 = c2 + cm_stride 62 CSEL x4, x12, x4, LO // a3 = a2 63 CSEL x7, x10, x7, LO // c3 = c2 64 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q17, [x5], 32 68 MOV v18.16b, v16.16b 69 MOV v19.16b, v17.16b 70 MOV v28.16b, v16.16b 71 MOV v29.16b, v17.16b 72 MOV v30.16b, v16.16b 73 MOV v31.16b, v17.16b 74 75 # Is there at least 4 floats (16 bytes)? 76 SUBS x0, x2, 16 // k = kc - 16 77 B.LO 3f 78 79 # Main loop - 4 floats of A (16 bytes) 801: 81 LDR q0, [x3], 16 82 LDP q20, q21, [x5], 32 83 LDR q1, [x11], 16 84 LDR q2, [x12], 16 85 LDR q3, [x4], 16 86 FMLA v16.4s, v20.4s, v0.s[0] 87 FMLA v17.4s, v21.4s, v0.s[0] 88 FMLA v18.4s, v20.4s, v1.s[0] 89 FMLA v19.4s, v21.4s, v1.s[0] 90 LDP q22, q23, [x5], 32 91 FMLA v28.4s, v20.4s, v2.s[0] 92 FMLA v29.4s, v21.4s, v2.s[0] 93 FMLA v30.4s, v20.4s, v3.s[0] 94 FMLA v31.4s, v21.4s, v3.s[0] 95 LDP q24, q25, [x5], 32 96 FMLA v16.4s, v22.4s, v0.s[1] 97 FMLA v17.4s, v23.4s, v0.s[1] 98 FMLA v18.4s, v22.4s, v1.s[1] 99 FMLA v19.4s, v23.4s, v1.s[1] 100 LDP q26, q27, [x5], 32 101 FMLA v28.4s, v22.4s, v2.s[1] 102 FMLA v29.4s, v23.4s, v2.s[1] 103 FMLA v30.4s, v22.4s, v3.s[1] 104 FMLA v31.4s, v23.4s, v3.s[1] 105 FMLA v16.4s, v24.4s, v0.s[2] 106 FMLA v17.4s, v25.4s, v0.s[2] 107 FMLA v18.4s, v24.4s, v1.s[2] 108 FMLA v19.4s, v25.4s, v1.s[2] 109 FMLA v28.4s, v24.4s, v2.s[2] 110 FMLA v29.4s, v25.4s, v2.s[2] 111 FMLA v30.4s, v24.4s, v3.s[2] 112 FMLA v31.4s, v25.4s, v3.s[2] 113 FMLA v16.4s, v26.4s, v0.s[3] 114 FMLA v17.4s, v27.4s, v0.s[3] 115 FMLA v18.4s, v26.4s, v1.s[3] 116 FMLA v19.4s, v27.4s, v1.s[3] 117 FMLA v28.4s, v26.4s, v2.s[3] 118 FMLA v29.4s, v27.4s, v2.s[3] 119 SUBS x0, x0, 16 120 FMLA v30.4s, v26.4s, v3.s[3] 121 FMLA v31.4s, v27.4s, v3.s[3] 122 B.HS 1b 123 124 TST x0, 15 125 B.NE 3f 126 1272: 128 # Clamp 129 FMAX v16.4s, v16.4s, v4.4s 130 SUBS x1, x1, 8 131 FMAX v17.4s, v17.4s, v4.4s 132 FMAX v18.4s, v18.4s, v4.4s 133 FMAX v19.4s, v19.4s, v4.4s 134 FMAX v28.4s, v28.4s, v4.4s 135 FMAX v29.4s, v29.4s, v4.4s 136 FMAX v30.4s, v30.4s, v4.4s 137 FMAX v31.4s, v31.4s, v4.4s 138 FMIN v16.4s, v16.4s, v5.4s 139 FMIN v17.4s, v17.4s, v5.4s 140 FMIN v18.4s, v18.4s, v5.4s 141 FMIN v19.4s, v19.4s, v5.4s 142 FMIN v28.4s, v28.4s, v5.4s 143 FMIN v29.4s, v29.4s, v5.4s 144 FMIN v30.4s, v30.4s, v5.4s 145 FMIN v31.4s, v31.4s, v5.4s 146 147 # Store full 4 x 8 148 B.LO 5f 149 150 ST1 {v16.16b, v17.16b}, [x6], x14 151 SUB x3, x3, x2 // a0 -= kc 152 ST1 {v18.16b, v19.16b}, [x9], x14 153 SUB x11, x11, x2 // a1 -= kc 154 ST1 {v28.16b, v29.16b}, [x10], x14 155 SUB x12, x12, x2 // a2 -= kc 156 ST1 {v30.16b, v31.16b}, [x7], x14 157 SUB x4, x4, x2 // a3 -= kc 158 159 B.HI 0b 160 RET 161 162 # Remainder- 2 floats of A (8 bytes) 1633: 164 # Is there a remainder?- 2 floats of A (8 bytes) 165 TBZ x0, 3, 4f 166 167 # Remainder- 2 floats of A (8 bytes) 168 LDR d0, [x3], 8 169 LDP q20, q21, [x5], 32 170 LDR d1, [x11], 8 171 LDR d2, [x12], 8 172 LDR d3, [x4], 8 173 FMLA v16.4s, v20.4s, v0.s[0] 174 FMLA v17.4s, v21.4s, v0.s[0] 175 FMLA v18.4s, v20.4s, v1.s[0] 176 FMLA v19.4s, v21.4s, v1.s[0] 177 LDP q22, q23, [x5], 32 178 FMLA v28.4s, v20.4s, v2.s[0] 179 FMLA v29.4s, v21.4s, v2.s[0] 180 FMLA v30.4s, v20.4s, v3.s[0] 181 FMLA v31.4s, v21.4s, v3.s[0] 182 FMLA v16.4s, v22.4s, v0.s[1] 183 FMLA v17.4s, v23.4s, v0.s[1] 184 FMLA v18.4s, v22.4s, v1.s[1] 185 FMLA v19.4s, v23.4s, v1.s[1] 186 FMLA v28.4s, v22.4s, v2.s[1] 187 FMLA v29.4s, v23.4s, v2.s[1] 188 FMLA v30.4s, v22.4s, v3.s[1] 189 FMLA v31.4s, v23.4s, v3.s[1] 190 191 # Is there a remainder?- 1 float of A (4 bytes) 192 TBZ x0, 2, 2b 193 194 # Remainder- 1 float of A (4 bytes) 1954: 196 LDR s0, [x3], 4 197 LDP q20, q21, [x5], 32 198 LDR s1, [x11], 4 199 LDR s2, [x12], 4 200 LDR s3, [x4], 4 201 FMLA v16.4s, v20.4s, v0.s[0] 202 FMLA v17.4s, v21.4s, v0.s[0] 203 FMLA v18.4s, v20.4s, v1.s[0] 204 FMLA v19.4s, v21.4s, v1.s[0] 205 FMLA v28.4s, v20.4s, v2.s[0] 206 FMLA v29.4s, v21.4s, v2.s[0] 207 FMLA v30.4s, v20.4s, v3.s[0] 208 FMLA v31.4s, v21.4s, v3.s[0] 209 B 2b 210 211 212 # Store odd width 2135: 214 TBZ x1, 2, 6f 215 STR q16, [x6], 16 216 MOV v16.16b, v17.16b 217 STR q18, [x9], 16 218 MOV v18.16b, v19.16b 219 STR q28, [x10], 16 220 MOV v28.16b, v29.16b 221 STR q30, [x7], 16 222 MOV v30.16b, v31.16b 223 2246: 225 TBZ x1, 1, 7f 226 STR d16, [x6], 8 227 STR d18, [x9], 8 228 DUP d16, v16.d[1] 229 DUP d18, v18.d[1] 230 STR d28, [x10], 8 231 STR d30, [x7], 8 232 DUP d28, v28.d[1] 233 DUP d30, v30.d[1] 234 2357: 236 TBZ x1, 0, 8f 237 STR s16, [x6] 238 STR s18, [x9] 239 STR s28, [x10] 240 STR s30, [x7] 241 2428: 243 RET 244 245END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128 246 247#ifdef __ELF__ 248.section ".note.GNU-stack","",%progbits 249#endif 250