1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/4x8-aarch64-neonfma-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x8 a0 30# x13 a1 31# x14 a2 32# x15 a3 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x7 c3 / cm_stride 39 40# Vector register usage 41# A0 v0 42# A1 v1 43# A2 v2 44# A3 v3 45# B v20 v21 v22 v23 46# C v24 v25 47# C v26 v27 48# C v28 v29 49# C v30 v31 50# Clamp v4 v5 51 52BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64 53 54 # Load cn_stride, a_offset 55 LDP x10, x11, [sp] 56 57 # Load zero, params pointer 58 LDP x12, x8, [sp, 16] 59 60 # Clamp C pointers 61 CMP x0, 2 // if mr < 2 62 ADD x16, x6, x7 // c1 = c0 + cm_stride 63 CSEL x16, x6, x16, LO // c1 = c0 64 65 # Load min/max values 66 LD2R {v4.4s, v5.4s}, [x8] 67 68 ADD x17, x16, x7 // c2 = c1 + cm_stride 69 // if mr <= 2 70 CSEL x17, x16, x17, LS // c2 = c1 71 72 CMP x0, 4 // if mr < 4 73 ADD x7, x17, x7 // c3 = c2 + cm_stride 74 CSEL x7, x17, x7, LO // c3 = c2 75 760: 77 # Load initial bias from w into accumulators 78 LDP q24, q25, [x5], 32 79 MOV v26.16b, v24.16b 80 MOV v27.16b, v25.16b 81 MOV v28.16b, v24.16b 82 MOV v29.16b, v25.16b 83 MOV v30.16b, v24.16b 84 MOV v31.16b, v25.16b 85 86 MOV x9, x3 // p = ks 87 881: 89 # Load next 4 A pointers 90 LDP x8, x13, [x4], 16 91 LDP x14, x15, [x4], 16 92 93 CMP x8, x12 // if a0 == zero 94 ADD x8, x8, x11 // a0 += a_offset 95 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 96 CMP x13, x12 // if a1 == zero 97 ADD x13, x13, x11 // a1 += a_offset 98 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset 99 CMP x14, x12 // if a2 == zero 100 ADD x14, x14, x11 // a2 += a_offset 101 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset 102 CMP x15, x12 // if a3 == zero 103 ADD x15, x15, x11 // a3 += a_offset 104 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset 105 106 # Is there at least 2 floats (8 bytes)? 107 SUBS x0, x2, 8 // k = kc - 8 108 B.LO 4f 109 110 # Main loop - 2 floats of A (8 bytes) 111 1122: 113 LDR d0, [x8], 8 114 LDP q20, q21, [x5], 32 115 LDR d1, [x13], 8 116 LDR d2, [x14], 8 117 LDR d3, [x15], 8 118 FMLA v24.4s, v20.4s, v0.s[0] 119 FMLA v25.4s, v21.4s, v0.s[0] 120 FMLA v26.4s, v20.4s, v1.s[0] 121 FMLA v27.4s, v21.4s, v1.s[0] 122 LDP q22, q23, [x5], 32 123 FMLA v28.4s, v20.4s, v2.s[0] 124 FMLA v29.4s, v21.4s, v2.s[0] 125 FMLA v30.4s, v20.4s, v3.s[0] 126 FMLA v31.4s, v21.4s, v3.s[0] 127 FMLA v24.4s, v22.4s, v0.s[1] 128 FMLA v25.4s, v23.4s, v0.s[1] 129 FMLA v26.4s, v22.4s, v1.s[1] 130 FMLA v27.4s, v23.4s, v1.s[1] 131 SUBS x0, x0, 8 132 FMLA v28.4s, v22.4s, v2.s[1] 133 FMLA v29.4s, v23.4s, v2.s[1] 134 FMLA v30.4s, v22.4s, v3.s[1] 135 FMLA v31.4s, v23.4s, v3.s[1] 136 B.HS 2b 137 138 # Is there a remainder?- 1 float of A (4 bytes) 139 TBNZ x0, 2, 4f 140 1413: 142 # ks loop 143 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 144 B.HI 1b 145 146 # Clamp 147 FMAX v24.4s, v24.4s, v4.4s 148 FMAX v25.4s, v25.4s, v4.4s 149 FMAX v26.4s, v26.4s, v4.4s 150 FMAX v27.4s, v27.4s, v4.4s 151 FMAX v28.4s, v28.4s, v4.4s 152 FMAX v29.4s, v29.4s, v4.4s 153 FMAX v30.4s, v30.4s, v4.4s 154 FMAX v31.4s, v31.4s, v4.4s 155 FMIN v24.4s, v24.4s, v5.4s 156 FMIN v25.4s, v25.4s, v5.4s 157 FMIN v26.4s, v26.4s, v5.4s 158 FMIN v27.4s, v27.4s, v5.4s 159 FMIN v28.4s, v28.4s, v5.4s 160 FMIN v29.4s, v29.4s, v5.4s 161 FMIN v30.4s, v30.4s, v5.4s 162 FMIN v31.4s, v31.4s, v5.4s 163 164 # Store full 4 x 8 165 SUBS x1, x1, 8 166 B.LO 5f 167 168 STP q30, q31, [x7] 169 ADD x7, x7, x10 170 STP q28, q29, [x17] 171 ADD x17, x17, x10 172 STP q26, q27, [x16] 173 ADD x16, x16, x10 174 STP q24, q25, [x6] 175 ADD x6, x6, x10 176 177 SUB x4, x4, x3 // a -= ks 178 179 # nc loop 180 B.HI 0b 181 RET 182 183 # Remainder- 1 float of A 1844: 185 LDR s0, [x8], 4 186 LDP q20, q21, [x5], 32 187 LDR s1, [x13], 4 188 LDR s2, [x14], 4 189 LDR s3, [x15], 4 190 FMLA v24.4s, v20.4s, v0.s[0] 191 FMLA v25.4s, v21.4s, v0.s[0] 192 FMLA v26.4s, v20.4s, v1.s[0] 193 FMLA v27.4s, v21.4s, v1.s[0] 194 FMLA v28.4s, v20.4s, v2.s[0] 195 FMLA v29.4s, v21.4s, v2.s[0] 196 FMLA v30.4s, v20.4s, v3.s[0] 197 FMLA v31.4s, v21.4s, v3.s[0] 198 B 3b 199 200 # Store odd width 2015: 202 TBZ x1, 2, 6f 203 STR q30, [x7], 16 204 MOV v30.16b, v31.16b 205 STR q28, [x17], 16 206 MOV v28.16b, v29.16b 207 STR q26, [x16], 16 208 MOV v26.16b, v27.16b 209 STR q24, [x6], 16 210 MOV v24.16b, v25.16b 211 2126: 213 TBZ x1, 1, 7f 214 STR d30, [x7], 8 215 STR d28, [x17], 8 216 DUP d30, v30.d[1] 217 DUP d28, v28.d[1] 218 STR d26, [x16], 8 219 STR d24, [x6], 8 220 DUP d26, v26.d[1] 221 DUP d24, v24.d[1] 222 2237: 224 TBZ x1, 0, 8f 225 STR s30, [x7] 226 STR s28, [x17] 227 STR s26, [x16] 228 STR s24, [x6] 2298: 230 RET 231 232END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64 233 234#ifdef __ELF__ 235.section ".note.GNU-stack","",%progbits 236#endif 237