1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/4x2-aarch64-neonfma-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x8 a0 30# x13 a1 31# x14 a2 32# x15 a3 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x7 c3 / cm_stride 39 40# Vector register usage 41# A0 v0 42# A1 v1 43# A2 v2 44# A3 v3 45# B v20 v21 46# C v24 v25 47# C v26 v27 48# C v28 v29 49# C v30 v31 50# Clamp v4 v5 51 52BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64 53 54 # Load cn_stride, a_offset 55 LDP x10, x11, [sp] 56 57 # Load zero, params pointer 58 LDP x12, x8, [sp, 16] 59 60 # Clamp C pointers 61 CMP x0, 2 // if mr < 2 62 ADD x16, x6, x7 // c1 = c0 + cm_stride 63 CSEL x16, x6, x16, LO // c1 = c0 64 65 # Load min/max values 66 LD2R {v4.2s, v5.2s}, [x8] 67 68 ADD x17, x16, x7 // c2 = c1 + cm_stride 69 // if mr <= 2 70 CSEL x17, x16, x17, LS // c2 = c1 71 72 CMP x0, 4 // if mr < 4 73 ADD x7, x17, x7 // c3 = c2 + cm_stride 74 CSEL x7, x17, x7, LO // c3 = c2 75 760: 77 # Load initial bias from w into accumulators 78 LDR d24, [x5], 8 79 MOV v26.8b, v24.8b 80 MOV v28.8b, v24.8b 81 MOV v30.8b, v24.8b 82 MOVI v25.2s, 0 83 MOVI v27.2s, 0 84 MOVI v29.2s, 0 85 MOVI v31.2s, 0 86 87 MOV x9, x3 // p = ks 88 891: 90 # Load next 4 A pointers 91 LDP x8, x13, [x4], 16 92 LDP x14, x15, [x4], 16 93 94 CMP x8, x12 // if a0 == zero 95 ADD x8, x8, x11 // a0 += a_offset 96 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 97 CMP x13, x12 // if a1 == zero 98 ADD x13, x13, x11 // a1 += a_offset 99 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset 100 CMP x14, x12 // if a2 == zero 101 ADD x14, x14, x11 // a2 += a_offset 102 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset 103 CMP x15, x12 // if a3 == zero 104 ADD x15, x15, x11 // a3 += a_offset 105 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset 106 107 # Is there at least 2 floats (8 bytes)? 108 SUBS x0, x2, 8 // k = kc - 8 109 B.LO 4f 110 111 # Main loop - 2 floats of A (8 bytes) 1122: 113 LDR d0, [x8], 8 114 LDP d20, d21, [x5], 16 115 LDR d1, [x13], 8 116 LDR d2, [x14], 8 117 LDR d3, [x15], 8 118 SUBS x0, x0, 8 119 FMLA v24.2s, v20.2s, v0.s[0] 120 FMLA v26.2s, v20.2s, v1.s[0] 121 FMLA v28.2s, v20.2s, v2.s[0] 122 FMLA v30.2s, v20.2s, v3.s[0] 123 FMLA v25.2s, v21.2s, v0.s[1] 124 FMLA v27.2s, v21.2s, v1.s[1] 125 FMLA v29.2s, v21.2s, v2.s[1] 126 FMLA v31.2s, v21.2s, v3.s[1] 127 B.HS 2b 128 129 # Is there a remainder?- 1 float of A (4 bytes) 130 TBNZ x0, 2, 4f 131 1323: 133 # ks loop 134 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 135 B.HI 1b 136 137 FADD v24.2s, v24.2s, v25.2s 138 FADD v26.2s, v26.2s, v27.2s 139 FADD v28.2s, v28.2s, v29.2s 140 FADD v30.2s, v30.2s, v31.2s 141 142 # Clamp 143 FMAX v24.2s, v24.2s, v4.2s 144 SUBS x1, x1, 2 145 FMAX v26.2s, v26.2s, v4.2s 146 FMAX v28.2s, v28.2s, v4.2s 147 FMAX v30.2s, v30.2s, v4.2s 148 FMIN v24.2s, v24.2s, v5.2s 149 FMIN v26.2s, v26.2s, v5.2s 150 FMIN v28.2s, v28.2s, v5.2s 151 FMIN v30.2s, v30.2s, v5.2s 152 153 # Store full 4 x 2 154 B.LO 5f 155 156 STR d30, [x7] 157 ADD x7, x7, x10 158 STR d28, [x17] 159 ADD x17, x17, x10 160 STR d26, [x16] 161 ADD x16, x16, x10 162 STR d24, [x6] 163 ADD x6, x6, x10 164 165 SUB x4, x4, x3 // a -= ks 166 167 # nc loop 168 B.HI 0b 169 RET 170 171 # Remainder- 1 float of A 1724: 173 LDR s0, [x8], 4 174 LDR d20, [x5], 8 175 LDR s1, [x13], 4 176 LDR s2, [x14], 4 177 LDR s3, [x15], 4 178 FMLA v24.2s, v20.2s, v0.s[0] 179 FMLA v26.2s, v20.2s, v1.s[0] 180 FMLA v28.2s, v20.2s, v2.s[0] 181 FMLA v30.2s, v20.2s, v3.s[0] 182 B 3b 183 184 # Store odd width 1855: 186 STR s30, [x7] 187 STR s28, [x17] 188 STR s26, [x16] 189 STR s24, [x6] 190 RET 191 192END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64 193 194#ifdef __ELF__ 195.section ".note.GNU-stack","",%progbits 196#endif 197