1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# Vector register usage 39# A0 v0 40# A1 v1 41# A2 v2 42# A3 v3 43# B v20 v21 v22 v23 44# C v24 v25 45# C v26 v27 46# C v28 v29 47# C v30 v31 48# Clamp v4 v5 49 50BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64 51 52 # Load cn_stride, params pointer 53 LDP x14, x8, [sp] 54 55 # Clamp A and C pointers 56 CMP x0, 2 // if mr < 2 57 ADD x11, x3, x4 // a1 = a0 + a_stride 58 ADD x9, x6, x7 // c1 = c0 + cm_stride 59 CSEL x11, x3, x11, LO // a1 = a0 60 CSEL x9, x6, x9, LO // c1 = c0 61 62 # Load min/max values 63 LD2R {v4.4s, v5.4s}, [x8] 64 65 ADD x12, x11, x4 // a2 = a1 + a_stride 66 ADD x10, x9, x7 // c2 = c1 + cm_stride 67 // if mr <= 2 68 CSEL x12, x11, x12, LS // a2 = a1 69 CSEL x10, x9, x10, LS // c2 = c1 70 71 CMP x0, 4 // if mr < 4 72 ADD x4, x12, x4 // a3 = a2 + a_stride 73 ADD x7, x10, x7 // c3 = c2 + cm_stride 74 CSEL x4, x12, x4, LO // a3 = a2 75 CSEL x7, x10, x7, LO // c3 = c2 76 770: 78 # Load initial bias from w into accumulators 79 LDP q24, q25, [x5], 32 80 MOV v26.16b, v24.16b 81 MOV v27.16b, v25.16b 82 MOV v28.16b, v24.16b 83 MOV v29.16b, v25.16b 84 MOV v30.16b, v24.16b 85 MOV v31.16b, v25.16b 86 87 # Is there at least 2 floats (8 bytes)? 88 SUBS x0, x2, 8 // k = kc - 8 89 B.LO 3f 90 91 # Main loop - 2 floats of A (8 bytes) 921: 93 LDR d0, [x3], 8 94 LDP q20, q21, [x5], 32 95 LDR d1, [x11], 8 96 LDR d2, [x12], 8 97 LDR d3, [x4], 8 98 FMLA v24.4s, v20.4s, v0.s[0] 99 FMLA v25.4s, v21.4s, v0.s[0] 100 FMLA v26.4s, v20.4s, v1.s[0] 101 FMLA v27.4s, v21.4s, v1.s[0] 102 LDP q22, q23, [x5], 32 103 FMLA v28.4s, v20.4s, v2.s[0] 104 FMLA v29.4s, v21.4s, v2.s[0] 105 FMLA v30.4s, v20.4s, v3.s[0] 106 FMLA v31.4s, v21.4s, v3.s[0] 107 FMLA v24.4s, v22.4s, v0.s[1] 108 FMLA v25.4s, v23.4s, v0.s[1] 109 FMLA v26.4s, v22.4s, v1.s[1] 110 FMLA v27.4s, v23.4s, v1.s[1] 111 SUBS x0, x0, 8 112 FMLA v28.4s, v22.4s, v2.s[1] 113 FMLA v29.4s, v23.4s, v2.s[1] 114 FMLA v30.4s, v22.4s, v3.s[1] 115 FMLA v31.4s, v23.4s, v3.s[1] 116 B.HS 1b 117 118 # Is there a remainder?- 1 float of A (4 bytes) 119 TBNZ x0, 2, 3f 120 1212: 122 # Clamp 123 FMAX v24.4s, v24.4s, v4.4s 124 SUBS x1, x1, 8 125 FMAX v25.4s, v25.4s, v4.4s 126 FMAX v26.4s, v26.4s, v4.4s 127 FMAX v27.4s, v27.4s, v4.4s 128 FMAX v28.4s, v28.4s, v4.4s 129 FMAX v29.4s, v29.4s, v4.4s 130 FMAX v30.4s, v30.4s, v4.4s 131 FMAX v31.4s, v31.4s, v4.4s 132 FMIN v24.4s, v24.4s, v5.4s 133 FMIN v25.4s, v25.4s, v5.4s 134 FMIN v26.4s, v26.4s, v5.4s 135 FMIN v27.4s, v27.4s, v5.4s 136 FMIN v28.4s, v28.4s, v5.4s 137 FMIN v29.4s, v29.4s, v5.4s 138 FMIN v30.4s, v30.4s, v5.4s 139 FMIN v31.4s, v31.4s, v5.4s 140 141 # Store full 4 x 8 142 B.LO 4f 143 144 ST1 {v24.16b, v25.16b}, [x6], x14 145 SUB x3, x3, x2 // a0 -= kc 146 ST1 {v26.16b, v27.16b}, [x9], x14 147 SUB x11, x11, x2 // a1 -= kc 148 ST1 {v28.16b, v29.16b}, [x10], x14 149 SUB x12, x12, x2 // a2 -= kc 150 ST1 {v30.16b, v31.16b}, [x7], x14 151 SUB x4, x4, x2 // a3 -= kc 152 153 B.HI 0b 154 155 RET 156 157 # Remainder- 1 float of A (4 bytes) 1583: 159 LDR s0, [x3], 4 160 LDP q20, q21, [x5], 32 161 LDR s1, [x11], 4 162 LDR s2, [x12], 4 163 LDR s3 , [x4], 4 164 FMLA v24.4s, v20.4s, v0.s[0] 165 FMLA v25.4s, v21.4s, v0.s[0] 166 FMLA v26.4s, v20.4s, v1.s[0] 167 FMLA v27.4s, v21.4s, v1.s[0] 168 FMLA v28.4s, v20.4s, v2.s[0] 169 FMLA v29.4s, v21.4s, v2.s[0] 170 FMLA v30.4s, v20.4s, v3.s[0] 171 FMLA v31.4s, v21.4s, v3.s[0] 172 B 2b 173 174 # Store odd width 1754: 176 TBZ x1, 2, 5f 177 STR q24, [x6], 16 178 MOV v24.16b, v25.16b 179 STR q26, [x9], 16 180 MOV v26.16b, v27.16b 181 STR q28, [x10], 16 182 MOV v28.16b, v29.16b 183 STR q30, [x7], 16 184 MOV v30.16b, v31.16b 185 1865: 187 TBZ x1, 1, 6f 188 STR d24, [x6], 8 189 STR d26, [x9], 8 190 DUP d24, v24.d[1] 191 DUP d26, v26.d[1] 192 STR d28, [x10], 8 193 STR d30, [x7], 8 194 DUP d28, v28.d[1] 195 DUP d30, v30.d[1] 196 1976: 198 TBZ x1, 0, 7f 199 STR s24, [x6] 200 STR s26, [x9] 201 STR s28, [x10] 202 STR s30, [x7] 2037: 204 RET 205 206END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64 207 208#ifdef __ELF__ 209.section ".note.GNU-stack","",%progbits 210#endif 211