1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointer 27# x3 a0 28 29# C pointer 30# x6 c0 31 32# Clamp v4 v5 33 34BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75 35 36 # Load cn_stride, params pointer 37 LDP x14, x8, [sp] 38 39 # Load min/max values 40 LD2R {v4.4s, v5.4s}, [x8] 410: 42 # Load initial bias from w into accumulators 43 LDP q16, q17, [x5], 32 44 45 MOVI v18.4s, 0 // second set of C for pipelining FMLA 46 MOVI v19.4s, 0 47 48 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 49 SUBS x0, x2, 32 // k = kc - 32 50 51 B.LO 3f 52 53 # 16 prologue 54 # Read first block of 1 A and B. 55 LDP q20, q21, [x5], 32 56 LDP q22, q23, [x5], 32 57 LDP q24, q25, [x5], 32 58 LDP q26, q27, [x5], 32 59 LDR q0, [x3], 16 60 61 # Is there at least 32. yes do main loop 62 SUBS x0, x0, 32 63 B.LO 2f 64 65 # Main loop - 8 floats of A (32 bytes) 661: 67 # First block of 4. FMA for first 4, loads for 2nd block of 4. 68 FMLA v16.4s, v20.4s, v0.s[0] 69 LDR q1, [x3], 16 70 FMLA v17.4s, v21.4s, v0.s[0] 71 LDP q20, q21, [x5], 32 72 FMLA v18.4s, v22.4s, v0.s[1] 73 FMLA v19.4s, v23.4s, v0.s[1] 74 LDP q22, q23, [x5], 32 75 FMLA v16.4s, v24.4s, v0.s[2] 76 FMLA v17.4s, v25.4s, v0.s[2] 77 LDP q24, q25, [x5], 32 78 FMLA v18.4s, v26.4s, v0.s[3] 79 FMLA v19.4s, v27.4s, v0.s[3] 80 LDP q26, q27, [x5], 32 81 82 # Second block of 4. FMA for second 4, loads for 1st block of 4. 83 FMLA v16.4s, v20.4s, v1.s[0] 84 LDR q0, [x3], 16 85 FMLA v17.4s, v21.4s, v1.s[0] 86 LDP q20, q21, [x5], 32 87 FMLA v18.4s, v22.4s, v1.s[1] 88 FMLA v19.4s, v23.4s, v1.s[1] 89 LDP q22, q23, [x5], 32 90 FMLA v16.4s, v24.4s, v1.s[2] 91 FMLA v17.4s, v25.4s, v1.s[2] 92 LDP q24, q25, [x5], 32 93 FMLA v18.4s, v26.4s, v1.s[3] 94 FMLA v19.4s, v27.4s, v1.s[3] 95 SUBS x0, x0, 32 96 LDP q26, q27, [x5], 32 97 B.HS 1b 98 992: 100 # Epilogue 101 102 # First block of 4. FMA for first 4, loads for 2nd block of 4. 103 FMLA v16.4s, v20.4s, v0.s[0] 104 LDR q1, [x3], 16 105 FMLA v17.4s, v21.4s, v0.s[0] 106 LDP q20, q21, [x5], 32 107 FMLA v18.4s, v22.4s, v0.s[1] 108 FMLA v19.4s, v23.4s, v0.s[1] 109 LDP q22, q23, [x5], 32 110 FMLA v16.4s, v24.4s, v0.s[2] 111 FMLA v17.4s, v25.4s, v0.s[2] 112 LDP q24, q25, [x5], 32 113 FMLA v18.4s, v26.4s, v0.s[3] 114 FMLA v19.4s, v27.4s, v0.s[3] 115 LDP q26, q27, [x5], 32 116 117 # Second block of 4. no loads 118 FMLA v16.4s, v20.4s, v1.s[0] 119 FMLA v17.4s, v21.4s, v1.s[0] 120 FMLA v18.4s, v22.4s, v1.s[1] 121 FMLA v19.4s, v23.4s, v1.s[1] 122 FMLA v16.4s, v24.4s, v1.s[2] 123 FMLA v17.4s, v25.4s, v1.s[2] 124 FMLA v18.4s, v26.4s, v1.s[3] 125 FMLA v19.4s, v27.4s, v1.s[3] 126 1273: 128 # Is there a remainder?- 4 floats of A (16 bytes) 129 TBNZ x0, 4, 5f 130 # Is there a remainder?- 2 floats of A (8 bytes) 131 TBNZ x0, 3, 6f 132 # Is there a remainder?- 1 float of A (4 bytes) 133 TBNZ x0, 2, 8f 134 1354: 136 FADD v16.4s, v16.4s, v18.4s 137 SUBS x1, x1, 8 138 FADD v17.4s, v17.4s, v19.4s 139 140 # Clamp 141 FMAX v16.4s, v16.4s, v4.4s 142 FMAX v17.4s, v17.4s, v4.4s 143 FMIN v16.4s, v16.4s, v5.4s 144 FMIN v17.4s, v17.4s, v5.4s 145 146 # Store full 1 x 8 147 B.LO 9f 148 149 STP q16, q17, [x6] 150 ADD x6, x6, x14 151 152 SUB x3, x3, x2 // a0 -= kc 153 154 B.HI 0b 155 156 RET 157 1585: 159 # Remainder- 4 floats of A (16 bytes) 160 LDP q20, q21, [x5], 32 161 LDR q0, [x3], 16 162 FMLA v16.4s, v20.4s, v0.s[0] 163 FMLA v17.4s, v21.4s, v0.s[0] 164 LDP q22, q23, [x5], 32 165 LDP q24, q25, [x5], 32 166 LDP q26, q27, [x5], 32 167 FMLA v18.4s, v22.4s, v0.s[1] 168 FMLA v19.4s, v23.4s, v0.s[1] 169 FMLA v16.4s, v24.4s, v0.s[2] 170 FMLA v17.4s, v25.4s, v0.s[2] 171 FMLA v18.4s, v26.4s, v0.s[3] 172 FMLA v19.4s, v27.4s, v0.s[3] 173 174 TBZ x0, 3, 7f 1756: 176 # Remainder- 2 floats of A (8 bytes) 177 LDP q20, q21, [x5], 32 178 LDR d0, [x3], 8 179 FMLA v16.4s, v20.4s, v0.s[0] 180 FMLA v17.4s, v21.4s, v0.s[0] 181 LDP q22, q23, [x5], 32 182 FMLA v18.4s, v22.4s, v0.s[1] 183 FMLA v19.4s, v23.4s, v0.s[1] 1847: 185 TBZ x0, 2, 4b 1868: 187 # Remainder- 1 float of A (4 bytes) 188 LDP q20, q21, [x5], 32 189 LDR s0, [x3], 4 190 FMLA v16.4s, v20.4s, v0.s[0] 191 FMLA v17.4s, v21.4s, v0.s[0] 192 B 4b 193 194 # Store odd channels 1959: 196 TBZ x1, 2, 10f 197 STR q16, [x6], 16 198 MOV v16.16b, v17.16b 199 20010: 201 TBZ x1, 1, 11f 202 STR d16, [x6], 8 203 DUP d16, v16.d[1] 204 20511: 206 TBZ x1, 0, 12f 207 STR s16, [x6] 20812: 209 RET 210 211END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75 212 213#ifdef __ELF__ 214.section ".note.GNU-stack","",%progbits 215#endif 216