1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, (x4) 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# B x5 v4 v5 v2 v3 30# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 31# temp0 v17 v19 v21 v23 32# x16, x17, x7 tenporary a53 gpr load data 33 34BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53 35 36 LDP x10, x11, [sp] // cn_stride, params 37 ADD x2, x2, 7 // kc = (kc + 7) & ~7 38 BIC x2, x2, 7 39 40 .p2align 3 410: 42 # Load initial bias from w into accumulators 43 LDP s16, s18, [x5], 8 44 SUBS x0, x2, 16 // k = kc - 16 45 LDP s20, s22, [x5], 8 46 LDP s24, s26, [x5], 8 47 LDP s28, s30, [x5], 8 48 # Is there at least 16 bytes for epilogue? 49 B.LO 4f 50 51 # Prologue: load A0 and 4 B's 52 LDP d0, d6, [x3], 16 // Read A0 53 LDP d4, d5, [x5] // Read B 54 LDP d2, d3, [x5, 64] // Read B 55 LDR x16, [x5, 16] // Read B 56 57 # Is there at least 16 bytes for main loop? 58 SUBS x0, x0, 16 // k = k - 16 59 B.LO 2f 60 61 # Main loop - 16 bytes of A 62 # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles. 63 # 1 load for A0 = +1 cycle. Total 41 cycles. 64 65 .p2align 3 661: 67 # BLOCK 0 - 6 cycles 68 SMULL v17.8h, v4.8b, v0.8b 69 LDR x17, [x5, 80] 70 SMULL v19.8h, v5.8b, v0.8b 71 LDR d5, [x5, 24] 72 INS v4.d[0], x16 73 SMLAL v17.8h, v2.8b, v6.8b 74 LDR x16, [x5, 32] 75 SMLAL v19.8h, v3.8b, v6.8b 76 LDR d3, [x5, 88] 77 INS v2.d[0], x17 78 79 # BLOCK 1 - 10 cycles 80 SMULL v21.8h, v4.8b, v0.8b 81 LDR x17, [x5, 96] 82 SMULL v23.8h, v5.8b, v0.8b 83 SADALP v16.4s, v17.8h 84 SADALP v18.4s, v19.8h 85 LDR d5, [x5, 40] 86 INS v4.d[0], x16 87 SMLAL v21.8h, v2.8b, v6.8b 88 LDR x16, [x5, 48] 89 SMLAL v23.8h, v3.8b, v6.8b 90 LDR d3, [x5, 104] 91 INS v2.d[0], x17 92 93 # BLOCK 2 - 10 cycles 94 SMULL v17.8h, v4.8b, v0.8b 95 LDR x17, [x5, 112] 96 SMULL v19.8h, v5.8b, v0.8b 97 SADALP v20.4s, v21.8h 98 SADALP v22.4s, v23.8h 99 LDR d5, [x5, 56] 100 INS v4.d[0], x16 101 SMLAL v17.8h, v2.8b, v6.8b 102 LDR x16, [x5, 128] 103 SMLAL v19.8h, v3.8b, v6.8b 104 LDR d3, [x5, 120] 105 INS v2.d[0], x17 106 107 # BLOCK 3 - 15 cycles 108 SMULL v21.8h, v4.8b, v0.8b 109 LDR x7, [x3], 8 // Read A0 110 SMULL v23.8h, v5.8b, v0.8b 111 LDR x17, [x5, 192] // Read B 112 SADALP v24.4s, v17.8h 113 SUBS x0, x0, 16 114 SADALP v26.4s, v19.8h 115 LDR d5, [x5, 136] // Read B 116 INS v4.d[0], x16 117 SMLAL v21.8h, v2.8b, v6.8b 118 LDR x16, [x5, 144] 119 SMLAL v23.8h, v3.8b, v6.8b 120 LDR d6, [x3], 8 // Read A0 121 INS v0.d[0], x7 122 LDR d3, [x5, 200] // Read B 123 INS v2.d[0], x17 124 SADALP v28.4s, v21.8h 125 ADD x5, x5, 128 126 SADALP v30.4s, v23.8h 127 B.HS 1b 128 129 # Epilogue 130 # Same as main loop except no loads at end of loop 131 132 .p2align 3 1332: 134 # BLOCK 0 - 6 cycles 135 SMULL v17.8h, v4.8b, v0.8b 136 LDR x17, [x5, 80] 137 SMULL v19.8h, v5.8b, v0.8b 138 LDR d5, [x5, 24] 139 INS v4.d[0], x16 140 SMLAL v17.8h, v2.8b, v6.8b 141 LDR x16, [x5, 32] 142 SMLAL v19.8h, v3.8b, v6.8b 143 LDR d3, [x5, 88] 144 INS v2.d[0], x17 145 146 # BLOCK 1 - 10 cycles 147 SMULL v21.8h, v4.8b, v0.8b 148 LDR x17, [x5, 96] 149 SMULL v23.8h, v5.8b, v0.8b 150 SADALP v16.4s, v17.8h 151 SADALP v18.4s, v19.8h 152 LDR d5, [x5, 40] 153 INS v4.d[0], x16 154 SMLAL v21.8h, v2.8b, v6.8b 155 LDR x16, [x5, 48] 156 SMLAL v23.8h, v3.8b, v6.8b 157 LDR d3, [x5, 104] 158 INS v2.d[0], x17 159 160 # BLOCK 2 - 10 cycles 161 SMULL v17.8h, v4.8b, v0.8b 162 LDR x17, [x5, 112] 163 SMULL v19.8h, v5.8b, v0.8b 164 SADALP v20.4s, v21.8h 165 SADALP v22.4s, v23.8h 166 LDR d5, [x5, 56] 167 INS v4.d[0], x16 168 SMLAL v17.8h, v2.8b, v6.8b 169 SMLAL v19.8h, v3.8b, v6.8b 170 LDR d3, [x5, 120] 171 INS v2.d[0], x17 172 173 # BLOCK 3 - 12 cycles 174 SMULL v21.8h, v4.8b, v0.8b 175 SMULL v23.8h, v5.8b, v0.8b 176 SADALP v24.4s, v17.8h 177 SADALP v26.4s, v19.8h 178 SMLAL v21.8h, v2.8b, v6.8b 179 SMLAL v23.8h, v3.8b, v6.8b 180 SADALP v28.4s, v21.8h 181 ADD x5, x5, 128 182 SADALP v30.4s, v23.8h 183 184 # Is there a remainder?- 8 bytes of A 185 TBNZ x0, 3, 4f 186 187 .p2align 3 1883: 189 # Add columns 190 ADDP v16.4s, v16.4s, v18.4s 191 ADDP v20.4s, v20.4s, v22.4s 192 ADDP v24.4s, v24.4s, v26.4s 193 ADDP v28.4s, v28.4s, v30.4s 194 ADDP v0.4s, v16.4s, v20.4s 195 ADDP v1.4s, v24.4s, v28.4s 196 197 # Load per channel scale values from weights 198 SCVTF v0.4s, v0.4s 199 LDR q4, [x5], 16 200 SCVTF v1.4s, v1.4s 201 LDR q5, [x5], 16 202 FMUL v0.4s, v0.4s, v4.4s 203 FMUL v1.4s, v1.4s, v5.4s 204 205 FCVTNS v0.4s, v0.4s 206 FCVTNS v1.4s, v1.4s 207 208 LD1R {v5.8h}, [x11], 2 209 SQXTN v0.4h, v0.4s 210 SQXTN2 v0.8h, v1.4s 211 SUBS x1, x1, 8 212 SQADD v0.8h, v0.8h, v5.8h 213 LD1R {v1.16b}, [x11], 1 214 SQXTN v0.8b, v0.8h 215 LD1R {v17.16b}, [x11] 216 SMAX v0.8b, v0.8b, v1.8b 217 SUB x11, x11, 3 // rewind params pointer 218 SMIN v0.8b, v0.8b, v17.8b 219 B.LO 5f 220 221 # Store full 1 x 8 222 ST1 {v0.8b}, [x6], x10 223 SUB x3, x3, x2 // a0 -= kc 224 B.HI 0b 225 RET 226 227 # Remainder - 8 bytes of A 228 .p2align 3 2294: 230 LDR d0, [x3], 8 231 LDP d4, d5, [x5] 232 LDP d6, d7, [x5, 16] 233 SMULL v17.8h, v4.8b, v0.8b 234 SMULL v19.8h, v5.8b, v0.8b 235 SMULL v21.8h, v6.8b, v0.8b 236 SMULL v23.8h, v7.8b, v0.8b 237 LDP d4, d5, [x5, 32] 238 LDP d6, d7, [x5, 48] 239 SADALP v16.4s, v17.8h 240 SADALP v18.4s, v19.8h 241 SADALP v20.4s, v21.8h 242 SADALP v22.4s, v23.8h 243 SMULL v17.8h, v4.8b, v0.8b 244 SMULL v19.8h, v5.8b, v0.8b 245 SMULL v21.8h, v6.8b, v0.8b 246 SMULL v23.8h, v7.8b, v0.8b 247 ADD x5, x5, 64 248 SADALP v24.4s, v17.8h 249 SADALP v26.4s, v19.8h 250 SADALP v28.4s, v21.8h 251 SADALP v30.4s, v23.8h 252 B 3b 253 254 # Store odd width 255 .p2align 3 2565: 257 TBZ x1, 2, 6f 258 STR s0, [x6], 4 259 EXT v0.16b, v0.16b, v0.16b, 4 260 2616: 262 TBZ x1, 1, 7f 263 STR h0, [x6], 2 264 EXT v0.16b, v0.16b, v0.16b, 2 2657: 266 TBZ x1, 0, 8f 267 STR b0, [x6] 2688: 269 RET 270 271END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53 272 273#ifdef __ELF__ 274.section ".note.GNU-stack","",%progbits 275#endif 276 277