1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, (x4) 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# B x5 v4 v5 v2 v3 30# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 31# temp0 v17 v19 v21 v23 32# x16, x17, x7 tenporary a53 gpr load data 33 34BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53 35 36 LDP x10, x11, [sp] // cn_stride, params 37 ADD x2, x2, 7 // kc = (kc + 7) & ~7 38 BIC x2, x2, 7 39 40 .p2align 3 410: 42 # Load initial bias from w into accumulators 43 LDP s16, s18, [x5], 8 44 SUBS x0, x2, 16 // k = kc - 16 45 LDP s20, s22, [x5], 8 46 LDP s24, s26, [x5], 8 47 LDP s28, s30, [x5], 8 48 # Is there at least 16 bytes for epilogue? 49 B.LO 4f 50 51 # Prologue: load A0 and 4 B's 52 LDP d0, d6, [x3], 16 // Read A0 53 LDP d4, d5, [x5] // Read B 54 LDP d2, d3, [x5, 64] // Read B 55 LDR x16, [x5, 16] // Read B 56 57 # Is there at least 16 bytes for main loop? 58 SUBS x0, x0, 16 // k = k - 16 59 B.LO 2f 60 61 # Main loop - 16 bytes of A 62 # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles. 63 # 1 load for A0 = +1 cycle. Total 41 cycles. 64 65 .p2align 3 661: 67 # BLOCK 0 - 6 cycles 68 SMULL v17.8h, v4.8b, v0.8b 69 LDR x17, [x5, 80] 70 SMULL v19.8h, v5.8b, v0.8b 71 LDR d5, [x5, 24] 72 INS v4.d[0], x16 73 SMLAL v17.8h, v2.8b, v6.8b 74 LDR x16, [x5, 32] 75 SMLAL v19.8h, v3.8b, v6.8b 76 LDR d3, [x5, 88] 77 INS v2.d[0], x17 78 79 # BLOCK 1 - 10 cycles 80 SMULL v21.8h, v4.8b, v0.8b 81 LDR x17, [x5, 96] 82 SMULL v23.8h, v5.8b, v0.8b 83 SADALP v16.4s, v17.8h 84 SADALP v18.4s, v19.8h 85 LDR d5, [x5, 40] 86 INS v4.d[0], x16 87 SMLAL v21.8h, v2.8b, v6.8b 88 LDR x16, [x5, 48] 89 SMLAL v23.8h, v3.8b, v6.8b 90 LDR d3, [x5, 104] 91 INS v2.d[0], x17 92 93 # BLOCK 2 - 10 cycles 94 SMULL v17.8h, v4.8b, v0.8b 95 LDR x17, [x5, 112] 96 SMULL v19.8h, v5.8b, v0.8b 97 SADALP v20.4s, v21.8h 98 SADALP v22.4s, v23.8h 99 LDR d5, [x5, 56] 100 INS v4.d[0], x16 101 SMLAL v17.8h, v2.8b, v6.8b 102 LDR x16, [x5, 128] 103 SMLAL v19.8h, v3.8b, v6.8b 104 LDR d3, [x5, 120] 105 INS v2.d[0], x17 106 107 # BLOCK 3 - 15 cycles 108 SMULL v21.8h, v4.8b, v0.8b 109 LDR x7, [x3], 8 // Read A0 110 SMULL v23.8h, v5.8b, v0.8b 111 LDR x17, [x5, 192] // Read B 112 SADALP v24.4s, v17.8h 113 SUBS x0, x0, 16 114 SADALP v26.4s, v19.8h 115 LDR d5, [x5, 136] // Read B 116 INS v4.d[0], x16 117 SMLAL v21.8h, v2.8b, v6.8b 118 LDR x16, [x5, 144] 119 SMLAL v23.8h, v3.8b, v6.8b 120 LDR d6, [x3], 8 // Read A0 121 INS v0.d[0], x7 122 LDR d3, [x5, 200] // Read B 123 INS v2.d[0], x17 124 SADALP v28.4s, v21.8h 125 ADD x5, x5, 128 126 SADALP v30.4s, v23.8h 127 B.HS 1b 128 129 # Epilogue 130 # Same as main loop except no loads at end of loop 131 132 .p2align 3 1332: 134 # BLOCK 0 - 6 cycles 135 SMULL v17.8h, v4.8b, v0.8b 136 LDR x17, [x5, 80] 137 SMULL v19.8h, v5.8b, v0.8b 138 LDR d5, [x5, 24] 139 INS v4.d[0], x16 140 SMLAL v17.8h, v2.8b, v6.8b 141 LDR x16, [x5, 32] 142 SMLAL v19.8h, v3.8b, v6.8b 143 LDR d3, [x5, 88] 144 INS v2.d[0], x17 145 146 # BLOCK 1 - 10 cycles 147 SMULL v21.8h, v4.8b, v0.8b 148 LDR x17, [x5, 96] 149 SMULL v23.8h, v5.8b, v0.8b 150 SADALP v16.4s, v17.8h 151 SADALP v18.4s, v19.8h 152 LDR d5, [x5, 40] 153 INS v4.d[0], x16 154 SMLAL v21.8h, v2.8b, v6.8b 155 LDR x16, [x5, 48] 156 SMLAL v23.8h, v3.8b, v6.8b 157 LDR d3, [x5, 104] 158 INS v2.d[0], x17 159 160 # BLOCK 2 - 10 cycles 161 SMULL v17.8h, v4.8b, v0.8b 162 LDR x17, [x5, 112] 163 SMULL v19.8h, v5.8b, v0.8b 164 SADALP v20.4s, v21.8h 165 SADALP v22.4s, v23.8h 166 LDR d5, [x5, 56] 167 INS v4.d[0], x16 168 SMLAL v17.8h, v2.8b, v6.8b 169 SMLAL v19.8h, v3.8b, v6.8b 170 LDR d3, [x5, 120] 171 INS v2.d[0], x17 172 173 # BLOCK 3 - 12 cycles 174 SMULL v21.8h, v4.8b, v0.8b 175 SMULL v23.8h, v5.8b, v0.8b 176 SADALP v24.4s, v17.8h 177 SADALP v26.4s, v19.8h 178 SMLAL v21.8h, v2.8b, v6.8b 179 SMLAL v23.8h, v3.8b, v6.8b 180 SADALP v28.4s, v21.8h 181 ADD x5, x5, 128 182 SADALP v30.4s, v23.8h 183 184 # Is there a remainder?- 8 bytes of A 185 TBNZ x0, 3, 4f 186 187 .p2align 3 1883: 189 # Add columns 190 ADDP v16.4s, v16.4s, v18.4s 191 ADDP v20.4s, v20.4s, v22.4s 192 ADDP v24.4s, v24.4s, v26.4s 193 ADDP v28.4s, v28.4s, v30.4s 194 ADDP v0.4s, v16.4s, v20.4s 195 ADDP v1.4s, v24.4s, v28.4s 196 197 # Apply params - scale, bias and clamp 198 SCVTF v0.4s, v0.4s 199 LD1R {v4.4s}, [x11], 4 200 SCVTF v1.4s, v1.4s 201 FMUL v0.4s, v0.4s, v4.4s 202 FMUL v1.4s, v1.4s, v4.4s 203 204 FCVTNS v0.4s, v0.4s 205 FCVTNS v1.4s, v1.4s 206 207 LD1R {v5.8h}, [x11], 2 208 SQXTN v0.4h, v0.4s 209 SQXTN2 v0.8h, v1.4s 210 SUBS x1, x1, 8 211 SQADD v0.8h, v0.8h, v5.8h 212 LD1R {v1.16b}, [x11], 1 213 SQXTN v0.8b, v0.8h 214 LD1R {v17.16b}, [x11] 215 SMAX v0.8b, v0.8b, v1.8b 216 SUB x11, x11, 7 // rewind params pointer 217 SMIN v0.8b, v0.8b, v17.8b 218 B.LO 5f 219 220 # Store full 1 x 8 221 ST1 {v0.8b}, [x6], x10 222 SUB x3, x3, x2 // a0 -= kc 223 B.HI 0b 224 RET 225 226 # Remainder - 8 bytes of A 227 .p2align 3 2284: 229 LDR d0, [x3], 8 230 LDP d4, d5, [x5] 231 LDP d6, d7, [x5, 16] 232 SMULL v17.8h, v4.8b, v0.8b 233 SMULL v19.8h, v5.8b, v0.8b 234 SMULL v21.8h, v6.8b, v0.8b 235 SMULL v23.8h, v7.8b, v0.8b 236 LDP d4, d5, [x5, 32] 237 LDP d6, d7, [x5, 48] 238 SADALP v16.4s, v17.8h 239 SADALP v18.4s, v19.8h 240 SADALP v20.4s, v21.8h 241 SADALP v22.4s, v23.8h 242 SMULL v17.8h, v4.8b, v0.8b 243 SMULL v19.8h, v5.8b, v0.8b 244 SMULL v21.8h, v6.8b, v0.8b 245 SMULL v23.8h, v7.8b, v0.8b 246 ADD x5, x5, 64 247 SADALP v24.4s, v17.8h 248 SADALP v26.4s, v19.8h 249 SADALP v28.4s, v21.8h 250 SADALP v30.4s, v23.8h 251 B 3b 252 253 # Store odd width 254 .p2align 3 2555: 256 TBZ x1, 2, 6f 257 STR s0, [x6], 4 258 EXT v0.16b, v0.16b, v0.16b, 4 259 2606: 261 TBZ x1, 1, 7f 262 STR h0, [x6], 2 263 EXT v0.16b, v0.16b, v0.16b, 2 2647: 265 TBZ x1, 0, 8f 266 STR b0, [x6] 2678: 268 RET 269 270END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53 271 272#ifdef __ELF__ 273.section ".note.GNU-stack","",%progbits 274#endif 275 276