1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, (x4) 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# B x5 v4 v5 v2 v3 30# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 31# temp0 v17 v19 v21 v23 32# x16, x17, x7 tenporary a53 gpr load data 33 34BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53 35 36 LDP x10, x11, [sp] // cn_stride, params 37 ADD x2, x2, 7 // kc = (kc + 7) & ~7 38 BIC x2, x2, 7 39 40 .p2align 3 410: 42 # Load initial bias from w into accumulators 43 LDP s16, s18, [x5], 8 44 SUBS x0, x2, 16 // k = kc - 16 45 LDP s20, s22, [x5], 8 46 LDP s24, s26, [x5], 8 47 LDP s28, s30, [x5], 8 48 # Is there at least 16 bytes for epilogue? 49 B.LO 4f 50 51 # Prologue: load A0 and 4 B's 52 LDP d0, d6, [x3], 16 // Read A0 53 LDP d4, d5, [x5] // Read B 54 LDP d2, d3, [x5, 64] // Read B 55 LDR x16, [x5, 16] // Read B 56 57 # Is there at least 16 bytes for main loop? 58 SUBS x0, x0, 16 // k = k - 16 59 B.LO 2f 60 61 # Main loop - 16 bytes of A 62 # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles. 63 # 1 load for A0 = +1 cycle. Total 41 cycles. 64 65 .p2align 3 661: 67 # BLOCK 0 - 6 cycles 68 SMULL v17.8h, v4.8b, v0.8b 69 LDR x17, [x5, 80] 70 SMULL v19.8h, v5.8b, v0.8b 71 LDR d5, [x5, 24] 72 INS v4.d[0], x16 73 SMLAL v17.8h, v2.8b, v6.8b 74 LDR x16, [x5, 32] 75 SMLAL v19.8h, v3.8b, v6.8b 76 LDR d3, [x5, 88] 77 INS v2.d[0], x17 78 79 # BLOCK 1 - 10 cycles 80 SMULL v21.8h, v4.8b, v0.8b 81 LDR x17, [x5, 96] 82 SMULL v23.8h, v5.8b, v0.8b 83 SADALP v16.4s, v17.8h 84 PRFM PLDL1KEEP, [x5, 448] 85 SADALP v18.4s, v19.8h 86 PRFM PLDL1KEEP, [x5, 512] 87 LDR d5, [x5, 40] 88 INS v4.d[0], x16 89 SMLAL v21.8h, v2.8b, v6.8b 90 LDR x16, [x5, 48] 91 SMLAL v23.8h, v3.8b, v6.8b 92 LDR d3, [x5, 104] 93 INS v2.d[0], x17 94 95 # BLOCK 2 - 10 cycles 96 SMULL v17.8h, v4.8b, v0.8b 97 LDR x17, [x5, 112] 98 SMULL v19.8h, v5.8b, v0.8b 99 SADALP v20.4s, v21.8h 100 PRFM PLDL1KEEP, [x3, 128] 101 SADALP v22.4s, v23.8h 102 LDR d5, [x5, 56] 103 INS v4.d[0], x16 104 SMLAL v17.8h, v2.8b, v6.8b 105 LDR x16, [x5, 128] 106 SMLAL v19.8h, v3.8b, v6.8b 107 LDR d3, [x5, 120] 108 INS v2.d[0], x17 109 110 # BLOCK 3 - 15 cycles 111 SMULL v21.8h, v4.8b, v0.8b 112 LDR x7, [x3], 8 // Read A0 113 SMULL v23.8h, v5.8b, v0.8b 114 LDR x17, [x5, 192] // Read B 115 SADALP v24.4s, v17.8h 116 SUBS x0, x0, 16 117 SADALP v26.4s, v19.8h 118 LDR d5, [x5, 136] // Read B 119 INS v4.d[0], x16 120 SMLAL v21.8h, v2.8b, v6.8b 121 LDR x16, [x5, 144] 122 SMLAL v23.8h, v3.8b, v6.8b 123 LDR d6, [x3], 8 // Read A0 124 INS v0.d[0], x7 125 LDR d3, [x5, 200] // Read B 126 INS v2.d[0], x17 127 SADALP v28.4s, v21.8h 128 ADD x5, x5, 128 129 SADALP v30.4s, v23.8h 130 B.HS 1b 131 132 # Epilogue 133 # Same as main loop except no loads at end of loop 134 135 .p2align 3 1362: 137 # BLOCK 0 - 6 cycles 138 SMULL v17.8h, v4.8b, v0.8b 139 LDR x17, [x5, 80] 140 SMULL v19.8h, v5.8b, v0.8b 141 LDR d5, [x5, 24] 142 INS v4.d[0], x16 143 SMLAL v17.8h, v2.8b, v6.8b 144 LDR x16, [x5, 32] 145 SMLAL v19.8h, v3.8b, v6.8b 146 LDR d3, [x5, 88] 147 INS v2.d[0], x17 148 149 # BLOCK 1 - 10 cycles 150 SMULL v21.8h, v4.8b, v0.8b 151 LDR x17, [x5, 96] 152 SMULL v23.8h, v5.8b, v0.8b 153 SADALP v16.4s, v17.8h 154 SADALP v18.4s, v19.8h 155 LDR d5, [x5, 40] 156 INS v4.d[0], x16 157 SMLAL v21.8h, v2.8b, v6.8b 158 LDR x16, [x5, 48] 159 SMLAL v23.8h, v3.8b, v6.8b 160 LDR d3, [x5, 104] 161 INS v2.d[0], x17 162 163 # BLOCK 2 - 10 cycles 164 SMULL v17.8h, v4.8b, v0.8b 165 LDR x17, [x5, 112] 166 SMULL v19.8h, v5.8b, v0.8b 167 SADALP v20.4s, v21.8h 168 SADALP v22.4s, v23.8h 169 LDR d5, [x5, 56] 170 INS v4.d[0], x16 171 SMLAL v17.8h, v2.8b, v6.8b 172 SMLAL v19.8h, v3.8b, v6.8b 173 LDR d3, [x5, 120] 174 INS v2.d[0], x17 175 176 # BLOCK 3 - 12 cycles 177 SMULL v21.8h, v4.8b, v0.8b 178 SMULL v23.8h, v5.8b, v0.8b 179 SADALP v24.4s, v17.8h 180 SADALP v26.4s, v19.8h 181 SMLAL v21.8h, v2.8b, v6.8b 182 SMLAL v23.8h, v3.8b, v6.8b 183 SADALP v28.4s, v21.8h 184 ADD x5, x5, 128 185 SADALP v30.4s, v23.8h 186 187 # Is there a remainder?- 8 bytes of A 188 TBNZ x0, 3, 4f 189 190 .p2align 3 1913: 192 # Add columns 193 ADDP v16.4s, v16.4s, v18.4s 194 ADDP v20.4s, v20.4s, v22.4s 195 ADDP v24.4s, v24.4s, v26.4s 196 ADDP v28.4s, v28.4s, v30.4s 197 ADDP v0.4s, v16.4s, v20.4s 198 ADDP v1.4s, v24.4s, v28.4s 199 200 # Apply params - scale, bias and clamp 201 SCVTF v0.4s, v0.4s 202 LD1R {v4.4s}, [x11], 4 203 SCVTF v1.4s, v1.4s 204 FMUL v0.4s, v0.4s, v4.4s 205 FMUL v1.4s, v1.4s, v4.4s 206 207 FCVTNS v0.4s, v0.4s 208 FCVTNS v1.4s, v1.4s 209 210 LD1R {v5.8h}, [x11], 2 211 SQXTN v0.4h, v0.4s 212 SQXTN2 v0.8h, v1.4s 213 SUBS x1, x1, 8 214 SQADD v0.8h, v0.8h, v5.8h 215 LD1R {v1.16b}, [x11], 1 216 SQXTN v0.8b, v0.8h 217 LD1R {v17.16b}, [x11] 218 SMAX v0.8b, v0.8b, v1.8b 219 SUB x11, x11, 7 // rewind params pointer 220 SMIN v0.8b, v0.8b, v17.8b 221 B.LO 5f 222 223 # Store full 1 x 8 224 ST1 {v0.8b}, [x6], x10 225 SUB x3, x3, x2 // a0 -= kc 226 B.HI 0b 227 RET 228 229 # Remainder - 8 bytes of A 230 .p2align 3 2314: 232 LDR d0, [x3], 8 233 LDP d4, d5, [x5] 234 LDP d6, d7, [x5, 16] 235 SMULL v17.8h, v4.8b, v0.8b 236 SMULL v19.8h, v5.8b, v0.8b 237 SMULL v21.8h, v6.8b, v0.8b 238 SMULL v23.8h, v7.8b, v0.8b 239 LDP d4, d5, [x5, 32] 240 LDP d6, d7, [x5, 48] 241 SADALP v16.4s, v17.8h 242 SADALP v18.4s, v19.8h 243 SADALP v20.4s, v21.8h 244 SADALP v22.4s, v23.8h 245 SMULL v17.8h, v4.8b, v0.8b 246 SMULL v19.8h, v5.8b, v0.8b 247 SMULL v21.8h, v6.8b, v0.8b 248 SMULL v23.8h, v7.8b, v0.8b 249 ADD x5, x5, 64 250 SADALP v24.4s, v17.8h 251 SADALP v26.4s, v19.8h 252 SADALP v28.4s, v21.8h 253 SADALP v30.4s, v23.8h 254 B 3b 255 256 # Store odd width 257 .p2align 3 2585: 259 TBZ x1, 2, 6f 260 STR s0, [x6], 4 261 EXT v0.16b, v0.16b, v0.16b, 4 262 2636: 264 TBZ x1, 1, 7f 265 STR h0, [x6], 2 266 EXT v0.16b, v0.16b, v0.16b, 2 2677: 268 TBZ x1, 0, 8f 269 STR b0, [x6] 2708: 271 RET 272 273END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53 274 275#ifdef __ELF__ 276.section ".note.GNU-stack","",%progbits 277#endif 278 279