1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, (x4) 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# B x5 v4 v5 v2 v3 30# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 31# temp0 v17 v19 v21 v23 32 33 34BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal 35 36 LDP x10, x11, [sp] // cn_stride, params 37 ADD x2, x2, 7 // kc = (kc + 7) & ~7 38 BIC x2, x2, 7 39 40 .p2align 3 410: 42 # Load initial bias from w into accumulators 43 LDP s16, s18, [x5], 8 44 SUBS x0, x2, 16 // k = kc - 16 45 LDP s20, s22, [x5], 8 46 LDP s24, s26, [x5], 8 47 LDP s28, s30, [x5], 8 48 # Is there at least 16 bytes for epilogue? 49 B.LO 4f 50 51 # Prologue: load A0 and 4 B's 52 LDP d0, d6, [x3], 16 // Read A0 53 LDP d4, d5, [x5] // Read B 54 LDP d2, d3, [x5, 64] // Read B 55 56 # Is there at least 16 bytes for main loop? 57 SUBS x0, x0, 16 // k = k - 16 58 B.LO 2f 59 60 # Main loop - 16 bytes of A 61 # 4 groups of 2 mul/mla/adap = 6 cycles. 62 # 2 load for A0, A1 = +4 cycle. Total 36 cycles. 63 64 .p2align 3 651: 66 # BLOCK 0 - 4 cycles 67 SMULL v17.8h, v4.8b, v0.8b 68 SMULL v19.8h, v5.8b, v0.8b 69 LDP d4, d5, [x5, 16] 70 SMLAL v17.8h, v2.8b, v6.8b 71 SMLAL v19.8h, v3.8b, v6.8b 72 LDP d2, d3, [x5, 80] 73 74 # BLOCK 1 - 6 cycles 75 SMULL v21.8h, v4.8b, v0.8b 76 SMULL v23.8h, v5.8b, v0.8b 77 SADALP v16.4s, v17.8h 78 SADALP v18.4s, v19.8h 79 LDP d4, d5, [x5, 32] 80 SMLAL v21.8h, v2.8b, v6.8b 81 SMLAL v23.8h, v3.8b, v6.8b 82 LDP d2, d3, [x5, 96] 83 84 # BLOCK 2 - 6 cycles 85 SMULL v17.8h, v4.8b, v0.8b 86 SMULL v19.8h, v5.8b, v0.8b 87 SADALP v20.4s, v21.8h 88 SADALP v22.4s, v23.8h 89 LDP d4, d5, [x5, 48] 90 SMLAL v17.8h, v2.8b, v6.8b 91 SMLAL v19.8h, v3.8b, v6.8b 92 LDP d2, d3, [x5, 112] 93 94 # BLOCK 3 - 14 cycles 95 SMULL v21.8h, v4.8b, v0.8b 96 ADD x5, x5, 128 97 SMULL v23.8h, v5.8b, v0.8b 98 SADALP v24.4s, v17.8h 99 SUBS x0, x0, 16 100 SADALP v26.4s, v19.8h 101 LDP d4, d5, [x5] // Read B 102 SMLAL v21.8h, v2.8b, v6.8b 103 SMLAL v23.8h, v3.8b, v6.8b 104 LDP d0, d6, [x3], 16 // Read A0 105 SADALP v28.4s, v21.8h 106 LDP d2, d3, [x5, 64] // Read B 107 SADALP v30.4s, v23.8h 108 B.HS 1b 109 110 # Epilogue 111 # Same as main loop except no loads at end of loop 112 113 .p2align 3 1142: 115 # BLOCK 0 - 4 cycles 116 SMULL v17.8h, v4.8b, v0.8b 117 SMULL v19.8h, v5.8b, v0.8b 118 LDP d4, d5, [x5, 16] 119 SMLAL v17.8h, v2.8b, v6.8b 120 SMLAL v19.8h, v3.8b, v6.8b 121 LDP d2, d3, [x5, 80] 122 123 # BLOCK 1 - 6 cycles 124 SMULL v21.8h, v4.8b, v0.8b 125 SMULL v23.8h, v5.8b, v0.8b 126 SADALP v16.4s, v17.8h 127 SADALP v18.4s, v19.8h 128 LDP d4, d5, [x5, 32] 129 SMLAL v21.8h, v2.8b, v6.8b 130 SMLAL v23.8h, v3.8b, v6.8b 131 LDP d2, d3, [x5, 96] 132 133 # BLOCK 2 - 6 cycles 134 SMULL v17.8h, v4.8b, v0.8b 135 SMULL v19.8h, v5.8b, v0.8b 136 SADALP v20.4s, v21.8h 137 SADALP v22.4s, v23.8h 138 LDP d4, d5, [x5, 48] 139 SMLAL v17.8h, v2.8b, v6.8b 140 SMLAL v19.8h, v3.8b, v6.8b 141 LDP d2, d3, [x5, 112] 142 143 # BLOCK 3 - 8 cycles 144 SMULL v21.8h, v4.8b, v0.8b 145 ADD x5, x5, 128 146 SMULL v23.8h, v5.8b, v0.8b 147 SADALP v24.4s, v17.8h 148 SADALP v26.4s, v19.8h 149 SMLAL v21.8h, v2.8b, v6.8b 150 SMLAL v23.8h, v3.8b, v6.8b 151 SADALP v28.4s, v21.8h 152 SADALP v30.4s, v23.8h 153 154 # Is there a remainder?- 8 bytes of A 155 TBNZ x0, 3, 4f 156 157 .p2align 3 1583: 159 # Add columns 160 ADDP v16.4s, v16.4s, v18.4s 161 ADDP v20.4s, v20.4s, v22.4s 162 ADDP v24.4s, v24.4s, v26.4s 163 ADDP v28.4s, v28.4s, v30.4s 164 ADDP v0.4s, v16.4s, v20.4s 165 ADDP v1.4s, v24.4s, v28.4s 166 167 # Load per channel scale values from weights 168 SCVTF v0.4s, v0.4s 169 LDR q4, [x5], 16 170 SCVTF v1.4s, v1.4s 171 LDR q5, [x5], 16 172 FMUL v0.4s, v0.4s, v4.4s 173 FMUL v1.4s, v1.4s, v5.4s 174 175 FCVTNS v0.4s, v0.4s 176 FCVTNS v1.4s, v1.4s 177 178 LD1R {v5.8h}, [x11], 2 179 SQXTN v0.4h, v0.4s 180 SQXTN2 v0.8h, v1.4s 181 SUBS x1, x1, 8 182 SQADD v0.8h, v0.8h, v5.8h 183 LD1R {v1.16b}, [x11], 1 184 SQXTN v0.8b, v0.8h 185 LD1R {v17.16b}, [x11] 186 SMAX v0.8b, v0.8b, v1.8b 187 SUB x11, x11, 3 // rewind params pointer 188 SMIN v0.8b, v0.8b, v17.8b 189 B.LO 5f 190 191 # Store full 1 x 8 192 ST1 {v0.8b}, [x6], x10 193 SUB x3, x3, x2 // a0 -= kc 194 B.HI 0b 195 RET 196 197 # Remainder - 8 bytes of A 198 .p2align 3 1994: 200 LDR d0, [x3], 8 201 LDP d4, d5, [x5] 202 LDP d6, d7, [x5, 16] 203 SMULL v17.8h, v4.8b, v0.8b 204 SMULL v19.8h, v5.8b, v0.8b 205 SMULL v21.8h, v6.8b, v0.8b 206 SMULL v23.8h, v7.8b, v0.8b 207 LDP d4, d5, [x5, 32] 208 LDP d6, d7, [x5, 48] 209 SADALP v16.4s, v17.8h 210 SADALP v18.4s, v19.8h 211 SADALP v20.4s, v21.8h 212 SADALP v22.4s, v23.8h 213 SMULL v17.8h, v4.8b, v0.8b 214 SMULL v19.8h, v5.8b, v0.8b 215 SMULL v21.8h, v6.8b, v0.8b 216 SMULL v23.8h, v7.8b, v0.8b 217 ADD x5, x5, 64 218 SADALP v24.4s, v17.8h 219 SADALP v26.4s, v19.8h 220 SADALP v28.4s, v21.8h 221 SADALP v30.4s, v23.8h 222 B 3b 223 224 # Store odd width 225 .p2align 3 2265: 227 TBZ x1, 2, 6f 228 STR s0, [x6], 4 229 EXT v0.16b, v0.16b, v0.16b, 4 230 2316: 232 TBZ x1, 1, 7f 233 STR h0, [x6], 2 234 EXT v0.16b, v0.16b, v0.16b, 2 2357: 236 TBZ x1, 0, 8f 237 STR b0, [x6] 2388: 239 RET 240 241END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal 242 243#ifdef __ELF__ 244.section ".note.GNU-stack","",%progbits 245#endif 246 247