1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, (x4) 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# B x5 v4 v5 v2 v3 30# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 31# temp0 v17 v19 v21 v23 32 33 34BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal 35 36 LDP x10, x11, [sp] // cn_stride, params 37 ADD x2, x2, 7 // kc = (kc + 7) & ~7 38 BIC x2, x2, 7 39 40 .p2align 3 410: 42 # Load initial bias from w into accumulators 43 LDP s16, s18, [x5], 8 44 SUBS x0, x2, 16 // k = kc - 16 45 LDP s20, s22, [x5], 8 46 LDP s24, s26, [x5], 8 47 LDP s28, s30, [x5], 8 48 # Is there at least 16 bytes for epilogue? 49 B.LO 4f 50 51 # Prologue: load A0 and 4 B's 52 LDP d0, d6, [x3], 16 // Read A0 53 LDP d4, d5, [x5] // Read B 54 LDP d2, d3, [x5, 64] // Read B 55 56 # Is there at least 16 bytes for main loop? 57 SUBS x0, x0, 16 // k = k - 16 58 B.LO 2f 59 60 # Main loop - 16 bytes of A 61 # 4 groups of 2 mul/mla/adap = 6 cycles. 62 # 2 load for A0, A1 = +4 cycle. Total 36 cycles. 63 64 .p2align 3 651: 66 # BLOCK 0 - 4 cycles 67 SMULL v17.8h, v4.8b, v0.8b 68 SMULL v19.8h, v5.8b, v0.8b 69 LDP d4, d5, [x5, 16] 70 SMLAL v17.8h, v2.8b, v6.8b 71 SMLAL v19.8h, v3.8b, v6.8b 72 LDP d2, d3, [x5, 80] 73 74 # BLOCK 1 - 6 cycles 75 SMULL v21.8h, v4.8b, v0.8b 76 SMULL v23.8h, v5.8b, v0.8b 77 SADALP v16.4s, v17.8h 78 SADALP v18.4s, v19.8h 79 LDP d4, d5, [x5, 32] 80 SMLAL v21.8h, v2.8b, v6.8b 81 SMLAL v23.8h, v3.8b, v6.8b 82 LDP d2, d3, [x5, 96] 83 84 # BLOCK 2 - 6 cycles 85 SMULL v17.8h, v4.8b, v0.8b 86 SMULL v19.8h, v5.8b, v0.8b 87 SADALP v20.4s, v21.8h 88 SADALP v22.4s, v23.8h 89 LDP d4, d5, [x5, 48] 90 SMLAL v17.8h, v2.8b, v6.8b 91 SMLAL v19.8h, v3.8b, v6.8b 92 LDP d2, d3, [x5, 112] 93 94 # BLOCK 3 - 14 cycles 95 SMULL v21.8h, v4.8b, v0.8b 96 ADD x5, x5, 128 97 SMULL v23.8h, v5.8b, v0.8b 98 SADALP v24.4s, v17.8h 99 SUBS x0, x0, 16 100 SADALP v26.4s, v19.8h 101 LDP d4, d5, [x5] // Read B 102 SMLAL v21.8h, v2.8b, v6.8b 103 SMLAL v23.8h, v3.8b, v6.8b 104 LDP d0, d6, [x3], 16 // Read A0 105 SADALP v28.4s, v21.8h 106 LDP d2, d3, [x5, 64] // Read B 107 SADALP v30.4s, v23.8h 108 B.HS 1b 109 110 # Epilogue 111 # Same as main loop except no loads at end of loop 112 113 .p2align 3 1142: 115 # BLOCK 0 - 4 cycles 116 SMULL v17.8h, v4.8b, v0.8b 117 SMULL v19.8h, v5.8b, v0.8b 118 LDP d4, d5, [x5, 16] 119 SMLAL v17.8h, v2.8b, v6.8b 120 SMLAL v19.8h, v3.8b, v6.8b 121 LDP d2, d3, [x5, 80] 122 123 # BLOCK 1 - 6 cycles 124 SMULL v21.8h, v4.8b, v0.8b 125 SMULL v23.8h, v5.8b, v0.8b 126 SADALP v16.4s, v17.8h 127 SADALP v18.4s, v19.8h 128 LDP d4, d5, [x5, 32] 129 SMLAL v21.8h, v2.8b, v6.8b 130 SMLAL v23.8h, v3.8b, v6.8b 131 LDP d2, d3, [x5, 96] 132 133 # BLOCK 2 - 6 cycles 134 SMULL v17.8h, v4.8b, v0.8b 135 SMULL v19.8h, v5.8b, v0.8b 136 SADALP v20.4s, v21.8h 137 SADALP v22.4s, v23.8h 138 LDP d4, d5, [x5, 48] 139 SMLAL v17.8h, v2.8b, v6.8b 140 SMLAL v19.8h, v3.8b, v6.8b 141 LDP d2, d3, [x5, 112] 142 143 # BLOCK 3 - 8 cycles 144 SMULL v21.8h, v4.8b, v0.8b 145 ADD x5, x5, 128 146 SMULL v23.8h, v5.8b, v0.8b 147 SADALP v24.4s, v17.8h 148 SADALP v26.4s, v19.8h 149 SMLAL v21.8h, v2.8b, v6.8b 150 SMLAL v23.8h, v3.8b, v6.8b 151 SADALP v28.4s, v21.8h 152 SADALP v30.4s, v23.8h 153 154 # Is there a remainder?- 8 bytes of A 155 TBNZ x0, 3, 4f 156 157 .p2align 3 1583: 159 # Add columns 160 ADDP v16.4s, v16.4s, v18.4s 161 ADDP v20.4s, v20.4s, v22.4s 162 ADDP v24.4s, v24.4s, v26.4s 163 ADDP v28.4s, v28.4s, v30.4s 164 ADDP v0.4s, v16.4s, v20.4s 165 ADDP v1.4s, v24.4s, v28.4s 166 167 # Apply params - scale, bias and clamp 168 SCVTF v0.4s, v0.4s 169 LD1R {v4.4s}, [x11], 4 170 SCVTF v1.4s, v1.4s 171 FMUL v0.4s, v0.4s, v4.4s 172 FMUL v1.4s, v1.4s, v4.4s 173 174 FCVTNS v0.4s, v0.4s 175 FCVTNS v1.4s, v1.4s 176 177 LD1R {v5.8h}, [x11], 2 178 SQXTN v0.4h, v0.4s 179 SQXTN2 v0.8h, v1.4s 180 SUBS x1, x1, 8 181 SQADD v0.8h, v0.8h, v5.8h 182 LD1R {v1.16b}, [x11], 1 183 SQXTN v0.8b, v0.8h 184 LD1R {v17.16b}, [x11] 185 SMAX v0.8b, v0.8b, v1.8b 186 SUB x11, x11, 7 // rewind params pointer 187 SMIN v0.8b, v0.8b, v17.8b 188 B.LO 5f 189 190 # Store full 1 x 8 191 ST1 {v0.8b}, [x6], x10 192 SUB x3, x3, x2 // a0 -= kc 193 B.HI 0b 194 RET 195 196 # Remainder - 8 bytes of A 197 .p2align 3 1984: 199 LDR d0, [x3], 8 200 LDP d4, d5, [x5] 201 LDP d6, d7, [x5, 16] 202 SMULL v17.8h, v4.8b, v0.8b 203 SMULL v19.8h, v5.8b, v0.8b 204 SMULL v21.8h, v6.8b, v0.8b 205 SMULL v23.8h, v7.8b, v0.8b 206 LDP d4, d5, [x5, 32] 207 LDP d6, d7, [x5, 48] 208 SADALP v16.4s, v17.8h 209 SADALP v18.4s, v19.8h 210 SADALP v20.4s, v21.8h 211 SADALP v22.4s, v23.8h 212 SMULL v17.8h, v4.8b, v0.8b 213 SMULL v19.8h, v5.8b, v0.8b 214 SMULL v21.8h, v6.8b, v0.8b 215 SMULL v23.8h, v7.8b, v0.8b 216 ADD x5, x5, 64 217 SADALP v24.4s, v17.8h 218 SADALP v26.4s, v19.8h 219 SADALP v28.4s, v21.8h 220 SADALP v30.4s, v23.8h 221 B 3b 222 223 # Store odd width 224 .p2align 3 2255: 226 TBZ x1, 2, 6f 227 STR s0, [x6], 4 228 EXT v0.16b, v0.16b, v0.16b, 4 229 2306: 231 TBZ x1, 1, 7f 232 STR h0, [x6], 2 233 EXT v0.16b, v0.16b, v0.16b, 2 2347: 235 TBZ x1, 0, 8f 236 STR b0, [x6] 2378: 238 RET 239 240END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal 241 242#ifdef __ELF__ 243.section ".note.GNU-stack","",%progbits 244#endif 245 246