1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, (x4) 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, (x7) 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# B x5 v4 v5 v2 v3 30# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 31# temp0 v17 v19 v21 v23 32# x16, x17, x7 tenporary a53 gpr load data 33 34BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53 35 36 LDP x10, x11, [sp] // cn_stride, params 37 ADD x2, x2, 7 // kc = (kc + 7) & ~7 38 BIC x2, x2, 7 39 40 .p2align 3 410: 42 # Load initial bias from w into accumulators 43 LDP s16, s18, [x5], 8 44 SUBS x0, x2, 16 // k = kc - 16 45 LDP s20, s22, [x5], 8 46 LDP s24, s26, [x5], 8 47 LDP s28, s30, [x5], 8 48 # Is there at least 16 bytes for epilogue? 49 B.LO 4f 50 51 # Prologue: load A0 and 4 B's 52 LDP d0, d6, [x3], 16 // Read A0 53 LDP d4, d5, [x5] // Read B 54 LDP d2, d3, [x5, 64] // Read B 55 LDR x16, [x5, 16] // Read B 56 57 # Is there at least 16 bytes for main loop? 58 SUBS x0, x0, 16 // k = k - 16 59 B.LO 2f 60 61 # Main loop - 16 bytes of A 62 # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles. 63 # 1 load for A0 = +1 cycle. Total 41 cycles. 64 65 .p2align 3 661: 67 # BLOCK 0 - 6 cycles 68 SMULL v17.8h, v4.8b, v0.8b 69 LDR x17, [x5, 80] 70 SMULL v19.8h, v5.8b, v0.8b 71 LDR d5, [x5, 24] 72 INS v4.d[0], x16 73 SMLAL v17.8h, v2.8b, v6.8b 74 LDR x16, [x5, 32] 75 SMLAL v19.8h, v3.8b, v6.8b 76 LDR d3, [x5, 88] 77 INS v2.d[0], x17 78 79 # BLOCK 1 - 10 cycles 80 SMULL v21.8h, v4.8b, v0.8b 81 LDR x17, [x5, 96] 82 SMULL v23.8h, v5.8b, v0.8b 83 SADALP v16.4s, v17.8h 84 PRFM PLDL1KEEP, [x5, 448] 85 SADALP v18.4s, v19.8h 86 PRFM PLDL1KEEP, [x5, 512] 87 LDR d5, [x5, 40] 88 INS v4.d[0], x16 89 SMLAL v21.8h, v2.8b, v6.8b 90 LDR x16, [x5, 48] 91 SMLAL v23.8h, v3.8b, v6.8b 92 LDR d3, [x5, 104] 93 INS v2.d[0], x17 94 95 # BLOCK 2 - 10 cycles 96 SMULL v17.8h, v4.8b, v0.8b 97 LDR x17, [x5, 112] 98 SMULL v19.8h, v5.8b, v0.8b 99 SADALP v20.4s, v21.8h 100 PRFM PLDL1KEEP, [x3, 128] 101 SADALP v22.4s, v23.8h 102 LDR d5, [x5, 56] 103 INS v4.d[0], x16 104 SMLAL v17.8h, v2.8b, v6.8b 105 LDR x16, [x5, 128] 106 SMLAL v19.8h, v3.8b, v6.8b 107 LDR d3, [x5, 120] 108 INS v2.d[0], x17 109 110 # BLOCK 3 - 15 cycles 111 SMULL v21.8h, v4.8b, v0.8b 112 LDR x7, [x3], 8 // Read A0 113 SMULL v23.8h, v5.8b, v0.8b 114 LDR x17, [x5, 192] // Read B 115 SADALP v24.4s, v17.8h 116 SUBS x0, x0, 16 117 SADALP v26.4s, v19.8h 118 LDR d5, [x5, 136] // Read B 119 INS v4.d[0], x16 120 SMLAL v21.8h, v2.8b, v6.8b 121 LDR x16, [x5, 144] 122 SMLAL v23.8h, v3.8b, v6.8b 123 LDR d6, [x3], 8 // Read A0 124 INS v0.d[0], x7 125 LDR d3, [x5, 200] // Read B 126 INS v2.d[0], x17 127 SADALP v28.4s, v21.8h 128 ADD x5, x5, 128 129 SADALP v30.4s, v23.8h 130 B.HS 1b 131 132 # Epilogue 133 # Same as main loop except no loads at end of loop 134 135 .p2align 3 1362: 137 # BLOCK 0 - 6 cycles 138 SMULL v17.8h, v4.8b, v0.8b 139 LDR x17, [x5, 80] 140 SMULL v19.8h, v5.8b, v0.8b 141 LDR d5, [x5, 24] 142 INS v4.d[0], x16 143 SMLAL v17.8h, v2.8b, v6.8b 144 LDR x16, [x5, 32] 145 SMLAL v19.8h, v3.8b, v6.8b 146 LDR d3, [x5, 88] 147 INS v2.d[0], x17 148 149 # BLOCK 1 - 10 cycles 150 SMULL v21.8h, v4.8b, v0.8b 151 LDR x17, [x5, 96] 152 SMULL v23.8h, v5.8b, v0.8b 153 SADALP v16.4s, v17.8h 154 SADALP v18.4s, v19.8h 155 LDR d5, [x5, 40] 156 INS v4.d[0], x16 157 SMLAL v21.8h, v2.8b, v6.8b 158 LDR x16, [x5, 48] 159 SMLAL v23.8h, v3.8b, v6.8b 160 LDR d3, [x5, 104] 161 INS v2.d[0], x17 162 163 # BLOCK 2 - 10 cycles 164 SMULL v17.8h, v4.8b, v0.8b 165 LDR x17, [x5, 112] 166 SMULL v19.8h, v5.8b, v0.8b 167 SADALP v20.4s, v21.8h 168 SADALP v22.4s, v23.8h 169 LDR d5, [x5, 56] 170 INS v4.d[0], x16 171 SMLAL v17.8h, v2.8b, v6.8b 172 SMLAL v19.8h, v3.8b, v6.8b 173 LDR d3, [x5, 120] 174 INS v2.d[0], x17 175 176 # BLOCK 3 - 12 cycles 177 SMULL v21.8h, v4.8b, v0.8b 178 SMULL v23.8h, v5.8b, v0.8b 179 SADALP v24.4s, v17.8h 180 SADALP v26.4s, v19.8h 181 SMLAL v21.8h, v2.8b, v6.8b 182 SMLAL v23.8h, v3.8b, v6.8b 183 SADALP v28.4s, v21.8h 184 ADD x5, x5, 128 185 SADALP v30.4s, v23.8h 186 187 # Is there a remainder?- 8 bytes of A 188 TBNZ x0, 3, 4f 189 190 .p2align 3 1913: 192 # Add columns 193 ADDP v16.4s, v16.4s, v18.4s 194 ADDP v20.4s, v20.4s, v22.4s 195 LD1R {v4.4s}, [x11], 4 196 ADDP v24.4s, v24.4s, v26.4s 197 ADDP v28.4s, v28.4s, v30.4s 198 LD1R {v7.4s}, [x11], 4 199 ADDP v0.4s, v16.4s, v20.4s 200 ADDP v1.4s, v24.4s, v28.4s 201 202 # Apply params - preshift, scale, postshift, bias and clamp 203 LD1R {v5.4s}, [x11], 4 204 SQSHL v0.4s, v0.4s, v4.4s // shift to upper bits 205 SQSHL v1.4s, v1.4s, v4.4s 206 SQDMULH v0.4s, v0.4s, v7.4s // scale without rounding 207 SQDMULH v1.4s, v1.4s, v7.4s 208 SRSHL v0.4s, v0.4s, v5.4s // signed rounding shift left 209 SRSHL v1.4s, v1.4s, v5.4s 210 211 LD1R {v5.8h}, [x11], 2 212 SQXTN v0.4h, v0.4s 213 SQXTN2 v0.8h, v1.4s 214 SUBS x1, x1, 8 215 SQADD v0.8h, v0.8h, v5.8h 216 LD1R {v1.16b}, [x11], 1 217 SQXTN v0.8b, v0.8h 218 LD1R {v17.16b}, [x11] 219 SMAX v0.8b, v0.8b, v1.8b 220 SUB x11, x11, 15 // rewind params pointer 221 SMIN v0.8b, v0.8b, v17.8b 222 B.LO 5f 223 224 # Store full 1 x 8 225 ST1 {v0.8b}, [x6], x10 226 SUB x3, x3, x2 // a0 -= kc 227 B.HI 0b 228 RET 229 230 # Remainder - 8 bytes of A 231 .p2align 3 2324: 233 LDR d0, [x3], 8 234 LDP d4, d5, [x5] 235 LDP d6, d7, [x5, 16] 236 SMULL v17.8h, v4.8b, v0.8b 237 SMULL v19.8h, v5.8b, v0.8b 238 SMULL v21.8h, v6.8b, v0.8b 239 SMULL v23.8h, v7.8b, v0.8b 240 LDP d4, d5, [x5, 32] 241 LDP d6, d7, [x5, 48] 242 SADALP v16.4s, v17.8h 243 SADALP v18.4s, v19.8h 244 SADALP v20.4s, v21.8h 245 SADALP v22.4s, v23.8h 246 SMULL v17.8h, v4.8b, v0.8b 247 SMULL v19.8h, v5.8b, v0.8b 248 SMULL v21.8h, v6.8b, v0.8b 249 SMULL v23.8h, v7.8b, v0.8b 250 ADD x5, x5, 64 251 SADALP v24.4s, v17.8h 252 SADALP v26.4s, v19.8h 253 SADALP v28.4s, v21.8h 254 SADALP v30.4s, v23.8h 255 B 3b 256 257 # Store odd width 258 .p2align 3 2595: 260 TBZ x1, 2, 6f 261 STR s0, [x6], 4 262 EXT v0.16b, v0.16b, v0.16b, 4 263 2646: 265 TBZ x1, 1, 7f 266 STR h0, [x6], 2 267 EXT v0.16b, v0.16b, v0.16b, 2 2687: 269 TBZ x1, 0, 8f 270 STR b0, [x6] 2718: 272 RET 273 274END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53 275 276#ifdef __ELF__ 277.section ".note.GNU-stack","",%progbits 278#endif 279 280