1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35 16// size_t mr, (r0) 17// size_t nc, r1 18// size_t kc, (r2) -> sp + 56 -> r5 19// size_t ks, (r3) -> sp + 60 -> r14 20// const int8_t**restrict a, sp + 88 -> r2 21// const void*restrict w, sp + 92 -> r9 22// int8_t*restrict c, sp + 96 -> r11 23// size_t cm_stride, sp + 100 -> r6 24// size_t cn_stride, sp + 104 -> r12 25// size_t a_offset, sp + 108 -> (r5) 26// const int8_t* zero, sp + 112 -> r7 27// xnn_qs8_minmax_params*params); sp + 116 -> (r5) 28 29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 30 31// Based on cortex_a53 microkernel but with Neon loads 32 33// Register usage 34// A0 r3 d0-d1 q0 35 36// B r9 d8-d9 q4 q5 37 38// C0 r11 d16-d17 q8 d18-d19 q9 39// q2, q3 acc2 40 41// Unused r4, r8, r10, d15, q10-q15, q1-q3 42 43// params structure is 4 bytes 44// struct { 45// int16_t output_zero_point; d13[2] 46// int8_t output_min; d13[6] 47// int8_t output_max; d13[7] 48// } xnn_qs8_minmax_params.neonv8; 49 50BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35 51 # Push 88 bytes 52 # r2, r3 will be reloaded in outer loop. 53 PUSH {r2, r3, r5, r6, r7, r9, r11, lr} // +32 54 SUB sp, sp, 8 // +8 55 VPUSH {d8-d13} // +48 = 88 56 57 LDR r2, [sp, 88] // a 58 LDR r9, [sp, 92] // w 59 LDR r11, [sp, 96] // c 60 LDR r6, [sp, 100] // cm_stride 61 LDR r12, [sp, 104] // cn_stride 62 LDR r7, [sp, 112] // zero 63 LDR r5, [sp, 116] // params 64 MOV r14, r3 // p = ks 65 66 # Load params values 67 VLD1.32 {d13[]}, [r5] // QC8 neonv8 params 68 69 PLD [r9, 64] // Prefetch B 70 PLD [r9, 112] 71 PLD [r9, 192] 72 PLD [r9, 256] 73 PLD [r9, 320] 74 PLD [r9, 384] 75 76 .p2align 3 770: 78 # Load initial bias from w into accumulators 79 VLDM r9!, {d16-d19} // Bias 80 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 81 VMOV.I32 q3, 0 82 83 .p2align 3 841: 85 # Load next A pointer 86 LDR r3, [r2, 0] 87 88 # Add a_offset 89 LDR r5, [sp, 108] // a_offset 90 ADD r2, r2, 4 91 CMP r3, r7 // if a0 == zero 92 ADD r3, r3, r5 // a0 += a_offset 93 MOVEQ r3, r7 // a0 = zero, else += a0 + a_offset 94 95 LDR r5, [sp, 56] // kc 96 SUBS r5, r5, 8 // kc - 8 97 BLO 5f // less than 8 channels? 98 99 // Prologue - load A0 and B0 100 VLD1.8 {d0}, [r3]! // A0 101 SUBS r5, r5, 8 // k = k - 8 102 VLD1.8 {d8}, [r9]! // B0 103 BLO 3f // less than 8 channels? 104 105 // Main loop - 8 bytes 106 // 64 bytes for weights. 107 108 .p2align 3 1092: 110 // Extend 111 VMOVL.S8 q0, d0 112 VMOVL.S8 q4, d8 113 PLD [r9, 448] 114 115 // BLOCK 0 116 VLD1.8 {d10}, [r9]! // B1 117 VMLAL.S16 q8, d8, d0[0] 118 VMLAL.S16 q9, d9, d0[0] 119 VMOVL.S8 q5, d10 120 121 // BLOCK 1 122 VLD1.8 {d8}, [r9]! // B2 123 VMLAL.S16 q2, d10, d0[1] 124 VMLAL.S16 q3, d11, d0[1] 125 VMOVL.S8 q4, d8 126 127 // BLOCK 2 128 VLD1.8 {d10}, [r9]! // B3 129 VMLAL.S16 q8, d8, d0[2] 130 VMLAL.S16 q9, d9, d0[2] 131 VMOVL.S8 q5, d10 132 133 // BLOCK 3 134 VLD1.8 {d8}, [r9]! // B4 135 VMLAL.S16 q2, d10, d0[3] 136 VMLAL.S16 q3, d11, d0[3] 137 VLD1.8 {d0}, [r3]! // A0 138 VMOVL.S8 q4, d8 139 140 // BLOCK 4 141 VLD1.8 {d10}, [r9]! // B5 142 VMLAL.S16 q8, d8, d1[0] 143 VMLAL.S16 q9, d9, d1[0] 144 VMOVL.S8 q5, d10 145 146 // BLOCK 5 147 VLD1.8 {d8}, [r9]! // B6 148 VMLAL.S16 q2, d10, d1[1] 149 VMLAL.S16 q3, d11, d1[1] 150 VMOVL.S8 q4, d8 151 152 // BLOCK 6 153 VLD1.8 {d10}, [r9]! // B7 154 VMLAL.S16 q8, d8, d1[2] 155 VMLAL.S16 q9, d9, d1[2] 156 VMOVL.S8 q5, d10 157 SUBS r5, r5, 8 158 159 // BLOCK 7 160 VLD1.8 {d8}, [r9]! // B0 161 VMLAL.S16 q2, d10, d1[3] 162 VMLAL.S16 q3, d11, d1[3] 163 BHS 2b 164 165 // Epilogue 166 167 .p2align 3 1683: 169 // Extend 170 VMOVL.S8 q0, d0 171 VMOVL.S8 q4, d8 172 PLD [r9, 448] 173 174 // BLOCK 0 175 VLD1.8 {d10}, [r9]! // B1 176 VMLAL.S16 q8, d8, d0[0] 177 VMLAL.S16 q9, d9, d0[0] 178 VMOVL.S8 q5, d10 179 180 // BLOCK 1 181 VLD1.8 {d8}, [r9]! // B2 182 VMLAL.S16 q2, d10, d0[1] 183 VMLAL.S16 q3, d11, d0[1] 184 VMOVL.S8 q4, d8 185 186 // BLOCK 2 187 VLD1.8 {d10}, [r9]! // B3 188 VMLAL.S16 q8, d8, d0[2] 189 VMLAL.S16 q9, d9, d0[2] 190 VMOVL.S8 q5, d10 191 192 // BLOCK 3 193 VLD1.8 {d8}, [r9]! // B4 194 VMLAL.S16 q2, d10, d0[3] 195 VMLAL.S16 q3, d11, d0[3] 196 VMOVL.S8 q4, d8 197 198 // BLOCK 4 199 VLD1.8 {d10}, [r9]! // B5 200 VMLAL.S16 q8, d8, d1[0] 201 VMLAL.S16 q9, d9, d1[0] 202 VMOVL.S8 q5, d10 203 204 // BLOCK 5 205 VLD1.8 {d8}, [r9]! // B6 206 VMLAL.S16 q2, d10, d1[1] 207 VMLAL.S16 q3, d11, d1[1] 208 VMOVL.S8 q4, d8 209 210 // BLOCK 6 211 VLD1.8 {d10}, [r9]! // B7 212 VMLAL.S16 q8, d8, d1[2] 213 VMLAL.S16 q9, d9, d1[2] 214 VMOVL.S8 q5, d10 215 ADDS r5, r5, 8 216 217 VMLAL.S16 q2, d10, d1[3] 218 VMLAL.S16 q3, d11, d1[3] 219 220 # Is there a remainder?- 1-7 bytes of A 221 BNE 6f 222 2234: 224 # ks loop 225 SUBS r14, r14, 4 // ks -= MR * sizeof(void*) 226 BHI 1b 227 228 LDR r14, [sp, 60] // p = ks 229 230 VADD.S32 q8, q8, q2 231 VADD.S32 q9, q9, q3 232 233 # QC8 FP32 quantization 234 VLD1.8 {q0-q1}, [r9]! 235 236 VCVT.F32.S32 q8, q8 237 VCVT.F32.S32 q9, q9 238 239 VMUL.F32 q8, q8, q0 // multiplier 240 VMUL.F32 q9, q9, q1 241 242 VCVTN.S32.F32 q8, q8 243 VCVTN.S32.F32 q9, q9 244 245 VDUP.16 q0, d13[2] // output_zero_point 246 247 VQMOVN.S32 d16, q8 248 VQMOVN.S32 d17, q9 249 250 VQADD.S16 q8, q8, q0 251 252 VDUP.8 d24, d13[6] // output_min 253 254 VQMOVN.S16 d0, q8 255 256 VDUP.8 d25, d13[7] // output_max 257 258 VMAX.S8 d0, d0, d24 259 260 SUBS r1, r1, 8 261 262 VMIN.S8 d0, d0, d25 263 264 # Store full 1 x 8 265 BLO 7f 266 VST1.8 {d0}, [r11], r12 267 SUB r2, r2, r14 // a -= ks 268 BHI 0b 269 270 VPOP {d8-d13} 271 ADD sp, sp, 16 // skip pad of 8, r2, r3 272 POP {r5, r6, r7, r9, r11, pc} 273 274 # Remainder- 1 to 7 bytes of A 275 .p2align 3 2765: 277 AND r5, r5, 7 // kc remainder 1 to 7 2786: 279 VLD1.8 {d0}, [r3] 280 VLD1.8 {d8}, [r9]! 281 282 VMOVL.S8 q0, d0 283 VMOVL.S8 q4, d8 284 VMLAL.S16 q8, d8, d0[0] 285 VMLAL.S16 q9, d9, d0[0] 286 CMP r5, 2 287 BLO 4b 288 289 VLD1.8 {d8}, [r9]! 290 VMOVL.S8 q4, d8 291 VMLAL.S16 q8, d8, d0[1] 292 VMLAL.S16 q9, d9, d0[1] 293 BEQ 4b 294 295 VLD1.8 {d8}, [r9]! 296 VMOVL.S8 q4, d8 297 VMLAL.S16 q8, d8, d0[2] 298 VMLAL.S16 q9, d9, d0[2] 299 CMP r5, 4 300 BLO 4b 301 302 VLD1.8 {d8}, [r9]! 303 VMOVL.S8 q4, d8 304 VMLAL.S16 q8, d8, d0[3] 305 VMLAL.S16 q9, d9, d0[3] 306 BEQ 4b 307 308 VLD1.8 {d8}, [r9]! 309 VMOVL.S8 q4, d8 310 VMLAL.S16 q8, d8, d1[0] 311 VMLAL.S16 q9, d9, d1[0] 312 CMP r5, 6 313 BLO 4b 314 315 VLD1.8 {d8}, [r9]! 316 VMOVL.S8 q4, d8 317 VMLAL.S16 q8, d8, d1[1] 318 VMLAL.S16 q9, d9, d1[1] 319 BEQ 4b 320 321 VLD1.8 {d8}, [r9]! 322 VMOVL.S8 q4, d8 323 VMLAL.S16 q8, d8, d1[2] 324 VMLAL.S16 q9, d9, d1[2] 325 B 4b 326 327 # Store odd width 328 .p2align 3 3297: 330 TST r1, 4 331 BEQ 8f 332 VST1.32 {d0[0]}, [r11]! 333 VEXT.8 q0, q0, q0, 4 3348: 335 TST r1, 2 336 BEQ 9f 337 VST1.16 {d0[0]}, [r11]! 338 VEXT.8 q0, q0, q0, 2 339 3409: 341 TST r1, 1 342 BEQ 10f 343 VST1.8 {d0[0]}, [r11] 344 34510: 346 VPOP {d8-d13} 347 ADD sp, sp, 16 // skip pad of 8, r2, r3 348 POP {r5, r6, r7, r9, r11, pc} 349 350END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35 351 352#ifdef __ELF__ 353.section ".note.GNU-stack","",%progbits 354#endif 355