1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7( 16// size_t mr, r0 17// size_t nc, r1 18// size_t kc, (r2) -> r5 19// const int8_t*restrict a, r3 20// size_t a_stride, sp + 96 -> (unused) 21// const void*restrict w, sp + 100 -> r9 22// int8_t*restrict c, sp + 104 -> r11 23// size_t cm_stride, sp + 108 -> (unused) 24// size_t cn_stride, sp + 112 -> r7 25// xnn_qs8_minmax_params params) sp + 116 -> (r5) 26 27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 28 29// Based on cortex_a53 microkernel but with Neon loads 30 31// Register usage 32// A0 r3 d0-d1 q0 33 34// B r9 d8-d9 q4 q5 35 36// C0 r11 d16-d17 q8 d18-d19 q9 37// q2, q3 acc2 38 39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3 40 41// params structure is 10 bytes 42// struct { 43// float magic_bias; d12[0] 44// int32_t magic_bias_less_output_zero_point; d12[1] 45// int8_t output_min; d13[6] 46// int8_t output_max; d13[7] 47// } xnn_qs8_minmax_params.neon; 48 49BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 50 # Push 96 bytes 51 PUSH {r5, r7, r9, r11} // 16 52 SUB sp, sp, 32 // +32 53 VPUSH {d8-d13} // +48 = 96 54 55 LDR r11, [sp, 104] // c 56 LDR r9, [sp, 100] // w 57 LDR r5, [sp, 116] // params 58 59 # Load params values 60 VLDM r5!, {d12} // QC8 neon params 61 VLD1.16 {d13[]}, [r5] // output_min/max 62 LDR r7, [sp, 112] // cn_stride 63 64 PLD [r9, 64] // Prefetch B 65 PLD [r9, 128] 66 PLD [r9, 192] 67 PLD [r9, 256] 68 PLD [r9, 320] 69 PLD [r9, 384] 70 71 .p2align 3 720: 73 # Load initial bias from w into accumulators 74 VLDM r9!, {d16-d19} // Bias 75 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 76 SUBS r5, r2, 8 // k = kc - 8 77 VMOV.I32 q3, 0 78 PLD [r3, 64] // Prefetch A 79 BLO 4f // less than 8 channels? 80 81 // Prologue - load A0 and B0 82 VLD1.8 {d0}, [r3]! // A0 83 SUBS r5, r5, 8 // k = k - 8 84 VLD1.8 {d8}, [r9]! // B0 85 BLO 2f // less than 8 channels? 86 87 // Main loop - 8 bytes 88 // 64 bytes for weights. 89 90 .p2align 3 911: 92 // Extend 93 VMOVL.S8 q0, d0 94 VMOVL.S8 q4, d8 95 PLD [r9, 448] 96 97 // BLOCK 0 98 VLD1.8 {d10}, [r9]! // B1 99 VMLAL.S16 q8, d8, d0[0] 100 VMLAL.S16 q9, d9, d0[0] 101 VMOVL.S8 q5, d10 102 103 // BLOCK 1 104 VLD1.8 {d8}, [r9]! // B2 105 VMLAL.S16 q2, d10, d0[1] 106 VMLAL.S16 q3, d11, d0[1] 107 VMOVL.S8 q4, d8 108 109 // BLOCK 2 110 VLD1.8 {d10}, [r9]! // B3 111 VMLAL.S16 q8, d8, d0[2] 112 VMLAL.S16 q9, d9, d0[2] 113 VMOVL.S8 q5, d10 114 115 // BLOCK 3 116 VLD1.8 {d8}, [r9]! // B4 117 VMLAL.S16 q2, d10, d0[3] 118 VMLAL.S16 q3, d11, d0[3] 119 VLD1.8 {d0}, [r3]! // A0 120 VMOVL.S8 q4, d8 121 122 // BLOCK 4 123 VLD1.8 {d10}, [r9]! // B5 124 VMLAL.S16 q8, d8, d1[0] 125 VMLAL.S16 q9, d9, d1[0] 126 VMOVL.S8 q5, d10 127 128 // BLOCK 5 129 VLD1.8 {d8}, [r9]! // B6 130 VMLAL.S16 q2, d10, d1[1] 131 VMLAL.S16 q3, d11, d1[1] 132 VMOVL.S8 q4, d8 133 134 // BLOCK 6 135 VLD1.8 {d10}, [r9]! // B7 136 VMLAL.S16 q8, d8, d1[2] 137 VMLAL.S16 q9, d9, d1[2] 138 VMOVL.S8 q5, d10 139 140 // BLOCK 7 141 VLD1.8 {d8}, [r9]! // B0 142 VMLAL.S16 q2, d10, d1[3] 143 VMLAL.S16 q3, d11, d1[3] 144 SUBS r5, r5, 8 145 BHS 1b 146 147 // Epilogue 148 149 .p2align 3 1502: 151 VMOVL.S8 q0, d0 152 VMOVL.S8 q4, d8 153 154 VLD1.8 {d10}, [r9]! // B1 155 VMLAL.S16 q8, d8, d0[0] 156 VMLAL.S16 q9, d9, d0[0] 157 VMOVL.S8 q5, d10 158 159 VLD1.8 {d8}, [r9]! // B2 160 VMLAL.S16 q2, d10, d0[1] 161 VMLAL.S16 q3, d11, d0[1] 162 VMOVL.S8 q4, d8 163 164 VLD1.8 {d10}, [r9]! // B3 165 VMLAL.S16 q8, d8, d0[2] 166 VMLAL.S16 q9, d9, d0[2] 167 VMOVL.S8 q5, d10 168 169 VLD1.8 {d8}, [r9]! // B4 170 VMLAL.S16 q2, d10, d0[3] 171 VMLAL.S16 q3, d11, d0[3] 172 VMOVL.S8 q4, d8 173 174 VLD1.8 {d10}, [r9]! // B5 175 VMLAL.S16 q8, d8, d1[0] 176 VMLAL.S16 q9, d9, d1[0] 177 VMOVL.S8 q5, d10 178 179 VLD1.8 {d8}, [r9]! // B6 180 VMLAL.S16 q2, d10, d1[1] 181 VMLAL.S16 q3, d11, d1[1] 182 VMOVL.S8 q4, d8 183 184 VLD1.8 {d10}, [r9]! // B7 185 VMLAL.S16 q8, d8, d1[2] 186 VMLAL.S16 q9, d9, d1[2] 187 VMOVL.S8 q5, d10 188 ADDS r5, r5, 8 189 190 VMLAL.S16 q2, d10, d1[3] 191 VMLAL.S16 q3, d11, d1[3] 192 193 # Is there a remainder?- 1-7 bytes of A 194 BNE 4f 195 1963: 197 VADD.S32 q8, q8, q2 198 VADD.S32 q9, q9, q3 199 200 # QC8 FP32 quantization 201 VLD1.8 {q0-q1}, [r9]! 202 203 VDUP.32 q2, d12[0] // magic_bias 204 VDUP.32 q3, d12[1] // magic_bias_less_output_zero_point 205 206 VCVT.F32.S32 q8, q8 207 VCVT.F32.S32 q9, q9 208 209 VMUL.F32 q8, q8, q0 // multiplier 210 VMUL.F32 q9, q9, q1 211 212 VADD.F32 q8, q8, q2 // magic_bias 213 VADD.F32 q9, q9, q2 214 215 VQSUB.S32 q8, q8, q3 // magic_bias_less_output_zero_point 216 VQSUB.S32 q9, q9, q3 217 218 219 VQMOVN.S32 d16, q8 220 VQMOVN.S32 d17, q9 221 222 223 VDUP.8 d24, d13[6] // output_min 224 225 VQMOVN.S16 d0, q8 226 227 VDUP.8 d25, d13[7] // output_max 228 229 VMAX.S8 d0, d0, d24 230 231 SUBS r1, r1, 8 232 233 VMIN.S8 d0, d0, d25 234 235 # Store full 1 x 8 236 BLO 5f 237 VST1.8 {d0}, [r11], r7 238 SUB r3, r3, r2 239 BHI 0b 240 241 VPOP {d8-d13} 242 ADD sp, sp, 16 // skip pad of 8 + d14 243 ADD sp, sp, 16 244 POP {r5, r7, r9, r11} 245 BX lr 246 247 # Remainder- 1 to 7 bytes of A 248 .p2align 3 2494: 250 AND r5, r5, 7 // kc remainder 1 to 7 251 252 VLD1.8 {d0}, [r3], r5 253 VLD1.8 {d8}, [r9]! 254 255 VMOVL.S8 q0, d0 256 VMOVL.S8 q4, d8 257 VMLAL.S16 q8, d8, d0[0] 258 VMLAL.S16 q9, d9, d0[0] 259 CMP r5, 2 260 BLO 3b 261 262 VLD1.8 {d8}, [r9]! 263 VMOVL.S8 q4, d8 264 VMLAL.S16 q8, d8, d0[1] 265 VMLAL.S16 q9, d9, d0[1] 266 BEQ 3b 267 268 VLD1.8 {d8}, [r9]! 269 VMOVL.S8 q4, d8 270 VMLAL.S16 q8, d8, d0[2] 271 VMLAL.S16 q9, d9, d0[2] 272 CMP r5, 4 273 BLO 3b 274 275 VLD1.8 {d8}, [r9]! 276 VMOVL.S8 q4, d8 277 VMLAL.S16 q8, d8, d0[3] 278 VMLAL.S16 q9, d9, d0[3] 279 BEQ 3b 280 281 VLD1.8 {d8}, [r9]! 282 VMOVL.S8 q4, d8 283 VMLAL.S16 q8, d8, d1[0] 284 VMLAL.S16 q9, d9, d1[0] 285 CMP r5, 6 286 BLO 3b 287 288 VLD1.8 {d8}, [r9]! 289 VMOVL.S8 q4, d8 290 VMLAL.S16 q8, d8, d1[1] 291 VMLAL.S16 q9, d9, d1[1] 292 BEQ 3b 293 294 VLD1.8 {d8}, [r9]! 295 VMOVL.S8 q4, d8 296 VMLAL.S16 q8, d8, d1[2] 297 VMLAL.S16 q9, d9, d1[2] 298 B 3b 299 300 # Store odd width 301 .p2align 3 3025: 303 TST r1, 4 304 BEQ 6f 305 VST1.32 {d0[0]}, [r11]! 306 VEXT.8 q0, q0, q0, 4 3076: 308 TST r1, 2 309 BEQ 7f 310 VST1.16 {d0[0]}, [r11]! 311 VEXT.8 q0, q0, q0, 2 3127: 313 TST r1, 1 314 BEQ 8f 315 VST1.8 {d0[0]}, [r11] 3168: 317 VPOP {d8-d13} 318 ADD sp, sp, 16 // skip pad of 8 + d14 319 ADD sp, sp, 16 320 POP {r5, r7, r9, r11} 321 BX lr 322 323END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 324 325#ifdef __ELF__ 326.section ".note.GNU-stack","",%progbits 327#endif 328 329