1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35( 16// size_t mr, r0 17// size_t nc, r1 18// size_t kc, (r2) -> r5 19// const int8_t*restrict a, r3 20// size_t a_stride, sp + 96 -> (unused) 21// const void*restrict w, sp + 100 -> r9 22// int8_t*restrict c, sp + 104 -> r11 23// size_t cm_stride, sp + 108 -> (unused) 24// size_t cn_stride, sp + 112 -> r7 25// xnn_qs8_minmax_params params) sp + 116 -> (r5) 26 27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 28 29// Based on cortex_a53 microkernel but with Neon loads 30 31// Register usage 32// A0 r3 d0-d1 q0 33 34// B r9 d8-d9 q4 q5 35 36// C0 r11 d16-d17 q8 d18-d19 q9 37// q2, q3 acc2 38 39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3 40 41// params structure is 4 bytes 42// struct { 43// int16_t output_zero_point; d13[2] 44// int8_t output_min; d13[6] 45// int8_t output_max; d13[7] 46// } xnn_qs8_minmax_params.neonv8; 47 48BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35 49 # Push 96 bytes 50 PUSH {r5, r7, r9, r11} // 16 51 SUB sp, sp, 32 // +32 52 VPUSH {d8-d13} // +48 = 96 53 54 LDR r11, [sp, 104] // c 55 LDR r9, [sp, 100] // w 56 LDR r5, [sp, 116] // params 57 58 # Load params values 59 VLD1.32 {d13[]}, [r5] // QC8 neonv8 params 60 LDR r7, [sp, 112] // cn_stride 61 62 PLD [r9, 64] // Prefetch B 63 PLD [r9, 128] 64 PLD [r9, 192] 65 PLD [r9, 256] 66 PLD [r9, 320] 67 PLD [r9, 384] 68 69 .p2align 3 700: 71 # Load initial bias from w into accumulators 72 VLDM r9!, {d16-d19} // Bias 73 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 74 SUBS r5, r2, 8 // k = kc - 8 75 VMOV.I32 q3, 0 76 PLD [r3, 64] // Prefetch A 77 BLO 4f // less than 8 channels? 78 79 // Prologue - load A0 and B0 80 VLD1.8 {d0}, [r3]! // A0 81 SUBS r5, r5, 8 // k = k - 8 82 VLD1.8 {d8}, [r9]! // B0 83 BLO 2f // less than 8 channels? 84 85 // Main loop - 8 bytes 86 // 64 bytes for weights. 87 88 .p2align 3 891: 90 // Extend 91 VMOVL.S8 q0, d0 92 VMOVL.S8 q4, d8 93 PLD [r9, 448] 94 95 // BLOCK 0 96 VLD1.8 {d10}, [r9]! // B1 97 VMLAL.S16 q8, d8, d0[0] 98 VMLAL.S16 q9, d9, d0[0] 99 VMOVL.S8 q5, d10 100 101 // BLOCK 1 102 VLD1.8 {d8}, [r9]! // B2 103 VMLAL.S16 q2, d10, d0[1] 104 VMLAL.S16 q3, d11, d0[1] 105 VMOVL.S8 q4, d8 106 107 // BLOCK 2 108 VLD1.8 {d10}, [r9]! // B3 109 VMLAL.S16 q8, d8, d0[2] 110 VMLAL.S16 q9, d9, d0[2] 111 VMOVL.S8 q5, d10 112 113 // BLOCK 3 114 VLD1.8 {d8}, [r9]! // B4 115 VMLAL.S16 q2, d10, d0[3] 116 VMLAL.S16 q3, d11, d0[3] 117 VLD1.8 {d0}, [r3]! // A0 118 VMOVL.S8 q4, d8 119 120 // BLOCK 4 121 VLD1.8 {d10}, [r9]! // B5 122 VMLAL.S16 q8, d8, d1[0] 123 VMLAL.S16 q9, d9, d1[0] 124 VMOVL.S8 q5, d10 125 126 // BLOCK 5 127 VLD1.8 {d8}, [r9]! // B6 128 VMLAL.S16 q2, d10, d1[1] 129 VMLAL.S16 q3, d11, d1[1] 130 VMOVL.S8 q4, d8 131 132 // BLOCK 6 133 VLD1.8 {d10}, [r9]! // B7 134 VMLAL.S16 q8, d8, d1[2] 135 VMLAL.S16 q9, d9, d1[2] 136 VMOVL.S8 q5, d10 137 138 // BLOCK 7 139 VLD1.8 {d8}, [r9]! // B0 140 VMLAL.S16 q2, d10, d1[3] 141 VMLAL.S16 q3, d11, d1[3] 142 SUBS r5, r5, 8 143 BHS 1b 144 145 // Epilogue 146 147 .p2align 3 1482: 149 VMOVL.S8 q0, d0 150 VMOVL.S8 q4, d8 151 152 VLD1.8 {d10}, [r9]! // B1 153 VMLAL.S16 q8, d8, d0[0] 154 VMLAL.S16 q9, d9, d0[0] 155 VMOVL.S8 q5, d10 156 157 VLD1.8 {d8}, [r9]! // B2 158 VMLAL.S16 q2, d10, d0[1] 159 VMLAL.S16 q3, d11, d0[1] 160 VMOVL.S8 q4, d8 161 162 VLD1.8 {d10}, [r9]! // B3 163 VMLAL.S16 q8, d8, d0[2] 164 VMLAL.S16 q9, d9, d0[2] 165 VMOVL.S8 q5, d10 166 167 VLD1.8 {d8}, [r9]! // B4 168 VMLAL.S16 q2, d10, d0[3] 169 VMLAL.S16 q3, d11, d0[3] 170 VMOVL.S8 q4, d8 171 172 VLD1.8 {d10}, [r9]! // B5 173 VMLAL.S16 q8, d8, d1[0] 174 VMLAL.S16 q9, d9, d1[0] 175 VMOVL.S8 q5, d10 176 177 VLD1.8 {d8}, [r9]! // B6 178 VMLAL.S16 q2, d10, d1[1] 179 VMLAL.S16 q3, d11, d1[1] 180 VMOVL.S8 q4, d8 181 182 VLD1.8 {d10}, [r9]! // B7 183 VMLAL.S16 q8, d8, d1[2] 184 VMLAL.S16 q9, d9, d1[2] 185 VMOVL.S8 q5, d10 186 ADDS r5, r5, 8 187 188 VMLAL.S16 q2, d10, d1[3] 189 VMLAL.S16 q3, d11, d1[3] 190 191 # Is there a remainder?- 1-7 bytes of A 192 BNE 4f 193 1943: 195 VADD.S32 q8, q8, q2 196 VADD.S32 q9, q9, q3 197 198 # QC8 FP32 quantization 199 VLD1.8 {q0-q1}, [r9]! 200 201 VCVT.F32.S32 q8, q8 202 VCVT.F32.S32 q9, q9 203 204 VMUL.F32 q8, q8, q0 // multiplier 205 VMUL.F32 q9, q9, q1 206 207 VCVTN.S32.F32 q8, q8 208 VCVTN.S32.F32 q9, q9 209 210 VDUP.16 q0, d13[2] // output_zero_point 211 212 VQMOVN.S32 d16, q8 213 VQMOVN.S32 d17, q9 214 215 VQADD.S16 q8, q8, q0 216 217 VDUP.8 d24, d13[6] // output_min 218 219 VQMOVN.S16 d0, q8 220 221 VDUP.8 d25, d13[7] // output_max 222 223 VMAX.S8 d0, d0, d24 224 225 SUBS r1, r1, 8 226 227 VMIN.S8 d0, d0, d25 228 229 # Store full 1 x 8 230 BLO 5f 231 VST1.8 {d0}, [r11], r7 232 SUB r3, r3, r2 233 BHI 0b 234 235 VPOP {d8-d13} 236 ADD sp, sp, 16 // skip pad of 8 + d14 237 ADD sp, sp, 16 238 POP {r5, r7, r9, r11} 239 BX lr 240 241 # Remainder- 1 to 7 bytes of A 242 .p2align 3 2434: 244 AND r5, r5, 7 // kc remainder 1 to 7 245 246 VLD1.8 {d0}, [r3], r5 247 VLD1.8 {d8}, [r9]! 248 249 VMOVL.S8 q0, d0 250 VMOVL.S8 q4, d8 251 VMLAL.S16 q8, d8, d0[0] 252 VMLAL.S16 q9, d9, d0[0] 253 CMP r5, 2 254 BLO 3b 255 256 VLD1.8 {d8}, [r9]! 257 VMOVL.S8 q4, d8 258 VMLAL.S16 q8, d8, d0[1] 259 VMLAL.S16 q9, d9, d0[1] 260 BEQ 3b 261 262 VLD1.8 {d8}, [r9]! 263 VMOVL.S8 q4, d8 264 VMLAL.S16 q8, d8, d0[2] 265 VMLAL.S16 q9, d9, d0[2] 266 CMP r5, 4 267 BLO 3b 268 269 VLD1.8 {d8}, [r9]! 270 VMOVL.S8 q4, d8 271 VMLAL.S16 q8, d8, d0[3] 272 VMLAL.S16 q9, d9, d0[3] 273 BEQ 3b 274 275 VLD1.8 {d8}, [r9]! 276 VMOVL.S8 q4, d8 277 VMLAL.S16 q8, d8, d1[0] 278 VMLAL.S16 q9, d9, d1[0] 279 CMP r5, 6 280 BLO 3b 281 282 VLD1.8 {d8}, [r9]! 283 VMOVL.S8 q4, d8 284 VMLAL.S16 q8, d8, d1[1] 285 VMLAL.S16 q9, d9, d1[1] 286 BEQ 3b 287 288 VLD1.8 {d8}, [r9]! 289 VMOVL.S8 q4, d8 290 VMLAL.S16 q8, d8, d1[2] 291 VMLAL.S16 q9, d9, d1[2] 292 B 3b 293 294 # Store odd width 295 .p2align 3 2965: 297 TST r1, 4 298 BEQ 6f 299 VST1.32 {d0[0]}, [r11]! 300 VEXT.8 q0, q0, q0, 4 3016: 302 TST r1, 2 303 BEQ 7f 304 VST1.16 {d0[0]}, [r11]! 305 VEXT.8 q0, q0, q0, 2 3067: 307 TST r1, 1 308 BEQ 8f 309 VST1.8 {d0[0]}, [r11] 3108: 311 VPOP {d8-d13} 312 ADD sp, sp, 16 // skip pad of 8 + d14 313 ADD sp, sp, 16 314 POP {r5, r7, r9, r11} 315 BX lr 316 317END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35 318 319#ifdef __ELF__ 320.section ".note.GNU-stack","",%progbits 321#endif 322 323