1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35( 16// size_t mr, r0 17// size_t nc, r1 18// size_t kc, (r2) -> r5 19// const int8_t*restrict a, r3 20// size_t a_stride, sp + 96 -> (unused) 21// const void*restrict w, sp + 100 -> r9 22// int8_t*restrict c, sp + 104 -> r11 23// size_t cm_stride, sp + 108 -> (unused) 24// size_t cn_stride, sp + 112 -> r7 25// xnn_qs8_minmax_params params) sp + 116 -> (r5) 26 27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 28 29// Based on cortex_a53 microkernel but with Neon loads 30 31// Register usage 32// A0 r3 d0-d1 q0 33 34// B r9 d8-d9 q4 q5 35 36// C0 r11 d16-d17 q8 d18-d19 q9 37// q2, q3 acc2 38 39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3 40 41// params structure is 4 bytes 42// struct { 43// int16_t output_zero_point; d13[2] 44// int8_t output_min; d13[6] 45// int8_t output_max; d13[7] 46// } xnn_qs8_minmax_params.neonv8; 47 48BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35 49 # Push 96 bytes 50 PUSH {r5, r7, r9, r11} // 16 51 SUB sp, sp, 32 // +32 52 VPUSH {d8-d13} // +48 = 96 53 54 LDR r11, [sp, 104] // c 55 LDR r9, [sp, 100] // w 56 LDR r5, [sp, 116] // params 57 58 # Load params values 59 VLD1.32 {d13[]}, [r5] // QC8 neonv8 params 60 LDR r7, [sp, 112] // cn_stride 61 62 63 .p2align 3 640: 65 # Load initial bias from w into accumulators 66 VLDM r9!, {d16-d19} // Bias 67 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 68 SUBS r5, r2, 8 // k = kc - 8 69 VMOV.I32 q3, 0 70 BLO 4f // less than 8 channels? 71 72 // Prologue - load A0 and B0 73 VLD1.8 {d0}, [r3]! // A0 74 SUBS r5, r5, 8 // k = k - 8 75 VLD1.8 {d8}, [r9]! // B0 76 BLO 2f // less than 8 channels? 77 78 // Main loop - 8 bytes 79 // 64 bytes for weights. 80 81 .p2align 3 821: 83 // Extend 84 VMOVL.S8 q0, d0 85 VMOVL.S8 q4, d8 86 87 // BLOCK 0 88 VLD1.8 {d10}, [r9]! // B1 89 VMLAL.S16 q8, d8, d0[0] 90 VMLAL.S16 q9, d9, d0[0] 91 VMOVL.S8 q5, d10 92 93 // BLOCK 1 94 VLD1.8 {d8}, [r9]! // B2 95 VMLAL.S16 q2, d10, d0[1] 96 VMLAL.S16 q3, d11, d0[1] 97 VMOVL.S8 q4, d8 98 99 // BLOCK 2 100 VLD1.8 {d10}, [r9]! // B3 101 VMLAL.S16 q8, d8, d0[2] 102 VMLAL.S16 q9, d9, d0[2] 103 VMOVL.S8 q5, d10 104 105 // BLOCK 3 106 VLD1.8 {d8}, [r9]! // B4 107 VMLAL.S16 q2, d10, d0[3] 108 VMLAL.S16 q3, d11, d0[3] 109 VLD1.8 {d0}, [r3]! // A0 110 VMOVL.S8 q4, d8 111 112 // BLOCK 4 113 VLD1.8 {d10}, [r9]! // B5 114 VMLAL.S16 q8, d8, d1[0] 115 VMLAL.S16 q9, d9, d1[0] 116 VMOVL.S8 q5, d10 117 118 // BLOCK 5 119 VLD1.8 {d8}, [r9]! // B6 120 VMLAL.S16 q2, d10, d1[1] 121 VMLAL.S16 q3, d11, d1[1] 122 VMOVL.S8 q4, d8 123 124 // BLOCK 6 125 VLD1.8 {d10}, [r9]! // B7 126 VMLAL.S16 q8, d8, d1[2] 127 VMLAL.S16 q9, d9, d1[2] 128 VMOVL.S8 q5, d10 129 130 // BLOCK 7 131 VLD1.8 {d8}, [r9]! // B0 132 VMLAL.S16 q2, d10, d1[3] 133 VMLAL.S16 q3, d11, d1[3] 134 SUBS r5, r5, 8 135 BHS 1b 136 137 // Epilogue 138 139 .p2align 3 1402: 141 VMOVL.S8 q0, d0 142 VMOVL.S8 q4, d8 143 144 VLD1.8 {d10}, [r9]! // B1 145 VMLAL.S16 q8, d8, d0[0] 146 VMLAL.S16 q9, d9, d0[0] 147 VMOVL.S8 q5, d10 148 149 VLD1.8 {d8}, [r9]! // B2 150 VMLAL.S16 q2, d10, d0[1] 151 VMLAL.S16 q3, d11, d0[1] 152 VMOVL.S8 q4, d8 153 154 VLD1.8 {d10}, [r9]! // B3 155 VMLAL.S16 q8, d8, d0[2] 156 VMLAL.S16 q9, d9, d0[2] 157 VMOVL.S8 q5, d10 158 159 VLD1.8 {d8}, [r9]! // B4 160 VMLAL.S16 q2, d10, d0[3] 161 VMLAL.S16 q3, d11, d0[3] 162 VMOVL.S8 q4, d8 163 164 VLD1.8 {d10}, [r9]! // B5 165 VMLAL.S16 q8, d8, d1[0] 166 VMLAL.S16 q9, d9, d1[0] 167 VMOVL.S8 q5, d10 168 169 VLD1.8 {d8}, [r9]! // B6 170 VMLAL.S16 q2, d10, d1[1] 171 VMLAL.S16 q3, d11, d1[1] 172 VMOVL.S8 q4, d8 173 174 VLD1.8 {d10}, [r9]! // B7 175 VMLAL.S16 q8, d8, d1[2] 176 VMLAL.S16 q9, d9, d1[2] 177 VMOVL.S8 q5, d10 178 ADDS r5, r5, 8 179 180 VMLAL.S16 q2, d10, d1[3] 181 VMLAL.S16 q3, d11, d1[3] 182 183 # Is there a remainder?- 1-7 bytes of A 184 BNE 4f 185 1863: 187 VADD.S32 q8, q8, q2 188 VADD.S32 q9, q9, q3 189 190 # QC8 FP32 quantization 191 VLD1.8 {q0-q1}, [r9]! 192 193 VCVT.F32.S32 q8, q8 194 VCVT.F32.S32 q9, q9 195 196 VMUL.F32 q8, q8, q0 // multiplier 197 VMUL.F32 q9, q9, q1 198 199 VCVTN.S32.F32 q8, q8 200 VCVTN.S32.F32 q9, q9 201 202 VDUP.16 q0, d13[2] // output_zero_point 203 204 VQMOVN.S32 d16, q8 205 VQMOVN.S32 d17, q9 206 207 VQADD.S16 q8, q8, q0 208 209 VDUP.8 d24, d13[6] // output_min 210 211 VQMOVN.S16 d0, q8 212 213 VDUP.8 d25, d13[7] // output_max 214 215 VMAX.S8 d0, d0, d24 216 217 SUBS r1, r1, 8 218 219 VMIN.S8 d0, d0, d25 220 221 # Store full 1 x 8 222 BLO 5f 223 VST1.8 {d0}, [r11], r7 224 SUB r3, r3, r2 225 BHI 0b 226 227 VPOP {d8-d13} 228 ADD sp, sp, 16 // skip pad of 8 + d14 229 ADD sp, sp, 16 230 POP {r5, r7, r9, r11} 231 BX lr 232 233 # Remainder- 1 to 7 bytes of A 234 .p2align 3 2354: 236 AND r5, r5, 7 // kc remainder 1 to 7 237 238 VLD1.8 {d0}, [r3], r5 239 VLD1.8 {d8}, [r9]! 240 241 VMOVL.S8 q0, d0 242 VMOVL.S8 q4, d8 243 VMLAL.S16 q8, d8, d0[0] 244 VMLAL.S16 q9, d9, d0[0] 245 CMP r5, 2 246 BLO 3b 247 248 VLD1.8 {d8}, [r9]! 249 VMOVL.S8 q4, d8 250 VMLAL.S16 q8, d8, d0[1] 251 VMLAL.S16 q9, d9, d0[1] 252 BEQ 3b 253 254 VLD1.8 {d8}, [r9]! 255 VMOVL.S8 q4, d8 256 VMLAL.S16 q8, d8, d0[2] 257 VMLAL.S16 q9, d9, d0[2] 258 CMP r5, 4 259 BLO 3b 260 261 VLD1.8 {d8}, [r9]! 262 VMOVL.S8 q4, d8 263 VMLAL.S16 q8, d8, d0[3] 264 VMLAL.S16 q9, d9, d0[3] 265 BEQ 3b 266 267 VLD1.8 {d8}, [r9]! 268 VMOVL.S8 q4, d8 269 VMLAL.S16 q8, d8, d1[0] 270 VMLAL.S16 q9, d9, d1[0] 271 CMP r5, 6 272 BLO 3b 273 274 VLD1.8 {d8}, [r9]! 275 VMOVL.S8 q4, d8 276 VMLAL.S16 q8, d8, d1[1] 277 VMLAL.S16 q9, d9, d1[1] 278 BEQ 3b 279 280 VLD1.8 {d8}, [r9]! 281 VMOVL.S8 q4, d8 282 VMLAL.S16 q8, d8, d1[2] 283 VMLAL.S16 q9, d9, d1[2] 284 B 3b 285 286 # Store odd width 287 .p2align 3 2885: 289 TST r1, 4 290 BEQ 6f 291 VST1.32 {d0[0]}, [r11]! 292 VEXT.8 q0, q0, q0, 4 2936: 294 TST r1, 2 295 BEQ 7f 296 VST1.16 {d0[0]}, [r11]! 297 VEXT.8 q0, q0, q0, 2 2987: 299 TST r1, 1 300 BEQ 8f 301 VST1.8 {d0[0]}, [r11] 3028: 303 VPOP {d8-d13} 304 ADD sp, sp, 16 // skip pad of 8 + d14 305 ADD sp, sp, 16 306 POP {r5, r7, r9, r11} 307 BX lr 308 309END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35 310 311#ifdef __ELF__ 312.section ".note.GNU-stack","",%progbits 313#endif 314 315