1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7( 16// size_t mr, r0 17// size_t nc, r1 18// size_t kc, (r2) -> r5 19// const uint8_t*restrict a, r3 20// size_t a_stride, sp + 96 -> (unused) 21// const void*restrict w, sp + 100 -> r9 22// uint8_t*restrict c, sp + 104 -> r11 23// size_t cm_stride, sp + 108 -> (unused) 24// size_t cn_stride, sp + 112 -> r7 25// xnn_qs8_conv_minmax_params params) sp + 116 -> (r5) 26 27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 28 29// Based on cortex_a53 microkernel but with Neon loads 30 31// Register usage 32// A0 r3 d0-d1 q0 33 34// B r9 d8-d9 q4 q5 35 36// C0 r11 d16-d17 q8 d18-d19 q9 37// q2, q3 acc2 38 39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3 40 41# params structure is 20 bytes 42# struct { 43# uint8_t kernel_zero_point[4]; d14 44# int32_t right_pre_shift; d12[0] 45# int32_t multiplier; d12[1] 46# int32_t right_post_shift; d13[0] 47# int16_t output_zero_point; d13[2] 48# uint8_t output_min; d13[6] 49# uint8_t output_max; d13[7] 50# } rndnu_neon; 51 52BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 53 # Push 96 bytes 54 PUSH {r5, r7, r9, r11} // 16 55 SUB sp, sp, 24 // +24 56 VPUSH {d8-d14} // +56 = 96 57 58 LDR r11, [sp, 104] // c 59 LDR r9, [sp, 100] // w 60 LDR r5, [sp, 116] // params 61 62 # Load params values 63 VLD1.32 {d14[]}, [r5]! // QU8 kernel_zero_point 64 VLDM r5, {d12-d13} // RNDNU params 65 LDR r7, [sp, 112] // cn_stride 66 67 PLD [r9, 64] // Prefetch B 68 PLD [r9, 128] 69 PLD [r9, 192] 70 PLD [r9, 256] 71 PLD [r9, 320] 72 PLD [r9, 384] 73 74 .p2align 3 750: 76 # Load initial bias from w into accumulators 77 VLDM r9!, {d16-d19} // Bias 78 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 79 SUBS r5, r2, 8 // k = kc - 8 80 VMOV.I32 q3, 0 81 PLD [r3, 64] // Prefetch A 82 BLO 4f // less than 8 channels? 83 84 // Prologue - load A0 and B0 85 VLD1.8 {d0}, [r3]! // A0 86 SUBS r5, r5, 8 // k = k - 8 87 VLD1.8 {d8}, [r9]! // B0 88 BLO 2f // less than 8 channels? 89 90 // Main loop - 8 bytes 91 // 64 bytes for weights. 92 93 .p2align 3 941: 95 // Extend 96 VMOVL.U8 q0, d0 97 VSUBL.U8 q4, d8, d14 98 PLD [r9, 448] 99 100 // BLOCK 0 101 VLD1.8 {d10}, [r9]! // B1 102 VMLAL.S16 q8, d8, d0[0] 103 VMLAL.S16 q9, d9, d0[0] 104 VSUBL.U8 q5, d10, d14 105 106 // BLOCK 1 107 VLD1.8 {d8}, [r9]! // B2 108 VMLAL.S16 q2, d10, d0[1] 109 VMLAL.S16 q3, d11, d0[1] 110 VSUBL.U8 q4, d8, d14 111 112 // BLOCK 2 113 VLD1.8 {d10}, [r9]! // B3 114 VMLAL.S16 q8, d8, d0[2] 115 VMLAL.S16 q9, d9, d0[2] 116 VSUBL.U8 q5, d10, d14 117 118 // BLOCK 3 119 VLD1.8 {d8}, [r9]! // B4 120 VMLAL.S16 q2, d10, d0[3] 121 VMLAL.S16 q3, d11, d0[3] 122 VLD1.8 {d0}, [r3]! // A0 123 VSUBL.U8 q4, d8, d14 124 125 // BLOCK 4 126 VLD1.8 {d10}, [r9]! // B5 127 VMLAL.S16 q8, d8, d1[0] 128 VMLAL.S16 q9, d9, d1[0] 129 VSUBL.U8 q5, d10, d14 130 131 // BLOCK 5 132 VLD1.8 {d8}, [r9]! // B6 133 VMLAL.S16 q2, d10, d1[1] 134 VMLAL.S16 q3, d11, d1[1] 135 VSUBL.U8 q4, d8, d14 136 137 // BLOCK 6 138 VLD1.8 {d10}, [r9]! // B7 139 VMLAL.S16 q8, d8, d1[2] 140 VMLAL.S16 q9, d9, d1[2] 141 VSUBL.U8 q5, d10, d14 142 143 // BLOCK 7 144 VLD1.8 {d8}, [r9]! // B0 145 VMLAL.S16 q2, d10, d1[3] 146 VMLAL.S16 q3, d11, d1[3] 147 SUBS r5, r5, 8 148 BHS 1b 149 150 // Epilogue 151 152 .p2align 3 1532: 154 VMOVL.U8 q0, d0 155 VSUBL.U8 q4, d8, d14 156 157 VLD1.8 {d10}, [r9]! // B1 158 VMLAL.S16 q8, d8, d0[0] 159 VMLAL.S16 q9, d9, d0[0] 160 VSUBL.U8 q5, d10, d14 161 162 VLD1.8 {d8}, [r9]! // B2 163 VMLAL.S16 q2, d10, d0[1] 164 VMLAL.S16 q3, d11, d0[1] 165 VSUBL.U8 q4, d8, d14 166 167 VLD1.8 {d10}, [r9]! // B3 168 VMLAL.S16 q8, d8, d0[2] 169 VMLAL.S16 q9, d9, d0[2] 170 VSUBL.U8 q5, d10, d14 171 172 VLD1.8 {d8}, [r9]! // B4 173 VMLAL.S16 q2, d10, d0[3] 174 VMLAL.S16 q3, d11, d0[3] 175 VSUBL.U8 q4, d8, d14 176 177 VLD1.8 {d10}, [r9]! // B5 178 VMLAL.S16 q8, d8, d1[0] 179 VMLAL.S16 q9, d9, d1[0] 180 VSUBL.U8 q5, d10, d14 181 182 VLD1.8 {d8}, [r9]! // B6 183 VMLAL.S16 q2, d10, d1[1] 184 VMLAL.S16 q3, d11, d1[1] 185 VSUBL.U8 q4, d8, d14 186 187 VLD1.8 {d10}, [r9]! // B7 188 VMLAL.S16 q8, d8, d1[2] 189 VMLAL.S16 q9, d9, d1[2] 190 VSUBL.U8 q5, d10, d14 191 ADDS r5, r5, 8 192 193 VMLAL.S16 q2, d10, d1[3] 194 VMLAL.S16 q3, d11, d1[3] 195 196 # Is there a remainder?- 1-7 bytes of A 197 BNE 4f 198 1993: 200 VADD.S32 q8, q8, q2 201 VADD.S32 q9, q9, q3 202 203 # RNDNU quantization 204 VDUP.32 q0, d12[0] // right_pre_shift 205 206 VQSHL.S32 q8, q8, q0 207 VQSHL.S32 q9, q9, q0 208 209 VDUP.32 q2, d13[0] // right_post_shift 210 211 VQDMULH.S32 q8, q8, d12[1] // multiplier 212 VQDMULH.S32 q9, q9, d12[1] 213 214 VRSHL.S32 q8, q8, q2 215 VRSHL.S32 q9, q9, q2 216 217 VDUP.16 q0, d13[2] // output_zero_point 218 219 VQMOVN.S32 d16, q8 220 VQMOVN.S32 d17, q9 221 222 VQADD.S16 q8, q8, q0 223 224 VDUP.8 d24, d13[6] // output_min 225 226 VQMOVUN.S16 d0, q8 227 228 VDUP.8 d25, d13[7] // output_max 229 230 VMAX.U8 d0, d0, d24 231 232 SUBS r1, r1, 8 233 234 VMIN.U8 d0, d0, d25 235 236 # Store full 1 x 8 237 BLO 5f 238 VST1.8 {d0}, [r11], r7 239 SUB r3, r3, r2 240 BHI 0b 241 242 VPOP {d8-d14} 243 ADD sp, sp, 8 // skip pad of 8 244 ADD sp, sp, 16 245 POP {r5, r7, r9, r11} 246 BX lr 247 248 # Remainder- 1 to 7 bytes of A 249 .p2align 3 2504: 251 AND r5, r5, 7 // kc remainder 1 to 7 252 253 VLD1.8 {d0}, [r3], r5 254 VLD1.8 {d8}, [r9]! 255 256 VMOVL.U8 q0, d0 257 VSUBL.U8 q4, d8, d14 258 VMLAL.S16 q8, d8, d0[0] 259 VMLAL.S16 q9, d9, d0[0] 260 CMP r5, 2 261 BLO 3b 262 263 VLD1.8 {d8}, [r9]! 264 VSUBL.U8 q4, d8, d14 265 VMLAL.S16 q8, d8, d0[1] 266 VMLAL.S16 q9, d9, d0[1] 267 BEQ 3b 268 269 VLD1.8 {d8}, [r9]! 270 VSUBL.U8 q4, d8, d14 271 VMLAL.S16 q8, d8, d0[2] 272 VMLAL.S16 q9, d9, d0[2] 273 CMP r5, 4 274 BLO 3b 275 276 VLD1.8 {d8}, [r9]! 277 VSUBL.U8 q4, d8, d14 278 VMLAL.S16 q8, d8, d0[3] 279 VMLAL.S16 q9, d9, d0[3] 280 BEQ 3b 281 282 VLD1.8 {d8}, [r9]! 283 VSUBL.U8 q4, d8, d14 284 VMLAL.S16 q8, d8, d1[0] 285 VMLAL.S16 q9, d9, d1[0] 286 CMP r5, 6 287 BLO 3b 288 289 VLD1.8 {d8}, [r9]! 290 VSUBL.U8 q4, d8, d14 291 VMLAL.S16 q8, d8, d1[1] 292 VMLAL.S16 q9, d9, d1[1] 293 BEQ 3b 294 295 VLD1.8 {d8}, [r9]! 296 VSUBL.U8 q4, d8, d14 297 VMLAL.S16 q8, d8, d1[2] 298 VMLAL.S16 q9, d9, d1[2] 299 B 3b 300 301 # Store odd width 302 .p2align 3 3035: 304 TST r1, 4 305 BEQ 6f 306 VST1.32 {d0[0]}, [r11]! 307 VEXT.8 q0, q0, q0, 4 3086: 309 TST r1, 2 310 BEQ 7f 311 VST1.16 {d0[0]}, [r11]! 312 VEXT.8 q0, q0, q0, 2 3137: 314 TST r1, 1 315 BEQ 8f 316 VST1.8 {d0[0]}, [r11] 3178: 318 VPOP {d8-d14} 319 ADD sp, sp, 8 // skip pad of 8 320 ADD sp, sp, 16 321 POP {r5, r7, r9, r11} 322 BX lr 323 324END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 325 326#ifdef __ELF__ 327.section ".note.GNU-stack","",%progbits 328#endif 329 330