1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15// void xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 16// size_t mr, (r0) 17// size_t nc, r1 18// size_t kc, (r2) -> sp + 56 -> r5 19// size_t ks, (r3) -> sp + 60 -> r14 20// const uint8_t**restrict a, sp + 88 -> r2 21// const void*restrict w, sp + 92 -> r9 22// uint8_t*restrict c, sp + 96 -> r11 23// size_t cm_stride, sp + 100 -> r6 24// size_t cn_stride, sp + 104 -> r12 25// size_t a_offset, sp + 108 -> (r5) 26// const uint8_t* zero, sp + 112 -> r7 27// xnn_qs8_conv_minmax_params*params); sp + 116 -> (r5) 28 29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 30 31// Based on cortex_a53 microkernel but with Neon loads 32 33// Register usage 34// A0 r3 d0-d1 q0 35 36// B r9 d8-d9 q4 q5 37 38// C0 r11 d16-d17 q8 d18-d19 q9 39// q2, q3 acc2 40 41// Unused r4, r8, r10, d15, q10-q15, q1-q3 42 43// params structure is 20 bytes 44// struct { 45// uint8_t kernel_zero_point[4]; d14 46// int32_t right_pre_shift; d12[0] 47// int32_t multiplier; d12[1] 48// int32_t right_post_shift; d13[0] 49// int16_t output_zero_point; d13[2] 50// uint8_t output_min; d13[6] 51// uint8_t output_max; d13[7] 52// } rndnu_neon; 53 54BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 55 # Push 88 bytes 56 # r2, r3 will be reloaded in outer loop. 57 PUSH {r2, r3, r5, r6, r7, r9, r11, lr} // +32 58 VPUSH {d8-d14} // +56 = 88 59 60 LDR r2, [sp, 88] // a 61 LDR r9, [sp, 92] // w 62 LDR r11, [sp, 96] // c 63 LDR r6, [sp, 100] // cm_stride 64 LDR r12, [sp, 104] // cn_stride 65 LDR r7, [sp, 112] // zero 66 LDR r5, [sp, 116] // params 67 MOV r14, r3 // p = ks 68 69 # Load params values 70 VLD1.32 {d14[]}, [r5]! // QU8 kernel_zero_point 71 VLDM r5, {d12-d13} // RNDNU params 72 73 PLD [r9, 64] // Prefetch B 74 PLD [r9, 112] 75 PLD [r9, 192] 76 PLD [r9, 256] 77 PLD [r9, 320] 78 PLD [r9, 384] 79 80 .p2align 3 810: 82 # Load initial bias from w into accumulators 83 VLDM r9!, {d16-d19} // Bias 84 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 85 VMOV.I32 q3, 0 86 87 .p2align 3 881: 89 # Load next A pointer 90 LDR r3, [r2, 0] 91 92 # Add a_offset 93 LDR r5, [sp, 108] // a_offset 94 ADD r2, r2, 4 95 CMP r3, r7 // if a0 == zero 96 ADD r3, r3, r5 // a0 += a_offset 97 MOVEQ r3, r7 // a0 = zero, else += a0 + a_offset 98 99 LDR r5, [sp, 56] // kc 100 SUBS r5, r5, 8 // kc - 8 101 BLO 5f // less than 8 channels? 102 103 // Prologue - load A0 and B0 104 VLD1.8 {d0}, [r3]! // A0 105 SUBS r5, r5, 8 // k = k - 8 106 VLD1.8 {d8}, [r9]! // B0 107 BLO 3f // less than 8 channels? 108 109 // Main loop - 8 bytes 110 // 64 bytes for weights. 111 112 .p2align 3 1132: 114 // Extend 115 VMOVL.U8 q0, d0 116 VSUBL.U8 q4, d8, d14 117 PLD [r9, 448] 118 119 // BLOCK 0 120 VLD1.8 {d10}, [r9]! // B1 121 VMLAL.S16 q8, d8, d0[0] 122 VMLAL.S16 q9, d9, d0[0] 123 VSUBL.U8 q5, d10, d14 124 125 // BLOCK 1 126 VLD1.8 {d8}, [r9]! // B2 127 VMLAL.S16 q2, d10, d0[1] 128 VMLAL.S16 q3, d11, d0[1] 129 VSUBL.U8 q4, d8, d14 130 131 // BLOCK 2 132 VLD1.8 {d10}, [r9]! // B3 133 VMLAL.S16 q8, d8, d0[2] 134 VMLAL.S16 q9, d9, d0[2] 135 VSUBL.U8 q5, d10, d14 136 137 // BLOCK 3 138 VLD1.8 {d8}, [r9]! // B4 139 VMLAL.S16 q2, d10, d0[3] 140 VMLAL.S16 q3, d11, d0[3] 141 VLD1.8 {d0}, [r3]! // A0 142 VSUBL.U8 q4, d8, d14 143 144 // BLOCK 4 145 VLD1.8 {d10}, [r9]! // B5 146 VMLAL.S16 q8, d8, d1[0] 147 VMLAL.S16 q9, d9, d1[0] 148 VSUBL.U8 q5, d10, d14 149 150 // BLOCK 5 151 VLD1.8 {d8}, [r9]! // B6 152 VMLAL.S16 q2, d10, d1[1] 153 VMLAL.S16 q3, d11, d1[1] 154 VSUBL.U8 q4, d8, d14 155 156 // BLOCK 6 157 VLD1.8 {d10}, [r9]! // B7 158 VMLAL.S16 q8, d8, d1[2] 159 VMLAL.S16 q9, d9, d1[2] 160 VSUBL.U8 q5, d10, d14 161 SUBS r5, r5, 8 162 163 // BLOCK 7 164 VLD1.8 {d8}, [r9]! // B0 165 VMLAL.S16 q2, d10, d1[3] 166 VMLAL.S16 q3, d11, d1[3] 167 BHS 2b 168 169 // Epilogue 170 171 .p2align 3 1723: 173 // Extend 174 VMOVL.U8 q0, d0 175 VSUBL.U8 q4, d8, d14 176 PLD [r9, 448] 177 178 // BLOCK 0 179 VLD1.8 {d10}, [r9]! // B1 180 VMLAL.S16 q8, d8, d0[0] 181 VMLAL.S16 q9, d9, d0[0] 182 VSUBL.U8 q5, d10, d14 183 184 // BLOCK 1 185 VLD1.8 {d8}, [r9]! // B2 186 VMLAL.S16 q2, d10, d0[1] 187 VMLAL.S16 q3, d11, d0[1] 188 VSUBL.U8 q4, d8, d14 189 190 // BLOCK 2 191 VLD1.8 {d10}, [r9]! // B3 192 VMLAL.S16 q8, d8, d0[2] 193 VMLAL.S16 q9, d9, d0[2] 194 VSUBL.U8 q5, d10, d14 195 196 // BLOCK 3 197 VLD1.8 {d8}, [r9]! // B4 198 VMLAL.S16 q2, d10, d0[3] 199 VMLAL.S16 q3, d11, d0[3] 200 VSUBL.U8 q4, d8, d14 201 202 // BLOCK 4 203 VLD1.8 {d10}, [r9]! // B5 204 VMLAL.S16 q8, d8, d1[0] 205 VMLAL.S16 q9, d9, d1[0] 206 VSUBL.U8 q5, d10, d14 207 208 // BLOCK 5 209 VLD1.8 {d8}, [r9]! // B6 210 VMLAL.S16 q2, d10, d1[1] 211 VMLAL.S16 q3, d11, d1[1] 212 VSUBL.U8 q4, d8, d14 213 214 // BLOCK 6 215 VLD1.8 {d10}, [r9]! // B7 216 VMLAL.S16 q8, d8, d1[2] 217 VMLAL.S16 q9, d9, d1[2] 218 VSUBL.U8 q5, d10, d14 219 ADDS r5, r5, 8 220 221 VMLAL.S16 q2, d10, d1[3] 222 VMLAL.S16 q3, d11, d1[3] 223 224 # Is there a remainder?- 1-7 bytes of A 225 BNE 6f 226 2274: 228 # ks loop 229 SUBS r14, r14, 4 // ks -= MR * sizeof(void*) 230 BHI 1b 231 232 LDR r14, [sp, 60] // p = ks 233 234 VADD.S32 q8, q8, q2 235 VADD.S32 q9, q9, q3 236 237 # RNDNU quantization 238 VDUP.32 q0, d12[0] // right_pre_shift 239 240 VQSHL.S32 q8, q8, q0 241 VQSHL.S32 q9, q9, q0 242 243 VDUP.32 q2, d13[0] // right_post_shift 244 245 VQDMULH.S32 q8, q8, d12[1] // multiplier 246 VQDMULH.S32 q9, q9, d12[1] 247 248 VRSHL.S32 q8, q8, q2 249 VRSHL.S32 q9, q9, q2 250 251 VDUP.16 q0, d13[2] // output_zero_point 252 253 VQMOVN.S32 d16, q8 254 VQMOVN.S32 d17, q9 255 256 VQADD.S16 q8, q8, q0 257 258 VDUP.8 d24, d13[6] // output_min 259 260 VQMOVUN.S16 d0, q8 261 262 VDUP.8 d25, d13[7] // output_max 263 264 VMAX.U8 d0, d0, d24 265 266 SUBS r1, r1, 8 267 268 VMIN.U8 d0, d0, d25 269 270 # Store full 1 x 8 271 BLO 7f 272 VST1.8 {d0}, [r11], r12 273 SUB r2, r2, r14 // a -= ks 274 BHI 0b 275 276 VPOP {d8-d14} 277 ADD sp, sp, 8 // skip r2, r3 278 POP {r5, r6, r7, r9, r11, pc} 279 280 # Remainder- 1 to 7 bytes of A 281 .p2align 3 2825: 283 AND r5, r5, 7 // kc remainder 1 to 7 2846: 285 VLD1.8 {d0}, [r3] 286 VLD1.8 {d8}, [r9]! 287 288 VMOVL.U8 q0, d0 289 VSUBL.U8 q4, d8, d14 290 VMLAL.S16 q8, d8, d0[0] 291 VMLAL.S16 q9, d9, d0[0] 292 CMP r5, 2 293 BLO 4b 294 295 VLD1.8 {d8}, [r9]! 296 VSUBL.U8 q4, d8, d14 297 VMLAL.S16 q8, d8, d0[1] 298 VMLAL.S16 q9, d9, d0[1] 299 BEQ 4b 300 301 VLD1.8 {d8}, [r9]! 302 VSUBL.U8 q4, d8, d14 303 VMLAL.S16 q8, d8, d0[2] 304 VMLAL.S16 q9, d9, d0[2] 305 CMP r5, 4 306 BLO 4b 307 308 VLD1.8 {d8}, [r9]! 309 VSUBL.U8 q4, d8, d14 310 VMLAL.S16 q8, d8, d0[3] 311 VMLAL.S16 q9, d9, d0[3] 312 BEQ 4b 313 314 VLD1.8 {d8}, [r9]! 315 VSUBL.U8 q4, d8, d14 316 VMLAL.S16 q8, d8, d1[0] 317 VMLAL.S16 q9, d9, d1[0] 318 CMP r5, 6 319 BLO 4b 320 321 VLD1.8 {d8}, [r9]! 322 VSUBL.U8 q4, d8, d14 323 VMLAL.S16 q8, d8, d1[1] 324 VMLAL.S16 q9, d9, d1[1] 325 BEQ 4b 326 327 VLD1.8 {d8}, [r9]! 328 VSUBL.U8 q4, d8, d14 329 VMLAL.S16 q8, d8, d1[2] 330 VMLAL.S16 q9, d9, d1[2] 331 B 4b 332 333 # Store odd width 334 .p2align 3 3357: 336 TST r1, 4 337 BEQ 8f 338 VST1.32 {d0[0]}, [r11]! 339 VEXT.8 q0, q0, q0, 4 3408: 341 TST r1, 2 342 BEQ 9f 343 VST1.16 {d0[0]}, [r11]! 344 VEXT.8 q0, q0, q0, 2 345 3469: 347 TST r1, 1 348 BEQ 10f 349 VST1.8 {d0[0]}, [r11] 350 35110: 352 VPOP {d8-d14} 353 ADD sp, sp, 8 // skip r2, r3 354 POP {r5, r6, r7, r9, r11, pc} 355 356END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7 357 358#ifdef __ELF__ 359.section ".note.GNU-stack","",%progbits 360#endif 361