1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2020 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/assembly.h> 7*4bdc9457SAndroid Build Coastguard Worker 8*4bdc9457SAndroid Build Coastguard Worker.syntax unified 9*4bdc9457SAndroid Build Coastguard Worker 10*4bdc9457SAndroid Build Coastguard Worker// void xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64( 11*4bdc9457SAndroid Build Coastguard Worker// size_t mr, r0 12*4bdc9457SAndroid Build Coastguard Worker// size_t nc, r1 13*4bdc9457SAndroid Build Coastguard Worker// size_t kc, r2 -> r5 14*4bdc9457SAndroid Build Coastguard Worker// const uint8_t*restrict a, r3 15*4bdc9457SAndroid Build Coastguard Worker// size_t a_stride, sp + 96 -> (r11) 16*4bdc9457SAndroid Build Coastguard Worker// const void*restrict w, sp + 100 -> r9 17*4bdc9457SAndroid Build Coastguard Worker// uint8_t*restrict c, sp + 104 -> r6 18*4bdc9457SAndroid Build Coastguard Worker// size_t cm_stride, sp + 108 -> (r7) 19*4bdc9457SAndroid Build Coastguard Worker// size_t cn_stride, sp + 112 -> r11 20*4bdc9457SAndroid Build Coastguard Worker// const union xnn_f32_default_params params) sp + 116 -> (r11) 21*4bdc9457SAndroid Build Coastguard Worker 22*4bdc9457SAndroid Build Coastguard Worker// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 23*4bdc9457SAndroid Build Coastguard Worker 24*4bdc9457SAndroid Build Coastguard Worker// Register usage 25*4bdc9457SAndroid Build Coastguard Worker 26*4bdc9457SAndroid Build Coastguard Worker// A0 r3 s0-s1 d0 27*4bdc9457SAndroid Build Coastguard Worker// A1 r12 s2-s3 d1 28*4bdc9457SAndroid Build Coastguard Worker// A2 r10 s4-s5 d2 29*4bdc9457SAndroid Build Coastguard Worker// A3 r0 s6-s7 d3 30*4bdc9457SAndroid Build Coastguard Worker 31*4bdc9457SAndroid Build Coastguard Worker// B r9 s8, s9, s10, s11 d4-d5 32*4bdc9457SAndroid Build Coastguard Worker// B s12, s13, s14, s15 d6-d7 33*4bdc9457SAndroid Build Coastguard Worker 34*4bdc9457SAndroid Build Coastguard Worker// C0 r6 s16-s17 d8 s18-s19 d9 35*4bdc9457SAndroid Build Coastguard Worker// C1 r4 s20-s21 d10 s22-s23 d11 36*4bdc9457SAndroid Build Coastguard Worker// C2 r8 s24-s25 d12 s26-s27 d13 37*4bdc9457SAndroid Build Coastguard Worker// C3 r7 s28-s29 d14 s30-s31 d15 38*4bdc9457SAndroid Build Coastguard Worker 39*4bdc9457SAndroid Build Coastguard WorkerBEGIN_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64 40*4bdc9457SAndroid Build Coastguard Worker .arm 41*4bdc9457SAndroid Build Coastguard Worker#ifndef __APPLE__ 42*4bdc9457SAndroid Build Coastguard Worker .arch armv6 43*4bdc9457SAndroid Build Coastguard Worker .fpu vfp 44*4bdc9457SAndroid Build Coastguard Worker#endif 45*4bdc9457SAndroid Build Coastguard Worker # Push 96 bytes 46*4bdc9457SAndroid Build Coastguard Worker PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 47*4bdc9457SAndroid Build Coastguard Worker VPUSH {d8-d15} // +64 = 96 48*4bdc9457SAndroid Build Coastguard Worker 49*4bdc9457SAndroid Build Coastguard Worker LDR r11, [sp, 96] // Load a_stride 50*4bdc9457SAndroid Build Coastguard Worker LDRD r6, r7, [sp, 104] // Load c and cm_stride 51*4bdc9457SAndroid Build Coastguard Worker 52*4bdc9457SAndroid Build Coastguard Worker # Clamp A and C pointers 53*4bdc9457SAndroid Build Coastguard Worker CMP r0, 2 // if mr >= 2 54*4bdc9457SAndroid Build Coastguard Worker ADD r12, r3, r11 // a1 = a0 + a_stride 55*4bdc9457SAndroid Build Coastguard Worker ADD r4, r6, r7 // c1 = c0 + cm_stride 56*4bdc9457SAndroid Build Coastguard Worker MOVLO r12, r3 // a1 57*4bdc9457SAndroid Build Coastguard Worker MOVLO r4, r6 // c1 58*4bdc9457SAndroid Build Coastguard Worker 59*4bdc9457SAndroid Build Coastguard Worker LDR r9, [sp, 100] // Load w 60*4bdc9457SAndroid Build Coastguard Worker 61*4bdc9457SAndroid Build Coastguard Worker // if mr > 2 62*4bdc9457SAndroid Build Coastguard Worker ADD r10, r12, r11 // a2 = a1 + a_stride 63*4bdc9457SAndroid Build Coastguard Worker ADD r8, r4, r7 // c2 = c1 + cm_stride 64*4bdc9457SAndroid Build Coastguard Worker MOVLS r10, r12 // a2 65*4bdc9457SAndroid Build Coastguard Worker MOVLS r8, r4 // c2 66*4bdc9457SAndroid Build Coastguard Worker 67*4bdc9457SAndroid Build Coastguard Worker CMP r0, 4 // if mr >=4 68*4bdc9457SAndroid Build Coastguard Worker ADD r0, r10, r11 // a3 = a2 + a_stride 69*4bdc9457SAndroid Build Coastguard Worker ADD r7, r8, r7 // c3 = c2 + cm_stride 70*4bdc9457SAndroid Build Coastguard Worker LDR r11, [sp, 112] // Load cn_stride 71*4bdc9457SAndroid Build Coastguard Worker MOVLO r0, r10 // a3 72*4bdc9457SAndroid Build Coastguard Worker MOVLO r7, r8 // c3 73*4bdc9457SAndroid Build Coastguard Worker 74*4bdc9457SAndroid Build Coastguard Worker0: 75*4bdc9457SAndroid Build Coastguard Worker # Load initial bias from w into accumulators 76*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d8-d9} // Bias 77*4bdc9457SAndroid Build Coastguard Worker SUBS r5, r2, 8 78*4bdc9457SAndroid Build Coastguard Worker VMOV.F64 d10, d8 79*4bdc9457SAndroid Build Coastguard Worker VMOV.F64 d12, d8 80*4bdc9457SAndroid Build Coastguard Worker VMOV.F64 d14, d8 81*4bdc9457SAndroid Build Coastguard Worker VMOV.F64 d11, d9 82*4bdc9457SAndroid Build Coastguard Worker VMOV.F64 d13, d9 83*4bdc9457SAndroid Build Coastguard Worker VMOV.F64 d15, d9 84*4bdc9457SAndroid Build Coastguard Worker BLO 3f // less than 2 channels? 85*4bdc9457SAndroid Build Coastguard Worker 86*4bdc9457SAndroid Build Coastguard Worker # Main loop - 2 floats of A (8 bytes) 87*4bdc9457SAndroid Build Coastguard Worker1: 88*4bdc9457SAndroid Build Coastguard Worker VLDM r3!, {d0} // A0 89*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d4-d5} // B0 90*4bdc9457SAndroid Build Coastguard Worker VLDM r12!, {d1} // A1 91*4bdc9457SAndroid Build Coastguard Worker VLDM r10!, {d2} // A2 92*4bdc9457SAndroid Build Coastguard Worker VLDM r0!, {d3} // A3 93*4bdc9457SAndroid Build Coastguard Worker 94*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s16, s8, s0 95*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s17, s9, s0 96*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s20, s8, s2 97*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s21, s9, s2 98*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s24, s8, s4 99*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s25, s9, s4 100*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s28, s8, s6 101*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s29, s9, s6 102*4bdc9457SAndroid Build Coastguard Worker 103*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d6-d7} // B1 104*4bdc9457SAndroid Build Coastguard Worker 105*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s18, s10, s0 106*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s19, s11, s0 107*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s22, s10, s2 108*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s23, s11, s2 109*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s26, s10, s4 110*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s27, s11, s4 111*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s30, s10, s6 112*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s31, s11, s6 113*4bdc9457SAndroid Build Coastguard Worker 114*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s16, s12, s1 115*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s17, s13, s1 116*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s20, s12, s3 117*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s21, s13, s3 118*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s24, s12, s5 119*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s25, s13, s5 120*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s28, s12, s7 121*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s29, s13, s7 122*4bdc9457SAndroid Build Coastguard Worker 123*4bdc9457SAndroid Build Coastguard Worker SUBS r5, r5, 8 124*4bdc9457SAndroid Build Coastguard Worker 125*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s18, s14, s1 126*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s19, s15, s1 127*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s22, s14, s3 128*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s23, s15, s3 129*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s26, s14, s5 130*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s27, s15, s5 131*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s30, s14, s7 132*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s31, s15, s7 133*4bdc9457SAndroid Build Coastguard Worker 134*4bdc9457SAndroid Build Coastguard Worker BHS 1b 135*4bdc9457SAndroid Build Coastguard Worker 136*4bdc9457SAndroid Build Coastguard Worker # Is there a remainder?- 1 float of A (4 bytes) 137*4bdc9457SAndroid Build Coastguard Worker TST r5, 4 138*4bdc9457SAndroid Build Coastguard Worker BNE 3f 139*4bdc9457SAndroid Build Coastguard Worker 140*4bdc9457SAndroid Build Coastguard Worker2: 141*4bdc9457SAndroid Build Coastguard Worker 142*4bdc9457SAndroid Build Coastguard Worker SUBS r1, r1, 4 143*4bdc9457SAndroid Build Coastguard Worker BLO 4f 144*4bdc9457SAndroid Build Coastguard Worker 145*4bdc9457SAndroid Build Coastguard Worker # Store full 4 x 4 146*4bdc9457SAndroid Build Coastguard Worker VSTM r6, {d8-d9} 147*4bdc9457SAndroid Build Coastguard Worker SUB r0, r0, r2 148*4bdc9457SAndroid Build Coastguard Worker ADD r6, r11 149*4bdc9457SAndroid Build Coastguard Worker VSTM r4, {d10-d11} 150*4bdc9457SAndroid Build Coastguard Worker SUB r10, r10, r2 151*4bdc9457SAndroid Build Coastguard Worker ADD r4, r11 152*4bdc9457SAndroid Build Coastguard Worker VSTM r8, {d12-d13} 153*4bdc9457SAndroid Build Coastguard Worker SUB r12, r12, r2 154*4bdc9457SAndroid Build Coastguard Worker ADD r8, r11 155*4bdc9457SAndroid Build Coastguard Worker VSTM r7, {d14-d15} 156*4bdc9457SAndroid Build Coastguard Worker SUB r3, r3, r2 157*4bdc9457SAndroid Build Coastguard Worker ADD r7, r11 158*4bdc9457SAndroid Build Coastguard Worker BHI 0b 159*4bdc9457SAndroid Build Coastguard Worker 160*4bdc9457SAndroid Build Coastguard Worker VPOP {d8-d15} 161*4bdc9457SAndroid Build Coastguard Worker POP {r4, r5, r6, r7, r8, r9, r10, r11} 162*4bdc9457SAndroid Build Coastguard Worker BX lr 163*4bdc9457SAndroid Build Coastguard Worker 164*4bdc9457SAndroid Build Coastguard Worker3: 165*4bdc9457SAndroid Build Coastguard Worker # Remainder- 1 float of A (4 bytes) 166*4bdc9457SAndroid Build Coastguard Worker VLDM r3!, {s0} // A0 167*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d6-d7} // B 168*4bdc9457SAndroid Build Coastguard Worker VLDM r12!, {s1} // A1 169*4bdc9457SAndroid Build Coastguard Worker VLDM r10!, {s2} // A2 170*4bdc9457SAndroid Build Coastguard Worker VLDM r0!, {s3} // A3 171*4bdc9457SAndroid Build Coastguard Worker 172*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s16, s12, s0 173*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s17, s13, s0 174*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s18, s14, s0 175*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s19, s15, s0 176*4bdc9457SAndroid Build Coastguard Worker 177*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s20, s12, s1 178*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s21, s13, s1 179*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s22, s14, s1 180*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s23, s15, s1 181*4bdc9457SAndroid Build Coastguard Worker 182*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s24, s12, s2 183*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s25, s13, s2 184*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s26, s14, s2 185*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s27, s15, s2 186*4bdc9457SAndroid Build Coastguard Worker 187*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s28, s12, s3 188*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s29, s13, s3 189*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s30, s14, s3 190*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 s31, s15, s3 191*4bdc9457SAndroid Build Coastguard Worker 192*4bdc9457SAndroid Build Coastguard Worker B 2b 193*4bdc9457SAndroid Build Coastguard Worker 194*4bdc9457SAndroid Build Coastguard Worker # Store odd width 195*4bdc9457SAndroid Build Coastguard Worker4: 196*4bdc9457SAndroid Build Coastguard Worker TST r1, 2 197*4bdc9457SAndroid Build Coastguard Worker BEQ 5f 198*4bdc9457SAndroid Build Coastguard Worker VSTM r6!, {d8} 199*4bdc9457SAndroid Build Coastguard Worker VMOV.F32 s16, s18 200*4bdc9457SAndroid Build Coastguard Worker VSTM r4!, {d10} 201*4bdc9457SAndroid Build Coastguard Worker VMOV.F32 s20, s22 202*4bdc9457SAndroid Build Coastguard Worker VSTM r8!, {d12} 203*4bdc9457SAndroid Build Coastguard Worker VMOV.F32 s24, s26 204*4bdc9457SAndroid Build Coastguard Worker VSTM r7!, {d14} 205*4bdc9457SAndroid Build Coastguard Worker VMOV.F32 s28, s30 206*4bdc9457SAndroid Build Coastguard Worker 207*4bdc9457SAndroid Build Coastguard Worker5: 208*4bdc9457SAndroid Build Coastguard Worker TST r1, 1 209*4bdc9457SAndroid Build Coastguard Worker BEQ 6f 210*4bdc9457SAndroid Build Coastguard Worker VSTR s16, [r6] 211*4bdc9457SAndroid Build Coastguard Worker VSTR s20, [r4] 212*4bdc9457SAndroid Build Coastguard Worker VSTR s24, [r8] 213*4bdc9457SAndroid Build Coastguard Worker VSTR s28, [r7] 214*4bdc9457SAndroid Build Coastguard Worker 215*4bdc9457SAndroid Build Coastguard Worker6: 216*4bdc9457SAndroid Build Coastguard Worker VPOP {d8-d15} 217*4bdc9457SAndroid Build Coastguard Worker POP {r4, r5, r6, r7, r8, r9, r10, r11} 218*4bdc9457SAndroid Build Coastguard Worker BX lr 219*4bdc9457SAndroid Build Coastguard Worker 220*4bdc9457SAndroid Build Coastguard WorkerEND_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64 221*4bdc9457SAndroid Build Coastguard Worker 222*4bdc9457SAndroid Build Coastguard Worker#ifdef __ELF__ 223*4bdc9457SAndroid Build Coastguard Worker.section ".note.GNU-stack","",%progbits 224*4bdc9457SAndroid Build Coastguard Worker#endif 225