1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2019 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/assembly.h> 7*4bdc9457SAndroid Build Coastguard Worker 8*4bdc9457SAndroid Build Coastguard Worker.syntax unified 9*4bdc9457SAndroid Build Coastguard Worker 10*4bdc9457SAndroid Build Coastguard Worker# LINT.IfChange 11*4bdc9457SAndroid Build Coastguard Worker// void xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon${"_prfm" if PREFETCH else ""}_ld64( 12*4bdc9457SAndroid Build Coastguard Worker// size_t mr, r0 13*4bdc9457SAndroid Build Coastguard Worker// size_t nc, r1 14*4bdc9457SAndroid Build Coastguard Worker// size_t kc, r2 -> r5 15*4bdc9457SAndroid Build Coastguard Worker// const uint8_t*restrict a, r3 16*4bdc9457SAndroid Build Coastguard Worker// size_t a_stride, sp + 96 -> (r7) 17*4bdc9457SAndroid Build Coastguard Worker// const void*restrict w, sp + 100 -> r9 18*4bdc9457SAndroid Build Coastguard Worker// uint8_t*restrict c, sp + 104 -> r11 19*4bdc9457SAndroid Build Coastguard Worker// size_t cm_stride, sp + 108 -> (r6) 20*4bdc9457SAndroid Build Coastguard Worker// size_t cn_stride, sp + 112 -> r7 21*4bdc9457SAndroid Build Coastguard Worker// const union xnn_f32_minmax_params params) sp + 116 -> (r7) 22*4bdc9457SAndroid Build Coastguard Worker 23*4bdc9457SAndroid Build Coastguard Worker// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 24*4bdc9457SAndroid Build Coastguard Worker 25*4bdc9457SAndroid Build Coastguard Worker// Register usage 26*4bdc9457SAndroid Build Coastguard Worker// A0 r3 d0 27*4bdc9457SAndroid Build Coastguard Worker// A1 r12 d1 28*4bdc9457SAndroid Build Coastguard Worker// A2 r10 d2 29*4bdc9457SAndroid Build Coastguard Worker// A3 r0 d3 30*4bdc9457SAndroid Build Coastguard Worker 31*4bdc9457SAndroid Build Coastguard Worker// B r9 d8, d9, d10, d11 32*4bdc9457SAndroid Build Coastguard Worker// B d12, d13, d14, d15 33*4bdc9457SAndroid Build Coastguard Worker 34*4bdc9457SAndroid Build Coastguard Worker// C0 r11 d16-d17 q8 d18-d19 q9 35*4bdc9457SAndroid Build Coastguard Worker// C1 r4 d20-d21 q10 d22-d23 q11 36*4bdc9457SAndroid Build Coastguard Worker// C2 r8 d24-d25 q12 d26-d27 q13 37*4bdc9457SAndroid Build Coastguard Worker// C3 r6 d28-d29 q14 d30-d31 q15 38*4bdc9457SAndroid Build Coastguard Worker 39*4bdc9457SAndroid Build Coastguard Worker// Clamp (r5) d4 d5 d6 d7 40*4bdc9457SAndroid Build Coastguard Worker 41*4bdc9457SAndroid Build Coastguard WorkerBEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon${"_prfm" if PREFETCH else ""}_ld64 42*4bdc9457SAndroid Build Coastguard Worker .arm 43*4bdc9457SAndroid Build Coastguard Worker#ifndef __APPLE__ 44*4bdc9457SAndroid Build Coastguard Worker .arch armv7-a 45*4bdc9457SAndroid Build Coastguard Worker .fpu neon 46*4bdc9457SAndroid Build Coastguard Worker#endif 47*4bdc9457SAndroid Build Coastguard Worker # Push 96 bytes 48*4bdc9457SAndroid Build Coastguard Worker PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 49*4bdc9457SAndroid Build Coastguard Worker VPUSH {d8-d15} // +64 = 96 50*4bdc9457SAndroid Build Coastguard Worker 51*4bdc9457SAndroid Build Coastguard Worker LDR r7, [sp, 96] // a_stride 52*4bdc9457SAndroid Build Coastguard Worker LDR r11, [sp, 104] // c 53*4bdc9457SAndroid Build Coastguard Worker LDR r6, [sp, 108] // cm_stride 54*4bdc9457SAndroid Build Coastguard Worker LDR r9, [sp, 100] // w 55*4bdc9457SAndroid Build Coastguard Worker LDR r5, [sp, 116] // params 56*4bdc9457SAndroid Build Coastguard Worker 57*4bdc9457SAndroid Build Coastguard Worker # Clamp A and C pointers 58*4bdc9457SAndroid Build Coastguard Worker CMP r0, 2 // if mr >= 2 59*4bdc9457SAndroid Build Coastguard Worker ADD r12, r3, r7 // a1 = a0 + a_stride 60*4bdc9457SAndroid Build Coastguard Worker ADD r4, r11, r6 // c1 = c0 + cm_stride 61*4bdc9457SAndroid Build Coastguard Worker MOVLO r12, r3 // a1 62*4bdc9457SAndroid Build Coastguard Worker MOVLO r4, r11 // c1 63*4bdc9457SAndroid Build Coastguard Worker // if mr > 2 64*4bdc9457SAndroid Build Coastguard Worker ADD r10, r12, r7 // a2 = a1 + a_stride 65*4bdc9457SAndroid Build Coastguard Worker ADD r8, r4, r6 // c2 = c1 + cm_stride 66*4bdc9457SAndroid Build Coastguard Worker MOVLS r10, r12 // a2 67*4bdc9457SAndroid Build Coastguard Worker MOVLS r8, r4 // c2 68*4bdc9457SAndroid Build Coastguard Worker 69*4bdc9457SAndroid Build Coastguard Worker CMP r0, 4 // if mr >=4 70*4bdc9457SAndroid Build Coastguard Worker ADD r0, r10, r7 // a3 = a2 + a_stride 71*4bdc9457SAndroid Build Coastguard Worker ADD r6, r8, r6 // c3 = c2 + cm_stride 72*4bdc9457SAndroid Build Coastguard Worker MOVLO r0, r10 // a3 73*4bdc9457SAndroid Build Coastguard Worker MOVLO r6, r8 // c3 74*4bdc9457SAndroid Build Coastguard Worker 75*4bdc9457SAndroid Build Coastguard Worker # Load min/max values 76*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d4[], d5[]}, [r5]! 77*4bdc9457SAndroid Build Coastguard Worker LDR r7, [sp, 112] // cn_stride 78*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d6[], d7[]}, [r5] 79*4bdc9457SAndroid Build Coastguard Worker 80*4bdc9457SAndroid Build Coastguard Worker0: 81*4bdc9457SAndroid Build Coastguard Worker # Load initial bias from w into accumulators 82*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d16-d19} // Bias 83*4bdc9457SAndroid Build Coastguard Worker SUBS r5, r2, 8 84*4bdc9457SAndroid Build Coastguard Worker VMOV q10, q8 85*4bdc9457SAndroid Build Coastguard Worker VMOV q11, q9 86*4bdc9457SAndroid Build Coastguard Worker VMOV q12, q8 87*4bdc9457SAndroid Build Coastguard Worker VMOV q13, q9 88*4bdc9457SAndroid Build Coastguard Worker VMOV q14, q8 89*4bdc9457SAndroid Build Coastguard Worker VMOV q15, q9 90*4bdc9457SAndroid Build Coastguard Worker 91*4bdc9457SAndroid Build Coastguard Worker $if PREFETCH: 92*4bdc9457SAndroid Build Coastguard Worker PLD [r3, 0] // Prefetch A 93*4bdc9457SAndroid Build Coastguard Worker PLD [r3, 64] 94*4bdc9457SAndroid Build Coastguard Worker PLD [r12, 0] 95*4bdc9457SAndroid Build Coastguard Worker PLD [r12, 64] 96*4bdc9457SAndroid Build Coastguard Worker PLD [r10, 0] 97*4bdc9457SAndroid Build Coastguard Worker PLD [r10, 64] 98*4bdc9457SAndroid Build Coastguard Worker PLD [r0, 0] 99*4bdc9457SAndroid Build Coastguard Worker PLD [r0, 64] 100*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 0] // Prefetch B 101*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 64] 102*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 128] 103*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 192] 104*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 256] 105*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 320] 106*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 384] 107*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 448] 108*4bdc9457SAndroid Build Coastguard Worker BLO 3f // less than 2 channels? 109*4bdc9457SAndroid Build Coastguard Worker 110*4bdc9457SAndroid Build Coastguard Worker # Main loop - 2 floats of A (8 bytes) 111*4bdc9457SAndroid Build Coastguard Worker1: 112*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d0}, [r3]! // A0 113*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d8-d11} // B0 114*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d1}, [r12]! // A1 115*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d2}, [r10]! // A2 116*4bdc9457SAndroid Build Coastguard Worker VLD1.32 {d3}, [ r0]! // A3 117*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d12-d15} // B1 118*4bdc9457SAndroid Build Coastguard Worker 119*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q8, q4, d0[0] 120*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q9, q5, d0[0] 121*4bdc9457SAndroid Build Coastguard Worker $if PREFETCH: 122*4bdc9457SAndroid Build Coastguard Worker PLD [r3, 128] // Prefetch A0 123*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q10, q4, d1[0] 124*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q13, q5, d2[0] 125*4bdc9457SAndroid Build Coastguard Worker $if PREFETCH: 126*4bdc9457SAndroid Build Coastguard Worker PLD [r12, 128] // Prefetch A1 127*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q11, q5, d1[0] 128*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q12, q4, d2[0] 129*4bdc9457SAndroid Build Coastguard Worker $if PREFETCH: 130*4bdc9457SAndroid Build Coastguard Worker PLD [r10, 128] // Prefetch A2 131*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q14, q4, d3[0] 132*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q15, q5, d3[0] 133*4bdc9457SAndroid Build Coastguard Worker $if PREFETCH: 134*4bdc9457SAndroid Build Coastguard Worker PLD [r0, 128] // Prefetch A3 135*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q8, q6, d0[1] 136*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q9, q7, d0[1] 137*4bdc9457SAndroid Build Coastguard Worker $if PREFETCH: 138*4bdc9457SAndroid Build Coastguard Worker PLD [r9, 448] // Prefetch B 139*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q10, q6, d1[1] 140*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q11, q7, d1[1] 141*4bdc9457SAndroid Build Coastguard Worker SUBS r5, r5, 8 142*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q12, q6, d2[1] 143*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q13, q7, d2[1] 144*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q14, q6, d3[1] 145*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q15, q7, d3[1] 146*4bdc9457SAndroid Build Coastguard Worker BHS 1b 147*4bdc9457SAndroid Build Coastguard Worker 148*4bdc9457SAndroid Build Coastguard Worker # Is there a remainder?- 1 float of A (4 bytes) 149*4bdc9457SAndroid Build Coastguard Worker TST r5, 4 150*4bdc9457SAndroid Build Coastguard Worker BNE 3f 151*4bdc9457SAndroid Build Coastguard Worker 152*4bdc9457SAndroid Build Coastguard Worker2: 153*4bdc9457SAndroid Build Coastguard Worker # Clamp 154*4bdc9457SAndroid Build Coastguard Worker VMAX.F32 q8, q8, q2 155*4bdc9457SAndroid Build Coastguard Worker SUBS r1, r1, 8 156*4bdc9457SAndroid Build Coastguard Worker VMAX.F32 q9, q9, q2 157*4bdc9457SAndroid Build Coastguard Worker VMAX.F32 q10, q10, q2 158*4bdc9457SAndroid Build Coastguard Worker VMAX.F32 q11, q11, q2 159*4bdc9457SAndroid Build Coastguard Worker VMAX.F32 q12, q12, q2 160*4bdc9457SAndroid Build Coastguard Worker VMAX.F32 q13, q13, q2 161*4bdc9457SAndroid Build Coastguard Worker VMAX.F32 q14, q14, q2 162*4bdc9457SAndroid Build Coastguard Worker VMAX.F32 q15, q15, q2 163*4bdc9457SAndroid Build Coastguard Worker VMIN.F32 q8, q8, q3 164*4bdc9457SAndroid Build Coastguard Worker VMIN.F32 q9, q9, q3 165*4bdc9457SAndroid Build Coastguard Worker VMIN.F32 q10, q10, q3 166*4bdc9457SAndroid Build Coastguard Worker VMIN.F32 q11, q11, q3 167*4bdc9457SAndroid Build Coastguard Worker VMIN.F32 q12, q12, q3 168*4bdc9457SAndroid Build Coastguard Worker VMIN.F32 q13, q13, q3 169*4bdc9457SAndroid Build Coastguard Worker VMIN.F32 q14, q14, q3 170*4bdc9457SAndroid Build Coastguard Worker VMIN.F32 q15, q15, q3 171*4bdc9457SAndroid Build Coastguard Worker 172*4bdc9457SAndroid Build Coastguard Worker # Store full 4 x 8 173*4bdc9457SAndroid Build Coastguard Worker BLO 4f 174*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d16-d19}, [r11], r7 175*4bdc9457SAndroid Build Coastguard Worker SUB r0, r0, r2 176*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d20-d23}, [r4], r7 177*4bdc9457SAndroid Build Coastguard Worker SUB r10, r10, r2 178*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d24-d27}, [r8], r7 179*4bdc9457SAndroid Build Coastguard Worker SUB r12, r12, r2 180*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d28-d31}, [r6], r7 181*4bdc9457SAndroid Build Coastguard Worker SUB r3, r3, r2 182*4bdc9457SAndroid Build Coastguard Worker BHI 0b 183*4bdc9457SAndroid Build Coastguard Worker 184*4bdc9457SAndroid Build Coastguard Worker VPOP {d8-d15} 185*4bdc9457SAndroid Build Coastguard Worker POP {r4, r5, r6, r7, r8, r9, r10, r11} 186*4bdc9457SAndroid Build Coastguard Worker BX lr 187*4bdc9457SAndroid Build Coastguard Worker 188*4bdc9457SAndroid Build Coastguard Worker3: 189*4bdc9457SAndroid Build Coastguard Worker # Remainder- 1 float of A (4 bytes) 190*4bdc9457SAndroid Build Coastguard Worker VLDM r3!, {s0} // A0 191*4bdc9457SAndroid Build Coastguard Worker VLDM r9!, {d8-d11} // B0 192*4bdc9457SAndroid Build Coastguard Worker VLDM r12!, {s2} // A1 193*4bdc9457SAndroid Build Coastguard Worker VLDM r10!, {s4} // A2 194*4bdc9457SAndroid Build Coastguard Worker VLDM r0!, {s6} // A3 195*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q8, q4, d0[0] 196*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q9, q5, d0[0] 197*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q10, q4, d1[0] 198*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q11, q5, d1[0] 199*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q12, q4, d2[0] 200*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q13, q5, d2[0] 201*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q14, q4, d3[0] 202*4bdc9457SAndroid Build Coastguard Worker VMLA.F32 q15, q5, d3[0] 203*4bdc9457SAndroid Build Coastguard Worker B 2b 204*4bdc9457SAndroid Build Coastguard Worker 205*4bdc9457SAndroid Build Coastguard Worker # Store odd width 206*4bdc9457SAndroid Build Coastguard Worker4: 207*4bdc9457SAndroid Build Coastguard Worker TST r1, 4 208*4bdc9457SAndroid Build Coastguard Worker BEQ 5f 209*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d16-d17}, [r11]! 210*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d20-d21}, [r4]! 211*4bdc9457SAndroid Build Coastguard Worker VMOV q8, q9 212*4bdc9457SAndroid Build Coastguard Worker VMOV q10, q11 213*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d24-d25}, [r8]! 214*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d28-d29}, [r6]! 215*4bdc9457SAndroid Build Coastguard Worker VMOV q12, q13 216*4bdc9457SAndroid Build Coastguard Worker VMOV q14, q15 217*4bdc9457SAndroid Build Coastguard Worker 218*4bdc9457SAndroid Build Coastguard Worker5: 219*4bdc9457SAndroid Build Coastguard Worker TST r1, 2 220*4bdc9457SAndroid Build Coastguard Worker BEQ 6f 221*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d16}, [r11]! 222*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d20}, [r4]! 223*4bdc9457SAndroid Build Coastguard Worker VMOV d16, d17 224*4bdc9457SAndroid Build Coastguard Worker VMOV d20, d21 225*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d24}, [r8]! 226*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d28}, [r6]! 227*4bdc9457SAndroid Build Coastguard Worker VMOV d24, d25 228*4bdc9457SAndroid Build Coastguard Worker VMOV d28, d29 229*4bdc9457SAndroid Build Coastguard Worker 230*4bdc9457SAndroid Build Coastguard Worker6: 231*4bdc9457SAndroid Build Coastguard Worker TST r1, 1 232*4bdc9457SAndroid Build Coastguard Worker BEQ 7f 233*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d16[0]}, [r11] 234*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d20[0]}, [r4] 235*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d24[0]}, [r8] 236*4bdc9457SAndroid Build Coastguard Worker VST1.32 {d28[0]}, [r6] 237*4bdc9457SAndroid Build Coastguard Worker 238*4bdc9457SAndroid Build Coastguard Worker7: 239*4bdc9457SAndroid Build Coastguard Worker VPOP {d8-d15} 240*4bdc9457SAndroid Build Coastguard Worker POP {r4, r5, r6, r7, r8, r9, r10, r11} 241*4bdc9457SAndroid Build Coastguard Worker BX lr 242*4bdc9457SAndroid Build Coastguard Worker 243*4bdc9457SAndroid Build Coastguard WorkerEND_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon${"_prfm" if PREFETCH else ""}_ld64 244*4bdc9457SAndroid Build Coastguard Worker# LINT.ThenChange(4x8-aarch32-neon-ld64.cc) 245*4bdc9457SAndroid Build Coastguard Worker 246*4bdc9457SAndroid Build Coastguard Worker#ifdef __ELF__ 247*4bdc9457SAndroid Build Coastguard Worker.section ".note.GNU-stack","",%progbits 248*4bdc9457SAndroid Build Coastguard Worker#endif 249*4bdc9457SAndroid Build Coastguard Worker 250