1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8.syntax unified 9 10// void xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64( 11// size_t mr, r0 12// size_t nc, r1 13// size_t kc, r2 -> r5 14// const uint8_t*restrict a, r3 15// size_t a_stride, sp + 96 -> (r11) 16// const void*restrict w, sp + 100 -> r9 17// uint8_t*restrict c, sp + 104 -> r6 18// size_t cm_stride, sp + 108 -> (r7) 19// size_t cn_stride, sp + 112 -> r11 20// const union xnn_f32_default_params params) sp + 116 -> (r11) 21 22// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 23 24// Register usage 25 26// A0 r3 s0-s1 d0 27// A1 r12 s2-s3 d1 28// A2 r10 s4-s5 d2 29// A3 r0 s6-s7 d3 30 31// B r9 s8, s9, s10, s11 d4-d5 32// B s12, s13, s14, s15 d6-d7 33 34// C0 r6 s16-s17 d8 s18-s19 d9 35// C1 r4 s20-s21 d10 s22-s23 d11 36// C2 r8 s24-s25 d12 s26-s27 d13 37// C3 r7 s28-s29 d14 s30-s31 d15 38 39BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64 40 .arm 41#ifndef __APPLE__ 42 .arch armv6 43 .fpu vfp 44#endif 45 # Push 96 bytes 46 PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 47 VPUSH {d8-d15} // +64 = 96 48 49 LDR r11, [sp, 96] // Load a_stride 50 LDRD r6, r7, [sp, 104] // Load c and cm_stride 51 52 # Clamp A and C pointers 53 CMP r0, 2 // if mr >= 2 54 ADD r12, r3, r11 // a1 = a0 + a_stride 55 ADD r4, r6, r7 // c1 = c0 + cm_stride 56 MOVLO r12, r3 // a1 57 MOVLO r4, r6 // c1 58 59 LDR r9, [sp, 100] // Load w 60 61 // if mr > 2 62 ADD r10, r12, r11 // a2 = a1 + a_stride 63 ADD r8, r4, r7 // c2 = c1 + cm_stride 64 MOVLS r10, r12 // a2 65 MOVLS r8, r4 // c2 66 67 CMP r0, 4 // if mr >=4 68 ADD r0, r10, r11 // a3 = a2 + a_stride 69 ADD r7, r8, r7 // c3 = c2 + cm_stride 70 LDR r11, [sp, 112] // Load cn_stride 71 MOVLO r0, r10 // a3 72 MOVLO r7, r8 // c3 73 740: 75 # Load initial bias from w into accumulators 76 VLDM r9!, {d8-d9} // Bias 77 SUBS r5, r2, 8 78 VMOV.F64 d10, d8 79 VMOV.F64 d12, d8 80 VMOV.F64 d14, d8 81 VMOV.F64 d11, d9 82 VMOV.F64 d13, d9 83 VMOV.F64 d15, d9 84 BLO 3f // less than 2 channels? 85 86 # Main loop - 2 floats of A (8 bytes) 871: 88 VLDM r3!, {d0} // A0 89 VLDM r9!, {d4-d5} // B0 90 VLDM r12!, {d1} // A1 91 VLDM r10!, {d2} // A2 92 VLDM r0!, {d3} // A3 93 94 VMLA.F32 s16, s8, s0 95 VMLA.F32 s17, s9, s0 96 VMLA.F32 s20, s8, s2 97 VMLA.F32 s21, s9, s2 98 VMLA.F32 s24, s8, s4 99 VMLA.F32 s25, s9, s4 100 VMLA.F32 s28, s8, s6 101 VMLA.F32 s29, s9, s6 102 103 VLDM r9!, {d6-d7} // B1 104 105 VMLA.F32 s18, s10, s0 106 VMLA.F32 s19, s11, s0 107 VMLA.F32 s22, s10, s2 108 VMLA.F32 s23, s11, s2 109 VMLA.F32 s26, s10, s4 110 VMLA.F32 s27, s11, s4 111 VMLA.F32 s30, s10, s6 112 VMLA.F32 s31, s11, s6 113 114 VMLA.F32 s16, s12, s1 115 VMLA.F32 s17, s13, s1 116 VMLA.F32 s20, s12, s3 117 VMLA.F32 s21, s13, s3 118 VMLA.F32 s24, s12, s5 119 VMLA.F32 s25, s13, s5 120 VMLA.F32 s28, s12, s7 121 VMLA.F32 s29, s13, s7 122 123 SUBS r5, r5, 8 124 125 VMLA.F32 s18, s14, s1 126 VMLA.F32 s19, s15, s1 127 VMLA.F32 s22, s14, s3 128 VMLA.F32 s23, s15, s3 129 VMLA.F32 s26, s14, s5 130 VMLA.F32 s27, s15, s5 131 VMLA.F32 s30, s14, s7 132 VMLA.F32 s31, s15, s7 133 134 BHS 1b 135 136 # Is there a remainder?- 1 float of A (4 bytes) 137 TST r5, 4 138 BNE 3f 139 1402: 141 142 SUBS r1, r1, 4 143 BLO 4f 144 145 # Store full 4 x 4 146 VSTM r6, {d8-d9} 147 SUB r0, r0, r2 148 ADD r6, r11 149 VSTM r4, {d10-d11} 150 SUB r10, r10, r2 151 ADD r4, r11 152 VSTM r8, {d12-d13} 153 SUB r12, r12, r2 154 ADD r8, r11 155 VSTM r7, {d14-d15} 156 SUB r3, r3, r2 157 ADD r7, r11 158 BHI 0b 159 160 VPOP {d8-d15} 161 POP {r4, r5, r6, r7, r8, r9, r10, r11} 162 BX lr 163 1643: 165 # Remainder- 1 float of A (4 bytes) 166 VLDM r3!, {s0} // A0 167 VLDM r9!, {d6-d7} // B 168 VLDM r12!, {s1} // A1 169 VLDM r10!, {s2} // A2 170 VLDM r0!, {s3} // A3 171 172 VMLA.F32 s16, s12, s0 173 VMLA.F32 s17, s13, s0 174 VMLA.F32 s18, s14, s0 175 VMLA.F32 s19, s15, s0 176 177 VMLA.F32 s20, s12, s1 178 VMLA.F32 s21, s13, s1 179 VMLA.F32 s22, s14, s1 180 VMLA.F32 s23, s15, s1 181 182 VMLA.F32 s24, s12, s2 183 VMLA.F32 s25, s13, s2 184 VMLA.F32 s26, s14, s2 185 VMLA.F32 s27, s15, s2 186 187 VMLA.F32 s28, s12, s3 188 VMLA.F32 s29, s13, s3 189 VMLA.F32 s30, s14, s3 190 VMLA.F32 s31, s15, s3 191 192 B 2b 193 194 # Store odd width 1954: 196 TST r1, 2 197 BEQ 5f 198 VSTM r6!, {d8} 199 VMOV.F32 s16, s18 200 VSTM r4!, {d10} 201 VMOV.F32 s20, s22 202 VSTM r8!, {d12} 203 VMOV.F32 s24, s26 204 VSTM r7!, {d14} 205 VMOV.F32 s28, s30 206 2075: 208 TST r1, 1 209 BEQ 6f 210 VSTR s16, [r6] 211 VSTR s20, [r4] 212 VSTR s24, [r8] 213 VSTR s28, [r7] 214 2156: 216 VPOP {d8-d15} 217 POP {r4, r5, r6, r7, r8, r9, r10, r11} 218 BX lr 219 220END_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64 221 222#ifdef __ELF__ 223.section ".note.GNU-stack","",%progbits 224#endif 225