1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8.syntax unified 9 10// void xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64( 11// size_t mr, r0 12// size_t nc, r1 13// size_t kc, r2 -> r5 14// const uint8_t*restrict a, r3 15// size_t a_stride, sp + 96 -> (r11) 16// const void*restrict w, sp + 100 -> r9 17// uint8_t*restrict c, sp + 104 -> r6 18// size_t cm_stride, sp + 108 -> (r7) 19// size_t cn_stride, sp + 112 -> r11 20// const union xnn_f32_minmax_params params) sp + 116 -> (r11) 21 22// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 23 24// Register usage 25 26// A0 r3 s0-s1 d0 27// A1 r12 s2-s3 d1 28// A2 r10 s4-s5 d2 29// A3 r0 s6-s7 d3 30 31// B r9 s12, s13, s14, s15 d6-d7 32// B s10, s11, s12, s13 d5-d6 33 34// C0 r6 s16-s17 d8 s18-s19 d9 35// C1 r4 s20-s21 d10 s22-s23 d11 36// C2 r8 s24-s25 d12 s26-s27 d13 37// C3 r7 s28-s29 d14 s30-s31 d15 38 39// Clamp (r5) s8, s9 d4 40 41BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64 42 .arm 43#ifndef __APPLE__ 44 .arch armv6 45 .fpu vfp 46#endif 47 # Push 96 bytes 48 PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 49 VPUSH {d8-d15} // +64 = 96 50 51 LDR r11, [sp, 96] // Load a_stride 52 LDRD r6, r7, [sp, 104] // Load c and cm_stride 53 LDR r5, [sp, 116] // Load params 54 55 # Clamp A and C pointers 56 CMP r0, 2 // if mr >= 2 57 ADD r12, r3, r11 // a1 = a0 + a_stride 58 ADD r4, r6, r7 // c1 = c0 + cm_stride 59 MOVLO r12, r3 // a1 60 MOVLO r4, r6 // c1 61 62 LDR r9, [sp, 100] // Load w 63 64 // if mr > 2 65 ADD r10, r12, r11 // a2 = a1 + a_stride 66 ADD r8, r4, r7 // c2 = c1 + cm_stride 67 MOVLS r10, r12 // a2 68 MOVLS r8, r4 // c2 69 70 VLDR d4, [r5] // Load min/max values 71 72 CMP r0, 4 // if mr >=4 73 ADD r0, r10, r11 // a3 = a2 + a_stride 74 ADD r7, r8, r7 // c3 = c2 + cm_stride 75 LDR r11, [sp, 112] // Load cn_stride 76 MOVLO r0, r10 // a3 77 MOVLO r7, r8 // c3 78 79 800: 81 # Load initial bias from w into accumulators 82 VLDM r9!, {d8-d9} // Bias 83 SUBS r5, r2, 8 84 VMOV.F64 d10, d8 85 VMOV.F64 d12, d8 86 VMOV.F64 d14, d8 87 VMOV.F64 d11, d9 88 VMOV.F64 d13, d9 89 VMOV.F64 d15, d9 90 BLO 3f // less than 2 channels? 91 92 # Main loop - 2 floats of A (8 bytes) 931: 94 VLDM r3!, {d0} // A0 95 VLDM r9!, {d6-d7} // B0 96 VLDM r12!, {d1} // A1 97 VLDM r10!, {d2} // A2 98 VLDM r0!, {d3} // A3 99 100 VMLA.F32 s16, s12, s0 101 VMLA.F32 s17, s13, s0 102 VMLA.F32 s20, s12, s2 103 VMLA.F32 s21, s13, s2 104 VMLA.F32 s24, s12, s4 105 VMLA.F32 s25, s13, s4 106 VMLA.F32 s28, s12, s6 107 VMLA.F32 s29, s13, s6 108 109 VMLA.F32 s18, s14, s0 110 VMLA.F32 s19, s15, s0 111 VMLA.F32 s22, s14, s2 112 VMLA.F32 s23, s15, s2 113 VLDM r9!, {d5-d6} // B1 114 VMLA.F32 s26, s14, s4 115 VMLA.F32 s27, s15, s4 116 VMLA.F32 s30, s14, s6 117 VMLA.F32 s31, s15, s6 118 119 VMLA.F32 s16, s10, s1 120 VMLA.F32 s17, s11, s1 121 VMLA.F32 s20, s10, s3 122 VMLA.F32 s21, s11, s3 123 VMLA.F32 s24, s10, s5 124 VMLA.F32 s25, s11, s5 125 VMLA.F32 s28, s10, s7 126 VMLA.F32 s29, s11, s7 127 128 SUBS r5, r5, 8 129 130 VMLA.F32 s18, s12, s1 131 VMLA.F32 s19, s13, s1 132 VMLA.F32 s22, s12, s3 133 VMLA.F32 s23, s13, s3 134 VMLA.F32 s26, s12, s5 135 VMLA.F32 s27, s13, s5 136 VMLA.F32 s30, s12, s7 137 VMLA.F32 s31, s13, s7 138 139 BHS 1b 140 141 # Is there a remainder?- 1 float of A (4 bytes) 142 TST r5, 4 143 BNE 3f 144 1452: 146 # Clamp 147 VCMPE.F32 s8, s16 148 VMRS APSR_nzcv, FPSCR 149 VCMPE.F32 s8, s17 150 VMOVPL.F32 s16, s8 151 VMRS APSR_nzcv, FPSCR 152 VCMPE.F32 s8, s18 153 VMOVPL.F32 s17, s8 154 VMRS APSR_nzcv, FPSCR 155 VCMPE.F32 s8, s19 156 VMOVPL.F32 s18, s8 157 VMRS APSR_nzcv, FPSCR 158 VCMPE.F32 s8, s20 159 VMOVPL.F32 s19, s8 160 VMRS APSR_nzcv, FPSCR 161 VCMPE.F32 s8, s21 162 VMOVPL.F32 s20, s8 163 VMRS APSR_nzcv, FPSCR 164 VCMPE.F32 s8, s22 165 VMOVPL.F32 s21, s8 166 VMRS APSR_nzcv, FPSCR 167 VCMPE.F32 s8, s23 168 VMOVPL.F32 s22, s8 169 VMRS APSR_nzcv, FPSCR 170 VCMPE.F32 s8, s24 171 VMOVPL.F32 s23, s8 172 VMRS APSR_nzcv, FPSCR 173 VCMPE.F32 s8, s25 174 VMOVPL.F32 s24, s8 175 VMRS APSR_nzcv, FPSCR 176 VCMPE.F32 s8, s26 177 VMOVPL.F32 s25, s8 178 VMRS APSR_nzcv, FPSCR 179 VCMPE.F32 s8, s27 180 VMOVPL.F32 s26, s8 181 VMRS APSR_nzcv, FPSCR 182 VCMPE.F32 s8, s28 183 VMOVPL.F32 s27, s8 184 VMRS APSR_nzcv, FPSCR 185 VCMPE.F32 s8, s29 186 VMOVPL.F32 s28, s8 187 VMRS APSR_nzcv, FPSCR 188 VCMPE.F32 s8, s30 189 VMOVPL.F32 s29, s8 190 VMRS APSR_nzcv, FPSCR 191 VCMPE.F32 s8, s31 192 VMOVPL.F32 s30, s8 193 VMRS APSR_nzcv, FPSCR 194 VCMPE.F32 s9, s16 195 VMOVPL.F32 s31, s8 196 VMRS APSR_nzcv, FPSCR 197 VCMPE.F32 s9, s17 198 VMOVMI.F32 s16, s9 199 VMRS APSR_nzcv, FPSCR 200 VCMPE.F32 s9, s18 201 VMOVMI.F32 s17, s9 202 VMRS APSR_nzcv, FPSCR 203 VCMPE.F32 s9, s19 204 VMOVMI.F32 s18, s9 205 VMRS APSR_nzcv, FPSCR 206 VCMPE.F32 s9, s20 207 VMOVMI.F32 s19, s9 208 VMRS APSR_nzcv, FPSCR 209 VCMPE.F32 s9, s21 210 VMOVMI.F32 s20, s9 211 VMRS APSR_nzcv, FPSCR 212 VCMPE.F32 s9, s22 213 VMOVMI.F32 s21, s9 214 VMRS APSR_nzcv, FPSCR 215 VCMPE.F32 s9, s23 216 VMOVMI.F32 s22, s9 217 VMRS APSR_nzcv, FPSCR 218 VCMPE.F32 s9, s24 219 VMOVMI.F32 s23, s9 220 VMRS APSR_nzcv, FPSCR 221 VCMPE.F32 s9, s25 222 VMOVMI.F32 s24, s9 223 VMRS APSR_nzcv, FPSCR 224 VCMPE.F32 s9, s26 225 VMOVMI.F32 s25, s9 226 VMRS APSR_nzcv, FPSCR 227 VCMPE.F32 s9, s27 228 VMOVMI.F32 s26, s9 229 VMRS APSR_nzcv, FPSCR 230 VCMPE.F32 s9, s28 231 VMOVMI.F32 s27, s9 232 VMRS APSR_nzcv, FPSCR 233 VCMPE.F32 s9, s29 234 VMOVMI.F32 s28, s9 235 VMRS APSR_nzcv, FPSCR 236 VCMPE.F32 s9, s30 237 VMOVMI.F32 s29, s9 238 VMRS APSR_nzcv, FPSCR 239 VCMPE.F32 s9, s31 240 VMOVMI.F32 s30, s9 241 VMRS APSR_nzcv, FPSCR 242 VMOVMI.F32 s31, s9 243 244 SUBS r1, r1, 4 245 BLO 4f 246 247 # Store full 4 x 4 248 VSTM r6, {d8-d9} 249 SUB r0, r0, r2 250 ADD r6, r11 251 VSTM r4, {d10-d11} 252 SUB r10, r10, r2 253 ADD r4, r11 254 VSTM r8, {d12-d13} 255 SUB r12, r12, r2 256 ADD r8, r11 257 VSTM r7, {d14-d15} 258 SUB r3, r3, r2 259 ADD r7, r11 260 BHI 0b 261 262 VPOP {d8-d15} 263 POP {r4, r5, r6, r7, r8, r9, r10, r11} 264 BX lr 265 2663: 267 # Remainder- 1 float of A (4 bytes) 268 VLDM r3!, {s0} // A0 269 VLDM r9!, {d6-d7} // B 270 VLDM r12!, {s1} // A1 271 VLDM r10!, {s2} // A2 272 VLDM r0!, {s3} // A3 273 274 VMLA.F32 s16, s12, s0 275 VMLA.F32 s17, s13, s0 276 VMLA.F32 s18, s14, s0 277 VMLA.F32 s19, s15, s0 278 279 VMLA.F32 s20, s12, s1 280 VMLA.F32 s21, s13, s1 281 VMLA.F32 s22, s14, s1 282 VMLA.F32 s23, s15, s1 283 284 VMLA.F32 s24, s12, s2 285 VMLA.F32 s25, s13, s2 286 VMLA.F32 s26, s14, s2 287 VMLA.F32 s27, s15, s2 288 289 VMLA.F32 s28, s12, s3 290 VMLA.F32 s29, s13, s3 291 VMLA.F32 s30, s14, s3 292 VMLA.F32 s31, s15, s3 293 294 B 2b 295 296 # Store odd width 2974: 298 TST r1, 2 299 BEQ 5f 300 VSTM r6!, {d8} 301 VMOV.F32 s16, s18 302 VSTM r4!, {d10} 303 VMOV.F32 s20, s22 304 VSTM r8!, {d12} 305 VMOV.F32 s24, s26 306 VSTM r7!, {d14} 307 VMOV.F32 s28, s30 308 3095: 310 TST r1, 1 311 BEQ 6f 312 VSTR s16, [r6] 313 VSTR s20, [r4] 314 VSTR s24, [r8] 315 VSTR s28, [r7] 316 3176: 318 VPOP {d8-d15} 319 POP {r4, r5, r6, r7, r8, r9, r10, r11} 320 BX lr 321 322END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64 323 324#ifdef __ELF__ 325.section ".note.GNU-stack","",%progbits 326#endif 327