1*4bdc9457SAndroid Build Coastguard Worker// Auto-generated file. Do not edit! 2*4bdc9457SAndroid Build Coastguard Worker// Template: src/f16-gemm/1x16-aarch64-neonfp16arith-ld32.S.in 3*4bdc9457SAndroid Build Coastguard Worker// Generator: tools/xngen 4*4bdc9457SAndroid Build Coastguard Worker// 5*4bdc9457SAndroid Build Coastguard Worker// Copyright 2020 Google LLC 6*4bdc9457SAndroid Build Coastguard Worker// 7*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 8*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 9*4bdc9457SAndroid Build Coastguard Worker 10*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/assembly.h> 11*4bdc9457SAndroid Build Coastguard Worker 12*4bdc9457SAndroid Build Coastguard Worker# void xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32( 13*4bdc9457SAndroid Build Coastguard Worker# size_t mr, (x0) - unused. mr = 1 14*4bdc9457SAndroid Build Coastguard Worker# size_t nc, x1 15*4bdc9457SAndroid Build Coastguard Worker# size_t kc, x2 / x0 16*4bdc9457SAndroid Build Coastguard Worker# const void*restrict a, x3 17*4bdc9457SAndroid Build Coastguard Worker# size_t a_stride, (x4) - unused 18*4bdc9457SAndroid Build Coastguard Worker# const void*restrict w, x5 19*4bdc9457SAndroid Build Coastguard Worker# void*restrict c, x6 20*4bdc9457SAndroid Build Coastguard Worker# size_t cm_stride, (x7) - unused 21*4bdc9457SAndroid Build Coastguard Worker# size_t cn_stride, [sp] -> x14 22*4bdc9457SAndroid Build Coastguard Worker# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23*4bdc9457SAndroid Build Coastguard Worker 24*4bdc9457SAndroid Build Coastguard Worker# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25*4bdc9457SAndroid Build Coastguard Worker 26*4bdc9457SAndroid Build Coastguard Worker# A pointer 27*4bdc9457SAndroid Build Coastguard Worker# x3 a0 28*4bdc9457SAndroid Build Coastguard Worker 29*4bdc9457SAndroid Build Coastguard Worker# C pointer 30*4bdc9457SAndroid Build Coastguard Worker# x6 c0 31*4bdc9457SAndroid Build Coastguard Worker 32*4bdc9457SAndroid Build Coastguard Worker# Clamp v4, v5 33*4bdc9457SAndroid Build Coastguard Worker 34*4bdc9457SAndroid Build Coastguard WorkerBEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32 35*4bdc9457SAndroid Build Coastguard Worker 36*4bdc9457SAndroid Build Coastguard Worker # Load cn_stride, params pointer 37*4bdc9457SAndroid Build Coastguard Worker LDP x14, x8, [sp] 38*4bdc9457SAndroid Build Coastguard Worker 39*4bdc9457SAndroid Build Coastguard Worker # Load params values 40*4bdc9457SAndroid Build Coastguard Worker LD2R {v4.8h, v5.8h}, [x8] 41*4bdc9457SAndroid Build Coastguard Worker0: 42*4bdc9457SAndroid Build Coastguard Worker # Load initial bias from w into accumulators 43*4bdc9457SAndroid Build Coastguard Worker LDP q16, q17, [x5], 32 44*4bdc9457SAndroid Build Coastguard Worker 45*4bdc9457SAndroid Build Coastguard Worker MOVI v18.8h, 0 // second set of C for pipelining FMLA 46*4bdc9457SAndroid Build Coastguard Worker MOVI v19.8h, 0 47*4bdc9457SAndroid Build Coastguard Worker 48*4bdc9457SAndroid Build Coastguard Worker # Is there at least 2 halffloats (4 bytes) 49*4bdc9457SAndroid Build Coastguard Worker SUBS x0, x2, 4 // k = kc - 4 50*4bdc9457SAndroid Build Coastguard Worker 51*4bdc9457SAndroid Build Coastguard Worker B.LO 3f 52*4bdc9457SAndroid Build Coastguard Worker 53*4bdc9457SAndroid Build Coastguard Worker # Main loop - 2 halffloats of A (4 bytes) 54*4bdc9457SAndroid Build Coastguard Worker1: 55*4bdc9457SAndroid Build Coastguard Worker LDR s0, [x3], 4 56*4bdc9457SAndroid Build Coastguard Worker LDR q20, [x5, 0] 57*4bdc9457SAndroid Build Coastguard Worker LDR q21, [x5, 16] 58*4bdc9457SAndroid Build Coastguard Worker LDR q22, [x5, 32] 59*4bdc9457SAndroid Build Coastguard Worker LDR q23, [x5, 48] 60*4bdc9457SAndroid Build Coastguard Worker SUBS x0, x0, 4 61*4bdc9457SAndroid Build Coastguard Worker FMLA v16.8h, v20.8h, v0.h[0] 62*4bdc9457SAndroid Build Coastguard Worker FMLA v17.8h, v21.8h, v0.h[0] 63*4bdc9457SAndroid Build Coastguard Worker FMLA v18.8h, v22.8h, v0.h[1] 64*4bdc9457SAndroid Build Coastguard Worker FMLA v19.8h, v23.8h, v0.h[1] 65*4bdc9457SAndroid Build Coastguard Worker ADD x5, x5, 64 66*4bdc9457SAndroid Build Coastguard Worker B.HS 1b 67*4bdc9457SAndroid Build Coastguard Worker 68*4bdc9457SAndroid Build Coastguard Worker # Is there a remainder?- 1 halffloat of A (2 bytes) 69*4bdc9457SAndroid Build Coastguard Worker TBNZ x0, 1, 3f 70*4bdc9457SAndroid Build Coastguard Worker 71*4bdc9457SAndroid Build Coastguard Worker2: 72*4bdc9457SAndroid Build Coastguard Worker FADD v16.8h, v16.8h, v18.8h 73*4bdc9457SAndroid Build Coastguard Worker FADD v17.8h, v17.8h, v19.8h 74*4bdc9457SAndroid Build Coastguard Worker SUBS x1, x1, 16 75*4bdc9457SAndroid Build Coastguard Worker 76*4bdc9457SAndroid Build Coastguard Worker # Clamp 77*4bdc9457SAndroid Build Coastguard Worker FMAX v16.8h, v16.8h, v4.8h 78*4bdc9457SAndroid Build Coastguard Worker FMAX v17.8h, v17.8h, v4.8h 79*4bdc9457SAndroid Build Coastguard Worker FMIN v16.8h, v16.8h, v5.8h 80*4bdc9457SAndroid Build Coastguard Worker FMIN v17.8h, v17.8h, v5.8h 81*4bdc9457SAndroid Build Coastguard Worker 82*4bdc9457SAndroid Build Coastguard Worker # Store full 1 x 16 83*4bdc9457SAndroid Build Coastguard Worker B.LO 4f 84*4bdc9457SAndroid Build Coastguard Worker 85*4bdc9457SAndroid Build Coastguard Worker STP q16, q17, [x6] 86*4bdc9457SAndroid Build Coastguard Worker ADD x6, x6, x14 87*4bdc9457SAndroid Build Coastguard Worker 88*4bdc9457SAndroid Build Coastguard Worker SUB x3, x3, x2 // a0 -= kc 89*4bdc9457SAndroid Build Coastguard Worker 90*4bdc9457SAndroid Build Coastguard Worker B.HI 0b 91*4bdc9457SAndroid Build Coastguard Worker 92*4bdc9457SAndroid Build Coastguard Worker RET 93*4bdc9457SAndroid Build Coastguard Worker 94*4bdc9457SAndroid Build Coastguard Worker3: 95*4bdc9457SAndroid Build Coastguard Worker # Remainder- 1 halffloat of A (2 bytes) 96*4bdc9457SAndroid Build Coastguard Worker LDR q20, [x5], 16 97*4bdc9457SAndroid Build Coastguard Worker LDR q21, [x5], 16 98*4bdc9457SAndroid Build Coastguard Worker LDR h0, [x3], 2 99*4bdc9457SAndroid Build Coastguard Worker FMLA v16.8h, v20.8h, v0.h[0] 100*4bdc9457SAndroid Build Coastguard Worker FMLA v17.8h, v21.8h, v0.h[0] 101*4bdc9457SAndroid Build Coastguard Worker B 2b 102*4bdc9457SAndroid Build Coastguard Worker 103*4bdc9457SAndroid Build Coastguard Worker # Store odd channels 104*4bdc9457SAndroid Build Coastguard Worker4: 105*4bdc9457SAndroid Build Coastguard Worker TBZ x1, 3, 5f 106*4bdc9457SAndroid Build Coastguard Worker STR q16, [x6], 16 107*4bdc9457SAndroid Build Coastguard Worker MOV v16.16b, v17.16b 108*4bdc9457SAndroid Build Coastguard Worker 109*4bdc9457SAndroid Build Coastguard Worker5: 110*4bdc9457SAndroid Build Coastguard Worker TBZ x1, 2, 6f 111*4bdc9457SAndroid Build Coastguard Worker STR d16, [x6], 8 112*4bdc9457SAndroid Build Coastguard Worker DUP d16, v16.d[1] 113*4bdc9457SAndroid Build Coastguard Worker 114*4bdc9457SAndroid Build Coastguard Worker6: 115*4bdc9457SAndroid Build Coastguard Worker TBZ x1, 1, 7f 116*4bdc9457SAndroid Build Coastguard Worker STR s16, [x6], 4 117*4bdc9457SAndroid Build Coastguard Worker DUP s16, v16.s[1] 118*4bdc9457SAndroid Build Coastguard Worker 119*4bdc9457SAndroid Build Coastguard Worker7: 120*4bdc9457SAndroid Build Coastguard Worker TBZ x1, 0, 8f 121*4bdc9457SAndroid Build Coastguard Worker STR h16, [x6] 122*4bdc9457SAndroid Build Coastguard Worker8: 123*4bdc9457SAndroid Build Coastguard Worker RET 124*4bdc9457SAndroid Build Coastguard Worker 125*4bdc9457SAndroid Build Coastguard WorkerEND_FUNCTION xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32 126*4bdc9457SAndroid Build Coastguard Worker 127*4bdc9457SAndroid Build Coastguard Worker#ifdef __ELF__ 128*4bdc9457SAndroid Build Coastguard Worker.section ".note.GNU-stack","",%progbits 129*4bdc9457SAndroid Build Coastguard Worker#endif 130