1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2019 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/assembly.h> 7*4bdc9457SAndroid Build Coastguard Worker 8*4bdc9457SAndroid Build Coastguard Worker# void xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma( 9*4bdc9457SAndroid Build Coastguard Worker# size_t channels, x0, x20 10*4bdc9457SAndroid Build Coastguard Worker# size_t output_width, x1 11*4bdc9457SAndroid Build Coastguard Worker# const float** input, x2 12*4bdc9457SAndroid Build Coastguard Worker# const float* weights, x3, x19 13*4bdc9457SAndroid Build Coastguard Worker# float* output, x4 14*4bdc9457SAndroid Build Coastguard Worker# size_t input_stride, x5 15*4bdc9457SAndroid Build Coastguard Worker# size_t output_increment, x6 16*4bdc9457SAndroid Build Coastguard Worker# size_t input_offset, x7 17*4bdc9457SAndroid Build Coastguard Worker# const float* zero, [sp + 80] -> x17 18*4bdc9457SAndroid Build Coastguard Worker# const xnn_f32_minmax_params params [sp + 88] -> (x16) 19*4bdc9457SAndroid Build Coastguard Worker 20*4bdc9457SAndroid Build Coastguard Worker# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 21*4bdc9457SAndroid Build Coastguard Worker 22*4bdc9457SAndroid Build Coastguard Worker# inputs 23*4bdc9457SAndroid Build Coastguard Worker# i0 x8 v21 24*4bdc9457SAndroid Build Coastguard Worker# i1 x9 v22 25*4bdc9457SAndroid Build Coastguard Worker# i2 x10 v23 26*4bdc9457SAndroid Build Coastguard Worker# i3 x11 v24 27*4bdc9457SAndroid Build Coastguard Worker# i4 x12 v25 28*4bdc9457SAndroid Build Coastguard Worker# i5 x13 v26 29*4bdc9457SAndroid Build Coastguard Worker# i6 x14 v27 30*4bdc9457SAndroid Build Coastguard Worker# i7 x15 v28 31*4bdc9457SAndroid Build Coastguard Worker# i8 x16 v29 32*4bdc9457SAndroid Build Coastguard Worker 33*4bdc9457SAndroid Build Coastguard Worker# weights 34*4bdc9457SAndroid Build Coastguard Worker# x19 v0 (acc) v1 v2 v3 v4 v5 v6 v7 v16 v17 35*4bdc9457SAndroid Build Coastguard Worker 36*4bdc9457SAndroid Build Coastguard Worker# Clamp v30 v31 37*4bdc9457SAndroid Build Coastguard Worker 38*4bdc9457SAndroid Build Coastguard Worker# unused v18 v19 v20 39*4bdc9457SAndroid Build Coastguard Worker 40*4bdc9457SAndroid Build Coastguard WorkerBEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma 41*4bdc9457SAndroid Build Coastguard Worker 42*4bdc9457SAndroid Build Coastguard Worker # Load zero, params pointer 43*4bdc9457SAndroid Build Coastguard Worker LDP x17, x16, [sp] 44*4bdc9457SAndroid Build Coastguard Worker 45*4bdc9457SAndroid Build Coastguard Worker # Save x19,x20 on stack 46*4bdc9457SAndroid Build Coastguard Worker STP x19, x20, [sp, -16]! 47*4bdc9457SAndroid Build Coastguard Worker 48*4bdc9457SAndroid Build Coastguard Worker # Load min/max values 49*4bdc9457SAndroid Build Coastguard Worker LD2R {v30.4s, v31.4s}, [x16] 50*4bdc9457SAndroid Build Coastguard Worker 51*4bdc9457SAndroid Build Coastguard Worker0: 52*4bdc9457SAndroid Build Coastguard Worker # Load 9 input pointers 53*4bdc9457SAndroid Build Coastguard Worker LDP x8, x9, [x2] 54*4bdc9457SAndroid Build Coastguard Worker LDP x10, x11, [x2, 16] 55*4bdc9457SAndroid Build Coastguard Worker LDP x12, x13, [x2, 32] 56*4bdc9457SAndroid Build Coastguard Worker LDP x14, x15, [x2, 48] 57*4bdc9457SAndroid Build Coastguard Worker LDR x16, [x2, 64] 58*4bdc9457SAndroid Build Coastguard Worker 59*4bdc9457SAndroid Build Coastguard Worker CMP x8, x17 // if i0 == zero 60*4bdc9457SAndroid Build Coastguard Worker ADD x8, x8, x7 // i0 += input_offset 61*4bdc9457SAndroid Build Coastguard Worker CSEL x8, x17, x8, EQ // i0 = zero, else += i0 + input_offset 62*4bdc9457SAndroid Build Coastguard Worker CMP x9, x17 // if i1 == zero 63*4bdc9457SAndroid Build Coastguard Worker ADD x9, x9, x7 // i1 += input_offset 64*4bdc9457SAndroid Build Coastguard Worker CSEL x9, x17, x9, EQ // i1 = zero, else += i1 + input_offset 65*4bdc9457SAndroid Build Coastguard Worker CMP x10, x17 // if i2 == zero 66*4bdc9457SAndroid Build Coastguard Worker ADD x10, x10, x7 // i2 += input_offset 67*4bdc9457SAndroid Build Coastguard Worker CSEL x10, x17, x10, EQ // i2 = zero, else += i2 + input_offset 68*4bdc9457SAndroid Build Coastguard Worker CMP x11, x17 // if i3 == zero 69*4bdc9457SAndroid Build Coastguard Worker ADD x11, x11, x7 // i3 += input_offset 70*4bdc9457SAndroid Build Coastguard Worker CSEL x11, x17, x11, EQ // i3 = zero, else += i3 + input_offset 71*4bdc9457SAndroid Build Coastguard Worker CMP x12, x17 // if i4 == zero 72*4bdc9457SAndroid Build Coastguard Worker ADD x12, x12, x7 // i4 += input_offset 73*4bdc9457SAndroid Build Coastguard Worker CSEL x12, x17, x12, EQ // i4 = zero, else += i4 + input_offset 74*4bdc9457SAndroid Build Coastguard Worker CMP x13, x17 // if i5 == zero 75*4bdc9457SAndroid Build Coastguard Worker ADD x13, x13, x7 // i5 += input_offset 76*4bdc9457SAndroid Build Coastguard Worker CSEL x13, x17, x13, EQ // i5 = zero, else += i5 + input_offset 77*4bdc9457SAndroid Build Coastguard Worker CMP x14, x17 // if i6 == zero 78*4bdc9457SAndroid Build Coastguard Worker ADD x14, x14, x7 // i6 += input_offset 79*4bdc9457SAndroid Build Coastguard Worker CSEL x14, x17, x14, EQ // i6 = zero, else += i6 + input_offset 80*4bdc9457SAndroid Build Coastguard Worker CMP x15, x17 // if i7 == zero 81*4bdc9457SAndroid Build Coastguard Worker ADD x15, x15, x7 // i7 += input_offset 82*4bdc9457SAndroid Build Coastguard Worker CSEL x15, x17, x15, EQ // i7 = zero, else += i7 + input_offset 83*4bdc9457SAndroid Build Coastguard Worker CMP x16, x17 // if i8 == zero 84*4bdc9457SAndroid Build Coastguard Worker ADD x16, x16, x7 // i8 += input_offset 85*4bdc9457SAndroid Build Coastguard Worker CSEL x16, x17, x16, EQ // i8 = zero, else += i8 + input_offset 86*4bdc9457SAndroid Build Coastguard Worker 87*4bdc9457SAndroid Build Coastguard Worker # input += input_stride 88*4bdc9457SAndroid Build Coastguard Worker ADD x2, x2, x5 89*4bdc9457SAndroid Build Coastguard Worker 90*4bdc9457SAndroid Build Coastguard Worker # x20 := c = channels 91*4bdc9457SAndroid Build Coastguard Worker # c -= 4 92*4bdc9457SAndroid Build Coastguard Worker SUBS x20, x0, 4 93*4bdc9457SAndroid Build Coastguard Worker # x19 := w = weights 94*4bdc9457SAndroid Build Coastguard Worker MOV x19, x3 95*4bdc9457SAndroid Build Coastguard Worker 96*4bdc9457SAndroid Build Coastguard Worker # skip main loop if c <= 4 97*4bdc9457SAndroid Build Coastguard Worker B.LO 2f 98*4bdc9457SAndroid Build Coastguard Worker1: 99*4bdc9457SAndroid Build Coastguard Worker LDR q21, [x8], 16 // load 9 inputs 100*4bdc9457SAndroid Build Coastguard Worker LDP q0, q1, [x19], 32 // load bias and 9 weights 101*4bdc9457SAndroid Build Coastguard Worker LDR q22, [x9], 16 102*4bdc9457SAndroid Build Coastguard Worker LDR q23, [x10], 16 103*4bdc9457SAndroid Build Coastguard Worker LDR q24, [x11], 16 104*4bdc9457SAndroid Build Coastguard Worker LDR q25, [x12], 16 105*4bdc9457SAndroid Build Coastguard Worker LDR q26, [x13], 16 106*4bdc9457SAndroid Build Coastguard Worker LDR q27, [x14], 16 107*4bdc9457SAndroid Build Coastguard Worker LDR q28, [x15], 16 108*4bdc9457SAndroid Build Coastguard Worker LDR q29, [x16], 16 109*4bdc9457SAndroid Build Coastguard Worker LDP q2, q3, [x19], 32 110*4bdc9457SAndroid Build Coastguard Worker LDP q4, q5, [x19], 32 111*4bdc9457SAndroid Build Coastguard Worker LDP q6, q7, [x19], 32 112*4bdc9457SAndroid Build Coastguard Worker LDP q16, q17, [x19], 32 113*4bdc9457SAndroid Build Coastguard Worker 114*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v1.4S, v21.4S 115*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v2.4S, v22.4S 116*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v3.4S, v23.4S 117*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v4.4S, v24.4S 118*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v5.4S, v25.4S 119*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v6.4S, v26.4S 120*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v7.4S, v27.4S 121*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v16.4S, v28.4S 122*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v17.4S, v29.4S 123*4bdc9457SAndroid Build Coastguard Worker SUBS x20, x20, 4 124*4bdc9457SAndroid Build Coastguard Worker 125*4bdc9457SAndroid Build Coastguard Worker FMAX v0.4S, v0.4S, v30.4S 126*4bdc9457SAndroid Build Coastguard Worker FMIN v0.4S, v0.4S, v31.4S 127*4bdc9457SAndroid Build Coastguard Worker STR q0, [x4], 16 128*4bdc9457SAndroid Build Coastguard Worker B.HS 1b 129*4bdc9457SAndroid Build Coastguard Worker 130*4bdc9457SAndroid Build Coastguard Worker2: 131*4bdc9457SAndroid Build Coastguard Worker # Is there a remainder?- 1 to 3 channels 132*4bdc9457SAndroid Build Coastguard Worker TST x20, 3 133*4bdc9457SAndroid Build Coastguard Worker B.EQ 4f 134*4bdc9457SAndroid Build Coastguard Worker 135*4bdc9457SAndroid Build Coastguard Worker LDR q21, [x8], 16 // load 9 inputs 136*4bdc9457SAndroid Build Coastguard Worker LDP q0, q1, [x19], 32 // load bias and 9 weights 137*4bdc9457SAndroid Build Coastguard Worker LDR q22, [x9], 16 138*4bdc9457SAndroid Build Coastguard Worker LDR q23, [x10], 16 139*4bdc9457SAndroid Build Coastguard Worker LDR q24, [x11], 16 140*4bdc9457SAndroid Build Coastguard Worker LDR q25, [x12], 16 141*4bdc9457SAndroid Build Coastguard Worker LDR q26, [x13], 16 142*4bdc9457SAndroid Build Coastguard Worker LDR q27, [x14], 16 143*4bdc9457SAndroid Build Coastguard Worker LDR q28, [x15], 16 144*4bdc9457SAndroid Build Coastguard Worker LDR q29, [x16], 16 145*4bdc9457SAndroid Build Coastguard Worker LDP q2, q3, [x19], 32 146*4bdc9457SAndroid Build Coastguard Worker LDP q4, q5, [x19], 32 147*4bdc9457SAndroid Build Coastguard Worker LDP q6, q7, [x19], 32 148*4bdc9457SAndroid Build Coastguard Worker LDP q16, q17, [x19], 32 149*4bdc9457SAndroid Build Coastguard Worker 150*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v1.4S, v21.4S 151*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v2.4S, v22.4S 152*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v3.4S, v23.4S 153*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v4.4S, v24.4S 154*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v5.4S, v25.4S 155*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v6.4S, v26.4S 156*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v7.4S, v27.4S 157*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v16.4S, v28.4S 158*4bdc9457SAndroid Build Coastguard Worker FMLA v0.4S, v17.4S, v29.4S 159*4bdc9457SAndroid Build Coastguard Worker 160*4bdc9457SAndroid Build Coastguard Worker FMAX v0.4S, v0.4S, v30.4S 161*4bdc9457SAndroid Build Coastguard Worker FMIN v0.4S, v0.4S, v31.4S 162*4bdc9457SAndroid Build Coastguard Worker 163*4bdc9457SAndroid Build Coastguard Worker TBZ x20, 1, 3f 164*4bdc9457SAndroid Build Coastguard Worker 165*4bdc9457SAndroid Build Coastguard Worker STR d0, [x4], 8 166*4bdc9457SAndroid Build Coastguard Worker DUP d0, v0.D[1] 167*4bdc9457SAndroid Build Coastguard Worker TBZ x20, 0, 4f 168*4bdc9457SAndroid Build Coastguard Worker3: 169*4bdc9457SAndroid Build Coastguard Worker STR s0, [x4], 4 170*4bdc9457SAndroid Build Coastguard Worker4: 171*4bdc9457SAndroid Build Coastguard Worker # output_width -= 1 172*4bdc9457SAndroid Build Coastguard Worker SUBS x1, x1, 1 173*4bdc9457SAndroid Build Coastguard Worker # output += output_increment 174*4bdc9457SAndroid Build Coastguard Worker ADD x4, x4, x6 175*4bdc9457SAndroid Build Coastguard Worker # process next pixel if output_width != 0 176*4bdc9457SAndroid Build Coastguard Worker B.NE 0b 177*4bdc9457SAndroid Build Coastguard Worker 178*4bdc9457SAndroid Build Coastguard Worker # Restore x19,x20 from stack 179*4bdc9457SAndroid Build Coastguard Worker LDP x19, x20, [sp], 16 180*4bdc9457SAndroid Build Coastguard Worker RET 181*4bdc9457SAndroid Build Coastguard Worker 182*4bdc9457SAndroid Build Coastguard WorkerEND_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma 183*4bdc9457SAndroid Build Coastguard Worker 184*4bdc9457SAndroid Build Coastguard Worker#ifdef __ELF__ 185*4bdc9457SAndroid Build Coastguard Worker.section ".note.GNU-stack","",%progbits 186*4bdc9457SAndroid Build Coastguard Worker#endif 187