1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma( 9# size_t channels, x0, x20 10# size_t output_width, x1 11# const float** input, x2 12# const float* weights, x3, x19 13# float* output, x4 14# size_t input_stride, x5 15# size_t output_increment, x6 16# size_t input_offset, x7 17# const float* zero, [sp + 80] -> x17 18# const xnn_f32_minmax_params params [sp + 88] -> (x16) 19 20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 21 22# inputs 23# i0 x8 v21 24# i1 x9 v22 25# i2 x10 v23 26# i3 x11 v24 27# i4 x12 v25 28# i5 x13 v26 29# i6 x14 v27 30# i7 x15 v28 31# i8 x16 v29 32 33# weights 34# x19 v0 (acc) v1 v2 v3 v4 v5 v6 v7 v16 v17 35 36# Clamp v30 v31 37 38# unused v18 v19 v20 39 40BEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma 41 42 # Load zero, params pointer 43 LDP x17, x16, [sp] 44 45 # Save x19,x20 on stack 46 STP x19, x20, [sp, -16]! 47 48 # Load min/max values 49 LD2R {v30.4s, v31.4s}, [x16] 50 510: 52 # Load 9 input pointers 53 LDP x8, x9, [x2] 54 LDP x10, x11, [x2, 16] 55 LDP x12, x13, [x2, 32] 56 LDP x14, x15, [x2, 48] 57 LDR x16, [x2, 64] 58 59 CMP x8, x17 // if i0 == zero 60 ADD x8, x8, x7 // i0 += input_offset 61 CSEL x8, x17, x8, EQ // i0 = zero, else += i0 + input_offset 62 CMP x9, x17 // if i1 == zero 63 ADD x9, x9, x7 // i1 += input_offset 64 CSEL x9, x17, x9, EQ // i1 = zero, else += i1 + input_offset 65 CMP x10, x17 // if i2 == zero 66 ADD x10, x10, x7 // i2 += input_offset 67 CSEL x10, x17, x10, EQ // i2 = zero, else += i2 + input_offset 68 CMP x11, x17 // if i3 == zero 69 ADD x11, x11, x7 // i3 += input_offset 70 CSEL x11, x17, x11, EQ // i3 = zero, else += i3 + input_offset 71 CMP x12, x17 // if i4 == zero 72 ADD x12, x12, x7 // i4 += input_offset 73 CSEL x12, x17, x12, EQ // i4 = zero, else += i4 + input_offset 74 CMP x13, x17 // if i5 == zero 75 ADD x13, x13, x7 // i5 += input_offset 76 CSEL x13, x17, x13, EQ // i5 = zero, else += i5 + input_offset 77 CMP x14, x17 // if i6 == zero 78 ADD x14, x14, x7 // i6 += input_offset 79 CSEL x14, x17, x14, EQ // i6 = zero, else += i6 + input_offset 80 CMP x15, x17 // if i7 == zero 81 ADD x15, x15, x7 // i7 += input_offset 82 CSEL x15, x17, x15, EQ // i7 = zero, else += i7 + input_offset 83 CMP x16, x17 // if i8 == zero 84 ADD x16, x16, x7 // i8 += input_offset 85 CSEL x16, x17, x16, EQ // i8 = zero, else += i8 + input_offset 86 87 # input += input_stride 88 ADD x2, x2, x5 89 90 # x20 := c = channels 91 # c -= 4 92 SUBS x20, x0, 4 93 # x19 := w = weights 94 MOV x19, x3 95 96 # skip main loop if c <= 4 97 B.LO 2f 981: 99 LDR q21, [x8], 16 // load 9 inputs 100 LDP q0, q1, [x19], 32 // load bias and 9 weights 101 LDR q22, [x9], 16 102 LDR q23, [x10], 16 103 LDR q24, [x11], 16 104 LDR q25, [x12], 16 105 LDR q26, [x13], 16 106 LDR q27, [x14], 16 107 LDR q28, [x15], 16 108 LDR q29, [x16], 16 109 LDP q2, q3, [x19], 32 110 LDP q4, q5, [x19], 32 111 LDP q6, q7, [x19], 32 112 LDP q16, q17, [x19], 32 113 114 FMLA v0.4S, v1.4S, v21.4S 115 FMLA v0.4S, v2.4S, v22.4S 116 FMLA v0.4S, v3.4S, v23.4S 117 FMLA v0.4S, v4.4S, v24.4S 118 FMLA v0.4S, v5.4S, v25.4S 119 FMLA v0.4S, v6.4S, v26.4S 120 FMLA v0.4S, v7.4S, v27.4S 121 FMLA v0.4S, v16.4S, v28.4S 122 FMLA v0.4S, v17.4S, v29.4S 123 SUBS x20, x20, 4 124 125 FMAX v0.4S, v0.4S, v30.4S 126 FMIN v0.4S, v0.4S, v31.4S 127 STR q0, [x4], 16 128 B.HS 1b 129 1302: 131 # Is there a remainder?- 1 to 3 channels 132 TST x20, 3 133 B.EQ 4f 134 135 LDR q21, [x8], 16 // load 9 inputs 136 LDP q0, q1, [x19], 32 // load bias and 9 weights 137 LDR q22, [x9], 16 138 LDR q23, [x10], 16 139 LDR q24, [x11], 16 140 LDR q25, [x12], 16 141 LDR q26, [x13], 16 142 LDR q27, [x14], 16 143 LDR q28, [x15], 16 144 LDR q29, [x16], 16 145 LDP q2, q3, [x19], 32 146 LDP q4, q5, [x19], 32 147 LDP q6, q7, [x19], 32 148 LDP q16, q17, [x19], 32 149 150 FMLA v0.4S, v1.4S, v21.4S 151 FMLA v0.4S, v2.4S, v22.4S 152 FMLA v0.4S, v3.4S, v23.4S 153 FMLA v0.4S, v4.4S, v24.4S 154 FMLA v0.4S, v5.4S, v25.4S 155 FMLA v0.4S, v6.4S, v26.4S 156 FMLA v0.4S, v7.4S, v27.4S 157 FMLA v0.4S, v16.4S, v28.4S 158 FMLA v0.4S, v17.4S, v29.4S 159 160 FMAX v0.4S, v0.4S, v30.4S 161 FMIN v0.4S, v0.4S, v31.4S 162 163 TBZ x20, 1, 3f 164 165 STR d0, [x4], 8 166 DUP d0, v0.D[1] 167 TBZ x20, 0, 4f 1683: 169 STR s0, [x4], 4 1704: 171 # output_width -= 1 172 SUBS x1, x1, 1 173 # output += output_increment 174 ADD x4, x4, x6 175 # process next pixel if output_width != 0 176 B.NE 0b 177 178 # Restore x19,x20 from stack 179 LDP x19, x20, [sp], 16 180 RET 181 182END_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma 183 184#ifdef __ELF__ 185.section ".note.GNU-stack","",%progbits 186#endif 187