1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert ROW_TILE >= 1 7$assert ACCUMULATORS >= 1 8#include <assert.h> 9 10#include <arm_neon.h> 11 12#include <xnnpack/dwconv.h> 13#include <xnnpack/math.h> 14 15 16void xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}( 17 size_t input_height, 18 size_t input_width, 19 const void* input, 20 const void* weights, 21 const void* zero, 22 void* output, 23 uint32_t padding_top, 24 const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 25{ 26 assert(input_height != 0); 27 assert(input_width != 0); 28 assert(input_width % sizeof(__fp16) == 0); 29 assert(padding_top >= 0); 30 assert(padding_top <= 1); 31 32 const uint16x4_t vmask_even = vld1_u16(params->neonfp16arith.mask_even); 33 const uint16x4_t vmask_odd = vld1_u16(params->neonfp16arith.mask_odd); 34 const float16x4_t vmax = vld1_dup_f16(¶ms->neonfp16arith.max); 35 const float16x4_t vmin = vld1_dup_f16(¶ms->neonfp16arith.min); 36 37 const __fp16* w0 = (const __fp16*)weights; 38 const float16x8_t vw01234567 = vld1q_f16(w0); 39 const float16x4_t vw89 = vreinterpret_f16_u32(vld1_lane_u32((const void*)(w0 + 8), vmov_n_u32(0), 0)); 40 41 const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(__fp16)); 42 $if ROW_TILE > 1: 43 const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(__fp16)) / 2, sizeof(__fp16)); 44 45 const __fp16* i0 = (const __fp16*) ((uintptr_t) input - ((-padding_top) & input_width)); 46 const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_width); 47 if XNN_UNPREDICTABLE(padding_top != 0) { 48 i0 = zero; 49 } 50 $for M in range(2, 1 + 2 * ROW_TILE): 51 const __fp16* i${M} = (const __fp16*) ((uintptr_t) i${M-1} + input_width); 52 53 __fp16* o0 = output; 54 $for M in range(1, ROW_TILE): 55 __fp16* o${M} = (__fp16*) ((uintptr_t) o${M-1} + output_width); 56 57 size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */; 58 size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2; 59 do { 60 $for M in range(2, 1 + 2 * ROW_TILE): 61 if XNN_UNPREDICTABLE(padded_input_height < ${2 + M}) { 62 i${M} = zero; 63 $if M % 2 == 1: 64 o${(M - 1) // 2} = o${(M - 1) // 2 - 1}; 65 } 66 67 $for M in range(1 + 2 * ROW_TILE): 68 float16x4_t vi${M}x1357 = vmov_n_f16(0); 69 70 size_t w = input_width; 71 for (; w >= 8 * sizeof(__fp16); w -= 8 * sizeof(__fp16)) { 72 $for M in range(ROW_TILE): 73 float16x4_t vo${M}p0 = vdup_laneq_f16(vw01234567, 0); 74 75 $for M in range(1 + 2 * ROW_TILE): 76 const float16x4x2_t vi${M}x8ACE9BDF = vld2_f16(i${M}); i${M} += 8; 77 78 $for M in range(ROW_TILE): 79 $if ACCUMULATORS > 1: 80 float16x4_t vo${M}p1 = vmul_laneq_f16(vi${2*M}x8ACE9BDF.val[0], vw01234567, 2); 81 $else: 82 vo${M}p0 = vfma_laneq_f16(vo${M}p0, vi${2*M}x8ACE9BDF.val[0], vw01234567, 2); 83 84 $for M in range(ROW_TILE): 85 $if ACCUMULATORS > 2: 86 float16x4_t vo${M}p2 = vmul_laneq_f16(vi${2*M+1}x8ACE9BDF.val[0], vw01234567, 5); 87 $else: 88 vo${M}p0 = vfma_laneq_f16(vo${M}p0, vi${2*M+1}x8ACE9BDF.val[0], vw01234567, 5); 89 90 $for M in range(ROW_TILE): 91 $if ACCUMULATORS > 3: 92 float16x4_t vo${M}p3 = vmul_lane_f16(vi${2*M+2}x8ACE9BDF.val[0], vw89, 0); 93 $else: 94 vo${M}p${4 % ACCUMULATORS} = vfma_lane_f16(vo${M}p${4 % ACCUMULATORS}, vi${2*M+2}x8ACE9BDF.val[0], vw89, 0); 95 96 $for M in range(1 + 2 * ROW_TILE): 97 const float16x4_t vi${M}x7BDF = vext_f16(vi${M}x1357, vi${M}x8ACE9BDF.val[1], 3); 98 vi${M}x1357 = vi${M}x8ACE9BDF.val[1]; 99 100 $for M in range(ROW_TILE): 101 $if ACCUMULATORS > 4: 102 float16x4_t vo${M}p4 = vmul_laneq_f16(vi${2*M}x7BDF, vw01234567, 1); 103 $else: 104 vo${M}p${5 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${5 % ACCUMULATORS}, vi${2*M}x7BDF, vw01234567, 1); 105 106 $for M in range(ROW_TILE): 107 $if ACCUMULATORS > 5: 108 float16x4_t vo${M}p5 = vmul_laneq_f16(vi${2*M+1}x7BDF, vw01234567, 4); 109 $else: 110 vo${M}p${6 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${6 % ACCUMULATORS}, vi${2*M+1}x7BDF, vw01234567, 4); 111 112 $for M in range(ROW_TILE): 113 $if ACCUMULATORS > 6: 114 float16x4_t vo${M}p6 = vmul_laneq_f16(vi${2*M+2}x7BDF, vw01234567, 5); 115 $else: 116 vo${M}p${7 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${7 % ACCUMULATORS}, vi${2*M+2}x7BDF, vw01234567, 7); 117 118 $for M in range(ROW_TILE): 119 vo${M}p${8 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${8 % ACCUMULATORS}, vi${2*M}x8ACE9BDF.val[1], vw01234567, 3); 120 121 $for M in range(ROW_TILE): 122 vo${M}p${9 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${9 % ACCUMULATORS}, vi${2*M+1}x8ACE9BDF.val[1], vw01234567, 6); 123 124 $for M in range(ROW_TILE): 125 vo${M}p${10 % ACCUMULATORS} = vfma_lane_f16(vo${M}p${10 % ACCUMULATORS}, vi${2*M+2}x8ACE9BDF.val[1], vw89, 1); 126 127 $if ACCUMULATORS > 1: 128 $ACC_SLICE = 1 129 $while ACC_SLICE < ACCUMULATORS: 130 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 131 $if A + ACC_SLICE < ACCUMULATORS: 132 $for M in range(ROW_TILE): 133 vo${M}p${A} = vadd_f16(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 134 $ACC_SLICE *= 2 135 136 $for M in range(ROW_TILE): 137 float16x4_t vo${M} = vmax_f16(vo${M}p0, vmin); 138 139 $for M in range(ROW_TILE): 140 vo${M} = vmin_f16(vo${M}, vmax); 141 142 $for M in reversed(range(ROW_TILE)): 143 vst1_f16(o${M}, vo${M}); o${M} += 4; 144 } 145 // Last block has 0-7 pixels to process. 146 assert(w < 8 * sizeof(__fp16)); 147 if XNN_LIKELY(w != 0) { 148 $for M in range(ROW_TILE): 149 float16x4_t vo${M}p0 = vdup_laneq_f16(vw01234567, 0); 150 151 $for M in range(1 + 2 * ROW_TILE): 152 const float16x4x2_t vi${M}x8ACE9BDF = vld2_f16(i${M}); 153 154 $for M in range(1 + 2 * ROW_TILE): 155 const float16x4_t vi${M}x8ACE = vreinterpret_f16_u16(vand_u16(vmask_even, vreinterpret_u16_f16(vi${M}x8ACE9BDF.val[0]))); 156 const float16x4_t vi${M}x9BDF = vreinterpret_f16_u16(vand_u16(vmask_odd, vreinterpret_u16_f16(vi${M}x8ACE9BDF.val[1]))); 157 158 $for M in range(ROW_TILE): 159 $if ACCUMULATORS > 1: 160 float16x4_t vo${M}p1 = vmul_laneq_f16(vi${2*M}x8ACE, vw01234567, 2); 161 $else: 162 vo${M}p0 = vfma_laneq_f16(vo${M}p0, vi${2*M}x8ACE, vw01234567, 2); 163 164 $for M in range(ROW_TILE): 165 $if ACCUMULATORS > 2: 166 float16x4_t vo${M}p2 = vmul_laneq_f16(vi${2*M+1}x8ACE, vw01234567, 5); 167 $else: 168 vo${M}p0 = vfma_laneq_f16(vo${M}p0, vi${2*M+1}x8ACE, vw01234567, 5); 169 170 $for M in range(ROW_TILE): 171 $if ACCUMULATORS > 3: 172 float16x4_t vo${M}p3 = vmul_lane_f16(vi${2*M+2}x8ACE, vw89, 0); 173 $else: 174 vo${M}p${4 % ACCUMULATORS} = vfma_lane_f16(vo${M}p${4 % ACCUMULATORS}, vi${2*M+2}x8ACE, vw89, 0); 175 176 $for M in range(1 + 2 * ROW_TILE): 177 const float16x4_t vi${M}x7BDF = vext_f16(vi${M}x1357, vi${M}x9BDF, 3); 178 179 $for M in range(ROW_TILE): 180 $if ACCUMULATORS > 4: 181 float16x4_t vo${M}p4 = vmul_laneq_f16(vi${2*M}x7BDF, vw01234567, 1); 182 $else: 183 vo${M}p${5 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${5 % ACCUMULATORS}, vi${2*M}x7BDF, vw01234567, 1); 184 185 $for M in range(ROW_TILE): 186 $if ACCUMULATORS > 5: 187 float16x4_t vo${M}p5 = vmul_laneq_f16(vi${2*M+1}x7BDF, vw01234567, 4); 188 $else: 189 vo${M}p${6 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${6 % ACCUMULATORS}, vi${2*M+1}x7BDF, vw01234567, 4); 190 191 $for M in range(ROW_TILE): 192 $if ACCUMULATORS > 6: 193 float16x4_t vo${M}p6 = vmul_laneq_f16(vi${2*M+2}x7BDF, vw01234567, 5); 194 $else: 195 vo${M}p${7 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${7 % ACCUMULATORS}, vi${2*M+2}x7BDF, vw01234567, 7); 196 197 $for M in range(ROW_TILE): 198 vo${M}p${8 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${8 % ACCUMULATORS}, vi${2*M}x9BDF, vw01234567, 3); 199 200 $for M in range(ROW_TILE): 201 vo${M}p${9 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${9 % ACCUMULATORS}, vi${2*M+1}x9BDF, vw01234567, 6); 202 203 $for M in range(ROW_TILE): 204 vo${M}p${10 % ACCUMULATORS} = vfma_lane_f16(vo${M}p${10 % ACCUMULATORS}, vi${2*M+2}x9BDF, vw89, 1); 205 206 $if ACCUMULATORS > 1: 207 $ACC_SLICE = 1 208 $while ACC_SLICE < ACCUMULATORS: 209 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 210 $if A + ACC_SLICE < ACCUMULATORS: 211 $for M in range(ROW_TILE): 212 vo${M}p${A} = vadd_f16(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 213 $ACC_SLICE *= 2 214 215 $for M in range(ROW_TILE): 216 float16x4_t vo${M} = vmax_f16(vo${M}p0, vmin); 217 218 $for M in range(ROW_TILE): 219 vo${M} = vmin_f16(vo${M}, vmax); 220 221 w += 1 * sizeof(__fp16); 222 223 if XNN_LIKELY(w == 8 * sizeof(__fp16)) { 224 $for M in reversed(range(ROW_TILE)): 225 vst1_f16(o${M}, vo${M}); o${M} += 4; 226 } else { 227 if (w & (4 * sizeof(__fp16))) { 228 $for M in reversed(range(ROW_TILE)): 229 vst1_lane_u32((void*) o${M}, vreinterpret_u32_f16(vo${M}), 0); o${M} += 2; 230 231 $for M in range(ROW_TILE): 232 vo${M} = vext_f16(vo${M}, vo${M}, 2); 233 } 234 if (w & (2 * sizeof(__fp16))) { 235 $for M in reversed(range(ROW_TILE)): 236 vst1_lane_f16(o${M}, vo${M}, 0); o${M} += 1; 237 } 238 } 239 240 } 241 242 i0 = (const __fp16*) ((uintptr_t) i${2 * ROW_TILE} - input_decrement); 243 $for M in range(1, 1 + 2 * ROW_TILE): 244 i${M} = (const __fp16*) ((uintptr_t) i${M-1} + input_width); 245 246 $if ROW_TILE > 1: 247 o0 = o${ROW_TILE - 1}; 248 $for M in range(1, ROW_TILE): 249 o${M} = (__fp16*) ((uintptr_t) o${M-1} + output_width); 250 251 $if ROW_TILE > 1: 252 output_height = doz(output_height, ${ROW_TILE}); 253 padded_input_height = doz(padded_input_height, ${ROW_TILE * 2}); 254 $else: 255 output_height -= 1; 256 padded_input_height -= 2; 257 } while (output_height != 0); 258} 259