1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert ROW_TILE >= 1 7$assert ACCUMULATORS >= 1 8#include <assert.h> 9 10#include <wasm_simd128.h> 11 12#include <xnnpack/dwconv.h> 13#include <xnnpack/math.h> 14 15 16$ARCH_SUFFIX = "_x86" if X86 else "_arm" 17 18void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd${ARCH_SUFFIX}_loadsplat_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}( 19 size_t input_height, 20 size_t input_width, 21 const float* input, 22 const float* weights, 23 const float* zero, 24 float* output, 25 uint32_t padding_top, 26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 27{ 28 assert(input_height != 0); 29 assert(input_width != 0); 30 assert(input_width % sizeof(float) == 0); 31 assert(padding_top >= 0); 32 assert(padding_top <= 1); 33 34 const v128_t vmask_even = wasm_v128_load(params->scalar.mask_even); 35 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); 36 const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); 37 const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); 38 39 const v128_t vw0123 = wasm_v128_load(weights); 40 const v128_t vw4567 = wasm_v128_load(weights + 4); 41 const v128_t vw89 = wasm_v128_load64_splat(weights + 8); 42 const v128_t vbias = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); 43 const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1); 44 const v128_t vk01 = wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2); 45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); 46 const v128_t vk10 = wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0); 47 const v128_t vk11 = wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1); 48 const v128_t vk12 = wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2); 49 const v128_t vk20 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3); 50 const v128_t vk21 = wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0); 51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); 52 53 const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float)); 54 $if ROW_TILE > 1: 55 const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float)); 56 57 const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width)); 58 const float* i1 = (const float*) ((uintptr_t) i0 + input_width); 59 if XNN_UNPREDICTABLE(padding_top != 0) { 60 i0 = zero; 61 } 62 $for M in range(2, 1 + 2 * ROW_TILE): 63 const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 64 65 float* o0 = output; 66 $for M in range(1, ROW_TILE): 67 float* o${M} = (float*) ((uintptr_t) o${M-1} + output_width); 68 69 size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */; 70 size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2; 71 do { 72 $for M in range(2, 1 + 2 * ROW_TILE): 73 if XNN_UNPREDICTABLE(padded_input_height < ${2 + M}) { 74 i${M} = zero; 75 $if M % 2 == 1: 76 o${(M - 1) // 2} = o${(M - 1) // 2 - 1}; 77 } 78 79 $for M in range(1 + 2 * ROW_TILE): 80 v128_t vi${M}x1357 = wasm_f32x4_const_splat(0.0f); 81 82 size_t w = input_width; 83 for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) { 84 $for M in range(ROW_TILE): 85 v128_t vo${M}p0 = vbias; 86 87 $for M in range(1 + 2 * ROW_TILE): 88 const v128_t vi${M}x89AB = wasm_v128_load(i${M}); 89 const v128_t vi${M}xCDEF = wasm_v128_load(i${M} + 4); 90 i${M} += 8; 91 92 $for M in range(1 + 2 * ROW_TILE): 93 const v128_t vi${M}x8ACE = wasm_v32x4_shuffle(vi${M}x89AB, vi${M}xCDEF, 0, 2, 4, 6); 94 const v128_t vi${M}x9BDF = wasm_v32x4_shuffle(vi${M}x89AB, vi${M}xCDEF, 1, 3, 5, 7); 95 96 $for M in range(ROW_TILE): 97 $if ACCUMULATORS > 1: 98 v128_t vo${M}p1 = wasm_f32x4_mul(vi${2*M}x8ACE, vk01); 99 $else: 100 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${2*M}x8ACE, vk01)); 101 102 $for M in range(ROW_TILE): 103 $if ACCUMULATORS > 2: 104 v128_t vo${M}p2 = wasm_f32x4_mul(vi${2*M+1}x8ACE, vk11); 105 $else: 106 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${2*M+1}x8ACE, vk11)); 107 108 $for M in range(ROW_TILE): 109 $if ACCUMULATORS > 3: 110 v128_t vo${M}p3 = wasm_f32x4_mul(vi${2*M+2}x8ACE, vk21); 111 $else: 112 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x8ACE, vk21)); 113 114 $for M in range(1 + 2 * ROW_TILE): 115 const v128_t vi${M}x7BDF = wasm_v32x4_shuffle(vi${M}x1357, vi${M}x9BDF, 3, 4, 5, 6); 116 vi${M}x1357 = vi${M}x9BDF; 117 118 $for M in range(ROW_TILE): 119 $if ACCUMULATORS > 4: 120 v128_t vo${M}p4 = wasm_f32x4_mul(vi${2*M}x7BDF, vk00); 121 $else: 122 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x7BDF, vk00)); 123 124 $for M in range(ROW_TILE): 125 $if ACCUMULATORS > 5: 126 v128_t vo${M}p5 = wasm_f32x4_mul(vi${2*M+1}x7BDF, vk10); 127 $else: 128 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x7BDF, vk10)); 129 130 $for M in range(ROW_TILE): 131 $if ACCUMULATORS > 6: 132 v128_t vo${M}p6 = wasm_f32x4_mul(vi${2*M+2}x7BDF, vk11); 133 $else: 134 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x7BDF, vk20)); 135 136 $for M in range(ROW_TILE): 137 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x9BDF, vk02)); 138 139 $for M in range(ROW_TILE): 140 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x9BDF, vk12)); 141 142 $for M in range(ROW_TILE): 143 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x9BDF, vk22)); 144 145 $if ACCUMULATORS > 1: 146 $ACC_SLICE = 1 147 $while ACC_SLICE < ACCUMULATORS: 148 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 149 $if A + ACC_SLICE < ACCUMULATORS: 150 $for M in range(ROW_TILE): 151 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 152 $ACC_SLICE *= 2 153 154 $if X86: 155 $for M in range(ROW_TILE): 156 v128_t vo${M} = wasm_f32x4_pmax(vmin, vo${M}p0); 157 $for M in range(ROW_TILE): 158 vo${M} = wasm_f32x4_pmin(vmax, vo${M}); 159 $else: 160 $for M in range(ROW_TILE): 161 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 162 $for M in range(ROW_TILE): 163 vo${M} = wasm_f32x4_min(vo${M}, vmax); 164 165 $for M in reversed(range(ROW_TILE)): 166 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 167 } 168 // Last block has 0-7 pixels to process. 169 assert(w < 8 * sizeof(float)); 170 if XNN_LIKELY(w != 0) { 171 $for M in range(ROW_TILE): 172 v128_t vo${M}p0 = vbias; 173 174 $for M in range(1 + 2 * ROW_TILE): 175 const v128_t vi${M}x89AB = wasm_v128_load(i${M}); 176 const v128_t vi${M}xCDEF = wasm_v128_load(i${M} + 4); 177 178 $for M in range(1 + 2 * ROW_TILE): 179 const v128_t vi${M}x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi${M}x89AB, vi${M}xCDEF, 0, 2, 4, 6)); 180 const v128_t vi${M}x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi${M}x89AB, vi${M}xCDEF, 1, 3, 5, 7)); 181 182 $for M in range(ROW_TILE): 183 $if ACCUMULATORS > 1: 184 v128_t vo${M}p1 = wasm_f32x4_mul(vi${2*M}x8ACE, vk01); 185 $else: 186 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${2*M}x8ACE, vk01)); 187 188 $for M in range(ROW_TILE): 189 $if ACCUMULATORS > 2: 190 v128_t vo${M}p2 = wasm_f32x4_mul(vi${2*M+1}x8ACE, vk11); 191 $else: 192 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${2*M+1}x8ACE, vk11)); 193 194 $for M in range(ROW_TILE): 195 $if ACCUMULATORS > 3: 196 v128_t vo${M}p3 = wasm_f32x4_mul(vi${2*M+2}x8ACE, vk21); 197 $else: 198 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x8ACE, vk21)); 199 200 $for M in range(1 + 2 * ROW_TILE): 201 const v128_t vi${M}x7BDF = wasm_v32x4_shuffle(vi${M}x1357, vi${M}x9BDF, 3, 4, 5, 6); 202 203 $for M in range(ROW_TILE): 204 $if ACCUMULATORS > 4: 205 v128_t vo${M}p4 = wasm_f32x4_mul(vi${2*M}x7BDF, vk00); 206 $else: 207 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x7BDF, vk00)); 208 209 $for M in range(ROW_TILE): 210 $if ACCUMULATORS > 5: 211 v128_t vo${M}p5 = wasm_f32x4_mul(vi${2*M+1}x7BDF, vk10); 212 $else: 213 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x7BDF, vk10)); 214 215 $for M in range(ROW_TILE): 216 $if ACCUMULATORS > 6: 217 v128_t vo${M}p6 = wasm_f32x4_mul(vi${2*M+2}x7BDF, vk11); 218 $else: 219 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x7BDF, vk20)); 220 221 $for M in range(ROW_TILE): 222 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x9BDF, vk02)); 223 224 $for M in range(ROW_TILE): 225 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x9BDF, vk12)); 226 227 $for M in range(ROW_TILE): 228 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x9BDF, vk22)); 229 230 $if ACCUMULATORS > 1: 231 $ACC_SLICE = 1 232 $while ACC_SLICE < ACCUMULATORS: 233 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 234 $if A + ACC_SLICE < ACCUMULATORS: 235 $for M in range(ROW_TILE): 236 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 237 $ACC_SLICE *= 2 238 239 $if X86: 240 $for M in range(ROW_TILE): 241 v128_t vo${M} = wasm_f32x4_pmax(vmin, vo${M}p0); 242 $for M in range(ROW_TILE): 243 vo${M} = wasm_f32x4_pmin(vmax, vo${M}); 244 $else: 245 $for M in range(ROW_TILE): 246 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 247 $for M in range(ROW_TILE): 248 vo${M} = wasm_f32x4_min(vo${M}, vmax); 249 250 w += 1 * sizeof(float); 251 if (w & (8 * sizeof(float))) { 252 $for M in reversed(range(ROW_TILE)): 253 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 254 } else { 255 if (w & (4 * sizeof(float))) { 256 $for M in reversed(range(ROW_TILE)): 257 *((double*) o${M}) = wasm_f64x2_extract_lane(vo${M}, 0); o${M} += 2; 258 259 $for M in range(ROW_TILE): 260 vo${M} = wasm_v32x4_shuffle(vo${M}, vo${M}, 2, 3, 0, 1); 261 } 262 if (w & (2 * sizeof(float))) { 263 $for M in reversed(range(ROW_TILE)): 264 *o${M} = wasm_f32x4_extract_lane(vo${M}, 0); o${M} += 1; 265 } 266 } 267 } 268 269 i0 = (const float*) ((uintptr_t) i${2 * ROW_TILE} - input_decrement); 270 $for M in range(1, 1 + 2 * ROW_TILE): 271 i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 272 273 $if ROW_TILE > 1: 274 o0 = o${ROW_TILE - 1}; 275 $for M in range(1, ROW_TILE): 276 o${M} = (float*) ((uintptr_t) o${M-1} + output_width); 277 278 $if ROW_TILE > 1: 279 output_height = doz(output_height, ${ROW_TILE}); 280 padded_input_height = doz(padded_input_height, ${ROW_TILE * 2}); 281 $else: 282 output_height -= 1; 283 padded_input_height -= 2; 284 } while (output_height != 0); 285} 286