1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert ROW_TILE >= 1 7$assert ACCUMULATORS >= 1 8#include <assert.h> 9 10#include <wasm_simd128.h> 11 12#include <xnnpack/dwconv.h> 13#include <xnnpack/math.h> 14 15 16$ARCH_SUFFIX = "_x86" if X86 else "_arm" 17 18void xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd${ARCH_SUFFIX}_loadsplat_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}( 19 size_t input_height, 20 size_t input_width, 21 const float* input, 22 const float* weights, 23 const float* zero, 24 float* output, 25 uint32_t padding_top, 26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 27{ 28 assert(input_height != 0); 29 assert(input_width != 0); 30 assert(input_width % sizeof(float) == 0); 31 assert(padding_top == 2); 32 33 const v128_t vmask = wasm_v128_load(params->scalar.mask); 34 const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); 35 const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); 36 37 const v128_t vw0123 = wasm_v128_load(weights); 38 const v128_t vw4567 = wasm_v128_load(weights + 4); 39 const v128_t vw89AB = wasm_v128_load(weights + 8); 40 const v128_t vwCDEF = wasm_v128_load(weights + 12); 41 const v128_t vwGHIJ = wasm_v128_load(weights + 16); 42 const v128_t vwKLMN = wasm_v128_load(weights + 20); 43 const v128_t vwOP = wasm_v128_load64_splat(weights + 24); 44 const v128_t vbias = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); 45 const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1); 46 const v128_t vk01 = wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2); 47 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); 48 const v128_t vk03 = wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0); 49 const v128_t vk04 = wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1); 50 const v128_t vk10 = wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2); 51 const v128_t vk11 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3); 52 const v128_t vk12 = wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0); 53 const v128_t vk13 = wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1); 54 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); 55 const v128_t vk20 = wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3); 56 const v128_t vk21 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0); 57 const v128_t vk22 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1); 58 const v128_t vk23 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2); 59 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); 60 const v128_t vk30 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0); 61 const v128_t vk31 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1); 62 const v128_t vk32 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2); 63 const v128_t vk33 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3); 64 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); 65 const v128_t vk40 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1); 66 const v128_t vk41 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2); 67 const v128_t vk42 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3); 68 const v128_t vk43 = wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0); 69 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); 70 71 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float)); 72 73 const float* i0 = zero; 74 const float* i1 = zero; 75 const float* i2 = input; 76 $for M in range(3, 4 + ROW_TILE): 77 const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 78 79 float* o0 = output; 80 $for M in range(1, ROW_TILE): 81 float* o${M} = (float*) ((uintptr_t) o${M-1} + input_width); 82 83 size_t output_height = input_height; 84 do { 85 $for M in range(2, 3 + ROW_TILE): 86 if XNN_UNPREDICTABLE(output_height < ${M}) { 87 i${M+1} = zero; 88 $if M <= ROW_TILE: 89 o${M-1} = o${M-2}; 90 } 91 92 $for M in range(4 + ROW_TILE): 93 v128_t vi${M}x0123 = wasm_f32x4_const_splat(0.0f); 94 95 $for M in range(4 + ROW_TILE): 96 v128_t vi${M}x4567 = wasm_v128_load(i${M}); i${M} += 4; 97 98 size_t w = input_width; 99 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) { 100 $for M in range(ROW_TILE): 101 v128_t vo${M}p0 = vbias; 102 103 $for M in range(4 + ROW_TILE): 104 const v128_t vi${M}x89AB = wasm_v128_load(i${M}); i${M} += 4; 105 106 $for M in range(ROW_TILE): 107 $if ACCUMULATORS > 1: 108 v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, vk02); 109 $else: 110 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, vk02)); 111 112 $for M in range(ROW_TILE): 113 $if ACCUMULATORS > 2: 114 v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, vk12); 115 $else: 116 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, vk12)); 117 118 $for M in range(ROW_TILE): 119 $if ACCUMULATORS > 3: 120 v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, vk22); 121 $else: 122 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, vk22)); 123 124 $for M in range(ROW_TILE): 125 $if ACCUMULATORS > 4: 126 v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, vk32); 127 $else: 128 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, vk32)); 129 130 $for M in range(ROW_TILE): 131 $if ACCUMULATORS > 6: 132 v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, vk42); 133 $else: 134 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, vk42)); 135 136 $for M in range(4 + ROW_TILE): 137 const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6); 138 139 $for M in range(ROW_TILE): 140 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, vk01)); 141 142 $for M in range(ROW_TILE): 143 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, vk11)); 144 145 $for M in range(ROW_TILE): 146 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, vk21)); 147 148 $for M in range(ROW_TILE): 149 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, vk31)); 150 151 $for M in range(ROW_TILE): 152 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, vk41)); 153 154 $for M in range(4 + ROW_TILE): 155 const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5); 156 vi${M}x0123 = vi${M}x4567; 157 158 $for M in range(ROW_TILE): 159 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, vk00)); 160 161 $for M in range(ROW_TILE): 162 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, vk10)); 163 164 $for M in range(ROW_TILE): 165 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, vk20)); 166 167 $for M in range(ROW_TILE): 168 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, vk30)); 169 170 $for M in range(ROW_TILE): 171 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, vk40)); 172 173 $for M in range(4 + ROW_TILE): 174 const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 1, 2, 3, 4); 175 176 $for M in range(ROW_TILE): 177 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, vk03)); 178 179 $for M in range(ROW_TILE): 180 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, vk13)); 181 182 $for M in range(ROW_TILE): 183 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, vk23)); 184 185 $for M in range(ROW_TILE): 186 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, vk33)); 187 188 $for M in range(ROW_TILE): 189 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, vk43)); 190 191 $for M in range(4 + ROW_TILE): 192 const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 2, 3, 4, 5); 193 vi${M}x4567 = vi${M}x89AB; 194 195 $for M in range(ROW_TILE): 196 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, vk04)); 197 198 $for M in range(ROW_TILE): 199 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, vk14)); 200 201 $for M in range(ROW_TILE): 202 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, vk24)); 203 204 $for M in range(ROW_TILE): 205 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, vk34)); 206 207 $for M in range(ROW_TILE): 208 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, vk44)); 209 210 $if ACCUMULATORS > 1: 211 $ACC_SLICE = 1 212 $while ACC_SLICE < ACCUMULATORS: 213 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 214 $if A + ACC_SLICE < ACCUMULATORS: 215 $for M in range(ROW_TILE): 216 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 217 $ACC_SLICE *= 2 218 219 $if X86: 220 $for M in range(ROW_TILE): 221 v128_t vo${M} = wasm_f32x4_pmax(vmin, vo${M}p0); 222 $for M in range(ROW_TILE): 223 vo${M} = wasm_f32x4_pmin(vmax, vo${M}); 224 $else: 225 $for M in range(ROW_TILE): 226 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 227 $for M in range(ROW_TILE): 228 vo${M} = wasm_f32x4_min(vo${M}, vmax); 229 230 $for M in reversed(range(ROW_TILE)): 231 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 232 } 233 // Always process the last block of 5..8 pixels. 234 if XNN_LIKELY(w > 4 * sizeof(float)) { 235 $for M in range(ROW_TILE): 236 v128_t vo${M}p0 = vbias; 237 238 $for M in range(4 + ROW_TILE): 239 v128_t vi${M}x89AB = wasm_v128_load(i${M}); i${M} += 4; 240 241 $for M in range(4 + ROW_TILE): 242 vi${M}x89AB = wasm_v128_and(vmask, vi${M}x89AB); 243 244 $for M in range(ROW_TILE): 245 $if ACCUMULATORS > 1: 246 v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, vk02); 247 $else: 248 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, vk02)); 249 250 $for M in range(ROW_TILE): 251 $if ACCUMULATORS > 2: 252 v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, vk12); 253 $else: 254 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, vk12)); 255 256 $for M in range(ROW_TILE): 257 $if ACCUMULATORS > 3: 258 v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, vk22); 259 $else: 260 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, vk22)); 261 262 $for M in range(ROW_TILE): 263 $if ACCUMULATORS > 4: 264 v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, vk32); 265 $else: 266 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, vk32)); 267 268 $for M in range(ROW_TILE): 269 $if ACCUMULATORS > 6: 270 v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, vk42); 271 $else: 272 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, vk42)); 273 274 $for M in range(4 + ROW_TILE): 275 const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6); 276 277 $for M in range(ROW_TILE): 278 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, vk01)); 279 280 $for M in range(ROW_TILE): 281 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, vk11)); 282 283 $for M in range(ROW_TILE): 284 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, vk21)); 285 286 $for M in range(ROW_TILE): 287 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, vk31)); 288 289 $for M in range(ROW_TILE): 290 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, vk41)); 291 292 $for M in range(4 + ROW_TILE): 293 const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5); 294 vi${M}x0123 = vi${M}x4567; 295 296 $for M in range(ROW_TILE): 297 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, vk00)); 298 299 $for M in range(ROW_TILE): 300 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, vk10)); 301 302 $for M in range(ROW_TILE): 303 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, vk20)); 304 305 $for M in range(ROW_TILE): 306 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, vk30)); 307 308 $for M in range(ROW_TILE): 309 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, vk40)); 310 311 $for M in range(4 + ROW_TILE): 312 const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 1, 2, 3, 4); 313 314 $for M in range(ROW_TILE): 315 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, vk03)); 316 317 $for M in range(ROW_TILE): 318 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, vk13)); 319 320 $for M in range(ROW_TILE): 321 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, vk23)); 322 323 $for M in range(ROW_TILE): 324 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, vk33)); 325 326 $for M in range(ROW_TILE): 327 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, vk43)); 328 329 $for M in range(4 + ROW_TILE): 330 const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 2, 3, 4, 5); 331 vi${M}x4567 = vi${M}x89AB; 332 333 $for M in range(ROW_TILE): 334 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, vk04)); 335 336 $for M in range(ROW_TILE): 337 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, vk14)); 338 339 $for M in range(ROW_TILE): 340 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, vk24)); 341 342 $for M in range(ROW_TILE): 343 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, vk34)); 344 345 $for M in range(ROW_TILE): 346 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, vk44)); 347 348 $if ACCUMULATORS > 1: 349 $ACC_SLICE = 1 350 $while ACC_SLICE < ACCUMULATORS: 351 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 352 $if A + ACC_SLICE < ACCUMULATORS: 353 $for M in range(ROW_TILE): 354 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 355 $ACC_SLICE *= 2 356 357 $if X86: 358 $for M in range(ROW_TILE): 359 v128_t vo${M} = wasm_f32x4_pmax(vmin, vo${M}p0); 360 $for M in range(ROW_TILE): 361 vo${M} = wasm_f32x4_pmin(vmax, vo${M}); 362 $else: 363 $for M in range(ROW_TILE): 364 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 365 $for M in range(ROW_TILE): 366 vo${M} = wasm_f32x4_min(vo${M}, vmax); 367 368 $for M in reversed(range(ROW_TILE)): 369 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 370 371 w -= 4 * sizeof(float); 372 } 373 assert(w >= 1 * sizeof(float)); 374 assert(w <= 4 * sizeof(float)); 375 { 376 $for M in range(ROW_TILE): 377 v128_t vo${M}p0 = vbias; 378 379 $for M in range(4 + ROW_TILE): 380 vi${M}x4567 = wasm_v128_and(vmask, vi${M}x4567); 381 382 $for M in range(ROW_TILE): 383 $if ACCUMULATORS > 1: 384 v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, vk02); 385 $else: 386 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, vk02)); 387 388 $for M in range(ROW_TILE): 389 $if ACCUMULATORS > 2: 390 v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, vk12); 391 $else: 392 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, vk12)); 393 394 $for M in range(ROW_TILE): 395 $if ACCUMULATORS > 3: 396 v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, vk22); 397 $else: 398 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, vk22)); 399 400 $for M in range(ROW_TILE): 401 $if ACCUMULATORS > 4: 402 v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, vk32); 403 $else: 404 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, vk32)); 405 406 $for M in range(ROW_TILE): 407 $if ACCUMULATORS > 6: 408 v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, vk42); 409 $else: 410 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, vk42)); 411 412 $for M in range(4 + ROW_TILE): 413 const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6); 414 415 $for M in range(ROW_TILE): 416 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, vk01)); 417 418 $for M in range(ROW_TILE): 419 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, vk11)); 420 421 $for M in range(ROW_TILE): 422 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, vk21)); 423 424 $for M in range(ROW_TILE): 425 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, vk31)); 426 427 $for M in range(ROW_TILE): 428 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, vk41)); 429 430 $for M in range(4 + ROW_TILE): 431 const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5); 432 433 $for M in range(ROW_TILE): 434 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, vk00)); 435 436 $for M in range(ROW_TILE): 437 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, vk10)); 438 439 $for M in range(ROW_TILE): 440 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, vk20)); 441 442 $for M in range(ROW_TILE): 443 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, vk30)); 444 445 $for M in range(ROW_TILE): 446 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, vk40)); 447 448 const v128_t vzero = wasm_f32x4_const_splat(0.0f); 449 $for M in range(4 + ROW_TILE): 450 const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vzero, 1, 2, 3, 4); 451 452 $for M in range(ROW_TILE): 453 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, vk03)); 454 455 $for M in range(ROW_TILE): 456 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, vk13)); 457 458 $for M in range(ROW_TILE): 459 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, vk23)); 460 461 $for M in range(ROW_TILE): 462 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, vk33)); 463 464 $for M in range(ROW_TILE): 465 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, vk43)); 466 467 $for M in range(4 + ROW_TILE): 468 const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x5678, vzero, 1, 2, 3, 4); 469 470 $for M in range(ROW_TILE): 471 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, vk04)); 472 473 $for M in range(ROW_TILE): 474 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, vk14)); 475 476 $for M in range(ROW_TILE): 477 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, vk24)); 478 479 $for M in range(ROW_TILE): 480 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, vk34)); 481 482 $for M in range(ROW_TILE): 483 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, vk44)); 484 485 $if ACCUMULATORS > 1: 486 $ACC_SLICE = 1 487 $while ACC_SLICE < ACCUMULATORS: 488 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 489 $if A + ACC_SLICE < ACCUMULATORS: 490 $for M in range(ROW_TILE): 491 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 492 $ACC_SLICE *= 2 493 494 $if X86: 495 $for M in range(ROW_TILE): 496 v128_t vo${M} = wasm_f32x4_pmax(vmin, vo${M}p0); 497 $for M in range(ROW_TILE): 498 vo${M} = wasm_f32x4_pmin(vmax, vo${M}); 499 $else: 500 $for M in range(ROW_TILE): 501 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 502 $for M in range(ROW_TILE): 503 vo${M} = wasm_f32x4_min(vo${M}, vmax); 504 505 if XNN_LIKELY(w & (4 * sizeof(float))) { 506 $for M in reversed(range(ROW_TILE)): 507 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 508 } else { 509 if (w & (2 * sizeof(float))) { 510 $for M in reversed(range(ROW_TILE)): 511 *((double*) o${M}) = wasm_f64x2_extract_lane(vo${M}, 0); o${M} += 2; 512 513 $for M in range(ROW_TILE): 514 vo${M} = wasm_v32x4_shuffle(vo${M}, vo${M}, 2, 3, 0, 1); 515 } 516 if (w & (1 * sizeof(float))) { 517 $for M in reversed(range(ROW_TILE)): 518 *o${M} = wasm_f32x4_extract_lane(vo${M}, 0); o${M} += 1; 519 } 520 } 521 } 522 523 i0 = (const float*) ((uintptr_t) i${ROW_TILE} - input_decrement); 524 i1 = (const float*) ((uintptr_t) i${ROW_TILE+1} - input_decrement); 525 $for M in range(2, 4 + ROW_TILE): 526 i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 527 528 $if ROW_TILE > 1: 529 o0 = o${ROW_TILE - 1}; 530 $for M in range(1, ROW_TILE): 531 o${M} = (float*) ((uintptr_t) o${M-1} + input_width); 532 533 $if ROW_TILE > 1: 534 output_height = doz(output_height, ${ROW_TILE}); 535 } while (${"--" if ROW_TILE == 1 else ""}output_height != 0); 536} 537