1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert CHANNEL_TILE % 4 == 0 7$assert KERNEL_TILE >= 2 8$assert ACCUMULATORS >= 1 9$assert ACTIVATION != "MINMAX" or ARCH in ["ARM", "X86", "RELAXED"] 10$assert not FMA or ARCH == "RELAXED" 11$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 12#include <assert.h> 13 14#include <wasm_simd128.h> 15 16#include <xnnpack/dwconv.h> 17 18 19$assert ACTIVATION in ["LINEAR", "RELU", "MINMAX"] 20$if ACTIVATION == "MINMAX": 21$ WASM_F32X4_MIN={"ARM": "wasm_f32x4_min", "X86": "wasm_f32x4_pmin", "RELAXED": "__builtin_wasm_relaxed_min_f32x4"}[ARCH] 22$ WASM_F32X4_MAX={"ARM": "wasm_f32x4_max", "X86": "wasm_f32x4_pmax", "RELAXED": "__builtin_wasm_relaxed_max_f32x4"}[ARCH] 23$ACTIVATION_SUFFIX = {"LINEAR": ""}.get(ACTIVATION, "_" + ACTIVATION.lower()) 24$ISA = "wasmsimd" if not FMA and (ACTIVATION in ["LINEAR", "RELU"] or ARCH != "RELAXED") else "wasmrelaxedsimd" 25$ARCH_SUFFIX = "" if not FMA and (ACTIVATION in ["LINEAR", "RELU"] or ARCH == "RELAXED") else "_" + ("fma" if FMA else ARCH.lower()) 26$PARAMS = {"LINEAR": "xnn_f32_default_params", "RELU": "xnn_f32_relu_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION] 27void xnn_f32_dwconv${ACTIVATION_SUFFIX}_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${ISA}${ARCH_SUFFIX}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( 28 size_t channels, 29 size_t output_width, 30 const float** input, 31 const float* weights, 32 float* output, 33 size_t input_stride, 34 size_t output_increment, 35 size_t input_offset, 36 const float* zero, 37 const union ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 38{ 39 assert(channels != 0); 40 assert(output_width != 0); 41 42 $if ACTIVATION == "MINMAX": 43 const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); 44 const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); 45 $elif ACTIVATION == "RELU": 46 const v128_t vzero = wasm_i32x4_const_splat(0); 47 do { 48 $for K in range(KERNEL_TILE): 49 const float* i${K} = input[${K}]; 50 assert(i${K} != NULL); 51 if XNN_UNPREDICTABLE(i${K} != zero) { 52 i${K} = (const float*) ((uintptr_t) i${K} + input_offset); 53 } 54 input = (const float**) ((uintptr_t) input + input_stride); 55 56 size_t c = channels; 57 const float* w = weights; 58 for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) { 59 v128_t vacc${ABC[0:4]}p0 = wasm_v128_load(w); 60 $for C in range(4, CHANNEL_TILE, 4): 61 v128_t vacc${ABC[C:C+4]}p0 = wasm_v128_load(w + ${C}); 62 63 $for K in range(KERNEL_TILE): 64 65 const v128_t vi${K}x${ABC[0:4]} = wasm_v128_load(i${K}); 66 $for C in range(4, CHANNEL_TILE, 4): 67 const v128_t vi${K}x${ABC[C:C+4]} = wasm_v128_load(i${K} + ${C}); 68 i${K} += ${CHANNEL_TILE}; 69 70 $for C in range(0, CHANNEL_TILE, 4): 71 const v128_t vk${K}x${ABC[C:C+4]} = wasm_v128_load(w + ${(K + 1) * CHANNEL_TILE + C}); 72 $for C in range(0, CHANNEL_TILE, 4): 73 $if 1 <= K < ACCUMULATORS: 74 v128_t vacc${ABC[C:C+4]}p${K} = wasm_f32x4_mul(vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]}); 75 $else: 76 $if FMA: 77 vacc${ABC[C:C+4]}p${K % ACCUMULATORS} = __builtin_wasm_fma_f32x4(vacc${ABC[C:C+4]}p${K % ACCUMULATORS}, vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]}); 78 $else: 79 vacc${ABC[C:C+4]}p${K % ACCUMULATORS} = wasm_f32x4_add(vacc${ABC[C:C+4]}p${K % ACCUMULATORS}, wasm_f32x4_mul(vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]})); 80 81 w += ${(KERNEL_TILE + 1) * CHANNEL_TILE}; 82 83 $if ACCUMULATORS > 1: 84 // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0 85 $ACC_SLICE = 1 86 $while ACC_SLICE < ACCUMULATORS: 87 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 88 $if A + ACC_SLICE < ACCUMULATORS: 89 $for C in range(0, CHANNEL_TILE, 4): 90 vacc${ABC[C:C+4]}p${A} = wasm_f32x4_add(vacc${ABC[C:C+4]}p${A}, vacc${ABC[C:C+4]}p${A + ACC_SLICE}); 91 $ACC_SLICE *= 2 92 93 $if ACTIVATION == "MINMAX": 94 $for C in range(0, CHANNEL_TILE, 4): 95 v128_t vacc${ABC[C:C+4]} = ${WASM_F32X4_MAX}(vmin, vacc${ABC[C:C+4]}p0); 96 97 $for C in range(0, CHANNEL_TILE, 4): 98 vacc${ABC[C:C+4]} = ${WASM_F32X4_MIN}(vmax, vacc${ABC[C:C+4]}); 99 $elif ACTIVATION == "RELU": 100 $for C in range(0, CHANNEL_TILE, 4): 101 const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_max(vacc${ABC[C:C+4]}p0, vzero); 102 $elif ACTIVATION == "LINEAR": 103 $for C in range(0, CHANNEL_TILE, 4): 104 const v128_t vacc${ABC[C:C+4]} = vacc${ABC[C:C+4]}p0; 105 106 wasm_v128_store(output, vacc${ABC[0:4]}); 107 $for C in range(4, CHANNEL_TILE, 4): 108 wasm_v128_store(output + ${C}, vacc${ABC[C:C+4]}); 109 output += ${CHANNEL_TILE}; 110 } 111 $if CHANNEL_TILE > 4: 112 for (; c >= 4; c -= 4) { 113 v128_t vacc0123p0 = wasm_v128_load(w); 114 $for K in range(KERNEL_TILE): 115 116 const v128_t vi${K}x0123 = wasm_v128_load(i${K}); 117 i${K} += 4; 118 119 const v128_t vk${K}x0123 = wasm_v128_load(w + ${(K + 1) * CHANNEL_TILE}); 120 $if 1 <= K < ACCUMULATORS: 121 v128_t vacc0123p${K} = wasm_f32x4_mul(vi${K}x0123, vk${K}x0123); 122 $else: 123 $if FMA: 124 vacc0123p${K % ACCUMULATORS} = __builtin_wasm_fma_f32x4(vacc0123p${K % ACCUMULATORS}, vi${K}x0123, vk${K}x0123); 125 $else: 126 vacc0123p${K % ACCUMULATORS} = wasm_f32x4_add(vacc0123p${K % ACCUMULATORS}, wasm_f32x4_mul(vi${K}x0123, vk${K}x0123)); 127 128 w += 4; 129 130 $if ACCUMULATORS > 1: 131 // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0 132 $ACC_SLICE = 1 133 $while ACC_SLICE < ACCUMULATORS: 134 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 135 $if A + ACC_SLICE < ACCUMULATORS: 136 vacc0123p${A} = wasm_f32x4_add(vacc0123p${A}, vacc0123p${A + ACC_SLICE}); 137 $ACC_SLICE *= 2 138 139 $if ACTIVATION == "MINMAX": 140 v128_t vacc0123 = ${WASM_F32X4_MAX}(vmin, vacc0123p0); 141 vacc0123 = ${WASM_F32X4_MIN}(vmax, vacc0123); 142 $elif ACTIVATION == "RELU": 143 const v128_t vacc0123 = wasm_i32x4_max(vacc0123p0, vzero); 144 $elif ACTIVATION == "LINEAR": 145 const v128_t vacc0123 = vacc0123p0; 146 147 wasm_v128_store(output, vacc0123); 148 output += 4; 149 } 150 if XNN_UNLIKELY(c != 0) { 151 v128_t vacc0123p0 = wasm_v128_load(w); 152 $for K in range(KERNEL_TILE): 153 154 const v128_t vi${K}x0123 = wasm_v128_load(i${K}); 155 const v128_t vk${K}x0123 = wasm_v128_load(w + ${(K+1) * CHANNEL_TILE}); 156 $if 1 <= K < ACCUMULATORS: 157 v128_t vacc0123p${K} = wasm_f32x4_mul(vi${K}x0123, vk${K}x0123); 158 $else: 159 $if FMA: 160 vacc0123p${K % ACCUMULATORS} = __builtin_wasm_fma_f32x4(vacc0123p${K % ACCUMULATORS}, vi${K}x0123, vk${K}x0123); 161 $else: 162 vacc0123p${K % ACCUMULATORS} = wasm_f32x4_add(vacc0123p${K % ACCUMULATORS}, wasm_f32x4_mul(vi${K}x0123, vk${K}x0123)); 163 164 $if ACCUMULATORS > 1: 165 // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0 166 $ACC_SLICE = 1 167 $while ACC_SLICE < ACCUMULATORS: 168 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 169 $if A + ACC_SLICE < ACCUMULATORS: 170 vacc0123p${A} = wasm_f32x4_add(vacc0123p${A}, vacc0123p${A + ACC_SLICE}); 171 $ACC_SLICE *= 2 172 173 $if ACTIVATION == "MINMAX": 174 v128_t vacc0123 = ${WASM_F32X4_MAX}(vmin, vacc0123p0); 175 vacc0123 = ${WASM_F32X4_MIN}(vmax, vacc0123); 176 $elif ACTIVATION == "RELU": 177 v128_t vacc0123 = wasm_i32x4_max(vacc0123p0, vzero); 178 $elif ACTIVATION == "LINEAR": 179 v128_t vacc0123 = vacc0123p0; 180 181 if (c & 2) { 182 *((double*) output) = wasm_f64x2_extract_lane(vacc0123, 0); 183 vacc0123 = wasm_v32x4_shuffle(vacc0123, vacc0123, 2, 3, 2, 3); 184 output += 2; 185 } 186 if (c & 1) { 187 *output = wasm_f32x4_extract_lane(vacc0123, 0); 188 output += 1; 189 } 190 } 191 192 output = (float*) ((uintptr_t) output + output_increment); 193 } while (--output_width != 0); 194} 195