1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2020 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert ROW_TILE >= 1 7*4bdc9457SAndroid Build Coastguard Worker$assert ACCUMULATORS >= 1 8*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 9*4bdc9457SAndroid Build Coastguard Worker 10*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/dwconv.h> 11*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h> 12*4bdc9457SAndroid Build Coastguard Worker 13*4bdc9457SAndroid Build Coastguard Worker 14*4bdc9457SAndroid Build Coastguard Workervoid xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_${ROW_TILE}x1${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}( 15*4bdc9457SAndroid Build Coastguard Worker size_t input_height, 16*4bdc9457SAndroid Build Coastguard Worker size_t input_width, 17*4bdc9457SAndroid Build Coastguard Worker const float* input, 18*4bdc9457SAndroid Build Coastguard Worker const float* weights, 19*4bdc9457SAndroid Build Coastguard Worker const float* zero, 20*4bdc9457SAndroid Build Coastguard Worker float* output, 21*4bdc9457SAndroid Build Coastguard Worker uint32_t padding_top, 22*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) 23*4bdc9457SAndroid Build Coastguard Worker{ 24*4bdc9457SAndroid Build Coastguard Worker assert(input_height != 0); 25*4bdc9457SAndroid Build Coastguard Worker assert(input_width != 0); 26*4bdc9457SAndroid Build Coastguard Worker assert(input_width % sizeof(float) == 0); 27*4bdc9457SAndroid Build Coastguard Worker assert(padding_top == 2); 28*4bdc9457SAndroid Build Coastguard Worker 29*4bdc9457SAndroid Build Coastguard Worker const float vmin = params->scalar.min; 30*4bdc9457SAndroid Build Coastguard Worker const float vmax = params->scalar.max; 31*4bdc9457SAndroid Build Coastguard Worker 32*4bdc9457SAndroid Build Coastguard Worker const float vbias = weights[0]; 33*4bdc9457SAndroid Build Coastguard Worker $for R in range(5): 34*4bdc9457SAndroid Build Coastguard Worker $for S in range(5): 35*4bdc9457SAndroid Build Coastguard Worker const float vk${R}${S} = weights[${R*5+S+1}]; 36*4bdc9457SAndroid Build Coastguard Worker 37*4bdc9457SAndroid Build Coastguard Worker const float* i0 = zero; 38*4bdc9457SAndroid Build Coastguard Worker const float* i1 = zero; 39*4bdc9457SAndroid Build Coastguard Worker const float* i2 = input; 40*4bdc9457SAndroid Build Coastguard Worker $for M in range(3, 4 + ROW_TILE): 41*4bdc9457SAndroid Build Coastguard Worker const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 42*4bdc9457SAndroid Build Coastguard Worker 43*4bdc9457SAndroid Build Coastguard Worker float* o0 = output; 44*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, ROW_TILE): 45*4bdc9457SAndroid Build Coastguard Worker float* o${M} = (float*) ((uintptr_t) o${M-1} + input_width); 46*4bdc9457SAndroid Build Coastguard Worker 47*4bdc9457SAndroid Build Coastguard Worker size_t output_height = input_height; 48*4bdc9457SAndroid Build Coastguard Worker do { 49*4bdc9457SAndroid Build Coastguard Worker $for M in range(2, 3 + ROW_TILE): 50*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(output_height < ${M}) { 51*4bdc9457SAndroid Build Coastguard Worker i${M+1} = zero; 52*4bdc9457SAndroid Build Coastguard Worker $if M <= ROW_TILE: 53*4bdc9457SAndroid Build Coastguard Worker o${M-1} = o${M-2}; 54*4bdc9457SAndroid Build Coastguard Worker } 55*4bdc9457SAndroid Build Coastguard Worker 56*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 57*4bdc9457SAndroid Build Coastguard Worker float vi${M}x0 = 0.0f; 58*4bdc9457SAndroid Build Coastguard Worker 59*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 60*4bdc9457SAndroid Build Coastguard Worker float vi${M}x1 = 0.0f; 61*4bdc9457SAndroid Build Coastguard Worker 62*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 63*4bdc9457SAndroid Build Coastguard Worker float vi${M}x2 = *i${M}++; 64*4bdc9457SAndroid Build Coastguard Worker 65*4bdc9457SAndroid Build Coastguard Worker size_t w = input_width; 66*4bdc9457SAndroid Build Coastguard Worker if (w > 1 * sizeof(float)) { 67*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 68*4bdc9457SAndroid Build Coastguard Worker float vi${M}x3 = *i${M}++; 69*4bdc9457SAndroid Build Coastguard Worker 70*4bdc9457SAndroid Build Coastguard Worker for (; w > 2 * sizeof(float); w -= 1 * sizeof(float)) { 71*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 72*4bdc9457SAndroid Build Coastguard Worker const float vi${M}x4 = *i${M}++; 73*4bdc9457SAndroid Build Coastguard Worker 74*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 75*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 76*4bdc9457SAndroid Build Coastguard Worker $if K == 0: 77*4bdc9457SAndroid Build Coastguard Worker float vo${M}p0 = vbias + vi${M+K}x0 * vk${K}0; 78*4bdc9457SAndroid Build Coastguard Worker $elif K < ACCUMULATORS: 79*4bdc9457SAndroid Build Coastguard Worker float vo${M}p${K} = vi${M+K}x0 * vk${K}0; 80*4bdc9457SAndroid Build Coastguard Worker $else: 81*4bdc9457SAndroid Build Coastguard Worker vo${M}p${K % ACCUMULATORS} += vi${M+K}x0 * vk${K}0; 82*4bdc9457SAndroid Build Coastguard Worker 83*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 84*4bdc9457SAndroid Build Coastguard Worker vi${M}x0 = vi${M}x1; 85*4bdc9457SAndroid Build Coastguard Worker 86*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 87*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 88*4bdc9457SAndroid Build Coastguard Worker $if K+5 < ACCUMULATORS: 89*4bdc9457SAndroid Build Coastguard Worker float vo${M}p${K+5} = vi${M+K}x1 * vk${K}1; 90*4bdc9457SAndroid Build Coastguard Worker $else: 91*4bdc9457SAndroid Build Coastguard Worker vo${M}p${(K+5) % ACCUMULATORS} += vi${M+K}x1 * vk${K}1; 92*4bdc9457SAndroid Build Coastguard Worker 93*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 94*4bdc9457SAndroid Build Coastguard Worker vi${M}x1 = vi${M}x2; 95*4bdc9457SAndroid Build Coastguard Worker 96*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 97*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 98*4bdc9457SAndroid Build Coastguard Worker vo${M}p${(K+10) % ACCUMULATORS} += vi${M+K}x2 * vk${K}2; 99*4bdc9457SAndroid Build Coastguard Worker 100*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 101*4bdc9457SAndroid Build Coastguard Worker vi${M}x2 = vi${M}x3; 102*4bdc9457SAndroid Build Coastguard Worker 103*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 104*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 105*4bdc9457SAndroid Build Coastguard Worker vo${M}p${(K+15) % ACCUMULATORS} += vi${M+K}x3 * vk${K}3; 106*4bdc9457SAndroid Build Coastguard Worker 107*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 108*4bdc9457SAndroid Build Coastguard Worker vi${M}x3 = vi${M}x4; 109*4bdc9457SAndroid Build Coastguard Worker 110*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 111*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 112*4bdc9457SAndroid Build Coastguard Worker vo${M}p${(K+20) % ACCUMULATORS} += vi${M+K}x4 * vk${K}4; 113*4bdc9457SAndroid Build Coastguard Worker 114*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 1: 115*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE = 1 116*4bdc9457SAndroid Build Coastguard Worker $while ACC_SLICE < ACCUMULATORS: 117*4bdc9457SAndroid Build Coastguard Worker $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 118*4bdc9457SAndroid Build Coastguard Worker $if A + ACC_SLICE < ACCUMULATORS: 119*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 120*4bdc9457SAndroid Build Coastguard Worker vo${M}p${A} += vo${M}p${A + ACC_SLICE}; 121*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE *= 2 122*4bdc9457SAndroid Build Coastguard Worker 123*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 124*4bdc9457SAndroid Build Coastguard Worker float vo${M} = math_max_f32(vo${M}p0, vmin); 125*4bdc9457SAndroid Build Coastguard Worker 126*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 127*4bdc9457SAndroid Build Coastguard Worker vo${M} = math_min_f32(vo${M}, vmax); 128*4bdc9457SAndroid Build Coastguard Worker 129*4bdc9457SAndroid Build Coastguard Worker $for M in reversed(range(ROW_TILE)): 130*4bdc9457SAndroid Build Coastguard Worker *o${M}++ = vo${M}; 131*4bdc9457SAndroid Build Coastguard Worker } 132*4bdc9457SAndroid Build Coastguard Worker assert(w == 2 * sizeof(float)); 133*4bdc9457SAndroid Build Coastguard Worker { 134*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 135*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 136*4bdc9457SAndroid Build Coastguard Worker $if K == 0: 137*4bdc9457SAndroid Build Coastguard Worker float vo${M}p0 = vbias + vi${M+K}x0 * vk${K}0; 138*4bdc9457SAndroid Build Coastguard Worker $elif K < ACCUMULATORS: 139*4bdc9457SAndroid Build Coastguard Worker float vo${M}p${K} = vi${M+K}x0 * vk${K}0; 140*4bdc9457SAndroid Build Coastguard Worker $else: 141*4bdc9457SAndroid Build Coastguard Worker vo${M}p${K % ACCUMULATORS} += vi${M+K}x0 * vk${K}0; 142*4bdc9457SAndroid Build Coastguard Worker 143*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 144*4bdc9457SAndroid Build Coastguard Worker vi${M}x0 = vi${M}x1; 145*4bdc9457SAndroid Build Coastguard Worker 146*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 147*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 148*4bdc9457SAndroid Build Coastguard Worker $if K+5 < ACCUMULATORS: 149*4bdc9457SAndroid Build Coastguard Worker float vo${M}p${K+5} = vi${M+K}x1 * vk${K}1; 150*4bdc9457SAndroid Build Coastguard Worker $else: 151*4bdc9457SAndroid Build Coastguard Worker vo${M}p${(K+5) % ACCUMULATORS} += vi${M+K}x1 * vk${K}1; 152*4bdc9457SAndroid Build Coastguard Worker 153*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 154*4bdc9457SAndroid Build Coastguard Worker vi${M}x1 = vi${M}x2; 155*4bdc9457SAndroid Build Coastguard Worker 156*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 157*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 158*4bdc9457SAndroid Build Coastguard Worker vo${M}p${(K+10) % ACCUMULATORS} += vi${M+K}x2 * vk${K}2; 159*4bdc9457SAndroid Build Coastguard Worker 160*4bdc9457SAndroid Build Coastguard Worker $for M in range(4 + ROW_TILE): 161*4bdc9457SAndroid Build Coastguard Worker vi${M}x2 = vi${M}x3; 162*4bdc9457SAndroid Build Coastguard Worker 163*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 164*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 165*4bdc9457SAndroid Build Coastguard Worker vo${M}p${(K+15) % ACCUMULATORS} += vi${M+K}x3 * vk${K}3; 166*4bdc9457SAndroid Build Coastguard Worker 167*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 1: 168*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE = 1 169*4bdc9457SAndroid Build Coastguard Worker $while ACC_SLICE < ACCUMULATORS: 170*4bdc9457SAndroid Build Coastguard Worker $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 171*4bdc9457SAndroid Build Coastguard Worker $if A + ACC_SLICE < ACCUMULATORS: 172*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 173*4bdc9457SAndroid Build Coastguard Worker vo${M}p${A} += vo${M}p${A + ACC_SLICE}; 174*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE *= 2 175*4bdc9457SAndroid Build Coastguard Worker 176*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 177*4bdc9457SAndroid Build Coastguard Worker float vo${M} = math_max_f32(vo${M}p0, vmin); 178*4bdc9457SAndroid Build Coastguard Worker 179*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 180*4bdc9457SAndroid Build Coastguard Worker vo${M} = math_min_f32(vo${M}, vmax); 181*4bdc9457SAndroid Build Coastguard Worker 182*4bdc9457SAndroid Build Coastguard Worker $for M in reversed(range(ROW_TILE)): 183*4bdc9457SAndroid Build Coastguard Worker *o${M}++ = vo${M}; 184*4bdc9457SAndroid Build Coastguard Worker } 185*4bdc9457SAndroid Build Coastguard Worker w -= 1 * sizeof(float); 186*4bdc9457SAndroid Build Coastguard Worker } 187*4bdc9457SAndroid Build Coastguard Worker assert(w == 1 * sizeof(float)); 188*4bdc9457SAndroid Build Coastguard Worker { 189*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 190*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 191*4bdc9457SAndroid Build Coastguard Worker $if K == 0: 192*4bdc9457SAndroid Build Coastguard Worker float vo${M}p0 = vbias + vi${M+K}x0 * vk${K}0; 193*4bdc9457SAndroid Build Coastguard Worker $elif K < ACCUMULATORS: 194*4bdc9457SAndroid Build Coastguard Worker float vo${M}p${K} = vi${M+K}x0 * vk${K}0; 195*4bdc9457SAndroid Build Coastguard Worker $else: 196*4bdc9457SAndroid Build Coastguard Worker vo${M}p${K % ACCUMULATORS} += vi${M+K}x0 * vk${K}0; 197*4bdc9457SAndroid Build Coastguard Worker 198*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 199*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 200*4bdc9457SAndroid Build Coastguard Worker $if K+5 < ACCUMULATORS: 201*4bdc9457SAndroid Build Coastguard Worker float vo${M}p${K+5} = vi${M+K}x1 * vk${K}1; 202*4bdc9457SAndroid Build Coastguard Worker $else: 203*4bdc9457SAndroid Build Coastguard Worker vo${M}p${(K+5) % ACCUMULATORS} += vi${M+K}x1 * vk${K}1; 204*4bdc9457SAndroid Build Coastguard Worker 205*4bdc9457SAndroid Build Coastguard Worker $for K in range(5): 206*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 207*4bdc9457SAndroid Build Coastguard Worker vo${M}p${(K+10) % ACCUMULATORS} += vi${M+K}x2 * vk${K}2; 208*4bdc9457SAndroid Build Coastguard Worker 209*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 1: 210*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE = 1 211*4bdc9457SAndroid Build Coastguard Worker $while ACC_SLICE < ACCUMULATORS: 212*4bdc9457SAndroid Build Coastguard Worker $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 213*4bdc9457SAndroid Build Coastguard Worker $if A + ACC_SLICE < ACCUMULATORS: 214*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 215*4bdc9457SAndroid Build Coastguard Worker vo${M}p${A} += vo${M}p${A + ACC_SLICE}; 216*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE *= 2 217*4bdc9457SAndroid Build Coastguard Worker 218*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 219*4bdc9457SAndroid Build Coastguard Worker float vo${M} = math_max_f32(vo${M}p0, vmin); 220*4bdc9457SAndroid Build Coastguard Worker 221*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 222*4bdc9457SAndroid Build Coastguard Worker vo${M} = math_min_f32(vo${M}, vmax); 223*4bdc9457SAndroid Build Coastguard Worker 224*4bdc9457SAndroid Build Coastguard Worker $for M in reversed(range(ROW_TILE)): 225*4bdc9457SAndroid Build Coastguard Worker *o${M}++ = vo${M}; 226*4bdc9457SAndroid Build Coastguard Worker } 227*4bdc9457SAndroid Build Coastguard Worker 228*4bdc9457SAndroid Build Coastguard Worker i0 = (const float*) ((uintptr_t) i${ROW_TILE} - input_width); 229*4bdc9457SAndroid Build Coastguard Worker i1 = (const float*) ((uintptr_t) i${ROW_TILE+1} - input_width); 230*4bdc9457SAndroid Build Coastguard Worker $if ROW_TILE > 1: 231*4bdc9457SAndroid Build Coastguard Worker i2 = i${ROW_TILE+1}; 232*4bdc9457SAndroid Build Coastguard Worker i3 = i${ROW_TILE+2}; 233*4bdc9457SAndroid Build Coastguard Worker i4 = i${ROW_TILE+3}; 234*4bdc9457SAndroid Build Coastguard Worker $for M in range(5, 4 + ROW_TILE): 235*4bdc9457SAndroid Build Coastguard Worker i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 236*4bdc9457SAndroid Build Coastguard Worker 237*4bdc9457SAndroid Build Coastguard Worker $if ROW_TILE > 1: 238*4bdc9457SAndroid Build Coastguard Worker o0 = o${ROW_TILE - 1}; 239*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, ROW_TILE): 240*4bdc9457SAndroid Build Coastguard Worker o${M} = (float*) ((uintptr_t) o${M-1} + input_width); 241*4bdc9457SAndroid Build Coastguard Worker 242*4bdc9457SAndroid Build Coastguard Worker $if ROW_TILE > 1: 243*4bdc9457SAndroid Build Coastguard Worker output_height = doz(output_height, ${ROW_TILE}); 244*4bdc9457SAndroid Build Coastguard Worker } while (${"--" if ROW_TILE == 1 else ""}output_height != 0); 245*4bdc9457SAndroid Build Coastguard Worker} 246