1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert DATATYPE in ["QS8", "QU8"] 7*4bdc9457SAndroid Build Coastguard Worker$assert CHANNEL_TILE >= 1 8*4bdc9457SAndroid Build Coastguard Worker$assert CHANNEL_TILE <= 16 9*4bdc9457SAndroid Build Coastguard Worker$assert ROW_TILE >= 3 10*4bdc9457SAndroid Build Coastguard Worker$assert ROW_SUBTILE >= 3 11*4bdc9457SAndroid Build Coastguard Worker$assert ROW_SUBTILE <= ROW_TILE 12*4bdc9457SAndroid Build Coastguard Worker$assert REQUANTIZATION == "FP32" 13*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 14*4bdc9457SAndroid Build Coastguard Worker$if VARIANT == "LRINTF": 15*4bdc9457SAndroid Build Coastguard Worker #include <math.h> 16*4bdc9457SAndroid Build Coastguard Worker 17*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/gavgpool.h> 18*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h> 19*4bdc9457SAndroid Build Coastguard Worker 20*4bdc9457SAndroid Build Coastguard Worker 21*4bdc9457SAndroid Build Coastguard Worker$PARAMS_STRUCT = "fp32_scalar_" + VARIANT.lower() 22*4bdc9457SAndroid Build Coastguard Worker$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" 23*4bdc9457SAndroid Build Coastguard Worker$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32" 24*4bdc9457SAndroid Build Coastguard Worker$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32" 25*4bdc9457SAndroid Build Coastguard Workervoid xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__scalar_${VARIANT.lower()}_c${CHANNEL_TILE}( 26*4bdc9457SAndroid Build Coastguard Worker size_t rows, 27*4bdc9457SAndroid Build Coastguard Worker size_t channels, 28*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* input, 29*4bdc9457SAndroid Build Coastguard Worker size_t input_stride, 30*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* zero, 31*4bdc9457SAndroid Build Coastguard Worker int32_t* buffer, 32*4bdc9457SAndroid Build Coastguard Worker ${XINT8_T}* output, 33*4bdc9457SAndroid Build Coastguard Worker const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) 34*4bdc9457SAndroid Build Coastguard Worker{ 35*4bdc9457SAndroid Build Coastguard Worker assert(rows > ${ROW_TILE}); 36*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0); 37*4bdc9457SAndroid Build Coastguard Worker 38*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* i0 = input; 39*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, ROW_TILE): 40*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); 41*4bdc9457SAndroid Build Coastguard Worker const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T}); 42*4bdc9457SAndroid Build Coastguard Worker 43*4bdc9457SAndroid Build Coastguard Worker const int32_t vinit_bias = params->${PARAMS_STRUCT}.init_bias; 44*4bdc9457SAndroid Build Coastguard Worker int32_t* b = buffer; 45*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE == 1: 46*4bdc9457SAndroid Build Coastguard Worker size_t c = channels; 47*4bdc9457SAndroid Build Coastguard Worker do { 48*4bdc9457SAndroid Build Coastguard Worker int32_t vacc = vinit_bias; 49*4bdc9457SAndroid Build Coastguard Worker $for M in range(2): 50*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}++; 51*4bdc9457SAndroid Build Coastguard Worker 52*4bdc9457SAndroid Build Coastguard Worker $for M in range(2, ROW_TILE): 53*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M-2}; 54*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}++; 55*4bdc9457SAndroid Build Coastguard Worker 56*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE - 2, ROW_TILE): 57*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M}; 58*4bdc9457SAndroid Build Coastguard Worker 59*4bdc9457SAndroid Build Coastguard Worker *b++ = vacc; 60*4bdc9457SAndroid Build Coastguard Worker } while (--c != 0); 61*4bdc9457SAndroid Build Coastguard Worker $else: 62*4bdc9457SAndroid Build Coastguard Worker for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= ${CHANNEL_TILE}) { 63*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 64*4bdc9457SAndroid Build Coastguard Worker const int32_t vi0x${C} = (int32_t) i0[${C}]; 65*4bdc9457SAndroid Build Coastguard Worker i0 += ${CHANNEL_TILE}; 66*4bdc9457SAndroid Build Coastguard Worker 67*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 68*4bdc9457SAndroid Build Coastguard Worker int32_t vacc${C} = vi0x${C} + vinit_bias; 69*4bdc9457SAndroid Build Coastguard Worker const int32_t vi1x${C} = (int32_t) i1[${C}]; 70*4bdc9457SAndroid Build Coastguard Worker i1 += ${CHANNEL_TILE}; 71*4bdc9457SAndroid Build Coastguard Worker 72*4bdc9457SAndroid Build Coastguard Worker $for M in range(2, ROW_TILE): 73*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 74*4bdc9457SAndroid Build Coastguard Worker vacc${C} += vi${M-1}x${C}; 75*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M}x${C} = (int32_t) i${M}[${C}]; 76*4bdc9457SAndroid Build Coastguard Worker i${M} += ${CHANNEL_TILE}; 77*4bdc9457SAndroid Build Coastguard Worker 78*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 79*4bdc9457SAndroid Build Coastguard Worker vacc${C} += vi${ROW_TILE-1}x${C}; 80*4bdc9457SAndroid Build Coastguard Worker 81*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 82*4bdc9457SAndroid Build Coastguard Worker b[${C}] = vacc${C}; 83*4bdc9457SAndroid Build Coastguard Worker b += ${CHANNEL_TILE}; 84*4bdc9457SAndroid Build Coastguard Worker } 85*4bdc9457SAndroid Build Coastguard Worker 86*4bdc9457SAndroid Build Coastguard Worker for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { 87*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_SUBTILE): 88*4bdc9457SAndroid Build Coastguard Worker i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 89*4bdc9457SAndroid Build Coastguard Worker 90*4bdc9457SAndroid Build Coastguard Worker int32_t* b = buffer; 91*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE == 1: 92*4bdc9457SAndroid Build Coastguard Worker size_t c = channels; 93*4bdc9457SAndroid Build Coastguard Worker do { 94*4bdc9457SAndroid Build Coastguard Worker int32_t vacc = *b; 95*4bdc9457SAndroid Build Coastguard Worker $for M in range(2): 96*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}++; 97*4bdc9457SAndroid Build Coastguard Worker 98*4bdc9457SAndroid Build Coastguard Worker $for M in range(2, ROW_SUBTILE): 99*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M-2}; 100*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}++; 101*4bdc9457SAndroid Build Coastguard Worker 102*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_SUBTILE - 2, ROW_SUBTILE): 103*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M}; 104*4bdc9457SAndroid Build Coastguard Worker 105*4bdc9457SAndroid Build Coastguard Worker *b++ = vacc; 106*4bdc9457SAndroid Build Coastguard Worker } while (--c != 0); 107*4bdc9457SAndroid Build Coastguard Worker $else: 108*4bdc9457SAndroid Build Coastguard Worker for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= ${CHANNEL_TILE}) { 109*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 110*4bdc9457SAndroid Build Coastguard Worker int32_t vacc${C} = b[${C}]; 111*4bdc9457SAndroid Build Coastguard Worker const int32_t vi0x${C} = (int32_t) i0[${C}]; 112*4bdc9457SAndroid Build Coastguard Worker i0 += ${CHANNEL_TILE}; 113*4bdc9457SAndroid Build Coastguard Worker 114*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, ROW_SUBTILE): 115*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 116*4bdc9457SAndroid Build Coastguard Worker vacc${C} += vi${M-1}x${C}; 117*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M}x${C} = (int32_t) i${M}[${C}]; 118*4bdc9457SAndroid Build Coastguard Worker i${M} += ${CHANNEL_TILE}; 119*4bdc9457SAndroid Build Coastguard Worker 120*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 121*4bdc9457SAndroid Build Coastguard Worker vacc${C} += vi${ROW_SUBTILE-1}x${C}; 122*4bdc9457SAndroid Build Coastguard Worker 123*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 124*4bdc9457SAndroid Build Coastguard Worker b[${C}] = vacc${C}; 125*4bdc9457SAndroid Build Coastguard Worker b += ${CHANNEL_TILE}; 126*4bdc9457SAndroid Build Coastguard Worker } 127*4bdc9457SAndroid Build Coastguard Worker } 128*4bdc9457SAndroid Build Coastguard Worker 129*4bdc9457SAndroid Build Coastguard Worker i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); 130*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, ROW_SUBTILE): 131*4bdc9457SAndroid Build Coastguard Worker i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 132*4bdc9457SAndroid Build Coastguard Worker $if M % 2 == 1: 133*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows < ${M+1}) { 134*4bdc9457SAndroid Build Coastguard Worker i${M} = zero; 135*4bdc9457SAndroid Build Coastguard Worker } 136*4bdc9457SAndroid Build Coastguard Worker $else: 137*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows <= ${M}) { 138*4bdc9457SAndroid Build Coastguard Worker i${M} = zero; 139*4bdc9457SAndroid Build Coastguard Worker } 140*4bdc9457SAndroid Build Coastguard Worker 141*4bdc9457SAndroid Build Coastguard Worker const float vscale = params->${PARAMS_STRUCT}.scale; 142*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 143*4bdc9457SAndroid Build Coastguard Worker const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; 144*4bdc9457SAndroid Build Coastguard Worker const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; 145*4bdc9457SAndroid Build Coastguard Worker const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; 146*4bdc9457SAndroid Build Coastguard Worker const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; 147*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 148*4bdc9457SAndroid Build Coastguard Worker const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; 149*4bdc9457SAndroid Build Coastguard Worker const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; 150*4bdc9457SAndroid Build Coastguard Worker const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; 151*4bdc9457SAndroid Build Coastguard Worker const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; 152*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 153*4bdc9457SAndroid Build Coastguard Worker const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; 154*4bdc9457SAndroid Build Coastguard Worker const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; 155*4bdc9457SAndroid Build Coastguard Worker const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; 156*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE == 1: 157*4bdc9457SAndroid Build Coastguard Worker do { 158*4bdc9457SAndroid Build Coastguard Worker int32_t vacc = *buffer++; 159*4bdc9457SAndroid Build Coastguard Worker $for M in range(2): 160*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}++; 161*4bdc9457SAndroid Build Coastguard Worker 162*4bdc9457SAndroid Build Coastguard Worker $for M in range(2, ROW_SUBTILE): 163*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M-2}; 164*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}++; 165*4bdc9457SAndroid Build Coastguard Worker 166*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_SUBTILE - 2, ROW_SUBTILE): 167*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M}; 168*4bdc9457SAndroid Build Coastguard Worker 169*4bdc9457SAndroid Build Coastguard Worker float vfpacc = (float) vacc * vscale; 170*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 171*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 172*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 173*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 174*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; 175*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 176*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 177*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc); 178*4bdc9457SAndroid Build Coastguard Worker vout = math_max_s32(vout, vmagic_min); 179*4bdc9457SAndroid Build Coastguard Worker vout = math_min_s32(vout, vmagic_max); 180*4bdc9457SAndroid Build Coastguard Worker vout -= vmagic_bias_less_zero_point; 181*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 182*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 183*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 184*4bdc9457SAndroid Build Coastguard Worker const int32_t vrndacc = (int32_t) lrintf(vfpacc); 185*4bdc9457SAndroid Build Coastguard Worker int32_t vout = vrndacc + voutput_zero_point; 186*4bdc9457SAndroid Build Coastguard Worker 187*4bdc9457SAndroid Build Coastguard Worker *output++ = (${XINT8_T}) vout; 188*4bdc9457SAndroid Build Coastguard Worker } while (--channels != 0); 189*4bdc9457SAndroid Build Coastguard Worker $else: 190*4bdc9457SAndroid Build Coastguard Worker for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { 191*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 192*4bdc9457SAndroid Build Coastguard Worker int32_t vacc${C} = buffer[${C}]; 193*4bdc9457SAndroid Build Coastguard Worker const int32_t vi0x${C} = (int32_t) i0[${C}]; 194*4bdc9457SAndroid Build Coastguard Worker buffer += ${CHANNEL_TILE}; 195*4bdc9457SAndroid Build Coastguard Worker i0 += ${CHANNEL_TILE}; 196*4bdc9457SAndroid Build Coastguard Worker 197*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, ROW_SUBTILE): 198*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 199*4bdc9457SAndroid Build Coastguard Worker vacc${C} += vi${M-1}x${C}; 200*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M}x${C} = (int32_t) i${M}[${C}]; 201*4bdc9457SAndroid Build Coastguard Worker i${M} += ${CHANNEL_TILE}; 202*4bdc9457SAndroid Build Coastguard Worker 203*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 204*4bdc9457SAndroid Build Coastguard Worker vacc${C} += vi${ROW_SUBTILE-1}x${C}; 205*4bdc9457SAndroid Build Coastguard Worker 206*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 207*4bdc9457SAndroid Build Coastguard Worker float vfpacc${C} = (float) vacc${C} * vscale; 208*4bdc9457SAndroid Build Coastguard Worker 209*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 210*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 211*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} = ${MAX_F32}(vfpacc${C}, voutput_min_less_zero_point); 212*4bdc9457SAndroid Build Coastguard Worker 213*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 214*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} = ${MIN_F32}(vfpacc${C}, voutput_max_less_zero_point); 215*4bdc9457SAndroid Build Coastguard Worker 216*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 217*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} += vmagic_bias; 218*4bdc9457SAndroid Build Coastguard Worker 219*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 220*4bdc9457SAndroid Build Coastguard Worker int32_t vout${C} = (int32_t) float_as_uint32(vfpacc${C}) - vmagic_bias_less_output_zero_point; 221*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 222*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 223*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} += vmagic_bias; 224*4bdc9457SAndroid Build Coastguard Worker 225*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 226*4bdc9457SAndroid Build Coastguard Worker int32_t vout${C} = (int32_t) float_as_uint32(vfpacc${C}); 227*4bdc9457SAndroid Build Coastguard Worker 228*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 229*4bdc9457SAndroid Build Coastguard Worker vout${C} = math_max_s32(vout${C}, vmagic_min); 230*4bdc9457SAndroid Build Coastguard Worker 231*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 232*4bdc9457SAndroid Build Coastguard Worker vout${C} = math_min_s32(vout${C}, vmagic_max); 233*4bdc9457SAndroid Build Coastguard Worker 234*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 235*4bdc9457SAndroid Build Coastguard Worker vout${C} -= vmagic_bias_less_zero_point; 236*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 237*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 238*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} = ${MAX_F32}(vfpacc${C}, voutput_min_less_zero_point); 239*4bdc9457SAndroid Build Coastguard Worker 240*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 241*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} = ${MIN_F32}(vfpacc${C}, voutput_max_less_zero_point); 242*4bdc9457SAndroid Build Coastguard Worker 243*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 244*4bdc9457SAndroid Build Coastguard Worker const int32_t vrndacc${C} = (int32_t) lrintf(vfpacc${C}); 245*4bdc9457SAndroid Build Coastguard Worker 246*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 247*4bdc9457SAndroid Build Coastguard Worker int32_t vout${C} = vrndacc${C} + voutput_zero_point; 248*4bdc9457SAndroid Build Coastguard Worker 249*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 250*4bdc9457SAndroid Build Coastguard Worker output[${C}] = (${XINT8_T}) vout${C}; 251*4bdc9457SAndroid Build Coastguard Worker output += ${CHANNEL_TILE}; 252*4bdc9457SAndroid Build Coastguard Worker } 253*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(channels != 0) { 254*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE == 2: 255*4bdc9457SAndroid Build Coastguard Worker int32_t vacc = *buffer; 256*4bdc9457SAndroid Build Coastguard Worker $for M in range(2): 257*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}; 258*4bdc9457SAndroid Build Coastguard Worker 259*4bdc9457SAndroid Build Coastguard Worker $for M in range(2, ROW_SUBTILE): 260*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M-2}; 261*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}; 262*4bdc9457SAndroid Build Coastguard Worker 263*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_SUBTILE - 2, ROW_SUBTILE): 264*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M}; 265*4bdc9457SAndroid Build Coastguard Worker 266*4bdc9457SAndroid Build Coastguard Worker float vfpacc = (float) vacc * vscale; 267*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 268*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 269*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 270*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 271*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; 272*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 273*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 274*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc); 275*4bdc9457SAndroid Build Coastguard Worker vout = math_max_s32(vout, vmagic_min); 276*4bdc9457SAndroid Build Coastguard Worker vout = math_min_s32(vout, vmagic_max); 277*4bdc9457SAndroid Build Coastguard Worker vout -= vmagic_bias_less_zero_point; 278*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 279*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 280*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 281*4bdc9457SAndroid Build Coastguard Worker const int32_t vrndacc = (int32_t) lrintf(vfpacc); 282*4bdc9457SAndroid Build Coastguard Worker int32_t vout = vrndacc + voutput_zero_point; 283*4bdc9457SAndroid Build Coastguard Worker 284*4bdc9457SAndroid Build Coastguard Worker *output = (${XINT8_T}) vout; 285*4bdc9457SAndroid Build Coastguard Worker $else: 286*4bdc9457SAndroid Build Coastguard Worker do { 287*4bdc9457SAndroid Build Coastguard Worker int32_t vacc = *buffer++; 288*4bdc9457SAndroid Build Coastguard Worker $for M in range(2): 289*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}++; 290*4bdc9457SAndroid Build Coastguard Worker 291*4bdc9457SAndroid Build Coastguard Worker $for M in range(2, ROW_SUBTILE): 292*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M-2}; 293*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${M} = (int32_t) *i${M}++; 294*4bdc9457SAndroid Build Coastguard Worker 295*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_SUBTILE - 2, ROW_SUBTILE): 296*4bdc9457SAndroid Build Coastguard Worker vacc += vi${M}; 297*4bdc9457SAndroid Build Coastguard Worker 298*4bdc9457SAndroid Build Coastguard Worker float vfpacc = (float) vacc * vscale; 299*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 300*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 301*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 302*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 303*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; 304*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 305*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 306*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc); 307*4bdc9457SAndroid Build Coastguard Worker vout = math_max_s32(vout, vmagic_min); 308*4bdc9457SAndroid Build Coastguard Worker vout = math_min_s32(vout, vmagic_max); 309*4bdc9457SAndroid Build Coastguard Worker vout -= vmagic_bias_less_zero_point; 310*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 311*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 312*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 313*4bdc9457SAndroid Build Coastguard Worker const int32_t vrndacc = (int32_t) lrintf(vfpacc); 314*4bdc9457SAndroid Build Coastguard Worker int32_t vout = vrndacc + voutput_zero_point; 315*4bdc9457SAndroid Build Coastguard Worker 316*4bdc9457SAndroid Build Coastguard Worker *output++ = (${XINT8_T}) vout; 317*4bdc9457SAndroid Build Coastguard Worker } while (--channels != 0); 318*4bdc9457SAndroid Build Coastguard Worker } 319*4bdc9457SAndroid Build Coastguard Worker} 320