1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert KERNEL_TILE >= 2 7*4bdc9457SAndroid Build Coastguard Worker$assert REQUANTIZATION == "FP32" 8*4bdc9457SAndroid Build Coastguard Worker$assert VARIANT in ["FMAGIC", "IMAGIC", "LRINTF"] 9*4bdc9457SAndroid Build Coastguard Worker$assert DATATYPE in ["QC8", "QS8", "QU8"] 10*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 11*4bdc9457SAndroid Build Coastguard Worker$if VARIANT == "LRINTF": 12*4bdc9457SAndroid Build Coastguard Worker #include <math.h> 13*4bdc9457SAndroid Build Coastguard Worker 14*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/dwconv.h> 15*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h> 16*4bdc9457SAndroid Build Coastguard Worker$if CHANNEL_TILE % 4 != 0: 17*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/unaligned.h> 18*4bdc9457SAndroid Build Coastguard Worker 19*4bdc9457SAndroid Build Coastguard Worker 20*4bdc9457SAndroid Build Coastguard Worker$PARAMS_STRUCT = REQUANTIZATION.lower() + "_scalar" + ("_" + VARIANT.lower() if VARIANT else "") 21*4bdc9457SAndroid Build Coastguard Worker$PARAMS_UNION = "xnn_%s_conv_minmax_params" % DATATYPE.lower() 22*4bdc9457SAndroid Build Coastguard Worker$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" 23*4bdc9457SAndroid Build Coastguard Worker$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32" 24*4bdc9457SAndroid Build Coastguard Worker$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32" 25*4bdc9457SAndroid Build Coastguard Workervoid xnn_${DATATYPE.lower()}_dwconv_minmax_${REQUANTIZATION.lower()}_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${"wasm" if WASM else "scalar"}_${VARIANT.lower()}( 26*4bdc9457SAndroid Build Coastguard Worker size_t channels, 27*4bdc9457SAndroid Build Coastguard Worker size_t output_width, 28*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}** input, 29*4bdc9457SAndroid Build Coastguard Worker const void* weights, 30*4bdc9457SAndroid Build Coastguard Worker ${XINT8_T}* output, 31*4bdc9457SAndroid Build Coastguard Worker size_t input_stride, 32*4bdc9457SAndroid Build Coastguard Worker size_t output_increment, 33*4bdc9457SAndroid Build Coastguard Worker size_t input_offset, 34*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* zero, 35*4bdc9457SAndroid Build Coastguard Worker const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) 36*4bdc9457SAndroid Build Coastguard Worker{ 37*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0); 38*4bdc9457SAndroid Build Coastguard Worker assert(output_width != 0); 39*4bdc9457SAndroid Build Coastguard Worker 40*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE != "QC8": 41*4bdc9457SAndroid Build Coastguard Worker const float vscale = params->${PARAMS_STRUCT}.scale; 42*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 43*4bdc9457SAndroid Build Coastguard Worker const float voutput_min_less_zero_point = params->${PARAMS_STRUCT}.output_min_less_zero_point; 44*4bdc9457SAndroid Build Coastguard Worker const float voutput_max_less_zero_point = params->${PARAMS_STRUCT}.output_max_less_zero_point; 45*4bdc9457SAndroid Build Coastguard Worker const float vmagic_bias = params->${PARAMS_STRUCT}.magic_bias; 46*4bdc9457SAndroid Build Coastguard Worker const int32_t vmagic_bias_less_output_zero_point = params->${PARAMS_STRUCT}.magic_bias_less_output_zero_point; 47*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 48*4bdc9457SAndroid Build Coastguard Worker const float vmagic_bias = params->${PARAMS_STRUCT}.magic_bias; 49*4bdc9457SAndroid Build Coastguard Worker const int32_t vmagic_min = params->${PARAMS_STRUCT}.magic_min; 50*4bdc9457SAndroid Build Coastguard Worker const int32_t vmagic_max = params->${PARAMS_STRUCT}.magic_max; 51*4bdc9457SAndroid Build Coastguard Worker const int32_t vmagic_bias_less_zero_point = params->${PARAMS_STRUCT}.magic_bias_less_zero_point; 52*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 53*4bdc9457SAndroid Build Coastguard Worker const float voutput_min_less_zero_point = params->${PARAMS_STRUCT}.output_min_less_zero_point; 54*4bdc9457SAndroid Build Coastguard Worker const float voutput_max_less_zero_point = params->${PARAMS_STRUCT}.output_max_less_zero_point; 55*4bdc9457SAndroid Build Coastguard Worker const int32_t voutput_zero_point = params->${PARAMS_STRUCT}.output_zero_point; 56*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QU8": 57*4bdc9457SAndroid Build Coastguard Worker const int32_t vkernel_zero_point = params->${PARAMS_STRUCT}.kernel_zero_point; 58*4bdc9457SAndroid Build Coastguard Worker do { 59*4bdc9457SAndroid Build Coastguard Worker $for K in range(KERNEL_TILE): 60*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* i${K} = input[${K}]; 61*4bdc9457SAndroid Build Coastguard Worker assert(i${K} != NULL); 62*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i${K} != zero) { 63*4bdc9457SAndroid Build Coastguard Worker i${K} = (const ${XINT8_T}*) ((uintptr_t) i${K} + input_offset); 64*4bdc9457SAndroid Build Coastguard Worker } 65*4bdc9457SAndroid Build Coastguard Worker input = (const ${XINT8_T}**) ((uintptr_t) input + input_stride); 66*4bdc9457SAndroid Build Coastguard Worker 67*4bdc9457SAndroid Build Coastguard Worker size_t c = channels; 68*4bdc9457SAndroid Build Coastguard Worker const void* w = weights; 69*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE == 1: 70*4bdc9457SAndroid Build Coastguard Worker do { 71*4bdc9457SAndroid Build Coastguard Worker int32_t vacc = unaligned_load_s32(w); 72*4bdc9457SAndroid Build Coastguard Worker 73*4bdc9457SAndroid Build Coastguard Worker $for K in range(KERNEL_TILE): 74*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QU8": 75*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${K} = (int32_t) (uint32_t) *i${K}++; 76*4bdc9457SAndroid Build Coastguard Worker $else: 77*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${K} = (int32_t) *i${K}++; 78*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QU8": 79*4bdc9457SAndroid Build Coastguard Worker const int32_t vk${K} = (int32_t) (uint32_t) ((const ${XINT8_T}*) ((uintptr_t) w + sizeof(int32_t)))[${K}] - vkernel_zero_point; 80*4bdc9457SAndroid Build Coastguard Worker $else: 81*4bdc9457SAndroid Build Coastguard Worker const int32_t vk${K} = ((const ${XINT8_T}*) ((uintptr_t) w + sizeof(int32_t)))[${K}]; 82*4bdc9457SAndroid Build Coastguard Worker vacc += vi${K} * vk${K}; 83*4bdc9457SAndroid Build Coastguard Worker 84*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + sizeof(int32_t) + ${KERNEL_TILE} * sizeof(${XINT8_T})); 85*4bdc9457SAndroid Build Coastguard Worker 86*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QC8": 87*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE % 4 != 0: 88*4bdc9457SAndroid Build Coastguard Worker const float vscale = unaligned_load_f32(w); 89*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const float*) w + 1); 90*4bdc9457SAndroid Build Coastguard Worker $else: 91*4bdc9457SAndroid Build Coastguard Worker const float vscale = *((const float*) w); 92*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const float*) w + 1); 93*4bdc9457SAndroid Build Coastguard Worker float vfpacc = (float) vacc * vscale; 94*4bdc9457SAndroid Build Coastguard Worker 95*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 96*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 97*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 98*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 99*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; 100*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 101*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 102*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc); 103*4bdc9457SAndroid Build Coastguard Worker vout = math_max_s32(vout, vmagic_min); 104*4bdc9457SAndroid Build Coastguard Worker vout = math_min_s32(vout, vmagic_max); 105*4bdc9457SAndroid Build Coastguard Worker vout -= vmagic_bias_less_zero_point; 106*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 107*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 108*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 109*4bdc9457SAndroid Build Coastguard Worker const int32_t vrndacc = (int32_t) lrintf(vfpacc); 110*4bdc9457SAndroid Build Coastguard Worker int32_t vout = vrndacc + voutput_zero_point; 111*4bdc9457SAndroid Build Coastguard Worker 112*4bdc9457SAndroid Build Coastguard Worker *output++ = (${XINT8_T}) vout; 113*4bdc9457SAndroid Build Coastguard Worker } while (--c != 0); 114*4bdc9457SAndroid Build Coastguard Worker $else: 115*4bdc9457SAndroid Build Coastguard Worker for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) { 116*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE % 4 != 0: 117*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 118*4bdc9457SAndroid Build Coastguard Worker int32_t vacc${C} = unaligned_indexed_load_s32(w, ${C}); 119*4bdc9457SAndroid Build Coastguard Worker $else: 120*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 121*4bdc9457SAndroid Build Coastguard Worker int32_t vacc${C} = ((const int32_t*) w)[${C}]; 122*4bdc9457SAndroid Build Coastguard Worker 123*4bdc9457SAndroid Build Coastguard Worker $for K in range(KERNEL_TILE): 124*4bdc9457SAndroid Build Coastguard Worker 125*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 126*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QU8": 127*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${K}x${C} = (int32_t) (uint32_t) i${K}[${C}]; 128*4bdc9457SAndroid Build Coastguard Worker $else: 129*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${K}x${C} = (int32_t) i${K}[${C}]; 130*4bdc9457SAndroid Build Coastguard Worker i${K} += ${CHANNEL_TILE}; 131*4bdc9457SAndroid Build Coastguard Worker 132*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 133*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QU8": 134*4bdc9457SAndroid Build Coastguard Worker const int32_t vk${K}x${C} = (int32_t) (uint32_t) ((const ${XINT8_T}*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t)))[${K * CHANNEL_TILE + C}] - vkernel_zero_point; 135*4bdc9457SAndroid Build Coastguard Worker $else: 136*4bdc9457SAndroid Build Coastguard Worker const int32_t vk${K}x${C} = (int32_t) ((const ${XINT8_T}*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t)))[${K * CHANNEL_TILE + C}]; 137*4bdc9457SAndroid Build Coastguard Worker 138*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 139*4bdc9457SAndroid Build Coastguard Worker vacc${C} += vi${K}x${C} * vk${K}x${C}; 140*4bdc9457SAndroid Build Coastguard Worker 141*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(${XINT8_T})); 142*4bdc9457SAndroid Build Coastguard Worker 143*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 144*4bdc9457SAndroid Build Coastguard Worker float vfpacc${C} = (float) vacc${C}; 145*4bdc9457SAndroid Build Coastguard Worker 146*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QC8": 147*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE % 4 != 0: 148*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 149*4bdc9457SAndroid Build Coastguard Worker const float vscale${C} = unaligned_indexed_load_f32(w, ${C}); 150*4bdc9457SAndroid Build Coastguard Worker $else: 151*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 152*4bdc9457SAndroid Build Coastguard Worker const float vscale${C} = ((const float*) w)[${C}]; 153*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((const float*) w + ${CHANNEL_TILE}); 154*4bdc9457SAndroid Build Coastguard Worker 155*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 156*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} *= vscale${C}; 157*4bdc9457SAndroid Build Coastguard Worker $else: 158*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 159*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} *= vscale; 160*4bdc9457SAndroid Build Coastguard Worker 161*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 162*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 163*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} = ${MAX_F32}(vfpacc${C}, voutput_min_less_zero_point); 164*4bdc9457SAndroid Build Coastguard Worker 165*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 166*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} = ${MIN_F32}(vfpacc${C}, voutput_max_less_zero_point); 167*4bdc9457SAndroid Build Coastguard Worker 168*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 169*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} += vmagic_bias; 170*4bdc9457SAndroid Build Coastguard Worker 171*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 172*4bdc9457SAndroid Build Coastguard Worker int32_t vout${C} = (int32_t) float_as_uint32(vfpacc${C}) - vmagic_bias_less_output_zero_point; 173*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 174*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 175*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} += vmagic_bias; 176*4bdc9457SAndroid Build Coastguard Worker 177*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 178*4bdc9457SAndroid Build Coastguard Worker int32_t vout${C} = (int32_t) float_as_uint32(vfpacc${C}); 179*4bdc9457SAndroid Build Coastguard Worker 180*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 181*4bdc9457SAndroid Build Coastguard Worker vout${C} = math_max_s32(vout${C}, vmagic_min); 182*4bdc9457SAndroid Build Coastguard Worker 183*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 184*4bdc9457SAndroid Build Coastguard Worker vout${C} = math_min_s32(vout${C}, vmagic_max); 185*4bdc9457SAndroid Build Coastguard Worker 186*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 187*4bdc9457SAndroid Build Coastguard Worker vout${C} -= vmagic_bias_less_zero_point; 188*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 189*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 190*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} = ${MAX_F32}(vfpacc${C}, voutput_min_less_zero_point); 191*4bdc9457SAndroid Build Coastguard Worker 192*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 193*4bdc9457SAndroid Build Coastguard Worker vfpacc${C} = ${MIN_F32}(vfpacc${C}, voutput_max_less_zero_point); 194*4bdc9457SAndroid Build Coastguard Worker 195*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 196*4bdc9457SAndroid Build Coastguard Worker const int32_t vrndacc${C} = (int32_t) lrintf(vfpacc${C}); 197*4bdc9457SAndroid Build Coastguard Worker 198*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 199*4bdc9457SAndroid Build Coastguard Worker int32_t vout${C} = (int32_t) vrndacc${C} + voutput_zero_point; 200*4bdc9457SAndroid Build Coastguard Worker 201*4bdc9457SAndroid Build Coastguard Worker $for C in range(CHANNEL_TILE): 202*4bdc9457SAndroid Build Coastguard Worker output[${C}] = (${XINT8_T}) vout${C}; 203*4bdc9457SAndroid Build Coastguard Worker output += ${CHANNEL_TILE}; 204*4bdc9457SAndroid Build Coastguard Worker } 205*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) { 206*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE == 2: 207*4bdc9457SAndroid Build Coastguard Worker int32_t vacc = unaligned_load_s32(w); 208*4bdc9457SAndroid Build Coastguard Worker 209*4bdc9457SAndroid Build Coastguard Worker $for K in range(KERNEL_TILE): 210*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QU8": 211*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${K} = (int32_t) (uint32_t) *i${K}; 212*4bdc9457SAndroid Build Coastguard Worker $else: 213*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${K} = (int32_t) *i${K}; 214*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QU8": 215*4bdc9457SAndroid Build Coastguard Worker const int32_t vk${K} = (int32_t) (uint32_t) ((const ${XINT8_T}*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t)))[${K * CHANNEL_TILE}] - vkernel_zero_point; 216*4bdc9457SAndroid Build Coastguard Worker $else: 217*4bdc9457SAndroid Build Coastguard Worker const int32_t vk${K} = (int32_t) ((const ${XINT8_T}*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t)))[${K * CHANNEL_TILE}]; 218*4bdc9457SAndroid Build Coastguard Worker vacc += vi${K} * vk${K}; 219*4bdc9457SAndroid Build Coastguard Worker 220*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QC8": 221*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE % 4 != 0: 222*4bdc9457SAndroid Build Coastguard Worker typedef XNN_UNALIGNED float unaligned_float; 223*4bdc9457SAndroid Build Coastguard Worker const float vscale = *((const unaligned_float*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(${XINT8_T}))); 224*4bdc9457SAndroid Build Coastguard Worker $else: 225*4bdc9457SAndroid Build Coastguard Worker const float vscale = *((const float*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(${XINT8_T}))); 226*4bdc9457SAndroid Build Coastguard Worker float vfpacc = (float) vacc * vscale; 227*4bdc9457SAndroid Build Coastguard Worker 228*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 229*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 230*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 231*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 232*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; 233*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 234*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 235*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc); 236*4bdc9457SAndroid Build Coastguard Worker vout = math_max_s32(vout, vmagic_min); 237*4bdc9457SAndroid Build Coastguard Worker vout = math_min_s32(vout, vmagic_max); 238*4bdc9457SAndroid Build Coastguard Worker vout -= vmagic_bias_less_zero_point; 239*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 240*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 241*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 242*4bdc9457SAndroid Build Coastguard Worker const int32_t vrndacc = (int32_t) lrintf(vfpacc); 243*4bdc9457SAndroid Build Coastguard Worker int32_t vout = vrndacc + voutput_zero_point; 244*4bdc9457SAndroid Build Coastguard Worker 245*4bdc9457SAndroid Build Coastguard Worker *output++ = (${XINT8_T}) vout; 246*4bdc9457SAndroid Build Coastguard Worker $else: 247*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* k = (const ${XINT8_T}*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t)); 248*4bdc9457SAndroid Build Coastguard Worker do { 249*4bdc9457SAndroid Build Coastguard Worker int32_t vacc = *((const int32_t*) w); 250*4bdc9457SAndroid Build Coastguard Worker w = (const void*) ((uintptr_t) w + sizeof(int32_t)); 251*4bdc9457SAndroid Build Coastguard Worker 252*4bdc9457SAndroid Build Coastguard Worker $for K in range(KERNEL_TILE): 253*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QU8": 254*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${K} = (int32_t) (uint32_t) *i${K}++; 255*4bdc9457SAndroid Build Coastguard Worker $else: 256*4bdc9457SAndroid Build Coastguard Worker const int32_t vi${K} = (int32_t) *i${K}++; 257*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QU8": 258*4bdc9457SAndroid Build Coastguard Worker const int32_t vk${K} = (int32_t) (uint32_t) k[${K * CHANNEL_TILE}] - vkernel_zero_point; 259*4bdc9457SAndroid Build Coastguard Worker $else: 260*4bdc9457SAndroid Build Coastguard Worker const int32_t vk${K} = (int32_t) k[${K * CHANNEL_TILE}]; 261*4bdc9457SAndroid Build Coastguard Worker vacc += vi${K} * vk${K}; 262*4bdc9457SAndroid Build Coastguard Worker k += 1; 263*4bdc9457SAndroid Build Coastguard Worker 264*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "QC8": 265*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE % 4 != 0: 266*4bdc9457SAndroid Build Coastguard Worker const float vscale = unaligned_load_f32((const void*) ((uintptr_t) w + ${CHANNEL_TILE - 1} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(${XINT8_T}))); 267*4bdc9457SAndroid Build Coastguard Worker $else: 268*4bdc9457SAndroid Build Coastguard Worker const float vscale = *((const float*) ((uintptr_t) w + ${CHANNEL_TILE - 1} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(${XINT8_T}))); 269*4bdc9457SAndroid Build Coastguard Worker float vfpacc = (float) vacc * vscale; 270*4bdc9457SAndroid Build Coastguard Worker 271*4bdc9457SAndroid Build Coastguard Worker $if VARIANT == "FMAGIC": 272*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 273*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 274*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 275*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; 276*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "IMAGIC": 277*4bdc9457SAndroid Build Coastguard Worker vfpacc += vmagic_bias; 278*4bdc9457SAndroid Build Coastguard Worker int32_t vout = (int32_t) float_as_uint32(vfpacc); 279*4bdc9457SAndroid Build Coastguard Worker vout = math_max_s32(vout, vmagic_min); 280*4bdc9457SAndroid Build Coastguard Worker vout = math_min_s32(vout, vmagic_max); 281*4bdc9457SAndroid Build Coastguard Worker vout -= vmagic_bias_less_zero_point; 282*4bdc9457SAndroid Build Coastguard Worker $elif VARIANT == "LRINTF": 283*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); 284*4bdc9457SAndroid Build Coastguard Worker vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); 285*4bdc9457SAndroid Build Coastguard Worker const int32_t vrndacc = (int32_t) lrintf(vfpacc); 286*4bdc9457SAndroid Build Coastguard Worker int32_t vout = vrndacc + voutput_zero_point; 287*4bdc9457SAndroid Build Coastguard Worker 288*4bdc9457SAndroid Build Coastguard Worker *output++ = (${XINT8_T}) vout; 289*4bdc9457SAndroid Build Coastguard Worker } while (--c != 0); 290*4bdc9457SAndroid Build Coastguard Worker } 291*4bdc9457SAndroid Build Coastguard Worker 292*4bdc9457SAndroid Build Coastguard Worker output = (${XINT8_T}*) ((uintptr_t) output + output_increment); 293*4bdc9457SAndroid Build Coastguard Worker } while (--output_width != 0); 294*4bdc9457SAndroid Build Coastguard Worker} 295