1// Copyright 2022 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert DATATYPE in ["QS8", "QU8"] 7$assert BATCH_TILE % 4 == 0 8$SIMD_TILE = BATCH_TILE // 4 9$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 10#include <assert.h> 11 12#include <arm_acle.h> 13 14#include <xnnpack/intrinsics-polyfill.h> 15#include <xnnpack/math.h> 16#include <xnnpack/unaligned.h> 17#include <xnnpack/vcvt.h> 18 19 20$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] 21$XINT8X4_T = {"QS8": "int8x4_t", "QU8": "uint8x4_t"}[DATATYPE] 22$XINT16X2_T = {"QS8": "int16x2_t", "QU8": "uint16x2_t"}[DATATYPE] 23$__XXTAB16 = {"QS8": "__sxtab16", "QU8": "__uxtab16"}[DATATYPE] 24$__XSAT = {"QS8": "__ssat", "QU8": "__usat"}[DATATYPE] 25void xnn_${DATATYPE.lower()}_vcvt_ukernel__armsimd32_x${BATCH_TILE}( 26 size_t n, 27 const ${XINT8_T}* x, 28 ${XINT8_T}* y, 29 const union xnn_${DATATYPE.lower()}_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 30{ 31 const ${XINT16X2_T} vminus_input_zero_point = (${XINT16X2_T}) params->armsimd32.minus_input_zero_point; 32 const int32_t vbias = params->armsimd32.bias; 33 const int32_t vmultiplier = params->armsimd32.multiplier; 34 $if BATCH_TILE > 4: 35 for (; n >= ${BATCH_TILE} * sizeof(${XINT8_T}); n -= ${BATCH_TILE} * sizeof(${XINT8_T})) { 36 $for N in range(SIMD_TILE): 37 const ${XINT8X4_T} vx${ABC[4*N:4*N+4]} = (${XINT8X4_T}) unaligned_indexed_load_u32(x, ${N}); 38 x += ${BATCH_TILE}; 39 40 $for N in range(0, BATCH_TILE, 4): 41 const ${XINT16X2_T} vx${ABC[N]}${ABC[N+2]} = ${__XXTAB16}(vminus_input_zero_point, vx${ABC[N:N+4]}); 42 const ${XINT16X2_T} vx${ABC[N+1]}${ABC[N+3]} = ${__XXTAB16}(vminus_input_zero_point, __ror(vx${ABC[N:N+4]}, 8)); 43 44 $for N in range(0, BATCH_TILE, 4): 45 int32_t vacc${ABC[N]} = __smlawb(vmultiplier, vx${ABC[N]}${ABC[N+2]}, vbias); 46 int32_t vacc${ABC[N+1]} = __smlawb(vmultiplier, vx${ABC[N+1]}${ABC[N+3]}, vbias); 47 int32_t vacc${ABC[N+2]} = __smlawt(vmultiplier, vx${ABC[N]}${ABC[N+2]}, vbias); 48 int32_t vacc${ABC[N+3]} = __smlawt(vmultiplier, vx${ABC[N+1]}${ABC[N+3]}, vbias); 49 50 $for N in range(BATCH_TILE): 51 vacc${ABC[N]} = ${__XSAT}(math_asr_s32(vacc${ABC[N]}, 1), 8); 52 53 $for N in range(BATCH_TILE): 54 y[${N}] = (${XINT8_T}) vacc${ABC[N]}; 55 y += ${BATCH_TILE}; 56 } 57 for (; n >= 4 * sizeof(${XINT8_T}); n -= 4 * sizeof(${XINT8_T})) { 58 const ${XINT8X4_T} vx0123 = (${XINT8X4_T}) unaligned_load_u32(x); 59 x += 4; 60 61 const ${XINT16X2_T} vx02 = ${__XXTAB16}(vminus_input_zero_point, vx0123); 62 const ${XINT16X2_T} vx13 = ${__XXTAB16}(vminus_input_zero_point, __ror(vx0123, 8)); 63 64 int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias); 65 int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias); 66 int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); 67 int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias); 68 69 vacc0 = ${__XSAT}(math_asr_s32(vacc0, 1), 8); 70 vacc1 = ${__XSAT}(math_asr_s32(vacc1, 1), 8); 71 vacc2 = ${__XSAT}(math_asr_s32(vacc2, 1), 8); 72 vacc3 = ${__XSAT}(math_asr_s32(vacc3, 1), 8); 73 74 y[0] = (${XINT8_T}) vacc0; 75 y[1] = (${XINT8_T}) vacc1; 76 y[2] = (${XINT8_T}) vacc2; 77 y[3] = (${XINT8_T}) vacc3; 78 y += 4; 79 } 80 if XNN_UNLIKELY(n != 0) { 81 const ${XINT8X4_T} vx0123 = (${XINT8X4_T}) unaligned_load_u32(x); 82 83 const ${XINT16X2_T} vx02 = ${__XXTAB16}(vminus_input_zero_point, vx0123); 84 const ${XINT16X2_T} vx13 = ${__XXTAB16}(vminus_input_zero_point, __ror(vx0123, 8)); 85 86 int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias); 87 int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias); 88 const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); 89 90 vacc0 = ${__XSAT}(math_asr_s32(vacc0, 1), 8); 91 vacc1 = ${__XSAT}(math_asr_s32(vacc1, 1), 8); 92 93 if (n & (2 * sizeof(${XINT8_T}))) { 94 y[0] = (${XINT8_T}) vacc0; 95 y[1] = (${XINT8_T}) vacc1; 96 vacc0 = ${__XSAT}(math_asr_s32(vacc2, 1), 8); 97 y += 2; 98 } 99 if (n & (1 * sizeof(${XINT8_T}))) { 100 y[0] = (${XINT8_T}) vacc0; 101 } 102 } 103} 104