1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert SSE in [2, 4] 7*4bdc9457SAndroid Build Coastguard Worker$assert not XOP or AVX 8*4bdc9457SAndroid Build Coastguard Worker$assert not AVX or SSE == 4 9*4bdc9457SAndroid Build Coastguard Worker$assert DATATYPE in ["S8", "U8"] 10*4bdc9457SAndroid Build Coastguard Worker$assert CHANNEL_TILE % 8 == 0 11*4bdc9457SAndroid Build Coastguard Worker$assert CHANNEL_TILE >= 8 12*4bdc9457SAndroid Build Coastguard Worker$assert PIXEL_TILE == 1 13*4bdc9457SAndroid Build Coastguard Worker$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 14*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 15*4bdc9457SAndroid Build Coastguard Worker 16*4bdc9457SAndroid Build Coastguard Worker$if XOP: 17*4bdc9457SAndroid Build Coastguard Worker #if defined(__GNUC__) || defined(__clang__) 18*4bdc9457SAndroid Build Coastguard Worker #include <x86intrin.h> 19*4bdc9457SAndroid Build Coastguard Worker #else 20*4bdc9457SAndroid Build Coastguard Worker #include <immintrin.h> 21*4bdc9457SAndroid Build Coastguard Worker #include <ammintrin.h> 22*4bdc9457SAndroid Build Coastguard Worker #endif 23*4bdc9457SAndroid Build Coastguard Worker$else: 24*4bdc9457SAndroid Build Coastguard Worker $SSE_HEADER = {2: "emmintrin.h", 4: "smmintrin.h"}[SSE] 25*4bdc9457SAndroid Build Coastguard Worker #include <${SSE_HEADER}> 26*4bdc9457SAndroid Build Coastguard Worker 27*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/common.h> 28*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/ibilinear.h> 29*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/unaligned.h> 30*4bdc9457SAndroid Build Coastguard Worker 31*4bdc9457SAndroid Build Coastguard Worker 32*4bdc9457SAndroid Build Coastguard Worker$XINT8_T = {"S8": "int8_t", "U8": "uint8_t"}[DATATYPE] 33*4bdc9457SAndroid Build Coastguard Worker$_MM_CVTEPX8_EPI16 = {"S8": "_mm_cvtepi8_epi16", "U8": "_mm_cvtepu8_epi16"}[DATATYPE] 34*4bdc9457SAndroid Build Coastguard Worker$_MM_SRXI_EPI32 = {"S8": "_mm_srai_epi32", "U8": "_mm_srli_epi32"}[DATATYPE] 35*4bdc9457SAndroid Build Coastguard Worker$_MM_SRXI_EPI16 = {"S8": "_mm_srai_epi16", "U8": "_mm_srli_epi16"}[DATATYPE] 36*4bdc9457SAndroid Build Coastguard Worker$_MM_PACKXS_EPI16 = {"S8": "_mm_packs_epi16", "U8": "_mm_packus_epi16"}[DATATYPE] 37*4bdc9457SAndroid Build Coastguard Worker$ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE] 38*4bdc9457SAndroid Build Coastguard Workervoid xnn_${DATATYPE.lower()}_ibilinear_ukernel__${ISA}_c${CHANNEL_TILE}${"" if PIXEL_TILE == 1 else "x%d" % PIXEL_TILE}( 39*4bdc9457SAndroid Build Coastguard Worker size_t output_pixels, 40*4bdc9457SAndroid Build Coastguard Worker size_t channels, 41*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}**restrict input, 42*4bdc9457SAndroid Build Coastguard Worker size_t input_offset, 43*4bdc9457SAndroid Build Coastguard Worker const int16_t*restrict weights, 44*4bdc9457SAndroid Build Coastguard Worker ${XINT8_T}*restrict output, 45*4bdc9457SAndroid Build Coastguard Worker size_t output_increment) XNN_OOB_READS 46*4bdc9457SAndroid Build Coastguard Worker{ 47*4bdc9457SAndroid Build Coastguard Worker assert(output_pixels != 0); 48*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0); 49*4bdc9457SAndroid Build Coastguard Worker 50*4bdc9457SAndroid Build Coastguard Worker do { 51*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* i0 = (const ${XINT8_T}*) ((uintptr_t) input[0] + input_offset); 52*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* i1 = (const ${XINT8_T}*) ((uintptr_t) input[1] + input_offset); 53*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* i2 = (const ${XINT8_T}*) ((uintptr_t) input[2] + input_offset); 54*4bdc9457SAndroid Build Coastguard Worker const ${XINT8_T}* i3 = (const ${XINT8_T}*) ((uintptr_t) input[3] + input_offset); 55*4bdc9457SAndroid Build Coastguard Worker input += 4; 56*4bdc9457SAndroid Build Coastguard Worker 57*4bdc9457SAndroid Build Coastguard Worker const __m128i valpha = _mm_cvtsi32_si128(*((const int*) weights)); 58*4bdc9457SAndroid Build Coastguard Worker weights += 2; 59*4bdc9457SAndroid Build Coastguard Worker __m128i valphah = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(0, 0, 0, 0)); 60*4bdc9457SAndroid Build Coastguard Worker valphah = _mm_unpacklo_epi64(valphah, valphah); 61*4bdc9457SAndroid Build Coastguard Worker $if SSE == 2: 62*4bdc9457SAndroid Build Coastguard Worker __m128i valphav = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(1, 1, 1, 1)); 63*4bdc9457SAndroid Build Coastguard Worker valphav = _mm_unpacklo_epi64(valphav, valphav); 64*4bdc9457SAndroid Build Coastguard Worker $else: 65*4bdc9457SAndroid Build Coastguard Worker __m128i valphav = _mm_srli_epi32(valpha, 16); 66*4bdc9457SAndroid Build Coastguard Worker valphav = _mm_shuffle_epi32(valphav, _MM_SHUFFLE(0, 0, 0, 0)); 67*4bdc9457SAndroid Build Coastguard Worker 68*4bdc9457SAndroid Build Coastguard Worker $if SSE == 4: 69*4bdc9457SAndroid Build Coastguard Worker valphah = _mm_blend_epi16(valphah, _mm_sub_epi16(_mm_set1_epi32(0x08000000), valphah), 0xAA); 70*4bdc9457SAndroid Build Coastguard Worker $else: 71*4bdc9457SAndroid Build Coastguard Worker valphah = _mm_xor_si128(valphah, _mm_set1_epi32(0xFFFF0000)); 72*4bdc9457SAndroid Build Coastguard Worker valphah = _mm_add_epi16(valphah, _mm_set1_epi32(0x08010000)); 73*4bdc9457SAndroid Build Coastguard Worker 74*4bdc9457SAndroid Build Coastguard Worker const __m128i vrounding = _mm_set1_epi32(0x00200000); 75*4bdc9457SAndroid Build Coastguard Worker 76*4bdc9457SAndroid Build Coastguard Worker size_t c = channels; 77*4bdc9457SAndroid Build Coastguard Worker $if CHANNEL_TILE > 8: 78*4bdc9457SAndroid Build Coastguard Worker for (; c >= ${CHANNEL_TILE} * sizeof(${XINT8_T}); c -= ${CHANNEL_TILE} * sizeof(${XINT8_T})) { 79*4bdc9457SAndroid Build Coastguard Worker $if SSE == 4: 80*4bdc9457SAndroid Build Coastguard Worker const __m128i vtl${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i0)); 81*4bdc9457SAndroid Build Coastguard Worker const __m128i vtr${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i1)); 82*4bdc9457SAndroid Build Coastguard Worker const __m128i vbl${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 83*4bdc9457SAndroid Build Coastguard Worker const __m128i vbr${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i3)); 84*4bdc9457SAndroid Build Coastguard Worker $for C in range(8, CHANNEL_TILE, 8): 85*4bdc9457SAndroid Build Coastguard Worker const __m128i vtl${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i0 + ${C}))); 86*4bdc9457SAndroid Build Coastguard Worker const __m128i vtr${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i1 + ${C}))); 87*4bdc9457SAndroid Build Coastguard Worker const __m128i vbl${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C}))); 88*4bdc9457SAndroid Build Coastguard Worker const __m128i vbr${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i3 + ${C}))); 89*4bdc9457SAndroid Build Coastguard Worker $else: 90*4bdc9457SAndroid Build Coastguard Worker __m128i vtl${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i0); 91*4bdc9457SAndroid Build Coastguard Worker __m128i vtr${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i1); 92*4bdc9457SAndroid Build Coastguard Worker __m128i vbl${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i2); 93*4bdc9457SAndroid Build Coastguard Worker __m128i vbr${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i3); 94*4bdc9457SAndroid Build Coastguard Worker $for C in range(8, CHANNEL_TILE, 8): 95*4bdc9457SAndroid Build Coastguard Worker __m128i vtl${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i0 + ${C})); 96*4bdc9457SAndroid Build Coastguard Worker __m128i vtr${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i1 + ${C})); 97*4bdc9457SAndroid Build Coastguard Worker __m128i vbl${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i2 + ${C})); 98*4bdc9457SAndroid Build Coastguard Worker __m128i vbr${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i3 + ${C})); 99*4bdc9457SAndroid Build Coastguard Worker i0 += ${CHANNEL_TILE}; 100*4bdc9457SAndroid Build Coastguard Worker i1 += ${CHANNEL_TILE}; 101*4bdc9457SAndroid Build Coastguard Worker i2 += ${CHANNEL_TILE}; 102*4bdc9457SAndroid Build Coastguard Worker i3 += ${CHANNEL_TILE}; 103*4bdc9457SAndroid Build Coastguard Worker 104*4bdc9457SAndroid Build Coastguard Worker $if SSE != 4: 105*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "U8": 106*4bdc9457SAndroid Build Coastguard Worker __m128i vzero = _mm_setzero_si128(); 107*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 8): 108*4bdc9457SAndroid Build Coastguard Worker vtl${ABC[C:C+8]} = _mm_unpacklo_epi8(vtl${ABC[C:C+8]}, vzero); 109*4bdc9457SAndroid Build Coastguard Worker vtr${ABC[C:C+8]} = _mm_unpacklo_epi8(vtr${ABC[C:C+8]}, vzero); 110*4bdc9457SAndroid Build Coastguard Worker vbl${ABC[C:C+8]} = _mm_unpacklo_epi8(vbl${ABC[C:C+8]}, vzero); 111*4bdc9457SAndroid Build Coastguard Worker vbr${ABC[C:C+8]} = _mm_unpacklo_epi8(vbr${ABC[C:C+8]}, vzero); 112*4bdc9457SAndroid Build Coastguard Worker $else: 113*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 8): 114*4bdc9457SAndroid Build Coastguard Worker vtl${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vtl${ABC[C:C+8]}, vtl${ABC[C:C+8]}), 8); 115*4bdc9457SAndroid Build Coastguard Worker vtr${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vtr${ABC[C:C+8]}, vtr${ABC[C:C+8]}), 8); 116*4bdc9457SAndroid Build Coastguard Worker vbl${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vbl${ABC[C:C+8]}, vbl${ABC[C:C+8]}), 8); 117*4bdc9457SAndroid Build Coastguard Worker vbr${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vbr${ABC[C:C+8]}, vbr${ABC[C:C+8]}), 8); 118*4bdc9457SAndroid Build Coastguard Worker 119*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 8): 120*4bdc9457SAndroid Build Coastguard Worker const __m128i vdr${ABC[C:C+8]} = _mm_sub_epi16(vbr${ABC[C:C+8]}, vtr${ABC[C:C+8]}); 121*4bdc9457SAndroid Build Coastguard Worker const __m128i vt${ABC[C:C+4]} = _mm_madd_epi16(_mm_unpacklo_epi16(vtr${ABC[C:C+8]}, vtl${ABC[C:C+8]}), valphah); 122*4bdc9457SAndroid Build Coastguard Worker const __m128i vdl${ABC[C:C+8]} = _mm_sub_epi16(vbl${ABC[C:C+8]}, vtl${ABC[C:C+8]}); 123*4bdc9457SAndroid Build Coastguard Worker const __m128i vt${ABC[C+4:C+8]} = _mm_madd_epi16(_mm_unpackhi_epi16(vtr${ABC[C:C+8]}, vtl${ABC[C:C+8]}), valphah); 124*4bdc9457SAndroid Build Coastguard Worker 125*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 8): 126*4bdc9457SAndroid Build Coastguard Worker const __m128i vd${ABC[C:C+4]} = _mm_madd_epi16(_mm_unpacklo_epi16(vdr${ABC[C:C+8]}, vdl${ABC[C:C+8]}), valphah); 127*4bdc9457SAndroid Build Coastguard Worker const __m128i vd${ABC[C+4:C+8]} = _mm_madd_epi16(_mm_unpackhi_epi16(vdr${ABC[C:C+8]}, vdl${ABC[C:C+8]}), valphah); 128*4bdc9457SAndroid Build Coastguard Worker 129*4bdc9457SAndroid Build Coastguard Worker $if SSE == 4: 130*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 4): 131*4bdc9457SAndroid Build Coastguard Worker __m128i vacc${ABC[C:C+4]} = _mm_mullo_epi32(vd${ABC[C:C+4]}, valphav); 132*4bdc9457SAndroid Build Coastguard Worker $else: 133*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 4): 134*4bdc9457SAndroid Build Coastguard Worker __m128i vacc${ABC[C:C+4]} = _mm_slli_epi32(_mm_mulhi_epu16(vd${ABC[C:C+4]}, valphav), 16); 135*4bdc9457SAndroid Build Coastguard Worker 136*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 4): 137*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[C:C+4]} = _mm_add_epi16(_mm_mullo_epi16(vd${ABC[C:C+4]}, valphav), vacc${ABC[C:C+4]}); 138*4bdc9457SAndroid Build Coastguard Worker 139*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 4): 140*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_slli_epi32(vt${ABC[C:C+4]}, 11), vacc${ABC[C:C+4]}); 141*4bdc9457SAndroid Build Coastguard Worker 142*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 4): 143*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[C:C+4]} = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc${ABC[C:C+4]}, vrounding), 22); 144*4bdc9457SAndroid Build Coastguard Worker 145*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 8): 146*4bdc9457SAndroid Build Coastguard Worker const __m128i vacc${ABC[C:C+8]} = _mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}); 147*4bdc9457SAndroid Build Coastguard Worker 148*4bdc9457SAndroid Build Coastguard Worker $for C in range(0, CHANNEL_TILE, 16): 149*4bdc9457SAndroid Build Coastguard Worker $if C + 8 < CHANNEL_TILE: 150*4bdc9457SAndroid Build Coastguard Worker const __m128i vo${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vacc${ABC[C:C+8]}, vacc${ABC[C+8:C+16]}); 151*4bdc9457SAndroid Build Coastguard Worker $else: 152*4bdc9457SAndroid Build Coastguard Worker const __m128i vo${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}); 153*4bdc9457SAndroid Build Coastguard Worker 154*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) output, vo${ABC[0:16]}); 155*4bdc9457SAndroid Build Coastguard Worker $for C in range(16, CHANNEL_TILE, 16): 156*4bdc9457SAndroid Build Coastguard Worker $if C + 8 < CHANNEL_TILE: 157*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (output + ${C}), vo${ABC[C:C+16]}); 158*4bdc9457SAndroid Build Coastguard Worker $else: 159*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) (output + ${C}), vo${ABC[C:C+8]}); 160*4bdc9457SAndroid Build Coastguard Worker output += ${CHANNEL_TILE}; 161*4bdc9457SAndroid Build Coastguard Worker } 162*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8 * sizeof(${XINT8_T}); c -= 8 * sizeof(${XINT8_T})) { 163*4bdc9457SAndroid Build Coastguard Worker $if SSE == 4: 164*4bdc9457SAndroid Build Coastguard Worker const __m128i vtl01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i0)); 165*4bdc9457SAndroid Build Coastguard Worker i0 += 8; 166*4bdc9457SAndroid Build Coastguard Worker const __m128i vtr01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i1)); 167*4bdc9457SAndroid Build Coastguard Worker i1 += 8; 168*4bdc9457SAndroid Build Coastguard Worker const __m128i vbl01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 169*4bdc9457SAndroid Build Coastguard Worker i2 += 8; 170*4bdc9457SAndroid Build Coastguard Worker const __m128i vbr01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i3)); 171*4bdc9457SAndroid Build Coastguard Worker i3 += 8; 172*4bdc9457SAndroid Build Coastguard Worker $else: 173*4bdc9457SAndroid Build Coastguard Worker __m128i vtl01234567 = _mm_loadl_epi64((const __m128i*) i0); 174*4bdc9457SAndroid Build Coastguard Worker i0 += 8; 175*4bdc9457SAndroid Build Coastguard Worker __m128i vtr01234567 = _mm_loadl_epi64((const __m128i*) i1); 176*4bdc9457SAndroid Build Coastguard Worker i1 += 8; 177*4bdc9457SAndroid Build Coastguard Worker __m128i vbl01234567 = _mm_loadl_epi64((const __m128i*) i2); 178*4bdc9457SAndroid Build Coastguard Worker i2 += 8; 179*4bdc9457SAndroid Build Coastguard Worker __m128i vbr01234567 = _mm_loadl_epi64((const __m128i*) i3); 180*4bdc9457SAndroid Build Coastguard Worker i3 += 8; 181*4bdc9457SAndroid Build Coastguard Worker 182*4bdc9457SAndroid Build Coastguard Worker $if SSE != 4: 183*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "U8": 184*4bdc9457SAndroid Build Coastguard Worker __m128i vzero = _mm_setzero_si128(); 185*4bdc9457SAndroid Build Coastguard Worker vtl01234567 = _mm_unpacklo_epi8(vtl01234567, vzero); 186*4bdc9457SAndroid Build Coastguard Worker vtr01234567 = _mm_unpacklo_epi8(vtr01234567, vzero); 187*4bdc9457SAndroid Build Coastguard Worker vbl01234567 = _mm_unpacklo_epi8(vbl01234567, vzero); 188*4bdc9457SAndroid Build Coastguard Worker vbr01234567 = _mm_unpacklo_epi8(vbr01234567, vzero); 189*4bdc9457SAndroid Build Coastguard Worker $else: 190*4bdc9457SAndroid Build Coastguard Worker vtl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtl01234567, vtl01234567), 8); 191*4bdc9457SAndroid Build Coastguard Worker vtr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtr01234567, vtr01234567), 8); 192*4bdc9457SAndroid Build Coastguard Worker vbl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbl01234567, vbl01234567), 8); 193*4bdc9457SAndroid Build Coastguard Worker vbr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbr01234567, vbr01234567), 8); 194*4bdc9457SAndroid Build Coastguard Worker 195*4bdc9457SAndroid Build Coastguard Worker const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567); 196*4bdc9457SAndroid Build Coastguard Worker const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah); 197*4bdc9457SAndroid Build Coastguard Worker const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567); 198*4bdc9457SAndroid Build Coastguard Worker const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah); 199*4bdc9457SAndroid Build Coastguard Worker 200*4bdc9457SAndroid Build Coastguard Worker const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah); 201*4bdc9457SAndroid Build Coastguard Worker const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah); 202*4bdc9457SAndroid Build Coastguard Worker 203*4bdc9457SAndroid Build Coastguard Worker $if SSE == 4: 204*4bdc9457SAndroid Build Coastguard Worker __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav); 205*4bdc9457SAndroid Build Coastguard Worker __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav); 206*4bdc9457SAndroid Build Coastguard Worker $else: 207*4bdc9457SAndroid Build Coastguard Worker __m128i vacc0123 = _mm_slli_epi32(_mm_mulhi_epu16(vd0123, valphav), 16); 208*4bdc9457SAndroid Build Coastguard Worker __m128i vacc4567 = _mm_slli_epi32(_mm_mulhi_epu16(vd4567, valphav), 16); 209*4bdc9457SAndroid Build Coastguard Worker 210*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm_add_epi16(_mm_mullo_epi16(vd0123, valphav), vacc0123); 211*4bdc9457SAndroid Build Coastguard Worker vacc4567 = _mm_add_epi16(_mm_mullo_epi16(vd4567, valphav), vacc4567); 212*4bdc9457SAndroid Build Coastguard Worker 213*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123); 214*4bdc9457SAndroid Build Coastguard Worker vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567); 215*4bdc9457SAndroid Build Coastguard Worker 216*4bdc9457SAndroid Build Coastguard Worker vacc0123 = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc0123, vrounding), 22); 217*4bdc9457SAndroid Build Coastguard Worker vacc4567 = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc4567, vrounding), 22); 218*4bdc9457SAndroid Build Coastguard Worker 219*4bdc9457SAndroid Build Coastguard Worker const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567); 220*4bdc9457SAndroid Build Coastguard Worker 221*4bdc9457SAndroid Build Coastguard Worker const __m128i vo01234567 = ${_MM_PACKXS_EPI16}(vacc01234567, vacc01234567); 222*4bdc9457SAndroid Build Coastguard Worker 223*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) output, vo01234567); 224*4bdc9457SAndroid Build Coastguard Worker output += 8; 225*4bdc9457SAndroid Build Coastguard Worker } 226*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) { 227*4bdc9457SAndroid Build Coastguard Worker $if SSE == 4: 228*4bdc9457SAndroid Build Coastguard Worker const __m128i vtl01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i0)); 229*4bdc9457SAndroid Build Coastguard Worker const __m128i vtr01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i1)); 230*4bdc9457SAndroid Build Coastguard Worker const __m128i vbl01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 231*4bdc9457SAndroid Build Coastguard Worker const __m128i vbr01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i3)); 232*4bdc9457SAndroid Build Coastguard Worker $else: 233*4bdc9457SAndroid Build Coastguard Worker __m128i vtl01234567 = _mm_loadl_epi64((const __m128i*) i0); 234*4bdc9457SAndroid Build Coastguard Worker __m128i vtr01234567 = _mm_loadl_epi64((const __m128i*) i1); 235*4bdc9457SAndroid Build Coastguard Worker __m128i vbl01234567 = _mm_loadl_epi64((const __m128i*) i2); 236*4bdc9457SAndroid Build Coastguard Worker __m128i vbr01234567 = _mm_loadl_epi64((const __m128i*) i3); 237*4bdc9457SAndroid Build Coastguard Worker 238*4bdc9457SAndroid Build Coastguard Worker $if SSE != 4: 239*4bdc9457SAndroid Build Coastguard Worker $if DATATYPE == "U8": 240*4bdc9457SAndroid Build Coastguard Worker __m128i vzero = _mm_setzero_si128(); 241*4bdc9457SAndroid Build Coastguard Worker vtl01234567 = _mm_unpacklo_epi8(vtl01234567, vzero); 242*4bdc9457SAndroid Build Coastguard Worker vtr01234567 = _mm_unpacklo_epi8(vtr01234567, vzero); 243*4bdc9457SAndroid Build Coastguard Worker vbl01234567 = _mm_unpacklo_epi8(vbl01234567, vzero); 244*4bdc9457SAndroid Build Coastguard Worker vbr01234567 = _mm_unpacklo_epi8(vbr01234567, vzero); 245*4bdc9457SAndroid Build Coastguard Worker $else: 246*4bdc9457SAndroid Build Coastguard Worker vtl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtl01234567, vtl01234567), 8); 247*4bdc9457SAndroid Build Coastguard Worker vtr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtr01234567, vtr01234567), 8); 248*4bdc9457SAndroid Build Coastguard Worker vbl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbl01234567, vbl01234567), 8); 249*4bdc9457SAndroid Build Coastguard Worker vbr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbr01234567, vbr01234567), 8); 250*4bdc9457SAndroid Build Coastguard Worker 251*4bdc9457SAndroid Build Coastguard Worker const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567); 252*4bdc9457SAndroid Build Coastguard Worker const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah); 253*4bdc9457SAndroid Build Coastguard Worker const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567); 254*4bdc9457SAndroid Build Coastguard Worker const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah); 255*4bdc9457SAndroid Build Coastguard Worker 256*4bdc9457SAndroid Build Coastguard Worker const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah); 257*4bdc9457SAndroid Build Coastguard Worker const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah); 258*4bdc9457SAndroid Build Coastguard Worker 259*4bdc9457SAndroid Build Coastguard Worker $if SSE == 4: 260*4bdc9457SAndroid Build Coastguard Worker __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav); 261*4bdc9457SAndroid Build Coastguard Worker __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav); 262*4bdc9457SAndroid Build Coastguard Worker $else: 263*4bdc9457SAndroid Build Coastguard Worker __m128i vacc0123 = _mm_slli_epi32(_mm_mulhi_epu16(vd0123, valphav), 16); 264*4bdc9457SAndroid Build Coastguard Worker __m128i vacc4567 = _mm_slli_epi32(_mm_mulhi_epu16(vd4567, valphav), 16); 265*4bdc9457SAndroid Build Coastguard Worker 266*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm_add_epi16(_mm_mullo_epi16(vd0123, valphav), vacc0123); 267*4bdc9457SAndroid Build Coastguard Worker vacc4567 = _mm_add_epi16(_mm_mullo_epi16(vd4567, valphav), vacc4567); 268*4bdc9457SAndroid Build Coastguard Worker 269*4bdc9457SAndroid Build Coastguard Worker vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123); 270*4bdc9457SAndroid Build Coastguard Worker vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567); 271*4bdc9457SAndroid Build Coastguard Worker 272*4bdc9457SAndroid Build Coastguard Worker vacc0123 = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc0123, vrounding), 22); 273*4bdc9457SAndroid Build Coastguard Worker vacc4567 = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc4567, vrounding), 22); 274*4bdc9457SAndroid Build Coastguard Worker 275*4bdc9457SAndroid Build Coastguard Worker const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567); 276*4bdc9457SAndroid Build Coastguard Worker 277*4bdc9457SAndroid Build Coastguard Worker __m128i vo01234567 = ${_MM_PACKXS_EPI16}(vacc01234567, vacc01234567); 278*4bdc9457SAndroid Build Coastguard Worker 279*4bdc9457SAndroid Build Coastguard Worker if (c & (4 * sizeof(${XINT8_T}))) { 280*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vo01234567)); 281*4bdc9457SAndroid Build Coastguard Worker output += 4; 282*4bdc9457SAndroid Build Coastguard Worker vo01234567 = _mm_srli_epi64(vo01234567, 32); 283*4bdc9457SAndroid Build Coastguard Worker } 284*4bdc9457SAndroid Build Coastguard Worker $if SSE == 4: 285*4bdc9457SAndroid Build Coastguard Worker if (c & (2 * sizeof(${XINT8_T}))) { 286*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vo01234567, 0)); 287*4bdc9457SAndroid Build Coastguard Worker output += 2; 288*4bdc9457SAndroid Build Coastguard Worker vo01234567 = _mm_srli_epi32(vo01234567, 16); 289*4bdc9457SAndroid Build Coastguard Worker } 290*4bdc9457SAndroid Build Coastguard Worker if (c & (1 * sizeof(${XINT8_T}))) { 291*4bdc9457SAndroid Build Coastguard Worker *output++ = (uint8_t) _mm_extract_epi8(vo01234567, 0); 292*4bdc9457SAndroid Build Coastguard Worker } 293*4bdc9457SAndroid Build Coastguard Worker $else: 294*4bdc9457SAndroid Build Coastguard Worker uint32_t vo0123 = (uint32_t) _mm_cvtsi128_si32(vo01234567); 295*4bdc9457SAndroid Build Coastguard Worker if (c & (2 * sizeof(${XINT8_T}))) { 296*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(output, (uint16_t) vo0123); 297*4bdc9457SAndroid Build Coastguard Worker output += 2; 298*4bdc9457SAndroid Build Coastguard Worker vo0123 >>= 16; 299*4bdc9457SAndroid Build Coastguard Worker } 300*4bdc9457SAndroid Build Coastguard Worker if (c & (1 * sizeof(${XINT8_T}))) { 301*4bdc9457SAndroid Build Coastguard Worker *output++ = (uint8_t) vo0123; 302*4bdc9457SAndroid Build Coastguard Worker } 303*4bdc9457SAndroid Build Coastguard Worker } 304*4bdc9457SAndroid Build Coastguard Worker 305*4bdc9457SAndroid Build Coastguard Worker output = (${XINT8_T}*) ((uintptr_t) output + output_increment); 306*4bdc9457SAndroid Build Coastguard Worker } while (--output_pixels != 0); 307*4bdc9457SAndroid Build Coastguard Worker} 308