xref: /aosp_15_r20/external/XNNPACK/src/s8-ibilinear/sse.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker//
3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker
6*4bdc9457SAndroid Build Coastguard Worker$assert SSE in [2, 4]
7*4bdc9457SAndroid Build Coastguard Worker$assert not XOP or AVX
8*4bdc9457SAndroid Build Coastguard Worker$assert not AVX or SSE == 4
9*4bdc9457SAndroid Build Coastguard Worker$assert DATATYPE in ["S8", "U8"]
10*4bdc9457SAndroid Build Coastguard Worker$assert CHANNEL_TILE % 8 == 0
11*4bdc9457SAndroid Build Coastguard Worker$assert CHANNEL_TILE >= 8
12*4bdc9457SAndroid Build Coastguard Worker$assert PIXEL_TILE == 1
13*4bdc9457SAndroid Build Coastguard Worker$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
14*4bdc9457SAndroid Build Coastguard Worker#include <assert.h>
15*4bdc9457SAndroid Build Coastguard Worker
16*4bdc9457SAndroid Build Coastguard Worker$if XOP:
17*4bdc9457SAndroid Build Coastguard Worker  #if defined(__GNUC__) || defined(__clang__)
18*4bdc9457SAndroid Build Coastguard Worker    #include <x86intrin.h>
19*4bdc9457SAndroid Build Coastguard Worker  #else
20*4bdc9457SAndroid Build Coastguard Worker    #include <immintrin.h>
21*4bdc9457SAndroid Build Coastguard Worker    #include <ammintrin.h>
22*4bdc9457SAndroid Build Coastguard Worker  #endif
23*4bdc9457SAndroid Build Coastguard Worker$else:
24*4bdc9457SAndroid Build Coastguard Worker  $SSE_HEADER = {2: "emmintrin.h", 4: "smmintrin.h"}[SSE]
25*4bdc9457SAndroid Build Coastguard Worker  #include <${SSE_HEADER}>
26*4bdc9457SAndroid Build Coastguard Worker
27*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/common.h>
28*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/ibilinear.h>
29*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/unaligned.h>
30*4bdc9457SAndroid Build Coastguard Worker
31*4bdc9457SAndroid Build Coastguard Worker
32*4bdc9457SAndroid Build Coastguard Worker$XINT8_T = {"S8": "int8_t", "U8": "uint8_t"}[DATATYPE]
33*4bdc9457SAndroid Build Coastguard Worker$_MM_CVTEPX8_EPI16 = {"S8": "_mm_cvtepi8_epi16", "U8": "_mm_cvtepu8_epi16"}[DATATYPE]
34*4bdc9457SAndroid Build Coastguard Worker$_MM_SRXI_EPI32 = {"S8": "_mm_srai_epi32", "U8": "_mm_srli_epi32"}[DATATYPE]
35*4bdc9457SAndroid Build Coastguard Worker$_MM_SRXI_EPI16 = {"S8": "_mm_srai_epi16", "U8": "_mm_srli_epi16"}[DATATYPE]
36*4bdc9457SAndroid Build Coastguard Worker$_MM_PACKXS_EPI16 = {"S8": "_mm_packs_epi16", "U8": "_mm_packus_epi16"}[DATATYPE]
37*4bdc9457SAndroid Build Coastguard Worker$ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
38*4bdc9457SAndroid Build Coastguard Workervoid xnn_${DATATYPE.lower()}_ibilinear_ukernel__${ISA}_c${CHANNEL_TILE}${"" if PIXEL_TILE == 1 else "x%d" % PIXEL_TILE}(
39*4bdc9457SAndroid Build Coastguard Worker    size_t output_pixels,
40*4bdc9457SAndroid Build Coastguard Worker    size_t channels,
41*4bdc9457SAndroid Build Coastguard Worker    const ${XINT8_T}**restrict input,
42*4bdc9457SAndroid Build Coastguard Worker    size_t input_offset,
43*4bdc9457SAndroid Build Coastguard Worker    const int16_t*restrict weights,
44*4bdc9457SAndroid Build Coastguard Worker    ${XINT8_T}*restrict output,
45*4bdc9457SAndroid Build Coastguard Worker    size_t output_increment) XNN_OOB_READS
46*4bdc9457SAndroid Build Coastguard Worker{
47*4bdc9457SAndroid Build Coastguard Worker  assert(output_pixels != 0);
48*4bdc9457SAndroid Build Coastguard Worker  assert(channels != 0);
49*4bdc9457SAndroid Build Coastguard Worker
50*4bdc9457SAndroid Build Coastguard Worker  do {
51*4bdc9457SAndroid Build Coastguard Worker    const ${XINT8_T}* i0 = (const ${XINT8_T}*) ((uintptr_t) input[0] + input_offset);
52*4bdc9457SAndroid Build Coastguard Worker    const ${XINT8_T}* i1 = (const ${XINT8_T}*) ((uintptr_t) input[1] + input_offset);
53*4bdc9457SAndroid Build Coastguard Worker    const ${XINT8_T}* i2 = (const ${XINT8_T}*) ((uintptr_t) input[2] + input_offset);
54*4bdc9457SAndroid Build Coastguard Worker    const ${XINT8_T}* i3 = (const ${XINT8_T}*) ((uintptr_t) input[3] + input_offset);
55*4bdc9457SAndroid Build Coastguard Worker    input += 4;
56*4bdc9457SAndroid Build Coastguard Worker
57*4bdc9457SAndroid Build Coastguard Worker    const __m128i valpha = _mm_cvtsi32_si128(*((const int*) weights));
58*4bdc9457SAndroid Build Coastguard Worker    weights += 2;
59*4bdc9457SAndroid Build Coastguard Worker    __m128i valphah = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(0, 0, 0, 0));
60*4bdc9457SAndroid Build Coastguard Worker    valphah = _mm_unpacklo_epi64(valphah, valphah);
61*4bdc9457SAndroid Build Coastguard Worker    $if SSE == 2:
62*4bdc9457SAndroid Build Coastguard Worker      __m128i valphav = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(1, 1, 1, 1));
63*4bdc9457SAndroid Build Coastguard Worker      valphav = _mm_unpacklo_epi64(valphav, valphav);
64*4bdc9457SAndroid Build Coastguard Worker    $else:
65*4bdc9457SAndroid Build Coastguard Worker      __m128i valphav = _mm_srli_epi32(valpha, 16);
66*4bdc9457SAndroid Build Coastguard Worker      valphav = _mm_shuffle_epi32(valphav, _MM_SHUFFLE(0, 0, 0, 0));
67*4bdc9457SAndroid Build Coastguard Worker
68*4bdc9457SAndroid Build Coastguard Worker    $if SSE == 4:
69*4bdc9457SAndroid Build Coastguard Worker      valphah = _mm_blend_epi16(valphah, _mm_sub_epi16(_mm_set1_epi32(0x08000000), valphah), 0xAA);
70*4bdc9457SAndroid Build Coastguard Worker    $else:
71*4bdc9457SAndroid Build Coastguard Worker      valphah = _mm_xor_si128(valphah, _mm_set1_epi32(0xFFFF0000));
72*4bdc9457SAndroid Build Coastguard Worker      valphah = _mm_add_epi16(valphah, _mm_set1_epi32(0x08010000));
73*4bdc9457SAndroid Build Coastguard Worker
74*4bdc9457SAndroid Build Coastguard Worker    const __m128i vrounding = _mm_set1_epi32(0x00200000);
75*4bdc9457SAndroid Build Coastguard Worker
76*4bdc9457SAndroid Build Coastguard Worker    size_t c = channels;
77*4bdc9457SAndroid Build Coastguard Worker    $if CHANNEL_TILE > 8:
78*4bdc9457SAndroid Build Coastguard Worker      for (; c >= ${CHANNEL_TILE} * sizeof(${XINT8_T}); c -= ${CHANNEL_TILE} * sizeof(${XINT8_T})) {
79*4bdc9457SAndroid Build Coastguard Worker        $if SSE == 4:
80*4bdc9457SAndroid Build Coastguard Worker          const __m128i vtl${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i0));
81*4bdc9457SAndroid Build Coastguard Worker          const __m128i vtr${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i1));
82*4bdc9457SAndroid Build Coastguard Worker          const __m128i vbl${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
83*4bdc9457SAndroid Build Coastguard Worker          const __m128i vbr${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i3));
84*4bdc9457SAndroid Build Coastguard Worker          $for C in range(8, CHANNEL_TILE, 8):
85*4bdc9457SAndroid Build Coastguard Worker            const __m128i vtl${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i0 + ${C})));
86*4bdc9457SAndroid Build Coastguard Worker            const __m128i vtr${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i1 + ${C})));
87*4bdc9457SAndroid Build Coastguard Worker            const __m128i vbl${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C})));
88*4bdc9457SAndroid Build Coastguard Worker            const __m128i vbr${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i3 + ${C})));
89*4bdc9457SAndroid Build Coastguard Worker        $else:
90*4bdc9457SAndroid Build Coastguard Worker          __m128i vtl${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i0);
91*4bdc9457SAndroid Build Coastguard Worker          __m128i vtr${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i1);
92*4bdc9457SAndroid Build Coastguard Worker          __m128i vbl${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i2);
93*4bdc9457SAndroid Build Coastguard Worker          __m128i vbr${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i3);
94*4bdc9457SAndroid Build Coastguard Worker          $for C in range(8, CHANNEL_TILE, 8):
95*4bdc9457SAndroid Build Coastguard Worker            __m128i vtl${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i0 + ${C}));
96*4bdc9457SAndroid Build Coastguard Worker            __m128i vtr${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i1 + ${C}));
97*4bdc9457SAndroid Build Coastguard Worker            __m128i vbl${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i2 + ${C}));
98*4bdc9457SAndroid Build Coastguard Worker            __m128i vbr${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i3 + ${C}));
99*4bdc9457SAndroid Build Coastguard Worker        i0 += ${CHANNEL_TILE};
100*4bdc9457SAndroid Build Coastguard Worker        i1 += ${CHANNEL_TILE};
101*4bdc9457SAndroid Build Coastguard Worker        i2 += ${CHANNEL_TILE};
102*4bdc9457SAndroid Build Coastguard Worker        i3 += ${CHANNEL_TILE};
103*4bdc9457SAndroid Build Coastguard Worker
104*4bdc9457SAndroid Build Coastguard Worker        $if SSE != 4:
105*4bdc9457SAndroid Build Coastguard Worker          $if DATATYPE == "U8":
106*4bdc9457SAndroid Build Coastguard Worker            __m128i vzero = _mm_setzero_si128();
107*4bdc9457SAndroid Build Coastguard Worker            $for C in range(0, CHANNEL_TILE, 8):
108*4bdc9457SAndroid Build Coastguard Worker              vtl${ABC[C:C+8]} = _mm_unpacklo_epi8(vtl${ABC[C:C+8]}, vzero);
109*4bdc9457SAndroid Build Coastguard Worker              vtr${ABC[C:C+8]} = _mm_unpacklo_epi8(vtr${ABC[C:C+8]}, vzero);
110*4bdc9457SAndroid Build Coastguard Worker              vbl${ABC[C:C+8]} = _mm_unpacklo_epi8(vbl${ABC[C:C+8]}, vzero);
111*4bdc9457SAndroid Build Coastguard Worker              vbr${ABC[C:C+8]} = _mm_unpacklo_epi8(vbr${ABC[C:C+8]}, vzero);
112*4bdc9457SAndroid Build Coastguard Worker          $else:
113*4bdc9457SAndroid Build Coastguard Worker            $for C in range(0, CHANNEL_TILE, 8):
114*4bdc9457SAndroid Build Coastguard Worker              vtl${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vtl${ABC[C:C+8]}, vtl${ABC[C:C+8]}), 8);
115*4bdc9457SAndroid Build Coastguard Worker              vtr${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vtr${ABC[C:C+8]}, vtr${ABC[C:C+8]}), 8);
116*4bdc9457SAndroid Build Coastguard Worker              vbl${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vbl${ABC[C:C+8]}, vbl${ABC[C:C+8]}), 8);
117*4bdc9457SAndroid Build Coastguard Worker              vbr${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vbr${ABC[C:C+8]}, vbr${ABC[C:C+8]}), 8);
118*4bdc9457SAndroid Build Coastguard Worker
119*4bdc9457SAndroid Build Coastguard Worker        $for C in range(0, CHANNEL_TILE, 8):
120*4bdc9457SAndroid Build Coastguard Worker          const __m128i vdr${ABC[C:C+8]} = _mm_sub_epi16(vbr${ABC[C:C+8]}, vtr${ABC[C:C+8]});
121*4bdc9457SAndroid Build Coastguard Worker          const __m128i vt${ABC[C:C+4]} = _mm_madd_epi16(_mm_unpacklo_epi16(vtr${ABC[C:C+8]}, vtl${ABC[C:C+8]}), valphah);
122*4bdc9457SAndroid Build Coastguard Worker          const __m128i vdl${ABC[C:C+8]} = _mm_sub_epi16(vbl${ABC[C:C+8]}, vtl${ABC[C:C+8]});
123*4bdc9457SAndroid Build Coastguard Worker          const __m128i vt${ABC[C+4:C+8]} = _mm_madd_epi16(_mm_unpackhi_epi16(vtr${ABC[C:C+8]}, vtl${ABC[C:C+8]}), valphah);
124*4bdc9457SAndroid Build Coastguard Worker
125*4bdc9457SAndroid Build Coastguard Worker        $for C in range(0, CHANNEL_TILE, 8):
126*4bdc9457SAndroid Build Coastguard Worker          const __m128i vd${ABC[C:C+4]} = _mm_madd_epi16(_mm_unpacklo_epi16(vdr${ABC[C:C+8]}, vdl${ABC[C:C+8]}), valphah);
127*4bdc9457SAndroid Build Coastguard Worker          const __m128i vd${ABC[C+4:C+8]} = _mm_madd_epi16(_mm_unpackhi_epi16(vdr${ABC[C:C+8]}, vdl${ABC[C:C+8]}), valphah);
128*4bdc9457SAndroid Build Coastguard Worker
129*4bdc9457SAndroid Build Coastguard Worker        $if SSE == 4:
130*4bdc9457SAndroid Build Coastguard Worker          $for C in range(0, CHANNEL_TILE, 4):
131*4bdc9457SAndroid Build Coastguard Worker            __m128i vacc${ABC[C:C+4]} = _mm_mullo_epi32(vd${ABC[C:C+4]}, valphav);
132*4bdc9457SAndroid Build Coastguard Worker        $else:
133*4bdc9457SAndroid Build Coastguard Worker          $for C in range(0, CHANNEL_TILE, 4):
134*4bdc9457SAndroid Build Coastguard Worker            __m128i vacc${ABC[C:C+4]} = _mm_slli_epi32(_mm_mulhi_epu16(vd${ABC[C:C+4]}, valphav), 16);
135*4bdc9457SAndroid Build Coastguard Worker
136*4bdc9457SAndroid Build Coastguard Worker          $for C in range(0, CHANNEL_TILE, 4):
137*4bdc9457SAndroid Build Coastguard Worker            vacc${ABC[C:C+4]} = _mm_add_epi16(_mm_mullo_epi16(vd${ABC[C:C+4]}, valphav), vacc${ABC[C:C+4]});
138*4bdc9457SAndroid Build Coastguard Worker
139*4bdc9457SAndroid Build Coastguard Worker        $for C in range(0, CHANNEL_TILE, 4):
140*4bdc9457SAndroid Build Coastguard Worker          vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_slli_epi32(vt${ABC[C:C+4]}, 11), vacc${ABC[C:C+4]});
141*4bdc9457SAndroid Build Coastguard Worker
142*4bdc9457SAndroid Build Coastguard Worker        $for C in range(0, CHANNEL_TILE, 4):
143*4bdc9457SAndroid Build Coastguard Worker          vacc${ABC[C:C+4]} = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc${ABC[C:C+4]}, vrounding), 22);
144*4bdc9457SAndroid Build Coastguard Worker
145*4bdc9457SAndroid Build Coastguard Worker        $for C in range(0, CHANNEL_TILE, 8):
146*4bdc9457SAndroid Build Coastguard Worker          const __m128i vacc${ABC[C:C+8]} = _mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]});
147*4bdc9457SAndroid Build Coastguard Worker
148*4bdc9457SAndroid Build Coastguard Worker        $for C in range(0, CHANNEL_TILE, 16):
149*4bdc9457SAndroid Build Coastguard Worker          $if C + 8 < CHANNEL_TILE:
150*4bdc9457SAndroid Build Coastguard Worker            const __m128i vo${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vacc${ABC[C:C+8]}, vacc${ABC[C+8:C+16]});
151*4bdc9457SAndroid Build Coastguard Worker          $else:
152*4bdc9457SAndroid Build Coastguard Worker            const __m128i vo${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]});
153*4bdc9457SAndroid Build Coastguard Worker
154*4bdc9457SAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) output, vo${ABC[0:16]});
155*4bdc9457SAndroid Build Coastguard Worker        $for C in range(16, CHANNEL_TILE, 16):
156*4bdc9457SAndroid Build Coastguard Worker          $if C + 8 < CHANNEL_TILE:
157*4bdc9457SAndroid Build Coastguard Worker            _mm_storeu_si128((__m128i*) (output + ${C}), vo${ABC[C:C+16]});
158*4bdc9457SAndroid Build Coastguard Worker          $else:
159*4bdc9457SAndroid Build Coastguard Worker            _mm_storel_epi64((__m128i*) (output + ${C}), vo${ABC[C:C+8]});
160*4bdc9457SAndroid Build Coastguard Worker        output += ${CHANNEL_TILE};
161*4bdc9457SAndroid Build Coastguard Worker      }
162*4bdc9457SAndroid Build Coastguard Worker    for (; c >= 8 * sizeof(${XINT8_T}); c -= 8 * sizeof(${XINT8_T})) {
163*4bdc9457SAndroid Build Coastguard Worker      $if SSE == 4:
164*4bdc9457SAndroid Build Coastguard Worker        const __m128i vtl01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i0));
165*4bdc9457SAndroid Build Coastguard Worker        i0 += 8;
166*4bdc9457SAndroid Build Coastguard Worker        const __m128i vtr01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i1));
167*4bdc9457SAndroid Build Coastguard Worker        i1 += 8;
168*4bdc9457SAndroid Build Coastguard Worker        const __m128i vbl01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
169*4bdc9457SAndroid Build Coastguard Worker        i2 += 8;
170*4bdc9457SAndroid Build Coastguard Worker        const __m128i vbr01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i3));
171*4bdc9457SAndroid Build Coastguard Worker        i3 += 8;
172*4bdc9457SAndroid Build Coastguard Worker      $else:
173*4bdc9457SAndroid Build Coastguard Worker        __m128i vtl01234567 = _mm_loadl_epi64((const __m128i*) i0);
174*4bdc9457SAndroid Build Coastguard Worker        i0 += 8;
175*4bdc9457SAndroid Build Coastguard Worker        __m128i vtr01234567 = _mm_loadl_epi64((const __m128i*) i1);
176*4bdc9457SAndroid Build Coastguard Worker        i1 += 8;
177*4bdc9457SAndroid Build Coastguard Worker        __m128i vbl01234567 = _mm_loadl_epi64((const __m128i*) i2);
178*4bdc9457SAndroid Build Coastguard Worker        i2 += 8;
179*4bdc9457SAndroid Build Coastguard Worker        __m128i vbr01234567 = _mm_loadl_epi64((const __m128i*) i3);
180*4bdc9457SAndroid Build Coastguard Worker        i3 += 8;
181*4bdc9457SAndroid Build Coastguard Worker
182*4bdc9457SAndroid Build Coastguard Worker      $if SSE != 4:
183*4bdc9457SAndroid Build Coastguard Worker        $if DATATYPE == "U8":
184*4bdc9457SAndroid Build Coastguard Worker          __m128i vzero = _mm_setzero_si128();
185*4bdc9457SAndroid Build Coastguard Worker          vtl01234567 = _mm_unpacklo_epi8(vtl01234567, vzero);
186*4bdc9457SAndroid Build Coastguard Worker          vtr01234567 = _mm_unpacklo_epi8(vtr01234567, vzero);
187*4bdc9457SAndroid Build Coastguard Worker          vbl01234567 = _mm_unpacklo_epi8(vbl01234567, vzero);
188*4bdc9457SAndroid Build Coastguard Worker          vbr01234567 = _mm_unpacklo_epi8(vbr01234567, vzero);
189*4bdc9457SAndroid Build Coastguard Worker        $else:
190*4bdc9457SAndroid Build Coastguard Worker          vtl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtl01234567, vtl01234567), 8);
191*4bdc9457SAndroid Build Coastguard Worker          vtr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtr01234567, vtr01234567), 8);
192*4bdc9457SAndroid Build Coastguard Worker          vbl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbl01234567, vbl01234567), 8);
193*4bdc9457SAndroid Build Coastguard Worker          vbr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbr01234567, vbr01234567), 8);
194*4bdc9457SAndroid Build Coastguard Worker
195*4bdc9457SAndroid Build Coastguard Worker      const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
196*4bdc9457SAndroid Build Coastguard Worker      const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
197*4bdc9457SAndroid Build Coastguard Worker      const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
198*4bdc9457SAndroid Build Coastguard Worker      const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
199*4bdc9457SAndroid Build Coastguard Worker
200*4bdc9457SAndroid Build Coastguard Worker      const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
201*4bdc9457SAndroid Build Coastguard Worker      const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
202*4bdc9457SAndroid Build Coastguard Worker
203*4bdc9457SAndroid Build Coastguard Worker      $if SSE == 4:
204*4bdc9457SAndroid Build Coastguard Worker        __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
205*4bdc9457SAndroid Build Coastguard Worker        __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
206*4bdc9457SAndroid Build Coastguard Worker      $else:
207*4bdc9457SAndroid Build Coastguard Worker        __m128i vacc0123 = _mm_slli_epi32(_mm_mulhi_epu16(vd0123, valphav), 16);
208*4bdc9457SAndroid Build Coastguard Worker        __m128i vacc4567 = _mm_slli_epi32(_mm_mulhi_epu16(vd4567, valphav), 16);
209*4bdc9457SAndroid Build Coastguard Worker
210*4bdc9457SAndroid Build Coastguard Worker        vacc0123 = _mm_add_epi16(_mm_mullo_epi16(vd0123, valphav), vacc0123);
211*4bdc9457SAndroid Build Coastguard Worker        vacc4567 = _mm_add_epi16(_mm_mullo_epi16(vd4567, valphav), vacc4567);
212*4bdc9457SAndroid Build Coastguard Worker
213*4bdc9457SAndroid Build Coastguard Worker      vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
214*4bdc9457SAndroid Build Coastguard Worker      vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
215*4bdc9457SAndroid Build Coastguard Worker
216*4bdc9457SAndroid Build Coastguard Worker      vacc0123 = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc0123, vrounding), 22);
217*4bdc9457SAndroid Build Coastguard Worker      vacc4567 = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc4567, vrounding), 22);
218*4bdc9457SAndroid Build Coastguard Worker
219*4bdc9457SAndroid Build Coastguard Worker      const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
220*4bdc9457SAndroid Build Coastguard Worker
221*4bdc9457SAndroid Build Coastguard Worker      const __m128i vo01234567 = ${_MM_PACKXS_EPI16}(vacc01234567, vacc01234567);
222*4bdc9457SAndroid Build Coastguard Worker
223*4bdc9457SAndroid Build Coastguard Worker      _mm_storel_epi64((__m128i*) output, vo01234567);
224*4bdc9457SAndroid Build Coastguard Worker      output += 8;
225*4bdc9457SAndroid Build Coastguard Worker    }
226*4bdc9457SAndroid Build Coastguard Worker    if XNN_UNLIKELY(c != 0) {
227*4bdc9457SAndroid Build Coastguard Worker      $if SSE == 4:
228*4bdc9457SAndroid Build Coastguard Worker        const __m128i vtl01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i0));
229*4bdc9457SAndroid Build Coastguard Worker        const __m128i vtr01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i1));
230*4bdc9457SAndroid Build Coastguard Worker        const __m128i vbl01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
231*4bdc9457SAndroid Build Coastguard Worker        const __m128i vbr01234567 = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i3));
232*4bdc9457SAndroid Build Coastguard Worker      $else:
233*4bdc9457SAndroid Build Coastguard Worker        __m128i vtl01234567 = _mm_loadl_epi64((const __m128i*) i0);
234*4bdc9457SAndroid Build Coastguard Worker        __m128i vtr01234567 = _mm_loadl_epi64((const __m128i*) i1);
235*4bdc9457SAndroid Build Coastguard Worker        __m128i vbl01234567 = _mm_loadl_epi64((const __m128i*) i2);
236*4bdc9457SAndroid Build Coastguard Worker        __m128i vbr01234567 = _mm_loadl_epi64((const __m128i*) i3);
237*4bdc9457SAndroid Build Coastguard Worker
238*4bdc9457SAndroid Build Coastguard Worker      $if SSE != 4:
239*4bdc9457SAndroid Build Coastguard Worker        $if DATATYPE == "U8":
240*4bdc9457SAndroid Build Coastguard Worker          __m128i vzero = _mm_setzero_si128();
241*4bdc9457SAndroid Build Coastguard Worker          vtl01234567 = _mm_unpacklo_epi8(vtl01234567, vzero);
242*4bdc9457SAndroid Build Coastguard Worker          vtr01234567 = _mm_unpacklo_epi8(vtr01234567, vzero);
243*4bdc9457SAndroid Build Coastguard Worker          vbl01234567 = _mm_unpacklo_epi8(vbl01234567, vzero);
244*4bdc9457SAndroid Build Coastguard Worker          vbr01234567 = _mm_unpacklo_epi8(vbr01234567, vzero);
245*4bdc9457SAndroid Build Coastguard Worker        $else:
246*4bdc9457SAndroid Build Coastguard Worker          vtl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtl01234567, vtl01234567), 8);
247*4bdc9457SAndroid Build Coastguard Worker          vtr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtr01234567, vtr01234567), 8);
248*4bdc9457SAndroid Build Coastguard Worker          vbl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbl01234567, vbl01234567), 8);
249*4bdc9457SAndroid Build Coastguard Worker          vbr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbr01234567, vbr01234567), 8);
250*4bdc9457SAndroid Build Coastguard Worker
251*4bdc9457SAndroid Build Coastguard Worker      const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
252*4bdc9457SAndroid Build Coastguard Worker      const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
253*4bdc9457SAndroid Build Coastguard Worker      const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
254*4bdc9457SAndroid Build Coastguard Worker      const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
255*4bdc9457SAndroid Build Coastguard Worker
256*4bdc9457SAndroid Build Coastguard Worker      const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
257*4bdc9457SAndroid Build Coastguard Worker      const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
258*4bdc9457SAndroid Build Coastguard Worker
259*4bdc9457SAndroid Build Coastguard Worker      $if SSE == 4:
260*4bdc9457SAndroid Build Coastguard Worker        __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
261*4bdc9457SAndroid Build Coastguard Worker        __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
262*4bdc9457SAndroid Build Coastguard Worker      $else:
263*4bdc9457SAndroid Build Coastguard Worker        __m128i vacc0123 = _mm_slli_epi32(_mm_mulhi_epu16(vd0123, valphav), 16);
264*4bdc9457SAndroid Build Coastguard Worker        __m128i vacc4567 = _mm_slli_epi32(_mm_mulhi_epu16(vd4567, valphav), 16);
265*4bdc9457SAndroid Build Coastguard Worker
266*4bdc9457SAndroid Build Coastguard Worker        vacc0123 = _mm_add_epi16(_mm_mullo_epi16(vd0123, valphav), vacc0123);
267*4bdc9457SAndroid Build Coastguard Worker        vacc4567 = _mm_add_epi16(_mm_mullo_epi16(vd4567, valphav), vacc4567);
268*4bdc9457SAndroid Build Coastguard Worker
269*4bdc9457SAndroid Build Coastguard Worker      vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
270*4bdc9457SAndroid Build Coastguard Worker      vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
271*4bdc9457SAndroid Build Coastguard Worker
272*4bdc9457SAndroid Build Coastguard Worker      vacc0123 = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc0123, vrounding), 22);
273*4bdc9457SAndroid Build Coastguard Worker      vacc4567 = ${_MM_SRXI_EPI32}(_mm_add_epi16(vacc4567, vrounding), 22);
274*4bdc9457SAndroid Build Coastguard Worker
275*4bdc9457SAndroid Build Coastguard Worker      const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
276*4bdc9457SAndroid Build Coastguard Worker
277*4bdc9457SAndroid Build Coastguard Worker      __m128i vo01234567 = ${_MM_PACKXS_EPI16}(vacc01234567, vacc01234567);
278*4bdc9457SAndroid Build Coastguard Worker
279*4bdc9457SAndroid Build Coastguard Worker      if (c & (4 * sizeof(${XINT8_T}))) {
280*4bdc9457SAndroid Build Coastguard Worker        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vo01234567));
281*4bdc9457SAndroid Build Coastguard Worker        output += 4;
282*4bdc9457SAndroid Build Coastguard Worker        vo01234567 = _mm_srli_epi64(vo01234567, 32);
283*4bdc9457SAndroid Build Coastguard Worker      }
284*4bdc9457SAndroid Build Coastguard Worker      $if SSE == 4:
285*4bdc9457SAndroid Build Coastguard Worker        if (c & (2 * sizeof(${XINT8_T}))) {
286*4bdc9457SAndroid Build Coastguard Worker          unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vo01234567, 0));
287*4bdc9457SAndroid Build Coastguard Worker          output += 2;
288*4bdc9457SAndroid Build Coastguard Worker          vo01234567 = _mm_srli_epi32(vo01234567, 16);
289*4bdc9457SAndroid Build Coastguard Worker        }
290*4bdc9457SAndroid Build Coastguard Worker        if (c & (1 * sizeof(${XINT8_T}))) {
291*4bdc9457SAndroid Build Coastguard Worker          *output++ = (uint8_t) _mm_extract_epi8(vo01234567, 0);
292*4bdc9457SAndroid Build Coastguard Worker        }
293*4bdc9457SAndroid Build Coastguard Worker      $else:
294*4bdc9457SAndroid Build Coastguard Worker        uint32_t vo0123 = (uint32_t) _mm_cvtsi128_si32(vo01234567);
295*4bdc9457SAndroid Build Coastguard Worker        if (c & (2 * sizeof(${XINT8_T}))) {
296*4bdc9457SAndroid Build Coastguard Worker          unaligned_store_u16(output, (uint16_t) vo0123);
297*4bdc9457SAndroid Build Coastguard Worker          output += 2;
298*4bdc9457SAndroid Build Coastguard Worker          vo0123 >>= 16;
299*4bdc9457SAndroid Build Coastguard Worker        }
300*4bdc9457SAndroid Build Coastguard Worker        if (c & (1 * sizeof(${XINT8_T}))) {
301*4bdc9457SAndroid Build Coastguard Worker          *output++ = (uint8_t) vo0123;
302*4bdc9457SAndroid Build Coastguard Worker        }
303*4bdc9457SAndroid Build Coastguard Worker    }
304*4bdc9457SAndroid Build Coastguard Worker
305*4bdc9457SAndroid Build Coastguard Worker    output = (${XINT8_T}*) ((uintptr_t) output + output_increment);
306*4bdc9457SAndroid Build Coastguard Worker  } while (--output_pixels != 0);
307*4bdc9457SAndroid Build Coastguard Worker}
308