xref: /aosp_15_r20/external/XNNPACK/src/qs8-dwconv/unipass-sse-mul16.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert SSE in [2, 4]
7$assert not XOP or AVX
8$assert not AVX or SSE == 4
9$assert REQUANTIZATION == "FP32"
10$assert DATATYPE in ["QC8", "QS8", "QU8"]
11$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
12$SSE_HEADER = {2: "emmintrin.h", 4: "smmintrin.h"}[SSE]
13$assert CHANNEL_TILE % 8 == 0
14$assert CHANNEL_TILE >= 8
15$assert KERNEL_TILE >= 2
16#include <assert.h>
17
18$if XOP:
19  #if defined(__GNUC__) || defined(__clang__)
20    #include <x86intrin.h>
21  #else
22    #include <immintrin.h>
23    #include <ammintrin.h>
24  #endif
25$else:
26  #include <${SSE_HEADER}>
27
28#include <xnnpack/dwconv.h>
29#include <xnnpack/unaligned.h>
30
31
32$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("sse4" if SSE == 4 and DATATYPE != "QU8" else "sse2")
33$PARAMS_UNION = "xnn_%s_conv_minmax_params" % DATATYPE.lower()
34$ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 4: "sse41"}[SSE]
35$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
36void xnn_${DATATYPE.lower()}_dwconv_minmax_${REQUANTIZATION.lower()}_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${ISA}_mul16${"_add16" if ADD16 else ""}(
37    size_t channels,
38    size_t output_width,
39    const ${XINT8_T}** input,
40    const void* weights,
41    ${XINT8_T}* output,
42    size_t input_stride,
43    size_t output_increment,
44    size_t input_offset,
45    const ${XINT8_T}* zero,
46    const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
47{
48  assert(channels != 0);
49  assert(output_width != 0);
50
51  do {
52    $for K in range(KERNEL_TILE):
53      const ${XINT8_T}* i${K} = input[${K}];
54      assert(i${K} != NULL);
55      if XNN_UNPREDICTABLE(i${K} != zero) {
56        i${K} = (const ${XINT8_T}*) ((uintptr_t) i${K} + input_offset);
57      }
58    input = (const ${XINT8_T}**) ((uintptr_t) input + input_stride);
59
60    size_t c = channels;
61    const void* w = weights;
62    $if DATATYPE == "QU8":
63      const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.kernel_zero_point);
64    for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) {
65      __m128i vacc${ABC[0:4]} = _mm_loadu_si128((const __m128i*) w);
66      $for C in range(4, CHANNEL_TILE, 4):
67        __m128i vacc${ABC[C:C+4]} = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + ${C}));
68
69      $for K in range(KERNEL_TILE):
70
71        $for C in range(0, CHANNEL_TILE, 8):
72          $if C == 0:
73            const __m128i vi${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${K});
74          $else:
75            const __m128i vi${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${K} + ${C}));
76          $if SSE == 4:
77            $if DATATYPE == "QU8":
78              const __m128i vxi${K}x${ABC[C:C+8]} = _mm_cvtepu8_epi16(vi${K}x${ABC[C:C+8]});
79            $else:
80              const __m128i vxi${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vi${K}x${ABC[C:C+8]});
81          const __m128i vk${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE + C} * sizeof(${XINT8_T})));
82          $if SSE == 4:
83            $if DATATYPE == "QU8":
84              const __m128i vxk${K}x${ABC[C:C+8]} = _mm_sub_epi16(_mm_cvtepu8_epi16(vk${K}x${ABC[C:C+8]}), vk_zero_point);
85            $else:
86              const __m128i vxk${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vk${K}x${ABC[C:C+8]});
87        i${K} += ${CHANNEL_TILE};
88
89        $if SSE < 4:
90          $if DATATYPE == "QU8":
91            $if K == 0:
92              const __m128i vzero = _mm_setzero_si128();
93            $for C in range(0, CHANNEL_TILE, 8):
94              const __m128i vxi${K}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${K}x${ABC[C:C+8]}, vzero);
95              const __m128i vxk${K}x${ABC[C:C+8]} = _mm_sub_epi16(_mm_unpacklo_epi8(vk${K}x${ABC[C:C+8]}, vzero), vk_zero_point);
96          $else:
97            $for C in range(0, CHANNEL_TILE, 8):
98              const __m128i vxi${K}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${K}x${ABC[C:C+8]}, vi${K}x${ABC[C:C+8]}), 8);
99              const __m128i vxk${K}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vk${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]}), 8);
100
101        $for C in range(0, CHANNEL_TILE, 8):
102          $if DATATYPE == "QU8" or SSE < 4 and not ADD16:
103            const __m128i vprod${K}x${ABC[C:C+8]}lo = _mm_mullo_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]});
104            const __m128i vprod${K}x${ABC[C:C+8]}hi = _mm_mulhi_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]});
105          $elif K == 0:
106            __m128i vprod${ABC[C:C+8]} = _mm_mullo_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]});
107          $elif K % 2 == 0 or K + 1 == KERNEL_TILE or not ADD16:
108            vprod${ABC[C:C+8]} = _mm_mullo_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]});
109          $elif XOP:
110            vprod${ABC[C:C+8]} = _mm_macc_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]}, vprod${ABC[C:C+8]});
111          $else:
112            vprod${ABC[C:C+8]} = _mm_add_epi16(vprod${ABC[C:C+8]}, _mm_mullo_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]}));
113
114        $if not ADD16 or K % 2 == 1 or K + 1 == KERNEL_TILE:
115          $for C in range(0, CHANNEL_TILE, 8):
116            $if DATATYPE == "QU8" or SSE < 4 and not ADD16:
117              vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_unpacklo_epi16(vprod${K}x${ABC[C:C+8]}lo, vprod${K}x${ABC[C:C+8]}hi));
118              vacc${ABC[C+4:C+8]} = _mm_add_epi32(vacc${ABC[C+4:C+8]}, _mm_unpackhi_epi16(vprod${K}x${ABC[C:C+8]}lo, vprod${K}x${ABC[C:C+8]}hi));
119            $elif SSE < 4:
120              const __m128i vsignprod${K}x${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod${ABC[C:C+8]});
121              vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_unpacklo_epi16(vprod${ABC[C:C+8]}, vsignprod${K}x${ABC[C:C+8]}));
122              vacc${ABC[C+4:C+8]} = _mm_add_epi32(vacc${ABC[C+4:C+8]}, _mm_unpackhi_epi16(vprod${ABC[C:C+8]}, vsignprod${K}x${ABC[C:C+8]}));
123            $else:
124              vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_cvtepi16_epi32(vprod${ABC[C:C+8]}));
125              vacc${ABC[C+4:C+8]} = _mm_add_epi32(vacc${ABC[C+4:C+8]}, _mm_srai_epi32(_mm_unpackhi_epi16(vprod${ABC[C:C+8]}, vprod${ABC[C:C+8]}), 16));
126
127      w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(${XINT8_T}));
128
129      $for C in range(0, CHANNEL_TILE, 4):
130        __m128 vscaled${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]});
131
132      $if DATATYPE == "QC8":
133        const __m128 vscale${ABC[0:4]} = _mm_loadu_ps((const float*) w);
134        $for C in range(4, CHANNEL_TILE, 4):
135          const __m128 vscale${ABC[C:C+4]} = _mm_loadu_ps((const float*) w + ${C});
136        w = (const void*) ((const float*) w + ${CHANNEL_TILE});
137        $for C in range(0, CHANNEL_TILE, 4):
138          vscaled${ABC[C:C+4]} = _mm_mul_ps(vscaled${ABC[C:C+4]}, vscale${ABC[C:C+4]});
139      $else:
140        const __m128 vscale = _mm_load_ps(params->${PARAMS_STRUCT}.scale);
141        $for C in range(0, CHANNEL_TILE, 4):
142          vscaled${ABC[C:C+4]} = _mm_mul_ps(vscaled${ABC[C:C+4]}, vscale);
143
144      const __m128 voutput_max_less_zero_point = _mm_load_ps(params->${PARAMS_STRUCT}.output_max_less_zero_point);
145      $for C in range(0, CHANNEL_TILE, 4):
146        vscaled${ABC[C:C+4]} = _mm_min_ps(vscaled${ABC[C:C+4]}, voutput_max_less_zero_point);
147
148      $for C in range(0, CHANNEL_TILE, 4):
149        vacc${ABC[C:C+4]} = _mm_cvtps_epi32(vscaled${ABC[C:C+4]});
150
151      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
152      $for C in range(0, CHANNEL_TILE, 8):
153        __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point);
154
155      $if DATATYPE == "QU8":
156        $for C in range(0, CHANNEL_TILE, 16):
157          $if C + 8 < CHANNEL_TILE:
158            __m128i vout${ABC[C:C+16]} = _mm_packus_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
159          $else:
160            __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packus_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
161
162        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
163        $for C in range(0, CHANNEL_TILE, 16):
164          $if C + 8 < CHANNEL_TILE:
165            vout${ABC[C:C+16]} = _mm_max_epu8(vout${ABC[C:C+16]}, voutput_min);
166          $else:
167            vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epu8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
168      $else:
169        $if SSE < 4:
170          const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
171          $for C in range(0, CHANNEL_TILE, 8):
172            vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min);
173
174        $for C in range(0, CHANNEL_TILE, 16):
175          $if C + 8 < CHANNEL_TILE:
176            __m128i vout${ABC[C:C+16]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
177          $else:
178            __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
179
180        $if SSE == 4:
181          const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
182          $for C in range(0, CHANNEL_TILE, 16):
183            $if C + 8 < CHANNEL_TILE:
184              vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min);
185            $else:
186              vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
187
188      $if CHANNEL_TILE > 8:
189        _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
190      $else:
191        _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]});
192      $for C in range(16, CHANNEL_TILE, 16):
193        $if C + 8 < CHANNEL_TILE:
194          _mm_storeu_si128((__m128i*) (output + ${C}), vout${ABC[C:C+16]});
195        $else:
196          _mm_storel_epi64((__m128i*) (output + ${C}), vout${ABC[C:C+8]}${ABC[C:C+8]});
197      output += ${CHANNEL_TILE};
198    }
199    if XNN_UNLIKELY(c != 0) {
200      $if CHANNEL_TILE > 8:
201        const ${XINT8_T}* k = (const ${XINT8_T}*) ((const int32_t*) w + ${CHANNEL_TILE});
202      ${"do " if CHANNEL_TILE > 8 else ""}{
203        __m128i vacc${ABC[0:4]} = _mm_loadu_si128((const __m128i*) w);
204        __m128i vacc${ABC[4:8]} = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
205
206        $for K in range(KERNEL_TILE):
207
208          const __m128i vi${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${K});
209          $if SSE == 4:
210            $if DATATYPE == "QU8":
211              const __m128i vxi${K}x${ABC[0:8]} = _mm_cvtepu8_epi16(vi${K}x${ABC[0:8]});
212            $else:
213              const __m128i vxi${K}x${ABC[0:8]} = _mm_cvtepi8_epi16(vi${K}x${ABC[0:8]});
214          $if CHANNEL_TILE > 8:
215            $if K == 0:
216              const __m128i vk${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) k);
217            $else:
218              const __m128i vk${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) (k + ${K * CHANNEL_TILE}));
219          $else:
220            const __m128i vk${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE} * sizeof(${XINT8_T})));
221          $if SSE == 4:
222            $if DATATYPE == "QU8":
223              const __m128i vxk${K}x${ABC[0:8]} = _mm_sub_epi16(_mm_cvtepu8_epi16(vk${K}x${ABC[0:8]}), vk_zero_point);
224            $else:
225              const __m128i vxk${K}x${ABC[0:8]} = _mm_cvtepi8_epi16(vk${K}x${ABC[0:8]});
226          $if CHANNEL_TILE > 8:
227            i${K} += 8;
228
229          $if SSE < 4:
230            $if DATATYPE == "QU8":
231              $if K == 0:
232                const __m128i vzero = _mm_setzero_si128();
233              const __m128i vxi${K}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${K}x${ABC[0:8]}, vzero);
234              const __m128i vxk${K}x${ABC[0:8]} = _mm_sub_epi16(_mm_unpacklo_epi8(vk${K}x${ABC[0:8]}, vzero), vk_zero_point);
235            $else:
236              const __m128i vxi${K}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${K}x${ABC[0:8]}, vi${K}x${ABC[0:8]}), 8);
237              const __m128i vxk${K}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vk${K}x${ABC[0:8]}, vk${K}x${ABC[0:8]}), 8);
238
239          $if DATATYPE == "QU8" or SSE < 4 and not ADD16:
240            const __m128i vprod${K}x${ABC[0:8]}lo = _mm_mullo_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]});
241            const __m128i vprod${K}x${ABC[0:8]}hi = _mm_mulhi_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]});
242          $elif K == 0:
243            __m128i vprod${ABC[0:8]} = _mm_mullo_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]});
244          $elif K % 2 == 0 or K + 1 == KERNEL_TILE or not ADD16:
245            vprod${ABC[0:8]} = _mm_mullo_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]});
246          $elif XOP:
247            vprod${ABC[0:8]} = _mm_macc_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]}, vprod${ABC[0:8]});
248          $else:
249            vprod${ABC[0:8]} = _mm_add_epi16(vprod${ABC[0:8]}, _mm_mullo_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]}));
250
251          $if not ADD16 or K % 2 == 1 or K + 1 == KERNEL_TILE:
252            $if DATATYPE == "QU8" or SSE < 4 and not ADD16:
253              vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_unpacklo_epi16(vprod${K}x${ABC[0:8]}lo, vprod${K}x${ABC[0:8]}hi));
254              vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_unpackhi_epi16(vprod${K}x${ABC[0:8]}lo, vprod${K}x${ABC[0:8]}hi));
255            $elif SSE < 4:
256              const __m128i vsignprod${K}x${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod${ABC[0:8]});
257              vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_unpacklo_epi16(vprod${ABC[0:8]}, vsignprod${K}x${ABC[0:8]}));
258              vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_unpackhi_epi16(vprod${ABC[0:8]}, vsignprod${K}x${ABC[0:8]}));
259            $else:
260              vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_cvtepi16_epi32(vprod${ABC[0:8]}));
261              vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_srai_epi32(_mm_unpackhi_epi16(vprod${ABC[0:8]}, vprod${ABC[0:8]}), 16));
262
263        $if CHANNEL_TILE > 8:
264          k += 8;
265
266        __m128 vscaled${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]});
267        __m128 vscaled${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]});
268
269        $if DATATYPE == "QC8":
270          const __m128 vscale${ABC[0:4]} = _mm_loadu_ps((const float*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${CHANNEL_TILE * KERNEL_TILE} * sizeof(${XINT8_T})));
271          const __m128 vscale${ABC[4:8]} = _mm_loadu_ps((const float*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${CHANNEL_TILE * KERNEL_TILE} * sizeof(${XINT8_T}) + 4 * sizeof(float)));
272          vscaled${ABC[0:4]} = _mm_mul_ps(vscaled${ABC[0:4]}, vscale${ABC[0:4]});
273          vscaled${ABC[4:8]} = _mm_mul_ps(vscaled${ABC[4:8]}, vscale${ABC[4:8]});
274        $else:
275          const __m128 vscale = _mm_load_ps(params->${PARAMS_STRUCT}.scale);
276          vscaled${ABC[0:4]} = _mm_mul_ps(vscaled${ABC[0:4]}, vscale);
277          vscaled${ABC[4:8]} = _mm_mul_ps(vscaled${ABC[4:8]}, vscale);
278
279        const __m128 voutput_max_less_zero_point = _mm_load_ps(params->${PARAMS_STRUCT}.output_max_less_zero_point);
280        vscaled${ABC[0:4]} = _mm_min_ps(vscaled${ABC[0:4]}, voutput_max_less_zero_point);
281        vscaled${ABC[4:8]} = _mm_min_ps(vscaled${ABC[4:8]}, voutput_max_less_zero_point);
282
283        vacc${ABC[0:4]} = _mm_cvtps_epi32(vscaled${ABC[0:4]});
284        vacc${ABC[4:8]} = _mm_cvtps_epi32(vscaled${ABC[4:8]});
285
286        $if CHANNEL_TILE > 8:
287          w = (const void*) ((const int32_t*) w + 8);
288
289        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
290        __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
291
292        $if DATATYPE == "QU8":
293          __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packus_epi16(vout${ABC[0:8]}, vout${ABC[0:8]});
294
295          vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epu8(vout${ABC[0:8]}${ABC[0:8]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
296        $else:
297          $if SSE < 4:
298            vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
299
300          __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[0:8]});
301
302          $if SSE == 4:
303            vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epi8(vout${ABC[0:8]}${ABC[0:8]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
304
305        $if CHANNEL_TILE > 8:
306          if XNN_LIKELY(c >= 8) {
307            _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]});
308            output += 8;
309            c -= 8;
310          } else {
311            if (c & 4) {
312              unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}));
313              vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
314              output += 4;
315            }
316            if (c & 2) {
317              unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0));
318              vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16);
319              output += 2;
320            }
321            if (c & 1) {
322              $if SSE == 4:
323                *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
324              $else:
325                *output = (${XINT8_T}) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
326              output += 1;
327            }
328            c = 0;
329          }
330        $else:
331          if (c & 4) {
332            unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}));
333            vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
334            output += 4;
335          }
336          if (c & 2) {
337            unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0));
338            vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16);
339            output += 2;
340          }
341          if (c & 1) {
342            $if SSE == 4:
343              *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
344            $else:
345              *output = (${XINT8_T}) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
346            output += 1;
347          }
348      }${" while (c != 0);" if CHANNEL_TILE > 8 else ""}
349    }
350
351    output = (${XINT8_T}*) ((uintptr_t) output + output_increment);
352  } while (--output_width != 0);
353}
354