1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert SSE in [2, 4] 7$assert not XOP or AVX 8$assert not AVX or SSE == 4 9$assert REQUANTIZATION == "FP32" 10$assert DATATYPE in ["QC8", "QS8", "QU8"] 11$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 12$SSE_HEADER = {2: "emmintrin.h", 4: "smmintrin.h"}[SSE] 13$assert CHANNEL_TILE % 8 == 0 14$assert CHANNEL_TILE >= 8 15$assert KERNEL_TILE >= 2 16#include <assert.h> 17 18$if XOP: 19 #if defined(__GNUC__) || defined(__clang__) 20 #include <x86intrin.h> 21 #else 22 #include <immintrin.h> 23 #include <ammintrin.h> 24 #endif 25$else: 26 #include <${SSE_HEADER}> 27 28#include <xnnpack/dwconv.h> 29#include <xnnpack/unaligned.h> 30 31 32$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("sse4" if SSE == 4 and DATATYPE != "QU8" else "sse2") 33$PARAMS_UNION = "xnn_%s_conv_minmax_params" % DATATYPE.lower() 34$ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 4: "sse41"}[SSE] 35$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" 36void xnn_${DATATYPE.lower()}_dwconv_minmax_${REQUANTIZATION.lower()}_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${ISA}_mul16${"_add16" if ADD16 else ""}( 37 size_t channels, 38 size_t output_width, 39 const ${XINT8_T}** input, 40 const void* weights, 41 ${XINT8_T}* output, 42 size_t input_stride, 43 size_t output_increment, 44 size_t input_offset, 45 const ${XINT8_T}* zero, 46 const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 47{ 48 assert(channels != 0); 49 assert(output_width != 0); 50 51 do { 52 $for K in range(KERNEL_TILE): 53 const ${XINT8_T}* i${K} = input[${K}]; 54 assert(i${K} != NULL); 55 if XNN_UNPREDICTABLE(i${K} != zero) { 56 i${K} = (const ${XINT8_T}*) ((uintptr_t) i${K} + input_offset); 57 } 58 input = (const ${XINT8_T}**) ((uintptr_t) input + input_stride); 59 60 size_t c = channels; 61 const void* w = weights; 62 $if DATATYPE == "QU8": 63 const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.kernel_zero_point); 64 for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) { 65 __m128i vacc${ABC[0:4]} = _mm_loadu_si128((const __m128i*) w); 66 $for C in range(4, CHANNEL_TILE, 4): 67 __m128i vacc${ABC[C:C+4]} = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + ${C})); 68 69 $for K in range(KERNEL_TILE): 70 71 $for C in range(0, CHANNEL_TILE, 8): 72 $if C == 0: 73 const __m128i vi${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${K}); 74 $else: 75 const __m128i vi${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${K} + ${C})); 76 $if SSE == 4: 77 $if DATATYPE == "QU8": 78 const __m128i vxi${K}x${ABC[C:C+8]} = _mm_cvtepu8_epi16(vi${K}x${ABC[C:C+8]}); 79 $else: 80 const __m128i vxi${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vi${K}x${ABC[C:C+8]}); 81 const __m128i vk${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE + C} * sizeof(${XINT8_T}))); 82 $if SSE == 4: 83 $if DATATYPE == "QU8": 84 const __m128i vxk${K}x${ABC[C:C+8]} = _mm_sub_epi16(_mm_cvtepu8_epi16(vk${K}x${ABC[C:C+8]}), vk_zero_point); 85 $else: 86 const __m128i vxk${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vk${K}x${ABC[C:C+8]}); 87 i${K} += ${CHANNEL_TILE}; 88 89 $if SSE < 4: 90 $if DATATYPE == "QU8": 91 $if K == 0: 92 const __m128i vzero = _mm_setzero_si128(); 93 $for C in range(0, CHANNEL_TILE, 8): 94 const __m128i vxi${K}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${K}x${ABC[C:C+8]}, vzero); 95 const __m128i vxk${K}x${ABC[C:C+8]} = _mm_sub_epi16(_mm_unpacklo_epi8(vk${K}x${ABC[C:C+8]}, vzero), vk_zero_point); 96 $else: 97 $for C in range(0, CHANNEL_TILE, 8): 98 const __m128i vxi${K}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${K}x${ABC[C:C+8]}, vi${K}x${ABC[C:C+8]}), 8); 99 const __m128i vxk${K}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vk${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]}), 8); 100 101 $for C in range(0, CHANNEL_TILE, 8): 102 $if DATATYPE == "QU8" or SSE < 4 and not ADD16: 103 const __m128i vprod${K}x${ABC[C:C+8]}lo = _mm_mullo_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]}); 104 const __m128i vprod${K}x${ABC[C:C+8]}hi = _mm_mulhi_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]}); 105 $elif K == 0: 106 __m128i vprod${ABC[C:C+8]} = _mm_mullo_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]}); 107 $elif K % 2 == 0 or K + 1 == KERNEL_TILE or not ADD16: 108 vprod${ABC[C:C+8]} = _mm_mullo_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]}); 109 $elif XOP: 110 vprod${ABC[C:C+8]} = _mm_macc_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]}, vprod${ABC[C:C+8]}); 111 $else: 112 vprod${ABC[C:C+8]} = _mm_add_epi16(vprod${ABC[C:C+8]}, _mm_mullo_epi16(vxi${K}x${ABC[C:C+8]}, vxk${K}x${ABC[C:C+8]})); 113 114 $if not ADD16 or K % 2 == 1 or K + 1 == KERNEL_TILE: 115 $for C in range(0, CHANNEL_TILE, 8): 116 $if DATATYPE == "QU8" or SSE < 4 and not ADD16: 117 vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_unpacklo_epi16(vprod${K}x${ABC[C:C+8]}lo, vprod${K}x${ABC[C:C+8]}hi)); 118 vacc${ABC[C+4:C+8]} = _mm_add_epi32(vacc${ABC[C+4:C+8]}, _mm_unpackhi_epi16(vprod${K}x${ABC[C:C+8]}lo, vprod${K}x${ABC[C:C+8]}hi)); 119 $elif SSE < 4: 120 const __m128i vsignprod${K}x${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod${ABC[C:C+8]}); 121 vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_unpacklo_epi16(vprod${ABC[C:C+8]}, vsignprod${K}x${ABC[C:C+8]})); 122 vacc${ABC[C+4:C+8]} = _mm_add_epi32(vacc${ABC[C+4:C+8]}, _mm_unpackhi_epi16(vprod${ABC[C:C+8]}, vsignprod${K}x${ABC[C:C+8]})); 123 $else: 124 vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_cvtepi16_epi32(vprod${ABC[C:C+8]})); 125 vacc${ABC[C+4:C+8]} = _mm_add_epi32(vacc${ABC[C+4:C+8]}, _mm_srai_epi32(_mm_unpackhi_epi16(vprod${ABC[C:C+8]}, vprod${ABC[C:C+8]}), 16)); 126 127 w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(${XINT8_T})); 128 129 $for C in range(0, CHANNEL_TILE, 4): 130 __m128 vscaled${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]}); 131 132 $if DATATYPE == "QC8": 133 const __m128 vscale${ABC[0:4]} = _mm_loadu_ps((const float*) w); 134 $for C in range(4, CHANNEL_TILE, 4): 135 const __m128 vscale${ABC[C:C+4]} = _mm_loadu_ps((const float*) w + ${C}); 136 w = (const void*) ((const float*) w + ${CHANNEL_TILE}); 137 $for C in range(0, CHANNEL_TILE, 4): 138 vscaled${ABC[C:C+4]} = _mm_mul_ps(vscaled${ABC[C:C+4]}, vscale${ABC[C:C+4]}); 139 $else: 140 const __m128 vscale = _mm_load_ps(params->${PARAMS_STRUCT}.scale); 141 $for C in range(0, CHANNEL_TILE, 4): 142 vscaled${ABC[C:C+4]} = _mm_mul_ps(vscaled${ABC[C:C+4]}, vscale); 143 144 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->${PARAMS_STRUCT}.output_max_less_zero_point); 145 $for C in range(0, CHANNEL_TILE, 4): 146 vscaled${ABC[C:C+4]} = _mm_min_ps(vscaled${ABC[C:C+4]}, voutput_max_less_zero_point); 147 148 $for C in range(0, CHANNEL_TILE, 4): 149 vacc${ABC[C:C+4]} = _mm_cvtps_epi32(vscaled${ABC[C:C+4]}); 150 151 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point); 152 $for C in range(0, CHANNEL_TILE, 8): 153 __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point); 154 155 $if DATATYPE == "QU8": 156 $for C in range(0, CHANNEL_TILE, 16): 157 $if C + 8 < CHANNEL_TILE: 158 __m128i vout${ABC[C:C+16]} = _mm_packus_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); 159 $else: 160 __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packus_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); 161 162 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min); 163 $for C in range(0, CHANNEL_TILE, 16): 164 $if C + 8 < CHANNEL_TILE: 165 vout${ABC[C:C+16]} = _mm_max_epu8(vout${ABC[C:C+16]}, voutput_min); 166 $else: 167 vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epu8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min); 168 $else: 169 $if SSE < 4: 170 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min); 171 $for C in range(0, CHANNEL_TILE, 8): 172 vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min); 173 174 $for C in range(0, CHANNEL_TILE, 16): 175 $if C + 8 < CHANNEL_TILE: 176 __m128i vout${ABC[C:C+16]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); 177 $else: 178 __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); 179 180 $if SSE == 4: 181 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min); 182 $for C in range(0, CHANNEL_TILE, 16): 183 $if C + 8 < CHANNEL_TILE: 184 vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min); 185 $else: 186 vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min); 187 188 $if CHANNEL_TILE > 8: 189 _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); 190 $else: 191 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); 192 $for C in range(16, CHANNEL_TILE, 16): 193 $if C + 8 < CHANNEL_TILE: 194 _mm_storeu_si128((__m128i*) (output + ${C}), vout${ABC[C:C+16]}); 195 $else: 196 _mm_storel_epi64((__m128i*) (output + ${C}), vout${ABC[C:C+8]}${ABC[C:C+8]}); 197 output += ${CHANNEL_TILE}; 198 } 199 if XNN_UNLIKELY(c != 0) { 200 $if CHANNEL_TILE > 8: 201 const ${XINT8_T}* k = (const ${XINT8_T}*) ((const int32_t*) w + ${CHANNEL_TILE}); 202 ${"do " if CHANNEL_TILE > 8 else ""}{ 203 __m128i vacc${ABC[0:4]} = _mm_loadu_si128((const __m128i*) w); 204 __m128i vacc${ABC[4:8]} = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4)); 205 206 $for K in range(KERNEL_TILE): 207 208 const __m128i vi${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${K}); 209 $if SSE == 4: 210 $if DATATYPE == "QU8": 211 const __m128i vxi${K}x${ABC[0:8]} = _mm_cvtepu8_epi16(vi${K}x${ABC[0:8]}); 212 $else: 213 const __m128i vxi${K}x${ABC[0:8]} = _mm_cvtepi8_epi16(vi${K}x${ABC[0:8]}); 214 $if CHANNEL_TILE > 8: 215 $if K == 0: 216 const __m128i vk${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) k); 217 $else: 218 const __m128i vk${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) (k + ${K * CHANNEL_TILE})); 219 $else: 220 const __m128i vk${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE} * sizeof(${XINT8_T}))); 221 $if SSE == 4: 222 $if DATATYPE == "QU8": 223 const __m128i vxk${K}x${ABC[0:8]} = _mm_sub_epi16(_mm_cvtepu8_epi16(vk${K}x${ABC[0:8]}), vk_zero_point); 224 $else: 225 const __m128i vxk${K}x${ABC[0:8]} = _mm_cvtepi8_epi16(vk${K}x${ABC[0:8]}); 226 $if CHANNEL_TILE > 8: 227 i${K} += 8; 228 229 $if SSE < 4: 230 $if DATATYPE == "QU8": 231 $if K == 0: 232 const __m128i vzero = _mm_setzero_si128(); 233 const __m128i vxi${K}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${K}x${ABC[0:8]}, vzero); 234 const __m128i vxk${K}x${ABC[0:8]} = _mm_sub_epi16(_mm_unpacklo_epi8(vk${K}x${ABC[0:8]}, vzero), vk_zero_point); 235 $else: 236 const __m128i vxi${K}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${K}x${ABC[0:8]}, vi${K}x${ABC[0:8]}), 8); 237 const __m128i vxk${K}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vk${K}x${ABC[0:8]}, vk${K}x${ABC[0:8]}), 8); 238 239 $if DATATYPE == "QU8" or SSE < 4 and not ADD16: 240 const __m128i vprod${K}x${ABC[0:8]}lo = _mm_mullo_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]}); 241 const __m128i vprod${K}x${ABC[0:8]}hi = _mm_mulhi_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]}); 242 $elif K == 0: 243 __m128i vprod${ABC[0:8]} = _mm_mullo_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]}); 244 $elif K % 2 == 0 or K + 1 == KERNEL_TILE or not ADD16: 245 vprod${ABC[0:8]} = _mm_mullo_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]}); 246 $elif XOP: 247 vprod${ABC[0:8]} = _mm_macc_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]}, vprod${ABC[0:8]}); 248 $else: 249 vprod${ABC[0:8]} = _mm_add_epi16(vprod${ABC[0:8]}, _mm_mullo_epi16(vxi${K}x${ABC[0:8]}, vxk${K}x${ABC[0:8]})); 250 251 $if not ADD16 or K % 2 == 1 or K + 1 == KERNEL_TILE: 252 $if DATATYPE == "QU8" or SSE < 4 and not ADD16: 253 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_unpacklo_epi16(vprod${K}x${ABC[0:8]}lo, vprod${K}x${ABC[0:8]}hi)); 254 vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_unpackhi_epi16(vprod${K}x${ABC[0:8]}lo, vprod${K}x${ABC[0:8]}hi)); 255 $elif SSE < 4: 256 const __m128i vsignprod${K}x${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod${ABC[0:8]}); 257 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_unpacklo_epi16(vprod${ABC[0:8]}, vsignprod${K}x${ABC[0:8]})); 258 vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_unpackhi_epi16(vprod${ABC[0:8]}, vsignprod${K}x${ABC[0:8]})); 259 $else: 260 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_cvtepi16_epi32(vprod${ABC[0:8]})); 261 vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_srai_epi32(_mm_unpackhi_epi16(vprod${ABC[0:8]}, vprod${ABC[0:8]}), 16)); 262 263 $if CHANNEL_TILE > 8: 264 k += 8; 265 266 __m128 vscaled${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]}); 267 __m128 vscaled${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]}); 268 269 $if DATATYPE == "QC8": 270 const __m128 vscale${ABC[0:4]} = _mm_loadu_ps((const float*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${CHANNEL_TILE * KERNEL_TILE} * sizeof(${XINT8_T}))); 271 const __m128 vscale${ABC[4:8]} = _mm_loadu_ps((const float*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${CHANNEL_TILE * KERNEL_TILE} * sizeof(${XINT8_T}) + 4 * sizeof(float))); 272 vscaled${ABC[0:4]} = _mm_mul_ps(vscaled${ABC[0:4]}, vscale${ABC[0:4]}); 273 vscaled${ABC[4:8]} = _mm_mul_ps(vscaled${ABC[4:8]}, vscale${ABC[4:8]}); 274 $else: 275 const __m128 vscale = _mm_load_ps(params->${PARAMS_STRUCT}.scale); 276 vscaled${ABC[0:4]} = _mm_mul_ps(vscaled${ABC[0:4]}, vscale); 277 vscaled${ABC[4:8]} = _mm_mul_ps(vscaled${ABC[4:8]}, vscale); 278 279 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->${PARAMS_STRUCT}.output_max_less_zero_point); 280 vscaled${ABC[0:4]} = _mm_min_ps(vscaled${ABC[0:4]}, voutput_max_less_zero_point); 281 vscaled${ABC[4:8]} = _mm_min_ps(vscaled${ABC[4:8]}, voutput_max_less_zero_point); 282 283 vacc${ABC[0:4]} = _mm_cvtps_epi32(vscaled${ABC[0:4]}); 284 vacc${ABC[4:8]} = _mm_cvtps_epi32(vscaled${ABC[4:8]}); 285 286 $if CHANNEL_TILE > 8: 287 w = (const void*) ((const int32_t*) w + 8); 288 289 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point); 290 __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point); 291 292 $if DATATYPE == "QU8": 293 __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packus_epi16(vout${ABC[0:8]}, vout${ABC[0:8]}); 294 295 vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epu8(vout${ABC[0:8]}${ABC[0:8]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min)); 296 $else: 297 $if SSE < 4: 298 vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min)); 299 300 __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[0:8]}); 301 302 $if SSE == 4: 303 vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epi8(vout${ABC[0:8]}${ABC[0:8]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min)); 304 305 $if CHANNEL_TILE > 8: 306 if XNN_LIKELY(c >= 8) { 307 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); 308 output += 8; 309 c -= 8; 310 } else { 311 if (c & 4) { 312 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); 313 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); 314 output += 4; 315 } 316 if (c & 2) { 317 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0)); 318 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16); 319 output += 2; 320 } 321 if (c & 1) { 322 $if SSE == 4: 323 *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0); 324 $else: 325 *output = (${XINT8_T}) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}); 326 output += 1; 327 } 328 c = 0; 329 } 330 $else: 331 if (c & 4) { 332 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); 333 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); 334 output += 4; 335 } 336 if (c & 2) { 337 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0)); 338 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16); 339 output += 2; 340 } 341 if (c & 1) { 342 $if SSE == 4: 343 *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0); 344 $else: 345 *output = (${XINT8_T}) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}); 346 output += 1; 347 } 348 }${" while (c != 0);" if CHANNEL_TILE > 8 else ""} 349 } 350 351 output = (${XINT8_T}*) ((uintptr_t) output + output_increment); 352 } while (--output_width != 0); 353} 354