1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker$import math 6*4bdc9457SAndroid Build Coastguard Worker$assert IN_PTRS in ["MULTI", "REUSE"] 7*4bdc9457SAndroid Build Coastguard Worker$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV"] 8*4bdc9457SAndroid Build Coastguard Worker$assert SIZE in [8, 16, 32, 64] 9*4bdc9457SAndroid Build Coastguard Worker$TILE_SIZE = int(128/SIZE) 10*4bdc9457SAndroid Build Coastguard Worker$NUM_ITERS = int(math.log2(TILE_SIZE)) 11*4bdc9457SAndroid Build Coastguard Worker 12*4bdc9457SAndroid Build Coastguard Worker#include <immintrin.h> 13*4bdc9457SAndroid Build Coastguard Worker 14*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 15*4bdc9457SAndroid Build Coastguard Worker 16*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/common.h> 17*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h> 18*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/transpose.h> 19*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/unaligned.h> 20*4bdc9457SAndroid Build Coastguard Worker 21*4bdc9457SAndroid Build Coastguard Worker 22*4bdc9457SAndroid Build Coastguard Workervoid xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_sse2( 23*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* input, 24*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* output, 25*4bdc9457SAndroid Build Coastguard Worker size_t input_stride, 26*4bdc9457SAndroid Build Coastguard Worker size_t output_stride, 27*4bdc9457SAndroid Build Coastguard Worker size_t block_width, 28*4bdc9457SAndroid Build Coastguard Worker size_t block_height) XNN_OOB_READS 29*4bdc9457SAndroid Build Coastguard Worker{ 30*4bdc9457SAndroid Build Coastguard Worker assert(output_stride >= block_height * sizeof(uint${SIZE}_t)); 31*4bdc9457SAndroid Build Coastguard Worker assert(input_stride >= block_width * sizeof(uint${SIZE}_t)); 32*4bdc9457SAndroid Build Coastguard Worker 33*4bdc9457SAndroid Build Coastguard Worker const size_t tile_height = ${TILE_SIZE}; 34*4bdc9457SAndroid Build Coastguard Worker const size_t tile_width = ${TILE_SIZE}; 35*4bdc9457SAndroid Build Coastguard Worker const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t); 36*4bdc9457SAndroid Build Coastguard Worker const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t); 37*4bdc9457SAndroid Build Coastguard Worker const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; 38*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "MULTI": 39*4bdc9457SAndroid Build Coastguard Worker const size_t input_offset = tile_height * input_stride; 40*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 41*4bdc9457SAndroid Build Coastguard Worker const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes; 42*4bdc9457SAndroid Build Coastguard Worker $else: 43*4bdc9457SAndroid Build Coastguard Worker const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t); 44*4bdc9457SAndroid Build Coastguard Worker 45*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "MULTI": 46*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* i0 = input; 47*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 48*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 49*4bdc9457SAndroid Build Coastguard Worker $else: 50*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* i0 = input; 51*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MULTI": 52*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o0 = (uint${SIZE}_t*) output; 53*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 54*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride); 55*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "SWITCH": 56*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o = (uint${SIZE}_t*) output; 57*4bdc9457SAndroid Build Coastguard Worker $else: 58*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes); 59*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 60*4bdc9457SAndroid Build Coastguard Worker const size_t minus_output_stride = -output_stride; 61*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "SWITCH" and SIZE != 64: 62*4bdc9457SAndroid Build Coastguard Worker const size_t minus_output_stride = -output_stride; 63*4bdc9457SAndroid Build Coastguard Worker 64*4bdc9457SAndroid Build Coastguard Worker do { 65*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MULTI": 66*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width < 2) { 67*4bdc9457SAndroid Build Coastguard Worker o1 = o0; 68*4bdc9457SAndroid Build Coastguard Worker } 69*4bdc9457SAndroid Build Coastguard Worker $for N in range(2, TILE_SIZE, 2): 70*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width <= ${N}) { 71*4bdc9457SAndroid Build Coastguard Worker o${N} = o0; 72*4bdc9457SAndroid Build Coastguard Worker } 73*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width < ${N+2}) { 74*4bdc9457SAndroid Build Coastguard Worker o${N+1} = o0; 75*4bdc9457SAndroid Build Coastguard Worker } 76*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "MOV": 77*4bdc9457SAndroid Build Coastguard Worker const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 78*4bdc9457SAndroid Build Coastguard Worker const size_t oN_stride = rem * output_stride; 79*4bdc9457SAndroid Build Coastguard Worker const size_t oN_offset = oN_stride + tile_hbytes; 80*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "SWITCH": 81*4bdc9457SAndroid Build Coastguard Worker const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 82*4bdc9457SAndroid Build Coastguard Worker const size_t oN_stride = rem * output_stride; 83*4bdc9457SAndroid Build Coastguard Worker size_t bh = block_height; 84*4bdc9457SAndroid Build Coastguard Worker for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) { 85*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 86*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "REUSE": 87*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS}_${N} = _mm_loadu_si128((const __m128i*) i0); 88*4bdc9457SAndroid Build Coastguard Worker i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride); 89*4bdc9457SAndroid Build Coastguard Worker $else: 90*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS}_${N} = _mm_loadu_si128((const __m128i*) i${N}); 91*4bdc9457SAndroid Build Coastguard Worker i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset); 92*4bdc9457SAndroid Build Coastguard Worker 93*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 94*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS-1}_${N*2} = _mm_unpacklo_epi${SIZE}(v${NUM_ITERS}_${N*2}, v${NUM_ITERS}_${N*2+1}); 95*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS-1}_${N*2+1} = _mm_unpackhi_epi${SIZE}(v${NUM_ITERS}_${N*2}, v${NUM_ITERS}_${N*2+1}); 96*4bdc9457SAndroid Build Coastguard Worker 97*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>=2: 98*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, TILE_SIZE, 4): 99*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS-2}_${N} = _mm_unpacklo_epi${SIZE*2}(v${NUM_ITERS-1}_${N}, v${NUM_ITERS-1}_${N+2}); 100*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS-2}_${N+1} = _mm_unpackhi_epi${SIZE*2}(v${NUM_ITERS-1}_${N}, v${NUM_ITERS-1}_${N+2}); 101*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS-2}_${N+2} = _mm_unpacklo_epi${SIZE*2}(v${NUM_ITERS-1}_${N+1}, v${NUM_ITERS-1}_${N+3}); 102*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS-2}_${N+3} = _mm_unpackhi_epi${SIZE*2}(v${NUM_ITERS-1}_${N+1}, v${NUM_ITERS-1}_${N+3}); 103*4bdc9457SAndroid Build Coastguard Worker 104*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>=3: 105*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, TILE_SIZE, 8): 106*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, 4): 107*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS-3}_${M+2*N} = _mm_unpacklo_epi${SIZE*4}(v${NUM_ITERS-2}_${M+N}, v${NUM_ITERS-2}_${M+N+4}); 108*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS-3}_${M+2*N+1} = _mm_unpackhi_epi${SIZE*4}(v${NUM_ITERS-2}_${M+N}, v${NUM_ITERS-2}_${M+N+4}); 109*4bdc9457SAndroid Build Coastguard Worker 110*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>=4: 111*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 112*4bdc9457SAndroid Build Coastguard Worker const __m128i v0_${N*2} = _mm_unpacklo_epi64(v1_${N}, v1_${N+8}); 113*4bdc9457SAndroid Build Coastguard Worker const __m128i v0_${N*2+1} = _mm_unpackhi_epi64(v1_${N}, v1_${N+8}); 114*4bdc9457SAndroid Build Coastguard Worker 115*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 116*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 117*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 118*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 119*4bdc9457SAndroid Build Coastguard Worker case ${N}: 120*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) oN, v0_${N}); 121*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 122*4bdc9457SAndroid Build Coastguard Worker case 1: 123*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) oN, v0_1); 124*4bdc9457SAndroid Build Coastguard Worker case 0: 125*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, v0_0); 126*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 127*4bdc9457SAndroid Build Coastguard Worker break; 128*4bdc9457SAndroid Build Coastguard Worker default: 129*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 130*4bdc9457SAndroid Build Coastguard Worker } 131*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "MOV": 132*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset); 133*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, v0_${TILE_SIZE-1}); 134*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 135*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 136*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 137*4bdc9457SAndroid Build Coastguard Worker o = oN; 138*4bdc9457SAndroid Build Coastguard Worker } 139*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, v0_${N}); 140*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 141*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 142*4bdc9457SAndroid Build Coastguard Worker o = oN; 143*4bdc9457SAndroid Build Coastguard Worker } 144*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, v0_${N-1}); 145*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 146*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 147*4bdc9457SAndroid Build Coastguard Worker o = oN; 148*4bdc9457SAndroid Build Coastguard Worker } 149*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, v0_0); 150*4bdc9457SAndroid Build Coastguard Worker $else: 151*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 152*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o${N}, v0_${N}); 153*4bdc9457SAndroid Build Coastguard Worker o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes); 154*4bdc9457SAndroid Build Coastguard Worker } 155*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 156*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 157*4bdc9457SAndroid Build Coastguard Worker if (bh != 0) { 158*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "REUSE": 159*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS}_0 = _mm_loadu_si128((const __m128i*) i0); 160*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE - 1, 2): 161*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 162*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < ${N+1}) { 163*4bdc9457SAndroid Build Coastguard Worker i${N} = i${N-1}; 164*4bdc9457SAndroid Build Coastguard Worker } 165*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS}_${N} = _mm_loadu_si128((const __m128i*) i${N}); 166*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride); 167*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= ${N+1}) { 168*4bdc9457SAndroid Build Coastguard Worker i${N+1} = i${N}; 169*4bdc9457SAndroid Build Coastguard Worker } 170*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS}_${N+1} = _mm_loadu_si128((const __m128i*) i${N+1}); 171*4bdc9457SAndroid Build Coastguard Worker $else: 172*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS}_0 = _mm_loadu_si128((const __m128i*) i0); 173*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE - 1, 2): 174*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < ${N+1}) { 175*4bdc9457SAndroid Build Coastguard Worker i${N} = i0; 176*4bdc9457SAndroid Build Coastguard Worker } 177*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS}_${N} = _mm_loadu_si128((const __m128i*) i${N}); 178*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= ${N+1}) { 179*4bdc9457SAndroid Build Coastguard Worker i${N+1} = i0; 180*4bdc9457SAndroid Build Coastguard Worker } 181*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS}_${N+1} = _mm_loadu_si128((const __m128i*) i${N+1}); 182*4bdc9457SAndroid Build Coastguard Worker const __m128i v${NUM_ITERS}_${TILE_SIZE-1} = _mm_undefined_si128(); 183*4bdc9457SAndroid Build Coastguard Worker 184*4bdc9457SAndroid Build Coastguard Worker $CONST = "const " 185*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS == 1: 186*4bdc9457SAndroid Build Coastguard Worker $CONST = "" 187*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 188*4bdc9457SAndroid Build Coastguard Worker ${CONST}__m128i v${NUM_ITERS-1}_${N*2} = _mm_unpacklo_epi${SIZE}(v${NUM_ITERS}_${N*2}, v${NUM_ITERS}_${N*2+1}); 189*4bdc9457SAndroid Build Coastguard Worker ${CONST}__m128i v${NUM_ITERS-1}_${N*2+1} = _mm_unpackhi_epi${SIZE}(v${NUM_ITERS}_${N*2}, v${NUM_ITERS}_${N*2+1}); 190*4bdc9457SAndroid Build Coastguard Worker 191*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS == 2: 192*4bdc9457SAndroid Build Coastguard Worker $CONST = "" 193*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>=2: 194*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, TILE_SIZE, 4): 195*4bdc9457SAndroid Build Coastguard Worker ${CONST}__m128i v${NUM_ITERS-2}_${N} = _mm_unpacklo_epi${SIZE*2}(v${NUM_ITERS-1}_${N}, v${NUM_ITERS-1}_${N+2}); 196*4bdc9457SAndroid Build Coastguard Worker ${CONST}__m128i v${NUM_ITERS-2}_${N+1} = _mm_unpackhi_epi${SIZE*2}(v${NUM_ITERS-1}_${N}, v${NUM_ITERS-1}_${N+2}); 197*4bdc9457SAndroid Build Coastguard Worker ${CONST}__m128i v${NUM_ITERS-2}_${N+2} = _mm_unpacklo_epi${SIZE*2}(v${NUM_ITERS-1}_${N+1}, v${NUM_ITERS-1}_${N+3}); 198*4bdc9457SAndroid Build Coastguard Worker ${CONST}__m128i v${NUM_ITERS-2}_${N+3} = _mm_unpackhi_epi${SIZE*2}(v${NUM_ITERS-1}_${N+1}, v${NUM_ITERS-1}_${N+3}); 199*4bdc9457SAndroid Build Coastguard Worker 200*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS == 3: 201*4bdc9457SAndroid Build Coastguard Worker $CONST = "" 202*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>=3: 203*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, TILE_SIZE, 8): 204*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, 4): 205*4bdc9457SAndroid Build Coastguard Worker ${CONST}__m128i v${NUM_ITERS-3}_${M+2*N} = _mm_unpacklo_epi${SIZE*4}(v${NUM_ITERS-2}_${M+N}, v${NUM_ITERS-2}_${M+N+4}); 206*4bdc9457SAndroid Build Coastguard Worker ${CONST}__m128i v${NUM_ITERS-3}_${M+2*N+1} = _mm_unpackhi_epi${SIZE*4}(v${NUM_ITERS-2}_${M+N}, v${NUM_ITERS-2}_${M+N+4}); 207*4bdc9457SAndroid Build Coastguard Worker 208*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>=4: 209*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 210*4bdc9457SAndroid Build Coastguard Worker __m128i v0_${N*2} = _mm_unpacklo_epi64(v1_${N}, v1_${N+8}); 211*4bdc9457SAndroid Build Coastguard Worker __m128i v0_${N*2+1} = _mm_unpackhi_epi64(v1_${N}, v1_${N+8}); 212*4bdc9457SAndroid Build Coastguard Worker 213*4bdc9457SAndroid Build Coastguard Worker if (bh & ${TILE_SIZE>>1}) { 214*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 215*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 216*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 217*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 218*4bdc9457SAndroid Build Coastguard Worker case ${N}: 219*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) oN, v0_${N}); 220*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 221*4bdc9457SAndroid Build Coastguard Worker case 1: 222*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) oN, v0_1); 223*4bdc9457SAndroid Build Coastguard Worker case 0: 224*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, v0_0); 225*4bdc9457SAndroid Build Coastguard Worker break; 226*4bdc9457SAndroid Build Coastguard Worker default: 227*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 228*4bdc9457SAndroid Build Coastguard Worker } 229*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 1: 230*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>1}; 231*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "MOV": 232*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 233*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, v0_${TILE_SIZE-1}); 234*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 235*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 236*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 237*4bdc9457SAndroid Build Coastguard Worker o = oN; 238*4bdc9457SAndroid Build Coastguard Worker } 239*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, v0_${N}); 240*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 241*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 242*4bdc9457SAndroid Build Coastguard Worker o = oN; 243*4bdc9457SAndroid Build Coastguard Worker } 244*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, v0_${N-1}); 245*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 246*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 247*4bdc9457SAndroid Build Coastguard Worker o = oN; 248*4bdc9457SAndroid Build Coastguard Worker } 249*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, v0_0); 250*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 1: 251*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>1}; 252*4bdc9457SAndroid Build Coastguard Worker $else: 253*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 254*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o${N}, v0_${N}); 255*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>1: 256*4bdc9457SAndroid Build Coastguard Worker o${N} += ${TILE_SIZE>>1}; 257*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 1: 258*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 259*4bdc9457SAndroid Build Coastguard Worker v0_${N} = _mm_unpackhi_epi64(v0_${N}, v0_${N}); 260*4bdc9457SAndroid Build Coastguard Worker } 261*4bdc9457SAndroid Build Coastguard Worker 262*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>1: 263*4bdc9457SAndroid Build Coastguard Worker if (bh & ${TILE_SIZE>>2}) { 264*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 265*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 266*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 267*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 268*4bdc9457SAndroid Build Coastguard Worker case ${N}: 269*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_${N})); 270*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 271*4bdc9457SAndroid Build Coastguard Worker case 1: 272*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_1)); 273*4bdc9457SAndroid Build Coastguard Worker case 0: 274*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_0)); 275*4bdc9457SAndroid Build Coastguard Worker break; 276*4bdc9457SAndroid Build Coastguard Worker default: 277*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 278*4bdc9457SAndroid Build Coastguard Worker } 279*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 2: 280*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>2}; 281*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "MOV": 282*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 283*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1})); 284*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 285*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 286*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 287*4bdc9457SAndroid Build Coastguard Worker o = oN; 288*4bdc9457SAndroid Build Coastguard Worker } 289*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_${N})); 290*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 291*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 292*4bdc9457SAndroid Build Coastguard Worker o = oN; 293*4bdc9457SAndroid Build Coastguard Worker } 294*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_${N-1})); 295*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 296*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 297*4bdc9457SAndroid Build Coastguard Worker o = oN; 298*4bdc9457SAndroid Build Coastguard Worker } 299*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_0)); 300*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 2: 301*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>2}; 302*4bdc9457SAndroid Build Coastguard Worker $else: 303*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 304*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(o${N}, (uint32_t) _mm_cvtsi128_si32(v0_${N})); 305*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>2: 306*4bdc9457SAndroid Build Coastguard Worker o${N} += ${TILE_SIZE>>2}; 307*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 2: 308*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 309*4bdc9457SAndroid Build Coastguard Worker v0_${N} = _mm_srli_epi64(v0_${N}, 32); 310*4bdc9457SAndroid Build Coastguard Worker } 311*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>2: 312*4bdc9457SAndroid Build Coastguard Worker if (bh & ${TILE_SIZE>>3}) { 313*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 314*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 315*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 316*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 317*4bdc9457SAndroid Build Coastguard Worker case ${N}: 318*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_${N})); 319*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 320*4bdc9457SAndroid Build Coastguard Worker case 1: 321*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_1)); 322*4bdc9457SAndroid Build Coastguard Worker case 0: 323*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0)); 324*4bdc9457SAndroid Build Coastguard Worker break; 325*4bdc9457SAndroid Build Coastguard Worker default: 326*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 327*4bdc9457SAndroid Build Coastguard Worker } 328*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>3: 329*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>3}; 330*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "MOV": 331*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 332*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1})); 333*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 334*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 335*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 336*4bdc9457SAndroid Build Coastguard Worker o = oN; 337*4bdc9457SAndroid Build Coastguard Worker } 338*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_${N})); 339*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 340*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 341*4bdc9457SAndroid Build Coastguard Worker o = oN; 342*4bdc9457SAndroid Build Coastguard Worker } 343*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_${N-1})); 344*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 345*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 346*4bdc9457SAndroid Build Coastguard Worker o = oN; 347*4bdc9457SAndroid Build Coastguard Worker } 348*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0)); 349*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 3: 350*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>3}; 351*4bdc9457SAndroid Build Coastguard Worker $else: 352*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 353*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(o${N}, (uint16_t) _mm_cvtsi128_si32(v0_${N})); 354*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>3: 355*4bdc9457SAndroid Build Coastguard Worker o${N} += ${TILE_SIZE>>3}; 356*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>3: 357*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 358*4bdc9457SAndroid Build Coastguard Worker v0_${N} = _mm_srli_epi32(v0_${N}, 16); 359*4bdc9457SAndroid Build Coastguard Worker } 360*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 8: 361*4bdc9457SAndroid Build Coastguard Worker if (bh & 1) { 362*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 363*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 364*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 365*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 366*4bdc9457SAndroid Build Coastguard Worker case ${N}: 367*4bdc9457SAndroid Build Coastguard Worker *oN = (uint8_t) _mm_cvtsi128_si32(v0_${N}); 368*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 369*4bdc9457SAndroid Build Coastguard Worker case 1: 370*4bdc9457SAndroid Build Coastguard Worker *oN = (uint8_t) _mm_cvtsi128_si32(v0_1); 371*4bdc9457SAndroid Build Coastguard Worker case 0: 372*4bdc9457SAndroid Build Coastguard Worker *o = (uint8_t) _mm_cvtsi128_si32(v0_0); 373*4bdc9457SAndroid Build Coastguard Worker break; 374*4bdc9457SAndroid Build Coastguard Worker default: 375*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 376*4bdc9457SAndroid Build Coastguard Worker } 377*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "MOV": 378*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 379*4bdc9457SAndroid Build Coastguard Worker *o = (uint8_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1}); 380*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 381*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 382*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 383*4bdc9457SAndroid Build Coastguard Worker o = oN; 384*4bdc9457SAndroid Build Coastguard Worker } 385*4bdc9457SAndroid Build Coastguard Worker *o = (uint8_t) _mm_cvtsi128_si32(v0_${N}); 386*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 387*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 388*4bdc9457SAndroid Build Coastguard Worker o = oN; 389*4bdc9457SAndroid Build Coastguard Worker } 390*4bdc9457SAndroid Build Coastguard Worker *o = (uint8_t) _mm_cvtsi128_si32(v0_${N-1}); 391*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 392*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 393*4bdc9457SAndroid Build Coastguard Worker o = oN; 394*4bdc9457SAndroid Build Coastguard Worker } 395*4bdc9457SAndroid Build Coastguard Worker *o = (uint8_t) _mm_cvtsi128_si32(v0_0); 396*4bdc9457SAndroid Build Coastguard Worker } 397*4bdc9457SAndroid Build Coastguard Worker } 398*4bdc9457SAndroid Build Coastguard Worker 399*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "MULTI": 400*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 401*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 402*4bdc9457SAndroid Build Coastguard Worker i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 403*4bdc9457SAndroid Build Coastguard Worker $else: 404*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 405*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MULTI": 406*4bdc9457SAndroid Build Coastguard Worker o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset); 407*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 408*4bdc9457SAndroid Build Coastguard Worker o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset); 409*4bdc9457SAndroid Build Coastguard Worker $else: 410*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset); 411*4bdc9457SAndroid Build Coastguard Worker block_width = doz(block_width, tile_width); 412*4bdc9457SAndroid Build Coastguard Worker } while (block_width != 0); 413*4bdc9457SAndroid Build Coastguard Worker} 414