1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker$import math 6*4bdc9457SAndroid Build Coastguard Worker$assert IN_PTRS in ["MULTI", "REUSE"] 7*4bdc9457SAndroid Build Coastguard Worker$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"] 8*4bdc9457SAndroid Build Coastguard Worker$assert SIZE in [8, 16, 32] 9*4bdc9457SAndroid Build Coastguard Worker$assert VECTOR_SIZE in [64, 128] 10*4bdc9457SAndroid Build Coastguard Worker$TILE_SIZE = int(VECTOR_SIZE/SIZE) 11*4bdc9457SAndroid Build Coastguard Worker$NUM_ITERS = int(math.log2(TILE_SIZE)) 12*4bdc9457SAndroid Build Coastguard Worker$SUFFIX = '' 13*4bdc9457SAndroid Build Coastguard Worker$NUM_D_REGISTERS=int(VECTOR_SIZE/64) 14*4bdc9457SAndroid Build Coastguard Worker$if VECTOR_SIZE == 128: 15*4bdc9457SAndroid Build Coastguard Worker$ SUFFIX = 'q' 16*4bdc9457SAndroid Build Coastguard Worker 17*4bdc9457SAndroid Build Coastguard Worker#include <arm_neon.h> 18*4bdc9457SAndroid Build Coastguard Worker 19*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 20*4bdc9457SAndroid Build Coastguard Worker 21*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/common.h> 22*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h> 23*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/transpose.h> 24*4bdc9457SAndroid Build Coastguard Worker 25*4bdc9457SAndroid Build Coastguard Workervoid xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_zip_neon( 26*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* input, 27*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* output, 28*4bdc9457SAndroid Build Coastguard Worker size_t input_stride, 29*4bdc9457SAndroid Build Coastguard Worker size_t output_stride, 30*4bdc9457SAndroid Build Coastguard Worker size_t block_width, 31*4bdc9457SAndroid Build Coastguard Worker size_t block_height) XNN_OOB_READS 32*4bdc9457SAndroid Build Coastguard Worker{ 33*4bdc9457SAndroid Build Coastguard Worker assert(output_stride >= block_height * sizeof(uint${SIZE}_t)); 34*4bdc9457SAndroid Build Coastguard Worker assert(input_stride >= block_width * sizeof(uint${SIZE}_t)); 35*4bdc9457SAndroid Build Coastguard Worker 36*4bdc9457SAndroid Build Coastguard Worker const size_t tile_height = ${TILE_SIZE}; 37*4bdc9457SAndroid Build Coastguard Worker const size_t tile_width = ${TILE_SIZE}; 38*4bdc9457SAndroid Build Coastguard Worker const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t); 39*4bdc9457SAndroid Build Coastguard Worker const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t); 40*4bdc9457SAndroid Build Coastguard Worker const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; 41*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "MULTI": 42*4bdc9457SAndroid Build Coastguard Worker const size_t input_offset = tile_height * input_stride; 43*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS in ["MOV", "DEC"]: 44*4bdc9457SAndroid Build Coastguard Worker const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes; 45*4bdc9457SAndroid Build Coastguard Worker $else: 46*4bdc9457SAndroid Build Coastguard Worker const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t); 47*4bdc9457SAndroid Build Coastguard Worker 48*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "MULTI": 49*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* i0 = input; 50*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 51*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 52*4bdc9457SAndroid Build Coastguard Worker $else: 53*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* i0 = input; 54*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MULTI": 55*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o0 = (uint${SIZE}_t*) output; 56*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 57*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride); 58*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "SWITCH": 59*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o = (uint${SIZE}_t*) output; 60*4bdc9457SAndroid Build Coastguard Worker $else: 61*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes); 62*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 63*4bdc9457SAndroid Build Coastguard Worker $if int(VECTOR_SIZE/SIZE) > 2: 64*4bdc9457SAndroid Build Coastguard Worker const size_t minus_output_stride = -output_stride; 65*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS != "MULTI": 66*4bdc9457SAndroid Build Coastguard Worker const size_t minus_output_stride = -output_stride; 67*4bdc9457SAndroid Build Coastguard Worker 68*4bdc9457SAndroid Build Coastguard Worker do { 69*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MULTI": 70*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width < 2) { 71*4bdc9457SAndroid Build Coastguard Worker o1 = o0; 72*4bdc9457SAndroid Build Coastguard Worker } 73*4bdc9457SAndroid Build Coastguard Worker $for N in range(2, TILE_SIZE, 2): 74*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width <= ${N}) { 75*4bdc9457SAndroid Build Coastguard Worker o${N} = o0; 76*4bdc9457SAndroid Build Coastguard Worker } 77*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width < ${N+2}) { 78*4bdc9457SAndroid Build Coastguard Worker o${N+1} = o0; 79*4bdc9457SAndroid Build Coastguard Worker } 80*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 81*4bdc9457SAndroid Build Coastguard Worker const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 82*4bdc9457SAndroid Build Coastguard Worker const size_t oN_stride = rem * output_stride; 83*4bdc9457SAndroid Build Coastguard Worker const size_t oN_offset = oN_stride + tile_hbytes; 84*4bdc9457SAndroid Build Coastguard Worker $else: 85*4bdc9457SAndroid Build Coastguard Worker const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 86*4bdc9457SAndroid Build Coastguard Worker const size_t oN_stride = rem * output_stride; 87*4bdc9457SAndroid Build Coastguard Worker size_t bh = block_height; 88*4bdc9457SAndroid Build Coastguard Worker for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) { 89*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 90*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "REUSE": 91*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i0); i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride); 92*4bdc9457SAndroid Build Coastguard Worker $else: 93*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N}); i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset); 94*4bdc9457SAndroid Build Coastguard Worker 95*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 96*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)}); 97*4bdc9457SAndroid Build Coastguard Worker 98*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, NUM_ITERS): 99*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 100*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]); 101*4bdc9457SAndroid Build Coastguard Worker 102*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 103*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 104*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 105*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 106*4bdc9457SAndroid Build Coastguard Worker case ${N}: 107*4bdc9457SAndroid Build Coastguard Worker vst1${SUFFIX}_u${SIZE}(oN, v0_${N>>1}.val[${N%2}]); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 108*4bdc9457SAndroid Build Coastguard Worker case 1: 109*4bdc9457SAndroid Build Coastguard Worker vst1${SUFFIX}_u${SIZE}(oN, v0_0.val[1]); 110*4bdc9457SAndroid Build Coastguard Worker case 0: 111*4bdc9457SAndroid Build Coastguard Worker vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]); o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 112*4bdc9457SAndroid Build Coastguard Worker break; 113*4bdc9457SAndroid Build Coastguard Worker default: 114*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 115*4bdc9457SAndroid Build Coastguard Worker } 116*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 117*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset); 118*4bdc9457SAndroid Build Coastguard Worker vst1${SUFFIX}_u${SIZE}(o, v0_${(TILE_SIZE-1)>>1}.val[1]); 119*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 120*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 121*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 122*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 123*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 124*4bdc9457SAndroid Build Coastguard Worker o = oN; 125*4bdc9457SAndroid Build Coastguard Worker $else: 126*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 127*4bdc9457SAndroid Build Coastguard Worker } 128*4bdc9457SAndroid Build Coastguard Worker vst1${SUFFIX}_u${SIZE}(o, v0_${N>>1}.val[0]); 129*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 130*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 131*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 132*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 133*4bdc9457SAndroid Build Coastguard Worker o = oN; 134*4bdc9457SAndroid Build Coastguard Worker $else: 135*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 136*4bdc9457SAndroid Build Coastguard Worker } 137*4bdc9457SAndroid Build Coastguard Worker vst1${SUFFIX}_u${SIZE}(o, v0_${(N-1)>>1}.val[1]); 138*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 139*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 140*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 141*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 142*4bdc9457SAndroid Build Coastguard Worker o = oN; 143*4bdc9457SAndroid Build Coastguard Worker $else: 144*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 145*4bdc9457SAndroid Build Coastguard Worker } 146*4bdc9457SAndroid Build Coastguard Worker vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]); 147*4bdc9457SAndroid Build Coastguard Worker $else: 148*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 149*4bdc9457SAndroid Build Coastguard Worker vst1${SUFFIX}_u${SIZE}(o${N}, v0_${N>>1}.val[${N%2}]); o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes); 150*4bdc9457SAndroid Build Coastguard Worker } 151*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS in ["MOV", "DEC"]: 152*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 153*4bdc9457SAndroid Build Coastguard Worker 154*4bdc9457SAndroid Build Coastguard Worker if (bh != 0) { 155*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "REUSE": 156*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0); 157*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE - 1, 2): 158*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 159*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < ${N+1}) { 160*4bdc9457SAndroid Build Coastguard Worker i${N} = i${N-1}; 161*4bdc9457SAndroid Build Coastguard Worker } 162*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N}); 163*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride); 164*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= ${N+1}) { 165*4bdc9457SAndroid Build Coastguard Worker i${N+1} = i${N}; 166*4bdc9457SAndroid Build Coastguard Worker } 167*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1}); 168*4bdc9457SAndroid Build Coastguard Worker $else: 169*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0); 170*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE - 1, 2): 171*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < ${N+1}) { 172*4bdc9457SAndroid Build Coastguard Worker i${N} = i0; 173*4bdc9457SAndroid Build Coastguard Worker } 174*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N}); 175*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= ${N+1}) { 176*4bdc9457SAndroid Build Coastguard Worker i${N+1} = i0; 177*4bdc9457SAndroid Build Coastguard Worker } 178*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1}); 179*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${TILE_SIZE-1} = vmov${SUFFIX}_n_u${SIZE}(0); 180*4bdc9457SAndroid Build Coastguard Worker 181*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 182*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)}); 183*4bdc9457SAndroid Build Coastguard Worker 184*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, NUM_ITERS): 185*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 186*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]); 187*4bdc9457SAndroid Build Coastguard Worker 188*4bdc9457SAndroid Build Coastguard Worker $if VECTOR_SIZE == 128: 189*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 190*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}x${TILE_SIZE>>1}_t v${N}_low = vget_low_u${SIZE}(v0_${N>>1}.val[${N%2}]); 191*4bdc9457SAndroid Build Coastguard Worker 192*4bdc9457SAndroid Build Coastguard Worker if (bh & ${TILE_SIZE>>1}) { 193*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 194*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 195*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 196*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 197*4bdc9457SAndroid Build Coastguard Worker case ${N}: 198*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(oN, v${N}_low); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 199*4bdc9457SAndroid Build Coastguard Worker case 1: 200*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(oN, v1_low); 201*4bdc9457SAndroid Build Coastguard Worker case 0: 202*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 1: 203*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1}; 204*4bdc9457SAndroid Build Coastguard Worker $else: 205*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(o, v0_low); 206*4bdc9457SAndroid Build Coastguard Worker break; 207*4bdc9457SAndroid Build Coastguard Worker default: 208*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 209*4bdc9457SAndroid Build Coastguard Worker } 210*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 211*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 212*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(o, v${TILE_SIZE-1}_low); 213*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 214*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 215*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 216*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 217*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 218*4bdc9457SAndroid Build Coastguard Worker o = oN; 219*4bdc9457SAndroid Build Coastguard Worker $else: 220*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 221*4bdc9457SAndroid Build Coastguard Worker } 222*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(o, v${N}_low); 223*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 224*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 225*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 226*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 227*4bdc9457SAndroid Build Coastguard Worker o = oN; 228*4bdc9457SAndroid Build Coastguard Worker $else: 229*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 230*4bdc9457SAndroid Build Coastguard Worker } 231*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(o, v${N-1}_low); 232*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 233*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 234*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 235*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 236*4bdc9457SAndroid Build Coastguard Worker o = oN; 237*4bdc9457SAndroid Build Coastguard Worker $else: 238*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 239*4bdc9457SAndroid Build Coastguard Worker } 240*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 1: 241*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1}; 242*4bdc9457SAndroid Build Coastguard Worker $else: 243*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(o, v0_low); 244*4bdc9457SAndroid Build Coastguard Worker $else: 245*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 246*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>1: 247*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(o${N}, v${N}_low); o${N} += ${TILE_SIZE>>1}; 248*4bdc9457SAndroid Build Coastguard Worker $else: 249*4bdc9457SAndroid Build Coastguard Worker vst1_u${SIZE}(o${N}, v${N}_low); 250*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 1: 251*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 252*4bdc9457SAndroid Build Coastguard Worker v${N}_low = vget_high_u${SIZE}(v0_${N>>1}.val[${N%2}]); 253*4bdc9457SAndroid Build Coastguard Worker } 254*4bdc9457SAndroid Build Coastguard Worker $else: 255*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 256*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}x${TILE_SIZE}_t v${N}_low = v0_${(N>>1)}.val[${N%2}]; 257*4bdc9457SAndroid Build Coastguard Worker 258*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>=NUM_D_REGISTERS: 259*4bdc9457SAndroid Build Coastguard Worker if (bh & ${TILE_SIZE>>NUM_D_REGISTERS}) { 260*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 261*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 262*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 263*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 264*4bdc9457SAndroid Build Coastguard Worker case ${N}: 265*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 32: 266*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 267*4bdc9457SAndroid Build Coastguard Worker $else: 268*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 269*4bdc9457SAndroid Build Coastguard Worker case 1: 270*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 32: 271*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32(oN, v1_low, 0); 272*4bdc9457SAndroid Build Coastguard Worker $else: 273*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v1_low), 0); 274*4bdc9457SAndroid Build Coastguard Worker case 0: 275*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 32: 276*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32(o, v0_low, 0); 277*4bdc9457SAndroid Build Coastguard Worker $else: 278*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS}; 279*4bdc9457SAndroid Build Coastguard Worker break; 280*4bdc9457SAndroid Build Coastguard Worker default: 281*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 282*4bdc9457SAndroid Build Coastguard Worker } 283*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 284*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 285*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 32: 286*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32(o, v${TILE_SIZE-1}_low, 0); 287*4bdc9457SAndroid Build Coastguard Worker $else: 288*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${TILE_SIZE-1}_low), 0); 289*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 290*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 291*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 292*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 293*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 294*4bdc9457SAndroid Build Coastguard Worker o = oN; 295*4bdc9457SAndroid Build Coastguard Worker $else: 296*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 297*4bdc9457SAndroid Build Coastguard Worker } 298*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 32: 299*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32(o, v${N}_low, 0); 300*4bdc9457SAndroid Build Coastguard Worker $else: 301*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N}_low), 0); 302*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 303*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 304*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 305*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 306*4bdc9457SAndroid Build Coastguard Worker o = oN; 307*4bdc9457SAndroid Build Coastguard Worker $else: 308*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 309*4bdc9457SAndroid Build Coastguard Worker } 310*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 32: 311*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32(o, v${N-1}_low, 0); 312*4bdc9457SAndroid Build Coastguard Worker $else: 313*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N-1}_low), 0); 314*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 315*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 316*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 317*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 318*4bdc9457SAndroid Build Coastguard Worker o = oN; 319*4bdc9457SAndroid Build Coastguard Worker $else: 320*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 321*4bdc9457SAndroid Build Coastguard Worker } 322*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 32: 323*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32(o, v0_low, 0); 324*4bdc9457SAndroid Build Coastguard Worker $else: 325*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS}; 326*4bdc9457SAndroid Build Coastguard Worker $else: 327*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 328*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 32: 329*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32(o${N}, v${N}_low, 0); 330*4bdc9457SAndroid Build Coastguard Worker $else: 331*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o${N}, vreinterpret_u32_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>NUM_D_REGISTERS}; 332*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > NUM_D_REGISTERS: 333*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 334*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 335*4bdc9457SAndroid Build Coastguard Worker v${N}_low = vext_u16(v${N}_low, v${N}_low, 2); 336*4bdc9457SAndroid Build Coastguard Worker $else: 337*4bdc9457SAndroid Build Coastguard Worker v${N}_low = vext_u8(v${N}_low, v${N}_low, 4); 338*4bdc9457SAndroid Build Coastguard Worker } 339*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>NUM_D_REGISTERS: 340*4bdc9457SAndroid Build Coastguard Worker if (bh & ${TILE_SIZE>>(NUM_D_REGISTERS+1)}) { 341*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 342*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 343*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 344*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 345*4bdc9457SAndroid Build Coastguard Worker case ${N}: 346*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 347*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 348*4bdc9457SAndroid Build Coastguard Worker $else: 349*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 350*4bdc9457SAndroid Build Coastguard Worker case 1: 351*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 352*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16(oN, v1_low, 0); 353*4bdc9457SAndroid Build Coastguard Worker $else: 354*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v1_low), 0); 355*4bdc9457SAndroid Build Coastguard Worker case 0: 356*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 357*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16(o, v0_low, 0); 358*4bdc9457SAndroid Build Coastguard Worker $else: 359*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>(NUM_D_REGISTERS+1): 360*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)}; 361*4bdc9457SAndroid Build Coastguard Worker $else: 362*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); 363*4bdc9457SAndroid Build Coastguard Worker break; 364*4bdc9457SAndroid Build Coastguard Worker default: 365*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 366*4bdc9457SAndroid Build Coastguard Worker } 367*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 368*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 369*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 370*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16(o, v${TILE_SIZE-1}_low, 0); 371*4bdc9457SAndroid Build Coastguard Worker $else: 372*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${TILE_SIZE-1}_low), 0); 373*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 374*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 375*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 376*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 377*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 378*4bdc9457SAndroid Build Coastguard Worker o = oN; 379*4bdc9457SAndroid Build Coastguard Worker $else: 380*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 381*4bdc9457SAndroid Build Coastguard Worker } 382*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 383*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16(o, v${N}_low, 0); 384*4bdc9457SAndroid Build Coastguard Worker $else: 385*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N}_low), 0); 386*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 387*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 388*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 389*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 390*4bdc9457SAndroid Build Coastguard Worker o = oN; 391*4bdc9457SAndroid Build Coastguard Worker $else: 392*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 393*4bdc9457SAndroid Build Coastguard Worker } 394*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 395*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16(o, v${N-1}_low, 0); 396*4bdc9457SAndroid Build Coastguard Worker $else: 397*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N-1}_low), 0); 398*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 399*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 400*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 401*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 402*4bdc9457SAndroid Build Coastguard Worker o = oN; 403*4bdc9457SAndroid Build Coastguard Worker $else: 404*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 405*4bdc9457SAndroid Build Coastguard Worker } 406*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 407*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16(o, v0_low, 0); 408*4bdc9457SAndroid Build Coastguard Worker $else: 409*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)}; 410*4bdc9457SAndroid Build Coastguard Worker $else: 411*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 412*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 413*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16(o${N}, v${N}_low, 0); 414*4bdc9457SAndroid Build Coastguard Worker $else: 415*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o${N}, vreinterpret_u16_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>(NUM_D_REGISTERS+1)}; 416*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>(NUM_D_REGISTERS+1): 417*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 418*4bdc9457SAndroid Build Coastguard Worker v${N}_low = vext_u8(v${N}_low, v${N}_low, 2); 419*4bdc9457SAndroid Build Coastguard Worker } 420*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 8: 421*4bdc9457SAndroid Build Coastguard Worker if (bh & 1) { 422*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 423*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 424*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 425*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 426*4bdc9457SAndroid Build Coastguard Worker case ${N}: 427*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 428*4bdc9457SAndroid Build Coastguard Worker case 1: 429*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(oN, v1_low, 0); 430*4bdc9457SAndroid Build Coastguard Worker case 0: 431*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v0_low, 0); 432*4bdc9457SAndroid Build Coastguard Worker break; 433*4bdc9457SAndroid Build Coastguard Worker default: 434*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 435*4bdc9457SAndroid Build Coastguard Worker } 436*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 437*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 438*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v${TILE_SIZE-1}_low, 0); 439*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 440*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 441*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 442*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 443*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 444*4bdc9457SAndroid Build Coastguard Worker o = oN; 445*4bdc9457SAndroid Build Coastguard Worker $else: 446*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 447*4bdc9457SAndroid Build Coastguard Worker } 448*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v${N}_low, 0); 449*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 450*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 451*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 452*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 453*4bdc9457SAndroid Build Coastguard Worker o = oN; 454*4bdc9457SAndroid Build Coastguard Worker $else: 455*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 456*4bdc9457SAndroid Build Coastguard Worker } 457*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v${N-1}_low, 0); 458*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 459*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 460*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 461*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 462*4bdc9457SAndroid Build Coastguard Worker o = oN; 463*4bdc9457SAndroid Build Coastguard Worker $else: 464*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 465*4bdc9457SAndroid Build Coastguard Worker } 466*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v0_low, 0); 467*4bdc9457SAndroid Build Coastguard Worker $else: 468*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 469*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o${N}, v${N}_low, 0); 470*4bdc9457SAndroid Build Coastguard Worker } 471*4bdc9457SAndroid Build Coastguard Worker } 472*4bdc9457SAndroid Build Coastguard Worker 473*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "MULTI": 474*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 475*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 476*4bdc9457SAndroid Build Coastguard Worker i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 477*4bdc9457SAndroid Build Coastguard Worker $else: 478*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 479*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MULTI": 480*4bdc9457SAndroid Build Coastguard Worker o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset); 481*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 482*4bdc9457SAndroid Build Coastguard Worker o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset); 483*4bdc9457SAndroid Build Coastguard Worker $else: 484*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset); 485*4bdc9457SAndroid Build Coastguard Worker block_width = doz(block_width, tile_width); 486*4bdc9457SAndroid Build Coastguard Worker } while (block_width != 0); 487*4bdc9457SAndroid Build Coastguard Worker} 488