1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker$from itertools import chain 6*4bdc9457SAndroid Build Coastguard Worker$import math 7*4bdc9457SAndroid Build Coastguard Worker$assert IN_PTRS in ["MULTI", "REUSE"] 8*4bdc9457SAndroid Build Coastguard Worker$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"] 9*4bdc9457SAndroid Build Coastguard Worker$assert SIZE in [8, 16, 32] 10*4bdc9457SAndroid Build Coastguard Worker$TILE_SIZE = int(128/SIZE) 11*4bdc9457SAndroid Build Coastguard Worker$NUM_ITERS = int(math.log2(TILE_SIZE)) 12*4bdc9457SAndroid Build Coastguard Worker$LO_PERM=str(list(chain.from_iterable((i, i+TILE_SIZE) for i in range((TILE_SIZE>>1)))))[1:-1] 13*4bdc9457SAndroid Build Coastguard Worker$HI_PERM=str(list(chain.from_iterable(((TILE_SIZE>>1)+i, (TILE_SIZE>>1)+i+TILE_SIZE) for i in range((TILE_SIZE>>1)))))[1:-1] 14*4bdc9457SAndroid Build Coastguard Worker 15*4bdc9457SAndroid Build Coastguard Worker#include <wasm_simd128.h> 16*4bdc9457SAndroid Build Coastguard Worker 17*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 18*4bdc9457SAndroid Build Coastguard Worker 19*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/common.h> 20*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h> 21*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/transpose.h> 22*4bdc9457SAndroid Build Coastguard Worker 23*4bdc9457SAndroid Build Coastguard Workervoid xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_wasmsimd( 24*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* input, 25*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* output, 26*4bdc9457SAndroid Build Coastguard Worker size_t input_stride, 27*4bdc9457SAndroid Build Coastguard Worker size_t output_stride, 28*4bdc9457SAndroid Build Coastguard Worker size_t block_width, 29*4bdc9457SAndroid Build Coastguard Worker size_t block_height) XNN_OOB_READS 30*4bdc9457SAndroid Build Coastguard Worker{ 31*4bdc9457SAndroid Build Coastguard Worker assert(output_stride >= block_height * sizeof(uint${SIZE}_t)); 32*4bdc9457SAndroid Build Coastguard Worker assert(input_stride >= block_width * sizeof(uint${SIZE}_t)); 33*4bdc9457SAndroid Build Coastguard Worker 34*4bdc9457SAndroid Build Coastguard Worker const size_t tile_height = ${TILE_SIZE}; 35*4bdc9457SAndroid Build Coastguard Worker const size_t tile_width = ${TILE_SIZE}; 36*4bdc9457SAndroid Build Coastguard Worker const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t); 37*4bdc9457SAndroid Build Coastguard Worker const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t); 38*4bdc9457SAndroid Build Coastguard Worker const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; 39*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "MULTI": 40*4bdc9457SAndroid Build Coastguard Worker const size_t input_offset = tile_height * input_stride; 41*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS in ["MOV", "DEC"]: 42*4bdc9457SAndroid Build Coastguard Worker const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes; 43*4bdc9457SAndroid Build Coastguard Worker $else: 44*4bdc9457SAndroid Build Coastguard Worker const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t); 45*4bdc9457SAndroid Build Coastguard Worker 46*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "MULTI": 47*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* i0 = input; 48*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 49*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 50*4bdc9457SAndroid Build Coastguard Worker $else: 51*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t* i0 = input; 52*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MULTI": 53*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o0 = (uint${SIZE}_t*) output; 54*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 55*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride); 56*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS == "SWITCH": 57*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o = (uint${SIZE}_t*) output; 58*4bdc9457SAndroid Build Coastguard Worker $else: 59*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes); 60*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS != "MULTI": 61*4bdc9457SAndroid Build Coastguard Worker const size_t minus_output_stride = -output_stride; 62*4bdc9457SAndroid Build Coastguard Worker 63*4bdc9457SAndroid Build Coastguard Worker do { 64*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MULTI": 65*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width < 2) { 66*4bdc9457SAndroid Build Coastguard Worker o1 = o0; 67*4bdc9457SAndroid Build Coastguard Worker } 68*4bdc9457SAndroid Build Coastguard Worker $for N in range(2, TILE_SIZE, 2): 69*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width <= ${N}) { 70*4bdc9457SAndroid Build Coastguard Worker o${N} = o0; 71*4bdc9457SAndroid Build Coastguard Worker } 72*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width < ${N+2}) { 73*4bdc9457SAndroid Build Coastguard Worker o${N+1} = o0; 74*4bdc9457SAndroid Build Coastguard Worker } 75*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 76*4bdc9457SAndroid Build Coastguard Worker const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 77*4bdc9457SAndroid Build Coastguard Worker const size_t oN_stride = rem * output_stride; 78*4bdc9457SAndroid Build Coastguard Worker const size_t oN_offset = oN_stride + tile_hbytes; 79*4bdc9457SAndroid Build Coastguard Worker $else: 80*4bdc9457SAndroid Build Coastguard Worker const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 81*4bdc9457SAndroid Build Coastguard Worker const size_t oN_stride = rem * output_stride; 82*4bdc9457SAndroid Build Coastguard Worker size_t bh = block_height; 83*4bdc9457SAndroid Build Coastguard Worker for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) { 84*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 85*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "REUSE": 86*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i0); 87*4bdc9457SAndroid Build Coastguard Worker i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride); 88*4bdc9457SAndroid Build Coastguard Worker $else: 89*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N}); 90*4bdc9457SAndroid Build Coastguard Worker i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset); 91*4bdc9457SAndroid Build Coastguard Worker 92*4bdc9457SAndroid Build Coastguard Worker $for M in range(NUM_ITERS): 93*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 94*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS-M-1}_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${LO_PERM}); 95*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS-M-1}_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${HI_PERM}); 96*4bdc9457SAndroid Build Coastguard Worker 97*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 98*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 99*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 100*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 101*4bdc9457SAndroid Build Coastguard Worker case ${N}: 102*4bdc9457SAndroid Build Coastguard Worker wasm_v128_store(oN, v0_${N}); 103*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 104*4bdc9457SAndroid Build Coastguard Worker case 1: 105*4bdc9457SAndroid Build Coastguard Worker wasm_v128_store(oN, v0_1); 106*4bdc9457SAndroid Build Coastguard Worker case 0: 107*4bdc9457SAndroid Build Coastguard Worker wasm_v128_store(o, v0_0); 108*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 109*4bdc9457SAndroid Build Coastguard Worker break; 110*4bdc9457SAndroid Build Coastguard Worker default: 111*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 112*4bdc9457SAndroid Build Coastguard Worker } 113*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 114*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset); 115*4bdc9457SAndroid Build Coastguard Worker wasm_v128_store(o, v0_${TILE_SIZE-1}); 116*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 117*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 118*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE-1, 2)): 119*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 120*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 121*4bdc9457SAndroid Build Coastguard Worker o = oN; 122*4bdc9457SAndroid Build Coastguard Worker $else: 123*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 124*4bdc9457SAndroid Build Coastguard Worker } 125*4bdc9457SAndroid Build Coastguard Worker wasm_v128_store(o, v0_${N}); 126*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 127*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 128*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 129*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 130*4bdc9457SAndroid Build Coastguard Worker o = oN; 131*4bdc9457SAndroid Build Coastguard Worker $else: 132*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 133*4bdc9457SAndroid Build Coastguard Worker } 134*4bdc9457SAndroid Build Coastguard Worker wasm_v128_store(o, v0_${N-1}); 135*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 136*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 137*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 138*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 139*4bdc9457SAndroid Build Coastguard Worker o = oN; 140*4bdc9457SAndroid Build Coastguard Worker $else: 141*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 142*4bdc9457SAndroid Build Coastguard Worker } 143*4bdc9457SAndroid Build Coastguard Worker wasm_v128_store(o, v0_0); 144*4bdc9457SAndroid Build Coastguard Worker $else: 145*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 146*4bdc9457SAndroid Build Coastguard Worker wasm_v128_store(o${N}, v0_${N}); 147*4bdc9457SAndroid Build Coastguard Worker o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes); 148*4bdc9457SAndroid Build Coastguard Worker } 149*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS in ["MOV", "DEC"]: 150*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 151*4bdc9457SAndroid Build Coastguard Worker 152*4bdc9457SAndroid Build Coastguard Worker if (bh != 0) { 153*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "REUSE": 154*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS}_0 = wasm_v128_load(i0); 155*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE - 1, 2): 156*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 157*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < ${N+1}) { 158*4bdc9457SAndroid Build Coastguard Worker i${N} = i${N-1}; 159*4bdc9457SAndroid Build Coastguard Worker } 160*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N}); 161*4bdc9457SAndroid Build Coastguard Worker const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride); 162*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= ${N+1}) { 163*4bdc9457SAndroid Build Coastguard Worker i${N+1} = i${N}; 164*4bdc9457SAndroid Build Coastguard Worker } 165*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS}_${N+1} = wasm_v128_load(i${N+1}); 166*4bdc9457SAndroid Build Coastguard Worker $else: 167*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS}_0 = wasm_v128_load(i0); 168*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE - 1, 2): 169*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < ${N+1}) { 170*4bdc9457SAndroid Build Coastguard Worker i${N} = i0; 171*4bdc9457SAndroid Build Coastguard Worker } 172*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N}); 173*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= ${N+1}) { 174*4bdc9457SAndroid Build Coastguard Worker i${N+1} = i0; 175*4bdc9457SAndroid Build Coastguard Worker } 176*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS}_${N+1} = wasm_v128_load(i${N+1}); 177*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS}_${TILE_SIZE-1} = wasm_v128_xor(v${NUM_ITERS}_0, v${NUM_ITERS}_0); 178*4bdc9457SAndroid Build Coastguard Worker 179*4bdc9457SAndroid Build Coastguard Worker $for M in range(NUM_ITERS-1): 180*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 181*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS-M-1}_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${LO_PERM}); 182*4bdc9457SAndroid Build Coastguard Worker const v128_t v${NUM_ITERS-M-1}_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${HI_PERM}); 183*4bdc9457SAndroid Build Coastguard Worker 184*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE >> 1): 185*4bdc9457SAndroid Build Coastguard Worker v128_t v0_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v1_${N}, v1_${N+int(TILE_SIZE/2)}, ${LO_PERM}); 186*4bdc9457SAndroid Build Coastguard Worker v128_t v0_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v1_${N}, v1_${N+int(TILE_SIZE/2)}, ${HI_PERM}); 187*4bdc9457SAndroid Build Coastguard Worker 188*4bdc9457SAndroid Build Coastguard Worker if (bh & ${TILE_SIZE>>1}) { 189*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 190*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 191*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 192*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 193*4bdc9457SAndroid Build Coastguard Worker case ${N}: 194*4bdc9457SAndroid Build Coastguard Worker *((double*) oN) = wasm_f64x2_extract_lane(v0_${N}, 0); 195*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 196*4bdc9457SAndroid Build Coastguard Worker case 1: 197*4bdc9457SAndroid Build Coastguard Worker *((double*) oN) = wasm_f64x2_extract_lane(v0_1, 0); 198*4bdc9457SAndroid Build Coastguard Worker case 0: 199*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 1: 200*4bdc9457SAndroid Build Coastguard Worker *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0); 201*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>1}; 202*4bdc9457SAndroid Build Coastguard Worker $else: 203*4bdc9457SAndroid Build Coastguard Worker *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0); 204*4bdc9457SAndroid Build Coastguard Worker break; 205*4bdc9457SAndroid Build Coastguard Worker default: 206*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 207*4bdc9457SAndroid Build Coastguard Worker } 208*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 209*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 210*4bdc9457SAndroid Build Coastguard Worker *((double*) o) = wasm_f64x2_extract_lane(v0_${TILE_SIZE-1}, 0); 211*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 212*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 213*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 214*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 215*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 216*4bdc9457SAndroid Build Coastguard Worker o = oN; 217*4bdc9457SAndroid Build Coastguard Worker $else: 218*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 219*4bdc9457SAndroid Build Coastguard Worker } 220*4bdc9457SAndroid Build Coastguard Worker *((double*) o) = wasm_f64x2_extract_lane(v0_${N}, 0); 221*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 222*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 223*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 224*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 225*4bdc9457SAndroid Build Coastguard Worker o = oN; 226*4bdc9457SAndroid Build Coastguard Worker $else: 227*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 228*4bdc9457SAndroid Build Coastguard Worker } 229*4bdc9457SAndroid Build Coastguard Worker *((double*) o) = wasm_f64x2_extract_lane(v0_${N-1}, 0); 230*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 231*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 232*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 233*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 234*4bdc9457SAndroid Build Coastguard Worker o = oN; 235*4bdc9457SAndroid Build Coastguard Worker $else: 236*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 237*4bdc9457SAndroid Build Coastguard Worker } 238*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 1: 239*4bdc9457SAndroid Build Coastguard Worker *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0); 240*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>1}; 241*4bdc9457SAndroid Build Coastguard Worker $else: 242*4bdc9457SAndroid Build Coastguard Worker *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0); 243*4bdc9457SAndroid Build Coastguard Worker $else: 244*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 245*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>1: 246*4bdc9457SAndroid Build Coastguard Worker *((double*) o${N}) = wasm_f64x2_extract_lane(v0_${N}, 0); 247*4bdc9457SAndroid Build Coastguard Worker o${N} += ${TILE_SIZE>>1}; 248*4bdc9457SAndroid Build Coastguard Worker $else: 249*4bdc9457SAndroid Build Coastguard Worker *((double*) o${N}) = wasm_f64x2_extract_lane(v0_${N}, 0); 250*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 1: 251*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 252*4bdc9457SAndroid Build Coastguard Worker v0_${N} = wasm_v64x2_shuffle(v0_${N}, v0_${N}, 1, 1); 253*4bdc9457SAndroid Build Coastguard Worker } 254*4bdc9457SAndroid Build Coastguard Worker 255*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>1: 256*4bdc9457SAndroid Build Coastguard Worker if (bh & ${TILE_SIZE>>2}) { 257*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 258*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 259*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 260*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 261*4bdc9457SAndroid Build Coastguard Worker case ${N}: 262*4bdc9457SAndroid Build Coastguard Worker *((float*) oN) = wasm_f32x4_extract_lane(v0_${N}, 0); 263*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 264*4bdc9457SAndroid Build Coastguard Worker case 1: 265*4bdc9457SAndroid Build Coastguard Worker *((float*) oN) = wasm_f32x4_extract_lane(v0_1, 0); 266*4bdc9457SAndroid Build Coastguard Worker case 0: 267*4bdc9457SAndroid Build Coastguard Worker *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0); 268*4bdc9457SAndroid Build Coastguard Worker $if SIZE < 32: 269*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>2}; 270*4bdc9457SAndroid Build Coastguard Worker break; 271*4bdc9457SAndroid Build Coastguard Worker default: 272*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 273*4bdc9457SAndroid Build Coastguard Worker } 274*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 275*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 276*4bdc9457SAndroid Build Coastguard Worker *((float*) o) = wasm_f32x4_extract_lane(v0_${TILE_SIZE-1}, 0); 277*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 278*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 279*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 280*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 281*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 282*4bdc9457SAndroid Build Coastguard Worker o = oN; 283*4bdc9457SAndroid Build Coastguard Worker $else: 284*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 285*4bdc9457SAndroid Build Coastguard Worker } 286*4bdc9457SAndroid Build Coastguard Worker *((float*) o) = wasm_f32x4_extract_lane(v0_${N}, 0); 287*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 288*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 289*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 290*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 291*4bdc9457SAndroid Build Coastguard Worker o = oN; 292*4bdc9457SAndroid Build Coastguard Worker $else: 293*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 294*4bdc9457SAndroid Build Coastguard Worker } 295*4bdc9457SAndroid Build Coastguard Worker *((float*) o) = wasm_f32x4_extract_lane(v0_${N-1}, 0); 296*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 297*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 298*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 299*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 300*4bdc9457SAndroid Build Coastguard Worker o = oN; 301*4bdc9457SAndroid Build Coastguard Worker $else: 302*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 303*4bdc9457SAndroid Build Coastguard Worker } 304*4bdc9457SAndroid Build Coastguard Worker *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0); 305*4bdc9457SAndroid Build Coastguard Worker $if SIZE < 32: 306*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>2}; 307*4bdc9457SAndroid Build Coastguard Worker $else: 308*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 309*4bdc9457SAndroid Build Coastguard Worker *((float*) o${N}) = wasm_f32x4_extract_lane(v0_${N}, 0); 310*4bdc9457SAndroid Build Coastguard Worker $if SIZE < 32: 311*4bdc9457SAndroid Build Coastguard Worker o${N} += ${TILE_SIZE>>2}; 312*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS > 2: 313*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 314*4bdc9457SAndroid Build Coastguard Worker v0_${N} = wasm_u64x2_shr(v0_${N}, 32); 315*4bdc9457SAndroid Build Coastguard Worker } 316*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>2: 317*4bdc9457SAndroid Build Coastguard Worker if (bh & ${TILE_SIZE>>3}) { 318*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 319*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 320*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 321*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 322*4bdc9457SAndroid Build Coastguard Worker case ${N}: 323*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 324*4bdc9457SAndroid Build Coastguard Worker *oN = wasm_i16x8_extract_lane(v0_${N}, 0); 325*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 326*4bdc9457SAndroid Build Coastguard Worker $else: 327*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_${N}, 0); 328*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 329*4bdc9457SAndroid Build Coastguard Worker case 1: 330*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 331*4bdc9457SAndroid Build Coastguard Worker *oN = wasm_i16x8_extract_lane(v0_1, 0); 332*4bdc9457SAndroid Build Coastguard Worker $else: 333*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_1, 0); 334*4bdc9457SAndroid Build Coastguard Worker case 0: 335*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 336*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i16x8_extract_lane(v0_0, 0); 337*4bdc9457SAndroid Build Coastguard Worker $else: 338*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0); 339*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>3}; 340*4bdc9457SAndroid Build Coastguard Worker break; 341*4bdc9457SAndroid Build Coastguard Worker default: 342*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 343*4bdc9457SAndroid Build Coastguard Worker } 344*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 345*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 346*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 347*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i16x8_extract_lane(v0_${TILE_SIZE-1}, 0); 348*4bdc9457SAndroid Build Coastguard Worker $else: 349*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${TILE_SIZE-1}, 0); 350*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 351*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 352*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 353*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 354*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 355*4bdc9457SAndroid Build Coastguard Worker o = oN; 356*4bdc9457SAndroid Build Coastguard Worker $else: 357*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 358*4bdc9457SAndroid Build Coastguard Worker } 359*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 360*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i16x8_extract_lane(v0_${N}, 0); 361*4bdc9457SAndroid Build Coastguard Worker $else: 362*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${N}, 0); 363*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 364*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 365*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 366*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 367*4bdc9457SAndroid Build Coastguard Worker o = oN; 368*4bdc9457SAndroid Build Coastguard Worker $else: 369*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 370*4bdc9457SAndroid Build Coastguard Worker } 371*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 372*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i16x8_extract_lane(v0_${N-1}, 0); 373*4bdc9457SAndroid Build Coastguard Worker $else: 374*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${N-1}, 0); 375*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 376*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 377*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 378*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 379*4bdc9457SAndroid Build Coastguard Worker o = oN; 380*4bdc9457SAndroid Build Coastguard Worker $else: 381*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 382*4bdc9457SAndroid Build Coastguard Worker } 383*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 384*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i16x8_extract_lane(v0_0, 0); 385*4bdc9457SAndroid Build Coastguard Worker $else: 386*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0); 387*4bdc9457SAndroid Build Coastguard Worker o += ${TILE_SIZE>>3}; 388*4bdc9457SAndroid Build Coastguard Worker $else: 389*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 390*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 16: 391*4bdc9457SAndroid Build Coastguard Worker *o${N} = wasm_i16x8_extract_lane(v0_${N}, 0); 392*4bdc9457SAndroid Build Coastguard Worker $else: 393*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) o${N}) = wasm_i16x8_extract_lane(v0_${N}, 0); 394*4bdc9457SAndroid Build Coastguard Worker o${N} += ${TILE_SIZE>>3}; 395*4bdc9457SAndroid Build Coastguard Worker $if NUM_ITERS>3: 396*4bdc9457SAndroid Build Coastguard Worker $for N in range(TILE_SIZE): 397*4bdc9457SAndroid Build Coastguard Worker v0_${N} = wasm_u32x4_shr(v0_${N}, 16); 398*4bdc9457SAndroid Build Coastguard Worker } 399*4bdc9457SAndroid Build Coastguard Worker $if SIZE == 8: 400*4bdc9457SAndroid Build Coastguard Worker if (bh & 1) { 401*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "SWITCH": 402*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 403*4bdc9457SAndroid Build Coastguard Worker switch (rem) { 404*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE)): 405*4bdc9457SAndroid Build Coastguard Worker case ${N}: 406*4bdc9457SAndroid Build Coastguard Worker *oN = wasm_i8x16_extract_lane(v0_${N}, 0); 407*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 408*4bdc9457SAndroid Build Coastguard Worker case 1: 409*4bdc9457SAndroid Build Coastguard Worker *oN = wasm_i8x16_extract_lane(v0_1, 0); 410*4bdc9457SAndroid Build Coastguard Worker case 0: 411*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i8x16_extract_lane(v0_0, 0); 412*4bdc9457SAndroid Build Coastguard Worker break; 413*4bdc9457SAndroid Build Coastguard Worker default: 414*4bdc9457SAndroid Build Coastguard Worker XNN_UNREACHABLE; 415*4bdc9457SAndroid Build Coastguard Worker } 416*4bdc9457SAndroid Build Coastguard Worker $elif OUT_PTRS in ["MOV", "DEC"]: 417*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 418*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i8x16_extract_lane(v0_${TILE_SIZE-1}, 0); 419*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 420*4bdc9457SAndroid Build Coastguard Worker uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 421*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(2, TILE_SIZE, 2)): 422*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > ${N+1}) { 423*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 424*4bdc9457SAndroid Build Coastguard Worker o = oN; 425*4bdc9457SAndroid Build Coastguard Worker $else: 426*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 427*4bdc9457SAndroid Build Coastguard Worker } 428*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i8x16_extract_lane(v0_${N}, 0); 429*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 430*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 431*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 432*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 433*4bdc9457SAndroid Build Coastguard Worker o = oN; 434*4bdc9457SAndroid Build Coastguard Worker $else: 435*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 436*4bdc9457SAndroid Build Coastguard Worker } 437*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i8x16_extract_lane(v0_${N-1}, 0); 438*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 439*4bdc9457SAndroid Build Coastguard Worker oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 440*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) { 441*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MOV": 442*4bdc9457SAndroid Build Coastguard Worker o = oN; 443*4bdc9457SAndroid Build Coastguard Worker $else: 444*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 445*4bdc9457SAndroid Build Coastguard Worker } 446*4bdc9457SAndroid Build Coastguard Worker *o = wasm_i8x16_extract_lane(v0_0, 0); 447*4bdc9457SAndroid Build Coastguard Worker $else: 448*4bdc9457SAndroid Build Coastguard Worker $for N in reversed(range(TILE_SIZE)): 449*4bdc9457SAndroid Build Coastguard Worker *o${N} = wasm_i8x16_extract_lane(v0_${N}, 0); 450*4bdc9457SAndroid Build Coastguard Worker } 451*4bdc9457SAndroid Build Coastguard Worker } 452*4bdc9457SAndroid Build Coastguard Worker 453*4bdc9457SAndroid Build Coastguard Worker $if IN_PTRS == "MULTI": 454*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 455*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 456*4bdc9457SAndroid Build Coastguard Worker i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 457*4bdc9457SAndroid Build Coastguard Worker $else: 458*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 459*4bdc9457SAndroid Build Coastguard Worker $if OUT_PTRS == "MULTI": 460*4bdc9457SAndroid Build Coastguard Worker o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset); 461*4bdc9457SAndroid Build Coastguard Worker $for N in range(1, TILE_SIZE): 462*4bdc9457SAndroid Build Coastguard Worker o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset); 463*4bdc9457SAndroid Build Coastguard Worker $else: 464*4bdc9457SAndroid Build Coastguard Worker o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset); 465*4bdc9457SAndroid Build Coastguard Worker block_width = doz(block_width, tile_width); 466*4bdc9457SAndroid Build Coastguard Worker } while (block_width != 0); 467*4bdc9457SAndroid Build Coastguard Worker} 468