xref: /aosp_15_r20/external/XNNPACK/src/x32-transposec/wasmsimd.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker//
3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker$from itertools import chain
6*4bdc9457SAndroid Build Coastguard Worker$import math
7*4bdc9457SAndroid Build Coastguard Worker$assert IN_PTRS in ["MULTI", "REUSE"]
8*4bdc9457SAndroid Build Coastguard Worker$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"]
9*4bdc9457SAndroid Build Coastguard Worker$assert SIZE in [8, 16, 32]
10*4bdc9457SAndroid Build Coastguard Worker$TILE_SIZE = int(128/SIZE)
11*4bdc9457SAndroid Build Coastguard Worker$NUM_ITERS = int(math.log2(TILE_SIZE))
12*4bdc9457SAndroid Build Coastguard Worker$LO_PERM=str(list(chain.from_iterable((i, i+TILE_SIZE) for i in range((TILE_SIZE>>1)))))[1:-1]
13*4bdc9457SAndroid Build Coastguard Worker$HI_PERM=str(list(chain.from_iterable(((TILE_SIZE>>1)+i, (TILE_SIZE>>1)+i+TILE_SIZE) for i in range((TILE_SIZE>>1)))))[1:-1]
14*4bdc9457SAndroid Build Coastguard Worker
15*4bdc9457SAndroid Build Coastguard Worker#include <wasm_simd128.h>
16*4bdc9457SAndroid Build Coastguard Worker
17*4bdc9457SAndroid Build Coastguard Worker#include <assert.h>
18*4bdc9457SAndroid Build Coastguard Worker
19*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/common.h>
20*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h>
21*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/transpose.h>
22*4bdc9457SAndroid Build Coastguard Worker
23*4bdc9457SAndroid Build Coastguard Workervoid xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_wasmsimd(
24*4bdc9457SAndroid Build Coastguard Worker    const uint${SIZE}_t* input,
25*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* output,
26*4bdc9457SAndroid Build Coastguard Worker    size_t input_stride,
27*4bdc9457SAndroid Build Coastguard Worker    size_t output_stride,
28*4bdc9457SAndroid Build Coastguard Worker    size_t block_width,
29*4bdc9457SAndroid Build Coastguard Worker    size_t block_height) XNN_OOB_READS
30*4bdc9457SAndroid Build Coastguard Worker{
31*4bdc9457SAndroid Build Coastguard Worker  assert(output_stride >= block_height * sizeof(uint${SIZE}_t));
32*4bdc9457SAndroid Build Coastguard Worker  assert(input_stride >= block_width * sizeof(uint${SIZE}_t));
33*4bdc9457SAndroid Build Coastguard Worker
34*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_height = ${TILE_SIZE};
35*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_width = ${TILE_SIZE};
36*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t);
37*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t);
38*4bdc9457SAndroid Build Coastguard Worker  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
39*4bdc9457SAndroid Build Coastguard Worker  $if IN_PTRS == "MULTI":
40*4bdc9457SAndroid Build Coastguard Worker    const size_t input_offset = tile_height * input_stride;
41*4bdc9457SAndroid Build Coastguard Worker  $if OUT_PTRS in ["MOV", "DEC"]:
42*4bdc9457SAndroid Build Coastguard Worker    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes;
43*4bdc9457SAndroid Build Coastguard Worker  $else:
44*4bdc9457SAndroid Build Coastguard Worker    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t);
45*4bdc9457SAndroid Build Coastguard Worker
46*4bdc9457SAndroid Build Coastguard Worker  $if IN_PTRS == "MULTI":
47*4bdc9457SAndroid Build Coastguard Worker    const uint${SIZE}_t* i0 = input;
48*4bdc9457SAndroid Build Coastguard Worker    $for N in range(1, TILE_SIZE):
49*4bdc9457SAndroid Build Coastguard Worker      const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
50*4bdc9457SAndroid Build Coastguard Worker  $else:
51*4bdc9457SAndroid Build Coastguard Worker    const uint${SIZE}_t* i0 = input;
52*4bdc9457SAndroid Build Coastguard Worker  $if OUT_PTRS == "MULTI":
53*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* o0 = (uint${SIZE}_t*) output;
54*4bdc9457SAndroid Build Coastguard Worker    $for N in range(1, TILE_SIZE):
55*4bdc9457SAndroid Build Coastguard Worker      uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride);
56*4bdc9457SAndroid Build Coastguard Worker  $elif OUT_PTRS == "SWITCH":
57*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* o = (uint${SIZE}_t*) output;
58*4bdc9457SAndroid Build Coastguard Worker  $else:
59*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes);
60*4bdc9457SAndroid Build Coastguard Worker  $if OUT_PTRS != "MULTI":
61*4bdc9457SAndroid Build Coastguard Worker    const size_t minus_output_stride = -output_stride;
62*4bdc9457SAndroid Build Coastguard Worker
63*4bdc9457SAndroid Build Coastguard Worker  do {
64*4bdc9457SAndroid Build Coastguard Worker    $if OUT_PTRS == "MULTI":
65*4bdc9457SAndroid Build Coastguard Worker      if XNN_UNPREDICTABLE(block_width < 2) {
66*4bdc9457SAndroid Build Coastguard Worker        o1 = o0;
67*4bdc9457SAndroid Build Coastguard Worker      }
68*4bdc9457SAndroid Build Coastguard Worker      $for N in range(2, TILE_SIZE, 2):
69*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNPREDICTABLE(block_width <= ${N}) {
70*4bdc9457SAndroid Build Coastguard Worker          o${N} = o0;
71*4bdc9457SAndroid Build Coastguard Worker        }
72*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNPREDICTABLE(block_width < ${N+2}) {
73*4bdc9457SAndroid Build Coastguard Worker          o${N+1} = o0;
74*4bdc9457SAndroid Build Coastguard Worker        }
75*4bdc9457SAndroid Build Coastguard Worker    $elif OUT_PTRS in ["MOV", "DEC"]:
76*4bdc9457SAndroid Build Coastguard Worker      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
77*4bdc9457SAndroid Build Coastguard Worker      const size_t oN_stride = rem * output_stride;
78*4bdc9457SAndroid Build Coastguard Worker      const size_t oN_offset = oN_stride + tile_hbytes;
79*4bdc9457SAndroid Build Coastguard Worker    $else:
80*4bdc9457SAndroid Build Coastguard Worker      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
81*4bdc9457SAndroid Build Coastguard Worker      const size_t oN_stride = rem * output_stride;
82*4bdc9457SAndroid Build Coastguard Worker    size_t bh = block_height;
83*4bdc9457SAndroid Build Coastguard Worker    for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) {
84*4bdc9457SAndroid Build Coastguard Worker      $for N in range(TILE_SIZE):
85*4bdc9457SAndroid Build Coastguard Worker        $if IN_PTRS == "REUSE":
86*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i0);
87*4bdc9457SAndroid Build Coastguard Worker          i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride);
88*4bdc9457SAndroid Build Coastguard Worker        $else:
89*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N});
90*4bdc9457SAndroid Build Coastguard Worker          i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset);
91*4bdc9457SAndroid Build Coastguard Worker
92*4bdc9457SAndroid Build Coastguard Worker      $for M in range(NUM_ITERS):
93*4bdc9457SAndroid Build Coastguard Worker        $for N in range(TILE_SIZE >> 1):
94*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS-M-1}_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${LO_PERM});
95*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS-M-1}_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${HI_PERM});
96*4bdc9457SAndroid Build Coastguard Worker
97*4bdc9457SAndroid Build Coastguard Worker      $if OUT_PTRS == "SWITCH":
98*4bdc9457SAndroid Build Coastguard Worker        uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
99*4bdc9457SAndroid Build Coastguard Worker        switch (rem) {
100*4bdc9457SAndroid Build Coastguard Worker          $for N in reversed(range(2, TILE_SIZE)):
101*4bdc9457SAndroid Build Coastguard Worker            case ${N}:
102*4bdc9457SAndroid Build Coastguard Worker              wasm_v128_store(oN, v0_${N});
103*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
104*4bdc9457SAndroid Build Coastguard Worker          case 1:
105*4bdc9457SAndroid Build Coastguard Worker            wasm_v128_store(oN, v0_1);
106*4bdc9457SAndroid Build Coastguard Worker          case 0:
107*4bdc9457SAndroid Build Coastguard Worker            wasm_v128_store(o, v0_0);
108*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
109*4bdc9457SAndroid Build Coastguard Worker            break;
110*4bdc9457SAndroid Build Coastguard Worker          default:
111*4bdc9457SAndroid Build Coastguard Worker            XNN_UNREACHABLE;
112*4bdc9457SAndroid Build Coastguard Worker        }
113*4bdc9457SAndroid Build Coastguard Worker      $elif OUT_PTRS in ["MOV", "DEC"]:
114*4bdc9457SAndroid Build Coastguard Worker        o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset);
115*4bdc9457SAndroid Build Coastguard Worker        wasm_v128_store(o, v0_${TILE_SIZE-1});
116*4bdc9457SAndroid Build Coastguard Worker        $if OUT_PTRS == "MOV":
117*4bdc9457SAndroid Build Coastguard Worker          uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
118*4bdc9457SAndroid Build Coastguard Worker        $for N in reversed(range(2, TILE_SIZE-1, 2)):
119*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(block_width > ${N+1}) {
120*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
121*4bdc9457SAndroid Build Coastguard Worker              o = oN;
122*4bdc9457SAndroid Build Coastguard Worker            $else:
123*4bdc9457SAndroid Build Coastguard Worker              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
124*4bdc9457SAndroid Build Coastguard Worker          }
125*4bdc9457SAndroid Build Coastguard Worker          wasm_v128_store(o, v0_${N});
126*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "MOV":
127*4bdc9457SAndroid Build Coastguard Worker            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
128*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
129*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
130*4bdc9457SAndroid Build Coastguard Worker              o = oN;
131*4bdc9457SAndroid Build Coastguard Worker            $else:
132*4bdc9457SAndroid Build Coastguard Worker              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
133*4bdc9457SAndroid Build Coastguard Worker          }
134*4bdc9457SAndroid Build Coastguard Worker          wasm_v128_store(o, v0_${N-1});
135*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "MOV":
136*4bdc9457SAndroid Build Coastguard Worker            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
137*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNPREDICTABLE(block_width > 1) {
138*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "MOV":
139*4bdc9457SAndroid Build Coastguard Worker            o = oN;
140*4bdc9457SAndroid Build Coastguard Worker          $else:
141*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
142*4bdc9457SAndroid Build Coastguard Worker        }
143*4bdc9457SAndroid Build Coastguard Worker        wasm_v128_store(o, v0_0);
144*4bdc9457SAndroid Build Coastguard Worker      $else:
145*4bdc9457SAndroid Build Coastguard Worker        $for N in reversed(range(TILE_SIZE)):
146*4bdc9457SAndroid Build Coastguard Worker          wasm_v128_store(o${N}, v0_${N});
147*4bdc9457SAndroid Build Coastguard Worker          o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes);
148*4bdc9457SAndroid Build Coastguard Worker    }
149*4bdc9457SAndroid Build Coastguard Worker    $if OUT_PTRS in ["MOV", "DEC"]:
150*4bdc9457SAndroid Build Coastguard Worker      o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
151*4bdc9457SAndroid Build Coastguard Worker
152*4bdc9457SAndroid Build Coastguard Worker    if (bh != 0) {
153*4bdc9457SAndroid Build Coastguard Worker      $if IN_PTRS == "REUSE":
154*4bdc9457SAndroid Build Coastguard Worker        const v128_t v${NUM_ITERS}_0 = wasm_v128_load(i0);
155*4bdc9457SAndroid Build Coastguard Worker        $for N in range(1, TILE_SIZE - 1, 2):
156*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
157*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh < ${N+1}) {
158*4bdc9457SAndroid Build Coastguard Worker            i${N} = i${N-1};
159*4bdc9457SAndroid Build Coastguard Worker          }
160*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N});
161*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride);
162*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
163*4bdc9457SAndroid Build Coastguard Worker            i${N+1} = i${N};
164*4bdc9457SAndroid Build Coastguard Worker          }
165*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS}_${N+1} = wasm_v128_load(i${N+1});
166*4bdc9457SAndroid Build Coastguard Worker      $else:
167*4bdc9457SAndroid Build Coastguard Worker        const v128_t v${NUM_ITERS}_0 = wasm_v128_load(i0);
168*4bdc9457SAndroid Build Coastguard Worker        $for N in range(1, TILE_SIZE - 1, 2):
169*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh < ${N+1}) {
170*4bdc9457SAndroid Build Coastguard Worker            i${N} = i0;
171*4bdc9457SAndroid Build Coastguard Worker          }
172*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N});
173*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
174*4bdc9457SAndroid Build Coastguard Worker            i${N+1} = i0;
175*4bdc9457SAndroid Build Coastguard Worker          }
176*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS}_${N+1} = wasm_v128_load(i${N+1});
177*4bdc9457SAndroid Build Coastguard Worker      const v128_t v${NUM_ITERS}_${TILE_SIZE-1} = wasm_v128_xor(v${NUM_ITERS}_0, v${NUM_ITERS}_0);
178*4bdc9457SAndroid Build Coastguard Worker
179*4bdc9457SAndroid Build Coastguard Worker      $for M in range(NUM_ITERS-1):
180*4bdc9457SAndroid Build Coastguard Worker        $for N in range(TILE_SIZE >> 1):
181*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS-M-1}_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${LO_PERM});
182*4bdc9457SAndroid Build Coastguard Worker          const v128_t v${NUM_ITERS-M-1}_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${HI_PERM});
183*4bdc9457SAndroid Build Coastguard Worker
184*4bdc9457SAndroid Build Coastguard Worker      $for N in range(TILE_SIZE >> 1):
185*4bdc9457SAndroid Build Coastguard Worker        v128_t v0_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v1_${N}, v1_${N+int(TILE_SIZE/2)}, ${LO_PERM});
186*4bdc9457SAndroid Build Coastguard Worker        v128_t v0_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v1_${N}, v1_${N+int(TILE_SIZE/2)}, ${HI_PERM});
187*4bdc9457SAndroid Build Coastguard Worker
188*4bdc9457SAndroid Build Coastguard Worker      if (bh & ${TILE_SIZE>>1}) {
189*4bdc9457SAndroid Build Coastguard Worker        $if OUT_PTRS == "SWITCH":
190*4bdc9457SAndroid Build Coastguard Worker          uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
191*4bdc9457SAndroid Build Coastguard Worker          switch (rem) {
192*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE)):
193*4bdc9457SAndroid Build Coastguard Worker              case ${N}:
194*4bdc9457SAndroid Build Coastguard Worker                *((double*) oN) = wasm_f64x2_extract_lane(v0_${N}, 0);
195*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
196*4bdc9457SAndroid Build Coastguard Worker            case 1:
197*4bdc9457SAndroid Build Coastguard Worker              *((double*) oN) = wasm_f64x2_extract_lane(v0_1, 0);
198*4bdc9457SAndroid Build Coastguard Worker            case 0:
199*4bdc9457SAndroid Build Coastguard Worker              $if NUM_ITERS > 1:
200*4bdc9457SAndroid Build Coastguard Worker                *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
201*4bdc9457SAndroid Build Coastguard Worker                o += ${TILE_SIZE>>1};
202*4bdc9457SAndroid Build Coastguard Worker              $else:
203*4bdc9457SAndroid Build Coastguard Worker                *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
204*4bdc9457SAndroid Build Coastguard Worker              break;
205*4bdc9457SAndroid Build Coastguard Worker            default:
206*4bdc9457SAndroid Build Coastguard Worker              XNN_UNREACHABLE;
207*4bdc9457SAndroid Build Coastguard Worker          }
208*4bdc9457SAndroid Build Coastguard Worker        $elif OUT_PTRS in ["MOV", "DEC"]:
209*4bdc9457SAndroid Build Coastguard Worker          o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
210*4bdc9457SAndroid Build Coastguard Worker          *((double*) o) = wasm_f64x2_extract_lane(v0_${TILE_SIZE-1}, 0);
211*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "MOV":
212*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
213*4bdc9457SAndroid Build Coastguard Worker          $for N in reversed(range(2, TILE_SIZE, 2)):
214*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > ${N+1}) {
215*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
216*4bdc9457SAndroid Build Coastguard Worker                o = oN;
217*4bdc9457SAndroid Build Coastguard Worker              $else:
218*4bdc9457SAndroid Build Coastguard Worker                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
219*4bdc9457SAndroid Build Coastguard Worker            }
220*4bdc9457SAndroid Build Coastguard Worker            *((double*) o) = wasm_f64x2_extract_lane(v0_${N}, 0);
221*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
222*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
223*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
224*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
225*4bdc9457SAndroid Build Coastguard Worker                o = oN;
226*4bdc9457SAndroid Build Coastguard Worker              $else:
227*4bdc9457SAndroid Build Coastguard Worker                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
228*4bdc9457SAndroid Build Coastguard Worker            }
229*4bdc9457SAndroid Build Coastguard Worker            *((double*) o) = wasm_f64x2_extract_lane(v0_${N-1}, 0);
230*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
231*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
232*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(block_width > 1) {
233*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
234*4bdc9457SAndroid Build Coastguard Worker              o = oN;
235*4bdc9457SAndroid Build Coastguard Worker            $else:
236*4bdc9457SAndroid Build Coastguard Worker              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
237*4bdc9457SAndroid Build Coastguard Worker          }
238*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS > 1:
239*4bdc9457SAndroid Build Coastguard Worker            *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
240*4bdc9457SAndroid Build Coastguard Worker            o += ${TILE_SIZE>>1};
241*4bdc9457SAndroid Build Coastguard Worker          $else:
242*4bdc9457SAndroid Build Coastguard Worker            *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
243*4bdc9457SAndroid Build Coastguard Worker        $else:
244*4bdc9457SAndroid Build Coastguard Worker          $for N in reversed(range(TILE_SIZE)):
245*4bdc9457SAndroid Build Coastguard Worker            $if NUM_ITERS>1:
246*4bdc9457SAndroid Build Coastguard Worker              *((double*) o${N}) = wasm_f64x2_extract_lane(v0_${N}, 0);
247*4bdc9457SAndroid Build Coastguard Worker              o${N} += ${TILE_SIZE>>1};
248*4bdc9457SAndroid Build Coastguard Worker            $else:
249*4bdc9457SAndroid Build Coastguard Worker              *((double*) o${N}) = wasm_f64x2_extract_lane(v0_${N}, 0);
250*4bdc9457SAndroid Build Coastguard Worker        $if NUM_ITERS > 1:
251*4bdc9457SAndroid Build Coastguard Worker          $for N in range(TILE_SIZE):
252*4bdc9457SAndroid Build Coastguard Worker            v0_${N} = wasm_v64x2_shuffle(v0_${N}, v0_${N}, 1, 1);
253*4bdc9457SAndroid Build Coastguard Worker      }
254*4bdc9457SAndroid Build Coastguard Worker
255*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>1:
256*4bdc9457SAndroid Build Coastguard Worker        if (bh & ${TILE_SIZE>>2}) {
257*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
258*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
259*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
260*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
261*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
262*4bdc9457SAndroid Build Coastguard Worker                  *((float*) oN) = wasm_f32x4_extract_lane(v0_${N}, 0);
263*4bdc9457SAndroid Build Coastguard Worker                  oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
264*4bdc9457SAndroid Build Coastguard Worker              case 1:
265*4bdc9457SAndroid Build Coastguard Worker                *((float*) oN) = wasm_f32x4_extract_lane(v0_1, 0);
266*4bdc9457SAndroid Build Coastguard Worker              case 0:
267*4bdc9457SAndroid Build Coastguard Worker                *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
268*4bdc9457SAndroid Build Coastguard Worker                $if SIZE < 32:
269*4bdc9457SAndroid Build Coastguard Worker                  o += ${TILE_SIZE>>2};
270*4bdc9457SAndroid Build Coastguard Worker                break;
271*4bdc9457SAndroid Build Coastguard Worker              default:
272*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
273*4bdc9457SAndroid Build Coastguard Worker            }
274*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS in ["MOV", "DEC"]:
275*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
276*4bdc9457SAndroid Build Coastguard Worker            *((float*) o) = wasm_f32x4_extract_lane(v0_${TILE_SIZE-1}, 0);
277*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
278*4bdc9457SAndroid Build Coastguard Worker              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
279*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
280*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
281*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
282*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
283*4bdc9457SAndroid Build Coastguard Worker                $else:
284*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
285*4bdc9457SAndroid Build Coastguard Worker              }
286*4bdc9457SAndroid Build Coastguard Worker              *((float*) o) = wasm_f32x4_extract_lane(v0_${N}, 0);
287*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
288*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
289*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
290*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
291*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
292*4bdc9457SAndroid Build Coastguard Worker                $else:
293*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
294*4bdc9457SAndroid Build Coastguard Worker              }
295*4bdc9457SAndroid Build Coastguard Worker              *((float*) o) = wasm_f32x4_extract_lane(v0_${N-1}, 0);
296*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
297*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
298*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
299*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
300*4bdc9457SAndroid Build Coastguard Worker                o = oN;
301*4bdc9457SAndroid Build Coastguard Worker              $else:
302*4bdc9457SAndroid Build Coastguard Worker                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
303*4bdc9457SAndroid Build Coastguard Worker            }
304*4bdc9457SAndroid Build Coastguard Worker            *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
305*4bdc9457SAndroid Build Coastguard Worker            $if SIZE < 32:
306*4bdc9457SAndroid Build Coastguard Worker              o += ${TILE_SIZE>>2};
307*4bdc9457SAndroid Build Coastguard Worker          $else:
308*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(TILE_SIZE)):
309*4bdc9457SAndroid Build Coastguard Worker              *((float*) o${N}) = wasm_f32x4_extract_lane(v0_${N}, 0);
310*4bdc9457SAndroid Build Coastguard Worker              $if SIZE < 32:
311*4bdc9457SAndroid Build Coastguard Worker                o${N} += ${TILE_SIZE>>2};
312*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS > 2:
313*4bdc9457SAndroid Build Coastguard Worker            $for N in range(TILE_SIZE):
314*4bdc9457SAndroid Build Coastguard Worker              v0_${N} = wasm_u64x2_shr(v0_${N}, 32);
315*4bdc9457SAndroid Build Coastguard Worker        }
316*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>2:
317*4bdc9457SAndroid Build Coastguard Worker        if (bh & ${TILE_SIZE>>3}) {
318*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
319*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
320*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
321*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
322*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
323*4bdc9457SAndroid Build Coastguard Worker                  $if SIZE == 16:
324*4bdc9457SAndroid Build Coastguard Worker                    *oN = wasm_i16x8_extract_lane(v0_${N}, 0);
325*4bdc9457SAndroid Build Coastguard Worker                    oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
326*4bdc9457SAndroid Build Coastguard Worker                  $else:
327*4bdc9457SAndroid Build Coastguard Worker                    *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_${N}, 0);
328*4bdc9457SAndroid Build Coastguard Worker                    oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
329*4bdc9457SAndroid Build Coastguard Worker              case 1:
330*4bdc9457SAndroid Build Coastguard Worker                $if SIZE == 16:
331*4bdc9457SAndroid Build Coastguard Worker                  *oN = wasm_i16x8_extract_lane(v0_1, 0);
332*4bdc9457SAndroid Build Coastguard Worker                $else:
333*4bdc9457SAndroid Build Coastguard Worker                  *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_1, 0);
334*4bdc9457SAndroid Build Coastguard Worker              case 0:
335*4bdc9457SAndroid Build Coastguard Worker                $if SIZE == 16:
336*4bdc9457SAndroid Build Coastguard Worker                  *o = wasm_i16x8_extract_lane(v0_0, 0);
337*4bdc9457SAndroid Build Coastguard Worker                $else:
338*4bdc9457SAndroid Build Coastguard Worker                  *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0);
339*4bdc9457SAndroid Build Coastguard Worker                  o += ${TILE_SIZE>>3};
340*4bdc9457SAndroid Build Coastguard Worker                break;
341*4bdc9457SAndroid Build Coastguard Worker              default:
342*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
343*4bdc9457SAndroid Build Coastguard Worker            }
344*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS in ["MOV", "DEC"]:
345*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
346*4bdc9457SAndroid Build Coastguard Worker            $if SIZE == 16:
347*4bdc9457SAndroid Build Coastguard Worker              *o = wasm_i16x8_extract_lane(v0_${TILE_SIZE-1}, 0);
348*4bdc9457SAndroid Build Coastguard Worker            $else:
349*4bdc9457SAndroid Build Coastguard Worker              *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${TILE_SIZE-1}, 0);
350*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
351*4bdc9457SAndroid Build Coastguard Worker              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
352*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
353*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
354*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
355*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
356*4bdc9457SAndroid Build Coastguard Worker                $else:
357*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
358*4bdc9457SAndroid Build Coastguard Worker              }
359*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 16:
360*4bdc9457SAndroid Build Coastguard Worker                *o = wasm_i16x8_extract_lane(v0_${N}, 0);
361*4bdc9457SAndroid Build Coastguard Worker              $else:
362*4bdc9457SAndroid Build Coastguard Worker                *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${N}, 0);
363*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
364*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
365*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
366*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
367*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
368*4bdc9457SAndroid Build Coastguard Worker                $else:
369*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
370*4bdc9457SAndroid Build Coastguard Worker              }
371*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 16:
372*4bdc9457SAndroid Build Coastguard Worker                *o = wasm_i16x8_extract_lane(v0_${N-1}, 0);
373*4bdc9457SAndroid Build Coastguard Worker              $else:
374*4bdc9457SAndroid Build Coastguard Worker                *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${N-1}, 0);
375*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
376*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
377*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
378*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
379*4bdc9457SAndroid Build Coastguard Worker                o = oN;
380*4bdc9457SAndroid Build Coastguard Worker              $else:
381*4bdc9457SAndroid Build Coastguard Worker                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
382*4bdc9457SAndroid Build Coastguard Worker            }
383*4bdc9457SAndroid Build Coastguard Worker            $if SIZE == 16:
384*4bdc9457SAndroid Build Coastguard Worker              *o = wasm_i16x8_extract_lane(v0_0, 0);
385*4bdc9457SAndroid Build Coastguard Worker            $else:
386*4bdc9457SAndroid Build Coastguard Worker              *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0);
387*4bdc9457SAndroid Build Coastguard Worker              o += ${TILE_SIZE>>3};
388*4bdc9457SAndroid Build Coastguard Worker          $else:
389*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(TILE_SIZE)):
390*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 16:
391*4bdc9457SAndroid Build Coastguard Worker                *o${N} = wasm_i16x8_extract_lane(v0_${N}, 0);
392*4bdc9457SAndroid Build Coastguard Worker              $else:
393*4bdc9457SAndroid Build Coastguard Worker                *((uint16_t*) o${N}) = wasm_i16x8_extract_lane(v0_${N}, 0);
394*4bdc9457SAndroid Build Coastguard Worker                o${N} += ${TILE_SIZE>>3};
395*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS>3:
396*4bdc9457SAndroid Build Coastguard Worker            $for N in range(TILE_SIZE):
397*4bdc9457SAndroid Build Coastguard Worker              v0_${N} = wasm_u32x4_shr(v0_${N}, 16);
398*4bdc9457SAndroid Build Coastguard Worker        }
399*4bdc9457SAndroid Build Coastguard Worker      $if SIZE == 8:
400*4bdc9457SAndroid Build Coastguard Worker        if (bh & 1) {
401*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
402*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
403*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
404*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
405*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
406*4bdc9457SAndroid Build Coastguard Worker                  *oN = wasm_i8x16_extract_lane(v0_${N}, 0);
407*4bdc9457SAndroid Build Coastguard Worker                  oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
408*4bdc9457SAndroid Build Coastguard Worker              case 1:
409*4bdc9457SAndroid Build Coastguard Worker                *oN = wasm_i8x16_extract_lane(v0_1, 0);
410*4bdc9457SAndroid Build Coastguard Worker              case 0:
411*4bdc9457SAndroid Build Coastguard Worker                *o = wasm_i8x16_extract_lane(v0_0, 0);
412*4bdc9457SAndroid Build Coastguard Worker                break;
413*4bdc9457SAndroid Build Coastguard Worker              default:
414*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
415*4bdc9457SAndroid Build Coastguard Worker            }
416*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS in ["MOV", "DEC"]:
417*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
418*4bdc9457SAndroid Build Coastguard Worker            *o = wasm_i8x16_extract_lane(v0_${TILE_SIZE-1}, 0);
419*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
420*4bdc9457SAndroid Build Coastguard Worker              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
421*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
422*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
423*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
424*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
425*4bdc9457SAndroid Build Coastguard Worker                $else:
426*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
427*4bdc9457SAndroid Build Coastguard Worker              }
428*4bdc9457SAndroid Build Coastguard Worker              *o = wasm_i8x16_extract_lane(v0_${N}, 0);
429*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
430*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
431*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
432*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
433*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
434*4bdc9457SAndroid Build Coastguard Worker                $else:
435*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
436*4bdc9457SAndroid Build Coastguard Worker              }
437*4bdc9457SAndroid Build Coastguard Worker              *o = wasm_i8x16_extract_lane(v0_${N-1}, 0);
438*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
439*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
440*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
441*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
442*4bdc9457SAndroid Build Coastguard Worker                o = oN;
443*4bdc9457SAndroid Build Coastguard Worker              $else:
444*4bdc9457SAndroid Build Coastguard Worker                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
445*4bdc9457SAndroid Build Coastguard Worker            }
446*4bdc9457SAndroid Build Coastguard Worker            *o = wasm_i8x16_extract_lane(v0_0, 0);
447*4bdc9457SAndroid Build Coastguard Worker          $else:
448*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(TILE_SIZE)):
449*4bdc9457SAndroid Build Coastguard Worker              *o${N} = wasm_i8x16_extract_lane(v0_${N}, 0);
450*4bdc9457SAndroid Build Coastguard Worker        }
451*4bdc9457SAndroid Build Coastguard Worker    }
452*4bdc9457SAndroid Build Coastguard Worker
453*4bdc9457SAndroid Build Coastguard Worker    $if IN_PTRS == "MULTI":
454*4bdc9457SAndroid Build Coastguard Worker      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
455*4bdc9457SAndroid Build Coastguard Worker      $for N in range(1, TILE_SIZE):
456*4bdc9457SAndroid Build Coastguard Worker        i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
457*4bdc9457SAndroid Build Coastguard Worker    $else:
458*4bdc9457SAndroid Build Coastguard Worker      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
459*4bdc9457SAndroid Build Coastguard Worker    $if OUT_PTRS == "MULTI":
460*4bdc9457SAndroid Build Coastguard Worker      o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset);
461*4bdc9457SAndroid Build Coastguard Worker      $for N in range(1, TILE_SIZE):
462*4bdc9457SAndroid Build Coastguard Worker        o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset);
463*4bdc9457SAndroid Build Coastguard Worker    $else:
464*4bdc9457SAndroid Build Coastguard Worker      o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset);
465*4bdc9457SAndroid Build Coastguard Worker    block_width = doz(block_width, tile_width);
466*4bdc9457SAndroid Build Coastguard Worker  } while (block_width != 0);
467*4bdc9457SAndroid Build Coastguard Worker}
468