xref: /aosp_15_r20/external/XNNPACK/src/x32-transposec/neon-zip.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker//
3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker$import math
6*4bdc9457SAndroid Build Coastguard Worker$assert IN_PTRS in ["MULTI", "REUSE"]
7*4bdc9457SAndroid Build Coastguard Worker$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"]
8*4bdc9457SAndroid Build Coastguard Worker$assert SIZE in [8, 16, 32]
9*4bdc9457SAndroid Build Coastguard Worker$assert VECTOR_SIZE in [64, 128]
10*4bdc9457SAndroid Build Coastguard Worker$TILE_SIZE = int(VECTOR_SIZE/SIZE)
11*4bdc9457SAndroid Build Coastguard Worker$NUM_ITERS = int(math.log2(TILE_SIZE))
12*4bdc9457SAndroid Build Coastguard Worker$SUFFIX = ''
13*4bdc9457SAndroid Build Coastguard Worker$NUM_D_REGISTERS=int(VECTOR_SIZE/64)
14*4bdc9457SAndroid Build Coastguard Worker$if VECTOR_SIZE == 128:
15*4bdc9457SAndroid Build Coastguard Worker$ SUFFIX = 'q'
16*4bdc9457SAndroid Build Coastguard Worker
17*4bdc9457SAndroid Build Coastguard Worker#include <arm_neon.h>
18*4bdc9457SAndroid Build Coastguard Worker
19*4bdc9457SAndroid Build Coastguard Worker#include <assert.h>
20*4bdc9457SAndroid Build Coastguard Worker
21*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/common.h>
22*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h>
23*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/transpose.h>
24*4bdc9457SAndroid Build Coastguard Worker
25*4bdc9457SAndroid Build Coastguard Workervoid xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_zip_neon(
26*4bdc9457SAndroid Build Coastguard Worker    const uint${SIZE}_t* input,
27*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* output,
28*4bdc9457SAndroid Build Coastguard Worker    size_t input_stride,
29*4bdc9457SAndroid Build Coastguard Worker    size_t output_stride,
30*4bdc9457SAndroid Build Coastguard Worker    size_t block_width,
31*4bdc9457SAndroid Build Coastguard Worker    size_t block_height) XNN_OOB_READS
32*4bdc9457SAndroid Build Coastguard Worker{
33*4bdc9457SAndroid Build Coastguard Worker  assert(output_stride >= block_height * sizeof(uint${SIZE}_t));
34*4bdc9457SAndroid Build Coastguard Worker  assert(input_stride >= block_width * sizeof(uint${SIZE}_t));
35*4bdc9457SAndroid Build Coastguard Worker
36*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_height = ${TILE_SIZE};
37*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_width = ${TILE_SIZE};
38*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t);
39*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t);
40*4bdc9457SAndroid Build Coastguard Worker  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
41*4bdc9457SAndroid Build Coastguard Worker  $if IN_PTRS == "MULTI":
42*4bdc9457SAndroid Build Coastguard Worker    const size_t input_offset = tile_height * input_stride;
43*4bdc9457SAndroid Build Coastguard Worker  $if OUT_PTRS in ["MOV", "DEC"]:
44*4bdc9457SAndroid Build Coastguard Worker    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes;
45*4bdc9457SAndroid Build Coastguard Worker  $else:
46*4bdc9457SAndroid Build Coastguard Worker    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t);
47*4bdc9457SAndroid Build Coastguard Worker
48*4bdc9457SAndroid Build Coastguard Worker  $if IN_PTRS == "MULTI":
49*4bdc9457SAndroid Build Coastguard Worker    const uint${SIZE}_t* i0 = input;
50*4bdc9457SAndroid Build Coastguard Worker    $for N in range(1, TILE_SIZE):
51*4bdc9457SAndroid Build Coastguard Worker      const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
52*4bdc9457SAndroid Build Coastguard Worker  $else:
53*4bdc9457SAndroid Build Coastguard Worker    const uint${SIZE}_t* i0 = input;
54*4bdc9457SAndroid Build Coastguard Worker  $if OUT_PTRS == "MULTI":
55*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* o0 = (uint${SIZE}_t*) output;
56*4bdc9457SAndroid Build Coastguard Worker    $for N in range(1, TILE_SIZE):
57*4bdc9457SAndroid Build Coastguard Worker      uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride);
58*4bdc9457SAndroid Build Coastguard Worker  $elif OUT_PTRS == "SWITCH":
59*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* o = (uint${SIZE}_t*) output;
60*4bdc9457SAndroid Build Coastguard Worker  $else:
61*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes);
62*4bdc9457SAndroid Build Coastguard Worker  $if OUT_PTRS == "SWITCH":
63*4bdc9457SAndroid Build Coastguard Worker    $if int(VECTOR_SIZE/SIZE) > 2:
64*4bdc9457SAndroid Build Coastguard Worker      const size_t minus_output_stride = -output_stride;
65*4bdc9457SAndroid Build Coastguard Worker  $elif OUT_PTRS != "MULTI":
66*4bdc9457SAndroid Build Coastguard Worker    const size_t minus_output_stride = -output_stride;
67*4bdc9457SAndroid Build Coastguard Worker
68*4bdc9457SAndroid Build Coastguard Worker  do {
69*4bdc9457SAndroid Build Coastguard Worker    $if OUT_PTRS == "MULTI":
70*4bdc9457SAndroid Build Coastguard Worker      if XNN_UNPREDICTABLE(block_width < 2) {
71*4bdc9457SAndroid Build Coastguard Worker        o1 = o0;
72*4bdc9457SAndroid Build Coastguard Worker      }
73*4bdc9457SAndroid Build Coastguard Worker      $for N in range(2, TILE_SIZE, 2):
74*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNPREDICTABLE(block_width <= ${N}) {
75*4bdc9457SAndroid Build Coastguard Worker          o${N} = o0;
76*4bdc9457SAndroid Build Coastguard Worker        }
77*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNPREDICTABLE(block_width < ${N+2}) {
78*4bdc9457SAndroid Build Coastguard Worker          o${N+1} = o0;
79*4bdc9457SAndroid Build Coastguard Worker        }
80*4bdc9457SAndroid Build Coastguard Worker    $elif OUT_PTRS in ["MOV", "DEC"]:
81*4bdc9457SAndroid Build Coastguard Worker      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
82*4bdc9457SAndroid Build Coastguard Worker      const size_t oN_stride = rem * output_stride;
83*4bdc9457SAndroid Build Coastguard Worker      const size_t oN_offset = oN_stride + tile_hbytes;
84*4bdc9457SAndroid Build Coastguard Worker    $else:
85*4bdc9457SAndroid Build Coastguard Worker      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
86*4bdc9457SAndroid Build Coastguard Worker      const size_t oN_stride = rem * output_stride;
87*4bdc9457SAndroid Build Coastguard Worker    size_t bh = block_height;
88*4bdc9457SAndroid Build Coastguard Worker    for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) {
89*4bdc9457SAndroid Build Coastguard Worker      $for N in range(TILE_SIZE):
90*4bdc9457SAndroid Build Coastguard Worker        $if IN_PTRS == "REUSE":
91*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i0); i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride);
92*4bdc9457SAndroid Build Coastguard Worker        $else:
93*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N}); i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset);
94*4bdc9457SAndroid Build Coastguard Worker
95*4bdc9457SAndroid Build Coastguard Worker      $for N in range(TILE_SIZE >> 1):
96*4bdc9457SAndroid Build Coastguard Worker        const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)});
97*4bdc9457SAndroid Build Coastguard Worker
98*4bdc9457SAndroid Build Coastguard Worker      $for M in range(1, NUM_ITERS):
99*4bdc9457SAndroid Build Coastguard Worker        $for N in range(TILE_SIZE >> 1):
100*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]);
101*4bdc9457SAndroid Build Coastguard Worker
102*4bdc9457SAndroid Build Coastguard Worker      $if OUT_PTRS == "SWITCH":
103*4bdc9457SAndroid Build Coastguard Worker        uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
104*4bdc9457SAndroid Build Coastguard Worker        switch (rem) {
105*4bdc9457SAndroid Build Coastguard Worker          $for N in reversed(range(2, TILE_SIZE)):
106*4bdc9457SAndroid Build Coastguard Worker            case ${N}:
107*4bdc9457SAndroid Build Coastguard Worker              vst1${SUFFIX}_u${SIZE}(oN, v0_${N>>1}.val[${N%2}]); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
108*4bdc9457SAndroid Build Coastguard Worker          case 1:
109*4bdc9457SAndroid Build Coastguard Worker            vst1${SUFFIX}_u${SIZE}(oN, v0_0.val[1]);
110*4bdc9457SAndroid Build Coastguard Worker          case 0:
111*4bdc9457SAndroid Build Coastguard Worker            vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]); o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
112*4bdc9457SAndroid Build Coastguard Worker            break;
113*4bdc9457SAndroid Build Coastguard Worker          default:
114*4bdc9457SAndroid Build Coastguard Worker            XNN_UNREACHABLE;
115*4bdc9457SAndroid Build Coastguard Worker        }
116*4bdc9457SAndroid Build Coastguard Worker      $elif OUT_PTRS in ["MOV", "DEC"]:
117*4bdc9457SAndroid Build Coastguard Worker        o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset);
118*4bdc9457SAndroid Build Coastguard Worker        vst1${SUFFIX}_u${SIZE}(o, v0_${(TILE_SIZE-1)>>1}.val[1]);
119*4bdc9457SAndroid Build Coastguard Worker        $if OUT_PTRS == "MOV":
120*4bdc9457SAndroid Build Coastguard Worker          uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
121*4bdc9457SAndroid Build Coastguard Worker        $for N in reversed(range(2, TILE_SIZE, 2)):
122*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(block_width > ${N+1}) {
123*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
124*4bdc9457SAndroid Build Coastguard Worker              o = oN;
125*4bdc9457SAndroid Build Coastguard Worker            $else:
126*4bdc9457SAndroid Build Coastguard Worker              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
127*4bdc9457SAndroid Build Coastguard Worker          }
128*4bdc9457SAndroid Build Coastguard Worker          vst1${SUFFIX}_u${SIZE}(o, v0_${N>>1}.val[0]);
129*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "MOV":
130*4bdc9457SAndroid Build Coastguard Worker            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
131*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
132*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
133*4bdc9457SAndroid Build Coastguard Worker              o = oN;
134*4bdc9457SAndroid Build Coastguard Worker            $else:
135*4bdc9457SAndroid Build Coastguard Worker              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
136*4bdc9457SAndroid Build Coastguard Worker          }
137*4bdc9457SAndroid Build Coastguard Worker          vst1${SUFFIX}_u${SIZE}(o, v0_${(N-1)>>1}.val[1]);
138*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "MOV":
139*4bdc9457SAndroid Build Coastguard Worker            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
140*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNPREDICTABLE(block_width > 1) {
141*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "MOV":
142*4bdc9457SAndroid Build Coastguard Worker            o = oN;
143*4bdc9457SAndroid Build Coastguard Worker          $else:
144*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
145*4bdc9457SAndroid Build Coastguard Worker        }
146*4bdc9457SAndroid Build Coastguard Worker        vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]);
147*4bdc9457SAndroid Build Coastguard Worker      $else:
148*4bdc9457SAndroid Build Coastguard Worker        $for N in reversed(range(TILE_SIZE)):
149*4bdc9457SAndroid Build Coastguard Worker          vst1${SUFFIX}_u${SIZE}(o${N}, v0_${N>>1}.val[${N%2}]); o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes);
150*4bdc9457SAndroid Build Coastguard Worker    }
151*4bdc9457SAndroid Build Coastguard Worker    $if OUT_PTRS in ["MOV", "DEC"]:
152*4bdc9457SAndroid Build Coastguard Worker      o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
153*4bdc9457SAndroid Build Coastguard Worker
154*4bdc9457SAndroid Build Coastguard Worker    if (bh != 0) {
155*4bdc9457SAndroid Build Coastguard Worker      $if IN_PTRS == "REUSE":
156*4bdc9457SAndroid Build Coastguard Worker        const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0);
157*4bdc9457SAndroid Build Coastguard Worker        $for N in range(1, TILE_SIZE - 1, 2):
158*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
159*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh < ${N+1}) {
160*4bdc9457SAndroid Build Coastguard Worker            i${N} = i${N-1};
161*4bdc9457SAndroid Build Coastguard Worker          }
162*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N});
163*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride);
164*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
165*4bdc9457SAndroid Build Coastguard Worker            i${N+1} = i${N};
166*4bdc9457SAndroid Build Coastguard Worker          }
167*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1});
168*4bdc9457SAndroid Build Coastguard Worker      $else:
169*4bdc9457SAndroid Build Coastguard Worker        const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0);
170*4bdc9457SAndroid Build Coastguard Worker        $for N in range(1, TILE_SIZE - 1, 2):
171*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh < ${N+1}) {
172*4bdc9457SAndroid Build Coastguard Worker            i${N} = i0;
173*4bdc9457SAndroid Build Coastguard Worker          }
174*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N});
175*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
176*4bdc9457SAndroid Build Coastguard Worker            i${N+1} = i0;
177*4bdc9457SAndroid Build Coastguard Worker          }
178*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1});
179*4bdc9457SAndroid Build Coastguard Worker      const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${TILE_SIZE-1} = vmov${SUFFIX}_n_u${SIZE}(0);
180*4bdc9457SAndroid Build Coastguard Worker
181*4bdc9457SAndroid Build Coastguard Worker      $for N in range(TILE_SIZE >> 1):
182*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)});
183*4bdc9457SAndroid Build Coastguard Worker
184*4bdc9457SAndroid Build Coastguard Worker      $for M in range(1, NUM_ITERS):
185*4bdc9457SAndroid Build Coastguard Worker        $for N in range(TILE_SIZE >> 1):
186*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]);
187*4bdc9457SAndroid Build Coastguard Worker
188*4bdc9457SAndroid Build Coastguard Worker      $if VECTOR_SIZE == 128:
189*4bdc9457SAndroid Build Coastguard Worker        $for N in range(TILE_SIZE):
190*4bdc9457SAndroid Build Coastguard Worker          uint${SIZE}x${TILE_SIZE>>1}_t v${N}_low = vget_low_u${SIZE}(v0_${N>>1}.val[${N%2}]);
191*4bdc9457SAndroid Build Coastguard Worker
192*4bdc9457SAndroid Build Coastguard Worker        if (bh & ${TILE_SIZE>>1}) {
193*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
194*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
195*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
196*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
197*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
198*4bdc9457SAndroid Build Coastguard Worker                  vst1_u${SIZE}(oN, v${N}_low); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
199*4bdc9457SAndroid Build Coastguard Worker              case 1:
200*4bdc9457SAndroid Build Coastguard Worker                vst1_u${SIZE}(oN, v1_low);
201*4bdc9457SAndroid Build Coastguard Worker              case 0:
202*4bdc9457SAndroid Build Coastguard Worker                $if NUM_ITERS > 1:
203*4bdc9457SAndroid Build Coastguard Worker                  vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1};
204*4bdc9457SAndroid Build Coastguard Worker                $else:
205*4bdc9457SAndroid Build Coastguard Worker                  vst1_u${SIZE}(o, v0_low);
206*4bdc9457SAndroid Build Coastguard Worker                break;
207*4bdc9457SAndroid Build Coastguard Worker              default:
208*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
209*4bdc9457SAndroid Build Coastguard Worker            }
210*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS in ["MOV", "DEC"]:
211*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
212*4bdc9457SAndroid Build Coastguard Worker            vst1_u${SIZE}(o, v${TILE_SIZE-1}_low);
213*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
214*4bdc9457SAndroid Build Coastguard Worker              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
215*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
216*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
217*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
218*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
219*4bdc9457SAndroid Build Coastguard Worker                $else:
220*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
221*4bdc9457SAndroid Build Coastguard Worker              }
222*4bdc9457SAndroid Build Coastguard Worker              vst1_u${SIZE}(o, v${N}_low);
223*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
224*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
225*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
226*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
227*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
228*4bdc9457SAndroid Build Coastguard Worker                $else:
229*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
230*4bdc9457SAndroid Build Coastguard Worker              }
231*4bdc9457SAndroid Build Coastguard Worker              vst1_u${SIZE}(o, v${N-1}_low);
232*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
233*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
234*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
235*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
236*4bdc9457SAndroid Build Coastguard Worker                o = oN;
237*4bdc9457SAndroid Build Coastguard Worker              $else:
238*4bdc9457SAndroid Build Coastguard Worker                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
239*4bdc9457SAndroid Build Coastguard Worker            }
240*4bdc9457SAndroid Build Coastguard Worker            $if NUM_ITERS > 1:
241*4bdc9457SAndroid Build Coastguard Worker              vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1};
242*4bdc9457SAndroid Build Coastguard Worker            $else:
243*4bdc9457SAndroid Build Coastguard Worker              vst1_u${SIZE}(o, v0_low);
244*4bdc9457SAndroid Build Coastguard Worker          $else:
245*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(TILE_SIZE)):
246*4bdc9457SAndroid Build Coastguard Worker              $if NUM_ITERS>1:
247*4bdc9457SAndroid Build Coastguard Worker                vst1_u${SIZE}(o${N}, v${N}_low); o${N} += ${TILE_SIZE>>1};
248*4bdc9457SAndroid Build Coastguard Worker              $else:
249*4bdc9457SAndroid Build Coastguard Worker                vst1_u${SIZE}(o${N}, v${N}_low);
250*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS > 1:
251*4bdc9457SAndroid Build Coastguard Worker            $for N in range(TILE_SIZE):
252*4bdc9457SAndroid Build Coastguard Worker              v${N}_low = vget_high_u${SIZE}(v0_${N>>1}.val[${N%2}]);
253*4bdc9457SAndroid Build Coastguard Worker        }
254*4bdc9457SAndroid Build Coastguard Worker      $else:
255*4bdc9457SAndroid Build Coastguard Worker        $for N in range(TILE_SIZE):
256*4bdc9457SAndroid Build Coastguard Worker          uint${SIZE}x${TILE_SIZE}_t v${N}_low = v0_${(N>>1)}.val[${N%2}];
257*4bdc9457SAndroid Build Coastguard Worker
258*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>=NUM_D_REGISTERS:
259*4bdc9457SAndroid Build Coastguard Worker        if (bh & ${TILE_SIZE>>NUM_D_REGISTERS}) {
260*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
261*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
262*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
263*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
264*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
265*4bdc9457SAndroid Build Coastguard Worker                  $if SIZE == 32:
266*4bdc9457SAndroid Build Coastguard Worker                    vst1_lane_u32(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
267*4bdc9457SAndroid Build Coastguard Worker                  $else:
268*4bdc9457SAndroid Build Coastguard Worker                    vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
269*4bdc9457SAndroid Build Coastguard Worker              case 1:
270*4bdc9457SAndroid Build Coastguard Worker                $if SIZE == 32:
271*4bdc9457SAndroid Build Coastguard Worker                  vst1_lane_u32(oN, v1_low, 0);
272*4bdc9457SAndroid Build Coastguard Worker                $else:
273*4bdc9457SAndroid Build Coastguard Worker                  vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v1_low), 0);
274*4bdc9457SAndroid Build Coastguard Worker              case 0:
275*4bdc9457SAndroid Build Coastguard Worker                $if SIZE == 32:
276*4bdc9457SAndroid Build Coastguard Worker                  vst1_lane_u32(o, v0_low, 0);
277*4bdc9457SAndroid Build Coastguard Worker                $else:
278*4bdc9457SAndroid Build Coastguard Worker                  vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS};
279*4bdc9457SAndroid Build Coastguard Worker                break;
280*4bdc9457SAndroid Build Coastguard Worker              default:
281*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
282*4bdc9457SAndroid Build Coastguard Worker            }
283*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS in ["MOV", "DEC"]:
284*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
285*4bdc9457SAndroid Build Coastguard Worker            $if SIZE == 32:
286*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u32(o, v${TILE_SIZE-1}_low, 0);
287*4bdc9457SAndroid Build Coastguard Worker            $else:
288*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${TILE_SIZE-1}_low), 0);
289*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
290*4bdc9457SAndroid Build Coastguard Worker              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
291*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
292*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
293*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
294*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
295*4bdc9457SAndroid Build Coastguard Worker                $else:
296*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
297*4bdc9457SAndroid Build Coastguard Worker              }
298*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 32:
299*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u32(o, v${N}_low, 0);
300*4bdc9457SAndroid Build Coastguard Worker              $else:
301*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N}_low), 0);
302*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
303*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
304*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
305*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
306*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
307*4bdc9457SAndroid Build Coastguard Worker                $else:
308*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
309*4bdc9457SAndroid Build Coastguard Worker              }
310*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 32:
311*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u32(o, v${N-1}_low, 0);
312*4bdc9457SAndroid Build Coastguard Worker              $else:
313*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N-1}_low), 0);
314*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
315*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
316*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
317*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
318*4bdc9457SAndroid Build Coastguard Worker                o = oN;
319*4bdc9457SAndroid Build Coastguard Worker              $else:
320*4bdc9457SAndroid Build Coastguard Worker                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
321*4bdc9457SAndroid Build Coastguard Worker            }
322*4bdc9457SAndroid Build Coastguard Worker            $if SIZE == 32:
323*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u32(o, v0_low, 0);
324*4bdc9457SAndroid Build Coastguard Worker            $else:
325*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS};
326*4bdc9457SAndroid Build Coastguard Worker          $else:
327*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(TILE_SIZE)):
328*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 32:
329*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u32(o${N}, v${N}_low, 0);
330*4bdc9457SAndroid Build Coastguard Worker              $else:
331*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u32((void*) o${N}, vreinterpret_u32_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>NUM_D_REGISTERS};
332*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS > NUM_D_REGISTERS:
333*4bdc9457SAndroid Build Coastguard Worker            $for N in range(TILE_SIZE):
334*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 16:
335*4bdc9457SAndroid Build Coastguard Worker                v${N}_low = vext_u16(v${N}_low, v${N}_low, 2);
336*4bdc9457SAndroid Build Coastguard Worker              $else:
337*4bdc9457SAndroid Build Coastguard Worker                v${N}_low = vext_u8(v${N}_low, v${N}_low, 4);
338*4bdc9457SAndroid Build Coastguard Worker        }
339*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>NUM_D_REGISTERS:
340*4bdc9457SAndroid Build Coastguard Worker        if (bh & ${TILE_SIZE>>(NUM_D_REGISTERS+1)}) {
341*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
342*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
343*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
344*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
345*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
346*4bdc9457SAndroid Build Coastguard Worker                  $if SIZE == 16:
347*4bdc9457SAndroid Build Coastguard Worker                    vst1_lane_u16(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
348*4bdc9457SAndroid Build Coastguard Worker                  $else:
349*4bdc9457SAndroid Build Coastguard Worker                    vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
350*4bdc9457SAndroid Build Coastguard Worker              case 1:
351*4bdc9457SAndroid Build Coastguard Worker                $if SIZE == 16:
352*4bdc9457SAndroid Build Coastguard Worker                  vst1_lane_u16(oN, v1_low, 0);
353*4bdc9457SAndroid Build Coastguard Worker                $else:
354*4bdc9457SAndroid Build Coastguard Worker                  vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v1_low), 0);
355*4bdc9457SAndroid Build Coastguard Worker              case 0:
356*4bdc9457SAndroid Build Coastguard Worker                $if SIZE == 16:
357*4bdc9457SAndroid Build Coastguard Worker                  vst1_lane_u16(o, v0_low, 0);
358*4bdc9457SAndroid Build Coastguard Worker                $else:
359*4bdc9457SAndroid Build Coastguard Worker                  $if NUM_ITERS>(NUM_D_REGISTERS+1):
360*4bdc9457SAndroid Build Coastguard Worker                    vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)};
361*4bdc9457SAndroid Build Coastguard Worker                  $else:
362*4bdc9457SAndroid Build Coastguard Worker                    vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0);
363*4bdc9457SAndroid Build Coastguard Worker                break;
364*4bdc9457SAndroid Build Coastguard Worker              default:
365*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
366*4bdc9457SAndroid Build Coastguard Worker            }
367*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS in ["MOV", "DEC"]:
368*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
369*4bdc9457SAndroid Build Coastguard Worker            $if SIZE == 16:
370*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u16(o, v${TILE_SIZE-1}_low, 0);
371*4bdc9457SAndroid Build Coastguard Worker            $else:
372*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${TILE_SIZE-1}_low), 0);
373*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
374*4bdc9457SAndroid Build Coastguard Worker              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
375*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
376*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
377*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
378*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
379*4bdc9457SAndroid Build Coastguard Worker                $else:
380*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
381*4bdc9457SAndroid Build Coastguard Worker              }
382*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 16:
383*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u16(o, v${N}_low, 0);
384*4bdc9457SAndroid Build Coastguard Worker              $else:
385*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N}_low), 0);
386*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
387*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
388*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
389*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
390*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
391*4bdc9457SAndroid Build Coastguard Worker                $else:
392*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
393*4bdc9457SAndroid Build Coastguard Worker              }
394*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 16:
395*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u16(o, v${N-1}_low, 0);
396*4bdc9457SAndroid Build Coastguard Worker              $else:
397*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N-1}_low), 0);
398*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
399*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
400*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
401*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
402*4bdc9457SAndroid Build Coastguard Worker                o = oN;
403*4bdc9457SAndroid Build Coastguard Worker              $else:
404*4bdc9457SAndroid Build Coastguard Worker                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
405*4bdc9457SAndroid Build Coastguard Worker            }
406*4bdc9457SAndroid Build Coastguard Worker            $if SIZE == 16:
407*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u16(o, v0_low, 0);
408*4bdc9457SAndroid Build Coastguard Worker            $else:
409*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)};
410*4bdc9457SAndroid Build Coastguard Worker          $else:
411*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(TILE_SIZE)):
412*4bdc9457SAndroid Build Coastguard Worker              $if SIZE == 16:
413*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u16(o${N}, v${N}_low, 0);
414*4bdc9457SAndroid Build Coastguard Worker              $else:
415*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u16((void*) o${N}, vreinterpret_u16_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>(NUM_D_REGISTERS+1)};
416*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS>(NUM_D_REGISTERS+1):
417*4bdc9457SAndroid Build Coastguard Worker            $for N in range(TILE_SIZE):
418*4bdc9457SAndroid Build Coastguard Worker              v${N}_low = vext_u8(v${N}_low, v${N}_low, 2);
419*4bdc9457SAndroid Build Coastguard Worker        }
420*4bdc9457SAndroid Build Coastguard Worker      $if SIZE == 8:
421*4bdc9457SAndroid Build Coastguard Worker        if (bh & 1) {
422*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
423*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
424*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
425*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
426*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
427*4bdc9457SAndroid Build Coastguard Worker                  vst1_lane_u8(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
428*4bdc9457SAndroid Build Coastguard Worker              case 1:
429*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u8(oN, v1_low, 0);
430*4bdc9457SAndroid Build Coastguard Worker              case 0:
431*4bdc9457SAndroid Build Coastguard Worker                vst1_lane_u8(o, v0_low, 0);
432*4bdc9457SAndroid Build Coastguard Worker                break;
433*4bdc9457SAndroid Build Coastguard Worker              default:
434*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
435*4bdc9457SAndroid Build Coastguard Worker            }
436*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS in ["MOV", "DEC"]:
437*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
438*4bdc9457SAndroid Build Coastguard Worker            vst1_lane_u8(o, v${TILE_SIZE-1}_low, 0);
439*4bdc9457SAndroid Build Coastguard Worker            $if OUT_PTRS == "MOV":
440*4bdc9457SAndroid Build Coastguard Worker              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
441*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
442*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
443*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
444*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
445*4bdc9457SAndroid Build Coastguard Worker                $else:
446*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
447*4bdc9457SAndroid Build Coastguard Worker              }
448*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u8(o, v${N}_low, 0);
449*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
450*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
451*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
452*4bdc9457SAndroid Build Coastguard Worker                $if OUT_PTRS == "MOV":
453*4bdc9457SAndroid Build Coastguard Worker                  o = oN;
454*4bdc9457SAndroid Build Coastguard Worker                $else:
455*4bdc9457SAndroid Build Coastguard Worker                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
456*4bdc9457SAndroid Build Coastguard Worker              }
457*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u8(o, v${N-1}_low, 0);
458*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
459*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
460*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
461*4bdc9457SAndroid Build Coastguard Worker              $if OUT_PTRS == "MOV":
462*4bdc9457SAndroid Build Coastguard Worker                o = oN;
463*4bdc9457SAndroid Build Coastguard Worker              $else:
464*4bdc9457SAndroid Build Coastguard Worker                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
465*4bdc9457SAndroid Build Coastguard Worker            }
466*4bdc9457SAndroid Build Coastguard Worker            vst1_lane_u8(o, v0_low, 0);
467*4bdc9457SAndroid Build Coastguard Worker          $else:
468*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(TILE_SIZE)):
469*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_u8(o${N}, v${N}_low, 0);
470*4bdc9457SAndroid Build Coastguard Worker        }
471*4bdc9457SAndroid Build Coastguard Worker    }
472*4bdc9457SAndroid Build Coastguard Worker
473*4bdc9457SAndroid Build Coastguard Worker    $if IN_PTRS == "MULTI":
474*4bdc9457SAndroid Build Coastguard Worker      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
475*4bdc9457SAndroid Build Coastguard Worker      $for N in range(1, TILE_SIZE):
476*4bdc9457SAndroid Build Coastguard Worker        i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
477*4bdc9457SAndroid Build Coastguard Worker    $else:
478*4bdc9457SAndroid Build Coastguard Worker      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
479*4bdc9457SAndroid Build Coastguard Worker    $if OUT_PTRS == "MULTI":
480*4bdc9457SAndroid Build Coastguard Worker      o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset);
481*4bdc9457SAndroid Build Coastguard Worker      $for N in range(1, TILE_SIZE):
482*4bdc9457SAndroid Build Coastguard Worker        o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset);
483*4bdc9457SAndroid Build Coastguard Worker    $else:
484*4bdc9457SAndroid Build Coastguard Worker      o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset);
485*4bdc9457SAndroid Build Coastguard Worker    block_width = doz(block_width, tile_width);
486*4bdc9457SAndroid Build Coastguard Worker  } while (block_width != 0);
487*4bdc9457SAndroid Build Coastguard Worker}
488