xref: /aosp_15_r20/external/XNNPACK/src/x32-transposec/neon-zip.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5$import math
6$assert IN_PTRS in ["MULTI", "REUSE"]
7$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"]
8$assert SIZE in [8, 16, 32]
9$assert VECTOR_SIZE in [64, 128]
10$TILE_SIZE = int(VECTOR_SIZE/SIZE)
11$NUM_ITERS = int(math.log2(TILE_SIZE))
12$SUFFIX = ''
13$NUM_D_REGISTERS=int(VECTOR_SIZE/64)
14$if VECTOR_SIZE == 128:
15$ SUFFIX = 'q'
16
17#include <arm_neon.h>
18
19#include <assert.h>
20
21#include <xnnpack/common.h>
22#include <xnnpack/math.h>
23#include <xnnpack/transpose.h>
24
25void xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_zip_neon(
26    const uint${SIZE}_t* input,
27    uint${SIZE}_t* output,
28    size_t input_stride,
29    size_t output_stride,
30    size_t block_width,
31    size_t block_height) XNN_OOB_READS
32{
33  assert(output_stride >= block_height * sizeof(uint${SIZE}_t));
34  assert(input_stride >= block_width * sizeof(uint${SIZE}_t));
35
36  const size_t tile_height = ${TILE_SIZE};
37  const size_t tile_width = ${TILE_SIZE};
38  const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t);
39  const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t);
40  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
41  $if IN_PTRS == "MULTI":
42    const size_t input_offset = tile_height * input_stride;
43  $if OUT_PTRS in ["MOV", "DEC"]:
44    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes;
45  $else:
46    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t);
47
48  $if IN_PTRS == "MULTI":
49    const uint${SIZE}_t* i0 = input;
50    $for N in range(1, TILE_SIZE):
51      const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
52  $else:
53    const uint${SIZE}_t* i0 = input;
54  $if OUT_PTRS == "MULTI":
55    uint${SIZE}_t* o0 = (uint${SIZE}_t*) output;
56    $for N in range(1, TILE_SIZE):
57      uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride);
58  $elif OUT_PTRS == "SWITCH":
59    uint${SIZE}_t* o = (uint${SIZE}_t*) output;
60  $else:
61    uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes);
62  $if OUT_PTRS == "SWITCH":
63    $if int(VECTOR_SIZE/SIZE) > 2:
64      const size_t minus_output_stride = -output_stride;
65  $elif OUT_PTRS != "MULTI":
66    const size_t minus_output_stride = -output_stride;
67
68  do {
69    $if OUT_PTRS == "MULTI":
70      if XNN_UNPREDICTABLE(block_width < 2) {
71        o1 = o0;
72      }
73      $for N in range(2, TILE_SIZE, 2):
74        if XNN_UNPREDICTABLE(block_width <= ${N}) {
75          o${N} = o0;
76        }
77        if XNN_UNPREDICTABLE(block_width < ${N+2}) {
78          o${N+1} = o0;
79        }
80    $elif OUT_PTRS in ["MOV", "DEC"]:
81      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
82      const size_t oN_stride = rem * output_stride;
83      const size_t oN_offset = oN_stride + tile_hbytes;
84    $else:
85      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
86      const size_t oN_stride = rem * output_stride;
87    size_t bh = block_height;
88    for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) {
89      $for N in range(TILE_SIZE):
90        $if IN_PTRS == "REUSE":
91          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i0); i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride);
92        $else:
93          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N}); i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset);
94
95      $for N in range(TILE_SIZE >> 1):
96        const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)});
97
98      $for M in range(1, NUM_ITERS):
99        $for N in range(TILE_SIZE >> 1):
100          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]);
101
102      $if OUT_PTRS == "SWITCH":
103        uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
104        switch (rem) {
105          $for N in reversed(range(2, TILE_SIZE)):
106            case ${N}:
107              vst1${SUFFIX}_u${SIZE}(oN, v0_${N>>1}.val[${N%2}]); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
108          case 1:
109            vst1${SUFFIX}_u${SIZE}(oN, v0_0.val[1]);
110          case 0:
111            vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]); o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
112            break;
113          default:
114            XNN_UNREACHABLE;
115        }
116      $elif OUT_PTRS in ["MOV", "DEC"]:
117        o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset);
118        vst1${SUFFIX}_u${SIZE}(o, v0_${(TILE_SIZE-1)>>1}.val[1]);
119        $if OUT_PTRS == "MOV":
120          uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
121        $for N in reversed(range(2, TILE_SIZE, 2)):
122          if XNN_UNPREDICTABLE(block_width > ${N+1}) {
123            $if OUT_PTRS == "MOV":
124              o = oN;
125            $else:
126              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
127          }
128          vst1${SUFFIX}_u${SIZE}(o, v0_${N>>1}.val[0]);
129          $if OUT_PTRS == "MOV":
130            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
131          if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
132            $if OUT_PTRS == "MOV":
133              o = oN;
134            $else:
135              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
136          }
137          vst1${SUFFIX}_u${SIZE}(o, v0_${(N-1)>>1}.val[1]);
138          $if OUT_PTRS == "MOV":
139            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
140        if XNN_UNPREDICTABLE(block_width > 1) {
141          $if OUT_PTRS == "MOV":
142            o = oN;
143          $else:
144            o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
145        }
146        vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]);
147      $else:
148        $for N in reversed(range(TILE_SIZE)):
149          vst1${SUFFIX}_u${SIZE}(o${N}, v0_${N>>1}.val[${N%2}]); o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes);
150    }
151    $if OUT_PTRS in ["MOV", "DEC"]:
152      o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
153
154    if (bh != 0) {
155      $if IN_PTRS == "REUSE":
156        const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0);
157        $for N in range(1, TILE_SIZE - 1, 2):
158          const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
159          if XNN_UNPREDICTABLE(bh < ${N+1}) {
160            i${N} = i${N-1};
161          }
162          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N});
163          const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride);
164          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
165            i${N+1} = i${N};
166          }
167          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1});
168      $else:
169        const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0);
170        $for N in range(1, TILE_SIZE - 1, 2):
171          if XNN_UNPREDICTABLE(bh < ${N+1}) {
172            i${N} = i0;
173          }
174          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N});
175          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
176            i${N+1} = i0;
177          }
178          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1});
179      const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${TILE_SIZE-1} = vmov${SUFFIX}_n_u${SIZE}(0);
180
181      $for N in range(TILE_SIZE >> 1):
182          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)});
183
184      $for M in range(1, NUM_ITERS):
185        $for N in range(TILE_SIZE >> 1):
186          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]);
187
188      $if VECTOR_SIZE == 128:
189        $for N in range(TILE_SIZE):
190          uint${SIZE}x${TILE_SIZE>>1}_t v${N}_low = vget_low_u${SIZE}(v0_${N>>1}.val[${N%2}]);
191
192        if (bh & ${TILE_SIZE>>1}) {
193          $if OUT_PTRS == "SWITCH":
194            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
195            switch (rem) {
196              $for N in reversed(range(2, TILE_SIZE)):
197                case ${N}:
198                  vst1_u${SIZE}(oN, v${N}_low); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
199              case 1:
200                vst1_u${SIZE}(oN, v1_low);
201              case 0:
202                $if NUM_ITERS > 1:
203                  vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1};
204                $else:
205                  vst1_u${SIZE}(o, v0_low);
206                break;
207              default:
208                XNN_UNREACHABLE;
209            }
210          $elif OUT_PTRS in ["MOV", "DEC"]:
211            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
212            vst1_u${SIZE}(o, v${TILE_SIZE-1}_low);
213            $if OUT_PTRS == "MOV":
214              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
215            $for N in reversed(range(2, TILE_SIZE, 2)):
216              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
217                $if OUT_PTRS == "MOV":
218                  o = oN;
219                $else:
220                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
221              }
222              vst1_u${SIZE}(o, v${N}_low);
223              $if OUT_PTRS == "MOV":
224                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
225              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
226                $if OUT_PTRS == "MOV":
227                  o = oN;
228                $else:
229                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
230              }
231              vst1_u${SIZE}(o, v${N-1}_low);
232              $if OUT_PTRS == "MOV":
233                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
234            if XNN_UNPREDICTABLE(block_width > 1) {
235              $if OUT_PTRS == "MOV":
236                o = oN;
237              $else:
238                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
239            }
240            $if NUM_ITERS > 1:
241              vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1};
242            $else:
243              vst1_u${SIZE}(o, v0_low);
244          $else:
245            $for N in reversed(range(TILE_SIZE)):
246              $if NUM_ITERS>1:
247                vst1_u${SIZE}(o${N}, v${N}_low); o${N} += ${TILE_SIZE>>1};
248              $else:
249                vst1_u${SIZE}(o${N}, v${N}_low);
250          $if NUM_ITERS > 1:
251            $for N in range(TILE_SIZE):
252              v${N}_low = vget_high_u${SIZE}(v0_${N>>1}.val[${N%2}]);
253        }
254      $else:
255        $for N in range(TILE_SIZE):
256          uint${SIZE}x${TILE_SIZE}_t v${N}_low = v0_${(N>>1)}.val[${N%2}];
257
258      $if NUM_ITERS>=NUM_D_REGISTERS:
259        if (bh & ${TILE_SIZE>>NUM_D_REGISTERS}) {
260          $if OUT_PTRS == "SWITCH":
261            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
262            switch (rem) {
263              $for N in reversed(range(2, TILE_SIZE)):
264                case ${N}:
265                  $if SIZE == 32:
266                    vst1_lane_u32(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
267                  $else:
268                    vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
269              case 1:
270                $if SIZE == 32:
271                  vst1_lane_u32(oN, v1_low, 0);
272                $else:
273                  vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v1_low), 0);
274              case 0:
275                $if SIZE == 32:
276                  vst1_lane_u32(o, v0_low, 0);
277                $else:
278                  vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS};
279                break;
280              default:
281                XNN_UNREACHABLE;
282            }
283          $elif OUT_PTRS in ["MOV", "DEC"]:
284            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
285            $if SIZE == 32:
286              vst1_lane_u32(o, v${TILE_SIZE-1}_low, 0);
287            $else:
288              vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${TILE_SIZE-1}_low), 0);
289            $if OUT_PTRS == "MOV":
290              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
291            $for N in reversed(range(2, TILE_SIZE, 2)):
292              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
293                $if OUT_PTRS == "MOV":
294                  o = oN;
295                $else:
296                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
297              }
298              $if SIZE == 32:
299                vst1_lane_u32(o, v${N}_low, 0);
300              $else:
301                vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N}_low), 0);
302              $if OUT_PTRS == "MOV":
303                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
304              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
305                $if OUT_PTRS == "MOV":
306                  o = oN;
307                $else:
308                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
309              }
310              $if SIZE == 32:
311                vst1_lane_u32(o, v${N-1}_low, 0);
312              $else:
313                vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N-1}_low), 0);
314              $if OUT_PTRS == "MOV":
315                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
316            if XNN_UNPREDICTABLE(block_width > 1) {
317              $if OUT_PTRS == "MOV":
318                o = oN;
319              $else:
320                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
321            }
322            $if SIZE == 32:
323              vst1_lane_u32(o, v0_low, 0);
324            $else:
325              vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS};
326          $else:
327            $for N in reversed(range(TILE_SIZE)):
328              $if SIZE == 32:
329                vst1_lane_u32(o${N}, v${N}_low, 0);
330              $else:
331                vst1_lane_u32((void*) o${N}, vreinterpret_u32_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>NUM_D_REGISTERS};
332          $if NUM_ITERS > NUM_D_REGISTERS:
333            $for N in range(TILE_SIZE):
334              $if SIZE == 16:
335                v${N}_low = vext_u16(v${N}_low, v${N}_low, 2);
336              $else:
337                v${N}_low = vext_u8(v${N}_low, v${N}_low, 4);
338        }
339      $if NUM_ITERS>NUM_D_REGISTERS:
340        if (bh & ${TILE_SIZE>>(NUM_D_REGISTERS+1)}) {
341          $if OUT_PTRS == "SWITCH":
342            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
343            switch (rem) {
344              $for N in reversed(range(2, TILE_SIZE)):
345                case ${N}:
346                  $if SIZE == 16:
347                    vst1_lane_u16(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
348                  $else:
349                    vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
350              case 1:
351                $if SIZE == 16:
352                  vst1_lane_u16(oN, v1_low, 0);
353                $else:
354                  vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v1_low), 0);
355              case 0:
356                $if SIZE == 16:
357                  vst1_lane_u16(o, v0_low, 0);
358                $else:
359                  $if NUM_ITERS>(NUM_D_REGISTERS+1):
360                    vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)};
361                  $else:
362                    vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0);
363                break;
364              default:
365                XNN_UNREACHABLE;
366            }
367          $elif OUT_PTRS in ["MOV", "DEC"]:
368            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
369            $if SIZE == 16:
370              vst1_lane_u16(o, v${TILE_SIZE-1}_low, 0);
371            $else:
372              vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${TILE_SIZE-1}_low), 0);
373            $if OUT_PTRS == "MOV":
374              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
375            $for N in reversed(range(2, TILE_SIZE, 2)):
376              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
377                $if OUT_PTRS == "MOV":
378                  o = oN;
379                $else:
380                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
381              }
382              $if SIZE == 16:
383                vst1_lane_u16(o, v${N}_low, 0);
384              $else:
385                vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N}_low), 0);
386              $if OUT_PTRS == "MOV":
387                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
388              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
389                $if OUT_PTRS == "MOV":
390                  o = oN;
391                $else:
392                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
393              }
394              $if SIZE == 16:
395                vst1_lane_u16(o, v${N-1}_low, 0);
396              $else:
397                vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N-1}_low), 0);
398              $if OUT_PTRS == "MOV":
399                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
400            if XNN_UNPREDICTABLE(block_width > 1) {
401              $if OUT_PTRS == "MOV":
402                o = oN;
403              $else:
404                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
405            }
406            $if SIZE == 16:
407              vst1_lane_u16(o, v0_low, 0);
408            $else:
409              vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)};
410          $else:
411            $for N in reversed(range(TILE_SIZE)):
412              $if SIZE == 16:
413                vst1_lane_u16(o${N}, v${N}_low, 0);
414              $else:
415                vst1_lane_u16((void*) o${N}, vreinterpret_u16_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>(NUM_D_REGISTERS+1)};
416          $if NUM_ITERS>(NUM_D_REGISTERS+1):
417            $for N in range(TILE_SIZE):
418              v${N}_low = vext_u8(v${N}_low, v${N}_low, 2);
419        }
420      $if SIZE == 8:
421        if (bh & 1) {
422          $if OUT_PTRS == "SWITCH":
423            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
424            switch (rem) {
425              $for N in reversed(range(2, TILE_SIZE)):
426                case ${N}:
427                  vst1_lane_u8(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
428              case 1:
429                vst1_lane_u8(oN, v1_low, 0);
430              case 0:
431                vst1_lane_u8(o, v0_low, 0);
432                break;
433              default:
434                XNN_UNREACHABLE;
435            }
436          $elif OUT_PTRS in ["MOV", "DEC"]:
437            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
438            vst1_lane_u8(o, v${TILE_SIZE-1}_low, 0);
439            $if OUT_PTRS == "MOV":
440              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
441            $for N in reversed(range(2, TILE_SIZE, 2)):
442              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
443                $if OUT_PTRS == "MOV":
444                  o = oN;
445                $else:
446                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
447              }
448              vst1_lane_u8(o, v${N}_low, 0);
449              $if OUT_PTRS == "MOV":
450                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
451              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
452                $if OUT_PTRS == "MOV":
453                  o = oN;
454                $else:
455                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
456              }
457              vst1_lane_u8(o, v${N-1}_low, 0);
458              $if OUT_PTRS == "MOV":
459                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
460            if XNN_UNPREDICTABLE(block_width > 1) {
461              $if OUT_PTRS == "MOV":
462                o = oN;
463              $else:
464                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
465            }
466            vst1_lane_u8(o, v0_low, 0);
467          $else:
468            $for N in reversed(range(TILE_SIZE)):
469              vst1_lane_u8(o${N}, v${N}_low, 0);
470        }
471    }
472
473    $if IN_PTRS == "MULTI":
474      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
475      $for N in range(1, TILE_SIZE):
476        i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
477    $else:
478      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
479    $if OUT_PTRS == "MULTI":
480      o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset);
481      $for N in range(1, TILE_SIZE):
482        o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset);
483    $else:
484      o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset);
485    block_width = doz(block_width, tile_width);
486  } while (block_width != 0);
487}
488