1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5$import math 6$assert IN_PTRS in ["MULTI", "REUSE"] 7$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"] 8$assert SIZE in [8, 16, 32] 9$assert VECTOR_SIZE in [64, 128] 10$TILE_SIZE = int(VECTOR_SIZE/SIZE) 11$NUM_ITERS = int(math.log2(TILE_SIZE)) 12$SUFFIX = '' 13$NUM_D_REGISTERS=int(VECTOR_SIZE/64) 14$if VECTOR_SIZE == 128: 15$ SUFFIX = 'q' 16 17#include <arm_neon.h> 18 19#include <assert.h> 20 21#include <xnnpack/common.h> 22#include <xnnpack/math.h> 23#include <xnnpack/transpose.h> 24 25void xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_zip_neon( 26 const uint${SIZE}_t* input, 27 uint${SIZE}_t* output, 28 size_t input_stride, 29 size_t output_stride, 30 size_t block_width, 31 size_t block_height) XNN_OOB_READS 32{ 33 assert(output_stride >= block_height * sizeof(uint${SIZE}_t)); 34 assert(input_stride >= block_width * sizeof(uint${SIZE}_t)); 35 36 const size_t tile_height = ${TILE_SIZE}; 37 const size_t tile_width = ${TILE_SIZE}; 38 const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t); 39 const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t); 40 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; 41 $if IN_PTRS == "MULTI": 42 const size_t input_offset = tile_height * input_stride; 43 $if OUT_PTRS in ["MOV", "DEC"]: 44 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes; 45 $else: 46 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t); 47 48 $if IN_PTRS == "MULTI": 49 const uint${SIZE}_t* i0 = input; 50 $for N in range(1, TILE_SIZE): 51 const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 52 $else: 53 const uint${SIZE}_t* i0 = input; 54 $if OUT_PTRS == "MULTI": 55 uint${SIZE}_t* o0 = (uint${SIZE}_t*) output; 56 $for N in range(1, TILE_SIZE): 57 uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride); 58 $elif OUT_PTRS == "SWITCH": 59 uint${SIZE}_t* o = (uint${SIZE}_t*) output; 60 $else: 61 uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes); 62 $if OUT_PTRS == "SWITCH": 63 $if int(VECTOR_SIZE/SIZE) > 2: 64 const size_t minus_output_stride = -output_stride; 65 $elif OUT_PTRS != "MULTI": 66 const size_t minus_output_stride = -output_stride; 67 68 do { 69 $if OUT_PTRS == "MULTI": 70 if XNN_UNPREDICTABLE(block_width < 2) { 71 o1 = o0; 72 } 73 $for N in range(2, TILE_SIZE, 2): 74 if XNN_UNPREDICTABLE(block_width <= ${N}) { 75 o${N} = o0; 76 } 77 if XNN_UNPREDICTABLE(block_width < ${N+2}) { 78 o${N+1} = o0; 79 } 80 $elif OUT_PTRS in ["MOV", "DEC"]: 81 const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 82 const size_t oN_stride = rem * output_stride; 83 const size_t oN_offset = oN_stride + tile_hbytes; 84 $else: 85 const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 86 const size_t oN_stride = rem * output_stride; 87 size_t bh = block_height; 88 for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) { 89 $for N in range(TILE_SIZE): 90 $if IN_PTRS == "REUSE": 91 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i0); i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride); 92 $else: 93 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N}); i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset); 94 95 $for N in range(TILE_SIZE >> 1): 96 const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)}); 97 98 $for M in range(1, NUM_ITERS): 99 $for N in range(TILE_SIZE >> 1): 100 const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]); 101 102 $if OUT_PTRS == "SWITCH": 103 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 104 switch (rem) { 105 $for N in reversed(range(2, TILE_SIZE)): 106 case ${N}: 107 vst1${SUFFIX}_u${SIZE}(oN, v0_${N>>1}.val[${N%2}]); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 108 case 1: 109 vst1${SUFFIX}_u${SIZE}(oN, v0_0.val[1]); 110 case 0: 111 vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]); o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 112 break; 113 default: 114 XNN_UNREACHABLE; 115 } 116 $elif OUT_PTRS in ["MOV", "DEC"]: 117 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset); 118 vst1${SUFFIX}_u${SIZE}(o, v0_${(TILE_SIZE-1)>>1}.val[1]); 119 $if OUT_PTRS == "MOV": 120 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 121 $for N in reversed(range(2, TILE_SIZE, 2)): 122 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 123 $if OUT_PTRS == "MOV": 124 o = oN; 125 $else: 126 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 127 } 128 vst1${SUFFIX}_u${SIZE}(o, v0_${N>>1}.val[0]); 129 $if OUT_PTRS == "MOV": 130 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 131 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 132 $if OUT_PTRS == "MOV": 133 o = oN; 134 $else: 135 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 136 } 137 vst1${SUFFIX}_u${SIZE}(o, v0_${(N-1)>>1}.val[1]); 138 $if OUT_PTRS == "MOV": 139 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 140 if XNN_UNPREDICTABLE(block_width > 1) { 141 $if OUT_PTRS == "MOV": 142 o = oN; 143 $else: 144 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 145 } 146 vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]); 147 $else: 148 $for N in reversed(range(TILE_SIZE)): 149 vst1${SUFFIX}_u${SIZE}(o${N}, v0_${N>>1}.val[${N%2}]); o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes); 150 } 151 $if OUT_PTRS in ["MOV", "DEC"]: 152 o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 153 154 if (bh != 0) { 155 $if IN_PTRS == "REUSE": 156 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0); 157 $for N in range(1, TILE_SIZE - 1, 2): 158 const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 159 if XNN_UNPREDICTABLE(bh < ${N+1}) { 160 i${N} = i${N-1}; 161 } 162 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N}); 163 const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride); 164 if XNN_UNPREDICTABLE(bh <= ${N+1}) { 165 i${N+1} = i${N}; 166 } 167 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1}); 168 $else: 169 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0); 170 $for N in range(1, TILE_SIZE - 1, 2): 171 if XNN_UNPREDICTABLE(bh < ${N+1}) { 172 i${N} = i0; 173 } 174 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N}); 175 if XNN_UNPREDICTABLE(bh <= ${N+1}) { 176 i${N+1} = i0; 177 } 178 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1}); 179 const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${TILE_SIZE-1} = vmov${SUFFIX}_n_u${SIZE}(0); 180 181 $for N in range(TILE_SIZE >> 1): 182 const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)}); 183 184 $for M in range(1, NUM_ITERS): 185 $for N in range(TILE_SIZE >> 1): 186 const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]); 187 188 $if VECTOR_SIZE == 128: 189 $for N in range(TILE_SIZE): 190 uint${SIZE}x${TILE_SIZE>>1}_t v${N}_low = vget_low_u${SIZE}(v0_${N>>1}.val[${N%2}]); 191 192 if (bh & ${TILE_SIZE>>1}) { 193 $if OUT_PTRS == "SWITCH": 194 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 195 switch (rem) { 196 $for N in reversed(range(2, TILE_SIZE)): 197 case ${N}: 198 vst1_u${SIZE}(oN, v${N}_low); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 199 case 1: 200 vst1_u${SIZE}(oN, v1_low); 201 case 0: 202 $if NUM_ITERS > 1: 203 vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1}; 204 $else: 205 vst1_u${SIZE}(o, v0_low); 206 break; 207 default: 208 XNN_UNREACHABLE; 209 } 210 $elif OUT_PTRS in ["MOV", "DEC"]: 211 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 212 vst1_u${SIZE}(o, v${TILE_SIZE-1}_low); 213 $if OUT_PTRS == "MOV": 214 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 215 $for N in reversed(range(2, TILE_SIZE, 2)): 216 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 217 $if OUT_PTRS == "MOV": 218 o = oN; 219 $else: 220 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 221 } 222 vst1_u${SIZE}(o, v${N}_low); 223 $if OUT_PTRS == "MOV": 224 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 225 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 226 $if OUT_PTRS == "MOV": 227 o = oN; 228 $else: 229 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 230 } 231 vst1_u${SIZE}(o, v${N-1}_low); 232 $if OUT_PTRS == "MOV": 233 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 234 if XNN_UNPREDICTABLE(block_width > 1) { 235 $if OUT_PTRS == "MOV": 236 o = oN; 237 $else: 238 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 239 } 240 $if NUM_ITERS > 1: 241 vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1}; 242 $else: 243 vst1_u${SIZE}(o, v0_low); 244 $else: 245 $for N in reversed(range(TILE_SIZE)): 246 $if NUM_ITERS>1: 247 vst1_u${SIZE}(o${N}, v${N}_low); o${N} += ${TILE_SIZE>>1}; 248 $else: 249 vst1_u${SIZE}(o${N}, v${N}_low); 250 $if NUM_ITERS > 1: 251 $for N in range(TILE_SIZE): 252 v${N}_low = vget_high_u${SIZE}(v0_${N>>1}.val[${N%2}]); 253 } 254 $else: 255 $for N in range(TILE_SIZE): 256 uint${SIZE}x${TILE_SIZE}_t v${N}_low = v0_${(N>>1)}.val[${N%2}]; 257 258 $if NUM_ITERS>=NUM_D_REGISTERS: 259 if (bh & ${TILE_SIZE>>NUM_D_REGISTERS}) { 260 $if OUT_PTRS == "SWITCH": 261 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 262 switch (rem) { 263 $for N in reversed(range(2, TILE_SIZE)): 264 case ${N}: 265 $if SIZE == 32: 266 vst1_lane_u32(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 267 $else: 268 vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 269 case 1: 270 $if SIZE == 32: 271 vst1_lane_u32(oN, v1_low, 0); 272 $else: 273 vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v1_low), 0); 274 case 0: 275 $if SIZE == 32: 276 vst1_lane_u32(o, v0_low, 0); 277 $else: 278 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS}; 279 break; 280 default: 281 XNN_UNREACHABLE; 282 } 283 $elif OUT_PTRS in ["MOV", "DEC"]: 284 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 285 $if SIZE == 32: 286 vst1_lane_u32(o, v${TILE_SIZE-1}_low, 0); 287 $else: 288 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${TILE_SIZE-1}_low), 0); 289 $if OUT_PTRS == "MOV": 290 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 291 $for N in reversed(range(2, TILE_SIZE, 2)): 292 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 293 $if OUT_PTRS == "MOV": 294 o = oN; 295 $else: 296 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 297 } 298 $if SIZE == 32: 299 vst1_lane_u32(o, v${N}_low, 0); 300 $else: 301 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N}_low), 0); 302 $if OUT_PTRS == "MOV": 303 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 304 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 305 $if OUT_PTRS == "MOV": 306 o = oN; 307 $else: 308 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 309 } 310 $if SIZE == 32: 311 vst1_lane_u32(o, v${N-1}_low, 0); 312 $else: 313 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N-1}_low), 0); 314 $if OUT_PTRS == "MOV": 315 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 316 if XNN_UNPREDICTABLE(block_width > 1) { 317 $if OUT_PTRS == "MOV": 318 o = oN; 319 $else: 320 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 321 } 322 $if SIZE == 32: 323 vst1_lane_u32(o, v0_low, 0); 324 $else: 325 vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS}; 326 $else: 327 $for N in reversed(range(TILE_SIZE)): 328 $if SIZE == 32: 329 vst1_lane_u32(o${N}, v${N}_low, 0); 330 $else: 331 vst1_lane_u32((void*) o${N}, vreinterpret_u32_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>NUM_D_REGISTERS}; 332 $if NUM_ITERS > NUM_D_REGISTERS: 333 $for N in range(TILE_SIZE): 334 $if SIZE == 16: 335 v${N}_low = vext_u16(v${N}_low, v${N}_low, 2); 336 $else: 337 v${N}_low = vext_u8(v${N}_low, v${N}_low, 4); 338 } 339 $if NUM_ITERS>NUM_D_REGISTERS: 340 if (bh & ${TILE_SIZE>>(NUM_D_REGISTERS+1)}) { 341 $if OUT_PTRS == "SWITCH": 342 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 343 switch (rem) { 344 $for N in reversed(range(2, TILE_SIZE)): 345 case ${N}: 346 $if SIZE == 16: 347 vst1_lane_u16(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 348 $else: 349 vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 350 case 1: 351 $if SIZE == 16: 352 vst1_lane_u16(oN, v1_low, 0); 353 $else: 354 vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v1_low), 0); 355 case 0: 356 $if SIZE == 16: 357 vst1_lane_u16(o, v0_low, 0); 358 $else: 359 $if NUM_ITERS>(NUM_D_REGISTERS+1): 360 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)}; 361 $else: 362 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); 363 break; 364 default: 365 XNN_UNREACHABLE; 366 } 367 $elif OUT_PTRS in ["MOV", "DEC"]: 368 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 369 $if SIZE == 16: 370 vst1_lane_u16(o, v${TILE_SIZE-1}_low, 0); 371 $else: 372 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${TILE_SIZE-1}_low), 0); 373 $if OUT_PTRS == "MOV": 374 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 375 $for N in reversed(range(2, TILE_SIZE, 2)): 376 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 377 $if OUT_PTRS == "MOV": 378 o = oN; 379 $else: 380 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 381 } 382 $if SIZE == 16: 383 vst1_lane_u16(o, v${N}_low, 0); 384 $else: 385 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N}_low), 0); 386 $if OUT_PTRS == "MOV": 387 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 388 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 389 $if OUT_PTRS == "MOV": 390 o = oN; 391 $else: 392 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 393 } 394 $if SIZE == 16: 395 vst1_lane_u16(o, v${N-1}_low, 0); 396 $else: 397 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N-1}_low), 0); 398 $if OUT_PTRS == "MOV": 399 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 400 if XNN_UNPREDICTABLE(block_width > 1) { 401 $if OUT_PTRS == "MOV": 402 o = oN; 403 $else: 404 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 405 } 406 $if SIZE == 16: 407 vst1_lane_u16(o, v0_low, 0); 408 $else: 409 vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)}; 410 $else: 411 $for N in reversed(range(TILE_SIZE)): 412 $if SIZE == 16: 413 vst1_lane_u16(o${N}, v${N}_low, 0); 414 $else: 415 vst1_lane_u16((void*) o${N}, vreinterpret_u16_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>(NUM_D_REGISTERS+1)}; 416 $if NUM_ITERS>(NUM_D_REGISTERS+1): 417 $for N in range(TILE_SIZE): 418 v${N}_low = vext_u8(v${N}_low, v${N}_low, 2); 419 } 420 $if SIZE == 8: 421 if (bh & 1) { 422 $if OUT_PTRS == "SWITCH": 423 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 424 switch (rem) { 425 $for N in reversed(range(2, TILE_SIZE)): 426 case ${N}: 427 vst1_lane_u8(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 428 case 1: 429 vst1_lane_u8(oN, v1_low, 0); 430 case 0: 431 vst1_lane_u8(o, v0_low, 0); 432 break; 433 default: 434 XNN_UNREACHABLE; 435 } 436 $elif OUT_PTRS in ["MOV", "DEC"]: 437 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 438 vst1_lane_u8(o, v${TILE_SIZE-1}_low, 0); 439 $if OUT_PTRS == "MOV": 440 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 441 $for N in reversed(range(2, TILE_SIZE, 2)): 442 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 443 $if OUT_PTRS == "MOV": 444 o = oN; 445 $else: 446 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 447 } 448 vst1_lane_u8(o, v${N}_low, 0); 449 $if OUT_PTRS == "MOV": 450 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 451 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 452 $if OUT_PTRS == "MOV": 453 o = oN; 454 $else: 455 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 456 } 457 vst1_lane_u8(o, v${N-1}_low, 0); 458 $if OUT_PTRS == "MOV": 459 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 460 if XNN_UNPREDICTABLE(block_width > 1) { 461 $if OUT_PTRS == "MOV": 462 o = oN; 463 $else: 464 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 465 } 466 vst1_lane_u8(o, v0_low, 0); 467 $else: 468 $for N in reversed(range(TILE_SIZE)): 469 vst1_lane_u8(o${N}, v${N}_low, 0); 470 } 471 } 472 473 $if IN_PTRS == "MULTI": 474 i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 475 $for N in range(1, TILE_SIZE): 476 i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 477 $else: 478 i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 479 $if OUT_PTRS == "MULTI": 480 o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset); 481 $for N in range(1, TILE_SIZE): 482 o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset); 483 $else: 484 o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset); 485 block_width = doz(block_width, tile_width); 486 } while (block_width != 0); 487} 488