xref: /aosp_15_r20/external/XNNPACK/src/x8-zip/x3-neon.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <arm_neon.h>
10 
11 #include <xnnpack/zip.h>
12 
13 
xnn_x8_zip_x3_ukernel__neon(size_t n,const uint8_t * input,uint8_t * output)14 void xnn_x8_zip_x3_ukernel__neon(
15     size_t n,
16     const uint8_t* input,
17     uint8_t* output)
18 {
19   const uint8_t* x = input;
20   const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
21   const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
22   uint8_t* o = output;
23 
24   if (n >= 8) {
25     do {
26       uint8x8x3_t vxyz;
27       vxyz.val[0] = vld1_u8(x); x += 8;
28       vxyz.val[1] = vld1_u8(y); y += 8;
29       vxyz.val[2] = vld1_u8(z); z += 8;
30       vst3_u8(o, vxyz); o += 24;
31       n -= 8;
32     } while (n >= 8);
33     if (n != 0) {
34       const size_t address_increment = n - 8;
35       uint8x8x3_t vxyz;
36       vxyz.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment));
37       vxyz.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment));
38       vxyz.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment));
39       vst3_u8((uint8_t*) ((uintptr_t) o + address_increment * 3), vxyz);
40     }
41   } else {
42     do {
43       const uint8_t vx = *x++;
44       const uint8_t vy = *y++;
45       const uint8_t vz = *z++;
46       o[0] = vx;
47       o[1] = vy;
48       o[2] = vz;
49       o += 3;
50     } while (--n != 0);
51   }
52 }
53