xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/u8rmax/neon.c (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <assert.h>
10 
11 #include <arm_neon.h>
12 
13 #include <qnnpack/u8rmax.h>
14 
pytorch_u8rmax_ukernel__neon(size_t n,const uint8_t * x)15 uint8_t pytorch_u8rmax_ukernel__neon(size_t n, const uint8_t* x) {
16   assert(n != 0);
17 
18   if
19     PYTORCH_QNNP_LIKELY(n >= 16) {
20       uint8x16_t vmax = vmovq_n_u8(0);
21       do {
22         const uint8x16_t vx = vld1q_u8(x);
23         x += 16;
24         vmax = vmaxq_u8(vmax, vx);
25         n -= 16;
26       } while (n >= 16);
27       if (n != 0) {
28         const size_t x_increment = n - 16;
29         x = (const uint8_t*)((uintptr_t)x + x_increment);
30         const uint8x16_t vx = vld1q_u8(x);
31         vmax = vmaxq_u8(vmax, vx);
32       }
33       uint8x8_t vmax8 = vmax_u8(vget_low_u8(vmax), vget_high_u8(vmax));
34       const uint8x8_t vmax4 = vpmax_u8(vmax8, vmax8);
35       const uint8x8_t vmax2 = vpmax_u8(vmax4, vmax4);
36       const uint8x8_t vmax1 = vpmax_u8(vmax2, vmax2);
37       return vget_lane_u8(vmax1, 0);
38     }
39   else {
40     uint8x8_t vmax = vmov_n_u8(0);
41     do {
42       const uint8x8_t vx = vld1_dup_u8(x);
43       x += 1;
44       vmax = vmax_u8(vmax, vx);
45     } while (--n != 0);
46     return vget_lane_u8(vmax, 0);
47   }
48 }
49