1 /*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <assert.h>
10
11 #include <arm_neon.h>
12
13 #include <qnnpack/u8rmax.h>
14
pytorch_u8rmax_ukernel__neon(size_t n,const uint8_t * x)15 uint8_t pytorch_u8rmax_ukernel__neon(size_t n, const uint8_t* x) {
16 assert(n != 0);
17
18 if
19 PYTORCH_QNNP_LIKELY(n >= 16) {
20 uint8x16_t vmax = vmovq_n_u8(0);
21 do {
22 const uint8x16_t vx = vld1q_u8(x);
23 x += 16;
24 vmax = vmaxq_u8(vmax, vx);
25 n -= 16;
26 } while (n >= 16);
27 if (n != 0) {
28 const size_t x_increment = n - 16;
29 x = (const uint8_t*)((uintptr_t)x + x_increment);
30 const uint8x16_t vx = vld1q_u8(x);
31 vmax = vmaxq_u8(vmax, vx);
32 }
33 uint8x8_t vmax8 = vmax_u8(vget_low_u8(vmax), vget_high_u8(vmax));
34 const uint8x8_t vmax4 = vpmax_u8(vmax8, vmax8);
35 const uint8x8_t vmax2 = vpmax_u8(vmax4, vmax4);
36 const uint8x8_t vmax1 = vpmax_u8(vmax2, vmax2);
37 return vget_lane_u8(vmax1, 0);
38 }
39 else {
40 uint8x8_t vmax = vmov_n_u8(0);
41 do {
42 const uint8x8_t vx = vld1_dup_u8(x);
43 x += 1;
44 vmax = vmax_u8(vmax, vx);
45 } while (--n != 0);
46 return vget_lane_u8(vmax, 0);
47 }
48 }
49