xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/u8rmax/sse2.c (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <assert.h>
10 
11 #include <emmintrin.h>
12 
13 #include <qnnpack/u8rmax.h>
14 
pytorch_u8rmax_ukernel__sse2(size_t n,const uint8_t * x)15 uint8_t pytorch_u8rmax_ukernel__sse2(size_t n, const uint8_t* x) {
16   assert(n != 0);
17 
18   if
19     PYTORCH_QNNP_LIKELY(n >= 16) {
20       __m128i vmax = _mm_setzero_si128();
21       do {
22         const __m128i vx = _mm_loadu_si128((const __m128i*)x);
23         x += 16;
24         vmax = _mm_max_epu8(vmax, vx);
25         n -= 16;
26       } while (n >= 16);
27       if (n != 0) {
28         const size_t x_increment = n - 16;
29         x = (const uint8_t*)((uintptr_t)x + x_increment);
30         const __m128i vx = _mm_loadu_si128((const __m128i*)x);
31         vmax = _mm_max_epu8(vmax, vx);
32       }
33       vmax = _mm_max_epu8(vmax, _mm_unpackhi_epi64(vmax, vmax));
34       vmax = _mm_max_epu8(vmax, _mm_srli_epi64(vmax, 32));
35       vmax = _mm_max_epu8(vmax, _mm_srli_epi32(vmax, 16));
36       vmax = _mm_max_epu8(vmax, _mm_srli_epi16(vmax, 8));
37       return (uint8_t)_mm_cvtsi128_si32(vmax);
38     }
39   else {
40     uint8_t vmax = 0;
41     do {
42       const uint8_t vx = *x++;
43       vmax = vx > vmax ? vx : vmax;
44     } while (--n != 0);
45     return vmax;
46   }
47 }
48