1 /*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <assert.h>
10
11 #include <emmintrin.h>
12
13 #include <qnnpack/u8rmax.h>
14
pytorch_u8rmax_ukernel__sse2(size_t n,const uint8_t * x)15 uint8_t pytorch_u8rmax_ukernel__sse2(size_t n, const uint8_t* x) {
16 assert(n != 0);
17
18 if
19 PYTORCH_QNNP_LIKELY(n >= 16) {
20 __m128i vmax = _mm_setzero_si128();
21 do {
22 const __m128i vx = _mm_loadu_si128((const __m128i*)x);
23 x += 16;
24 vmax = _mm_max_epu8(vmax, vx);
25 n -= 16;
26 } while (n >= 16);
27 if (n != 0) {
28 const size_t x_increment = n - 16;
29 x = (const uint8_t*)((uintptr_t)x + x_increment);
30 const __m128i vx = _mm_loadu_si128((const __m128i*)x);
31 vmax = _mm_max_epu8(vmax, vx);
32 }
33 vmax = _mm_max_epu8(vmax, _mm_unpackhi_epi64(vmax, vmax));
34 vmax = _mm_max_epu8(vmax, _mm_srli_epi64(vmax, 32));
35 vmax = _mm_max_epu8(vmax, _mm_srli_epi32(vmax, 16));
36 vmax = _mm_max_epu8(vmax, _mm_srli_epi16(vmax, 8));
37 return (uint8_t)_mm_cvtsi128_si32(vmax);
38 }
39 else {
40 uint8_t vmax = 0;
41 do {
42 const uint8_t vx = *x++;
43 vmax = vx > vmax ? vx : vmax;
44 } while (--n != 0);
45 return vmax;
46 }
47 }
48