xref: /aosp_15_r20/external/webp/src/dsp/neon.h (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1*b2055c35SXin Li // Copyright 2014 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li //  NEON common code.
11*b2055c35SXin Li 
12*b2055c35SXin Li #ifndef WEBP_DSP_NEON_H_
13*b2055c35SXin Li #define WEBP_DSP_NEON_H_
14*b2055c35SXin Li 
15*b2055c35SXin Li #include "src/dsp/dsp.h"
16*b2055c35SXin Li 
17*b2055c35SXin Li #if defined(WEBP_USE_NEON)
18*b2055c35SXin Li 
19*b2055c35SXin Li #include <arm_neon.h>
20*b2055c35SXin Li 
21*b2055c35SXin Li // Right now, some intrinsics functions seem slower, so we disable them
22*b2055c35SXin Li // everywhere except newer clang/gcc or aarch64 where the inline assembly is
23*b2055c35SXin Li // incompatible.
24*b2055c35SXin Li #if LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 9) || WEBP_AARCH64
25*b2055c35SXin Li #define WEBP_USE_INTRINSICS   // use intrinsics when possible
26*b2055c35SXin Li #endif
27*b2055c35SXin Li 
28*b2055c35SXin Li #define INIT_VECTOR2(v, a, b) do {  \
29*b2055c35SXin Li   v.val[0] = a;                     \
30*b2055c35SXin Li   v.val[1] = b;                     \
31*b2055c35SXin Li } while (0)
32*b2055c35SXin Li 
33*b2055c35SXin Li #define INIT_VECTOR3(v, a, b, c) do {  \
34*b2055c35SXin Li   v.val[0] = a;                        \
35*b2055c35SXin Li   v.val[1] = b;                        \
36*b2055c35SXin Li   v.val[2] = c;                        \
37*b2055c35SXin Li } while (0)
38*b2055c35SXin Li 
39*b2055c35SXin Li #define INIT_VECTOR4(v, a, b, c, d) do {  \
40*b2055c35SXin Li   v.val[0] = a;                           \
41*b2055c35SXin Li   v.val[1] = b;                           \
42*b2055c35SXin Li   v.val[2] = c;                           \
43*b2055c35SXin Li   v.val[3] = d;                           \
44*b2055c35SXin Li } while (0)
45*b2055c35SXin Li 
46*b2055c35SXin Li // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
47*b2055c35SXin Li // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
48*b2055c35SXin Li // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
49*b2055c35SXin Li #if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64)
50*b2055c35SXin Li #define WORK_AROUND_GCC
51*b2055c35SXin Li #endif
52*b2055c35SXin Li 
Transpose4x4_NEON(const int32x4x4_t rows)53*b2055c35SXin Li static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
54*b2055c35SXin Li   uint64x2x2_t row01, row23;
55*b2055c35SXin Li 
56*b2055c35SXin Li   row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
57*b2055c35SXin Li   row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
58*b2055c35SXin Li   row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
59*b2055c35SXin Li   row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
60*b2055c35SXin Li   // Transpose 64-bit values (there's no vswp equivalent)
61*b2055c35SXin Li   {
62*b2055c35SXin Li     const uint64x1_t row0h = vget_high_u64(row01.val[0]);
63*b2055c35SXin Li     const uint64x1_t row2l = vget_low_u64(row23.val[0]);
64*b2055c35SXin Li     const uint64x1_t row1h = vget_high_u64(row01.val[1]);
65*b2055c35SXin Li     const uint64x1_t row3l = vget_low_u64(row23.val[1]);
66*b2055c35SXin Li     row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
67*b2055c35SXin Li     row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
68*b2055c35SXin Li     row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
69*b2055c35SXin Li     row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
70*b2055c35SXin Li   }
71*b2055c35SXin Li   {
72*b2055c35SXin Li     const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
73*b2055c35SXin Li                                         vreinterpretq_s32_u64(row01.val[1]));
74*b2055c35SXin Li     const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
75*b2055c35SXin Li                                         vreinterpretq_s32_u64(row23.val[1]));
76*b2055c35SXin Li     int32x4x4_t out;
77*b2055c35SXin Li     out.val[0] = out01.val[0];
78*b2055c35SXin Li     out.val[1] = out01.val[1];
79*b2055c35SXin Li     out.val[2] = out23.val[0];
80*b2055c35SXin Li     out.val[3] = out23.val[1];
81*b2055c35SXin Li     return out;
82*b2055c35SXin Li   }
83*b2055c35SXin Li }
84*b2055c35SXin Li 
85*b2055c35SXin Li #if 0     // Useful debug macro.
86*b2055c35SXin Li #include <stdio.h>
87*b2055c35SXin Li #define PRINT_REG(REG, SIZE) do {                       \
88*b2055c35SXin Li   int i;                                                \
89*b2055c35SXin Li   printf("%s \t[%d]: 0x", #REG, SIZE);                  \
90*b2055c35SXin Li   if (SIZE == 8) {                                      \
91*b2055c35SXin Li     uint8_t _tmp[8];                                    \
92*b2055c35SXin Li     vst1_u8(_tmp, (REG));                               \
93*b2055c35SXin Li     for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]);   \
94*b2055c35SXin Li   } else if (SIZE == 16) {                              \
95*b2055c35SXin Li     uint16_t _tmp[4];                                   \
96*b2055c35SXin Li     vst1_u16(_tmp, (REG));                              \
97*b2055c35SXin Li     for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]);   \
98*b2055c35SXin Li   }                                                     \
99*b2055c35SXin Li   printf("\n");                                         \
100*b2055c35SXin Li } while (0)
101*b2055c35SXin Li #endif
102*b2055c35SXin Li 
103*b2055c35SXin Li #endif  // WEBP_USE_NEON
104*b2055c35SXin Li #endif  // WEBP_DSP_NEON_H_
105