1*b2055c35SXin Li // Copyright 2014 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // NEON common code.
11*b2055c35SXin Li
12*b2055c35SXin Li #ifndef WEBP_DSP_NEON_H_
13*b2055c35SXin Li #define WEBP_DSP_NEON_H_
14*b2055c35SXin Li
15*b2055c35SXin Li #include "src/dsp/dsp.h"
16*b2055c35SXin Li
17*b2055c35SXin Li #if defined(WEBP_USE_NEON)
18*b2055c35SXin Li
19*b2055c35SXin Li #include <arm_neon.h>
20*b2055c35SXin Li
21*b2055c35SXin Li // Right now, some intrinsics functions seem slower, so we disable them
22*b2055c35SXin Li // everywhere except newer clang/gcc or aarch64 where the inline assembly is
23*b2055c35SXin Li // incompatible.
24*b2055c35SXin Li #if LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 9) || WEBP_AARCH64
25*b2055c35SXin Li #define WEBP_USE_INTRINSICS // use intrinsics when possible
26*b2055c35SXin Li #endif
27*b2055c35SXin Li
28*b2055c35SXin Li #define INIT_VECTOR2(v, a, b) do { \
29*b2055c35SXin Li v.val[0] = a; \
30*b2055c35SXin Li v.val[1] = b; \
31*b2055c35SXin Li } while (0)
32*b2055c35SXin Li
33*b2055c35SXin Li #define INIT_VECTOR3(v, a, b, c) do { \
34*b2055c35SXin Li v.val[0] = a; \
35*b2055c35SXin Li v.val[1] = b; \
36*b2055c35SXin Li v.val[2] = c; \
37*b2055c35SXin Li } while (0)
38*b2055c35SXin Li
39*b2055c35SXin Li #define INIT_VECTOR4(v, a, b, c, d) do { \
40*b2055c35SXin Li v.val[0] = a; \
41*b2055c35SXin Li v.val[1] = b; \
42*b2055c35SXin Li v.val[2] = c; \
43*b2055c35SXin Li v.val[3] = d; \
44*b2055c35SXin Li } while (0)
45*b2055c35SXin Li
46*b2055c35SXin Li // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
47*b2055c35SXin Li // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
48*b2055c35SXin Li // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
49*b2055c35SXin Li #if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64)
50*b2055c35SXin Li #define WORK_AROUND_GCC
51*b2055c35SXin Li #endif
52*b2055c35SXin Li
Transpose4x4_NEON(const int32x4x4_t rows)53*b2055c35SXin Li static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
54*b2055c35SXin Li uint64x2x2_t row01, row23;
55*b2055c35SXin Li
56*b2055c35SXin Li row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
57*b2055c35SXin Li row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
58*b2055c35SXin Li row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
59*b2055c35SXin Li row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
60*b2055c35SXin Li // Transpose 64-bit values (there's no vswp equivalent)
61*b2055c35SXin Li {
62*b2055c35SXin Li const uint64x1_t row0h = vget_high_u64(row01.val[0]);
63*b2055c35SXin Li const uint64x1_t row2l = vget_low_u64(row23.val[0]);
64*b2055c35SXin Li const uint64x1_t row1h = vget_high_u64(row01.val[1]);
65*b2055c35SXin Li const uint64x1_t row3l = vget_low_u64(row23.val[1]);
66*b2055c35SXin Li row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
67*b2055c35SXin Li row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
68*b2055c35SXin Li row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
69*b2055c35SXin Li row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
70*b2055c35SXin Li }
71*b2055c35SXin Li {
72*b2055c35SXin Li const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
73*b2055c35SXin Li vreinterpretq_s32_u64(row01.val[1]));
74*b2055c35SXin Li const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
75*b2055c35SXin Li vreinterpretq_s32_u64(row23.val[1]));
76*b2055c35SXin Li int32x4x4_t out;
77*b2055c35SXin Li out.val[0] = out01.val[0];
78*b2055c35SXin Li out.val[1] = out01.val[1];
79*b2055c35SXin Li out.val[2] = out23.val[0];
80*b2055c35SXin Li out.val[3] = out23.val[1];
81*b2055c35SXin Li return out;
82*b2055c35SXin Li }
83*b2055c35SXin Li }
84*b2055c35SXin Li
85*b2055c35SXin Li #if 0 // Useful debug macro.
86*b2055c35SXin Li #include <stdio.h>
87*b2055c35SXin Li #define PRINT_REG(REG, SIZE) do { \
88*b2055c35SXin Li int i; \
89*b2055c35SXin Li printf("%s \t[%d]: 0x", #REG, SIZE); \
90*b2055c35SXin Li if (SIZE == 8) { \
91*b2055c35SXin Li uint8_t _tmp[8]; \
92*b2055c35SXin Li vst1_u8(_tmp, (REG)); \
93*b2055c35SXin Li for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]); \
94*b2055c35SXin Li } else if (SIZE == 16) { \
95*b2055c35SXin Li uint16_t _tmp[4]; \
96*b2055c35SXin Li vst1_u16(_tmp, (REG)); \
97*b2055c35SXin Li for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]); \
98*b2055c35SXin Li } \
99*b2055c35SXin Li printf("\n"); \
100*b2055c35SXin Li } while (0)
101*b2055c35SXin Li #endif
102*b2055c35SXin Li
103*b2055c35SXin Li #endif // WEBP_USE_NEON
104*b2055c35SXin Li #endif // WEBP_DSP_NEON_H_
105