1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
12 #define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
13
14 #include <arm_neon.h>
15
vpx_fdct4x4_pass1_neon(int16x4_t * in)16 static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
17 int16x4_t out[4];
18
19 const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
20 const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
21
22 // in_0 +/- in_3, in_1 +/- in_2
23 const int16x8_t s_01 = vaddq_s16(input_01, input_32);
24 const int16x8_t s_32 = vsubq_s16(input_01, input_32);
25
26 // step_0 +/- step_1, step_2 +/- step_3
27 const int16x4_t s_0 = vget_low_s16(s_01);
28 const int16x4_t s_1 = vget_high_s16(s_01);
29 const int16x4_t s_2 = vget_high_s16(s_32);
30 const int16x4_t s_3 = vget_low_s16(s_32);
31
32 // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
33 butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
34
35 // s_3 * cospi_8_64 + s_2 * cospi_24_64
36 // s_3 * cospi_24_64 - s_2 * cospi_8_64
37 butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
38
39 transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
40
41 in[0] = out[0];
42 in[1] = out[1];
43 in[2] = out[2];
44 in[3] = out[3];
45 }
46
vpx_fdct4x4_pass2_neon(int16x4_t * in)47 static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
48 int16x4_t out[4];
49
50 const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
51 const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
52
53 // in_0 +/- in_3, in_1 +/- in_2
54 const int16x8_t s_01 = vaddq_s16(input_01, input_32);
55 const int16x8_t s_32 = vsubq_s16(input_01, input_32);
56
57 // step_0 +/- step_1, step_2 +/- step_3
58 const int16x4_t s_0 = vget_low_s16(s_01);
59 const int16x4_t s_1 = vget_high_s16(s_01);
60 const int16x4_t s_2 = vget_high_s16(s_32);
61 const int16x4_t s_3 = vget_low_s16(s_32);
62
63 // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
64 butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
65 &out[2]);
66
67 // s_3 * cospi_8_64 + s_2 * cospi_24_64
68 // s_3 * cospi_24_64 - s_2 * cospi_8_64
69 butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
70
71 transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
72
73 in[0] = out[0];
74 in[1] = out[1];
75 in[2] = out[2];
76 in[3] = out[3];
77 }
78
79 #if CONFIG_VP9_HIGHBITDEPTH
80
vpx_highbd_fdct4x4_pass1_neon(int32x4_t * in)81 static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
82 int32x4_t out[4];
83 // in_0 +/- in_3, in_1 +/- in_2
84 const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
85 const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
86 const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
87 const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
88
89 butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
90
91 // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
92 // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
93 butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
94 &out[1], &out[3]);
95
96 transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
97
98 in[0] = out[0];
99 in[1] = out[1];
100 in[2] = out[2];
101 in[3] = out[3];
102 }
103
104 #endif // CONFIG_VP9_HIGHBITDEPTH
105 #endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
106