1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
12*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker #include <arm_neon.h>
15*fb1b10abSAndroid Build Coastguard Worker
vpx_fdct4x4_pass1_neon(int16x4_t * in)16*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
17*fb1b10abSAndroid Build Coastguard Worker int16x4_t out[4];
18*fb1b10abSAndroid Build Coastguard Worker
19*fb1b10abSAndroid Build Coastguard Worker const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
20*fb1b10abSAndroid Build Coastguard Worker const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
21*fb1b10abSAndroid Build Coastguard Worker
22*fb1b10abSAndroid Build Coastguard Worker // in_0 +/- in_3, in_1 +/- in_2
23*fb1b10abSAndroid Build Coastguard Worker const int16x8_t s_01 = vaddq_s16(input_01, input_32);
24*fb1b10abSAndroid Build Coastguard Worker const int16x8_t s_32 = vsubq_s16(input_01, input_32);
25*fb1b10abSAndroid Build Coastguard Worker
26*fb1b10abSAndroid Build Coastguard Worker // step_0 +/- step_1, step_2 +/- step_3
27*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s_0 = vget_low_s16(s_01);
28*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s_1 = vget_high_s16(s_01);
29*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s_2 = vget_high_s16(s_32);
30*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s_3 = vget_low_s16(s_32);
31*fb1b10abSAndroid Build Coastguard Worker
32*fb1b10abSAndroid Build Coastguard Worker // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
33*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
34*fb1b10abSAndroid Build Coastguard Worker
35*fb1b10abSAndroid Build Coastguard Worker // s_3 * cospi_8_64 + s_2 * cospi_24_64
36*fb1b10abSAndroid Build Coastguard Worker // s_3 * cospi_24_64 - s_2 * cospi_8_64
37*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
38*fb1b10abSAndroid Build Coastguard Worker
39*fb1b10abSAndroid Build Coastguard Worker transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
40*fb1b10abSAndroid Build Coastguard Worker
41*fb1b10abSAndroid Build Coastguard Worker in[0] = out[0];
42*fb1b10abSAndroid Build Coastguard Worker in[1] = out[1];
43*fb1b10abSAndroid Build Coastguard Worker in[2] = out[2];
44*fb1b10abSAndroid Build Coastguard Worker in[3] = out[3];
45*fb1b10abSAndroid Build Coastguard Worker }
46*fb1b10abSAndroid Build Coastguard Worker
vpx_fdct4x4_pass2_neon(int16x4_t * in)47*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
48*fb1b10abSAndroid Build Coastguard Worker int16x4_t out[4];
49*fb1b10abSAndroid Build Coastguard Worker
50*fb1b10abSAndroid Build Coastguard Worker const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
51*fb1b10abSAndroid Build Coastguard Worker const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
52*fb1b10abSAndroid Build Coastguard Worker
53*fb1b10abSAndroid Build Coastguard Worker // in_0 +/- in_3, in_1 +/- in_2
54*fb1b10abSAndroid Build Coastguard Worker const int16x8_t s_01 = vaddq_s16(input_01, input_32);
55*fb1b10abSAndroid Build Coastguard Worker const int16x8_t s_32 = vsubq_s16(input_01, input_32);
56*fb1b10abSAndroid Build Coastguard Worker
57*fb1b10abSAndroid Build Coastguard Worker // step_0 +/- step_1, step_2 +/- step_3
58*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s_0 = vget_low_s16(s_01);
59*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s_1 = vget_high_s16(s_01);
60*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s_2 = vget_high_s16(s_32);
61*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s_3 = vget_low_s16(s_32);
62*fb1b10abSAndroid Build Coastguard Worker
63*fb1b10abSAndroid Build Coastguard Worker // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
64*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
65*fb1b10abSAndroid Build Coastguard Worker &out[2]);
66*fb1b10abSAndroid Build Coastguard Worker
67*fb1b10abSAndroid Build Coastguard Worker // s_3 * cospi_8_64 + s_2 * cospi_24_64
68*fb1b10abSAndroid Build Coastguard Worker // s_3 * cospi_24_64 - s_2 * cospi_8_64
69*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
70*fb1b10abSAndroid Build Coastguard Worker
71*fb1b10abSAndroid Build Coastguard Worker transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
72*fb1b10abSAndroid Build Coastguard Worker
73*fb1b10abSAndroid Build Coastguard Worker in[0] = out[0];
74*fb1b10abSAndroid Build Coastguard Worker in[1] = out[1];
75*fb1b10abSAndroid Build Coastguard Worker in[2] = out[2];
76*fb1b10abSAndroid Build Coastguard Worker in[3] = out[3];
77*fb1b10abSAndroid Build Coastguard Worker }
78*fb1b10abSAndroid Build Coastguard Worker
79*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
80*fb1b10abSAndroid Build Coastguard Worker
vpx_highbd_fdct4x4_pass1_neon(int32x4_t * in)81*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
82*fb1b10abSAndroid Build Coastguard Worker int32x4_t out[4];
83*fb1b10abSAndroid Build Coastguard Worker // in_0 +/- in_3, in_1 +/- in_2
84*fb1b10abSAndroid Build Coastguard Worker const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
85*fb1b10abSAndroid Build Coastguard Worker const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
86*fb1b10abSAndroid Build Coastguard Worker const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
87*fb1b10abSAndroid Build Coastguard Worker const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
88*fb1b10abSAndroid Build Coastguard Worker
89*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
90*fb1b10abSAndroid Build Coastguard Worker
91*fb1b10abSAndroid Build Coastguard Worker // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
92*fb1b10abSAndroid Build Coastguard Worker // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
93*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
94*fb1b10abSAndroid Build Coastguard Worker &out[1], &out[3]);
95*fb1b10abSAndroid Build Coastguard Worker
96*fb1b10abSAndroid Build Coastguard Worker transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
97*fb1b10abSAndroid Build Coastguard Worker
98*fb1b10abSAndroid Build Coastguard Worker in[0] = out[0];
99*fb1b10abSAndroid Build Coastguard Worker in[1] = out[1];
100*fb1b10abSAndroid Build Coastguard Worker in[2] = out[2];
101*fb1b10abSAndroid Build Coastguard Worker in[3] = out[3];
102*fb1b10abSAndroid Build Coastguard Worker }
103*fb1b10abSAndroid Build Coastguard Worker
104*fb1b10abSAndroid Build Coastguard Worker #endif // CONFIG_VP9_HIGHBITDEPTH
105*fb1b10abSAndroid Build Coastguard Worker #endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
106