xref: /aosp_15_r20/external/libopenapv/src/neon/oapv_tq_neon.h (revision abb65b4b03b69e1d508d4d9a44dcf199df16e7c3)
1 /*
2  * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3  * All Rights Reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * - Redistributions of source code must retain the above copyright notice,
9  *   this list of conditions and the following disclaimer.
10  *
11  * - Redistributions in binary form must reproduce the above copyright notice,
12  *   this list of conditions and the following disclaimer in the documentation
13  *   and/or other materials provided with the distribution.
14  *
15  * - Neither the name of the copyright owner, nor the names of its contributors
16  *   may be used to endorse or promote products derived from this software
17  *   without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #ifndef _OAPV_TQ_NEON_H_
33 #define _OAPV_TQ_NEON_H_
34 
35 ///////////////////////////////////////////////////////////////////////////////
36 // start of encoder code
37 #if ENABLE_ENCODER
38 ///////////////////////////////////////////////////////////////////////////////
39 
40 #if ARM_NEON
41 
42 extern const oapv_fn_tx_t oapv_tbl_fn_txb_neon[2];
43 extern const oapv_fn_quant_t oapv_tbl_fn_quant_neon[2];
44 extern const oapv_fn_dquant_t oapv_tbl_fn_dquant_neon[2];
45 extern const oapv_fn_itx_t oapv_tbl_fn_itx_neon[2];
46 
47 #define CALCU_2x8(c0, c1, d0, d1)  \
48    v0 = _mm256_madd_epi16(s0, c0); \
49    v1 = _mm256_madd_epi16(s1, c0); \
50    v2 = _mm256_madd_epi16(s2, c0); \
51    v3 = _mm256_madd_epi16(s3, c0); \
52    v4 = _mm256_madd_epi16(s0, c1); \
53    v5 = _mm256_madd_epi16(s1, c1); \
54    v6 = _mm256_madd_epi16(s2, c1); \
55    v7 = _mm256_madd_epi16(s3, c1); \
56    v0 = _mm256_hadd_epi32(v0, v1); \
57    v2 = _mm256_hadd_epi32(v2, v3); \
58    v4 = _mm256_hadd_epi32(v4, v5); \
59    v6 = _mm256_hadd_epi32(v6, v7); \
60    d0 = _mm256_hadd_epi32(v0, v2); \
61    d1 = _mm256_hadd_epi32(v4, v6)
62 
63 #define CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add, shift) \
64    d0 = _mm256_add_epi32(d0, add);                      \
65    d1 = _mm256_add_epi32(d1, add);                      \
66    d2 = _mm256_add_epi32(d2, add);                      \
67    d3 = _mm256_add_epi32(d3, add);                      \
68    d0 = _mm256_srai_epi32(d0, shift);                   \
69    d1 = _mm256_srai_epi32(d1, shift);                   \
70    d2 = _mm256_srai_epi32(d2, shift);                   \
71    d3 = _mm256_srai_epi32(d3, shift);
72 
73 #define CALCU_2x4(c0, c1, c2, c3, d0, d1)   \
74    v0 = _mm256_madd_epi16(s0, c0);          \
75    v1 = _mm256_madd_epi16(s1, c0);          \
76    v2 = _mm256_madd_epi16(s0, c1);          \
77    v3 = _mm256_madd_epi16(s1, c1);          \
78    v4 = _mm256_madd_epi16(s0, c2);          \
79    v5 = _mm256_madd_epi16(s1, c2);          \
80    v6 = _mm256_madd_epi16(s0, c3);          \
81    v7 = _mm256_madd_epi16(s1, c3);          \
82    v0 = _mm256_hadd_epi32(v0, v1);          \
83    v2 = _mm256_hadd_epi32(v2, v3);          \
84    v4 = _mm256_hadd_epi32(v4, v5);          \
85    v6 = _mm256_hadd_epi32(v6, v7);          \
86    d0 = _mm256_hadd_epi32(v0, v2);          \
87    d1 = _mm256_hadd_epi32(v4, v6);          \
88    d0 = _mm256_permute4x64_epi64(d0, 0xd8); \
89    d1 = _mm256_permute4x64_epi64(d1, 0xd8)
90 
91 #define CALCU_LINE_1x8(coeff0, dst)              \
92    v0 = _mm256_madd_epi16(s00, coeff0);          \
93    v1 = _mm256_madd_epi16(s01, coeff0);          \
94    v2 = _mm256_madd_epi16(s02, coeff0);          \
95    v3 = _mm256_madd_epi16(s03, coeff0);          \
96    v4 = _mm256_madd_epi16(s04, coeff0);          \
97    v5 = _mm256_madd_epi16(s05, coeff0);          \
98    v6 = _mm256_madd_epi16(s06, coeff0);          \
99    v7 = _mm256_madd_epi16(s07, coeff0);          \
100    v0 = _mm256_hadd_epi32(v0, v1);               \
101    v2 = _mm256_hadd_epi32(v2, v3);               \
102    v4 = _mm256_hadd_epi32(v4, v5);               \
103    v6 = _mm256_hadd_epi32(v6, v7);               \
104    v0 = _mm256_hadd_epi32(v0, v2);               \
105    v4 = _mm256_hadd_epi32(v4, v6);               \
106    v1 = _mm256_permute2x128_si256(v0, v4, 0x20); \
107    v2 = _mm256_permute2x128_si256(v0, v4, 0x31); \
108    dst = _mm256_add_epi32(v1, v2)
109 
110 #define CALCU_LINE_1x8_ADD_SHIFT(d0, d1, d2, d3, d4, d5, d6, d7, add, shift) \
111    d0 = _mm256_add_epi32(d0, add);                                           \
112    d1 = _mm256_add_epi32(d1, add);                                           \
113    d2 = _mm256_add_epi32(d2, add);                                           \
114    d3 = _mm256_add_epi32(d3, add);                                           \
115    d4 = _mm256_add_epi32(d4, add);                                           \
116    d5 = _mm256_add_epi32(d5, add);                                           \
117    d6 = _mm256_add_epi32(d6, add);                                           \
118    d7 = _mm256_add_epi32(d7, add);                                           \
119    d0 = _mm256_srai_epi32(d0, shift);                                        \
120    d1 = _mm256_srai_epi32(d1, shift);                                        \
121    d2 = _mm256_srai_epi32(d2, shift);                                        \
122    d3 = _mm256_srai_epi32(d3, shift);                                        \
123    d4 = _mm256_srai_epi32(d4, shift);                                        \
124    d5 = _mm256_srai_epi32(d5, shift);                                        \
125    d6 = _mm256_srai_epi32(d6, shift);                                        \
126    d7 = _mm256_srai_epi32(d7, shift)
127 
128 #endif // ARM_NEON
129 
130 ///////////////////////////////////////////////////////////////////////////////
131 // end of encoder code
132 #endif // ENABLE_ENCODER
133 ///////////////////////////////////////////////////////////////////////////////
134 
135 #endif /* _OAPV_TQ_NEON_H_  */
136