xref: /aosp_15_r20/external/libaom/aom_dsp/x86/fwd_txfm_sse2.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
13 #define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
14 
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18 
k_madd_epi32(__m128i a,__m128i b)19 static inline __m128i k_madd_epi32(__m128i a, __m128i b) {
20   __m128i buf0, buf1;
21   buf0 = _mm_mul_epu32(a, b);
22   a = _mm_srli_epi64(a, 32);
23   b = _mm_srli_epi64(b, 32);
24   buf1 = _mm_mul_epu32(a, b);
25   return _mm_add_epi64(buf0, buf1);
26 }
27 
k_packs_epi64(__m128i a,__m128i b)28 static inline __m128i k_packs_epi64(__m128i a, __m128i b) {
29   __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
30   __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
31   return _mm_unpacklo_epi64(buf0, buf1);
32 }
33 
check_epi16_overflow_x2(const __m128i * preg0,const __m128i * preg1)34 static inline int check_epi16_overflow_x2(const __m128i *preg0,
35                                           const __m128i *preg1) {
36   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
37   const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
38   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
39                               _mm_cmpeq_epi16(*preg0, min_overflow));
40   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
41                               _mm_cmpeq_epi16(*preg1, min_overflow));
42   cmp0 = _mm_or_si128(cmp0, cmp1);
43   return _mm_movemask_epi8(cmp0);
44 }
45 
check_epi16_overflow_x4(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3)46 static inline int check_epi16_overflow_x4(const __m128i *preg0,
47                                           const __m128i *preg1,
48                                           const __m128i *preg2,
49                                           const __m128i *preg3) {
50   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
51   const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
52   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
53                               _mm_cmpeq_epi16(*preg0, min_overflow));
54   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
55                               _mm_cmpeq_epi16(*preg1, min_overflow));
56   __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
57                               _mm_cmpeq_epi16(*preg2, min_overflow));
58   __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
59                               _mm_cmpeq_epi16(*preg3, min_overflow));
60   cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
61   return _mm_movemask_epi8(cmp0);
62 }
63 
check_epi16_overflow_x8(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7)64 static inline int check_epi16_overflow_x8(
65     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
66     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
67     const __m128i *preg6, const __m128i *preg7) {
68   int res0, res1;
69   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
70   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
71   return res0 + res1;
72 }
73 
check_epi16_overflow_x12(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11)74 static inline int check_epi16_overflow_x12(
75     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
76     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
77     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
78     const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
79   int res0, res1;
80   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
81   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
82   if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
83   return res0 + res1;
84 }
85 
check_epi16_overflow_x16(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15)86 static inline int check_epi16_overflow_x16(
87     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
88     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
89     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
90     const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
91     const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
92     const __m128i *preg15) {
93   int res0, res1;
94   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
95   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
96   if (!res0) {
97     res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
98     if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
99   }
100   return res0 + res1;
101 }
102 
check_epi16_overflow_x32(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15,const __m128i * preg16,const __m128i * preg17,const __m128i * preg18,const __m128i * preg19,const __m128i * preg20,const __m128i * preg21,const __m128i * preg22,const __m128i * preg23,const __m128i * preg24,const __m128i * preg25,const __m128i * preg26,const __m128i * preg27,const __m128i * preg28,const __m128i * preg29,const __m128i * preg30,const __m128i * preg31)103 static inline int check_epi16_overflow_x32(
104     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
105     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
106     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
107     const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
108     const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
109     const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
110     const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
111     const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
112     const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
113     const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
114     const __m128i *preg30, const __m128i *preg31) {
115   int res0, res1;
116   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
117   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
118   if (!res0) {
119     res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
120     if (!res1) {
121       res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
122       if (!res0) {
123         res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
124         if (!res1) {
125           res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
126           if (!res0) {
127             res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
128             if (!res1)
129               res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
130           }
131         }
132       }
133     }
134   }
135   return res0 + res1;
136 }
137 
store_output(const __m128i * poutput,tran_low_t * dst_ptr)138 static inline void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
139   const __m128i zero = _mm_setzero_si128();
140   const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
141   __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
142   __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
143   _mm_store_si128((__m128i *)(dst_ptr), out0);
144   _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
145 }
146 
storeu_output(const __m128i * poutput,tran_low_t * dst_ptr)147 static inline void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
148   const __m128i zero = _mm_setzero_si128();
149   const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
150   __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
151   __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
152   _mm_storeu_si128((__m128i *)(dst_ptr), out0);
153   _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
154 }
155 
156 #ifdef __cplusplus
157 }  // extern "C"
158 #endif
159 
160 #endif  // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
161