xref: /aosp_15_r20/external/libgav1/src/dsp/x86/motion_vector_search_sse4.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/motion_vector_search.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 
20 #include <smmintrin.h>
21 
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/x86/common_sse4.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31 #include "src/utils/types.h"
32 
33 namespace libgav1 {
34 namespace dsp {
35 namespace {
36 
37 constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
38     0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
39     1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
40     744,  712,   682,  655,  630,  606,  585,  564,  546,  528};
41 
MvProjection(const __m128i mv,const __m128i denominator,const __m128i numerator)42 inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
43                             const __m128i numerator) {
44   const __m128i m0 = _mm_madd_epi16(mv, denominator);
45   const __m128i m = _mm_mullo_epi32(m0, numerator);
46   // Add the sign (0 or -1) to round towards zero.
47   const __m128i sign = _mm_srai_epi32(m, 31);
48   const __m128i add_sign = _mm_add_epi32(m, sign);
49   const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
50   return _mm_srai_epi32(sum, 14);
51 }
52 
MvProjectionClip(const __m128i mvs[2],const __m128i denominators[2],const __m128i numerator)53 inline __m128i MvProjectionClip(const __m128i mvs[2],
54                                 const __m128i denominators[2],
55                                 const __m128i numerator) {
56   const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
57   const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
58   const __m128i mv = _mm_packs_epi32(s0, s1);
59   const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
60   const __m128i projection_mv_clamp_negative =
61       _mm_set1_epi16(-kProjectionMvClamp);
62   const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
63   return _mm_max_epi16(clamp, projection_mv_clamp_negative);
64 }
65 
MvProjectionCompoundClip(const MotionVector * LIBGAV1_RESTRICT const temporal_mvs,const int8_t temporal_reference_offsets[2],const int reference_offsets[2])66 inline __m128i MvProjectionCompoundClip(
67     const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
68     const int8_t temporal_reference_offsets[2],
69     const int reference_offsets[2]) {
70   const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
71   const __m128i temporal_mv = LoadLo8(tmvs);
72   const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
73   __m128i mvs[2], denominators[2];
74   mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
75   mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
76   denominators[0] = _mm_set1_epi32(
77       kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
78   denominators[1] = _mm_set1_epi32(
79       kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
80   const __m128i offsets = LoadLo8(reference_offsets);
81   const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
82   return MvProjectionClip(mvs, denominators, numerator);
83 }
84 
MvProjectionSingleClip(const MotionVector * LIBGAV1_RESTRICT const temporal_mvs,const int8_t * LIBGAV1_RESTRICT const temporal_reference_offsets,const int reference_offset)85 inline __m128i MvProjectionSingleClip(
86     const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
87     const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
88     const int reference_offset) {
89   const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
90   const __m128i temporal_mv = LoadAligned16(tmvs);
91   __m128i lookup = _mm_cvtsi32_si128(
92       kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
93   lookup = _mm_insert_epi32(
94       lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
95       1);
96   lookup = _mm_insert_epi32(
97       lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
98       2);
99   lookup = _mm_insert_epi32(
100       lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
101       3);
102   __m128i mvs[2], denominators[2];
103   mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
104   mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
105   denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
106   denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
107   const __m128i numerator = _mm_set1_epi32(reference_offset);
108   return MvProjectionClip(mvs, denominators, numerator);
109 }
110 
LowPrecision(const __m128i mv,void * const candidate_mvs)111 inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
112   const __m128i kRoundDownMask = _mm_set1_epi16(~1);
113   const __m128i sign = _mm_srai_epi16(mv, 15);
114   const __m128i sub_sign = _mm_sub_epi16(mv, sign);
115   const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
116   StoreAligned16(candidate_mvs, d);
117 }
118 
ForceInteger(const __m128i mv,void * const candidate_mvs)119 inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
120   const __m128i kRoundDownMask = _mm_set1_epi16(~7);
121   const __m128i sign = _mm_srai_epi16(mv, 15);
122   const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
123   const __m128i mv2 = _mm_sub_epi16(mv1, sign);
124   const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
125   StoreAligned16(candidate_mvs, mv3);
126 }
127 
MvProjectionCompoundLowPrecision_SSE4_1(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * LIBGAV1_RESTRICT candidate_mvs)128 void MvProjectionCompoundLowPrecision_SSE4_1(
129     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
130     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
131     const int reference_offsets[2], const int count,
132     CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
133   // |reference_offsets| non-zero check usually equals true and is ignored.
134   // To facilitate the compilers, make a local copy of |reference_offsets|.
135   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
136   // One more element could be calculated.
137   int i = 0;
138   do {
139     const __m128i mv = MvProjectionCompoundClip(
140         temporal_mvs + i, temporal_reference_offsets + i, offsets);
141     LowPrecision(mv, candidate_mvs + i);
142     i += 2;
143   } while (i < count);
144 }
145 
MvProjectionCompoundForceInteger_SSE4_1(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * LIBGAV1_RESTRICT candidate_mvs)146 void MvProjectionCompoundForceInteger_SSE4_1(
147     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
148     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
149     const int reference_offsets[2], const int count,
150     CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
151   // |reference_offsets| non-zero check usually equals true and is ignored.
152   // To facilitate the compilers, make a local copy of |reference_offsets|.
153   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
154   // One more element could be calculated.
155   int i = 0;
156   do {
157     const __m128i mv = MvProjectionCompoundClip(
158         temporal_mvs + i, temporal_reference_offsets + i, offsets);
159     ForceInteger(mv, candidate_mvs + i);
160     i += 2;
161   } while (i < count);
162 }
163 
MvProjectionCompoundHighPrecision_SSE4_1(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * LIBGAV1_RESTRICT candidate_mvs)164 void MvProjectionCompoundHighPrecision_SSE4_1(
165     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
166     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
167     const int reference_offsets[2], const int count,
168     CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
169   // |reference_offsets| non-zero check usually equals true and is ignored.
170   // To facilitate the compilers, make a local copy of |reference_offsets|.
171   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
172   // One more element could be calculated.
173   int i = 0;
174   do {
175     const __m128i mv = MvProjectionCompoundClip(
176         temporal_mvs + i, temporal_reference_offsets + i, offsets);
177     StoreAligned16(candidate_mvs + i, mv);
178     i += 2;
179   } while (i < count);
180 }
181 
MvProjectionSingleLowPrecision_SSE4_1(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offset,const int count,MotionVector * LIBGAV1_RESTRICT candidate_mvs)182 void MvProjectionSingleLowPrecision_SSE4_1(
183     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
184     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
185     const int reference_offset, const int count,
186     MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
187   // Up to three more elements could be calculated.
188   int i = 0;
189   do {
190     const __m128i mv = MvProjectionSingleClip(
191         temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
192     LowPrecision(mv, candidate_mvs + i);
193     i += 4;
194   } while (i < count);
195 }
196 
MvProjectionSingleForceInteger_SSE4_1(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offset,const int count,MotionVector * LIBGAV1_RESTRICT candidate_mvs)197 void MvProjectionSingleForceInteger_SSE4_1(
198     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
199     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
200     const int reference_offset, const int count,
201     MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
202   // Up to three more elements could be calculated.
203   int i = 0;
204   do {
205     const __m128i mv = MvProjectionSingleClip(
206         temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
207     ForceInteger(mv, candidate_mvs + i);
208     i += 4;
209   } while (i < count);
210 }
211 
MvProjectionSingleHighPrecision_SSE4_1(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offset,const int count,MotionVector * LIBGAV1_RESTRICT candidate_mvs)212 void MvProjectionSingleHighPrecision_SSE4_1(
213     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
214     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
215     const int reference_offset, const int count,
216     MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
217   // Up to three more elements could be calculated.
218   int i = 0;
219   do {
220     const __m128i mv = MvProjectionSingleClip(
221         temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
222     StoreAligned16(candidate_mvs + i, mv);
223     i += 4;
224   } while (i < count);
225 }
226 
227 }  // namespace
228 
MotionVectorSearchInit_SSE4_1()229 void MotionVectorSearchInit_SSE4_1() {
230   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
231   assert(dsp != nullptr);
232   dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
233   dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
234   dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
235   dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
236   dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
237   dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
238 }
239 
240 }  // namespace dsp
241 }  // namespace libgav1
242 
243 #else   // !LIBGAV1_TARGETING_SSE4_1
244 namespace libgav1 {
245 namespace dsp {
246 
MotionVectorSearchInit_SSE4_1()247 void MotionVectorSearchInit_SSE4_1() {}
248 
249 }  // namespace dsp
250 }  // namespace libgav1
251 #endif  // LIBGAV1_TARGETING_SSE4_1
252