1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/motion_field_projection.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19
20 #include <smmintrin.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/utils/common.h"
31 #include "src/utils/constants.h"
32 #include "src/utils/types.h"
33
34 namespace libgav1 {
35 namespace dsp {
36 namespace {
37
LoadDivision(const __m128i division_table,const __m128i reference_offset)38 inline __m128i LoadDivision(const __m128i division_table,
39 const __m128i reference_offset) {
40 const __m128i kOne = _mm_set1_epi16(0x0100);
41 const __m128i t = _mm_add_epi8(reference_offset, reference_offset);
42 const __m128i tt = _mm_unpacklo_epi8(t, t);
43 const __m128i idx = _mm_add_epi8(tt, kOne);
44 return _mm_shuffle_epi8(division_table, idx);
45 }
46
MvProjection(const __m128i mv,const __m128i denominator,const int numerator)47 inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
48 const int numerator) {
49 const __m128i m0 = _mm_madd_epi16(mv, denominator);
50 const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator));
51 // Add the sign (0 or -1) to round towards zero.
52 const __m128i sign = _mm_srai_epi32(m, 31);
53 const __m128i add_sign = _mm_add_epi32(m, sign);
54 const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
55 return _mm_srai_epi32(sum, 14);
56 }
57
MvProjectionClip(const __m128i mv,const __m128i denominator,const int numerator)58 inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator,
59 const int numerator) {
60 const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128());
61 const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128());
62 const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128());
63 const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128());
64 const __m128i s0 = MvProjection(mv0, denorm0, numerator);
65 const __m128i s1 = MvProjection(mv1, denorm1, numerator);
66 const __m128i projection = _mm_packs_epi32(s0, s1);
67 const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
68 const __m128i projection_mv_clamp_negative =
69 _mm_set1_epi16(-kProjectionMvClamp);
70 const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp);
71 return _mm_max_epi16(clamp, projection_mv_clamp_negative);
72 }
73
Project_SSE4_1(const __m128i delta,const __m128i dst_sign)74 inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) {
75 // Add 63 to negative delta so that it shifts towards zero.
76 const __m128i delta_sign = _mm_srai_epi16(delta, 15);
77 const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10);
78 const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63);
79 const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6);
80 const __m128i offset1 = _mm_xor_si128(offset0, dst_sign);
81 return _mm_sub_epi16(offset1, dst_sign);
82 }
83
GetPosition(const __m128i division_table,const MotionVector * const mv,const int numerator,const int x8_start,const int x8_end,const int x8,const __m128i & r_offsets,const __m128i & source_reference_type8,const __m128i & skip_r,const __m128i & y8_floor8,const __m128i & y8_ceiling8,const __m128i & d_sign,const int delta,__m128i * const r,__m128i * const position_xy,int64_t * const skip_64,__m128i mvs[2])84 inline void GetPosition(
85 const __m128i division_table, const MotionVector* const mv,
86 const int numerator, const int x8_start, const int x8_end, const int x8,
87 const __m128i& r_offsets, const __m128i& source_reference_type8,
88 const __m128i& skip_r, const __m128i& y8_floor8, const __m128i& y8_ceiling8,
89 const __m128i& d_sign, const int delta, __m128i* const r,
90 __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) {
91 const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
92 *r = _mm_shuffle_epi8(r_offsets, source_reference_type8);
93 const __m128i denorm = LoadDivision(division_table, source_reference_type8);
94 __m128i projection_mv[2];
95 mvs[0] = LoadUnaligned16(mv_int + 0);
96 mvs[1] = LoadUnaligned16(mv_int + 4);
97 // Deinterlace x and y components
98 const __m128i kShuffle =
99 _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
100 const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle);
101 const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle);
102 const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1);
103 const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1);
104 // numerator could be 0.
105 projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator);
106 projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator);
107 // Do not update the motion vector if the block position is not valid or
108 // if position_x8 is outside the current range of x8_start and x8_end.
109 // Note that position_y8 will always be within the range of y8_start and
110 // y8_end.
111 // After subtracting the base, valid projections are within 8-bit.
112 const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign);
113 const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign);
114 const __m128i positions = _mm_packs_epi16(position_x, position_y);
115 const __m128i k01234567 =
116 _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
117 *position_xy = _mm_add_epi8(positions, k01234567);
118 const int x8_floor = std::max(
119 x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8]
120 const int x8_ceiling =
121 std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) -
122 1; // [-1, 15]
123 const __m128i x8_floor8 = _mm_set1_epi8(x8_floor);
124 const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling);
125 const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8);
126 const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8);
127 const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy);
128 const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy);
129 const __m128i out = _mm_or_si128(underflow, overflow);
130 const __m128i skip_low = _mm_or_si128(skip_r, out);
131 const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8));
132 StoreLo8(skip_64, skip);
133 }
134
135 template <int idx>
Store(const __m128i position,const __m128i reference_offset,const __m128i mv,int8_t * dst_reference_offset,MotionVector * dst_mv)136 inline void Store(const __m128i position, const __m128i reference_offset,
137 const __m128i mv, int8_t* dst_reference_offset,
138 MotionVector* dst_mv) {
139 const ptrdiff_t offset =
140 static_cast<int16_t>(_mm_extract_epi16(position, idx));
141 if ((idx & 3) == 0) {
142 dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_cvtsi128_si32(mv));
143 } else {
144 dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_extract_epi32(mv, idx & 3));
145 }
146 dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
147 }
148
149 template <int idx>
CheckStore(const int8_t * skips,const __m128i position,const __m128i reference_offset,const __m128i mv,int8_t * dst_reference_offset,MotionVector * dst_mv)150 inline void CheckStore(const int8_t* skips, const __m128i position,
151 const __m128i reference_offset, const __m128i mv,
152 int8_t* dst_reference_offset, MotionVector* dst_mv) {
153 if (skips[idx] == 0) {
154 Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
155 }
156 }
157
158 // 7.9.2.
MotionFieldProjectionKernel_SSE4_1(const ReferenceInfo & reference_info,const int reference_to_current_with_sign,const int dst_sign,const int y8_start,const int y8_end,const int x8_start,const int x8_end,TemporalMotionField * const motion_field)159 void MotionFieldProjectionKernel_SSE4_1(
160 const ReferenceInfo& reference_info,
161 const int reference_to_current_with_sign, const int dst_sign,
162 const int y8_start, const int y8_end, const int x8_start, const int x8_end,
163 TemporalMotionField* const motion_field) {
164 const ptrdiff_t stride = motion_field->mv.columns();
165 // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
166 // coordinates in that range could end up being position_x8 because of
167 // projection.
168 const int adjusted_x8_start =
169 std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
170 const int adjusted_x8_end = std::min(
171 x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
172 const int adjusted_x8_end8 = adjusted_x8_end & ~7;
173 const int leftover = adjusted_x8_end - adjusted_x8_end8;
174 const int8_t* const reference_offsets =
175 reference_info.relative_distance_to.data();
176 const bool* const skip_references = reference_info.skip_references.data();
177 const int16_t* const projection_divisions =
178 reference_info.projection_divisions.data();
179 const ReferenceFrameType* source_reference_types =
180 &reference_info.motion_field_reference_frame[y8_start][0];
181 const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
182 int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
183 MotionVector* dst_mv = motion_field->mv[y8_start];
184 const __m128i d_sign = _mm_set1_epi16(dst_sign);
185
186 static_assert(sizeof(int8_t) == sizeof(bool), "");
187 static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
188 static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
189 assert(dst_sign == 0 || dst_sign == -1);
190 assert(stride == motion_field->reference_offset.columns());
191 assert((y8_start & 7) == 0);
192 assert((adjusted_x8_start & 7) == 0);
193 // The final position calculation is represented with int16_t. Valid
194 // position_y8 from its base is at most 7. After considering the horizontal
195 // offset which is at most |stride - 1|, we have the following assertion,
196 // which means this optimization works for frame width up to 32K (each
197 // position is a 8x8 block).
198 assert(8 * stride <= 32768);
199 const __m128i skip_reference = LoadLo8(skip_references);
200 const __m128i r_offsets = LoadLo8(reference_offsets);
201 const __m128i division_table = LoadUnaligned16(projection_divisions);
202
203 int y8 = y8_start;
204 do {
205 const int y8_floor = (y8 & ~7) - y8; // [-7, 0]
206 const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1; // [0, 7]
207 const __m128i y8_floor8 = _mm_set1_epi8(y8_floor);
208 const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling);
209 int x8;
210
211 for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
212 const __m128i source_reference_type8 =
213 LoadLo8(source_reference_types + x8);
214 const __m128i skip_r =
215 _mm_shuffle_epi8(skip_reference, source_reference_type8);
216 int64_t early_skip;
217 StoreLo8(&early_skip, skip_r);
218 // Early termination #1 if all are skips. Chance is typically ~30-40%.
219 if (early_skip == -1) continue;
220 int64_t skip_64;
221 __m128i r, position_xy, mvs[2];
222 GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
223 x8_end, x8, r_offsets, source_reference_type8, skip_r,
224 y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64,
225 mvs);
226 // Early termination #2 if all are skips.
227 // Chance is typically ~15-25% after Early termination #1.
228 if (skip_64 == -1) continue;
229 const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
230 const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
231 const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
232 const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
233 const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
234 if (skip_64 == 0) {
235 // Store all. Chance is typically ~70-85% after Early termination #2.
236 Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
237 Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
238 Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
239 Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
240 Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
241 Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
242 Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
243 Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
244 } else {
245 // Check and store each.
246 // Chance is typically ~15-30% after Early termination #2.
247 // The compiler is smart enough to not create the local buffer skips[].
248 int8_t skips[8];
249 memcpy(skips, &skip_64, sizeof(skips));
250 CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
251 CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
252 CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
253 CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
254 CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
255 CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
256 CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
257 CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
258 }
259 }
260
261 // The following leftover processing cannot be moved out of the do...while
262 // loop. Doing so may change the result storing orders of the same position.
263 if (leftover > 0) {
264 // Use SIMD only when leftover is at least 4, and there are at least 8
265 // elements in a row.
266 if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
267 // Process the last 8 elements to avoid loading invalid memory. Some
268 // elements may have been processed in the above loop, which is OK.
269 const int delta = 8 - leftover;
270 x8 = adjusted_x8_end - 8;
271 const __m128i source_reference_type8 =
272 LoadLo8(source_reference_types + x8);
273 const __m128i skip_r =
274 _mm_shuffle_epi8(skip_reference, source_reference_type8);
275 int64_t early_skip;
276 StoreLo8(&early_skip, skip_r);
277 // Early termination #1 if all are skips.
278 if (early_skip != -1) {
279 int64_t skip_64;
280 __m128i r, position_xy, mvs[2];
281 GetPosition(division_table, mv, reference_to_current_with_sign,
282 x8_start, x8_end, x8, r_offsets, source_reference_type8,
283 skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
284 &position_xy, &skip_64, mvs);
285 // Early termination #2 if all are skips.
286 if (skip_64 != -1) {
287 const __m128i p_y =
288 _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
289 const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
290 const __m128i p_y_offset =
291 _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
292 const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
293 const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
294 // Store up to 7 elements since leftover is at most 7.
295 if (skip_64 == 0) {
296 // Store all.
297 Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
298 Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
299 Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
300 Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
301 Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
302 Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
303 Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
304 } else {
305 // Check and store each.
306 // The compiler is smart enough to not create the local buffer
307 // skips[].
308 int8_t skips[8];
309 memcpy(skips, &skip_64, sizeof(skips));
310 CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
311 dst_mv);
312 CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
313 dst_mv);
314 CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
315 dst_mv);
316 CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
317 dst_mv);
318 CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
319 dst_mv);
320 CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
321 dst_mv);
322 CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
323 dst_mv);
324 }
325 }
326 }
327 } else {
328 for (; x8 < adjusted_x8_end; ++x8) {
329 const int source_reference_type = source_reference_types[x8];
330 if (skip_references[source_reference_type]) continue;
331 MotionVector projection_mv;
332 // reference_to_current_with_sign could be 0.
333 GetMvProjection(mv[x8], reference_to_current_with_sign,
334 projection_divisions[source_reference_type],
335 &projection_mv);
336 // Do not update the motion vector if the block position is not valid
337 // or if position_x8 is outside the current range of x8_start and
338 // x8_end. Note that position_y8 will always be within the range of
339 // y8_start and y8_end.
340 const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
341 if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue;
342 const int x8_base = x8 & ~7;
343 const int x8_floor =
344 std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
345 const int x8_ceiling =
346 std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
347 const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
348 if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
349 dst_mv[position_y8 * stride + position_x8] = mv[x8];
350 dst_reference_offset[position_y8 * stride + position_x8] =
351 reference_offsets[source_reference_type];
352 }
353 }
354 }
355
356 source_reference_types += stride;
357 mv += stride;
358 dst_reference_offset += stride;
359 dst_mv += stride;
360 } while (++y8 < y8_end);
361 }
362
363 } // namespace
364
MotionFieldProjectionInit_SSE4_1()365 void MotionFieldProjectionInit_SSE4_1() {
366 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
367 assert(dsp != nullptr);
368 dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
369 }
370
371 } // namespace dsp
372 } // namespace libgav1
373
374 #else // !LIBGAV1_TARGETING_SSE4_1
375 namespace libgav1 {
376 namespace dsp {
377
MotionFieldProjectionInit_SSE4_1()378 void MotionFieldProjectionInit_SSE4_1() {}
379
380 } // namespace dsp
381 } // namespace libgav1
382 #endif // LIBGAV1_TARGETING_SSE4_1
383