1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/warp.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19
20 #include <smmintrin.h>
21
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstring>
26 #include <type_traits>
27
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/dsp/x86/transpose_sse4.h"
32 #include "src/utils/common.h"
33 #include "src/utils/constants.h"
34
35 namespace libgav1 {
36 namespace dsp {
37 namespace low_bitdepth {
38 namespace {
39
40 // Number of extra bits of precision in warped filtering.
41 constexpr int kWarpedDiffPrecisionBits = 10;
42
43 // This assumes the two filters contain filter[x] and filter[x+2].
AccumulateFilter(const __m128i sum,const __m128i filter_0,const __m128i filter_1,const __m128i & src_window)44 inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0,
45 const __m128i filter_1,
46 const __m128i& src_window) {
47 const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1);
48 const __m128i src =
49 _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2));
50 return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps));
51 }
52
53 constexpr int kFirstPassOffset = 1 << 14;
54 constexpr int kOffsetRemoval =
55 (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
56
57 // Applies the horizontal filter to one source row and stores the result in
58 // |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
59 // |intermediate_result| two-dimensional array.
HorizontalFilter(const int sx4,const int16_t alpha,const __m128i src_row,int16_t intermediate_result_row[8])60 inline void HorizontalFilter(const int sx4, const int16_t alpha,
61 const __m128i src_row,
62 int16_t intermediate_result_row[8]) {
63 int sx = sx4 - MultiplyBy4(alpha);
64 __m128i filter[8];
65 for (__m128i& f : filter) {
66 const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
67 kWarpedPixelPrecisionShifts;
68 f = LoadLo8(kWarpedFilters8[offset]);
69 sx += alpha;
70 }
71 Transpose8x8To4x16_U8(filter, filter);
72 // |filter| now contains two filters per register.
73 // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
74 // without overflowing the sign bit. The sign bit is hit only where two taps
75 // paired in a single madd add up to more than 128. This is only possible with
76 // two adjacent "inner" taps. Therefore, pairing odd with odd and even with
77 // even guarantees safety. |sum| is given a negative offset to allow for large
78 // intermediate values.
79 // k = 0, 2.
80 __m128i src_row_window = src_row;
81 __m128i sum = _mm_set1_epi16(-kFirstPassOffset);
82 sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window);
83
84 // k = 1, 3.
85 src_row_window = _mm_srli_si128(src_row_window, 1);
86 sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8),
87 _mm_srli_si128(filter[1], 8), src_row_window);
88 // k = 4, 6.
89 src_row_window = _mm_srli_si128(src_row_window, 3);
90 sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window);
91
92 // k = 5, 7.
93 src_row_window = _mm_srli_si128(src_row_window, 1);
94 sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8),
95 _mm_srli_si128(filter[3], 8), src_row_window);
96
97 sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal);
98 StoreUnaligned16(intermediate_result_row, sum);
99 }
100
101 template <bool is_compound>
WriteVerticalFilter(const __m128i filter[8],const int16_t intermediate_result[15][8],int y,void * LIBGAV1_RESTRICT dst_row)102 inline void WriteVerticalFilter(const __m128i filter[8],
103 const int16_t intermediate_result[15][8], int y,
104 void* LIBGAV1_RESTRICT dst_row) {
105 constexpr int kRoundBitsVertical =
106 is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
107 __m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
108 __m128i sum_high = sum_low;
109 for (int k = 0; k < 8; k += 2) {
110 const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
111 const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
112 const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]);
113 const __m128i intermediate_1 =
114 LoadUnaligned16(intermediate_result[y + k + 1]);
115 const __m128i intermediate_low =
116 _mm_unpacklo_epi16(intermediate_0, intermediate_1);
117 const __m128i intermediate_high =
118 _mm_unpackhi_epi16(intermediate_0, intermediate_1);
119
120 const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low);
121 const __m128i product_high =
122 _mm_madd_epi16(filters_high, intermediate_high);
123 sum_low = _mm_add_epi32(sum_low, product_low);
124 sum_high = _mm_add_epi32(sum_high, product_high);
125 }
126 sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
127 sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
128 if (is_compound) {
129 const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
130 StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
131 } else {
132 const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
133 StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
134 }
135 }
136
137 template <bool is_compound>
WriteVerticalFilter(const __m128i filter[8],const int16_t * LIBGAV1_RESTRICT intermediate_result_column,void * LIBGAV1_RESTRICT dst_row)138 inline void WriteVerticalFilter(const __m128i filter[8],
139 const int16_t* LIBGAV1_RESTRICT
140 intermediate_result_column,
141 void* LIBGAV1_RESTRICT dst_row) {
142 constexpr int kRoundBitsVertical =
143 is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
144 __m128i sum_low = _mm_setzero_si128();
145 __m128i sum_high = _mm_setzero_si128();
146 for (int k = 0; k < 8; k += 2) {
147 const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
148 const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
149 // Equivalent to unpacking two vectors made by duplicating int16_t values.
150 const __m128i intermediate =
151 _mm_set1_epi32((intermediate_result_column[k + 1] << 16) |
152 intermediate_result_column[k]);
153 const __m128i product_low = _mm_madd_epi16(filters_low, intermediate);
154 const __m128i product_high = _mm_madd_epi16(filters_high, intermediate);
155 sum_low = _mm_add_epi32(sum_low, product_low);
156 sum_high = _mm_add_epi32(sum_high, product_high);
157 }
158 sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
159 sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
160 if (is_compound) {
161 const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
162 StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
163 } else {
164 const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
165 StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
166 }
167 }
168
169 template <bool is_compound, typename DestType>
VerticalFilter(const int16_t source[15][8],int64_t y4,int gamma,int delta,DestType * LIBGAV1_RESTRICT dest_row,ptrdiff_t dest_stride)170 inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma,
171 int delta, DestType* LIBGAV1_RESTRICT dest_row,
172 ptrdiff_t dest_stride) {
173 int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
174 for (int y = 0; y < 8; ++y) {
175 int sy = sy4 - MultiplyBy4(gamma);
176 __m128i filter[8];
177 for (__m128i& f : filter) {
178 const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
179 kWarpedPixelPrecisionShifts;
180 f = LoadUnaligned16(kWarpedFilters[offset]);
181 sy += gamma;
182 }
183 Transpose8x8_U16(filter, filter);
184 WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
185 dest_row += dest_stride;
186 sy4 += delta;
187 }
188 }
189
190 template <bool is_compound, typename DestType>
VerticalFilter(const int16_t * LIBGAV1_RESTRICT source_cols,int64_t y4,int gamma,int delta,DestType * LIBGAV1_RESTRICT dest_row,ptrdiff_t dest_stride)191 inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols,
192 int64_t y4, int gamma, int delta,
193 DestType* LIBGAV1_RESTRICT dest_row,
194 ptrdiff_t dest_stride) {
195 int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
196 for (int y = 0; y < 8; ++y) {
197 int sy = sy4 - MultiplyBy4(gamma);
198 __m128i filter[8];
199 for (__m128i& f : filter) {
200 const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
201 kWarpedPixelPrecisionShifts;
202 f = LoadUnaligned16(kWarpedFilters[offset]);
203 sy += gamma;
204 }
205 Transpose8x8_U16(filter, filter);
206 WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
207 dest_row += dest_stride;
208 sy4 += delta;
209 }
210 }
211
212 template <bool is_compound, typename DestType>
WarpRegion1(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int source_width,int source_height,int ix4,int iy4,DestType * LIBGAV1_RESTRICT dst_row,ptrdiff_t dest_stride)213 inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
214 ptrdiff_t source_stride, int source_width,
215 int source_height, int ix4, int iy4,
216 DestType* LIBGAV1_RESTRICT dst_row,
217 ptrdiff_t dest_stride) {
218 // Region 1
219 // Points to the left or right border of the first row of |src|.
220 const uint8_t* first_row_border =
221 (ix4 + 7 <= 0) ? src : src + source_width - 1;
222 // In general, for y in [-7, 8), the row number iy4 + y is clipped:
223 // const int row = Clip3(iy4 + y, 0, source_height - 1);
224 // In two special cases, iy4 + y is clipped to either 0 or
225 // source_height - 1 for all y. In the rest of the cases, iy4 + y is
226 // bounded and we can avoid clipping iy4 + y by relying on a reference
227 // frame's boundary extension on the top and bottom.
228 // Region 1.
229 // Every sample used to calculate the prediction block has the same
230 // value. So the whole prediction block has the same value.
231 const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
232 const uint8_t row_border_pixel = first_row_border[row * source_stride];
233
234 if (is_compound) {
235 const __m128i sum =
236 _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
237 kInterRoundBitsCompoundVertical));
238 StoreUnaligned16(dst_row, sum);
239 } else {
240 memset(dst_row, row_border_pixel, 8);
241 }
242 const DestType* const first_dst_row = dst_row;
243 dst_row += dest_stride;
244 for (int y = 1; y < 8; ++y) {
245 memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
246 dst_row += dest_stride;
247 }
248 }
249
250 template <bool is_compound, typename DestType>
WarpRegion2(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int source_width,int64_t y4,int ix4,int iy4,int gamma,int delta,int16_t intermediate_result_column[15],DestType * LIBGAV1_RESTRICT dst_row,ptrdiff_t dest_stride)251 inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
252 ptrdiff_t source_stride, int source_width, int64_t y4,
253 int ix4, int iy4, int gamma, int delta,
254 int16_t intermediate_result_column[15],
255 DestType* LIBGAV1_RESTRICT dst_row,
256 ptrdiff_t dest_stride) {
257 // Region 2.
258 // Points to the left or right border of the first row of |src|.
259 const uint8_t* first_row_border =
260 (ix4 + 7 <= 0) ? src : src + source_width - 1;
261 // In general, for y in [-7, 8), the row number iy4 + y is clipped:
262 // const int row = Clip3(iy4 + y, 0, source_height - 1);
263 // In two special cases, iy4 + y is clipped to either 0 or
264 // source_height - 1 for all y. In the rest of the cases, iy4 + y is
265 // bounded and we can avoid clipping iy4 + y by relying on a reference
266 // frame's boundary extension on the top and bottom.
267
268 // Region 2.
269 // Horizontal filter.
270 // The input values in this region are generated by extending the border
271 // which makes them identical in the horizontal direction. This
272 // computation could be inlined in the vertical pass but most
273 // implementations will need a transpose of some sort.
274 // It is not necessary to use the offset values here because the
275 // horizontal pass is a simple shift and the vertical pass will always
276 // require using 32 bits.
277 for (int y = -7; y < 8; ++y) {
278 // We may over-read up to 13 pixels above the top source row, or up
279 // to 13 pixels below the bottom source row. This is proved in
280 // warp.cc.
281 const int row = iy4 + y;
282 int sum = first_row_border[row * source_stride];
283 sum <<= (kFilterBits - kInterRoundBitsHorizontal);
284 intermediate_result_column[y + 7] = sum;
285 }
286 // Region 2 vertical filter.
287 VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
288 delta, dst_row, dest_stride);
289 }
290
291 template <bool is_compound, typename DestType>
WarpRegion3(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int source_height,int alpha,int beta,int64_t x4,int ix4,int iy4,int16_t intermediate_result[15][8])292 inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
293 ptrdiff_t source_stride, int source_height, int alpha,
294 int beta, int64_t x4, int ix4, int iy4,
295 int16_t intermediate_result[15][8]) {
296 // Region 3
297 // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
298
299 // In general, for y in [-7, 8), the row number iy4 + y is clipped:
300 // const int row = Clip3(iy4 + y, 0, source_height - 1);
301 // In two special cases, iy4 + y is clipped to either 0 or
302 // source_height - 1 for all y. In the rest of the cases, iy4 + y is
303 // bounded and we can avoid clipping iy4 + y by relying on a reference
304 // frame's boundary extension on the top and bottom.
305 // Horizontal filter.
306 const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
307 const uint8_t* const src_row = src + row * source_stride;
308 // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
309 // read but is ignored.
310 //
311 // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
312 // bytes after src_row[source_width - 1]. We assume the source frame
313 // has left and right borders of at least 13 bytes that extend the
314 // frame boundary pixels. We also assume there is at least one extra
315 // padding byte after the right border of the last source row.
316 const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
317 int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
318 for (int y = -7; y < 8; ++y) {
319 HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
320 sx4 += beta;
321 }
322 }
323
324 template <bool is_compound, typename DestType>
WarpRegion4(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int alpha,int beta,int64_t x4,int ix4,int iy4,int16_t intermediate_result[15][8])325 inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
326 ptrdiff_t source_stride, int alpha, int beta,
327 int64_t x4, int ix4, int iy4,
328 int16_t intermediate_result[15][8]) {
329 // Region 4.
330 // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
331
332 // In general, for y in [-7, 8), the row number iy4 + y is clipped:
333 // const int row = Clip3(iy4 + y, 0, source_height - 1);
334 // In two special cases, iy4 + y is clipped to either 0 or
335 // source_height - 1 for all y. In the rest of the cases, iy4 + y is
336 // bounded and we can avoid clipping iy4 + y by relying on a reference
337 // frame's boundary extension on the top and bottom.
338 // Horizontal filter.
339 int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
340 for (int y = -7; y < 8; ++y) {
341 // We may over-read up to 13 pixels above the top source row, or up
342 // to 13 pixels below the bottom source row. This is proved in
343 // warp.cc.
344 const int row = iy4 + y;
345 const uint8_t* const src_row = src + row * source_stride;
346 // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
347 // read but is ignored.
348 //
349 // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
350 // bytes after src_row[source_width - 1]. We assume the source frame
351 // has left and right borders of at least 13 bytes that extend the
352 // frame boundary pixels. We also assume there is at least one extra
353 // padding byte after the right border of the last source row.
354 const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
355 // Convert src_row_v to int8 (subtract 128).
356 HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
357 sx4 += beta;
358 }
359 }
360
361 template <bool is_compound, typename DestType>
HandleWarpBlock(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int source_width,int source_height,const int * LIBGAV1_RESTRICT warp_params,int subsampling_x,int subsampling_y,int src_x,int src_y,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta,DestType * LIBGAV1_RESTRICT dst_row,ptrdiff_t dest_stride)362 inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
363 ptrdiff_t source_stride, int source_width,
364 int source_height,
365 const int* LIBGAV1_RESTRICT warp_params,
366 int subsampling_x, int subsampling_y, int src_x,
367 int src_y, int16_t alpha, int16_t beta,
368 int16_t gamma, int16_t delta,
369 DestType* LIBGAV1_RESTRICT dst_row,
370 ptrdiff_t dest_stride) {
371 union {
372 // Intermediate_result is the output of the horizontal filtering and
373 // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
374 // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
375 // type so that we can start with a negative offset and restore it on the
376 // final filter sum.
377 int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
378 // In the simple special cases where the samples in each row are all the
379 // same, store one sample per row in a column vector.
380 int16_t intermediate_result_column[15];
381 };
382
383 const WarpFilterParams filter_params = GetWarpFilterParams(
384 src_x, src_y, subsampling_x, subsampling_y, warp_params);
385 // A prediction block may fall outside the frame's boundaries. If a
386 // prediction block is calculated using only samples outside the frame's
387 // boundary, the filtering can be simplified. We can divide the plane
388 // into several regions and handle them differently.
389 //
390 // | |
391 // 1 | 3 | 1
392 // | |
393 // -------+-----------+-------
394 // |***********|
395 // 2 |*****4*****| 2
396 // |***********|
397 // -------+-----------+-------
398 // | |
399 // 1 | 3 | 1
400 // | |
401 //
402 // At the center, region 4 represents the frame and is the general case.
403 //
404 // In regions 1 and 2, the prediction block is outside the frame's
405 // boundary horizontally. Therefore the horizontal filtering can be
406 // simplified. Furthermore, in the region 1 (at the four corners), the
407 // prediction is outside the frame's boundary both horizontally and
408 // vertically, so we get a constant prediction block.
409 //
410 // In region 3, the prediction block is outside the frame's boundary
411 // vertically. Unfortunately because we apply the horizontal filters
412 // first, by the time we apply the vertical filters, they no longer see
413 // simple inputs. So the only simplification is that all the rows are
414 // the same, but we still need to apply all the horizontal and vertical
415 // filters.
416
417 // Check for two simple special cases, where the horizontal filter can
418 // be significantly simplified.
419 //
420 // In general, for each row, the horizontal filter is calculated as
421 // follows:
422 // for (int x = -4; x < 4; ++x) {
423 // const int offset = ...;
424 // int sum = first_pass_offset;
425 // for (int k = 0; k < 8; ++k) {
426 // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
427 // sum += kWarpedFilters[offset][k] * src_row[column];
428 // }
429 // ...
430 // }
431 // The column index before clipping, ix4 + x + k - 3, varies in the range
432 // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
433 // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
434 // border index (source_width - 1 or 0, respectively). Then for each x,
435 // the inner for loop of the horizontal filter is reduced to multiplying
436 // the border pixel by the sum of the filter coefficients.
437 if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) {
438 if ((filter_params.iy4 - 7 >= source_height - 1 ||
439 filter_params.iy4 + 7 <= 0)) {
440 // Outside the frame in both directions. One repeated value.
441 WarpRegion1<is_compound, DestType>(
442 src, source_stride, source_width, source_height, filter_params.ix4,
443 filter_params.iy4, dst_row, dest_stride);
444 return;
445 }
446 // Outside the frame horizontally. Rows repeated.
447 WarpRegion2<is_compound, DestType>(
448 src, source_stride, source_width, filter_params.y4, filter_params.ix4,
449 filter_params.iy4, gamma, delta, intermediate_result_column, dst_row,
450 dest_stride);
451 return;
452 }
453
454 if ((filter_params.iy4 - 7 >= source_height - 1 ||
455 filter_params.iy4 + 7 <= 0)) {
456 // Outside the frame vertically.
457 WarpRegion3<is_compound, DestType>(
458 src, source_stride, source_height, alpha, beta, filter_params.x4,
459 filter_params.ix4, filter_params.iy4, intermediate_result);
460 } else {
461 // Inside the frame.
462 WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta,
463 filter_params.x4, filter_params.ix4,
464 filter_params.iy4, intermediate_result);
465 }
466 // Region 3 and 4 vertical filter.
467 VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4,
468 gamma, delta, dst_row, dest_stride);
469 }
470
471 template <bool is_compound>
Warp_SSE4_1(const void * LIBGAV1_RESTRICT source,ptrdiff_t source_stride,int source_width,int source_height,const int * LIBGAV1_RESTRICT warp_params,int subsampling_x,int subsampling_y,int block_start_x,int block_start_y,int block_width,int block_height,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta,void * LIBGAV1_RESTRICT dest,ptrdiff_t dest_stride)472 void Warp_SSE4_1(const void* LIBGAV1_RESTRICT source, ptrdiff_t source_stride,
473 int source_width, int source_height,
474 const int* LIBGAV1_RESTRICT warp_params, int subsampling_x,
475 int subsampling_y, int block_start_x, int block_start_y,
476 int block_width, int block_height, int16_t alpha, int16_t beta,
477 int16_t gamma, int16_t delta, void* LIBGAV1_RESTRICT dest,
478 ptrdiff_t dest_stride) {
479 const auto* const src = static_cast<const uint8_t*>(source);
480 using DestType =
481 typename std::conditional<is_compound, int16_t, uint8_t>::type;
482 auto* dst = static_cast<DestType*>(dest);
483
484 // Warp process applies for each 8x8 block.
485 assert(block_width >= 8);
486 assert(block_height >= 8);
487 const int block_end_x = block_start_x + block_width;
488 const int block_end_y = block_start_y + block_height;
489
490 const int start_x = block_start_x;
491 const int start_y = block_start_y;
492 int src_x = (start_x + 4) << subsampling_x;
493 int src_y = (start_y + 4) << subsampling_y;
494 const int end_x = (block_end_x + 4) << subsampling_x;
495 const int end_y = (block_end_y + 4) << subsampling_y;
496 do {
497 DestType* dst_row = dst;
498 src_x = (start_x + 4) << subsampling_x;
499 do {
500 HandleWarpBlock<is_compound, DestType>(
501 src, source_stride, source_width, source_height, warp_params,
502 subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
503 dst_row, dest_stride);
504 src_x += (8 << subsampling_x);
505 dst_row += 8;
506 } while (src_x < end_x);
507 dst += 8 * dest_stride;
508 src_y += (8 << subsampling_y);
509 } while (src_y < end_y);
510 }
511
Init8bpp()512 void Init8bpp() {
513 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
514 assert(dsp != nullptr);
515 dsp->warp = Warp_SSE4_1</*is_compound=*/false>;
516 dsp->warp_compound = Warp_SSE4_1</*is_compound=*/true>;
517 }
518
519 } // namespace
520 } // namespace low_bitdepth
521
WarpInit_SSE4_1()522 void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
523
524 } // namespace dsp
525 } // namespace libgav1
526 #else // !LIBGAV1_TARGETING_SSE4_1
527
528 namespace libgav1 {
529 namespace dsp {
530
WarpInit_SSE4_1()531 void WarpInit_SSE4_1() {}
532
533 } // namespace dsp
534 } // namespace libgav1
535 #endif // LIBGAV1_TARGETING_SSE4_1
536