1*09537850SAkhilesh Sanikop // Copyright 2020 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop // http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop
15*09537850SAkhilesh Sanikop #include "src/dsp/warp.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop
18*09537850SAkhilesh Sanikop #if LIBGAV1_TARGETING_SSE4_1
19*09537850SAkhilesh Sanikop
20*09537850SAkhilesh Sanikop #include <smmintrin.h>
21*09537850SAkhilesh Sanikop
22*09537850SAkhilesh Sanikop #include <cassert>
23*09537850SAkhilesh Sanikop #include <cstddef>
24*09537850SAkhilesh Sanikop #include <cstdint>
25*09537850SAkhilesh Sanikop #include <cstring>
26*09537850SAkhilesh Sanikop #include <type_traits>
27*09537850SAkhilesh Sanikop
28*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
29*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
30*09537850SAkhilesh Sanikop #include "src/dsp/x86/common_sse4.h"
31*09537850SAkhilesh Sanikop #include "src/dsp/x86/transpose_sse4.h"
32*09537850SAkhilesh Sanikop #include "src/utils/common.h"
33*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
34*09537850SAkhilesh Sanikop
35*09537850SAkhilesh Sanikop namespace libgav1 {
36*09537850SAkhilesh Sanikop namespace dsp {
37*09537850SAkhilesh Sanikop namespace low_bitdepth {
38*09537850SAkhilesh Sanikop namespace {
39*09537850SAkhilesh Sanikop
40*09537850SAkhilesh Sanikop // Number of extra bits of precision in warped filtering.
41*09537850SAkhilesh Sanikop constexpr int kWarpedDiffPrecisionBits = 10;
42*09537850SAkhilesh Sanikop
43*09537850SAkhilesh Sanikop // This assumes the two filters contain filter[x] and filter[x+2].
AccumulateFilter(const __m128i sum,const __m128i filter_0,const __m128i filter_1,const __m128i & src_window)44*09537850SAkhilesh Sanikop inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0,
45*09537850SAkhilesh Sanikop const __m128i filter_1,
46*09537850SAkhilesh Sanikop const __m128i& src_window) {
47*09537850SAkhilesh Sanikop const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1);
48*09537850SAkhilesh Sanikop const __m128i src =
49*09537850SAkhilesh Sanikop _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2));
50*09537850SAkhilesh Sanikop return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps));
51*09537850SAkhilesh Sanikop }
52*09537850SAkhilesh Sanikop
53*09537850SAkhilesh Sanikop constexpr int kFirstPassOffset = 1 << 14;
54*09537850SAkhilesh Sanikop constexpr int kOffsetRemoval =
55*09537850SAkhilesh Sanikop (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
56*09537850SAkhilesh Sanikop
57*09537850SAkhilesh Sanikop // Applies the horizontal filter to one source row and stores the result in
58*09537850SAkhilesh Sanikop // |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
59*09537850SAkhilesh Sanikop // |intermediate_result| two-dimensional array.
HorizontalFilter(const int sx4,const int16_t alpha,const __m128i src_row,int16_t intermediate_result_row[8])60*09537850SAkhilesh Sanikop inline void HorizontalFilter(const int sx4, const int16_t alpha,
61*09537850SAkhilesh Sanikop const __m128i src_row,
62*09537850SAkhilesh Sanikop int16_t intermediate_result_row[8]) {
63*09537850SAkhilesh Sanikop int sx = sx4 - MultiplyBy4(alpha);
64*09537850SAkhilesh Sanikop __m128i filter[8];
65*09537850SAkhilesh Sanikop for (__m128i& f : filter) {
66*09537850SAkhilesh Sanikop const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
67*09537850SAkhilesh Sanikop kWarpedPixelPrecisionShifts;
68*09537850SAkhilesh Sanikop f = LoadLo8(kWarpedFilters8[offset]);
69*09537850SAkhilesh Sanikop sx += alpha;
70*09537850SAkhilesh Sanikop }
71*09537850SAkhilesh Sanikop Transpose8x8To4x16_U8(filter, filter);
72*09537850SAkhilesh Sanikop // |filter| now contains two filters per register.
73*09537850SAkhilesh Sanikop // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
74*09537850SAkhilesh Sanikop // without overflowing the sign bit. The sign bit is hit only where two taps
75*09537850SAkhilesh Sanikop // paired in a single madd add up to more than 128. This is only possible with
76*09537850SAkhilesh Sanikop // two adjacent "inner" taps. Therefore, pairing odd with odd and even with
77*09537850SAkhilesh Sanikop // even guarantees safety. |sum| is given a negative offset to allow for large
78*09537850SAkhilesh Sanikop // intermediate values.
79*09537850SAkhilesh Sanikop // k = 0, 2.
80*09537850SAkhilesh Sanikop __m128i src_row_window = src_row;
81*09537850SAkhilesh Sanikop __m128i sum = _mm_set1_epi16(-kFirstPassOffset);
82*09537850SAkhilesh Sanikop sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window);
83*09537850SAkhilesh Sanikop
84*09537850SAkhilesh Sanikop // k = 1, 3.
85*09537850SAkhilesh Sanikop src_row_window = _mm_srli_si128(src_row_window, 1);
86*09537850SAkhilesh Sanikop sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8),
87*09537850SAkhilesh Sanikop _mm_srli_si128(filter[1], 8), src_row_window);
88*09537850SAkhilesh Sanikop // k = 4, 6.
89*09537850SAkhilesh Sanikop src_row_window = _mm_srli_si128(src_row_window, 3);
90*09537850SAkhilesh Sanikop sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window);
91*09537850SAkhilesh Sanikop
92*09537850SAkhilesh Sanikop // k = 5, 7.
93*09537850SAkhilesh Sanikop src_row_window = _mm_srli_si128(src_row_window, 1);
94*09537850SAkhilesh Sanikop sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8),
95*09537850SAkhilesh Sanikop _mm_srli_si128(filter[3], 8), src_row_window);
96*09537850SAkhilesh Sanikop
97*09537850SAkhilesh Sanikop sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal);
98*09537850SAkhilesh Sanikop StoreUnaligned16(intermediate_result_row, sum);
99*09537850SAkhilesh Sanikop }
100*09537850SAkhilesh Sanikop
101*09537850SAkhilesh Sanikop template <bool is_compound>
WriteVerticalFilter(const __m128i filter[8],const int16_t intermediate_result[15][8],int y,void * LIBGAV1_RESTRICT dst_row)102*09537850SAkhilesh Sanikop inline void WriteVerticalFilter(const __m128i filter[8],
103*09537850SAkhilesh Sanikop const int16_t intermediate_result[15][8], int y,
104*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT dst_row) {
105*09537850SAkhilesh Sanikop constexpr int kRoundBitsVertical =
106*09537850SAkhilesh Sanikop is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
107*09537850SAkhilesh Sanikop __m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
108*09537850SAkhilesh Sanikop __m128i sum_high = sum_low;
109*09537850SAkhilesh Sanikop for (int k = 0; k < 8; k += 2) {
110*09537850SAkhilesh Sanikop const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
111*09537850SAkhilesh Sanikop const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
112*09537850SAkhilesh Sanikop const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]);
113*09537850SAkhilesh Sanikop const __m128i intermediate_1 =
114*09537850SAkhilesh Sanikop LoadUnaligned16(intermediate_result[y + k + 1]);
115*09537850SAkhilesh Sanikop const __m128i intermediate_low =
116*09537850SAkhilesh Sanikop _mm_unpacklo_epi16(intermediate_0, intermediate_1);
117*09537850SAkhilesh Sanikop const __m128i intermediate_high =
118*09537850SAkhilesh Sanikop _mm_unpackhi_epi16(intermediate_0, intermediate_1);
119*09537850SAkhilesh Sanikop
120*09537850SAkhilesh Sanikop const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low);
121*09537850SAkhilesh Sanikop const __m128i product_high =
122*09537850SAkhilesh Sanikop _mm_madd_epi16(filters_high, intermediate_high);
123*09537850SAkhilesh Sanikop sum_low = _mm_add_epi32(sum_low, product_low);
124*09537850SAkhilesh Sanikop sum_high = _mm_add_epi32(sum_high, product_high);
125*09537850SAkhilesh Sanikop }
126*09537850SAkhilesh Sanikop sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
127*09537850SAkhilesh Sanikop sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
128*09537850SAkhilesh Sanikop if (is_compound) {
129*09537850SAkhilesh Sanikop const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
130*09537850SAkhilesh Sanikop StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
131*09537850SAkhilesh Sanikop } else {
132*09537850SAkhilesh Sanikop const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
133*09537850SAkhilesh Sanikop StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
134*09537850SAkhilesh Sanikop }
135*09537850SAkhilesh Sanikop }
136*09537850SAkhilesh Sanikop
137*09537850SAkhilesh Sanikop template <bool is_compound>
WriteVerticalFilter(const __m128i filter[8],const int16_t * LIBGAV1_RESTRICT intermediate_result_column,void * LIBGAV1_RESTRICT dst_row)138*09537850SAkhilesh Sanikop inline void WriteVerticalFilter(const __m128i filter[8],
139*09537850SAkhilesh Sanikop const int16_t* LIBGAV1_RESTRICT
140*09537850SAkhilesh Sanikop intermediate_result_column,
141*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT dst_row) {
142*09537850SAkhilesh Sanikop constexpr int kRoundBitsVertical =
143*09537850SAkhilesh Sanikop is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
144*09537850SAkhilesh Sanikop __m128i sum_low = _mm_setzero_si128();
145*09537850SAkhilesh Sanikop __m128i sum_high = _mm_setzero_si128();
146*09537850SAkhilesh Sanikop for (int k = 0; k < 8; k += 2) {
147*09537850SAkhilesh Sanikop const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
148*09537850SAkhilesh Sanikop const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
149*09537850SAkhilesh Sanikop // Equivalent to unpacking two vectors made by duplicating int16_t values.
150*09537850SAkhilesh Sanikop const __m128i intermediate =
151*09537850SAkhilesh Sanikop _mm_set1_epi32((intermediate_result_column[k + 1] << 16) |
152*09537850SAkhilesh Sanikop intermediate_result_column[k]);
153*09537850SAkhilesh Sanikop const __m128i product_low = _mm_madd_epi16(filters_low, intermediate);
154*09537850SAkhilesh Sanikop const __m128i product_high = _mm_madd_epi16(filters_high, intermediate);
155*09537850SAkhilesh Sanikop sum_low = _mm_add_epi32(sum_low, product_low);
156*09537850SAkhilesh Sanikop sum_high = _mm_add_epi32(sum_high, product_high);
157*09537850SAkhilesh Sanikop }
158*09537850SAkhilesh Sanikop sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
159*09537850SAkhilesh Sanikop sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
160*09537850SAkhilesh Sanikop if (is_compound) {
161*09537850SAkhilesh Sanikop const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
162*09537850SAkhilesh Sanikop StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
163*09537850SAkhilesh Sanikop } else {
164*09537850SAkhilesh Sanikop const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
165*09537850SAkhilesh Sanikop StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
166*09537850SAkhilesh Sanikop }
167*09537850SAkhilesh Sanikop }
168*09537850SAkhilesh Sanikop
169*09537850SAkhilesh Sanikop template <bool is_compound, typename DestType>
VerticalFilter(const int16_t source[15][8],int64_t y4,int gamma,int delta,DestType * LIBGAV1_RESTRICT dest_row,ptrdiff_t dest_stride)170*09537850SAkhilesh Sanikop inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma,
171*09537850SAkhilesh Sanikop int delta, DestType* LIBGAV1_RESTRICT dest_row,
172*09537850SAkhilesh Sanikop ptrdiff_t dest_stride) {
173*09537850SAkhilesh Sanikop int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
174*09537850SAkhilesh Sanikop for (int y = 0; y < 8; ++y) {
175*09537850SAkhilesh Sanikop int sy = sy4 - MultiplyBy4(gamma);
176*09537850SAkhilesh Sanikop __m128i filter[8];
177*09537850SAkhilesh Sanikop for (__m128i& f : filter) {
178*09537850SAkhilesh Sanikop const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
179*09537850SAkhilesh Sanikop kWarpedPixelPrecisionShifts;
180*09537850SAkhilesh Sanikop f = LoadUnaligned16(kWarpedFilters[offset]);
181*09537850SAkhilesh Sanikop sy += gamma;
182*09537850SAkhilesh Sanikop }
183*09537850SAkhilesh Sanikop Transpose8x8_U16(filter, filter);
184*09537850SAkhilesh Sanikop WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
185*09537850SAkhilesh Sanikop dest_row += dest_stride;
186*09537850SAkhilesh Sanikop sy4 += delta;
187*09537850SAkhilesh Sanikop }
188*09537850SAkhilesh Sanikop }
189*09537850SAkhilesh Sanikop
190*09537850SAkhilesh Sanikop template <bool is_compound, typename DestType>
VerticalFilter(const int16_t * LIBGAV1_RESTRICT source_cols,int64_t y4,int gamma,int delta,DestType * LIBGAV1_RESTRICT dest_row,ptrdiff_t dest_stride)191*09537850SAkhilesh Sanikop inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols,
192*09537850SAkhilesh Sanikop int64_t y4, int gamma, int delta,
193*09537850SAkhilesh Sanikop DestType* LIBGAV1_RESTRICT dest_row,
194*09537850SAkhilesh Sanikop ptrdiff_t dest_stride) {
195*09537850SAkhilesh Sanikop int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
196*09537850SAkhilesh Sanikop for (int y = 0; y < 8; ++y) {
197*09537850SAkhilesh Sanikop int sy = sy4 - MultiplyBy4(gamma);
198*09537850SAkhilesh Sanikop __m128i filter[8];
199*09537850SAkhilesh Sanikop for (__m128i& f : filter) {
200*09537850SAkhilesh Sanikop const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
201*09537850SAkhilesh Sanikop kWarpedPixelPrecisionShifts;
202*09537850SAkhilesh Sanikop f = LoadUnaligned16(kWarpedFilters[offset]);
203*09537850SAkhilesh Sanikop sy += gamma;
204*09537850SAkhilesh Sanikop }
205*09537850SAkhilesh Sanikop Transpose8x8_U16(filter, filter);
206*09537850SAkhilesh Sanikop WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
207*09537850SAkhilesh Sanikop dest_row += dest_stride;
208*09537850SAkhilesh Sanikop sy4 += delta;
209*09537850SAkhilesh Sanikop }
210*09537850SAkhilesh Sanikop }
211*09537850SAkhilesh Sanikop
212*09537850SAkhilesh Sanikop template <bool is_compound, typename DestType>
WarpRegion1(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int source_width,int source_height,int ix4,int iy4,DestType * LIBGAV1_RESTRICT dst_row,ptrdiff_t dest_stride)213*09537850SAkhilesh Sanikop inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
214*09537850SAkhilesh Sanikop ptrdiff_t source_stride, int source_width,
215*09537850SAkhilesh Sanikop int source_height, int ix4, int iy4,
216*09537850SAkhilesh Sanikop DestType* LIBGAV1_RESTRICT dst_row,
217*09537850SAkhilesh Sanikop ptrdiff_t dest_stride) {
218*09537850SAkhilesh Sanikop // Region 1
219*09537850SAkhilesh Sanikop // Points to the left or right border of the first row of |src|.
220*09537850SAkhilesh Sanikop const uint8_t* first_row_border =
221*09537850SAkhilesh Sanikop (ix4 + 7 <= 0) ? src : src + source_width - 1;
222*09537850SAkhilesh Sanikop // In general, for y in [-7, 8), the row number iy4 + y is clipped:
223*09537850SAkhilesh Sanikop // const int row = Clip3(iy4 + y, 0, source_height - 1);
224*09537850SAkhilesh Sanikop // In two special cases, iy4 + y is clipped to either 0 or
225*09537850SAkhilesh Sanikop // source_height - 1 for all y. In the rest of the cases, iy4 + y is
226*09537850SAkhilesh Sanikop // bounded and we can avoid clipping iy4 + y by relying on a reference
227*09537850SAkhilesh Sanikop // frame's boundary extension on the top and bottom.
228*09537850SAkhilesh Sanikop // Region 1.
229*09537850SAkhilesh Sanikop // Every sample used to calculate the prediction block has the same
230*09537850SAkhilesh Sanikop // value. So the whole prediction block has the same value.
231*09537850SAkhilesh Sanikop const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
232*09537850SAkhilesh Sanikop const uint8_t row_border_pixel = first_row_border[row * source_stride];
233*09537850SAkhilesh Sanikop
234*09537850SAkhilesh Sanikop if (is_compound) {
235*09537850SAkhilesh Sanikop const __m128i sum =
236*09537850SAkhilesh Sanikop _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
237*09537850SAkhilesh Sanikop kInterRoundBitsCompoundVertical));
238*09537850SAkhilesh Sanikop StoreUnaligned16(dst_row, sum);
239*09537850SAkhilesh Sanikop } else {
240*09537850SAkhilesh Sanikop memset(dst_row, row_border_pixel, 8);
241*09537850SAkhilesh Sanikop }
242*09537850SAkhilesh Sanikop const DestType* const first_dst_row = dst_row;
243*09537850SAkhilesh Sanikop dst_row += dest_stride;
244*09537850SAkhilesh Sanikop for (int y = 1; y < 8; ++y) {
245*09537850SAkhilesh Sanikop memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
246*09537850SAkhilesh Sanikop dst_row += dest_stride;
247*09537850SAkhilesh Sanikop }
248*09537850SAkhilesh Sanikop }
249*09537850SAkhilesh Sanikop
250*09537850SAkhilesh Sanikop template <bool is_compound, typename DestType>
WarpRegion2(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int source_width,int64_t y4,int ix4,int iy4,int gamma,int delta,int16_t intermediate_result_column[15],DestType * LIBGAV1_RESTRICT dst_row,ptrdiff_t dest_stride)251*09537850SAkhilesh Sanikop inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
252*09537850SAkhilesh Sanikop ptrdiff_t source_stride, int source_width, int64_t y4,
253*09537850SAkhilesh Sanikop int ix4, int iy4, int gamma, int delta,
254*09537850SAkhilesh Sanikop int16_t intermediate_result_column[15],
255*09537850SAkhilesh Sanikop DestType* LIBGAV1_RESTRICT dst_row,
256*09537850SAkhilesh Sanikop ptrdiff_t dest_stride) {
257*09537850SAkhilesh Sanikop // Region 2.
258*09537850SAkhilesh Sanikop // Points to the left or right border of the first row of |src|.
259*09537850SAkhilesh Sanikop const uint8_t* first_row_border =
260*09537850SAkhilesh Sanikop (ix4 + 7 <= 0) ? src : src + source_width - 1;
261*09537850SAkhilesh Sanikop // In general, for y in [-7, 8), the row number iy4 + y is clipped:
262*09537850SAkhilesh Sanikop // const int row = Clip3(iy4 + y, 0, source_height - 1);
263*09537850SAkhilesh Sanikop // In two special cases, iy4 + y is clipped to either 0 or
264*09537850SAkhilesh Sanikop // source_height - 1 for all y. In the rest of the cases, iy4 + y is
265*09537850SAkhilesh Sanikop // bounded and we can avoid clipping iy4 + y by relying on a reference
266*09537850SAkhilesh Sanikop // frame's boundary extension on the top and bottom.
267*09537850SAkhilesh Sanikop
268*09537850SAkhilesh Sanikop // Region 2.
269*09537850SAkhilesh Sanikop // Horizontal filter.
270*09537850SAkhilesh Sanikop // The input values in this region are generated by extending the border
271*09537850SAkhilesh Sanikop // which makes them identical in the horizontal direction. This
272*09537850SAkhilesh Sanikop // computation could be inlined in the vertical pass but most
273*09537850SAkhilesh Sanikop // implementations will need a transpose of some sort.
274*09537850SAkhilesh Sanikop // It is not necessary to use the offset values here because the
275*09537850SAkhilesh Sanikop // horizontal pass is a simple shift and the vertical pass will always
276*09537850SAkhilesh Sanikop // require using 32 bits.
277*09537850SAkhilesh Sanikop for (int y = -7; y < 8; ++y) {
278*09537850SAkhilesh Sanikop // We may over-read up to 13 pixels above the top source row, or up
279*09537850SAkhilesh Sanikop // to 13 pixels below the bottom source row. This is proved in
280*09537850SAkhilesh Sanikop // warp.cc.
281*09537850SAkhilesh Sanikop const int row = iy4 + y;
282*09537850SAkhilesh Sanikop int sum = first_row_border[row * source_stride];
283*09537850SAkhilesh Sanikop sum <<= (kFilterBits - kInterRoundBitsHorizontal);
284*09537850SAkhilesh Sanikop intermediate_result_column[y + 7] = sum;
285*09537850SAkhilesh Sanikop }
286*09537850SAkhilesh Sanikop // Region 2 vertical filter.
287*09537850SAkhilesh Sanikop VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
288*09537850SAkhilesh Sanikop delta, dst_row, dest_stride);
289*09537850SAkhilesh Sanikop }
290*09537850SAkhilesh Sanikop
291*09537850SAkhilesh Sanikop template <bool is_compound, typename DestType>
WarpRegion3(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int source_height,int alpha,int beta,int64_t x4,int ix4,int iy4,int16_t intermediate_result[15][8])292*09537850SAkhilesh Sanikop inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
293*09537850SAkhilesh Sanikop ptrdiff_t source_stride, int source_height, int alpha,
294*09537850SAkhilesh Sanikop int beta, int64_t x4, int ix4, int iy4,
295*09537850SAkhilesh Sanikop int16_t intermediate_result[15][8]) {
296*09537850SAkhilesh Sanikop // Region 3
297*09537850SAkhilesh Sanikop // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
298*09537850SAkhilesh Sanikop
299*09537850SAkhilesh Sanikop // In general, for y in [-7, 8), the row number iy4 + y is clipped:
300*09537850SAkhilesh Sanikop // const int row = Clip3(iy4 + y, 0, source_height - 1);
301*09537850SAkhilesh Sanikop // In two special cases, iy4 + y is clipped to either 0 or
302*09537850SAkhilesh Sanikop // source_height - 1 for all y. In the rest of the cases, iy4 + y is
303*09537850SAkhilesh Sanikop // bounded and we can avoid clipping iy4 + y by relying on a reference
304*09537850SAkhilesh Sanikop // frame's boundary extension on the top and bottom.
305*09537850SAkhilesh Sanikop // Horizontal filter.
306*09537850SAkhilesh Sanikop const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
307*09537850SAkhilesh Sanikop const uint8_t* const src_row = src + row * source_stride;
308*09537850SAkhilesh Sanikop // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
309*09537850SAkhilesh Sanikop // read but is ignored.
310*09537850SAkhilesh Sanikop //
311*09537850SAkhilesh Sanikop // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
312*09537850SAkhilesh Sanikop // bytes after src_row[source_width - 1]. We assume the source frame
313*09537850SAkhilesh Sanikop // has left and right borders of at least 13 bytes that extend the
314*09537850SAkhilesh Sanikop // frame boundary pixels. We also assume there is at least one extra
315*09537850SAkhilesh Sanikop // padding byte after the right border of the last source row.
316*09537850SAkhilesh Sanikop const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
317*09537850SAkhilesh Sanikop int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
318*09537850SAkhilesh Sanikop for (int y = -7; y < 8; ++y) {
319*09537850SAkhilesh Sanikop HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
320*09537850SAkhilesh Sanikop sx4 += beta;
321*09537850SAkhilesh Sanikop }
322*09537850SAkhilesh Sanikop }
323*09537850SAkhilesh Sanikop
324*09537850SAkhilesh Sanikop template <bool is_compound, typename DestType>
WarpRegion4(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int alpha,int beta,int64_t x4,int ix4,int iy4,int16_t intermediate_result[15][8])325*09537850SAkhilesh Sanikop inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
326*09537850SAkhilesh Sanikop ptrdiff_t source_stride, int alpha, int beta,
327*09537850SAkhilesh Sanikop int64_t x4, int ix4, int iy4,
328*09537850SAkhilesh Sanikop int16_t intermediate_result[15][8]) {
329*09537850SAkhilesh Sanikop // Region 4.
330*09537850SAkhilesh Sanikop // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
331*09537850SAkhilesh Sanikop
332*09537850SAkhilesh Sanikop // In general, for y in [-7, 8), the row number iy4 + y is clipped:
333*09537850SAkhilesh Sanikop // const int row = Clip3(iy4 + y, 0, source_height - 1);
334*09537850SAkhilesh Sanikop // In two special cases, iy4 + y is clipped to either 0 or
335*09537850SAkhilesh Sanikop // source_height - 1 for all y. In the rest of the cases, iy4 + y is
336*09537850SAkhilesh Sanikop // bounded and we can avoid clipping iy4 + y by relying on a reference
337*09537850SAkhilesh Sanikop // frame's boundary extension on the top and bottom.
338*09537850SAkhilesh Sanikop // Horizontal filter.
339*09537850SAkhilesh Sanikop int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
340*09537850SAkhilesh Sanikop for (int y = -7; y < 8; ++y) {
341*09537850SAkhilesh Sanikop // We may over-read up to 13 pixels above the top source row, or up
342*09537850SAkhilesh Sanikop // to 13 pixels below the bottom source row. This is proved in
343*09537850SAkhilesh Sanikop // warp.cc.
344*09537850SAkhilesh Sanikop const int row = iy4 + y;
345*09537850SAkhilesh Sanikop const uint8_t* const src_row = src + row * source_stride;
346*09537850SAkhilesh Sanikop // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
347*09537850SAkhilesh Sanikop // read but is ignored.
348*09537850SAkhilesh Sanikop //
349*09537850SAkhilesh Sanikop // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
350*09537850SAkhilesh Sanikop // bytes after src_row[source_width - 1]. We assume the source frame
351*09537850SAkhilesh Sanikop // has left and right borders of at least 13 bytes that extend the
352*09537850SAkhilesh Sanikop // frame boundary pixels. We also assume there is at least one extra
353*09537850SAkhilesh Sanikop // padding byte after the right border of the last source row.
354*09537850SAkhilesh Sanikop const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
355*09537850SAkhilesh Sanikop // Convert src_row_v to int8 (subtract 128).
356*09537850SAkhilesh Sanikop HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
357*09537850SAkhilesh Sanikop sx4 += beta;
358*09537850SAkhilesh Sanikop }
359*09537850SAkhilesh Sanikop }
360*09537850SAkhilesh Sanikop
361*09537850SAkhilesh Sanikop template <bool is_compound, typename DestType>
HandleWarpBlock(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t source_stride,int source_width,int source_height,const int * LIBGAV1_RESTRICT warp_params,int subsampling_x,int subsampling_y,int src_x,int src_y,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta,DestType * LIBGAV1_RESTRICT dst_row,ptrdiff_t dest_stride)362*09537850SAkhilesh Sanikop inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
363*09537850SAkhilesh Sanikop ptrdiff_t source_stride, int source_width,
364*09537850SAkhilesh Sanikop int source_height,
365*09537850SAkhilesh Sanikop const int* LIBGAV1_RESTRICT warp_params,
366*09537850SAkhilesh Sanikop int subsampling_x, int subsampling_y, int src_x,
367*09537850SAkhilesh Sanikop int src_y, int16_t alpha, int16_t beta,
368*09537850SAkhilesh Sanikop int16_t gamma, int16_t delta,
369*09537850SAkhilesh Sanikop DestType* LIBGAV1_RESTRICT dst_row,
370*09537850SAkhilesh Sanikop ptrdiff_t dest_stride) {
371*09537850SAkhilesh Sanikop union {
372*09537850SAkhilesh Sanikop // Intermediate_result is the output of the horizontal filtering and
373*09537850SAkhilesh Sanikop // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
374*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
375*09537850SAkhilesh Sanikop // type so that we can start with a negative offset and restore it on the
376*09537850SAkhilesh Sanikop // final filter sum.
377*09537850SAkhilesh Sanikop int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
378*09537850SAkhilesh Sanikop // In the simple special cases where the samples in each row are all the
379*09537850SAkhilesh Sanikop // same, store one sample per row in a column vector.
380*09537850SAkhilesh Sanikop int16_t intermediate_result_column[15];
381*09537850SAkhilesh Sanikop };
382*09537850SAkhilesh Sanikop
383*09537850SAkhilesh Sanikop const WarpFilterParams filter_params = GetWarpFilterParams(
384*09537850SAkhilesh Sanikop src_x, src_y, subsampling_x, subsampling_y, warp_params);
385*09537850SAkhilesh Sanikop // A prediction block may fall outside the frame's boundaries. If a
386*09537850SAkhilesh Sanikop // prediction block is calculated using only samples outside the frame's
387*09537850SAkhilesh Sanikop // boundary, the filtering can be simplified. We can divide the plane
388*09537850SAkhilesh Sanikop // into several regions and handle them differently.
389*09537850SAkhilesh Sanikop //
390*09537850SAkhilesh Sanikop // | |
391*09537850SAkhilesh Sanikop // 1 | 3 | 1
392*09537850SAkhilesh Sanikop // | |
393*09537850SAkhilesh Sanikop // -------+-----------+-------
394*09537850SAkhilesh Sanikop // |***********|
395*09537850SAkhilesh Sanikop // 2 |*****4*****| 2
396*09537850SAkhilesh Sanikop // |***********|
397*09537850SAkhilesh Sanikop // -------+-----------+-------
398*09537850SAkhilesh Sanikop // | |
399*09537850SAkhilesh Sanikop // 1 | 3 | 1
400*09537850SAkhilesh Sanikop // | |
401*09537850SAkhilesh Sanikop //
402*09537850SAkhilesh Sanikop // At the center, region 4 represents the frame and is the general case.
403*09537850SAkhilesh Sanikop //
404*09537850SAkhilesh Sanikop // In regions 1 and 2, the prediction block is outside the frame's
405*09537850SAkhilesh Sanikop // boundary horizontally. Therefore the horizontal filtering can be
406*09537850SAkhilesh Sanikop // simplified. Furthermore, in the region 1 (at the four corners), the
407*09537850SAkhilesh Sanikop // prediction is outside the frame's boundary both horizontally and
408*09537850SAkhilesh Sanikop // vertically, so we get a constant prediction block.
409*09537850SAkhilesh Sanikop //
410*09537850SAkhilesh Sanikop // In region 3, the prediction block is outside the frame's boundary
411*09537850SAkhilesh Sanikop // vertically. Unfortunately because we apply the horizontal filters
412*09537850SAkhilesh Sanikop // first, by the time we apply the vertical filters, they no longer see
413*09537850SAkhilesh Sanikop // simple inputs. So the only simplification is that all the rows are
414*09537850SAkhilesh Sanikop // the same, but we still need to apply all the horizontal and vertical
415*09537850SAkhilesh Sanikop // filters.
416*09537850SAkhilesh Sanikop
417*09537850SAkhilesh Sanikop // Check for two simple special cases, where the horizontal filter can
418*09537850SAkhilesh Sanikop // be significantly simplified.
419*09537850SAkhilesh Sanikop //
420*09537850SAkhilesh Sanikop // In general, for each row, the horizontal filter is calculated as
421*09537850SAkhilesh Sanikop // follows:
422*09537850SAkhilesh Sanikop // for (int x = -4; x < 4; ++x) {
423*09537850SAkhilesh Sanikop // const int offset = ...;
424*09537850SAkhilesh Sanikop // int sum = first_pass_offset;
425*09537850SAkhilesh Sanikop // for (int k = 0; k < 8; ++k) {
426*09537850SAkhilesh Sanikop // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
427*09537850SAkhilesh Sanikop // sum += kWarpedFilters[offset][k] * src_row[column];
428*09537850SAkhilesh Sanikop // }
429*09537850SAkhilesh Sanikop // ...
430*09537850SAkhilesh Sanikop // }
431*09537850SAkhilesh Sanikop // The column index before clipping, ix4 + x + k - 3, varies in the range
432*09537850SAkhilesh Sanikop // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
433*09537850SAkhilesh Sanikop // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
434*09537850SAkhilesh Sanikop // border index (source_width - 1 or 0, respectively). Then for each x,
435*09537850SAkhilesh Sanikop // the inner for loop of the horizontal filter is reduced to multiplying
436*09537850SAkhilesh Sanikop // the border pixel by the sum of the filter coefficients.
437*09537850SAkhilesh Sanikop if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) {
438*09537850SAkhilesh Sanikop if ((filter_params.iy4 - 7 >= source_height - 1 ||
439*09537850SAkhilesh Sanikop filter_params.iy4 + 7 <= 0)) {
440*09537850SAkhilesh Sanikop // Outside the frame in both directions. One repeated value.
441*09537850SAkhilesh Sanikop WarpRegion1<is_compound, DestType>(
442*09537850SAkhilesh Sanikop src, source_stride, source_width, source_height, filter_params.ix4,
443*09537850SAkhilesh Sanikop filter_params.iy4, dst_row, dest_stride);
444*09537850SAkhilesh Sanikop return;
445*09537850SAkhilesh Sanikop }
446*09537850SAkhilesh Sanikop // Outside the frame horizontally. Rows repeated.
447*09537850SAkhilesh Sanikop WarpRegion2<is_compound, DestType>(
448*09537850SAkhilesh Sanikop src, source_stride, source_width, filter_params.y4, filter_params.ix4,
449*09537850SAkhilesh Sanikop filter_params.iy4, gamma, delta, intermediate_result_column, dst_row,
450*09537850SAkhilesh Sanikop dest_stride);
451*09537850SAkhilesh Sanikop return;
452*09537850SAkhilesh Sanikop }
453*09537850SAkhilesh Sanikop
454*09537850SAkhilesh Sanikop if ((filter_params.iy4 - 7 >= source_height - 1 ||
455*09537850SAkhilesh Sanikop filter_params.iy4 + 7 <= 0)) {
456*09537850SAkhilesh Sanikop // Outside the frame vertically.
457*09537850SAkhilesh Sanikop WarpRegion3<is_compound, DestType>(
458*09537850SAkhilesh Sanikop src, source_stride, source_height, alpha, beta, filter_params.x4,
459*09537850SAkhilesh Sanikop filter_params.ix4, filter_params.iy4, intermediate_result);
460*09537850SAkhilesh Sanikop } else {
461*09537850SAkhilesh Sanikop // Inside the frame.
462*09537850SAkhilesh Sanikop WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta,
463*09537850SAkhilesh Sanikop filter_params.x4, filter_params.ix4,
464*09537850SAkhilesh Sanikop filter_params.iy4, intermediate_result);
465*09537850SAkhilesh Sanikop }
466*09537850SAkhilesh Sanikop // Region 3 and 4 vertical filter.
467*09537850SAkhilesh Sanikop VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4,
468*09537850SAkhilesh Sanikop gamma, delta, dst_row, dest_stride);
469*09537850SAkhilesh Sanikop }
470*09537850SAkhilesh Sanikop
471*09537850SAkhilesh Sanikop template <bool is_compound>
Warp_SSE4_1(const void * LIBGAV1_RESTRICT source,ptrdiff_t source_stride,int source_width,int source_height,const int * LIBGAV1_RESTRICT warp_params,int subsampling_x,int subsampling_y,int block_start_x,int block_start_y,int block_width,int block_height,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta,void * LIBGAV1_RESTRICT dest,ptrdiff_t dest_stride)472*09537850SAkhilesh Sanikop void Warp_SSE4_1(const void* LIBGAV1_RESTRICT source, ptrdiff_t source_stride,
473*09537850SAkhilesh Sanikop int source_width, int source_height,
474*09537850SAkhilesh Sanikop const int* LIBGAV1_RESTRICT warp_params, int subsampling_x,
475*09537850SAkhilesh Sanikop int subsampling_y, int block_start_x, int block_start_y,
476*09537850SAkhilesh Sanikop int block_width, int block_height, int16_t alpha, int16_t beta,
477*09537850SAkhilesh Sanikop int16_t gamma, int16_t delta, void* LIBGAV1_RESTRICT dest,
478*09537850SAkhilesh Sanikop ptrdiff_t dest_stride) {
479*09537850SAkhilesh Sanikop const auto* const src = static_cast<const uint8_t*>(source);
480*09537850SAkhilesh Sanikop using DestType =
481*09537850SAkhilesh Sanikop typename std::conditional<is_compound, int16_t, uint8_t>::type;
482*09537850SAkhilesh Sanikop auto* dst = static_cast<DestType*>(dest);
483*09537850SAkhilesh Sanikop
484*09537850SAkhilesh Sanikop // Warp process applies for each 8x8 block.
485*09537850SAkhilesh Sanikop assert(block_width >= 8);
486*09537850SAkhilesh Sanikop assert(block_height >= 8);
487*09537850SAkhilesh Sanikop const int block_end_x = block_start_x + block_width;
488*09537850SAkhilesh Sanikop const int block_end_y = block_start_y + block_height;
489*09537850SAkhilesh Sanikop
490*09537850SAkhilesh Sanikop const int start_x = block_start_x;
491*09537850SAkhilesh Sanikop const int start_y = block_start_y;
492*09537850SAkhilesh Sanikop int src_x = (start_x + 4) << subsampling_x;
493*09537850SAkhilesh Sanikop int src_y = (start_y + 4) << subsampling_y;
494*09537850SAkhilesh Sanikop const int end_x = (block_end_x + 4) << subsampling_x;
495*09537850SAkhilesh Sanikop const int end_y = (block_end_y + 4) << subsampling_y;
496*09537850SAkhilesh Sanikop do {
497*09537850SAkhilesh Sanikop DestType* dst_row = dst;
498*09537850SAkhilesh Sanikop src_x = (start_x + 4) << subsampling_x;
499*09537850SAkhilesh Sanikop do {
500*09537850SAkhilesh Sanikop HandleWarpBlock<is_compound, DestType>(
501*09537850SAkhilesh Sanikop src, source_stride, source_width, source_height, warp_params,
502*09537850SAkhilesh Sanikop subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
503*09537850SAkhilesh Sanikop dst_row, dest_stride);
504*09537850SAkhilesh Sanikop src_x += (8 << subsampling_x);
505*09537850SAkhilesh Sanikop dst_row += 8;
506*09537850SAkhilesh Sanikop } while (src_x < end_x);
507*09537850SAkhilesh Sanikop dst += 8 * dest_stride;
508*09537850SAkhilesh Sanikop src_y += (8 << subsampling_y);
509*09537850SAkhilesh Sanikop } while (src_y < end_y);
510*09537850SAkhilesh Sanikop }
511*09537850SAkhilesh Sanikop
Init8bpp()512*09537850SAkhilesh Sanikop void Init8bpp() {
513*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
514*09537850SAkhilesh Sanikop assert(dsp != nullptr);
515*09537850SAkhilesh Sanikop dsp->warp = Warp_SSE4_1</*is_compound=*/false>;
516*09537850SAkhilesh Sanikop dsp->warp_compound = Warp_SSE4_1</*is_compound=*/true>;
517*09537850SAkhilesh Sanikop }
518*09537850SAkhilesh Sanikop
519*09537850SAkhilesh Sanikop } // namespace
520*09537850SAkhilesh Sanikop } // namespace low_bitdepth
521*09537850SAkhilesh Sanikop
WarpInit_SSE4_1()522*09537850SAkhilesh Sanikop void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
523*09537850SAkhilesh Sanikop
524*09537850SAkhilesh Sanikop } // namespace dsp
525*09537850SAkhilesh Sanikop } // namespace libgav1
526*09537850SAkhilesh Sanikop #else // !LIBGAV1_TARGETING_SSE4_1
527*09537850SAkhilesh Sanikop
528*09537850SAkhilesh Sanikop namespace libgav1 {
529*09537850SAkhilesh Sanikop namespace dsp {
530*09537850SAkhilesh Sanikop
WarpInit_SSE4_1()531*09537850SAkhilesh Sanikop void WarpInit_SSE4_1() {}
532*09537850SAkhilesh Sanikop
533*09537850SAkhilesh Sanikop } // namespace dsp
534*09537850SAkhilesh Sanikop } // namespace libgav1
535*09537850SAkhilesh Sanikop #endif // LIBGAV1_TARGETING_SSE4_1
536