1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop // http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop
15*09537850SAkhilesh Sanikop #include "src/dsp/convolve.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop
18*09537850SAkhilesh Sanikop #if LIBGAV1_ENABLE_NEON
19*09537850SAkhilesh Sanikop
20*09537850SAkhilesh Sanikop #include <arm_neon.h>
21*09537850SAkhilesh Sanikop
22*09537850SAkhilesh Sanikop #include <algorithm>
23*09537850SAkhilesh Sanikop #include <cassert>
24*09537850SAkhilesh Sanikop #include <cstddef>
25*09537850SAkhilesh Sanikop #include <cstdint>
26*09537850SAkhilesh Sanikop
27*09537850SAkhilesh Sanikop #include "src/dsp/arm/common_neon.h"
28*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
29*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
30*09537850SAkhilesh Sanikop #include "src/utils/common.h"
31*09537850SAkhilesh Sanikop #include "src/utils/compiler_attributes.h"
32*09537850SAkhilesh Sanikop
33*09537850SAkhilesh Sanikop namespace libgav1 {
34*09537850SAkhilesh Sanikop namespace dsp {
35*09537850SAkhilesh Sanikop namespace low_bitdepth {
36*09537850SAkhilesh Sanikop namespace {
37*09537850SAkhilesh Sanikop
38*09537850SAkhilesh Sanikop // Include the constants and utility functions inside the anonymous namespace.
39*09537850SAkhilesh Sanikop #include "src/dsp/convolve.inc"
40*09537850SAkhilesh Sanikop
41*09537850SAkhilesh Sanikop // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
42*09537850SAkhilesh Sanikop // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
43*09537850SAkhilesh Sanikop // sum from outranging int16_t.
44*09537850SAkhilesh Sanikop template <int filter_index, bool negative_outside_taps = false>
SumOnePassTaps(const uint8x8_t * const src,const uint8x8_t * const taps)45*09537850SAkhilesh Sanikop int16x8_t SumOnePassTaps(const uint8x8_t* const src,
46*09537850SAkhilesh Sanikop const uint8x8_t* const taps) {
47*09537850SAkhilesh Sanikop uint16x8_t sum;
48*09537850SAkhilesh Sanikop if (filter_index == 0) {
49*09537850SAkhilesh Sanikop // 6 taps. + - + + - +
50*09537850SAkhilesh Sanikop sum = vmull_u8(src[0], taps[0]);
51*09537850SAkhilesh Sanikop // Unsigned overflow will result in a valid int16_t value.
52*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[1], taps[1]);
53*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[2], taps[2]);
54*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[3], taps[3]);
55*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[4], taps[4]);
56*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[5], taps[5]);
57*09537850SAkhilesh Sanikop } else if (filter_index == 1 && negative_outside_taps) {
58*09537850SAkhilesh Sanikop // 6 taps. - + + + + -
59*09537850SAkhilesh Sanikop // Set a base we can subtract from.
60*09537850SAkhilesh Sanikop sum = vmull_u8(src[1], taps[1]);
61*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[0], taps[0]);
62*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[2], taps[2]);
63*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[3], taps[3]);
64*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[4], taps[4]);
65*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[5], taps[5]);
66*09537850SAkhilesh Sanikop } else if (filter_index == 1) {
67*09537850SAkhilesh Sanikop // 6 taps. All are positive.
68*09537850SAkhilesh Sanikop sum = vmull_u8(src[0], taps[0]);
69*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[1], taps[1]);
70*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[2], taps[2]);
71*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[3], taps[3]);
72*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[4], taps[4]);
73*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[5], taps[5]);
74*09537850SAkhilesh Sanikop } else if (filter_index == 2) {
75*09537850SAkhilesh Sanikop // 8 taps. - + - + + - + -
76*09537850SAkhilesh Sanikop sum = vmull_u8(src[1], taps[1]);
77*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[0], taps[0]);
78*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[2], taps[2]);
79*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[3], taps[3]);
80*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[4], taps[4]);
81*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[5], taps[5]);
82*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[6], taps[6]);
83*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[7], taps[7]);
84*09537850SAkhilesh Sanikop } else if (filter_index == 3) {
85*09537850SAkhilesh Sanikop // 2 taps. All are positive.
86*09537850SAkhilesh Sanikop sum = vmull_u8(src[0], taps[0]);
87*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[1], taps[1]);
88*09537850SAkhilesh Sanikop } else if (filter_index == 4) {
89*09537850SAkhilesh Sanikop // 4 taps. - + + -
90*09537850SAkhilesh Sanikop sum = vmull_u8(src[1], taps[1]);
91*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[0], taps[0]);
92*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[2], taps[2]);
93*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, src[3], taps[3]);
94*09537850SAkhilesh Sanikop } else if (filter_index == 5) {
95*09537850SAkhilesh Sanikop // 4 taps. All are positive.
96*09537850SAkhilesh Sanikop sum = vmull_u8(src[0], taps[0]);
97*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[1], taps[1]);
98*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[2], taps[2]);
99*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, src[3], taps[3]);
100*09537850SAkhilesh Sanikop }
101*09537850SAkhilesh Sanikop return vreinterpretq_s16_u16(sum);
102*09537850SAkhilesh Sanikop }
103*09537850SAkhilesh Sanikop
104*09537850SAkhilesh Sanikop template <int filter_index, bool negative_outside_taps, bool is_2d,
105*09537850SAkhilesh Sanikop bool is_compound>
FilterHorizontalWidth8AndUp(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dest,const ptrdiff_t pred_stride,const int width,const int height,const uint8x8_t * const v_tap)106*09537850SAkhilesh Sanikop void FilterHorizontalWidth8AndUp(const uint8_t* LIBGAV1_RESTRICT src,
107*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
108*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest,
109*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride, const int width,
110*09537850SAkhilesh Sanikop const int height,
111*09537850SAkhilesh Sanikop const uint8x8_t* const v_tap) {
112*09537850SAkhilesh Sanikop auto* dest8 = static_cast<uint8_t*>(dest);
113*09537850SAkhilesh Sanikop auto* dest16 = static_cast<uint16_t*>(dest);
114*09537850SAkhilesh Sanikop if (!is_2d) {
115*09537850SAkhilesh Sanikop int y = height;
116*09537850SAkhilesh Sanikop do {
117*09537850SAkhilesh Sanikop int x = 0;
118*09537850SAkhilesh Sanikop do { // Increasing loop counter x is better.
119*09537850SAkhilesh Sanikop const uint8x16_t src_long = vld1q_u8(src + x);
120*09537850SAkhilesh Sanikop uint8x8_t v_src[8];
121*09537850SAkhilesh Sanikop int16x8_t sum;
122*09537850SAkhilesh Sanikop if (filter_index < 2) {
123*09537850SAkhilesh Sanikop v_src[0] = vget_low_u8(src_long);
124*09537850SAkhilesh Sanikop v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
125*09537850SAkhilesh Sanikop v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
126*09537850SAkhilesh Sanikop v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
127*09537850SAkhilesh Sanikop v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
128*09537850SAkhilesh Sanikop v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
129*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
130*09537850SAkhilesh Sanikop v_tap + 1);
131*09537850SAkhilesh Sanikop } else if (filter_index == 2) {
132*09537850SAkhilesh Sanikop v_src[0] = vget_low_u8(src_long);
133*09537850SAkhilesh Sanikop v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
134*09537850SAkhilesh Sanikop v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
135*09537850SAkhilesh Sanikop v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
136*09537850SAkhilesh Sanikop v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
137*09537850SAkhilesh Sanikop v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
138*09537850SAkhilesh Sanikop v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
139*09537850SAkhilesh Sanikop v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
140*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
141*09537850SAkhilesh Sanikop } else if (filter_index == 3) {
142*09537850SAkhilesh Sanikop v_src[0] = vget_low_u8(src_long);
143*09537850SAkhilesh Sanikop v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
144*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
145*09537850SAkhilesh Sanikop } else if (filter_index > 3) {
146*09537850SAkhilesh Sanikop v_src[0] = vget_low_u8(src_long);
147*09537850SAkhilesh Sanikop v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
148*09537850SAkhilesh Sanikop v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
149*09537850SAkhilesh Sanikop v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
150*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
151*09537850SAkhilesh Sanikop }
152*09537850SAkhilesh Sanikop if (is_compound) {
153*09537850SAkhilesh Sanikop const uint16x8_t v_sum = vreinterpretq_u16_s16(
154*09537850SAkhilesh Sanikop vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
155*09537850SAkhilesh Sanikop vst1q_u16(&dest16[x], v_sum);
156*09537850SAkhilesh Sanikop } else {
157*09537850SAkhilesh Sanikop // Normally the Horizontal pass does the downshift in two passes:
158*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
159*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal). Each one uses a rounding shift.
160*09537850SAkhilesh Sanikop // Combining them requires adding the rounding offset from the skipped
161*09537850SAkhilesh Sanikop // shift.
162*09537850SAkhilesh Sanikop constexpr int first_shift_rounding_bit =
163*09537850SAkhilesh Sanikop 1 << (kInterRoundBitsHorizontal - 2);
164*09537850SAkhilesh Sanikop sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
165*09537850SAkhilesh Sanikop const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
166*09537850SAkhilesh Sanikop vst1_u8(&dest8[x], result);
167*09537850SAkhilesh Sanikop }
168*09537850SAkhilesh Sanikop x += 8;
169*09537850SAkhilesh Sanikop } while (x < width);
170*09537850SAkhilesh Sanikop src += src_stride;
171*09537850SAkhilesh Sanikop dest8 += pred_stride;
172*09537850SAkhilesh Sanikop dest16 += pred_stride;
173*09537850SAkhilesh Sanikop } while (--y != 0);
174*09537850SAkhilesh Sanikop } else {
175*09537850SAkhilesh Sanikop int x = 0;
176*09537850SAkhilesh Sanikop do {
177*09537850SAkhilesh Sanikop const uint8_t* s = src + x;
178*09537850SAkhilesh Sanikop int y = height;
179*09537850SAkhilesh Sanikop do { // Increasing loop counter x is better.
180*09537850SAkhilesh Sanikop const uint8x16_t src_long = vld1q_u8(s);
181*09537850SAkhilesh Sanikop uint8x8_t v_src[8];
182*09537850SAkhilesh Sanikop int16x8_t sum;
183*09537850SAkhilesh Sanikop if (filter_index < 2) {
184*09537850SAkhilesh Sanikop v_src[0] = vget_low_u8(src_long);
185*09537850SAkhilesh Sanikop v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
186*09537850SAkhilesh Sanikop v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
187*09537850SAkhilesh Sanikop v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
188*09537850SAkhilesh Sanikop v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
189*09537850SAkhilesh Sanikop v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
190*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
191*09537850SAkhilesh Sanikop v_tap + 1);
192*09537850SAkhilesh Sanikop } else if (filter_index == 2) {
193*09537850SAkhilesh Sanikop v_src[0] = vget_low_u8(src_long);
194*09537850SAkhilesh Sanikop v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
195*09537850SAkhilesh Sanikop v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
196*09537850SAkhilesh Sanikop v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
197*09537850SAkhilesh Sanikop v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
198*09537850SAkhilesh Sanikop v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
199*09537850SAkhilesh Sanikop v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
200*09537850SAkhilesh Sanikop v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
201*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
202*09537850SAkhilesh Sanikop } else if (filter_index == 3) {
203*09537850SAkhilesh Sanikop v_src[0] = vget_low_u8(src_long);
204*09537850SAkhilesh Sanikop v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
205*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
206*09537850SAkhilesh Sanikop } else if (filter_index > 3) {
207*09537850SAkhilesh Sanikop v_src[0] = vget_low_u8(src_long);
208*09537850SAkhilesh Sanikop v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
209*09537850SAkhilesh Sanikop v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
210*09537850SAkhilesh Sanikop v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
211*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
212*09537850SAkhilesh Sanikop }
213*09537850SAkhilesh Sanikop const uint16x8_t v_sum = vreinterpretq_u16_s16(
214*09537850SAkhilesh Sanikop vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
215*09537850SAkhilesh Sanikop vst1q_u16(dest16, v_sum);
216*09537850SAkhilesh Sanikop s += src_stride;
217*09537850SAkhilesh Sanikop dest16 += 8;
218*09537850SAkhilesh Sanikop } while (--y != 0);
219*09537850SAkhilesh Sanikop x += 8;
220*09537850SAkhilesh Sanikop } while (x < width);
221*09537850SAkhilesh Sanikop }
222*09537850SAkhilesh Sanikop }
223*09537850SAkhilesh Sanikop
224*09537850SAkhilesh Sanikop template <int filter_index, bool is_2d, bool is_compound>
FilterHorizontalWidth4(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dest,const ptrdiff_t pred_stride,const int height,const uint8x8_t * const v_tap)225*09537850SAkhilesh Sanikop void FilterHorizontalWidth4(const uint8_t* LIBGAV1_RESTRICT src,
226*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
227*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest,
228*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride, const int height,
229*09537850SAkhilesh Sanikop const uint8x8_t* const v_tap) {
230*09537850SAkhilesh Sanikop auto* dest8 = static_cast<uint8_t*>(dest);
231*09537850SAkhilesh Sanikop auto* dest16 = static_cast<uint16_t*>(dest);
232*09537850SAkhilesh Sanikop int y = height;
233*09537850SAkhilesh Sanikop do {
234*09537850SAkhilesh Sanikop uint8x8_t v_src[4];
235*09537850SAkhilesh Sanikop int16x8_t sum;
236*09537850SAkhilesh Sanikop v_src[0] = vld1_u8(src);
237*09537850SAkhilesh Sanikop if (filter_index == 3) {
238*09537850SAkhilesh Sanikop v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
239*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
240*09537850SAkhilesh Sanikop } else {
241*09537850SAkhilesh Sanikop v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
242*09537850SAkhilesh Sanikop v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
243*09537850SAkhilesh Sanikop v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
244*09537850SAkhilesh Sanikop sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
245*09537850SAkhilesh Sanikop }
246*09537850SAkhilesh Sanikop if (is_2d || is_compound) {
247*09537850SAkhilesh Sanikop const uint16x4_t v_sum = vreinterpret_u16_s16(
248*09537850SAkhilesh Sanikop vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
249*09537850SAkhilesh Sanikop vst1_u16(dest16, v_sum);
250*09537850SAkhilesh Sanikop } else {
251*09537850SAkhilesh Sanikop constexpr int first_shift_rounding_bit =
252*09537850SAkhilesh Sanikop 1 << (kInterRoundBitsHorizontal - 2);
253*09537850SAkhilesh Sanikop sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
254*09537850SAkhilesh Sanikop const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
255*09537850SAkhilesh Sanikop StoreLo4(&dest8[0], result);
256*09537850SAkhilesh Sanikop }
257*09537850SAkhilesh Sanikop src += src_stride;
258*09537850SAkhilesh Sanikop dest8 += pred_stride;
259*09537850SAkhilesh Sanikop dest16 += pred_stride;
260*09537850SAkhilesh Sanikop } while (--y != 0);
261*09537850SAkhilesh Sanikop }
262*09537850SAkhilesh Sanikop
263*09537850SAkhilesh Sanikop template <int filter_index, bool is_2d>
FilterHorizontalWidth2(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dest,const ptrdiff_t pred_stride,const int height,const uint8x8_t * const v_tap)264*09537850SAkhilesh Sanikop void FilterHorizontalWidth2(const uint8_t* LIBGAV1_RESTRICT src,
265*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
266*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest,
267*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride, const int height,
268*09537850SAkhilesh Sanikop const uint8x8_t* const v_tap) {
269*09537850SAkhilesh Sanikop auto* dest8 = static_cast<uint8_t*>(dest);
270*09537850SAkhilesh Sanikop auto* dest16 = static_cast<uint16_t*>(dest);
271*09537850SAkhilesh Sanikop int y = height >> 1;
272*09537850SAkhilesh Sanikop do {
273*09537850SAkhilesh Sanikop const uint8x8_t input0 = vld1_u8(src);
274*09537850SAkhilesh Sanikop const uint8x8_t input1 = vld1_u8(src + src_stride);
275*09537850SAkhilesh Sanikop const uint8x8x2_t input = vzip_u8(input0, input1);
276*09537850SAkhilesh Sanikop uint16x8_t sum;
277*09537850SAkhilesh Sanikop if (filter_index == 3) {
278*09537850SAkhilesh Sanikop // tap signs : + +
279*09537850SAkhilesh Sanikop sum = vmull_u8(input.val[0], v_tap[3]);
280*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
281*09537850SAkhilesh Sanikop } else if (filter_index == 4) {
282*09537850SAkhilesh Sanikop // tap signs : - + + -
283*09537850SAkhilesh Sanikop sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
284*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
285*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
286*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
287*09537850SAkhilesh Sanikop } else {
288*09537850SAkhilesh Sanikop // tap signs : + + + +
289*09537850SAkhilesh Sanikop sum = vmull_u8(input.val[0], v_tap[2]);
290*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
291*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
292*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
293*09537850SAkhilesh Sanikop }
294*09537850SAkhilesh Sanikop int16x8_t s = vreinterpretq_s16_u16(sum);
295*09537850SAkhilesh Sanikop if (is_2d) {
296*09537850SAkhilesh Sanikop const uint16x8_t v_sum =
297*09537850SAkhilesh Sanikop vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
298*09537850SAkhilesh Sanikop dest16[0] = vgetq_lane_u16(v_sum, 0);
299*09537850SAkhilesh Sanikop dest16[1] = vgetq_lane_u16(v_sum, 2);
300*09537850SAkhilesh Sanikop dest16 += pred_stride;
301*09537850SAkhilesh Sanikop dest16[0] = vgetq_lane_u16(v_sum, 1);
302*09537850SAkhilesh Sanikop dest16[1] = vgetq_lane_u16(v_sum, 3);
303*09537850SAkhilesh Sanikop dest16 += pred_stride;
304*09537850SAkhilesh Sanikop } else {
305*09537850SAkhilesh Sanikop // Normally the Horizontal pass does the downshift in two passes:
306*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
307*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal). Each one uses a rounding shift.
308*09537850SAkhilesh Sanikop // Combining them requires adding the rounding offset from the skipped
309*09537850SAkhilesh Sanikop // shift.
310*09537850SAkhilesh Sanikop constexpr int first_shift_rounding_bit =
311*09537850SAkhilesh Sanikop 1 << (kInterRoundBitsHorizontal - 2);
312*09537850SAkhilesh Sanikop s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
313*09537850SAkhilesh Sanikop const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
314*09537850SAkhilesh Sanikop dest8[0] = vget_lane_u8(result, 0);
315*09537850SAkhilesh Sanikop dest8[1] = vget_lane_u8(result, 2);
316*09537850SAkhilesh Sanikop dest8 += pred_stride;
317*09537850SAkhilesh Sanikop dest8[0] = vget_lane_u8(result, 1);
318*09537850SAkhilesh Sanikop dest8[1] = vget_lane_u8(result, 3);
319*09537850SAkhilesh Sanikop dest8 += pred_stride;
320*09537850SAkhilesh Sanikop }
321*09537850SAkhilesh Sanikop src += src_stride << 1;
322*09537850SAkhilesh Sanikop } while (--y != 0);
323*09537850SAkhilesh Sanikop
324*09537850SAkhilesh Sanikop // The 2d filters have an odd |height| because the horizontal pass
325*09537850SAkhilesh Sanikop // generates context for the vertical pass.
326*09537850SAkhilesh Sanikop if (is_2d) {
327*09537850SAkhilesh Sanikop assert(height % 2 == 1);
328*09537850SAkhilesh Sanikop const uint8x8_t input = vld1_u8(src);
329*09537850SAkhilesh Sanikop uint16x8_t sum;
330*09537850SAkhilesh Sanikop if (filter_index == 3) {
331*09537850SAkhilesh Sanikop sum = vmull_u8(input, v_tap[3]);
332*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
333*09537850SAkhilesh Sanikop } else if (filter_index == 4) {
334*09537850SAkhilesh Sanikop sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
335*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, input, v_tap[2]);
336*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
337*09537850SAkhilesh Sanikop sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
338*09537850SAkhilesh Sanikop } else {
339*09537850SAkhilesh Sanikop assert(filter_index == 5);
340*09537850SAkhilesh Sanikop sum = vmull_u8(input, v_tap[2]);
341*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
342*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
343*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
344*09537850SAkhilesh Sanikop }
345*09537850SAkhilesh Sanikop // |sum| contains an int16_t value.
346*09537850SAkhilesh Sanikop sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
347*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
348*09537850SAkhilesh Sanikop Store2<0>(dest16, sum);
349*09537850SAkhilesh Sanikop }
350*09537850SAkhilesh Sanikop }
351*09537850SAkhilesh Sanikop
352*09537850SAkhilesh Sanikop template <int filter_index, bool negative_outside_taps, bool is_2d,
353*09537850SAkhilesh Sanikop bool is_compound>
FilterHorizontal(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dest,const ptrdiff_t pred_stride,const int width,const int height,const uint8x8_t * const v_tap)354*09537850SAkhilesh Sanikop void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
355*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
356*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest,
357*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride, const int width,
358*09537850SAkhilesh Sanikop const int height, const uint8x8_t* const v_tap) {
359*09537850SAkhilesh Sanikop assert(width < 8 || filter_index <= 3);
360*09537850SAkhilesh Sanikop // Don't simplify the redundant if conditions with the template parameters,
361*09537850SAkhilesh Sanikop // which helps the compiler generate compact code.
362*09537850SAkhilesh Sanikop if (width >= 8 && filter_index <= 3) {
363*09537850SAkhilesh Sanikop FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
364*09537850SAkhilesh Sanikop is_compound>(src, src_stride, dest, pred_stride,
365*09537850SAkhilesh Sanikop width, height, v_tap);
366*09537850SAkhilesh Sanikop return;
367*09537850SAkhilesh Sanikop }
368*09537850SAkhilesh Sanikop
369*09537850SAkhilesh Sanikop // Horizontal passes only needs to account for number of taps 2 and 4 when
370*09537850SAkhilesh Sanikop // |width| <= 4.
371*09537850SAkhilesh Sanikop assert(width <= 4);
372*09537850SAkhilesh Sanikop assert(filter_index >= 3 && filter_index <= 5);
373*09537850SAkhilesh Sanikop if (filter_index >= 3 && filter_index <= 5) {
374*09537850SAkhilesh Sanikop if (width == 2 && !is_compound) {
375*09537850SAkhilesh Sanikop FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
376*09537850SAkhilesh Sanikop pred_stride, height, v_tap);
377*09537850SAkhilesh Sanikop return;
378*09537850SAkhilesh Sanikop }
379*09537850SAkhilesh Sanikop assert(width == 4);
380*09537850SAkhilesh Sanikop FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
381*09537850SAkhilesh Sanikop src, src_stride, dest, pred_stride, height, v_tap);
382*09537850SAkhilesh Sanikop }
383*09537850SAkhilesh Sanikop }
384*09537850SAkhilesh Sanikop
385*09537850SAkhilesh Sanikop // Process 16 bit inputs and output 32 bits.
386*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound>
Sum2DVerticalTaps4(const int16x4_t * const src,const int16x8_t taps)387*09537850SAkhilesh Sanikop inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
388*09537850SAkhilesh Sanikop const int16x8_t taps) {
389*09537850SAkhilesh Sanikop const int16x4_t taps_lo = vget_low_s16(taps);
390*09537850SAkhilesh Sanikop const int16x4_t taps_hi = vget_high_s16(taps);
391*09537850SAkhilesh Sanikop int32x4_t sum;
392*09537850SAkhilesh Sanikop if (num_taps == 8) {
393*09537850SAkhilesh Sanikop sum = vmull_lane_s16(src[0], taps_lo, 0);
394*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
395*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
396*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
397*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
398*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
399*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
400*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
401*09537850SAkhilesh Sanikop } else if (num_taps == 6) {
402*09537850SAkhilesh Sanikop sum = vmull_lane_s16(src[0], taps_lo, 1);
403*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
404*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
405*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
406*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
407*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
408*09537850SAkhilesh Sanikop } else if (num_taps == 4) {
409*09537850SAkhilesh Sanikop sum = vmull_lane_s16(src[0], taps_lo, 2);
410*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
411*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
412*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
413*09537850SAkhilesh Sanikop } else if (num_taps == 2) {
414*09537850SAkhilesh Sanikop sum = vmull_lane_s16(src[0], taps_lo, 3);
415*09537850SAkhilesh Sanikop sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
416*09537850SAkhilesh Sanikop }
417*09537850SAkhilesh Sanikop
418*09537850SAkhilesh Sanikop if (is_compound) {
419*09537850SAkhilesh Sanikop return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
420*09537850SAkhilesh Sanikop }
421*09537850SAkhilesh Sanikop
422*09537850SAkhilesh Sanikop return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
423*09537850SAkhilesh Sanikop }
424*09537850SAkhilesh Sanikop
425*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound>
SimpleSum2DVerticalTaps(const int16x8_t * const src,const int16x8_t taps)426*09537850SAkhilesh Sanikop int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
427*09537850SAkhilesh Sanikop const int16x8_t taps) {
428*09537850SAkhilesh Sanikop const int16x4_t taps_lo = vget_low_s16(taps);
429*09537850SAkhilesh Sanikop const int16x4_t taps_hi = vget_high_s16(taps);
430*09537850SAkhilesh Sanikop int32x4_t sum_lo, sum_hi;
431*09537850SAkhilesh Sanikop if (num_taps == 8) {
432*09537850SAkhilesh Sanikop sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
433*09537850SAkhilesh Sanikop sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
434*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
435*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
436*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
437*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
438*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
439*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
440*09537850SAkhilesh Sanikop
441*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
442*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
443*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
444*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
445*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
446*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
447*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
448*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
449*09537850SAkhilesh Sanikop } else if (num_taps == 6) {
450*09537850SAkhilesh Sanikop sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
451*09537850SAkhilesh Sanikop sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
452*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
453*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
454*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
455*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
456*09537850SAkhilesh Sanikop
457*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
458*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
459*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
460*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
461*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
462*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
463*09537850SAkhilesh Sanikop } else if (num_taps == 4) {
464*09537850SAkhilesh Sanikop sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
465*09537850SAkhilesh Sanikop sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
466*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
467*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
468*09537850SAkhilesh Sanikop
469*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
470*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
471*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
472*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
473*09537850SAkhilesh Sanikop } else if (num_taps == 2) {
474*09537850SAkhilesh Sanikop sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
475*09537850SAkhilesh Sanikop sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
476*09537850SAkhilesh Sanikop
477*09537850SAkhilesh Sanikop sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
478*09537850SAkhilesh Sanikop sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
479*09537850SAkhilesh Sanikop }
480*09537850SAkhilesh Sanikop
481*09537850SAkhilesh Sanikop if (is_compound) {
482*09537850SAkhilesh Sanikop return vcombine_s16(
483*09537850SAkhilesh Sanikop vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
484*09537850SAkhilesh Sanikop vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
485*09537850SAkhilesh Sanikop }
486*09537850SAkhilesh Sanikop
487*09537850SAkhilesh Sanikop return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
488*09537850SAkhilesh Sanikop vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
489*09537850SAkhilesh Sanikop }
490*09537850SAkhilesh Sanikop
491*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound = false>
Filter2DVerticalWidth8AndUp(const uint16_t * LIBGAV1_RESTRICT src,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int width,const int height,const int16x8_t taps)492*09537850SAkhilesh Sanikop void Filter2DVerticalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
493*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
494*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int width,
495*09537850SAkhilesh Sanikop const int height, const int16x8_t taps) {
496*09537850SAkhilesh Sanikop assert(width >= 8);
497*09537850SAkhilesh Sanikop constexpr int next_row = num_taps - 1;
498*09537850SAkhilesh Sanikop auto* const dst8 = static_cast<uint8_t*>(dst);
499*09537850SAkhilesh Sanikop auto* const dst16 = static_cast<uint16_t*>(dst);
500*09537850SAkhilesh Sanikop
501*09537850SAkhilesh Sanikop int x = 0;
502*09537850SAkhilesh Sanikop do {
503*09537850SAkhilesh Sanikop int16x8_t srcs[9];
504*09537850SAkhilesh Sanikop srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
505*09537850SAkhilesh Sanikop src += 8;
506*09537850SAkhilesh Sanikop if (num_taps >= 4) {
507*09537850SAkhilesh Sanikop srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
508*09537850SAkhilesh Sanikop src += 8;
509*09537850SAkhilesh Sanikop srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
510*09537850SAkhilesh Sanikop src += 8;
511*09537850SAkhilesh Sanikop if (num_taps >= 6) {
512*09537850SAkhilesh Sanikop srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
513*09537850SAkhilesh Sanikop src += 8;
514*09537850SAkhilesh Sanikop srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
515*09537850SAkhilesh Sanikop src += 8;
516*09537850SAkhilesh Sanikop if (num_taps == 8) {
517*09537850SAkhilesh Sanikop srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
518*09537850SAkhilesh Sanikop src += 8;
519*09537850SAkhilesh Sanikop srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
520*09537850SAkhilesh Sanikop src += 8;
521*09537850SAkhilesh Sanikop }
522*09537850SAkhilesh Sanikop }
523*09537850SAkhilesh Sanikop }
524*09537850SAkhilesh Sanikop
525*09537850SAkhilesh Sanikop uint8_t* d8 = dst8 + x;
526*09537850SAkhilesh Sanikop uint16_t* d16 = dst16 + x;
527*09537850SAkhilesh Sanikop int y = height;
528*09537850SAkhilesh Sanikop do {
529*09537850SAkhilesh Sanikop srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
530*09537850SAkhilesh Sanikop src += 8;
531*09537850SAkhilesh Sanikop srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
532*09537850SAkhilesh Sanikop src += 8;
533*09537850SAkhilesh Sanikop const int16x8_t sum0 =
534*09537850SAkhilesh Sanikop SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
535*09537850SAkhilesh Sanikop const int16x8_t sum1 =
536*09537850SAkhilesh Sanikop SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
537*09537850SAkhilesh Sanikop if (is_compound) {
538*09537850SAkhilesh Sanikop vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
539*09537850SAkhilesh Sanikop d16 += dst_stride;
540*09537850SAkhilesh Sanikop vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
541*09537850SAkhilesh Sanikop d16 += dst_stride;
542*09537850SAkhilesh Sanikop } else {
543*09537850SAkhilesh Sanikop vst1_u8(d8, vqmovun_s16(sum0));
544*09537850SAkhilesh Sanikop d8 += dst_stride;
545*09537850SAkhilesh Sanikop vst1_u8(d8, vqmovun_s16(sum1));
546*09537850SAkhilesh Sanikop d8 += dst_stride;
547*09537850SAkhilesh Sanikop }
548*09537850SAkhilesh Sanikop srcs[0] = srcs[2];
549*09537850SAkhilesh Sanikop if (num_taps >= 4) {
550*09537850SAkhilesh Sanikop srcs[1] = srcs[3];
551*09537850SAkhilesh Sanikop srcs[2] = srcs[4];
552*09537850SAkhilesh Sanikop if (num_taps >= 6) {
553*09537850SAkhilesh Sanikop srcs[3] = srcs[5];
554*09537850SAkhilesh Sanikop srcs[4] = srcs[6];
555*09537850SAkhilesh Sanikop if (num_taps == 8) {
556*09537850SAkhilesh Sanikop srcs[5] = srcs[7];
557*09537850SAkhilesh Sanikop srcs[6] = srcs[8];
558*09537850SAkhilesh Sanikop }
559*09537850SAkhilesh Sanikop }
560*09537850SAkhilesh Sanikop }
561*09537850SAkhilesh Sanikop y -= 2;
562*09537850SAkhilesh Sanikop } while (y != 0);
563*09537850SAkhilesh Sanikop x += 8;
564*09537850SAkhilesh Sanikop } while (x < width);
565*09537850SAkhilesh Sanikop }
566*09537850SAkhilesh Sanikop
567*09537850SAkhilesh Sanikop // Take advantage of |src_stride| == |width| to process two rows at a time.
568*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound = false>
Filter2DVerticalWidth4(const uint16_t * LIBGAV1_RESTRICT src,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)569*09537850SAkhilesh Sanikop void Filter2DVerticalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
570*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
571*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int height,
572*09537850SAkhilesh Sanikop const int16x8_t taps) {
573*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
574*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst);
575*09537850SAkhilesh Sanikop
576*09537850SAkhilesh Sanikop int16x8_t srcs[9];
577*09537850SAkhilesh Sanikop srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
578*09537850SAkhilesh Sanikop src += 8;
579*09537850SAkhilesh Sanikop if (num_taps >= 4) {
580*09537850SAkhilesh Sanikop srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
581*09537850SAkhilesh Sanikop src += 8;
582*09537850SAkhilesh Sanikop srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
583*09537850SAkhilesh Sanikop if (num_taps >= 6) {
584*09537850SAkhilesh Sanikop srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
585*09537850SAkhilesh Sanikop src += 8;
586*09537850SAkhilesh Sanikop srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
587*09537850SAkhilesh Sanikop if (num_taps == 8) {
588*09537850SAkhilesh Sanikop srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
589*09537850SAkhilesh Sanikop src += 8;
590*09537850SAkhilesh Sanikop srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
591*09537850SAkhilesh Sanikop }
592*09537850SAkhilesh Sanikop }
593*09537850SAkhilesh Sanikop }
594*09537850SAkhilesh Sanikop
595*09537850SAkhilesh Sanikop int y = height;
596*09537850SAkhilesh Sanikop do {
597*09537850SAkhilesh Sanikop srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
598*09537850SAkhilesh Sanikop src += 8;
599*09537850SAkhilesh Sanikop srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
600*09537850SAkhilesh Sanikop vget_low_s16(srcs[num_taps]));
601*09537850SAkhilesh Sanikop
602*09537850SAkhilesh Sanikop const int16x8_t sum =
603*09537850SAkhilesh Sanikop SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
604*09537850SAkhilesh Sanikop if (is_compound) {
605*09537850SAkhilesh Sanikop const uint16x8_t results = vreinterpretq_u16_s16(sum);
606*09537850SAkhilesh Sanikop vst1q_u16(dst16, results);
607*09537850SAkhilesh Sanikop dst16 += 4 << 1;
608*09537850SAkhilesh Sanikop } else {
609*09537850SAkhilesh Sanikop const uint8x8_t results = vqmovun_s16(sum);
610*09537850SAkhilesh Sanikop
611*09537850SAkhilesh Sanikop StoreLo4(dst8, results);
612*09537850SAkhilesh Sanikop dst8 += dst_stride;
613*09537850SAkhilesh Sanikop StoreHi4(dst8, results);
614*09537850SAkhilesh Sanikop dst8 += dst_stride;
615*09537850SAkhilesh Sanikop }
616*09537850SAkhilesh Sanikop
617*09537850SAkhilesh Sanikop srcs[0] = srcs[2];
618*09537850SAkhilesh Sanikop if (num_taps >= 4) {
619*09537850SAkhilesh Sanikop srcs[1] = srcs[3];
620*09537850SAkhilesh Sanikop srcs[2] = srcs[4];
621*09537850SAkhilesh Sanikop if (num_taps >= 6) {
622*09537850SAkhilesh Sanikop srcs[3] = srcs[5];
623*09537850SAkhilesh Sanikop srcs[4] = srcs[6];
624*09537850SAkhilesh Sanikop if (num_taps == 8) {
625*09537850SAkhilesh Sanikop srcs[5] = srcs[7];
626*09537850SAkhilesh Sanikop srcs[6] = srcs[8];
627*09537850SAkhilesh Sanikop }
628*09537850SAkhilesh Sanikop }
629*09537850SAkhilesh Sanikop }
630*09537850SAkhilesh Sanikop y -= 2;
631*09537850SAkhilesh Sanikop } while (y != 0);
632*09537850SAkhilesh Sanikop }
633*09537850SAkhilesh Sanikop
634*09537850SAkhilesh Sanikop // Take advantage of |src_stride| == |width| to process four rows at a time.
635*09537850SAkhilesh Sanikop template <int num_taps>
Filter2DVerticalWidth2(const uint16_t * LIBGAV1_RESTRICT src,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)636*09537850SAkhilesh Sanikop void Filter2DVerticalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
637*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
638*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int height,
639*09537850SAkhilesh Sanikop const int16x8_t taps) {
640*09537850SAkhilesh Sanikop constexpr int next_row = (num_taps < 6) ? 4 : 8;
641*09537850SAkhilesh Sanikop
642*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
643*09537850SAkhilesh Sanikop
644*09537850SAkhilesh Sanikop int16x8_t srcs[9];
645*09537850SAkhilesh Sanikop srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
646*09537850SAkhilesh Sanikop src += 8;
647*09537850SAkhilesh Sanikop if (num_taps >= 6) {
648*09537850SAkhilesh Sanikop srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
649*09537850SAkhilesh Sanikop src += 8;
650*09537850SAkhilesh Sanikop srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
651*09537850SAkhilesh Sanikop if (num_taps == 8) {
652*09537850SAkhilesh Sanikop srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
653*09537850SAkhilesh Sanikop srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
654*09537850SAkhilesh Sanikop }
655*09537850SAkhilesh Sanikop }
656*09537850SAkhilesh Sanikop
657*09537850SAkhilesh Sanikop int y = 0;
658*09537850SAkhilesh Sanikop do {
659*09537850SAkhilesh Sanikop srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
660*09537850SAkhilesh Sanikop src += 8;
661*09537850SAkhilesh Sanikop if (num_taps == 2) {
662*09537850SAkhilesh Sanikop srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
663*09537850SAkhilesh Sanikop } else if (num_taps == 4) {
664*09537850SAkhilesh Sanikop srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
665*09537850SAkhilesh Sanikop srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
666*09537850SAkhilesh Sanikop srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
667*09537850SAkhilesh Sanikop } else if (num_taps == 6) {
668*09537850SAkhilesh Sanikop srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
669*09537850SAkhilesh Sanikop srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
670*09537850SAkhilesh Sanikop srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
671*09537850SAkhilesh Sanikop } else if (num_taps == 8) {
672*09537850SAkhilesh Sanikop srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
673*09537850SAkhilesh Sanikop srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
674*09537850SAkhilesh Sanikop srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
675*09537850SAkhilesh Sanikop }
676*09537850SAkhilesh Sanikop
677*09537850SAkhilesh Sanikop const int16x8_t sum =
678*09537850SAkhilesh Sanikop SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
679*09537850SAkhilesh Sanikop const uint8x8_t results = vqmovun_s16(sum);
680*09537850SAkhilesh Sanikop
681*09537850SAkhilesh Sanikop Store2<0>(dst8, results);
682*09537850SAkhilesh Sanikop dst8 += dst_stride;
683*09537850SAkhilesh Sanikop Store2<1>(dst8, results);
684*09537850SAkhilesh Sanikop // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
685*09537850SAkhilesh Sanikop // Therefore we don't need to check this condition when |height| > 4.
686*09537850SAkhilesh Sanikop if (num_taps <= 4 && height == 2) return;
687*09537850SAkhilesh Sanikop dst8 += dst_stride;
688*09537850SAkhilesh Sanikop Store2<2>(dst8, results);
689*09537850SAkhilesh Sanikop dst8 += dst_stride;
690*09537850SAkhilesh Sanikop Store2<3>(dst8, results);
691*09537850SAkhilesh Sanikop dst8 += dst_stride;
692*09537850SAkhilesh Sanikop
693*09537850SAkhilesh Sanikop srcs[0] = srcs[4];
694*09537850SAkhilesh Sanikop if (num_taps == 6) {
695*09537850SAkhilesh Sanikop srcs[1] = srcs[5];
696*09537850SAkhilesh Sanikop srcs[4] = srcs[8];
697*09537850SAkhilesh Sanikop } else if (num_taps == 8) {
698*09537850SAkhilesh Sanikop srcs[1] = srcs[5];
699*09537850SAkhilesh Sanikop srcs[2] = srcs[6];
700*09537850SAkhilesh Sanikop srcs[3] = srcs[7];
701*09537850SAkhilesh Sanikop srcs[4] = srcs[8];
702*09537850SAkhilesh Sanikop }
703*09537850SAkhilesh Sanikop
704*09537850SAkhilesh Sanikop y += 4;
705*09537850SAkhilesh Sanikop } while (y < height);
706*09537850SAkhilesh Sanikop }
707*09537850SAkhilesh Sanikop
708*09537850SAkhilesh Sanikop template <bool is_2d = false, bool is_compound = false>
DoHorizontalPass(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int width,const int height,const int filter_id,const int filter_index)709*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
710*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
711*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
712*09537850SAkhilesh Sanikop const int width, const int height, const int filter_id,
713*09537850SAkhilesh Sanikop const int filter_index) {
714*09537850SAkhilesh Sanikop // Duplicate the absolute value for each tap. Negative taps are corrected
715*09537850SAkhilesh Sanikop // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8.
716*09537850SAkhilesh Sanikop uint8x8_t v_tap[kSubPixelTaps];
717*09537850SAkhilesh Sanikop assert(filter_id != 0);
718*09537850SAkhilesh Sanikop
719*09537850SAkhilesh Sanikop for (int k = 0; k < kSubPixelTaps; ++k) {
720*09537850SAkhilesh Sanikop v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
721*09537850SAkhilesh Sanikop }
722*09537850SAkhilesh Sanikop
723*09537850SAkhilesh Sanikop if (filter_index == 2) { // 8 tap.
724*09537850SAkhilesh Sanikop FilterHorizontal<2, true, is_2d, is_compound>(
725*09537850SAkhilesh Sanikop src, src_stride, dst, dst_stride, width, height, v_tap);
726*09537850SAkhilesh Sanikop } else if (filter_index == 1) { // 6 tap.
727*09537850SAkhilesh Sanikop // Check if outside taps are positive.
728*09537850SAkhilesh Sanikop if ((filter_id == 1) | (filter_id == 15)) {
729*09537850SAkhilesh Sanikop FilterHorizontal<1, false, is_2d, is_compound>(
730*09537850SAkhilesh Sanikop src + 1, src_stride, dst, dst_stride, width, height, v_tap);
731*09537850SAkhilesh Sanikop } else {
732*09537850SAkhilesh Sanikop FilterHorizontal<1, true, is_2d, is_compound>(
733*09537850SAkhilesh Sanikop src + 1, src_stride, dst, dst_stride, width, height, v_tap);
734*09537850SAkhilesh Sanikop }
735*09537850SAkhilesh Sanikop } else if (filter_index == 0) { // 6 tap.
736*09537850SAkhilesh Sanikop FilterHorizontal<0, true, is_2d, is_compound>(
737*09537850SAkhilesh Sanikop src + 1, src_stride, dst, dst_stride, width, height, v_tap);
738*09537850SAkhilesh Sanikop } else if (filter_index == 4) { // 4 tap.
739*09537850SAkhilesh Sanikop FilterHorizontal<4, true, is_2d, is_compound>(
740*09537850SAkhilesh Sanikop src + 2, src_stride, dst, dst_stride, width, height, v_tap);
741*09537850SAkhilesh Sanikop } else if (filter_index == 5) { // 4 tap.
742*09537850SAkhilesh Sanikop FilterHorizontal<5, true, is_2d, is_compound>(
743*09537850SAkhilesh Sanikop src + 2, src_stride, dst, dst_stride, width, height, v_tap);
744*09537850SAkhilesh Sanikop } else { // 2 tap.
745*09537850SAkhilesh Sanikop FilterHorizontal<3, true, is_2d, is_compound>(
746*09537850SAkhilesh Sanikop src + 3, src_stride, dst, dst_stride, width, height, v_tap);
747*09537850SAkhilesh Sanikop }
748*09537850SAkhilesh Sanikop }
749*09537850SAkhilesh Sanikop
750*09537850SAkhilesh Sanikop template <int vertical_taps>
Filter2DVertical(const uint16_t * LIBGAV1_RESTRICT const intermediate_result,const int width,const int height,const int16x8_t taps,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)751*09537850SAkhilesh Sanikop void Filter2DVertical(
752*09537850SAkhilesh Sanikop const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
753*09537850SAkhilesh Sanikop const int height, const int16x8_t taps,
754*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
755*09537850SAkhilesh Sanikop auto* const dest = static_cast<uint8_t*>(prediction);
756*09537850SAkhilesh Sanikop if (width >= 8) {
757*09537850SAkhilesh Sanikop Filter2DVerticalWidth8AndUp<vertical_taps>(
758*09537850SAkhilesh Sanikop intermediate_result, dest, pred_stride, width, height, taps);
759*09537850SAkhilesh Sanikop } else if (width == 4) {
760*09537850SAkhilesh Sanikop Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
761*09537850SAkhilesh Sanikop pred_stride, height, taps);
762*09537850SAkhilesh Sanikop } else {
763*09537850SAkhilesh Sanikop assert(width == 2);
764*09537850SAkhilesh Sanikop Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
765*09537850SAkhilesh Sanikop pred_stride, height, taps);
766*09537850SAkhilesh Sanikop }
767*09537850SAkhilesh Sanikop }
768*09537850SAkhilesh Sanikop
Convolve2D_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)769*09537850SAkhilesh Sanikop void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference,
770*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride,
771*09537850SAkhilesh Sanikop const int horizontal_filter_index,
772*09537850SAkhilesh Sanikop const int vertical_filter_index,
773*09537850SAkhilesh Sanikop const int horizontal_filter_id,
774*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width,
775*09537850SAkhilesh Sanikop const int height, void* LIBGAV1_RESTRICT const prediction,
776*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride) {
777*09537850SAkhilesh Sanikop const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
778*09537850SAkhilesh Sanikop const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
779*09537850SAkhilesh Sanikop const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
780*09537850SAkhilesh Sanikop
781*09537850SAkhilesh Sanikop // The output of the horizontal filter is guaranteed to fit in 16 bits.
782*09537850SAkhilesh Sanikop uint16_t
783*09537850SAkhilesh Sanikop intermediate_result[kMaxSuperBlockSizeInPixels *
784*09537850SAkhilesh Sanikop (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
785*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
786*09537850SAkhilesh Sanikop // Quiet msan warnings. Set with random non-zero value to aid in debugging.
787*09537850SAkhilesh Sanikop memset(intermediate_result, 0x33, sizeof(intermediate_result));
788*09537850SAkhilesh Sanikop #endif
789*09537850SAkhilesh Sanikop const int intermediate_height = height + vertical_taps - 1;
790*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
791*09537850SAkhilesh Sanikop const auto* const src = static_cast<const uint8_t*>(reference) -
792*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride -
793*09537850SAkhilesh Sanikop kHorizontalOffset;
794*09537850SAkhilesh Sanikop
795*09537850SAkhilesh Sanikop DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
796*09537850SAkhilesh Sanikop width, intermediate_height,
797*09537850SAkhilesh Sanikop horizontal_filter_id, horiz_filter_index);
798*09537850SAkhilesh Sanikop
799*09537850SAkhilesh Sanikop // Vertical filter.
800*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
801*09537850SAkhilesh Sanikop const int16x8_t taps = vmovl_s8(
802*09537850SAkhilesh Sanikop vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
803*09537850SAkhilesh Sanikop if (vertical_taps == 8) {
804*09537850SAkhilesh Sanikop Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
805*09537850SAkhilesh Sanikop pred_stride);
806*09537850SAkhilesh Sanikop } else if (vertical_taps == 6) {
807*09537850SAkhilesh Sanikop Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
808*09537850SAkhilesh Sanikop pred_stride);
809*09537850SAkhilesh Sanikop } else if (vertical_taps == 4) {
810*09537850SAkhilesh Sanikop Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
811*09537850SAkhilesh Sanikop pred_stride);
812*09537850SAkhilesh Sanikop } else { // |vertical_taps| == 2
813*09537850SAkhilesh Sanikop Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
814*09537850SAkhilesh Sanikop pred_stride);
815*09537850SAkhilesh Sanikop }
816*09537850SAkhilesh Sanikop }
817*09537850SAkhilesh Sanikop
818*09537850SAkhilesh Sanikop // There are many opportunities for overreading in scaled convolve, because the
819*09537850SAkhilesh Sanikop // range of starting points for filter windows is anywhere from 0 to 16 for 8
820*09537850SAkhilesh Sanikop // destination pixels, and the window sizes range from 2 to 8. To accommodate
821*09537850SAkhilesh Sanikop // this range concisely, we use |grade_x| to mean the most steps in src that can
822*09537850SAkhilesh Sanikop // be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
823*09537850SAkhilesh Sanikop // we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
824*09537850SAkhilesh Sanikop // increments. The first load covers the initial elements of src_x, while the
825*09537850SAkhilesh Sanikop // final load covers the taps.
826*09537850SAkhilesh Sanikop template <int grade_x>
LoadSrcVals(const uint8_t * const src_x)827*09537850SAkhilesh Sanikop inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
828*09537850SAkhilesh Sanikop uint8x8x3_t ret;
829*09537850SAkhilesh Sanikop const uint8x16_t src_val = vld1q_u8(src_x);
830*09537850SAkhilesh Sanikop ret.val[0] = vget_low_u8(src_val);
831*09537850SAkhilesh Sanikop ret.val[1] = vget_high_u8(src_val);
832*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
833*09537850SAkhilesh Sanikop // Initialize to quiet msan warnings when grade_x <= 1.
834*09537850SAkhilesh Sanikop ret.val[2] = vdup_n_u8(0);
835*09537850SAkhilesh Sanikop #endif
836*09537850SAkhilesh Sanikop if (grade_x > 1) {
837*09537850SAkhilesh Sanikop ret.val[2] = vld1_u8(src_x + 16);
838*09537850SAkhilesh Sanikop }
839*09537850SAkhilesh Sanikop return ret;
840*09537850SAkhilesh Sanikop }
841*09537850SAkhilesh Sanikop
842*09537850SAkhilesh Sanikop // Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
GetPositive2TapFilter(const int tap_index)843*09537850SAkhilesh Sanikop inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
844*09537850SAkhilesh Sanikop assert(tap_index < 2);
845*09537850SAkhilesh Sanikop alignas(
846*09537850SAkhilesh Sanikop 16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
847*09537850SAkhilesh Sanikop {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
848*09537850SAkhilesh Sanikop {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
849*09537850SAkhilesh Sanikop
850*09537850SAkhilesh Sanikop return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
851*09537850SAkhilesh Sanikop }
852*09537850SAkhilesh Sanikop
853*09537850SAkhilesh Sanikop template <int grade_x>
ConvolveKernelHorizontal2Tap(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * LIBGAV1_RESTRICT intermediate)854*09537850SAkhilesh Sanikop inline void ConvolveKernelHorizontal2Tap(
855*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
856*09537850SAkhilesh Sanikop const int width, const int subpixel_x, const int step_x,
857*09537850SAkhilesh Sanikop const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) {
858*09537850SAkhilesh Sanikop // Account for the 0-taps that precede the 2 nonzero taps.
859*09537850SAkhilesh Sanikop const int kernel_offset = 3;
860*09537850SAkhilesh Sanikop const int ref_x = subpixel_x >> kScaleSubPixelBits;
861*09537850SAkhilesh Sanikop const int step_x8 = step_x << 3;
862*09537850SAkhilesh Sanikop const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
863*09537850SAkhilesh Sanikop const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
864*09537850SAkhilesh Sanikop const uint16x8_t index_steps = vmulq_n_u16(
865*09537850SAkhilesh Sanikop vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
866*09537850SAkhilesh Sanikop const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
867*09537850SAkhilesh Sanikop
868*09537850SAkhilesh Sanikop int p = subpixel_x;
869*09537850SAkhilesh Sanikop if (width <= 4) {
870*09537850SAkhilesh Sanikop const uint8_t* src_x =
871*09537850SAkhilesh Sanikop &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
872*09537850SAkhilesh Sanikop // Only add steps to the 10-bit truncated p to avoid overflow.
873*09537850SAkhilesh Sanikop const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
874*09537850SAkhilesh Sanikop const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
875*09537850SAkhilesh Sanikop const uint8x8_t filter_indices =
876*09537850SAkhilesh Sanikop vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
877*09537850SAkhilesh Sanikop // This is a special case. The 2-tap filter has no negative taps, so we
878*09537850SAkhilesh Sanikop // can use unsigned values.
879*09537850SAkhilesh Sanikop // For each x, a lane of tapsK has
880*09537850SAkhilesh Sanikop // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
881*09537850SAkhilesh Sanikop // on x.
882*09537850SAkhilesh Sanikop const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
883*09537850SAkhilesh Sanikop VQTbl1U8(filter_taps1, filter_indices)};
884*09537850SAkhilesh Sanikop int y = intermediate_height;
885*09537850SAkhilesh Sanikop do {
886*09537850SAkhilesh Sanikop // Load a pool of samples to select from using stepped indices.
887*09537850SAkhilesh Sanikop const uint8x16_t src_vals = vld1q_u8(src_x);
888*09537850SAkhilesh Sanikop const uint8x8_t src_indices =
889*09537850SAkhilesh Sanikop vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
890*09537850SAkhilesh Sanikop
891*09537850SAkhilesh Sanikop // For each x, a lane of srcK contains src_x[k].
892*09537850SAkhilesh Sanikop const uint8x8_t src[2] = {
893*09537850SAkhilesh Sanikop VQTbl1U8(src_vals, src_indices),
894*09537850SAkhilesh Sanikop VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
895*09537850SAkhilesh Sanikop
896*09537850SAkhilesh Sanikop vst1q_s16(intermediate,
897*09537850SAkhilesh Sanikop vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
898*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
899*09537850SAkhilesh Sanikop src_x += src_stride;
900*09537850SAkhilesh Sanikop intermediate += kIntermediateStride;
901*09537850SAkhilesh Sanikop } while (--y != 0);
902*09537850SAkhilesh Sanikop return;
903*09537850SAkhilesh Sanikop }
904*09537850SAkhilesh Sanikop
905*09537850SAkhilesh Sanikop // |width| >= 8
906*09537850SAkhilesh Sanikop int x = 0;
907*09537850SAkhilesh Sanikop do {
908*09537850SAkhilesh Sanikop const uint8_t* src_x =
909*09537850SAkhilesh Sanikop &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
910*09537850SAkhilesh Sanikop // Only add steps to the 10-bit truncated p to avoid overflow.
911*09537850SAkhilesh Sanikop const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
912*09537850SAkhilesh Sanikop const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
913*09537850SAkhilesh Sanikop const uint8x8_t filter_indices =
914*09537850SAkhilesh Sanikop vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
915*09537850SAkhilesh Sanikop filter_index_mask);
916*09537850SAkhilesh Sanikop // This is a special case. The 2-tap filter has no negative taps, so we
917*09537850SAkhilesh Sanikop // can use unsigned values.
918*09537850SAkhilesh Sanikop // For each x, a lane of tapsK has
919*09537850SAkhilesh Sanikop // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
920*09537850SAkhilesh Sanikop // on x.
921*09537850SAkhilesh Sanikop const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
922*09537850SAkhilesh Sanikop VQTbl1U8(filter_taps1, filter_indices)};
923*09537850SAkhilesh Sanikop int y = intermediate_height;
924*09537850SAkhilesh Sanikop do {
925*09537850SAkhilesh Sanikop // Load a pool of samples to select from using stepped indices.
926*09537850SAkhilesh Sanikop const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
927*09537850SAkhilesh Sanikop const uint8x8_t src_indices =
928*09537850SAkhilesh Sanikop vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
929*09537850SAkhilesh Sanikop
930*09537850SAkhilesh Sanikop // For each x, a lane of srcK contains src_x[k].
931*09537850SAkhilesh Sanikop const uint8x8_t src[2] = {
932*09537850SAkhilesh Sanikop vtbl3_u8(src_vals, src_indices),
933*09537850SAkhilesh Sanikop vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
934*09537850SAkhilesh Sanikop
935*09537850SAkhilesh Sanikop vst1q_s16(intermediate,
936*09537850SAkhilesh Sanikop vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
937*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
938*09537850SAkhilesh Sanikop src_x += src_stride;
939*09537850SAkhilesh Sanikop intermediate += kIntermediateStride;
940*09537850SAkhilesh Sanikop } while (--y != 0);
941*09537850SAkhilesh Sanikop x += 8;
942*09537850SAkhilesh Sanikop p += step_x8;
943*09537850SAkhilesh Sanikop } while (x < width);
944*09537850SAkhilesh Sanikop }
945*09537850SAkhilesh Sanikop
946*09537850SAkhilesh Sanikop // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
GetPositive4TapFilter(const int tap_index)947*09537850SAkhilesh Sanikop inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
948*09537850SAkhilesh Sanikop assert(tap_index < 4);
949*09537850SAkhilesh Sanikop alignas(
950*09537850SAkhilesh Sanikop 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
951*09537850SAkhilesh Sanikop {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
952*09537850SAkhilesh Sanikop {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
953*09537850SAkhilesh Sanikop {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
954*09537850SAkhilesh Sanikop {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
955*09537850SAkhilesh Sanikop
956*09537850SAkhilesh Sanikop return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
957*09537850SAkhilesh Sanikop }
958*09537850SAkhilesh Sanikop
959*09537850SAkhilesh Sanikop // This filter is only possible when width <= 4.
ConvolveKernelHorizontalPositive4Tap(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * LIBGAV1_RESTRICT intermediate)960*09537850SAkhilesh Sanikop void ConvolveKernelHorizontalPositive4Tap(
961*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
962*09537850SAkhilesh Sanikop const int subpixel_x, const int step_x, const int intermediate_height,
963*09537850SAkhilesh Sanikop int16_t* LIBGAV1_RESTRICT intermediate) {
964*09537850SAkhilesh Sanikop const int kernel_offset = 2;
965*09537850SAkhilesh Sanikop const int ref_x = subpixel_x >> kScaleSubPixelBits;
966*09537850SAkhilesh Sanikop const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
967*09537850SAkhilesh Sanikop const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
968*09537850SAkhilesh Sanikop const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
969*09537850SAkhilesh Sanikop const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
970*09537850SAkhilesh Sanikop const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
971*09537850SAkhilesh Sanikop const uint16x8_t index_steps = vmulq_n_u16(
972*09537850SAkhilesh Sanikop vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
973*09537850SAkhilesh Sanikop const int p = subpixel_x;
974*09537850SAkhilesh Sanikop // First filter is special, just a 128 tap on the center.
975*09537850SAkhilesh Sanikop const uint8_t* src_x =
976*09537850SAkhilesh Sanikop &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
977*09537850SAkhilesh Sanikop // Only add steps to the 10-bit truncated p to avoid overflow.
978*09537850SAkhilesh Sanikop const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
979*09537850SAkhilesh Sanikop const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
980*09537850SAkhilesh Sanikop const uint8x8_t filter_indices = vand_u8(
981*09537850SAkhilesh Sanikop vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
982*09537850SAkhilesh Sanikop // Note that filter_id depends on x.
983*09537850SAkhilesh Sanikop // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
984*09537850SAkhilesh Sanikop const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
985*09537850SAkhilesh Sanikop VQTbl1U8(filter_taps1, filter_indices),
986*09537850SAkhilesh Sanikop VQTbl1U8(filter_taps2, filter_indices),
987*09537850SAkhilesh Sanikop VQTbl1U8(filter_taps3, filter_indices)};
988*09537850SAkhilesh Sanikop
989*09537850SAkhilesh Sanikop const uint8x8_t src_indices =
990*09537850SAkhilesh Sanikop vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
991*09537850SAkhilesh Sanikop int y = intermediate_height;
992*09537850SAkhilesh Sanikop do {
993*09537850SAkhilesh Sanikop // Load a pool of samples to select from using stepped index vectors.
994*09537850SAkhilesh Sanikop const uint8x16_t src_vals = vld1q_u8(src_x);
995*09537850SAkhilesh Sanikop
996*09537850SAkhilesh Sanikop // For each x, srcK contains src_x[k] where k=1.
997*09537850SAkhilesh Sanikop // Whereas taps come from different arrays, src pixels are drawn from the
998*09537850SAkhilesh Sanikop // same contiguous line.
999*09537850SAkhilesh Sanikop const uint8x8_t src[4] = {
1000*09537850SAkhilesh Sanikop VQTbl1U8(src_vals, src_indices),
1001*09537850SAkhilesh Sanikop VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
1002*09537850SAkhilesh Sanikop VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
1003*09537850SAkhilesh Sanikop VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
1004*09537850SAkhilesh Sanikop
1005*09537850SAkhilesh Sanikop vst1q_s16(intermediate,
1006*09537850SAkhilesh Sanikop vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
1007*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
1008*09537850SAkhilesh Sanikop
1009*09537850SAkhilesh Sanikop src_x += src_stride;
1010*09537850SAkhilesh Sanikop intermediate += kIntermediateStride;
1011*09537850SAkhilesh Sanikop } while (--y != 0);
1012*09537850SAkhilesh Sanikop }
1013*09537850SAkhilesh Sanikop
1014*09537850SAkhilesh Sanikop // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
GetSigned4TapFilter(const int tap_index)1015*09537850SAkhilesh Sanikop inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
1016*09537850SAkhilesh Sanikop assert(tap_index < 4);
1017*09537850SAkhilesh Sanikop alignas(16) static constexpr uint8_t
1018*09537850SAkhilesh Sanikop kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
1019*09537850SAkhilesh Sanikop {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
1020*09537850SAkhilesh Sanikop {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1021*09537850SAkhilesh Sanikop {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1022*09537850SAkhilesh Sanikop {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
1023*09537850SAkhilesh Sanikop
1024*09537850SAkhilesh Sanikop return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
1025*09537850SAkhilesh Sanikop }
1026*09537850SAkhilesh Sanikop
1027*09537850SAkhilesh Sanikop // This filter is only possible when width <= 4.
ConvolveKernelHorizontalSigned4Tap(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * LIBGAV1_RESTRICT intermediate)1028*09537850SAkhilesh Sanikop inline void ConvolveKernelHorizontalSigned4Tap(
1029*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
1030*09537850SAkhilesh Sanikop const int subpixel_x, const int step_x, const int intermediate_height,
1031*09537850SAkhilesh Sanikop int16_t* LIBGAV1_RESTRICT intermediate) {
1032*09537850SAkhilesh Sanikop const int kernel_offset = 2;
1033*09537850SAkhilesh Sanikop const int ref_x = subpixel_x >> kScaleSubPixelBits;
1034*09537850SAkhilesh Sanikop const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1035*09537850SAkhilesh Sanikop const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
1036*09537850SAkhilesh Sanikop const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
1037*09537850SAkhilesh Sanikop const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
1038*09537850SAkhilesh Sanikop const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
1039*09537850SAkhilesh Sanikop const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
1040*09537850SAkhilesh Sanikop static_cast<uint16_t>(step_x));
1041*09537850SAkhilesh Sanikop
1042*09537850SAkhilesh Sanikop const int p = subpixel_x;
1043*09537850SAkhilesh Sanikop const uint8_t* src_x =
1044*09537850SAkhilesh Sanikop &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1045*09537850SAkhilesh Sanikop // Only add steps to the 10-bit truncated p to avoid overflow.
1046*09537850SAkhilesh Sanikop const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
1047*09537850SAkhilesh Sanikop const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
1048*09537850SAkhilesh Sanikop const uint8x8_t filter_index_offsets = vshrn_n_u16(
1049*09537850SAkhilesh Sanikop vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
1050*09537850SAkhilesh Sanikop const uint8x8_t filter_indices =
1051*09537850SAkhilesh Sanikop vand_u8(filter_index_offsets, filter_index_mask);
1052*09537850SAkhilesh Sanikop // Note that filter_id depends on x.
1053*09537850SAkhilesh Sanikop // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
1054*09537850SAkhilesh Sanikop const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
1055*09537850SAkhilesh Sanikop VQTbl1U8(filter_taps1, filter_indices),
1056*09537850SAkhilesh Sanikop VQTbl1U8(filter_taps2, filter_indices),
1057*09537850SAkhilesh Sanikop VQTbl1U8(filter_taps3, filter_indices)};
1058*09537850SAkhilesh Sanikop
1059*09537850SAkhilesh Sanikop const uint8x8_t src_indices_base =
1060*09537850SAkhilesh Sanikop vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
1061*09537850SAkhilesh Sanikop
1062*09537850SAkhilesh Sanikop const uint8x8_t src_indices[4] = {src_indices_base,
1063*09537850SAkhilesh Sanikop vadd_u8(src_indices_base, vdup_n_u8(1)),
1064*09537850SAkhilesh Sanikop vadd_u8(src_indices_base, vdup_n_u8(2)),
1065*09537850SAkhilesh Sanikop vadd_u8(src_indices_base, vdup_n_u8(3))};
1066*09537850SAkhilesh Sanikop
1067*09537850SAkhilesh Sanikop int y = intermediate_height;
1068*09537850SAkhilesh Sanikop do {
1069*09537850SAkhilesh Sanikop // Load a pool of samples to select from using stepped indices.
1070*09537850SAkhilesh Sanikop const uint8x16_t src_vals = vld1q_u8(src_x);
1071*09537850SAkhilesh Sanikop
1072*09537850SAkhilesh Sanikop // For each x, srcK contains src_x[k] where k=1.
1073*09537850SAkhilesh Sanikop // Whereas taps come from different arrays, src pixels are drawn from the
1074*09537850SAkhilesh Sanikop // same contiguous line.
1075*09537850SAkhilesh Sanikop const uint8x8_t src[4] = {
1076*09537850SAkhilesh Sanikop VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
1077*09537850SAkhilesh Sanikop VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
1078*09537850SAkhilesh Sanikop
1079*09537850SAkhilesh Sanikop vst1q_s16(intermediate,
1080*09537850SAkhilesh Sanikop vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
1081*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
1082*09537850SAkhilesh Sanikop src_x += src_stride;
1083*09537850SAkhilesh Sanikop intermediate += kIntermediateStride;
1084*09537850SAkhilesh Sanikop } while (--y != 0);
1085*09537850SAkhilesh Sanikop }
1086*09537850SAkhilesh Sanikop
1087*09537850SAkhilesh Sanikop // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
GetSigned6TapFilter(const int tap_index)1088*09537850SAkhilesh Sanikop inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
1089*09537850SAkhilesh Sanikop assert(tap_index < 6);
1090*09537850SAkhilesh Sanikop alignas(16) static constexpr uint8_t
1091*09537850SAkhilesh Sanikop kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
1092*09537850SAkhilesh Sanikop {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
1093*09537850SAkhilesh Sanikop {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
1094*09537850SAkhilesh Sanikop {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1095*09537850SAkhilesh Sanikop {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1096*09537850SAkhilesh Sanikop {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
1097*09537850SAkhilesh Sanikop {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
1098*09537850SAkhilesh Sanikop
1099*09537850SAkhilesh Sanikop return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
1100*09537850SAkhilesh Sanikop }
1101*09537850SAkhilesh Sanikop
1102*09537850SAkhilesh Sanikop // This filter is only possible when width >= 8.
1103*09537850SAkhilesh Sanikop template <int grade_x>
ConvolveKernelHorizontalSigned6Tap(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * LIBGAV1_RESTRICT const intermediate)1104*09537850SAkhilesh Sanikop inline void ConvolveKernelHorizontalSigned6Tap(
1105*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
1106*09537850SAkhilesh Sanikop const int width, const int subpixel_x, const int step_x,
1107*09537850SAkhilesh Sanikop const int intermediate_height,
1108*09537850SAkhilesh Sanikop int16_t* LIBGAV1_RESTRICT const intermediate) {
1109*09537850SAkhilesh Sanikop const int kernel_offset = 1;
1110*09537850SAkhilesh Sanikop const uint8x8_t one = vdup_n_u8(1);
1111*09537850SAkhilesh Sanikop const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1112*09537850SAkhilesh Sanikop const int ref_x = subpixel_x >> kScaleSubPixelBits;
1113*09537850SAkhilesh Sanikop const int step_x8 = step_x << 3;
1114*09537850SAkhilesh Sanikop uint8x16_t filter_taps[6];
1115*09537850SAkhilesh Sanikop for (int i = 0; i < 6; ++i) {
1116*09537850SAkhilesh Sanikop filter_taps[i] = GetSigned6TapFilter(i);
1117*09537850SAkhilesh Sanikop }
1118*09537850SAkhilesh Sanikop const uint16x8_t index_steps = vmulq_n_u16(
1119*09537850SAkhilesh Sanikop vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1120*09537850SAkhilesh Sanikop
1121*09537850SAkhilesh Sanikop int16_t* intermediate_x = intermediate;
1122*09537850SAkhilesh Sanikop int x = 0;
1123*09537850SAkhilesh Sanikop int p = subpixel_x;
1124*09537850SAkhilesh Sanikop do {
1125*09537850SAkhilesh Sanikop // Avoid overloading outside the reference boundaries. This means
1126*09537850SAkhilesh Sanikop // |trailing_width| can be up to 24.
1127*09537850SAkhilesh Sanikop const uint8_t* src_x =
1128*09537850SAkhilesh Sanikop &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1129*09537850SAkhilesh Sanikop // Only add steps to the 10-bit truncated p to avoid overflow.
1130*09537850SAkhilesh Sanikop const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1131*09537850SAkhilesh Sanikop const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1132*09537850SAkhilesh Sanikop const uint8x8_t src_indices =
1133*09537850SAkhilesh Sanikop vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1134*09537850SAkhilesh Sanikop uint8x8_t src_lookup[6];
1135*09537850SAkhilesh Sanikop src_lookup[0] = src_indices;
1136*09537850SAkhilesh Sanikop for (int i = 1; i < 6; ++i) {
1137*09537850SAkhilesh Sanikop src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1138*09537850SAkhilesh Sanikop }
1139*09537850SAkhilesh Sanikop
1140*09537850SAkhilesh Sanikop const uint8x8_t filter_indices =
1141*09537850SAkhilesh Sanikop vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1142*09537850SAkhilesh Sanikop filter_index_mask);
1143*09537850SAkhilesh Sanikop // For each x, a lane of taps[k] has
1144*09537850SAkhilesh Sanikop // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1145*09537850SAkhilesh Sanikop // on x.
1146*09537850SAkhilesh Sanikop uint8x8_t taps[6];
1147*09537850SAkhilesh Sanikop for (int i = 0; i < 6; ++i) {
1148*09537850SAkhilesh Sanikop taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1149*09537850SAkhilesh Sanikop }
1150*09537850SAkhilesh Sanikop int y = intermediate_height;
1151*09537850SAkhilesh Sanikop do {
1152*09537850SAkhilesh Sanikop // Load a pool of samples to select from using stepped indices.
1153*09537850SAkhilesh Sanikop const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1154*09537850SAkhilesh Sanikop
1155*09537850SAkhilesh Sanikop const uint8x8_t src[6] = {
1156*09537850SAkhilesh Sanikop vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1157*09537850SAkhilesh Sanikop vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1158*09537850SAkhilesh Sanikop vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
1159*09537850SAkhilesh Sanikop
1160*09537850SAkhilesh Sanikop vst1q_s16(intermediate_x,
1161*09537850SAkhilesh Sanikop vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
1162*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
1163*09537850SAkhilesh Sanikop src_x += src_stride;
1164*09537850SAkhilesh Sanikop intermediate_x += kIntermediateStride;
1165*09537850SAkhilesh Sanikop } while (--y != 0);
1166*09537850SAkhilesh Sanikop x += 8;
1167*09537850SAkhilesh Sanikop p += step_x8;
1168*09537850SAkhilesh Sanikop } while (x < width);
1169*09537850SAkhilesh Sanikop }
1170*09537850SAkhilesh Sanikop
1171*09537850SAkhilesh Sanikop // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
1172*09537850SAkhilesh Sanikop // has mixed positive and negative outer taps which are handled in
1173*09537850SAkhilesh Sanikop // GetMixed6TapFilter().
GetPositive6TapFilter(const int tap_index)1174*09537850SAkhilesh Sanikop inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
1175*09537850SAkhilesh Sanikop assert(tap_index < 6);
1176*09537850SAkhilesh Sanikop alignas(16) static constexpr uint8_t
1177*09537850SAkhilesh Sanikop kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
1178*09537850SAkhilesh Sanikop {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
1179*09537850SAkhilesh Sanikop {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
1180*09537850SAkhilesh Sanikop {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
1181*09537850SAkhilesh Sanikop {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
1182*09537850SAkhilesh Sanikop
1183*09537850SAkhilesh Sanikop return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
1184*09537850SAkhilesh Sanikop }
1185*09537850SAkhilesh Sanikop
GetMixed6TapFilter(const int tap_index)1186*09537850SAkhilesh Sanikop inline int8x16_t GetMixed6TapFilter(const int tap_index) {
1187*09537850SAkhilesh Sanikop assert(tap_index < 2);
1188*09537850SAkhilesh Sanikop alignas(
1189*09537850SAkhilesh Sanikop 16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
1190*09537850SAkhilesh Sanikop {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
1191*09537850SAkhilesh Sanikop {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
1192*09537850SAkhilesh Sanikop
1193*09537850SAkhilesh Sanikop return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
1194*09537850SAkhilesh Sanikop }
1195*09537850SAkhilesh Sanikop
1196*09537850SAkhilesh Sanikop // This filter is only possible when width >= 8.
1197*09537850SAkhilesh Sanikop template <int grade_x>
ConvolveKernelHorizontalMixed6Tap(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * LIBGAV1_RESTRICT const intermediate)1198*09537850SAkhilesh Sanikop inline void ConvolveKernelHorizontalMixed6Tap(
1199*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
1200*09537850SAkhilesh Sanikop const int width, const int subpixel_x, const int step_x,
1201*09537850SAkhilesh Sanikop const int intermediate_height,
1202*09537850SAkhilesh Sanikop int16_t* LIBGAV1_RESTRICT const intermediate) {
1203*09537850SAkhilesh Sanikop const int kernel_offset = 1;
1204*09537850SAkhilesh Sanikop const uint8x8_t one = vdup_n_u8(1);
1205*09537850SAkhilesh Sanikop const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1206*09537850SAkhilesh Sanikop const int ref_x = subpixel_x >> kScaleSubPixelBits;
1207*09537850SAkhilesh Sanikop const int step_x8 = step_x << 3;
1208*09537850SAkhilesh Sanikop uint8x8_t taps[4];
1209*09537850SAkhilesh Sanikop int16x8_t mixed_taps[2];
1210*09537850SAkhilesh Sanikop uint8x16_t positive_filter_taps[4];
1211*09537850SAkhilesh Sanikop for (int i = 0; i < 4; ++i) {
1212*09537850SAkhilesh Sanikop positive_filter_taps[i] = GetPositive6TapFilter(i);
1213*09537850SAkhilesh Sanikop }
1214*09537850SAkhilesh Sanikop int8x16_t mixed_filter_taps[2];
1215*09537850SAkhilesh Sanikop mixed_filter_taps[0] = GetMixed6TapFilter(0);
1216*09537850SAkhilesh Sanikop mixed_filter_taps[1] = GetMixed6TapFilter(1);
1217*09537850SAkhilesh Sanikop const uint16x8_t index_steps = vmulq_n_u16(
1218*09537850SAkhilesh Sanikop vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1219*09537850SAkhilesh Sanikop
1220*09537850SAkhilesh Sanikop int16_t* intermediate_x = intermediate;
1221*09537850SAkhilesh Sanikop int x = 0;
1222*09537850SAkhilesh Sanikop int p = subpixel_x;
1223*09537850SAkhilesh Sanikop do {
1224*09537850SAkhilesh Sanikop const uint8_t* src_x =
1225*09537850SAkhilesh Sanikop &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1226*09537850SAkhilesh Sanikop // Only add steps to the 10-bit truncated p to avoid overflow.
1227*09537850SAkhilesh Sanikop const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1228*09537850SAkhilesh Sanikop const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1229*09537850SAkhilesh Sanikop const uint8x8_t src_indices =
1230*09537850SAkhilesh Sanikop vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1231*09537850SAkhilesh Sanikop uint8x8_t src_lookup[6];
1232*09537850SAkhilesh Sanikop src_lookup[0] = src_indices;
1233*09537850SAkhilesh Sanikop for (int i = 1; i < 6; ++i) {
1234*09537850SAkhilesh Sanikop src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1235*09537850SAkhilesh Sanikop }
1236*09537850SAkhilesh Sanikop
1237*09537850SAkhilesh Sanikop const uint8x8_t filter_indices =
1238*09537850SAkhilesh Sanikop vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1239*09537850SAkhilesh Sanikop filter_index_mask);
1240*09537850SAkhilesh Sanikop // For each x, a lane of taps[k] has
1241*09537850SAkhilesh Sanikop // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1242*09537850SAkhilesh Sanikop // on x.
1243*09537850SAkhilesh Sanikop for (int i = 0; i < 4; ++i) {
1244*09537850SAkhilesh Sanikop taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
1245*09537850SAkhilesh Sanikop }
1246*09537850SAkhilesh Sanikop mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
1247*09537850SAkhilesh Sanikop mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
1248*09537850SAkhilesh Sanikop
1249*09537850SAkhilesh Sanikop int y = intermediate_height;
1250*09537850SAkhilesh Sanikop do {
1251*09537850SAkhilesh Sanikop // Load a pool of samples to select from using stepped indices.
1252*09537850SAkhilesh Sanikop const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1253*09537850SAkhilesh Sanikop
1254*09537850SAkhilesh Sanikop int16x8_t sum_mixed = vmulq_s16(
1255*09537850SAkhilesh Sanikop mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
1256*09537850SAkhilesh Sanikop sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
1257*09537850SAkhilesh Sanikop ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
1258*09537850SAkhilesh Sanikop uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
1259*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
1260*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
1261*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
1262*09537850SAkhilesh Sanikop sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
1263*09537850SAkhilesh Sanikop
1264*09537850SAkhilesh Sanikop vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
1265*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
1266*09537850SAkhilesh Sanikop src_x += src_stride;
1267*09537850SAkhilesh Sanikop intermediate_x += kIntermediateStride;
1268*09537850SAkhilesh Sanikop } while (--y != 0);
1269*09537850SAkhilesh Sanikop x += 8;
1270*09537850SAkhilesh Sanikop p += step_x8;
1271*09537850SAkhilesh Sanikop } while (x < width);
1272*09537850SAkhilesh Sanikop }
1273*09537850SAkhilesh Sanikop
1274*09537850SAkhilesh Sanikop // Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
GetSigned8TapFilter(const int tap_index)1275*09537850SAkhilesh Sanikop inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
1276*09537850SAkhilesh Sanikop assert(tap_index < 8);
1277*09537850SAkhilesh Sanikop alignas(16) static constexpr uint8_t
1278*09537850SAkhilesh Sanikop kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
1279*09537850SAkhilesh Sanikop {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
1280*09537850SAkhilesh Sanikop {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
1281*09537850SAkhilesh Sanikop {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
1282*09537850SAkhilesh Sanikop {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
1283*09537850SAkhilesh Sanikop {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
1284*09537850SAkhilesh Sanikop {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
1285*09537850SAkhilesh Sanikop {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
1286*09537850SAkhilesh Sanikop {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
1287*09537850SAkhilesh Sanikop
1288*09537850SAkhilesh Sanikop return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
1289*09537850SAkhilesh Sanikop }
1290*09537850SAkhilesh Sanikop
1291*09537850SAkhilesh Sanikop // This filter is only possible when width >= 8.
1292*09537850SAkhilesh Sanikop template <int grade_x>
ConvolveKernelHorizontalSigned8Tap(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * LIBGAV1_RESTRICT const intermediate)1293*09537850SAkhilesh Sanikop inline void ConvolveKernelHorizontalSigned8Tap(
1294*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
1295*09537850SAkhilesh Sanikop const int width, const int subpixel_x, const int step_x,
1296*09537850SAkhilesh Sanikop const int intermediate_height,
1297*09537850SAkhilesh Sanikop int16_t* LIBGAV1_RESTRICT const intermediate) {
1298*09537850SAkhilesh Sanikop const uint8x8_t one = vdup_n_u8(1);
1299*09537850SAkhilesh Sanikop const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1300*09537850SAkhilesh Sanikop const int ref_x = subpixel_x >> kScaleSubPixelBits;
1301*09537850SAkhilesh Sanikop const int step_x8 = step_x << 3;
1302*09537850SAkhilesh Sanikop uint8x8_t taps[8];
1303*09537850SAkhilesh Sanikop uint8x16_t filter_taps[8];
1304*09537850SAkhilesh Sanikop for (int i = 0; i < 8; ++i) {
1305*09537850SAkhilesh Sanikop filter_taps[i] = GetSigned8TapFilter(i);
1306*09537850SAkhilesh Sanikop }
1307*09537850SAkhilesh Sanikop const uint16x8_t index_steps = vmulq_n_u16(
1308*09537850SAkhilesh Sanikop vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1309*09537850SAkhilesh Sanikop
1310*09537850SAkhilesh Sanikop int16_t* intermediate_x = intermediate;
1311*09537850SAkhilesh Sanikop int x = 0;
1312*09537850SAkhilesh Sanikop int p = subpixel_x;
1313*09537850SAkhilesh Sanikop do {
1314*09537850SAkhilesh Sanikop const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
1315*09537850SAkhilesh Sanikop // Only add steps to the 10-bit truncated p to avoid overflow.
1316*09537850SAkhilesh Sanikop const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1317*09537850SAkhilesh Sanikop const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1318*09537850SAkhilesh Sanikop const uint8x8_t src_indices =
1319*09537850SAkhilesh Sanikop vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1320*09537850SAkhilesh Sanikop uint8x8_t src_lookup[8];
1321*09537850SAkhilesh Sanikop src_lookup[0] = src_indices;
1322*09537850SAkhilesh Sanikop for (int i = 1; i < 8; ++i) {
1323*09537850SAkhilesh Sanikop src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1324*09537850SAkhilesh Sanikop }
1325*09537850SAkhilesh Sanikop
1326*09537850SAkhilesh Sanikop const uint8x8_t filter_indices =
1327*09537850SAkhilesh Sanikop vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1328*09537850SAkhilesh Sanikop filter_index_mask);
1329*09537850SAkhilesh Sanikop // For each x, a lane of taps[k] has
1330*09537850SAkhilesh Sanikop // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1331*09537850SAkhilesh Sanikop // on x.
1332*09537850SAkhilesh Sanikop for (int i = 0; i < 8; ++i) {
1333*09537850SAkhilesh Sanikop taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1334*09537850SAkhilesh Sanikop }
1335*09537850SAkhilesh Sanikop
1336*09537850SAkhilesh Sanikop int y = intermediate_height;
1337*09537850SAkhilesh Sanikop do {
1338*09537850SAkhilesh Sanikop // Load a pool of samples to select from using stepped indices.
1339*09537850SAkhilesh Sanikop const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1340*09537850SAkhilesh Sanikop
1341*09537850SAkhilesh Sanikop const uint8x8_t src[8] = {
1342*09537850SAkhilesh Sanikop vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1343*09537850SAkhilesh Sanikop vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1344*09537850SAkhilesh Sanikop vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
1345*09537850SAkhilesh Sanikop vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
1346*09537850SAkhilesh Sanikop
1347*09537850SAkhilesh Sanikop vst1q_s16(intermediate_x,
1348*09537850SAkhilesh Sanikop vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
1349*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
1350*09537850SAkhilesh Sanikop src_x += src_stride;
1351*09537850SAkhilesh Sanikop intermediate_x += kIntermediateStride;
1352*09537850SAkhilesh Sanikop } while (--y != 0);
1353*09537850SAkhilesh Sanikop x += 8;
1354*09537850SAkhilesh Sanikop p += step_x8;
1355*09537850SAkhilesh Sanikop } while (x < width);
1356*09537850SAkhilesh Sanikop }
1357*09537850SAkhilesh Sanikop
1358*09537850SAkhilesh Sanikop // This function handles blocks of width 2 or 4.
1359*09537850SAkhilesh Sanikop template <int num_taps, int grade_y, int width, bool is_compound>
ConvolveVerticalScale4xH(const int16_t * LIBGAV1_RESTRICT const src,const int subpixel_y,const int filter_index,const int step_y,const int height,void * LIBGAV1_RESTRICT const dest,const ptrdiff_t dest_stride)1360*09537850SAkhilesh Sanikop void ConvolveVerticalScale4xH(const int16_t* LIBGAV1_RESTRICT const src,
1361*09537850SAkhilesh Sanikop const int subpixel_y, const int filter_index,
1362*09537850SAkhilesh Sanikop const int step_y, const int height,
1363*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest,
1364*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride) {
1365*09537850SAkhilesh Sanikop constexpr ptrdiff_t src_stride = kIntermediateStride;
1366*09537850SAkhilesh Sanikop const int16_t* src_y = src;
1367*09537850SAkhilesh Sanikop // |dest| is 16-bit in compound mode, Pixel otherwise.
1368*09537850SAkhilesh Sanikop auto* dest16_y = static_cast<uint16_t*>(dest);
1369*09537850SAkhilesh Sanikop auto* dest_y = static_cast<uint8_t*>(dest);
1370*09537850SAkhilesh Sanikop int16x4_t s[num_taps + grade_y];
1371*09537850SAkhilesh Sanikop
1372*09537850SAkhilesh Sanikop int p = subpixel_y & 1023;
1373*09537850SAkhilesh Sanikop int prev_p = p;
1374*09537850SAkhilesh Sanikop int y = height;
1375*09537850SAkhilesh Sanikop do {
1376*09537850SAkhilesh Sanikop for (int i = 0; i < num_taps; ++i) {
1377*09537850SAkhilesh Sanikop s[i] = vld1_s16(src_y + i * src_stride);
1378*09537850SAkhilesh Sanikop }
1379*09537850SAkhilesh Sanikop int filter_id = (p >> 6) & kSubPixelMask;
1380*09537850SAkhilesh Sanikop int16x8_t filter =
1381*09537850SAkhilesh Sanikop vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1382*09537850SAkhilesh Sanikop int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
1383*09537850SAkhilesh Sanikop if (is_compound) {
1384*09537850SAkhilesh Sanikop assert(width != 2);
1385*09537850SAkhilesh Sanikop const uint16x4_t result = vreinterpret_u16_s16(sums);
1386*09537850SAkhilesh Sanikop vst1_u16(dest16_y, result);
1387*09537850SAkhilesh Sanikop } else {
1388*09537850SAkhilesh Sanikop const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1389*09537850SAkhilesh Sanikop if (width == 2) {
1390*09537850SAkhilesh Sanikop Store2<0>(dest_y, result);
1391*09537850SAkhilesh Sanikop } else {
1392*09537850SAkhilesh Sanikop StoreLo4(dest_y, result);
1393*09537850SAkhilesh Sanikop }
1394*09537850SAkhilesh Sanikop }
1395*09537850SAkhilesh Sanikop p += step_y;
1396*09537850SAkhilesh Sanikop const int p_diff =
1397*09537850SAkhilesh Sanikop (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1398*09537850SAkhilesh Sanikop prev_p = p;
1399*09537850SAkhilesh Sanikop // Here we load extra source in case it is needed. If |p_diff| == 0, these
1400*09537850SAkhilesh Sanikop // values will be unused, but it's faster to load than to branch.
1401*09537850SAkhilesh Sanikop s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
1402*09537850SAkhilesh Sanikop if (grade_y > 1) {
1403*09537850SAkhilesh Sanikop s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
1404*09537850SAkhilesh Sanikop }
1405*09537850SAkhilesh Sanikop dest16_y += dest_stride;
1406*09537850SAkhilesh Sanikop dest_y += dest_stride;
1407*09537850SAkhilesh Sanikop
1408*09537850SAkhilesh Sanikop filter_id = (p >> 6) & kSubPixelMask;
1409*09537850SAkhilesh Sanikop filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1410*09537850SAkhilesh Sanikop sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
1411*09537850SAkhilesh Sanikop if (is_compound) {
1412*09537850SAkhilesh Sanikop assert(width != 2);
1413*09537850SAkhilesh Sanikop const uint16x4_t result = vreinterpret_u16_s16(sums);
1414*09537850SAkhilesh Sanikop vst1_u16(dest16_y, result);
1415*09537850SAkhilesh Sanikop } else {
1416*09537850SAkhilesh Sanikop const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1417*09537850SAkhilesh Sanikop if (width == 2) {
1418*09537850SAkhilesh Sanikop Store2<0>(dest_y, result);
1419*09537850SAkhilesh Sanikop } else {
1420*09537850SAkhilesh Sanikop StoreLo4(dest_y, result);
1421*09537850SAkhilesh Sanikop }
1422*09537850SAkhilesh Sanikop }
1423*09537850SAkhilesh Sanikop p += step_y;
1424*09537850SAkhilesh Sanikop src_y = src + (p >> kScaleSubPixelBits) * src_stride;
1425*09537850SAkhilesh Sanikop prev_p = p;
1426*09537850SAkhilesh Sanikop dest16_y += dest_stride;
1427*09537850SAkhilesh Sanikop dest_y += dest_stride;
1428*09537850SAkhilesh Sanikop y -= 2;
1429*09537850SAkhilesh Sanikop } while (y != 0);
1430*09537850SAkhilesh Sanikop }
1431*09537850SAkhilesh Sanikop
1432*09537850SAkhilesh Sanikop template <int num_taps, int grade_y, bool is_compound>
ConvolveVerticalScale(const int16_t * LIBGAV1_RESTRICT const source,const int intermediate_height,const int width,const int subpixel_y,const int filter_index,const int step_y,const int height,void * LIBGAV1_RESTRICT const dest,const ptrdiff_t dest_stride)1433*09537850SAkhilesh Sanikop inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const source,
1434*09537850SAkhilesh Sanikop const int intermediate_height,
1435*09537850SAkhilesh Sanikop const int width, const int subpixel_y,
1436*09537850SAkhilesh Sanikop const int filter_index, const int step_y,
1437*09537850SAkhilesh Sanikop const int height,
1438*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest,
1439*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride) {
1440*09537850SAkhilesh Sanikop constexpr ptrdiff_t src_stride = kIntermediateStride;
1441*09537850SAkhilesh Sanikop // A possible improvement is to use arithmetic to decide how many times to
1442*09537850SAkhilesh Sanikop // apply filters to same source before checking whether to load new srcs.
1443*09537850SAkhilesh Sanikop // However, this will only improve performance with very small step sizes.
1444*09537850SAkhilesh Sanikop int16x8_t s[num_taps + grade_y];
1445*09537850SAkhilesh Sanikop // |dest| is 16-bit in compound mode, Pixel otherwise.
1446*09537850SAkhilesh Sanikop uint16_t* dest16_y;
1447*09537850SAkhilesh Sanikop uint8_t* dest_y;
1448*09537850SAkhilesh Sanikop const int16_t* src = source;
1449*09537850SAkhilesh Sanikop
1450*09537850SAkhilesh Sanikop int x = 0;
1451*09537850SAkhilesh Sanikop do {
1452*09537850SAkhilesh Sanikop const int16_t* src_y = src;
1453*09537850SAkhilesh Sanikop dest16_y = static_cast<uint16_t*>(dest) + x;
1454*09537850SAkhilesh Sanikop dest_y = static_cast<uint8_t*>(dest) + x;
1455*09537850SAkhilesh Sanikop int p = subpixel_y & 1023;
1456*09537850SAkhilesh Sanikop int prev_p = p;
1457*09537850SAkhilesh Sanikop int y = height;
1458*09537850SAkhilesh Sanikop do {
1459*09537850SAkhilesh Sanikop for (int i = 0; i < num_taps; ++i) {
1460*09537850SAkhilesh Sanikop s[i] = vld1q_s16(src_y + i * src_stride);
1461*09537850SAkhilesh Sanikop }
1462*09537850SAkhilesh Sanikop int filter_id = (p >> 6) & kSubPixelMask;
1463*09537850SAkhilesh Sanikop int16x8_t filter =
1464*09537850SAkhilesh Sanikop vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1465*09537850SAkhilesh Sanikop int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
1466*09537850SAkhilesh Sanikop if (is_compound) {
1467*09537850SAkhilesh Sanikop vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1468*09537850SAkhilesh Sanikop } else {
1469*09537850SAkhilesh Sanikop vst1_u8(dest_y, vqmovun_s16(sum));
1470*09537850SAkhilesh Sanikop }
1471*09537850SAkhilesh Sanikop p += step_y;
1472*09537850SAkhilesh Sanikop const int p_diff =
1473*09537850SAkhilesh Sanikop (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1474*09537850SAkhilesh Sanikop // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
1475*09537850SAkhilesh Sanikop // needed. Otherwise, we only need to load one vector because |p_diff|
1476*09537850SAkhilesh Sanikop // can't exceed 1.
1477*09537850SAkhilesh Sanikop s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
1478*09537850SAkhilesh Sanikop if (grade_y > 1) {
1479*09537850SAkhilesh Sanikop s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
1480*09537850SAkhilesh Sanikop }
1481*09537850SAkhilesh Sanikop dest16_y += dest_stride;
1482*09537850SAkhilesh Sanikop dest_y += dest_stride;
1483*09537850SAkhilesh Sanikop
1484*09537850SAkhilesh Sanikop filter_id = (p >> 6) & kSubPixelMask;
1485*09537850SAkhilesh Sanikop filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1486*09537850SAkhilesh Sanikop sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
1487*09537850SAkhilesh Sanikop if (is_compound) {
1488*09537850SAkhilesh Sanikop vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1489*09537850SAkhilesh Sanikop } else {
1490*09537850SAkhilesh Sanikop vst1_u8(dest_y, vqmovun_s16(sum));
1491*09537850SAkhilesh Sanikop }
1492*09537850SAkhilesh Sanikop p += step_y;
1493*09537850SAkhilesh Sanikop src_y = src + (p >> kScaleSubPixelBits) * src_stride;
1494*09537850SAkhilesh Sanikop prev_p = p;
1495*09537850SAkhilesh Sanikop dest16_y += dest_stride;
1496*09537850SAkhilesh Sanikop dest_y += dest_stride;
1497*09537850SAkhilesh Sanikop y -= 2;
1498*09537850SAkhilesh Sanikop } while (y != 0);
1499*09537850SAkhilesh Sanikop src += kIntermediateStride * intermediate_height;
1500*09537850SAkhilesh Sanikop x += 8;
1501*09537850SAkhilesh Sanikop } while (x < width);
1502*09537850SAkhilesh Sanikop }
1503*09537850SAkhilesh Sanikop
1504*09537850SAkhilesh Sanikop template <bool is_compound>
ConvolveScale2D_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int step_x,const int step_y,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)1505*09537850SAkhilesh Sanikop void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference,
1506*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride,
1507*09537850SAkhilesh Sanikop const int horizontal_filter_index,
1508*09537850SAkhilesh Sanikop const int vertical_filter_index, const int subpixel_x,
1509*09537850SAkhilesh Sanikop const int subpixel_y, const int step_x,
1510*09537850SAkhilesh Sanikop const int step_y, const int width, const int height,
1511*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction,
1512*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride) {
1513*09537850SAkhilesh Sanikop const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
1514*09537850SAkhilesh Sanikop const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
1515*09537850SAkhilesh Sanikop assert(step_x <= 2048);
1516*09537850SAkhilesh Sanikop assert(step_y <= 2048);
1517*09537850SAkhilesh Sanikop const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
1518*09537850SAkhilesh Sanikop const int intermediate_height =
1519*09537850SAkhilesh Sanikop (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
1520*09537850SAkhilesh Sanikop kScaleSubPixelBits) +
1521*09537850SAkhilesh Sanikop num_vert_taps;
1522*09537850SAkhilesh Sanikop // The output of the horizontal filter, i.e. the intermediate_result, is
1523*09537850SAkhilesh Sanikop // guaranteed to fit in int16_t.
1524*09537850SAkhilesh Sanikop int16_t intermediate_result[kIntermediateAllocWidth *
1525*09537850SAkhilesh Sanikop (2 * kIntermediateAllocWidth + 8)];
1526*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
1527*09537850SAkhilesh Sanikop // Quiet msan warnings. Set with random non-zero value to aid in debugging.
1528*09537850SAkhilesh Sanikop memset(intermediate_result, 0x44, sizeof(intermediate_result));
1529*09537850SAkhilesh Sanikop #endif
1530*09537850SAkhilesh Sanikop // Horizontal filter.
1531*09537850SAkhilesh Sanikop // Filter types used for width <= 4 are different from those for width > 4.
1532*09537850SAkhilesh Sanikop // When width > 4, the valid filter index range is always [0, 3].
1533*09537850SAkhilesh Sanikop // When width <= 4, the valid filter index range is always [3, 5].
1534*09537850SAkhilesh Sanikop // Similarly for height.
1535*09537850SAkhilesh Sanikop int filter_index = GetFilterIndex(horizontal_filter_index, width);
1536*09537850SAkhilesh Sanikop int16_t* intermediate = intermediate_result;
1537*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
1538*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
1539*09537850SAkhilesh Sanikop const int vert_kernel_offset = (8 - num_vert_taps) / 2;
1540*09537850SAkhilesh Sanikop src += vert_kernel_offset * src_stride;
1541*09537850SAkhilesh Sanikop
1542*09537850SAkhilesh Sanikop // Derive the maximum value of |step_x| at which all source values fit in one
1543*09537850SAkhilesh Sanikop // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
1544*09537850SAkhilesh Sanikop // step_x*7 is the final base subpel index for the shuffle mask for filter
1545*09537850SAkhilesh Sanikop // inputs in each iteration on large blocks. When step_x is large, we need a
1546*09537850SAkhilesh Sanikop // larger structure and use a larger table lookup in order to gather all
1547*09537850SAkhilesh Sanikop // filter inputs.
1548*09537850SAkhilesh Sanikop // |num_taps| - 1 is the shuffle index of the final filter input.
1549*09537850SAkhilesh Sanikop const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
1550*09537850SAkhilesh Sanikop const int kernel_start_ceiling = 16 - num_horiz_taps;
1551*09537850SAkhilesh Sanikop // This truncated quotient |grade_x_threshold| selects |step_x| such that:
1552*09537850SAkhilesh Sanikop // (step_x * 7) >> kScaleSubPixelBits < single load limit
1553*09537850SAkhilesh Sanikop const int grade_x_threshold =
1554*09537850SAkhilesh Sanikop (kernel_start_ceiling << kScaleSubPixelBits) / 7;
1555*09537850SAkhilesh Sanikop switch (filter_index) {
1556*09537850SAkhilesh Sanikop case 0:
1557*09537850SAkhilesh Sanikop if (step_x > grade_x_threshold) {
1558*09537850SAkhilesh Sanikop ConvolveKernelHorizontalSigned6Tap<2>(
1559*09537850SAkhilesh Sanikop src, src_stride, width, subpixel_x, step_x, intermediate_height,
1560*09537850SAkhilesh Sanikop intermediate);
1561*09537850SAkhilesh Sanikop } else {
1562*09537850SAkhilesh Sanikop ConvolveKernelHorizontalSigned6Tap<1>(
1563*09537850SAkhilesh Sanikop src, src_stride, width, subpixel_x, step_x, intermediate_height,
1564*09537850SAkhilesh Sanikop intermediate);
1565*09537850SAkhilesh Sanikop }
1566*09537850SAkhilesh Sanikop break;
1567*09537850SAkhilesh Sanikop case 1:
1568*09537850SAkhilesh Sanikop if (step_x > grade_x_threshold) {
1569*09537850SAkhilesh Sanikop ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
1570*09537850SAkhilesh Sanikop step_x, intermediate_height,
1571*09537850SAkhilesh Sanikop intermediate);
1572*09537850SAkhilesh Sanikop
1573*09537850SAkhilesh Sanikop } else {
1574*09537850SAkhilesh Sanikop ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
1575*09537850SAkhilesh Sanikop step_x, intermediate_height,
1576*09537850SAkhilesh Sanikop intermediate);
1577*09537850SAkhilesh Sanikop }
1578*09537850SAkhilesh Sanikop break;
1579*09537850SAkhilesh Sanikop case 2:
1580*09537850SAkhilesh Sanikop if (step_x > grade_x_threshold) {
1581*09537850SAkhilesh Sanikop ConvolveKernelHorizontalSigned8Tap<2>(
1582*09537850SAkhilesh Sanikop src, src_stride, width, subpixel_x, step_x, intermediate_height,
1583*09537850SAkhilesh Sanikop intermediate);
1584*09537850SAkhilesh Sanikop } else {
1585*09537850SAkhilesh Sanikop ConvolveKernelHorizontalSigned8Tap<1>(
1586*09537850SAkhilesh Sanikop src, src_stride, width, subpixel_x, step_x, intermediate_height,
1587*09537850SAkhilesh Sanikop intermediate);
1588*09537850SAkhilesh Sanikop }
1589*09537850SAkhilesh Sanikop break;
1590*09537850SAkhilesh Sanikop case 3:
1591*09537850SAkhilesh Sanikop if (step_x > grade_x_threshold) {
1592*09537850SAkhilesh Sanikop ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
1593*09537850SAkhilesh Sanikop step_x, intermediate_height,
1594*09537850SAkhilesh Sanikop intermediate);
1595*09537850SAkhilesh Sanikop } else {
1596*09537850SAkhilesh Sanikop ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
1597*09537850SAkhilesh Sanikop step_x, intermediate_height,
1598*09537850SAkhilesh Sanikop intermediate);
1599*09537850SAkhilesh Sanikop }
1600*09537850SAkhilesh Sanikop break;
1601*09537850SAkhilesh Sanikop case 4:
1602*09537850SAkhilesh Sanikop assert(width <= 4);
1603*09537850SAkhilesh Sanikop ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
1604*09537850SAkhilesh Sanikop intermediate_height, intermediate);
1605*09537850SAkhilesh Sanikop break;
1606*09537850SAkhilesh Sanikop default:
1607*09537850SAkhilesh Sanikop assert(filter_index == 5);
1608*09537850SAkhilesh Sanikop ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
1609*09537850SAkhilesh Sanikop intermediate_height, intermediate);
1610*09537850SAkhilesh Sanikop }
1611*09537850SAkhilesh Sanikop // Vertical filter.
1612*09537850SAkhilesh Sanikop filter_index = GetFilterIndex(vertical_filter_index, height);
1613*09537850SAkhilesh Sanikop intermediate = intermediate_result;
1614*09537850SAkhilesh Sanikop
1615*09537850SAkhilesh Sanikop switch (filter_index) {
1616*09537850SAkhilesh Sanikop case 0:
1617*09537850SAkhilesh Sanikop case 1:
1618*09537850SAkhilesh Sanikop if (step_y <= 1024) {
1619*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1620*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
1621*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1622*09537850SAkhilesh Sanikop prediction, pred_stride);
1623*09537850SAkhilesh Sanikop } else if (width == 4) {
1624*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
1625*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1626*09537850SAkhilesh Sanikop prediction, pred_stride);
1627*09537850SAkhilesh Sanikop } else {
1628*09537850SAkhilesh Sanikop ConvolveVerticalScale<6, 1, is_compound>(
1629*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1630*09537850SAkhilesh Sanikop filter_index, step_y, height, prediction, pred_stride);
1631*09537850SAkhilesh Sanikop }
1632*09537850SAkhilesh Sanikop } else {
1633*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1634*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
1635*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1636*09537850SAkhilesh Sanikop prediction, pred_stride);
1637*09537850SAkhilesh Sanikop } else if (width == 4) {
1638*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
1639*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1640*09537850SAkhilesh Sanikop prediction, pred_stride);
1641*09537850SAkhilesh Sanikop } else {
1642*09537850SAkhilesh Sanikop ConvolveVerticalScale<6, 2, is_compound>(
1643*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1644*09537850SAkhilesh Sanikop filter_index, step_y, height, prediction, pred_stride);
1645*09537850SAkhilesh Sanikop }
1646*09537850SAkhilesh Sanikop }
1647*09537850SAkhilesh Sanikop break;
1648*09537850SAkhilesh Sanikop case 2:
1649*09537850SAkhilesh Sanikop if (step_y <= 1024) {
1650*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1651*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
1652*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1653*09537850SAkhilesh Sanikop prediction, pred_stride);
1654*09537850SAkhilesh Sanikop } else if (width == 4) {
1655*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
1656*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1657*09537850SAkhilesh Sanikop prediction, pred_stride);
1658*09537850SAkhilesh Sanikop } else {
1659*09537850SAkhilesh Sanikop ConvolveVerticalScale<8, 1, is_compound>(
1660*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1661*09537850SAkhilesh Sanikop filter_index, step_y, height, prediction, pred_stride);
1662*09537850SAkhilesh Sanikop }
1663*09537850SAkhilesh Sanikop } else {
1664*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1665*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
1666*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1667*09537850SAkhilesh Sanikop prediction, pred_stride);
1668*09537850SAkhilesh Sanikop } else if (width == 4) {
1669*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
1670*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1671*09537850SAkhilesh Sanikop prediction, pred_stride);
1672*09537850SAkhilesh Sanikop } else {
1673*09537850SAkhilesh Sanikop ConvolveVerticalScale<8, 2, is_compound>(
1674*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1675*09537850SAkhilesh Sanikop filter_index, step_y, height, prediction, pred_stride);
1676*09537850SAkhilesh Sanikop }
1677*09537850SAkhilesh Sanikop }
1678*09537850SAkhilesh Sanikop break;
1679*09537850SAkhilesh Sanikop case 3:
1680*09537850SAkhilesh Sanikop if (step_y <= 1024) {
1681*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1682*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
1683*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1684*09537850SAkhilesh Sanikop prediction, pred_stride);
1685*09537850SAkhilesh Sanikop } else if (width == 4) {
1686*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
1687*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1688*09537850SAkhilesh Sanikop prediction, pred_stride);
1689*09537850SAkhilesh Sanikop } else {
1690*09537850SAkhilesh Sanikop ConvolveVerticalScale<2, 1, is_compound>(
1691*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1692*09537850SAkhilesh Sanikop filter_index, step_y, height, prediction, pred_stride);
1693*09537850SAkhilesh Sanikop }
1694*09537850SAkhilesh Sanikop } else {
1695*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1696*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
1697*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1698*09537850SAkhilesh Sanikop prediction, pred_stride);
1699*09537850SAkhilesh Sanikop } else if (width == 4) {
1700*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
1701*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1702*09537850SAkhilesh Sanikop prediction, pred_stride);
1703*09537850SAkhilesh Sanikop } else {
1704*09537850SAkhilesh Sanikop ConvolveVerticalScale<2, 2, is_compound>(
1705*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1706*09537850SAkhilesh Sanikop filter_index, step_y, height, prediction, pred_stride);
1707*09537850SAkhilesh Sanikop }
1708*09537850SAkhilesh Sanikop }
1709*09537850SAkhilesh Sanikop break;
1710*09537850SAkhilesh Sanikop case 4:
1711*09537850SAkhilesh Sanikop default:
1712*09537850SAkhilesh Sanikop assert(filter_index == 4 || filter_index == 5);
1713*09537850SAkhilesh Sanikop assert(height <= 4);
1714*09537850SAkhilesh Sanikop if (step_y <= 1024) {
1715*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1716*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
1717*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1718*09537850SAkhilesh Sanikop prediction, pred_stride);
1719*09537850SAkhilesh Sanikop } else if (width == 4) {
1720*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
1721*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1722*09537850SAkhilesh Sanikop prediction, pred_stride);
1723*09537850SAkhilesh Sanikop } else {
1724*09537850SAkhilesh Sanikop ConvolveVerticalScale<4, 1, is_compound>(
1725*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1726*09537850SAkhilesh Sanikop filter_index, step_y, height, prediction, pred_stride);
1727*09537850SAkhilesh Sanikop }
1728*09537850SAkhilesh Sanikop } else {
1729*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1730*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
1731*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1732*09537850SAkhilesh Sanikop prediction, pred_stride);
1733*09537850SAkhilesh Sanikop } else if (width == 4) {
1734*09537850SAkhilesh Sanikop ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
1735*09537850SAkhilesh Sanikop intermediate, subpixel_y, filter_index, step_y, height,
1736*09537850SAkhilesh Sanikop prediction, pred_stride);
1737*09537850SAkhilesh Sanikop } else {
1738*09537850SAkhilesh Sanikop ConvolveVerticalScale<4, 2, is_compound>(
1739*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1740*09537850SAkhilesh Sanikop filter_index, step_y, height, prediction, pred_stride);
1741*09537850SAkhilesh Sanikop }
1742*09537850SAkhilesh Sanikop }
1743*09537850SAkhilesh Sanikop }
1744*09537850SAkhilesh Sanikop }
1745*09537850SAkhilesh Sanikop
ConvolveHorizontal_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)1746*09537850SAkhilesh Sanikop void ConvolveHorizontal_NEON(
1747*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
1748*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int horizontal_filter_index,
1749*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int horizontal_filter_id,
1750*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
1751*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
1752*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(horizontal_filter_index, width);
1753*09537850SAkhilesh Sanikop // Set |src| to the outermost tap.
1754*09537850SAkhilesh Sanikop const auto* const src =
1755*09537850SAkhilesh Sanikop static_cast<const uint8_t*>(reference) - kHorizontalOffset;
1756*09537850SAkhilesh Sanikop auto* const dest = static_cast<uint8_t*>(prediction);
1757*09537850SAkhilesh Sanikop
1758*09537850SAkhilesh Sanikop DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
1759*09537850SAkhilesh Sanikop horizontal_filter_id, filter_index);
1760*09537850SAkhilesh Sanikop }
1761*09537850SAkhilesh Sanikop
1762*09537850SAkhilesh Sanikop // The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
1763*09537850SAkhilesh Sanikop // Vertical calculations.
Compound1DShift(const int16x8_t sum)1764*09537850SAkhilesh Sanikop uint16x8_t Compound1DShift(const int16x8_t sum) {
1765*09537850SAkhilesh Sanikop return vreinterpretq_u16_s16(
1766*09537850SAkhilesh Sanikop vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
1767*09537850SAkhilesh Sanikop }
1768*09537850SAkhilesh Sanikop
1769*09537850SAkhilesh Sanikop template <int filter_index, bool is_compound = false,
1770*09537850SAkhilesh Sanikop bool negative_outside_taps = false>
FilterVertical(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int width,const int height,const uint8x8_t * const taps)1771*09537850SAkhilesh Sanikop void FilterVertical(const uint8_t* LIBGAV1_RESTRICT const src,
1772*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
1773*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
1774*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int width,
1775*09537850SAkhilesh Sanikop const int height, const uint8x8_t* const taps) {
1776*09537850SAkhilesh Sanikop const int num_taps = GetNumTapsInFilter(filter_index);
1777*09537850SAkhilesh Sanikop const int next_row = num_taps - 1;
1778*09537850SAkhilesh Sanikop auto* const dst8 = static_cast<uint8_t*>(dst);
1779*09537850SAkhilesh Sanikop auto* const dst16 = static_cast<uint16_t*>(dst);
1780*09537850SAkhilesh Sanikop assert(width >= 8);
1781*09537850SAkhilesh Sanikop
1782*09537850SAkhilesh Sanikop int x = 0;
1783*09537850SAkhilesh Sanikop do {
1784*09537850SAkhilesh Sanikop const uint8_t* src_x = src + x;
1785*09537850SAkhilesh Sanikop uint8x8_t srcs[8];
1786*09537850SAkhilesh Sanikop srcs[0] = vld1_u8(src_x);
1787*09537850SAkhilesh Sanikop src_x += src_stride;
1788*09537850SAkhilesh Sanikop if (num_taps >= 4) {
1789*09537850SAkhilesh Sanikop srcs[1] = vld1_u8(src_x);
1790*09537850SAkhilesh Sanikop src_x += src_stride;
1791*09537850SAkhilesh Sanikop srcs[2] = vld1_u8(src_x);
1792*09537850SAkhilesh Sanikop src_x += src_stride;
1793*09537850SAkhilesh Sanikop if (num_taps >= 6) {
1794*09537850SAkhilesh Sanikop srcs[3] = vld1_u8(src_x);
1795*09537850SAkhilesh Sanikop src_x += src_stride;
1796*09537850SAkhilesh Sanikop srcs[4] = vld1_u8(src_x);
1797*09537850SAkhilesh Sanikop src_x += src_stride;
1798*09537850SAkhilesh Sanikop if (num_taps == 8) {
1799*09537850SAkhilesh Sanikop srcs[5] = vld1_u8(src_x);
1800*09537850SAkhilesh Sanikop src_x += src_stride;
1801*09537850SAkhilesh Sanikop srcs[6] = vld1_u8(src_x);
1802*09537850SAkhilesh Sanikop src_x += src_stride;
1803*09537850SAkhilesh Sanikop }
1804*09537850SAkhilesh Sanikop }
1805*09537850SAkhilesh Sanikop }
1806*09537850SAkhilesh Sanikop
1807*09537850SAkhilesh Sanikop // Decreasing the y loop counter produces worse code with clang.
1808*09537850SAkhilesh Sanikop // Don't unroll this loop since it generates too much code and the decoder
1809*09537850SAkhilesh Sanikop // is even slower.
1810*09537850SAkhilesh Sanikop int y = 0;
1811*09537850SAkhilesh Sanikop do {
1812*09537850SAkhilesh Sanikop srcs[next_row] = vld1_u8(src_x);
1813*09537850SAkhilesh Sanikop src_x += src_stride;
1814*09537850SAkhilesh Sanikop
1815*09537850SAkhilesh Sanikop const int16x8_t sums =
1816*09537850SAkhilesh Sanikop SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1817*09537850SAkhilesh Sanikop if (is_compound) {
1818*09537850SAkhilesh Sanikop const uint16x8_t results = Compound1DShift(sums);
1819*09537850SAkhilesh Sanikop vst1q_u16(dst16 + x + y * dst_stride, results);
1820*09537850SAkhilesh Sanikop } else {
1821*09537850SAkhilesh Sanikop const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1822*09537850SAkhilesh Sanikop vst1_u8(dst8 + x + y * dst_stride, results);
1823*09537850SAkhilesh Sanikop }
1824*09537850SAkhilesh Sanikop
1825*09537850SAkhilesh Sanikop srcs[0] = srcs[1];
1826*09537850SAkhilesh Sanikop if (num_taps >= 4) {
1827*09537850SAkhilesh Sanikop srcs[1] = srcs[2];
1828*09537850SAkhilesh Sanikop srcs[2] = srcs[3];
1829*09537850SAkhilesh Sanikop if (num_taps >= 6) {
1830*09537850SAkhilesh Sanikop srcs[3] = srcs[4];
1831*09537850SAkhilesh Sanikop srcs[4] = srcs[5];
1832*09537850SAkhilesh Sanikop if (num_taps == 8) {
1833*09537850SAkhilesh Sanikop srcs[5] = srcs[6];
1834*09537850SAkhilesh Sanikop srcs[6] = srcs[7];
1835*09537850SAkhilesh Sanikop }
1836*09537850SAkhilesh Sanikop }
1837*09537850SAkhilesh Sanikop }
1838*09537850SAkhilesh Sanikop } while (++y < height);
1839*09537850SAkhilesh Sanikop x += 8;
1840*09537850SAkhilesh Sanikop } while (x < width);
1841*09537850SAkhilesh Sanikop }
1842*09537850SAkhilesh Sanikop
1843*09537850SAkhilesh Sanikop template <int filter_index, bool is_compound = false,
1844*09537850SAkhilesh Sanikop bool negative_outside_taps = false>
FilterVertical4xH(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)1845*09537850SAkhilesh Sanikop void FilterVertical4xH(const uint8_t* LIBGAV1_RESTRICT src,
1846*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
1847*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
1848*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int height,
1849*09537850SAkhilesh Sanikop const uint8x8_t* const taps) {
1850*09537850SAkhilesh Sanikop const int num_taps = GetNumTapsInFilter(filter_index);
1851*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
1852*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst);
1853*09537850SAkhilesh Sanikop
1854*09537850SAkhilesh Sanikop uint8x8_t srcs[9];
1855*09537850SAkhilesh Sanikop
1856*09537850SAkhilesh Sanikop if (num_taps == 2) {
1857*09537850SAkhilesh Sanikop srcs[2] = vdup_n_u8(0);
1858*09537850SAkhilesh Sanikop
1859*09537850SAkhilesh Sanikop srcs[0] = Load4(src);
1860*09537850SAkhilesh Sanikop src += src_stride;
1861*09537850SAkhilesh Sanikop
1862*09537850SAkhilesh Sanikop int y = height;
1863*09537850SAkhilesh Sanikop do {
1864*09537850SAkhilesh Sanikop srcs[0] = Load4<1>(src, srcs[0]);
1865*09537850SAkhilesh Sanikop src += src_stride;
1866*09537850SAkhilesh Sanikop srcs[2] = Load4<0>(src, srcs[2]);
1867*09537850SAkhilesh Sanikop src += src_stride;
1868*09537850SAkhilesh Sanikop srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1869*09537850SAkhilesh Sanikop
1870*09537850SAkhilesh Sanikop const int16x8_t sums =
1871*09537850SAkhilesh Sanikop SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1872*09537850SAkhilesh Sanikop if (is_compound) {
1873*09537850SAkhilesh Sanikop const uint16x8_t results = Compound1DShift(sums);
1874*09537850SAkhilesh Sanikop
1875*09537850SAkhilesh Sanikop vst1q_u16(dst16, results);
1876*09537850SAkhilesh Sanikop dst16 += 4 << 1;
1877*09537850SAkhilesh Sanikop } else {
1878*09537850SAkhilesh Sanikop const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1879*09537850SAkhilesh Sanikop
1880*09537850SAkhilesh Sanikop StoreLo4(dst8, results);
1881*09537850SAkhilesh Sanikop dst8 += dst_stride;
1882*09537850SAkhilesh Sanikop StoreHi4(dst8, results);
1883*09537850SAkhilesh Sanikop dst8 += dst_stride;
1884*09537850SAkhilesh Sanikop }
1885*09537850SAkhilesh Sanikop
1886*09537850SAkhilesh Sanikop srcs[0] = srcs[2];
1887*09537850SAkhilesh Sanikop y -= 2;
1888*09537850SAkhilesh Sanikop } while (y != 0);
1889*09537850SAkhilesh Sanikop } else if (num_taps == 4) {
1890*09537850SAkhilesh Sanikop srcs[4] = vdup_n_u8(0);
1891*09537850SAkhilesh Sanikop
1892*09537850SAkhilesh Sanikop srcs[0] = Load4(src);
1893*09537850SAkhilesh Sanikop src += src_stride;
1894*09537850SAkhilesh Sanikop srcs[0] = Load4<1>(src, srcs[0]);
1895*09537850SAkhilesh Sanikop src += src_stride;
1896*09537850SAkhilesh Sanikop srcs[2] = Load4(src);
1897*09537850SAkhilesh Sanikop src += src_stride;
1898*09537850SAkhilesh Sanikop srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1899*09537850SAkhilesh Sanikop
1900*09537850SAkhilesh Sanikop int y = height;
1901*09537850SAkhilesh Sanikop do {
1902*09537850SAkhilesh Sanikop srcs[2] = Load4<1>(src, srcs[2]);
1903*09537850SAkhilesh Sanikop src += src_stride;
1904*09537850SAkhilesh Sanikop srcs[4] = Load4<0>(src, srcs[4]);
1905*09537850SAkhilesh Sanikop src += src_stride;
1906*09537850SAkhilesh Sanikop srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1907*09537850SAkhilesh Sanikop
1908*09537850SAkhilesh Sanikop const int16x8_t sums =
1909*09537850SAkhilesh Sanikop SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1910*09537850SAkhilesh Sanikop if (is_compound) {
1911*09537850SAkhilesh Sanikop const uint16x8_t results = Compound1DShift(sums);
1912*09537850SAkhilesh Sanikop
1913*09537850SAkhilesh Sanikop vst1q_u16(dst16, results);
1914*09537850SAkhilesh Sanikop dst16 += 4 << 1;
1915*09537850SAkhilesh Sanikop } else {
1916*09537850SAkhilesh Sanikop const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1917*09537850SAkhilesh Sanikop
1918*09537850SAkhilesh Sanikop StoreLo4(dst8, results);
1919*09537850SAkhilesh Sanikop dst8 += dst_stride;
1920*09537850SAkhilesh Sanikop StoreHi4(dst8, results);
1921*09537850SAkhilesh Sanikop dst8 += dst_stride;
1922*09537850SAkhilesh Sanikop }
1923*09537850SAkhilesh Sanikop
1924*09537850SAkhilesh Sanikop srcs[0] = srcs[2];
1925*09537850SAkhilesh Sanikop srcs[1] = srcs[3];
1926*09537850SAkhilesh Sanikop srcs[2] = srcs[4];
1927*09537850SAkhilesh Sanikop y -= 2;
1928*09537850SAkhilesh Sanikop } while (y != 0);
1929*09537850SAkhilesh Sanikop } else if (num_taps == 6) {
1930*09537850SAkhilesh Sanikop srcs[6] = vdup_n_u8(0);
1931*09537850SAkhilesh Sanikop
1932*09537850SAkhilesh Sanikop srcs[0] = Load4(src);
1933*09537850SAkhilesh Sanikop src += src_stride;
1934*09537850SAkhilesh Sanikop srcs[0] = Load4<1>(src, srcs[0]);
1935*09537850SAkhilesh Sanikop src += src_stride;
1936*09537850SAkhilesh Sanikop srcs[2] = Load4(src);
1937*09537850SAkhilesh Sanikop src += src_stride;
1938*09537850SAkhilesh Sanikop srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1939*09537850SAkhilesh Sanikop srcs[2] = Load4<1>(src, srcs[2]);
1940*09537850SAkhilesh Sanikop src += src_stride;
1941*09537850SAkhilesh Sanikop srcs[4] = Load4(src);
1942*09537850SAkhilesh Sanikop src += src_stride;
1943*09537850SAkhilesh Sanikop srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1944*09537850SAkhilesh Sanikop
1945*09537850SAkhilesh Sanikop int y = height;
1946*09537850SAkhilesh Sanikop do {
1947*09537850SAkhilesh Sanikop srcs[4] = Load4<1>(src, srcs[4]);
1948*09537850SAkhilesh Sanikop src += src_stride;
1949*09537850SAkhilesh Sanikop srcs[6] = Load4<0>(src, srcs[6]);
1950*09537850SAkhilesh Sanikop src += src_stride;
1951*09537850SAkhilesh Sanikop srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1952*09537850SAkhilesh Sanikop
1953*09537850SAkhilesh Sanikop const int16x8_t sums =
1954*09537850SAkhilesh Sanikop SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1955*09537850SAkhilesh Sanikop if (is_compound) {
1956*09537850SAkhilesh Sanikop const uint16x8_t results = Compound1DShift(sums);
1957*09537850SAkhilesh Sanikop
1958*09537850SAkhilesh Sanikop vst1q_u16(dst16, results);
1959*09537850SAkhilesh Sanikop dst16 += 4 << 1;
1960*09537850SAkhilesh Sanikop } else {
1961*09537850SAkhilesh Sanikop const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1962*09537850SAkhilesh Sanikop
1963*09537850SAkhilesh Sanikop StoreLo4(dst8, results);
1964*09537850SAkhilesh Sanikop dst8 += dst_stride;
1965*09537850SAkhilesh Sanikop StoreHi4(dst8, results);
1966*09537850SAkhilesh Sanikop dst8 += dst_stride;
1967*09537850SAkhilesh Sanikop }
1968*09537850SAkhilesh Sanikop
1969*09537850SAkhilesh Sanikop srcs[0] = srcs[2];
1970*09537850SAkhilesh Sanikop srcs[1] = srcs[3];
1971*09537850SAkhilesh Sanikop srcs[2] = srcs[4];
1972*09537850SAkhilesh Sanikop srcs[3] = srcs[5];
1973*09537850SAkhilesh Sanikop srcs[4] = srcs[6];
1974*09537850SAkhilesh Sanikop y -= 2;
1975*09537850SAkhilesh Sanikop } while (y != 0);
1976*09537850SAkhilesh Sanikop } else if (num_taps == 8) {
1977*09537850SAkhilesh Sanikop srcs[8] = vdup_n_u8(0);
1978*09537850SAkhilesh Sanikop
1979*09537850SAkhilesh Sanikop srcs[0] = Load4(src);
1980*09537850SAkhilesh Sanikop src += src_stride;
1981*09537850SAkhilesh Sanikop srcs[0] = Load4<1>(src, srcs[0]);
1982*09537850SAkhilesh Sanikop src += src_stride;
1983*09537850SAkhilesh Sanikop srcs[2] = Load4(src);
1984*09537850SAkhilesh Sanikop src += src_stride;
1985*09537850SAkhilesh Sanikop srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1986*09537850SAkhilesh Sanikop srcs[2] = Load4<1>(src, srcs[2]);
1987*09537850SAkhilesh Sanikop src += src_stride;
1988*09537850SAkhilesh Sanikop srcs[4] = Load4(src);
1989*09537850SAkhilesh Sanikop src += src_stride;
1990*09537850SAkhilesh Sanikop srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1991*09537850SAkhilesh Sanikop srcs[4] = Load4<1>(src, srcs[4]);
1992*09537850SAkhilesh Sanikop src += src_stride;
1993*09537850SAkhilesh Sanikop srcs[6] = Load4(src);
1994*09537850SAkhilesh Sanikop src += src_stride;
1995*09537850SAkhilesh Sanikop srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1996*09537850SAkhilesh Sanikop
1997*09537850SAkhilesh Sanikop int y = height;
1998*09537850SAkhilesh Sanikop do {
1999*09537850SAkhilesh Sanikop srcs[6] = Load4<1>(src, srcs[6]);
2000*09537850SAkhilesh Sanikop src += src_stride;
2001*09537850SAkhilesh Sanikop srcs[8] = Load4<0>(src, srcs[8]);
2002*09537850SAkhilesh Sanikop src += src_stride;
2003*09537850SAkhilesh Sanikop srcs[7] = vext_u8(srcs[6], srcs[8], 4);
2004*09537850SAkhilesh Sanikop
2005*09537850SAkhilesh Sanikop const int16x8_t sums =
2006*09537850SAkhilesh Sanikop SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2007*09537850SAkhilesh Sanikop if (is_compound) {
2008*09537850SAkhilesh Sanikop const uint16x8_t results = Compound1DShift(sums);
2009*09537850SAkhilesh Sanikop
2010*09537850SAkhilesh Sanikop vst1q_u16(dst16, results);
2011*09537850SAkhilesh Sanikop dst16 += 4 << 1;
2012*09537850SAkhilesh Sanikop } else {
2013*09537850SAkhilesh Sanikop const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2014*09537850SAkhilesh Sanikop
2015*09537850SAkhilesh Sanikop StoreLo4(dst8, results);
2016*09537850SAkhilesh Sanikop dst8 += dst_stride;
2017*09537850SAkhilesh Sanikop StoreHi4(dst8, results);
2018*09537850SAkhilesh Sanikop dst8 += dst_stride;
2019*09537850SAkhilesh Sanikop }
2020*09537850SAkhilesh Sanikop
2021*09537850SAkhilesh Sanikop srcs[0] = srcs[2];
2022*09537850SAkhilesh Sanikop srcs[1] = srcs[3];
2023*09537850SAkhilesh Sanikop srcs[2] = srcs[4];
2024*09537850SAkhilesh Sanikop srcs[3] = srcs[5];
2025*09537850SAkhilesh Sanikop srcs[4] = srcs[6];
2026*09537850SAkhilesh Sanikop srcs[5] = srcs[7];
2027*09537850SAkhilesh Sanikop srcs[6] = srcs[8];
2028*09537850SAkhilesh Sanikop y -= 2;
2029*09537850SAkhilesh Sanikop } while (y != 0);
2030*09537850SAkhilesh Sanikop }
2031*09537850SAkhilesh Sanikop }
2032*09537850SAkhilesh Sanikop
2033*09537850SAkhilesh Sanikop template <int filter_index, bool negative_outside_taps = false>
FilterVertical2xH(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)2034*09537850SAkhilesh Sanikop void FilterVertical2xH(const uint8_t* LIBGAV1_RESTRICT src,
2035*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
2036*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
2037*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int height,
2038*09537850SAkhilesh Sanikop const uint8x8_t* const taps) {
2039*09537850SAkhilesh Sanikop const int num_taps = GetNumTapsInFilter(filter_index);
2040*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
2041*09537850SAkhilesh Sanikop
2042*09537850SAkhilesh Sanikop uint8x8_t srcs[9];
2043*09537850SAkhilesh Sanikop
2044*09537850SAkhilesh Sanikop if (num_taps == 2) {
2045*09537850SAkhilesh Sanikop srcs[2] = vdup_n_u8(0);
2046*09537850SAkhilesh Sanikop
2047*09537850SAkhilesh Sanikop srcs[0] = Load2(src);
2048*09537850SAkhilesh Sanikop src += src_stride;
2049*09537850SAkhilesh Sanikop
2050*09537850SAkhilesh Sanikop int y = 0;
2051*09537850SAkhilesh Sanikop do {
2052*09537850SAkhilesh Sanikop srcs[0] = Load2<1>(src, srcs[0]);
2053*09537850SAkhilesh Sanikop src += src_stride;
2054*09537850SAkhilesh Sanikop srcs[0] = Load2<2>(src, srcs[0]);
2055*09537850SAkhilesh Sanikop src += src_stride;
2056*09537850SAkhilesh Sanikop srcs[0] = Load2<3>(src, srcs[0]);
2057*09537850SAkhilesh Sanikop src += src_stride;
2058*09537850SAkhilesh Sanikop srcs[2] = Load2<0>(src, srcs[2]);
2059*09537850SAkhilesh Sanikop src += src_stride;
2060*09537850SAkhilesh Sanikop srcs[1] = vext_u8(srcs[0], srcs[2], 2);
2061*09537850SAkhilesh Sanikop
2062*09537850SAkhilesh Sanikop // This uses srcs[0]..srcs[1].
2063*09537850SAkhilesh Sanikop const int16x8_t sums =
2064*09537850SAkhilesh Sanikop SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2065*09537850SAkhilesh Sanikop const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2066*09537850SAkhilesh Sanikop
2067*09537850SAkhilesh Sanikop Store2<0>(dst8, results);
2068*09537850SAkhilesh Sanikop dst8 += dst_stride;
2069*09537850SAkhilesh Sanikop Store2<1>(dst8, results);
2070*09537850SAkhilesh Sanikop if (height == 2) return;
2071*09537850SAkhilesh Sanikop dst8 += dst_stride;
2072*09537850SAkhilesh Sanikop Store2<2>(dst8, results);
2073*09537850SAkhilesh Sanikop dst8 += dst_stride;
2074*09537850SAkhilesh Sanikop Store2<3>(dst8, results);
2075*09537850SAkhilesh Sanikop dst8 += dst_stride;
2076*09537850SAkhilesh Sanikop
2077*09537850SAkhilesh Sanikop srcs[0] = srcs[2];
2078*09537850SAkhilesh Sanikop y += 4;
2079*09537850SAkhilesh Sanikop } while (y < height);
2080*09537850SAkhilesh Sanikop } else if (num_taps == 4) {
2081*09537850SAkhilesh Sanikop srcs[4] = vdup_n_u8(0);
2082*09537850SAkhilesh Sanikop
2083*09537850SAkhilesh Sanikop srcs[0] = Load2(src);
2084*09537850SAkhilesh Sanikop src += src_stride;
2085*09537850SAkhilesh Sanikop srcs[0] = Load2<1>(src, srcs[0]);
2086*09537850SAkhilesh Sanikop src += src_stride;
2087*09537850SAkhilesh Sanikop srcs[0] = Load2<2>(src, srcs[0]);
2088*09537850SAkhilesh Sanikop src += src_stride;
2089*09537850SAkhilesh Sanikop
2090*09537850SAkhilesh Sanikop int y = 0;
2091*09537850SAkhilesh Sanikop do {
2092*09537850SAkhilesh Sanikop srcs[0] = Load2<3>(src, srcs[0]);
2093*09537850SAkhilesh Sanikop src += src_stride;
2094*09537850SAkhilesh Sanikop srcs[4] = Load2<0>(src, srcs[4]);
2095*09537850SAkhilesh Sanikop src += src_stride;
2096*09537850SAkhilesh Sanikop srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2097*09537850SAkhilesh Sanikop srcs[4] = Load2<1>(src, srcs[4]);
2098*09537850SAkhilesh Sanikop src += src_stride;
2099*09537850SAkhilesh Sanikop srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2100*09537850SAkhilesh Sanikop srcs[4] = Load2<2>(src, srcs[4]);
2101*09537850SAkhilesh Sanikop src += src_stride;
2102*09537850SAkhilesh Sanikop srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2103*09537850SAkhilesh Sanikop
2104*09537850SAkhilesh Sanikop // This uses srcs[0]..srcs[3].
2105*09537850SAkhilesh Sanikop const int16x8_t sums =
2106*09537850SAkhilesh Sanikop SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2107*09537850SAkhilesh Sanikop const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2108*09537850SAkhilesh Sanikop
2109*09537850SAkhilesh Sanikop Store2<0>(dst8, results);
2110*09537850SAkhilesh Sanikop dst8 += dst_stride;
2111*09537850SAkhilesh Sanikop Store2<1>(dst8, results);
2112*09537850SAkhilesh Sanikop if (height == 2) return;
2113*09537850SAkhilesh Sanikop dst8 += dst_stride;
2114*09537850SAkhilesh Sanikop Store2<2>(dst8, results);
2115*09537850SAkhilesh Sanikop dst8 += dst_stride;
2116*09537850SAkhilesh Sanikop Store2<3>(dst8, results);
2117*09537850SAkhilesh Sanikop dst8 += dst_stride;
2118*09537850SAkhilesh Sanikop
2119*09537850SAkhilesh Sanikop srcs[0] = srcs[4];
2120*09537850SAkhilesh Sanikop y += 4;
2121*09537850SAkhilesh Sanikop } while (y < height);
2122*09537850SAkhilesh Sanikop } else if (num_taps == 6) {
2123*09537850SAkhilesh Sanikop // During the vertical pass the number of taps is restricted when
2124*09537850SAkhilesh Sanikop // |height| <= 4.
2125*09537850SAkhilesh Sanikop assert(height > 4);
2126*09537850SAkhilesh Sanikop srcs[8] = vdup_n_u8(0);
2127*09537850SAkhilesh Sanikop
2128*09537850SAkhilesh Sanikop srcs[0] = Load2(src);
2129*09537850SAkhilesh Sanikop src += src_stride;
2130*09537850SAkhilesh Sanikop srcs[0] = Load2<1>(src, srcs[0]);
2131*09537850SAkhilesh Sanikop src += src_stride;
2132*09537850SAkhilesh Sanikop srcs[0] = Load2<2>(src, srcs[0]);
2133*09537850SAkhilesh Sanikop src += src_stride;
2134*09537850SAkhilesh Sanikop srcs[0] = Load2<3>(src, srcs[0]);
2135*09537850SAkhilesh Sanikop src += src_stride;
2136*09537850SAkhilesh Sanikop srcs[4] = Load2(src);
2137*09537850SAkhilesh Sanikop src += src_stride;
2138*09537850SAkhilesh Sanikop srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2139*09537850SAkhilesh Sanikop
2140*09537850SAkhilesh Sanikop int y = 0;
2141*09537850SAkhilesh Sanikop do {
2142*09537850SAkhilesh Sanikop srcs[4] = Load2<1>(src, srcs[4]);
2143*09537850SAkhilesh Sanikop src += src_stride;
2144*09537850SAkhilesh Sanikop srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2145*09537850SAkhilesh Sanikop srcs[4] = Load2<2>(src, srcs[4]);
2146*09537850SAkhilesh Sanikop src += src_stride;
2147*09537850SAkhilesh Sanikop srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2148*09537850SAkhilesh Sanikop srcs[4] = Load2<3>(src, srcs[4]);
2149*09537850SAkhilesh Sanikop src += src_stride;
2150*09537850SAkhilesh Sanikop srcs[8] = Load2<0>(src, srcs[8]);
2151*09537850SAkhilesh Sanikop src += src_stride;
2152*09537850SAkhilesh Sanikop srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2153*09537850SAkhilesh Sanikop
2154*09537850SAkhilesh Sanikop // This uses srcs[0]..srcs[5].
2155*09537850SAkhilesh Sanikop const int16x8_t sums =
2156*09537850SAkhilesh Sanikop SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2157*09537850SAkhilesh Sanikop const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2158*09537850SAkhilesh Sanikop
2159*09537850SAkhilesh Sanikop Store2<0>(dst8, results);
2160*09537850SAkhilesh Sanikop dst8 += dst_stride;
2161*09537850SAkhilesh Sanikop Store2<1>(dst8, results);
2162*09537850SAkhilesh Sanikop dst8 += dst_stride;
2163*09537850SAkhilesh Sanikop Store2<2>(dst8, results);
2164*09537850SAkhilesh Sanikop dst8 += dst_stride;
2165*09537850SAkhilesh Sanikop Store2<3>(dst8, results);
2166*09537850SAkhilesh Sanikop dst8 += dst_stride;
2167*09537850SAkhilesh Sanikop
2168*09537850SAkhilesh Sanikop srcs[0] = srcs[4];
2169*09537850SAkhilesh Sanikop srcs[1] = srcs[5];
2170*09537850SAkhilesh Sanikop srcs[4] = srcs[8];
2171*09537850SAkhilesh Sanikop y += 4;
2172*09537850SAkhilesh Sanikop } while (y < height);
2173*09537850SAkhilesh Sanikop } else if (num_taps == 8) {
2174*09537850SAkhilesh Sanikop // During the vertical pass the number of taps is restricted when
2175*09537850SAkhilesh Sanikop // |height| <= 4.
2176*09537850SAkhilesh Sanikop assert(height > 4);
2177*09537850SAkhilesh Sanikop srcs[8] = vdup_n_u8(0);
2178*09537850SAkhilesh Sanikop
2179*09537850SAkhilesh Sanikop srcs[0] = Load2(src);
2180*09537850SAkhilesh Sanikop src += src_stride;
2181*09537850SAkhilesh Sanikop srcs[0] = Load2<1>(src, srcs[0]);
2182*09537850SAkhilesh Sanikop src += src_stride;
2183*09537850SAkhilesh Sanikop srcs[0] = Load2<2>(src, srcs[0]);
2184*09537850SAkhilesh Sanikop src += src_stride;
2185*09537850SAkhilesh Sanikop srcs[0] = Load2<3>(src, srcs[0]);
2186*09537850SAkhilesh Sanikop src += src_stride;
2187*09537850SAkhilesh Sanikop srcs[4] = Load2(src);
2188*09537850SAkhilesh Sanikop src += src_stride;
2189*09537850SAkhilesh Sanikop srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2190*09537850SAkhilesh Sanikop srcs[4] = Load2<1>(src, srcs[4]);
2191*09537850SAkhilesh Sanikop src += src_stride;
2192*09537850SAkhilesh Sanikop srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2193*09537850SAkhilesh Sanikop srcs[4] = Load2<2>(src, srcs[4]);
2194*09537850SAkhilesh Sanikop src += src_stride;
2195*09537850SAkhilesh Sanikop srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2196*09537850SAkhilesh Sanikop
2197*09537850SAkhilesh Sanikop int y = 0;
2198*09537850SAkhilesh Sanikop do {
2199*09537850SAkhilesh Sanikop srcs[4] = Load2<3>(src, srcs[4]);
2200*09537850SAkhilesh Sanikop src += src_stride;
2201*09537850SAkhilesh Sanikop srcs[8] = Load2<0>(src, srcs[8]);
2202*09537850SAkhilesh Sanikop src += src_stride;
2203*09537850SAkhilesh Sanikop srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2204*09537850SAkhilesh Sanikop srcs[8] = Load2<1>(src, srcs[8]);
2205*09537850SAkhilesh Sanikop src += src_stride;
2206*09537850SAkhilesh Sanikop srcs[6] = vext_u8(srcs[4], srcs[8], 4);
2207*09537850SAkhilesh Sanikop srcs[8] = Load2<2>(src, srcs[8]);
2208*09537850SAkhilesh Sanikop src += src_stride;
2209*09537850SAkhilesh Sanikop srcs[7] = vext_u8(srcs[4], srcs[8], 6);
2210*09537850SAkhilesh Sanikop
2211*09537850SAkhilesh Sanikop // This uses srcs[0]..srcs[7].
2212*09537850SAkhilesh Sanikop const int16x8_t sums =
2213*09537850SAkhilesh Sanikop SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2214*09537850SAkhilesh Sanikop const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2215*09537850SAkhilesh Sanikop
2216*09537850SAkhilesh Sanikop Store2<0>(dst8, results);
2217*09537850SAkhilesh Sanikop dst8 += dst_stride;
2218*09537850SAkhilesh Sanikop Store2<1>(dst8, results);
2219*09537850SAkhilesh Sanikop dst8 += dst_stride;
2220*09537850SAkhilesh Sanikop Store2<2>(dst8, results);
2221*09537850SAkhilesh Sanikop dst8 += dst_stride;
2222*09537850SAkhilesh Sanikop Store2<3>(dst8, results);
2223*09537850SAkhilesh Sanikop dst8 += dst_stride;
2224*09537850SAkhilesh Sanikop
2225*09537850SAkhilesh Sanikop srcs[0] = srcs[4];
2226*09537850SAkhilesh Sanikop srcs[1] = srcs[5];
2227*09537850SAkhilesh Sanikop srcs[2] = srcs[6];
2228*09537850SAkhilesh Sanikop srcs[3] = srcs[7];
2229*09537850SAkhilesh Sanikop srcs[4] = srcs[8];
2230*09537850SAkhilesh Sanikop y += 4;
2231*09537850SAkhilesh Sanikop } while (y < height);
2232*09537850SAkhilesh Sanikop }
2233*09537850SAkhilesh Sanikop }
2234*09537850SAkhilesh Sanikop
2235*09537850SAkhilesh Sanikop // This function is a simplified version of Convolve2D_C.
2236*09537850SAkhilesh Sanikop // It is called when it is single prediction mode, where only vertical
2237*09537850SAkhilesh Sanikop // filtering is required.
2238*09537850SAkhilesh Sanikop // The output is the single prediction of the block, clipped to valid pixel
2239*09537850SAkhilesh Sanikop // range.
ConvolveVertical_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)2240*09537850SAkhilesh Sanikop void ConvolveVertical_NEON(
2241*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
2242*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
2243*09537850SAkhilesh Sanikop const int vertical_filter_index, const int /*horizontal_filter_id*/,
2244*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width, const int height,
2245*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
2246*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(vertical_filter_index, height);
2247*09537850SAkhilesh Sanikop const int vertical_taps = GetNumTapsInFilter(filter_index);
2248*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
2249*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) -
2250*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride;
2251*09537850SAkhilesh Sanikop auto* const dest = static_cast<uint8_t*>(prediction);
2252*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride = pred_stride;
2253*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
2254*09537850SAkhilesh Sanikop
2255*09537850SAkhilesh Sanikop uint8x8_t taps[8];
2256*09537850SAkhilesh Sanikop for (int k = 0; k < kSubPixelTaps; ++k) {
2257*09537850SAkhilesh Sanikop taps[k] =
2258*09537850SAkhilesh Sanikop vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
2259*09537850SAkhilesh Sanikop }
2260*09537850SAkhilesh Sanikop
2261*09537850SAkhilesh Sanikop if (filter_index == 0) { // 6 tap.
2262*09537850SAkhilesh Sanikop if (width == 2) {
2263*09537850SAkhilesh Sanikop FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
2264*09537850SAkhilesh Sanikop taps + 1);
2265*09537850SAkhilesh Sanikop } else if (width == 4) {
2266*09537850SAkhilesh Sanikop FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
2267*09537850SAkhilesh Sanikop taps + 1);
2268*09537850SAkhilesh Sanikop } else {
2269*09537850SAkhilesh Sanikop FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
2270*09537850SAkhilesh Sanikop taps + 1);
2271*09537850SAkhilesh Sanikop }
2272*09537850SAkhilesh Sanikop } else if ((static_cast<int>(filter_index == 1) &
2273*09537850SAkhilesh Sanikop (static_cast<int>(vertical_filter_id == 1) |
2274*09537850SAkhilesh Sanikop static_cast<int>(vertical_filter_id == 15))) != 0) { // 5 tap.
2275*09537850SAkhilesh Sanikop if (width == 2) {
2276*09537850SAkhilesh Sanikop FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
2277*09537850SAkhilesh Sanikop taps + 1);
2278*09537850SAkhilesh Sanikop } else if (width == 4) {
2279*09537850SAkhilesh Sanikop FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
2280*09537850SAkhilesh Sanikop taps + 1);
2281*09537850SAkhilesh Sanikop } else {
2282*09537850SAkhilesh Sanikop FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
2283*09537850SAkhilesh Sanikop taps + 1);
2284*09537850SAkhilesh Sanikop }
2285*09537850SAkhilesh Sanikop } else if ((static_cast<int>(filter_index == 1) &
2286*09537850SAkhilesh Sanikop (static_cast<int>(vertical_filter_id == 7) |
2287*09537850SAkhilesh Sanikop static_cast<int>(vertical_filter_id == 8) |
2288*09537850SAkhilesh Sanikop static_cast<int>(vertical_filter_id == 9))) !=
2289*09537850SAkhilesh Sanikop 0) { // 6 tap with weird negative taps.
2290*09537850SAkhilesh Sanikop if (width == 2) {
2291*09537850SAkhilesh Sanikop FilterVertical2xH<1,
2292*09537850SAkhilesh Sanikop /*negative_outside_taps=*/true>(
2293*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, height, taps + 1);
2294*09537850SAkhilesh Sanikop } else if (width == 4) {
2295*09537850SAkhilesh Sanikop FilterVertical4xH<1, /*is_compound=*/false,
2296*09537850SAkhilesh Sanikop /*negative_outside_taps=*/true>(
2297*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, height, taps + 1);
2298*09537850SAkhilesh Sanikop } else {
2299*09537850SAkhilesh Sanikop FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
2300*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps + 1);
2301*09537850SAkhilesh Sanikop }
2302*09537850SAkhilesh Sanikop } else if (filter_index == 2) { // 8 tap.
2303*09537850SAkhilesh Sanikop if (width == 2) {
2304*09537850SAkhilesh Sanikop FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
2305*09537850SAkhilesh Sanikop } else if (width == 4) {
2306*09537850SAkhilesh Sanikop FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
2307*09537850SAkhilesh Sanikop } else {
2308*09537850SAkhilesh Sanikop FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
2309*09537850SAkhilesh Sanikop taps);
2310*09537850SAkhilesh Sanikop }
2311*09537850SAkhilesh Sanikop } else if (filter_index == 3) { // 2 tap.
2312*09537850SAkhilesh Sanikop if (width == 2) {
2313*09537850SAkhilesh Sanikop FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
2314*09537850SAkhilesh Sanikop taps + 3);
2315*09537850SAkhilesh Sanikop } else if (width == 4) {
2316*09537850SAkhilesh Sanikop FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
2317*09537850SAkhilesh Sanikop taps + 3);
2318*09537850SAkhilesh Sanikop } else {
2319*09537850SAkhilesh Sanikop FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
2320*09537850SAkhilesh Sanikop taps + 3);
2321*09537850SAkhilesh Sanikop }
2322*09537850SAkhilesh Sanikop } else if (filter_index == 4) { // 4 tap.
2323*09537850SAkhilesh Sanikop // Outside taps are negative.
2324*09537850SAkhilesh Sanikop if (width == 2) {
2325*09537850SAkhilesh Sanikop FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
2326*09537850SAkhilesh Sanikop taps + 2);
2327*09537850SAkhilesh Sanikop } else if (width == 4) {
2328*09537850SAkhilesh Sanikop FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
2329*09537850SAkhilesh Sanikop taps + 2);
2330*09537850SAkhilesh Sanikop } else {
2331*09537850SAkhilesh Sanikop FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
2332*09537850SAkhilesh Sanikop taps + 2);
2333*09537850SAkhilesh Sanikop }
2334*09537850SAkhilesh Sanikop } else {
2335*09537850SAkhilesh Sanikop // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
2336*09537850SAkhilesh Sanikop // below map to 4 tap filters.
2337*09537850SAkhilesh Sanikop assert(filter_index == 5 ||
2338*09537850SAkhilesh Sanikop (filter_index == 1 &&
2339*09537850SAkhilesh Sanikop (vertical_filter_id == 2 || vertical_filter_id == 3 ||
2340*09537850SAkhilesh Sanikop vertical_filter_id == 4 || vertical_filter_id == 5 ||
2341*09537850SAkhilesh Sanikop vertical_filter_id == 6 || vertical_filter_id == 10 ||
2342*09537850SAkhilesh Sanikop vertical_filter_id == 11 || vertical_filter_id == 12 ||
2343*09537850SAkhilesh Sanikop vertical_filter_id == 13 || vertical_filter_id == 14)));
2344*09537850SAkhilesh Sanikop // According to GetNumTapsInFilter() this has 6 taps but here we are
2345*09537850SAkhilesh Sanikop // treating it as though it has 4.
2346*09537850SAkhilesh Sanikop if (filter_index == 1) src += src_stride;
2347*09537850SAkhilesh Sanikop if (width == 2) {
2348*09537850SAkhilesh Sanikop FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
2349*09537850SAkhilesh Sanikop taps + 2);
2350*09537850SAkhilesh Sanikop } else if (width == 4) {
2351*09537850SAkhilesh Sanikop FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
2352*09537850SAkhilesh Sanikop taps + 2);
2353*09537850SAkhilesh Sanikop } else {
2354*09537850SAkhilesh Sanikop FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
2355*09537850SAkhilesh Sanikop taps + 2);
2356*09537850SAkhilesh Sanikop }
2357*09537850SAkhilesh Sanikop }
2358*09537850SAkhilesh Sanikop }
2359*09537850SAkhilesh Sanikop
ConvolveCompoundCopy_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t)2360*09537850SAkhilesh Sanikop void ConvolveCompoundCopy_NEON(
2361*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
2362*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
2363*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
2364*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
2365*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
2366*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
2367*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
2368*09537850SAkhilesh Sanikop auto* dest = static_cast<uint16_t*>(prediction);
2369*09537850SAkhilesh Sanikop constexpr int final_shift =
2370*09537850SAkhilesh Sanikop kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
2371*09537850SAkhilesh Sanikop
2372*09537850SAkhilesh Sanikop if (width >= 16) {
2373*09537850SAkhilesh Sanikop int y = height;
2374*09537850SAkhilesh Sanikop do {
2375*09537850SAkhilesh Sanikop int x = 0;
2376*09537850SAkhilesh Sanikop do {
2377*09537850SAkhilesh Sanikop const uint8x16_t v_src = vld1q_u8(&src[x]);
2378*09537850SAkhilesh Sanikop const uint16x8_t v_dest_lo =
2379*09537850SAkhilesh Sanikop vshll_n_u8(vget_low_u8(v_src), final_shift);
2380*09537850SAkhilesh Sanikop const uint16x8_t v_dest_hi =
2381*09537850SAkhilesh Sanikop vshll_n_u8(vget_high_u8(v_src), final_shift);
2382*09537850SAkhilesh Sanikop vst1q_u16(&dest[x], v_dest_lo);
2383*09537850SAkhilesh Sanikop x += 8;
2384*09537850SAkhilesh Sanikop vst1q_u16(&dest[x], v_dest_hi);
2385*09537850SAkhilesh Sanikop x += 8;
2386*09537850SAkhilesh Sanikop } while (x < width);
2387*09537850SAkhilesh Sanikop src += src_stride;
2388*09537850SAkhilesh Sanikop dest += width;
2389*09537850SAkhilesh Sanikop } while (--y != 0);
2390*09537850SAkhilesh Sanikop } else if (width == 8) {
2391*09537850SAkhilesh Sanikop int y = height;
2392*09537850SAkhilesh Sanikop do {
2393*09537850SAkhilesh Sanikop const uint8x8_t v_src = vld1_u8(&src[0]);
2394*09537850SAkhilesh Sanikop const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2395*09537850SAkhilesh Sanikop vst1q_u16(&dest[0], v_dest);
2396*09537850SAkhilesh Sanikop src += src_stride;
2397*09537850SAkhilesh Sanikop dest += width;
2398*09537850SAkhilesh Sanikop } while (--y != 0);
2399*09537850SAkhilesh Sanikop } else { // width == 4
2400*09537850SAkhilesh Sanikop uint8x8_t v_src = vdup_n_u8(0);
2401*09537850SAkhilesh Sanikop
2402*09537850SAkhilesh Sanikop int y = height;
2403*09537850SAkhilesh Sanikop do {
2404*09537850SAkhilesh Sanikop v_src = Load4<0>(&src[0], v_src);
2405*09537850SAkhilesh Sanikop src += src_stride;
2406*09537850SAkhilesh Sanikop v_src = Load4<1>(&src[0], v_src);
2407*09537850SAkhilesh Sanikop src += src_stride;
2408*09537850SAkhilesh Sanikop const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2409*09537850SAkhilesh Sanikop vst1q_u16(&dest[0], v_dest);
2410*09537850SAkhilesh Sanikop dest += 4 << 1;
2411*09537850SAkhilesh Sanikop y -= 2;
2412*09537850SAkhilesh Sanikop } while (y != 0);
2413*09537850SAkhilesh Sanikop }
2414*09537850SAkhilesh Sanikop }
2415*09537850SAkhilesh Sanikop
ConvolveCompoundVertical_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t)2416*09537850SAkhilesh Sanikop void ConvolveCompoundVertical_NEON(
2417*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
2418*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
2419*09537850SAkhilesh Sanikop const int vertical_filter_index, const int /*horizontal_filter_id*/,
2420*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width, const int height,
2421*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
2422*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(vertical_filter_index, height);
2423*09537850SAkhilesh Sanikop const int vertical_taps = GetNumTapsInFilter(filter_index);
2424*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
2425*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) -
2426*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride;
2427*09537850SAkhilesh Sanikop auto* const dest = static_cast<uint16_t*>(prediction);
2428*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
2429*09537850SAkhilesh Sanikop
2430*09537850SAkhilesh Sanikop uint8x8_t taps[8];
2431*09537850SAkhilesh Sanikop for (int k = 0; k < kSubPixelTaps; ++k) {
2432*09537850SAkhilesh Sanikop taps[k] =
2433*09537850SAkhilesh Sanikop vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
2434*09537850SAkhilesh Sanikop }
2435*09537850SAkhilesh Sanikop
2436*09537850SAkhilesh Sanikop if (filter_index == 0) { // 6 tap.
2437*09537850SAkhilesh Sanikop if (width == 4) {
2438*09537850SAkhilesh Sanikop FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
2439*09537850SAkhilesh Sanikop height, taps + 1);
2440*09537850SAkhilesh Sanikop } else {
2441*09537850SAkhilesh Sanikop FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
2442*09537850SAkhilesh Sanikop width, height, taps + 1);
2443*09537850SAkhilesh Sanikop }
2444*09537850SAkhilesh Sanikop } else if ((static_cast<int>(filter_index == 1) &
2445*09537850SAkhilesh Sanikop (static_cast<int>(vertical_filter_id == 1) |
2446*09537850SAkhilesh Sanikop static_cast<int>(vertical_filter_id == 15))) != 0) { // 5 tap.
2447*09537850SAkhilesh Sanikop if (width == 4) {
2448*09537850SAkhilesh Sanikop FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
2449*09537850SAkhilesh Sanikop height, taps + 1);
2450*09537850SAkhilesh Sanikop } else {
2451*09537850SAkhilesh Sanikop FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
2452*09537850SAkhilesh Sanikop width, height, taps + 1);
2453*09537850SAkhilesh Sanikop }
2454*09537850SAkhilesh Sanikop } else if ((static_cast<int>(filter_index == 1) &
2455*09537850SAkhilesh Sanikop (static_cast<int>(vertical_filter_id == 7) |
2456*09537850SAkhilesh Sanikop static_cast<int>(vertical_filter_id == 8) |
2457*09537850SAkhilesh Sanikop static_cast<int>(vertical_filter_id == 9))) !=
2458*09537850SAkhilesh Sanikop 0) { // 6 tap with weird negative taps.
2459*09537850SAkhilesh Sanikop if (width == 4) {
2460*09537850SAkhilesh Sanikop FilterVertical4xH<1, /*is_compound=*/true,
2461*09537850SAkhilesh Sanikop /*negative_outside_taps=*/true>(src, src_stride, dest,
2462*09537850SAkhilesh Sanikop 4, height, taps + 1);
2463*09537850SAkhilesh Sanikop } else {
2464*09537850SAkhilesh Sanikop FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
2465*09537850SAkhilesh Sanikop src, src_stride, dest, width, width, height, taps + 1);
2466*09537850SAkhilesh Sanikop }
2467*09537850SAkhilesh Sanikop } else if (filter_index == 2) { // 8 tap.
2468*09537850SAkhilesh Sanikop if (width == 4) {
2469*09537850SAkhilesh Sanikop FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
2470*09537850SAkhilesh Sanikop height, taps);
2471*09537850SAkhilesh Sanikop } else {
2472*09537850SAkhilesh Sanikop FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
2473*09537850SAkhilesh Sanikop width, height, taps);
2474*09537850SAkhilesh Sanikop }
2475*09537850SAkhilesh Sanikop } else if (filter_index == 3) { // 2 tap.
2476*09537850SAkhilesh Sanikop if (width == 4) {
2477*09537850SAkhilesh Sanikop FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
2478*09537850SAkhilesh Sanikop height, taps + 3);
2479*09537850SAkhilesh Sanikop } else {
2480*09537850SAkhilesh Sanikop FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
2481*09537850SAkhilesh Sanikop width, height, taps + 3);
2482*09537850SAkhilesh Sanikop }
2483*09537850SAkhilesh Sanikop } else if (filter_index == 4) { // 4 tap.
2484*09537850SAkhilesh Sanikop if (width == 4) {
2485*09537850SAkhilesh Sanikop FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
2486*09537850SAkhilesh Sanikop height, taps + 2);
2487*09537850SAkhilesh Sanikop } else {
2488*09537850SAkhilesh Sanikop FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
2489*09537850SAkhilesh Sanikop width, height, taps + 2);
2490*09537850SAkhilesh Sanikop }
2491*09537850SAkhilesh Sanikop } else {
2492*09537850SAkhilesh Sanikop // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
2493*09537850SAkhilesh Sanikop // to 4 tap filters.
2494*09537850SAkhilesh Sanikop assert(filter_index == 5 ||
2495*09537850SAkhilesh Sanikop (filter_index == 1 &&
2496*09537850SAkhilesh Sanikop (vertical_filter_id == 2 || vertical_filter_id == 3 ||
2497*09537850SAkhilesh Sanikop vertical_filter_id == 4 || vertical_filter_id == 5 ||
2498*09537850SAkhilesh Sanikop vertical_filter_id == 6 || vertical_filter_id == 10 ||
2499*09537850SAkhilesh Sanikop vertical_filter_id == 11 || vertical_filter_id == 12 ||
2500*09537850SAkhilesh Sanikop vertical_filter_id == 13 || vertical_filter_id == 14)));
2501*09537850SAkhilesh Sanikop // According to GetNumTapsInFilter() this has 6 taps but here we are
2502*09537850SAkhilesh Sanikop // treating it as though it has 4.
2503*09537850SAkhilesh Sanikop if (filter_index == 1) src += src_stride;
2504*09537850SAkhilesh Sanikop if (width == 4) {
2505*09537850SAkhilesh Sanikop FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
2506*09537850SAkhilesh Sanikop height, taps + 2);
2507*09537850SAkhilesh Sanikop } else {
2508*09537850SAkhilesh Sanikop FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
2509*09537850SAkhilesh Sanikop width, height, taps + 2);
2510*09537850SAkhilesh Sanikop }
2511*09537850SAkhilesh Sanikop }
2512*09537850SAkhilesh Sanikop }
2513*09537850SAkhilesh Sanikop
ConvolveCompoundHorizontal_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t)2514*09537850SAkhilesh Sanikop void ConvolveCompoundHorizontal_NEON(
2515*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
2516*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int horizontal_filter_index,
2517*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int horizontal_filter_id,
2518*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
2519*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
2520*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(horizontal_filter_index, width);
2521*09537850SAkhilesh Sanikop const auto* const src =
2522*09537850SAkhilesh Sanikop static_cast<const uint8_t*>(reference) - kHorizontalOffset;
2523*09537850SAkhilesh Sanikop auto* const dest = static_cast<uint16_t*>(prediction);
2524*09537850SAkhilesh Sanikop
2525*09537850SAkhilesh Sanikop DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
2526*09537850SAkhilesh Sanikop src, reference_stride, dest, width, width, height, horizontal_filter_id,
2527*09537850SAkhilesh Sanikop filter_index);
2528*09537850SAkhilesh Sanikop }
2529*09537850SAkhilesh Sanikop
2530*09537850SAkhilesh Sanikop template <int vertical_taps>
Compound2DVertical(const uint16_t * LIBGAV1_RESTRICT const intermediate_result,const int width,const int height,const int16x8_t taps,void * LIBGAV1_RESTRICT const prediction)2531*09537850SAkhilesh Sanikop void Compound2DVertical(
2532*09537850SAkhilesh Sanikop const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
2533*09537850SAkhilesh Sanikop const int height, const int16x8_t taps,
2534*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction) {
2535*09537850SAkhilesh Sanikop auto* const dest = static_cast<uint16_t*>(prediction);
2536*09537850SAkhilesh Sanikop if (width == 4) {
2537*09537850SAkhilesh Sanikop Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
2538*09537850SAkhilesh Sanikop intermediate_result, dest, width, height, taps);
2539*09537850SAkhilesh Sanikop } else {
2540*09537850SAkhilesh Sanikop Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
2541*09537850SAkhilesh Sanikop intermediate_result, dest, width, width, height, taps);
2542*09537850SAkhilesh Sanikop }
2543*09537850SAkhilesh Sanikop }
2544*09537850SAkhilesh Sanikop
ConvolveCompound2D_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t)2545*09537850SAkhilesh Sanikop void ConvolveCompound2D_NEON(
2546*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
2547*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int horizontal_filter_index,
2548*09537850SAkhilesh Sanikop const int vertical_filter_index, const int horizontal_filter_id,
2549*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width, const int height,
2550*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
2551*09537850SAkhilesh Sanikop // The output of the horizontal filter, i.e. the intermediate_result, is
2552*09537850SAkhilesh Sanikop // guaranteed to fit in int16_t.
2553*09537850SAkhilesh Sanikop uint16_t
2554*09537850SAkhilesh Sanikop intermediate_result[kMaxSuperBlockSizeInPixels *
2555*09537850SAkhilesh Sanikop (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
2556*09537850SAkhilesh Sanikop
2557*09537850SAkhilesh Sanikop // Horizontal filter.
2558*09537850SAkhilesh Sanikop // Filter types used for width <= 4 are different from those for width > 4.
2559*09537850SAkhilesh Sanikop // When width > 4, the valid filter index range is always [0, 3].
2560*09537850SAkhilesh Sanikop // When width <= 4, the valid filter index range is always [4, 5].
2561*09537850SAkhilesh Sanikop // Similarly for height.
2562*09537850SAkhilesh Sanikop const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
2563*09537850SAkhilesh Sanikop const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
2564*09537850SAkhilesh Sanikop const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
2565*09537850SAkhilesh Sanikop const int intermediate_height = height + vertical_taps - 1;
2566*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
2567*09537850SAkhilesh Sanikop const auto* const src = static_cast<const uint8_t*>(reference) -
2568*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride -
2569*09537850SAkhilesh Sanikop kHorizontalOffset;
2570*09537850SAkhilesh Sanikop DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
2571*09537850SAkhilesh Sanikop src, src_stride, intermediate_result, width, width, intermediate_height,
2572*09537850SAkhilesh Sanikop horizontal_filter_id, horiz_filter_index);
2573*09537850SAkhilesh Sanikop
2574*09537850SAkhilesh Sanikop // Vertical filter.
2575*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
2576*09537850SAkhilesh Sanikop const int16x8_t taps = vmovl_s8(
2577*09537850SAkhilesh Sanikop vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
2578*09537850SAkhilesh Sanikop if (vertical_taps == 8) {
2579*09537850SAkhilesh Sanikop Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
2580*09537850SAkhilesh Sanikop } else if (vertical_taps == 6) {
2581*09537850SAkhilesh Sanikop Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
2582*09537850SAkhilesh Sanikop } else if (vertical_taps == 4) {
2583*09537850SAkhilesh Sanikop Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
2584*09537850SAkhilesh Sanikop } else { // |vertical_taps| == 2
2585*09537850SAkhilesh Sanikop Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
2586*09537850SAkhilesh Sanikop }
2587*09537850SAkhilesh Sanikop }
2588*09537850SAkhilesh Sanikop
HalfAddHorizontal(const uint8_t * LIBGAV1_RESTRICT const src,uint8_t * LIBGAV1_RESTRICT const dst)2589*09537850SAkhilesh Sanikop inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
2590*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT const dst) {
2591*09537850SAkhilesh Sanikop const uint8x16_t left = vld1q_u8(src);
2592*09537850SAkhilesh Sanikop const uint8x16_t right = vld1q_u8(src + 1);
2593*09537850SAkhilesh Sanikop vst1q_u8(dst, vrhaddq_u8(left, right));
2594*09537850SAkhilesh Sanikop }
2595*09537850SAkhilesh Sanikop
2596*09537850SAkhilesh Sanikop template <int width>
IntraBlockCopyHorizontal(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t dst_stride)2597*09537850SAkhilesh Sanikop inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
2598*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
2599*09537850SAkhilesh Sanikop const int height,
2600*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT dst,
2601*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride) {
2602*09537850SAkhilesh Sanikop const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2603*09537850SAkhilesh Sanikop const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2604*09537850SAkhilesh Sanikop
2605*09537850SAkhilesh Sanikop int y = height;
2606*09537850SAkhilesh Sanikop do {
2607*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
2608*09537850SAkhilesh Sanikop if (width >= 32) {
2609*09537850SAkhilesh Sanikop src += 16;
2610*09537850SAkhilesh Sanikop dst += 16;
2611*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
2612*09537850SAkhilesh Sanikop if (width >= 64) {
2613*09537850SAkhilesh Sanikop src += 16;
2614*09537850SAkhilesh Sanikop dst += 16;
2615*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
2616*09537850SAkhilesh Sanikop src += 16;
2617*09537850SAkhilesh Sanikop dst += 16;
2618*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
2619*09537850SAkhilesh Sanikop if (width == 128) {
2620*09537850SAkhilesh Sanikop src += 16;
2621*09537850SAkhilesh Sanikop dst += 16;
2622*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
2623*09537850SAkhilesh Sanikop src += 16;
2624*09537850SAkhilesh Sanikop dst += 16;
2625*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
2626*09537850SAkhilesh Sanikop src += 16;
2627*09537850SAkhilesh Sanikop dst += 16;
2628*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
2629*09537850SAkhilesh Sanikop src += 16;
2630*09537850SAkhilesh Sanikop dst += 16;
2631*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
2632*09537850SAkhilesh Sanikop }
2633*09537850SAkhilesh Sanikop }
2634*09537850SAkhilesh Sanikop }
2635*09537850SAkhilesh Sanikop src += src_remainder_stride;
2636*09537850SAkhilesh Sanikop dst += dst_remainder_stride;
2637*09537850SAkhilesh Sanikop } while (--y != 0);
2638*09537850SAkhilesh Sanikop }
2639*09537850SAkhilesh Sanikop
ConvolveIntraBlockCopyHorizontal_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)2640*09537850SAkhilesh Sanikop void ConvolveIntraBlockCopyHorizontal_NEON(
2641*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
2642*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
2643*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int /*subpixel_x*/,
2644*09537850SAkhilesh Sanikop const int /*subpixel_y*/, const int width, const int height,
2645*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
2646*09537850SAkhilesh Sanikop assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
2647*09537850SAkhilesh Sanikop assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
2648*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
2649*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
2650*09537850SAkhilesh Sanikop
2651*09537850SAkhilesh Sanikop if (width == 128) {
2652*09537850SAkhilesh Sanikop IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
2653*09537850SAkhilesh Sanikop pred_stride);
2654*09537850SAkhilesh Sanikop } else if (width == 64) {
2655*09537850SAkhilesh Sanikop IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
2656*09537850SAkhilesh Sanikop pred_stride);
2657*09537850SAkhilesh Sanikop } else if (width == 32) {
2658*09537850SAkhilesh Sanikop IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
2659*09537850SAkhilesh Sanikop pred_stride);
2660*09537850SAkhilesh Sanikop } else if (width == 16) {
2661*09537850SAkhilesh Sanikop IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
2662*09537850SAkhilesh Sanikop pred_stride);
2663*09537850SAkhilesh Sanikop } else if (width == 8) {
2664*09537850SAkhilesh Sanikop int y = height;
2665*09537850SAkhilesh Sanikop do {
2666*09537850SAkhilesh Sanikop const uint8x8_t left = vld1_u8(src);
2667*09537850SAkhilesh Sanikop const uint8x8_t right = vld1_u8(src + 1);
2668*09537850SAkhilesh Sanikop vst1_u8(dest, vrhadd_u8(left, right));
2669*09537850SAkhilesh Sanikop
2670*09537850SAkhilesh Sanikop src += reference_stride;
2671*09537850SAkhilesh Sanikop dest += pred_stride;
2672*09537850SAkhilesh Sanikop } while (--y != 0);
2673*09537850SAkhilesh Sanikop } else { // width == 4
2674*09537850SAkhilesh Sanikop uint8x8_t left = vdup_n_u8(0);
2675*09537850SAkhilesh Sanikop uint8x8_t right = vdup_n_u8(0);
2676*09537850SAkhilesh Sanikop int y = height;
2677*09537850SAkhilesh Sanikop do {
2678*09537850SAkhilesh Sanikop left = Load4<0>(src, left);
2679*09537850SAkhilesh Sanikop right = Load4<0>(src + 1, right);
2680*09537850SAkhilesh Sanikop src += reference_stride;
2681*09537850SAkhilesh Sanikop left = Load4<1>(src, left);
2682*09537850SAkhilesh Sanikop right = Load4<1>(src + 1, right);
2683*09537850SAkhilesh Sanikop src += reference_stride;
2684*09537850SAkhilesh Sanikop
2685*09537850SAkhilesh Sanikop const uint8x8_t result = vrhadd_u8(left, right);
2686*09537850SAkhilesh Sanikop
2687*09537850SAkhilesh Sanikop StoreLo4(dest, result);
2688*09537850SAkhilesh Sanikop dest += pred_stride;
2689*09537850SAkhilesh Sanikop StoreHi4(dest, result);
2690*09537850SAkhilesh Sanikop dest += pred_stride;
2691*09537850SAkhilesh Sanikop y -= 2;
2692*09537850SAkhilesh Sanikop } while (y != 0);
2693*09537850SAkhilesh Sanikop }
2694*09537850SAkhilesh Sanikop }
2695*09537850SAkhilesh Sanikop
2696*09537850SAkhilesh Sanikop template <int width>
IntraBlockCopyVertical(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t dst_stride)2697*09537850SAkhilesh Sanikop inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
2698*09537850SAkhilesh Sanikop const ptrdiff_t src_stride, const int height,
2699*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT dst,
2700*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride) {
2701*09537850SAkhilesh Sanikop const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2702*09537850SAkhilesh Sanikop const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2703*09537850SAkhilesh Sanikop uint8x16_t row[8], below[8];
2704*09537850SAkhilesh Sanikop
2705*09537850SAkhilesh Sanikop row[0] = vld1q_u8(src);
2706*09537850SAkhilesh Sanikop if (width >= 32) {
2707*09537850SAkhilesh Sanikop src += 16;
2708*09537850SAkhilesh Sanikop row[1] = vld1q_u8(src);
2709*09537850SAkhilesh Sanikop if (width >= 64) {
2710*09537850SAkhilesh Sanikop src += 16;
2711*09537850SAkhilesh Sanikop row[2] = vld1q_u8(src);
2712*09537850SAkhilesh Sanikop src += 16;
2713*09537850SAkhilesh Sanikop row[3] = vld1q_u8(src);
2714*09537850SAkhilesh Sanikop if (width == 128) {
2715*09537850SAkhilesh Sanikop src += 16;
2716*09537850SAkhilesh Sanikop row[4] = vld1q_u8(src);
2717*09537850SAkhilesh Sanikop src += 16;
2718*09537850SAkhilesh Sanikop row[5] = vld1q_u8(src);
2719*09537850SAkhilesh Sanikop src += 16;
2720*09537850SAkhilesh Sanikop row[6] = vld1q_u8(src);
2721*09537850SAkhilesh Sanikop src += 16;
2722*09537850SAkhilesh Sanikop row[7] = vld1q_u8(src);
2723*09537850SAkhilesh Sanikop }
2724*09537850SAkhilesh Sanikop }
2725*09537850SAkhilesh Sanikop }
2726*09537850SAkhilesh Sanikop src += src_remainder_stride;
2727*09537850SAkhilesh Sanikop
2728*09537850SAkhilesh Sanikop int y = height;
2729*09537850SAkhilesh Sanikop do {
2730*09537850SAkhilesh Sanikop below[0] = vld1q_u8(src);
2731*09537850SAkhilesh Sanikop if (width >= 32) {
2732*09537850SAkhilesh Sanikop src += 16;
2733*09537850SAkhilesh Sanikop below[1] = vld1q_u8(src);
2734*09537850SAkhilesh Sanikop if (width >= 64) {
2735*09537850SAkhilesh Sanikop src += 16;
2736*09537850SAkhilesh Sanikop below[2] = vld1q_u8(src);
2737*09537850SAkhilesh Sanikop src += 16;
2738*09537850SAkhilesh Sanikop below[3] = vld1q_u8(src);
2739*09537850SAkhilesh Sanikop if (width == 128) {
2740*09537850SAkhilesh Sanikop src += 16;
2741*09537850SAkhilesh Sanikop below[4] = vld1q_u8(src);
2742*09537850SAkhilesh Sanikop src += 16;
2743*09537850SAkhilesh Sanikop below[5] = vld1q_u8(src);
2744*09537850SAkhilesh Sanikop src += 16;
2745*09537850SAkhilesh Sanikop below[6] = vld1q_u8(src);
2746*09537850SAkhilesh Sanikop src += 16;
2747*09537850SAkhilesh Sanikop below[7] = vld1q_u8(src);
2748*09537850SAkhilesh Sanikop }
2749*09537850SAkhilesh Sanikop }
2750*09537850SAkhilesh Sanikop }
2751*09537850SAkhilesh Sanikop src += src_remainder_stride;
2752*09537850SAkhilesh Sanikop
2753*09537850SAkhilesh Sanikop vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
2754*09537850SAkhilesh Sanikop row[0] = below[0];
2755*09537850SAkhilesh Sanikop if (width >= 32) {
2756*09537850SAkhilesh Sanikop dst += 16;
2757*09537850SAkhilesh Sanikop vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
2758*09537850SAkhilesh Sanikop row[1] = below[1];
2759*09537850SAkhilesh Sanikop if (width >= 64) {
2760*09537850SAkhilesh Sanikop dst += 16;
2761*09537850SAkhilesh Sanikop vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
2762*09537850SAkhilesh Sanikop row[2] = below[2];
2763*09537850SAkhilesh Sanikop dst += 16;
2764*09537850SAkhilesh Sanikop vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
2765*09537850SAkhilesh Sanikop row[3] = below[3];
2766*09537850SAkhilesh Sanikop if (width >= 128) {
2767*09537850SAkhilesh Sanikop dst += 16;
2768*09537850SAkhilesh Sanikop vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
2769*09537850SAkhilesh Sanikop row[4] = below[4];
2770*09537850SAkhilesh Sanikop dst += 16;
2771*09537850SAkhilesh Sanikop vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
2772*09537850SAkhilesh Sanikop row[5] = below[5];
2773*09537850SAkhilesh Sanikop dst += 16;
2774*09537850SAkhilesh Sanikop vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
2775*09537850SAkhilesh Sanikop row[6] = below[6];
2776*09537850SAkhilesh Sanikop dst += 16;
2777*09537850SAkhilesh Sanikop vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
2778*09537850SAkhilesh Sanikop row[7] = below[7];
2779*09537850SAkhilesh Sanikop }
2780*09537850SAkhilesh Sanikop }
2781*09537850SAkhilesh Sanikop }
2782*09537850SAkhilesh Sanikop dst += dst_remainder_stride;
2783*09537850SAkhilesh Sanikop } while (--y != 0);
2784*09537850SAkhilesh Sanikop }
2785*09537850SAkhilesh Sanikop
ConvolveIntraBlockCopyVertical_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)2786*09537850SAkhilesh Sanikop void ConvolveIntraBlockCopyVertical_NEON(
2787*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
2788*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
2789*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
2790*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
2791*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
2792*09537850SAkhilesh Sanikop assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
2793*09537850SAkhilesh Sanikop assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
2794*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
2795*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
2796*09537850SAkhilesh Sanikop
2797*09537850SAkhilesh Sanikop if (width == 128) {
2798*09537850SAkhilesh Sanikop IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
2799*09537850SAkhilesh Sanikop pred_stride);
2800*09537850SAkhilesh Sanikop } else if (width == 64) {
2801*09537850SAkhilesh Sanikop IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
2802*09537850SAkhilesh Sanikop pred_stride);
2803*09537850SAkhilesh Sanikop } else if (width == 32) {
2804*09537850SAkhilesh Sanikop IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
2805*09537850SAkhilesh Sanikop pred_stride);
2806*09537850SAkhilesh Sanikop } else if (width == 16) {
2807*09537850SAkhilesh Sanikop IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
2808*09537850SAkhilesh Sanikop pred_stride);
2809*09537850SAkhilesh Sanikop } else if (width == 8) {
2810*09537850SAkhilesh Sanikop uint8x8_t row, below;
2811*09537850SAkhilesh Sanikop row = vld1_u8(src);
2812*09537850SAkhilesh Sanikop src += reference_stride;
2813*09537850SAkhilesh Sanikop
2814*09537850SAkhilesh Sanikop int y = height;
2815*09537850SAkhilesh Sanikop do {
2816*09537850SAkhilesh Sanikop below = vld1_u8(src);
2817*09537850SAkhilesh Sanikop src += reference_stride;
2818*09537850SAkhilesh Sanikop
2819*09537850SAkhilesh Sanikop vst1_u8(dest, vrhadd_u8(row, below));
2820*09537850SAkhilesh Sanikop dest += pred_stride;
2821*09537850SAkhilesh Sanikop
2822*09537850SAkhilesh Sanikop row = below;
2823*09537850SAkhilesh Sanikop } while (--y != 0);
2824*09537850SAkhilesh Sanikop } else { // width == 4
2825*09537850SAkhilesh Sanikop uint8x8_t row = Load4(src);
2826*09537850SAkhilesh Sanikop uint8x8_t below = vdup_n_u8(0);
2827*09537850SAkhilesh Sanikop src += reference_stride;
2828*09537850SAkhilesh Sanikop
2829*09537850SAkhilesh Sanikop int y = height;
2830*09537850SAkhilesh Sanikop do {
2831*09537850SAkhilesh Sanikop below = Load4<0>(src, below);
2832*09537850SAkhilesh Sanikop src += reference_stride;
2833*09537850SAkhilesh Sanikop
2834*09537850SAkhilesh Sanikop StoreLo4(dest, vrhadd_u8(row, below));
2835*09537850SAkhilesh Sanikop dest += pred_stride;
2836*09537850SAkhilesh Sanikop
2837*09537850SAkhilesh Sanikop row = below;
2838*09537850SAkhilesh Sanikop } while (--y != 0);
2839*09537850SAkhilesh Sanikop }
2840*09537850SAkhilesh Sanikop }
2841*09537850SAkhilesh Sanikop
2842*09537850SAkhilesh Sanikop template <int width>
IntraBlockCopy2D(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t dst_stride)2843*09537850SAkhilesh Sanikop inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
2844*09537850SAkhilesh Sanikop const ptrdiff_t src_stride, const int height,
2845*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT dst,
2846*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride) {
2847*09537850SAkhilesh Sanikop const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
2848*09537850SAkhilesh Sanikop const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
2849*09537850SAkhilesh Sanikop uint16x8_t row[16];
2850*09537850SAkhilesh Sanikop row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2851*09537850SAkhilesh Sanikop if (width >= 16) {
2852*09537850SAkhilesh Sanikop src += 8;
2853*09537850SAkhilesh Sanikop row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2854*09537850SAkhilesh Sanikop if (width >= 32) {
2855*09537850SAkhilesh Sanikop src += 8;
2856*09537850SAkhilesh Sanikop row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2857*09537850SAkhilesh Sanikop src += 8;
2858*09537850SAkhilesh Sanikop row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2859*09537850SAkhilesh Sanikop if (width >= 64) {
2860*09537850SAkhilesh Sanikop src += 8;
2861*09537850SAkhilesh Sanikop row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2862*09537850SAkhilesh Sanikop src += 8;
2863*09537850SAkhilesh Sanikop row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2864*09537850SAkhilesh Sanikop src += 8;
2865*09537850SAkhilesh Sanikop row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2866*09537850SAkhilesh Sanikop src += 8;
2867*09537850SAkhilesh Sanikop row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2868*09537850SAkhilesh Sanikop if (width == 128) {
2869*09537850SAkhilesh Sanikop src += 8;
2870*09537850SAkhilesh Sanikop row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2871*09537850SAkhilesh Sanikop src += 8;
2872*09537850SAkhilesh Sanikop row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2873*09537850SAkhilesh Sanikop src += 8;
2874*09537850SAkhilesh Sanikop row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2875*09537850SAkhilesh Sanikop src += 8;
2876*09537850SAkhilesh Sanikop row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2877*09537850SAkhilesh Sanikop src += 8;
2878*09537850SAkhilesh Sanikop row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2879*09537850SAkhilesh Sanikop src += 8;
2880*09537850SAkhilesh Sanikop row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2881*09537850SAkhilesh Sanikop src += 8;
2882*09537850SAkhilesh Sanikop row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2883*09537850SAkhilesh Sanikop src += 8;
2884*09537850SAkhilesh Sanikop row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2885*09537850SAkhilesh Sanikop }
2886*09537850SAkhilesh Sanikop }
2887*09537850SAkhilesh Sanikop }
2888*09537850SAkhilesh Sanikop }
2889*09537850SAkhilesh Sanikop src += src_remainder_stride;
2890*09537850SAkhilesh Sanikop
2891*09537850SAkhilesh Sanikop int y = height;
2892*09537850SAkhilesh Sanikop do {
2893*09537850SAkhilesh Sanikop const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2894*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
2895*09537850SAkhilesh Sanikop row[0] = below_0;
2896*09537850SAkhilesh Sanikop if (width >= 16) {
2897*09537850SAkhilesh Sanikop src += 8;
2898*09537850SAkhilesh Sanikop dst += 8;
2899*09537850SAkhilesh Sanikop
2900*09537850SAkhilesh Sanikop const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2901*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
2902*09537850SAkhilesh Sanikop row[1] = below_1;
2903*09537850SAkhilesh Sanikop if (width >= 32) {
2904*09537850SAkhilesh Sanikop src += 8;
2905*09537850SAkhilesh Sanikop dst += 8;
2906*09537850SAkhilesh Sanikop
2907*09537850SAkhilesh Sanikop const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2908*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
2909*09537850SAkhilesh Sanikop row[2] = below_2;
2910*09537850SAkhilesh Sanikop src += 8;
2911*09537850SAkhilesh Sanikop dst += 8;
2912*09537850SAkhilesh Sanikop
2913*09537850SAkhilesh Sanikop const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2914*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
2915*09537850SAkhilesh Sanikop row[3] = below_3;
2916*09537850SAkhilesh Sanikop if (width >= 64) {
2917*09537850SAkhilesh Sanikop src += 8;
2918*09537850SAkhilesh Sanikop dst += 8;
2919*09537850SAkhilesh Sanikop
2920*09537850SAkhilesh Sanikop const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2921*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
2922*09537850SAkhilesh Sanikop row[4] = below_4;
2923*09537850SAkhilesh Sanikop src += 8;
2924*09537850SAkhilesh Sanikop dst += 8;
2925*09537850SAkhilesh Sanikop
2926*09537850SAkhilesh Sanikop const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2927*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
2928*09537850SAkhilesh Sanikop row[5] = below_5;
2929*09537850SAkhilesh Sanikop src += 8;
2930*09537850SAkhilesh Sanikop dst += 8;
2931*09537850SAkhilesh Sanikop
2932*09537850SAkhilesh Sanikop const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2933*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
2934*09537850SAkhilesh Sanikop row[6] = below_6;
2935*09537850SAkhilesh Sanikop src += 8;
2936*09537850SAkhilesh Sanikop dst += 8;
2937*09537850SAkhilesh Sanikop
2938*09537850SAkhilesh Sanikop const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2939*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
2940*09537850SAkhilesh Sanikop row[7] = below_7;
2941*09537850SAkhilesh Sanikop if (width == 128) {
2942*09537850SAkhilesh Sanikop src += 8;
2943*09537850SAkhilesh Sanikop dst += 8;
2944*09537850SAkhilesh Sanikop
2945*09537850SAkhilesh Sanikop const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2946*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
2947*09537850SAkhilesh Sanikop row[8] = below_8;
2948*09537850SAkhilesh Sanikop src += 8;
2949*09537850SAkhilesh Sanikop dst += 8;
2950*09537850SAkhilesh Sanikop
2951*09537850SAkhilesh Sanikop const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2952*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
2953*09537850SAkhilesh Sanikop row[9] = below_9;
2954*09537850SAkhilesh Sanikop src += 8;
2955*09537850SAkhilesh Sanikop dst += 8;
2956*09537850SAkhilesh Sanikop
2957*09537850SAkhilesh Sanikop const uint16x8_t below_10 =
2958*09537850SAkhilesh Sanikop vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2959*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
2960*09537850SAkhilesh Sanikop row[10] = below_10;
2961*09537850SAkhilesh Sanikop src += 8;
2962*09537850SAkhilesh Sanikop dst += 8;
2963*09537850SAkhilesh Sanikop
2964*09537850SAkhilesh Sanikop const uint16x8_t below_11 =
2965*09537850SAkhilesh Sanikop vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2966*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
2967*09537850SAkhilesh Sanikop row[11] = below_11;
2968*09537850SAkhilesh Sanikop src += 8;
2969*09537850SAkhilesh Sanikop dst += 8;
2970*09537850SAkhilesh Sanikop
2971*09537850SAkhilesh Sanikop const uint16x8_t below_12 =
2972*09537850SAkhilesh Sanikop vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2973*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
2974*09537850SAkhilesh Sanikop row[12] = below_12;
2975*09537850SAkhilesh Sanikop src += 8;
2976*09537850SAkhilesh Sanikop dst += 8;
2977*09537850SAkhilesh Sanikop
2978*09537850SAkhilesh Sanikop const uint16x8_t below_13 =
2979*09537850SAkhilesh Sanikop vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2980*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
2981*09537850SAkhilesh Sanikop row[13] = below_13;
2982*09537850SAkhilesh Sanikop src += 8;
2983*09537850SAkhilesh Sanikop dst += 8;
2984*09537850SAkhilesh Sanikop
2985*09537850SAkhilesh Sanikop const uint16x8_t below_14 =
2986*09537850SAkhilesh Sanikop vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2987*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
2988*09537850SAkhilesh Sanikop row[14] = below_14;
2989*09537850SAkhilesh Sanikop src += 8;
2990*09537850SAkhilesh Sanikop dst += 8;
2991*09537850SAkhilesh Sanikop
2992*09537850SAkhilesh Sanikop const uint16x8_t below_15 =
2993*09537850SAkhilesh Sanikop vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2994*09537850SAkhilesh Sanikop vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
2995*09537850SAkhilesh Sanikop row[15] = below_15;
2996*09537850SAkhilesh Sanikop }
2997*09537850SAkhilesh Sanikop }
2998*09537850SAkhilesh Sanikop }
2999*09537850SAkhilesh Sanikop }
3000*09537850SAkhilesh Sanikop src += src_remainder_stride;
3001*09537850SAkhilesh Sanikop dst += dst_remainder_stride;
3002*09537850SAkhilesh Sanikop } while (--y != 0);
3003*09537850SAkhilesh Sanikop }
3004*09537850SAkhilesh Sanikop
ConvolveIntraBlockCopy2D_NEON(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)3005*09537850SAkhilesh Sanikop void ConvolveIntraBlockCopy2D_NEON(
3006*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
3007*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
3008*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
3009*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
3010*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
3011*09537850SAkhilesh Sanikop assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
3012*09537850SAkhilesh Sanikop assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
3013*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
3014*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
3015*09537850SAkhilesh Sanikop // Note: allow vertical access to height + 1. Because this function is only
3016*09537850SAkhilesh Sanikop // for u/v plane of intra block copy, such access is guaranteed to be within
3017*09537850SAkhilesh Sanikop // the prediction block.
3018*09537850SAkhilesh Sanikop
3019*09537850SAkhilesh Sanikop if (width == 128) {
3020*09537850SAkhilesh Sanikop IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
3021*09537850SAkhilesh Sanikop } else if (width == 64) {
3022*09537850SAkhilesh Sanikop IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
3023*09537850SAkhilesh Sanikop } else if (width == 32) {
3024*09537850SAkhilesh Sanikop IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
3025*09537850SAkhilesh Sanikop } else if (width == 16) {
3026*09537850SAkhilesh Sanikop IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
3027*09537850SAkhilesh Sanikop } else if (width == 8) {
3028*09537850SAkhilesh Sanikop IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
3029*09537850SAkhilesh Sanikop } else { // width == 4
3030*09537850SAkhilesh Sanikop uint8x8_t left = Load4(src);
3031*09537850SAkhilesh Sanikop uint8x8_t right = Load4(src + 1);
3032*09537850SAkhilesh Sanikop src += reference_stride;
3033*09537850SAkhilesh Sanikop
3034*09537850SAkhilesh Sanikop uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
3035*09537850SAkhilesh Sanikop
3036*09537850SAkhilesh Sanikop int y = height;
3037*09537850SAkhilesh Sanikop do {
3038*09537850SAkhilesh Sanikop left = Load4<0>(src, left);
3039*09537850SAkhilesh Sanikop right = Load4<0>(src + 1, right);
3040*09537850SAkhilesh Sanikop src += reference_stride;
3041*09537850SAkhilesh Sanikop left = Load4<1>(src, left);
3042*09537850SAkhilesh Sanikop right = Load4<1>(src + 1, right);
3043*09537850SAkhilesh Sanikop src += reference_stride;
3044*09537850SAkhilesh Sanikop
3045*09537850SAkhilesh Sanikop const uint16x8_t below = vaddl_u8(left, right);
3046*09537850SAkhilesh Sanikop
3047*09537850SAkhilesh Sanikop const uint8x8_t result = vrshrn_n_u16(
3048*09537850SAkhilesh Sanikop vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
3049*09537850SAkhilesh Sanikop StoreLo4(dest, result);
3050*09537850SAkhilesh Sanikop dest += pred_stride;
3051*09537850SAkhilesh Sanikop StoreHi4(dest, result);
3052*09537850SAkhilesh Sanikop dest += pred_stride;
3053*09537850SAkhilesh Sanikop
3054*09537850SAkhilesh Sanikop row = vget_high_u16(below);
3055*09537850SAkhilesh Sanikop y -= 2;
3056*09537850SAkhilesh Sanikop } while (y != 0);
3057*09537850SAkhilesh Sanikop }
3058*09537850SAkhilesh Sanikop }
3059*09537850SAkhilesh Sanikop
Init8bpp()3060*09537850SAkhilesh Sanikop void Init8bpp() {
3061*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
3062*09537850SAkhilesh Sanikop assert(dsp != nullptr);
3063*09537850SAkhilesh Sanikop dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
3064*09537850SAkhilesh Sanikop dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
3065*09537850SAkhilesh Sanikop dsp->convolve[0][0][1][1] = Convolve2D_NEON;
3066*09537850SAkhilesh Sanikop
3067*09537850SAkhilesh Sanikop dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
3068*09537850SAkhilesh Sanikop dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
3069*09537850SAkhilesh Sanikop dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
3070*09537850SAkhilesh Sanikop dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
3071*09537850SAkhilesh Sanikop
3072*09537850SAkhilesh Sanikop dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
3073*09537850SAkhilesh Sanikop dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
3074*09537850SAkhilesh Sanikop dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
3075*09537850SAkhilesh Sanikop
3076*09537850SAkhilesh Sanikop dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
3077*09537850SAkhilesh Sanikop dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
3078*09537850SAkhilesh Sanikop }
3079*09537850SAkhilesh Sanikop
3080*09537850SAkhilesh Sanikop } // namespace
3081*09537850SAkhilesh Sanikop } // namespace low_bitdepth
3082*09537850SAkhilesh Sanikop
ConvolveInit_NEON()3083*09537850SAkhilesh Sanikop void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
3084*09537850SAkhilesh Sanikop
3085*09537850SAkhilesh Sanikop } // namespace dsp
3086*09537850SAkhilesh Sanikop } // namespace libgav1
3087*09537850SAkhilesh Sanikop
3088*09537850SAkhilesh Sanikop #else // !LIBGAV1_ENABLE_NEON
3089*09537850SAkhilesh Sanikop
3090*09537850SAkhilesh Sanikop namespace libgav1 {
3091*09537850SAkhilesh Sanikop namespace dsp {
3092*09537850SAkhilesh Sanikop
ConvolveInit_NEON()3093*09537850SAkhilesh Sanikop void ConvolveInit_NEON() {}
3094*09537850SAkhilesh Sanikop
3095*09537850SAkhilesh Sanikop } // namespace dsp
3096*09537850SAkhilesh Sanikop } // namespace libgav1
3097*09537850SAkhilesh Sanikop #endif // LIBGAV1_ENABLE_NEON
3098