xref: /aosp_15_r20/external/libgav1/src/dsp/arm/obmc_neon.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop //      http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop 
15*09537850SAkhilesh Sanikop #include "src/dsp/obmc.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop 
18*09537850SAkhilesh Sanikop #if LIBGAV1_ENABLE_NEON
19*09537850SAkhilesh Sanikop 
20*09537850SAkhilesh Sanikop #include <arm_neon.h>
21*09537850SAkhilesh Sanikop 
22*09537850SAkhilesh Sanikop #include <algorithm>
23*09537850SAkhilesh Sanikop #include <cassert>
24*09537850SAkhilesh Sanikop #include <cstddef>
25*09537850SAkhilesh Sanikop #include <cstdint>
26*09537850SAkhilesh Sanikop #include <cstring>
27*09537850SAkhilesh Sanikop 
28*09537850SAkhilesh Sanikop #include "src/dsp/arm/common_neon.h"
29*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
30*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
31*09537850SAkhilesh Sanikop #include "src/utils/common.h"
32*09537850SAkhilesh Sanikop 
33*09537850SAkhilesh Sanikop namespace libgav1 {
34*09537850SAkhilesh Sanikop namespace dsp {
35*09537850SAkhilesh Sanikop namespace {
36*09537850SAkhilesh Sanikop #include "src/dsp/obmc.inc"
37*09537850SAkhilesh Sanikop 
38*09537850SAkhilesh Sanikop }  // namespace
39*09537850SAkhilesh Sanikop 
40*09537850SAkhilesh Sanikop namespace low_bitdepth {
41*09537850SAkhilesh Sanikop namespace {
42*09537850SAkhilesh Sanikop 
WriteObmcLine4(uint8_t * LIBGAV1_RESTRICT const pred,const uint8_t * LIBGAV1_RESTRICT const obmc_pred,const uint8x8_t pred_mask,const uint8x8_t obmc_pred_mask)43*09537850SAkhilesh Sanikop inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred,
44*09537850SAkhilesh Sanikop                            const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
45*09537850SAkhilesh Sanikop                            const uint8x8_t pred_mask,
46*09537850SAkhilesh Sanikop                            const uint8x8_t obmc_pred_mask) {
47*09537850SAkhilesh Sanikop   const uint8x8_t pred_val = Load4(pred);
48*09537850SAkhilesh Sanikop   const uint8x8_t obmc_pred_val = Load4(obmc_pred);
49*09537850SAkhilesh Sanikop   const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
50*09537850SAkhilesh Sanikop   const uint8x8_t result =
51*09537850SAkhilesh Sanikop       vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
52*09537850SAkhilesh Sanikop   StoreLo4(pred, result);
53*09537850SAkhilesh Sanikop }
54*09537850SAkhilesh Sanikop 
WriteObmcLine8(uint8_t * LIBGAV1_RESTRICT const pred,const uint8x8_t obmc_pred_val,const uint8x8_t pred_mask,const uint8x8_t obmc_pred_mask)55*09537850SAkhilesh Sanikop inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred,
56*09537850SAkhilesh Sanikop                            const uint8x8_t obmc_pred_val,
57*09537850SAkhilesh Sanikop                            const uint8x8_t pred_mask,
58*09537850SAkhilesh Sanikop                            const uint8x8_t obmc_pred_mask) {
59*09537850SAkhilesh Sanikop   const uint8x8_t pred_val = vld1_u8(pred);
60*09537850SAkhilesh Sanikop   const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
61*09537850SAkhilesh Sanikop   const uint8x8_t result =
62*09537850SAkhilesh Sanikop       vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
63*09537850SAkhilesh Sanikop   vst1_u8(pred, result);
64*09537850SAkhilesh Sanikop }
65*09537850SAkhilesh Sanikop 
OverlapBlendFromLeft2xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)66*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft2xH_NEON(
67*09537850SAkhilesh Sanikop     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
68*09537850SAkhilesh Sanikop     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
69*09537850SAkhilesh Sanikop     const ptrdiff_t obmc_prediction_stride) {
70*09537850SAkhilesh Sanikop   const uint8x8_t mask_inverter = vdup_n_u8(64);
71*09537850SAkhilesh Sanikop   const uint8x8_t pred_mask = Load2(kObmcMask);
72*09537850SAkhilesh Sanikop   const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
73*09537850SAkhilesh Sanikop   uint8x8_t pred_val = vdup_n_u8(0);
74*09537850SAkhilesh Sanikop   uint8x8_t obmc_pred_val = vdup_n_u8(0);
75*09537850SAkhilesh Sanikop   int y = 0;
76*09537850SAkhilesh Sanikop   do {
77*09537850SAkhilesh Sanikop     pred_val = Load2<0>(pred, pred_val);
78*09537850SAkhilesh Sanikop     const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
79*09537850SAkhilesh Sanikop     obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val);
80*09537850SAkhilesh Sanikop     const uint8x8_t result =
81*09537850SAkhilesh Sanikop         vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
82*09537850SAkhilesh Sanikop     Store2<0>(pred, result);
83*09537850SAkhilesh Sanikop 
84*09537850SAkhilesh Sanikop     pred += prediction_stride;
85*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
86*09537850SAkhilesh Sanikop   } while (++y != height);
87*09537850SAkhilesh Sanikop }
88*09537850SAkhilesh Sanikop 
OverlapBlendFromLeft4xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)89*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft4xH_NEON(
90*09537850SAkhilesh Sanikop     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
91*09537850SAkhilesh Sanikop     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
92*09537850SAkhilesh Sanikop     const ptrdiff_t obmc_prediction_stride) {
93*09537850SAkhilesh Sanikop   const uint8x8_t mask_inverter = vdup_n_u8(64);
94*09537850SAkhilesh Sanikop   const uint8x8_t pred_mask = Load4(kObmcMask + 2);
95*09537850SAkhilesh Sanikop   // 64 - mask
96*09537850SAkhilesh Sanikop   const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
97*09537850SAkhilesh Sanikop   int y = 0;
98*09537850SAkhilesh Sanikop   do {
99*09537850SAkhilesh Sanikop     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
100*09537850SAkhilesh Sanikop     pred += prediction_stride;
101*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
102*09537850SAkhilesh Sanikop 
103*09537850SAkhilesh Sanikop     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
104*09537850SAkhilesh Sanikop     pred += prediction_stride;
105*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
106*09537850SAkhilesh Sanikop 
107*09537850SAkhilesh Sanikop     y += 2;
108*09537850SAkhilesh Sanikop   } while (y != height);
109*09537850SAkhilesh Sanikop }
110*09537850SAkhilesh Sanikop 
OverlapBlendFromLeft8xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred)111*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft8xH_NEON(
112*09537850SAkhilesh Sanikop     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
113*09537850SAkhilesh Sanikop     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
114*09537850SAkhilesh Sanikop   const uint8x8_t mask_inverter = vdup_n_u8(64);
115*09537850SAkhilesh Sanikop   const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
116*09537850SAkhilesh Sanikop   constexpr int obmc_prediction_stride = 8;
117*09537850SAkhilesh Sanikop   // 64 - mask
118*09537850SAkhilesh Sanikop   const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
119*09537850SAkhilesh Sanikop   int y = 0;
120*09537850SAkhilesh Sanikop   do {
121*09537850SAkhilesh Sanikop     const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
122*09537850SAkhilesh Sanikop     WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask);
123*09537850SAkhilesh Sanikop     pred += prediction_stride;
124*09537850SAkhilesh Sanikop 
125*09537850SAkhilesh Sanikop     WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask,
126*09537850SAkhilesh Sanikop                    obmc_pred_mask);
127*09537850SAkhilesh Sanikop     pred += prediction_stride;
128*09537850SAkhilesh Sanikop 
129*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride << 1;
130*09537850SAkhilesh Sanikop     y += 2;
131*09537850SAkhilesh Sanikop   } while (y != height);
132*09537850SAkhilesh Sanikop }
133*09537850SAkhilesh Sanikop 
OverlapBlendFromLeft_NEON(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)134*09537850SAkhilesh Sanikop void OverlapBlendFromLeft_NEON(
135*09537850SAkhilesh Sanikop     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
136*09537850SAkhilesh Sanikop     const int width, const int height,
137*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const obmc_prediction,
138*09537850SAkhilesh Sanikop     const ptrdiff_t obmc_prediction_stride) {
139*09537850SAkhilesh Sanikop   auto* pred = static_cast<uint8_t*>(prediction);
140*09537850SAkhilesh Sanikop   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
141*09537850SAkhilesh Sanikop   assert(width >= 2);
142*09537850SAkhilesh Sanikop   assert(height >= 4);
143*09537850SAkhilesh Sanikop 
144*09537850SAkhilesh Sanikop   if (width == 2) {
145*09537850SAkhilesh Sanikop     OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
146*09537850SAkhilesh Sanikop                                  obmc_prediction_stride);
147*09537850SAkhilesh Sanikop     return;
148*09537850SAkhilesh Sanikop   }
149*09537850SAkhilesh Sanikop   if (width == 4) {
150*09537850SAkhilesh Sanikop     OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
151*09537850SAkhilesh Sanikop                                  obmc_prediction_stride);
152*09537850SAkhilesh Sanikop     return;
153*09537850SAkhilesh Sanikop   }
154*09537850SAkhilesh Sanikop   if (width == 8) {
155*09537850SAkhilesh Sanikop     OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred);
156*09537850SAkhilesh Sanikop     return;
157*09537850SAkhilesh Sanikop   }
158*09537850SAkhilesh Sanikop   const uint8x16_t mask_inverter = vdupq_n_u8(64);
159*09537850SAkhilesh Sanikop   const uint8_t* mask = kObmcMask + width - 2;
160*09537850SAkhilesh Sanikop   int x = 0;
161*09537850SAkhilesh Sanikop   do {
162*09537850SAkhilesh Sanikop     pred = static_cast<uint8_t*>(prediction) + x;
163*09537850SAkhilesh Sanikop     obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
164*09537850SAkhilesh Sanikop     const uint8x16_t pred_mask = vld1q_u8(mask + x);
165*09537850SAkhilesh Sanikop     // 64 - mask
166*09537850SAkhilesh Sanikop     const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask);
167*09537850SAkhilesh Sanikop     int y = 0;
168*09537850SAkhilesh Sanikop     do {
169*09537850SAkhilesh Sanikop       const uint8x16_t pred_val = vld1q_u8(pred);
170*09537850SAkhilesh Sanikop       const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
171*09537850SAkhilesh Sanikop       const uint16x8_t weighted_pred_lo =
172*09537850SAkhilesh Sanikop           vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val));
173*09537850SAkhilesh Sanikop       const uint8x8_t result_lo =
174*09537850SAkhilesh Sanikop           vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask),
175*09537850SAkhilesh Sanikop                                 vget_low_u8(obmc_pred_val)),
176*09537850SAkhilesh Sanikop                        6);
177*09537850SAkhilesh Sanikop       const uint16x8_t weighted_pred_hi =
178*09537850SAkhilesh Sanikop           vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val));
179*09537850SAkhilesh Sanikop       const uint8x8_t result_hi =
180*09537850SAkhilesh Sanikop           vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask),
181*09537850SAkhilesh Sanikop                                 vget_high_u8(obmc_pred_val)),
182*09537850SAkhilesh Sanikop                        6);
183*09537850SAkhilesh Sanikop       vst1q_u8(pred, vcombine_u8(result_lo, result_hi));
184*09537850SAkhilesh Sanikop 
185*09537850SAkhilesh Sanikop       pred += prediction_stride;
186*09537850SAkhilesh Sanikop       obmc_pred += obmc_prediction_stride;
187*09537850SAkhilesh Sanikop     } while (++y < height);
188*09537850SAkhilesh Sanikop     x += 16;
189*09537850SAkhilesh Sanikop   } while (x < width);
190*09537850SAkhilesh Sanikop }
191*09537850SAkhilesh Sanikop 
OverlapBlendFromTop4x4_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride,const int height)192*09537850SAkhilesh Sanikop inline void OverlapBlendFromTop4x4_NEON(
193*09537850SAkhilesh Sanikop     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
194*09537850SAkhilesh Sanikop     const uint8_t* LIBGAV1_RESTRICT obmc_pred,
195*09537850SAkhilesh Sanikop     const ptrdiff_t obmc_prediction_stride, const int height) {
196*09537850SAkhilesh Sanikop   uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
197*09537850SAkhilesh Sanikop   const uint8x8_t mask_inverter = vdup_n_u8(64);
198*09537850SAkhilesh Sanikop   uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
199*09537850SAkhilesh Sanikop   WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
200*09537850SAkhilesh Sanikop   pred += prediction_stride;
201*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
202*09537850SAkhilesh Sanikop 
203*09537850SAkhilesh Sanikop   if (height == 2) {
204*09537850SAkhilesh Sanikop     return;
205*09537850SAkhilesh Sanikop   }
206*09537850SAkhilesh Sanikop 
207*09537850SAkhilesh Sanikop   pred_mask = vdup_n_u8(kObmcMask[3]);
208*09537850SAkhilesh Sanikop   obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
209*09537850SAkhilesh Sanikop   WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
210*09537850SAkhilesh Sanikop   pred += prediction_stride;
211*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
212*09537850SAkhilesh Sanikop 
213*09537850SAkhilesh Sanikop   pred_mask = vdup_n_u8(kObmcMask[4]);
214*09537850SAkhilesh Sanikop   obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
215*09537850SAkhilesh Sanikop   WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
216*09537850SAkhilesh Sanikop }
217*09537850SAkhilesh Sanikop 
OverlapBlendFromTop4xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)218*09537850SAkhilesh Sanikop inline void OverlapBlendFromTop4xH_NEON(
219*09537850SAkhilesh Sanikop     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
220*09537850SAkhilesh Sanikop     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
221*09537850SAkhilesh Sanikop     const ptrdiff_t obmc_prediction_stride) {
222*09537850SAkhilesh Sanikop   if (height < 8) {
223*09537850SAkhilesh Sanikop     OverlapBlendFromTop4x4_NEON(pred, prediction_stride, obmc_pred,
224*09537850SAkhilesh Sanikop                                 obmc_prediction_stride, height);
225*09537850SAkhilesh Sanikop     return;
226*09537850SAkhilesh Sanikop   }
227*09537850SAkhilesh Sanikop   const uint8_t* mask = kObmcMask + height - 2;
228*09537850SAkhilesh Sanikop   const uint8x8_t mask_inverter = vdup_n_u8(64);
229*09537850SAkhilesh Sanikop   int y = 0;
230*09537850SAkhilesh Sanikop   // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
231*09537850SAkhilesh Sanikop   // lines are unchanged as the corresponding mask value is 64.
232*09537850SAkhilesh Sanikop   do {
233*09537850SAkhilesh Sanikop     uint8x8_t pred_mask = vdup_n_u8(mask[y]);
234*09537850SAkhilesh Sanikop     uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
235*09537850SAkhilesh Sanikop     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
236*09537850SAkhilesh Sanikop     pred += prediction_stride;
237*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
238*09537850SAkhilesh Sanikop 
239*09537850SAkhilesh Sanikop     pred_mask = vdup_n_u8(mask[y + 1]);
240*09537850SAkhilesh Sanikop     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
241*09537850SAkhilesh Sanikop     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
242*09537850SAkhilesh Sanikop     pred += prediction_stride;
243*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
244*09537850SAkhilesh Sanikop 
245*09537850SAkhilesh Sanikop     pred_mask = vdup_n_u8(mask[y + 2]);
246*09537850SAkhilesh Sanikop     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
247*09537850SAkhilesh Sanikop     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
248*09537850SAkhilesh Sanikop     pred += prediction_stride;
249*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
250*09537850SAkhilesh Sanikop 
251*09537850SAkhilesh Sanikop     pred_mask = vdup_n_u8(mask[y + 3]);
252*09537850SAkhilesh Sanikop     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
253*09537850SAkhilesh Sanikop     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
254*09537850SAkhilesh Sanikop     pred += prediction_stride;
255*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
256*09537850SAkhilesh Sanikop 
257*09537850SAkhilesh Sanikop     pred_mask = vdup_n_u8(mask[y + 4]);
258*09537850SAkhilesh Sanikop     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
259*09537850SAkhilesh Sanikop     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
260*09537850SAkhilesh Sanikop     pred += prediction_stride;
261*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
262*09537850SAkhilesh Sanikop 
263*09537850SAkhilesh Sanikop     pred_mask = vdup_n_u8(mask[y + 5]);
264*09537850SAkhilesh Sanikop     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
265*09537850SAkhilesh Sanikop     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
266*09537850SAkhilesh Sanikop     pred += prediction_stride;
267*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
268*09537850SAkhilesh Sanikop 
269*09537850SAkhilesh Sanikop     // Increment for the right mask index.
270*09537850SAkhilesh Sanikop     y += 6;
271*09537850SAkhilesh Sanikop   } while (y < height - 4);
272*09537850SAkhilesh Sanikop }
273*09537850SAkhilesh Sanikop 
OverlapBlendFromTop8xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred)274*09537850SAkhilesh Sanikop inline void OverlapBlendFromTop8xH_NEON(
275*09537850SAkhilesh Sanikop     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
276*09537850SAkhilesh Sanikop     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
277*09537850SAkhilesh Sanikop   constexpr int obmc_prediction_stride = 8;
278*09537850SAkhilesh Sanikop   const uint8x8_t mask_inverter = vdup_n_u8(64);
279*09537850SAkhilesh Sanikop   const uint8_t* mask = kObmcMask + height - 2;
280*09537850SAkhilesh Sanikop   const int compute_height = height - (height >> 2);
281*09537850SAkhilesh Sanikop   int y = 0;
282*09537850SAkhilesh Sanikop   do {
283*09537850SAkhilesh Sanikop     const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]);
284*09537850SAkhilesh Sanikop     // 64 - mask
285*09537850SAkhilesh Sanikop     const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0);
286*09537850SAkhilesh Sanikop     const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
287*09537850SAkhilesh Sanikop 
288*09537850SAkhilesh Sanikop     WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0,
289*09537850SAkhilesh Sanikop                    obmc_pred_mask0);
290*09537850SAkhilesh Sanikop     pred += prediction_stride;
291*09537850SAkhilesh Sanikop     ++y;
292*09537850SAkhilesh Sanikop 
293*09537850SAkhilesh Sanikop     const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]);
294*09537850SAkhilesh Sanikop     // 64 - mask
295*09537850SAkhilesh Sanikop     const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1);
296*09537850SAkhilesh Sanikop     WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1,
297*09537850SAkhilesh Sanikop                    obmc_pred_mask1);
298*09537850SAkhilesh Sanikop     pred += prediction_stride;
299*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride << 1;
300*09537850SAkhilesh Sanikop   } while (++y < compute_height);
301*09537850SAkhilesh Sanikop }
302*09537850SAkhilesh Sanikop 
OverlapBlendFromTop_NEON(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)303*09537850SAkhilesh Sanikop void OverlapBlendFromTop_NEON(
304*09537850SAkhilesh Sanikop     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
305*09537850SAkhilesh Sanikop     const int width, const int height,
306*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const obmc_prediction,
307*09537850SAkhilesh Sanikop     const ptrdiff_t obmc_prediction_stride) {
308*09537850SAkhilesh Sanikop   auto* pred = static_cast<uint8_t*>(prediction);
309*09537850SAkhilesh Sanikop   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
310*09537850SAkhilesh Sanikop   assert(width >= 4);
311*09537850SAkhilesh Sanikop   assert(height >= 2);
312*09537850SAkhilesh Sanikop 
313*09537850SAkhilesh Sanikop   if (width == 4) {
314*09537850SAkhilesh Sanikop     OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
315*09537850SAkhilesh Sanikop                                 obmc_prediction_stride);
316*09537850SAkhilesh Sanikop     return;
317*09537850SAkhilesh Sanikop   }
318*09537850SAkhilesh Sanikop 
319*09537850SAkhilesh Sanikop   if (width == 8) {
320*09537850SAkhilesh Sanikop     OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred);
321*09537850SAkhilesh Sanikop     return;
322*09537850SAkhilesh Sanikop   }
323*09537850SAkhilesh Sanikop 
324*09537850SAkhilesh Sanikop   const uint8_t* mask = kObmcMask + height - 2;
325*09537850SAkhilesh Sanikop   const uint8x8_t mask_inverter = vdup_n_u8(64);
326*09537850SAkhilesh Sanikop   // Stop when mask value becomes 64. This is inferred for 4xH.
327*09537850SAkhilesh Sanikop   const int compute_height = height - (height >> 2);
328*09537850SAkhilesh Sanikop   int y = 0;
329*09537850SAkhilesh Sanikop   do {
330*09537850SAkhilesh Sanikop     const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
331*09537850SAkhilesh Sanikop     // 64 - mask
332*09537850SAkhilesh Sanikop     const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
333*09537850SAkhilesh Sanikop     int x = 0;
334*09537850SAkhilesh Sanikop     do {
335*09537850SAkhilesh Sanikop       const uint8x16_t pred_val = vld1q_u8(pred + x);
336*09537850SAkhilesh Sanikop       const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x);
337*09537850SAkhilesh Sanikop       const uint16x8_t weighted_pred_lo =
338*09537850SAkhilesh Sanikop           vmull_u8(pred_mask, vget_low_u8(pred_val));
339*09537850SAkhilesh Sanikop       const uint8x8_t result_lo =
340*09537850SAkhilesh Sanikop           vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask,
341*09537850SAkhilesh Sanikop                                 vget_low_u8(obmc_pred_val)),
342*09537850SAkhilesh Sanikop                        6);
343*09537850SAkhilesh Sanikop       const uint16x8_t weighted_pred_hi =
344*09537850SAkhilesh Sanikop           vmull_u8(pred_mask, vget_high_u8(pred_val));
345*09537850SAkhilesh Sanikop       const uint8x8_t result_hi =
346*09537850SAkhilesh Sanikop           vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask,
347*09537850SAkhilesh Sanikop                                 vget_high_u8(obmc_pred_val)),
348*09537850SAkhilesh Sanikop                        6);
349*09537850SAkhilesh Sanikop       vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi));
350*09537850SAkhilesh Sanikop 
351*09537850SAkhilesh Sanikop       x += 16;
352*09537850SAkhilesh Sanikop     } while (x < width);
353*09537850SAkhilesh Sanikop     pred += prediction_stride;
354*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
355*09537850SAkhilesh Sanikop   } while (++y < compute_height);
356*09537850SAkhilesh Sanikop }
357*09537850SAkhilesh Sanikop 
Init8bpp()358*09537850SAkhilesh Sanikop void Init8bpp() {
359*09537850SAkhilesh Sanikop   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
360*09537850SAkhilesh Sanikop   assert(dsp != nullptr);
361*09537850SAkhilesh Sanikop   dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
362*09537850SAkhilesh Sanikop   dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
363*09537850SAkhilesh Sanikop }
364*09537850SAkhilesh Sanikop 
365*09537850SAkhilesh Sanikop }  // namespace
366*09537850SAkhilesh Sanikop }  // namespace low_bitdepth
367*09537850SAkhilesh Sanikop 
368*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
369*09537850SAkhilesh Sanikop namespace high_bitdepth {
370*09537850SAkhilesh Sanikop namespace {
371*09537850SAkhilesh Sanikop 
372*09537850SAkhilesh Sanikop // This is a flat array of masks for each block dimension from 2 to 32. The
373*09537850SAkhilesh Sanikop // starting index for each length is length-2. The value 64 leaves the result
374*09537850SAkhilesh Sanikop // equal to |pred| and may be ignored if convenient. Vector loads may overrread
375*09537850SAkhilesh Sanikop // values meant for larger sizes, but these values will be unused.
376*09537850SAkhilesh Sanikop constexpr uint16_t kObmcMask[62] = {
377*09537850SAkhilesh Sanikop     // Obmc Mask 2
378*09537850SAkhilesh Sanikop     45, 64,
379*09537850SAkhilesh Sanikop     // Obmc Mask 4
380*09537850SAkhilesh Sanikop     39, 50, 59, 64,
381*09537850SAkhilesh Sanikop     // Obmc Mask 8
382*09537850SAkhilesh Sanikop     36, 42, 48, 53, 57, 61, 64, 64,
383*09537850SAkhilesh Sanikop     // Obmc Mask 16
384*09537850SAkhilesh Sanikop     34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
385*09537850SAkhilesh Sanikop     // Obmc Mask 32
386*09537850SAkhilesh Sanikop     33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
387*09537850SAkhilesh Sanikop     59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
388*09537850SAkhilesh Sanikop 
BlendObmc2Or4(uint16_t * const pred,const uint16x4_t obmc_pred_val,const uint16x4_t pred_mask,const uint16x4_t obmc_pred_mask)389*09537850SAkhilesh Sanikop inline uint16x4_t BlendObmc2Or4(uint16_t* const pred,
390*09537850SAkhilesh Sanikop                                 const uint16x4_t obmc_pred_val,
391*09537850SAkhilesh Sanikop                                 const uint16x4_t pred_mask,
392*09537850SAkhilesh Sanikop                                 const uint16x4_t obmc_pred_mask) {
393*09537850SAkhilesh Sanikop   const uint16x4_t pred_val = vld1_u16(pred);
394*09537850SAkhilesh Sanikop   const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val);
395*09537850SAkhilesh Sanikop   const uint16x4_t result =
396*09537850SAkhilesh Sanikop       vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
397*09537850SAkhilesh Sanikop   return result;
398*09537850SAkhilesh Sanikop }
399*09537850SAkhilesh Sanikop 
BlendObmc8(uint16_t * LIBGAV1_RESTRICT const pred,const uint16_t * LIBGAV1_RESTRICT const obmc_pred,const uint16x8_t pred_mask,const uint16x8_t obmc_pred_mask)400*09537850SAkhilesh Sanikop inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred,
401*09537850SAkhilesh Sanikop                              const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
402*09537850SAkhilesh Sanikop                              const uint16x8_t pred_mask,
403*09537850SAkhilesh Sanikop                              const uint16x8_t obmc_pred_mask) {
404*09537850SAkhilesh Sanikop   const uint16x8_t pred_val = vld1q_u16(pred);
405*09537850SAkhilesh Sanikop   const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
406*09537850SAkhilesh Sanikop   const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val);
407*09537850SAkhilesh Sanikop   const uint16x8_t result =
408*09537850SAkhilesh Sanikop       vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
409*09537850SAkhilesh Sanikop   return result;
410*09537850SAkhilesh Sanikop }
411*09537850SAkhilesh Sanikop 
OverlapBlendFromLeft2xH_NEON(uint16_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint16_t * LIBGAV1_RESTRICT obmc_pred)412*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft2xH_NEON(
413*09537850SAkhilesh Sanikop     uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
414*09537850SAkhilesh Sanikop     const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
415*09537850SAkhilesh Sanikop   constexpr int obmc_prediction_stride = 2;
416*09537850SAkhilesh Sanikop   const uint16x4_t mask_inverter = vdup_n_u16(64);
417*09537850SAkhilesh Sanikop   // Second two lanes unused.
418*09537850SAkhilesh Sanikop   const uint16x4_t pred_mask = vld1_u16(kObmcMask);
419*09537850SAkhilesh Sanikop   const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
420*09537850SAkhilesh Sanikop   int y = 0;
421*09537850SAkhilesh Sanikop   do {
422*09537850SAkhilesh Sanikop     const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred);
423*09537850SAkhilesh Sanikop     const uint16x4_t result_0 =
424*09537850SAkhilesh Sanikop         BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask);
425*09537850SAkhilesh Sanikop     Store2<0>(pred, result_0);
426*09537850SAkhilesh Sanikop 
427*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
428*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
429*09537850SAkhilesh Sanikop 
430*09537850SAkhilesh Sanikop     const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred);
431*09537850SAkhilesh Sanikop     const uint16x4_t result_1 =
432*09537850SAkhilesh Sanikop         BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask);
433*09537850SAkhilesh Sanikop     Store2<0>(pred, result_1);
434*09537850SAkhilesh Sanikop 
435*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
436*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride;
437*09537850SAkhilesh Sanikop 
438*09537850SAkhilesh Sanikop     y += 2;
439*09537850SAkhilesh Sanikop   } while (y != height);
440*09537850SAkhilesh Sanikop }
441*09537850SAkhilesh Sanikop 
OverlapBlendFromLeft4xH_NEON(uint16_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint16_t * LIBGAV1_RESTRICT obmc_pred)442*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft4xH_NEON(
443*09537850SAkhilesh Sanikop     uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
444*09537850SAkhilesh Sanikop     const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
445*09537850SAkhilesh Sanikop   constexpr int obmc_prediction_stride = 4;
446*09537850SAkhilesh Sanikop   const uint16x4_t mask_inverter = vdup_n_u16(64);
447*09537850SAkhilesh Sanikop   const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2);
448*09537850SAkhilesh Sanikop   // 64 - mask
449*09537850SAkhilesh Sanikop   const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
450*09537850SAkhilesh Sanikop   int y = 0;
451*09537850SAkhilesh Sanikop   do {
452*09537850SAkhilesh Sanikop     const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
453*09537850SAkhilesh Sanikop     const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val),
454*09537850SAkhilesh Sanikop                                               pred_mask, obmc_pred_mask);
455*09537850SAkhilesh Sanikop     vst1_u16(pred, result_0);
456*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
457*09537850SAkhilesh Sanikop 
458*09537850SAkhilesh Sanikop     const uint16x4_t result_1 = BlendObmc2Or4(
459*09537850SAkhilesh Sanikop         pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask);
460*09537850SAkhilesh Sanikop     vst1_u16(pred, result_1);
461*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
462*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride << 1;
463*09537850SAkhilesh Sanikop 
464*09537850SAkhilesh Sanikop     y += 2;
465*09537850SAkhilesh Sanikop   } while (y != height);
466*09537850SAkhilesh Sanikop }
467*09537850SAkhilesh Sanikop 
OverlapBlendFromLeft_NEON(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)468*09537850SAkhilesh Sanikop void OverlapBlendFromLeft_NEON(
469*09537850SAkhilesh Sanikop     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
470*09537850SAkhilesh Sanikop     const int width, const int height,
471*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const obmc_prediction,
472*09537850SAkhilesh Sanikop     const ptrdiff_t obmc_prediction_stride) {
473*09537850SAkhilesh Sanikop   auto* pred = static_cast<uint16_t*>(prediction);
474*09537850SAkhilesh Sanikop   const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
475*09537850SAkhilesh Sanikop   assert(width >= 2);
476*09537850SAkhilesh Sanikop   assert(height >= 4);
477*09537850SAkhilesh Sanikop 
478*09537850SAkhilesh Sanikop   if (width == 2) {
479*09537850SAkhilesh Sanikop     OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred);
480*09537850SAkhilesh Sanikop     return;
481*09537850SAkhilesh Sanikop   }
482*09537850SAkhilesh Sanikop   if (width == 4) {
483*09537850SAkhilesh Sanikop     OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred);
484*09537850SAkhilesh Sanikop     return;
485*09537850SAkhilesh Sanikop   }
486*09537850SAkhilesh Sanikop   const uint16x8_t mask_inverter = vdupq_n_u16(64);
487*09537850SAkhilesh Sanikop   const uint16_t* mask = kObmcMask + width - 2;
488*09537850SAkhilesh Sanikop   int x = 0;
489*09537850SAkhilesh Sanikop   do {
490*09537850SAkhilesh Sanikop     uint16_t* pred_x = pred + x;
491*09537850SAkhilesh Sanikop     const uint16_t* obmc_pred_x = obmc_pred + x;
492*09537850SAkhilesh Sanikop     const uint16x8_t pred_mask = vld1q_u16(mask + x);
493*09537850SAkhilesh Sanikop     // 64 - mask
494*09537850SAkhilesh Sanikop     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
495*09537850SAkhilesh Sanikop     int y = 0;
496*09537850SAkhilesh Sanikop     do {
497*09537850SAkhilesh Sanikop       const uint16x8_t result =
498*09537850SAkhilesh Sanikop           BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask);
499*09537850SAkhilesh Sanikop       vst1q_u16(pred_x, result);
500*09537850SAkhilesh Sanikop 
501*09537850SAkhilesh Sanikop       pred_x = AddByteStride(pred_x, prediction_stride);
502*09537850SAkhilesh Sanikop       obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride);
503*09537850SAkhilesh Sanikop     } while (++y < height);
504*09537850SAkhilesh Sanikop     x += 8;
505*09537850SAkhilesh Sanikop   } while (x < width);
506*09537850SAkhilesh Sanikop }
507*09537850SAkhilesh Sanikop 
508*09537850SAkhilesh Sanikop template <int lane>
BlendObmcFromTop4(uint16_t * const pred,const uint16x4_t obmc_pred_val,const uint16x8_t pred_mask,const uint16x8_t obmc_pred_mask)509*09537850SAkhilesh Sanikop inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred,
510*09537850SAkhilesh Sanikop                                     const uint16x4_t obmc_pred_val,
511*09537850SAkhilesh Sanikop                                     const uint16x8_t pred_mask,
512*09537850SAkhilesh Sanikop                                     const uint16x8_t obmc_pred_mask) {
513*09537850SAkhilesh Sanikop   const uint16x4_t pred_val = vld1_u16(pred);
514*09537850SAkhilesh Sanikop   const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask);
515*09537850SAkhilesh Sanikop   const uint16x4_t result = vrshr_n_u16(
516*09537850SAkhilesh Sanikop       VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
517*09537850SAkhilesh Sanikop   return result;
518*09537850SAkhilesh Sanikop }
519*09537850SAkhilesh Sanikop 
520*09537850SAkhilesh Sanikop template <int lane>
BlendObmcFromTop8(uint16_t * LIBGAV1_RESTRICT const pred,const uint16_t * LIBGAV1_RESTRICT const obmc_pred,const uint16x8_t pred_mask,const uint16x8_t obmc_pred_mask)521*09537850SAkhilesh Sanikop inline uint16x8_t BlendObmcFromTop8(
522*09537850SAkhilesh Sanikop     uint16_t* LIBGAV1_RESTRICT const pred,
523*09537850SAkhilesh Sanikop     const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
524*09537850SAkhilesh Sanikop     const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) {
525*09537850SAkhilesh Sanikop   const uint16x8_t pred_val = vld1q_u16(pred);
526*09537850SAkhilesh Sanikop   const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
527*09537850SAkhilesh Sanikop   const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask);
528*09537850SAkhilesh Sanikop   const uint16x8_t result = vrshrq_n_u16(
529*09537850SAkhilesh Sanikop       VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
530*09537850SAkhilesh Sanikop   return result;
531*09537850SAkhilesh Sanikop }
532*09537850SAkhilesh Sanikop 
OverlapBlendFromTop4x2Or4_NEON(uint16_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const uint16_t * LIBGAV1_RESTRICT obmc_pred,const int height)533*09537850SAkhilesh Sanikop inline void OverlapBlendFromTop4x2Or4_NEON(
534*09537850SAkhilesh Sanikop     uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
535*09537850SAkhilesh Sanikop     const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
536*09537850SAkhilesh Sanikop   constexpr int obmc_prediction_stride = 4;
537*09537850SAkhilesh Sanikop   const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]);
538*09537850SAkhilesh Sanikop   const uint16x8_t mask_inverter = vdupq_n_u16(64);
539*09537850SAkhilesh Sanikop   const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
540*09537850SAkhilesh Sanikop   const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred);
541*09537850SAkhilesh Sanikop   uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0),
542*09537850SAkhilesh Sanikop                                            pred_mask, obmc_pred_mask);
543*09537850SAkhilesh Sanikop   vst1_u16(pred, result);
544*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
545*09537850SAkhilesh Sanikop 
546*09537850SAkhilesh Sanikop   if (height == 2) {
547*09537850SAkhilesh Sanikop     // Mask value is 64, meaning |pred| is unchanged.
548*09537850SAkhilesh Sanikop     return;
549*09537850SAkhilesh Sanikop   }
550*09537850SAkhilesh Sanikop 
551*09537850SAkhilesh Sanikop   result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask,
552*09537850SAkhilesh Sanikop                                 obmc_pred_mask);
553*09537850SAkhilesh Sanikop   vst1_u16(pred, result);
554*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
555*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride << 1;
556*09537850SAkhilesh Sanikop 
557*09537850SAkhilesh Sanikop   const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred);
558*09537850SAkhilesh Sanikop   result =
559*09537850SAkhilesh Sanikop       BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask);
560*09537850SAkhilesh Sanikop   vst1_u16(pred, result);
561*09537850SAkhilesh Sanikop }
562*09537850SAkhilesh Sanikop 
OverlapBlendFromTop4xH_NEON(uint16_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint16_t * LIBGAV1_RESTRICT obmc_pred)563*09537850SAkhilesh Sanikop inline void OverlapBlendFromTop4xH_NEON(
564*09537850SAkhilesh Sanikop     uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
565*09537850SAkhilesh Sanikop     const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
566*09537850SAkhilesh Sanikop   if (height < 8) {
567*09537850SAkhilesh Sanikop     OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height);
568*09537850SAkhilesh Sanikop     return;
569*09537850SAkhilesh Sanikop   }
570*09537850SAkhilesh Sanikop   constexpr int obmc_prediction_stride = 4;
571*09537850SAkhilesh Sanikop   const uint16_t* mask = kObmcMask + height - 2;
572*09537850SAkhilesh Sanikop   const uint16x8_t mask_inverter = vdupq_n_u16(64);
573*09537850SAkhilesh Sanikop   int y = 0;
574*09537850SAkhilesh Sanikop   // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
575*09537850SAkhilesh Sanikop   // lines are unchanged as the corresponding mask value is 64.
576*09537850SAkhilesh Sanikop   do {
577*09537850SAkhilesh Sanikop     const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
578*09537850SAkhilesh Sanikop     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
579*09537850SAkhilesh Sanikop     // Load obmc row 0, 1.
580*09537850SAkhilesh Sanikop     uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
581*09537850SAkhilesh Sanikop     uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val),
582*09537850SAkhilesh Sanikop                                              pred_mask, obmc_pred_mask);
583*09537850SAkhilesh Sanikop     vst1_u16(pred, result);
584*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
585*09537850SAkhilesh Sanikop 
586*09537850SAkhilesh Sanikop     result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask,
587*09537850SAkhilesh Sanikop                                   obmc_pred_mask);
588*09537850SAkhilesh Sanikop     vst1_u16(pred, result);
589*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
590*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride << 1;
591*09537850SAkhilesh Sanikop 
592*09537850SAkhilesh Sanikop     // Load obmc row 2, 3.
593*09537850SAkhilesh Sanikop     obmc_pred_val = vld1q_u16(obmc_pred);
594*09537850SAkhilesh Sanikop     result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask,
595*09537850SAkhilesh Sanikop                                   obmc_pred_mask);
596*09537850SAkhilesh Sanikop     vst1_u16(pred, result);
597*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
598*09537850SAkhilesh Sanikop 
599*09537850SAkhilesh Sanikop     result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask,
600*09537850SAkhilesh Sanikop                                   obmc_pred_mask);
601*09537850SAkhilesh Sanikop     vst1_u16(pred, result);
602*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
603*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride << 1;
604*09537850SAkhilesh Sanikop 
605*09537850SAkhilesh Sanikop     // Load obmc row 4, 5.
606*09537850SAkhilesh Sanikop     obmc_pred_val = vld1q_u16(obmc_pred);
607*09537850SAkhilesh Sanikop     result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask,
608*09537850SAkhilesh Sanikop                                   obmc_pred_mask);
609*09537850SAkhilesh Sanikop     vst1_u16(pred, result);
610*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
611*09537850SAkhilesh Sanikop 
612*09537850SAkhilesh Sanikop     result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask,
613*09537850SAkhilesh Sanikop                                   obmc_pred_mask);
614*09537850SAkhilesh Sanikop     vst1_u16(pred, result);
615*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
616*09537850SAkhilesh Sanikop     obmc_pred += obmc_prediction_stride << 1;
617*09537850SAkhilesh Sanikop 
618*09537850SAkhilesh Sanikop     // Increment for the right mask index.
619*09537850SAkhilesh Sanikop     y += 6;
620*09537850SAkhilesh Sanikop   } while (y < height - 4);
621*09537850SAkhilesh Sanikop }
622*09537850SAkhilesh Sanikop 
OverlapBlendFromTop8xH_NEON(uint16_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const uint16_t * LIBGAV1_RESTRICT obmc_pred,const int height)623*09537850SAkhilesh Sanikop inline void OverlapBlendFromTop8xH_NEON(
624*09537850SAkhilesh Sanikop     uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
625*09537850SAkhilesh Sanikop     const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
626*09537850SAkhilesh Sanikop   const uint16_t* mask = kObmcMask + height - 2;
627*09537850SAkhilesh Sanikop   const uint16x8_t mask_inverter = vdupq_n_u16(64);
628*09537850SAkhilesh Sanikop   uint16x8_t pred_mask = vld1q_u16(mask);
629*09537850SAkhilesh Sanikop   uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
630*09537850SAkhilesh Sanikop   uint16x8_t result =
631*09537850SAkhilesh Sanikop       BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
632*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
633*09537850SAkhilesh Sanikop   if (height == 2) return;
634*09537850SAkhilesh Sanikop 
635*09537850SAkhilesh Sanikop   constexpr int obmc_prediction_stride = 8;
636*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
637*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
638*09537850SAkhilesh Sanikop 
639*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
640*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
641*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
642*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
643*09537850SAkhilesh Sanikop 
644*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
645*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
646*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
647*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
648*09537850SAkhilesh Sanikop 
649*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
650*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
651*09537850SAkhilesh Sanikop   if (height == 4) return;
652*09537850SAkhilesh Sanikop 
653*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
654*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
655*09537850SAkhilesh Sanikop 
656*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
657*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
658*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
659*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
660*09537850SAkhilesh Sanikop 
661*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
662*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
663*09537850SAkhilesh Sanikop 
664*09537850SAkhilesh Sanikop   if (height == 8) return;
665*09537850SAkhilesh Sanikop 
666*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
667*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
668*09537850SAkhilesh Sanikop 
669*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
670*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
671*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
672*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
673*09537850SAkhilesh Sanikop 
674*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
675*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
676*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
677*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
678*09537850SAkhilesh Sanikop 
679*09537850SAkhilesh Sanikop   pred_mask = vld1q_u16(&mask[8]);
680*09537850SAkhilesh Sanikop   obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
681*09537850SAkhilesh Sanikop 
682*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
683*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
684*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
685*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
686*09537850SAkhilesh Sanikop 
687*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
688*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
689*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
690*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
691*09537850SAkhilesh Sanikop 
692*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
693*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
694*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
695*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
696*09537850SAkhilesh Sanikop 
697*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
698*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
699*09537850SAkhilesh Sanikop 
700*09537850SAkhilesh Sanikop   if (height == 16) return;
701*09537850SAkhilesh Sanikop 
702*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
703*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
704*09537850SAkhilesh Sanikop 
705*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
706*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
707*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
708*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
709*09537850SAkhilesh Sanikop 
710*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
711*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
712*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
713*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
714*09537850SAkhilesh Sanikop 
715*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
716*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
717*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
718*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
719*09537850SAkhilesh Sanikop 
720*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
721*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
722*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
723*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
724*09537850SAkhilesh Sanikop 
725*09537850SAkhilesh Sanikop   pred_mask = vld1q_u16(&mask[16]);
726*09537850SAkhilesh Sanikop   obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
727*09537850SAkhilesh Sanikop 
728*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
729*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
730*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
731*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
732*09537850SAkhilesh Sanikop 
733*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
734*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
735*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
736*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
737*09537850SAkhilesh Sanikop 
738*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
739*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
740*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
741*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
742*09537850SAkhilesh Sanikop 
743*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
744*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
745*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
746*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
747*09537850SAkhilesh Sanikop 
748*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
749*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
750*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
751*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
752*09537850SAkhilesh Sanikop 
753*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
754*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
755*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
756*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
757*09537850SAkhilesh Sanikop 
758*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
759*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
760*09537850SAkhilesh Sanikop   pred = AddByteStride(pred, prediction_stride);
761*09537850SAkhilesh Sanikop   obmc_pred += obmc_prediction_stride;
762*09537850SAkhilesh Sanikop 
763*09537850SAkhilesh Sanikop   result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
764*09537850SAkhilesh Sanikop   vst1q_u16(pred, result);
765*09537850SAkhilesh Sanikop }
766*09537850SAkhilesh Sanikop 
OverlapBlendFromTop_NEON(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)767*09537850SAkhilesh Sanikop void OverlapBlendFromTop_NEON(
768*09537850SAkhilesh Sanikop     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
769*09537850SAkhilesh Sanikop     const int width, const int height,
770*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const obmc_prediction,
771*09537850SAkhilesh Sanikop     const ptrdiff_t obmc_prediction_stride) {
772*09537850SAkhilesh Sanikop   auto* pred = static_cast<uint16_t*>(prediction);
773*09537850SAkhilesh Sanikop   const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
774*09537850SAkhilesh Sanikop   assert(width >= 4);
775*09537850SAkhilesh Sanikop   assert(height >= 2);
776*09537850SAkhilesh Sanikop 
777*09537850SAkhilesh Sanikop   if (width == 4) {
778*09537850SAkhilesh Sanikop     OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred);
779*09537850SAkhilesh Sanikop     return;
780*09537850SAkhilesh Sanikop   }
781*09537850SAkhilesh Sanikop 
782*09537850SAkhilesh Sanikop   if (width == 8) {
783*09537850SAkhilesh Sanikop     OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height);
784*09537850SAkhilesh Sanikop     return;
785*09537850SAkhilesh Sanikop   }
786*09537850SAkhilesh Sanikop 
787*09537850SAkhilesh Sanikop   const uint16_t* mask = kObmcMask + height - 2;
788*09537850SAkhilesh Sanikop   const uint16x8_t mask_inverter = vdupq_n_u16(64);
789*09537850SAkhilesh Sanikop   const uint16x8_t pred_mask = vld1q_u16(mask);
790*09537850SAkhilesh Sanikop   // 64 - mask
791*09537850SAkhilesh Sanikop   const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
792*09537850SAkhilesh Sanikop #define OBMC_ROW_FROM_TOP(n)                                   \
793*09537850SAkhilesh Sanikop   do {                                                         \
794*09537850SAkhilesh Sanikop     int x = 0;                                                 \
795*09537850SAkhilesh Sanikop     do {                                                       \
796*09537850SAkhilesh Sanikop       const uint16x8_t result = BlendObmcFromTop8<n>(          \
797*09537850SAkhilesh Sanikop           pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \
798*09537850SAkhilesh Sanikop       vst1q_u16(pred + x, result);                             \
799*09537850SAkhilesh Sanikop                                                                \
800*09537850SAkhilesh Sanikop       x += 8;                                                  \
801*09537850SAkhilesh Sanikop     } while (x < width);                                       \
802*09537850SAkhilesh Sanikop   } while (false)
803*09537850SAkhilesh Sanikop 
804*09537850SAkhilesh Sanikop   // Compute 1 row.
805*09537850SAkhilesh Sanikop   if (height == 2) {
806*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(0);
807*09537850SAkhilesh Sanikop     return;
808*09537850SAkhilesh Sanikop   }
809*09537850SAkhilesh Sanikop 
810*09537850SAkhilesh Sanikop   // Compute 3 rows.
811*09537850SAkhilesh Sanikop   if (height == 4) {
812*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(0);
813*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
814*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
815*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(1);
816*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
817*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
818*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(2);
819*09537850SAkhilesh Sanikop     return;
820*09537850SAkhilesh Sanikop   }
821*09537850SAkhilesh Sanikop 
822*09537850SAkhilesh Sanikop   // Compute 6 rows.
823*09537850SAkhilesh Sanikop   if (height == 8) {
824*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(0);
825*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
826*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
827*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(1);
828*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
829*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
830*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(2);
831*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
832*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
833*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(3);
834*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
835*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
836*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(4);
837*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
838*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
839*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(5);
840*09537850SAkhilesh Sanikop     return;
841*09537850SAkhilesh Sanikop   }
842*09537850SAkhilesh Sanikop 
843*09537850SAkhilesh Sanikop   // Compute 12 rows.
844*09537850SAkhilesh Sanikop   if (height == 16) {
845*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(0);
846*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
847*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
848*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(1);
849*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
850*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
851*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(2);
852*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
853*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
854*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(3);
855*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
856*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
857*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(4);
858*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
859*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
860*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(5);
861*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
862*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
863*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(6);
864*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
865*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
866*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(7);
867*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
868*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
869*09537850SAkhilesh Sanikop 
870*09537850SAkhilesh Sanikop     const uint16x8_t pred_mask = vld1q_u16(&mask[8]);
871*09537850SAkhilesh Sanikop     // 64 - mask
872*09537850SAkhilesh Sanikop     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
873*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(0);
874*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
875*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
876*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(1);
877*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
878*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
879*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(2);
880*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
881*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
882*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(3);
883*09537850SAkhilesh Sanikop     return;
884*09537850SAkhilesh Sanikop   }
885*09537850SAkhilesh Sanikop 
886*09537850SAkhilesh Sanikop   // Stop when mask value becomes 64. This is a multiple of 8 for height 32
887*09537850SAkhilesh Sanikop   // and 64.
888*09537850SAkhilesh Sanikop   const int compute_height = height - (height >> 2);
889*09537850SAkhilesh Sanikop   int y = 0;
890*09537850SAkhilesh Sanikop   do {
891*09537850SAkhilesh Sanikop     const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
892*09537850SAkhilesh Sanikop     // 64 - mask
893*09537850SAkhilesh Sanikop     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
894*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(0);
895*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
896*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
897*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(1);
898*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
899*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
900*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(2);
901*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
902*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
903*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(3);
904*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
905*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
906*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(4);
907*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
908*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
909*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(5);
910*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
911*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
912*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(6);
913*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
914*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
915*09537850SAkhilesh Sanikop     OBMC_ROW_FROM_TOP(7);
916*09537850SAkhilesh Sanikop     pred = AddByteStride(pred, prediction_stride);
917*09537850SAkhilesh Sanikop     obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
918*09537850SAkhilesh Sanikop 
919*09537850SAkhilesh Sanikop     y += 8;
920*09537850SAkhilesh Sanikop   } while (y < compute_height);
921*09537850SAkhilesh Sanikop }
922*09537850SAkhilesh Sanikop 
Init10bpp()923*09537850SAkhilesh Sanikop void Init10bpp() {
924*09537850SAkhilesh Sanikop   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
925*09537850SAkhilesh Sanikop   assert(dsp != nullptr);
926*09537850SAkhilesh Sanikop   dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
927*09537850SAkhilesh Sanikop   dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
928*09537850SAkhilesh Sanikop }
929*09537850SAkhilesh Sanikop 
930*09537850SAkhilesh Sanikop }  // namespace
931*09537850SAkhilesh Sanikop }  // namespace high_bitdepth
932*09537850SAkhilesh Sanikop #endif  // LIBGAV1_MAX_BITDEPTH >= 10
933*09537850SAkhilesh Sanikop 
ObmcInit_NEON()934*09537850SAkhilesh Sanikop void ObmcInit_NEON() {
935*09537850SAkhilesh Sanikop   low_bitdepth::Init8bpp();
936*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
937*09537850SAkhilesh Sanikop   high_bitdepth::Init10bpp();
938*09537850SAkhilesh Sanikop #endif
939*09537850SAkhilesh Sanikop }
940*09537850SAkhilesh Sanikop 
941*09537850SAkhilesh Sanikop }  // namespace dsp
942*09537850SAkhilesh Sanikop }  // namespace libgav1
943*09537850SAkhilesh Sanikop 
944*09537850SAkhilesh Sanikop #else   // !LIBGAV1_ENABLE_NEON
945*09537850SAkhilesh Sanikop 
946*09537850SAkhilesh Sanikop namespace libgav1 {
947*09537850SAkhilesh Sanikop namespace dsp {
948*09537850SAkhilesh Sanikop 
ObmcInit_NEON()949*09537850SAkhilesh Sanikop void ObmcInit_NEON() {}
950*09537850SAkhilesh Sanikop 
951*09537850SAkhilesh Sanikop }  // namespace dsp
952*09537850SAkhilesh Sanikop }  // namespace libgav1
953*09537850SAkhilesh Sanikop #endif  // LIBGAV1_ENABLE_NEON
954