xref: /aosp_15_r20/external/libgav1/src/dsp/arm/film_grain_neon.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/film_grain.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 #include <arm_neon.h>
20 
21 #include <cassert>
22 #include <cstddef>
23 #include <cstdint>
24 #include <cstring>
25 
26 #include "src/dsp/arm/common_neon.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/film_grain_common.h"
30 #include "src/utils/array_2d.h"
31 #include "src/utils/common.h"
32 #include "src/utils/compiler_attributes.h"
33 #include "src/utils/constants.h"
34 #include "src/utils/memory.h"
35 #include "src/utils/types.h"
36 
37 namespace libgav1 {
38 namespace dsp {
39 namespace film_grain {
40 namespace {
41 
42 // These functions are overloaded for both possible sizes in order to simplify
43 // loading and storing to and from intermediate value types from within a
44 // template function.
GetSignedSource8(const int8_t * src)45 inline int16x8_t GetSignedSource8(const int8_t* src) {
46   return vmovl_s8(vld1_s8(src));
47 }
48 
GetSignedSource8(const uint8_t * src)49 inline int16x8_t GetSignedSource8(const uint8_t* src) {
50   return ZeroExtend(vld1_u8(src));
51 }
52 
GetSignedSource8Msan(const uint8_t * src,int valid_range)53 inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) {
54   return ZeroExtend(Load1MsanU8(src, 8 - valid_range));
55 }
56 
StoreUnsigned8(uint8_t * dest,const uint16x8_t data)57 inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
58   vst1_u8(dest, vmovn_u16(data));
59 }
60 
61 #if LIBGAV1_MAX_BITDEPTH >= 10
GetSignedSource8(const int16_t * src)62 inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); }
63 
GetSignedSource8(const uint16_t * src)64 inline int16x8_t GetSignedSource8(const uint16_t* src) {
65   return vreinterpretq_s16_u16(vld1q_u16(src));
66 }
67 
GetSignedSource8Msan(const uint16_t * src,int valid_range)68 inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) {
69   return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range));
70 }
71 
StoreUnsigned8(uint16_t * dest,const uint16x8_t data)72 inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
73   vst1q_u16(dest, data);
74 }
75 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
76 
77 // Each element in |sum| represents one destination value's running
78 // autoregression formula. The fixed source values in |grain_lo| and |grain_hi|
79 // allow for a sliding window in successive calls to this function.
80 template <int position_offset>
AccumulateWeightedGrain(const int16x8_t grain_lo,const int16x8_t grain_hi,int16_t coeff,int32x4x2_t sum)81 inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
82                                            const int16x8_t grain_hi,
83                                            int16_t coeff, int32x4x2_t sum) {
84   const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
85   sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
86   sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
87   return sum;
88 }
89 
90 // Because the autoregressive filter requires the output of each pixel to
91 // compute pixels that come after in the row, we have to finish the calculations
92 // one at a time.
93 template <int bitdepth, int auto_regression_coeff_lag, int lane>
WriteFinalAutoRegression(int8_t * LIBGAV1_RESTRICT grain_cursor,int32x4x2_t sum,const int8_t * LIBGAV1_RESTRICT coeffs,int pos,int shift)94 inline void WriteFinalAutoRegression(int8_t* LIBGAV1_RESTRICT grain_cursor,
95                                      int32x4x2_t sum,
96                                      const int8_t* LIBGAV1_RESTRICT coeffs,
97                                      int pos, int shift) {
98   int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
99 
100   for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
101     result += grain_cursor[lane + delta_col] * coeffs[pos];
102     ++pos;
103   }
104   grain_cursor[lane] =
105       Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
106             GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
107 }
108 
109 #if LIBGAV1_MAX_BITDEPTH >= 10
110 template <int bitdepth, int auto_regression_coeff_lag, int lane>
WriteFinalAutoRegression(int16_t * LIBGAV1_RESTRICT grain_cursor,int32x4x2_t sum,const int8_t * LIBGAV1_RESTRICT coeffs,int pos,int shift)111 inline void WriteFinalAutoRegression(int16_t* LIBGAV1_RESTRICT grain_cursor,
112                                      int32x4x2_t sum,
113                                      const int8_t* LIBGAV1_RESTRICT coeffs,
114                                      int pos, int shift) {
115   int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
116 
117   for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
118     result += grain_cursor[lane + delta_col] * coeffs[pos];
119     ++pos;
120   }
121   grain_cursor[lane] =
122       Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
123             GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
124 }
125 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
126 
127 // Because the autoregressive filter requires the output of each pixel to
128 // compute pixels that come after in the row, we have to finish the calculations
129 // one at a time.
130 template <int bitdepth, int auto_regression_coeff_lag, int lane>
WriteFinalAutoRegressionChroma(int8_t * LIBGAV1_RESTRICT u_grain_cursor,int8_t * LIBGAV1_RESTRICT v_grain_cursor,int32x4x2_t sum_u,int32x4x2_t sum_v,const int8_t * LIBGAV1_RESTRICT coeffs_u,const int8_t * LIBGAV1_RESTRICT coeffs_v,int pos,int shift)131 inline void WriteFinalAutoRegressionChroma(
132     int8_t* LIBGAV1_RESTRICT u_grain_cursor,
133     int8_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
134     int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
135     const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
136   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
137       u_grain_cursor, sum_u, coeffs_u, pos, shift);
138   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
139       v_grain_cursor, sum_v, coeffs_v, pos, shift);
140 }
141 
142 #if LIBGAV1_MAX_BITDEPTH >= 10
143 template <int bitdepth, int auto_regression_coeff_lag, int lane>
WriteFinalAutoRegressionChroma(int16_t * LIBGAV1_RESTRICT u_grain_cursor,int16_t * LIBGAV1_RESTRICT v_grain_cursor,int32x4x2_t sum_u,int32x4x2_t sum_v,const int8_t * LIBGAV1_RESTRICT coeffs_u,const int8_t * LIBGAV1_RESTRICT coeffs_v,int pos,int shift)144 inline void WriteFinalAutoRegressionChroma(
145     int16_t* LIBGAV1_RESTRICT u_grain_cursor,
146     int16_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
147     int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
148     const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
149   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
150       u_grain_cursor, sum_u, coeffs_u, pos, shift);
151   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
152       v_grain_cursor, sum_v, coeffs_v, pos, shift);
153 }
154 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
155 
SetZero(int32x4x2_t * v)156 inline void SetZero(int32x4x2_t* v) {
157   v->val[0] = vdupq_n_s32(0);
158   v->val[1] = vdupq_n_s32(0);
159 }
160 
161 // Computes subsampled luma for use with chroma, by averaging in the x direction
162 // or y direction when applicable.
GetSubsampledLuma(const int8_t * const luma,int subsampling_x,int subsampling_y,ptrdiff_t stride)163 int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x,
164                             int subsampling_y, ptrdiff_t stride) {
165   if (subsampling_y != 0) {
166     assert(subsampling_x != 0);
167     const int8x16_t src0 = vld1q_s8(luma);
168     const int8x16_t src1 = vld1q_s8(luma + stride);
169     const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)),
170                                         vpaddl_s8(vget_high_s8(src0)));
171     const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)),
172                                         vpaddl_s8(vget_high_s8(src1)));
173     return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2);
174   }
175   if (subsampling_x != 0) {
176     const int8x16_t src = vld1q_s8(luma);
177     return vrshrq_n_s16(
178         vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))),
179         1);
180   }
181   return vmovl_s8(vld1_s8(luma));
182 }
183 
184 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint8_t * const luma,int subsampling_x)185 inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
186   if (subsampling_x != 0) {
187     const uint8x16_t src = vld1q_u8(luma);
188     return vrshrq_n_u16(vpaddlq_u8(src), 1);
189   }
190   return vmovl_u8(vld1_u8(luma));
191 }
192 
GetAverageLumaMsan(const uint8_t * const luma,int subsampling_x,int valid_range)193 inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma,
194                                      int subsampling_x, int valid_range) {
195   if (subsampling_x != 0) {
196     const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range);
197     // MemorySanitizer registers vpaddlq_u8 as a use of the memory.
198     return vrshrq_n_u16(vpaddlq_u8(src), 1);
199   }
200   return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range);
201 }
202 
203 #if LIBGAV1_MAX_BITDEPTH >= 10
204 // Computes subsampled luma for use with chroma, by averaging in the x direction
205 // or y direction when applicable.
GetSubsampledLuma(const int16_t * const luma,int subsampling_x,int subsampling_y,ptrdiff_t stride)206 int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x,
207                             int subsampling_y, ptrdiff_t stride) {
208   if (subsampling_y != 0) {
209     assert(subsampling_x != 0);
210     int16x8_t src0_lo = vld1q_s16(luma);
211     int16x8_t src0_hi = vld1q_s16(luma + 8);
212     const int16x8_t src1_lo = vld1q_s16(luma + stride);
213     const int16x8_t src1_hi = vld1q_s16(luma + stride + 8);
214     const int16x8_t src0 =
215         vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)),
216                      vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi)));
217     const int16x8_t src1 =
218         vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)),
219                      vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi)));
220     return vrshrq_n_s16(vaddq_s16(src0, src1), 2);
221   }
222   if (subsampling_x != 0) {
223     const int16x8_t src_lo = vld1q_s16(luma);
224     const int16x8_t src_hi = vld1q_s16(luma + 8);
225     const int16x8_t ret =
226         vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)),
227                      vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi)));
228     return vrshrq_n_s16(ret, 1);
229   }
230   return vld1q_s16(luma);
231 }
232 
233 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint16_t * const luma,int subsampling_x)234 inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
235                                  int subsampling_x) {
236   if (subsampling_x != 0) {
237     const uint16x8x2_t src = vld2q_u16(luma);
238     return vrhaddq_u16(src.val[0], src.val[1]);
239   }
240   return vld1q_u16(luma);
241 }
242 
GetAverageLumaMsan(const uint16_t * const luma,int subsampling_x,int valid_range)243 inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma,
244                                      int subsampling_x, int valid_range) {
245   if (subsampling_x != 0) {
246     const uint16x8x2_t src = vld2q_u16(luma);
247     const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]);
248     return MaskOverreadsQ(result, 16 - valid_range);
249   }
250   return Load1QMsanU16(luma, 16 - valid_range);
251 }
252 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
253 
254 template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
255           bool use_luma>
ApplyAutoRegressiveFilterToChromaGrains_NEON(const FilmGrainParams & params,const void * LIBGAV1_RESTRICT luma_grain_buffer,int subsampling_x,int subsampling_y,void * LIBGAV1_RESTRICT u_grain_buffer,void * LIBGAV1_RESTRICT v_grain_buffer)256 void ApplyAutoRegressiveFilterToChromaGrains_NEON(
257     const FilmGrainParams& params,
258     const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
259     int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
260     void* LIBGAV1_RESTRICT v_grain_buffer) {
261   static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag.");
262   const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
263   auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
264   auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
265   const int auto_regression_shift = params.auto_regression_shift;
266   const int chroma_width =
267       (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
268   const int chroma_height =
269       (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
270   // When |chroma_width| == 44, we write 8 at a time from x in [3, 34],
271   // leaving [35, 40] to write at the end.
272   const int chroma_width_remainder =
273       (chroma_width - 2 * kAutoRegressionBorder) & 7;
274 
275   int y = kAutoRegressionBorder;
276   luma_grain += kLumaWidth * y;
277   u_grain += chroma_width * y;
278   v_grain += chroma_width * y;
279   do {
280     // Each row is computed 8 values at a time in the following loop. At the
281     // end of the loop, 4 values remain to write. They are given a special
282     // reduced iteration at the end.
283     int x = kAutoRegressionBorder;
284     int luma_x = kAutoRegressionBorder;
285     do {
286       int pos = 0;
287       int32x4x2_t sum_u;
288       int32x4x2_t sum_v;
289       SetZero(&sum_u);
290       SetZero(&sum_v);
291 
292       if (auto_regression_coeff_lag > 0) {
293         for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
294              ++delta_row) {
295           // These loads may overflow to the next row, but they are never called
296           // on the final row of a grain block. Therefore, they will never
297           // exceed the block boundaries.
298           // Note: this could be slightly optimized to a single load in 8bpp,
299           // but requires making a special first iteration and accumulate
300           // function that takes an int8x16_t.
301           const int16x8_t u_grain_lo =
302               GetSignedSource8(u_grain + x + delta_row * chroma_width -
303                                auto_regression_coeff_lag);
304           const int16x8_t u_grain_hi =
305               GetSignedSource8(u_grain + x + delta_row * chroma_width -
306                                auto_regression_coeff_lag + 8);
307           const int16x8_t v_grain_lo =
308               GetSignedSource8(v_grain + x + delta_row * chroma_width -
309                                auto_regression_coeff_lag);
310           const int16x8_t v_grain_hi =
311               GetSignedSource8(v_grain + x + delta_row * chroma_width -
312                                auto_regression_coeff_lag + 8);
313 #define ACCUMULATE_WEIGHTED_GRAIN(offset)                                  \
314   sum_u = AccumulateWeightedGrain<offset>(                                 \
315       u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \
316   sum_v = AccumulateWeightedGrain<offset>(                                 \
317       v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v)
318 
319           ACCUMULATE_WEIGHTED_GRAIN(0);
320           ACCUMULATE_WEIGHTED_GRAIN(1);
321           ACCUMULATE_WEIGHTED_GRAIN(2);
322           // The horizontal |auto_regression_coeff_lag| loop is replaced with
323           // if-statements to give vextq_s16 an immediate param.
324           if (auto_regression_coeff_lag > 1) {
325             ACCUMULATE_WEIGHTED_GRAIN(3);
326             ACCUMULATE_WEIGHTED_GRAIN(4);
327           }
328           if (auto_regression_coeff_lag > 2) {
329             assert(auto_regression_coeff_lag == 3);
330             ACCUMULATE_WEIGHTED_GRAIN(5);
331             ACCUMULATE_WEIGHTED_GRAIN(6);
332           }
333         }
334       }
335 
336       if (use_luma) {
337         const int16x8_t luma = GetSubsampledLuma(
338             luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
339 
340         // Luma samples get the final coefficient in the formula, but are best
341         // computed all at once before the final row.
342         const int coeff_u =
343             params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
344         const int coeff_v =
345             params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
346 
347         sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
348         sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
349         sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
350         sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
351       }
352       // At this point in the filter, the source addresses and destination
353       // addresses overlap. Because this is an auto-regressive filter, the
354       // higher lanes cannot be computed without the results of the lower lanes.
355       // Each call to WriteFinalAutoRegression incorporates preceding values
356       // on the final row, and writes a single sample. This allows the next
357       // pixel's value to be computed in the next call.
358 #define WRITE_AUTO_REGRESSION_RESULT(lane)                                    \
359   WriteFinalAutoRegressionChroma<bitdepth, auto_regression_coeff_lag, lane>(  \
360       u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \
361       params.auto_regression_coeff_v, pos, auto_regression_shift)
362 
363       WRITE_AUTO_REGRESSION_RESULT(0);
364       WRITE_AUTO_REGRESSION_RESULT(1);
365       WRITE_AUTO_REGRESSION_RESULT(2);
366       WRITE_AUTO_REGRESSION_RESULT(3);
367       WRITE_AUTO_REGRESSION_RESULT(4);
368       WRITE_AUTO_REGRESSION_RESULT(5);
369       WRITE_AUTO_REGRESSION_RESULT(6);
370       WRITE_AUTO_REGRESSION_RESULT(7);
371 
372       x += 8;
373       luma_x += 8 << subsampling_x;
374     } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder);
375 
376     // This is the "final iteration" of the above loop over width. We fill in
377     // the remainder of the width, which is less than 8.
378     int pos = 0;
379     int32x4x2_t sum_u;
380     int32x4x2_t sum_v;
381     SetZero(&sum_u);
382     SetZero(&sum_v);
383 
384     for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
385          ++delta_row) {
386       // These loads may overflow to the next row, but they are never called on
387       // the final row of a grain block. Therefore, they will never exceed the
388       // block boundaries.
389       const int16x8_t u_grain_lo = GetSignedSource8(
390           u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
391       const int16x8_t u_grain_hi =
392           GetSignedSource8(u_grain + x + delta_row * chroma_width -
393                            auto_regression_coeff_lag + 8);
394       const int16x8_t v_grain_lo = GetSignedSource8(
395           v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
396       const int16x8_t v_grain_hi =
397           GetSignedSource8(v_grain + x + delta_row * chroma_width -
398                            auto_regression_coeff_lag + 8);
399 
400       ACCUMULATE_WEIGHTED_GRAIN(0);
401       ACCUMULATE_WEIGHTED_GRAIN(1);
402       ACCUMULATE_WEIGHTED_GRAIN(2);
403       // The horizontal |auto_regression_coeff_lag| loop is replaced with
404       // if-statements to give vextq_s16 an immediate param.
405       if (auto_regression_coeff_lag > 1) {
406         ACCUMULATE_WEIGHTED_GRAIN(3);
407         ACCUMULATE_WEIGHTED_GRAIN(4);
408       }
409       if (auto_regression_coeff_lag > 2) {
410         assert(auto_regression_coeff_lag == 3);
411         ACCUMULATE_WEIGHTED_GRAIN(5);
412         ACCUMULATE_WEIGHTED_GRAIN(6);
413       }
414     }
415 
416     if (use_luma) {
417       const int16x8_t luma = GetSubsampledLuma(
418           luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
419 
420       // Luma samples get the final coefficient in the formula, but are best
421       // computed all at once before the final row.
422       const int coeff_u =
423           params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
424       const int coeff_v =
425           params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
426 
427       sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
428       sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
429       sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
430       sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
431     }
432 
433     WRITE_AUTO_REGRESSION_RESULT(0);
434     WRITE_AUTO_REGRESSION_RESULT(1);
435     WRITE_AUTO_REGRESSION_RESULT(2);
436     WRITE_AUTO_REGRESSION_RESULT(3);
437     if (chroma_width_remainder == 6) {
438       WRITE_AUTO_REGRESSION_RESULT(4);
439       WRITE_AUTO_REGRESSION_RESULT(5);
440     }
441 
442     luma_grain += kLumaWidth << subsampling_y;
443     u_grain += chroma_width;
444     v_grain += chroma_width;
445   } while (++y < chroma_height);
446 #undef ACCUMULATE_WEIGHTED_GRAIN
447 #undef WRITE_AUTO_REGRESSION_RESULT
448 }
449 
450 // Applies an auto-regressive filter to the white noise in luma_grain.
451 template <int bitdepth, typename GrainType, int auto_regression_coeff_lag>
ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams & params,void * luma_grain_buffer)452 void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params,
453                                                void* luma_grain_buffer) {
454   static_assert(auto_regression_coeff_lag > 0, "");
455   const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
456   const uint8_t auto_regression_shift = params.auto_regression_shift;
457 
458   int y = kAutoRegressionBorder;
459   auto* luma_grain =
460       static_cast<GrainType*>(luma_grain_buffer) + kLumaWidth * y;
461   do {
462     // Each row is computed 8 values at a time in the following loop. At the
463     // end of the loop, 4 values remain to write. They are given a special
464     // reduced iteration at the end.
465     int x = kAutoRegressionBorder;
466     do {
467       int pos = 0;
468       int32x4x2_t sum;
469       SetZero(&sum);
470       for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
471            ++delta_row) {
472         // These loads may overflow to the next row, but they are never called
473         // on the final row of a grain block. Therefore, they will never exceed
474         // the block boundaries.
475         const int16x8_t src_grain_lo =
476             GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
477                              auto_regression_coeff_lag);
478         const int16x8_t src_grain_hi =
479             GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
480                              auto_regression_coeff_lag + 8);
481 
482         // A pictorial representation of the auto-regressive filter for
483         // various values of params.auto_regression_coeff_lag. The letter 'O'
484         // represents the current sample. (The filter always operates on the
485         // current sample with filter coefficient 1.) The letters 'X'
486         // represent the neighboring samples that the filter operates on, below
487         // their corresponding "offset" number.
488         //
489         // params.auto_regression_coeff_lag == 3:
490         //   0 1 2 3 4 5 6
491         //   X X X X X X X
492         //   X X X X X X X
493         //   X X X X X X X
494         //   X X X O
495         // params.auto_regression_coeff_lag == 2:
496         //     0 1 2 3 4
497         //     X X X X X
498         //     X X X X X
499         //     X X O
500         // params.auto_regression_coeff_lag == 1:
501         //       0 1 2
502         //       X X X
503         //       X O
504         // params.auto_regression_coeff_lag == 0:
505         //         O
506         // The function relies on the caller to skip the call in the 0 lag
507         // case.
508 
509 #define ACCUMULATE_WEIGHTED_GRAIN(offset)                           \
510   sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
511                                         auto_regression_coeff_y[pos++], sum)
512         ACCUMULATE_WEIGHTED_GRAIN(0);
513         ACCUMULATE_WEIGHTED_GRAIN(1);
514         ACCUMULATE_WEIGHTED_GRAIN(2);
515         // The horizontal |auto_regression_coeff_lag| loop is replaced with
516         // if-statements to give vextq_s16 an immediate param.
517         if (auto_regression_coeff_lag > 1) {
518           ACCUMULATE_WEIGHTED_GRAIN(3);
519           ACCUMULATE_WEIGHTED_GRAIN(4);
520         }
521         if (auto_regression_coeff_lag > 2) {
522           assert(auto_regression_coeff_lag == 3);
523           ACCUMULATE_WEIGHTED_GRAIN(5);
524           ACCUMULATE_WEIGHTED_GRAIN(6);
525         }
526       }
527       // At this point in the filter, the source addresses and destination
528       // addresses overlap. Because this is an auto-regressive filter, the
529       // higher lanes cannot be computed without the results of the lower lanes.
530       // Each call to WriteFinalAutoRegression incorporates preceding values
531       // on the final row, and writes a single sample. This allows the next
532       // pixel's value to be computed in the next call.
533 #define WRITE_AUTO_REGRESSION_RESULT(lane)                             \
534   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
535       luma_grain + x, sum, auto_regression_coeff_y, pos,               \
536       auto_regression_shift)
537 
538       WRITE_AUTO_REGRESSION_RESULT(0);
539       WRITE_AUTO_REGRESSION_RESULT(1);
540       WRITE_AUTO_REGRESSION_RESULT(2);
541       WRITE_AUTO_REGRESSION_RESULT(3);
542       WRITE_AUTO_REGRESSION_RESULT(4);
543       WRITE_AUTO_REGRESSION_RESULT(5);
544       WRITE_AUTO_REGRESSION_RESULT(6);
545       WRITE_AUTO_REGRESSION_RESULT(7);
546       x += 8;
547       // Leave the final four pixels for the special iteration below.
548     } while (x < kLumaWidth - kAutoRegressionBorder - 4);
549 
550     // Final 4 pixels in the row.
551     int pos = 0;
552     int32x4x2_t sum;
553     SetZero(&sum);
554     for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
555          ++delta_row) {
556       const int16x8_t src_grain_lo = GetSignedSource8(
557           luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
558       const int16x8_t src_grain_hi =
559           GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
560                            auto_regression_coeff_lag + 8);
561 
562       ACCUMULATE_WEIGHTED_GRAIN(0);
563       ACCUMULATE_WEIGHTED_GRAIN(1);
564       ACCUMULATE_WEIGHTED_GRAIN(2);
565       // The horizontal |auto_regression_coeff_lag| loop is replaced with
566       // if-statements to give vextq_s16 an immediate param.
567       if (auto_regression_coeff_lag > 1) {
568         ACCUMULATE_WEIGHTED_GRAIN(3);
569         ACCUMULATE_WEIGHTED_GRAIN(4);
570       }
571       if (auto_regression_coeff_lag > 2) {
572         assert(auto_regression_coeff_lag == 3);
573         ACCUMULATE_WEIGHTED_GRAIN(5);
574         ACCUMULATE_WEIGHTED_GRAIN(6);
575       }
576     }
577     // delta_row == 0
578     WRITE_AUTO_REGRESSION_RESULT(0);
579     WRITE_AUTO_REGRESSION_RESULT(1);
580     WRITE_AUTO_REGRESSION_RESULT(2);
581     WRITE_AUTO_REGRESSION_RESULT(3);
582     luma_grain += kLumaWidth;
583   } while (++y < kLumaHeight);
584 
585 #undef WRITE_AUTO_REGRESSION_RESULT
586 #undef ACCUMULATE_WEIGHTED_GRAIN
587 }
588 
589 template <int bitdepth>
InitializeScalingLookupTable_NEON(int num_points,const uint8_t point_value[],const uint8_t point_scaling[],int16_t * scaling_lut,const int scaling_lut_length)590 void InitializeScalingLookupTable_NEON(int num_points,
591                                        const uint8_t point_value[],
592                                        const uint8_t point_scaling[],
593                                        int16_t* scaling_lut,
594                                        const int scaling_lut_length) {
595   static_assert(bitdepth < kBitdepth12,
596                 "NEON Scaling lookup table only supports 8bpp and 10bpp.");
597   if (num_points == 0) {
598     memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
599     return;
600   }
601   static_assert(sizeof(scaling_lut[0]) == 2, "");
602   Memset(scaling_lut, point_scaling[0],
603          (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8));
604   const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000));
605   const int32x4_t rounding = vdupq_n_s32(32768);
606   for (int i = 0; i < num_points - 1; ++i) {
607     const int delta_y = point_scaling[i + 1] - point_scaling[i];
608     const int delta_x = point_value[i + 1] - point_value[i];
609     // |delta| corresponds to b, for the function y = a + b*x.
610     const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
611     const int delta4 = delta << 2;
612     // vmull_n_u16 will not work here because |delta| typically exceeds the
613     // range of uint16_t.
614     int32x4_t upscaled_points0 = vmlaq_n_s32(rounding, steps, delta);
615     const int32x4_t line_increment4 = vdupq_n_s32(delta4);
616     // Get the second set of 4 points by adding 4 steps to the first set.
617     int32x4_t upscaled_points1 = vaddq_s32(upscaled_points0, line_increment4);
618     // We obtain the next set of 8 points by adding 8 steps to each of the
619     // current 8 points.
620     const int32x4_t line_increment8 = vshlq_n_s32(line_increment4, 1);
621     const int16x8_t base_point = vdupq_n_s16(point_scaling[i]);
622     int x = 0;
623     // Derive and write 8 values (or 32 values, for 10bpp).
624     do {
625       const int16x4_t interp_points0 = vshrn_n_s32(upscaled_points0, 16);
626       const int16x4_t interp_points1 = vshrn_n_s32(upscaled_points1, 16);
627       const int16x8_t interp_points =
628           vcombine_s16(interp_points0, interp_points1);
629       // The spec guarantees that the max value of |point_value[i]| + x is 255.
630       // Writing 8 values starting at the final table byte, leaves 7 values of
631       // required padding.
632       const int16x8_t full_interp = vaddq_s16(interp_points, base_point);
633       const int x_base = (point_value[i] + x) << (bitdepth - kBitdepth8);
634       if (bitdepth == kBitdepth10) {
635         const int16x8_t next_val = vaddq_s16(
636             base_point,
637             vdupq_n_s16((vgetq_lane_s32(upscaled_points1, 3) + delta) >> 16));
638         const int16x8_t start = full_interp;
639         const int16x8_t end = vextq_s16(full_interp, next_val, 1);
640         // lut[i << 2] = start;
641         // lut[(i << 2) + 1] = start + RightShiftWithRounding(start - end, 2)
642         // lut[(i << 2) + 2] = start +
643         //                      RightShiftWithRounding(2 * (start - end), 2)
644         // lut[(i << 2) + 3] = start +
645         //                      RightShiftWithRounding(3 * (start - end), 2)
646         const int16x8_t delta = vsubq_s16(end, start);
647         const int16x8_t double_delta = vshlq_n_s16(delta, 1);
648         const int16x8_t delta2 = vrshrq_n_s16(double_delta, 2);
649         const int16x8_t delta3 =
650             vrshrq_n_s16(vaddq_s16(delta, double_delta), 2);
651         const int16x8x4_t result = {
652             start, vaddq_s16(start, vrshrq_n_s16(delta, 2)),
653             vaddq_s16(start, delta2), vaddq_s16(start, delta3)};
654         Store4QMsanS16(&scaling_lut[x_base], result);
655       } else {
656         vst1q_s16(&scaling_lut[x_base], full_interp);
657       }
658       upscaled_points0 = vaddq_s32(upscaled_points0, line_increment8);
659       upscaled_points1 = vaddq_s32(upscaled_points1, line_increment8);
660       x += 8;
661     } while (x < delta_x);
662   }
663   const int16_t last_point_value = point_value[num_points - 1];
664   const int x_base = last_point_value << (bitdepth - kBitdepth8);
665   Memset(&scaling_lut[x_base], point_scaling[num_points - 1],
666          scaling_lut_length - x_base);
667   if (bitdepth == kBitdepth10 && x_base > 0) {
668     const int start = scaling_lut[x_base - 4];
669     const int end = point_scaling[num_points - 1];
670     const int delta = end - start;
671     scaling_lut[x_base - 3] = start + RightShiftWithRounding(delta, 2);
672     scaling_lut[x_base - 2] = start + RightShiftWithRounding(2 * delta, 2);
673     scaling_lut[x_base - 1] = start + RightShiftWithRounding(3 * delta, 2);
674   }
675 }
676 
Clip3(const int16x8_t value,const int16x8_t low,const int16x8_t high)677 inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
678                        const int16x8_t high) {
679   const int16x8_t clipped_to_ceiling = vminq_s16(high, value);
680   return vmaxq_s16(low, clipped_to_ceiling);
681 }
682 
683 template <int bitdepth, typename Pixel>
GetScalingFactors(const int16_t scaling_lut[],const Pixel * source,const int valid_range=8)684 inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
685                                    const Pixel* source,
686                                    const int valid_range = 8) {
687   int16_t start_vals[8];
688   static_assert(bitdepth <= kBitdepth10,
689                 "NEON Film Grain is not yet implemented for 12bpp.");
690 #if LIBGAV1_MSAN
691   if (valid_range < 8) memset(start_vals, 0, sizeof(start_vals));
692 #endif
693   for (int i = 0; i < valid_range; ++i) {
694     assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
695     start_vals[i] = scaling_lut[source[i]];
696   }
697   return vld1q_s16(start_vals);
698 }
699 
700 template <int bitdepth>
ScaleNoise(const int16x8_t noise,const int16x8_t scaling,const int16x8_t scaling_shift_vect)701 inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
702                             const int16x8_t scaling_shift_vect) {
703   if (bitdepth == kBitdepth8) {
704     const int16x8_t upscaled_noise = vmulq_s16(noise, scaling);
705     return vrshlq_s16(upscaled_noise, scaling_shift_vect);
706   }
707   // Scaling shift is in the range [8, 11]. The doubling multiply returning high
708   // half is equivalent to a right shift by 15, so |scaling_shift_vect| should
709   // provide a left shift equal to 15 - s, where s is the original shift
710   // parameter.
711   const int16x8_t scaling_up = vshlq_s16(scaling, scaling_shift_vect);
712   return vqrdmulhq_s16(noise, scaling_up);
713 }
714 
715 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageLuma_NEON(const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_luma,int scaling_shift,int width,int height,int start_height,const int16_t * scaling_lut_y,const void * source_plane_y,ptrdiff_t source_stride_y,void * dest_plane_y,ptrdiff_t dest_stride_y)716 void BlendNoiseWithImageLuma_NEON(
717     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
718     int scaling_shift, int width, int height, int start_height,
719     const int16_t* scaling_lut_y, const void* source_plane_y,
720     ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
721   const auto* noise_image =
722       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
723   const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
724   source_stride_y /= sizeof(Pixel);
725   auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
726   dest_stride_y /= sizeof(Pixel);
727   const int16x8_t floor = vdupq_n_s16(min_value);
728   const int16x8_t ceiling = vdupq_n_s16(max_luma);
729   // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
730   // for 16 bit signed integers. In higher bitdepths, however, we have to
731   // expand to 32 to protect the sign bit.
732   const int16x8_t scaling_shift_vect = vdupq_n_s16(
733       (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
734 
735   const int safe_width = width & ~15;
736   int y = 0;
737   do {
738     int x = 0;
739     for (; x + 8 <= safe_width; x += 8) {
740       // This operation on the unsigned input is safe in 8bpp because the vector
741       // is widened before it is reinterpreted.
742       const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]);
743       const int16x8_t scaling0 =
744           GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
745       int16x8_t noise =
746           GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
747 
748       noise = ScaleNoise<bitdepth>(noise, scaling0, scaling_shift_vect);
749       const int16x8_t combined0 = vaddq_s16(orig0, noise);
750       // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
751       // clipping with vqmovun_s16, but it's not likely to be worth copying the
752       // function for just that case, though the gain would be very small.
753       StoreUnsigned8(&out_y_row[x],
754                      vreinterpretq_u16_s16(Clip3(combined0, floor, ceiling)));
755       x += 8;
756 
757       // This operation on the unsigned input is safe in 8bpp because the vector
758       // is widened before it is reinterpreted.
759       const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]);
760       const int16x8_t scaling1 =
761           GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
762       noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
763 
764       noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect);
765       const int16x8_t combined1 = vaddq_s16(orig1, noise);
766       // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
767       // clipping with vqmovun_s16, but it's not likely to be worth copying the
768       // function for just that case, though the gain would be very small.
769       StoreUnsigned8(&out_y_row[x],
770                      vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling)));
771     }
772 
773     if (x < width) {
774       assert(width - x < 16);
775       if (x < width - 8) {
776         const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
777         const int16x8_t scaling =
778             GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
779         int16x8_t noise =
780             GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
781 
782         noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
783         const int16x8_t combined = vaddq_s16(orig, noise);
784         // In 8bpp, when params_.clip_to_restricted_range == false, we can
785         // replace clipping with vqmovun_s16, but it's not likely to be worth
786         // copying the function for just that case, though the gain would be
787         // very small.
788         StoreUnsigned8(&out_y_row[x],
789                        vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
790         x += 8;
791       }
792       const int valid_range_pixels = width - x;
793       const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]);
794       const int16x8_t orig =
795           GetSignedSource8Msan(&in_y_row[x], valid_range_bytes);
796       const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
797           scaling_lut_y, &in_y_row[x], valid_range_pixels);
798       int16x8_t noise =
799           GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
800       noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
801 
802       const int16x8_t combined = vaddq_s16(orig, noise);
803       StoreUnsigned8(&out_y_row[x],
804                      vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
805     }
806     in_y_row += source_stride_y;
807     out_y_row += dest_stride_y;
808   } while (++y < height);
809 }
810 
811 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaValsWithCfl(const Pixel * LIBGAV1_RESTRICT chroma_cursor,const GrainType * LIBGAV1_RESTRICT noise_image_cursor,const int16x8_t scaling,const int16x8_t scaling_shift_vect)812 inline int16x8_t BlendChromaValsWithCfl(
813     const Pixel* LIBGAV1_RESTRICT chroma_cursor,
814     const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
815     const int16x8_t scaling, const int16x8_t scaling_shift_vect) {
816   const int16x8_t orig = GetSignedSource8(chroma_cursor);
817   int16x8_t noise = GetSignedSource8(noise_image_cursor);
818   noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
819   return vaddq_s16(orig, noise);
820 }
821 
822 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaPlaneWithCfl_NEON(const Array2D<GrainType> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,const int16_t * LIBGAV1_RESTRICT scaling_lut,const Pixel * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const Pixel * in_chroma_row,ptrdiff_t source_stride_chroma,Pixel * out_chroma_row,ptrdiff_t dest_stride)823 LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
824     const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
825     int width, int height, int start_height, int subsampling_x,
826     int subsampling_y, int scaling_shift,
827     const int16_t* LIBGAV1_RESTRICT scaling_lut,
828     const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
829     const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
830     Pixel* out_chroma_row, ptrdiff_t dest_stride) {
831   const int16x8_t floor = vdupq_n_s16(min_value);
832   const int16x8_t ceiling = vdupq_n_s16(max_chroma);
833   Pixel luma_buffer[16];
834   // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
835   // for 16 bit signed integers. In higher bitdepths, however, we have to
836   // expand to 32 to protect the sign bit.
837   const int16x8_t scaling_shift_vect = vdupq_n_s16(
838       (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
839 
840   const int chroma_height = (height + subsampling_y) >> subsampling_y;
841   const int chroma_width = (width + subsampling_x) >> subsampling_x;
842   const int safe_chroma_width = chroma_width & ~7;
843 
844   // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
845   // in GetScalingFactors.
846   Pixel average_luma_buffer[8];
847   assert(start_height % 2 == 0);
848   start_height >>= subsampling_y;
849   int y = 0;
850   do {
851     int x = 0;
852     for (; x + 8 <= safe_chroma_width; x += 8) {
853       const int luma_x = x << subsampling_x;
854       const uint16x8_t average_luma =
855           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
856       StoreUnsigned8(average_luma_buffer, average_luma);
857 
858       const int16x8_t scaling =
859           GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
860       const int16x8_t blended =
861           BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
862               &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
863               scaling_shift_vect);
864 
865       // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
866       // clipping with vqmovun_s16, but it's not likely to be worth copying the
867       // function for just that case.
868       StoreUnsigned8(&out_chroma_row[x],
869                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
870     }
871 
872     if (x < chroma_width) {
873       const int luma_x = x << subsampling_x;
874       const int valid_range_pixels = width - luma_x;
875       const int valid_range_chroma_pixels = chroma_width - x;
876       const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
877       assert(valid_range_pixels < 16);
878       memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
879       luma_buffer[valid_range_pixels] = in_y_row[width - 1];
880       const uint16x8_t average_luma = GetAverageLumaMsan(
881           luma_buffer, subsampling_x, valid_range_chroma_pixels << 1);
882 
883       StoreUnsigned8(average_luma_buffer, average_luma);
884 
885       const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
886           scaling_lut, average_luma_buffer, valid_range_chroma_pixels);
887       const int16x8_t blended =
888           BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
889               &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
890               scaling_shift_vect);
891       // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
892       // clipping with vqmovun_s16, but it's not likely to be worth copying the
893       // function for just that case.
894       StoreUnsigned8(&out_chroma_row[x],
895                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
896     }
897 
898     in_y_row += source_stride_y << subsampling_y;
899     in_chroma_row += source_stride_chroma;
900     out_chroma_row += dest_stride;
901   } while (++y < chroma_height);
902 }
903 
904 // This function is for the case params_.chroma_scaling_from_luma == true.
905 // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
906 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageChromaWithCfl_NEON(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * LIBGAV1_RESTRICT scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)907 void BlendNoiseWithImageChromaWithCfl_NEON(
908     Plane plane, const FilmGrainParams& params,
909     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
910     int width, int height, int start_height, int subsampling_x,
911     int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
912     const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
913     const void* source_plane_uv, ptrdiff_t source_stride_uv,
914     void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
915   const auto* noise_image =
916       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
917   const auto* in_y = static_cast<const Pixel*>(source_plane_y);
918   source_stride_y /= sizeof(Pixel);
919 
920   const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
921   source_stride_uv /= sizeof(Pixel);
922   auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
923   dest_stride_uv /= sizeof(Pixel);
924   // Looping over one plane at a time is faster in higher resolutions, despite
925   // re-computing luma.
926   BlendChromaPlaneWithCfl_NEON<bitdepth, GrainType, Pixel>(
927       noise_image[plane], min_value, max_chroma, width, height, start_height,
928       subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
929       source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
930 }
931 
932 }  // namespace
933 
934 namespace low_bitdepth {
935 namespace {
936 
BlendChromaValsNoCfl(const int16_t * LIBGAV1_RESTRICT scaling_lut,const int16x8_t orig,const int8_t * LIBGAV1_RESTRICT noise_image_cursor,const int16x8_t & average_luma,const int16x8_t & scaling_shift_vect,const int16x8_t & offset,int luma_multiplier,int chroma_multiplier,bool restrict_scaling_lookup,int valid_range_pixels=0)937 inline int16x8_t BlendChromaValsNoCfl(
938     const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
939     const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
940     const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
941     const int16x8_t& offset, int luma_multiplier, int chroma_multiplier,
942     bool restrict_scaling_lookup, int valid_range_pixels = 0) {
943   uint8_t merged_buffer[8];
944   const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
945   const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
946   // Maximum value of |combined_u| is 127*255 = 0x7E81.
947   const int16x8_t combined = vhaddq_s16(weighted_luma, weighted_chroma);
948   // Maximum value of u_offset is (255 << 5) = 0x1FE0.
949   // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
950   const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
951   vst1_u8(merged_buffer, merged);
952 
953   const int16x8_t scaling =
954       restrict_scaling_lookup
955           ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer,
956                                                    valid_range_pixels)
957           : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
958   int16x8_t noise = GetSignedSource8(noise_image_cursor);
959   noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect);
960   return vaddq_s16(orig, noise);
961 }
962 
BlendChromaPlane8bpp_NEON(const Array2D<int8_t> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,int chroma_offset,int chroma_multiplier,int luma_multiplier,const int16_t * LIBGAV1_RESTRICT scaling_lut,const uint8_t * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const uint8_t * in_chroma_row,ptrdiff_t source_stride_chroma,uint8_t * out_chroma_row,ptrdiff_t dest_stride)963 LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
964     const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
965     int width, int height, int start_height, int subsampling_x,
966     int subsampling_y, int scaling_shift, int chroma_offset,
967     int chroma_multiplier, int luma_multiplier,
968     const int16_t* LIBGAV1_RESTRICT scaling_lut,
969     const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
970     const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
971     uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
972   const int16x8_t floor = vdupq_n_s16(min_value);
973   const int16x8_t ceiling = vdupq_n_s16(max_chroma);
974   // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
975   // for 16 bit signed integers. In higher bitdepths, however, we have to
976   // expand to 32 to protect the sign bit.
977   const int16x8_t scaling_shift_vect = vdupq_n_s16(-scaling_shift);
978 
979   const int chroma_height = (height + subsampling_y) >> subsampling_y;
980   const int chroma_width = (width + subsampling_x) >> subsampling_x;
981   const int safe_chroma_width = chroma_width & ~7;
982   uint8_t luma_buffer[16];
983   const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
984 
985   start_height >>= subsampling_y;
986   int y = 0;
987   do {
988     int x = 0;
989     for (; x + 8 <= safe_chroma_width; x += 8) {
990       const int luma_x = x << subsampling_x;
991       const int valid_range_chroma_pixels = chroma_width - x;
992 
993       const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
994       const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
995           &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1));
996       const int16x8_t blended = BlendChromaValsNoCfl(
997           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
998           average_luma, scaling_shift_vect, offset, luma_multiplier,
999           chroma_multiplier, /*restrict_scaling_lookup=*/false);
1000       // In 8bpp, when params_.clip_to_restricted_range == false, we can
1001       // replace clipping with vqmovun_s16, but the gain would be small.
1002       StoreUnsigned8(&out_chroma_row[x],
1003                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
1004     }
1005 
1006     if (x < chroma_width) {
1007       // Begin right edge iteration. Same as the normal iterations, but the
1008       // |average_luma| computation requires a duplicated luma value at the
1009       // end.
1010       const int luma_x = x << subsampling_x;
1011       const int valid_range_pixels = width - luma_x;
1012       const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
1013       assert(valid_range_pixels < 16);
1014       memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
1015       luma_buffer[valid_range_pixels] = in_y_row[width - 1];
1016       const int valid_range_chroma_pixels = chroma_width - x;
1017 
1018       const int16x8_t orig_chroma =
1019           GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels);
1020       const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
1021           luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
1022       const int16x8_t blended = BlendChromaValsNoCfl(
1023           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
1024           average_luma, scaling_shift_vect, offset, luma_multiplier,
1025           chroma_multiplier, /*restrict_scaling_lookup=*/true,
1026           valid_range_chroma_pixels);
1027       StoreUnsigned8(&out_chroma_row[x],
1028                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
1029       // End of right edge iteration.
1030     }
1031 
1032     in_y_row += source_stride_y << subsampling_y;
1033     in_chroma_row += source_stride_chroma;
1034     out_chroma_row += dest_stride;
1035   } while (++y < chroma_height);
1036 }
1037 
1038 // This function is for the case params_.chroma_scaling_from_luma == false.
BlendNoiseWithImageChroma8bpp_NEON(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * LIBGAV1_RESTRICT scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)1039 void BlendNoiseWithImageChroma8bpp_NEON(
1040     Plane plane, const FilmGrainParams& params,
1041     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
1042     int width, int height, int start_height, int subsampling_x,
1043     int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
1044     const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
1045     const void* source_plane_uv, ptrdiff_t source_stride_uv,
1046     void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
1047   assert(plane == kPlaneU || plane == kPlaneV);
1048   const auto* noise_image =
1049       static_cast<const Array2D<int8_t>*>(noise_image_ptr);
1050   const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
1051   const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
1052   auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
1053 
1054   const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
1055   const int luma_multiplier =
1056       (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
1057   const int multiplier =
1058       (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
1059   BlendChromaPlane8bpp_NEON(noise_image[plane], min_value, max_chroma, width,
1060                             height, start_height, subsampling_x, subsampling_y,
1061                             params.chroma_scaling, offset, multiplier,
1062                             luma_multiplier, scaling_lut, in_y, source_stride_y,
1063                             in_uv, source_stride_uv, out_uv, dest_stride_uv);
1064 }
1065 
WriteOverlapLine8bpp_NEON(const int8_t * LIBGAV1_RESTRICT noise_stripe_row,const int8_t * LIBGAV1_RESTRICT noise_stripe_row_prev,int plane_width,const int8x8_t grain_coeff,const int8x8_t old_coeff,int8_t * LIBGAV1_RESTRICT noise_image_row)1066 inline void WriteOverlapLine8bpp_NEON(
1067     const int8_t* LIBGAV1_RESTRICT noise_stripe_row,
1068     const int8_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
1069     const int8x8_t grain_coeff, const int8x8_t old_coeff,
1070     int8_t* LIBGAV1_RESTRICT noise_image_row) {
1071   int x = 0;
1072   do {
1073     // Note that these reads may exceed noise_stripe_row's width by up to 7
1074     // bytes.
1075     const int8x8_t source_grain = vld1_s8(noise_stripe_row + x);
1076     const int8x8_t source_old = vld1_s8(noise_stripe_row_prev + x);
1077     const int16x8_t weighted_grain = vmull_s8(grain_coeff, source_grain);
1078     const int16x8_t grain = vmlal_s8(weighted_grain, old_coeff, source_old);
1079     // Note that this write may exceed noise_image_row's width by up to 7 bytes.
1080     vst1_s8(noise_image_row + x, vqrshrn_n_s16(grain, 5));
1081     x += 8;
1082   } while (x < plane_width);
1083 }
1084 
ConstructNoiseImageOverlap8bpp_NEON(const void * LIBGAV1_RESTRICT noise_stripes_buffer,int width,int height,int subsampling_x,int subsampling_y,void * LIBGAV1_RESTRICT noise_image_buffer)1085 void ConstructNoiseImageOverlap8bpp_NEON(
1086     const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
1087     int subsampling_x, int subsampling_y,
1088     void* LIBGAV1_RESTRICT noise_image_buffer) {
1089   const auto* noise_stripes =
1090       static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer);
1091   auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer);
1092   const int plane_width = (width + subsampling_x) >> subsampling_x;
1093   const int plane_height = (height + subsampling_y) >> subsampling_y;
1094   const int stripe_height = 32 >> subsampling_y;
1095   const int stripe_mask = stripe_height - 1;
1096   int y = stripe_height;
1097   int luma_num = 1;
1098   if (subsampling_y == 0) {
1099     const int8x8_t first_row_grain_coeff = vdup_n_s8(17);
1100     const int8x8_t first_row_old_coeff = vdup_n_s8(27);
1101     const int8x8_t second_row_grain_coeff = first_row_old_coeff;
1102     const int8x8_t second_row_old_coeff = first_row_grain_coeff;
1103     for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
1104       const int8_t* noise_stripe = (*noise_stripes)[luma_num];
1105       const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1106       WriteOverlapLine8bpp_NEON(
1107           noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
1108           first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1109 
1110       WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
1111                                 &noise_stripe_prev[(32 + 1) * plane_width],
1112                                 plane_width, second_row_grain_coeff,
1113                                 second_row_old_coeff, (*noise_image)[y + 1]);
1114     }
1115     // Either one partial stripe remains (remaining_height  > 0),
1116     // OR image is less than one stripe high (remaining_height < 0),
1117     // OR all stripes are completed (remaining_height == 0).
1118     const int remaining_height = plane_height - y;
1119     if (remaining_height <= 0) {
1120       return;
1121     }
1122     const int8_t* noise_stripe = (*noise_stripes)[luma_num];
1123     const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1124     WriteOverlapLine8bpp_NEON(
1125         noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
1126         first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1127 
1128     if (remaining_height > 1) {
1129       WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
1130                                 &noise_stripe_prev[(32 + 1) * plane_width],
1131                                 plane_width, second_row_grain_coeff,
1132                                 second_row_old_coeff, (*noise_image)[y + 1]);
1133     }
1134   } else {  // subsampling_y == 1
1135     const int8x8_t first_row_grain_coeff = vdup_n_s8(22);
1136     const int8x8_t first_row_old_coeff = vdup_n_s8(23);
1137     for (; y < plane_height; ++luma_num, y += stripe_height) {
1138       const int8_t* noise_stripe = (*noise_stripes)[luma_num];
1139       const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1140       WriteOverlapLine8bpp_NEON(
1141           noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
1142           first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1143     }
1144   }
1145 }
1146 
Init8bpp()1147 void Init8bpp() {
1148   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
1149   assert(dsp != nullptr);
1150 
1151   // LumaAutoRegressionFunc
1152   dsp->film_grain.luma_auto_regression[0] =
1153       ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 1>;
1154   dsp->film_grain.luma_auto_regression[1] =
1155       ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 2>;
1156   dsp->film_grain.luma_auto_regression[2] =
1157       ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 3>;
1158 
1159   // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag]
1160   // Chroma autoregression should never be called when lag is 0 and use_luma
1161   // is false.
1162   dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
1163   dsp->film_grain.chroma_auto_regression[0][1] =
1164       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1,
1165                                                    false>;
1166   dsp->film_grain.chroma_auto_regression[0][2] =
1167       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2,
1168                                                    false>;
1169   dsp->film_grain.chroma_auto_regression[0][3] =
1170       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3,
1171                                                    false>;
1172   dsp->film_grain.chroma_auto_regression[1][0] =
1173       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 0, true>;
1174   dsp->film_grain.chroma_auto_regression[1][1] =
1175       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1, true>;
1176   dsp->film_grain.chroma_auto_regression[1][2] =
1177       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2, true>;
1178   dsp->film_grain.chroma_auto_regression[1][3] =
1179       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3, true>;
1180 
1181   dsp->film_grain.construct_noise_image_overlap =
1182       ConstructNoiseImageOverlap8bpp_NEON;
1183 
1184   dsp->film_grain.initialize_scaling_lut =
1185       InitializeScalingLookupTable_NEON<kBitdepth8>;
1186 
1187   dsp->film_grain.blend_noise_luma =
1188       BlendNoiseWithImageLuma_NEON<kBitdepth8, int8_t, uint8_t>;
1189   dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON;
1190   dsp->film_grain.blend_noise_chroma[1] =
1191       BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth8, int8_t, uint8_t>;
1192 }
1193 
1194 }  // namespace
1195 }  // namespace low_bitdepth
1196 
1197 #if LIBGAV1_MAX_BITDEPTH >= 10
1198 namespace high_bitdepth {
1199 namespace {
1200 
WriteOverlapLine10bpp_NEON(const int16_t * LIBGAV1_RESTRICT noise_stripe_row,const int16_t * LIBGAV1_RESTRICT noise_stripe_row_prev,int plane_width,const int16x8_t grain_coeff,const int16x8_t old_coeff,int16_t * LIBGAV1_RESTRICT noise_image_row)1201 inline void WriteOverlapLine10bpp_NEON(
1202     const int16_t* LIBGAV1_RESTRICT noise_stripe_row,
1203     const int16_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
1204     const int16x8_t grain_coeff, const int16x8_t old_coeff,
1205     int16_t* LIBGAV1_RESTRICT noise_image_row) {
1206   int x = 0;
1207   do {
1208     // Note that these reads may exceed noise_stripe_row's width by up to 7
1209     // values.
1210     const int16x8_t source_grain = vld1q_s16(noise_stripe_row + x);
1211     const int16x8_t source_old = vld1q_s16(noise_stripe_row_prev + x);
1212     // Maximum product is 511 * 27 = 0x35E5.
1213     const int16x8_t weighted_grain = vmulq_s16(grain_coeff, source_grain);
1214     // Maximum sum is 511 * (22 + 23) = 0x59D3.
1215     const int16x8_t grain_sum =
1216         vmlaq_s16(weighted_grain, old_coeff, source_old);
1217     // Note that this write may exceed noise_image_row's width by up to 7
1218     // values.
1219     const int16x8_t grain = Clip3S16(vrshrq_n_s16(grain_sum, 5),
1220                                      vdupq_n_s16(GetGrainMin<kBitdepth10>()),
1221                                      vdupq_n_s16(GetGrainMax<kBitdepth10>()));
1222     vst1q_s16(noise_image_row + x, grain);
1223     x += 8;
1224   } while (x < plane_width);
1225 }
1226 
ConstructNoiseImageOverlap10bpp_NEON(const void * LIBGAV1_RESTRICT noise_stripes_buffer,int width,int height,int subsampling_x,int subsampling_y,void * LIBGAV1_RESTRICT noise_image_buffer)1227 void ConstructNoiseImageOverlap10bpp_NEON(
1228     const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
1229     int subsampling_x, int subsampling_y,
1230     void* LIBGAV1_RESTRICT noise_image_buffer) {
1231   const auto* noise_stripes =
1232       static_cast<const Array2DView<int16_t>*>(noise_stripes_buffer);
1233   auto* noise_image = static_cast<Array2D<int16_t>*>(noise_image_buffer);
1234   const int plane_width = (width + subsampling_x) >> subsampling_x;
1235   const int plane_height = (height + subsampling_y) >> subsampling_y;
1236   const int stripe_height = 32 >> subsampling_y;
1237   const int stripe_mask = stripe_height - 1;
1238   int y = stripe_height;
1239   int luma_num = 1;
1240   if (subsampling_y == 0) {
1241     const int16x8_t first_row_grain_coeff = vdupq_n_s16(17);
1242     const int16x8_t first_row_old_coeff = vdupq_n_s16(27);
1243     const int16x8_t second_row_grain_coeff = first_row_old_coeff;
1244     const int16x8_t second_row_old_coeff = first_row_grain_coeff;
1245     for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
1246       const int16_t* noise_stripe = (*noise_stripes)[luma_num];
1247       const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1248       WriteOverlapLine10bpp_NEON(
1249           noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
1250           first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1251 
1252       WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
1253                                  &noise_stripe_prev[(32 + 1) * plane_width],
1254                                  plane_width, second_row_grain_coeff,
1255                                  second_row_old_coeff, (*noise_image)[y + 1]);
1256     }
1257     // Either one partial stripe remains (remaining_height > 0),
1258     // OR image is less than one stripe high (remaining_height < 0),
1259     // OR all stripes are completed (remaining_height == 0).
1260     const int remaining_height = plane_height - y;
1261     if (remaining_height <= 0) {
1262       return;
1263     }
1264     const int16_t* noise_stripe = (*noise_stripes)[luma_num];
1265     const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1266     WriteOverlapLine10bpp_NEON(
1267         noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
1268         first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1269 
1270     if (remaining_height > 1) {
1271       WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
1272                                  &noise_stripe_prev[(32 + 1) * plane_width],
1273                                  plane_width, second_row_grain_coeff,
1274                                  second_row_old_coeff, (*noise_image)[y + 1]);
1275     }
1276   } else {  // subsampling_y == 1
1277     const int16x8_t first_row_grain_coeff = vdupq_n_s16(22);
1278     const int16x8_t first_row_old_coeff = vdupq_n_s16(23);
1279     for (; y < plane_height; ++luma_num, y += stripe_height) {
1280       const int16_t* noise_stripe = (*noise_stripes)[luma_num];
1281       const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1282       WriteOverlapLine10bpp_NEON(
1283           noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
1284           first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1285     }
1286   }
1287 }
1288 
BlendChromaValsNoCfl(const int16_t * LIBGAV1_RESTRICT scaling_lut,const int16x8_t orig,const int16_t * LIBGAV1_RESTRICT noise_image_cursor,const int16x8_t & average_luma,const int16x8_t & scaling_shift_vect,const int32x4_t & offset,int luma_multiplier,int chroma_multiplier,bool restrict_scaling_lookup,int valid_range_pixels=0)1289 inline int16x8_t BlendChromaValsNoCfl(
1290     const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
1291     const int16_t* LIBGAV1_RESTRICT noise_image_cursor,
1292     const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
1293     const int32x4_t& offset, int luma_multiplier, int chroma_multiplier,
1294     bool restrict_scaling_lookup, int valid_range_pixels = 0) {
1295   uint16_t merged_buffer[8];
1296   const int32x4_t weighted_luma_low =
1297       vmull_n_s16(vget_low_s16(average_luma), luma_multiplier);
1298   const int32x4_t weighted_luma_high =
1299       vmull_n_s16(vget_high_s16(average_luma), luma_multiplier);
1300   // Maximum value of combined is 127 * 1023 = 0x1FB81.
1301   const int32x4_t combined_low =
1302       vmlal_n_s16(weighted_luma_low, vget_low_s16(orig), chroma_multiplier);
1303   const int32x4_t combined_high =
1304       vmlal_n_s16(weighted_luma_high, vget_high_s16(orig), chroma_multiplier);
1305   // Maximum value of offset is (255 << 8) = 0xFF00. Offset may be negative.
1306   const uint16x4_t merged_low =
1307       vqshrun_n_s32(vaddq_s32(offset, combined_low), 6);
1308   const uint16x4_t merged_high =
1309       vqshrun_n_s32(vaddq_s32(offset, combined_high), 6);
1310   const uint16x8_t max_pixel = vdupq_n_u16((1 << kBitdepth10) - 1);
1311   vst1q_u16(merged_buffer,
1312             vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel));
1313   const int16x8_t scaling =
1314       restrict_scaling_lookup
1315           ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer,
1316                                                      valid_range_pixels)
1317           : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut,
1318                                                      merged_buffer);
1319   const int16x8_t noise = GetSignedSource8(noise_image_cursor);
1320   const int16x8_t scaled_noise =
1321       ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect);
1322   return vaddq_s16(orig, scaled_noise);
1323 }
1324 
BlendChromaPlane10bpp_NEON(const Array2D<int16_t> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,int chroma_offset,int chroma_multiplier,int luma_multiplier,const int16_t * LIBGAV1_RESTRICT scaling_lut,const uint16_t * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const uint16_t * in_chroma_row,ptrdiff_t source_stride_chroma,uint16_t * out_chroma_row,ptrdiff_t dest_stride)1325 LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
1326     const Array2D<int16_t>& noise_image, int min_value, int max_chroma,
1327     int width, int height, int start_height, int subsampling_x,
1328     int subsampling_y, int scaling_shift, int chroma_offset,
1329     int chroma_multiplier, int luma_multiplier,
1330     const int16_t* LIBGAV1_RESTRICT scaling_lut,
1331     const uint16_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
1332     const uint16_t* in_chroma_row, ptrdiff_t source_stride_chroma,
1333     uint16_t* out_chroma_row, ptrdiff_t dest_stride) {
1334   const int16x8_t floor = vdupq_n_s16(min_value);
1335   const int16x8_t ceiling = vdupq_n_s16(max_chroma);
1336   const int16x8_t scaling_shift_vect = vdupq_n_s16(15 - scaling_shift);
1337 
1338   const int chroma_height = (height + subsampling_y) >> subsampling_y;
1339   const int chroma_width = (width + subsampling_x) >> subsampling_x;
1340   const int safe_chroma_width = chroma_width & ~7;
1341   uint16_t luma_buffer[16];
1342   // Offset is added before downshifting in order to take advantage of
1343   // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp.
1344   const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2));
1345 
1346   start_height >>= subsampling_y;
1347   int y = 0;
1348   do {
1349     int x = 0;
1350     for (; x + 8 <= safe_chroma_width; x += 8) {
1351       const int luma_x = x << subsampling_x;
1352       const int16x8_t average_luma = vreinterpretq_s16_u16(
1353           GetAverageLuma(&in_y_row[luma_x], subsampling_x));
1354       const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
1355       const int16x8_t blended = BlendChromaValsNoCfl(
1356           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
1357           average_luma, scaling_shift_vect, offset, luma_multiplier,
1358           chroma_multiplier, /*restrict_scaling_lookup=*/false);
1359       StoreUnsigned8(&out_chroma_row[x],
1360                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
1361     }
1362 
1363     if (x < chroma_width) {
1364       // Begin right edge iteration. Same as the normal iterations, but the
1365       // |average_luma| computation requires a duplicated luma value at the
1366       // end.
1367       const int luma_x = x << subsampling_x;
1368       const int valid_range_pixels = width - luma_x;
1369       const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
1370       assert(valid_range_pixels < 16);
1371       memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
1372       luma_buffer[valid_range_pixels] = in_y_row[width - 1];
1373       const int valid_range_chroma_pixels = chroma_width - x;
1374       const int valid_range_chroma_bytes =
1375           (chroma_width - x) * sizeof(in_chroma_row[0]);
1376       const int16x8_t orig_chroma =
1377           GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
1378 
1379       const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
1380           luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
1381       const int16x8_t blended = BlendChromaValsNoCfl(
1382           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
1383           average_luma, scaling_shift_vect, offset, luma_multiplier,
1384           chroma_multiplier, /*restrict_scaling_lookup=*/true,
1385           valid_range_chroma_pixels);
1386       StoreUnsigned8(&out_chroma_row[x],
1387                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
1388       // End of right edge iteration.
1389     }
1390 
1391     in_y_row = AddByteStride(in_y_row, source_stride_y << subsampling_y);
1392     in_chroma_row = AddByteStride(in_chroma_row, source_stride_chroma);
1393     out_chroma_row = AddByteStride(out_chroma_row, dest_stride);
1394   } while (++y < chroma_height);
1395 }
1396 
1397 // This function is for the case params_.chroma_scaling_from_luma == false.
BlendNoiseWithImageChroma10bpp_NEON(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * LIBGAV1_RESTRICT scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)1398 void BlendNoiseWithImageChroma10bpp_NEON(
1399     Plane plane, const FilmGrainParams& params,
1400     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
1401     int width, int height, int start_height, int subsampling_x,
1402     int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
1403     const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
1404     const void* source_plane_uv, ptrdiff_t source_stride_uv,
1405     void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
1406   assert(plane == kPlaneU || plane == kPlaneV);
1407   const auto* noise_image =
1408       static_cast<const Array2D<int16_t>*>(noise_image_ptr);
1409   const auto* in_y = static_cast<const uint16_t*>(source_plane_y);
1410   const auto* in_uv = static_cast<const uint16_t*>(source_plane_uv);
1411   auto* out_uv = static_cast<uint16_t*>(dest_plane_uv);
1412 
1413   const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
1414   const int luma_multiplier =
1415       (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
1416   const int multiplier =
1417       (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
1418   BlendChromaPlane10bpp_NEON(
1419       noise_image[plane], min_value, max_chroma, width, height, start_height,
1420       subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
1421       luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
1422       source_stride_uv, out_uv, dest_stride_uv);
1423 }
1424 
Init10bpp()1425 void Init10bpp() {
1426   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
1427   assert(dsp != nullptr);
1428 
1429   // LumaAutoRegressionFunc
1430   dsp->film_grain.luma_auto_regression[0] =
1431       ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 1>;
1432   dsp->film_grain.luma_auto_regression[1] =
1433       ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 2>;
1434   dsp->film_grain.luma_auto_regression[2] =
1435       ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 3>;
1436 
1437   // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling]
1438   // Chroma autoregression should never be called when lag is 0 and use_luma
1439   // is false.
1440   dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
1441   dsp->film_grain.chroma_auto_regression[0][1] =
1442       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
1443                                                    false>;
1444   dsp->film_grain.chroma_auto_regression[0][2] =
1445       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
1446                                                    false>;
1447   dsp->film_grain.chroma_auto_regression[0][3] =
1448       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
1449                                                    false>;
1450   dsp->film_grain.chroma_auto_regression[1][0] =
1451       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 0,
1452                                                    true>;
1453   dsp->film_grain.chroma_auto_regression[1][1] =
1454       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
1455                                                    true>;
1456   dsp->film_grain.chroma_auto_regression[1][2] =
1457       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
1458                                                    true>;
1459   dsp->film_grain.chroma_auto_regression[1][3] =
1460       ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
1461                                                    true>;
1462 
1463   dsp->film_grain.construct_noise_image_overlap =
1464       ConstructNoiseImageOverlap10bpp_NEON;
1465 
1466   dsp->film_grain.initialize_scaling_lut =
1467       InitializeScalingLookupTable_NEON<kBitdepth10>;
1468 
1469   dsp->film_grain.blend_noise_luma =
1470       BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
1471   dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON;
1472   dsp->film_grain.blend_noise_chroma[1] =
1473       BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>;
1474 }
1475 
1476 }  // namespace
1477 }  // namespace high_bitdepth
1478 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
1479 
1480 }  // namespace film_grain
1481 
FilmGrainInit_NEON()1482 void FilmGrainInit_NEON() {
1483   film_grain::low_bitdepth::Init8bpp();
1484 #if LIBGAV1_MAX_BITDEPTH >= 10
1485   film_grain::high_bitdepth::Init10bpp();
1486 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
1487 }
1488 
1489 }  // namespace dsp
1490 }  // namespace libgav1
1491 
1492 #else   // !LIBGAV1_ENABLE_NEON
1493 
1494 namespace libgav1 {
1495 namespace dsp {
1496 
FilmGrainInit_NEON()1497 void FilmGrainInit_NEON() {}
1498 
1499 }  // namespace dsp
1500 }  // namespace libgav1
1501 #endif  // LIBGAV1_ENABLE_NEON
1502