1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/film_grain.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON
19 #include <arm_neon.h>
20
21 #include <cassert>
22 #include <cstddef>
23 #include <cstdint>
24 #include <cstring>
25
26 #include "src/dsp/arm/common_neon.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/film_grain_common.h"
30 #include "src/utils/array_2d.h"
31 #include "src/utils/common.h"
32 #include "src/utils/compiler_attributes.h"
33 #include "src/utils/constants.h"
34 #include "src/utils/memory.h"
35 #include "src/utils/types.h"
36
37 namespace libgav1 {
38 namespace dsp {
39 namespace film_grain {
40 namespace {
41
42 // These functions are overloaded for both possible sizes in order to simplify
43 // loading and storing to and from intermediate value types from within a
44 // template function.
GetSignedSource8(const int8_t * src)45 inline int16x8_t GetSignedSource8(const int8_t* src) {
46 return vmovl_s8(vld1_s8(src));
47 }
48
GetSignedSource8(const uint8_t * src)49 inline int16x8_t GetSignedSource8(const uint8_t* src) {
50 return ZeroExtend(vld1_u8(src));
51 }
52
GetSignedSource8Msan(const uint8_t * src,int valid_range)53 inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) {
54 return ZeroExtend(Load1MsanU8(src, 8 - valid_range));
55 }
56
StoreUnsigned8(uint8_t * dest,const uint16x8_t data)57 inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
58 vst1_u8(dest, vmovn_u16(data));
59 }
60
61 #if LIBGAV1_MAX_BITDEPTH >= 10
GetSignedSource8(const int16_t * src)62 inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); }
63
GetSignedSource8(const uint16_t * src)64 inline int16x8_t GetSignedSource8(const uint16_t* src) {
65 return vreinterpretq_s16_u16(vld1q_u16(src));
66 }
67
GetSignedSource8Msan(const uint16_t * src,int valid_range)68 inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) {
69 return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range));
70 }
71
StoreUnsigned8(uint16_t * dest,const uint16x8_t data)72 inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
73 vst1q_u16(dest, data);
74 }
75 #endif // LIBGAV1_MAX_BITDEPTH >= 10
76
77 // Each element in |sum| represents one destination value's running
78 // autoregression formula. The fixed source values in |grain_lo| and |grain_hi|
79 // allow for a sliding window in successive calls to this function.
80 template <int position_offset>
AccumulateWeightedGrain(const int16x8_t grain_lo,const int16x8_t grain_hi,int16_t coeff,int32x4x2_t sum)81 inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
82 const int16x8_t grain_hi,
83 int16_t coeff, int32x4x2_t sum) {
84 const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
85 sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
86 sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
87 return sum;
88 }
89
90 // Because the autoregressive filter requires the output of each pixel to
91 // compute pixels that come after in the row, we have to finish the calculations
92 // one at a time.
93 template <int bitdepth, int auto_regression_coeff_lag, int lane>
WriteFinalAutoRegression(int8_t * LIBGAV1_RESTRICT grain_cursor,int32x4x2_t sum,const int8_t * LIBGAV1_RESTRICT coeffs,int pos,int shift)94 inline void WriteFinalAutoRegression(int8_t* LIBGAV1_RESTRICT grain_cursor,
95 int32x4x2_t sum,
96 const int8_t* LIBGAV1_RESTRICT coeffs,
97 int pos, int shift) {
98 int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
99
100 for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
101 result += grain_cursor[lane + delta_col] * coeffs[pos];
102 ++pos;
103 }
104 grain_cursor[lane] =
105 Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
106 GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
107 }
108
109 #if LIBGAV1_MAX_BITDEPTH >= 10
110 template <int bitdepth, int auto_regression_coeff_lag, int lane>
WriteFinalAutoRegression(int16_t * LIBGAV1_RESTRICT grain_cursor,int32x4x2_t sum,const int8_t * LIBGAV1_RESTRICT coeffs,int pos,int shift)111 inline void WriteFinalAutoRegression(int16_t* LIBGAV1_RESTRICT grain_cursor,
112 int32x4x2_t sum,
113 const int8_t* LIBGAV1_RESTRICT coeffs,
114 int pos, int shift) {
115 int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
116
117 for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
118 result += grain_cursor[lane + delta_col] * coeffs[pos];
119 ++pos;
120 }
121 grain_cursor[lane] =
122 Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
123 GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
124 }
125 #endif // LIBGAV1_MAX_BITDEPTH >= 10
126
127 // Because the autoregressive filter requires the output of each pixel to
128 // compute pixels that come after in the row, we have to finish the calculations
129 // one at a time.
130 template <int bitdepth, int auto_regression_coeff_lag, int lane>
WriteFinalAutoRegressionChroma(int8_t * LIBGAV1_RESTRICT u_grain_cursor,int8_t * LIBGAV1_RESTRICT v_grain_cursor,int32x4x2_t sum_u,int32x4x2_t sum_v,const int8_t * LIBGAV1_RESTRICT coeffs_u,const int8_t * LIBGAV1_RESTRICT coeffs_v,int pos,int shift)131 inline void WriteFinalAutoRegressionChroma(
132 int8_t* LIBGAV1_RESTRICT u_grain_cursor,
133 int8_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
134 int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
135 const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
136 WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
137 u_grain_cursor, sum_u, coeffs_u, pos, shift);
138 WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
139 v_grain_cursor, sum_v, coeffs_v, pos, shift);
140 }
141
142 #if LIBGAV1_MAX_BITDEPTH >= 10
143 template <int bitdepth, int auto_regression_coeff_lag, int lane>
WriteFinalAutoRegressionChroma(int16_t * LIBGAV1_RESTRICT u_grain_cursor,int16_t * LIBGAV1_RESTRICT v_grain_cursor,int32x4x2_t sum_u,int32x4x2_t sum_v,const int8_t * LIBGAV1_RESTRICT coeffs_u,const int8_t * LIBGAV1_RESTRICT coeffs_v,int pos,int shift)144 inline void WriteFinalAutoRegressionChroma(
145 int16_t* LIBGAV1_RESTRICT u_grain_cursor,
146 int16_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
147 int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
148 const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
149 WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
150 u_grain_cursor, sum_u, coeffs_u, pos, shift);
151 WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
152 v_grain_cursor, sum_v, coeffs_v, pos, shift);
153 }
154 #endif // LIBGAV1_MAX_BITDEPTH >= 10
155
SetZero(int32x4x2_t * v)156 inline void SetZero(int32x4x2_t* v) {
157 v->val[0] = vdupq_n_s32(0);
158 v->val[1] = vdupq_n_s32(0);
159 }
160
161 // Computes subsampled luma for use with chroma, by averaging in the x direction
162 // or y direction when applicable.
GetSubsampledLuma(const int8_t * const luma,int subsampling_x,int subsampling_y,ptrdiff_t stride)163 int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x,
164 int subsampling_y, ptrdiff_t stride) {
165 if (subsampling_y != 0) {
166 assert(subsampling_x != 0);
167 const int8x16_t src0 = vld1q_s8(luma);
168 const int8x16_t src1 = vld1q_s8(luma + stride);
169 const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)),
170 vpaddl_s8(vget_high_s8(src0)));
171 const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)),
172 vpaddl_s8(vget_high_s8(src1)));
173 return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2);
174 }
175 if (subsampling_x != 0) {
176 const int8x16_t src = vld1q_s8(luma);
177 return vrshrq_n_s16(
178 vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))),
179 1);
180 }
181 return vmovl_s8(vld1_s8(luma));
182 }
183
184 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint8_t * const luma,int subsampling_x)185 inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
186 if (subsampling_x != 0) {
187 const uint8x16_t src = vld1q_u8(luma);
188 return vrshrq_n_u16(vpaddlq_u8(src), 1);
189 }
190 return vmovl_u8(vld1_u8(luma));
191 }
192
GetAverageLumaMsan(const uint8_t * const luma,int subsampling_x,int valid_range)193 inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma,
194 int subsampling_x, int valid_range) {
195 if (subsampling_x != 0) {
196 const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range);
197 // MemorySanitizer registers vpaddlq_u8 as a use of the memory.
198 return vrshrq_n_u16(vpaddlq_u8(src), 1);
199 }
200 return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range);
201 }
202
203 #if LIBGAV1_MAX_BITDEPTH >= 10
204 // Computes subsampled luma for use with chroma, by averaging in the x direction
205 // or y direction when applicable.
GetSubsampledLuma(const int16_t * const luma,int subsampling_x,int subsampling_y,ptrdiff_t stride)206 int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x,
207 int subsampling_y, ptrdiff_t stride) {
208 if (subsampling_y != 0) {
209 assert(subsampling_x != 0);
210 int16x8_t src0_lo = vld1q_s16(luma);
211 int16x8_t src0_hi = vld1q_s16(luma + 8);
212 const int16x8_t src1_lo = vld1q_s16(luma + stride);
213 const int16x8_t src1_hi = vld1q_s16(luma + stride + 8);
214 const int16x8_t src0 =
215 vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)),
216 vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi)));
217 const int16x8_t src1 =
218 vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)),
219 vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi)));
220 return vrshrq_n_s16(vaddq_s16(src0, src1), 2);
221 }
222 if (subsampling_x != 0) {
223 const int16x8_t src_lo = vld1q_s16(luma);
224 const int16x8_t src_hi = vld1q_s16(luma + 8);
225 const int16x8_t ret =
226 vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)),
227 vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi)));
228 return vrshrq_n_s16(ret, 1);
229 }
230 return vld1q_s16(luma);
231 }
232
233 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint16_t * const luma,int subsampling_x)234 inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
235 int subsampling_x) {
236 if (subsampling_x != 0) {
237 const uint16x8x2_t src = vld2q_u16(luma);
238 return vrhaddq_u16(src.val[0], src.val[1]);
239 }
240 return vld1q_u16(luma);
241 }
242
GetAverageLumaMsan(const uint16_t * const luma,int subsampling_x,int valid_range)243 inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma,
244 int subsampling_x, int valid_range) {
245 if (subsampling_x != 0) {
246 const uint16x8x2_t src = vld2q_u16(luma);
247 const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]);
248 return MaskOverreadsQ(result, 16 - valid_range);
249 }
250 return Load1QMsanU16(luma, 16 - valid_range);
251 }
252 #endif // LIBGAV1_MAX_BITDEPTH >= 10
253
254 template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
255 bool use_luma>
ApplyAutoRegressiveFilterToChromaGrains_NEON(const FilmGrainParams & params,const void * LIBGAV1_RESTRICT luma_grain_buffer,int subsampling_x,int subsampling_y,void * LIBGAV1_RESTRICT u_grain_buffer,void * LIBGAV1_RESTRICT v_grain_buffer)256 void ApplyAutoRegressiveFilterToChromaGrains_NEON(
257 const FilmGrainParams& params,
258 const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
259 int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
260 void* LIBGAV1_RESTRICT v_grain_buffer) {
261 static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag.");
262 const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
263 auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
264 auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
265 const int auto_regression_shift = params.auto_regression_shift;
266 const int chroma_width =
267 (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
268 const int chroma_height =
269 (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
270 // When |chroma_width| == 44, we write 8 at a time from x in [3, 34],
271 // leaving [35, 40] to write at the end.
272 const int chroma_width_remainder =
273 (chroma_width - 2 * kAutoRegressionBorder) & 7;
274
275 int y = kAutoRegressionBorder;
276 luma_grain += kLumaWidth * y;
277 u_grain += chroma_width * y;
278 v_grain += chroma_width * y;
279 do {
280 // Each row is computed 8 values at a time in the following loop. At the
281 // end of the loop, 4 values remain to write. They are given a special
282 // reduced iteration at the end.
283 int x = kAutoRegressionBorder;
284 int luma_x = kAutoRegressionBorder;
285 do {
286 int pos = 0;
287 int32x4x2_t sum_u;
288 int32x4x2_t sum_v;
289 SetZero(&sum_u);
290 SetZero(&sum_v);
291
292 if (auto_regression_coeff_lag > 0) {
293 for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
294 ++delta_row) {
295 // These loads may overflow to the next row, but they are never called
296 // on the final row of a grain block. Therefore, they will never
297 // exceed the block boundaries.
298 // Note: this could be slightly optimized to a single load in 8bpp,
299 // but requires making a special first iteration and accumulate
300 // function that takes an int8x16_t.
301 const int16x8_t u_grain_lo =
302 GetSignedSource8(u_grain + x + delta_row * chroma_width -
303 auto_regression_coeff_lag);
304 const int16x8_t u_grain_hi =
305 GetSignedSource8(u_grain + x + delta_row * chroma_width -
306 auto_regression_coeff_lag + 8);
307 const int16x8_t v_grain_lo =
308 GetSignedSource8(v_grain + x + delta_row * chroma_width -
309 auto_regression_coeff_lag);
310 const int16x8_t v_grain_hi =
311 GetSignedSource8(v_grain + x + delta_row * chroma_width -
312 auto_regression_coeff_lag + 8);
313 #define ACCUMULATE_WEIGHTED_GRAIN(offset) \
314 sum_u = AccumulateWeightedGrain<offset>( \
315 u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \
316 sum_v = AccumulateWeightedGrain<offset>( \
317 v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v)
318
319 ACCUMULATE_WEIGHTED_GRAIN(0);
320 ACCUMULATE_WEIGHTED_GRAIN(1);
321 ACCUMULATE_WEIGHTED_GRAIN(2);
322 // The horizontal |auto_regression_coeff_lag| loop is replaced with
323 // if-statements to give vextq_s16 an immediate param.
324 if (auto_regression_coeff_lag > 1) {
325 ACCUMULATE_WEIGHTED_GRAIN(3);
326 ACCUMULATE_WEIGHTED_GRAIN(4);
327 }
328 if (auto_regression_coeff_lag > 2) {
329 assert(auto_regression_coeff_lag == 3);
330 ACCUMULATE_WEIGHTED_GRAIN(5);
331 ACCUMULATE_WEIGHTED_GRAIN(6);
332 }
333 }
334 }
335
336 if (use_luma) {
337 const int16x8_t luma = GetSubsampledLuma(
338 luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
339
340 // Luma samples get the final coefficient in the formula, but are best
341 // computed all at once before the final row.
342 const int coeff_u =
343 params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
344 const int coeff_v =
345 params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
346
347 sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
348 sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
349 sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
350 sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
351 }
352 // At this point in the filter, the source addresses and destination
353 // addresses overlap. Because this is an auto-regressive filter, the
354 // higher lanes cannot be computed without the results of the lower lanes.
355 // Each call to WriteFinalAutoRegression incorporates preceding values
356 // on the final row, and writes a single sample. This allows the next
357 // pixel's value to be computed in the next call.
358 #define WRITE_AUTO_REGRESSION_RESULT(lane) \
359 WriteFinalAutoRegressionChroma<bitdepth, auto_regression_coeff_lag, lane>( \
360 u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \
361 params.auto_regression_coeff_v, pos, auto_regression_shift)
362
363 WRITE_AUTO_REGRESSION_RESULT(0);
364 WRITE_AUTO_REGRESSION_RESULT(1);
365 WRITE_AUTO_REGRESSION_RESULT(2);
366 WRITE_AUTO_REGRESSION_RESULT(3);
367 WRITE_AUTO_REGRESSION_RESULT(4);
368 WRITE_AUTO_REGRESSION_RESULT(5);
369 WRITE_AUTO_REGRESSION_RESULT(6);
370 WRITE_AUTO_REGRESSION_RESULT(7);
371
372 x += 8;
373 luma_x += 8 << subsampling_x;
374 } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder);
375
376 // This is the "final iteration" of the above loop over width. We fill in
377 // the remainder of the width, which is less than 8.
378 int pos = 0;
379 int32x4x2_t sum_u;
380 int32x4x2_t sum_v;
381 SetZero(&sum_u);
382 SetZero(&sum_v);
383
384 for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
385 ++delta_row) {
386 // These loads may overflow to the next row, but they are never called on
387 // the final row of a grain block. Therefore, they will never exceed the
388 // block boundaries.
389 const int16x8_t u_grain_lo = GetSignedSource8(
390 u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
391 const int16x8_t u_grain_hi =
392 GetSignedSource8(u_grain + x + delta_row * chroma_width -
393 auto_regression_coeff_lag + 8);
394 const int16x8_t v_grain_lo = GetSignedSource8(
395 v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
396 const int16x8_t v_grain_hi =
397 GetSignedSource8(v_grain + x + delta_row * chroma_width -
398 auto_regression_coeff_lag + 8);
399
400 ACCUMULATE_WEIGHTED_GRAIN(0);
401 ACCUMULATE_WEIGHTED_GRAIN(1);
402 ACCUMULATE_WEIGHTED_GRAIN(2);
403 // The horizontal |auto_regression_coeff_lag| loop is replaced with
404 // if-statements to give vextq_s16 an immediate param.
405 if (auto_regression_coeff_lag > 1) {
406 ACCUMULATE_WEIGHTED_GRAIN(3);
407 ACCUMULATE_WEIGHTED_GRAIN(4);
408 }
409 if (auto_regression_coeff_lag > 2) {
410 assert(auto_regression_coeff_lag == 3);
411 ACCUMULATE_WEIGHTED_GRAIN(5);
412 ACCUMULATE_WEIGHTED_GRAIN(6);
413 }
414 }
415
416 if (use_luma) {
417 const int16x8_t luma = GetSubsampledLuma(
418 luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
419
420 // Luma samples get the final coefficient in the formula, but are best
421 // computed all at once before the final row.
422 const int coeff_u =
423 params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
424 const int coeff_v =
425 params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
426
427 sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
428 sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
429 sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
430 sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
431 }
432
433 WRITE_AUTO_REGRESSION_RESULT(0);
434 WRITE_AUTO_REGRESSION_RESULT(1);
435 WRITE_AUTO_REGRESSION_RESULT(2);
436 WRITE_AUTO_REGRESSION_RESULT(3);
437 if (chroma_width_remainder == 6) {
438 WRITE_AUTO_REGRESSION_RESULT(4);
439 WRITE_AUTO_REGRESSION_RESULT(5);
440 }
441
442 luma_grain += kLumaWidth << subsampling_y;
443 u_grain += chroma_width;
444 v_grain += chroma_width;
445 } while (++y < chroma_height);
446 #undef ACCUMULATE_WEIGHTED_GRAIN
447 #undef WRITE_AUTO_REGRESSION_RESULT
448 }
449
450 // Applies an auto-regressive filter to the white noise in luma_grain.
451 template <int bitdepth, typename GrainType, int auto_regression_coeff_lag>
ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams & params,void * luma_grain_buffer)452 void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params,
453 void* luma_grain_buffer) {
454 static_assert(auto_regression_coeff_lag > 0, "");
455 const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
456 const uint8_t auto_regression_shift = params.auto_regression_shift;
457
458 int y = kAutoRegressionBorder;
459 auto* luma_grain =
460 static_cast<GrainType*>(luma_grain_buffer) + kLumaWidth * y;
461 do {
462 // Each row is computed 8 values at a time in the following loop. At the
463 // end of the loop, 4 values remain to write. They are given a special
464 // reduced iteration at the end.
465 int x = kAutoRegressionBorder;
466 do {
467 int pos = 0;
468 int32x4x2_t sum;
469 SetZero(&sum);
470 for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
471 ++delta_row) {
472 // These loads may overflow to the next row, but they are never called
473 // on the final row of a grain block. Therefore, they will never exceed
474 // the block boundaries.
475 const int16x8_t src_grain_lo =
476 GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
477 auto_regression_coeff_lag);
478 const int16x8_t src_grain_hi =
479 GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
480 auto_regression_coeff_lag + 8);
481
482 // A pictorial representation of the auto-regressive filter for
483 // various values of params.auto_regression_coeff_lag. The letter 'O'
484 // represents the current sample. (The filter always operates on the
485 // current sample with filter coefficient 1.) The letters 'X'
486 // represent the neighboring samples that the filter operates on, below
487 // their corresponding "offset" number.
488 //
489 // params.auto_regression_coeff_lag == 3:
490 // 0 1 2 3 4 5 6
491 // X X X X X X X
492 // X X X X X X X
493 // X X X X X X X
494 // X X X O
495 // params.auto_regression_coeff_lag == 2:
496 // 0 1 2 3 4
497 // X X X X X
498 // X X X X X
499 // X X O
500 // params.auto_regression_coeff_lag == 1:
501 // 0 1 2
502 // X X X
503 // X O
504 // params.auto_regression_coeff_lag == 0:
505 // O
506 // The function relies on the caller to skip the call in the 0 lag
507 // case.
508
509 #define ACCUMULATE_WEIGHTED_GRAIN(offset) \
510 sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
511 auto_regression_coeff_y[pos++], sum)
512 ACCUMULATE_WEIGHTED_GRAIN(0);
513 ACCUMULATE_WEIGHTED_GRAIN(1);
514 ACCUMULATE_WEIGHTED_GRAIN(2);
515 // The horizontal |auto_regression_coeff_lag| loop is replaced with
516 // if-statements to give vextq_s16 an immediate param.
517 if (auto_regression_coeff_lag > 1) {
518 ACCUMULATE_WEIGHTED_GRAIN(3);
519 ACCUMULATE_WEIGHTED_GRAIN(4);
520 }
521 if (auto_regression_coeff_lag > 2) {
522 assert(auto_regression_coeff_lag == 3);
523 ACCUMULATE_WEIGHTED_GRAIN(5);
524 ACCUMULATE_WEIGHTED_GRAIN(6);
525 }
526 }
527 // At this point in the filter, the source addresses and destination
528 // addresses overlap. Because this is an auto-regressive filter, the
529 // higher lanes cannot be computed without the results of the lower lanes.
530 // Each call to WriteFinalAutoRegression incorporates preceding values
531 // on the final row, and writes a single sample. This allows the next
532 // pixel's value to be computed in the next call.
533 #define WRITE_AUTO_REGRESSION_RESULT(lane) \
534 WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
535 luma_grain + x, sum, auto_regression_coeff_y, pos, \
536 auto_regression_shift)
537
538 WRITE_AUTO_REGRESSION_RESULT(0);
539 WRITE_AUTO_REGRESSION_RESULT(1);
540 WRITE_AUTO_REGRESSION_RESULT(2);
541 WRITE_AUTO_REGRESSION_RESULT(3);
542 WRITE_AUTO_REGRESSION_RESULT(4);
543 WRITE_AUTO_REGRESSION_RESULT(5);
544 WRITE_AUTO_REGRESSION_RESULT(6);
545 WRITE_AUTO_REGRESSION_RESULT(7);
546 x += 8;
547 // Leave the final four pixels for the special iteration below.
548 } while (x < kLumaWidth - kAutoRegressionBorder - 4);
549
550 // Final 4 pixels in the row.
551 int pos = 0;
552 int32x4x2_t sum;
553 SetZero(&sum);
554 for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
555 ++delta_row) {
556 const int16x8_t src_grain_lo = GetSignedSource8(
557 luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
558 const int16x8_t src_grain_hi =
559 GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
560 auto_regression_coeff_lag + 8);
561
562 ACCUMULATE_WEIGHTED_GRAIN(0);
563 ACCUMULATE_WEIGHTED_GRAIN(1);
564 ACCUMULATE_WEIGHTED_GRAIN(2);
565 // The horizontal |auto_regression_coeff_lag| loop is replaced with
566 // if-statements to give vextq_s16 an immediate param.
567 if (auto_regression_coeff_lag > 1) {
568 ACCUMULATE_WEIGHTED_GRAIN(3);
569 ACCUMULATE_WEIGHTED_GRAIN(4);
570 }
571 if (auto_regression_coeff_lag > 2) {
572 assert(auto_regression_coeff_lag == 3);
573 ACCUMULATE_WEIGHTED_GRAIN(5);
574 ACCUMULATE_WEIGHTED_GRAIN(6);
575 }
576 }
577 // delta_row == 0
578 WRITE_AUTO_REGRESSION_RESULT(0);
579 WRITE_AUTO_REGRESSION_RESULT(1);
580 WRITE_AUTO_REGRESSION_RESULT(2);
581 WRITE_AUTO_REGRESSION_RESULT(3);
582 luma_grain += kLumaWidth;
583 } while (++y < kLumaHeight);
584
585 #undef WRITE_AUTO_REGRESSION_RESULT
586 #undef ACCUMULATE_WEIGHTED_GRAIN
587 }
588
589 template <int bitdepth>
InitializeScalingLookupTable_NEON(int num_points,const uint8_t point_value[],const uint8_t point_scaling[],int16_t * scaling_lut,const int scaling_lut_length)590 void InitializeScalingLookupTable_NEON(int num_points,
591 const uint8_t point_value[],
592 const uint8_t point_scaling[],
593 int16_t* scaling_lut,
594 const int scaling_lut_length) {
595 static_assert(bitdepth < kBitdepth12,
596 "NEON Scaling lookup table only supports 8bpp and 10bpp.");
597 if (num_points == 0) {
598 memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
599 return;
600 }
601 static_assert(sizeof(scaling_lut[0]) == 2, "");
602 Memset(scaling_lut, point_scaling[0],
603 (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8));
604 const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000));
605 const int32x4_t rounding = vdupq_n_s32(32768);
606 for (int i = 0; i < num_points - 1; ++i) {
607 const int delta_y = point_scaling[i + 1] - point_scaling[i];
608 const int delta_x = point_value[i + 1] - point_value[i];
609 // |delta| corresponds to b, for the function y = a + b*x.
610 const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
611 const int delta4 = delta << 2;
612 // vmull_n_u16 will not work here because |delta| typically exceeds the
613 // range of uint16_t.
614 int32x4_t upscaled_points0 = vmlaq_n_s32(rounding, steps, delta);
615 const int32x4_t line_increment4 = vdupq_n_s32(delta4);
616 // Get the second set of 4 points by adding 4 steps to the first set.
617 int32x4_t upscaled_points1 = vaddq_s32(upscaled_points0, line_increment4);
618 // We obtain the next set of 8 points by adding 8 steps to each of the
619 // current 8 points.
620 const int32x4_t line_increment8 = vshlq_n_s32(line_increment4, 1);
621 const int16x8_t base_point = vdupq_n_s16(point_scaling[i]);
622 int x = 0;
623 // Derive and write 8 values (or 32 values, for 10bpp).
624 do {
625 const int16x4_t interp_points0 = vshrn_n_s32(upscaled_points0, 16);
626 const int16x4_t interp_points1 = vshrn_n_s32(upscaled_points1, 16);
627 const int16x8_t interp_points =
628 vcombine_s16(interp_points0, interp_points1);
629 // The spec guarantees that the max value of |point_value[i]| + x is 255.
630 // Writing 8 values starting at the final table byte, leaves 7 values of
631 // required padding.
632 const int16x8_t full_interp = vaddq_s16(interp_points, base_point);
633 const int x_base = (point_value[i] + x) << (bitdepth - kBitdepth8);
634 if (bitdepth == kBitdepth10) {
635 const int16x8_t next_val = vaddq_s16(
636 base_point,
637 vdupq_n_s16((vgetq_lane_s32(upscaled_points1, 3) + delta) >> 16));
638 const int16x8_t start = full_interp;
639 const int16x8_t end = vextq_s16(full_interp, next_val, 1);
640 // lut[i << 2] = start;
641 // lut[(i << 2) + 1] = start + RightShiftWithRounding(start - end, 2)
642 // lut[(i << 2) + 2] = start +
643 // RightShiftWithRounding(2 * (start - end), 2)
644 // lut[(i << 2) + 3] = start +
645 // RightShiftWithRounding(3 * (start - end), 2)
646 const int16x8_t delta = vsubq_s16(end, start);
647 const int16x8_t double_delta = vshlq_n_s16(delta, 1);
648 const int16x8_t delta2 = vrshrq_n_s16(double_delta, 2);
649 const int16x8_t delta3 =
650 vrshrq_n_s16(vaddq_s16(delta, double_delta), 2);
651 const int16x8x4_t result = {
652 start, vaddq_s16(start, vrshrq_n_s16(delta, 2)),
653 vaddq_s16(start, delta2), vaddq_s16(start, delta3)};
654 Store4QMsanS16(&scaling_lut[x_base], result);
655 } else {
656 vst1q_s16(&scaling_lut[x_base], full_interp);
657 }
658 upscaled_points0 = vaddq_s32(upscaled_points0, line_increment8);
659 upscaled_points1 = vaddq_s32(upscaled_points1, line_increment8);
660 x += 8;
661 } while (x < delta_x);
662 }
663 const int16_t last_point_value = point_value[num_points - 1];
664 const int x_base = last_point_value << (bitdepth - kBitdepth8);
665 Memset(&scaling_lut[x_base], point_scaling[num_points - 1],
666 scaling_lut_length - x_base);
667 if (bitdepth == kBitdepth10 && x_base > 0) {
668 const int start = scaling_lut[x_base - 4];
669 const int end = point_scaling[num_points - 1];
670 const int delta = end - start;
671 scaling_lut[x_base - 3] = start + RightShiftWithRounding(delta, 2);
672 scaling_lut[x_base - 2] = start + RightShiftWithRounding(2 * delta, 2);
673 scaling_lut[x_base - 1] = start + RightShiftWithRounding(3 * delta, 2);
674 }
675 }
676
Clip3(const int16x8_t value,const int16x8_t low,const int16x8_t high)677 inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
678 const int16x8_t high) {
679 const int16x8_t clipped_to_ceiling = vminq_s16(high, value);
680 return vmaxq_s16(low, clipped_to_ceiling);
681 }
682
683 template <int bitdepth, typename Pixel>
GetScalingFactors(const int16_t scaling_lut[],const Pixel * source,const int valid_range=8)684 inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
685 const Pixel* source,
686 const int valid_range = 8) {
687 int16_t start_vals[8];
688 static_assert(bitdepth <= kBitdepth10,
689 "NEON Film Grain is not yet implemented for 12bpp.");
690 #if LIBGAV1_MSAN
691 if (valid_range < 8) memset(start_vals, 0, sizeof(start_vals));
692 #endif
693 for (int i = 0; i < valid_range; ++i) {
694 assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
695 start_vals[i] = scaling_lut[source[i]];
696 }
697 return vld1q_s16(start_vals);
698 }
699
700 template <int bitdepth>
ScaleNoise(const int16x8_t noise,const int16x8_t scaling,const int16x8_t scaling_shift_vect)701 inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
702 const int16x8_t scaling_shift_vect) {
703 if (bitdepth == kBitdepth8) {
704 const int16x8_t upscaled_noise = vmulq_s16(noise, scaling);
705 return vrshlq_s16(upscaled_noise, scaling_shift_vect);
706 }
707 // Scaling shift is in the range [8, 11]. The doubling multiply returning high
708 // half is equivalent to a right shift by 15, so |scaling_shift_vect| should
709 // provide a left shift equal to 15 - s, where s is the original shift
710 // parameter.
711 const int16x8_t scaling_up = vshlq_s16(scaling, scaling_shift_vect);
712 return vqrdmulhq_s16(noise, scaling_up);
713 }
714
715 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageLuma_NEON(const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_luma,int scaling_shift,int width,int height,int start_height,const int16_t * scaling_lut_y,const void * source_plane_y,ptrdiff_t source_stride_y,void * dest_plane_y,ptrdiff_t dest_stride_y)716 void BlendNoiseWithImageLuma_NEON(
717 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
718 int scaling_shift, int width, int height, int start_height,
719 const int16_t* scaling_lut_y, const void* source_plane_y,
720 ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
721 const auto* noise_image =
722 static_cast<const Array2D<GrainType>*>(noise_image_ptr);
723 const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
724 source_stride_y /= sizeof(Pixel);
725 auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
726 dest_stride_y /= sizeof(Pixel);
727 const int16x8_t floor = vdupq_n_s16(min_value);
728 const int16x8_t ceiling = vdupq_n_s16(max_luma);
729 // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
730 // for 16 bit signed integers. In higher bitdepths, however, we have to
731 // expand to 32 to protect the sign bit.
732 const int16x8_t scaling_shift_vect = vdupq_n_s16(
733 (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
734
735 const int safe_width = width & ~15;
736 int y = 0;
737 do {
738 int x = 0;
739 for (; x + 8 <= safe_width; x += 8) {
740 // This operation on the unsigned input is safe in 8bpp because the vector
741 // is widened before it is reinterpreted.
742 const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]);
743 const int16x8_t scaling0 =
744 GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
745 int16x8_t noise =
746 GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
747
748 noise = ScaleNoise<bitdepth>(noise, scaling0, scaling_shift_vect);
749 const int16x8_t combined0 = vaddq_s16(orig0, noise);
750 // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
751 // clipping with vqmovun_s16, but it's not likely to be worth copying the
752 // function for just that case, though the gain would be very small.
753 StoreUnsigned8(&out_y_row[x],
754 vreinterpretq_u16_s16(Clip3(combined0, floor, ceiling)));
755 x += 8;
756
757 // This operation on the unsigned input is safe in 8bpp because the vector
758 // is widened before it is reinterpreted.
759 const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]);
760 const int16x8_t scaling1 =
761 GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
762 noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
763
764 noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect);
765 const int16x8_t combined1 = vaddq_s16(orig1, noise);
766 // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
767 // clipping with vqmovun_s16, but it's not likely to be worth copying the
768 // function for just that case, though the gain would be very small.
769 StoreUnsigned8(&out_y_row[x],
770 vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling)));
771 }
772
773 if (x < width) {
774 assert(width - x < 16);
775 if (x < width - 8) {
776 const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
777 const int16x8_t scaling =
778 GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
779 int16x8_t noise =
780 GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
781
782 noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
783 const int16x8_t combined = vaddq_s16(orig, noise);
784 // In 8bpp, when params_.clip_to_restricted_range == false, we can
785 // replace clipping with vqmovun_s16, but it's not likely to be worth
786 // copying the function for just that case, though the gain would be
787 // very small.
788 StoreUnsigned8(&out_y_row[x],
789 vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
790 x += 8;
791 }
792 const int valid_range_pixels = width - x;
793 const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]);
794 const int16x8_t orig =
795 GetSignedSource8Msan(&in_y_row[x], valid_range_bytes);
796 const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
797 scaling_lut_y, &in_y_row[x], valid_range_pixels);
798 int16x8_t noise =
799 GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
800 noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
801
802 const int16x8_t combined = vaddq_s16(orig, noise);
803 StoreUnsigned8(&out_y_row[x],
804 vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
805 }
806 in_y_row += source_stride_y;
807 out_y_row += dest_stride_y;
808 } while (++y < height);
809 }
810
811 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaValsWithCfl(const Pixel * LIBGAV1_RESTRICT chroma_cursor,const GrainType * LIBGAV1_RESTRICT noise_image_cursor,const int16x8_t scaling,const int16x8_t scaling_shift_vect)812 inline int16x8_t BlendChromaValsWithCfl(
813 const Pixel* LIBGAV1_RESTRICT chroma_cursor,
814 const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
815 const int16x8_t scaling, const int16x8_t scaling_shift_vect) {
816 const int16x8_t orig = GetSignedSource8(chroma_cursor);
817 int16x8_t noise = GetSignedSource8(noise_image_cursor);
818 noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
819 return vaddq_s16(orig, noise);
820 }
821
822 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaPlaneWithCfl_NEON(const Array2D<GrainType> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,const int16_t * LIBGAV1_RESTRICT scaling_lut,const Pixel * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const Pixel * in_chroma_row,ptrdiff_t source_stride_chroma,Pixel * out_chroma_row,ptrdiff_t dest_stride)823 LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
824 const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
825 int width, int height, int start_height, int subsampling_x,
826 int subsampling_y, int scaling_shift,
827 const int16_t* LIBGAV1_RESTRICT scaling_lut,
828 const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
829 const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
830 Pixel* out_chroma_row, ptrdiff_t dest_stride) {
831 const int16x8_t floor = vdupq_n_s16(min_value);
832 const int16x8_t ceiling = vdupq_n_s16(max_chroma);
833 Pixel luma_buffer[16];
834 // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
835 // for 16 bit signed integers. In higher bitdepths, however, we have to
836 // expand to 32 to protect the sign bit.
837 const int16x8_t scaling_shift_vect = vdupq_n_s16(
838 (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
839
840 const int chroma_height = (height + subsampling_y) >> subsampling_y;
841 const int chroma_width = (width + subsampling_x) >> subsampling_x;
842 const int safe_chroma_width = chroma_width & ~7;
843
844 // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
845 // in GetScalingFactors.
846 Pixel average_luma_buffer[8];
847 assert(start_height % 2 == 0);
848 start_height >>= subsampling_y;
849 int y = 0;
850 do {
851 int x = 0;
852 for (; x + 8 <= safe_chroma_width; x += 8) {
853 const int luma_x = x << subsampling_x;
854 const uint16x8_t average_luma =
855 GetAverageLuma(&in_y_row[luma_x], subsampling_x);
856 StoreUnsigned8(average_luma_buffer, average_luma);
857
858 const int16x8_t scaling =
859 GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
860 const int16x8_t blended =
861 BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
862 &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
863 scaling_shift_vect);
864
865 // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
866 // clipping with vqmovun_s16, but it's not likely to be worth copying the
867 // function for just that case.
868 StoreUnsigned8(&out_chroma_row[x],
869 vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
870 }
871
872 if (x < chroma_width) {
873 const int luma_x = x << subsampling_x;
874 const int valid_range_pixels = width - luma_x;
875 const int valid_range_chroma_pixels = chroma_width - x;
876 const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
877 assert(valid_range_pixels < 16);
878 memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
879 luma_buffer[valid_range_pixels] = in_y_row[width - 1];
880 const uint16x8_t average_luma = GetAverageLumaMsan(
881 luma_buffer, subsampling_x, valid_range_chroma_pixels << 1);
882
883 StoreUnsigned8(average_luma_buffer, average_luma);
884
885 const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
886 scaling_lut, average_luma_buffer, valid_range_chroma_pixels);
887 const int16x8_t blended =
888 BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
889 &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
890 scaling_shift_vect);
891 // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
892 // clipping with vqmovun_s16, but it's not likely to be worth copying the
893 // function for just that case.
894 StoreUnsigned8(&out_chroma_row[x],
895 vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
896 }
897
898 in_y_row += source_stride_y << subsampling_y;
899 in_chroma_row += source_stride_chroma;
900 out_chroma_row += dest_stride;
901 } while (++y < chroma_height);
902 }
903
904 // This function is for the case params_.chroma_scaling_from_luma == true.
905 // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
906 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageChromaWithCfl_NEON(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * LIBGAV1_RESTRICT scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)907 void BlendNoiseWithImageChromaWithCfl_NEON(
908 Plane plane, const FilmGrainParams& params,
909 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
910 int width, int height, int start_height, int subsampling_x,
911 int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
912 const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
913 const void* source_plane_uv, ptrdiff_t source_stride_uv,
914 void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
915 const auto* noise_image =
916 static_cast<const Array2D<GrainType>*>(noise_image_ptr);
917 const auto* in_y = static_cast<const Pixel*>(source_plane_y);
918 source_stride_y /= sizeof(Pixel);
919
920 const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
921 source_stride_uv /= sizeof(Pixel);
922 auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
923 dest_stride_uv /= sizeof(Pixel);
924 // Looping over one plane at a time is faster in higher resolutions, despite
925 // re-computing luma.
926 BlendChromaPlaneWithCfl_NEON<bitdepth, GrainType, Pixel>(
927 noise_image[plane], min_value, max_chroma, width, height, start_height,
928 subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
929 source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
930 }
931
932 } // namespace
933
934 namespace low_bitdepth {
935 namespace {
936
BlendChromaValsNoCfl(const int16_t * LIBGAV1_RESTRICT scaling_lut,const int16x8_t orig,const int8_t * LIBGAV1_RESTRICT noise_image_cursor,const int16x8_t & average_luma,const int16x8_t & scaling_shift_vect,const int16x8_t & offset,int luma_multiplier,int chroma_multiplier,bool restrict_scaling_lookup,int valid_range_pixels=0)937 inline int16x8_t BlendChromaValsNoCfl(
938 const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
939 const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
940 const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
941 const int16x8_t& offset, int luma_multiplier, int chroma_multiplier,
942 bool restrict_scaling_lookup, int valid_range_pixels = 0) {
943 uint8_t merged_buffer[8];
944 const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
945 const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
946 // Maximum value of |combined_u| is 127*255 = 0x7E81.
947 const int16x8_t combined = vhaddq_s16(weighted_luma, weighted_chroma);
948 // Maximum value of u_offset is (255 << 5) = 0x1FE0.
949 // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
950 const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
951 vst1_u8(merged_buffer, merged);
952
953 const int16x8_t scaling =
954 restrict_scaling_lookup
955 ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer,
956 valid_range_pixels)
957 : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
958 int16x8_t noise = GetSignedSource8(noise_image_cursor);
959 noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect);
960 return vaddq_s16(orig, noise);
961 }
962
BlendChromaPlane8bpp_NEON(const Array2D<int8_t> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,int chroma_offset,int chroma_multiplier,int luma_multiplier,const int16_t * LIBGAV1_RESTRICT scaling_lut,const uint8_t * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const uint8_t * in_chroma_row,ptrdiff_t source_stride_chroma,uint8_t * out_chroma_row,ptrdiff_t dest_stride)963 LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
964 const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
965 int width, int height, int start_height, int subsampling_x,
966 int subsampling_y, int scaling_shift, int chroma_offset,
967 int chroma_multiplier, int luma_multiplier,
968 const int16_t* LIBGAV1_RESTRICT scaling_lut,
969 const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
970 const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
971 uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
972 const int16x8_t floor = vdupq_n_s16(min_value);
973 const int16x8_t ceiling = vdupq_n_s16(max_chroma);
974 // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
975 // for 16 bit signed integers. In higher bitdepths, however, we have to
976 // expand to 32 to protect the sign bit.
977 const int16x8_t scaling_shift_vect = vdupq_n_s16(-scaling_shift);
978
979 const int chroma_height = (height + subsampling_y) >> subsampling_y;
980 const int chroma_width = (width + subsampling_x) >> subsampling_x;
981 const int safe_chroma_width = chroma_width & ~7;
982 uint8_t luma_buffer[16];
983 const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
984
985 start_height >>= subsampling_y;
986 int y = 0;
987 do {
988 int x = 0;
989 for (; x + 8 <= safe_chroma_width; x += 8) {
990 const int luma_x = x << subsampling_x;
991 const int valid_range_chroma_pixels = chroma_width - x;
992
993 const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
994 const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
995 &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1));
996 const int16x8_t blended = BlendChromaValsNoCfl(
997 scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
998 average_luma, scaling_shift_vect, offset, luma_multiplier,
999 chroma_multiplier, /*restrict_scaling_lookup=*/false);
1000 // In 8bpp, when params_.clip_to_restricted_range == false, we can
1001 // replace clipping with vqmovun_s16, but the gain would be small.
1002 StoreUnsigned8(&out_chroma_row[x],
1003 vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
1004 }
1005
1006 if (x < chroma_width) {
1007 // Begin right edge iteration. Same as the normal iterations, but the
1008 // |average_luma| computation requires a duplicated luma value at the
1009 // end.
1010 const int luma_x = x << subsampling_x;
1011 const int valid_range_pixels = width - luma_x;
1012 const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
1013 assert(valid_range_pixels < 16);
1014 memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
1015 luma_buffer[valid_range_pixels] = in_y_row[width - 1];
1016 const int valid_range_chroma_pixels = chroma_width - x;
1017
1018 const int16x8_t orig_chroma =
1019 GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels);
1020 const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
1021 luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
1022 const int16x8_t blended = BlendChromaValsNoCfl(
1023 scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
1024 average_luma, scaling_shift_vect, offset, luma_multiplier,
1025 chroma_multiplier, /*restrict_scaling_lookup=*/true,
1026 valid_range_chroma_pixels);
1027 StoreUnsigned8(&out_chroma_row[x],
1028 vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
1029 // End of right edge iteration.
1030 }
1031
1032 in_y_row += source_stride_y << subsampling_y;
1033 in_chroma_row += source_stride_chroma;
1034 out_chroma_row += dest_stride;
1035 } while (++y < chroma_height);
1036 }
1037
1038 // This function is for the case params_.chroma_scaling_from_luma == false.
BlendNoiseWithImageChroma8bpp_NEON(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * LIBGAV1_RESTRICT scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)1039 void BlendNoiseWithImageChroma8bpp_NEON(
1040 Plane plane, const FilmGrainParams& params,
1041 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
1042 int width, int height, int start_height, int subsampling_x,
1043 int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
1044 const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
1045 const void* source_plane_uv, ptrdiff_t source_stride_uv,
1046 void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
1047 assert(plane == kPlaneU || plane == kPlaneV);
1048 const auto* noise_image =
1049 static_cast<const Array2D<int8_t>*>(noise_image_ptr);
1050 const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
1051 const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
1052 auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
1053
1054 const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
1055 const int luma_multiplier =
1056 (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
1057 const int multiplier =
1058 (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
1059 BlendChromaPlane8bpp_NEON(noise_image[plane], min_value, max_chroma, width,
1060 height, start_height, subsampling_x, subsampling_y,
1061 params.chroma_scaling, offset, multiplier,
1062 luma_multiplier, scaling_lut, in_y, source_stride_y,
1063 in_uv, source_stride_uv, out_uv, dest_stride_uv);
1064 }
1065
WriteOverlapLine8bpp_NEON(const int8_t * LIBGAV1_RESTRICT noise_stripe_row,const int8_t * LIBGAV1_RESTRICT noise_stripe_row_prev,int plane_width,const int8x8_t grain_coeff,const int8x8_t old_coeff,int8_t * LIBGAV1_RESTRICT noise_image_row)1066 inline void WriteOverlapLine8bpp_NEON(
1067 const int8_t* LIBGAV1_RESTRICT noise_stripe_row,
1068 const int8_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
1069 const int8x8_t grain_coeff, const int8x8_t old_coeff,
1070 int8_t* LIBGAV1_RESTRICT noise_image_row) {
1071 int x = 0;
1072 do {
1073 // Note that these reads may exceed noise_stripe_row's width by up to 7
1074 // bytes.
1075 const int8x8_t source_grain = vld1_s8(noise_stripe_row + x);
1076 const int8x8_t source_old = vld1_s8(noise_stripe_row_prev + x);
1077 const int16x8_t weighted_grain = vmull_s8(grain_coeff, source_grain);
1078 const int16x8_t grain = vmlal_s8(weighted_grain, old_coeff, source_old);
1079 // Note that this write may exceed noise_image_row's width by up to 7 bytes.
1080 vst1_s8(noise_image_row + x, vqrshrn_n_s16(grain, 5));
1081 x += 8;
1082 } while (x < plane_width);
1083 }
1084
ConstructNoiseImageOverlap8bpp_NEON(const void * LIBGAV1_RESTRICT noise_stripes_buffer,int width,int height,int subsampling_x,int subsampling_y,void * LIBGAV1_RESTRICT noise_image_buffer)1085 void ConstructNoiseImageOverlap8bpp_NEON(
1086 const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
1087 int subsampling_x, int subsampling_y,
1088 void* LIBGAV1_RESTRICT noise_image_buffer) {
1089 const auto* noise_stripes =
1090 static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer);
1091 auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer);
1092 const int plane_width = (width + subsampling_x) >> subsampling_x;
1093 const int plane_height = (height + subsampling_y) >> subsampling_y;
1094 const int stripe_height = 32 >> subsampling_y;
1095 const int stripe_mask = stripe_height - 1;
1096 int y = stripe_height;
1097 int luma_num = 1;
1098 if (subsampling_y == 0) {
1099 const int8x8_t first_row_grain_coeff = vdup_n_s8(17);
1100 const int8x8_t first_row_old_coeff = vdup_n_s8(27);
1101 const int8x8_t second_row_grain_coeff = first_row_old_coeff;
1102 const int8x8_t second_row_old_coeff = first_row_grain_coeff;
1103 for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
1104 const int8_t* noise_stripe = (*noise_stripes)[luma_num];
1105 const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1106 WriteOverlapLine8bpp_NEON(
1107 noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
1108 first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1109
1110 WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
1111 &noise_stripe_prev[(32 + 1) * plane_width],
1112 plane_width, second_row_grain_coeff,
1113 second_row_old_coeff, (*noise_image)[y + 1]);
1114 }
1115 // Either one partial stripe remains (remaining_height > 0),
1116 // OR image is less than one stripe high (remaining_height < 0),
1117 // OR all stripes are completed (remaining_height == 0).
1118 const int remaining_height = plane_height - y;
1119 if (remaining_height <= 0) {
1120 return;
1121 }
1122 const int8_t* noise_stripe = (*noise_stripes)[luma_num];
1123 const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1124 WriteOverlapLine8bpp_NEON(
1125 noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
1126 first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1127
1128 if (remaining_height > 1) {
1129 WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
1130 &noise_stripe_prev[(32 + 1) * plane_width],
1131 plane_width, second_row_grain_coeff,
1132 second_row_old_coeff, (*noise_image)[y + 1]);
1133 }
1134 } else { // subsampling_y == 1
1135 const int8x8_t first_row_grain_coeff = vdup_n_s8(22);
1136 const int8x8_t first_row_old_coeff = vdup_n_s8(23);
1137 for (; y < plane_height; ++luma_num, y += stripe_height) {
1138 const int8_t* noise_stripe = (*noise_stripes)[luma_num];
1139 const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1140 WriteOverlapLine8bpp_NEON(
1141 noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
1142 first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1143 }
1144 }
1145 }
1146
Init8bpp()1147 void Init8bpp() {
1148 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
1149 assert(dsp != nullptr);
1150
1151 // LumaAutoRegressionFunc
1152 dsp->film_grain.luma_auto_regression[0] =
1153 ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 1>;
1154 dsp->film_grain.luma_auto_regression[1] =
1155 ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 2>;
1156 dsp->film_grain.luma_auto_regression[2] =
1157 ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 3>;
1158
1159 // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag]
1160 // Chroma autoregression should never be called when lag is 0 and use_luma
1161 // is false.
1162 dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
1163 dsp->film_grain.chroma_auto_regression[0][1] =
1164 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1,
1165 false>;
1166 dsp->film_grain.chroma_auto_regression[0][2] =
1167 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2,
1168 false>;
1169 dsp->film_grain.chroma_auto_regression[0][3] =
1170 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3,
1171 false>;
1172 dsp->film_grain.chroma_auto_regression[1][0] =
1173 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 0, true>;
1174 dsp->film_grain.chroma_auto_regression[1][1] =
1175 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1, true>;
1176 dsp->film_grain.chroma_auto_regression[1][2] =
1177 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2, true>;
1178 dsp->film_grain.chroma_auto_regression[1][3] =
1179 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3, true>;
1180
1181 dsp->film_grain.construct_noise_image_overlap =
1182 ConstructNoiseImageOverlap8bpp_NEON;
1183
1184 dsp->film_grain.initialize_scaling_lut =
1185 InitializeScalingLookupTable_NEON<kBitdepth8>;
1186
1187 dsp->film_grain.blend_noise_luma =
1188 BlendNoiseWithImageLuma_NEON<kBitdepth8, int8_t, uint8_t>;
1189 dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON;
1190 dsp->film_grain.blend_noise_chroma[1] =
1191 BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth8, int8_t, uint8_t>;
1192 }
1193
1194 } // namespace
1195 } // namespace low_bitdepth
1196
1197 #if LIBGAV1_MAX_BITDEPTH >= 10
1198 namespace high_bitdepth {
1199 namespace {
1200
WriteOverlapLine10bpp_NEON(const int16_t * LIBGAV1_RESTRICT noise_stripe_row,const int16_t * LIBGAV1_RESTRICT noise_stripe_row_prev,int plane_width,const int16x8_t grain_coeff,const int16x8_t old_coeff,int16_t * LIBGAV1_RESTRICT noise_image_row)1201 inline void WriteOverlapLine10bpp_NEON(
1202 const int16_t* LIBGAV1_RESTRICT noise_stripe_row,
1203 const int16_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
1204 const int16x8_t grain_coeff, const int16x8_t old_coeff,
1205 int16_t* LIBGAV1_RESTRICT noise_image_row) {
1206 int x = 0;
1207 do {
1208 // Note that these reads may exceed noise_stripe_row's width by up to 7
1209 // values.
1210 const int16x8_t source_grain = vld1q_s16(noise_stripe_row + x);
1211 const int16x8_t source_old = vld1q_s16(noise_stripe_row_prev + x);
1212 // Maximum product is 511 * 27 = 0x35E5.
1213 const int16x8_t weighted_grain = vmulq_s16(grain_coeff, source_grain);
1214 // Maximum sum is 511 * (22 + 23) = 0x59D3.
1215 const int16x8_t grain_sum =
1216 vmlaq_s16(weighted_grain, old_coeff, source_old);
1217 // Note that this write may exceed noise_image_row's width by up to 7
1218 // values.
1219 const int16x8_t grain = Clip3S16(vrshrq_n_s16(grain_sum, 5),
1220 vdupq_n_s16(GetGrainMin<kBitdepth10>()),
1221 vdupq_n_s16(GetGrainMax<kBitdepth10>()));
1222 vst1q_s16(noise_image_row + x, grain);
1223 x += 8;
1224 } while (x < plane_width);
1225 }
1226
ConstructNoiseImageOverlap10bpp_NEON(const void * LIBGAV1_RESTRICT noise_stripes_buffer,int width,int height,int subsampling_x,int subsampling_y,void * LIBGAV1_RESTRICT noise_image_buffer)1227 void ConstructNoiseImageOverlap10bpp_NEON(
1228 const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
1229 int subsampling_x, int subsampling_y,
1230 void* LIBGAV1_RESTRICT noise_image_buffer) {
1231 const auto* noise_stripes =
1232 static_cast<const Array2DView<int16_t>*>(noise_stripes_buffer);
1233 auto* noise_image = static_cast<Array2D<int16_t>*>(noise_image_buffer);
1234 const int plane_width = (width + subsampling_x) >> subsampling_x;
1235 const int plane_height = (height + subsampling_y) >> subsampling_y;
1236 const int stripe_height = 32 >> subsampling_y;
1237 const int stripe_mask = stripe_height - 1;
1238 int y = stripe_height;
1239 int luma_num = 1;
1240 if (subsampling_y == 0) {
1241 const int16x8_t first_row_grain_coeff = vdupq_n_s16(17);
1242 const int16x8_t first_row_old_coeff = vdupq_n_s16(27);
1243 const int16x8_t second_row_grain_coeff = first_row_old_coeff;
1244 const int16x8_t second_row_old_coeff = first_row_grain_coeff;
1245 for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
1246 const int16_t* noise_stripe = (*noise_stripes)[luma_num];
1247 const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1248 WriteOverlapLine10bpp_NEON(
1249 noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
1250 first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1251
1252 WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
1253 &noise_stripe_prev[(32 + 1) * plane_width],
1254 plane_width, second_row_grain_coeff,
1255 second_row_old_coeff, (*noise_image)[y + 1]);
1256 }
1257 // Either one partial stripe remains (remaining_height > 0),
1258 // OR image is less than one stripe high (remaining_height < 0),
1259 // OR all stripes are completed (remaining_height == 0).
1260 const int remaining_height = plane_height - y;
1261 if (remaining_height <= 0) {
1262 return;
1263 }
1264 const int16_t* noise_stripe = (*noise_stripes)[luma_num];
1265 const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1266 WriteOverlapLine10bpp_NEON(
1267 noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
1268 first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1269
1270 if (remaining_height > 1) {
1271 WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
1272 &noise_stripe_prev[(32 + 1) * plane_width],
1273 plane_width, second_row_grain_coeff,
1274 second_row_old_coeff, (*noise_image)[y + 1]);
1275 }
1276 } else { // subsampling_y == 1
1277 const int16x8_t first_row_grain_coeff = vdupq_n_s16(22);
1278 const int16x8_t first_row_old_coeff = vdupq_n_s16(23);
1279 for (; y < plane_height; ++luma_num, y += stripe_height) {
1280 const int16_t* noise_stripe = (*noise_stripes)[luma_num];
1281 const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
1282 WriteOverlapLine10bpp_NEON(
1283 noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
1284 first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
1285 }
1286 }
1287 }
1288
BlendChromaValsNoCfl(const int16_t * LIBGAV1_RESTRICT scaling_lut,const int16x8_t orig,const int16_t * LIBGAV1_RESTRICT noise_image_cursor,const int16x8_t & average_luma,const int16x8_t & scaling_shift_vect,const int32x4_t & offset,int luma_multiplier,int chroma_multiplier,bool restrict_scaling_lookup,int valid_range_pixels=0)1289 inline int16x8_t BlendChromaValsNoCfl(
1290 const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
1291 const int16_t* LIBGAV1_RESTRICT noise_image_cursor,
1292 const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
1293 const int32x4_t& offset, int luma_multiplier, int chroma_multiplier,
1294 bool restrict_scaling_lookup, int valid_range_pixels = 0) {
1295 uint16_t merged_buffer[8];
1296 const int32x4_t weighted_luma_low =
1297 vmull_n_s16(vget_low_s16(average_luma), luma_multiplier);
1298 const int32x4_t weighted_luma_high =
1299 vmull_n_s16(vget_high_s16(average_luma), luma_multiplier);
1300 // Maximum value of combined is 127 * 1023 = 0x1FB81.
1301 const int32x4_t combined_low =
1302 vmlal_n_s16(weighted_luma_low, vget_low_s16(orig), chroma_multiplier);
1303 const int32x4_t combined_high =
1304 vmlal_n_s16(weighted_luma_high, vget_high_s16(orig), chroma_multiplier);
1305 // Maximum value of offset is (255 << 8) = 0xFF00. Offset may be negative.
1306 const uint16x4_t merged_low =
1307 vqshrun_n_s32(vaddq_s32(offset, combined_low), 6);
1308 const uint16x4_t merged_high =
1309 vqshrun_n_s32(vaddq_s32(offset, combined_high), 6);
1310 const uint16x8_t max_pixel = vdupq_n_u16((1 << kBitdepth10) - 1);
1311 vst1q_u16(merged_buffer,
1312 vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel));
1313 const int16x8_t scaling =
1314 restrict_scaling_lookup
1315 ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer,
1316 valid_range_pixels)
1317 : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut,
1318 merged_buffer);
1319 const int16x8_t noise = GetSignedSource8(noise_image_cursor);
1320 const int16x8_t scaled_noise =
1321 ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect);
1322 return vaddq_s16(orig, scaled_noise);
1323 }
1324
BlendChromaPlane10bpp_NEON(const Array2D<int16_t> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,int chroma_offset,int chroma_multiplier,int luma_multiplier,const int16_t * LIBGAV1_RESTRICT scaling_lut,const uint16_t * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const uint16_t * in_chroma_row,ptrdiff_t source_stride_chroma,uint16_t * out_chroma_row,ptrdiff_t dest_stride)1325 LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
1326 const Array2D<int16_t>& noise_image, int min_value, int max_chroma,
1327 int width, int height, int start_height, int subsampling_x,
1328 int subsampling_y, int scaling_shift, int chroma_offset,
1329 int chroma_multiplier, int luma_multiplier,
1330 const int16_t* LIBGAV1_RESTRICT scaling_lut,
1331 const uint16_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
1332 const uint16_t* in_chroma_row, ptrdiff_t source_stride_chroma,
1333 uint16_t* out_chroma_row, ptrdiff_t dest_stride) {
1334 const int16x8_t floor = vdupq_n_s16(min_value);
1335 const int16x8_t ceiling = vdupq_n_s16(max_chroma);
1336 const int16x8_t scaling_shift_vect = vdupq_n_s16(15 - scaling_shift);
1337
1338 const int chroma_height = (height + subsampling_y) >> subsampling_y;
1339 const int chroma_width = (width + subsampling_x) >> subsampling_x;
1340 const int safe_chroma_width = chroma_width & ~7;
1341 uint16_t luma_buffer[16];
1342 // Offset is added before downshifting in order to take advantage of
1343 // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp.
1344 const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2));
1345
1346 start_height >>= subsampling_y;
1347 int y = 0;
1348 do {
1349 int x = 0;
1350 for (; x + 8 <= safe_chroma_width; x += 8) {
1351 const int luma_x = x << subsampling_x;
1352 const int16x8_t average_luma = vreinterpretq_s16_u16(
1353 GetAverageLuma(&in_y_row[luma_x], subsampling_x));
1354 const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
1355 const int16x8_t blended = BlendChromaValsNoCfl(
1356 scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
1357 average_luma, scaling_shift_vect, offset, luma_multiplier,
1358 chroma_multiplier, /*restrict_scaling_lookup=*/false);
1359 StoreUnsigned8(&out_chroma_row[x],
1360 vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
1361 }
1362
1363 if (x < chroma_width) {
1364 // Begin right edge iteration. Same as the normal iterations, but the
1365 // |average_luma| computation requires a duplicated luma value at the
1366 // end.
1367 const int luma_x = x << subsampling_x;
1368 const int valid_range_pixels = width - luma_x;
1369 const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
1370 assert(valid_range_pixels < 16);
1371 memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
1372 luma_buffer[valid_range_pixels] = in_y_row[width - 1];
1373 const int valid_range_chroma_pixels = chroma_width - x;
1374 const int valid_range_chroma_bytes =
1375 (chroma_width - x) * sizeof(in_chroma_row[0]);
1376 const int16x8_t orig_chroma =
1377 GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
1378
1379 const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
1380 luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
1381 const int16x8_t blended = BlendChromaValsNoCfl(
1382 scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
1383 average_luma, scaling_shift_vect, offset, luma_multiplier,
1384 chroma_multiplier, /*restrict_scaling_lookup=*/true,
1385 valid_range_chroma_pixels);
1386 StoreUnsigned8(&out_chroma_row[x],
1387 vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
1388 // End of right edge iteration.
1389 }
1390
1391 in_y_row = AddByteStride(in_y_row, source_stride_y << subsampling_y);
1392 in_chroma_row = AddByteStride(in_chroma_row, source_stride_chroma);
1393 out_chroma_row = AddByteStride(out_chroma_row, dest_stride);
1394 } while (++y < chroma_height);
1395 }
1396
1397 // This function is for the case params_.chroma_scaling_from_luma == false.
BlendNoiseWithImageChroma10bpp_NEON(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * LIBGAV1_RESTRICT scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)1398 void BlendNoiseWithImageChroma10bpp_NEON(
1399 Plane plane, const FilmGrainParams& params,
1400 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
1401 int width, int height, int start_height, int subsampling_x,
1402 int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
1403 const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
1404 const void* source_plane_uv, ptrdiff_t source_stride_uv,
1405 void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
1406 assert(plane == kPlaneU || plane == kPlaneV);
1407 const auto* noise_image =
1408 static_cast<const Array2D<int16_t>*>(noise_image_ptr);
1409 const auto* in_y = static_cast<const uint16_t*>(source_plane_y);
1410 const auto* in_uv = static_cast<const uint16_t*>(source_plane_uv);
1411 auto* out_uv = static_cast<uint16_t*>(dest_plane_uv);
1412
1413 const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
1414 const int luma_multiplier =
1415 (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
1416 const int multiplier =
1417 (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
1418 BlendChromaPlane10bpp_NEON(
1419 noise_image[plane], min_value, max_chroma, width, height, start_height,
1420 subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
1421 luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
1422 source_stride_uv, out_uv, dest_stride_uv);
1423 }
1424
Init10bpp()1425 void Init10bpp() {
1426 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
1427 assert(dsp != nullptr);
1428
1429 // LumaAutoRegressionFunc
1430 dsp->film_grain.luma_auto_regression[0] =
1431 ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 1>;
1432 dsp->film_grain.luma_auto_regression[1] =
1433 ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 2>;
1434 dsp->film_grain.luma_auto_regression[2] =
1435 ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 3>;
1436
1437 // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling]
1438 // Chroma autoregression should never be called when lag is 0 and use_luma
1439 // is false.
1440 dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
1441 dsp->film_grain.chroma_auto_regression[0][1] =
1442 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
1443 false>;
1444 dsp->film_grain.chroma_auto_regression[0][2] =
1445 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
1446 false>;
1447 dsp->film_grain.chroma_auto_regression[0][3] =
1448 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
1449 false>;
1450 dsp->film_grain.chroma_auto_regression[1][0] =
1451 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 0,
1452 true>;
1453 dsp->film_grain.chroma_auto_regression[1][1] =
1454 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
1455 true>;
1456 dsp->film_grain.chroma_auto_regression[1][2] =
1457 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
1458 true>;
1459 dsp->film_grain.chroma_auto_regression[1][3] =
1460 ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
1461 true>;
1462
1463 dsp->film_grain.construct_noise_image_overlap =
1464 ConstructNoiseImageOverlap10bpp_NEON;
1465
1466 dsp->film_grain.initialize_scaling_lut =
1467 InitializeScalingLookupTable_NEON<kBitdepth10>;
1468
1469 dsp->film_grain.blend_noise_luma =
1470 BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
1471 dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON;
1472 dsp->film_grain.blend_noise_chroma[1] =
1473 BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>;
1474 }
1475
1476 } // namespace
1477 } // namespace high_bitdepth
1478 #endif // LIBGAV1_MAX_BITDEPTH >= 10
1479
1480 } // namespace film_grain
1481
FilmGrainInit_NEON()1482 void FilmGrainInit_NEON() {
1483 film_grain::low_bitdepth::Init8bpp();
1484 #if LIBGAV1_MAX_BITDEPTH >= 10
1485 film_grain::high_bitdepth::Init10bpp();
1486 #endif // LIBGAV1_MAX_BITDEPTH >= 10
1487 }
1488
1489 } // namespace dsp
1490 } // namespace libgav1
1491
1492 #else // !LIBGAV1_ENABLE_NEON
1493
1494 namespace libgav1 {
1495 namespace dsp {
1496
FilmGrainInit_NEON()1497 void FilmGrainInit_NEON() {}
1498
1499 } // namespace dsp
1500 } // namespace libgav1
1501 #endif // LIBGAV1_ENABLE_NEON
1502