xref: /aosp_15_r20/external/libgav1/src/dsp/x86/film_grain_sse4.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/film_grain.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 #include <smmintrin.h>
20 
21 #include <cassert>
22 #include <cstddef>
23 #include <cstdint>
24 #include <cstring>
25 
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/film_grain_common.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/utils/array_2d.h"
31 #include "src/utils/common.h"
32 #include "src/utils/compiler_attributes.h"
33 #include "src/utils/constants.h"
34 #include "src/utils/types.h"
35 
36 namespace libgav1 {
37 namespace dsp {
38 namespace film_grain {
39 namespace {
40 
41 // Load 8 values from source, widening to int16_t intermediate value size.
42 // The function is overloaded for each type and bitdepth for simplicity.
LoadSource(const int8_t * src)43 inline __m128i LoadSource(const int8_t* src) {
44   return _mm_cvtepi8_epi16(LoadLo8(src));
45 }
46 
47 // Load 8 values from source, widening to int16_t intermediate value size.
LoadSource(const uint8_t * src)48 inline __m128i LoadSource(const uint8_t* src) {
49   return _mm_cvtepu8_epi16(LoadLo8(src));
50 }
51 
LoadSourceMsan(const uint8_t * src,const int valid_range)52 inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
53   return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
54 }
55 
56 // Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
StoreUnsigned(uint8_t * dest,const __m128i data)57 inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
58   StoreLo8(dest, _mm_packus_epi16(data, data));
59 }
60 
61 #if LIBGAV1_MAX_BITDEPTH >= 10
62 // Load 8 values from source.
LoadSource(const int16_t * src)63 inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
64 
65 // Load 8 values from source.
LoadSource(const uint16_t * src)66 inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
67 
68 // Store 8 values to dest.
StoreUnsigned(uint16_t * dest,const __m128i data)69 inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
70   StoreUnaligned16(dest, data);
71 }
72 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
73 
74 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint8_t * const luma,int subsampling_x)75 inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
76   if (subsampling_x != 0) {
77     const __m128i src = LoadUnaligned16(luma);
78 
79     return RightShiftWithRounding_U16(
80         _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
81                        _mm_unpackhi_epi8(src, _mm_setzero_si128())),
82         1);
83   }
84   return _mm_cvtepu8_epi16(LoadLo8(luma));
85 }
86 
GetAverageLumaMsan(const uint8_t * const luma,int subsampling_x,int valid_range)87 inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
88                                   int valid_range) {
89   if (subsampling_x != 0) {
90     const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
91 
92     return RightShiftWithRounding_U16(
93         _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
94                        _mm_unpackhi_epi8(src, _mm_setzero_si128())),
95         1);
96   }
97   return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
98 }
99 
100 #if LIBGAV1_MAX_BITDEPTH >= 10
101 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint16_t * const luma,int subsampling_x)102 inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
103   if (subsampling_x != 0) {
104     return RightShiftWithRounding_U16(
105         _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
106   }
107   return LoadUnaligned16(luma);
108 }
109 
GetAverageLumaMsan(const uint16_t * const luma,int subsampling_x,int valid_range)110 inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
111                                   int valid_range) {
112   if (subsampling_x != 0) {
113     return RightShiftWithRounding_U16(
114         _mm_hadd_epi16(
115             LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
116             LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
117         1);
118   }
119   return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
120 }
121 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
122 
Clip3(const __m128i value,const __m128i low,const __m128i high)123 inline __m128i Clip3(const __m128i value, const __m128i low,
124                      const __m128i high) {
125   const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
126   return _mm_max_epi16(low, clipped_to_ceiling);
127 }
128 
129 template <int bitdepth, typename Pixel>
GetScalingFactors(const int16_t * scaling_lut,const Pixel * source)130 inline __m128i GetScalingFactors(const int16_t* scaling_lut,
131                                  const Pixel* source) {
132   alignas(16) int16_t start_vals[8];
133   static_assert(bitdepth <= kBitdepth10,
134                 "SSE4 Film Grain is not yet implemented for 12bpp.");
135   for (int i = 0; i < 8; ++i) {
136     assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
137     start_vals[i] = scaling_lut[source[i]];
138   }
139   return LoadAligned16(start_vals);
140 }
141 
142 // |scaling_shift| is in range [8,11].
143 template <int bitdepth>
ScaleNoise(const __m128i noise,const __m128i scaling,const __m128i scaling_shift)144 inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
145                           const __m128i scaling_shift) {
146   const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
147   return _mm_mulhrs_epi16(noise, shifted_scale_factors);
148 }
149 
150 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageLuma_SSE4_1(const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_luma,int scaling_shift,int width,int height,int start_height,const int16_t * scaling_lut_y,const void * source_plane_y,ptrdiff_t source_stride_y,void * dest_plane_y,ptrdiff_t dest_stride_y)151 void BlendNoiseWithImageLuma_SSE4_1(
152     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
153     int scaling_shift, int width, int height, int start_height,
154     const int16_t* scaling_lut_y, const void* source_plane_y,
155     ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
156   const auto* noise_image =
157       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
158   const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
159   source_stride_y /= sizeof(Pixel);
160   auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
161   dest_stride_y /= sizeof(Pixel);
162   const __m128i floor = _mm_set1_epi16(min_value);
163   const __m128i ceiling = _mm_set1_epi16(max_luma);
164   const int safe_width = width & ~7;
165   const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
166   int y = 0;
167   do {
168     int x = 0;
169     for (; x + 8 <= safe_width; x += 8) {
170       const __m128i orig = LoadSource(&in_y_row[x]);
171       const __m128i scaling =
172           GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
173       __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
174 
175       noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
176       const __m128i combined = _mm_add_epi16(orig, noise);
177       StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
178     }
179 
180     if (x < width) {
181       Pixel luma_buffer[8];
182       // Prevent arbitrary indices from entering GetScalingFactors.
183       memset(luma_buffer, 0, sizeof(luma_buffer));
184       const int valid_range = width - x;
185       assert(valid_range < 8);
186       memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
187       luma_buffer[valid_range] = in_y_row[width - 1];
188       const __m128i orig = LoadSource(&in_y_row[x]);
189       const __m128i scaling =
190           GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
191       __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
192 
193       noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
194       const __m128i combined = _mm_add_epi16(orig, noise);
195       StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
196     }
197     in_y_row += source_stride_y;
198     out_y_row += dest_stride_y;
199   } while (++y < height);
200   out_y_row = static_cast<Pixel*>(dest_plane_y);
201 }
202 
203 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaValsWithCfl(const Pixel * LIBGAV1_RESTRICT average_luma_buffer,const int16_t * scaling_lut,const Pixel * LIBGAV1_RESTRICT chroma_cursor,const GrainType * LIBGAV1_RESTRICT noise_image_cursor,const __m128i scaling_shift)204 inline __m128i BlendChromaValsWithCfl(
205     const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
206     const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor,
207     const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
208     const __m128i scaling_shift) {
209   const __m128i scaling =
210       GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
211   const __m128i orig = LoadSource(chroma_cursor);
212   __m128i noise = LoadSource(noise_image_cursor);
213   noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
214   return _mm_add_epi16(orig, noise);
215 }
216 
217 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaPlaneWithCfl_SSE4_1(const Array2D<GrainType> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,const int16_t * scaling_lut,const Pixel * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const Pixel * in_chroma_row,ptrdiff_t source_stride_chroma,Pixel * out_chroma_row,ptrdiff_t dest_stride)218 LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
219     const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
220     int width, int height, int start_height, int subsampling_x,
221     int subsampling_y, int scaling_shift, const int16_t* scaling_lut,
222     const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
223     const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
224     Pixel* out_chroma_row, ptrdiff_t dest_stride) {
225   const __m128i floor = _mm_set1_epi16(min_value);
226   const __m128i ceiling = _mm_set1_epi16(max_chroma);
227   alignas(16) Pixel luma_buffer[16];
228 
229   const int chroma_height = (height + subsampling_y) >> subsampling_y;
230   const int chroma_width = (width + subsampling_x) >> subsampling_x;
231   // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
232   // need to be guarded from overread, even if |chroma_width| is divisible by 8.
233   const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
234 
235   // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
236   // in GetScalingFactors.
237   Pixel average_luma_buffer[8];
238   assert(start_height % 2 == 0);
239   start_height >>= subsampling_y;
240   const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
241   int y = 0;
242   do {
243     int x = 0;
244     for (; x + 8 <= safe_chroma_width; x += 8) {
245       const int luma_x = x << subsampling_x;
246       const __m128i average_luma =
247           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
248       StoreUnsigned(average_luma_buffer, average_luma);
249 
250       const __m128i blended =
251           BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
252               average_luma_buffer, scaling_lut, &in_chroma_row[x],
253               &(noise_image[y + start_height][x]), derived_scaling_shift);
254       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
255     }
256 
257     if (x < chroma_width) {
258       // Prevent huge indices from entering GetScalingFactors due to
259       // uninitialized values. This is not a problem in 8bpp because the table
260       // is made larger than 255 values.
261       if (bitdepth > kBitdepth8) {
262         memset(luma_buffer, 0, sizeof(luma_buffer));
263       }
264       const int luma_x = x << subsampling_x;
265       const int valid_range = width - luma_x;
266       assert(valid_range < 16);
267       memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
268       luma_buffer[valid_range] = in_y_row[width - 1];
269       const __m128i average_luma =
270           GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
271       StoreUnsigned(average_luma_buffer, average_luma);
272 
273       const __m128i blended =
274           BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
275               average_luma_buffer, scaling_lut, &in_chroma_row[x],
276               &(noise_image[y + start_height][x]), derived_scaling_shift);
277       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
278     }
279 
280     in_y_row += source_stride_y << subsampling_y;
281     in_chroma_row += source_stride_chroma;
282     out_chroma_row += dest_stride;
283   } while (++y < chroma_height);
284 }
285 
286 // This function is for the case params_.chroma_scaling_from_luma == true.
287 // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
288 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageChromaWithCfl_SSE4_1(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)289 void BlendNoiseWithImageChromaWithCfl_SSE4_1(
290     Plane plane, const FilmGrainParams& params,
291     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
292     int width, int height, int start_height, int subsampling_x,
293     int subsampling_y, const int16_t* scaling_lut,
294     const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
295     const void* source_plane_uv, ptrdiff_t source_stride_uv,
296     void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
297   const auto* noise_image =
298       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
299   const auto* in_y = static_cast<const Pixel*>(source_plane_y);
300   source_stride_y /= sizeof(Pixel);
301 
302   const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
303   source_stride_uv /= sizeof(Pixel);
304   auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
305   dest_stride_uv /= sizeof(Pixel);
306   BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
307       noise_image[plane], min_value, max_chroma, width, height, start_height,
308       subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
309       source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
310 }
311 
312 }  // namespace
313 
314 namespace low_bitdepth {
315 namespace {
316 
317 // |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
BlendChromaValsNoCfl8bpp(const int16_t * scaling_lut,const __m128i & orig,const int8_t * LIBGAV1_RESTRICT noise_image_cursor,const __m128i & average_luma,const __m128i & scaling_shift,const __m128i & offset,const __m128i & weights)318 inline __m128i BlendChromaValsNoCfl8bpp(
319     const int16_t* scaling_lut, const __m128i& orig,
320     const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
321     const __m128i& average_luma, const __m128i& scaling_shift,
322     const __m128i& offset, const __m128i& weights) {
323   uint8_t merged_buffer[8];
324   const __m128i combined_lo =
325       _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
326   const __m128i combined_hi =
327       _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
328   const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
329                                               _mm_srai_epi32((combined_hi), 6));
330 
331   const __m128i merged = _mm_add_epi16(merged_base, offset);
332 
333   StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
334   const __m128i scaling =
335       GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
336   __m128i noise = LoadSource(noise_image_cursor);
337   noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift);
338   return _mm_add_epi16(orig, noise);
339 }
340 
BlendChromaPlane8bpp_SSE4_1(const Array2D<int8_t> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,int chroma_offset,int chroma_multiplier,int luma_multiplier,const int16_t * scaling_lut,const uint8_t * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const uint8_t * in_chroma_row,ptrdiff_t source_stride_chroma,uint8_t * out_chroma_row,ptrdiff_t dest_stride)341 LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
342     const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
343     int width, int height, int start_height, int subsampling_x,
344     int subsampling_y, int scaling_shift, int chroma_offset,
345     int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut,
346     const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
347     const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
348     uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
349   const __m128i floor = _mm_set1_epi16(min_value);
350   const __m128i ceiling = _mm_set1_epi16(max_chroma);
351 
352   const int chroma_height = (height + subsampling_y) >> subsampling_y;
353   const int chroma_width = (width + subsampling_x) >> subsampling_x;
354   // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
355   // will need to be guarded from overread, even if |chroma_width| is a
356   // multiple of 8.
357   const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
358   alignas(16) uint8_t luma_buffer[16];
359   const __m128i offset = _mm_set1_epi16(chroma_offset);
360   const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
361                                              (luma_multiplier & 0xFFFF));
362   const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
363 
364   start_height >>= subsampling_y;
365   int y = 0;
366   do {
367     int x = 0;
368     for (; x + 8 <= safe_chroma_width; x += 8) {
369       const int luma_x = x << subsampling_x;
370       const __m128i average_luma =
371           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
372       const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
373       const __m128i blended = BlendChromaValsNoCfl8bpp(
374           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
375           average_luma, derived_scaling_shift, offset, multipliers);
376       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
377     }
378 
379     if (x < chroma_width) {
380       // Begin right edge iteration. Same as the normal iterations, but the
381       // |average_luma| computation requires a duplicated luma value at the
382       // end.
383       const int luma_x = x << subsampling_x;
384       const int valid_range = width - luma_x;
385       assert(valid_range < 16);
386       // There is no need to pre-initialize this buffer, because merged values
387       // used as indices are saturated in the 8bpp case. Uninitialized values
388       // are written outside the frame.
389       memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
390       luma_buffer[valid_range] = in_y_row[width - 1];
391       const int valid_range_chroma = chroma_width - x;
392       uint8_t chroma_buffer[8];
393       memcpy(chroma_buffer, &in_chroma_row[x],
394              valid_range_chroma * sizeof(in_chroma_row[0]));
395 
396       const __m128i average_luma =
397           GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
398       const __m128i orig_chroma =
399           LoadSourceMsan(chroma_buffer, valid_range_chroma);
400       const __m128i blended = BlendChromaValsNoCfl8bpp(
401           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
402           average_luma, derived_scaling_shift, offset, multipliers);
403       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
404       // End of right edge iteration.
405     }
406 
407     in_y_row += source_stride_y << subsampling_y;
408     in_chroma_row += source_stride_chroma;
409     out_chroma_row += dest_stride;
410   } while (++y < chroma_height);
411 }
412 
413 // This function is for the case params_.chroma_scaling_from_luma == false.
BlendNoiseWithImageChroma8bpp_SSE4_1(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)414 void BlendNoiseWithImageChroma8bpp_SSE4_1(
415     Plane plane, const FilmGrainParams& params,
416     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
417     int width, int height, int start_height, int subsampling_x,
418     int subsampling_y, const int16_t* scaling_lut,
419     const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
420     const void* source_plane_uv, ptrdiff_t source_stride_uv,
421     void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
422   assert(plane == kPlaneU || plane == kPlaneV);
423   const auto* noise_image =
424       static_cast<const Array2D<int8_t>*>(noise_image_ptr);
425   const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
426   const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
427   auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
428 
429   const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
430   const int luma_multiplier =
431       (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
432   const int multiplier =
433       (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
434   BlendChromaPlane8bpp_SSE4_1(
435       noise_image[plane], min_value, max_chroma, width, height, start_height,
436       subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
437       luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
438       source_stride_uv, out_uv, dest_stride_uv);
439 }
440 
Init8bpp()441 void Init8bpp() {
442   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
443   assert(dsp != nullptr);
444 
445   dsp->film_grain.blend_noise_luma =
446       BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>;
447   dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
448   dsp->film_grain.blend_noise_chroma[1] =
449       BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>;
450 }
451 
452 }  // namespace
453 }  // namespace low_bitdepth
454 
455 #if LIBGAV1_MAX_BITDEPTH >= 10
456 namespace high_bitdepth {
457 namespace {
458 
Init10bpp()459 void Init10bpp() {
460   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
461   assert(dsp != nullptr);
462 
463   dsp->film_grain.blend_noise_luma =
464       BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>;
465   dsp->film_grain.blend_noise_chroma[1] =
466       BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>;
467 }
468 
469 }  // namespace
470 }  // namespace high_bitdepth
471 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
472 
473 }  // namespace film_grain
474 
FilmGrainInit_SSE4_1()475 void FilmGrainInit_SSE4_1() {
476   film_grain::low_bitdepth::Init8bpp();
477 #if LIBGAV1_MAX_BITDEPTH >= 10
478   film_grain::high_bitdepth::Init10bpp();
479 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
480 }
481 
482 }  // namespace dsp
483 }  // namespace libgav1
484 
485 #else   // !LIBGAV1_ENABLE_SSE4_1
486 
487 namespace libgav1 {
488 namespace dsp {
489 
FilmGrainInit_SSE4_1()490 void FilmGrainInit_SSE4_1() {}
491 
492 }  // namespace dsp
493 }  // namespace libgav1
494 #endif  // LIBGAV1_TARGETING_SSE4_1
495