1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/film_grain.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19 #include <smmintrin.h>
20
21 #include <cassert>
22 #include <cstddef>
23 #include <cstdint>
24 #include <cstring>
25
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/film_grain_common.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/utils/array_2d.h"
31 #include "src/utils/common.h"
32 #include "src/utils/compiler_attributes.h"
33 #include "src/utils/constants.h"
34 #include "src/utils/types.h"
35
36 namespace libgav1 {
37 namespace dsp {
38 namespace film_grain {
39 namespace {
40
41 // Load 8 values from source, widening to int16_t intermediate value size.
42 // The function is overloaded for each type and bitdepth for simplicity.
LoadSource(const int8_t * src)43 inline __m128i LoadSource(const int8_t* src) {
44 return _mm_cvtepi8_epi16(LoadLo8(src));
45 }
46
47 // Load 8 values from source, widening to int16_t intermediate value size.
LoadSource(const uint8_t * src)48 inline __m128i LoadSource(const uint8_t* src) {
49 return _mm_cvtepu8_epi16(LoadLo8(src));
50 }
51
LoadSourceMsan(const uint8_t * src,const int valid_range)52 inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
53 return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
54 }
55
56 // Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
StoreUnsigned(uint8_t * dest,const __m128i data)57 inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
58 StoreLo8(dest, _mm_packus_epi16(data, data));
59 }
60
61 #if LIBGAV1_MAX_BITDEPTH >= 10
62 // Load 8 values from source.
LoadSource(const int16_t * src)63 inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
64
65 // Load 8 values from source.
LoadSource(const uint16_t * src)66 inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
67
68 // Store 8 values to dest.
StoreUnsigned(uint16_t * dest,const __m128i data)69 inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
70 StoreUnaligned16(dest, data);
71 }
72 #endif // LIBGAV1_MAX_BITDEPTH >= 10
73
74 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint8_t * const luma,int subsampling_x)75 inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
76 if (subsampling_x != 0) {
77 const __m128i src = LoadUnaligned16(luma);
78
79 return RightShiftWithRounding_U16(
80 _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
81 _mm_unpackhi_epi8(src, _mm_setzero_si128())),
82 1);
83 }
84 return _mm_cvtepu8_epi16(LoadLo8(luma));
85 }
86
GetAverageLumaMsan(const uint8_t * const luma,int subsampling_x,int valid_range)87 inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
88 int valid_range) {
89 if (subsampling_x != 0) {
90 const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
91
92 return RightShiftWithRounding_U16(
93 _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
94 _mm_unpackhi_epi8(src, _mm_setzero_si128())),
95 1);
96 }
97 return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
98 }
99
100 #if LIBGAV1_MAX_BITDEPTH >= 10
101 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint16_t * const luma,int subsampling_x)102 inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
103 if (subsampling_x != 0) {
104 return RightShiftWithRounding_U16(
105 _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
106 }
107 return LoadUnaligned16(luma);
108 }
109
GetAverageLumaMsan(const uint16_t * const luma,int subsampling_x,int valid_range)110 inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
111 int valid_range) {
112 if (subsampling_x != 0) {
113 return RightShiftWithRounding_U16(
114 _mm_hadd_epi16(
115 LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
116 LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
117 1);
118 }
119 return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
120 }
121 #endif // LIBGAV1_MAX_BITDEPTH >= 10
122
Clip3(const __m128i value,const __m128i low,const __m128i high)123 inline __m128i Clip3(const __m128i value, const __m128i low,
124 const __m128i high) {
125 const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
126 return _mm_max_epi16(low, clipped_to_ceiling);
127 }
128
129 template <int bitdepth, typename Pixel>
GetScalingFactors(const int16_t * scaling_lut,const Pixel * source)130 inline __m128i GetScalingFactors(const int16_t* scaling_lut,
131 const Pixel* source) {
132 alignas(16) int16_t start_vals[8];
133 static_assert(bitdepth <= kBitdepth10,
134 "SSE4 Film Grain is not yet implemented for 12bpp.");
135 for (int i = 0; i < 8; ++i) {
136 assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
137 start_vals[i] = scaling_lut[source[i]];
138 }
139 return LoadAligned16(start_vals);
140 }
141
142 // |scaling_shift| is in range [8,11].
143 template <int bitdepth>
ScaleNoise(const __m128i noise,const __m128i scaling,const __m128i scaling_shift)144 inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
145 const __m128i scaling_shift) {
146 const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
147 return _mm_mulhrs_epi16(noise, shifted_scale_factors);
148 }
149
150 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageLuma_SSE4_1(const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_luma,int scaling_shift,int width,int height,int start_height,const int16_t * scaling_lut_y,const void * source_plane_y,ptrdiff_t source_stride_y,void * dest_plane_y,ptrdiff_t dest_stride_y)151 void BlendNoiseWithImageLuma_SSE4_1(
152 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
153 int scaling_shift, int width, int height, int start_height,
154 const int16_t* scaling_lut_y, const void* source_plane_y,
155 ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
156 const auto* noise_image =
157 static_cast<const Array2D<GrainType>*>(noise_image_ptr);
158 const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
159 source_stride_y /= sizeof(Pixel);
160 auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
161 dest_stride_y /= sizeof(Pixel);
162 const __m128i floor = _mm_set1_epi16(min_value);
163 const __m128i ceiling = _mm_set1_epi16(max_luma);
164 const int safe_width = width & ~7;
165 const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
166 int y = 0;
167 do {
168 int x = 0;
169 for (; x + 8 <= safe_width; x += 8) {
170 const __m128i orig = LoadSource(&in_y_row[x]);
171 const __m128i scaling =
172 GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
173 __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
174
175 noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
176 const __m128i combined = _mm_add_epi16(orig, noise);
177 StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
178 }
179
180 if (x < width) {
181 Pixel luma_buffer[8];
182 // Prevent arbitrary indices from entering GetScalingFactors.
183 memset(luma_buffer, 0, sizeof(luma_buffer));
184 const int valid_range = width - x;
185 assert(valid_range < 8);
186 memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
187 luma_buffer[valid_range] = in_y_row[width - 1];
188 const __m128i orig = LoadSource(&in_y_row[x]);
189 const __m128i scaling =
190 GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
191 __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
192
193 noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
194 const __m128i combined = _mm_add_epi16(orig, noise);
195 StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
196 }
197 in_y_row += source_stride_y;
198 out_y_row += dest_stride_y;
199 } while (++y < height);
200 out_y_row = static_cast<Pixel*>(dest_plane_y);
201 }
202
203 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaValsWithCfl(const Pixel * LIBGAV1_RESTRICT average_luma_buffer,const int16_t * scaling_lut,const Pixel * LIBGAV1_RESTRICT chroma_cursor,const GrainType * LIBGAV1_RESTRICT noise_image_cursor,const __m128i scaling_shift)204 inline __m128i BlendChromaValsWithCfl(
205 const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
206 const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor,
207 const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
208 const __m128i scaling_shift) {
209 const __m128i scaling =
210 GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
211 const __m128i orig = LoadSource(chroma_cursor);
212 __m128i noise = LoadSource(noise_image_cursor);
213 noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
214 return _mm_add_epi16(orig, noise);
215 }
216
217 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaPlaneWithCfl_SSE4_1(const Array2D<GrainType> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,const int16_t * scaling_lut,const Pixel * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const Pixel * in_chroma_row,ptrdiff_t source_stride_chroma,Pixel * out_chroma_row,ptrdiff_t dest_stride)218 LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
219 const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
220 int width, int height, int start_height, int subsampling_x,
221 int subsampling_y, int scaling_shift, const int16_t* scaling_lut,
222 const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
223 const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
224 Pixel* out_chroma_row, ptrdiff_t dest_stride) {
225 const __m128i floor = _mm_set1_epi16(min_value);
226 const __m128i ceiling = _mm_set1_epi16(max_chroma);
227 alignas(16) Pixel luma_buffer[16];
228
229 const int chroma_height = (height + subsampling_y) >> subsampling_y;
230 const int chroma_width = (width + subsampling_x) >> subsampling_x;
231 // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
232 // need to be guarded from overread, even if |chroma_width| is divisible by 8.
233 const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
234
235 // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
236 // in GetScalingFactors.
237 Pixel average_luma_buffer[8];
238 assert(start_height % 2 == 0);
239 start_height >>= subsampling_y;
240 const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
241 int y = 0;
242 do {
243 int x = 0;
244 for (; x + 8 <= safe_chroma_width; x += 8) {
245 const int luma_x = x << subsampling_x;
246 const __m128i average_luma =
247 GetAverageLuma(&in_y_row[luma_x], subsampling_x);
248 StoreUnsigned(average_luma_buffer, average_luma);
249
250 const __m128i blended =
251 BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
252 average_luma_buffer, scaling_lut, &in_chroma_row[x],
253 &(noise_image[y + start_height][x]), derived_scaling_shift);
254 StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
255 }
256
257 if (x < chroma_width) {
258 // Prevent huge indices from entering GetScalingFactors due to
259 // uninitialized values. This is not a problem in 8bpp because the table
260 // is made larger than 255 values.
261 if (bitdepth > kBitdepth8) {
262 memset(luma_buffer, 0, sizeof(luma_buffer));
263 }
264 const int luma_x = x << subsampling_x;
265 const int valid_range = width - luma_x;
266 assert(valid_range < 16);
267 memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
268 luma_buffer[valid_range] = in_y_row[width - 1];
269 const __m128i average_luma =
270 GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
271 StoreUnsigned(average_luma_buffer, average_luma);
272
273 const __m128i blended =
274 BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
275 average_luma_buffer, scaling_lut, &in_chroma_row[x],
276 &(noise_image[y + start_height][x]), derived_scaling_shift);
277 StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
278 }
279
280 in_y_row += source_stride_y << subsampling_y;
281 in_chroma_row += source_stride_chroma;
282 out_chroma_row += dest_stride;
283 } while (++y < chroma_height);
284 }
285
286 // This function is for the case params_.chroma_scaling_from_luma == true.
287 // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
288 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageChromaWithCfl_SSE4_1(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)289 void BlendNoiseWithImageChromaWithCfl_SSE4_1(
290 Plane plane, const FilmGrainParams& params,
291 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
292 int width, int height, int start_height, int subsampling_x,
293 int subsampling_y, const int16_t* scaling_lut,
294 const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
295 const void* source_plane_uv, ptrdiff_t source_stride_uv,
296 void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
297 const auto* noise_image =
298 static_cast<const Array2D<GrainType>*>(noise_image_ptr);
299 const auto* in_y = static_cast<const Pixel*>(source_plane_y);
300 source_stride_y /= sizeof(Pixel);
301
302 const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
303 source_stride_uv /= sizeof(Pixel);
304 auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
305 dest_stride_uv /= sizeof(Pixel);
306 BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
307 noise_image[plane], min_value, max_chroma, width, height, start_height,
308 subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
309 source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
310 }
311
312 } // namespace
313
314 namespace low_bitdepth {
315 namespace {
316
317 // |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
BlendChromaValsNoCfl8bpp(const int16_t * scaling_lut,const __m128i & orig,const int8_t * LIBGAV1_RESTRICT noise_image_cursor,const __m128i & average_luma,const __m128i & scaling_shift,const __m128i & offset,const __m128i & weights)318 inline __m128i BlendChromaValsNoCfl8bpp(
319 const int16_t* scaling_lut, const __m128i& orig,
320 const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
321 const __m128i& average_luma, const __m128i& scaling_shift,
322 const __m128i& offset, const __m128i& weights) {
323 uint8_t merged_buffer[8];
324 const __m128i combined_lo =
325 _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
326 const __m128i combined_hi =
327 _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
328 const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
329 _mm_srai_epi32((combined_hi), 6));
330
331 const __m128i merged = _mm_add_epi16(merged_base, offset);
332
333 StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
334 const __m128i scaling =
335 GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
336 __m128i noise = LoadSource(noise_image_cursor);
337 noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift);
338 return _mm_add_epi16(orig, noise);
339 }
340
BlendChromaPlane8bpp_SSE4_1(const Array2D<int8_t> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,int chroma_offset,int chroma_multiplier,int luma_multiplier,const int16_t * scaling_lut,const uint8_t * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const uint8_t * in_chroma_row,ptrdiff_t source_stride_chroma,uint8_t * out_chroma_row,ptrdiff_t dest_stride)341 LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
342 const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
343 int width, int height, int start_height, int subsampling_x,
344 int subsampling_y, int scaling_shift, int chroma_offset,
345 int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut,
346 const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
347 const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
348 uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
349 const __m128i floor = _mm_set1_epi16(min_value);
350 const __m128i ceiling = _mm_set1_epi16(max_chroma);
351
352 const int chroma_height = (height + subsampling_y) >> subsampling_y;
353 const int chroma_width = (width + subsampling_x) >> subsampling_x;
354 // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
355 // will need to be guarded from overread, even if |chroma_width| is a
356 // multiple of 8.
357 const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
358 alignas(16) uint8_t luma_buffer[16];
359 const __m128i offset = _mm_set1_epi16(chroma_offset);
360 const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
361 (luma_multiplier & 0xFFFF));
362 const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
363
364 start_height >>= subsampling_y;
365 int y = 0;
366 do {
367 int x = 0;
368 for (; x + 8 <= safe_chroma_width; x += 8) {
369 const int luma_x = x << subsampling_x;
370 const __m128i average_luma =
371 GetAverageLuma(&in_y_row[luma_x], subsampling_x);
372 const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
373 const __m128i blended = BlendChromaValsNoCfl8bpp(
374 scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
375 average_luma, derived_scaling_shift, offset, multipliers);
376 StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
377 }
378
379 if (x < chroma_width) {
380 // Begin right edge iteration. Same as the normal iterations, but the
381 // |average_luma| computation requires a duplicated luma value at the
382 // end.
383 const int luma_x = x << subsampling_x;
384 const int valid_range = width - luma_x;
385 assert(valid_range < 16);
386 // There is no need to pre-initialize this buffer, because merged values
387 // used as indices are saturated in the 8bpp case. Uninitialized values
388 // are written outside the frame.
389 memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
390 luma_buffer[valid_range] = in_y_row[width - 1];
391 const int valid_range_chroma = chroma_width - x;
392 uint8_t chroma_buffer[8];
393 memcpy(chroma_buffer, &in_chroma_row[x],
394 valid_range_chroma * sizeof(in_chroma_row[0]));
395
396 const __m128i average_luma =
397 GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
398 const __m128i orig_chroma =
399 LoadSourceMsan(chroma_buffer, valid_range_chroma);
400 const __m128i blended = BlendChromaValsNoCfl8bpp(
401 scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
402 average_luma, derived_scaling_shift, offset, multipliers);
403 StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
404 // End of right edge iteration.
405 }
406
407 in_y_row += source_stride_y << subsampling_y;
408 in_chroma_row += source_stride_chroma;
409 out_chroma_row += dest_stride;
410 } while (++y < chroma_height);
411 }
412
413 // This function is for the case params_.chroma_scaling_from_luma == false.
BlendNoiseWithImageChroma8bpp_SSE4_1(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)414 void BlendNoiseWithImageChroma8bpp_SSE4_1(
415 Plane plane, const FilmGrainParams& params,
416 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
417 int width, int height, int start_height, int subsampling_x,
418 int subsampling_y, const int16_t* scaling_lut,
419 const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
420 const void* source_plane_uv, ptrdiff_t source_stride_uv,
421 void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
422 assert(plane == kPlaneU || plane == kPlaneV);
423 const auto* noise_image =
424 static_cast<const Array2D<int8_t>*>(noise_image_ptr);
425 const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
426 const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
427 auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
428
429 const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
430 const int luma_multiplier =
431 (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
432 const int multiplier =
433 (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
434 BlendChromaPlane8bpp_SSE4_1(
435 noise_image[plane], min_value, max_chroma, width, height, start_height,
436 subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
437 luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
438 source_stride_uv, out_uv, dest_stride_uv);
439 }
440
Init8bpp()441 void Init8bpp() {
442 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
443 assert(dsp != nullptr);
444
445 dsp->film_grain.blend_noise_luma =
446 BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>;
447 dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
448 dsp->film_grain.blend_noise_chroma[1] =
449 BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>;
450 }
451
452 } // namespace
453 } // namespace low_bitdepth
454
455 #if LIBGAV1_MAX_BITDEPTH >= 10
456 namespace high_bitdepth {
457 namespace {
458
Init10bpp()459 void Init10bpp() {
460 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
461 assert(dsp != nullptr);
462
463 dsp->film_grain.blend_noise_luma =
464 BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>;
465 dsp->film_grain.blend_noise_chroma[1] =
466 BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>;
467 }
468
469 } // namespace
470 } // namespace high_bitdepth
471 #endif // LIBGAV1_MAX_BITDEPTH >= 10
472
473 } // namespace film_grain
474
FilmGrainInit_SSE4_1()475 void FilmGrainInit_SSE4_1() {
476 film_grain::low_bitdepth::Init8bpp();
477 #if LIBGAV1_MAX_BITDEPTH >= 10
478 film_grain::high_bitdepth::Init10bpp();
479 #endif // LIBGAV1_MAX_BITDEPTH >= 10
480 }
481
482 } // namespace dsp
483 } // namespace libgav1
484
485 #else // !LIBGAV1_ENABLE_SSE4_1
486
487 namespace libgav1 {
488 namespace dsp {
489
FilmGrainInit_SSE4_1()490 void FilmGrainInit_SSE4_1() {}
491
492 } // namespace dsp
493 } // namespace libgav1
494 #endif // LIBGAV1_TARGETING_SSE4_1
495