1 /*
2 * Copyright © 2018, Niklas Haas
3 * Copyright © 2018, VideoLAN and dav1d authors
4 * Copyright © 2018, Two Orioles, LLC
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 * list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "common/attributes.h"
30 #include "common/intops.h"
31
32 #include "src/filmgrain.h"
33 #include "src/tables.h"
34
35 #define SUB_GRAIN_WIDTH 44
36 #define SUB_GRAIN_HEIGHT 38
37
get_random_number(const int bits,unsigned * const state)38 static inline int get_random_number(const int bits, unsigned *const state) {
39 const int r = *state;
40 unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
41 *state = (r >> 1) | (bit << 15);
42
43 return (*state >> (16 - bits)) & ((1 << bits) - 1);
44 }
45
round2(const int x,const uint64_t shift)46 static inline int round2(const int x, const uint64_t shift) {
47 return (x + ((1 << shift) >> 1)) >> shift;
48 }
49
generate_grain_y_c(entry buf[][GRAIN_WIDTH],const Dav1dFilmGrainData * const data HIGHBD_DECL_SUFFIX)50 static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
51 const Dav1dFilmGrainData *const data
52 HIGHBD_DECL_SUFFIX)
53 {
54 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
55 unsigned seed = data->seed;
56 const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
57 const int grain_ctr = 128 << bitdepth_min_8;
58 const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
59
60 for (int y = 0; y < GRAIN_HEIGHT; y++) {
61 for (int x = 0; x < GRAIN_WIDTH; x++) {
62 const int value = get_random_number(11, &seed);
63 buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
64 }
65 }
66
67 const int ar_pad = 3;
68 const int ar_lag = data->ar_coeff_lag;
69
70 for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
71 for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
72 const int8_t *coeff = data->ar_coeffs_y;
73 int sum = 0;
74 for (int dy = -ar_lag; dy <= 0; dy++) {
75 for (int dx = -ar_lag; dx <= ar_lag; dx++) {
76 if (!dx && !dy)
77 break;
78 sum += *(coeff++) * buf[y + dy][x + dx];
79 }
80 }
81
82 const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
83 buf[y][x] = iclip(grain, grain_min, grain_max);
84 }
85 }
86 }
87
88 static NOINLINE void
generate_grain_uv_c(entry buf[][GRAIN_WIDTH],const entry buf_y[][GRAIN_WIDTH],const Dav1dFilmGrainData * const data,const intptr_t uv,const int subx,const int suby HIGHBD_DECL_SUFFIX)89 generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
90 const entry buf_y[][GRAIN_WIDTH],
91 const Dav1dFilmGrainData *const data, const intptr_t uv,
92 const int subx, const int suby HIGHBD_DECL_SUFFIX)
93 {
94 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
95 unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
96 const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
97 const int grain_ctr = 128 << bitdepth_min_8;
98 const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
99
100 const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH;
101 const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
102
103 for (int y = 0; y < chromaH; y++) {
104 for (int x = 0; x < chromaW; x++) {
105 const int value = get_random_number(11, &seed);
106 buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
107 }
108 }
109
110 const int ar_pad = 3;
111 const int ar_lag = data->ar_coeff_lag;
112
113 for (int y = ar_pad; y < chromaH; y++) {
114 for (int x = ar_pad; x < chromaW - ar_pad; x++) {
115 const int8_t *coeff = data->ar_coeffs_uv[uv];
116 int sum = 0;
117 for (int dy = -ar_lag; dy <= 0; dy++) {
118 for (int dx = -ar_lag; dx <= ar_lag; dx++) {
119 // For the final (current) pixel, we need to add in the
120 // contribution from the luma grain texture
121 if (!dx && !dy) {
122 if (!data->num_y_points)
123 break;
124 int luma = 0;
125 const int lumaX = ((x - ar_pad) << subx) + ar_pad;
126 const int lumaY = ((y - ar_pad) << suby) + ar_pad;
127 for (int i = 0; i <= suby; i++) {
128 for (int j = 0; j <= subx; j++) {
129 luma += buf_y[lumaY + i][lumaX + j];
130 }
131 }
132 luma = round2(luma, subx + suby);
133 sum += luma * (*coeff);
134 break;
135 }
136
137 sum += *(coeff++) * buf[y + dy][x + dx];
138 }
139 }
140
141 const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
142 buf[y][x] = iclip(grain, grain_min, grain_max);
143 }
144 }
145 }
146
147 #define gnuv_ss_fn(nm, ss_x, ss_y) \
148 static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \
149 generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \
150 }
151
152 gnuv_ss_fn(420, 1, 1);
153 gnuv_ss_fn(422, 1, 0);
154 gnuv_ss_fn(444, 0, 0);
155
156 // samples from the correct block of a grain LUT, while taking into account the
157 // offsets provided by the offsets cache
sample_lut(const entry grain_lut[][GRAIN_WIDTH],const int offsets[2][2],const int subx,const int suby,const int bx,const int by,const int x,const int y)158 static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
159 const int offsets[2][2], const int subx, const int suby,
160 const int bx, const int by, const int x, const int y)
161 {
162 const int randval = offsets[bx][by];
163 const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
164 const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
165 return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by]
166 [offx + x + (FG_BLOCK_SIZE >> subx) * bx];
167 }
168
fgy_32x32xn_c(pixel * const dst_row,const pixel * const src_row,const ptrdiff_t stride,const Dav1dFilmGrainData * const data,const size_t pw,const uint8_t scaling[SCALING_SIZE],const entry grain_lut[][GRAIN_WIDTH],const int bh,const int row_num HIGHBD_DECL_SUFFIX)169 static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
170 const ptrdiff_t stride,
171 const Dav1dFilmGrainData *const data, const size_t pw,
172 const uint8_t scaling[SCALING_SIZE],
173 const entry grain_lut[][GRAIN_WIDTH],
174 const int bh, const int row_num HIGHBD_DECL_SUFFIX)
175 {
176 const int rows = 1 + (data->overlap_flag && row_num > 0);
177 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
178 const int grain_ctr = 128 << bitdepth_min_8;
179 const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
180
181 int min_value, max_value;
182 if (data->clip_to_restricted_range) {
183 min_value = 16 << bitdepth_min_8;
184 max_value = 235 << bitdepth_min_8;
185 } else {
186 min_value = 0;
187 max_value = BITDEPTH_MAX;
188 }
189
190 // seed[0] contains the current row, seed[1] contains the previous
191 unsigned seed[2];
192 for (int i = 0; i < rows; i++) {
193 seed[i] = data->seed;
194 seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
195 seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
196 }
197
198 assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
199
200 int offsets[2 /* col offset */][2 /* row offset */];
201
202 // process this row in FG_BLOCK_SIZE^2 blocks
203 for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
204 const int bw = imin(FG_BLOCK_SIZE, (int) pw - bx);
205
206 if (data->overlap_flag && bx) {
207 // shift previous offsets left
208 for (int i = 0; i < rows; i++)
209 offsets[1][i] = offsets[0][i];
210 }
211
212 // update current offsets
213 for (int i = 0; i < rows; i++)
214 offsets[0][i] = get_random_number(8, &seed[i]);
215
216 // x/y block offsets to compensate for overlapped regions
217 const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0;
218 const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0;
219
220 static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
221
222 #define add_noise_y(x, y, grain) \
223 const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \
224 pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \
225 const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
226 *dst = iclip(*src + noise, min_value, max_value);
227
228 for (int y = ystart; y < bh; y++) {
229 // Non-overlapped image region (straightforward)
230 for (int x = xstart; x < bw; x++) {
231 int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
232 add_noise_y(x, y, grain);
233 }
234
235 // Special case for overlapped column
236 for (int x = 0; x < xstart; x++) {
237 int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
238 int old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
239 grain = round2(old * w[x][0] + grain * w[x][1], 5);
240 grain = iclip(grain, grain_min, grain_max);
241 add_noise_y(x, y, grain);
242 }
243 }
244
245 for (int y = 0; y < ystart; y++) {
246 // Special case for overlapped row (sans corner)
247 for (int x = xstart; x < bw; x++) {
248 int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
249 int old = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
250 grain = round2(old * w[y][0] + grain * w[y][1], 5);
251 grain = iclip(grain, grain_min, grain_max);
252 add_noise_y(x, y, grain);
253 }
254
255 // Special case for doubly-overlapped corner
256 for (int x = 0; x < xstart; x++) {
257 // Blend the top pixel with the top left block
258 int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
259 int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y);
260 top = round2(old * w[x][0] + top * w[x][1], 5);
261 top = iclip(top, grain_min, grain_max);
262
263 // Blend the current pixel with the left block
264 int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
265 old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
266 grain = round2(old * w[x][0] + grain * w[x][1], 5);
267 grain = iclip(grain, grain_min, grain_max);
268
269 // Mix the row rows together and apply grain
270 grain = round2(top * w[y][0] + grain * w[y][1], 5);
271 grain = iclip(grain, grain_min, grain_max);
272 add_noise_y(x, y, grain);
273 }
274 }
275 }
276 }
277
278 static NOINLINE void
fguv_32x32xn_c(pixel * const dst_row,const pixel * const src_row,const ptrdiff_t stride,const Dav1dFilmGrainData * const data,const size_t pw,const uint8_t scaling[SCALING_SIZE],const entry grain_lut[][GRAIN_WIDTH],const int bh,const int row_num,const pixel * const luma_row,const ptrdiff_t luma_stride,const int uv,const int is_id,const int sx,const int sy HIGHBD_DECL_SUFFIX)279 fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
280 const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
281 const size_t pw, const uint8_t scaling[SCALING_SIZE],
282 const entry grain_lut[][GRAIN_WIDTH], const int bh,
283 const int row_num, const pixel *const luma_row,
284 const ptrdiff_t luma_stride, const int uv, const int is_id,
285 const int sx, const int sy HIGHBD_DECL_SUFFIX)
286 {
287 const int rows = 1 + (data->overlap_flag && row_num > 0);
288 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
289 const int grain_ctr = 128 << bitdepth_min_8;
290 const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
291
292 int min_value, max_value;
293 if (data->clip_to_restricted_range) {
294 min_value = 16 << bitdepth_min_8;
295 max_value = (is_id ? 235 : 240) << bitdepth_min_8;
296 } else {
297 min_value = 0;
298 max_value = BITDEPTH_MAX;
299 }
300
301 // seed[0] contains the current row, seed[1] contains the previous
302 unsigned seed[2];
303 for (int i = 0; i < rows; i++) {
304 seed[i] = data->seed;
305 seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
306 seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
307 }
308
309 assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
310
311 int offsets[2 /* col offset */][2 /* row offset */];
312
313 // process this row in FG_BLOCK_SIZE^2 blocks (subsampled)
314 for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) {
315 const int bw = imin(FG_BLOCK_SIZE >> sx, (int)(pw - bx));
316 if (data->overlap_flag && bx) {
317 // shift previous offsets left
318 for (int i = 0; i < rows; i++)
319 offsets[1][i] = offsets[0][i];
320 }
321
322 // update current offsets
323 for (int i = 0; i < rows; i++)
324 offsets[0][i] = get_random_number(8, &seed[i]);
325
326 // x/y block offsets to compensate for overlapped regions
327 const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0;
328 const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0;
329
330 static const int w[2 /* sub */][2 /* off */][2] = {
331 { { 27, 17 }, { 17, 27 } },
332 { { 23, 22 } },
333 };
334
335 #define add_noise_uv(x, y, grain) \
336 const int lx = (bx + x) << sx; \
337 const int ly = y << sy; \
338 const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \
339 pixel avg = luma[0]; \
340 if (sx) \
341 avg = (avg + luma[1] + 1) >> 1; \
342 const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
343 pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
344 int val = avg; \
345 if (!data->chroma_scaling_from_luma) { \
346 const int combined = avg * data->uv_luma_mult[uv] + \
347 *src * data->uv_mult[uv]; \
348 val = iclip_pixel( (combined >> 6) + \
349 (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \
350 } \
351 const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
352 *dst = iclip(*src + noise, min_value, max_value);
353
354 for (int y = ystart; y < bh; y++) {
355 // Non-overlapped image region (straightforward)
356 for (int x = xstart; x < bw; x++) {
357 int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
358 add_noise_uv(x, y, grain);
359 }
360
361 // Special case for overlapped column
362 for (int x = 0; x < xstart; x++) {
363 int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
364 int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
365 grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
366 grain = iclip(grain, grain_min, grain_max);
367 add_noise_uv(x, y, grain);
368 }
369 }
370
371 for (int y = 0; y < ystart; y++) {
372 // Special case for overlapped row (sans corner)
373 for (int x = xstart; x < bw; x++) {
374 int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
375 int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
376 grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5);
377 grain = iclip(grain, grain_min, grain_max);
378 add_noise_uv(x, y, grain);
379 }
380
381 // Special case for doubly-overlapped corner
382 for (int x = 0; x < xstart; x++) {
383 // Blend the top pixel with the top left block
384 int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
385 int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y);
386 top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5);
387 top = iclip(top, grain_min, grain_max);
388
389 // Blend the current pixel with the left block
390 int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
391 old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
392 grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
393 grain = iclip(grain, grain_min, grain_max);
394
395 // Mix the row rows together and apply to image
396 grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5);
397 grain = iclip(grain, grain_min, grain_max);
398 add_noise_uv(x, y, grain);
399 }
400 }
401 }
402 }
403
404 #define fguv_ss_fn(nm, ss_x, ss_y) \
405 static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
406 fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
407 row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
408 HIGHBD_TAIL_SUFFIX); \
409 }
410
411 fguv_ss_fn(420, 1, 1);
412 fguv_ss_fn(422, 1, 0);
413 fguv_ss_fn(444, 0, 0);
414
415 #if HAVE_ASM
416 #if ARCH_AARCH64 || ARCH_ARM
417 #include "src/arm/filmgrain.h"
418 #elif ARCH_X86
419 #include "src/x86/filmgrain.h"
420 #endif
421 #endif
422
bitfn(dav1d_film_grain_dsp_init)423 COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
424 c->generate_grain_y = generate_grain_y_c;
425 c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
426 c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
427 c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
428
429 c->fgy_32x32xn = fgy_32x32xn_c;
430 c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
431 c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
432 c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
433
434 #if HAVE_ASM
435 #if ARCH_AARCH64 || ARCH_ARM
436 film_grain_dsp_init_arm(c);
437 #elif ARCH_X86
438 film_grain_dsp_init_x86(c);
439 #endif
440 #endif
441 }
442