1 /*
2 * Copyright © 2018-2021, VideoLAN and dav1d authors
3 * Copyright © 2018, Two Orioles, LLC
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "config.h"
29
30 #include <string.h>
31 #include <stdio.h>
32
33 #include "common/attributes.h"
34 #include "common/bitdepth.h"
35 #include "common/dump.h"
36 #include "common/frame.h"
37 #include "common/intops.h"
38
39 #include "src/cdef_apply.h"
40 #include "src/ctx.h"
41 #include "src/ipred_prepare.h"
42 #include "src/lf_apply.h"
43 #include "src/lr_apply.h"
44 #include "src/recon.h"
45 #include "src/scan.h"
46 #include "src/tables.h"
47 #include "src/wedge.h"
48
read_golomb(MsacContext * const msac)49 static inline unsigned read_golomb(MsacContext *const msac) {
50 int len = 0;
51 unsigned val = 1;
52
53 while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
54 while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
55
56 return val - 1;
57 }
58
get_skip_ctx(const TxfmInfo * const t_dim,const enum BlockSize bs,const uint8_t * const a,const uint8_t * const l,const int chroma,const enum Dav1dPixelLayout layout)59 static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
60 const enum BlockSize bs,
61 const uint8_t *const a,
62 const uint8_t *const l,
63 const int chroma,
64 const enum Dav1dPixelLayout layout)
65 {
66 const uint8_t *const b_dim = dav1d_block_dimensions[bs];
67
68 if (chroma) {
69 const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
70 const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
71 const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
72 b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
73 unsigned ca, cl;
74
75 #define MERGE_CTX(dir, type, no_val) \
76 c##dir = *(const type *) dir != no_val; \
77 break
78
79 switch (t_dim->lw) {
80 /* For some reason the MSVC CRT _wassert() function is not flagged as
81 * __declspec(noreturn), so when using those headers the compiler will
82 * expect execution to continue after an assertion has been triggered
83 * and will therefore complain about the use of uninitialized variables
84 * when compiled in debug mode if we put the default case at the end. */
85 default: assert(0); /* fall-through */
86 case TX_4X4: MERGE_CTX(a, uint8_t, 0x40);
87 case TX_8X8: MERGE_CTX(a, uint16_t, 0x4040);
88 case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
89 case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
90 }
91 switch (t_dim->lh) {
92 default: assert(0); /* fall-through */
93 case TX_4X4: MERGE_CTX(l, uint8_t, 0x40);
94 case TX_8X8: MERGE_CTX(l, uint16_t, 0x4040);
95 case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
96 case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
97 }
98 #undef MERGE_CTX
99
100 return 7 + not_one_blk * 3 + ca + cl;
101 } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
102 return 0;
103 } else {
104 unsigned la, ll;
105
106 #define MERGE_CTX(dir, type, tx) \
107 if (tx == TX_64X64) { \
108 uint64_t tmp = *(const uint64_t *) dir; \
109 tmp |= *(const uint64_t *) &dir[8]; \
110 l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
111 } else \
112 l##dir = *(const type *) dir; \
113 if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
114 if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
115 if (tx >= TX_8X8) l##dir |= l##dir >> 8; \
116 break
117
118 switch (t_dim->lw) {
119 default: assert(0); /* fall-through */
120 case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4);
121 case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8);
122 case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
123 case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
124 case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
125 }
126 switch (t_dim->lh) {
127 default: assert(0); /* fall-through */
128 case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4);
129 case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8);
130 case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
131 case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
132 case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
133 }
134 #undef MERGE_CTX
135
136 return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
137 }
138 }
139
get_dc_sign_ctx(const int tx,const uint8_t * const a,const uint8_t * const l)140 static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
141 const uint8_t *const a,
142 const uint8_t *const l)
143 {
144 uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
145 int s;
146
147 #if ARCH_X86_64 && defined(__GNUC__)
148 /* Coerce compilers into producing better code. For some reason
149 * every x86-64 compiler is awful at handling 64-bit constants. */
150 __asm__("" : "+r"(mask), "+r"(mul));
151 #endif
152
153 switch(tx) {
154 default: assert(0); /* fall-through */
155 case TX_4X4: {
156 int t = *(const uint8_t *) a >> 6;
157 t += *(const uint8_t *) l >> 6;
158 s = t - 1 - 1;
159 break;
160 }
161 case TX_8X8: {
162 uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
163 t += *(const uint16_t *) l & (uint32_t) mask;
164 t *= 0x04040404U;
165 s = (int) (t >> 24) - 2 - 2;
166 break;
167 }
168 case TX_16X16: {
169 uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
170 t += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
171 t *= (uint32_t) mul;
172 s = (int) (t >> 24) - 4 - 4;
173 break;
174 }
175 case TX_32X32: {
176 uint64_t t = (*(const uint64_t *) a & mask) >> 6;
177 t += (*(const uint64_t *) l & mask) >> 6;
178 t *= mul;
179 s = (int) (t >> 56) - 8 - 8;
180 break;
181 }
182 case TX_64X64: {
183 uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
184 t += (*(const uint64_t *) &a[8] & mask) >> 6;
185 t += (*(const uint64_t *) &l[0] & mask) >> 6;
186 t += (*(const uint64_t *) &l[8] & mask) >> 6;
187 t *= mul;
188 s = (int) (t >> 56) - 16 - 16;
189 break;
190 }
191 case RTX_4X8: {
192 uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
193 t += *(const uint16_t *) l & (uint32_t) mask;
194 t *= 0x04040404U;
195 s = (int) (t >> 24) - 1 - 2;
196 break;
197 }
198 case RTX_8X4: {
199 uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
200 t += *(const uint8_t *) l & (uint32_t) mask;
201 t *= 0x04040404U;
202 s = (int) (t >> 24) - 2 - 1;
203 break;
204 }
205 case RTX_8X16: {
206 uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
207 t += *(const uint32_t *) l & (uint32_t) mask;
208 t = (t >> 6) * (uint32_t) mul;
209 s = (int) (t >> 24) - 2 - 4;
210 break;
211 }
212 case RTX_16X8: {
213 uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
214 t += *(const uint16_t *) l & (uint32_t) mask;
215 t = (t >> 6) * (uint32_t) mul;
216 s = (int) (t >> 24) - 4 - 2;
217 break;
218 }
219 case RTX_16X32: {
220 uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
221 t += *(const uint64_t *) l & mask;
222 t = (t >> 6) * mul;
223 s = (int) (t >> 56) - 4 - 8;
224 break;
225 }
226 case RTX_32X16: {
227 uint64_t t = *(const uint64_t *) a & mask;
228 t += *(const uint32_t *) l & (uint32_t) mask;
229 t = (t >> 6) * mul;
230 s = (int) (t >> 56) - 8 - 4;
231 break;
232 }
233 case RTX_32X64: {
234 uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
235 t += (*(const uint64_t *) &l[0] & mask) >> 6;
236 t += (*(const uint64_t *) &l[8] & mask) >> 6;
237 t *= mul;
238 s = (int) (t >> 56) - 8 - 16;
239 break;
240 }
241 case RTX_64X32: {
242 uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
243 t += (*(const uint64_t *) &a[8] & mask) >> 6;
244 t += (*(const uint64_t *) &l[0] & mask) >> 6;
245 t *= mul;
246 s = (int) (t >> 56) - 16 - 8;
247 break;
248 }
249 case RTX_4X16: {
250 uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
251 t += *(const uint32_t *) l & (uint32_t) mask;
252 t = (t >> 6) * (uint32_t) mul;
253 s = (int) (t >> 24) - 1 - 4;
254 break;
255 }
256 case RTX_16X4: {
257 uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
258 t += *(const uint8_t *) l & (uint32_t) mask;
259 t = (t >> 6) * (uint32_t) mul;
260 s = (int) (t >> 24) - 4 - 1;
261 break;
262 }
263 case RTX_8X32: {
264 uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
265 t += *(const uint64_t *) l & mask;
266 t = (t >> 6) * mul;
267 s = (int) (t >> 56) - 2 - 8;
268 break;
269 }
270 case RTX_32X8: {
271 uint64_t t = *(const uint64_t *) a & mask;
272 t += *(const uint16_t *) l & (uint32_t) mask;
273 t = (t >> 6) * mul;
274 s = (int) (t >> 56) - 8 - 2;
275 break;
276 }
277 case RTX_16X64: {
278 uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
279 t += *(const uint64_t *) &l[0] & mask;
280 t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
281 t *= mul;
282 s = (int) (t >> 56) - 4 - 16;
283 break;
284 }
285 case RTX_64X16: {
286 uint64_t t = *(const uint64_t *) &a[0] & mask;
287 t += *(const uint32_t *) l & (uint32_t) mask;
288 t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
289 t *= mul;
290 s = (int) (t >> 56) - 16 - 4;
291 break;
292 }
293 }
294
295 return (s != 0) + (s > 0);
296 }
297
get_lo_ctx(const uint8_t * const levels,const enum TxClass tx_class,unsigned * const hi_mag,const uint8_t (* const ctx_offsets)[5],const unsigned x,const unsigned y,const ptrdiff_t stride)298 static inline unsigned get_lo_ctx(const uint8_t *const levels,
299 const enum TxClass tx_class,
300 unsigned *const hi_mag,
301 const uint8_t (*const ctx_offsets)[5],
302 const unsigned x, const unsigned y,
303 const ptrdiff_t stride)
304 {
305 unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
306 unsigned offset;
307 if (tx_class == TX_CLASS_2D) {
308 mag += levels[1 * stride + 1];
309 *hi_mag = mag;
310 mag += levels[0 * stride + 2] + levels[2 * stride + 0];
311 offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
312 } else {
313 mag += levels[0 * stride + 2];
314 *hi_mag = mag;
315 mag += levels[0 * stride + 3] + levels[0 * stride + 4];
316 offset = 26 + (y > 1 ? 10 : y * 5);
317 }
318 return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
319 }
320
decode_coefs(Dav1dTaskContext * const t,uint8_t * const a,uint8_t * const l,const enum RectTxfmSize tx,const enum BlockSize bs,const Av1Block * const b,const int intra,const int plane,coef * cf,enum TxfmType * const txtp,uint8_t * res_ctx)321 static int decode_coefs(Dav1dTaskContext *const t,
322 uint8_t *const a, uint8_t *const l,
323 const enum RectTxfmSize tx, const enum BlockSize bs,
324 const Av1Block *const b, const int intra,
325 const int plane, coef *cf,
326 enum TxfmType *const txtp, uint8_t *res_ctx)
327 {
328 Dav1dTileState *const ts = t->ts;
329 const int chroma = !!plane;
330 const Dav1dFrameContext *const f = t->f;
331 const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
332 const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
333 const int dbg = DEBUG_BLOCK_INFO && plane && 0;
334
335 if (dbg)
336 printf("Start: r=%d\n", ts->msac.rng);
337
338 // does this block have any non-zero coefficients
339 const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
340 const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
341 ts->cdf.coef.skip[t_dim->ctx][sctx]);
342 if (dbg)
343 printf("Post-non-zero[%d][%d][%d]: r=%d\n",
344 t_dim->ctx, sctx, all_skip, ts->msac.rng);
345 if (all_skip) {
346 *res_ctx = 0x40;
347 *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
348 return -1;
349 }
350
351 // transform type (chroma: derived, luma: explicitly coded)
352 if (lossless) {
353 assert(t_dim->max == TX_4X4);
354 *txtp = WHT_WHT;
355 } else if (t_dim->max + intra >= TX_64X64) {
356 *txtp = DCT_DCT;
357 } else if (chroma) {
358 // inferred from either the luma txtp (inter) or a LUT (intra)
359 *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
360 get_uv_inter_txtp(t_dim, *txtp);
361 } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
362 // In libaom, lossless is checked by a literal qidx == 0, but not all
363 // such blocks are actually lossless. The remainder gets an implicit
364 // transform type (for luma)
365 *txtp = DCT_DCT;
366 } else {
367 unsigned idx;
368 if (intra) {
369 const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
370 dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
371 if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
372 idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
373 ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
374 *txtp = dav1d_tx_types_per_set[idx + 0];
375 } else {
376 idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
377 ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
378 *txtp = dav1d_tx_types_per_set[idx + 5];
379 }
380 if (dbg)
381 printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
382 tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
383 } else {
384 if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
385 idx = dav1d_msac_decode_bool_adapt(&ts->msac,
386 ts->cdf.m.txtp_inter3[t_dim->min]);
387 *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
388 } else if (t_dim->min == TX_16X16) {
389 idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
390 ts->cdf.m.txtp_inter2, 11);
391 *txtp = dav1d_tx_types_per_set[idx + 12];
392 } else {
393 idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
394 ts->cdf.m.txtp_inter1[t_dim->min], 15);
395 *txtp = dav1d_tx_types_per_set[idx + 24];
396 }
397 if (dbg)
398 printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
399 tx, t_dim->min, idx, *txtp, ts->msac.rng);
400 }
401 }
402
403 // find end-of-block (eob)
404 int eob_bin;
405 const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32);
406 const int tx2dszctx = slw + slh;
407 const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
408 const int is_1d = tx_class != TX_CLASS_2D;
409 switch (tx2dszctx) {
410 #define case_sz(sz, bin, ns, is_1d) \
411 case sz: { \
412 uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
413 eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
414 break; \
415 }
416 case_sz(0, 16, 8, [is_1d]);
417 case_sz(1, 32, 8, [is_1d]);
418 case_sz(2, 64, 8, [is_1d]);
419 case_sz(3, 128, 8, [is_1d]);
420 case_sz(4, 256, 16, [is_1d]);
421 case_sz(5, 512, 16, );
422 case_sz(6, 1024, 16, );
423 #undef case_sz
424 }
425 if (dbg)
426 printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
427 16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
428 int eob;
429 if (eob_bin > 1) {
430 uint16_t *const eob_hi_bit_cdf =
431 ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
432 const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
433 if (dbg)
434 printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
435 t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
436 eob = ((eob_hi_bit | 2) << (eob_bin - 2)) |
437 dav1d_msac_decode_bools(&ts->msac, eob_bin - 2);
438 if (dbg)
439 printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
440 } else {
441 eob = eob_bin;
442 }
443 assert(eob >= 0);
444
445 // base tokens
446 uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
447 uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
448 unsigned rc, dc_tok;
449
450 if (eob) {
451 uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
452 uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
453
454 /* eob */
455 unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx);
456 int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
457 int tok = eob_tok + 1;
458 int level_tok = tok * 0x41;
459 unsigned mag;
460
461 #define DECODE_COEFS_CLASS(tx_class) \
462 unsigned x, y; \
463 uint8_t *level; \
464 if (tx_class == TX_CLASS_2D) \
465 rc = scan[eob], x = rc >> shift, y = rc & mask; \
466 else if (tx_class == TX_CLASS_H) \
467 /* Transposing reduces the stride and padding requirements */ \
468 x = eob & mask, y = eob >> shift, rc = eob; \
469 else /* tx_class == TX_CLASS_V */ \
470 x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
471 if (dbg) \
472 printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
473 t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
474 if (eob_tok == 2) { \
475 ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
476 tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
477 level_tok = tok + (3 << 6); \
478 if (dbg) \
479 printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
480 imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
481 ts->msac.rng); \
482 } \
483 cf[rc] = tok << 11; \
484 if (TX_CLASS_2D) \
485 level = levels + rc; \
486 else \
487 level = levels + x * stride + y; \
488 *level = (uint8_t) level_tok; \
489 for (int i = eob - 1; i > 0; i--) { /* ac */ \
490 unsigned rc_i; \
491 if (tx_class == TX_CLASS_2D) \
492 rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
493 else if (tx_class == TX_CLASS_H) \
494 x = i & mask, y = i >> shift, rc_i = i; \
495 else /* tx_class == TX_CLASS_V */ \
496 x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
497 assert(x < 32 && y < 32); \
498 if (TX_CLASS_2D) \
499 level = levels + rc; \
500 else \
501 level = levels + x * stride + y; \
502 ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
503 if (tx_class == TX_CLASS_2D) \
504 y |= x; \
505 tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
506 if (dbg) \
507 printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
508 t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
509 if (tok == 3) { \
510 mag &= 63; \
511 ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
512 (mag > 12 ? 6 : (mag + 1) >> 1); \
513 tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
514 if (dbg) \
515 printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
516 imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
517 ts->msac.rng); \
518 *level = (uint8_t) (tok + (3 << 6)); \
519 cf[rc_i] = (tok << 11) | rc; \
520 rc = rc_i; \
521 } else { \
522 /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
523 tok *= 0x17ff41; \
524 *level = (uint8_t) tok; \
525 /* tok ? (tok << 11) | rc : 0 */ \
526 tok = (tok >> 9) & (rc + ~0x7ffu); \
527 if (tok) rc = rc_i; \
528 cf[rc_i] = tok; \
529 } \
530 } \
531 /* dc */ \
532 ctx = (tx_class == TX_CLASS_2D) ? 0 : \
533 get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
534 dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
535 if (dbg) \
536 printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
537 t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
538 if (dc_tok == 3) { \
539 if (tx_class == TX_CLASS_2D) \
540 mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
541 levels[1 * stride + 1]; \
542 mag &= 63; \
543 ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
544 dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
545 if (dbg) \
546 printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
547 imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
548 } \
549 break
550
551 const uint16_t *scan;
552 switch (tx_class) {
553 case TX_CLASS_2D: {
554 const unsigned nonsquare_tx = tx >= RTX_4X8;
555 const uint8_t (*const lo_ctx_offsets)[5] =
556 dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
557 scan = dav1d_scans[tx];
558 const ptrdiff_t stride = 4 << slh;
559 const unsigned shift = slh + 2, shift2 = 0;
560 const unsigned mask = (4 << slh) - 1;
561 memset(levels, 0, stride * ((4 << slw) + 2));
562 DECODE_COEFS_CLASS(TX_CLASS_2D);
563 }
564 case TX_CLASS_H: {
565 const uint8_t (*const lo_ctx_offsets)[5] = NULL;
566 const ptrdiff_t stride = 16;
567 const unsigned shift = slh + 2, shift2 = 0;
568 const unsigned mask = (4 << slh) - 1;
569 memset(levels, 0, stride * ((4 << slh) + 2));
570 DECODE_COEFS_CLASS(TX_CLASS_H);
571 }
572 case TX_CLASS_V: {
573 const uint8_t (*const lo_ctx_offsets)[5] = NULL;
574 const ptrdiff_t stride = 16;
575 const unsigned shift = slw + 2, shift2 = slh + 2;
576 const unsigned mask = (4 << slw) - 1;
577 memset(levels, 0, stride * ((4 << slw) + 2));
578 DECODE_COEFS_CLASS(TX_CLASS_V);
579 }
580 #undef DECODE_COEFS_CLASS
581 default: assert(0);
582 }
583 } else { // dc-only
584 int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
585 dc_tok = 1 + tok_br;
586 if (dbg)
587 printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
588 t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
589 if (tok_br == 2) {
590 dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
591 if (dbg)
592 printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
593 imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
594 }
595 rc = 0;
596 }
597
598 // residual and sign
599 const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
600 const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
601 const int dq_shift = imax(0, t_dim->ctx - 2);
602 const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
603 unsigned cul_level, dc_sign_level;
604
605 if (!dc_tok) {
606 cul_level = 0;
607 dc_sign_level = 1 << 6;
608 if (qm_tbl) goto ac_qm;
609 goto ac_noqm;
610 }
611
612 const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
613 uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
614 const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
615 if (dbg)
616 printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
617 chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
618
619 int dc_dq = dq_tbl[0];
620 dc_sign_level = (dc_sign - 1) & (2 << 6);
621
622 if (qm_tbl) {
623 dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
624
625 if (dc_tok == 15) {
626 dc_tok = read_golomb(&ts->msac) + 15;
627 if (dbg)
628 printf("Post-dc_residual[%d->%d]: r=%d\n",
629 dc_tok - 15, dc_tok, ts->msac.rng);
630
631 dc_tok &= 0xfffff;
632 dc_dq = (dc_dq * dc_tok) & 0xffffff;
633 } else {
634 dc_dq *= dc_tok;
635 assert(dc_dq <= 0xffffff);
636 }
637 cul_level = dc_tok;
638 dc_dq >>= dq_shift;
639 dc_dq = umin(dc_dq, cf_max + dc_sign);
640 cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
641
642 if (rc) ac_qm: {
643 const unsigned ac_dq = dq_tbl[1];
644 do {
645 const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
646 if (dbg)
647 printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
648 const unsigned rc_tok = cf[rc];
649 unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
650 int dq_sat;
651
652 if (rc_tok >= (15 << 11)) {
653 tok = read_golomb(&ts->msac) + 15;
654 if (dbg)
655 printf("Post-residual[%d=%d->%d]: r=%d\n",
656 rc, tok - 15, tok, ts->msac.rng);
657
658 tok &= 0xfffff;
659 dq = (dq * tok) & 0xffffff;
660 } else {
661 tok = rc_tok >> 11;
662 dq *= tok;
663 assert(dq <= 0xffffff);
664 }
665 cul_level += tok;
666 dq >>= dq_shift;
667 dq_sat = umin(dq, cf_max + sign);
668 cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
669
670 rc = rc_tok & 0x3ff;
671 } while (rc);
672 }
673 } else {
674 // non-qmatrix is the common case and allows for additional optimizations
675 if (dc_tok == 15) {
676 dc_tok = read_golomb(&ts->msac) + 15;
677 if (dbg)
678 printf("Post-dc_residual[%d->%d]: r=%d\n",
679 dc_tok - 15, dc_tok, ts->msac.rng);
680
681 dc_tok &= 0xfffff;
682 dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
683 dc_dq = umin(dc_dq, cf_max + dc_sign);
684 } else {
685 dc_dq = ((dc_dq * dc_tok) >> dq_shift);
686 assert(dc_dq <= cf_max);
687 }
688 cul_level = dc_tok;
689 cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
690
691 if (rc) ac_noqm: {
692 const unsigned ac_dq = dq_tbl[1];
693 do {
694 const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
695 if (dbg)
696 printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
697 const unsigned rc_tok = cf[rc];
698 unsigned tok;
699 int dq;
700
701 // residual
702 if (rc_tok >= (15 << 11)) {
703 tok = read_golomb(&ts->msac) + 15;
704 if (dbg)
705 printf("Post-residual[%d=%d->%d]: r=%d\n",
706 rc, tok - 15, tok, ts->msac.rng);
707
708 // coefficient parsing, see 5.11.39
709 tok &= 0xfffff;
710
711 // dequant, see 7.12.3
712 dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
713 dq = umin(dq, cf_max + sign);
714 } else {
715 // cannot exceed cf_max, so we can avoid the clipping
716 tok = rc_tok >> 11;
717 dq = ((ac_dq * tok) >> dq_shift);
718 assert(dq <= cf_max);
719 }
720 cul_level += tok;
721 cf[rc] = (coef) (sign ? -dq : dq);
722
723 rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
724 } while (rc);
725 }
726 }
727
728 // context
729 *res_ctx = umin(cul_level, 63) | dc_sign_level;
730
731 return eob;
732 }
733
read_coef_tree(Dav1dTaskContext * const t,const enum BlockSize bs,const Av1Block * const b,const enum RectTxfmSize ytx,const int depth,const uint16_t * const tx_split,const int x_off,const int y_off,pixel * dst)734 static void read_coef_tree(Dav1dTaskContext *const t,
735 const enum BlockSize bs, const Av1Block *const b,
736 const enum RectTxfmSize ytx, const int depth,
737 const uint16_t *const tx_split,
738 const int x_off, const int y_off, pixel *dst)
739 {
740 const Dav1dFrameContext *const f = t->f;
741 Dav1dTileState *const ts = t->ts;
742 const Dav1dDSPContext *const dsp = f->dsp;
743 const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
744 const int txw = t_dim->w, txh = t_dim->h;
745
746 /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
747 * be splitted. Aviods an undefined left shift. */
748 if (depth < 2 && tx_split[depth] &&
749 tx_split[depth] & (1 << (y_off * 4 + x_off)))
750 {
751 const enum RectTxfmSize sub = t_dim->sub;
752 const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
753 const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
754
755 read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
756 x_off * 2 + 0, y_off * 2 + 0, dst);
757 t->bx += txsw;
758 if (txw >= txh && t->bx < f->bw)
759 read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
760 y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
761 t->bx -= txsw;
762 t->by += txsh;
763 if (txh >= txw && t->by < f->bh) {
764 if (dst)
765 dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
766 read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
767 x_off * 2 + 0, y_off * 2 + 1, dst);
768 t->bx += txsw;
769 if (txw >= txh && t->bx < f->bw)
770 read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
771 y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
772 t->bx -= txsw;
773 }
774 t->by -= txsh;
775 } else {
776 const int bx4 = t->bx & 31, by4 = t->by & 31;
777 enum TxfmType txtp;
778 uint8_t cf_ctx;
779 int eob;
780 coef *cf;
781
782 if (t->frame_thread.pass) {
783 const int p = t->frame_thread.pass & 1;
784 assert(ts->frame_thread[p].cf);
785 cf = ts->frame_thread[p].cf;
786 ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
787 } else {
788 cf = bitfn(t->cf);
789 }
790 if (t->frame_thread.pass != 2) {
791 eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
792 ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
793 if (DEBUG_BLOCK_INFO)
794 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
795 ytx, txtp, eob, ts->msac.rng);
796 dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
797 dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
798 #define set_ctx(rep_macro) \
799 for (int y = 0; y < txh; y++) { \
800 rep_macro(txtp_map, 0, txtp); \
801 txtp_map += 32; \
802 }
803 uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];
804 case_set_upto16(t_dim->lw);
805 #undef set_ctx
806 if (t->frame_thread.pass == 1)
807 *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
808 } else {
809 const int cbi = *ts->frame_thread[0].cbi++;
810 eob = cbi >> 5;
811 txtp = cbi & 0x1f;
812 }
813 if (!(t->frame_thread.pass & 1)) {
814 assert(dst);
815 if (eob >= 0) {
816 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
817 coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
818 dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob
819 HIGHBD_CALL_SUFFIX);
820 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
821 hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
822 }
823 }
824 }
825 }
826
bytefn(dav1d_read_coef_blocks)827 void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
828 const enum BlockSize bs, const Av1Block *const b)
829 {
830 const Dav1dFrameContext *const f = t->f;
831 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
832 const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
833 const int bx4 = t->bx & 31, by4 = t->by & 31;
834 const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
835 const uint8_t *const b_dim = dav1d_block_dimensions[bs];
836 const int bw4 = b_dim[0], bh4 = b_dim[1];
837 const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
838 const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
839 (bw4 > ss_hor || t->bx & 1) &&
840 (bh4 > ss_ver || t->by & 1);
841
842 if (b->skip) {
843 BlockContext *const a = t->a;
844 dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
845 dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
846 if (has_chroma) {
847 dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
848 dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
849 memset_cw(&a->ccoef[0][cbx4], 0x40);
850 memset_cw(&a->ccoef[1][cbx4], 0x40);
851 memset_ch(&t->l.ccoef[0][cby4], 0x40);
852 memset_ch(&t->l.ccoef[1][cby4], 0x40);
853 }
854 return;
855 }
856
857 Dav1dTileState *const ts = t->ts;
858 const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
859 const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
860 assert(t->frame_thread.pass == 1);
861 assert(!b->skip);
862 const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
863 const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
864 const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
865
866 for (int init_y = 0; init_y < h4; init_y += 16) {
867 const int sub_h4 = imin(h4, 16 + init_y);
868 for (int init_x = 0; init_x < w4; init_x += 16) {
869 const int sub_w4 = imin(w4, init_x + 16);
870 int y_off = !!init_y, y, x;
871 for (y = init_y, t->by += init_y; y < sub_h4;
872 y += t_dim->h, t->by += t_dim->h, y_off++)
873 {
874 int x_off = !!init_x;
875 for (x = init_x, t->bx += init_x; x < sub_w4;
876 x += t_dim->w, t->bx += t_dim->w, x_off++)
877 {
878 if (!b->intra) {
879 read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
880 x_off, y_off, NULL);
881 } else {
882 uint8_t cf_ctx = 0x40;
883 enum TxfmType txtp;
884 const int eob =
885 decode_coefs(t, &t->a->lcoef[bx4 + x],
886 &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
887 0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
888 if (DEBUG_BLOCK_INFO)
889 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
890 b->tx, txtp, eob, ts->msac.rng);
891 *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
892 ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
893 dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
894 dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
895 }
896 }
897 t->bx -= x;
898 }
899 t->by -= y;
900
901 if (!has_chroma) continue;
902
903 const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
904 const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
905 for (int pl = 0; pl < 2; pl++) {
906 for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
907 y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
908 {
909 for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
910 x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
911 {
912 uint8_t cf_ctx = 0x40;
913 enum TxfmType txtp;
914 if (!b->intra)
915 txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
916 bx4 + (x << ss_hor)];
917 const int eob =
918 decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
919 &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
920 b, b->intra, 1 + pl, ts->frame_thread[1].cf,
921 &txtp, &cf_ctx);
922 if (DEBUG_BLOCK_INFO)
923 printf("Post-uv-cf-blk[pl=%d,tx=%d,"
924 "txtp=%d,eob=%d]: r=%d\n",
925 pl, b->uvtx, txtp, eob, ts->msac.rng);
926 *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
927 ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
928 int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
929 int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
930 dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
931 dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
932 }
933 t->bx -= x << ss_hor;
934 }
935 t->by -= y << ss_ver;
936 }
937 }
938 }
939 }
940
mc(Dav1dTaskContext * const t,pixel * const dst8,int16_t * const dst16,const ptrdiff_t dst_stride,const int bw4,const int bh4,const int bx,const int by,const int pl,const mv mv,const Dav1dThreadPicture * const refp,const int refidx,const enum Filter2d filter_2d)941 static int mc(Dav1dTaskContext *const t,
942 pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
943 const int bw4, const int bh4,
944 const int bx, const int by, const int pl,
945 const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
946 const enum Filter2d filter_2d)
947 {
948 assert((dst8 != NULL) ^ (dst16 != NULL));
949 const Dav1dFrameContext *const f = t->f;
950 const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
951 const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
952 const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
953 const int mvx = mv.x, mvy = mv.y;
954 const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
955 ptrdiff_t ref_stride = refp->p.stride[!!pl];
956 const pixel *ref;
957
958 if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
959 const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
960 const int dy = by * v_mul + (mvy >> (3 + ss_ver));
961 int w, h;
962
963 if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
964 w = (f->cur.p.w + ss_hor) >> ss_hor;
965 h = (f->cur.p.h + ss_ver) >> ss_ver;
966 } else {
967 w = f->bw * 4 >> ss_hor;
968 h = f->bh * 4 >> ss_ver;
969 }
970 if (dx < !!mx * 3 || dy < !!my * 3 ||
971 dx + bw4 * h_mul + !!mx * 4 > w ||
972 dy + bh4 * v_mul + !!my * 4 > h)
973 {
974 pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
975 f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
976 w, h, dx - !!mx * 3, dy - !!my * 3,
977 emu_edge_buf, 192 * sizeof(pixel),
978 refp->p.data[pl], ref_stride);
979 ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];
980 ref_stride = 192 * sizeof(pixel);
981 } else {
982 ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
983 }
984
985 if (dst8 != NULL) {
986 f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
987 bh4 * v_mul, mx << !ss_hor, my << !ss_ver
988 HIGHBD_CALL_SUFFIX);
989 } else {
990 f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
991 bh4 * v_mul, mx << !ss_hor, my << !ss_ver
992 HIGHBD_CALL_SUFFIX);
993 }
994 } else {
995 assert(refp != &f->sr_cur);
996
997 const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
998 const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
999 #define scale_mv(res, val, scale) do { \
1000 const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
1001 res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32; \
1002 } while (0)
1003 int pos_y, pos_x;
1004 scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
1005 scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
1006 #undef scale_mv
1007 const int left = pos_x >> 10;
1008 const int top = pos_y >> 10;
1009 const int right =
1010 ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
1011 const int bottom =
1012 ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
1013
1014 if (DEBUG_BLOCK_INFO)
1015 printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
1016 left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
1017 right-left, bottom-top,
1018 f->svc[refidx][0].step, f->svc[refidx][1].step);
1019
1020 const int w = (refp->p.p.w + ss_hor) >> ss_hor;
1021 const int h = (refp->p.p.h + ss_ver) >> ss_ver;
1022 if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
1023 pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
1024 f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
1025 w, h, left - 3, top - 3,
1026 emu_edge_buf, 320 * sizeof(pixel),
1027 refp->p.data[pl], ref_stride);
1028 ref = &emu_edge_buf[320 * 3 + 3];
1029 ref_stride = 320 * sizeof(pixel);
1030 if (DEBUG_BLOCK_INFO) printf("Emu\n");
1031 } else {
1032 ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
1033 }
1034
1035 if (dst8 != NULL) {
1036 f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
1037 bw4 * h_mul, bh4 * v_mul,
1038 pos_x & 0x3ff, pos_y & 0x3ff,
1039 f->svc[refidx][0].step,
1040 f->svc[refidx][1].step
1041 HIGHBD_CALL_SUFFIX);
1042 } else {
1043 f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
1044 bw4 * h_mul, bh4 * v_mul,
1045 pos_x & 0x3ff, pos_y & 0x3ff,
1046 f->svc[refidx][0].step,
1047 f->svc[refidx][1].step
1048 HIGHBD_CALL_SUFFIX);
1049 }
1050 }
1051
1052 return 0;
1053 }
1054
obmc(Dav1dTaskContext * const t,pixel * const dst,const ptrdiff_t dst_stride,const uint8_t * const b_dim,const int pl,const int bx4,const int by4,const int w4,const int h4)1055 static int obmc(Dav1dTaskContext *const t,
1056 pixel *const dst, const ptrdiff_t dst_stride,
1057 const uint8_t *const b_dim, const int pl,
1058 const int bx4, const int by4, const int w4, const int h4)
1059 {
1060 assert(!(t->bx & 1) && !(t->by & 1));
1061 const Dav1dFrameContext *const f = t->f;
1062 /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
1063 pixel *const lap = bitfn(t->scratch.lap);
1064 const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1065 const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1066 const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
1067 int res;
1068
1069 if (t->by > t->ts->tiling.row_start &&
1070 (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
1071 {
1072 for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
1073 // only odd blocks are considered for overlap handling, hence +1
1074 const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
1075 const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
1076 const int step4 = iclip(a_b_dim[0], 2, 16);
1077
1078 if (a_r->ref.ref[0] > 0) {
1079 const int ow4 = imin(step4, b_dim[0]);
1080 const int oh4 = imin(b_dim[1], 16) >> 1;
1081 res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
1082 t->bx + x, t->by, pl, a_r->mv.mv[0],
1083 &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
1084 dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
1085 if (res) return res;
1086 f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
1087 h_mul * ow4, v_mul * oh4);
1088 i++;
1089 }
1090 x += step4;
1091 }
1092 }
1093
1094 if (t->bx > t->ts->tiling.col_start)
1095 for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
1096 // only odd blocks are considered for overlap handling, hence +1
1097 const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
1098 const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
1099 const int step4 = iclip(l_b_dim[1], 2, 16);
1100
1101 if (l_r->ref.ref[0] > 0) {
1102 const int ow4 = imin(b_dim[0], 16) >> 1;
1103 const int oh4 = imin(step4, b_dim[1]);
1104 res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
1105 t->bx, t->by + y, pl, l_r->mv.mv[0],
1106 &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
1107 dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
1108 if (res) return res;
1109 f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
1110 dst_stride, lap, h_mul * ow4, v_mul * oh4);
1111 i++;
1112 }
1113 y += step4;
1114 }
1115 return 0;
1116 }
1117
warp_affine(Dav1dTaskContext * const t,pixel * dst8,int16_t * dst16,const ptrdiff_t dstride,const uint8_t * const b_dim,const int pl,const Dav1dThreadPicture * const refp,const Dav1dWarpedMotionParams * const wmp)1118 static int warp_affine(Dav1dTaskContext *const t,
1119 pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
1120 const uint8_t *const b_dim, const int pl,
1121 const Dav1dThreadPicture *const refp,
1122 const Dav1dWarpedMotionParams *const wmp)
1123 {
1124 assert((dst8 != NULL) ^ (dst16 != NULL));
1125 const Dav1dFrameContext *const f = t->f;
1126 const Dav1dDSPContext *const dsp = f->dsp;
1127 const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1128 const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1129 const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
1130 assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
1131 const int32_t *const mat = wmp->matrix;
1132 const int width = (refp->p.p.w + ss_hor) >> ss_hor;
1133 const int height = (refp->p.p.h + ss_ver) >> ss_ver;
1134
1135 for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
1136 const int src_y = t->by * 4 + ((y + 4) << ss_ver);
1137 const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
1138 const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
1139 for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
1140 // calculate transformation relative to center of 8x8 block in
1141 // luma pixel units
1142 const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
1143 const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
1144 const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
1145
1146 const int dx = (int) (mvx >> 16) - 4;
1147 const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
1148 wmp->u.p.beta * 7) & ~0x3f;
1149 const int dy = (int) (mvy >> 16) - 4;
1150 const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
1151 wmp->u.p.delta * 4) & ~0x3f;
1152
1153 const pixel *ref_ptr;
1154 ptrdiff_t ref_stride = refp->p.stride[!!pl];
1155
1156 if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
1157 pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
1158 f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
1159 emu_edge_buf, 32 * sizeof(pixel),
1160 refp->p.data[pl], ref_stride);
1161 ref_ptr = &emu_edge_buf[32 * 3 + 3];
1162 ref_stride = 32 * sizeof(pixel);
1163 } else {
1164 ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
1165 }
1166 if (dst16 != NULL)
1167 dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
1168 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
1169 else
1170 dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
1171 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
1172 }
1173 if (dst8) dst8 += 8 * PXSTRIDE(dstride);
1174 else dst16 += 8 * dstride;
1175 }
1176 return 0;
1177 }
1178
bytefn(dav1d_recon_b_intra)1179 void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
1180 const enum EdgeFlags intra_edge_flags,
1181 const Av1Block *const b)
1182 {
1183 Dav1dTileState *const ts = t->ts;
1184 const Dav1dFrameContext *const f = t->f;
1185 const Dav1dDSPContext *const dsp = f->dsp;
1186 const int bx4 = t->bx & 31, by4 = t->by & 31;
1187 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1188 const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1189 const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1190 const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1191 const int bw4 = b_dim[0], bh4 = b_dim[1];
1192 const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1193 const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1194 const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1195 (bw4 > ss_hor || t->bx & 1) &&
1196 (bh4 > ss_ver || t->by & 1);
1197 const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
1198 const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
1199
1200 // coefficient coding
1201 pixel *const edge = bitfn(t->scratch.edge) + 128;
1202 const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
1203
1204 const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
1205
1206 for (int init_y = 0; init_y < h4; init_y += 16) {
1207 const int sub_h4 = imin(h4, 16 + init_y);
1208 const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
1209 for (int init_x = 0; init_x < w4; init_x += 16) {
1210 if (b->pal_sz[0]) {
1211 pixel *dst = ((pixel *) f->cur.data[0]) +
1212 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1213 const uint8_t *pal_idx;
1214 if (t->frame_thread.pass) {
1215 const int p = t->frame_thread.pass & 1;
1216 assert(ts->frame_thread[p].pal_idx);
1217 pal_idx = ts->frame_thread[p].pal_idx;
1218 ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
1219 } else {
1220 pal_idx = t->scratch.pal_idx_y;
1221 }
1222 const pixel *const pal = t->frame_thread.pass ?
1223 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1224 ((t->bx >> 1) + (t->by & 1))][0] :
1225 bytefn(t->scratch.pal)[0];
1226 f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
1227 pal_idx, bw4 * 4, bh4 * 4);
1228 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1229 hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
1230 bw4 * 4, bh4 * 4, "y-pal-pred");
1231 }
1232
1233 const int intra_flags = (sm_flag(t->a, bx4) |
1234 sm_flag(&t->l, by4) |
1235 intra_edge_filter_flag);
1236 const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
1237 intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
1238 const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
1239 intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
1240 int y, x;
1241 const int sub_w4 = imin(w4, init_x + 16);
1242 for (y = init_y, t->by += init_y; y < sub_h4;
1243 y += t_dim->h, t->by += t_dim->h)
1244 {
1245 pixel *dst = ((pixel *) f->cur.data[0]) +
1246 4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
1247 t->bx + init_x);
1248 for (x = init_x, t->bx += init_x; x < sub_w4;
1249 x += t_dim->w, t->bx += t_dim->w)
1250 {
1251 if (b->pal_sz[0]) goto skip_y_pred;
1252
1253 int angle = b->y_angle;
1254 const enum EdgeFlags edge_flags =
1255 (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
1256 0 : EDGE_I444_TOP_HAS_RIGHT) |
1257 ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
1258 0 : EDGE_I444_LEFT_HAS_BOTTOM);
1259 const pixel *top_sb_edge = NULL;
1260 if (!(t->by & (f->sb_step - 1))) {
1261 top_sb_edge = f->ipred_edge[0];
1262 const int sby = t->by >> f->sb_shift;
1263 top_sb_edge += f->sb128w * 128 * (sby - 1);
1264 }
1265 const enum IntraPredMode m =
1266 bytefn(dav1d_prepare_intra_edges)(t->bx,
1267 t->bx > ts->tiling.col_start,
1268 t->by,
1269 t->by > ts->tiling.row_start,
1270 ts->tiling.col_end,
1271 ts->tiling.row_end,
1272 edge_flags, dst,
1273 f->cur.stride[0], top_sb_edge,
1274 b->y_mode, &angle,
1275 t_dim->w, t_dim->h,
1276 f->seq_hdr->intra_edge_filter,
1277 edge HIGHBD_CALL_SUFFIX);
1278 dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
1279 t_dim->w * 4, t_dim->h * 4,
1280 angle | intra_flags,
1281 4 * f->bw - 4 * t->bx,
1282 4 * f->bh - 4 * t->by
1283 HIGHBD_CALL_SUFFIX);
1284
1285 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1286 hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
1287 t_dim->h * 4, 2, "l");
1288 hex_dump(edge, 0, 1, 1, "tl");
1289 hex_dump(edge + 1, t_dim->w * 4,
1290 t_dim->w * 4, 2, "t");
1291 hex_dump(dst, f->cur.stride[0],
1292 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
1293 }
1294
1295 skip_y_pred: {}
1296 if (!b->skip) {
1297 coef *cf;
1298 int eob;
1299 enum TxfmType txtp;
1300 if (t->frame_thread.pass) {
1301 const int p = t->frame_thread.pass & 1;
1302 const int cbi = *ts->frame_thread[p].cbi++;
1303 cf = ts->frame_thread[p].cf;
1304 ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
1305 eob = cbi >> 5;
1306 txtp = cbi & 0x1f;
1307 } else {
1308 uint8_t cf_ctx;
1309 cf = bitfn(t->cf);
1310 eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
1311 &t->l.lcoef[by4 + y], b->tx, bs,
1312 b, 1, 0, cf, &txtp, &cf_ctx);
1313 if (DEBUG_BLOCK_INFO)
1314 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
1315 b->tx, txtp, eob, ts->msac.rng);
1316 dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
1317 dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
1318 }
1319 if (eob >= 0) {
1320 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1321 coef_dump(cf, imin(t_dim->h, 8) * 4,
1322 imin(t_dim->w, 8) * 4, 3, "dq");
1323 dsp->itx.itxfm_add[b->tx]
1324 [txtp](dst,
1325 f->cur.stride[0],
1326 cf, eob HIGHBD_CALL_SUFFIX);
1327 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1328 hex_dump(dst, f->cur.stride[0],
1329 t_dim->w * 4, t_dim->h * 4, "recon");
1330 }
1331 } else if (!t->frame_thread.pass) {
1332 dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);
1333 dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);
1334 }
1335 dst += 4 * t_dim->w;
1336 }
1337 t->bx -= x;
1338 }
1339 t->by -= y;
1340
1341 if (!has_chroma) continue;
1342
1343 const ptrdiff_t stride = f->cur.stride[1];
1344
1345 if (b->uv_mode == CFL_PRED) {
1346 assert(!init_x && !init_y);
1347
1348 int16_t *const ac = t->scratch.ac;
1349 pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
1350 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
1351 const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
1352 (t->by >> ss_ver) * PXSTRIDE(stride));
1353 pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
1354 ((pixel *) f->cur.data[2]) + uv_off };
1355
1356 const int furthest_r =
1357 ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
1358 const int furthest_b =
1359 ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
1360 dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
1361 cbw4 - (furthest_r >> ss_hor),
1362 cbh4 - (furthest_b >> ss_ver),
1363 cbw4 * 4, cbh4 * 4);
1364 for (int pl = 0; pl < 2; pl++) {
1365 if (!b->cfl_alpha[pl]) continue;
1366 int angle = 0;
1367 const pixel *top_sb_edge = NULL;
1368 if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1369 top_sb_edge = f->ipred_edge[pl + 1];
1370 const int sby = t->by >> f->sb_shift;
1371 top_sb_edge += f->sb128w * 128 * (sby - 1);
1372 }
1373 const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1374 const int xstart = ts->tiling.col_start >> ss_hor;
1375 const int ystart = ts->tiling.row_start >> ss_ver;
1376 const enum IntraPredMode m =
1377 bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1378 ypos, ypos > ystart,
1379 ts->tiling.col_end >> ss_hor,
1380 ts->tiling.row_end >> ss_ver,
1381 0, uv_dst[pl], stride,
1382 top_sb_edge, DC_PRED, &angle,
1383 uv_t_dim->w, uv_t_dim->h, 0,
1384 edge HIGHBD_CALL_SUFFIX);
1385 dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
1386 uv_t_dim->w * 4,
1387 uv_t_dim->h * 4,
1388 ac, b->cfl_alpha[pl]
1389 HIGHBD_CALL_SUFFIX);
1390 }
1391 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1392 ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
1393 hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
1394 hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
1395 }
1396 } else if (b->pal_sz[1]) {
1397 const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
1398 (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1399 const pixel (*pal)[8];
1400 const uint8_t *pal_idx;
1401 if (t->frame_thread.pass) {
1402 const int p = t->frame_thread.pass & 1;
1403 assert(ts->frame_thread[p].pal_idx);
1404 pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1405 ((t->bx >> 1) + (t->by & 1))];
1406 pal_idx = ts->frame_thread[p].pal_idx;
1407 ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
1408 } else {
1409 pal = bytefn(t->scratch.pal);
1410 pal_idx = t->scratch.pal_idx_uv;
1411 }
1412
1413 f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
1414 f->cur.stride[1], pal[1],
1415 pal_idx, cbw4 * 4, cbh4 * 4);
1416 f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
1417 f->cur.stride[1], pal[2],
1418 pal_idx, cbw4 * 4, cbh4 * 4);
1419 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1420 hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
1421 PXSTRIDE(f->cur.stride[1]),
1422 cbw4 * 4, cbh4 * 4, "u-pal-pred");
1423 hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
1424 PXSTRIDE(f->cur.stride[1]),
1425 cbw4 * 4, cbh4 * 4, "v-pal-pred");
1426 }
1427 }
1428
1429 const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
1430 sm_uv_flag(&t->l, cby4);
1431 const int uv_sb_has_tr =
1432 ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
1433 intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
1434 const int uv_sb_has_bl =
1435 init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
1436 intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
1437 const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
1438 for (int pl = 0; pl < 2; pl++) {
1439 for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
1440 y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
1441 {
1442 pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
1443 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
1444 ((t->bx + init_x) >> ss_hor));
1445 for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
1446 x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
1447 {
1448 if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
1449 b->pal_sz[1])
1450 {
1451 goto skip_uv_pred;
1452 }
1453
1454 int angle = b->uv_angle;
1455 // this probably looks weird because we're using
1456 // luma flags in a chroma loop, but that's because
1457 // prepare_intra_edges() expects luma flags as input
1458 const enum EdgeFlags edge_flags =
1459 (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
1460 (x + uv_t_dim->w >= sub_cw4)) ?
1461 0 : EDGE_I444_TOP_HAS_RIGHT) |
1462 ((x > (init_x >> ss_hor) ||
1463 (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
1464 0 : EDGE_I444_LEFT_HAS_BOTTOM);
1465 const pixel *top_sb_edge = NULL;
1466 if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1467 top_sb_edge = f->ipred_edge[1 + pl];
1468 const int sby = t->by >> f->sb_shift;
1469 top_sb_edge += f->sb128w * 128 * (sby - 1);
1470 }
1471 const enum IntraPredMode uv_mode =
1472 b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
1473 const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1474 const int xstart = ts->tiling.col_start >> ss_hor;
1475 const int ystart = ts->tiling.row_start >> ss_ver;
1476 const enum IntraPredMode m =
1477 bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1478 ypos, ypos > ystart,
1479 ts->tiling.col_end >> ss_hor,
1480 ts->tiling.row_end >> ss_ver,
1481 edge_flags, dst, stride,
1482 top_sb_edge, uv_mode,
1483 &angle, uv_t_dim->w,
1484 uv_t_dim->h,
1485 f->seq_hdr->intra_edge_filter,
1486 edge HIGHBD_CALL_SUFFIX);
1487 angle |= intra_edge_filter_flag;
1488 dsp->ipred.intra_pred[m](dst, stride, edge,
1489 uv_t_dim->w * 4,
1490 uv_t_dim->h * 4,
1491 angle | sm_uv_fl,
1492 (4 * f->bw + ss_hor -
1493 4 * (t->bx & ~ss_hor)) >> ss_hor,
1494 (4 * f->bh + ss_ver -
1495 4 * (t->by & ~ss_ver)) >> ss_ver
1496 HIGHBD_CALL_SUFFIX);
1497 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1498 hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
1499 uv_t_dim->h * 4, 2, "l");
1500 hex_dump(edge, 0, 1, 1, "tl");
1501 hex_dump(edge + 1, uv_t_dim->w * 4,
1502 uv_t_dim->w * 4, 2, "t");
1503 hex_dump(dst, stride, uv_t_dim->w * 4,
1504 uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
1505 }
1506
1507 skip_uv_pred: {}
1508 if (!b->skip) {
1509 enum TxfmType txtp;
1510 int eob;
1511 coef *cf;
1512 if (t->frame_thread.pass) {
1513 const int p = t->frame_thread.pass & 1;
1514 const int cbi = *ts->frame_thread[p].cbi++;
1515 cf = ts->frame_thread[p].cf;
1516 ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
1517 eob = cbi >> 5;
1518 txtp = cbi & 0x1f;
1519 } else {
1520 uint8_t cf_ctx;
1521 cf = bitfn(t->cf);
1522 eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1523 &t->l.ccoef[pl][cby4 + y],
1524 b->uvtx, bs, b, 1, 1 + pl, cf,
1525 &txtp, &cf_ctx);
1526 if (DEBUG_BLOCK_INFO)
1527 printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1528 "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
1529 pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
1530 int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
1531 int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
1532 dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
1533 dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
1534 }
1535 if (eob >= 0) {
1536 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1537 coef_dump(cf, uv_t_dim->h * 4,
1538 uv_t_dim->w * 4, 3, "dq");
1539 dsp->itx.itxfm_add[b->uvtx]
1540 [txtp](dst, stride,
1541 cf, eob HIGHBD_CALL_SUFFIX);
1542 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1543 hex_dump(dst, stride, uv_t_dim->w * 4,
1544 uv_t_dim->h * 4, "recon");
1545 }
1546 } else if (!t->frame_thread.pass) {
1547 dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);
1548 dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);
1549 }
1550 dst += uv_t_dim->w * 4;
1551 }
1552 t->bx -= x << ss_hor;
1553 }
1554 t->by -= y << ss_ver;
1555 }
1556 }
1557 }
1558 }
1559
bytefn(dav1d_recon_b_inter)1560 int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
1561 const Av1Block *const b)
1562 {
1563 Dav1dTileState *const ts = t->ts;
1564 const Dav1dFrameContext *const f = t->f;
1565 const Dav1dDSPContext *const dsp = f->dsp;
1566 const int bx4 = t->bx & 31, by4 = t->by & 31;
1567 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1568 const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1569 const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1570 const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1571 const int bw4 = b_dim[0], bh4 = b_dim[1];
1572 const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1573 const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1574 (bw4 > ss_hor || t->bx & 1) &&
1575 (bh4 > ss_ver || t->by & 1);
1576 const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
1577 DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
1578 int res;
1579
1580 // prediction
1581 const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
1582 pixel *dst = ((pixel *) f->cur.data[0]) +
1583 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1584 const ptrdiff_t uvdstoff =
1585 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1586 if (IS_KEY_OR_INTRA(f->frame_hdr)) {
1587 // intrabc
1588 assert(!f->frame_hdr->super_res.enabled);
1589 res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
1590 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1591 if (res) return res;
1592 if (has_chroma) for (int pl = 1; pl < 3; pl++) {
1593 res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
1594 bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1595 t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
1596 &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1597 if (res) return res;
1598 }
1599 } else if (b->comp_type == COMP_INTER_NONE) {
1600 const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
1601 const enum Filter2d filter_2d = b->filter2d;
1602
1603 if (imin(bw4, bh4) > 1 &&
1604 ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1605 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1606 {
1607 res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
1608 b->motion_mode == MM_WARP ? &t->warpmv :
1609 &f->frame_hdr->gmv[b->ref[0]]);
1610 if (res) return res;
1611 } else {
1612 res = mc(t, dst, NULL, f->cur.stride[0],
1613 bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
1614 if (res) return res;
1615 if (b->motion_mode == MM_OBMC) {
1616 res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
1617 if (res) return res;
1618 }
1619 }
1620 if (b->interintra_type) {
1621 pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1622 enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
1623 SMOOTH_PRED : b->interintra_mode;
1624 pixel *const tmp = bitfn(t->scratch.interintra);
1625 int angle = 0;
1626 const pixel *top_sb_edge = NULL;
1627 if (!(t->by & (f->sb_step - 1))) {
1628 top_sb_edge = f->ipred_edge[0];
1629 const int sby = t->by >> f->sb_shift;
1630 top_sb_edge += f->sb128w * 128 * (sby - 1);
1631 }
1632 m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
1633 t->by, t->by > ts->tiling.row_start,
1634 ts->tiling.col_end, ts->tiling.row_end,
1635 0, dst, f->cur.stride[0], top_sb_edge,
1636 m, &angle, bw4, bh4, 0, tl_edge
1637 HIGHBD_CALL_SUFFIX);
1638 dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
1639 tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
1640 HIGHBD_CALL_SUFFIX);
1641 dsp->mc.blend(dst, f->cur.stride[0], tmp,
1642 bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
1643 }
1644
1645 if (!has_chroma) goto skip_inter_chroma_pred;
1646
1647 // sub8x8 derivation
1648 int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
1649 refmvs_block *const *r;
1650 if (is_sub8x8) {
1651 assert(ss_hor == 1);
1652 r = &t->rt.r[(t->by & 31) + 5];
1653 if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
1654 if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
1655 if (bw4 == 1 && bh4 == ss_ver)
1656 is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
1657 }
1658
1659 // chroma prediction
1660 if (is_sub8x8) {
1661 assert(ss_hor == 1);
1662 ptrdiff_t h_off = 0, v_off = 0;
1663 if (bw4 == 1 && bh4 == ss_ver) {
1664 for (int pl = 0; pl < 2; pl++) {
1665 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1666 NULL, f->cur.stride[1],
1667 bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
1668 r[-1][t->bx - 1].mv.mv[0],
1669 &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
1670 r[-1][t->bx - 1].ref.ref[0] - 1,
1671 t->frame_thread.pass != 2 ? t->tl_4x4_filter :
1672 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
1673 if (res) return res;
1674 }
1675 v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1676 h_off = 2;
1677 }
1678 if (bw4 == 1) {
1679 const enum Filter2d left_filter_2d =
1680 dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
1681 for (int pl = 0; pl < 2; pl++) {
1682 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
1683 f->cur.stride[1], bw4, bh4, t->bx - 1,
1684 t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
1685 &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
1686 r[0][t->bx - 1].ref.ref[0] - 1,
1687 t->frame_thread.pass != 2 ? left_filter_2d :
1688 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
1689 if (res) return res;
1690 }
1691 h_off = 2;
1692 }
1693 if (bh4 == ss_ver) {
1694 const enum Filter2d top_filter_2d =
1695 dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
1696 for (int pl = 0; pl < 2; pl++) {
1697 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
1698 f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
1699 1 + pl, r[-1][t->bx].mv.mv[0],
1700 &f->refp[r[-1][t->bx].ref.ref[0] - 1],
1701 r[-1][t->bx].ref.ref[0] - 1,
1702 t->frame_thread.pass != 2 ? top_filter_2d :
1703 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
1704 if (res) return res;
1705 }
1706 v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1707 }
1708 for (int pl = 0; pl < 2; pl++) {
1709 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
1710 bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
1711 refp, b->ref[0], filter_2d);
1712 if (res) return res;
1713 }
1714 } else {
1715 if (imin(cbw4, cbh4) > 1 &&
1716 ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1717 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1718 {
1719 for (int pl = 0; pl < 2; pl++) {
1720 res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
1721 f->cur.stride[1], b_dim, 1 + pl, refp,
1722 b->motion_mode == MM_WARP ? &t->warpmv :
1723 &f->frame_hdr->gmv[b->ref[0]]);
1724 if (res) return res;
1725 }
1726 } else {
1727 for (int pl = 0; pl < 2; pl++) {
1728 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1729 NULL, f->cur.stride[1],
1730 bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1731 t->bx & ~ss_hor, t->by & ~ss_ver,
1732 1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
1733 if (res) return res;
1734 if (b->motion_mode == MM_OBMC) {
1735 res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1736 f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
1737 if (res) return res;
1738 }
1739 }
1740 }
1741 if (b->interintra_type) {
1742 // FIXME for 8x32 with 4:2:2 subsampling, this probably does
1743 // the wrong thing since it will select 4x16, not 4x32, as a
1744 // transform size...
1745 const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
1746
1747 for (int pl = 0; pl < 2; pl++) {
1748 pixel *const tmp = bitfn(t->scratch.interintra);
1749 pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1750 enum IntraPredMode m =
1751 b->interintra_mode == II_SMOOTH_PRED ?
1752 SMOOTH_PRED : b->interintra_mode;
1753 int angle = 0;
1754 pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1755 const pixel *top_sb_edge = NULL;
1756 if (!(t->by & (f->sb_step - 1))) {
1757 top_sb_edge = f->ipred_edge[pl + 1];
1758 const int sby = t->by >> f->sb_shift;
1759 top_sb_edge += f->sb128w * 128 * (sby - 1);
1760 }
1761 m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
1762 (t->bx >> ss_hor) >
1763 (ts->tiling.col_start >> ss_hor),
1764 t->by >> ss_ver,
1765 (t->by >> ss_ver) >
1766 (ts->tiling.row_start >> ss_ver),
1767 ts->tiling.col_end >> ss_hor,
1768 ts->tiling.row_end >> ss_ver,
1769 0, uvdst, f->cur.stride[1],
1770 top_sb_edge, m,
1771 &angle, cbw4, cbh4, 0, tl_edge
1772 HIGHBD_CALL_SUFFIX);
1773 dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
1774 tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
1775 HIGHBD_CALL_SUFFIX);
1776 dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
1777 cbw4 * 4, cbh4 * 4, ii_mask);
1778 }
1779 }
1780 }
1781
1782 skip_inter_chroma_pred: {}
1783 t->tl_4x4_filter = filter_2d;
1784 } else {
1785 const enum Filter2d filter_2d = b->filter2d;
1786 // Maximum super block size is 128x128
1787 int16_t (*tmp)[128 * 128] = t->scratch.compinter;
1788 int jnt_weight;
1789 uint8_t *const seg_mask = t->scratch.seg_mask;
1790 const uint8_t *mask;
1791
1792 for (int i = 0; i < 2; i++) {
1793 const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1794
1795 if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
1796 res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
1797 &f->frame_hdr->gmv[b->ref[i]]);
1798 if (res) return res;
1799 } else {
1800 res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
1801 b->mv[i], refp, b->ref[i], filter_2d);
1802 if (res) return res;
1803 }
1804 }
1805 switch (b->comp_type) {
1806 case COMP_INTER_AVG:
1807 dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1808 bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
1809 break;
1810 case COMP_INTER_WEIGHTED_AVG:
1811 jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
1812 dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1813 bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
1814 break;
1815 case COMP_INTER_SEG:
1816 dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
1817 tmp[b->mask_sign], tmp[!b->mask_sign],
1818 bw4 * 4, bh4 * 4, seg_mask,
1819 b->mask_sign HIGHBD_CALL_SUFFIX);
1820 mask = seg_mask;
1821 break;
1822 case COMP_INTER_WEDGE:
1823 mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
1824 dsp->mc.mask(dst, f->cur.stride[0],
1825 tmp[b->mask_sign], tmp[!b->mask_sign],
1826 bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
1827 if (has_chroma)
1828 mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
1829 break;
1830 }
1831
1832 // chroma
1833 if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1834 for (int i = 0; i < 2; i++) {
1835 const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1836 if (b->inter_mode == GLOBALMV_GLOBALMV &&
1837 imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
1838 {
1839 res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
1840 b_dim, 1 + pl,
1841 refp, &f->frame_hdr->gmv[b->ref[i]]);
1842 if (res) return res;
1843 } else {
1844 res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
1845 1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
1846 if (res) return res;
1847 }
1848 }
1849 pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1850 switch (b->comp_type) {
1851 case COMP_INTER_AVG:
1852 dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1853 bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
1854 HIGHBD_CALL_SUFFIX);
1855 break;
1856 case COMP_INTER_WEIGHTED_AVG:
1857 dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1858 bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
1859 HIGHBD_CALL_SUFFIX);
1860 break;
1861 case COMP_INTER_WEDGE:
1862 case COMP_INTER_SEG:
1863 dsp->mc.mask(uvdst, f->cur.stride[1],
1864 tmp[b->mask_sign], tmp[!b->mask_sign],
1865 bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
1866 HIGHBD_CALL_SUFFIX);
1867 break;
1868 }
1869 }
1870 }
1871
1872 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1873 hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
1874 if (has_chroma) {
1875 hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
1876 cbw4 * 4, cbh4 * 4, "u-pred");
1877 hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
1878 cbw4 * 4, cbh4 * 4, "v-pred");
1879 }
1880 }
1881
1882 const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1883
1884 if (b->skip) {
1885 // reset coef contexts
1886 BlockContext *const a = t->a;
1887 dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
1888 dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
1889 if (has_chroma) {
1890 dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
1891 dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
1892 memset_cw(&a->ccoef[0][cbx4], 0x40);
1893 memset_cw(&a->ccoef[1][cbx4], 0x40);
1894 memset_ch(&t->l.ccoef[0][cby4], 0x40);
1895 memset_ch(&t->l.ccoef[1][cby4], 0x40);
1896 }
1897 return 0;
1898 }
1899
1900 const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
1901 const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
1902 const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
1903
1904 for (int init_y = 0; init_y < bh4; init_y += 16) {
1905 for (int init_x = 0; init_x < bw4; init_x += 16) {
1906 // coefficient coding & inverse transforms
1907 int y_off = !!init_y, y;
1908 dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
1909 for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
1910 y += ytx->h, y_off++)
1911 {
1912 int x, x_off = !!init_x;
1913 for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
1914 x += ytx->w, x_off++)
1915 {
1916 read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
1917 x_off, y_off, &dst[x * 4]);
1918 t->bx += ytx->w;
1919 }
1920 dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
1921 t->bx -= x;
1922 t->by += ytx->h;
1923 }
1924 dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
1925 t->by -= y;
1926
1927 // chroma coefs and inverse transform
1928 if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1929 pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
1930 (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
1931 for (y = init_y >> ss_ver, t->by += init_y;
1932 y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
1933 {
1934 int x;
1935 for (x = init_x >> ss_hor, t->bx += init_x;
1936 x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
1937 {
1938 coef *cf;
1939 int eob;
1940 enum TxfmType txtp;
1941 if (t->frame_thread.pass) {
1942 const int p = t->frame_thread.pass & 1;
1943 const int cbi = *ts->frame_thread[p].cbi++;
1944 cf = ts->frame_thread[p].cf;
1945 ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
1946 eob = cbi >> 5;
1947 txtp = cbi & 0x1f;
1948 } else {
1949 uint8_t cf_ctx;
1950 cf = bitfn(t->cf);
1951 txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
1952 bx4 + (x << ss_hor)];
1953 eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1954 &t->l.ccoef[pl][cby4 + y],
1955 b->uvtx, bs, b, 0, 1 + pl,
1956 cf, &txtp, &cf_ctx);
1957 if (DEBUG_BLOCK_INFO)
1958 printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1959 "txtp=%d,eob=%d]: r=%d\n",
1960 pl, b->uvtx, txtp, eob, ts->msac.rng);
1961 int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
1962 int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
1963 dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
1964 dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
1965 }
1966 if (eob >= 0) {
1967 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1968 coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
1969 dsp->itx.itxfm_add[b->uvtx]
1970 [txtp](&uvdst[4 * x],
1971 f->cur.stride[1],
1972 cf, eob HIGHBD_CALL_SUFFIX);
1973 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1974 hex_dump(&uvdst[4 * x], f->cur.stride[1],
1975 uvtx->w * 4, uvtx->h * 4, "recon");
1976 }
1977 t->bx += uvtx->w << ss_hor;
1978 }
1979 uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
1980 t->bx -= x << ss_hor;
1981 t->by += uvtx->h << ss_ver;
1982 }
1983 t->by -= y << ss_ver;
1984 }
1985 }
1986 }
1987 return 0;
1988 }
1989
bytefn(dav1d_filter_sbrow_deblock_cols)1990 void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
1991 if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
1992 (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
1993 {
1994 return;
1995 }
1996 const int y = sby * f->sb_step * 4;
1997 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1998 pixel *const p[3] = {
1999 f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2000 f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2001 f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2002 };
2003 Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2004 bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
2005 f->lf.start_of_tile_row[sby]);
2006 }
2007
bytefn(dav1d_filter_sbrow_deblock_rows)2008 void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
2009 const int y = sby * f->sb_step * 4;
2010 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2011 pixel *const p[3] = {
2012 f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2013 f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2014 f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2015 };
2016 Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2017 if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
2018 (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
2019 {
2020 bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
2021 }
2022 if (f->seq_hdr->cdef || f->lf.restore_planes) {
2023 // Store loop filtered pixels required by CDEF / LR
2024 bytefn(dav1d_copy_lpf)(f, p, sby);
2025 }
2026 }
2027
bytefn(dav1d_filter_sbrow_cdef)2028 void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
2029 const Dav1dFrameContext *const f = tc->f;
2030 if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
2031 const int sbsz = f->sb_step;
2032 const int y = sby * sbsz * 4;
2033 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2034 pixel *const p[3] = {
2035 f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2036 f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2037 f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2038 };
2039 Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
2040 Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2041 const int start = sby * sbsz;
2042 if (sby) {
2043 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2044 pixel *p_up[3] = {
2045 p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
2046 p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2047 p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2048 };
2049 bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
2050 }
2051 const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
2052 const int end = imin(start + n_blks, f->bh);
2053 bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
2054 }
2055
bytefn(dav1d_filter_sbrow_resize)2056 void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
2057 const int sbsz = f->sb_step;
2058 const int y = sby * sbsz * 4;
2059 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2060 const pixel *const p[3] = {
2061 f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2062 f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2063 f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2064 };
2065 pixel *const sr_p[3] = {
2066 f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2067 f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2068 f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2069 };
2070 const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
2071 for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
2072 const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2073 const int h_start = 8 * !!sby >> ss_ver;
2074 const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
2075 pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
2076 const ptrdiff_t src_stride = f->cur.stride[!!pl];
2077 const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
2078 const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
2079 const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2080 const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
2081 const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
2082 const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
2083
2084 f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
2085 imin(img_h, h_end) + h_start, src_w,
2086 f->resize_step[!!pl], f->resize_start[!!pl]
2087 HIGHBD_CALL_SUFFIX);
2088 }
2089 }
2090
bytefn(dav1d_filter_sbrow_lr)2091 void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
2092 if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
2093 const int y = sby * f->sb_step * 4;
2094 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2095 pixel *const sr_p[3] = {
2096 f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2097 f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2098 f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2099 };
2100 bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
2101 }
2102
bytefn(dav1d_filter_sbrow)2103 void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
2104 bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
2105 bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
2106 if (f->seq_hdr->cdef)
2107 bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
2108 if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
2109 bytefn(dav1d_filter_sbrow_resize)(f, sby);
2110 if (f->lf.restore_planes)
2111 bytefn(dav1d_filter_sbrow_lr)(f, sby);
2112 }
2113
bytefn(dav1d_backup_ipred_edge)2114 void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
2115 const Dav1dFrameContext *const f = t->f;
2116 Dav1dTileState *const ts = t->ts;
2117 const int sby = t->by >> f->sb_shift;
2118 const int sby_off = f->sb128w * 128 * sby;
2119 const int x_off = ts->tiling.col_start;
2120
2121 const pixel *const y =
2122 ((const pixel *) f->cur.data[0]) + x_off * 4 +
2123 ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
2124 pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
2125 4 * (ts->tiling.col_end - x_off));
2126
2127 if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
2128 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2129 const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2130
2131 const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
2132 (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
2133 for (int pl = 1; pl <= 2; pl++)
2134 pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
2135 &((const pixel *) f->cur.data[pl])[uv_off],
2136 4 * (ts->tiling.col_end - x_off) >> ss_hor);
2137 }
2138 }
2139
bytefn(dav1d_copy_pal_block_y)2140 void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
2141 const int bx4, const int by4,
2142 const int bw4, const int bh4)
2143
2144 {
2145 const Dav1dFrameContext *const f = t->f;
2146 pixel *const pal = t->frame_thread.pass ?
2147 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2148 ((t->bx >> 1) + (t->by & 1))][0] :
2149 bytefn(t->scratch.pal)[0];
2150 for (int x = 0; x < bw4; x++)
2151 memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
2152 for (int y = 0; y < bh4; y++)
2153 memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
2154 }
2155
bytefn(dav1d_copy_pal_block_uv)2156 void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
2157 const int bx4, const int by4,
2158 const int bw4, const int bh4)
2159
2160 {
2161 const Dav1dFrameContext *const f = t->f;
2162 const pixel (*const pal)[8] = t->frame_thread.pass ?
2163 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2164 ((t->bx >> 1) + (t->by & 1))] :
2165 bytefn(t->scratch.pal);
2166 // see aomedia bug 2183 for why we use luma coordinates here
2167 for (int pl = 1; pl <= 2; pl++) {
2168 for (int x = 0; x < bw4; x++)
2169 memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
2170 for (int y = 0; y < bh4; y++)
2171 memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
2172 }
2173 }
2174
bytefn(dav1d_read_pal_plane)2175 void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
2176 const int pl, const int sz_ctx,
2177 const int bx4, const int by4)
2178 {
2179 Dav1dTileState *const ts = t->ts;
2180 const Dav1dFrameContext *const f = t->f;
2181 const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
2182 ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
2183 pixel cache[16], used_cache[8];
2184 int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
2185 int n_cache = 0;
2186 // don't reuse above palette outside SB64 boundaries
2187 int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
2188 const pixel *l = bytefn(t->al_pal)[1][by4][pl];
2189 const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
2190
2191 // fill/sort cache
2192 while (l_cache && a_cache) {
2193 if (*l < *a) {
2194 if (!n_cache || cache[n_cache - 1] != *l)
2195 cache[n_cache++] = *l;
2196 l++;
2197 l_cache--;
2198 } else {
2199 if (*a == *l) {
2200 l++;
2201 l_cache--;
2202 }
2203 if (!n_cache || cache[n_cache - 1] != *a)
2204 cache[n_cache++] = *a;
2205 a++;
2206 a_cache--;
2207 }
2208 }
2209 if (l_cache) {
2210 do {
2211 if (!n_cache || cache[n_cache - 1] != *l)
2212 cache[n_cache++] = *l;
2213 l++;
2214 } while (--l_cache > 0);
2215 } else if (a_cache) {
2216 do {
2217 if (!n_cache || cache[n_cache - 1] != *a)
2218 cache[n_cache++] = *a;
2219 a++;
2220 } while (--a_cache > 0);
2221 }
2222
2223 // find reused cache entries
2224 int i = 0;
2225 for (int n = 0; n < n_cache && i < pal_sz; n++)
2226 if (dav1d_msac_decode_bool_equi(&ts->msac))
2227 used_cache[i++] = cache[n];
2228 const int n_used_cache = i;
2229
2230 // parse new entries
2231 pixel *const pal = t->frame_thread.pass ?
2232 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2233 ((t->bx >> 1) + (t->by & 1))][pl] :
2234 bytefn(t->scratch.pal)[pl];
2235 if (i < pal_sz) {
2236 const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2237 int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
2238
2239 if (i < pal_sz) {
2240 int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
2241 const int max = (1 << bpc) - 1;
2242
2243 do {
2244 const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2245 prev = pal[i++] = imin(prev + delta + !pl, max);
2246 if (prev + !pl >= max) {
2247 for (; i < pal_sz; i++)
2248 pal[i] = max;
2249 break;
2250 }
2251 bits = imin(bits, 1 + ulog2(max - prev - !pl));
2252 } while (i < pal_sz);
2253 }
2254
2255 // merge cache+new entries
2256 int n = 0, m = n_used_cache;
2257 for (i = 0; i < pal_sz; i++) {
2258 if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
2259 pal[i] = used_cache[n++];
2260 } else {
2261 assert(m < pal_sz);
2262 pal[i] = pal[m++];
2263 }
2264 }
2265 } else {
2266 memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
2267 }
2268
2269 if (DEBUG_BLOCK_INFO) {
2270 printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
2271 pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
2272 for (int n = 0; n < n_cache; n++)
2273 printf("%c%02x", n ? ' ' : '[', cache[n]);
2274 printf("%s, pal=", n_cache ? "]" : "[]");
2275 for (int n = 0; n < pal_sz; n++)
2276 printf("%c%02x", n ? ' ' : '[', pal[n]);
2277 printf("]\n");
2278 }
2279 }
2280
bytefn(dav1d_read_pal_uv)2281 void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
2282 const int sz_ctx, const int bx4, const int by4)
2283 {
2284 bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
2285
2286 // V pal coding
2287 Dav1dTileState *const ts = t->ts;
2288 const Dav1dFrameContext *const f = t->f;
2289 pixel *const pal = t->frame_thread.pass ?
2290 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2291 ((t->bx >> 1) + (t->by & 1))][2] :
2292 bytefn(t->scratch.pal)[2];
2293 const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2294 if (dav1d_msac_decode_bool_equi(&ts->msac)) {
2295 const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
2296 int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
2297 const int max = (1 << bpc) - 1;
2298 for (int i = 1; i < b->pal_sz[1]; i++) {
2299 int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2300 if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
2301 prev = pal[i] = (prev + delta) & max;
2302 }
2303 } else {
2304 for (int i = 0; i < b->pal_sz[1]; i++)
2305 pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
2306 }
2307 if (DEBUG_BLOCK_INFO) {
2308 printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
2309 for (int n = 0; n < b->pal_sz[1]; n++)
2310 printf("%c%02x", n ? ' ' : '[', pal[n]);
2311 printf("]\n");
2312 }
2313 }
2314