/* * Copyright © 2024, VideoLAN and dav1d authors * Copyright © 2024, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/ppc/dav1d_types.h" #include "src/ppc/itx.h" #include "src/ppc/utils.h" #if BITDEPTH == 8 #define LOAD_4(src, stride, a, b, c, d) \ { \ uint8_t *s = src; \ a = vec_xl(0, s); \ s += stride; \ b = vec_xl(0, s); \ s += stride; \ c = vec_xl(0, s); \ s += stride; \ d = vec_xl(0, s); \ } #define LOAD_DECLARE_2_I16(src, a, b) \ i16x8 a = vec_xl(0, src); \ i16x8 b = vec_xl(0, src + 8); #define UNPACK_DECLARE_4_I16_I32(sa, sb, a, b, c, d) \ i32x4 a = i16h_to_i32(sa); \ i32x4 b = i16l_to_i32(sa); \ i32x4 c = i16h_to_i32(sb); \ i32x4 d = i16l_to_i32(sb); #define LOAD_COEFF_4(coeff) \ LOAD_DECLARE_2_I16(coeff, c01, c23) \ UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) #define LOAD_SCALE_COEFF_4x8(coeff, scale) \ LOAD_DECLARE_2_I16(coeff, c04, c15) \ LOAD_DECLARE_2_I16(coeff+16, c26, c37) \ i16x8 c01 = (i16x8)vec_mergeh((i64x2)c04, (i64x2)c15); \ i16x8 c23 = (i16x8)vec_mergeh((i64x2)c26, (i64x2)c37); \ i16x8 c45 = (i16x8)vec_mergel((i64x2)c04, (i64x2)c15); \ i16x8 c67 = (i16x8)vec_mergel((i64x2)c26, (i64x2)c37); \ c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \ c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \ UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \ c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \ c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \ UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7) #define LOAD_SCALE_COEFF_8x4(coeff, scale) \ LOAD_DECLARE_2_I16(coeff, c01, c23) \ LOAD_DECLARE_2_I16(coeff+16, c45, c67) \ c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \ c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \ UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \ c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \ c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \ UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7) #define LOAD_COEFF_8x8(coeff) \ LOAD_DECLARE_2_I16(coeff, c0, c1) \ LOAD_DECLARE_2_I16(coeff+16, c2, c3) \ LOAD_DECLARE_2_I16(coeff+32, c4, c5) \ LOAD_DECLARE_2_I16(coeff+48, c6, c7) \ UNPACK_DECLARE_4_I16_I32(c0, c1, c0h, c0l, c1h, c1l) \ UNPACK_DECLARE_4_I16_I32(c2, c3, c2h, c2l, c3h, c3l) \ UNPACK_DECLARE_4_I16_I32(c4, c5, c4h, c4l, c5h, c5l) \ UNPACK_DECLARE_4_I16_I32(c6, c7, c6h, c6l, c7h, c7l) \ #define LOAD_COEFF_4x16(coeff) \ LOAD_DECLARE_2_I16(coeff, a0b0, c0d0) \ LOAD_DECLARE_2_I16(coeff+16, a1b1, c1d1) \ LOAD_DECLARE_2_I16(coeff+32, a2b2, c2d2) \ LOAD_DECLARE_2_I16(coeff+48, a3b3, c3d3) \ UNPACK_DECLARE_4_I16_I32(a0b0, c0d0, cA0, cB0, cC0, cD0) \ UNPACK_DECLARE_4_I16_I32(a1b1, c1d1, cA1, cB1, cC1, cD1) \ UNPACK_DECLARE_4_I16_I32(a2b2, c2d2, cA2, cB2, cC2, cD2) \ UNPACK_DECLARE_4_I16_I32(a3b3, c3d3, cA3, cB3, cC3, cD3) #define LOAD_DECLARE_4(src, stride, a, b, c, d) \ u8x16 a, b, c, d; \ LOAD_4(src, stride, a, b, c, d) #define STORE_LEN(l, dst, stride, a, b, c, d) \ { \ uint8_t *dst2 = dst; \ vec_xst_len(a, dst2, l); \ dst2 += stride; \ vec_xst_len(b, dst2, l); \ dst2 += stride; \ vec_xst_len(c, dst2, l); \ dst2 += stride; \ vec_xst_len(d, dst2, l); \ } #define STORE_4(dst, stride, a, b, c, d) \ STORE_LEN(4, dst, stride, a, b, c, d) #define STORE_8(dst, stride, ab, cd, ef, gh) \ STORE_LEN(8, dst, stride, ab, cd, ef, gh) #define STORE_16(dst, stride, l0, l1, l2, l3) \ { \ uint8_t *dst##2 = dst; \ vec_xst(l0, 0, dst##2); \ dst##2 += stride; \ vec_xst(l1, 0, dst##2); \ dst##2 += stride; \ vec_xst(l2, 0, dst##2); \ dst##2 += stride; \ vec_xst(l3, 0, dst##2); \ } #define APPLY_COEFF_4(a, b, c, d, c01, c23) \ { \ u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \ u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); \ \ c01 = vec_adds(c01, vec_splat_s16(8)); \ c23 = vec_adds(c23, vec_splat_s16(8)); \ c01 = vec_sra(c01, vec_splat_u16(4)); \ c23 = vec_sra(c23, vec_splat_u16(4)); \ \ i16x8 abs = u8h_to_i16(ab); \ i16x8 cds = u8h_to_i16(cd); \ \ abs = vec_adds(abs, c01); \ cds = vec_adds(cds, c23); \ \ a = vec_packsu(abs, abs); \ c = vec_packsu(cds, cds); \ \ b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); \ d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); \ } #define APPLY_COEFF_8x4(ab, cd, c01, c23) \ { \ i16x8 abs = u8h_to_i16(ab); \ i16x8 cds = u8h_to_i16(cd); \ c01 = vec_adds(c01, vec_splat_s16(8)); \ c23 = vec_adds(c23, vec_splat_s16(8)); \ c01 = vec_sra(c01, vec_splat_u16(4)); \ c23 = vec_sra(c23, vec_splat_u16(4)); \ \ abs = vec_adds(abs, c01); \ cds = vec_adds(cds, c23); \ \ ab = vec_packsu(abs, abs); \ cd = vec_packsu(cds, cds); \ } #define APPLY_COEFF_16x4(a, b, c, d, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ i16x8 ah = u8h_to_i16(a); \ i16x8 al = u8l_to_i16(a); \ i16x8 bh = u8h_to_i16(b); \ i16x8 bl = u8l_to_i16(b); \ i16x8 ch = u8h_to_i16(c); \ i16x8 cl = u8l_to_i16(c); \ i16x8 dh = u8h_to_i16(d); \ i16x8 dl = u8l_to_i16(d); \ SCALE_ROUND_4(c00c01, c02c03, c04c05, c06c07, vec_splat_s16(8), vec_splat_u16(4)) \ SCALE_ROUND_4(c08c09, c10c11, c12c13, c14c15, vec_splat_s16(8), vec_splat_u16(4)) \ \ ah = vec_adds(ah, c00c01); \ al = vec_adds(al, c02c03); \ bh = vec_adds(bh, c04c05); \ bl = vec_adds(bl, c06c07); \ ch = vec_adds(ch, c08c09); \ cl = vec_adds(cl, c10c11); \ dh = vec_adds(dh, c12c13); \ dl = vec_adds(dl, c14c15); \ \ a = vec_packsu(ah, al); \ b = vec_packsu(bh, bl); \ c = vec_packsu(ch, cl); \ d = vec_packsu(dh, dl); \ } #define IDCT_4_INNER(c0, c1, c2, c3) \ { \ i32x4 o0 = vec_add(c0, c2); \ i32x4 o1 = vec_sub(c0, c2); \ \ i32x4 v2896 = vec_splats(2896); \ i32x4 v1567 = vec_splats(1567); \ i32x4 v3784 = vec_splats(3784); \ i32x4 v2048 = vec_splats(2048); \ \ o0 = vec_mul(o0, v2896); \ o1 = vec_mul(o1, v2896); \ \ i32x4 o2a = vec_mul(c1, v1567); \ i32x4 o2b = vec_mul(c3, v3784); \ i32x4 o3a = vec_mul(c1, v3784); \ i32x4 o3b = vec_mul(c3, v1567); \ \ i32x4 o2 = vec_sub(o2a, o2b); \ i32x4 o3 = vec_add(o3a, o3b); \ \ u32x4 v12 = vec_splat_u32(12); \ \ o0 = vec_add(o0, v2048); \ o1 = vec_add(o1, v2048); \ o2 = vec_add(o2, v2048); \ o3 = vec_add(o3, v2048); \ \ o0 = vec_sra(o0, v12); \ o1 = vec_sra(o1, v12); \ o2 = vec_sra(o2, v12); \ o3 = vec_sra(o3, v12); \ \ c0 = vec_add(o0, o3); \ c1 = vec_add(o1, o2); \ c2 = vec_sub(o1, o2); \ c3 = vec_sub(o0, o3); \ \ } #define dct4_for_dct8(c0, c1, c2, c3, c03, c12) \ IDCT_4_INNER(c0, c1, c2, c3) \ c03 = vec_packs(c0, c3); \ c12 = vec_packs(c1, c2); \ #define dct_4_in(c0, c1, c2, c3, c01, c23) \ { \ IDCT_4_INNER(c0, c1, c2, c3) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c0 = i16h_to_i32(c01); \ c1 = i16l_to_i32(c01); \ c2 = i16h_to_i32(c23); \ c3 = i16l_to_i32(c23); \ } #define dct_4_out(c0, c1, c2, c3, c01, c23) \ IDCT_4_INNER(c0, c1, c2, c3) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ #define IDENTITY_4(c01, c23) \ { \ i16x8 v1697 = vec_splats((int16_t)(1697*8)); \ i16x8 o01 = vec_mradds(c01, v1697, vec_splat_s16(0)); \ i16x8 o23 = vec_mradds(c23, v1697, vec_splat_s16(0)); \ c01 = vec_adds(c01, o01); \ c23 = vec_adds(c23, o23); \ } #define identity_4_in(c0, c1, c2, c3, c01, c23) \ { \ IDENTITY_4(c01, c23) \ c0 = i16h_to_i32(c01); \ c1 = i16l_to_i32(c01); \ c2 = i16h_to_i32(c23); \ c3 = i16l_to_i32(c23); \ } #define identity_4_out(c0, c1, c2, c3, c01, c23) \ { \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ IDENTITY_4(c01, c23) \ } #define ADST_INNER_4(c0, c1, c2, c3, oc0, oc1, oc2, oc3) \ { \ i32x4 v1321 = vec_splats(1321); \ i32x4 v3803 = vec_splats(3803); \ i32x4 v2482 = vec_splats(2482); \ i32x4 v3344 = vec_splats(3344); \ i32x4 v2048 = vec_splats(2048); \ i32x4 i0_v1321 = vec_mul(c0, v1321); \ i32x4 i0_v2482 = vec_mul(c0, v2482); \ i32x4 i0_v3803 = vec_mul(c0, v3803); \ i32x4 i1 = vec_mul(c1, v3344); \ i32x4 i2_v1321 = vec_mul(c2, v1321); \ i32x4 i2_v2482 = vec_mul(c2, v2482); \ i32x4 i2_v3803 = vec_mul(c2, v3803); \ i32x4 i3_v1321 = vec_mul(c3, v1321); \ i32x4 i3_v2482 = vec_mul(c3, v2482); \ i32x4 i3_v3803 = vec_mul(c3, v3803); \ \ i32x4 n1 = vec_sub(i1, v2048); \ i1 = vec_add(i1, v2048); \ \ \ i32x4 o0 = vec_add(i0_v1321, i2_v3803); \ i32x4 o1 = vec_sub(i0_v2482, i2_v1321); \ i32x4 o2 = vec_sub(c0, c2); \ i32x4 o3 = vec_add(i0_v3803, i2_v2482); \ \ o0 = vec_add(o0, i3_v2482); \ o1 = vec_sub(o1, i3_v3803); \ o2 = vec_add(o2, c3); \ o3 = vec_sub(o3, i3_v1321); \ \ o0 = vec_add(o0, i1); \ o1 = vec_add(o1, i1); \ o2 = vec_mul(o2, v3344); \ o3 = vec_sub(o3, n1); \ \ o2 = vec_add(o2, v2048); \ \ oc0 = vec_sra(o0, vec_splat_u32(12)); \ oc1 = vec_sra(o1, vec_splat_u32(12)); \ oc2 = vec_sra(o2, vec_splat_u32(12)); \ oc3 = vec_sra(o3, vec_splat_u32(12)); \ } #define adst_4_in(c0, c1, c2, c3, c01, c23) \ { \ ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \ } #define flipadst_4_in(c0, c1, c2, c3, c01, c23) \ { \ ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \ } #define adst_4_out(c0, c1, c2, c3, c01, c23) \ { \ ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ } #define flipadst_4_out(c0, c1, c2, c3, c01, c23) \ { \ ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ } static void dc_only_4xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) { int dc = coeff[0]; const int rnd = (1 << shift) >> 1; if (is_rect2) dc = (dc * 181 + 128) >> 8; dc = (dc * 181 + 128) >> 8; dc = (dc + rnd) >> shift; dc = (dc * 181 + 128 + 2048) >> 12; i16x8 vdc = vec_splats((int16_t)dc); coeff[0] = 0; for (int i = 0; i < n; i++, dst += 4 * stride) { LOAD_DECLARE_4(dst, stride, a, b, c, d) i16x8 as = u8h_to_i16(a); i16x8 bs = u8h_to_i16(b); i16x8 cs = u8h_to_i16(c); i16x8 ds = u8h_to_i16(d); as = vec_adds(as, vdc); bs = vec_adds(bs, vdc); cs = vec_adds(cs, vdc); ds = vec_adds(ds, vdc); a = vec_packsu(as, as); b = vec_packsu(bs, bs); c = vec_packsu(cs, cs); d = vec_packsu(ds, ds); STORE_4(dst, stride, a, b, c, d) } } static void dc_only_8xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) { int dc = coeff[0]; const int rnd = (1 << shift) >> 1; if (is_rect2) dc = (dc * 181 + 128) >> 8; dc = (dc * 181 + 128) >> 8; dc = (dc + rnd) >> shift; dc = (dc * 181 + 128 + 2048) >> 12; i16x8 vdc = vec_splats((int16_t)dc); coeff[0] = 0; for (int i = 0; i < n; i++, dst += 4 * stride) { LOAD_DECLARE_4(dst, stride, a, b, c, d) i16x8 as = u8h_to_i16(a); i16x8 bs = u8h_to_i16(b); i16x8 cs = u8h_to_i16(c); i16x8 ds = u8h_to_i16(d); as = vec_adds(as, vdc); bs = vec_adds(bs, vdc); cs = vec_adds(cs, vdc); ds = vec_adds(ds, vdc); a = vec_packsu(as, as); b = vec_packsu(bs, bs); c = vec_packsu(cs, cs); d = vec_packsu(ds, ds); STORE_8(dst, stride, a, b, c, d) } } static void dc_only_16xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) { int dc = coeff[0]; const int rnd = (1 << shift) >> 1; if (is_rect2) dc = (dc * 181 + 128) >> 8; dc = (dc * 181 + 128) >> 8; dc = (dc + rnd) >> shift; dc = (dc * 181 + 128 + 2048) >> 12; i16x8 vdc = vec_splats((int16_t)dc); coeff[0] = 0; for (int i = 0; i < n; i++, dst += 4 * stride) { LOAD_DECLARE_4(dst, stride, a, b, c, d) i16x8 ah = u8h_to_i16(a); i16x8 bh = u8h_to_i16(b); i16x8 ch = u8h_to_i16(c); i16x8 dh = u8h_to_i16(d); i16x8 al = u8l_to_i16(a); i16x8 bl = u8l_to_i16(b); i16x8 cl = u8l_to_i16(c); i16x8 dl = u8l_to_i16(d); ah = vec_adds(ah, vdc); bh = vec_adds(bh, vdc); ch = vec_adds(ch, vdc); dh = vec_adds(dh, vdc); al = vec_adds(al, vdc); bl = vec_adds(bl, vdc); cl = vec_adds(cl, vdc); dl = vec_adds(dl, vdc); a = vec_packsu(ah, al); b = vec_packsu(bh, bl); c = vec_packsu(ch, cl); d = vec_packsu(dh, dl); STORE_16(dst, stride, a, b, c, d) } } void dav1d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { assert(eob >= 0); if (eob < 1) { return dc_only_4xN(dst, stride, coeff, 1, 0, 0); } LOAD_COEFF_4(coeff) dct_4_in(c0, c1, c2, c3, c01, c23) TRANSPOSE4_I32(c0, c1, c2, c3) memset(coeff, 0, sizeof(*coeff) * 4 * 4); dct_4_out(c0, c1, c2, c3, c01, c23) LOAD_DECLARE_4(dst, stride, a, b, c, d) APPLY_COEFF_4(a, b, c, d, c01, c23) STORE_4(dst, stride, a, b, c, d) } void dav1d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel *dst, const ptrdiff_t stride, coef *const coeff, const int eob) { LOAD_COEFF_4(coeff) u32x4 v2 = vec_splat_u32(2); c0 = vec_sra(c0, v2); c1 = vec_sra(c1, v2); c2 = vec_sra(c2, v2); c3 = vec_sra(c3, v2); i32x4 t0 = vec_add(c0, c1); i32x4 t2 = vec_sub(c2, c3); i32x4 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1)); i32x4 t3 = vec_sub(t4, c3); i32x4 t1 = vec_sub(t4, c1); c0 = vec_sub(t0, t3); c1 = t3; c2 = t1; c3 = vec_add(t2, t1); memset(coeff, 0, sizeof(*coeff) * 4 * 4); TRANSPOSE4_I32(c0, c1, c2, c3) t0 = vec_add(c0, c1); t2 = vec_sub(c2, c3); t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1)); t3 = vec_sub(t4, c3); t1 = vec_sub(t4, c1); c0 = vec_sub(t0, t3); c1 = t3; c2 = t1; c3 = vec_add(t2, t1); c01 = vec_packs(c0, c1); c23 = vec_packs(c2, c3); LOAD_DECLARE_4(dst, stride, a, b, c, d) u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); i16x8 abs = u8h_to_i16(ab); i16x8 cds = u8h_to_i16(cd); abs = vec_adds(abs, c01); cds = vec_adds(cds, c23); a = vec_packsu(abs, abs); c = vec_packsu(cds, cds); b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); STORE_4(dst, stride, a, b, c, d) } #define inv_txfm_fn4x4(type1, type2) \ void dav1d_inv_txfm_add_##type1##_##type2##_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_COEFF_4(coeff) \ type1##_4_in(c0, c1, c2, c3, c01, c23) \ memset(coeff, 0, sizeof(*coeff) * 4 * 4); \ TRANSPOSE4_I32(c0, c1, c2, c3) \ type2##_4_out(c0, c1, c2, c3, c01, c23) \ LOAD_DECLARE_4(dst, stride, a, b, c, d) \ APPLY_COEFF_4(a, b, c, d, c01, c23) \ STORE_4(dst, stride, a, b, c, d) \ } inv_txfm_fn4x4(adst, dct ) inv_txfm_fn4x4(dct, adst ) inv_txfm_fn4x4(dct, flipadst) inv_txfm_fn4x4(flipadst, dct ) inv_txfm_fn4x4(adst, flipadst) inv_txfm_fn4x4(flipadst, adst ) inv_txfm_fn4x4(identity, dct ) inv_txfm_fn4x4(dct, identity) inv_txfm_fn4x4(identity, flipadst) inv_txfm_fn4x4(flipadst, identity) inv_txfm_fn4x4(identity, adst ) inv_txfm_fn4x4(adst, identity) inv_txfm_fn4x4(identity, identity) inv_txfm_fn4x4(adst, adst ) inv_txfm_fn4x4(flipadst, flipadst) #define IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \ dct4_for_dct8(c0, c2, c4, c6, c03, c12) \ \ i32x4 v799 = vec_splats(799); \ i32x4 v4017 = vec_splats(4017); \ i32x4 v3406 = vec_splats(3406); \ i32x4 v2276 = vec_splats(2276); \ i32x4 v2048 = vec_splats(2048); \ u32x4 v12 = vec_splat_u32(12); \ \ i32x4 c1v799 = vec_mul(c1, v799); \ i32x4 c7v4017 = vec_mul(c7, v4017); \ i32x4 c5v3406 = vec_mul(c5, v3406); \ i32x4 c3v2276 = vec_mul(c3, v2276); \ i32x4 c5v2276 = vec_mul(c5, v2276); \ i32x4 c3v3406 = vec_mul(c3, v3406); \ i32x4 c1v4017 = vec_mul(c1, v4017); \ i32x4 c7v799 = vec_mul(c7, v799); \ \ i32x4 t4a = vec_subs(c1v799, c7v4017); \ i32x4 t5a = vec_subs(c5v3406, c3v2276); \ i32x4 t6a = vec_adds(c5v2276, c3v3406); \ i32x4 t7a = vec_adds(c1v4017, c7v799); \ \ t4a = vec_adds(t4a, v2048); \ t5a = vec_adds(t5a, v2048); \ t6a = vec_adds(t6a, v2048); \ t7a = vec_adds(t7a, v2048); \ \ t4a = vec_sra(t4a, v12); \ t7a = vec_sra(t7a, v12); \ t5a = vec_sra(t5a, v12); \ t6a = vec_sra(t6a, v12); \ \ i16x8 t7at4a = vec_packs(t7a, t4a); \ i16x8 t6at5a = vec_packs(t6a, t5a); \ \ i16x8 t7t4 = vec_adds(t7at4a, t6at5a); \ t6at5a = vec_subs(t7at4a, t6at5a); \ \ t6a = i16h_to_i32(t6at5a); \ t5a = i16l_to_i32(t6at5a); \ \ i32x4 t6 = vec_add(t6a, t5a); \ i32x4 t5 = vec_sub(t6a, t5a); \ \ t6 = vec_mul(t6, vec_splats(181)); \ t5 = vec_mul(t5, vec_splats(181)); \ t6 = vec_add(t6, vec_splats(128)); \ t5 = vec_add(t5, vec_splats(128)); \ \ t6 = vec_sra(t6, vec_splat_u32(8)); \ t5 = vec_sra(t5, vec_splat_u32(8)); \ \ i16x8 t6t5 = vec_packs(t6, t5); \ \ c74 = vec_subs(c03, t7t4); \ c65 = vec_subs(c12, t6t5); \ c03 = vec_adds(c03, t7t4); \ c12 = vec_adds(c12, t6t5); \ #define UNPACK_4_I16_I32(t0, t1, t2, t3) \ t0 = i16h_to_i32(t0##t1); \ t1 = i16l_to_i32(t0##t1); \ t2 = i16h_to_i32(t2##t3); \ t3 = i16l_to_i32(t2##t3); #define UNPACK_PAIR_I16_I32(hi, lo, v) \ hi = i16h_to_i32(v); \ lo = i16l_to_i32(v); \ #define dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, ...) \ { \ i16x8 c0##c3, c1##c2, c7##c4, c6##c5; \ IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c0##c3, c1##c2, c7##c4, c6##c5) \ UNPACK_4_I16_I32(c0, c3, c1, c2) \ UNPACK_4_I16_I32(c7, c4, c6, c5) \ } #define dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ { \ i16x8 c03, c12, c74, c65; \ IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \ c01 = (i16x8)vec_mergeh((u64x2)c03, (u64x2)c12); \ c23 = (i16x8)vec_mergel((u64x2)c12, (u64x2)c03); \ c45 = (i16x8)vec_mergel((u64x2)c74, (u64x2)c65); \ c67 = (i16x8)vec_mergeh((u64x2)c65, (u64x2)c74); \ } #define dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ dct_8_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,) \ dct_8_in(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,) \ } #define dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ i16x8 c03h, c12h, c74h, c65h; \ i16x8 c03l, c12l, c74l, c65l; \ { \ IDCT_8_INNER(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c03h, c12h, c74h, c65h) \ } \ { \ IDCT_8_INNER(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c03l, c12l, c74l, c65l) \ } \ c0 = (i16x8)vec_mergeh((u64x2)c03h, (u64x2)c03l); \ c3 = (i16x8)vec_mergel((u64x2)c03h, (u64x2)c03l); \ c1 = (i16x8)vec_mergeh((u64x2)c12h, (u64x2)c12l); \ c2 = (i16x8)vec_mergel((u64x2)c12h, (u64x2)c12l); \ c7 = (i16x8)vec_mergeh((u64x2)c74h, (u64x2)c74l); \ c4 = (i16x8)vec_mergel((u64x2)c74h, (u64x2)c74l); \ c6 = (i16x8)vec_mergeh((u64x2)c65h, (u64x2)c65l); \ c5 = (i16x8)vec_mergel((u64x2)c65h, (u64x2)c65l); \ } #define IDENTITY_8(c01, c23, c45, c67) \ { \ c01 = vec_adds(c01, c01); \ c23 = vec_adds(c23, c23); \ c45 = vec_adds(c45, c45); \ c67 = vec_adds(c67, c67); \ } #define identity_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ { \ IDENTITY_8(c01, c23, c45, c67) \ UNPACK_PAIR_I16_I32(c0, c1, c01) \ UNPACK_PAIR_I16_I32(c2, c3, c23) \ UNPACK_PAIR_I16_I32(c4, c5, c45) \ UNPACK_PAIR_I16_I32(c6, c7, c67) \ } #define identity_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ IDENTITY_8(c01, c23, c45, c67) #define identity_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ IDENTITY_8(c0, c1, c2, c3) \ IDENTITY_8(c4, c5, c6, c7) \ UNPACK_PAIR_I16_I32(c0h, c0l, c0) \ UNPACK_PAIR_I16_I32(c1h, c1l, c1) \ UNPACK_PAIR_I16_I32(c2h, c2l, c2) \ UNPACK_PAIR_I16_I32(c3h, c3l, c3) \ UNPACK_PAIR_I16_I32(c4h, c4l, c4) \ UNPACK_PAIR_I16_I32(c5h, c5l, c5) \ UNPACK_PAIR_I16_I32(c6h, c6l, c6) \ UNPACK_PAIR_I16_I32(c7h, c7l, c7) \ } #define PACK_4(c0, c1, c2, c3, \ c0h, c1h, c2h, c3h, \ c0l, c1l, c2l, c3l) \ { \ c0 = vec_packs(c0h, c0l); \ c1 = vec_packs(c1h, c1l); \ c2 = vec_packs(c2h, c2l); \ c3 = vec_packs(c3h, c3l); \ } #define DECLARE_PACK_4(c0, c1, c2, c3, \ c0h, c1h, c2h, c3h, \ c0l, c1l, c2l, c3l) \ i16x8 c0, c1, c2, c3; \ PACK_4(c0, c1, c2, c3, c0h, c1h, c2h, c3h, c0l, c1l, c2l, c3l); #define PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ { \ c0 = vec_packs(c0h, c0l); \ c1 = vec_packs(c1h, c1l); \ c2 = vec_packs(c2h, c2l); \ c3 = vec_packs(c3h, c3l); \ c4 = vec_packs(c4h, c4l); \ c5 = vec_packs(c5h, c5l); \ c6 = vec_packs(c6h, c6l); \ c7 = vec_packs(c7h, c7l); \ } #define identity_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ IDENTITY_8(c0, c1, c2, c3) \ IDENTITY_8(c4, c5, c6, c7) \ } #define DECLARE_SPLAT_I32(val) \ i32x4 v##val = vec_splats(val); #define DECLARE_MUL_PAIR_I32(ca, cb, va, vb) \ i32x4 ca##va = vec_mul(ca, va); \ i32x4 cb##vb = vec_mul(cb, vb); \ i32x4 ca##vb = vec_mul(ca, vb); \ i32x4 cb##va = vec_mul(cb, va); #define ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \ r0 = vec_adds(ca##va, cb##vb); \ r1 = vec_subs(ca##vb, cb##va); #define DECLARE_ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \ i32x4 r0, r1; \ ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) #define SCALE_ROUND_4(a, b, c, d, rnd, shift) \ a = vec_adds(a, rnd); \ b = vec_adds(b, rnd); \ c = vec_adds(c, rnd); \ d = vec_adds(d, rnd); \ a = vec_sra(a, shift); \ b = vec_sra(b, shift); \ c = vec_sra(c, shift); \ d = vec_sra(d, shift); #define ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ o0, o1, o2, o3, o4, o5, o6, o7) \ { \ DECLARE_SPLAT_I32(4076) \ DECLARE_SPLAT_I32(401) \ \ DECLARE_SPLAT_I32(3612) \ DECLARE_SPLAT_I32(1931) \ \ DECLARE_SPLAT_I32(2598) \ DECLARE_SPLAT_I32(3166) \ \ DECLARE_SPLAT_I32(1189) \ DECLARE_SPLAT_I32(3920) \ \ DECLARE_SPLAT_I32(3784) \ DECLARE_SPLAT_I32(1567) \ \ DECLARE_SPLAT_I32(2048) \ u32x4 v12 = vec_splat_u32(12); \ \ DECLARE_MUL_PAIR_I32(c7, c0, v4076, v401) \ DECLARE_MUL_PAIR_I32(c5, c2, v3612, v1931) \ DECLARE_MUL_PAIR_I32(c3, c4, v2598, v3166) \ DECLARE_MUL_PAIR_I32(c1, c6, v1189, v3920) \ \ DECLARE_ADD_SUB_PAIR(t0a, t1a, c7, c0, v4076, v401) \ DECLARE_ADD_SUB_PAIR(t2a, t3a, c5, c2, v3612, v1931) \ DECLARE_ADD_SUB_PAIR(t4a, t5a, c3, c4, v2598, v3166) \ DECLARE_ADD_SUB_PAIR(t6a, t7a, c1, c6, v1189, v3920) \ \ SCALE_ROUND_4(t0a, t1a, t2a, t3a, v2048, v12) \ SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \ \ i32x4 t0 = vec_add(t0a, t4a); \ i32x4 t1 = vec_add(t1a, t5a); \ i32x4 t2 = vec_add(t2a, t6a); \ i32x4 t3 = vec_add(t3a, t7a); \ i32x4 t4 = vec_sub(t0a, t4a); \ i32x4 t5 = vec_sub(t1a, t5a); \ i32x4 t6 = vec_sub(t2a, t6a); \ i32x4 t7 = vec_sub(t3a, t7a); \ \ i16x8 t0t1 = vec_packs(t0, t1); \ i16x8 t2t3 = vec_packs(t2, t3); \ i16x8 t4t5 = vec_packs(t4, t5); \ i16x8 t6t7 = vec_packs(t6, t7); \ \ UNPACK_4_I16_I32(t4, t5, t6, t7) \ UNPACK_4_I16_I32(t0, t1, t2, t3) \ \ DECLARE_MUL_PAIR_I32(t4, t5, v3784, v1567) \ DECLARE_MUL_PAIR_I32(t7, t6, v3784, v1567) \ \ ADD_SUB_PAIR(t4a, t5a, t4, t5, v3784, v1567) \ ADD_SUB_PAIR(t7a, t6a, t7, t6, v1567, v3784) \ \ SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \ \ o0 = vec_add(t0, t2); \ o1 = vec_add(t4a, t6a); \ o7 = vec_add(t1, t3); \ o6 = vec_add(t5a, t7a); \ t2 = vec_sub(t0, t2); \ t3 = vec_sub(t1, t3); \ t6 = vec_sub(t4a, t6a); \ t7 = vec_sub(t5a, t7a); \ \ i16x8 o7##o1 = vec_packs(o7, o1); \ i16x8 o0##o6 = vec_packs(o0, o6); \ t2t3 = vec_packs(t2, t3); \ t6t7 = vec_packs(t6, t7); \ \ UNPACK_4_I16_I32(t2, t3, t6, t7) \ UNPACK_4_I16_I32(o7, o1, o0, o6) \ \ o7 = -o7; \ o1 = -o1; \ \ o3 = vec_add(t2, t3); \ o4 = vec_sub(t2, t3); \ o5 = vec_sub(t6, t7); \ o2 = vec_add(t6, t7); \ \ i32x4 v181 = vec_splats(181); \ i32x4 v128 = vec_splats(128); \ u32x4 v8 = vec_splat_u32(8); \ \ o2 = vec_mul(o2, v181); \ o3 = vec_mul(o3, v181); \ o4 = vec_mul(o4, v181); \ o5 = vec_mul(o5, v181); \ \ SCALE_ROUND_4(o2, o3, o4, o5, v128, v8) \ \ o3 = -o3; \ o5 = -o5; \ } #define adst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ {\ ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0, c1, c2, c3, c4, c5, c6, c7) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ UNPACK_PAIR_I16_I32(c0, c1, c01) \ UNPACK_PAIR_I16_I32(c2, c3, c23) \ UNPACK_PAIR_I16_I32(c4, c5, c45) \ UNPACK_PAIR_I16_I32(c6, c7, c67) \ } #define adst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ {\ ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0, c1, c2, c3, c4, c5, c6, c7) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ } #define adst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \ ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ } #define adst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \ ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ } #define flipadst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ {\ ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c7, c6, c5, c4, c3, c2, c1, c0) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ UNPACK_PAIR_I16_I32(c0, c1, c01) \ UNPACK_PAIR_I16_I32(c2, c3, c23) \ UNPACK_PAIR_I16_I32(c4, c5, c45) \ UNPACK_PAIR_I16_I32(c6, c7, c67) \ } #define flipadst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ {\ ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c7, c6, c5, c4, c3, c2, c1, c0) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ } #define flipadst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \ ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \ } #define flipadst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \ ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \ PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ } void dav1d_inv_txfm_add_dct_dct_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { i16x8 v = vec_splats((int16_t)(2896*8)); if (eob < 1) { return dc_only_4xN(dst, stride, coeff, 2, 1, 0); } LOAD_SCALE_COEFF_4x8(coeff, v) dct_4_in(c0, c1, c2, c3, c01, c23) dct_4_in(c4, c5, c6, c7, c45, c67) memset(coeff, 0, sizeof(*coeff) * 4 * 8); TRANSPOSE4_I32(c0, c1, c2, c3); TRANSPOSE4_I32(c4, c5, c6, c7); dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) LOAD_DECLARE_4(dst, stride, a, b, cc, d) LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh) APPLY_COEFF_4(a, b, cc, d, c01, c23) APPLY_COEFF_4(e, f, g, hh, c45, c67) STORE_4(dst, stride, a, b, cc, d) STORE_4(dst + 4 * stride, stride, e, f, g, hh) } #define inv_txfm_fn4x8(type1, type2) \ void dav1d_inv_txfm_add_##type1##_##type2##_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ i16x8 v = vec_splats((int16_t)(2896*8)); \ LOAD_SCALE_COEFF_4x8(coeff, v) \ type1##_4_in(c0, c1, c2, c3, c01, c23) \ type1##_4_in(c4, c5, c6, c7, c45, c67) \ memset(coeff, 0, sizeof(*coeff) * 4 * 8); \ TRANSPOSE4_I32(c0, c1, c2, c3); \ TRANSPOSE4_I32(c4, c5, c6, c7); \ type2##_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ LOAD_DECLARE_4(dst, stride, a, b, c, d) \ LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ APPLY_COEFF_4(a, b, c, d, c01, c23) \ APPLY_COEFF_4(e, f, g, h, c45, c67) \ STORE_4(dst, stride, a, b, c, d) \ STORE_4(dst + 4 * stride, stride, e, f, g, h) \ } inv_txfm_fn4x8(adst, dct ) inv_txfm_fn4x8(dct, adst ) inv_txfm_fn4x8(dct, flipadst) inv_txfm_fn4x8(flipadst, dct ) inv_txfm_fn4x8(adst, flipadst) inv_txfm_fn4x8(flipadst, adst ) inv_txfm_fn4x8(identity, dct ) inv_txfm_fn4x8(dct, identity) inv_txfm_fn4x8(identity, flipadst) inv_txfm_fn4x8(flipadst, identity) inv_txfm_fn4x8(identity, adst ) inv_txfm_fn4x8(adst, identity) inv_txfm_fn4x8(identity, identity) inv_txfm_fn4x8(adst, adst ) inv_txfm_fn4x8(flipadst, flipadst) void dav1d_inv_txfm_add_dct_dct_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { i16x8 v = vec_splats((int16_t)(2896*8)); if (eob < 1) { return dc_only_8xN(dst, stride, coeff, 1, 1, 0); } LOAD_SCALE_COEFF_8x4(coeff, v) dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) memset(coeff, 0, sizeof(*coeff) * 8 * 4); TRANSPOSE4_I32(c0, c1, c2, c3) TRANSPOSE4_I32(c4, c5, c6, c7) dct_4_out(c0, c1, c2, c3, c01, c23) dct_4_out(c4, c5, c6, c7, c45, c67) LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); APPLY_COEFF_8x4(ae, bf, c04, c15) APPLY_COEFF_8x4(cg, dh, c26, c37) STORE_8(dst, stride, ae, bf, cg, dh) } #define inv_txfm_fn8x4(type1, type2) \ void dav1d_inv_txfm_add_##type1##_##type2##_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ i16x8 v = vec_splats((int16_t)(2896*8)); \ LOAD_SCALE_COEFF_8x4(coeff, v) \ type1##_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ memset(coeff, 0, sizeof(*coeff) * 8 * 4); \ TRANSPOSE4_I32(c0, c1, c2, c3) \ TRANSPOSE4_I32(c4, c5, c6, c7) \ type2##_4_out(c0, c1, c2, c3, c01, c23) \ type2##_4_out(c4, c5, c6, c7, c45, c67) \ LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) \ i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); \ i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); \ i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); \ i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); \ APPLY_COEFF_8x4(ae, bf, c04, c15) \ APPLY_COEFF_8x4(cg, dh, c26, c37) \ STORE_8(dst, stride, ae, bf, cg, dh) \ } inv_txfm_fn8x4(adst, dct ) inv_txfm_fn8x4(dct, adst ) inv_txfm_fn8x4(dct, flipadst) inv_txfm_fn8x4(flipadst, dct ) inv_txfm_fn8x4(adst, flipadst) inv_txfm_fn8x4(flipadst, adst ) inv_txfm_fn8x4(identity, dct ) inv_txfm_fn8x4(dct, identity) inv_txfm_fn8x4(identity, flipadst) inv_txfm_fn8x4(flipadst, identity) inv_txfm_fn8x4(identity, adst ) inv_txfm_fn8x4(adst, identity) inv_txfm_fn8x4(identity, identity) inv_txfm_fn8x4(adst, adst ) inv_txfm_fn8x4(flipadst, flipadst) void dav1d_inv_txfm_add_dct_dct_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { if (eob < 1) { return dc_only_8xN(dst, stride, coeff, 2, 0, 1); } LOAD_COEFF_8x8(coeff) dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c0, c1, c2, c3, c4, c5, c6, c7) memset(coeff, 0, sizeof(*coeff) * 8 * 8); SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c0, c1, c2, c3, c4, c5, c6, c7) LOAD_DECLARE_4(dst, stride, a, b, cc, d) LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh) APPLY_COEFF_8x4(a, b, c0, c1) APPLY_COEFF_8x4(cc, d, c2, c3) APPLY_COEFF_8x4(e, f, c4, c5) APPLY_COEFF_8x4(g, hh, c6, c7) STORE_8(dst, stride, a, b, cc, d) STORE_8(dst + 4 * stride, stride, e, f, g, hh) } #define inv_txfm_fn8x8(type1, type2) \ void dav1d_inv_txfm_add_##type1##_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_COEFF_8x8(coeff) \ type1##_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) \ memset(coeff, 0, sizeof(*coeff) * 8 * 8); \ TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ LOAD_DECLARE_4(dst, stride, a, b, c, d) \ LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ APPLY_COEFF_8x4(a, b, c0, c1) \ APPLY_COEFF_8x4(c, d, c2, c3) \ APPLY_COEFF_8x4(e, f, c4, c5) \ APPLY_COEFF_8x4(g, h, c6, c7) \ STORE_8(dst, stride, a, b, c, d) \ STORE_8(dst + 4 * stride, stride, e, f, g, h) \ } inv_txfm_fn8x8(adst, dct ) inv_txfm_fn8x8(dct, adst ) inv_txfm_fn8x8(dct, flipadst) inv_txfm_fn8x8(flipadst, dct ) inv_txfm_fn8x8(adst, flipadst) inv_txfm_fn8x8(flipadst, adst ) inv_txfm_fn8x8(dct, identity) inv_txfm_fn8x8(flipadst, identity) inv_txfm_fn8x8(adst, identity) inv_txfm_fn8x8(adst, adst ) inv_txfm_fn8x8(flipadst, flipadst) // identity + scale is a no op #define inv_txfm_fn8x8_identity(type2) \ void dav1d_inv_txfm_add_identity_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_COEFF_8x8(coeff) \ memset(coeff, 0, sizeof(*coeff) * 8 * 8); \ TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ LOAD_DECLARE_4(dst, stride, a, b, c, d) \ LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ APPLY_COEFF_8x4(a, b, c0, c1) \ APPLY_COEFF_8x4(c, d, c2, c3) \ APPLY_COEFF_8x4(e, f, c4, c5) \ APPLY_COEFF_8x4(g, h, c6, c7) \ STORE_8(dst, stride, a, b, c, d) \ STORE_8(dst + 4 * stride, stride, e, f, g, h) \ } inv_txfm_fn8x8_identity(dct ) inv_txfm_fn8x8_identity(flipadst) inv_txfm_fn8x8_identity(adst ) inv_txfm_fn8x8_identity(identity) #define CLIP16_I32_8(a, b, c, d, e, f, g, h, \ ab, cd, ef, gh) \ { \ ab = vec_packs(a, b); \ cd = vec_packs(c, d); \ ef = vec_packs(e, f); \ gh = vec_packs(g, h); \ UNPACK_PAIR_I16_I32(a, b, ab) \ UNPACK_PAIR_I16_I32(c, d, cd) \ UNPACK_PAIR_I16_I32(e, f, ef) \ UNPACK_PAIR_I16_I32(g, h, gh) \ } #define MUL_4_INPLACE(a, b, c, d, v) \ a = vec_mul(a, v); \ b = vec_mul(b, v); \ c = vec_mul(c, v); \ d = vec_mul(d, v); \ #define IDENTITY_16_V(v) \ { \ i16x8 v_ = vec_adds(v, v); \ v = vec_mradds(v, v1697_16, v_); \ } #define IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ i16x8 v1697_16 = vec_splats((int16_t)(1697*16)); \ IDENTITY_16_V(c00c01) \ IDENTITY_16_V(c02c03) \ IDENTITY_16_V(c04c05) \ IDENTITY_16_V(c06c07) \ IDENTITY_16_V(c08c09) \ IDENTITY_16_V(c10c11) \ IDENTITY_16_V(c12c13) \ IDENTITY_16_V(c14c15) \ } #define IDENTITY_16_4_I32(a, b, c, d) \ { \ i32x4 a2 = vec_add(a, a); \ i32x4 b2 = vec_add(b, b); \ i32x4 c2 = vec_add(c, c); \ i32x4 d2 = vec_add(d, d); \ MUL_4_INPLACE(a, b, c, d, v1697) \ SCALE_ROUND_4(a, b, c, d, v1024, vec_splat_u32(11)); \ a = vec_add(a2, a); \ b = vec_add(b2, b); \ c = vec_add(c2, c); \ d = vec_add(d2, d); \ } #define identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ DECLARE_SPLAT_I32(1697) \ DECLARE_SPLAT_I32(1024) \ IDENTITY_16_4_I32(c00, c01, c02, c03) \ IDENTITY_16_4_I32(c04, c05, c06, c07) \ IDENTITY_16_4_I32(c08, c09, c10, c11) \ IDENTITY_16_4_I32(c12, c13, c14, c15) \ } #define identity_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ c00, c02, c04, c06, c08, c10, c12, c14, \ c01, c03, c05, c07, c09, c11, c13, c15) \ IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ } #define IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c03, c01c02, c07c04, c06c05, \ c08c11, c09c10, c14c13, c15c12) \ IDCT_8_INNER(c00, c02, c04, c06, c08, c10, c12, c14, \ c00c03, c01c02, c07c04, c06c05) \ DECLARE_SPLAT_I32(128) \ DECLARE_SPLAT_I32(181) \ DECLARE_SPLAT_I32(401) \ DECLARE_SPLAT_I32(4076) \ DECLARE_SPLAT_I32(3166) \ DECLARE_SPLAT_I32(2598) \ DECLARE_SPLAT_I32(1931) \ DECLARE_SPLAT_I32(3612) \ DECLARE_SPLAT_I32(3920) \ DECLARE_SPLAT_I32(1189) \ DECLARE_SPLAT_I32(1567) \ DECLARE_SPLAT_I32(3784) \ \ DECLARE_MUL_PAIR_I32(c01, c15, v401, v4076) \ DECLARE_MUL_PAIR_I32(c09, c07, v3166, v2598) \ DECLARE_MUL_PAIR_I32(c05, c11, v1931, v3612) \ DECLARE_MUL_PAIR_I32(c13, c03, v3920, v1189) \ \ DECLARE_ADD_SUB_PAIR(t15a, t08a, c01, c15, v4076, v401) \ DECLARE_ADD_SUB_PAIR(t14a, t09a, c09, c07, v2598, v3166) \ DECLARE_ADD_SUB_PAIR(t13a, t10a, c05, c11, v3612, v1931) \ DECLARE_ADD_SUB_PAIR(t12a, t11a, c13, c03, v1189, v3920) \ \ SCALE_ROUND_4(t15a, t08a, t14a, t09a, v2048, v12) \ SCALE_ROUND_4(t13a, t10a, t12a, t11a, v2048, v12) \ \ CLIP16_I32_8(t15a, t08a, t14a, t09a, \ t13a, t10a, t12a, t11a, \ c08c11, c09c10, c14c13, c15c12) \ DECLARE_ADD_SUB_PAIR(t08, t09, t08a, t09a,,) \ DECLARE_ADD_SUB_PAIR(t11, t10, t11a, t10a,,) \ DECLARE_ADD_SUB_PAIR(t12, t13, t12a, t13a,,) \ DECLARE_ADD_SUB_PAIR(t15, t14, t15a, t14a,,) \ \ CLIP16_I32_8(t08, t09, t11, t10, \ t12, t13, t15, t14, \ c08c11, c09c10, c14c13, c15c12) \ \ DECLARE_MUL_PAIR_I32(t14, t09, v1567, v3784) \ DECLARE_MUL_PAIR_I32(t13, t10, v1567, v3784) \ \ ADD_SUB_PAIR(t14a, t09a, t14, t09, v3784, v1567) \ ADD_SUB_PAIR(t10a, t13a, t13, t10, v3784, v1567) \ t10a = -t10a; \ \ SCALE_ROUND_4(t14a, t09a, t13a, t10a, v2048, v12) \ \ ADD_SUB_PAIR(t08a, t11a, t08, t11,,) \ ADD_SUB_PAIR(t09, t10, t09a, t10a,,) \ ADD_SUB_PAIR(t15a, t12a, t15, t12,,) \ ADD_SUB_PAIR(t14, t13, t14a, t13a,,) \ \ CLIP16_I32_8(t08a, t11a, t09, t10, \ t15a, t12a, t14, t13, \ c08c11, c09c10, c14c13, c15c12) \ ADD_SUB_PAIR(t13a, t10a, t13, t10,,); \ ADD_SUB_PAIR(t12, t11, t12a, t11a,,); \ \ MUL_4_INPLACE(t13a, t10a, t12, t11, v181); \ SCALE_ROUND_4(t13a, t10a, t12, t11, v128, vec_splat_u32(8)) \ \ DECLARE_PACK_4(t15at12, t14t13a, t08at11, t09t10a, \ t15a, t14, t08a, t09, \ t12, t13a, t11, t10a) \ \ c15c12 = vec_subs(c00c03, t15at12); \ c14c13 = vec_subs(c01c02, t14t13a); \ c08c11 = vec_subs(c07c04, t08at11); \ c09c10 = vec_subs(c06c05, t09t10a); \ c00c03 = vec_adds(c00c03, t15at12); \ c01c02 = vec_adds(c01c02, t14t13a); \ c07c04 = vec_adds(c07c04, t08at11); \ c06c05 = vec_adds(c06c05, t09t10a); \ #define dct_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ \ i16x8 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12; \ IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ c00c01 = (i16x8)vec_mergeh((u64x2)c00c03, (u64x2)c01c02); \ c02c03 = (i16x8)vec_mergel((u64x2)c01c02, (u64x2)c00c03); \ c04c05 = (i16x8)vec_mergel((u64x2)c07c04, (u64x2)c06c05); \ c06c07 = (i16x8)vec_mergeh((u64x2)c06c05, (u64x2)c07c04); \ c08c09 = (i16x8)vec_mergeh((u64x2)c08c11, (u64x2)c09c10); \ c10c11 = (i16x8)vec_mergel((u64x2)c09c10, (u64x2)c08c11); \ c12c13 = (i16x8)vec_mergel((u64x2)c15c12, (u64x2)c14c13); \ c14c15 = (i16x8)vec_mergeh((u64x2)c14c13, (u64x2)c15c12); \ #define dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ \ IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ UNPACK_PAIR_I16_I32(c00, c03, c00c03) \ UNPACK_PAIR_I16_I32(c01, c02, c01c02) \ UNPACK_PAIR_I16_I32(c07, c04, c07c04) \ UNPACK_PAIR_I16_I32(c06, c05, c06c05) \ UNPACK_PAIR_I16_I32(c08, c11, c08c11) \ UNPACK_PAIR_I16_I32(c09, c10, c09c10) \ UNPACK_PAIR_I16_I32(c14, c13, c14c13) \ UNPACK_PAIR_I16_I32(c15, c12, c15c12) \ #define dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ dct_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ dct_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ dct_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ dct_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) #define PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ c00c01 = vec_packs(c00, c04); c02c03 = vec_packs(c08, c12); \ c04c05 = vec_packs(c01, c05); c06c07 = vec_packs(c09, c13); \ c08c09 = vec_packs(c02, c06); c10c11 = vec_packs(c10, c14); \ c12c13 = vec_packs(c03, c07); c14c15 = vec_packs(c11, c15); \ } #define dct_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ IDCT_4_INNER(c00, c01, c02, c03) \ IDCT_4_INNER(c04, c05, c06, c07) \ IDCT_4_INNER(c08, c09, c10, c11) \ IDCT_4_INNER(c12, c13, c14, c15) \ \ PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ } #define IDENTITY_4_I32(a, b, c, d) \ { \ DECLARE_SPLAT_I32(5793) \ DECLARE_SPLAT_I32(2048) \ MUL_4_INPLACE(a, b, c, d, v5793) \ SCALE_ROUND_4(a, b, c, d, v2048, vec_splat_u32(12)) \ } #define identity_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ { \ IDENTITY_4_I32(cA0, cA1, cA2, cA3) \ IDENTITY_4_I32(cB0, cB1, cB2, cB3) \ IDENTITY_4_I32(cC0, cC1, cC2, cC3) \ IDENTITY_4_I32(cD0, cD1, cD2, cD3) \ } #define identity_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ IDENTITY_4(c00c01, c02c03) \ IDENTITY_4(c04c05, c06c07) \ IDENTITY_4(c08c09, c10c11) \ IDENTITY_4(c12c13, c14c15) \ } #define adst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ adst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ adst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ adst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ adst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) #define adst_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_4(c00, c01, c02, c03, c00, c01, c02, c03) \ ADST_INNER_4(c04, c05, c06, c07, c04, c05, c06, c07) \ ADST_INNER_4(c08, c09, c10, c11, c08, c09, c10, c11) \ ADST_INNER_4(c12, c13, c14, c15, c12, c13, c14, c15) \ \ PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ } #define flipadst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ flipadst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ flipadst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ flipadst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ flipadst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) #define flipadst_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_4(c00, c01, c02, c03, c03, c02, c01, c00) \ ADST_INNER_4(c04, c05, c06, c07, c07, c06, c05, c04) \ ADST_INNER_4(c08, c09, c10, c11, c11, c10, c09, c08) \ ADST_INNER_4(c12, c13, c14, c15, c15, c14, c13, c12) \ \ PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ } #define ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ o00, o01, o02, o03, o04, o05, o06, o07, \ o08, o09, o10, o11, o12, o13, o14, o15, \ c00c01, c02c03, c04c05, c06c07) \ DECLARE_SPLAT_I32(2048); \ u32x4 v12 = vec_splat_u32(12); \ DECLARE_SPLAT_I32(4091) \ DECLARE_SPLAT_I32(201) \ DECLARE_SPLAT_I32(3973) \ DECLARE_SPLAT_I32(995) \ DECLARE_SPLAT_I32(3703) \ DECLARE_SPLAT_I32(1751) \ DECLARE_SPLAT_I32(3290) \ DECLARE_SPLAT_I32(2440) \ DECLARE_SPLAT_I32(2751) \ DECLARE_SPLAT_I32(3035) \ DECLARE_SPLAT_I32(2106) \ DECLARE_SPLAT_I32(3513) \ DECLARE_SPLAT_I32(1380) \ DECLARE_SPLAT_I32(3857) \ DECLARE_SPLAT_I32(601) \ DECLARE_SPLAT_I32(4052) \ \ DECLARE_MUL_PAIR_I32(c15, c00, v4091, v201) \ DECLARE_MUL_PAIR_I32(c13, c02, v3973, v995) \ DECLARE_MUL_PAIR_I32(c11, c04, v3703, v1751) \ DECLARE_MUL_PAIR_I32(c09, c06, v3290, v2440) \ DECLARE_MUL_PAIR_I32(c07, c08, v2751, v3035) \ DECLARE_MUL_PAIR_I32(c05, c10, v2106, v3513) \ DECLARE_MUL_PAIR_I32(c03, c12, v1380, v3857) \ DECLARE_MUL_PAIR_I32(c01, c14, v601, v4052) \ \ DECLARE_ADD_SUB_PAIR(t00, t01, c15, c00, v4091, v201);\ DECLARE_ADD_SUB_PAIR(t02, t03, c13, c02, v3973, v995) \ DECLARE_ADD_SUB_PAIR(t04, t05, c11, c04, v3703, v1751) \ DECLARE_ADD_SUB_PAIR(t06, t07, c09, c06, v3290, v2440) \ DECLARE_ADD_SUB_PAIR(t08, t09, c07, c08, v2751, v3035) \ DECLARE_ADD_SUB_PAIR(t10, t11, c05, c10, v2106, v3513) \ DECLARE_ADD_SUB_PAIR(t12, t13, c03, c12, v1380, v3857) \ DECLARE_ADD_SUB_PAIR(t14, t15, c01, c14, v601, v4052) \ \ SCALE_ROUND_4(t00, t01, t02, t03, v2048, v12) \ SCALE_ROUND_4(t04, t05, t06, t07, v2048, v12) \ SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \ SCALE_ROUND_4(t12, t13, t14, t15, v2048, v12) \ \ DECLARE_ADD_SUB_PAIR(t00a, t08a, t00, t08,,) \ DECLARE_ADD_SUB_PAIR(t01a, t09a, t01, t09,,) \ DECLARE_ADD_SUB_PAIR(t02a, t10a, t02, t10,,) \ DECLARE_ADD_SUB_PAIR(t03a, t11a, t03, t11,,) \ DECLARE_ADD_SUB_PAIR(t04a, t12a, t04, t12,,) \ DECLARE_ADD_SUB_PAIR(t05a, t13a, t05, t13,,) \ DECLARE_ADD_SUB_PAIR(t06a, t14a, t06, t14,,) \ DECLARE_ADD_SUB_PAIR(t07a, t15a, t07, t15,,) \ \ CLIP16_I32_8(t00a, t08a, t01a, t09a, t02a, t10a, t03a, t11a, \ c00c01, c02c03, c04c05, c06c07); \ CLIP16_I32_8(t04a, t12a, t05a, t13a, t06a, t14a, t07a, t15a, \ c00c01, c02c03, c04c05, c06c07); \ \ DECLARE_SPLAT_I32(4017) \ DECLARE_SPLAT_I32(799) \ DECLARE_SPLAT_I32(2276) \ DECLARE_SPLAT_I32(3406) \ \ DECLARE_MUL_PAIR_I32(t08a, t09a, v4017, v799); \ DECLARE_MUL_PAIR_I32(t10a, t11a, v2276, v3406); \ DECLARE_MUL_PAIR_I32(t13a, t12a, v799, v4017); \ DECLARE_MUL_PAIR_I32(t15a, t14a, v3406, v2276); \ \ ADD_SUB_PAIR(t08, t09, t08a, t09a, v4017, v799); \ ADD_SUB_PAIR(t10, t11, t10a, t11a, v2276, v3406); \ ADD_SUB_PAIR(t13, t12, t13a, t12a, v799, v4017); \ ADD_SUB_PAIR(t15, t14, t15a, t14a, v3406, v2276); \ \ SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \ SCALE_ROUND_4(t13, t12, t15, t14, v2048, v12) \ \ ADD_SUB_PAIR(t00, t04, t00a, t04a,,); \ ADD_SUB_PAIR(t01, t05, t01a, t05a,,); \ ADD_SUB_PAIR(t02, t06, t02a, t06a,,); \ ADD_SUB_PAIR(t03, t07, t03a, t07a,,); \ ADD_SUB_PAIR(t08a, t12a, t08, t12,,); \ ADD_SUB_PAIR(t09a, t13a, t09, t13,,); \ ADD_SUB_PAIR(t10a, t14a, t10, t14,,); \ ADD_SUB_PAIR(t11a, t15a, t11, t15,,); \ \ CLIP16_I32_8(t00, t04, t01, t05, t02, t06, t03, t07, \ c00c01, c02c03, c04c05, c06c07) \ CLIP16_I32_8(t08a, t12a, t09a, t13a, t10a, t14a, t11a, t15a, \ c00c01, c02c03, c04c05, c06c07) \ \ DECLARE_SPLAT_I32(3784) \ DECLARE_SPLAT_I32(1567) \ \ DECLARE_MUL_PAIR_I32(t04, t05, v3784, v1567) \ DECLARE_MUL_PAIR_I32(t07, t06, v1567, v3784) \ DECLARE_MUL_PAIR_I32(t12a, t13a, v3784, v1567) \ DECLARE_MUL_PAIR_I32(t15a, t14a, v1567, v3784) \ \ ADD_SUB_PAIR(t04a, t05a, t04, t05, v3784, v1567) \ ADD_SUB_PAIR(t07a, t06a, t07, t06, v1567, v3784) \ ADD_SUB_PAIR(t12, t13, t12a, t13a, v3784, v1567) \ ADD_SUB_PAIR(t15, t14, t15a, t14a, v1567, v3784) \ \ SCALE_ROUND_4(t04a, t05a, t07a, t06a, v2048, v12) \ SCALE_ROUND_4(t12, t13, t15, t14, v2048, v12) \ \ ADD_SUB_PAIR(o00, t02a, t00, t02,,) \ ADD_SUB_PAIR(o15, t03a, t01, t03,,) \ ADD_SUB_PAIR(o03, t06, t04a, t06a,,) \ ADD_SUB_PAIR(o12, t07, t05a, t07a,,) \ ADD_SUB_PAIR(o01, t10, t08a, t10a,,) \ ADD_SUB_PAIR(o14, t11, t09a, t11a,,) \ ADD_SUB_PAIR(o02, t14a, t12, t14,,) \ ADD_SUB_PAIR(o13, t15a, t13, t15,,) \ \ CLIP16_I32_8(o00, t02a, o15, t03a, o03, t06, o12, t07, \ c00c01, c02c03, c04c05, c06c07) \ CLIP16_I32_8(o01, t10, o14, t11, o02, t14a, o13, t15a, \ c00c01, c02c03, c04c05, c06c07) \ \ DECLARE_SPLAT_I32(181) \ DECLARE_SPLAT_I32(128) \ u32x4 v8 = vec_splat_u32(8); \ \ ADD_SUB_PAIR(o07, o08, t02a, t03a,,) \ ADD_SUB_PAIR(o04, o11, t06, t07,,) \ ADD_SUB_PAIR(o06, o09, t10, t11,,) \ ADD_SUB_PAIR(o05, o10, t14a, t15a,,) \ \ MUL_4_INPLACE(o07, o08, o04, o11, v181) \ MUL_4_INPLACE(o06, o09, o05, o10, v181) \ \ SCALE_ROUND_4(o07, o08, o04, o11, v128, v8) \ SCALE_ROUND_4(o06, o09, o05, o10, v128, v8) \ \ o01 = -o01; \ o03 = -o03; \ o05 = -o05; \ o07 = -o07; \ o09 = -o09; \ o11 = -o11; \ o13 = -o13; \ o15 = -o15; \ #define adst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07) \ } #define adst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07) \ PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ c00, c02, c04, c06, c08, c10, c12, c14, \ c01, c03, c05, c07, c09, c11, c13, c15) \ } #define flipadst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \ c00c01, c02c03, c04c05, c06c07) \ } #define flipadst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \ c00c01, c02c03, c04c05, c06c07) \ PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ c00, c02, c04, c06, c08, c10, c12, c14, \ c01, c03, c05, c07, c09, c11, c13, c15) \ } void dav1d_inv_txfm_add_dct_dct_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob HIGHBD_DECL_SUFFIX) { if (eob < 1) { return dc_only_4xN(dst, stride, coeff, 4, 0, 1); } LOAD_COEFF_4x16(coeff) dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) memset(coeff, 0, sizeof(*coeff) * 4 * 16); SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) dct_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); STORE_4(dst, stride, l00, l01, l02, l03); STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07); STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11); STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); } #define inv_txfm_fn4x16(type1, type2) \ void dav1d_inv_txfm_add_##type1##_##type2##_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_COEFF_4x16(coeff) \ type1##_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ memset(coeff, 0, sizeof(*coeff) * 4 * 16); \ SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) \ TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) \ type2##_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) \ LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) \ LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) \ LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) \ APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); \ APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); \ APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); \ APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); \ STORE_4(dst, stride, l00, l01, l02, l03); \ STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07); \ STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11); \ STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); \ } inv_txfm_fn4x16(adst, dct ) inv_txfm_fn4x16(dct, adst ) inv_txfm_fn4x16(dct, flipadst) inv_txfm_fn4x16(flipadst, dct ) inv_txfm_fn4x16(adst, flipadst) inv_txfm_fn4x16(flipadst, adst ) inv_txfm_fn4x16(identity, dct ) inv_txfm_fn4x16(dct, identity) inv_txfm_fn4x16(identity, flipadst) inv_txfm_fn4x16(flipadst, identity) inv_txfm_fn4x16(identity, adst ) inv_txfm_fn4x16(adst, identity) inv_txfm_fn4x16(identity, identity) inv_txfm_fn4x16(adst, adst ) inv_txfm_fn4x16(flipadst, flipadst) void dav1d_inv_txfm_add_dct_dct_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { if (eob < 1) { return dc_only_16xN(dst, stride, coeff, 1, 0, 1); } LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) memset(coeff, 0, sizeof(*coeff) * 16 * 4); SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) TRANSPOSE4_I32(c00, c01, c02, c03); TRANSPOSE4_I32(c04, c05, c06, c07); TRANSPOSE4_I32(c08, c09, c10, c11); TRANSPOSE4_I32(c12, c13, c14, c15); dct_4x4_out(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) APPLY_COEFF_16x4(l0, l1, l2, l3, c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) STORE_16(dst, stride, l0, l1, l2, l3) } #define inv_txfm_fn16x4(type1, type2) \ void dav1d_inv_txfm_add_##type1##_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \ UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \ UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \ UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \ type1##_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ memset(coeff, 0, sizeof(*coeff) * 16 * 4); \ SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \ TRANSPOSE4_I32(c00, c01, c02, c03); \ TRANSPOSE4_I32(c04, c05, c06, c07); \ TRANSPOSE4_I32(c08, c09, c10, c11); \ TRANSPOSE4_I32(c12, c13, c14, c15); \ type2##_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15); \ LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \ APPLY_COEFF_16x4(l0, l1, l2, l3, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ STORE_16(dst, stride, l0, l1, l2, l3) \ } inv_txfm_fn16x4(adst, dct ) inv_txfm_fn16x4(dct, adst ) inv_txfm_fn16x4(dct, flipadst) inv_txfm_fn16x4(flipadst, dct ) inv_txfm_fn16x4(adst, flipadst) inv_txfm_fn16x4(flipadst, adst ) inv_txfm_fn16x4(dct, identity) inv_txfm_fn16x4(flipadst, identity) inv_txfm_fn16x4(adst, identity) inv_txfm_fn16x4(identity, identity) inv_txfm_fn16x4(adst, adst ) inv_txfm_fn16x4(flipadst, flipadst) #define inv_txfm_fn16x4_identity(type2) \ void dav1d_inv_txfm_add_identity_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \ UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \ UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \ UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \ identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ memset(coeff, 0, sizeof(*coeff) * 16 * 4); \ SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \ CLIP16_I32_8(c00, c01, c02, c03, c04, c05, c06, c07, c00c01, c02c03, c04c05, c06c07) \ CLIP16_I32_8(c08, c09, c10, c11, c12, c13, c14, c15, c08c09, c10c11, c12c13, c14c15) \ TRANSPOSE4_I32(c00, c01, c02, c03); \ TRANSPOSE4_I32(c04, c05, c06, c07); \ TRANSPOSE4_I32(c08, c09, c10, c11); \ TRANSPOSE4_I32(c12, c13, c14, c15); \ type2##_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15); \ LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \ APPLY_COEFF_16x4(l0, l1, l2, l3, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ STORE_16(dst, stride, l0, l1, l2, l3) \ } inv_txfm_fn16x4_identity(dct) inv_txfm_fn16x4_identity(adst) inv_txfm_fn16x4_identity(flipadst) #endif // BITDEPTH