1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
13*77c1e3ccSAndroid Build Coastguard Worker #define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
14*77c1e3ccSAndroid Build Coastguard Worker
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
17*77c1e3ccSAndroid Build Coastguard Worker
18*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/cdef_block.h"
19*77c1e3ccSAndroid Build Coastguard Worker
20*77c1e3ccSAndroid Build Coastguard Worker /* partial A is a 16-bit vector of the form:
21*77c1e3ccSAndroid Build Coastguard Worker [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
22*77c1e3ccSAndroid Build Coastguard Worker [0 y1 y2 y3 y4 y5 y6 y7].
23*77c1e3ccSAndroid Build Coastguard Worker This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
24*77c1e3ccSAndroid Build Coastguard Worker (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
25*77c1e3ccSAndroid Build Coastguard Worker and const2. */
fold_mul_and_sum(v128 partiala,v128 partialb,v128 const1,v128 const2)26*77c1e3ccSAndroid Build Coastguard Worker static inline v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
27*77c1e3ccSAndroid Build Coastguard Worker v128 const2) {
28*77c1e3ccSAndroid Build Coastguard Worker v128 tmp;
29*77c1e3ccSAndroid Build Coastguard Worker /* Reverse partial B. */
30*77c1e3ccSAndroid Build Coastguard Worker partialb = v128_shuffle_8(
31*77c1e3ccSAndroid Build Coastguard Worker partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
32*77c1e3ccSAndroid Build Coastguard Worker /* Interleave the x and y values of identical indices and pair x8 with 0. */
33*77c1e3ccSAndroid Build Coastguard Worker tmp = partiala;
34*77c1e3ccSAndroid Build Coastguard Worker partiala = v128_ziplo_16(partialb, partiala);
35*77c1e3ccSAndroid Build Coastguard Worker partialb = v128_ziphi_16(partialb, tmp);
36*77c1e3ccSAndroid Build Coastguard Worker /* Square and add the corresponding x and y values. */
37*77c1e3ccSAndroid Build Coastguard Worker partiala = v128_madd_s16(partiala, partiala);
38*77c1e3ccSAndroid Build Coastguard Worker partialb = v128_madd_s16(partialb, partialb);
39*77c1e3ccSAndroid Build Coastguard Worker /* Multiply by constant. */
40*77c1e3ccSAndroid Build Coastguard Worker partiala = v128_mullo_s32(partiala, const1);
41*77c1e3ccSAndroid Build Coastguard Worker partialb = v128_mullo_s32(partialb, const2);
42*77c1e3ccSAndroid Build Coastguard Worker /* Sum all results. */
43*77c1e3ccSAndroid Build Coastguard Worker partiala = v128_add_32(partiala, partialb);
44*77c1e3ccSAndroid Build Coastguard Worker return partiala;
45*77c1e3ccSAndroid Build Coastguard Worker }
46*77c1e3ccSAndroid Build Coastguard Worker
hsum4(v128 x0,v128 x1,v128 x2,v128 x3)47*77c1e3ccSAndroid Build Coastguard Worker static inline v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
48*77c1e3ccSAndroid Build Coastguard Worker v128 t0, t1, t2, t3;
49*77c1e3ccSAndroid Build Coastguard Worker t0 = v128_ziplo_32(x1, x0);
50*77c1e3ccSAndroid Build Coastguard Worker t1 = v128_ziplo_32(x3, x2);
51*77c1e3ccSAndroid Build Coastguard Worker t2 = v128_ziphi_32(x1, x0);
52*77c1e3ccSAndroid Build Coastguard Worker t3 = v128_ziphi_32(x3, x2);
53*77c1e3ccSAndroid Build Coastguard Worker x0 = v128_ziplo_64(t1, t0);
54*77c1e3ccSAndroid Build Coastguard Worker x1 = v128_ziphi_64(t1, t0);
55*77c1e3ccSAndroid Build Coastguard Worker x2 = v128_ziplo_64(t3, t2);
56*77c1e3ccSAndroid Build Coastguard Worker x3 = v128_ziphi_64(t3, t2);
57*77c1e3ccSAndroid Build Coastguard Worker return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
58*77c1e3ccSAndroid Build Coastguard Worker }
59*77c1e3ccSAndroid Build Coastguard Worker
60*77c1e3ccSAndroid Build Coastguard Worker /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
61*77c1e3ccSAndroid Build Coastguard Worker to compute the remaining directions. */
compute_directions(v128 lines[8],int32_t tmp_cost1[4])62*77c1e3ccSAndroid Build Coastguard Worker static inline v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
63*77c1e3ccSAndroid Build Coastguard Worker v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
64*77c1e3ccSAndroid Build Coastguard Worker v128 partial6;
65*77c1e3ccSAndroid Build Coastguard Worker v128 tmp;
66*77c1e3ccSAndroid Build Coastguard Worker /* Partial sums for lines 0 and 1. */
67*77c1e3ccSAndroid Build Coastguard Worker partial4a = v128_shl_n_byte(lines[0], 14);
68*77c1e3ccSAndroid Build Coastguard Worker partial4b = v128_shr_n_byte(lines[0], 2);
69*77c1e3ccSAndroid Build Coastguard Worker partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
70*77c1e3ccSAndroid Build Coastguard Worker partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
71*77c1e3ccSAndroid Build Coastguard Worker tmp = v128_add_16(lines[0], lines[1]);
72*77c1e3ccSAndroid Build Coastguard Worker partial5a = v128_shl_n_byte(tmp, 10);
73*77c1e3ccSAndroid Build Coastguard Worker partial5b = v128_shr_n_byte(tmp, 6);
74*77c1e3ccSAndroid Build Coastguard Worker partial7a = v128_shl_n_byte(tmp, 4);
75*77c1e3ccSAndroid Build Coastguard Worker partial7b = v128_shr_n_byte(tmp, 12);
76*77c1e3ccSAndroid Build Coastguard Worker partial6 = tmp;
77*77c1e3ccSAndroid Build Coastguard Worker
78*77c1e3ccSAndroid Build Coastguard Worker /* Partial sums for lines 2 and 3. */
79*77c1e3ccSAndroid Build Coastguard Worker partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
80*77c1e3ccSAndroid Build Coastguard Worker partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
81*77c1e3ccSAndroid Build Coastguard Worker partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
82*77c1e3ccSAndroid Build Coastguard Worker partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
83*77c1e3ccSAndroid Build Coastguard Worker tmp = v128_add_16(lines[2], lines[3]);
84*77c1e3ccSAndroid Build Coastguard Worker partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
85*77c1e3ccSAndroid Build Coastguard Worker partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
86*77c1e3ccSAndroid Build Coastguard Worker partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
87*77c1e3ccSAndroid Build Coastguard Worker partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
88*77c1e3ccSAndroid Build Coastguard Worker partial6 = v128_add_16(partial6, tmp);
89*77c1e3ccSAndroid Build Coastguard Worker
90*77c1e3ccSAndroid Build Coastguard Worker /* Partial sums for lines 4 and 5. */
91*77c1e3ccSAndroid Build Coastguard Worker partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
92*77c1e3ccSAndroid Build Coastguard Worker partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
93*77c1e3ccSAndroid Build Coastguard Worker partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
94*77c1e3ccSAndroid Build Coastguard Worker partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
95*77c1e3ccSAndroid Build Coastguard Worker tmp = v128_add_16(lines[4], lines[5]);
96*77c1e3ccSAndroid Build Coastguard Worker partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
97*77c1e3ccSAndroid Build Coastguard Worker partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
98*77c1e3ccSAndroid Build Coastguard Worker partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
99*77c1e3ccSAndroid Build Coastguard Worker partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
100*77c1e3ccSAndroid Build Coastguard Worker partial6 = v128_add_16(partial6, tmp);
101*77c1e3ccSAndroid Build Coastguard Worker
102*77c1e3ccSAndroid Build Coastguard Worker /* Partial sums for lines 6 and 7. */
103*77c1e3ccSAndroid Build Coastguard Worker partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
104*77c1e3ccSAndroid Build Coastguard Worker partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
105*77c1e3ccSAndroid Build Coastguard Worker partial4a = v128_add_16(partial4a, lines[7]);
106*77c1e3ccSAndroid Build Coastguard Worker tmp = v128_add_16(lines[6], lines[7]);
107*77c1e3ccSAndroid Build Coastguard Worker partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
108*77c1e3ccSAndroid Build Coastguard Worker partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
109*77c1e3ccSAndroid Build Coastguard Worker partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
110*77c1e3ccSAndroid Build Coastguard Worker partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
111*77c1e3ccSAndroid Build Coastguard Worker partial6 = v128_add_16(partial6, tmp);
112*77c1e3ccSAndroid Build Coastguard Worker
113*77c1e3ccSAndroid Build Coastguard Worker /* Compute costs in terms of partial sums. */
114*77c1e3ccSAndroid Build Coastguard Worker partial4a =
115*77c1e3ccSAndroid Build Coastguard Worker fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
116*77c1e3ccSAndroid Build Coastguard Worker v128_from_32(105, 120, 140, 168));
117*77c1e3ccSAndroid Build Coastguard Worker partial7a =
118*77c1e3ccSAndroid Build Coastguard Worker fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
119*77c1e3ccSAndroid Build Coastguard Worker v128_from_32(105, 105, 105, 140));
120*77c1e3ccSAndroid Build Coastguard Worker partial5a =
121*77c1e3ccSAndroid Build Coastguard Worker fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
122*77c1e3ccSAndroid Build Coastguard Worker v128_from_32(105, 105, 105, 140));
123*77c1e3ccSAndroid Build Coastguard Worker partial6 = v128_madd_s16(partial6, partial6);
124*77c1e3ccSAndroid Build Coastguard Worker partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
125*77c1e3ccSAndroid Build Coastguard Worker
126*77c1e3ccSAndroid Build Coastguard Worker partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
127*77c1e3ccSAndroid Build Coastguard Worker v128_store_unaligned(tmp_cost1, partial4a);
128*77c1e3ccSAndroid Build Coastguard Worker return partial4a;
129*77c1e3ccSAndroid Build Coastguard Worker }
130*77c1e3ccSAndroid Build Coastguard Worker
131*77c1e3ccSAndroid Build Coastguard Worker /* transpose and reverse the order of the lines -- equivalent to a 90-degree
132*77c1e3ccSAndroid Build Coastguard Worker counter-clockwise rotation of the pixels. */
array_reverse_transpose_8x8(v128 * in,v128 * res)133*77c1e3ccSAndroid Build Coastguard Worker static inline void array_reverse_transpose_8x8(v128 *in, v128 *res) {
134*77c1e3ccSAndroid Build Coastguard Worker const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
135*77c1e3ccSAndroid Build Coastguard Worker const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
136*77c1e3ccSAndroid Build Coastguard Worker const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
137*77c1e3ccSAndroid Build Coastguard Worker const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
138*77c1e3ccSAndroid Build Coastguard Worker const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
139*77c1e3ccSAndroid Build Coastguard Worker const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
140*77c1e3ccSAndroid Build Coastguard Worker const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
141*77c1e3ccSAndroid Build Coastguard Worker const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
142*77c1e3ccSAndroid Build Coastguard Worker
143*77c1e3ccSAndroid Build Coastguard Worker const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
144*77c1e3ccSAndroid Build Coastguard Worker const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
145*77c1e3ccSAndroid Build Coastguard Worker const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
146*77c1e3ccSAndroid Build Coastguard Worker const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
147*77c1e3ccSAndroid Build Coastguard Worker const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
148*77c1e3ccSAndroid Build Coastguard Worker const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
149*77c1e3ccSAndroid Build Coastguard Worker const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
150*77c1e3ccSAndroid Build Coastguard Worker const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
151*77c1e3ccSAndroid Build Coastguard Worker
152*77c1e3ccSAndroid Build Coastguard Worker res[7] = v128_ziplo_64(tr1_1, tr1_0);
153*77c1e3ccSAndroid Build Coastguard Worker res[6] = v128_ziphi_64(tr1_1, tr1_0);
154*77c1e3ccSAndroid Build Coastguard Worker res[5] = v128_ziplo_64(tr1_3, tr1_2);
155*77c1e3ccSAndroid Build Coastguard Worker res[4] = v128_ziphi_64(tr1_3, tr1_2);
156*77c1e3ccSAndroid Build Coastguard Worker res[3] = v128_ziplo_64(tr1_5, tr1_4);
157*77c1e3ccSAndroid Build Coastguard Worker res[2] = v128_ziphi_64(tr1_5, tr1_4);
158*77c1e3ccSAndroid Build Coastguard Worker res[1] = v128_ziplo_64(tr1_7, tr1_6);
159*77c1e3ccSAndroid Build Coastguard Worker res[0] = v128_ziphi_64(tr1_7, tr1_6);
160*77c1e3ccSAndroid Build Coastguard Worker }
161*77c1e3ccSAndroid Build Coastguard Worker
SIMD_FUNC(cdef_find_dir)162*77c1e3ccSAndroid Build Coastguard Worker int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
163*77c1e3ccSAndroid Build Coastguard Worker int coeff_shift) {
164*77c1e3ccSAndroid Build Coastguard Worker int i;
165*77c1e3ccSAndroid Build Coastguard Worker int32_t cost[8];
166*77c1e3ccSAndroid Build Coastguard Worker int32_t best_cost = 0;
167*77c1e3ccSAndroid Build Coastguard Worker int best_dir = 0;
168*77c1e3ccSAndroid Build Coastguard Worker v128 lines[8];
169*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < 8; i++) {
170*77c1e3ccSAndroid Build Coastguard Worker lines[i] = v128_load_unaligned(&img[i * stride]);
171*77c1e3ccSAndroid Build Coastguard Worker lines[i] =
172*77c1e3ccSAndroid Build Coastguard Worker v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
173*77c1e3ccSAndroid Build Coastguard Worker }
174*77c1e3ccSAndroid Build Coastguard Worker
175*77c1e3ccSAndroid Build Coastguard Worker /* Compute "mostly vertical" directions. */
176*77c1e3ccSAndroid Build Coastguard Worker v128 dir47 = compute_directions(lines, cost + 4);
177*77c1e3ccSAndroid Build Coastguard Worker
178*77c1e3ccSAndroid Build Coastguard Worker array_reverse_transpose_8x8(lines, lines);
179*77c1e3ccSAndroid Build Coastguard Worker
180*77c1e3ccSAndroid Build Coastguard Worker /* Compute "mostly horizontal" directions. */
181*77c1e3ccSAndroid Build Coastguard Worker v128 dir03 = compute_directions(lines, cost);
182*77c1e3ccSAndroid Build Coastguard Worker
183*77c1e3ccSAndroid Build Coastguard Worker v128 max = v128_max_s32(dir03, dir47);
184*77c1e3ccSAndroid Build Coastguard Worker max = v128_max_s32(max, v128_align(max, max, 8));
185*77c1e3ccSAndroid Build Coastguard Worker max = v128_max_s32(max, v128_align(max, max, 4));
186*77c1e3ccSAndroid Build Coastguard Worker best_cost = v128_low_u32(max);
187*77c1e3ccSAndroid Build Coastguard Worker v128 t =
188*77c1e3ccSAndroid Build Coastguard Worker v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
189*77c1e3ccSAndroid Build Coastguard Worker best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
190*77c1e3ccSAndroid Build Coastguard Worker best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros
191*77c1e3ccSAndroid Build Coastguard Worker
192*77c1e3ccSAndroid Build Coastguard Worker /* Difference between the optimal variance and the variance along the
193*77c1e3ccSAndroid Build Coastguard Worker orthogonal direction. Again, the sum(x^2) terms cancel out. */
194*77c1e3ccSAndroid Build Coastguard Worker *var = best_cost - cost[(best_dir + 4) & 7];
195*77c1e3ccSAndroid Build Coastguard Worker /* We'd normally divide by 840, but dividing by 1024 is close enough
196*77c1e3ccSAndroid Build Coastguard Worker for what we're going to do with this. */
197*77c1e3ccSAndroid Build Coastguard Worker *var >>= 10;
198*77c1e3ccSAndroid Build Coastguard Worker return best_dir;
199*77c1e3ccSAndroid Build Coastguard Worker }
200*77c1e3ccSAndroid Build Coastguard Worker
201*77c1e3ccSAndroid Build Coastguard Worker // Work around compiler out of memory issues with Win32 builds. This issue has
202*77c1e3ccSAndroid Build Coastguard Worker // been observed with Visual Studio 2017, 2019, and 2022 (version 17.10.3).
203*77c1e3ccSAndroid Build Coastguard Worker #if defined(_MSC_VER) && defined(_M_IX86)
204*77c1e3ccSAndroid Build Coastguard Worker #define CDEF_INLINE static inline
205*77c1e3ccSAndroid Build Coastguard Worker #else
206*77c1e3ccSAndroid Build Coastguard Worker #define CDEF_INLINE SIMD_INLINE
207*77c1e3ccSAndroid Build Coastguard Worker #endif
208*77c1e3ccSAndroid Build Coastguard Worker
209*77c1e3ccSAndroid Build Coastguard Worker // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
constrain16(v256 a,v256 b,unsigned int threshold,unsigned int adjdamp)210*77c1e3ccSAndroid Build Coastguard Worker CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
211*77c1e3ccSAndroid Build Coastguard Worker unsigned int adjdamp) {
212*77c1e3ccSAndroid Build Coastguard Worker v256 diff = v256_sub_16(a, b);
213*77c1e3ccSAndroid Build Coastguard Worker const v256 sign = v256_shr_n_s16(diff, 15);
214*77c1e3ccSAndroid Build Coastguard Worker diff = v256_abs_s16(diff);
215*77c1e3ccSAndroid Build Coastguard Worker const v256 s =
216*77c1e3ccSAndroid Build Coastguard Worker v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
217*77c1e3ccSAndroid Build Coastguard Worker return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
218*77c1e3ccSAndroid Build Coastguard Worker }
219*77c1e3ccSAndroid Build Coastguard Worker
get_max_primary(const int is_lowbd,v256 * tap,v256 max,v256 cdef_large_value_mask)220*77c1e3ccSAndroid Build Coastguard Worker SIMD_INLINE v256 get_max_primary(const int is_lowbd, v256 *tap, v256 max,
221*77c1e3ccSAndroid Build Coastguard Worker v256 cdef_large_value_mask) {
222*77c1e3ccSAndroid Build Coastguard Worker if (is_lowbd) {
223*77c1e3ccSAndroid Build Coastguard Worker v256 max_u8;
224*77c1e3ccSAndroid Build Coastguard Worker max_u8 = tap[0];
225*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[1]);
226*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[2]);
227*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[3]);
228*77c1e3ccSAndroid Build Coastguard Worker /* The source is 16 bits, however, we only really care about the lower
229*77c1e3ccSAndroid Build Coastguard Worker 8 bits. The upper 8 bits contain the "large" flag. After the final
230*77c1e3ccSAndroid Build Coastguard Worker primary max has been calculated, zero out the upper 8 bits. Use this
231*77c1e3ccSAndroid Build Coastguard Worker to find the "16 bit" max. */
232*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
233*77c1e3ccSAndroid Build Coastguard Worker } else {
234*77c1e3ccSAndroid Build Coastguard Worker /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
235*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
236*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
237*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
238*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
239*77c1e3ccSAndroid Build Coastguard Worker }
240*77c1e3ccSAndroid Build Coastguard Worker return max;
241*77c1e3ccSAndroid Build Coastguard Worker }
242*77c1e3ccSAndroid Build Coastguard Worker
get_max_secondary(const int is_lowbd,v256 * tap,v256 max,v256 cdef_large_value_mask)243*77c1e3ccSAndroid Build Coastguard Worker SIMD_INLINE v256 get_max_secondary(const int is_lowbd, v256 *tap, v256 max,
244*77c1e3ccSAndroid Build Coastguard Worker v256 cdef_large_value_mask) {
245*77c1e3ccSAndroid Build Coastguard Worker if (is_lowbd) {
246*77c1e3ccSAndroid Build Coastguard Worker v256 max_u8;
247*77c1e3ccSAndroid Build Coastguard Worker max_u8 = tap[0];
248*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[1]);
249*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[2]);
250*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[3]);
251*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[4]);
252*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[5]);
253*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[6]);
254*77c1e3ccSAndroid Build Coastguard Worker max_u8 = v256_max_u8(max_u8, tap[7]);
255*77c1e3ccSAndroid Build Coastguard Worker /* The source is 16 bits, however, we only really care about the lower
256*77c1e3ccSAndroid Build Coastguard Worker 8 bits. The upper 8 bits contain the "large" flag. After the final
257*77c1e3ccSAndroid Build Coastguard Worker primary max has been calculated, zero out the upper 8 bits. Use this
258*77c1e3ccSAndroid Build Coastguard Worker to find the "16 bit" max. */
259*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
260*77c1e3ccSAndroid Build Coastguard Worker } else {
261*77c1e3ccSAndroid Build Coastguard Worker /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
262*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
263*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
264*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
265*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
266*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask));
267*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask));
268*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask));
269*77c1e3ccSAndroid Build Coastguard Worker max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask));
270*77c1e3ccSAndroid Build Coastguard Worker }
271*77c1e3ccSAndroid Build Coastguard Worker return max;
272*77c1e3ccSAndroid Build Coastguard Worker }
273*77c1e3ccSAndroid Build Coastguard Worker
274*77c1e3ccSAndroid Build Coastguard Worker // MSVC takes far too much time optimizing these.
275*77c1e3ccSAndroid Build Coastguard Worker // https://bugs.chromium.org/p/aomedia/issues/detail?id=3395
276*77c1e3ccSAndroid Build Coastguard Worker #if defined(_MSC_VER) && !defined(__clang__)
277*77c1e3ccSAndroid Build Coastguard Worker #pragma optimize("", off)
278*77c1e3ccSAndroid Build Coastguard Worker #endif
279*77c1e3ccSAndroid Build Coastguard Worker
filter_block_4x4(const int is_lowbd,void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int height,int enable_primary,int enable_secondary)280*77c1e3ccSAndroid Build Coastguard Worker CDEF_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride,
281*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *in, int pri_strength,
282*77c1e3ccSAndroid Build Coastguard Worker int sec_strength, int dir, int pri_damping,
283*77c1e3ccSAndroid Build Coastguard Worker int sec_damping, int coeff_shift, int height,
284*77c1e3ccSAndroid Build Coastguard Worker int enable_primary, int enable_secondary) {
285*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst8 = (uint8_t *)dest;
286*77c1e3ccSAndroid Build Coastguard Worker uint16_t *dst16 = (uint16_t *)dest;
287*77c1e3ccSAndroid Build Coastguard Worker const int clipping_required = enable_primary && enable_secondary;
288*77c1e3ccSAndroid Build Coastguard Worker v256 p0, p1, p2, p3;
289*77c1e3ccSAndroid Build Coastguard Worker v256 sum, row, res;
290*77c1e3ccSAndroid Build Coastguard Worker v256 max, min;
291*77c1e3ccSAndroid Build Coastguard Worker const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
292*77c1e3ccSAndroid Build Coastguard Worker const int po1 = cdef_directions[dir][0];
293*77c1e3ccSAndroid Build Coastguard Worker const int po2 = cdef_directions[dir][1];
294*77c1e3ccSAndroid Build Coastguard Worker const int s1o1 = cdef_directions[dir + 2][0];
295*77c1e3ccSAndroid Build Coastguard Worker const int s1o2 = cdef_directions[dir + 2][1];
296*77c1e3ccSAndroid Build Coastguard Worker const int s2o1 = cdef_directions[dir - 2][0];
297*77c1e3ccSAndroid Build Coastguard Worker const int s2o2 = cdef_directions[dir - 2][1];
298*77c1e3ccSAndroid Build Coastguard Worker const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
299*77c1e3ccSAndroid Build Coastguard Worker const int *sec_taps = cdef_sec_taps;
300*77c1e3ccSAndroid Build Coastguard Worker int i;
301*77c1e3ccSAndroid Build Coastguard Worker
302*77c1e3ccSAndroid Build Coastguard Worker if (enable_primary && pri_strength)
303*77c1e3ccSAndroid Build Coastguard Worker pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
304*77c1e3ccSAndroid Build Coastguard Worker if (enable_secondary && sec_strength)
305*77c1e3ccSAndroid Build Coastguard Worker sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
306*77c1e3ccSAndroid Build Coastguard Worker
307*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; i += 4) {
308*77c1e3ccSAndroid Build Coastguard Worker sum = v256_zero();
309*77c1e3ccSAndroid Build Coastguard Worker row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]),
310*77c1e3ccSAndroid Build Coastguard Worker v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
311*77c1e3ccSAndroid Build Coastguard Worker v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
312*77c1e3ccSAndroid Build Coastguard Worker v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
313*77c1e3ccSAndroid Build Coastguard Worker max = min = row;
314*77c1e3ccSAndroid Build Coastguard Worker
315*77c1e3ccSAndroid Build Coastguard Worker if (enable_primary) {
316*77c1e3ccSAndroid Build Coastguard Worker v256 tap[4];
317*77c1e3ccSAndroid Build Coastguard Worker // Primary near taps
318*77c1e3ccSAndroid Build Coastguard Worker tap[0] =
319*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]),
320*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
321*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
322*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
323*77c1e3ccSAndroid Build Coastguard Worker p0 = constrain16(tap[0], row, pri_strength, pri_damping);
324*77c1e3ccSAndroid Build Coastguard Worker tap[1] =
325*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]),
326*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
327*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
328*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
329*77c1e3ccSAndroid Build Coastguard Worker p1 = constrain16(tap[1], row, pri_strength, pri_damping);
330*77c1e3ccSAndroid Build Coastguard Worker
331*77c1e3ccSAndroid Build Coastguard Worker // sum += pri_taps[0] * (p0 + p1)
332*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(
333*77c1e3ccSAndroid Build Coastguard Worker sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
334*77c1e3ccSAndroid Build Coastguard Worker
335*77c1e3ccSAndroid Build Coastguard Worker // Primary far taps
336*77c1e3ccSAndroid Build Coastguard Worker tap[2] =
337*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]),
338*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
339*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
340*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
341*77c1e3ccSAndroid Build Coastguard Worker p0 = constrain16(tap[2], row, pri_strength, pri_damping);
342*77c1e3ccSAndroid Build Coastguard Worker tap[3] =
343*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]),
344*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
345*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
346*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
347*77c1e3ccSAndroid Build Coastguard Worker p1 = constrain16(tap[3], row, pri_strength, pri_damping);
348*77c1e3ccSAndroid Build Coastguard Worker
349*77c1e3ccSAndroid Build Coastguard Worker // sum += pri_taps[1] * (p0 + p1)
350*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(
351*77c1e3ccSAndroid Build Coastguard Worker sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
352*77c1e3ccSAndroid Build Coastguard Worker if (clipping_required) {
353*77c1e3ccSAndroid Build Coastguard Worker max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
354*77c1e3ccSAndroid Build Coastguard Worker
355*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[0]);
356*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[1]);
357*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[2]);
358*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[3]);
359*77c1e3ccSAndroid Build Coastguard Worker }
360*77c1e3ccSAndroid Build Coastguard Worker }
361*77c1e3ccSAndroid Build Coastguard Worker
362*77c1e3ccSAndroid Build Coastguard Worker if (enable_secondary) {
363*77c1e3ccSAndroid Build Coastguard Worker v256 tap[8];
364*77c1e3ccSAndroid Build Coastguard Worker // Secondary near taps
365*77c1e3ccSAndroid Build Coastguard Worker tap[0] =
366*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]),
367*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
368*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
369*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
370*77c1e3ccSAndroid Build Coastguard Worker p0 = constrain16(tap[0], row, sec_strength, sec_damping);
371*77c1e3ccSAndroid Build Coastguard Worker tap[1] =
372*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]),
373*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
374*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
375*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
376*77c1e3ccSAndroid Build Coastguard Worker p1 = constrain16(tap[1], row, sec_strength, sec_damping);
377*77c1e3ccSAndroid Build Coastguard Worker tap[2] =
378*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]),
379*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
380*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
381*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
382*77c1e3ccSAndroid Build Coastguard Worker p2 = constrain16(tap[2], row, sec_strength, sec_damping);
383*77c1e3ccSAndroid Build Coastguard Worker tap[3] =
384*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]),
385*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
386*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
387*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
388*77c1e3ccSAndroid Build Coastguard Worker p3 = constrain16(tap[3], row, sec_strength, sec_damping);
389*77c1e3ccSAndroid Build Coastguard Worker
390*77c1e3ccSAndroid Build Coastguard Worker // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
391*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
392*77c1e3ccSAndroid Build Coastguard Worker v256_add_16(v256_add_16(p0, p1),
393*77c1e3ccSAndroid Build Coastguard Worker v256_add_16(p2, p3))));
394*77c1e3ccSAndroid Build Coastguard Worker
395*77c1e3ccSAndroid Build Coastguard Worker // Secondary far taps
396*77c1e3ccSAndroid Build Coastguard Worker tap[4] =
397*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]),
398*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
399*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
400*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
401*77c1e3ccSAndroid Build Coastguard Worker p0 = constrain16(tap[4], row, sec_strength, sec_damping);
402*77c1e3ccSAndroid Build Coastguard Worker tap[5] =
403*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]),
404*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
405*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
406*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
407*77c1e3ccSAndroid Build Coastguard Worker p1 = constrain16(tap[5], row, sec_strength, sec_damping);
408*77c1e3ccSAndroid Build Coastguard Worker tap[6] =
409*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]),
410*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
411*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
412*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
413*77c1e3ccSAndroid Build Coastguard Worker p2 = constrain16(tap[6], row, sec_strength, sec_damping);
414*77c1e3ccSAndroid Build Coastguard Worker tap[7] =
415*77c1e3ccSAndroid Build Coastguard Worker v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]),
416*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
417*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
418*77c1e3ccSAndroid Build Coastguard Worker v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
419*77c1e3ccSAndroid Build Coastguard Worker p3 = constrain16(tap[7], row, sec_strength, sec_damping);
420*77c1e3ccSAndroid Build Coastguard Worker
421*77c1e3ccSAndroid Build Coastguard Worker // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
422*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
423*77c1e3ccSAndroid Build Coastguard Worker v256_add_16(v256_add_16(p0, p1),
424*77c1e3ccSAndroid Build Coastguard Worker v256_add_16(p2, p3))));
425*77c1e3ccSAndroid Build Coastguard Worker
426*77c1e3ccSAndroid Build Coastguard Worker if (clipping_required) {
427*77c1e3ccSAndroid Build Coastguard Worker max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
428*77c1e3ccSAndroid Build Coastguard Worker
429*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[0]);
430*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[1]);
431*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[2]);
432*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[3]);
433*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[4]);
434*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[5]);
435*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[6]);
436*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[7]);
437*77c1e3ccSAndroid Build Coastguard Worker }
438*77c1e3ccSAndroid Build Coastguard Worker }
439*77c1e3ccSAndroid Build Coastguard Worker
440*77c1e3ccSAndroid Build Coastguard Worker // res = row + ((sum - (sum < 0) + 8) >> 4)
441*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
442*77c1e3ccSAndroid Build Coastguard Worker res = v256_add_16(sum, v256_dup_16(8));
443*77c1e3ccSAndroid Build Coastguard Worker res = v256_shr_n_s16(res, 4);
444*77c1e3ccSAndroid Build Coastguard Worker res = v256_add_16(row, res);
445*77c1e3ccSAndroid Build Coastguard Worker if (clipping_required) {
446*77c1e3ccSAndroid Build Coastguard Worker res = v256_min_s16(v256_max_s16(res, min), max);
447*77c1e3ccSAndroid Build Coastguard Worker }
448*77c1e3ccSAndroid Build Coastguard Worker
449*77c1e3ccSAndroid Build Coastguard Worker if (is_lowbd) {
450*77c1e3ccSAndroid Build Coastguard Worker const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
451*77c1e3ccSAndroid Build Coastguard Worker u32_store_aligned(&dst8[(i + 0) * dstride],
452*77c1e3ccSAndroid Build Coastguard Worker v64_high_u32(v128_high_v64(res_128)));
453*77c1e3ccSAndroid Build Coastguard Worker u32_store_aligned(&dst8[(i + 1) * dstride],
454*77c1e3ccSAndroid Build Coastguard Worker v64_low_u32(v128_high_v64(res_128)));
455*77c1e3ccSAndroid Build Coastguard Worker u32_store_aligned(&dst8[(i + 2) * dstride],
456*77c1e3ccSAndroid Build Coastguard Worker v64_high_u32(v128_low_v64(res_128)));
457*77c1e3ccSAndroid Build Coastguard Worker u32_store_aligned(&dst8[(i + 3) * dstride],
458*77c1e3ccSAndroid Build Coastguard Worker v64_low_u32(v128_low_v64(res_128)));
459*77c1e3ccSAndroid Build Coastguard Worker } else {
460*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst16[(i + 0) * dstride],
461*77c1e3ccSAndroid Build Coastguard Worker v128_high_v64(v256_high_v128(res)));
462*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst16[(i + 1) * dstride],
463*77c1e3ccSAndroid Build Coastguard Worker v128_low_v64(v256_high_v128(res)));
464*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst16[(i + 2) * dstride],
465*77c1e3ccSAndroid Build Coastguard Worker v128_high_v64(v256_low_v128(res)));
466*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst16[(i + 3) * dstride],
467*77c1e3ccSAndroid Build Coastguard Worker v128_low_v64(v256_low_v128(res)));
468*77c1e3ccSAndroid Build Coastguard Worker }
469*77c1e3ccSAndroid Build Coastguard Worker }
470*77c1e3ccSAndroid Build Coastguard Worker }
471*77c1e3ccSAndroid Build Coastguard Worker
filter_block_8x8(const int is_lowbd,void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int height,int enable_primary,int enable_secondary)472*77c1e3ccSAndroid Build Coastguard Worker CDEF_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride,
473*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *in, int pri_strength,
474*77c1e3ccSAndroid Build Coastguard Worker int sec_strength, int dir, int pri_damping,
475*77c1e3ccSAndroid Build Coastguard Worker int sec_damping, int coeff_shift, int height,
476*77c1e3ccSAndroid Build Coastguard Worker int enable_primary, int enable_secondary) {
477*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst8 = (uint8_t *)dest;
478*77c1e3ccSAndroid Build Coastguard Worker uint16_t *dst16 = (uint16_t *)dest;
479*77c1e3ccSAndroid Build Coastguard Worker const int clipping_required = enable_primary && enable_secondary;
480*77c1e3ccSAndroid Build Coastguard Worker int i;
481*77c1e3ccSAndroid Build Coastguard Worker v256 sum, p0, p1, p2, p3, row, res;
482*77c1e3ccSAndroid Build Coastguard Worker const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
483*77c1e3ccSAndroid Build Coastguard Worker v256 max, min;
484*77c1e3ccSAndroid Build Coastguard Worker const int po1 = cdef_directions[dir][0];
485*77c1e3ccSAndroid Build Coastguard Worker const int po2 = cdef_directions[dir][1];
486*77c1e3ccSAndroid Build Coastguard Worker const int s1o1 = cdef_directions[dir + 2][0];
487*77c1e3ccSAndroid Build Coastguard Worker const int s1o2 = cdef_directions[dir + 2][1];
488*77c1e3ccSAndroid Build Coastguard Worker const int s2o1 = cdef_directions[dir - 2][0];
489*77c1e3ccSAndroid Build Coastguard Worker const int s2o2 = cdef_directions[dir - 2][1];
490*77c1e3ccSAndroid Build Coastguard Worker const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
491*77c1e3ccSAndroid Build Coastguard Worker const int *sec_taps = cdef_sec_taps;
492*77c1e3ccSAndroid Build Coastguard Worker
493*77c1e3ccSAndroid Build Coastguard Worker if (enable_primary && pri_strength)
494*77c1e3ccSAndroid Build Coastguard Worker pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
495*77c1e3ccSAndroid Build Coastguard Worker if (enable_secondary && sec_strength)
496*77c1e3ccSAndroid Build Coastguard Worker sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
497*77c1e3ccSAndroid Build Coastguard Worker
498*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; i += 2) {
499*77c1e3ccSAndroid Build Coastguard Worker v256 tap[8];
500*77c1e3ccSAndroid Build Coastguard Worker sum = v256_zero();
501*77c1e3ccSAndroid Build Coastguard Worker row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
502*77c1e3ccSAndroid Build Coastguard Worker v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
503*77c1e3ccSAndroid Build Coastguard Worker
504*77c1e3ccSAndroid Build Coastguard Worker min = max = row;
505*77c1e3ccSAndroid Build Coastguard Worker if (enable_primary) {
506*77c1e3ccSAndroid Build Coastguard Worker // Primary near taps
507*77c1e3ccSAndroid Build Coastguard Worker tap[0] = v256_from_v128(
508*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
509*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
510*77c1e3ccSAndroid Build Coastguard Worker tap[1] = v256_from_v128(
511*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
512*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
513*77c1e3ccSAndroid Build Coastguard Worker p0 = constrain16(tap[0], row, pri_strength, pri_damping);
514*77c1e3ccSAndroid Build Coastguard Worker p1 = constrain16(tap[1], row, pri_strength, pri_damping);
515*77c1e3ccSAndroid Build Coastguard Worker
516*77c1e3ccSAndroid Build Coastguard Worker // sum += pri_taps[0] * (p0 + p1)
517*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(
518*77c1e3ccSAndroid Build Coastguard Worker sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
519*77c1e3ccSAndroid Build Coastguard Worker
520*77c1e3ccSAndroid Build Coastguard Worker // Primary far taps
521*77c1e3ccSAndroid Build Coastguard Worker tap[2] = v256_from_v128(
522*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
523*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
524*77c1e3ccSAndroid Build Coastguard Worker tap[3] = v256_from_v128(
525*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
526*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
527*77c1e3ccSAndroid Build Coastguard Worker p0 = constrain16(tap[2], row, pri_strength, pri_damping);
528*77c1e3ccSAndroid Build Coastguard Worker p1 = constrain16(tap[3], row, pri_strength, pri_damping);
529*77c1e3ccSAndroid Build Coastguard Worker
530*77c1e3ccSAndroid Build Coastguard Worker // sum += pri_taps[1] * (p0 + p1)
531*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(
532*77c1e3ccSAndroid Build Coastguard Worker sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
533*77c1e3ccSAndroid Build Coastguard Worker
534*77c1e3ccSAndroid Build Coastguard Worker if (clipping_required) {
535*77c1e3ccSAndroid Build Coastguard Worker max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
536*77c1e3ccSAndroid Build Coastguard Worker
537*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[0]);
538*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[1]);
539*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[2]);
540*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[3]);
541*77c1e3ccSAndroid Build Coastguard Worker }
542*77c1e3ccSAndroid Build Coastguard Worker // End primary
543*77c1e3ccSAndroid Build Coastguard Worker }
544*77c1e3ccSAndroid Build Coastguard Worker
545*77c1e3ccSAndroid Build Coastguard Worker if (enable_secondary) {
546*77c1e3ccSAndroid Build Coastguard Worker // Secondary near taps
547*77c1e3ccSAndroid Build Coastguard Worker tap[0] = v256_from_v128(
548*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
549*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
550*77c1e3ccSAndroid Build Coastguard Worker tap[1] = v256_from_v128(
551*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
552*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
553*77c1e3ccSAndroid Build Coastguard Worker tap[2] = v256_from_v128(
554*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
555*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
556*77c1e3ccSAndroid Build Coastguard Worker tap[3] = v256_from_v128(
557*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
558*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
559*77c1e3ccSAndroid Build Coastguard Worker p0 = constrain16(tap[0], row, sec_strength, sec_damping);
560*77c1e3ccSAndroid Build Coastguard Worker p1 = constrain16(tap[1], row, sec_strength, sec_damping);
561*77c1e3ccSAndroid Build Coastguard Worker p2 = constrain16(tap[2], row, sec_strength, sec_damping);
562*77c1e3ccSAndroid Build Coastguard Worker p3 = constrain16(tap[3], row, sec_strength, sec_damping);
563*77c1e3ccSAndroid Build Coastguard Worker
564*77c1e3ccSAndroid Build Coastguard Worker // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
565*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
566*77c1e3ccSAndroid Build Coastguard Worker v256_add_16(v256_add_16(p0, p1),
567*77c1e3ccSAndroid Build Coastguard Worker v256_add_16(p2, p3))));
568*77c1e3ccSAndroid Build Coastguard Worker
569*77c1e3ccSAndroid Build Coastguard Worker // Secondary far taps
570*77c1e3ccSAndroid Build Coastguard Worker tap[4] = v256_from_v128(
571*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
572*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
573*77c1e3ccSAndroid Build Coastguard Worker tap[5] = v256_from_v128(
574*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
575*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
576*77c1e3ccSAndroid Build Coastguard Worker tap[6] = v256_from_v128(
577*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
578*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
579*77c1e3ccSAndroid Build Coastguard Worker tap[7] = v256_from_v128(
580*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
581*77c1e3ccSAndroid Build Coastguard Worker v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
582*77c1e3ccSAndroid Build Coastguard Worker p0 = constrain16(tap[4], row, sec_strength, sec_damping);
583*77c1e3ccSAndroid Build Coastguard Worker p1 = constrain16(tap[5], row, sec_strength, sec_damping);
584*77c1e3ccSAndroid Build Coastguard Worker p2 = constrain16(tap[6], row, sec_strength, sec_damping);
585*77c1e3ccSAndroid Build Coastguard Worker p3 = constrain16(tap[7], row, sec_strength, sec_damping);
586*77c1e3ccSAndroid Build Coastguard Worker
587*77c1e3ccSAndroid Build Coastguard Worker // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
588*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
589*77c1e3ccSAndroid Build Coastguard Worker v256_add_16(v256_add_16(p0, p1),
590*77c1e3ccSAndroid Build Coastguard Worker v256_add_16(p2, p3))));
591*77c1e3ccSAndroid Build Coastguard Worker
592*77c1e3ccSAndroid Build Coastguard Worker if (clipping_required) {
593*77c1e3ccSAndroid Build Coastguard Worker max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
594*77c1e3ccSAndroid Build Coastguard Worker
595*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[0]);
596*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[1]);
597*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[2]);
598*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[3]);
599*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[4]);
600*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[5]);
601*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[6]);
602*77c1e3ccSAndroid Build Coastguard Worker min = v256_min_s16(min, tap[7]);
603*77c1e3ccSAndroid Build Coastguard Worker }
604*77c1e3ccSAndroid Build Coastguard Worker // End secondary
605*77c1e3ccSAndroid Build Coastguard Worker }
606*77c1e3ccSAndroid Build Coastguard Worker
607*77c1e3ccSAndroid Build Coastguard Worker // res = row + ((sum - (sum < 0) + 8) >> 4)
608*77c1e3ccSAndroid Build Coastguard Worker sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
609*77c1e3ccSAndroid Build Coastguard Worker res = v256_add_16(sum, v256_dup_16(8));
610*77c1e3ccSAndroid Build Coastguard Worker res = v256_shr_n_s16(res, 4);
611*77c1e3ccSAndroid Build Coastguard Worker res = v256_add_16(row, res);
612*77c1e3ccSAndroid Build Coastguard Worker if (clipping_required) {
613*77c1e3ccSAndroid Build Coastguard Worker res = v256_min_s16(v256_max_s16(res, min), max);
614*77c1e3ccSAndroid Build Coastguard Worker }
615*77c1e3ccSAndroid Build Coastguard Worker
616*77c1e3ccSAndroid Build Coastguard Worker if (is_lowbd) {
617*77c1e3ccSAndroid Build Coastguard Worker const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
618*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128));
619*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128));
620*77c1e3ccSAndroid Build Coastguard Worker } else {
621*77c1e3ccSAndroid Build Coastguard Worker v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res));
622*77c1e3ccSAndroid Build Coastguard Worker v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res));
623*77c1e3ccSAndroid Build Coastguard Worker }
624*77c1e3ccSAndroid Build Coastguard Worker }
625*77c1e3ccSAndroid Build Coastguard Worker }
626*77c1e3ccSAndroid Build Coastguard Worker
627*77c1e3ccSAndroid Build Coastguard Worker #if defined(_MSC_VER) && !defined(__clang__)
628*77c1e3ccSAndroid Build Coastguard Worker #pragma optimize("", on)
629*77c1e3ccSAndroid Build Coastguard Worker #endif
630*77c1e3ccSAndroid Build Coastguard Worker
copy_block_4xh(const int is_lowbd,void * dest,int dstride,const uint16_t * in,int height)631*77c1e3ccSAndroid Build Coastguard Worker SIMD_INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride,
632*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *in, int height) {
633*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst8 = (uint8_t *)dest;
634*77c1e3ccSAndroid Build Coastguard Worker uint16_t *dst16 = (uint16_t *)dest;
635*77c1e3ccSAndroid Build Coastguard Worker int i;
636*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; i += 4) {
637*77c1e3ccSAndroid Build Coastguard Worker const v128 row0 =
638*77c1e3ccSAndroid Build Coastguard Worker v128_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]),
639*77c1e3ccSAndroid Build Coastguard Worker v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
640*77c1e3ccSAndroid Build Coastguard Worker const v128 row1 =
641*77c1e3ccSAndroid Build Coastguard Worker v128_from_v64(v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
642*77c1e3ccSAndroid Build Coastguard Worker v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
643*77c1e3ccSAndroid Build Coastguard Worker if (is_lowbd) {
644*77c1e3ccSAndroid Build Coastguard Worker /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */
645*77c1e3ccSAndroid Build Coastguard Worker const v128 res_128 = v128_pack_s16_u8(row1, row0);
646*77c1e3ccSAndroid Build Coastguard Worker u32_store_aligned(&dst8[(i + 0) * dstride],
647*77c1e3ccSAndroid Build Coastguard Worker v64_high_u32(v128_low_v64(res_128)));
648*77c1e3ccSAndroid Build Coastguard Worker u32_store_aligned(&dst8[(i + 1) * dstride],
649*77c1e3ccSAndroid Build Coastguard Worker v64_low_u32(v128_low_v64(res_128)));
650*77c1e3ccSAndroid Build Coastguard Worker u32_store_aligned(&dst8[(i + 2) * dstride],
651*77c1e3ccSAndroid Build Coastguard Worker v64_high_u32(v128_high_v64(res_128)));
652*77c1e3ccSAndroid Build Coastguard Worker u32_store_aligned(&dst8[(i + 3) * dstride],
653*77c1e3ccSAndroid Build Coastguard Worker v64_low_u32(v128_high_v64(res_128)));
654*77c1e3ccSAndroid Build Coastguard Worker } else {
655*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst16[(i + 0) * dstride], v128_high_v64(row0));
656*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst16[(i + 1) * dstride], v128_low_v64(row0));
657*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst16[(i + 2) * dstride], v128_high_v64(row1));
658*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst16[(i + 3) * dstride], v128_low_v64(row1));
659*77c1e3ccSAndroid Build Coastguard Worker }
660*77c1e3ccSAndroid Build Coastguard Worker }
661*77c1e3ccSAndroid Build Coastguard Worker }
662*77c1e3ccSAndroid Build Coastguard Worker
copy_block_8xh(const int is_lowbd,void * dest,int dstride,const uint16_t * in,int height)663*77c1e3ccSAndroid Build Coastguard Worker SIMD_INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride,
664*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *in, int height) {
665*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst8 = (uint8_t *)dest;
666*77c1e3ccSAndroid Build Coastguard Worker uint16_t *dst16 = (uint16_t *)dest;
667*77c1e3ccSAndroid Build Coastguard Worker int i;
668*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; i += 2) {
669*77c1e3ccSAndroid Build Coastguard Worker const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
670*77c1e3ccSAndroid Build Coastguard Worker const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]);
671*77c1e3ccSAndroid Build Coastguard Worker if (is_lowbd) {
672*77c1e3ccSAndroid Build Coastguard Worker /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */
673*77c1e3ccSAndroid Build Coastguard Worker const v128 res_128 = v128_pack_s16_u8(row1, row0);
674*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128));
675*77c1e3ccSAndroid Build Coastguard Worker v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128));
676*77c1e3ccSAndroid Build Coastguard Worker } else {
677*77c1e3ccSAndroid Build Coastguard Worker v128_store_unaligned(&dst16[i * dstride], row0);
678*77c1e3ccSAndroid Build Coastguard Worker v128_store_unaligned(&dst16[(i + 1) * dstride], row1);
679*77c1e3ccSAndroid Build Coastguard Worker }
680*77c1e3ccSAndroid Build Coastguard Worker }
681*77c1e3ccSAndroid Build Coastguard Worker }
682*77c1e3ccSAndroid Build Coastguard Worker
SIMD_FUNC(cdef_filter_8_0)683*77c1e3ccSAndroid Build Coastguard Worker void SIMD_FUNC(cdef_filter_8_0)(void *dest, int dstride, const uint16_t *in,
684*77c1e3ccSAndroid Build Coastguard Worker int pri_strength, int sec_strength, int dir,
685*77c1e3ccSAndroid Build Coastguard Worker int pri_damping, int sec_damping,
686*77c1e3ccSAndroid Build Coastguard Worker int coeff_shift, int block_width,
687*77c1e3ccSAndroid Build Coastguard Worker int block_height) {
688*77c1e3ccSAndroid Build Coastguard Worker if (block_width == 8) {
689*77c1e3ccSAndroid Build Coastguard Worker filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
690*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
691*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/1,
692*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/1);
693*77c1e3ccSAndroid Build Coastguard Worker } else {
694*77c1e3ccSAndroid Build Coastguard Worker filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
695*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
696*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/1,
697*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/1);
698*77c1e3ccSAndroid Build Coastguard Worker }
699*77c1e3ccSAndroid Build Coastguard Worker }
700*77c1e3ccSAndroid Build Coastguard Worker
SIMD_FUNC(cdef_filter_8_1)701*77c1e3ccSAndroid Build Coastguard Worker void SIMD_FUNC(cdef_filter_8_1)(void *dest, int dstride, const uint16_t *in,
702*77c1e3ccSAndroid Build Coastguard Worker int pri_strength, int sec_strength, int dir,
703*77c1e3ccSAndroid Build Coastguard Worker int pri_damping, int sec_damping,
704*77c1e3ccSAndroid Build Coastguard Worker int coeff_shift, int block_width,
705*77c1e3ccSAndroid Build Coastguard Worker int block_height) {
706*77c1e3ccSAndroid Build Coastguard Worker if (block_width == 8) {
707*77c1e3ccSAndroid Build Coastguard Worker filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
708*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
709*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/1,
710*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/0);
711*77c1e3ccSAndroid Build Coastguard Worker } else {
712*77c1e3ccSAndroid Build Coastguard Worker filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
713*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
714*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/1,
715*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/0);
716*77c1e3ccSAndroid Build Coastguard Worker }
717*77c1e3ccSAndroid Build Coastguard Worker }
SIMD_FUNC(cdef_filter_8_2)718*77c1e3ccSAndroid Build Coastguard Worker void SIMD_FUNC(cdef_filter_8_2)(void *dest, int dstride, const uint16_t *in,
719*77c1e3ccSAndroid Build Coastguard Worker int pri_strength, int sec_strength, int dir,
720*77c1e3ccSAndroid Build Coastguard Worker int pri_damping, int sec_damping,
721*77c1e3ccSAndroid Build Coastguard Worker int coeff_shift, int block_width,
722*77c1e3ccSAndroid Build Coastguard Worker int block_height) {
723*77c1e3ccSAndroid Build Coastguard Worker if (block_width == 8) {
724*77c1e3ccSAndroid Build Coastguard Worker filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
725*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
726*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/0,
727*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/1);
728*77c1e3ccSAndroid Build Coastguard Worker } else {
729*77c1e3ccSAndroid Build Coastguard Worker filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
730*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
731*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/0,
732*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/1);
733*77c1e3ccSAndroid Build Coastguard Worker }
734*77c1e3ccSAndroid Build Coastguard Worker }
735*77c1e3ccSAndroid Build Coastguard Worker
SIMD_FUNC(cdef_filter_8_3)736*77c1e3ccSAndroid Build Coastguard Worker void SIMD_FUNC(cdef_filter_8_3)(void *dest, int dstride, const uint16_t *in,
737*77c1e3ccSAndroid Build Coastguard Worker int pri_strength, int sec_strength, int dir,
738*77c1e3ccSAndroid Build Coastguard Worker int pri_damping, int sec_damping,
739*77c1e3ccSAndroid Build Coastguard Worker int coeff_shift, int block_width,
740*77c1e3ccSAndroid Build Coastguard Worker int block_height) {
741*77c1e3ccSAndroid Build Coastguard Worker (void)pri_strength;
742*77c1e3ccSAndroid Build Coastguard Worker (void)sec_strength;
743*77c1e3ccSAndroid Build Coastguard Worker (void)dir;
744*77c1e3ccSAndroid Build Coastguard Worker (void)pri_damping;
745*77c1e3ccSAndroid Build Coastguard Worker (void)sec_damping;
746*77c1e3ccSAndroid Build Coastguard Worker (void)coeff_shift;
747*77c1e3ccSAndroid Build Coastguard Worker (void)block_width;
748*77c1e3ccSAndroid Build Coastguard Worker
749*77c1e3ccSAndroid Build Coastguard Worker if (block_width == 8) {
750*77c1e3ccSAndroid Build Coastguard Worker copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
751*77c1e3ccSAndroid Build Coastguard Worker } else {
752*77c1e3ccSAndroid Build Coastguard Worker copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
753*77c1e3ccSAndroid Build Coastguard Worker }
754*77c1e3ccSAndroid Build Coastguard Worker }
755*77c1e3ccSAndroid Build Coastguard Worker
SIMD_FUNC(cdef_filter_16_0)756*77c1e3ccSAndroid Build Coastguard Worker void SIMD_FUNC(cdef_filter_16_0)(void *dest, int dstride, const uint16_t *in,
757*77c1e3ccSAndroid Build Coastguard Worker int pri_strength, int sec_strength, int dir,
758*77c1e3ccSAndroid Build Coastguard Worker int pri_damping, int sec_damping,
759*77c1e3ccSAndroid Build Coastguard Worker int coeff_shift, int block_width,
760*77c1e3ccSAndroid Build Coastguard Worker int block_height) {
761*77c1e3ccSAndroid Build Coastguard Worker if (block_width == 8) {
762*77c1e3ccSAndroid Build Coastguard Worker filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
763*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
764*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/1,
765*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/1);
766*77c1e3ccSAndroid Build Coastguard Worker } else {
767*77c1e3ccSAndroid Build Coastguard Worker filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
768*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
769*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/1,
770*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/1);
771*77c1e3ccSAndroid Build Coastguard Worker }
772*77c1e3ccSAndroid Build Coastguard Worker }
773*77c1e3ccSAndroid Build Coastguard Worker
SIMD_FUNC(cdef_filter_16_1)774*77c1e3ccSAndroid Build Coastguard Worker void SIMD_FUNC(cdef_filter_16_1)(void *dest, int dstride, const uint16_t *in,
775*77c1e3ccSAndroid Build Coastguard Worker int pri_strength, int sec_strength, int dir,
776*77c1e3ccSAndroid Build Coastguard Worker int pri_damping, int sec_damping,
777*77c1e3ccSAndroid Build Coastguard Worker int coeff_shift, int block_width,
778*77c1e3ccSAndroid Build Coastguard Worker int block_height) {
779*77c1e3ccSAndroid Build Coastguard Worker if (block_width == 8) {
780*77c1e3ccSAndroid Build Coastguard Worker filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
781*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
782*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/1,
783*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/0);
784*77c1e3ccSAndroid Build Coastguard Worker } else {
785*77c1e3ccSAndroid Build Coastguard Worker filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
786*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
787*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/1,
788*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/0);
789*77c1e3ccSAndroid Build Coastguard Worker }
790*77c1e3ccSAndroid Build Coastguard Worker }
SIMD_FUNC(cdef_filter_16_2)791*77c1e3ccSAndroid Build Coastguard Worker void SIMD_FUNC(cdef_filter_16_2)(void *dest, int dstride, const uint16_t *in,
792*77c1e3ccSAndroid Build Coastguard Worker int pri_strength, int sec_strength, int dir,
793*77c1e3ccSAndroid Build Coastguard Worker int pri_damping, int sec_damping,
794*77c1e3ccSAndroid Build Coastguard Worker int coeff_shift, int block_width,
795*77c1e3ccSAndroid Build Coastguard Worker int block_height) {
796*77c1e3ccSAndroid Build Coastguard Worker if (block_width == 8) {
797*77c1e3ccSAndroid Build Coastguard Worker filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
798*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
799*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/0,
800*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/1);
801*77c1e3ccSAndroid Build Coastguard Worker } else {
802*77c1e3ccSAndroid Build Coastguard Worker filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
803*77c1e3ccSAndroid Build Coastguard Worker sec_strength, dir, pri_damping, sec_damping, coeff_shift,
804*77c1e3ccSAndroid Build Coastguard Worker block_height, /*enable_primary=*/0,
805*77c1e3ccSAndroid Build Coastguard Worker /*enable_secondary=*/1);
806*77c1e3ccSAndroid Build Coastguard Worker }
807*77c1e3ccSAndroid Build Coastguard Worker }
808*77c1e3ccSAndroid Build Coastguard Worker
SIMD_FUNC(cdef_filter_16_3)809*77c1e3ccSAndroid Build Coastguard Worker void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in,
810*77c1e3ccSAndroid Build Coastguard Worker int pri_strength, int sec_strength, int dir,
811*77c1e3ccSAndroid Build Coastguard Worker int pri_damping, int sec_damping,
812*77c1e3ccSAndroid Build Coastguard Worker int coeff_shift, int block_width,
813*77c1e3ccSAndroid Build Coastguard Worker int block_height) {
814*77c1e3ccSAndroid Build Coastguard Worker (void)pri_strength;
815*77c1e3ccSAndroid Build Coastguard Worker (void)sec_strength;
816*77c1e3ccSAndroid Build Coastguard Worker (void)dir;
817*77c1e3ccSAndroid Build Coastguard Worker (void)pri_damping;
818*77c1e3ccSAndroid Build Coastguard Worker (void)sec_damping;
819*77c1e3ccSAndroid Build Coastguard Worker (void)coeff_shift;
820*77c1e3ccSAndroid Build Coastguard Worker (void)block_width;
821*77c1e3ccSAndroid Build Coastguard Worker if (block_width == 8) {
822*77c1e3ccSAndroid Build Coastguard Worker copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
823*77c1e3ccSAndroid Build Coastguard Worker } else {
824*77c1e3ccSAndroid Build Coastguard Worker copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
825*77c1e3ccSAndroid Build Coastguard Worker }
826*77c1e3ccSAndroid Build Coastguard Worker }
827*77c1e3ccSAndroid Build Coastguard Worker
828*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)829*77c1e3ccSAndroid Build Coastguard Worker void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
830*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *src, int sstride,
831*77c1e3ccSAndroid Build Coastguard Worker int width, int height) {
832*77c1e3ccSAndroid Build Coastguard Worker int i, j;
833*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; i++) {
834*77c1e3ccSAndroid Build Coastguard Worker for (j = 0; j < (width & ~0x7); j += 8) {
835*77c1e3ccSAndroid Build Coastguard Worker v128 row = v128_load_unaligned(&src[i * sstride + j]);
836*77c1e3ccSAndroid Build Coastguard Worker v128_store_unaligned(&dst[i * dstride + j], row);
837*77c1e3ccSAndroid Build Coastguard Worker }
838*77c1e3ccSAndroid Build Coastguard Worker for (; j < width; j++) {
839*77c1e3ccSAndroid Build Coastguard Worker dst[i * dstride + j] = src[i * sstride + j];
840*77c1e3ccSAndroid Build Coastguard Worker }
841*77c1e3ccSAndroid Build Coastguard Worker }
842*77c1e3ccSAndroid Build Coastguard Worker }
843*77c1e3ccSAndroid Build Coastguard Worker #endif // CONFIG_AV1_HIGHBITDEPTH
844*77c1e3ccSAndroid Build Coastguard Worker
845*77c1e3ccSAndroid Build Coastguard Worker #undef CDEF_INLINE
846*77c1e3ccSAndroid Build Coastguard Worker
847*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
848