1*c0909341SAndroid Build Coastguard Worker /*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, Two Orioles, LLC
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker * list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker * and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker #include "src/cpu.h"
29*c0909341SAndroid Build Coastguard Worker #include "src/looprestoration.h"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker #if ARCH_AARCH64
32*c0909341SAndroid Build Coastguard Worker void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
33*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4], const pixel *lpf,
34*c0909341SAndroid Build Coastguard Worker const int w, int h,
35*c0909341SAndroid Build Coastguard Worker const LooprestorationParams *const params,
36*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges
37*c0909341SAndroid Build Coastguard Worker HIGHBD_DECL_SUFFIX);
38*c0909341SAndroid Build Coastguard Worker void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
39*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4], const pixel *lpf,
40*c0909341SAndroid Build Coastguard Worker const int w, int h,
41*c0909341SAndroid Build Coastguard Worker const LooprestorationParams *const params,
42*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges
43*c0909341SAndroid Build Coastguard Worker HIGHBD_DECL_SUFFIX);
44*c0909341SAndroid Build Coastguard Worker #else
45*c0909341SAndroid Build Coastguard Worker
46*c0909341SAndroid Build Coastguard Worker // The 8bpc version calculates things slightly differently than the reference
47*c0909341SAndroid Build Coastguard Worker // C version. That version calculates roughly this:
48*c0909341SAndroid Build Coastguard Worker // int16_t sum = 0;
49*c0909341SAndroid Build Coastguard Worker // for (int i = 0; i < 7; i++)
50*c0909341SAndroid Build Coastguard Worker // sum += src[idx] * fh[i];
51*c0909341SAndroid Build Coastguard Worker // int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
52*c0909341SAndroid Build Coastguard Worker // sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
53*c0909341SAndroid Build Coastguard Worker // sum += 1 << (bitdepth + 6 - round_bits_h);
54*c0909341SAndroid Build Coastguard Worker // Compared to the reference C version, this is the output of the first pass
55*c0909341SAndroid Build Coastguard Worker // _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
56*c0909341SAndroid Build Coastguard Worker // with round_offset precompensated.
57*c0909341SAndroid Build Coastguard Worker // The 16bpc version calculates things pretty much the same way as the
58*c0909341SAndroid Build Coastguard Worker // reference C version, but with the end result subtracted by
59*c0909341SAndroid Build Coastguard Worker // 1 << (bitdepth + 6 - round_bits_h).
60*c0909341SAndroid Build Coastguard Worker void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
61*c0909341SAndroid Build Coastguard Worker const pixel *src, ptrdiff_t stride,
62*c0909341SAndroid Build Coastguard Worker const int16_t fh[8], intptr_t w,
63*c0909341SAndroid Build Coastguard Worker int h, enum LrEdgeFlags edges
64*c0909341SAndroid Build Coastguard Worker HIGHBD_DECL_SUFFIX);
65*c0909341SAndroid Build Coastguard Worker // This calculates things slightly differently than the reference C version.
66*c0909341SAndroid Build Coastguard Worker // This version calculates roughly this:
67*c0909341SAndroid Build Coastguard Worker // int32_t sum = 0;
68*c0909341SAndroid Build Coastguard Worker // for (int i = 0; i < 7; i++)
69*c0909341SAndroid Build Coastguard Worker // sum += mid[idx] * fv[i];
70*c0909341SAndroid Build Coastguard Worker // sum = (sum + rounding_off_v) >> round_bits_v;
71*c0909341SAndroid Build Coastguard Worker // This function assumes that the width is a multiple of 8.
72*c0909341SAndroid Build Coastguard Worker void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
73*c0909341SAndroid Build Coastguard Worker const int16_t *mid, int w, int h,
74*c0909341SAndroid Build Coastguard Worker const int16_t fv[8], enum LrEdgeFlags edges,
75*c0909341SAndroid Build Coastguard Worker ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
76*c0909341SAndroid Build Coastguard Worker
wiener_filter_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)77*c0909341SAndroid Build Coastguard Worker static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
78*c0909341SAndroid Build Coastguard Worker const pixel (*const left)[4], const pixel *lpf,
79*c0909341SAndroid Build Coastguard Worker const int w, const int h,
80*c0909341SAndroid Build Coastguard Worker const LooprestorationParams *const params,
81*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
82*c0909341SAndroid Build Coastguard Worker {
83*c0909341SAndroid Build Coastguard Worker const int16_t (*const filter)[8] = params->filter;
84*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, mid, 68 * 384,);
85*c0909341SAndroid Build Coastguard Worker int mid_stride = (w + 7) & ~7;
86*c0909341SAndroid Build Coastguard Worker
87*c0909341SAndroid Build Coastguard Worker // Horizontal filter
88*c0909341SAndroid Build Coastguard Worker BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
89*c0909341SAndroid Build Coastguard Worker filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
90*c0909341SAndroid Build Coastguard Worker if (edges & LR_HAVE_TOP)
91*c0909341SAndroid Build Coastguard Worker BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
92*c0909341SAndroid Build Coastguard Worker filter[0], w, 2, edges
93*c0909341SAndroid Build Coastguard Worker HIGHBD_TAIL_SUFFIX);
94*c0909341SAndroid Build Coastguard Worker if (edges & LR_HAVE_BOTTOM)
95*c0909341SAndroid Build Coastguard Worker BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
96*c0909341SAndroid Build Coastguard Worker lpf + 6 * PXSTRIDE(stride),
97*c0909341SAndroid Build Coastguard Worker stride, filter[0], w, 2, edges
98*c0909341SAndroid Build Coastguard Worker HIGHBD_TAIL_SUFFIX);
99*c0909341SAndroid Build Coastguard Worker
100*c0909341SAndroid Build Coastguard Worker // Vertical filter
101*c0909341SAndroid Build Coastguard Worker BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
102*c0909341SAndroid Build Coastguard Worker w, h, filter[1], edges,
103*c0909341SAndroid Build Coastguard Worker mid_stride * sizeof(*mid)
104*c0909341SAndroid Build Coastguard Worker HIGHBD_TAIL_SUFFIX);
105*c0909341SAndroid Build Coastguard Worker }
106*c0909341SAndroid Build Coastguard Worker #endif
107*c0909341SAndroid Build Coastguard Worker
108*c0909341SAndroid Build Coastguard Worker #if ARCH_ARM
109*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
110*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4],
111*c0909341SAndroid Build Coastguard Worker const pixel *src, const ptrdiff_t stride,
112*c0909341SAndroid Build Coastguard Worker const int w, const int h,
113*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges);
114*c0909341SAndroid Build Coastguard Worker void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
115*c0909341SAndroid Build Coastguard Worker const int w, const int h,
116*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges);
117*c0909341SAndroid Build Coastguard Worker void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
118*c0909341SAndroid Build Coastguard Worker const int w, const int h, const int strength,
119*c0909341SAndroid Build Coastguard Worker const int bitdepth_max);
120*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
121*c0909341SAndroid Build Coastguard Worker const pixel *src, const ptrdiff_t stride,
122*c0909341SAndroid Build Coastguard Worker const int32_t *a, const int16_t *b,
123*c0909341SAndroid Build Coastguard Worker const int w, const int h);
124*c0909341SAndroid Build Coastguard Worker
125*c0909341SAndroid Build Coastguard Worker /* filter with a 3x3 box (radius=1) */
dav1d_sgr_filter1_neon(int16_t * tmp,const pixel * src,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,const int h,const int strength,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)126*c0909341SAndroid Build Coastguard Worker static void dav1d_sgr_filter1_neon(int16_t *tmp,
127*c0909341SAndroid Build Coastguard Worker const pixel *src, const ptrdiff_t stride,
128*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4], const pixel *lpf,
129*c0909341SAndroid Build Coastguard Worker const int w, const int h, const int strength,
130*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges
131*c0909341SAndroid Build Coastguard Worker HIGHBD_DECL_SUFFIX)
132*c0909341SAndroid Build Coastguard Worker {
133*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
134*c0909341SAndroid Build Coastguard Worker int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
135*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
136*c0909341SAndroid Build Coastguard Worker int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
137*c0909341SAndroid Build Coastguard Worker
138*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
139*c0909341SAndroid Build Coastguard Worker if (edges & LR_HAVE_TOP)
140*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
141*c0909341SAndroid Build Coastguard Worker NULL, lpf, stride, w, 2, edges);
142*c0909341SAndroid Build Coastguard Worker
143*c0909341SAndroid Build Coastguard Worker if (edges & LR_HAVE_BOTTOM)
144*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
145*c0909341SAndroid Build Coastguard Worker NULL, lpf + 6 * PXSTRIDE(stride),
146*c0909341SAndroid Build Coastguard Worker stride, w, 2, edges);
147*c0909341SAndroid Build Coastguard Worker
148*c0909341SAndroid Build Coastguard Worker dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
149*c0909341SAndroid Build Coastguard Worker dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
150*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
151*c0909341SAndroid Build Coastguard Worker }
152*c0909341SAndroid Build Coastguard Worker
153*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
154*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4],
155*c0909341SAndroid Build Coastguard Worker const pixel *src, const ptrdiff_t stride,
156*c0909341SAndroid Build Coastguard Worker const int w, const int h,
157*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges);
158*c0909341SAndroid Build Coastguard Worker void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
159*c0909341SAndroid Build Coastguard Worker const int w, const int h,
160*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges);
161*c0909341SAndroid Build Coastguard Worker void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
162*c0909341SAndroid Build Coastguard Worker const int w, const int h, const int strength,
163*c0909341SAndroid Build Coastguard Worker const int bitdepth_max);
164*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
165*c0909341SAndroid Build Coastguard Worker const pixel *src, const ptrdiff_t stride,
166*c0909341SAndroid Build Coastguard Worker const int32_t *a, const int16_t *b,
167*c0909341SAndroid Build Coastguard Worker const int w, const int h);
168*c0909341SAndroid Build Coastguard Worker
169*c0909341SAndroid Build Coastguard Worker /* filter with a 5x5 box (radius=2) */
dav1d_sgr_filter2_neon(int16_t * tmp,const pixel * src,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,const int h,const int strength,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)170*c0909341SAndroid Build Coastguard Worker static void dav1d_sgr_filter2_neon(int16_t *tmp,
171*c0909341SAndroid Build Coastguard Worker const pixel *src, const ptrdiff_t stride,
172*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4], const pixel *lpf,
173*c0909341SAndroid Build Coastguard Worker const int w, const int h, const int strength,
174*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges
175*c0909341SAndroid Build Coastguard Worker HIGHBD_DECL_SUFFIX)
176*c0909341SAndroid Build Coastguard Worker {
177*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
178*c0909341SAndroid Build Coastguard Worker int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
179*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
180*c0909341SAndroid Build Coastguard Worker int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
181*c0909341SAndroid Build Coastguard Worker
182*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
183*c0909341SAndroid Build Coastguard Worker if (edges & LR_HAVE_TOP)
184*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
185*c0909341SAndroid Build Coastguard Worker NULL, lpf, stride, w, 2, edges);
186*c0909341SAndroid Build Coastguard Worker
187*c0909341SAndroid Build Coastguard Worker if (edges & LR_HAVE_BOTTOM)
188*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
189*c0909341SAndroid Build Coastguard Worker NULL, lpf + 6 * PXSTRIDE(stride),
190*c0909341SAndroid Build Coastguard Worker stride, w, 2, edges);
191*c0909341SAndroid Build Coastguard Worker
192*c0909341SAndroid Build Coastguard Worker dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
193*c0909341SAndroid Build Coastguard Worker dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
194*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
195*c0909341SAndroid Build Coastguard Worker }
196*c0909341SAndroid Build Coastguard Worker
197*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
198*c0909341SAndroid Build Coastguard Worker const pixel *src, const ptrdiff_t src_stride,
199*c0909341SAndroid Build Coastguard Worker const int16_t *t1, const int w, const int h,
200*c0909341SAndroid Build Coastguard Worker const int wt HIGHBD_DECL_SUFFIX);
201*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
202*c0909341SAndroid Build Coastguard Worker const pixel *src, const ptrdiff_t src_stride,
203*c0909341SAndroid Build Coastguard Worker const int16_t *t1, const int16_t *t2,
204*c0909341SAndroid Build Coastguard Worker const int w, const int h,
205*c0909341SAndroid Build Coastguard Worker const int16_t wt[2] HIGHBD_DECL_SUFFIX);
206*c0909341SAndroid Build Coastguard Worker
sgr_filter_5x5_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)207*c0909341SAndroid Build Coastguard Worker static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
208*c0909341SAndroid Build Coastguard Worker const pixel (*const left)[4], const pixel *lpf,
209*c0909341SAndroid Build Coastguard Worker const int w, const int h,
210*c0909341SAndroid Build Coastguard Worker const LooprestorationParams *const params,
211*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
212*c0909341SAndroid Build Coastguard Worker {
213*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, tmp, 64 * 384,);
214*c0909341SAndroid Build Coastguard Worker dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
215*c0909341SAndroid Build Coastguard Worker w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
216*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
217*c0909341SAndroid Build Coastguard Worker tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
218*c0909341SAndroid Build Coastguard Worker }
219*c0909341SAndroid Build Coastguard Worker
sgr_filter_3x3_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)220*c0909341SAndroid Build Coastguard Worker static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
221*c0909341SAndroid Build Coastguard Worker const pixel (*const left)[4], const pixel *lpf,
222*c0909341SAndroid Build Coastguard Worker const int w, const int h,
223*c0909341SAndroid Build Coastguard Worker const LooprestorationParams *const params,
224*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
225*c0909341SAndroid Build Coastguard Worker {
226*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, tmp, 64 * 384,);
227*c0909341SAndroid Build Coastguard Worker dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
228*c0909341SAndroid Build Coastguard Worker w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
229*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
230*c0909341SAndroid Build Coastguard Worker tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
231*c0909341SAndroid Build Coastguard Worker }
232*c0909341SAndroid Build Coastguard Worker
sgr_filter_mix_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)233*c0909341SAndroid Build Coastguard Worker static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
234*c0909341SAndroid Build Coastguard Worker const pixel (*const left)[4], const pixel *lpf,
235*c0909341SAndroid Build Coastguard Worker const int w, const int h,
236*c0909341SAndroid Build Coastguard Worker const LooprestorationParams *const params,
237*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
238*c0909341SAndroid Build Coastguard Worker {
239*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
240*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
241*c0909341SAndroid Build Coastguard Worker dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
242*c0909341SAndroid Build Coastguard Worker w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
243*c0909341SAndroid Build Coastguard Worker dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
244*c0909341SAndroid Build Coastguard Worker w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
245*c0909341SAndroid Build Coastguard Worker const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
246*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
247*c0909341SAndroid Build Coastguard Worker tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
248*c0909341SAndroid Build Coastguard Worker }
249*c0909341SAndroid Build Coastguard Worker
250*c0909341SAndroid Build Coastguard Worker #else
rotate(int32_t ** sumsq_ptrs,int16_t ** sum_ptrs,int n)251*c0909341SAndroid Build Coastguard Worker static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) {
252*c0909341SAndroid Build Coastguard Worker int32_t *tmp32 = sumsq_ptrs[0];
253*c0909341SAndroid Build Coastguard Worker int16_t *tmp16 = sum_ptrs[0];
254*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < n - 1; i++) {
255*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[i] = sumsq_ptrs[i+1];
256*c0909341SAndroid Build Coastguard Worker sum_ptrs[i] = sum_ptrs[i+1];
257*c0909341SAndroid Build Coastguard Worker }
258*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[n - 1] = tmp32;
259*c0909341SAndroid Build Coastguard Worker sum_ptrs[n - 1] = tmp16;
260*c0909341SAndroid Build Coastguard Worker }
rotate5_x2(int32_t ** sumsq_ptrs,int16_t ** sum_ptrs)261*c0909341SAndroid Build Coastguard Worker static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) {
262*c0909341SAndroid Build Coastguard Worker int32_t *tmp32[2];
263*c0909341SAndroid Build Coastguard Worker int16_t *tmp16[2];
264*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 2; i++) {
265*c0909341SAndroid Build Coastguard Worker tmp32[i] = sumsq_ptrs[i];
266*c0909341SAndroid Build Coastguard Worker tmp16[i] = sum_ptrs[i];
267*c0909341SAndroid Build Coastguard Worker }
268*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 3; i++) {
269*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[i] = sumsq_ptrs[i+2];
270*c0909341SAndroid Build Coastguard Worker sum_ptrs[i] = sum_ptrs[i+2];
271*c0909341SAndroid Build Coastguard Worker }
272*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 2; i++) {
273*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[3 + i] = tmp32[i];
274*c0909341SAndroid Build Coastguard Worker sum_ptrs[3 + i] = tmp16[i];
275*c0909341SAndroid Build Coastguard Worker }
276*c0909341SAndroid Build Coastguard Worker }
277*c0909341SAndroid Build Coastguard Worker
rotate_ab_3(int32_t ** A_ptrs,int16_t ** B_ptrs)278*c0909341SAndroid Build Coastguard Worker static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) {
279*c0909341SAndroid Build Coastguard Worker rotate(A_ptrs, B_ptrs, 3);
280*c0909341SAndroid Build Coastguard Worker }
281*c0909341SAndroid Build Coastguard Worker
rotate_ab_2(int32_t ** A_ptrs,int16_t ** B_ptrs)282*c0909341SAndroid Build Coastguard Worker static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) {
283*c0909341SAndroid Build Coastguard Worker rotate(A_ptrs, B_ptrs, 2);
284*c0909341SAndroid Build Coastguard Worker }
285*c0909341SAndroid Build Coastguard Worker
rotate_ab_4(int32_t ** A_ptrs,int16_t ** B_ptrs)286*c0909341SAndroid Build Coastguard Worker static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) {
287*c0909341SAndroid Build Coastguard Worker rotate(A_ptrs, B_ptrs, 4);
288*c0909341SAndroid Build Coastguard Worker }
289*c0909341SAndroid Build Coastguard Worker
290*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum,
291*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4],
292*c0909341SAndroid Build Coastguard Worker const pixel *src, const int w,
293*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges);
294*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum,
295*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4],
296*c0909341SAndroid Build Coastguard Worker const pixel *src, const int w,
297*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges);
298*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3,
299*c0909341SAndroid Build Coastguard Worker int32_t *sumsq5, int16_t *sum5,
300*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4],
301*c0909341SAndroid Build Coastguard Worker const pixel *src, const int w,
302*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges);
303*c0909341SAndroid Build Coastguard Worker
304*c0909341SAndroid Build Coastguard Worker void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
305*c0909341SAndroid Build Coastguard Worker int32_t *AA, int16_t *BB,
306*c0909341SAndroid Build Coastguard Worker const int w, const int s,
307*c0909341SAndroid Build Coastguard Worker const int bitdepth_max);
308*c0909341SAndroid Build Coastguard Worker void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
309*c0909341SAndroid Build Coastguard Worker int32_t *AA, int16_t *BB,
310*c0909341SAndroid Build Coastguard Worker const int w, const int s,
311*c0909341SAndroid Build Coastguard Worker const int bitdepth_max);
312*c0909341SAndroid Build Coastguard Worker
313*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst,
314*c0909341SAndroid Build Coastguard Worker int32_t **A_ptrs, int16_t **B_ptrs,
315*c0909341SAndroid Build Coastguard Worker const int w, const int w1
316*c0909341SAndroid Build Coastguard Worker HIGHBD_DECL_SUFFIX);
317*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride,
318*c0909341SAndroid Build Coastguard Worker int32_t **A_ptrs, int16_t **B_ptrs,
319*c0909341SAndroid Build Coastguard Worker const int w, const int h,
320*c0909341SAndroid Build Coastguard Worker const int w1 HIGHBD_DECL_SUFFIX);
321*c0909341SAndroid Build Coastguard Worker
322*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src,
323*c0909341SAndroid Build Coastguard Worker const ptrdiff_t src_stride,
324*c0909341SAndroid Build Coastguard Worker int32_t **A_ptrs,
325*c0909341SAndroid Build Coastguard Worker int16_t **B_ptrs,
326*c0909341SAndroid Build Coastguard Worker const int w, const int h);
327*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src,
328*c0909341SAndroid Build Coastguard Worker const ptrdiff_t src_stride,
329*c0909341SAndroid Build Coastguard Worker int32_t **A_ptrs, int16_t **B_ptrs,
330*c0909341SAndroid Build Coastguard Worker const int w, const int h);
331*c0909341SAndroid Build Coastguard Worker void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
332*c0909341SAndroid Build Coastguard Worker const pixel *src, const ptrdiff_t src_stride,
333*c0909341SAndroid Build Coastguard Worker const int16_t *t1, const int16_t *t2,
334*c0909341SAndroid Build Coastguard Worker const int w, const int h,
335*c0909341SAndroid Build Coastguard Worker const int16_t wt[2] HIGHBD_DECL_SUFFIX);
336*c0909341SAndroid Build Coastguard Worker
sgr_box3_vert_neon(int32_t ** sumsq,int16_t ** sum,int32_t * sumsq_out,int16_t * sum_out,const int w,int s,int bitdepth_max)337*c0909341SAndroid Build Coastguard Worker static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
338*c0909341SAndroid Build Coastguard Worker int32_t *sumsq_out, int16_t *sum_out,
339*c0909341SAndroid Build Coastguard Worker const int w, int s, int bitdepth_max) {
340*c0909341SAndroid Build Coastguard Worker // box3_v + calc_ab1
341*c0909341SAndroid Build Coastguard Worker dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
342*c0909341SAndroid Build Coastguard Worker rotate(sumsq, sum, 3);
343*c0909341SAndroid Build Coastguard Worker }
344*c0909341SAndroid Build Coastguard Worker
sgr_box5_vert_neon(int32_t ** sumsq,int16_t ** sum,int32_t * sumsq_out,int16_t * sum_out,const int w,int s,int bitdepth_max)345*c0909341SAndroid Build Coastguard Worker static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
346*c0909341SAndroid Build Coastguard Worker int32_t *sumsq_out, int16_t *sum_out,
347*c0909341SAndroid Build Coastguard Worker const int w, int s, int bitdepth_max) {
348*c0909341SAndroid Build Coastguard Worker // box5_v + calc_ab2
349*c0909341SAndroid Build Coastguard Worker dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
350*c0909341SAndroid Build Coastguard Worker rotate5_x2(sumsq, sum);
351*c0909341SAndroid Build Coastguard Worker }
352*c0909341SAndroid Build Coastguard Worker
sgr_box3_hv_neon(int32_t ** sumsq,int16_t ** sum,int32_t * AA,int16_t * BB,const pixel (* left)[4],const pixel * src,const int w,const int s,const enum LrEdgeFlags edges,const int bitdepth_max)353*c0909341SAndroid Build Coastguard Worker static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum,
354*c0909341SAndroid Build Coastguard Worker int32_t *AA, int16_t *BB,
355*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4],
356*c0909341SAndroid Build Coastguard Worker const pixel *src, const int w,
357*c0909341SAndroid Build Coastguard Worker const int s,
358*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges,
359*c0909341SAndroid Build Coastguard Worker const int bitdepth_max) {
360*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges);
361*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max);
362*c0909341SAndroid Build Coastguard Worker }
363*c0909341SAndroid Build Coastguard Worker
364*c0909341SAndroid Build Coastguard Worker
sgr_finish1_neon(pixel ** dst,const ptrdiff_t stride,int32_t ** A_ptrs,int16_t ** B_ptrs,const int w,const int w1 HIGHBD_DECL_SUFFIX)365*c0909341SAndroid Build Coastguard Worker static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride,
366*c0909341SAndroid Build Coastguard Worker int32_t **A_ptrs, int16_t **B_ptrs, const int w,
367*c0909341SAndroid Build Coastguard Worker const int w1 HIGHBD_DECL_SUFFIX) {
368*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs,
369*c0909341SAndroid Build Coastguard Worker w, w1 HIGHBD_TAIL_SUFFIX);
370*c0909341SAndroid Build Coastguard Worker *dst += PXSTRIDE(stride);
371*c0909341SAndroid Build Coastguard Worker rotate_ab_3(A_ptrs, B_ptrs);
372*c0909341SAndroid Build Coastguard Worker }
373*c0909341SAndroid Build Coastguard Worker
sgr_finish2_neon(pixel ** dst,const ptrdiff_t stride,int32_t ** A_ptrs,int16_t ** B_ptrs,const int w,const int h,const int w1 HIGHBD_DECL_SUFFIX)374*c0909341SAndroid Build Coastguard Worker static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride,
375*c0909341SAndroid Build Coastguard Worker int32_t **A_ptrs, int16_t **B_ptrs,
376*c0909341SAndroid Build Coastguard Worker const int w, const int h, const int w1
377*c0909341SAndroid Build Coastguard Worker HIGHBD_DECL_SUFFIX) {
378*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs,
379*c0909341SAndroid Build Coastguard Worker w, h, w1 HIGHBD_TAIL_SUFFIX);
380*c0909341SAndroid Build Coastguard Worker *dst += 2*PXSTRIDE(stride);
381*c0909341SAndroid Build Coastguard Worker rotate_ab_2(A_ptrs, B_ptrs);
382*c0909341SAndroid Build Coastguard Worker }
383*c0909341SAndroid Build Coastguard Worker
sgr_finish_mix_neon(pixel ** dst,const ptrdiff_t stride,int32_t ** A5_ptrs,int16_t ** B5_ptrs,int32_t ** A3_ptrs,int16_t ** B3_ptrs,const int w,const int h,const int w0,const int w1 HIGHBD_DECL_SUFFIX)384*c0909341SAndroid Build Coastguard Worker static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride,
385*c0909341SAndroid Build Coastguard Worker int32_t **A5_ptrs, int16_t **B5_ptrs,
386*c0909341SAndroid Build Coastguard Worker int32_t **A3_ptrs, int16_t **B3_ptrs,
387*c0909341SAndroid Build Coastguard Worker const int w, const int h,
388*c0909341SAndroid Build Coastguard Worker const int w0, const int w1 HIGHBD_DECL_SUFFIX) {
389*c0909341SAndroid Build Coastguard Worker #define FILTER_OUT_STRIDE 384
390*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,);
391*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,);
392*c0909341SAndroid Build Coastguard Worker
393*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride,
394*c0909341SAndroid Build Coastguard Worker A5_ptrs, B5_ptrs, w, h);
395*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride,
396*c0909341SAndroid Build Coastguard Worker A3_ptrs, B3_ptrs, w, h);
397*c0909341SAndroid Build Coastguard Worker const int16_t wt[2] = { w0, w1 };
398*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride,
399*c0909341SAndroid Build Coastguard Worker tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX);
400*c0909341SAndroid Build Coastguard Worker *dst += h*PXSTRIDE(stride);
401*c0909341SAndroid Build Coastguard Worker rotate_ab_2(A5_ptrs, B5_ptrs);
402*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
403*c0909341SAndroid Build Coastguard Worker }
404*c0909341SAndroid Build Coastguard Worker
405*c0909341SAndroid Build Coastguard Worker
sgr_filter_3x3_neon(pixel * dst,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)406*c0909341SAndroid Build Coastguard Worker static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride,
407*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4], const pixel *lpf,
408*c0909341SAndroid Build Coastguard Worker const int w, int h,
409*c0909341SAndroid Build Coastguard Worker const LooprestorationParams *const params,
410*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
411*c0909341SAndroid Build Coastguard Worker {
412*c0909341SAndroid Build Coastguard Worker #define BUF_STRIDE (384 + 16)
413*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
414*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,);
415*c0909341SAndroid Build Coastguard Worker int32_t *sumsq_ptrs[3], *sumsq_rows[3];
416*c0909341SAndroid Build Coastguard Worker int16_t *sum_ptrs[3], *sum_rows[3];
417*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 3; i++) {
418*c0909341SAndroid Build Coastguard Worker sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
419*c0909341SAndroid Build Coastguard Worker sum_rows[i] = &sum_buf[i * BUF_STRIDE];
420*c0909341SAndroid Build Coastguard Worker }
421*c0909341SAndroid Build Coastguard Worker
422*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
423*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,);
424*c0909341SAndroid Build Coastguard Worker int32_t *A_ptrs[3];
425*c0909341SAndroid Build Coastguard Worker int16_t *B_ptrs[3];
426*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 3; i++) {
427*c0909341SAndroid Build Coastguard Worker A_ptrs[i] = &A_buf[i * BUF_STRIDE];
428*c0909341SAndroid Build Coastguard Worker B_ptrs[i] = &B_buf[i * BUF_STRIDE];
429*c0909341SAndroid Build Coastguard Worker }
430*c0909341SAndroid Build Coastguard Worker const pixel *src = dst;
431*c0909341SAndroid Build Coastguard Worker const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
432*c0909341SAndroid Build Coastguard Worker
433*c0909341SAndroid Build Coastguard Worker if (edges & LR_HAVE_TOP) {
434*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[0] = sumsq_rows[0];
435*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[1] = sumsq_rows[1];
436*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[2] = sumsq_rows[2];
437*c0909341SAndroid Build Coastguard Worker sum_ptrs[0] = sum_rows[0];
438*c0909341SAndroid Build Coastguard Worker sum_ptrs[1] = sum_rows[1];
439*c0909341SAndroid Build Coastguard Worker sum_ptrs[2] = sum_rows[2];
440*c0909341SAndroid Build Coastguard Worker
441*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
442*c0909341SAndroid Build Coastguard Worker NULL, lpf, w, edges);
443*c0909341SAndroid Build Coastguard Worker lpf += PXSTRIDE(stride);
444*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1],
445*c0909341SAndroid Build Coastguard Worker NULL, lpf, w, edges);
446*c0909341SAndroid Build Coastguard Worker
447*c0909341SAndroid Build Coastguard Worker sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
448*c0909341SAndroid Build Coastguard Worker left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
449*c0909341SAndroid Build Coastguard Worker left++;
450*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
451*c0909341SAndroid Build Coastguard Worker rotate_ab_3(A_ptrs, B_ptrs);
452*c0909341SAndroid Build Coastguard Worker
453*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
454*c0909341SAndroid Build Coastguard Worker goto vert_1;
455*c0909341SAndroid Build Coastguard Worker
456*c0909341SAndroid Build Coastguard Worker sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
457*c0909341SAndroid Build Coastguard Worker left++;
458*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
459*c0909341SAndroid Build Coastguard Worker rotate_ab_3(A_ptrs, B_ptrs);
460*c0909341SAndroid Build Coastguard Worker
461*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
462*c0909341SAndroid Build Coastguard Worker goto vert_2;
463*c0909341SAndroid Build Coastguard Worker } else {
464*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[0] = sumsq_rows[0];
465*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[1] = sumsq_rows[0];
466*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[2] = sumsq_rows[0];
467*c0909341SAndroid Build Coastguard Worker sum_ptrs[0] = sum_rows[0];
468*c0909341SAndroid Build Coastguard Worker sum_ptrs[1] = sum_rows[0];
469*c0909341SAndroid Build Coastguard Worker sum_ptrs[2] = sum_rows[0];
470*c0909341SAndroid Build Coastguard Worker
471*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
472*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
473*c0909341SAndroid Build Coastguard Worker left++;
474*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
475*c0909341SAndroid Build Coastguard Worker
476*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
477*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
478*c0909341SAndroid Build Coastguard Worker rotate_ab_3(A_ptrs, B_ptrs);
479*c0909341SAndroid Build Coastguard Worker
480*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
481*c0909341SAndroid Build Coastguard Worker goto vert_1;
482*c0909341SAndroid Build Coastguard Worker
483*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[2] = sumsq_rows[1];
484*c0909341SAndroid Build Coastguard Worker sum_ptrs[2] = sum_rows[1];
485*c0909341SAndroid Build Coastguard Worker
486*c0909341SAndroid Build Coastguard Worker sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
487*c0909341SAndroid Build Coastguard Worker left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
488*c0909341SAndroid Build Coastguard Worker left++;
489*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
490*c0909341SAndroid Build Coastguard Worker rotate_ab_3(A_ptrs, B_ptrs);
491*c0909341SAndroid Build Coastguard Worker
492*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
493*c0909341SAndroid Build Coastguard Worker goto vert_2;
494*c0909341SAndroid Build Coastguard Worker
495*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[2] = sumsq_rows[2];
496*c0909341SAndroid Build Coastguard Worker sum_ptrs[2] = sum_rows[2];
497*c0909341SAndroid Build Coastguard Worker }
498*c0909341SAndroid Build Coastguard Worker
499*c0909341SAndroid Build Coastguard Worker do {
500*c0909341SAndroid Build Coastguard Worker sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
501*c0909341SAndroid Build Coastguard Worker left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
502*c0909341SAndroid Build Coastguard Worker left++;
503*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
504*c0909341SAndroid Build Coastguard Worker
505*c0909341SAndroid Build Coastguard Worker sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
506*c0909341SAndroid Build Coastguard Worker w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
507*c0909341SAndroid Build Coastguard Worker } while (--h > 0);
508*c0909341SAndroid Build Coastguard Worker
509*c0909341SAndroid Build Coastguard Worker if (!(edges & LR_HAVE_BOTTOM))
510*c0909341SAndroid Build Coastguard Worker goto vert_2;
511*c0909341SAndroid Build Coastguard Worker
512*c0909341SAndroid Build Coastguard Worker sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
513*c0909341SAndroid Build Coastguard Worker NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
514*c0909341SAndroid Build Coastguard Worker lpf_bottom += PXSTRIDE(stride);
515*c0909341SAndroid Build Coastguard Worker
516*c0909341SAndroid Build Coastguard Worker sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
517*c0909341SAndroid Build Coastguard Worker w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
518*c0909341SAndroid Build Coastguard Worker
519*c0909341SAndroid Build Coastguard Worker sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
520*c0909341SAndroid Build Coastguard Worker NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
521*c0909341SAndroid Build Coastguard Worker
522*c0909341SAndroid Build Coastguard Worker sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
523*c0909341SAndroid Build Coastguard Worker w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
524*c0909341SAndroid Build Coastguard Worker return;
525*c0909341SAndroid Build Coastguard Worker
526*c0909341SAndroid Build Coastguard Worker vert_2:
527*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[2] = sumsq_ptrs[1];
528*c0909341SAndroid Build Coastguard Worker sum_ptrs[2] = sum_ptrs[1];
529*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
530*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
531*c0909341SAndroid Build Coastguard Worker
532*c0909341SAndroid Build Coastguard Worker sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
533*c0909341SAndroid Build Coastguard Worker w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
534*c0909341SAndroid Build Coastguard Worker
535*c0909341SAndroid Build Coastguard Worker output_1:
536*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[2] = sumsq_ptrs[1];
537*c0909341SAndroid Build Coastguard Worker sum_ptrs[2] = sum_ptrs[1];
538*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
539*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
540*c0909341SAndroid Build Coastguard Worker
541*c0909341SAndroid Build Coastguard Worker sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
542*c0909341SAndroid Build Coastguard Worker w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
543*c0909341SAndroid Build Coastguard Worker return;
544*c0909341SAndroid Build Coastguard Worker
545*c0909341SAndroid Build Coastguard Worker vert_1:
546*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[2] = sumsq_ptrs[1];
547*c0909341SAndroid Build Coastguard Worker sum_ptrs[2] = sum_ptrs[1];
548*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
549*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
550*c0909341SAndroid Build Coastguard Worker rotate_ab_3(A_ptrs, B_ptrs);
551*c0909341SAndroid Build Coastguard Worker goto output_1;
552*c0909341SAndroid Build Coastguard Worker }
553*c0909341SAndroid Build Coastguard Worker
sgr_filter_5x5_neon(pixel * dst,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)554*c0909341SAndroid Build Coastguard Worker static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride,
555*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4], const pixel *lpf,
556*c0909341SAndroid Build Coastguard Worker const int w, int h,
557*c0909341SAndroid Build Coastguard Worker const LooprestorationParams *const params,
558*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
559*c0909341SAndroid Build Coastguard Worker {
560*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
561*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,);
562*c0909341SAndroid Build Coastguard Worker int32_t *sumsq_ptrs[5], *sumsq_rows[5];
563*c0909341SAndroid Build Coastguard Worker int16_t *sum_ptrs[5], *sum_rows[5];
564*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 5; i++) {
565*c0909341SAndroid Build Coastguard Worker sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
566*c0909341SAndroid Build Coastguard Worker sum_rows[i] = &sum_buf[i * BUF_STRIDE];
567*c0909341SAndroid Build Coastguard Worker }
568*c0909341SAndroid Build Coastguard Worker
569*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
570*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,);
571*c0909341SAndroid Build Coastguard Worker int32_t *A_ptrs[2];
572*c0909341SAndroid Build Coastguard Worker int16_t *B_ptrs[2];
573*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 2; i++) {
574*c0909341SAndroid Build Coastguard Worker A_ptrs[i] = &A_buf[i * BUF_STRIDE];
575*c0909341SAndroid Build Coastguard Worker B_ptrs[i] = &B_buf[i * BUF_STRIDE];
576*c0909341SAndroid Build Coastguard Worker }
577*c0909341SAndroid Build Coastguard Worker const pixel *src = dst;
578*c0909341SAndroid Build Coastguard Worker const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
579*c0909341SAndroid Build Coastguard Worker
580*c0909341SAndroid Build Coastguard Worker if (edges & LR_HAVE_TOP) {
581*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[0] = sumsq_rows[0];
582*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[1] = sumsq_rows[0];
583*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[2] = sumsq_rows[1];
584*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[3] = sumsq_rows[2];
585*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[4] = sumsq_rows[3];
586*c0909341SAndroid Build Coastguard Worker sum_ptrs[0] = sum_rows[0];
587*c0909341SAndroid Build Coastguard Worker sum_ptrs[1] = sum_rows[0];
588*c0909341SAndroid Build Coastguard Worker sum_ptrs[2] = sum_rows[1];
589*c0909341SAndroid Build Coastguard Worker sum_ptrs[3] = sum_rows[2];
590*c0909341SAndroid Build Coastguard Worker sum_ptrs[4] = sum_rows[3];
591*c0909341SAndroid Build Coastguard Worker
592*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
593*c0909341SAndroid Build Coastguard Worker NULL, lpf, w, edges);
594*c0909341SAndroid Build Coastguard Worker lpf += PXSTRIDE(stride);
595*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
596*c0909341SAndroid Build Coastguard Worker NULL, lpf, w, edges);
597*c0909341SAndroid Build Coastguard Worker
598*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
599*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
600*c0909341SAndroid Build Coastguard Worker left++;
601*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
602*c0909341SAndroid Build Coastguard Worker
603*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
604*c0909341SAndroid Build Coastguard Worker goto vert_1;
605*c0909341SAndroid Build Coastguard Worker
606*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
607*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
608*c0909341SAndroid Build Coastguard Worker left++;
609*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
610*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
611*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
612*c0909341SAndroid Build Coastguard Worker rotate_ab_2(A_ptrs, B_ptrs);
613*c0909341SAndroid Build Coastguard Worker
614*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
615*c0909341SAndroid Build Coastguard Worker goto vert_2;
616*c0909341SAndroid Build Coastguard Worker
617*c0909341SAndroid Build Coastguard Worker // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
618*c0909341SAndroid Build Coastguard Worker // one of them to point at the previously unused rows[4].
619*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[3] = sumsq_rows[4];
620*c0909341SAndroid Build Coastguard Worker sum_ptrs[3] = sum_rows[4];
621*c0909341SAndroid Build Coastguard Worker } else {
622*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[0] = sumsq_rows[0];
623*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[1] = sumsq_rows[0];
624*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[2] = sumsq_rows[0];
625*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[3] = sumsq_rows[0];
626*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[4] = sumsq_rows[0];
627*c0909341SAndroid Build Coastguard Worker sum_ptrs[0] = sum_rows[0];
628*c0909341SAndroid Build Coastguard Worker sum_ptrs[1] = sum_rows[0];
629*c0909341SAndroid Build Coastguard Worker sum_ptrs[2] = sum_rows[0];
630*c0909341SAndroid Build Coastguard Worker sum_ptrs[3] = sum_rows[0];
631*c0909341SAndroid Build Coastguard Worker sum_ptrs[4] = sum_rows[0];
632*c0909341SAndroid Build Coastguard Worker
633*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
634*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
635*c0909341SAndroid Build Coastguard Worker left++;
636*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
637*c0909341SAndroid Build Coastguard Worker
638*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
639*c0909341SAndroid Build Coastguard Worker goto vert_1;
640*c0909341SAndroid Build Coastguard Worker
641*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[4] = sumsq_rows[1];
642*c0909341SAndroid Build Coastguard Worker sum_ptrs[4] = sum_rows[1];
643*c0909341SAndroid Build Coastguard Worker
644*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
645*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
646*c0909341SAndroid Build Coastguard Worker left++;
647*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
648*c0909341SAndroid Build Coastguard Worker
649*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
650*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
651*c0909341SAndroid Build Coastguard Worker rotate_ab_2(A_ptrs, B_ptrs);
652*c0909341SAndroid Build Coastguard Worker
653*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
654*c0909341SAndroid Build Coastguard Worker goto vert_2;
655*c0909341SAndroid Build Coastguard Worker
656*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[3] = sumsq_rows[2];
657*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[4] = sumsq_rows[3];
658*c0909341SAndroid Build Coastguard Worker sum_ptrs[3] = sum_rows[2];
659*c0909341SAndroid Build Coastguard Worker sum_ptrs[4] = sum_rows[3];
660*c0909341SAndroid Build Coastguard Worker
661*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
662*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
663*c0909341SAndroid Build Coastguard Worker left++;
664*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
665*c0909341SAndroid Build Coastguard Worker
666*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
667*c0909341SAndroid Build Coastguard Worker goto odd;
668*c0909341SAndroid Build Coastguard Worker
669*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
670*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
671*c0909341SAndroid Build Coastguard Worker left++;
672*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
673*c0909341SAndroid Build Coastguard Worker
674*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
675*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
676*c0909341SAndroid Build Coastguard Worker sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
677*c0909341SAndroid Build Coastguard Worker w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
678*c0909341SAndroid Build Coastguard Worker
679*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
680*c0909341SAndroid Build Coastguard Worker goto vert_2;
681*c0909341SAndroid Build Coastguard Worker
682*c0909341SAndroid Build Coastguard Worker // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
683*c0909341SAndroid Build Coastguard Worker // one of them to point at the previously unused rows[4].
684*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[3] = sumsq_rows[4];
685*c0909341SAndroid Build Coastguard Worker sum_ptrs[3] = sum_rows[4];
686*c0909341SAndroid Build Coastguard Worker }
687*c0909341SAndroid Build Coastguard Worker
688*c0909341SAndroid Build Coastguard Worker do {
689*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
690*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
691*c0909341SAndroid Build Coastguard Worker left++;
692*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
693*c0909341SAndroid Build Coastguard Worker
694*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
695*c0909341SAndroid Build Coastguard Worker goto odd;
696*c0909341SAndroid Build Coastguard Worker
697*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
698*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
699*c0909341SAndroid Build Coastguard Worker left++;
700*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
701*c0909341SAndroid Build Coastguard Worker
702*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
703*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
704*c0909341SAndroid Build Coastguard Worker sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
705*c0909341SAndroid Build Coastguard Worker w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
706*c0909341SAndroid Build Coastguard Worker } while (--h > 0);
707*c0909341SAndroid Build Coastguard Worker
708*c0909341SAndroid Build Coastguard Worker if (!(edges & LR_HAVE_BOTTOM))
709*c0909341SAndroid Build Coastguard Worker goto vert_2;
710*c0909341SAndroid Build Coastguard Worker
711*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
712*c0909341SAndroid Build Coastguard Worker NULL, lpf_bottom, w, edges);
713*c0909341SAndroid Build Coastguard Worker lpf_bottom += PXSTRIDE(stride);
714*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
715*c0909341SAndroid Build Coastguard Worker NULL, lpf_bottom, w, edges);
716*c0909341SAndroid Build Coastguard Worker
717*c0909341SAndroid Build Coastguard Worker output_2:
718*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
719*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
720*c0909341SAndroid Build Coastguard Worker sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
721*c0909341SAndroid Build Coastguard Worker w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
722*c0909341SAndroid Build Coastguard Worker return;
723*c0909341SAndroid Build Coastguard Worker
724*c0909341SAndroid Build Coastguard Worker vert_2:
725*c0909341SAndroid Build Coastguard Worker // Duplicate the last row twice more
726*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[3] = sumsq_ptrs[2];
727*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[4] = sumsq_ptrs[2];
728*c0909341SAndroid Build Coastguard Worker sum_ptrs[3] = sum_ptrs[2];
729*c0909341SAndroid Build Coastguard Worker sum_ptrs[4] = sum_ptrs[2];
730*c0909341SAndroid Build Coastguard Worker goto output_2;
731*c0909341SAndroid Build Coastguard Worker
732*c0909341SAndroid Build Coastguard Worker odd:
733*c0909341SAndroid Build Coastguard Worker // Copy the last row as padding once
734*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[4] = sumsq_ptrs[3];
735*c0909341SAndroid Build Coastguard Worker sum_ptrs[4] = sum_ptrs[3];
736*c0909341SAndroid Build Coastguard Worker
737*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
738*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
739*c0909341SAndroid Build Coastguard Worker sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
740*c0909341SAndroid Build Coastguard Worker w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
741*c0909341SAndroid Build Coastguard Worker
742*c0909341SAndroid Build Coastguard Worker output_1:
743*c0909341SAndroid Build Coastguard Worker // Duplicate the last row twice more
744*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[3] = sumsq_ptrs[2];
745*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[4] = sumsq_ptrs[2];
746*c0909341SAndroid Build Coastguard Worker sum_ptrs[3] = sum_ptrs[2];
747*c0909341SAndroid Build Coastguard Worker sum_ptrs[4] = sum_ptrs[2];
748*c0909341SAndroid Build Coastguard Worker
749*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
750*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
751*c0909341SAndroid Build Coastguard Worker // Output only one row
752*c0909341SAndroid Build Coastguard Worker sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
753*c0909341SAndroid Build Coastguard Worker w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
754*c0909341SAndroid Build Coastguard Worker return;
755*c0909341SAndroid Build Coastguard Worker
756*c0909341SAndroid Build Coastguard Worker vert_1:
757*c0909341SAndroid Build Coastguard Worker // Copy the last row as padding once
758*c0909341SAndroid Build Coastguard Worker sumsq_ptrs[4] = sumsq_ptrs[3];
759*c0909341SAndroid Build Coastguard Worker sum_ptrs[4] = sum_ptrs[3];
760*c0909341SAndroid Build Coastguard Worker
761*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
762*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
763*c0909341SAndroid Build Coastguard Worker rotate_ab_2(A_ptrs, B_ptrs);
764*c0909341SAndroid Build Coastguard Worker
765*c0909341SAndroid Build Coastguard Worker goto output_1;
766*c0909341SAndroid Build Coastguard Worker }
767*c0909341SAndroid Build Coastguard Worker
sgr_filter_mix_neon(pixel * dst,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)768*c0909341SAndroid Build Coastguard Worker static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride,
769*c0909341SAndroid Build Coastguard Worker const pixel (*left)[4], const pixel *lpf,
770*c0909341SAndroid Build Coastguard Worker const int w, int h,
771*c0909341SAndroid Build Coastguard Worker const LooprestorationParams *const params,
772*c0909341SAndroid Build Coastguard Worker const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
773*c0909341SAndroid Build Coastguard Worker {
774*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
775*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,);
776*c0909341SAndroid Build Coastguard Worker int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
777*c0909341SAndroid Build Coastguard Worker int16_t *sum5_ptrs[5], *sum5_rows[5];
778*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 5; i++) {
779*c0909341SAndroid Build Coastguard Worker sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
780*c0909341SAndroid Build Coastguard Worker sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
781*c0909341SAndroid Build Coastguard Worker }
782*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
783*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,);
784*c0909341SAndroid Build Coastguard Worker int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
785*c0909341SAndroid Build Coastguard Worker int16_t *sum3_ptrs[3], *sum3_rows[3];
786*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 3; i++) {
787*c0909341SAndroid Build Coastguard Worker sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
788*c0909341SAndroid Build Coastguard Worker sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
789*c0909341SAndroid Build Coastguard Worker }
790*c0909341SAndroid Build Coastguard Worker
791*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
792*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,);
793*c0909341SAndroid Build Coastguard Worker int32_t *A5_ptrs[2];
794*c0909341SAndroid Build Coastguard Worker int16_t *B5_ptrs[2];
795*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 2; i++) {
796*c0909341SAndroid Build Coastguard Worker A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
797*c0909341SAndroid Build Coastguard Worker B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
798*c0909341SAndroid Build Coastguard Worker }
799*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
800*c0909341SAndroid Build Coastguard Worker ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,);
801*c0909341SAndroid Build Coastguard Worker int32_t *A3_ptrs[4];
802*c0909341SAndroid Build Coastguard Worker int16_t *B3_ptrs[4];
803*c0909341SAndroid Build Coastguard Worker for (int i = 0; i < 4; i++) {
804*c0909341SAndroid Build Coastguard Worker A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
805*c0909341SAndroid Build Coastguard Worker B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
806*c0909341SAndroid Build Coastguard Worker }
807*c0909341SAndroid Build Coastguard Worker const pixel *src = dst;
808*c0909341SAndroid Build Coastguard Worker const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
809*c0909341SAndroid Build Coastguard Worker
810*c0909341SAndroid Build Coastguard Worker if (edges & LR_HAVE_TOP) {
811*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[0] = sumsq5_rows[0];
812*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[1] = sumsq5_rows[0];
813*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[2] = sumsq5_rows[1];
814*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[3] = sumsq5_rows[2];
815*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4] = sumsq5_rows[3];
816*c0909341SAndroid Build Coastguard Worker sum5_ptrs[0] = sum5_rows[0];
817*c0909341SAndroid Build Coastguard Worker sum5_ptrs[1] = sum5_rows[0];
818*c0909341SAndroid Build Coastguard Worker sum5_ptrs[2] = sum5_rows[1];
819*c0909341SAndroid Build Coastguard Worker sum5_ptrs[3] = sum5_rows[2];
820*c0909341SAndroid Build Coastguard Worker sum5_ptrs[4] = sum5_rows[3];
821*c0909341SAndroid Build Coastguard Worker
822*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[0] = sumsq3_rows[0];
823*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[1] = sumsq3_rows[1];
824*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[2] = sumsq3_rows[2];
825*c0909341SAndroid Build Coastguard Worker sum3_ptrs[0] = sum3_rows[0];
826*c0909341SAndroid Build Coastguard Worker sum3_ptrs[1] = sum3_rows[1];
827*c0909341SAndroid Build Coastguard Worker sum3_ptrs[2] = sum3_rows[2];
828*c0909341SAndroid Build Coastguard Worker
829*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
830*c0909341SAndroid Build Coastguard Worker sumsq5_rows[0], sum5_rows[0],
831*c0909341SAndroid Build Coastguard Worker NULL, lpf, w, edges);
832*c0909341SAndroid Build Coastguard Worker lpf += PXSTRIDE(stride);
833*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
834*c0909341SAndroid Build Coastguard Worker sumsq5_rows[1], sum5_rows[1],
835*c0909341SAndroid Build Coastguard Worker NULL, lpf, w, edges);
836*c0909341SAndroid Build Coastguard Worker
837*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
838*c0909341SAndroid Build Coastguard Worker sumsq5_rows[2], sum5_rows[2],
839*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
840*c0909341SAndroid Build Coastguard Worker left++;
841*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
842*c0909341SAndroid Build Coastguard Worker
843*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
844*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
845*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
846*c0909341SAndroid Build Coastguard Worker
847*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
848*c0909341SAndroid Build Coastguard Worker goto vert_1;
849*c0909341SAndroid Build Coastguard Worker
850*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
851*c0909341SAndroid Build Coastguard Worker sumsq5_rows[3], sum5_rows[3],
852*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
853*c0909341SAndroid Build Coastguard Worker left++;
854*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
855*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
856*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
857*c0909341SAndroid Build Coastguard Worker rotate_ab_2(A5_ptrs, B5_ptrs);
858*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
859*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
860*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
861*c0909341SAndroid Build Coastguard Worker
862*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
863*c0909341SAndroid Build Coastguard Worker goto vert_2;
864*c0909341SAndroid Build Coastguard Worker
865*c0909341SAndroid Build Coastguard Worker // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
866*c0909341SAndroid Build Coastguard Worker // one of them to point at the previously unused rows[4].
867*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[3] = sumsq5_rows[4];
868*c0909341SAndroid Build Coastguard Worker sum5_ptrs[3] = sum5_rows[4];
869*c0909341SAndroid Build Coastguard Worker } else {
870*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[0] = sumsq5_rows[0];
871*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[1] = sumsq5_rows[0];
872*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[2] = sumsq5_rows[0];
873*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[3] = sumsq5_rows[0];
874*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4] = sumsq5_rows[0];
875*c0909341SAndroid Build Coastguard Worker sum5_ptrs[0] = sum5_rows[0];
876*c0909341SAndroid Build Coastguard Worker sum5_ptrs[1] = sum5_rows[0];
877*c0909341SAndroid Build Coastguard Worker sum5_ptrs[2] = sum5_rows[0];
878*c0909341SAndroid Build Coastguard Worker sum5_ptrs[3] = sum5_rows[0];
879*c0909341SAndroid Build Coastguard Worker sum5_ptrs[4] = sum5_rows[0];
880*c0909341SAndroid Build Coastguard Worker
881*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[0] = sumsq3_rows[0];
882*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[1] = sumsq3_rows[0];
883*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[2] = sumsq3_rows[0];
884*c0909341SAndroid Build Coastguard Worker sum3_ptrs[0] = sum3_rows[0];
885*c0909341SAndroid Build Coastguard Worker sum3_ptrs[1] = sum3_rows[0];
886*c0909341SAndroid Build Coastguard Worker sum3_ptrs[2] = sum3_rows[0];
887*c0909341SAndroid Build Coastguard Worker
888*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
889*c0909341SAndroid Build Coastguard Worker sumsq5_rows[0], sum5_rows[0],
890*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
891*c0909341SAndroid Build Coastguard Worker left++;
892*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
893*c0909341SAndroid Build Coastguard Worker
894*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
895*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
896*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
897*c0909341SAndroid Build Coastguard Worker
898*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
899*c0909341SAndroid Build Coastguard Worker goto vert_1;
900*c0909341SAndroid Build Coastguard Worker
901*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4] = sumsq5_rows[1];
902*c0909341SAndroid Build Coastguard Worker sum5_ptrs[4] = sum5_rows[1];
903*c0909341SAndroid Build Coastguard Worker
904*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[2] = sumsq3_rows[1];
905*c0909341SAndroid Build Coastguard Worker sum3_ptrs[2] = sum3_rows[1];
906*c0909341SAndroid Build Coastguard Worker
907*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
908*c0909341SAndroid Build Coastguard Worker sumsq5_rows[1], sum5_rows[1],
909*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
910*c0909341SAndroid Build Coastguard Worker left++;
911*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
912*c0909341SAndroid Build Coastguard Worker
913*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
914*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
915*c0909341SAndroid Build Coastguard Worker rotate_ab_2(A5_ptrs, B5_ptrs);
916*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
917*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
918*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
919*c0909341SAndroid Build Coastguard Worker
920*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
921*c0909341SAndroid Build Coastguard Worker goto vert_2;
922*c0909341SAndroid Build Coastguard Worker
923*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[3] = sumsq5_rows[2];
924*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4] = sumsq5_rows[3];
925*c0909341SAndroid Build Coastguard Worker sum5_ptrs[3] = sum5_rows[2];
926*c0909341SAndroid Build Coastguard Worker sum5_ptrs[4] = sum5_rows[3];
927*c0909341SAndroid Build Coastguard Worker
928*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[2] = sumsq3_rows[2];
929*c0909341SAndroid Build Coastguard Worker sum3_ptrs[2] = sum3_rows[2];
930*c0909341SAndroid Build Coastguard Worker
931*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
932*c0909341SAndroid Build Coastguard Worker sumsq5_rows[2], sum5_rows[2],
933*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
934*c0909341SAndroid Build Coastguard Worker left++;
935*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
936*c0909341SAndroid Build Coastguard Worker
937*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
938*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
939*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
940*c0909341SAndroid Build Coastguard Worker
941*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
942*c0909341SAndroid Build Coastguard Worker goto odd;
943*c0909341SAndroid Build Coastguard Worker
944*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
945*c0909341SAndroid Build Coastguard Worker sumsq5_rows[3], sum5_rows[3],
946*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
947*c0909341SAndroid Build Coastguard Worker left++;
948*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
949*c0909341SAndroid Build Coastguard Worker
950*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
951*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
952*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
953*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
954*c0909341SAndroid Build Coastguard Worker sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
955*c0909341SAndroid Build Coastguard Worker w, 2, params->sgr.w0, params->sgr.w1
956*c0909341SAndroid Build Coastguard Worker HIGHBD_TAIL_SUFFIX);
957*c0909341SAndroid Build Coastguard Worker
958*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
959*c0909341SAndroid Build Coastguard Worker goto vert_2;
960*c0909341SAndroid Build Coastguard Worker
961*c0909341SAndroid Build Coastguard Worker // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
962*c0909341SAndroid Build Coastguard Worker // one of them to point at the previously unused rows[4].
963*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[3] = sumsq5_rows[4];
964*c0909341SAndroid Build Coastguard Worker sum5_ptrs[3] = sum5_rows[4];
965*c0909341SAndroid Build Coastguard Worker }
966*c0909341SAndroid Build Coastguard Worker
967*c0909341SAndroid Build Coastguard Worker do {
968*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
969*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[3], sum5_ptrs[3],
970*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
971*c0909341SAndroid Build Coastguard Worker left++;
972*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
973*c0909341SAndroid Build Coastguard Worker
974*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
975*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
976*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
977*c0909341SAndroid Build Coastguard Worker
978*c0909341SAndroid Build Coastguard Worker if (--h <= 0)
979*c0909341SAndroid Build Coastguard Worker goto odd;
980*c0909341SAndroid Build Coastguard Worker
981*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
982*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4], sum5_ptrs[4],
983*c0909341SAndroid Build Coastguard Worker left, src, w, edges);
984*c0909341SAndroid Build Coastguard Worker left++;
985*c0909341SAndroid Build Coastguard Worker src += PXSTRIDE(stride);
986*c0909341SAndroid Build Coastguard Worker
987*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
988*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
989*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
990*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
991*c0909341SAndroid Build Coastguard Worker sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
992*c0909341SAndroid Build Coastguard Worker w, 2, params->sgr.w0, params->sgr.w1
993*c0909341SAndroid Build Coastguard Worker HIGHBD_TAIL_SUFFIX);
994*c0909341SAndroid Build Coastguard Worker } while (--h > 0);
995*c0909341SAndroid Build Coastguard Worker
996*c0909341SAndroid Build Coastguard Worker if (!(edges & LR_HAVE_BOTTOM))
997*c0909341SAndroid Build Coastguard Worker goto vert_2;
998*c0909341SAndroid Build Coastguard Worker
999*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
1000*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[3], sum5_ptrs[3],
1001*c0909341SAndroid Build Coastguard Worker NULL, lpf_bottom, w, edges);
1002*c0909341SAndroid Build Coastguard Worker lpf_bottom += PXSTRIDE(stride);
1003*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1004*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
1005*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
1006*c0909341SAndroid Build Coastguard Worker
1007*c0909341SAndroid Build Coastguard Worker BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
1008*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4], sum5_ptrs[4],
1009*c0909341SAndroid Build Coastguard Worker NULL, lpf_bottom, w, edges);
1010*c0909341SAndroid Build Coastguard Worker
1011*c0909341SAndroid Build Coastguard Worker output_2:
1012*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1013*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
1014*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1015*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
1016*c0909341SAndroid Build Coastguard Worker sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1017*c0909341SAndroid Build Coastguard Worker w, 2, params->sgr.w0, params->sgr.w1
1018*c0909341SAndroid Build Coastguard Worker HIGHBD_TAIL_SUFFIX);
1019*c0909341SAndroid Build Coastguard Worker return;
1020*c0909341SAndroid Build Coastguard Worker
1021*c0909341SAndroid Build Coastguard Worker vert_2:
1022*c0909341SAndroid Build Coastguard Worker // Duplicate the last row twice more
1023*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[3] = sumsq5_ptrs[2];
1024*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4] = sumsq5_ptrs[2];
1025*c0909341SAndroid Build Coastguard Worker sum5_ptrs[3] = sum5_ptrs[2];
1026*c0909341SAndroid Build Coastguard Worker sum5_ptrs[4] = sum5_ptrs[2];
1027*c0909341SAndroid Build Coastguard Worker
1028*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[2] = sumsq3_ptrs[1];
1029*c0909341SAndroid Build Coastguard Worker sum3_ptrs[2] = sum3_ptrs[1];
1030*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1031*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
1032*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
1033*c0909341SAndroid Build Coastguard Worker
1034*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[2] = sumsq3_ptrs[1];
1035*c0909341SAndroid Build Coastguard Worker sum3_ptrs[2] = sum3_ptrs[1];
1036*c0909341SAndroid Build Coastguard Worker
1037*c0909341SAndroid Build Coastguard Worker goto output_2;
1038*c0909341SAndroid Build Coastguard Worker
1039*c0909341SAndroid Build Coastguard Worker odd:
1040*c0909341SAndroid Build Coastguard Worker // Copy the last row as padding once
1041*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4] = sumsq5_ptrs[3];
1042*c0909341SAndroid Build Coastguard Worker sum5_ptrs[4] = sum5_ptrs[3];
1043*c0909341SAndroid Build Coastguard Worker
1044*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[2] = sumsq3_ptrs[1];
1045*c0909341SAndroid Build Coastguard Worker sum3_ptrs[2] = sum3_ptrs[1];
1046*c0909341SAndroid Build Coastguard Worker
1047*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1048*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
1049*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1050*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
1051*c0909341SAndroid Build Coastguard Worker sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1052*c0909341SAndroid Build Coastguard Worker w, 2, params->sgr.w0, params->sgr.w1
1053*c0909341SAndroid Build Coastguard Worker HIGHBD_TAIL_SUFFIX);
1054*c0909341SAndroid Build Coastguard Worker
1055*c0909341SAndroid Build Coastguard Worker output_1:
1056*c0909341SAndroid Build Coastguard Worker // Duplicate the last row twice more
1057*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[3] = sumsq5_ptrs[2];
1058*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4] = sumsq5_ptrs[2];
1059*c0909341SAndroid Build Coastguard Worker sum5_ptrs[3] = sum5_ptrs[2];
1060*c0909341SAndroid Build Coastguard Worker sum5_ptrs[4] = sum5_ptrs[2];
1061*c0909341SAndroid Build Coastguard Worker
1062*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[2] = sumsq3_ptrs[1];
1063*c0909341SAndroid Build Coastguard Worker sum3_ptrs[2] = sum3_ptrs[1];
1064*c0909341SAndroid Build Coastguard Worker
1065*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1066*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
1067*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1068*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
1069*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
1070*c0909341SAndroid Build Coastguard Worker // Output only one row
1071*c0909341SAndroid Build Coastguard Worker sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1072*c0909341SAndroid Build Coastguard Worker w, 1, params->sgr.w0, params->sgr.w1
1073*c0909341SAndroid Build Coastguard Worker HIGHBD_TAIL_SUFFIX);
1074*c0909341SAndroid Build Coastguard Worker return;
1075*c0909341SAndroid Build Coastguard Worker
1076*c0909341SAndroid Build Coastguard Worker vert_1:
1077*c0909341SAndroid Build Coastguard Worker // Copy the last row as padding once
1078*c0909341SAndroid Build Coastguard Worker sumsq5_ptrs[4] = sumsq5_ptrs[3];
1079*c0909341SAndroid Build Coastguard Worker sum5_ptrs[4] = sum5_ptrs[3];
1080*c0909341SAndroid Build Coastguard Worker
1081*c0909341SAndroid Build Coastguard Worker sumsq3_ptrs[2] = sumsq3_ptrs[1];
1082*c0909341SAndroid Build Coastguard Worker sum3_ptrs[2] = sum3_ptrs[1];
1083*c0909341SAndroid Build Coastguard Worker
1084*c0909341SAndroid Build Coastguard Worker sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1085*c0909341SAndroid Build Coastguard Worker w, params->sgr.s0, BITDEPTH_MAX);
1086*c0909341SAndroid Build Coastguard Worker rotate_ab_2(A5_ptrs, B5_ptrs);
1087*c0909341SAndroid Build Coastguard Worker sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1088*c0909341SAndroid Build Coastguard Worker w, params->sgr.s1, BITDEPTH_MAX);
1089*c0909341SAndroid Build Coastguard Worker rotate_ab_4(A3_ptrs, B3_ptrs);
1090*c0909341SAndroid Build Coastguard Worker
1091*c0909341SAndroid Build Coastguard Worker goto output_1;
1092*c0909341SAndroid Build Coastguard Worker }
1093*c0909341SAndroid Build Coastguard Worker
1094*c0909341SAndroid Build Coastguard Worker #endif
1095*c0909341SAndroid Build Coastguard Worker
1096*c0909341SAndroid Build Coastguard Worker
loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext * const c,int bpc)1097*c0909341SAndroid Build Coastguard Worker static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
1098*c0909341SAndroid Build Coastguard Worker const unsigned flags = dav1d_get_cpu_flags();
1099*c0909341SAndroid Build Coastguard Worker
1100*c0909341SAndroid Build Coastguard Worker if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
1101*c0909341SAndroid Build Coastguard Worker
1102*c0909341SAndroid Build Coastguard Worker #if ARCH_AARCH64
1103*c0909341SAndroid Build Coastguard Worker c->wiener[0] = BF(dav1d_wiener_filter7, neon);
1104*c0909341SAndroid Build Coastguard Worker c->wiener[1] = BF(dav1d_wiener_filter5, neon);
1105*c0909341SAndroid Build Coastguard Worker #else
1106*c0909341SAndroid Build Coastguard Worker c->wiener[0] = c->wiener[1] = wiener_filter_neon;
1107*c0909341SAndroid Build Coastguard Worker #endif
1108*c0909341SAndroid Build Coastguard Worker if (BITDEPTH == 8 || bpc == 10) {
1109*c0909341SAndroid Build Coastguard Worker c->sgr[0] = sgr_filter_5x5_neon;
1110*c0909341SAndroid Build Coastguard Worker c->sgr[1] = sgr_filter_3x3_neon;
1111*c0909341SAndroid Build Coastguard Worker c->sgr[2] = sgr_filter_mix_neon;
1112*c0909341SAndroid Build Coastguard Worker }
1113*c0909341SAndroid Build Coastguard Worker }
1114