xref: /aosp_15_r20/external/libdav1d/src/loongarch/looprestoration_tmpl.c (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1 /*
2  * Copyright © 2023, VideoLAN and dav1d authors
3  * Copyright © 2023, Loongson Technology Corporation Limited
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "src/loongarch/looprestoration.h"
29 
30 #if BITDEPTH == 8
31 
32 #define REST_UNIT_STRIDE (400)
33 
34 void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr,
35                                     uint8_t *tmp_ptr,
36                                     const int16_t filterh[8],
37                                     const int w, const int h);
38 
39 void BF(dav1d_wiener_filter_h, lasx)(int32_t *hor_ptr,
40                                      uint8_t *tmp_ptr,
41                                      const int16_t filterh[8],
42                                      const int w, const int h);
43 
44 void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p,
45                                     const ptrdiff_t p_stride,
46                                     const int32_t *hor,
47                                     const int16_t filterv[8],
48                                     const int w, const int h);
49 
50 void BF(dav1d_wiener_filter_v, lasx)(uint8_t *p,
51                                      const ptrdiff_t p_stride,
52                                      const int32_t *hor,
53                                      const int16_t filterv[8],
54                                      const int w, const int h);
55 
56 // This function refers to the function in the ppc/looprestoration_init_tmpl.c.
padding(uint8_t * dst,const uint8_t * p,const ptrdiff_t stride,const uint8_t (* left)[4],const uint8_t * lpf,int unit_w,const int stripe_h,const enum LrEdgeFlags edges)57 static inline void padding(uint8_t *dst, const uint8_t *p,
58                            const ptrdiff_t stride, const uint8_t (*left)[4],
59                            const uint8_t *lpf, int unit_w, const int stripe_h,
60                            const enum LrEdgeFlags edges)
61 {
62     const int have_left = !!(edges & LR_HAVE_LEFT);
63     const int have_right = !!(edges & LR_HAVE_RIGHT);
64 
65     // Copy more pixels if we don't have to pad them
66     unit_w += 3 * have_left + 3 * have_right;
67     uint8_t *dst_l = dst + 3 * !have_left;
68     p -= 3 * have_left;
69     lpf -= 3 * have_left;
70 
71     if (edges & LR_HAVE_TOP) {
72         // Copy previous loop filtered rows
73         const uint8_t *const above_1 = lpf;
74         const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
75         pixel_copy(dst_l, above_1, unit_w);
76         pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
77         pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
78     } else {
79         // Pad with first row
80         pixel_copy(dst_l, p, unit_w);
81         pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
82         pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
83         if (have_left) {
84             pixel_copy(dst_l, &left[0][1], 3);
85             pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
86             pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
87         }
88     }
89 
90     uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
91     if (edges & LR_HAVE_BOTTOM) {
92         // Copy next loop filtered rows
93         const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
94         const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
95         pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
96         pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
97         pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
98     } else {
99         // Pad with last row
100         const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
101         pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
102         pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
103         pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
104         if (have_left) {
105             pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
106             pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
107             pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
108         }
109     }
110 
111     // Inner UNIT_WxSTRIPE_H
112     for (int j = 0; j < stripe_h; j++) {
113         pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
114         dst_tl += REST_UNIT_STRIDE;
115         p += PXSTRIDE(stride);
116     }
117 
118     if (!have_right) {
119         uint8_t *pad = dst_l + unit_w;
120         uint8_t *row_last = &dst_l[unit_w - 1];
121         // Pad 3x(STRIPE_H+6) with last column
122         for (int j = 0; j < stripe_h + 6; j++) {
123             pixel_set(pad, *row_last, 3);
124             pad += REST_UNIT_STRIDE;
125             row_last += REST_UNIT_STRIDE;
126         }
127     }
128 
129     if (!have_left) {
130         // Pad 3x(STRIPE_H+6) with first column
131         for (int j = 0; j < stripe_h + 6; j++) {
132             pixel_set(dst, *dst_l, 3);
133             dst += REST_UNIT_STRIDE;
134             dst_l += REST_UNIT_STRIDE;
135         }
136     } else {
137         dst += 3 * REST_UNIT_STRIDE;
138         for (int j = 0; j < stripe_h; j++) {
139             pixel_copy(dst, &left[j][1], 3);
140             dst += REST_UNIT_STRIDE;
141         }
142     }
143 }
144 
145 // This function refers to the function in the ppc/looprestoration_init_tmpl.c.
146 
147 // FIXME Could split into luma and chroma specific functions,
148 // (since first and last tops are always 0 for chroma)
149 // FIXME Could implement a version that requires less temporary memory
150 // (should be possible to implement with only 6 rows of temp storage)
dav1d_wiener_filter_lsx(uint8_t * p,const ptrdiff_t p_stride,const uint8_t (* const left)[4],const uint8_t * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)151 void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride,
152                               const uint8_t (*const left)[4],
153                               const uint8_t *lpf,
154                               const int w, const int h,
155                               const LooprestorationParams *const params,
156                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
157 {
158     const int16_t (*const filter)[8] = params->filter;
159 
160     // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
161     // of padding above and below
162     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
163     padding(tmp, p, p_stride, left, lpf, w, h, edges);
164     ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
165 
166     BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6);
167     BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h);
168 }
169 
dav1d_wiener_filter_lasx(uint8_t * p,const ptrdiff_t p_stride,const uint8_t (* const left)[4],const uint8_t * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)170 void dav1d_wiener_filter_lasx(uint8_t *p, const ptrdiff_t p_stride,
171                               const uint8_t (*const left)[4],
172                               const uint8_t *lpf,
173                               const int w, const int h,
174                               const LooprestorationParams *const params,
175                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
176 {
177     const int16_t (*const filter)[8] = params->filter;
178 
179     // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
180     // of padding above and below
181     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
182     padding(tmp, p, p_stride, left, lpf, w, h, edges);
183     ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
184 
185     BF(dav1d_wiener_filter_h, lasx)(hor, tmp, filter[0], w, h + 6);
186     BF(dav1d_wiener_filter_v, lasx)(p, p_stride, hor, filter[1], w, h);
187 }
188 
189 void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src,
190                               const int w, const int h);
191 void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum,
192                               const int w, const int h);
193 
194 void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
195                                   const int w, const int h, const int w1);
196 void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp,
197                                   int32_t *sumsq, int16_t *sum,
198                                   const int w, const int h);
199 void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride,
200                                    int16_t *dst, int w1,
201                                    const int w, const int h);
202 
203 void BF(dav1d_boxsum3_h, lasx)(int32_t *sumsq, int16_t *sum, pixel *src,
204                                const int w, const int h);
205 void BF(dav1d_boxsum3_sgf_h, lasx)(int32_t *sumsq, int16_t *sum,
206                                    const int w, const int h, const int w1);
207 void BF(dav1d_boxsum3_sgf_v, lasx)(int16_t *dst, uint8_t *tmp,
208                                    int32_t *sumsq, int16_t *sum,
209                                    const int w, const int h);
210 
boxsum3_lsx(int32_t * sumsq,coef * sum,pixel * src,const int w,const int h)211 static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src,
212                                const int w, const int h)
213 {
214     BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6);
215     BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
216 }
217 
boxsum3_lasx(int32_t * sumsq,coef * sum,pixel * src,const int w,const int h)218 static inline void boxsum3_lasx(int32_t *sumsq, coef *sum, pixel *src,
219                                const int w, const int h)
220 {
221     BF(dav1d_boxsum3_h, lasx)(sumsq, sum, src, w + 6, h + 6);
222     BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
223 }
224 
dav1d_sgr_filter_3x3_lsx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)225 void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
226                               const pixel (*const left)[4],
227                               const pixel *lpf,
228                               const int w, const int h,
229                               const LooprestorationParams *const params,
230                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
231 {
232     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
233     padding(tmp, p, p_stride, left, lpf, w, h, edges);
234     coef dst[64 * 384];
235 
236     ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
237     ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
238 
239     boxsum3_lsx(sumsq, sum, tmp, w, h);
240     BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1);
241     BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
242     BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
243 }
244 
dav1d_sgr_filter_3x3_lasx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)245 void dav1d_sgr_filter_3x3_lasx(pixel *p, const ptrdiff_t p_stride,
246                               const pixel (*const left)[4],
247                               const pixel *lpf,
248                               const int w, const int h,
249                               const LooprestorationParams *const params,
250                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
251 {
252     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
253     padding(tmp, p, p_stride, left, lpf, w, h, edges);
254     coef dst[64 * 384];
255 
256     ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
257     ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
258 
259     boxsum3_lasx(sumsq, sum, tmp, w, h);
260     BF(dav1d_boxsum3_sgf_h, lasx)(sumsq, sum, w, h, params->sgr.s1);
261     BF(dav1d_boxsum3_sgf_v, lasx)(dst, tmp, sumsq, sum, w, h);
262     BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
263 }
264 
265 void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum,
266                               const uint8_t *const src,
267                               const int w, const int h);
268 
269 void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum,
270                               const int w, const int h);
271 
272 void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
273                                   const int w, const int h,
274                                   const unsigned s);
275 
276 void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src,
277                                   int32_t *sumsq, int16_t *sum,
278                                   const int w, const int h);
279 
280 void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride,
281                                    const int16_t *dst0, const int16_t *dst1,
282                                    const int w0, const int w1,
283                                    const int w, const int h);
284 
boxsum5_lsx(int32_t * sumsq,coef * sum,pixel * src,const int w,const int h)285 static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src,
286                                const int w, const int h)
287 {
288     BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6);
289     BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6);
290 }
291 
dav1d_sgr_filter_5x5_lsx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)292 void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
293                               const pixel (*const left)[4],
294                               const pixel *lpf,
295                               const int w, const int h,
296                               const LooprestorationParams *const params,
297                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
298 {
299     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
300     padding(tmp, p, p_stride, left, lpf, w, h, edges);
301     coef dst[64 * 384];
302 
303     ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
304     ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
305 
306     boxsum5_lsx(sumsq, sum, tmp, w, h);
307     BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0);
308     BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
309     BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h);
310 }
311 
dav1d_sgr_filter_mix_lsx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)312 void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
313                               const pixel (*const left)[4],
314                               const pixel *lpf,
315                               const int w, const int h,
316                               const LooprestorationParams *const params,
317                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
318 {
319     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
320     padding(tmp, p, p_stride, left, lpf, w, h, edges);
321     coef dst0[64 * 384];
322     coef dst1[64 * 384];
323 
324     ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, );
325     ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, );
326 
327     boxsum5_lsx(sumsq0, sum0, tmp, w, h);
328     BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0);
329     BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h);
330 
331     boxsum3_lsx(sumsq0, sum0, tmp, w, h);
332     BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1);
333     BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h);
334 
335     BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0,
336                                    params->sgr.w1, w, h);
337 }
338 #endif
339