1 /*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "src/loongarch/looprestoration.h"
29
30 #if BITDEPTH == 8
31
32 #define REST_UNIT_STRIDE (400)
33
34 void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr,
35 uint8_t *tmp_ptr,
36 const int16_t filterh[8],
37 const int w, const int h);
38
39 void BF(dav1d_wiener_filter_h, lasx)(int32_t *hor_ptr,
40 uint8_t *tmp_ptr,
41 const int16_t filterh[8],
42 const int w, const int h);
43
44 void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p,
45 const ptrdiff_t p_stride,
46 const int32_t *hor,
47 const int16_t filterv[8],
48 const int w, const int h);
49
50 void BF(dav1d_wiener_filter_v, lasx)(uint8_t *p,
51 const ptrdiff_t p_stride,
52 const int32_t *hor,
53 const int16_t filterv[8],
54 const int w, const int h);
55
56 // This function refers to the function in the ppc/looprestoration_init_tmpl.c.
padding(uint8_t * dst,const uint8_t * p,const ptrdiff_t stride,const uint8_t (* left)[4],const uint8_t * lpf,int unit_w,const int stripe_h,const enum LrEdgeFlags edges)57 static inline void padding(uint8_t *dst, const uint8_t *p,
58 const ptrdiff_t stride, const uint8_t (*left)[4],
59 const uint8_t *lpf, int unit_w, const int stripe_h,
60 const enum LrEdgeFlags edges)
61 {
62 const int have_left = !!(edges & LR_HAVE_LEFT);
63 const int have_right = !!(edges & LR_HAVE_RIGHT);
64
65 // Copy more pixels if we don't have to pad them
66 unit_w += 3 * have_left + 3 * have_right;
67 uint8_t *dst_l = dst + 3 * !have_left;
68 p -= 3 * have_left;
69 lpf -= 3 * have_left;
70
71 if (edges & LR_HAVE_TOP) {
72 // Copy previous loop filtered rows
73 const uint8_t *const above_1 = lpf;
74 const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
75 pixel_copy(dst_l, above_1, unit_w);
76 pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
77 pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
78 } else {
79 // Pad with first row
80 pixel_copy(dst_l, p, unit_w);
81 pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
82 pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
83 if (have_left) {
84 pixel_copy(dst_l, &left[0][1], 3);
85 pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
86 pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
87 }
88 }
89
90 uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
91 if (edges & LR_HAVE_BOTTOM) {
92 // Copy next loop filtered rows
93 const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
94 const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
95 pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
96 pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
97 pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
98 } else {
99 // Pad with last row
100 const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
101 pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
102 pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
103 pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
104 if (have_left) {
105 pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
106 pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
107 pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
108 }
109 }
110
111 // Inner UNIT_WxSTRIPE_H
112 for (int j = 0; j < stripe_h; j++) {
113 pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
114 dst_tl += REST_UNIT_STRIDE;
115 p += PXSTRIDE(stride);
116 }
117
118 if (!have_right) {
119 uint8_t *pad = dst_l + unit_w;
120 uint8_t *row_last = &dst_l[unit_w - 1];
121 // Pad 3x(STRIPE_H+6) with last column
122 for (int j = 0; j < stripe_h + 6; j++) {
123 pixel_set(pad, *row_last, 3);
124 pad += REST_UNIT_STRIDE;
125 row_last += REST_UNIT_STRIDE;
126 }
127 }
128
129 if (!have_left) {
130 // Pad 3x(STRIPE_H+6) with first column
131 for (int j = 0; j < stripe_h + 6; j++) {
132 pixel_set(dst, *dst_l, 3);
133 dst += REST_UNIT_STRIDE;
134 dst_l += REST_UNIT_STRIDE;
135 }
136 } else {
137 dst += 3 * REST_UNIT_STRIDE;
138 for (int j = 0; j < stripe_h; j++) {
139 pixel_copy(dst, &left[j][1], 3);
140 dst += REST_UNIT_STRIDE;
141 }
142 }
143 }
144
145 // This function refers to the function in the ppc/looprestoration_init_tmpl.c.
146
147 // FIXME Could split into luma and chroma specific functions,
148 // (since first and last tops are always 0 for chroma)
149 // FIXME Could implement a version that requires less temporary memory
150 // (should be possible to implement with only 6 rows of temp storage)
dav1d_wiener_filter_lsx(uint8_t * p,const ptrdiff_t p_stride,const uint8_t (* const left)[4],const uint8_t * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)151 void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride,
152 const uint8_t (*const left)[4],
153 const uint8_t *lpf,
154 const int w, const int h,
155 const LooprestorationParams *const params,
156 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
157 {
158 const int16_t (*const filter)[8] = params->filter;
159
160 // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
161 // of padding above and below
162 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
163 padding(tmp, p, p_stride, left, lpf, w, h, edges);
164 ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
165
166 BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6);
167 BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h);
168 }
169
dav1d_wiener_filter_lasx(uint8_t * p,const ptrdiff_t p_stride,const uint8_t (* const left)[4],const uint8_t * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)170 void dav1d_wiener_filter_lasx(uint8_t *p, const ptrdiff_t p_stride,
171 const uint8_t (*const left)[4],
172 const uint8_t *lpf,
173 const int w, const int h,
174 const LooprestorationParams *const params,
175 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
176 {
177 const int16_t (*const filter)[8] = params->filter;
178
179 // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
180 // of padding above and below
181 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
182 padding(tmp, p, p_stride, left, lpf, w, h, edges);
183 ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
184
185 BF(dav1d_wiener_filter_h, lasx)(hor, tmp, filter[0], w, h + 6);
186 BF(dav1d_wiener_filter_v, lasx)(p, p_stride, hor, filter[1], w, h);
187 }
188
189 void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src,
190 const int w, const int h);
191 void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum,
192 const int w, const int h);
193
194 void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
195 const int w, const int h, const int w1);
196 void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp,
197 int32_t *sumsq, int16_t *sum,
198 const int w, const int h);
199 void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride,
200 int16_t *dst, int w1,
201 const int w, const int h);
202
203 void BF(dav1d_boxsum3_h, lasx)(int32_t *sumsq, int16_t *sum, pixel *src,
204 const int w, const int h);
205 void BF(dav1d_boxsum3_sgf_h, lasx)(int32_t *sumsq, int16_t *sum,
206 const int w, const int h, const int w1);
207 void BF(dav1d_boxsum3_sgf_v, lasx)(int16_t *dst, uint8_t *tmp,
208 int32_t *sumsq, int16_t *sum,
209 const int w, const int h);
210
boxsum3_lsx(int32_t * sumsq,coef * sum,pixel * src,const int w,const int h)211 static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src,
212 const int w, const int h)
213 {
214 BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6);
215 BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
216 }
217
boxsum3_lasx(int32_t * sumsq,coef * sum,pixel * src,const int w,const int h)218 static inline void boxsum3_lasx(int32_t *sumsq, coef *sum, pixel *src,
219 const int w, const int h)
220 {
221 BF(dav1d_boxsum3_h, lasx)(sumsq, sum, src, w + 6, h + 6);
222 BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
223 }
224
dav1d_sgr_filter_3x3_lsx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)225 void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
226 const pixel (*const left)[4],
227 const pixel *lpf,
228 const int w, const int h,
229 const LooprestorationParams *const params,
230 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
231 {
232 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
233 padding(tmp, p, p_stride, left, lpf, w, h, edges);
234 coef dst[64 * 384];
235
236 ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
237 ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
238
239 boxsum3_lsx(sumsq, sum, tmp, w, h);
240 BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1);
241 BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
242 BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
243 }
244
dav1d_sgr_filter_3x3_lasx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)245 void dav1d_sgr_filter_3x3_lasx(pixel *p, const ptrdiff_t p_stride,
246 const pixel (*const left)[4],
247 const pixel *lpf,
248 const int w, const int h,
249 const LooprestorationParams *const params,
250 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
251 {
252 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
253 padding(tmp, p, p_stride, left, lpf, w, h, edges);
254 coef dst[64 * 384];
255
256 ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
257 ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
258
259 boxsum3_lasx(sumsq, sum, tmp, w, h);
260 BF(dav1d_boxsum3_sgf_h, lasx)(sumsq, sum, w, h, params->sgr.s1);
261 BF(dav1d_boxsum3_sgf_v, lasx)(dst, tmp, sumsq, sum, w, h);
262 BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
263 }
264
265 void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum,
266 const uint8_t *const src,
267 const int w, const int h);
268
269 void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum,
270 const int w, const int h);
271
272 void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
273 const int w, const int h,
274 const unsigned s);
275
276 void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src,
277 int32_t *sumsq, int16_t *sum,
278 const int w, const int h);
279
280 void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride,
281 const int16_t *dst0, const int16_t *dst1,
282 const int w0, const int w1,
283 const int w, const int h);
284
boxsum5_lsx(int32_t * sumsq,coef * sum,pixel * src,const int w,const int h)285 static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src,
286 const int w, const int h)
287 {
288 BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6);
289 BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6);
290 }
291
dav1d_sgr_filter_5x5_lsx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)292 void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
293 const pixel (*const left)[4],
294 const pixel *lpf,
295 const int w, const int h,
296 const LooprestorationParams *const params,
297 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
298 {
299 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
300 padding(tmp, p, p_stride, left, lpf, w, h, edges);
301 coef dst[64 * 384];
302
303 ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
304 ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
305
306 boxsum5_lsx(sumsq, sum, tmp, w, h);
307 BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0);
308 BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
309 BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h);
310 }
311
dav1d_sgr_filter_mix_lsx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)312 void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
313 const pixel (*const left)[4],
314 const pixel *lpf,
315 const int w, const int h,
316 const LooprestorationParams *const params,
317 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
318 {
319 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
320 padding(tmp, p, p_stride, left, lpf, w, h, edges);
321 coef dst0[64 * 384];
322 coef dst1[64 * 384];
323
324 ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, );
325 ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, );
326
327 boxsum5_lsx(sumsq0, sum0, tmp, w, h);
328 BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0);
329 BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h);
330
331 boxsum3_lsx(sumsq0, sum0, tmp, w, h);
332 BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1);
333 BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h);
334
335 BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0,
336 params->sgr.w1, w, h);
337 }
338 #endif
339