xref: /aosp_15_r20/external/libdav1d/src/arm/64/looprestoration_common.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table.
32// In the comments, let RefTable denote the original, reference table.
33const x_by_x_tables
34// RangeMins
35//
36// Min(RefTable[i*8:i*8+8])
37// First two values are zeroed.
38//
39// Lookup using RangeMins[(x >> 3)]
40        .byte 0,  0, 11,  8,  6,  5,  5,  4,  4,  3,  3,  3,  2,  2,  2,  2
41        .byte 2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0
42
43// DiffMasks
44//
45// This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range
46// in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of
47// RefTable changes at that particular index.
48// Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of
49// the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15).
50//
51// Lookup using DiffMasks[(x >> 3)]
52        .byte 0x00, 0x00, 0xD4, 0x44
53        .byte 0x42, 0x04, 0x00, 0x00
54        .byte 0x00, 0x80, 0x00, 0x00
55        .byte 0x04, 0x00, 0x00, 0x00
56        .byte 0x00, 0x00, 0x00, 0x00
57        .byte 0x00, 0x40, 0x00, 0x00
58        .byte 0x00, 0x00, 0x00, 0x00
59        .byte 0x00, 0x00, 0x00, 0x02
60// Binary form:
61// 0b00000000, 0b00000000, 0b11010100, 0b01000100
62// 0b01000010, 0b00000100, 0b00000000, 0b00000000
63// 0b00000000, 0b10000000, 0b00000000, 0b00000000
64// 0b00000100, 0b00000000, 0b00000000, 0b00000000
65// 0b00000000, 0b00000000, 0b00000000, 0b00000000
66// 0b00000000, 0b01000000, 0b00000000, 0b00000000
67// 0b00000000, 0b00000000, 0b00000000, 0b00000000
68// 0b00000000, 0b00000000, 0b00000000, 0b00000010
69
70// RefLo
71//
72// RefTable[0:16]
73//      i.e. First 16 elements of the original table.
74// Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable.
75//
76// Lookup using RangeMins[x] (tbl will replace x > 15 with 0)
77        .byte 255, 128,  85,  64,  51,  43,  37,  32, 28,  26,  23,  21,  20,  18,  17,  16
78
79// Pseudo assembly
80//
81// hi_bits = x >> 3
82// tbl             ref,    {RefLo}, x
83// tbl             diffs,  {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits
84// tbl             min,    {RangeMins[0:16], RangeMins[16:32]}, hi_bits
85// lo_bits = x & 0x7
86// diffs = diffs << lo_bits
87// ref = ref + min
88// integral = popcnt(diffs)
89// ref = ref + integral
90// return ref
91endconst
92
93// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
94//                               int32_t *AA, int16_t *BB,
95//                               const int w, const int s,
96//                               const int bitdepth_max);
97function sgr_box3_vert_neon, export=1
98        stp             d8,  d9,  [sp, #-0x40]!
99        stp             d10, d11, [sp, #0x10]
100        stp             d12, d13, [sp, #0x20]
101        stp             d14, d15, [sp, #0x30]
102
103        add             w4,  w4,  #2
104        clz             w9,  w6        // bitdepth_max
105        dup             v28.4s,   w5   // strength
106
107        ldp             x5,  x6,  [x0]
108        ldr             x0,       [x0, #16]
109        ldp             x7,  x8,  [x1]
110        ldr             x1,       [x1, #16]
111
112        movi            v31.4s,   #9   // n
113
114        sub             w9,  w9,  #24  // -bitdepth_min_8
115        movrel          x12, x_by_x_tables
116        mov             w13, #455      // one_by_x
117        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks
118        movi            v22.16b, #0x7
119        ldr             q23, [x12, #64] //RefLo
120        dup             v6.8h,    w9   // -bitdepth_min_8
121        saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
122        movi            v29.8h,   #1, lsl #8
123        dup             v30.4s,   w13  // one_by_x
124
125        ld1             {v8.4s,  v9.4s,  v10.4s, v11.4s}, [x5], #64
126        ld1             {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
127        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
128        ld1             {v20.8h, v21.8h}, [x8], #32
129        ld1             {v0.8h,  v1.8h},  [x7], #32
1301:
131        ld1             {v2.8h,  v3.8h},   [x1], #32
132        add             v8.4s,   v8.4s,   v12.4s
133        add             v9.4s,   v9.4s,   v13.4s
134        add             v10.4s,  v10.4s,  v14.4s
135        add             v11.4s,  v11.4s,  v15.4s
136        add             v0.8h,   v0.8h,   v20.8h
137        add             v1.8h,   v1.8h,   v21.8h
138
139        add             v16.4s,  v16.4s,  v8.4s
140        add             v17.4s,  v17.4s,  v9.4s
141        add             v18.4s,  v18.4s,  v10.4s
142        add             v19.4s,  v19.4s,  v11.4s
143        add             v4.8h,   v2.8h,   v0.8h
144        add             v5.8h,   v3.8h,   v1.8h
145
146        srshl           v16.4s,  v16.4s,  v7.4s
147        srshl           v17.4s,  v17.4s,  v7.4s
148        srshl           v18.4s,  v18.4s,  v7.4s
149        srshl           v19.4s,  v19.4s,  v7.4s
150        srshl           v9.8h,   v4.8h,   v6.8h
151        srshl           v13.8h,  v5.8h,   v6.8h
152        mul             v16.4s,  v16.4s,  v31.4s // a * n
153        mul             v17.4s,  v17.4s,  v31.4s // a * n
154        mul             v18.4s,  v18.4s,  v31.4s // a * n
155        mul             v19.4s,  v19.4s,  v31.4s // a * n
156        umull           v8.4s,   v9.4h,   v9.4h  // b * b
157        umull2          v9.4s,   v9.8h,   v9.8h  // b * b
158        umull           v12.4s,  v13.4h,  v13.4h // b * b
159        umull2          v13.4s,  v13.8h,  v13.8h // b * b
160        uqsub           v16.4s,  v16.4s,  v8.4s  // imax(a * n - b * b, 0)
161        uqsub           v17.4s,  v17.4s,  v9.4s  // imax(a * n - b * b, 0)
162        uqsub           v18.4s,  v18.4s,  v12.4s // imax(a * n - b * b, 0)
163        uqsub           v19.4s,  v19.4s,  v13.4s // imax(a * n - b * b, 0)
164        mul             v16.4s,  v16.4s,  v28.4s // p * s
165        mul             v17.4s,  v17.4s,  v28.4s // p * s
166        mul             v18.4s,  v18.4s,  v28.4s // p * s
167        mul             v19.4s,  v19.4s,  v28.4s // p * s
168        uqshrn          v16.4h,  v16.4s,  #16
169        uqshrn2         v16.8h,  v17.4s,  #16
170        uqshrn          v18.4h,  v18.4s,  #16
171        uqshrn2         v18.8h,  v19.4s,  #16
172        uqrshrn         v1.8b,   v16.8h,  #4     // imin(z, 255)
173        uqrshrn2        v1.16b,  v18.8h,  #4     // imin(z, 255)
174
175        ld1             {v16.4s, v17.4s}, [x0], #32
176        subs            w4,  w4,  #16
177
178        ushr            v0.16b,  v1.16b,  #3
179        ld1             {v8.4s,  v9.4s}, [x5], #32
180        tbl             v2.16b,  {v26.16b, v27.16b}, v0.16b // RangeMins
181        tbl             v0.16b,  {v24.16b, v25.16b}, v0.16b // DiffMasks
182        tbl             v3.16b,  {v23.16b}, v1.16b          // RefLo
183        and             v1.16b,  v1.16b,   v22.16b
184        ld1             {v12.4s, v13.4s}, [x6], #32
185        ushl            v1.16b,  v2.16b,  v1.16b
186        ld1             {v20.8h, v21.8h}, [x8], #32
187        add             v3.16b,  v3.16b,  v0.16b
188        cnt             v1.16b,  v1.16b
189        ld1             {v18.4s, v19.4s}, [x0], #32
190        add             v3.16b,  v3.16b,  v1.16b
191        ld1             {v10.4s, v11.4s}, [x5], #32
192        uxtl            v0.8h,   v3.8b           // x
193        uxtl2           v1.8h,   v3.16b          // x
194
195        ld1             {v14.4s, v15.4s}, [x6], #32
196
197        umull           v2.4s,   v0.4h,   v4.4h // x * BB[i]
198        umull2          v3.4s,   v0.8h,   v4.8h // x * BB[i]
199        umull           v4.4s,   v1.4h,   v5.4h // x * BB[i]
200        umull2          v5.4s,   v1.8h,   v5.8h // x * BB[i]
201        sub             v0.8h,   v29.8h,  v0.8h // 256 - x
202        sub             v1.8h,   v29.8h,  v1.8h // 256 - x
203        mul             v2.4s,   v2.4s,  v30.4s // x * BB[i] * sgr_one_by_x
204        mul             v3.4s,   v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
205        mul             v4.4s,   v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
206        mul             v5.4s,   v5.4s,  v30.4s // x * BB[i] * sgr_one_by_x
207        st1             {v0.8h, v1.8h}, [x3], #32
208        ld1             {v0.8h, v1.8h}, [x7], #32
209        srshr           v2.4s,   v2.4s,  #12    // AA[i]
210        srshr           v3.4s,   v3.4s,  #12    // AA[i]
211        srshr           v4.4s,   v4.4s,  #12    // AA[i]
212        srshr           v5.4s,   v5.4s,  #12    // AA[i]
213
214        st1             {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
215        b.gt            1b
216
217        ldp             d14, d15, [sp, #0x30]
218        ldp             d12, d13, [sp, #0x20]
219        ldp             d10, d11, [sp, #0x10]
220        ldp             d8,  d9,  [sp], 0x40
221        ret
222endfunc
223
224// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
225//                               int32_t *AA, int16_t *BB,
226//                               const int w, const int s,
227//                               const int bitdepth_max);
228function sgr_box5_vert_neon, export=1
229        stp             d8,  d9,  [sp, #-0x30]!
230        stp             d10, d11, [sp, #0x10]
231        stp             d12, d13, [sp, #0x20]
232
233        add             w4,  w4,  #2
234        clz             w15, w6        // bitdepth_max
235        dup             v28.4s,   w5   // strength
236
237        ldp             x5,  x6,  [x0]
238        ldp             x7,  x8,  [x0, #16]
239        ldr             x0,       [x0, #32]
240        ldp             x9,  x10, [x1]
241        ldp             x11, x12, [x1, #16]
242        ldr             x1,       [x1, #32]
243
244        movi            v31.4s,   #25   // n
245
246        sub             w15, w15, #24  // -bitdepth_min_8
247        movrel          x13, x_by_x_tables
248        movi            v30.4s,  #164
249        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks
250        dup             v6.8h,   w15  // -bitdepth_min_8
251        movi            v19.8b,  #0x7
252        ldr             q18, [x13, #64] // RefLo
253        saddl           v7.4s,   v6.4h,   v6.4h  // -2*bitdepth_min_8
254        movi            v29.8h,  #1, lsl #8
255
256        ld1             {v8.4s,  v9.4s},  [x5], #32
257        ld1             {v10.4s, v11.4s}, [x6], #32
258        ld1             {v12.4s, v13.4s}, [x7], #32
259        ld1             {v16.4s, v17.4s}, [x8], #32
260        ld1             {v20.8h},         [x9], #16
261        ld1             {v21.8h},         [x10], #16
262        ld1             {v22.8h},         [x11], #16
263        ld1             {v23.8h},         [x12], #16
264        ld1             {v0.4s,  v1.4s},  [x0], #32
265        ld1             {v2.8h},          [x1], #16
266
2671:
268        add             v8.4s,   v8.4s,   v10.4s
269        add             v9.4s,   v9.4s,   v11.4s
270        add             v12.4s,  v12.4s,  v16.4s
271        add             v13.4s,  v13.4s,  v17.4s
272
273        add             v20.8h,  v20.8h,  v21.8h
274        add             v22.8h,  v22.8h,  v23.8h
275
276        add             v0.4s,   v0.4s,   v8.4s
277        add             v1.4s,   v1.4s,   v9.4s
278        add             v2.8h,   v2.8h,   v20.8h
279
280        add             v0.4s,   v0.4s,   v12.4s
281        add             v1.4s,   v1.4s,   v13.4s
282        add             v2.8h,   v2.8h,   v22.8h
283
284        subs            w4,  w4,  #8
285
286        srshl           v0.4s,   v0.4s,   v7.4s
287        srshl           v1.4s,   v1.4s,   v7.4s
288        srshl           v4.8h,   v2.8h,   v6.8h
289        mul             v0.4s,   v0.4s,   v31.4s // a * n
290        mul             v1.4s,   v1.4s,   v31.4s // a * n
291        umull           v3.4s,   v4.4h,   v4.4h  // b * b
292        umull2          v4.4s,   v4.8h,   v4.8h  // b * b
293        uqsub           v0.4s,   v0.4s,   v3.4s  // imax(a * n - b * b, 0)
294        uqsub           v1.4s,   v1.4s,   v4.4s  // imax(a * n - b * b, 0)
295        mul             v0.4s,   v0.4s,   v28.4s // p * s
296        mul             v1.4s,   v1.4s,   v28.4s // p * s
297        ld1             {v8.4s,  v9.4s},  [x5], #32
298        uqshrn          v0.4h,   v0.4s,   #16
299        uqshrn2         v0.8h,   v1.4s,   #16
300        ld1             {v10.4s, v11.4s}, [x6], #32
301        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)
302
303        ld1             {v12.4s, v13.4s}, [x7], #32
304
305        ushr            v1.8b,   v0.8b,  #3
306        ld1             {v16.4s, v17.4s}, [x8], #32
307        tbl             v5.8b,   {v26.16b, v27.16b}, v1.8b // RangeMins
308        tbl             v1.8b,   {v24.16b, v25.16b}, v1.8b // DiffMasks
309        tbl             v4.8b,   {v18.16b}, v0.8b          // RefLo
310        and             v0.8b,   v0.8b,  v19.8b
311        ld1             {v20.8h},         [x9], #16
312        ushl            v5.8b,   v5.8b,  v0.8b
313        add             v4.8b,   v4.8b,  v1.8b
314        ld1             {v21.8h},         [x10], #16
315        cnt             v5.8b,   v5.8b
316        ld1             {v22.8h},         [x11], #16
317        add             v5.8b,   v4.8b,  v5.8b
318        ld1             {v23.8h},         [x12], #16
319        uxtl            v5.8h,   v5.8b           // x
320
321        ld1             {v0.4s,  v1.4s},  [x0], #32
322        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
323        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
324        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
325        mul             v4.4s,   v4.4s,   v30.4s // x * BB[i] * sgr_one_by_x
326        srshr           v3.4s,   v3.4s,   #12    // AA[i]
327        srshr           v4.4s,   v4.4s,   #12    // AA[i]
328        sub             v5.8h,   v29.8h,  v5.8h  // 256 - x
329        ld1             {v2.8h},          [x1], #16
330
331        st1             {v3.4s, v4.4s}, [x2], #32
332        st1             {v5.8h}, [x3], #16
333        b.gt            1b
334
335        ldp             d12, d13, [sp, #0x20]
336        ldp             d10, d11, [sp, #0x10]
337        ldp             d8,  d9,  [sp], 0x30
338        ret
339endfunc
340