xref: /aosp_15_r20/external/libdav1d/src/arm/64/itx16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2020, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// The exported functions in this file have got the following signature:
32// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
33//                int bitdepth_max);
34
35// Most of the functions use the following register layout:
36// x0-x3  external parameters
37// x4     function pointer to first transform
38// x5     function pointer to second transform
39// x6     output parameter for helper function
40// x7     input parameter for helper function
41// x8     input stride for helper function
42// x9-x12 scratch variables for helper functions
43// x13    pointer to list of eob thresholds
44// x14    return pointer for helper function
45// x15    return pointer for main function
46
47// The SIMD registers most often use the following layout:
48// v0-v1   multiplication coefficients
49// v2-v7   scratch registers
50// v8-v15  unused
51// v16-v31 inputs/outputs of transforms
52
53const idct_coeffs, align=4
54        // idct4
55        .int            2896, 2896*8*(1<<16), 1567, 3784
56        // idct8
57        .int            799, 4017, 3406, 2276
58        // idct16
59        .int            401, 4076, 3166, 2598
60        .int            1931, 3612, 3920, 1189
61        // idct32
62        .int            201, 4091, 3035, 2751
63        .int            1751, 3703, 3857, 1380
64        .int            995, 3973, 3513, 2106
65        .int            2440, 3290, 4052, 601
66endconst
67
68const idct64_coeffs, align=4
69        .int            101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
70        .int            1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
71        .int            4076, 401, 4017, 799
72
73        .int            4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
74        .int            3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
75        .int            -3166, -2598, -799, -4017
76
77        .int            501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
78        .int            2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
79        .int            3612, 1931, 2276, 3406
80
81        .int            4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
82        .int            3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
83        .int            -3920, -1189, -3406, -2276
84endconst
85
86const iadst4_coeffs, align=4
87        .int            1321, 3803, 2482, 3344
88endconst
89
90const iadst8_coeffs, align=4
91        .int            4076, 401, 3612, 1931
92        .int            2598, 3166, 1189, 3920
93        // idct_coeffs
94        .int            2896, 0, 1567, 3784
95endconst
96
97const iadst16_coeffs, align=4
98        .int            4091, 201, 3973, 995
99        .int            3703, 1751, 3290, 2440
100        .int            2751, 3035, 2106, 3513
101        .int            1380, 3857, 601, 4052
102endconst
103
104.macro mul_mla d, s0, s1, c0, c1
105        mul             \d\().4s, \s0\().4s, \c0
106        mla             \d\().4s, \s1\().4s, \c1
107.endm
108
109.macro mul_mls d, s0, s1, c0, c1
110        mul             \d\().4s, \s0\().4s, \c0
111        mls             \d\().4s, \s1\().4s, \c1
112.endm
113
114.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
115        sqrdmulh        \r0\sz,  \r0\sz,  \c
116        sqrdmulh        \r1\sz,  \r1\sz,  \c
117        sqrdmulh        \r2\sz,  \r2\sz,  \c
118        sqrdmulh        \r3\sz,  \r3\sz,  \c
119.ifnb \r4
120        sqrdmulh        \r4\sz,  \r4\sz,  \c
121        sqrdmulh        \r5\sz,  \r5\sz,  \c
122        sqrdmulh        \r6\sz,  \r6\sz,  \c
123        sqrdmulh        \r7\sz,  \r7\sz,  \c
124.endif
125.endm
126
127.macro smin_4s r0, r1, r2
128        smin            \r0\().4s, \r1\().4s, \r2\().4s
129.endm
130.macro smax_4s r0, r1, r2
131        smax            \r0\().4s, \r1\().4s, \r2\().4s
132.endm
133
134.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
135.ifnb \load
136        ld1             {\load},  [\src], x1
137.endif
138.ifnb \shift
139        srshr           \shift,  \shift,  #\shiftbits
140.endif
141.ifnb \addsrc
142        usqadd          \adddst, \addsrc
143.endif
144.ifnb \min
145        smin            \min,  \min,  v7.8h
146.endif
147.ifnb \store
148        st1             {\store},  [\dst], x1
149.endif
150.endm
151.macro load_add_store_8x16 dst, src
152        mov             \src, \dst
153        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
154        load_add_store  v2.8h,  v16.8h,       ,      ,       ,       ,  \dst, \src
155        load_add_store  v3.8h,  v17.8h,       ,      ,       ,       ,  \dst, \src
156        load_add_store  v4.8h,  v18.8h, v16.8h, v2.8h,       ,       ,  \dst, \src
157        load_add_store  v5.8h,  v19.8h, v17.8h, v3.8h,  v2.8h,       ,  \dst, \src
158        load_add_store  v16.8h, v20.8h, v18.8h, v4.8h,  v3.8h,  v2.8h,  \dst, \src
159        load_add_store  v17.8h, v21.8h, v19.8h, v5.8h,  v4.8h,  v3.8h,  \dst, \src
160        load_add_store  v18.8h, v22.8h, v20.8h, v16.8h, v5.8h,  v4.8h,  \dst, \src
161        load_add_store  v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h,  \dst, \src
162        load_add_store  v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src
163        load_add_store  v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src
164        load_add_store  v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src
165        load_add_store  v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src
166        load_add_store  v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src
167        load_add_store  v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src
168        load_add_store  v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src
169        load_add_store  v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src
170        load_add_store        ,       , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src
171        load_add_store        ,       , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src
172        load_add_store        ,       ,       ,       , v27.8h, v26.8h, \dst, \src
173        load_add_store        ,       ,       ,       ,       , v27.8h, \dst, \src
174.endm
175.macro load_add_store_8x8 dst, src, shiftbits=4
176        mov             \src, \dst
177        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
178        load_add_store  v2.8h,  v16.8h,       ,      ,       ,       ,  \dst, \src, \shiftbits
179        load_add_store  v3.8h,  v17.8h,       ,      ,       ,       ,  \dst, \src, \shiftbits
180        load_add_store  v4.8h,  v18.8h, v16.8h, v2.8h,       ,       ,  \dst, \src, \shiftbits
181        load_add_store  v5.8h,  v19.8h, v17.8h, v3.8h,  v2.8h,       ,  \dst, \src, \shiftbits
182        load_add_store  v16.8h, v20.8h, v18.8h, v4.8h,  v3.8h,  v2.8h,  \dst, \src, \shiftbits
183        load_add_store  v17.8h, v21.8h, v19.8h, v5.8h,  v4.8h,  v3.8h,  \dst, \src, \shiftbits
184        load_add_store  v18.8h, v22.8h, v20.8h, v16.8h, v5.8h,  v4.8h,  \dst, \src, \shiftbits
185        load_add_store  v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h,  \dst, \src, \shiftbits
186        load_add_store        ,       , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
187        load_add_store        ,       , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
188        load_add_store        ,       ,       ,       , v19.8h, v18.8h, \dst, \src, \shiftbits
189        load_add_store        ,       ,       ,       ,       , v19.8h, \dst, \src, \shiftbits
190.endm
191.macro load_add_store_8x4 dst, src, shiftbits=4
192        mov             \src, \dst
193        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
194        load_add_store  v2.8h, v16.8h,       ,      ,      ,      , \dst, \src, \shiftbits
195        load_add_store  v3.8h, v17.8h,       ,      ,      ,      , \dst, \src, \shiftbits
196        load_add_store  v4.8h, v18.8h, v16.8h, v2.8h,      ,      , \dst, \src, \shiftbits
197        load_add_store  v5.8h, v19.8h, v17.8h, v3.8h, v2.8h,      , \dst, \src, \shiftbits
198        load_add_store       ,       , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
199        load_add_store       ,       , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
200        load_add_store       ,       ,       ,      , v5.8h, v4.8h, \dst, \src, \shiftbits
201        load_add_store       ,       ,       ,      ,      , v5.8h, \dst, \src, \shiftbits
202.endm
203.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src
204.ifnb \load
205        ld1             {\load}[0],  [\src], x1
206.endif
207.ifnb \inssrc
208        ins             \insdst\().d[1],   \inssrc\().d[0]
209.endif
210.ifnb \shift
211        srshr           \shift,  \shift,  #4
212.endif
213.ifnb \load
214        ld1             {\load}[1],  [\src], x1
215.endif
216.ifnb \addsrc
217        usqadd          \adddst, \addsrc
218.endif
219.ifnb \store
220        st1             {\store}[0],  [\dst], x1
221.endif
222.ifnb \min
223        smin            \min,  \min,  v7.8h
224.endif
225.ifnb \store
226        st1             {\store}[1],  [\dst], x1
227.endif
228.endm
229.macro load_add_store_4x16 dst, src
230        mov             \src, \dst
231        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
232        load_add_store4 v0.d,  v17, v16,       ,       ,      ,       ,      ,  \dst, \src
233        load_add_store4 v1.d,  v19, v18,       ,       ,      ,       ,      ,  \dst, \src
234        load_add_store4 v2.d,  v21, v20, v16.8h,       ,      ,       ,      ,  \dst, \src
235        load_add_store4 v3.d,  v23, v22, v18.8h, v16.8h, v0.8h,       ,      ,  \dst, \src
236        load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h,  v0.8h,      ,  \dst, \src
237        load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h,  v1.8h,  v0.d,  \dst, \src
238        load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h,  v2.8h,  v1.d,  \dst, \src
239        load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h,  v2.d,  \dst, \src
240        load_add_store4      ,    ,    , v28.8h, v26.8h, v19.8h, v17.8h, v3.d,  \dst, \src
241        load_add_store4      ,    ,    , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src
242        load_add_store4      ,    ,    ,       , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src
243        load_add_store4      ,    ,    ,       ,      ,        , v23.8h, v21.d, \dst, \src
244        load_add_store4      ,    ,    ,       ,      ,        ,       , v23.d, \dst, \src
245.endm
246.macro load_add_store_4x8 dst, src
247        mov             \src, \dst
248        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
249        load_add_store4 v0.d, v17, v16,       ,       ,      ,      ,     , \dst, \src
250        load_add_store4 v1.d, v19, v18,       ,       ,      ,      ,     , \dst, \src
251        load_add_store4 v2.d, v21, v20, v16.8h,       ,      ,      ,     , \dst, \src
252        load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h,      ,     , \dst, \src
253        load_add_store4     ,    ,    , v20.8h, v18.8h, v1.8h, v0.8h,     , \dst, \src
254        load_add_store4     ,    ,    , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
255        load_add_store4     ,    ,    ,       , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
256        load_add_store4     ,    ,    ,       ,       ,      , v3.8h, v2.d, \dst, \src
257        load_add_store4     ,    ,    ,       ,       ,      ,      , v3.d, \dst, \src
258.endm
259
260.macro idct_dc w, h, shift
261        cbnz            w3,  1f
262        movz            w16, #2896*8, lsl #16
263        ld1r            {v16.4s}, [x2]
264        dup             v0.2s,   w16
265        sqrdmulh        v20.4s,  v16.4s,  v0.s[0]
266        str             wzr, [x2]
267.if (\w == 2*\h) || (2*\w == \h)
268        sqrdmulh        v20.4s,  v20.4s,  v0.s[0]
269.endif
270.if \shift > 0
271        sqrshrn         v16.4h,  v20.4s,  #\shift
272        sqrshrn2        v16.8h,  v20.4s,  #\shift
273.else
274        sqxtn           v16.4h,  v20.4s
275        sqxtn2          v16.8h,  v20.4s
276.endif
277        sqrdmulh        v16.8h,  v16.8h,  v0.h[1]
278        srshr           v16.8h,  v16.8h,  #4
279        mov             w4,  #\h
280        b               idct_dc_w\w\()_neon
2811:
282.endm
283
284function idct_dc_w4_neon
285        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
2861:
287        ld1             {v0.d}[0], [x0], x1
288        ld1             {v0.d}[1], [x0], x1
289        ld1             {v1.d}[0], [x0], x1
290        subs            w4,  w4,  #4
291        ld1             {v1.d}[1], [x0], x1
292        usqadd          v0.8h,   v16.8h
293        sub             x0,  x0,  x1, lsl #2
294        usqadd          v1.8h,   v16.8h
295        smin            v0.8h,   v0.8h,   v31.8h
296        st1             {v0.d}[0], [x0], x1
297        smin            v1.8h,   v1.8h,   v31.8h
298        st1             {v0.d}[1], [x0], x1
299        st1             {v1.d}[0], [x0], x1
300        st1             {v1.d}[1], [x0], x1
301        b.gt            1b
302        ret
303endfunc
304
305function idct_dc_w8_neon
306        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
3071:
308        ld1             {v0.8h}, [x0], x1
309        subs            w4,  w4,  #4
310        ld1             {v1.8h}, [x0], x1
311        usqadd          v0.8h,   v16.8h
312        ld1             {v2.8h}, [x0], x1
313        usqadd          v1.8h,   v16.8h
314        ld1             {v3.8h}, [x0], x1
315        usqadd          v2.8h,   v16.8h
316        usqadd          v3.8h,   v16.8h
317        sub             x0,  x0,  x1, lsl #2
318        smin            v0.8h,   v0.8h,   v31.8h
319        smin            v1.8h,   v1.8h,   v31.8h
320        st1             {v0.8h}, [x0], x1
321        smin            v2.8h,   v2.8h,   v31.8h
322        st1             {v1.8h}, [x0], x1
323        smin            v3.8h,   v3.8h,   v31.8h
324        st1             {v2.8h}, [x0], x1
325        st1             {v3.8h}, [x0], x1
326        b.gt            1b
327        ret
328endfunc
329
330function idct_dc_w16_neon
331        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
3321:
333        ld1             {v0.8h, v1.8h}, [x0], x1
334        subs            w4,  w4,  #2
335        ld1             {v2.8h, v3.8h}, [x0], x1
336        usqadd          v0.8h,   v16.8h
337        usqadd          v1.8h,   v16.8h
338        sub             x0,  x0,  x1, lsl #1
339        usqadd          v2.8h,   v16.8h
340        usqadd          v3.8h,   v16.8h
341        smin            v0.8h,   v0.8h,   v31.8h
342        smin            v1.8h,   v1.8h,   v31.8h
343        smin            v2.8h,   v2.8h,   v31.8h
344        st1             {v0.8h, v1.8h}, [x0], x1
345        smin            v3.8h,   v3.8h,   v31.8h
346        st1             {v2.8h, v3.8h}, [x0], x1
347        b.gt            1b
348        ret
349endfunc
350
351function idct_dc_w32_neon
352        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
3531:
354        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
355        subs            w4,  w4,  #1
356        usqadd          v0.8h,   v16.8h
357        usqadd          v1.8h,   v16.8h
358        usqadd          v2.8h,   v16.8h
359        usqadd          v3.8h,   v16.8h
360        smin            v0.8h,   v0.8h,   v31.8h
361        smin            v1.8h,   v1.8h,   v31.8h
362        smin            v2.8h,   v2.8h,   v31.8h
363        smin            v3.8h,   v3.8h,   v31.8h
364        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
365        b.gt            1b
366        ret
367endfunc
368
369function idct_dc_w64_neon
370        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
371        sub             x1,  x1,  #64
3721:
373        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
374        subs            w4,  w4,  #1
375        usqadd          v0.8h,   v16.8h
376        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
377        usqadd          v1.8h,   v16.8h
378        sub             x0,  x0,  #64
379        usqadd          v2.8h,   v16.8h
380        usqadd          v3.8h,   v16.8h
381        usqadd          v4.8h,   v16.8h
382        usqadd          v5.8h,   v16.8h
383        usqadd          v6.8h,   v16.8h
384        usqadd          v7.8h,   v16.8h
385        smin            v0.8h,   v0.8h,   v31.8h
386        smin            v1.8h,   v1.8h,   v31.8h
387        smin            v2.8h,   v2.8h,   v31.8h
388        smin            v3.8h,   v3.8h,   v31.8h
389        smin            v4.8h,   v4.8h,   v31.8h
390        smin            v5.8h,   v5.8h,   v31.8h
391        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
392        smin            v6.8h,   v6.8h,   v31.8h
393        smin            v7.8h,   v7.8h,   v31.8h
394        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
395        b.gt            1b
396        ret
397endfunc
398
399.macro iwht4
400        add             v16.4s,  v16.4s,  v17.4s
401        sub             v21.4s,  v18.4s,  v19.4s
402        sub             v20.4s,  v16.4s,  v21.4s
403        sshr            v20.4s,  v20.4s,  #1
404        sub             v18.4s,  v20.4s,  v17.4s
405        sub             v17.4s,  v20.4s,  v19.4s
406        add             v19.4s,  v21.4s,  v18.4s
407        sub             v16.4s,  v16.4s,  v17.4s
408.endm
409
410.macro idct_4 r0, r1, r2, r3
411        mul_mla         v6,  \r1, \r3, v0.s[3], v0.s[2]
412        mul_mla         v2,  \r0, \r2, v0.s[0], v0.s[0]
413        mul_mls         v4,  \r1, \r3, v0.s[2], v0.s[3]
414        mul_mls         v3,  \r0, \r2, v0.s[0], v0.s[0]
415        srshr           v6.4s,  v6.4s,  #12
416        srshr           v2.4s,  v2.4s,  #12
417        srshr           v7.4s,  v4.4s,  #12
418        srshr           v3.4s,  v3.4s,  #12
419        sqadd           \r0\().4s,  v2.4s,   v6.4s
420        sqsub           \r3\().4s,  v2.4s,   v6.4s
421        sqadd           \r1\().4s,  v3.4s,   v7.4s
422        sqsub           \r2\().4s,  v3.4s,   v7.4s
423.endm
424
425function inv_dct_4s_x4_neon
426        AARCH64_VALID_CALL_TARGET
427        movrel          x16, idct_coeffs
428        ld1             {v0.4s}, [x16]
429        idct_4          v16, v17, v18, v19
430        ret
431endfunc
432
433.macro iadst_4x4 o0, o1, o2, o3
434        movrel          x16, iadst4_coeffs
435        ld1             {v0.4s}, [x16]
436
437        sub             v3.4s,   v16.4s,  v18.4s
438        mul             v4.4s,   v16.4s,  v0.s[0]
439        mla             v4.4s,   v18.4s,  v0.s[1]
440        mla             v4.4s,   v19.4s,  v0.s[2]
441        mul             v7.4s,   v17.4s,  v0.s[3]
442        add             v3.4s,   v3.4s,   v19.4s
443        mul             v5.4s,   v16.4s,  v0.s[2]
444        mls             v5.4s,   v18.4s,  v0.s[0]
445        mls             v5.4s,   v19.4s,  v0.s[1]
446
447        add             \o3\().4s, v4.4s,     v5.4s
448        mul             \o2\().4s, v3.4s,     v0.s[3]
449        add             \o0\().4s, v4.4s,     v7.4s
450        add             \o1\().4s, v5.4s,     v7.4s
451        sub             \o3\().4s, \o3\().4s, v7.4s
452
453        srshr           \o0\().4s, \o0\().4s, #12
454        srshr           \o2\().4s, \o2\().4s, #12
455        srshr           \o1\().4s, \o1\().4s, #12
456        srshr           \o3\().4s, \o3\().4s, #12
457.endm
458
459function inv_adst_4s_x4_neon
460        AARCH64_VALID_CALL_TARGET
461        iadst_4x4       v16, v17, v18, v19
462        ret
463endfunc
464
465function inv_flipadst_4s_x4_neon
466        AARCH64_VALID_CALL_TARGET
467        iadst_4x4       v19, v18, v17, v16
468        ret
469endfunc
470
471function inv_identity_4s_x4_neon
472        AARCH64_VALID_CALL_TARGET
473        movz            w16, #(5793-4096)*8, lsl #16
474        dup             v0.2s,   w16
475        sqrdmulh        v4.4s,   v16.4s,  v0.s[0]
476        sqrdmulh        v5.4s,   v17.4s,  v0.s[0]
477        sqrdmulh        v6.4s,   v18.4s,  v0.s[0]
478        sqrdmulh        v7.4s,   v19.4s,  v0.s[0]
479        sqadd           v16.4s,  v16.4s,  v4.4s
480        sqadd           v17.4s,  v17.4s,  v5.4s
481        sqadd           v18.4s,  v18.4s,  v6.4s
482        sqadd           v19.4s,  v19.4s,  v7.4s
483        ret
484endfunc
485
486function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
487        mov             x15, x30
488        movi            v30.4s,  #0
489        movi            v31.4s,  #0
490        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
491        st1             {v30.4s, v31.4s}, [x2], #32
492
493        sshr            v16.4s,  v16.4s,  #2
494        sshr            v17.4s,  v17.4s,  #2
495        sshr            v18.4s,  v18.4s,  #2
496        sshr            v19.4s,  v19.4s,  #2
497
498        iwht4
499
500        st1             {v30.4s, v31.4s}, [x2], #32
501        transpose_4x4s  v16, v17, v18, v19, v20, v21, v22, v23
502
503        iwht4
504
505        ld1             {v0.d}[0], [x0], x1
506        sqxtn           v16.4h,  v16.4s
507        ld1             {v0.d}[1], [x0], x1
508        sqxtn2          v16.8h,  v17.4s
509        ld1             {v1.d}[0], [x0], x1
510        sqxtn           v18.4h,  v18.4s
511        ld1             {v1.d}[1], [x0], x1
512        sqxtn2          v18.8h,  v19.4s
513
514        b               L(itx_4x4_end)
515endfunc
516
517// HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers
518// x0-x4  external parameters
519// x5     function pointer to first transform
520// x6     function pointer to second transform
521function inv_txfm_add_4x4_neon
522        movi            v30.4s,  #0
523        movi            v31.4s,  #0
524        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
525        st1             {v30.4s, v31.4s}, [x2], #32
526
527        blr             x5
528
529        st1             {v30.4s, v31.4s}, [x2], #32
530        sqxtn           v16.4h,  v16.4s
531        sqxtn           v17.4h,  v17.4s
532        sqxtn           v18.4h,  v18.4s
533        sqxtn           v19.4h,  v19.4s
534        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
535
536        blr             x6
537
538        ld1             {v0.d}[0], [x0], x1
539        ld1             {v0.d}[1], [x0], x1
540        ins             v16.d[1], v17.d[0]
541        ins             v18.d[1], v19.d[0]
542        ld1             {v1.d}[0], [x0], x1
543        ld1             {v1.d}[1], [x0], x1
544        srshr           v16.8h,  v16.8h,  #4
545        srshr           v18.8h,  v18.8h,  #4
546
547L(itx_4x4_end):
548        dup             v31.8h,  w4
549        sub             x0,  x0,  x1, lsl #2
550        usqadd          v0.8h,   v16.8h
551        usqadd          v1.8h,   v18.8h
552        smin            v0.8h,   v0.8h,   v31.8h
553        st1             {v0.d}[0], [x0], x1
554        smin            v1.8h,   v1.8h,   v31.8h
555        st1             {v0.d}[1], [x0], x1
556        st1             {v1.d}[0], [x0], x1
557        st1             {v1.d}[1], [x0], x1
558
559        ret             x15
560endfunc
561
562.macro def_fn_4x4 txfm1, txfm2
563function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
564        mov             x15, x30
565
566.ifc \txfm1\()_\txfm2, dct_dct
567        cbnz            w3,  1f
568        movz            w16, #2896*8, lsl #16
569        ld1r            {v16.4s}, [x2]
570        dup             v4.2s,   w16
571        str             wzr, [x2]
572        sqrdmulh        v16.4s,  v16.4s,  v4.s[0]
573        ld1             {v0.d}[0], [x0], x1
574        sqxtn           v20.4h,  v16.4s
575        sqxtn2          v20.8h,  v16.4s
576        ld1             {v0.d}[1], [x0], x1
577        sqrdmulh        v20.8h,  v20.8h,  v4.h[1]
578        ld1             {v1.d}[0], [x0], x1
579        srshr           v16.8h,  v20.8h,  #4
580        ld1             {v1.d}[1], [x0], x1
581        srshr           v18.8h,  v20.8h,  #4
582        movi            v30.8h,  #0
583        b               L(itx_4x4_end)
5841:
585.endif
586        adr             x5,  inv_\txfm1\()_4s_x4_neon
587        movrel          x6,  X(inv_\txfm2\()_4h_x4_neon)
588        b               inv_txfm_add_4x4_neon
589endfunc
590.endm
591
592def_fn_4x4 dct, dct
593def_fn_4x4 identity, identity
594def_fn_4x4 dct, adst
595def_fn_4x4 dct, flipadst
596def_fn_4x4 dct, identity
597def_fn_4x4 adst, dct
598def_fn_4x4 adst, adst
599def_fn_4x4 adst, flipadst
600def_fn_4x4 flipadst, dct
601def_fn_4x4 flipadst, adst
602def_fn_4x4 flipadst, flipadst
603def_fn_4x4 identity, dct
604
605def_fn_4x4 adst, identity
606def_fn_4x4 flipadst, identity
607def_fn_4x4 identity, adst
608def_fn_4x4 identity, flipadst
609
610.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
611        idct_4          \r0, \r2, \r4, \r6
612
613        movi            v5.4s,  #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
614        mvni            v4.4s,  #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
615.irp r, \r0, \r2, \r4, \r6
616        smin_4s         \r, \r, v5
617.endr
618.irp r, \r0, \r2, \r4, \r6
619        smax_4s         \r, \r, v4
620.endr
621
622        mul_mls         v2,  \r1, \r7, v1.s[0], v1.s[1]  // -> t4a
623        mul_mla         v3,  \r1, \r7, v1.s[1], v1.s[0]  // -> t7a
624        mul_mls         v6,  \r5, \r3, v1.s[2], v1.s[3]  // -> t5a
625        mul_mla         v7,  \r5, \r3, v1.s[3], v1.s[2]  // -> t6a
626        srshr           \r1\().4s, v2.4s,  #12           // t4a
627        srshr           \r7\().4s, v3.4s,  #12           // t7a
628        srshr           \r3\().4s, v6.4s,  #12           // t5a
629        srshr           \r5\().4s, v7.4s,  #12           // t6a
630
631        sqadd           v2.4s,     \r1\().4s,  \r3\().4s // t4
632        sqsub           \r1\().4s, \r1\().4s,  \r3\().4s // t5a
633        sqadd           v3.4s,     \r7\().4s,  \r5\().4s // t7
634        sqsub           \r3\().4s, \r7\().4s,  \r5\().4s // t6a
635
636.irp r, v2, \r1, v3, \r3
637        smin_4s         \r, \r, v5
638.endr
639.irp r, v2, \r1, v3, \r3
640        smax_4s         \r, \r, v4
641.endr
642
643        mul_mls         v7,  \r3, \r1, v0.s[0], v0.s[0]  // -> t5
644        mul_mla         v6,  \r3, \r1, v0.s[0], v0.s[0]  // -> t6
645        srshr           v7.4s,  v7.4s,  #12              // t5
646        srshr           v6.4s,  v6.4s,  #12              // t6
647
648        sqsub           \r7\().4s,  \r0\().4s,  v3.4s    // out7
649        sqadd           \r0\().4s,  \r0\().4s,  v3.4s    // out0
650        sqadd           \r1\().4s,  \r2\().4s,  v6.4s    // out1
651        sqsub           v6.4s,      \r2\().4s,  v6.4s    // out6
652        sqadd           \r2\().4s,  \r4\().4s,  v7.4s    // out2
653        sqsub           \r5\().4s,  \r4\().4s,  v7.4s    // out5
654        sqadd           \r3\().4s,  \r6\().4s,  v2.4s    // out3
655        sqsub           \r4\().4s,  \r6\().4s,  v2.4s    // out4
656        mov             \r6\().16b, v6.16b               // out6
657.endm
658
659function inv_dct_4s_x8_neon
660        AARCH64_VALID_CALL_TARGET
661        movrel          x16, idct_coeffs
662        ld1             {v0.4s, v1.4s}, [x16]
663        idct_8          v16, v17, v18, v19, v20, v21, v22, v23
664        ret
665endfunc
666
667.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
668        movrel          x16, iadst8_coeffs
669        ld1             {v0.4s, v1.4s}, [x16], #32
670
671        mul_mla         v2,  v23, v16, v0.s[0], v0.s[1]
672        mul_mls         v4,  v23, v16, v0.s[1], v0.s[0]
673        mul_mla         v6,  v21, v18, v0.s[2], v0.s[3]
674        srshr           v16.4s, v2.4s,  #12  // t0a
675        srshr           v23.4s, v4.4s,  #12  // t1a
676        mul_mls         v2,  v21, v18, v0.s[3], v0.s[2]
677        mul_mla         v4,  v19, v20, v1.s[0], v1.s[1]
678        srshr           v18.4s, v6.4s,  #12  // t2a
679        srshr           v21.4s, v2.4s,  #12  // t3a
680        mul_mls         v6,  v19, v20, v1.s[1], v1.s[0]
681        mul_mla         v2,  v17, v22, v1.s[2], v1.s[3]
682        srshr           v20.4s, v4.4s,  #12  // t4a
683        srshr           v19.4s, v6.4s,  #12  // t5a
684        mul_mls         v4,  v17, v22, v1.s[3], v1.s[2]
685        srshr           v22.4s, v2.4s,  #12  // t6a
686        srshr           v17.4s, v4.4s,  #12  // t7a
687
688        ld1             {v0.4s}, [x16]
689
690        movi            v1.4s,   #1, msl #16     // row_clip_max = ~(~bdmax << 7), 0x1ffff
691
692        sqadd           v2.4s,   v16.4s,  v20.4s // t0
693        sqsub           v3.4s,   v16.4s,  v20.4s // t4
694        mvni            v20.4s,  #1, msl #16     // row_clip_min = (~bdmax << 7), 0xfffe0000
695        sqadd           v4.4s,   v23.4s,  v19.4s // t1
696        sqsub           v5.4s,   v23.4s,  v19.4s // t5
697        sqadd           v6.4s,   v18.4s,  v22.4s // t2
698        sqsub           v7.4s,   v18.4s,  v22.4s // t6
699        sqadd           v18.4s,  v21.4s,  v17.4s // t3
700        sqsub           v19.4s,  v21.4s,  v17.4s // t7
701
702.irp r, v2, v3, v4, v5, v6, v7, v18, v19
703        smin_4s         \r, \r, v1
704.endr
705.irp r, v2, v3, v4, v5, v6, v7, v18, v19
706        smax_4s         \r, \r, v20
707.endr
708
709        mul_mla         v16, v3,  v5,  v0.s[3], v0.s[2]
710        mul_mls         v20, v3,  v5,  v0.s[2], v0.s[3]
711        mul_mls         v22, v19, v7,  v0.s[3], v0.s[2]
712
713        srshr           v3.4s,  v16.4s, #12  // t4a
714        srshr           v5.4s,  v20.4s, #12  // t5a
715
716        mul_mla         v16, v19, v7,  v0.s[2], v0.s[3]
717
718        srshr           v7.4s,  v22.4s, #12  // t6a
719        srshr           v19.4s, v16.4s, #12  // t7a
720
721        sqadd           \o0\().4s, v2.4s, v6.4s  // out0
722        sqsub           v2.4s,     v2.4s, v6.4s  // t2
723        sqadd           \o7\().4s, v4.4s, v18.4s // out7
724        sqsub           v4.4s,     v4.4s, v18.4s // t3
725
726        mvni            v18.4s,  #1, msl #16     // row_clip_min = (~bdmax << 7), 0xfffe0000
727
728        sqadd           \o1\().4s, v3.4s, v7.4s  // out1
729        sqsub           v3.4s,     v3.4s, v7.4s  // t6
730        sqadd           \o6\().4s, v5.4s, v19.4s // out6
731        sqsub           v5.4s,     v5.4s, v19.4s // t7
732
733        // Not clipping the output registers, as they will be downshifted and
734        // narrowed afterwards anyway.
735.irp r, v2, v4, v3, v5
736        smin_4s         \r, \r, v1
737.endr
738.irp r, v2, v4, v3, v5
739        smax_4s         \r, \r, v18
740.endr
741
742        sqneg           \o7\().4s, \o7\().4s     // out7
743        sqneg           \o1\().4s, \o1\().4s     // out1
744
745        mul_mla         v18, v2,  v4,  v0.s[0], v0.s[0] // -> out3 (v19 or v20)
746        mul_mls         v6,  v2,  v4,  v0.s[0], v0.s[0] // -> out4 (v20 or v19)
747        mul_mls         v20, v3,  v5,  v0.s[0], v0.s[0] // -> out5 (v21 or v18)
748        srshr           v2.4s,  v18.4s, #12 // out3
749        mul_mla         v18, v3,  v5,  v0.s[0], v0.s[0] // -> out2 (v18 or v21)
750        srshr           v3.4s,  v20.4s, #12 // out5
751        srshr           \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
752        srshr           \o4\().4s, v6.4s,  #12 // out4 (v20 or v19)
753
754        sqneg           \o3\().4s, v2.4s     // out3
755        sqneg           \o5\().4s, v3.4s     // out5
756.endm
757
758function inv_adst_4s_x8_neon
759        AARCH64_VALID_CALL_TARGET
760        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23
761        ret
762endfunc
763
764function inv_flipadst_4s_x8_neon
765        AARCH64_VALID_CALL_TARGET
766        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16
767        ret
768endfunc
769
770function inv_identity_4s_x8_neon
771        AARCH64_VALID_CALL_TARGET
772        sqshl           v16.4s,  v16.4s,  #1
773        sqshl           v17.4s,  v17.4s,  #1
774        sqshl           v18.4s,  v18.4s,  #1
775        sqshl           v19.4s,  v19.4s,  #1
776        sqshl           v20.4s,  v20.4s,  #1
777        sqshl           v21.4s,  v21.4s,  #1
778        sqshl           v22.4s,  v22.4s,  #1
779        sqshl           v23.4s,  v23.4s,  #1
780        ret
781endfunc
782
783function inv_txfm_add_8x8_neon
784        movi            v31.4s,  #0
785
786        cmp             w3,  w13
787        mov             x11, #32
788        b.lt            1f
789
790        add             x6,  x2,  #16
791.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
792        ld1             {\i},     [x6]
793        st1             {v31.4s}, [x6], x11
794.endr
795
796        blr             x4
797
798        sqrshrn         v24.4h,  v16.4s,  #1
799        sqrshrn         v25.4h,  v17.4s,  #1
800        sqrshrn         v26.4h,  v18.4s,  #1
801        sqrshrn         v27.4h,  v19.4s,  #1
802        sqrshrn2        v24.8h,  v20.4s,  #1
803        sqrshrn2        v25.8h,  v21.4s,  #1
804        sqrshrn2        v26.8h,  v22.4s,  #1
805        sqrshrn2        v27.8h,  v23.4s,  #1
806
807        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
808
809        b               2f
810
8111:
812.irp i, v24.8h, v25.8h, v26.8h, v27.8h
813        movi            \i,  #0
814.endr
815
8162:
817
818.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
819        ld1             {\i},     [x2]
820        st1             {v31.4s}, [x2], x11
821.endr
822
823        blr             x4
824
825        sqrshrn         v16.4h,  v16.4s,  #1
826        sqrshrn         v17.4h,  v17.4s,  #1
827        sqrshrn         v18.4h,  v18.4s,  #1
828        sqrshrn         v19.4h,  v19.4s,  #1
829        sqrshrn2        v16.8h,  v20.4s,  #1
830        sqrshrn2        v17.8h,  v21.4s,  #1
831        sqrshrn2        v18.8h,  v22.4s,  #1
832        sqrshrn2        v19.8h,  v23.4s,  #1
833
834        transpose_4x8h  v16, v17, v18, v19, v20, v21, v22, v23
835
836        mov             v20.16b, v24.16b
837        mov             v21.16b, v25.16b
838        mov             v22.16b, v26.16b
839        mov             v23.16b, v27.16b
840
841        blr             x5
842
843        load_add_store_8x8 x0, x7
844        ret             x15
845endfunc
846
847.macro def_fn_8x8 txfm1, txfm2, eob_half
848function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
849        mov             x15, x30
850
851.ifc \txfm1\()_\txfm2, dct_dct
852        idct_dc         8,   8,   1
853.endif
854        movrel          x5,  X(inv_\txfm2\()_8h_x8_neon)
855        mov             w13, #\eob_half
856        adr             x4,  inv_\txfm1\()_4s_x8_neon
857        b               inv_txfm_add_8x8_neon
858endfunc
859.endm
860
861def_fn_8x8 dct, dct, 10
862def_fn_8x8 identity, identity, 10
863def_fn_8x8 dct, adst, 10
864def_fn_8x8 dct, flipadst, 10
865def_fn_8x8 dct, identity, 4
866def_fn_8x8 adst, dct, 10
867def_fn_8x8 adst, adst, 10
868def_fn_8x8 adst, flipadst, 10
869def_fn_8x8 flipadst, dct, 10
870def_fn_8x8 flipadst, adst, 10
871def_fn_8x8 flipadst, flipadst, 10
872def_fn_8x8 identity, dct, 4
873def_fn_8x8 adst, identity, 4
874def_fn_8x8 flipadst, identity, 4
875def_fn_8x8 identity, adst, 4
876def_fn_8x8 identity, flipadst, 4
877
878function inv_txfm_add_8x4_neon
879        movi            v28.4s,  #0
880        movi            v29.4s,  #0
881        movi            v30.4s,  #0
882        movi            v31.4s,  #0
883        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
884        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
885        movz            w16, #2896*8, lsl #16
886        dup             v0.2s,   w16
887        ld1             {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
888        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
889
890        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
891
892        blr             x4
893
894        sqxtn           v16.4h,  v16.4s
895        sqxtn           v17.4h,  v17.4s
896        sqxtn           v18.4h,  v18.4s
897        sqxtn           v19.4h,  v19.4s
898        sqxtn           v20.4h,  v20.4s
899        sqxtn           v21.4h,  v21.4s
900        sqxtn           v22.4h,  v22.4s
901        sqxtn           v23.4h,  v23.4s
902
903        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
904        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
905        ins             v16.d[1], v20.d[0]
906        ins             v17.d[1], v21.d[0]
907        ins             v18.d[1], v22.d[0]
908        ins             v19.d[1], v23.d[0]
909
910        blr             x5
911
912        load_add_store_8x4 x0, x7
913        ret             x15
914endfunc
915
916function inv_txfm_add_4x8_neon
917        movz            w16, #2896*8, lsl #16
918        movi            v31.4s,  #0
919        dup             v30.2s,  w16
920
921        cmp             w3,  w13
922        mov             x11, #32
923        b.lt            1f
924
925        add             x6,  x2,  #16
926.irp i, v16.4s, v17.4s, v18.4s, v19.4s
927        ld1             {\i},     [x6]
928        st1             {v31.4s}, [x6], x11
929.endr
930        scale_input     .4s, v30.s[0], v16, v17, v18, v19
931        blr             x4
932        sqxtn           v20.4h,  v16.4s
933        sqxtn           v21.4h,  v17.4s
934        sqxtn           v22.4h,  v18.4s
935        sqxtn           v23.4h,  v19.4s
936        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
937
938        b               2f
939
9401:
941.irp i, v20, v21, v22, v23
942        movi            \i\().4h, #0
943.endr
944
9452:
946
947.irp i, v16.4s, v17.4s, v18.4s, v19.4s
948        ld1             {\i},     [x2]
949        st1             {v31.4s}, [x2], x11
950.endr
951        scale_input     .4s, v30.s[0], v16, v17, v18, v19
952        blr             x4
953        sqxtn           v16.4h,  v16.4s
954        sqxtn           v17.4h,  v17.4s
955        sqxtn           v18.4h,  v18.4s
956        sqxtn           v19.4h,  v19.4s
957        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
958
959        blr             x5
960
961        load_add_store_4x8 x0, x7
962        ret             x15
963endfunc
964
965.macro def_fn_48 w, h, txfm1, txfm2, eob_half
966function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
967        mov             x15, x30
968
969.ifc \txfm1\()_\txfm2, dct_dct
970        idct_dc         \w,  \h,  0
971.endif
972        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
973.if \w == 4
974        mov             w13, #\eob_half
975.endif
976        movrel          x5,  X(inv_\txfm2\()_\w\()h_x\h\()_neon)
977        b               inv_txfm_add_\w\()x\h\()_neon
978endfunc
979.endm
980
981.macro def_fns_48 w, h
982def_fn_48 \w, \h, dct, dct, 13
983def_fn_48 \w, \h, identity, identity, 13
984def_fn_48 \w, \h, dct, adst, 13
985def_fn_48 \w, \h, dct, flipadst, 13
986def_fn_48 \w, \h, dct, identity, 4
987def_fn_48 \w, \h, adst, dct, 13
988def_fn_48 \w, \h, adst, adst, 13
989def_fn_48 \w, \h, adst, flipadst, 13
990def_fn_48 \w, \h, flipadst, dct, 13
991def_fn_48 \w, \h, flipadst, adst, 13
992def_fn_48 \w, \h, flipadst, flipadst, 13
993def_fn_48 \w, \h, identity, dct, 16
994def_fn_48 \w, \h, adst, identity, 4
995def_fn_48 \w, \h, flipadst, identity, 4
996def_fn_48 \w, \h, identity, adst, 16
997def_fn_48 \w, \h, identity, flipadst, 16
998.endm
999
1000def_fns_48 4, 8
1001def_fns_48 8, 4
1002
1003
1004function inv_dct_4s_x16_neon
1005        AARCH64_VALID_CALL_TARGET
1006        movrel          x16, idct_coeffs
1007        ld1             {v0.4s, v1.4s}, [x16], #32
1008
1009        idct_8          v16, v18, v20, v22, v24, v26, v28, v30
1010
1011        // idct_8 leaves the row_clip_max/min constants in v5 and v4
1012.irp r, v16, v18, v20, v22, v24, v26, v28, v30
1013        smin            \r\().4s,  \r\().4s,  v5.4s
1014.endr
1015.irp r, v16, v18, v20, v22, v24, v26, v28, v30
1016        smax            \r\().4s,  \r\().4s,  v4.4s
1017.endr
1018
1019        ld1             {v0.4s, v1.4s}, [x16]
1020        sub             x16, x16, #32
1021
1022        mul_mls         v2,  v17, v31, v0.s[0], v0.s[1] // -> t8a
1023        mul_mla         v3,  v17, v31, v0.s[1], v0.s[0] // -> t15a
1024        mul_mls         v6,  v25, v23, v0.s[2], v0.s[3] // -> t9a
1025        srshr           v17.4s, v2.4s,  #12             // t8a
1026        srshr           v31.4s, v3.4s,  #12             // t15a
1027        mul_mla         v2,  v25, v23, v0.s[3], v0.s[2] // -> t14a
1028        mul_mls         v3,  v21, v27, v1.s[0], v1.s[1] // -> t10a
1029        srshr           v23.4s, v6.4s,  #12             // t9a
1030        srshr           v25.4s, v2.4s,  #12             // t14a
1031        mul_mla         v6,  v21, v27, v1.s[1], v1.s[0] // -> t13a
1032        mul_mls         v2,  v29, v19, v1.s[2], v1.s[3] // -> t11a
1033        srshr           v21.4s, v3.4s,  #12             // t10a
1034        srshr           v27.4s, v6.4s,  #12             // t13a
1035        mul_mla         v3,  v29, v19, v1.s[3], v1.s[2] // -> t12a
1036        srshr           v19.4s, v2.4s,  #12             // t11a
1037        srshr           v29.4s, v3.4s,  #12             // t12a
1038
1039        ld1             {v0.4s}, [x16]
1040
1041        sqsub           v2.4s,   v17.4s,  v23.4s  // t9
1042        sqadd           v17.4s,  v17.4s,  v23.4s  // t8
1043        sqsub           v3.4s,   v31.4s,  v25.4s  // t14
1044        sqadd           v31.4s,  v31.4s,  v25.4s  // t15
1045        sqsub           v23.4s,  v19.4s,  v21.4s  // t10
1046        sqadd           v19.4s,  v19.4s,  v21.4s  // t11
1047        sqadd           v25.4s,  v29.4s,  v27.4s  // t12
1048        sqsub           v29.4s,  v29.4s,  v27.4s  // t13
1049
1050.irp r, v2, v17, v3, v31, v23, v19, v25, v29
1051        smin            \r\().4s,  \r\().4s,  v5.4s
1052.endr
1053.irp r, v2, v17, v3, v31, v23, v19, v25, v29
1054        smax            \r\().4s,  \r\().4s,  v4.4s
1055.endr
1056
1057        mul_mls         v7,  v3,  v2,  v0.s[2], v0.s[3] // -> t9a
1058        mul_mla         v6,  v3,  v2,  v0.s[3], v0.s[2] // -> t14a
1059        srshr           v21.4s, v7.4s,  #12             // t9a
1060        srshr           v27.4s, v6.4s,  #12             // t14a
1061
1062        mul_mls         v7,  v29, v23, v0.s[2], v0.s[3] // -> t13a
1063        mul_mla         v6,  v29, v23, v0.s[3], v0.s[2] // -> t10a
1064        srshr           v29.4s, v7.4s,  #12             // t13a
1065        neg             v6.4s,   v6.4s
1066        srshr           v23.4s, v6.4s,  #12             // t10a
1067
1068        sqsub           v2.4s,   v17.4s,  v19.4s  // t11a
1069        sqadd           v17.4s,  v17.4s,  v19.4s  // t8a
1070        sqsub           v3.4s,   v31.4s,  v25.4s  // t12a
1071        sqadd           v31.4s,  v31.4s,  v25.4s  // t15a
1072        sqadd           v19.4s,  v21.4s,  v23.4s  // t9
1073        sqsub           v21.4s,  v21.4s,  v23.4s  // t10
1074        sqsub           v25.4s,  v27.4s,  v29.4s  // t13
1075        sqadd           v27.4s,  v27.4s,  v29.4s  // t14
1076
1077.irp r, v2, v17, v3, v31, v19, v21, v25, v27
1078        smin            \r\().4s,  \r\().4s,  v5.4s
1079.endr
1080.irp r, v2, v17, v3, v31, v19, v21, v25, v27
1081        smax            \r\().4s,  \r\().4s,  v4.4s
1082.endr
1083
1084        mul_mls         v7,  v3,  v2,  v0.s[0], v0.s[0] // -> t11
1085        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t12
1086        mul_mls         v2,  v25, v21, v0.s[0], v0.s[0] // -> t10a
1087
1088        srshr           v7.4s,  v7.4s,  #12   // t11
1089        srshr           v6.4s,  v6.4s,  #12   // t12
1090        mul_mla         v3,  v25, v21, v0.s[0], v0.s[0] // -> t13a
1091        srshr           v2.4s,  v2.4s,  #12   // t10a
1092        srshr           v3.4s,  v3.4s,  #12   // t13a
1093
1094        sqadd           v1.4s,   v16.4s,  v31.4s  // out0
1095        sqsub           v31.4s,  v16.4s,  v31.4s  // out15
1096        mov             v16.16b, v1.16b
1097        sqadd           v23.4s,  v30.4s,  v17.4s  // out7
1098        sqsub           v1.4s,   v30.4s,  v17.4s  // out8
1099        sqadd           v17.4s,  v18.4s,  v27.4s  // out1
1100        sqsub           v30.4s,  v18.4s,  v27.4s  // out14
1101        sqadd           v18.4s,  v20.4s,  v3.4s   // out2
1102        sqsub           v29.4s,  v20.4s,  v3.4s   // out13
1103        sqadd           v3.4s,   v28.4s,  v19.4s  // out6
1104        sqsub           v25.4s,  v28.4s,  v19.4s  // out9
1105        sqadd           v19.4s,  v22.4s,  v6.4s   // out3
1106        sqsub           v28.4s,  v22.4s,  v6.4s   // out12
1107        sqadd           v20.4s,  v24.4s,  v7.4s   // out4
1108        sqsub           v27.4s,  v24.4s,  v7.4s   // out11
1109        sqadd           v21.4s,  v26.4s,  v2.4s   // out5
1110        sqsub           v26.4s,  v26.4s,  v2.4s   // out10
1111        mov             v24.16b, v1.16b
1112        mov             v22.16b, v3.16b
1113
1114        ret
1115endfunc
1116
1117.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
1118        movrel          x16, iadst16_coeffs
1119        ld1             {v0.4s, v1.4s}, [x16], #32
1120
1121        mul_mla         v2,  v31, v16, v0.s[0], v0.s[1] // -> t0
1122        mul_mls         v4,  v31, v16, v0.s[1], v0.s[0] // -> t1
1123        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t2
1124        srshr           v16.4s, v2.4s,  #12             // t0
1125        srshr           v31.4s, v4.4s,  #12             // t1
1126        mul_mls         v2,  v29, v18, v0.s[3], v0.s[2] // -> t3
1127        mul_mla         v4,  v27, v20, v1.s[0], v1.s[1] // -> t4
1128        srshr           v18.4s, v6.4s,  #12             // t2
1129        srshr           v29.4s, v2.4s,  #12             // t3
1130        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t5
1131        mul_mla         v2,  v25, v22, v1.s[2], v1.s[3] // -> t6
1132        srshr           v20.4s, v4.4s,  #12             // t4
1133        srshr           v27.4s, v6.4s,  #12             // t5
1134        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t7
1135        ld1             {v0.4s, v1.4s}, [x16]
1136        movrel          x16, idct_coeffs
1137        mul_mla         v6,  v23, v24, v0.s[0], v0.s[1] // -> t8
1138        srshr           v22.4s, v2.4s,  #12             // t6
1139        srshr           v25.4s, v4.4s,  #12             // t7
1140        mul_mls         v2,  v23, v24, v0.s[1], v0.s[0] // -> t9
1141        mul_mla         v4,  v21, v26, v0.s[2], v0.s[3] // -> t10
1142        srshr           v23.4s, v6.4s,  #12             // t8
1143        srshr           v24.4s, v2.4s,  #12             // t9
1144        mul_mls         v6,  v21, v26, v0.s[3], v0.s[2] // -> t11
1145        mul_mla         v2,  v19, v28, v1.s[0], v1.s[1] // -> t12
1146        srshr           v21.4s, v4.4s,  #12             // t10
1147        srshr           v26.4s, v6.4s,  #12             // t11
1148        mul_mls         v4,  v19, v28, v1.s[1], v1.s[0] // -> t13
1149        mul_mla         v6,  v17, v30, v1.s[2], v1.s[3] // -> t14
1150        srshr           v19.4s, v2.4s,  #12             // t12
1151        srshr           v28.4s, v4.4s,  #12             // t13
1152        mul_mls         v2,  v17, v30, v1.s[3], v1.s[2] // -> t15
1153        srshr           v17.4s, v6.4s,  #12             // t14
1154        srshr           v30.4s, v2.4s,  #12             // t15
1155
1156        ld1             {v0.4s, v1.4s}, [x16]
1157
1158        movi            v5.4s,   #1, msl #16     // row_clip_max = ~(~bdmax << 7), 0x1ffff
1159        mvni            v7.4s,   #1, msl #16     // row_clip_min = (~bdmax << 7), 0xfffe0000
1160
1161        sqsub           v2.4s,   v16.4s,  v23.4s // t8a
1162        sqadd           v16.4s,  v16.4s,  v23.4s // t0a
1163        sqsub           v3.4s,   v31.4s,  v24.4s // t9a
1164        sqadd           v31.4s,  v31.4s,  v24.4s // t1a
1165        sqadd           v23.4s,  v18.4s,  v21.4s // t2a
1166        sqsub           v18.4s,  v18.4s,  v21.4s // t10a
1167        sqadd           v24.4s,  v29.4s,  v26.4s // t3a
1168        sqsub           v29.4s,  v29.4s,  v26.4s // t11a
1169        sqadd           v21.4s,  v20.4s,  v19.4s // t4a
1170        sqsub           v20.4s,  v20.4s,  v19.4s // t12a
1171        sqadd           v26.4s,  v27.4s,  v28.4s // t5a
1172        sqsub           v27.4s,  v27.4s,  v28.4s // t13a
1173        sqadd           v19.4s,  v22.4s,  v17.4s // t6a
1174        sqsub           v22.4s,  v22.4s,  v17.4s // t14a
1175        sqadd           v28.4s,  v25.4s,  v30.4s // t7a
1176        sqsub           v25.4s,  v25.4s,  v30.4s // t15a
1177
1178.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
1179        smin_4s         \r, \r, v5
1180.endr
1181.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
1182        smax_4s         \r, \r, v7
1183.endr
1184
1185        mul_mla         v4,  v2,  v3,  v1.s[1], v1.s[0] // -> t8
1186        mul_mls         v6,  v2,  v3,  v1.s[0], v1.s[1] // -> t9
1187        mul_mla         v2,  v18, v29, v1.s[3], v1.s[2] // -> t10
1188        srshr           v17.4s, v4.4s,  #12             // t8
1189        srshr           v30.4s, v6.4s,  #12             // t9
1190        mul_mls         v4,  v18, v29, v1.s[2], v1.s[3] // -> t11
1191        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t12
1192        srshr           v18.4s, v2.4s,  #12             // t10
1193        srshr           v29.4s, v4.4s,  #12             // t11
1194        mul_mla         v2,  v27, v20, v1.s[0], v1.s[1] // -> t13
1195        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t14
1196        srshr           v27.4s, v6.4s,  #12             // t12
1197        srshr           v20.4s, v2.4s,  #12             // t13
1198        mul_mla         v6,  v25, v22, v1.s[2], v1.s[3] // -> t15
1199        srshr           v25.4s, v4.4s,  #12             // t14
1200        srshr           v22.4s, v6.4s,  #12             // t15
1201
1202        sqsub           v2.4s,   v16.4s,  v21.4s // t4
1203        sqadd           v16.4s,  v16.4s,  v21.4s // t0
1204        sqsub           v3.4s,   v31.4s,  v26.4s // t5
1205        sqadd           v31.4s,  v31.4s,  v26.4s // t1
1206        sqadd           v21.4s,  v23.4s,  v19.4s // t2
1207        sqsub           v23.4s,  v23.4s,  v19.4s // t6
1208        sqadd           v26.4s,  v24.4s,  v28.4s // t3
1209        sqsub           v24.4s,  v24.4s,  v28.4s // t7
1210        sqadd           v19.4s,  v17.4s,  v27.4s // t8a
1211        sqsub           v17.4s,  v17.4s,  v27.4s // t12a
1212        sqadd           v28.4s,  v30.4s,  v20.4s // t9a
1213        sqsub           v30.4s,  v30.4s,  v20.4s // t13a
1214        sqadd           v27.4s,  v18.4s,  v25.4s // t10a
1215        sqsub           v18.4s,  v18.4s,  v25.4s // t14a
1216        sqadd           v20.4s,  v29.4s,  v22.4s // t11a
1217        sqsub           v29.4s,  v29.4s,  v22.4s // t15a
1218
1219.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
1220        smin_4s         \r, \r, v5
1221.endr
1222.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
1223        smax_4s         \r, \r, v7
1224.endr
1225
1226        mul_mla         v4,  v2,  v3,  v0.s[3], v0.s[2] // -> t4a
1227        mul_mls         v6,  v2,  v3,  v0.s[2], v0.s[3] // -> t5a
1228        mul_mls         v2,  v24, v23, v0.s[3], v0.s[2] // -> t6a
1229        srshr           v22.4s, v4.4s,  #12             // t4a
1230        srshr           v25.4s, v6.4s,  #12             // t5a
1231        mul_mla         v4,  v24, v23, v0.s[2], v0.s[3] // -> t7a
1232        mul_mla         v6,  v17, v30, v0.s[3], v0.s[2] // -> t12
1233        srshr           v24.4s, v2.4s,  #12             // t6a
1234        srshr           v23.4s, v4.4s,  #12             // t7a
1235        mul_mls         v2,  v17, v30, v0.s[2], v0.s[3] // -> t13
1236        mul_mls         v4,  v29, v18, v0.s[3], v0.s[2] // -> t14
1237        srshr           v17.4s, v6.4s,  #12             // t12
1238        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t15
1239        srshr           v29.4s, v2.4s,  #12             // t13
1240        srshr           v30.4s, v4.4s,  #12             // t14
1241        srshr           v18.4s, v6.4s,  #12             // t15
1242
1243        sqsub           v2.4s,   v16.4s,  v21.4s // t2a
1244.ifc \o0, v16
1245        sqadd           \o0\().4s,  v16.4s,  v21.4s // out0
1246        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
1247        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
1248.else
1249        sqadd           v4.4s,      v16.4s,  v21.4s // out0
1250        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
1251        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
1252        mov             \o0\().16b, v4.16b
1253.endif
1254
1255        sqsub           v3.4s,      v29.4s,  v18.4s // t15a
1256        sqadd           \o13\().4s, v29.4s,  v18.4s // out13
1257        sqadd           \o2\().4s,  v17.4s,  v30.4s // out2
1258        sqsub           v26.4s,     v17.4s,  v30.4s // t14a
1259
1260        sqadd           \o1\().4s,  v19.4s,  v27.4s // out1
1261        sqsub           v27.4s,     v19.4s,  v27.4s // t10
1262        sqadd           \o14\().4s, v28.4s,  v20.4s // out14
1263        sqsub           v20.4s,     v28.4s,  v20.4s // t11
1264
1265        sqadd           \o3\().4s,  v22.4s,  v24.4s // out3
1266        sqsub           v22.4s,     v22.4s,  v24.4s // t6
1267        sqadd           \o12\().4s, v25.4s,  v23.4s // out12
1268        sqsub           v23.4s,     v25.4s,  v23.4s // t7
1269
1270        // Not clipping the output registers, as they will be downshifted and
1271        // narrowed afterwards anyway.
1272.irp r, v2, v21, v3, v26, v27, v20, v22, v23
1273        smin_4s         \r, \r, v5
1274.endr
1275.irp r, v2, v21, v3, v26, v27, v20, v22, v23
1276        smax_4s         \r, \r, v7
1277.endr
1278
1279        sqneg           \o15\().4s, \o15\().4s      // out15
1280        sqneg           \o13\().4s, \o13\().4s      // out13
1281        sqneg           \o1\().4s,  \o1\().4s       // out1
1282        sqneg           \o3\().4s,  \o3\().4s       // out3
1283
1284        mul_mls         v24, v2,  v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
1285        mul_mla         v4,  v2,  v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
1286        mul_mla         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out5 (v21 or v26)
1287
1288        srshr           v24.4s, v24.4s, #12             // out8
1289        srshr           v4.4s,  v4.4s,  #12             // out7
1290        srshr           v5.4s,  v6.4s,  #12             // out5
1291        mul_mls         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out10 (v26 or v21)
1292        mul_mla         v2,  v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
1293        srshr           v26.4s, v6.4s,  #12             // out10
1294
1295        mul_mls         v6,  v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
1296        mul_mla         v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
1297        mul_mls         v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
1298
1299        srshr           \o4\().4s,   v2.4s,  #12        // out4
1300        srshr           v6.4s,       v6.4s,  #12        // out11
1301        srshr           v7.4s,       v21.4s, #12        // out9
1302        srshr           \o6\().4s,   v22.4s, #12        // out6
1303
1304.ifc \o8, v23
1305        mov             \o8\().16b,  v24.16b
1306        mov             \o10\().16b, v26.16b
1307.endif
1308
1309        sqneg           \o7\().4s,   v4.4s // out7
1310        sqneg           \o5\().4s,   v5.4s // out5
1311        sqneg           \o11\().4s,  v6.4s // out11
1312        sqneg           \o9\().4s,   v7.4s // out9
1313.endm
1314
1315function inv_adst_4s_x16_neon
1316        AARCH64_VALID_CALL_TARGET
1317        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1318        ret
1319endfunc
1320
1321function inv_flipadst_4s_x16_neon
1322        AARCH64_VALID_CALL_TARGET
1323        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
1324        ret
1325endfunc
1326
1327function inv_identity_4s_x16_neon
1328        AARCH64_VALID_CALL_TARGET
1329        movz            w16, #2*(5793-4096)*8, lsl #16
1330        dup             v0.2s,   w16
1331.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1332        sqrdmulh        v2.4s,      v\i\().4s,  v0.s[0]
1333        sqadd           v\i\().4s,  v\i\().4s,  v\i\().4s
1334        sqadd           v\i\().4s,  v\i\().4s,  v2.4s
1335.endr
1336        ret
1337endfunc
1338
1339.macro identity_4x16_shift1 c
1340.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1341        sqrdmulh        v3.4s,   \i,      \c
1342        srshr           v3.4s,   v3.4s,   #1
1343        sqadd           \i,      \i,      v3.4s
1344.endr
1345.endm
1346
1347.macro identity_4x16 c
1348.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1349        sqrdmulh        v3.4s,   \i,      \c
1350        sqadd           \i,      \i,      \i
1351        sqadd           \i,      \i,      v3.4s
1352.endr
1353.endm
1354
1355.macro def_horz_16 scale=0, shift=2, suffix
1356function inv_txfm_horz\suffix\()_16x4_neon
1357        mov             x14, x30
1358        movi            v7.4s,  #0
1359.if \scale
1360        movz            w16, #2896*8, lsl #16
1361        dup             v0.2s,   w16
1362.endif
1363.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1364        ld1             {\i}, [x7]
1365        st1             {v7.4s}, [x7], x8
1366.endr
1367.if \scale
1368        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1369        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
1370.endif
1371        blr             x4
1372        sqrshrn         v16.4h,  v16.4s,  #\shift
1373        sqrshrn         v17.4h,  v17.4s,  #\shift
1374        sqrshrn         v18.4h,  v18.4s,  #\shift
1375        sqrshrn         v19.4h,  v19.4s,  #\shift
1376        sqrshrn2        v16.8h,  v20.4s,  #\shift
1377        sqrshrn2        v17.8h,  v21.4s,  #\shift
1378        sqrshrn2        v18.8h,  v22.4s,  #\shift
1379        sqrshrn2        v19.8h,  v23.4s,  #\shift
1380        sqrshrn         v20.4h,  v24.4s,  #\shift
1381        sqrshrn         v21.4h,  v25.4s,  #\shift
1382        sqrshrn         v22.4h,  v26.4s,  #\shift
1383        sqrshrn         v23.4h,  v27.4s,  #\shift
1384        sqrshrn2        v20.8h,  v28.4s,  #\shift
1385        sqrshrn2        v21.8h,  v29.4s,  #\shift
1386        sqrshrn2        v22.8h,  v30.4s,  #\shift
1387        sqrshrn2        v23.8h,  v31.4s,  #\shift
1388.if \scale
1389        b               L(horz_16x4_epilog)
1390.else
1391L(horz_16x4_epilog):
1392        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
1393        transpose_4x8h  v20, v21, v22, v23, v4,  v5,  v6,  v7
1394
1395.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
1396        st1             {\i}, [x6], #16
1397.endr
1398
1399        ret             x14
1400.endif
1401endfunc
1402.endm
1403
1404def_horz_16 scale=1, shift=1, suffix=_scale
1405def_horz_16 scale=0, shift=2
1406
1407function inv_txfm_add_vert_8x16_neon
1408        mov             x14, x30
1409.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1410        ld1             {v\i\().8h}, [x7], x8
1411.endr
1412        blr             x5
1413        load_add_store_8x16 x6, x7
1414        ret             x14
1415endfunc
1416
1417function inv_txfm_add_16x16_neon
1418        mov             x15, x30
1419        sub             sp,  sp,  #512
1420        ldrh            w12, [x13], #2
1421.irp i, 0, 4, 8, 12
1422        add             x6,  sp,  #(\i*16*2)
1423.if \i > 0
1424        mov             w8,  #(16 - \i)
1425        cmp             w3,  w12
1426        b.lt            1f
1427.if \i < 12
1428        ldrh            w12, [x13], #2
1429.endif
1430.endif
1431        add             x7,  x2,  #(\i*4)
1432        mov             x8,  #16*4
1433        bl              inv_txfm_horz_16x4_neon
1434.endr
1435        b               3f
14361:
1437        movi            v4.8h,  #0
1438        movi            v5.8h,  #0
1439        movi            v6.8h,  #0
1440        movi            v7.8h,  #0
14412:
1442        subs            w8,  w8,  #4
1443.rept 2
1444        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
1445.endr
1446        b.gt            2b
14473:
1448.irp i, 0, 8
1449        add             x6,  x0,  #(\i*2)
1450        add             x7,  sp,  #(\i*2)
1451        mov             x8,  #32
1452        bl              inv_txfm_add_vert_8x16_neon
1453.endr
1454
1455        add             sp,  sp,  #512
1456        ret             x15
1457endfunc
1458
1459const eob_16x16
1460        .short 10, 36, 78, 256
1461endconst
1462
1463const eob_16x16_identity
1464        .short 4, 8, 12, 256
1465endconst
1466
1467.macro def_fn_16x16 txfm1, txfm2
1468function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
1469.ifc \txfm1\()_\txfm2, dct_dct
1470        idct_dc         16,  16,  2
1471.endif
1472        adr             x4,  inv_\txfm1\()_4s_x16_neon
1473        movrel          x5,  X(inv_\txfm2\()_8h_x16_neon)
1474.ifc \txfm1, identity
1475.ifc \txfm2, identity
1476        movrel          x13, eob_16x16
1477.else
1478        movrel          x13, eob_16x16_identity
1479.endif
1480.else
1481.ifc \txfm2, identity
1482        movrel          x13, eob_16x16_identity
1483.else
1484        movrel          x13, eob_16x16
1485.endif
1486.endif
1487        b               inv_txfm_add_16x16_neon
1488endfunc
1489.endm
1490
1491def_fn_16x16 dct, dct
1492def_fn_16x16 identity, identity
1493def_fn_16x16 dct, adst
1494def_fn_16x16 dct, flipadst
1495def_fn_16x16 dct, identity
1496def_fn_16x16 adst, dct
1497def_fn_16x16 adst, adst
1498def_fn_16x16 adst, flipadst
1499def_fn_16x16 flipadst, dct
1500def_fn_16x16 flipadst, adst
1501def_fn_16x16 flipadst, flipadst
1502def_fn_16x16 identity, dct
1503
1504function inv_txfm_add_16x4_neon
1505        mov             x15, x30
1506        movi            v4.4s,  #0
1507
1508.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1509        ld1             {\i},    [x2]
1510        st1             {v4.4s}, [x2], #16
1511.endr
1512
1513        blr             x4
1514
1515        sqrshrn         v16.4h,  v16.4s,  #1
1516        sqrshrn         v17.4h,  v17.4s,  #1
1517        sqrshrn         v18.4h,  v18.4s,  #1
1518        sqrshrn         v19.4h,  v19.4s,  #1
1519        sqrshrn2        v16.8h,  v20.4s,  #1
1520        sqrshrn2        v17.8h,  v21.4s,  #1
1521        sqrshrn2        v18.8h,  v22.4s,  #1
1522        sqrshrn2        v19.8h,  v23.4s,  #1
1523        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
1524        blr             x5
1525        mov             x6,  x0
1526        load_add_store_8x4 x6, x7
1527
1528        sqrshrn         v16.4h,  v24.4s,  #1
1529        sqrshrn         v17.4h,  v25.4s,  #1
1530        sqrshrn         v18.4h,  v26.4s,  #1
1531        sqrshrn         v19.4h,  v27.4s,  #1
1532        sqrshrn2        v16.8h,  v28.4s,  #1
1533        sqrshrn2        v17.8h,  v29.4s,  #1
1534        sqrshrn2        v18.8h,  v30.4s,  #1
1535        sqrshrn2        v19.8h,  v31.4s,  #1
1536        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
1537        blr             x5
1538        add             x6,  x0,  #16
1539        load_add_store_8x4 x6, x7
1540
1541        ret             x15
1542endfunc
1543
1544function inv_txfm_add_4x16_neon
1545        ldrh            w12, [x13, #4]
1546        mov             x15, x30
1547
1548        mov             x11, #64
1549
1550        cmp             w3,  w12
1551        ldrh            w12, [x13, #2]
1552        b.lt            1f
1553
1554        add             x6,  x2,  #48
1555        movi            v2.4s,   #0
1556.irp i, v16.4s, v17.4s, v18.4s, v19.4s
1557        ld1             {\i},    [x6]
1558        st1             {v2.4s}, [x6], x11
1559.endr
1560        blr             x4
1561        sqrshrn         v28.4h,  v16.4s,  #1
1562        sqrshrn         v29.4h,  v17.4s,  #1
1563        sqrshrn         v30.4h,  v18.4s,  #1
1564        sqrshrn         v31.4h,  v19.4s,  #1
1565        transpose_4x4h  v28, v29, v30, v31, v4,  v5,  v6,  v7
1566
1567        b               2f
15681:
1569.irp i, v28.4h, v29.4h, v30.4h, v31.4h
1570        movi            \i,  #0
1571.endr
15722:
1573        cmp             w3,  w12
1574        ldrh            w12, [x13, #0]
1575        b.lt            1f
1576
1577        add             x6,  x2,  #32
1578        movi            v2.4s,   #0
1579.irp i, v16.4s, v17.4s, v18.4s, v19.4s
1580        ld1             {\i},    [x6]
1581        st1             {v2.4s}, [x6], x11
1582.endr
1583        blr             x4
1584        sqrshrn         v24.4h,  v16.4s,  #1
1585        sqrshrn         v25.4h,  v17.4s,  #1
1586        sqrshrn         v26.4h,  v18.4s,  #1
1587        sqrshrn         v27.4h,  v19.4s,  #1
1588        transpose_4x4h  v24, v25, v26, v27, v4,  v5,  v6,  v7
1589
1590        b               2f
15911:
1592.irp i, v24.4h, v25.4h, v26.4h, v27.4h
1593        movi            \i,  #0
1594.endr
15952:
1596        cmp             w3,  w12
1597        b.lt            1f
1598
1599        add             x6,  x2,  #16
1600        movi            v2.4s,   #0
1601.irp i, v16.4s, v17.4s, v18.4s, v19.4s
1602        ld1             {\i},    [x6]
1603        st1             {v2.4s}, [x6], x11
1604.endr
1605        blr             x4
1606        sqrshrn         v20.4h,  v16.4s,  #1
1607        sqrshrn         v21.4h,  v17.4s,  #1
1608        sqrshrn         v22.4h,  v18.4s,  #1
1609        sqrshrn         v23.4h,  v19.4s,  #1
1610        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
1611
1612        b               2f
16131:
1614.irp i, v20.4h, v21.4h, v22.4h, v23.4h
1615        movi            \i,  #0
1616.endr
16172:
1618
1619        movi            v2.4s,   #0
1620.irp i, v16.4s, v17.4s, v18.4s, v19.4s
1621        ld1             {\i},    [x2]
1622        st1             {v2.4s}, [x2], x11
1623.endr
1624        blr             x4
1625        sqrshrn         v16.4h,  v16.4s,  #1
1626        sqrshrn         v17.4h,  v17.4s,  #1
1627        sqrshrn         v18.4h,  v18.4s,  #1
1628        sqrshrn         v19.4h,  v19.4s,  #1
1629        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
1630
1631        blr             x5
1632
1633        load_add_store_4x16 x0, x6
1634
1635        ret             x15
1636endfunc
1637
1638const eob_4x16
1639        .short 13, 29, 45, 64
1640endconst
1641
1642const eob_4x16_identity1
1643        .short 16, 32, 48, 64
1644endconst
1645
1646const eob_4x16_identity2
1647        .short 4, 8, 12, 64
1648endconst
1649
1650.macro def_fn_416 w, h, txfm1, txfm2
1651function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
1652.ifc \txfm1\()_\txfm2, dct_dct
1653        idct_dc         \w,  \h,  1
1654.endif
1655.if \w == 4
1656        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
1657        movrel          x5,  X(inv_\txfm2\()_4h_x\h\()_neon)
1658.ifc \txfm1, identity
1659.ifc \txfm2, identity
1660        movrel          x13, eob_4x16
1661.else
1662        movrel          x13, eob_4x16_identity1
1663.endif
1664.else
1665.ifc \txfm2, identity
1666        movrel          x13, eob_4x16_identity2
1667.else
1668        movrel          x13, eob_4x16
1669.endif
1670.endif
1671.else
1672        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
1673        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
1674.endif
1675        b               inv_txfm_add_\w\()x\h\()_neon
1676endfunc
1677.endm
1678
1679.macro def_fns_416 w, h
1680def_fn_416 \w, \h, dct, dct
1681def_fn_416 \w, \h, identity, identity
1682def_fn_416 \w, \h, dct, adst
1683def_fn_416 \w, \h, dct, flipadst
1684def_fn_416 \w, \h, dct, identity
1685def_fn_416 \w, \h, adst, dct
1686def_fn_416 \w, \h, adst, adst
1687def_fn_416 \w, \h, adst, flipadst
1688def_fn_416 \w, \h, flipadst, dct
1689def_fn_416 \w, \h, flipadst, adst
1690def_fn_416 \w, \h, flipadst, flipadst
1691def_fn_416 \w, \h, identity, dct
1692def_fn_416 \w, \h, adst, identity
1693def_fn_416 \w, \h, flipadst, identity
1694def_fn_416 \w, \h, identity, adst
1695def_fn_416 \w, \h, identity, flipadst
1696.endm
1697
1698def_fns_416 4, 16
1699def_fns_416 16, 4
1700
1701
1702function inv_txfm_add_16x8_neon
1703        mov             x15, x30
1704        stp             d8,  d9,  [sp, #-0x40]!
1705        stp             d10, d11, [sp, #0x10]
1706        stp             d12, d13, [sp, #0x20]
1707        stp             d14, d15, [sp, #0x30]
1708
1709        cmp             w3,  w13
1710        mov             x11, #32
1711        b.lt            1f
1712
1713        movi            v4.4s,  #0
1714        movz            w16, #2896*8, lsl #16
1715        dup             v0.2s,   w16
1716
1717        add             x6,  x2,  #16
1718.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1719        ld1             {\i},    [x6]
1720        st1             {v4.4s}, [x6], x11
1721.endr
1722
1723        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1724        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
1725        blr             x4
1726
1727        sqrshrn         v8.4h,   v16.4s,  #1
1728        sqrshrn         v9.4h,   v17.4s,  #1
1729        sqrshrn         v10.4h,  v18.4s,  #1
1730        sqrshrn         v11.4h,  v19.4s,  #1
1731        sqrshrn2        v8.8h,   v20.4s,  #1
1732        sqrshrn2        v9.8h,   v21.4s,  #1
1733        sqrshrn2        v10.8h,  v22.4s,  #1
1734        sqrshrn2        v11.8h,  v23.4s,  #1
1735        sqrshrn         v12.4h,  v24.4s,  #1
1736        sqrshrn         v13.4h,  v25.4s,  #1
1737        sqrshrn         v14.4h,  v26.4s,  #1
1738        sqrshrn         v15.4h,  v27.4s,  #1
1739        sqrshrn2        v12.8h,  v28.4s,  #1
1740        sqrshrn2        v13.8h,  v29.4s,  #1
1741        sqrshrn2        v14.8h,  v30.4s,  #1
1742        sqrshrn2        v15.8h,  v31.4s,  #1
1743
1744        transpose_4x8h  v8,  v9,  v10, v11, v2,  v3,  v4,  v5
1745        transpose_4x8h  v12, v13, v14, v15, v2,  v3,  v4,  v5
1746
1747        b               2f
17481:
1749.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
1750        movi            \i,  #0
1751.endr
17522:
1753        movz            w16, #2896*8, lsl #16
1754        dup             v0.2s,   w16
1755
1756        movi            v4.4s,  #0
1757.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1758        ld1             {\i},    [x2]
1759        st1             {v4.4s}, [x2], x11
1760.endr
1761
1762        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1763        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
1764        blr             x4
1765
1766        sqrshrn         v16.4h,  v16.4s,  #1
1767        sqrshrn         v17.4h,  v17.4s,  #1
1768        sqrshrn         v18.4h,  v18.4s,  #1
1769        sqrshrn         v19.4h,  v19.4s,  #1
1770        sqrshrn2        v16.8h,  v20.4s,  #1
1771        sqrshrn2        v17.8h,  v21.4s,  #1
1772        sqrshrn2        v18.8h,  v22.4s,  #1
1773        sqrshrn2        v19.8h,  v23.4s,  #1
1774
1775        mov             v20.16b, v8.16b
1776        mov             v21.16b, v9.16b
1777        mov             v22.16b, v10.16b
1778        mov             v23.16b, v11.16b
1779
1780        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
1781
1782        sqrshrn         v8.4h,   v24.4s,  #1
1783        sqrshrn         v9.4h,   v25.4s,  #1
1784        sqrshrn         v10.4h,  v26.4s,  #1
1785        sqrshrn         v11.4h,  v27.4s,  #1
1786        sqrshrn2        v8.8h,   v28.4s,  #1
1787        sqrshrn2        v9.8h,   v29.4s,  #1
1788        sqrshrn2        v10.8h,  v30.4s,  #1
1789        sqrshrn2        v11.8h,  v31.4s,  #1
1790
1791        transpose_4x8h  v8,  v9, v10, v11, v2,  v3,  v4,  v5
1792
1793        blr             x5
1794
1795        mov             x6,  x0
1796        load_add_store_8x8 x6, x7
1797
1798        mov             v16.16b, v8.16b
1799        mov             v17.16b, v9.16b
1800        mov             v18.16b, v10.16b
1801        mov             v19.16b, v11.16b
1802        mov             v20.16b, v12.16b
1803        mov             v21.16b, v13.16b
1804        mov             v22.16b, v14.16b
1805        mov             v23.16b, v15.16b
1806
1807        blr             x5
1808
1809        add             x0,  x0,  #16
1810        load_add_store_8x8 x0, x7
1811
1812        ldp             d14, d15, [sp, #0x30]
1813        ldp             d12, d13, [sp, #0x20]
1814        ldp             d10, d11, [sp, #0x10]
1815        ldp             d8,  d9,  [sp], 0x40
1816        ret             x15
1817endfunc
1818
1819function inv_txfm_add_8x16_neon
1820        mov             x15, x30
1821        stp             d8,  d9,  [sp, #-0x20]!
1822        stp             d10, d11, [sp, #0x10]
1823        ldrh            w12, [x13, #4]
1824
1825        mov             x11, #64
1826
1827        cmp             w3,  w12
1828        ldrh            w12, [x13, #2]
1829        b.lt            1f
1830
1831        add             x6,  x2,  #48
1832        movi            v4.4s,   #0
1833        movz            w16, #2896*8, lsl #16
1834        dup             v0.2s,   w16
1835.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1836        ld1             {\i},    [x6]
1837        st1             {v4.4s}, [x6], x11
1838.endr
1839        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1840        blr             x4
1841
1842        sqrshrn         v28.4h,  v16.4s,  #1
1843        sqrshrn         v29.4h,  v17.4s,  #1
1844        sqrshrn         v30.4h,  v18.4s,  #1
1845        sqrshrn         v31.4h,  v19.4s,  #1
1846        sqrshrn2        v28.8h,  v20.4s,  #1
1847        sqrshrn2        v29.8h,  v21.4s,  #1
1848        sqrshrn2        v30.8h,  v22.4s,  #1
1849        sqrshrn2        v31.8h,  v23.4s,  #1
1850        transpose_4x8h  v28, v29, v30, v31, v2, v3, v4, v5
1851
1852        b               2f
1853
18541:
1855.irp i, v28.8h, v29.8h, v30.8h, v31.8h
1856        movi            \i,  #0
1857.endr
1858
18592:
1860        cmp             w3,  w12
1861        ldrh            w12, [x13, #0]
1862        b.lt            1f
1863
1864        add             x6,  x2,  #32
1865        movi            v4.4s,   #0
1866        movz            w16, #2896*8, lsl #16
1867        dup             v0.2s,   w16
1868.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1869        ld1             {\i},    [x6]
1870        st1             {v4.4s}, [x6], x11
1871.endr
1872        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1873        blr             x4
1874
1875        sqrshrn         v24.4h,  v16.4s,  #1
1876        sqrshrn         v25.4h,  v17.4s,  #1
1877        sqrshrn         v26.4h,  v18.4s,  #1
1878        sqrshrn         v27.4h,  v19.4s,  #1
1879        sqrshrn2        v24.8h,  v20.4s,  #1
1880        sqrshrn2        v25.8h,  v21.4s,  #1
1881        sqrshrn2        v26.8h,  v22.4s,  #1
1882        sqrshrn2        v27.8h,  v23.4s,  #1
1883        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
1884
1885        b               2f
1886
18871:
1888.irp i, v24.8h, v25.8h, v26.8h, v27.8h
1889        movi            \i,  #0
1890.endr
1891
18922:
1893        cmp             w3,  w12
1894        b.lt            1f
1895
1896        add             x6,  x2,  #16
1897        movi            v4.4s,   #0
1898        movz            w16, #2896*8, lsl #16
1899        dup             v0.2s,   w16
1900.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1901        ld1             {\i},    [x6]
1902        st1             {v4.4s}, [x6], x11
1903.endr
1904        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1905        blr             x4
1906
1907        sqrshrn         v8.4h,   v16.4s,  #1
1908        sqrshrn         v9.4h,   v17.4s,  #1
1909        sqrshrn         v10.4h,  v18.4s,  #1
1910        sqrshrn         v11.4h,  v19.4s,  #1
1911        sqrshrn2        v8.8h,   v20.4s,  #1
1912        sqrshrn2        v9.8h,   v21.4s,  #1
1913        sqrshrn2        v10.8h,  v22.4s,  #1
1914        sqrshrn2        v11.8h,  v23.4s,  #1
1915        transpose_4x8h  v8,  v9,  v10, v11, v2, v3, v4, v5
1916
1917        b               2f
1918
19191:
1920.irp i, v8.8h, v9.8h, v10.8h, v11.8h
1921        movi            \i,  #0
1922.endr
1923
19242:
1925        movi            v4.4s,   #0
1926        movz            w16, #2896*8, lsl #16
1927        dup             v0.2s,   w16
1928.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1929        ld1             {\i},    [x2]
1930        st1             {v4.4s}, [x2], x11
1931.endr
1932        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1933        blr             x4
1934
1935        sqrshrn         v16.4h,  v16.4s,  #1
1936        sqrshrn         v17.4h,  v17.4s,  #1
1937        sqrshrn         v18.4h,  v18.4s,  #1
1938        sqrshrn         v19.4h,  v19.4s,  #1
1939        sqrshrn2        v16.8h,  v20.4s,  #1
1940        sqrshrn2        v17.8h,  v21.4s,  #1
1941        sqrshrn2        v18.8h,  v22.4s,  #1
1942        sqrshrn2        v19.8h,  v23.4s,  #1
1943        transpose_4x8h  v16, v17, v18, v19, v2, v3, v4, v5
1944
1945        mov             v20.16b, v8.16b
1946        mov             v21.16b, v9.16b
1947        mov             v22.16b, v10.16b
1948        mov             v23.16b, v11.16b
1949
1950        blr             x5
1951
1952        load_add_store_8x16 x0, x6
1953
1954        ldp             d10, d11, [sp, #0x10]
1955        ldp             d8,  d9,  [sp], 0x20
1956
1957        ret             x15
1958endfunc
1959
1960const eob_8x16
1961        .short 10, 43, 75, 128
1962endconst
1963
1964const eob_8x16_identity1
1965        .short 4, 64, 96, 128
1966endconst
1967
1968const eob_8x16_identity2
1969        .short 4, 8, 12, 128
1970endconst
1971
1972.macro def_fn_816 w, h, txfm1, txfm2
1973function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
1974.ifc \txfm1\()_\txfm2, dct_dct
1975        idct_dc         \w,  \h,  1
1976.endif
1977        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
1978        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
1979.ifc \txfm1, identity
1980.ifc \txfm2, identity
1981        movrel          x13, eob_8x16
1982.else
1983        movrel          x13, eob_8x16_identity1
1984.endif
1985.else
1986.ifc \txfm2, identity
1987        movrel          x13, eob_8x16_identity2
1988.else
1989        movrel          x13, eob_8x16
1990.endif
1991.endif
1992.if \h == 8
1993        ldrh            w13, [x13]
1994.endif
1995        b               inv_txfm_add_\w\()x\h\()_neon
1996endfunc
1997.endm
1998
1999.macro def_fns_816 w, h
2000def_fn_816 \w, \h, dct, dct
2001def_fn_816 \w, \h, identity, identity
2002def_fn_816 \w, \h, dct, adst
2003def_fn_816 \w, \h, dct, flipadst
2004def_fn_816 \w, \h, dct, identity
2005def_fn_816 \w, \h, adst, dct
2006def_fn_816 \w, \h, adst, adst
2007def_fn_816 \w, \h, adst, flipadst
2008def_fn_816 \w, \h, flipadst, dct
2009def_fn_816 \w, \h, flipadst, adst
2010def_fn_816 \w, \h, flipadst, flipadst
2011def_fn_816 \w, \h, identity, dct
2012def_fn_816 \w, \h, adst, identity
2013def_fn_816 \w, \h, flipadst, identity
2014def_fn_816 \w, \h, identity, adst
2015def_fn_816 \w, \h, identity, flipadst
2016.endm
2017
2018def_fns_816 8, 16
2019def_fns_816 16, 8
2020
2021function inv_dct32_odd_4s_x16_neon
2022        movrel          x16, idct_coeffs, 4*16
2023        ld1             {v0.4s, v1.4s}, [x16], #32
2024
2025        mul_mls         v2,  v16, v31, v0.s[0], v0.s[1] // -> t16a
2026        mul_mla         v4,  v16, v31, v0.s[1], v0.s[0] // -> t31a
2027        mul_mls         v6,  v24, v23, v0.s[2], v0.s[3] // -> t17a
2028        srshr           v16.4s, v2.4s,  #12             // t16a
2029        srshr           v31.4s, v4.4s,  #12             // t31a
2030        mul_mla         v2,  v24, v23, v0.s[3], v0.s[2] // -> t30a
2031        mul_mls         v4,  v20, v27, v1.s[0], v1.s[1] // -> t18a
2032        srshr           v24.4s, v6.4s,  #12             // t17a
2033        srshr           v23.4s, v2.4s,  #12             // t30a
2034        mul_mla         v6,  v20, v27, v1.s[1], v1.s[0] // -> t29a
2035        mul_mls         v2,  v28, v19, v1.s[2], v1.s[3] // -> t19a
2036        srshr           v20.4s, v4.4s,  #12             // t18a
2037        srshr           v27.4s, v6.4s,  #12             // t29a
2038        mul_mla         v4,  v28, v19, v1.s[3], v1.s[2] // -> t28a
2039        ld1             {v0.4s, v1.4s}, [x16]
2040        sub             x16, x16, #4*24
2041        mul_mls         v6,  v18, v29, v0.s[0], v0.s[1] // -> t20a
2042        srshr           v28.4s, v2.4s,  #12             // t19a
2043        srshr           v19.4s, v4.4s,  #12             // t28a
2044        mul_mla         v2,  v18, v29, v0.s[1], v0.s[0] // -> t27a
2045        mul_mls         v4,  v26, v21, v0.s[2], v0.s[3] // -> t21a
2046        srshr           v18.4s, v6.4s,  #12             // t20a
2047        srshr           v29.4s, v2.4s,  #12             // t27a
2048        mul_mla         v6,  v26, v21, v0.s[3], v0.s[2] // -> t26a
2049        mul_mls         v2,  v22, v25, v1.s[0], v1.s[1] // -> t22a
2050        srshr           v26.4s, v4.4s,  #12             // t21a
2051        srshr           v21.4s, v6.4s,  #12             // t26a
2052        mul_mla         v4,  v22, v25, v1.s[1], v1.s[0] // -> t25a
2053        mul_mls         v6,  v30, v17, v1.s[2], v1.s[3] // -> t23a
2054        srshr           v22.4s, v2.4s,  #12             // t22a
2055        srshr           v25.4s, v4.4s,  #12             // t25a
2056        mul_mla         v2,  v30, v17, v1.s[3], v1.s[2] // -> t24a
2057        srshr           v30.4s, v6.4s,  #12             // t23a
2058        srshr           v17.4s, v2.4s,  #12             // t24a
2059
2060        ld1             {v0.4s, v1.4s}, [x16]
2061
2062        movi            v5.4s,   #1, msl #16     // row_clip_max = ~(~bdmax << 7), 0x1ffff
2063        mvni            v4.4s,   #1, msl #16     // row_clip_min = (~bdmax << 7), 0xfffe0000
2064
2065        sqsub           v2.4s,   v16.4s,  v24.4s // t17
2066        sqadd           v16.4s,  v16.4s,  v24.4s // t16
2067        sqsub           v3.4s,   v31.4s,  v23.4s // t30
2068        sqadd           v31.4s,  v31.4s,  v23.4s // t31
2069        sqsub           v24.4s,  v28.4s,  v20.4s // t18
2070        sqadd           v28.4s,  v28.4s,  v20.4s // t19
2071        sqadd           v23.4s,  v18.4s,  v26.4s // t20
2072        sqsub           v18.4s,  v18.4s,  v26.4s // t21
2073        sqsub           v20.4s,  v30.4s,  v22.4s // t22
2074        sqadd           v30.4s,  v30.4s,  v22.4s // t23
2075        sqadd           v26.4s,  v17.4s,  v25.4s // t24
2076        sqsub           v17.4s,  v17.4s,  v25.4s // t25
2077        sqsub           v22.4s,  v29.4s,  v21.4s // t26
2078        sqadd           v29.4s,  v29.4s,  v21.4s // t27
2079        sqadd           v25.4s,  v19.4s,  v27.4s // t28
2080        sqsub           v19.4s,  v19.4s,  v27.4s // t29
2081
2082.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
2083        smin            \r\().4s, \r\().4s, v5.4s
2084.endr
2085.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
2086        smax            \r\().4s, \r\().4s, v4.4s
2087.endr
2088
2089        mul_mls         v7,  v3,  v2,  v1.s[0], v1.s[1] // -> t17a
2090        mul_mla         v6,  v3,  v2,  v1.s[1], v1.s[0] // -> t30a
2091        mul_mla         v2,  v19, v24, v1.s[1], v1.s[0] // -> t18a
2092        srshr           v21.4s, v7.4s,  #12             // t17a
2093        srshr           v27.4s, v6.4s,  #12             // t30a
2094        neg             v2.4s,   v2.4s                  // -> t18a
2095        mul_mls         v7,  v19, v24, v1.s[0], v1.s[1] // -> t29a
2096        mul_mls         v6,  v22, v18, v1.s[2], v1.s[3] // -> t21a
2097        srshr           v19.4s, v2.4s,  #12             // t18a
2098        srshr           v24.4s, v7.4s,  #12             // t29a
2099        mul_mla         v2,  v22, v18, v1.s[3], v1.s[2] // -> t26a
2100        mul_mla         v7,  v17, v20, v1.s[3], v1.s[2] // -> t22a
2101        srshr           v22.4s, v6.4s,  #12             // t21a
2102        srshr           v18.4s, v2.4s,  #12             // t26a
2103        neg             v7.4s,   v7.4s                  // -> t22a
2104        mul_mls         v6,  v17, v20, v1.s[2], v1.s[3] // -> t25a
2105        srshr           v17.4s, v7.4s,  #12             // t22a
2106        srshr           v20.4s, v6.4s,  #12             // t25a
2107
2108        sqsub           v2.4s,   v27.4s,  v24.4s // t29
2109        sqadd           v27.4s,  v27.4s,  v24.4s // t30
2110        sqsub           v3.4s,   v21.4s,  v19.4s // t18
2111        sqadd           v21.4s,  v21.4s,  v19.4s // t17
2112        sqsub           v24.4s,  v16.4s,  v28.4s // t19a
2113        sqadd           v16.4s,  v16.4s,  v28.4s // t16a
2114        sqsub           v19.4s,  v30.4s,  v23.4s // t20a
2115        sqadd           v30.4s,  v30.4s,  v23.4s // t23a
2116        sqsub           v28.4s,  v17.4s,  v22.4s // t21
2117        sqadd           v17.4s,  v17.4s,  v22.4s // t22
2118        sqadd           v23.4s,  v26.4s,  v29.4s // t24a
2119        sqsub           v26.4s,  v26.4s,  v29.4s // t27a
2120        sqadd           v22.4s,  v20.4s,  v18.4s // t25
2121        sqsub           v20.4s,  v20.4s,  v18.4s // t26
2122        sqsub           v29.4s,  v31.4s,  v25.4s // t28a
2123        sqadd           v31.4s,  v31.4s,  v25.4s // t31a
2124
2125.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
2126        smin            \r\().4s, \r\().4s, v5.4s
2127.endr
2128.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
2129        smax            \r\().4s, \r\().4s, v4.4s
2130.endr
2131
2132        mul_mls         v7,  v2,  v3,  v0.s[2], v0.s[3] // -> t18a
2133        mul_mla         v6,  v2,  v3,  v0.s[3], v0.s[2] // -> t29a
2134        mul_mls         v2,  v29, v24, v0.s[2], v0.s[3] // -> t19
2135        srshr           v18.4s, v7.4s,  #12             // t18a
2136        srshr           v25.4s, v6.4s,  #12             // t29a
2137        mul_mla         v7,  v29, v24, v0.s[3], v0.s[2] // -> t28
2138        mul_mla         v6,  v26, v19, v0.s[3], v0.s[2] // -> t20
2139        srshr           v29.4s, v2.4s,  #12             // t19
2140        srshr           v24.4s, v7.4s,  #12             // t28
2141        neg             v6.4s,   v6.4s                  // -> t20
2142        mul_mls         v2,  v26, v19, v0.s[2], v0.s[3] // -> t27
2143        mul_mla         v7,  v20, v28, v0.s[3], v0.s[2] // -> t21a
2144        srshr           v26.4s, v6.4s,  #12             // t20
2145        srshr           v19.4s, v2.4s,  #12             // t27
2146        neg             v7.4s,   v7.4s                  // -> t21a
2147        mul_mls         v6,  v20, v28, v0.s[2], v0.s[3] // -> t26a
2148        srshr           v20.4s, v7.4s,  #12             // t21a
2149        srshr           v28.4s, v6.4s,  #12             // t26a
2150
2151        sqsub           v2.4s,   v16.4s,  v30.4s // t23
2152        sqadd           v16.4s,  v16.4s,  v30.4s // t16 = out16
2153        sqsub           v3.4s,   v31.4s,  v23.4s // t24
2154        sqadd           v31.4s,  v31.4s,  v23.4s // t31 = out31
2155        sqsub           v23.4s,  v21.4s,  v17.4s // t22a
2156        sqadd           v17.4s,  v21.4s,  v17.4s // t17a = out17
2157        sqadd           v30.4s,  v27.4s,  v22.4s // t30a = out30
2158        sqsub           v21.4s,  v27.4s,  v22.4s // t25a
2159        sqsub           v27.4s,  v18.4s,  v20.4s // t21
2160        sqadd           v18.4s,  v18.4s,  v20.4s // t18 = out18
2161        sqadd           v7.4s,   v29.4s,  v26.4s // t19a = out19
2162        sqsub           v26.4s,  v29.4s,  v26.4s // t20a
2163        sqadd           v29.4s,  v25.4s,  v28.4s // t29 = out29
2164        sqsub           v25.4s,  v25.4s,  v28.4s // t26
2165        sqadd           v28.4s,  v24.4s,  v19.4s // t28a = out28
2166        sqsub           v24.4s,  v24.4s,  v19.4s // t27a
2167        mov             v19.16b, v7.16b          // out19
2168
2169.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
2170        smin            \r\().4s, \r\().4s, v5.4s
2171.endr
2172.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
2173        smax            \r\().4s, \r\().4s, v4.4s
2174.endr
2175
2176        mul_mls         v7,  v24, v26, v0.s[0], v0.s[0] // -> t20
2177        mul_mla         v6,  v24, v26, v0.s[0], v0.s[0] // -> t27
2178        srshr           v20.4s, v7.4s,  #12             // t20
2179        srshr           v22.4s, v6.4s,  #12             // t27
2180
2181        mul_mla         v7,  v25, v27, v0.s[0], v0.s[0] // -> t26a
2182        mul_mls         v6,  v25, v27, v0.s[0], v0.s[0] // -> t21a
2183        mov             v27.16b,  v22.16b               // t27
2184        srshr           v26.4s, v7.4s,  #12             // t26a
2185
2186        mul_mls         v24, v21, v23, v0.s[0], v0.s[0] // -> t22
2187        mul_mla         v7,  v21, v23, v0.s[0], v0.s[0] // -> t25
2188        srshr           v21.4s, v6.4s,  #12             // t21a
2189        srshr           v22.4s, v24.4s, #12             // t22
2190        srshr           v25.4s, v7.4s,  #12             // t25
2191
2192        mul_mls         v7,  v3,  v2,  v0.s[0], v0.s[0] // -> t23a
2193        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t24a
2194        srshr           v23.4s, v7.4s,  #12             // t23a
2195        srshr           v24.4s, v6.4s,  #12             // t24a
2196
2197        ret
2198endfunc
2199
2200.macro def_horz_32 scale=0, shift=2, suffix
2201function inv_txfm_horz\suffix\()_dct_32x4_neon
2202        mov             x14, x30
2203        movi            v7.4s,  #0
2204        lsl             x8,  x8,  #1
2205.if \scale
2206        movz            w16, #2896*8, lsl #16
2207        dup             v0.2s,   w16
2208.endif
2209
2210.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
2211        ld1             {\i}, [x7]
2212        st1             {v7.4s}, [x7], x8
2213.endr
2214        sub             x7,  x7,  x8, lsl #4
2215        add             x7,  x7,  x8, lsr #1
2216.if \scale
2217        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
2218        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
2219.endif
2220        bl              inv_dct_4s_x16_neon
2221
2222        // idct_16 leaves the row_clip_max/min constants in v5 and v4
2223.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
2224        smin_4s         \r, \r, v5
2225.endr
2226.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
2227        smax_4s         \r, \r, v4
2228.endr
2229
2230        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
2231        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
2232        transpose_4x4s  v24, v25, v26, v27, v2,  v3,  v4,  v5
2233        transpose_4x4s  v28, v29, v30, v31, v2,  v3,  v4,  v5
2234
2235.macro store1 r0, r1, r2, r3
2236        st1             {\r0}, [x6], #16
2237        st1             {\r1}, [x6], #16
2238        st1             {\r2}, [x6], #16
2239        st1             {\r3}, [x6], #16
2240.endm
2241        store1          v16.4s,  v20.4s,  v24.4s,  v28.4s
2242        store1          v17.4s,  v21.4s,  v25.4s,  v29.4s
2243        store1          v18.4s,  v22.4s,  v26.4s,  v30.4s
2244        store1          v19.4s,  v23.4s,  v27.4s,  v31.4s
2245.purgem store1
2246        sub             x6,  x6,  #64*4
2247
2248        movi            v7.4s,  #0
2249.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
2250        ld1             {\i}, [x7]
2251        st1             {v7.4s}, [x7], x8
2252.endr
2253.if \scale
2254        // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
2255        scale_input     .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
2256        scale_input     .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
2257.endif
2258        bl              inv_dct32_odd_4s_x16_neon
2259        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
2260        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
2261        transpose_4x4s  v23, v22, v21, v20, v2,  v3,  v4,  v5
2262        transpose_4x4s  v19, v18, v17, v16, v2,  v3,  v4,  v5
2263.macro store2 r0, r1, r2, r3, shift
2264        ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
2265        sqsub           v4.4s,   v0.4s,   \r0
2266        sqadd           v0.4s,   v0.4s,   \r0
2267        sqsub           v5.4s,   v1.4s,   \r1
2268        sqadd           v1.4s,   v1.4s,   \r1
2269        sqsub           v6.4s,   v2.4s,   \r2
2270        sqadd           v2.4s,   v2.4s,   \r2
2271        sqsub           v7.4s,   v3.4s,   \r3
2272        sqadd           v3.4s,   v3.4s,   \r3
2273        sqrshrn         v0.4h,   v0.4s,   #\shift
2274        sqrshrn2        v0.8h,   v1.4s,   #\shift
2275        sqrshrn         v1.4h,   v2.4s,   #\shift
2276        sqrshrn2        v1.8h,   v3.4s,   #\shift
2277        sqrshrn         v2.4h,   v7.4s,   #\shift
2278        sqrshrn2        v2.8h,   v6.4s,   #\shift
2279        sqrshrn         v3.4h,   v5.4s,   #\shift
2280        sqrshrn2        v3.8h,   v4.4s,   #\shift
2281        st1             {v0.8h, v1.8h}, [x6], #32
2282        rev64           v2.8h,   v2.8h
2283        rev64           v3.8h,   v3.8h
2284        st1             {v2.8h, v3.8h}, [x6], #32
2285.endm
2286
2287        store2          v31.4s,  v27.4s,  v23.4s,  v19.4s,  \shift
2288        store2          v30.4s,  v26.4s,  v22.4s,  v18.4s,  \shift
2289        store2          v29.4s,  v25.4s,  v21.4s,  v17.4s,  \shift
2290        store2          v28.4s,  v24.4s,  v20.4s,  v16.4s,  \shift
2291.purgem store2
2292        ret             x14
2293endfunc
2294.endm
2295
2296def_horz_32 scale=0, shift=2
2297def_horz_32 scale=1, shift=1, suffix=_scale
2298
2299function inv_txfm_add_vert_dct_8x32_neon
2300        mov             x14, x30
2301        lsl             x8,  x8,  #1
2302
2303.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2304        ld1             {v\i\().8h}, [x7], x8
2305.endr
2306        sub             x7,  x7,  x8, lsl #4
2307
2308        bl              X(inv_dct_8h_x16_neon)
2309
2310.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2311        st1             {v\i\().8h}, [x7], x8
2312.endr
2313        sub             x7,  x7,  x8, lsl #4
2314        add             x7,  x7,  x8, lsr #1
2315
2316.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2317        ld1             {v\i\().8h}, [x7], x8
2318.endr
2319        sub             x7,  x7,  x8, lsl #4
2320        sub             x7,  x7,  x8, lsr #1
2321        bl              X(inv_dct32_odd_8h_x16_neon)
2322
2323        neg             x9,  x8
2324        mov             x10, x6
2325        mvni            v1.8h,   #0xfc, lsl #8 // 0x3ff
2326.macro combine r0, r1, r2, r3, op, stride
2327        ld1             {v5.8h}, [x7],    \stride
2328        ld1             {v2.8h}, [x10],   x1
2329        ld1             {v6.8h}, [x7],    \stride
2330        ld1             {v3.8h}, [x10],   x1
2331        \op             v5.8h,   v5.8h,   \r0
2332        ld1             {v7.8h}, [x7],    \stride
2333        ld1             {v4.8h}, [x10],   x1
2334        srshr           v5.8h,   v5.8h,   #4
2335        \op             v6.8h,   v6.8h,   \r1
2336        usqadd          v2.8h,   v5.8h
2337        srshr           v6.8h,   v6.8h,   #4
2338        \op             v7.8h,   v7.8h,   \r2
2339        ld1             {v5.8h}, [x7],    \stride
2340        usqadd          v3.8h,   v6.8h
2341        smin            v2.8h,   v2.8h,   v1.8h
2342        srshr           v7.8h,   v7.8h,   #4
2343        \op             v5.8h,   v5.8h,   \r3
2344        st1             {v2.8h}, [x6],    x1
2345        ld1             {v2.8h}, [x10],   x1
2346        usqadd          v4.8h,   v7.8h
2347        smin            v3.8h,   v3.8h,   v1.8h
2348        srshr           v5.8h,   v5.8h,   #4
2349        st1             {v3.8h}, [x6],    x1
2350        usqadd          v2.8h,   v5.8h
2351        smin            v4.8h,   v4.8h,   v1.8h
2352        st1             {v4.8h}, [x6],    x1
2353        smin            v2.8h,   v2.8h,   v1.8h
2354        st1             {v2.8h}, [x6],    x1
2355.endm
2356        combine         v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
2357        combine         v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
2358        combine         v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
2359        combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
2360        sub             x7,  x7,  x8
2361        combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
2362        combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
2363        combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
2364        combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
2365.purgem combine
2366
2367        ret             x14
2368endfunc
2369
2370const eob_32x32
2371        .short 10, 36, 78, 136, 210, 300, 406, 1024
2372endconst
2373
2374const eob_16x32
2375        .short 10, 36, 78, 151, 215, 279, 343, 512
2376endconst
2377
2378const eob_16x32_shortside
2379        .short 10, 36, 78, 512
2380endconst
2381
2382const eob_8x32
2383        .short 10, 43, 75, 107, 139, 171, 203, 256
2384endconst
2385
2386function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
2387        movi            v0.8h,  #0
2388        movi            v1.8h,  #0
2389        movrel          x13, eob_32x32, 2
2390
2391        mov             x8,  #4*32
23921:
2393        mov             w9,  #0
2394        movrel          x12, eob_32x32, 2
23952:
2396        add             w9,  w9,  #8
2397        ld1             {v16.4s, v17.4s}, [x2]
2398        st1             {v0.4s, v1.4s},   [x2], x8
2399        ld1             {v18.4s, v19.4s}, [x2]
2400        st1             {v0.4s, v1.4s},   [x2], x8
2401        ld1             {v20.4s, v21.4s}, [x2]
2402        st1             {v0.4s, v1.4s},   [x2], x8
2403        ld1             {v22.4s, v23.4s}, [x2]
2404        st1             {v0.4s, v1.4s},   [x2], x8
2405        ld1             {v24.4s, v25.4s}, [x2]
2406        st1             {v0.4s, v1.4s},   [x2], x8
2407        ld1             {v26.4s, v27.4s}, [x2]
2408        st1             {v0.4s, v1.4s},   [x2], x8
2409        ld1             {v28.4s, v29.4s}, [x2]
2410        st1             {v0.4s, v1.4s},   [x2], x8
2411        ld1             {v30.4s, v31.4s}, [x2]
2412        st1             {v0.4s, v1.4s},   [x2], x8
2413        sqxtn           v16.4h,  v16.4s
2414        sqxtn2          v16.8h,  v17.4s
2415        sqxtn           v17.4h,  v18.4s
2416        sqxtn2          v17.8h,  v19.4s
2417        sqxtn           v18.4h,  v20.4s
2418        sqxtn2          v18.8h,  v21.4s
2419        sqxtn           v19.4h,  v22.4s
2420        sqxtn2          v19.8h,  v23.4s
2421        sqxtn           v20.4h,  v24.4s
2422        sqxtn2          v20.8h,  v25.4s
2423        sqxtn           v21.4h,  v26.4s
2424        sqxtn2          v21.8h,  v27.4s
2425        sqxtn           v22.4h,  v28.4s
2426        sqxtn2          v22.8h,  v29.4s
2427        sqxtn           v23.4h,  v30.4s
2428        sqxtn2          v23.8h,  v31.4s
2429        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2430
2431        load_add_store_8x8 x0, x7, shiftbits=2
2432        ldrh            w11, [x12], #4
2433        sub             x0,  x0,  x1, lsl #3
2434        add             x0,  x0,  #2*8
2435        cmp             w3,  w11
2436        b.ge            2b
2437
2438        ldrh            w11, [x13], #4
2439        cmp             w3,  w11
2440        b.lt            9f
2441
2442        sub             x0,  x0,  w9, uxtw #1
2443        add             x0,  x0,  x1, lsl #3
2444        msub            x2,  x8,  x9,  x2
2445        add             x2,  x2,  #4*8
2446        b               1b
24479:
2448        ret
2449endfunc
2450
2451.macro shift_16_regs op, shift
2452.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
2453        \op             \i,  \i,  #\shift
2454.endr
2455.endm
2456
2457.macro def_identity_1632 w, h, wshort, hshort
2458function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
2459        movz            w16, #2896*8, lsl #16
2460        movz            w17, #2*(5793-4096)*8, lsl #16
2461        movi            v0.4s,   #0
2462        movi            v1.4s,   #0
2463        movrel          x13, eob_16x32\hshort, 2
2464
2465        mov             x8,  #4*\h
24661:
2467        mov             w9,  #0
2468        movrel          x12, eob_16x32\wshort, 2
24692:
2470        add             w9,  w9,  #8
2471        ld1             {v16.4s, v17.4s}, [x2]
2472        st1             {v0.4s, v1.4s},   [x2], x8
2473        dup             v2.2s,   w16
2474        ld1             {v18.4s, v19.4s}, [x2]
2475        st1             {v0.4s, v1.4s},   [x2], x8
2476        mov             v2.s[1], w17
2477        ld1             {v20.4s, v21.4s}, [x2]
2478        st1             {v0.4s, v1.4s},   [x2], x8
2479        ld1             {v22.4s, v23.4s}, [x2]
2480        st1             {v0.4s, v1.4s},   [x2], x8
2481        ld1             {v24.4s, v25.4s}, [x2]
2482        st1             {v0.4s, v1.4s},   [x2], x8
2483        ld1             {v26.4s, v27.4s}, [x2]
2484        st1             {v0.4s, v1.4s},   [x2], x8
2485        ld1             {v28.4s, v29.4s}, [x2]
2486        st1             {v0.4s, v1.4s},   [x2], x8
2487        ld1             {v30.4s, v31.4s}, [x2]
2488        st1             {v0.4s, v1.4s},   [x2], x8
2489        scale_input     .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
2490        scale_input     .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
2491
2492.if \w == 16
2493        // 16x32
2494        identity_4x16_shift1 v2.s[1]
2495.else
2496        // 32x16
2497        shift_16_regs   sqshl, 1
2498        identity_4x16   v2.s[1]
2499.endif
2500        sqxtn           v16.4h,  v16.4s
2501        sqxtn2          v16.8h,  v17.4s
2502        sqxtn           v17.4h,  v18.4s
2503        sqxtn2          v17.8h,  v19.4s
2504        sqxtn           v18.4h,  v20.4s
2505        sqxtn2          v18.8h,  v21.4s
2506        sqxtn           v19.4h,  v22.4s
2507        sqxtn2          v19.8h,  v23.4s
2508        sqxtn           v20.4h,  v24.4s
2509        sqxtn2          v20.8h,  v25.4s
2510        sqxtn           v21.4h,  v26.4s
2511        sqxtn2          v21.8h,  v27.4s
2512        sqxtn           v22.4h,  v28.4s
2513        sqxtn2          v22.8h,  v29.4s
2514        sqxtn           v23.4h,  v30.4s
2515        sqxtn2          v23.8h,  v31.4s
2516
2517        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2518
2519.if \w == 16
2520        load_add_store_8x8 x0, x7, shiftbits=2
2521.else
2522        load_add_store_8x8 x0, x7, shiftbits=4
2523.endif
2524        ldrh            w11, [x12], #4
2525        sub             x0,  x0,  x1, lsl #3
2526        add             x0,  x0,  #16
2527        cmp             w3,  w11
2528        b.ge            2b
2529
2530        ldrh            w11, [x13], #4
2531        cmp             w3,  w11
2532        b.lt            9f
2533
2534        sub             x0,  x0,  w9, uxtw #1
2535        add             x0,  x0,  x1, lsl #3
2536        msub            x2,  x8,  x9,  x2
2537        add             x2,  x2,  #4*8
2538        b               1b
25399:
2540        ret
2541endfunc
2542.endm
2543
2544def_identity_1632 16, 32, _shortside,
2545def_identity_1632 32, 16, , _shortside
2546
2547.macro def_identity_832 w, h
2548function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
2549        movi            v0.4s,  #0
2550        movi            v1.4s,  #0
2551        // Working on 8x8 blocks, read every other entry from eob_8x32
2552        movrel          x13, eob_8x32, 2
2553
2554        mov             w8,  #4*\h
25551:
2556        // Working on 8x8 blocks, read every other entry from eob_8x32
2557        ldrh            w12, [x13], #4
2558        ld1             {v16.4s, v17.4s}, [x2]
2559        st1             {v0.4s, v1.4s},   [x2], x8
2560        ld1             {v18.4s, v19.4s}, [x2]
2561        st1             {v0.4s, v1.4s},   [x2], x8
2562        ld1             {v20.4s, v21.4s}, [x2]
2563        st1             {v0.4s, v1.4s},   [x2], x8
2564        ld1             {v22.4s, v23.4s}, [x2]
2565        st1             {v0.4s, v1.4s},   [x2], x8
2566        ld1             {v24.4s, v25.4s}, [x2]
2567        st1             {v0.4s, v1.4s},   [x2], x8
2568        ld1             {v26.4s, v27.4s}, [x2]
2569        st1             {v0.4s, v1.4s},   [x2], x8
2570        ld1             {v28.4s, v29.4s}, [x2]
2571        st1             {v0.4s, v1.4s},   [x2], x8
2572        ld1             {v30.4s, v31.4s}, [x2]
2573        st1             {v0.4s, v1.4s},   [x2], x8
2574
2575.if \w == 8
2576        sqrshrn         v16.4h,  v16.4s,  #1
2577        sqrshrn2        v16.8h,  v17.4s,  #1
2578        sqrshrn         v17.4h,  v18.4s,  #1
2579        sqrshrn2        v17.8h,  v19.4s,  #1
2580        sqrshrn         v18.4h,  v20.4s,  #1
2581        sqrshrn2        v18.8h,  v21.4s,  #1
2582        sqrshrn         v19.4h,  v22.4s,  #1
2583        sqrshrn2        v19.8h,  v23.4s,  #1
2584        sqrshrn         v20.4h,  v24.4s,  #1
2585        sqrshrn2        v20.8h,  v25.4s,  #1
2586        sqrshrn         v21.4h,  v26.4s,  #1
2587        sqrshrn2        v21.8h,  v27.4s,  #1
2588        sqrshrn         v22.4h,  v28.4s,  #1
2589        sqrshrn2        v22.8h,  v29.4s,  #1
2590        sqrshrn         v23.4h,  v30.4s,  #1
2591        sqrshrn2        v23.8h,  v31.4s,  #1
2592.else
2593        sqxtn           v16.4h,  v16.4s
2594        sqxtn2          v16.8h,  v17.4s
2595        sqxtn           v17.4h,  v18.4s
2596        sqxtn2          v17.8h,  v19.4s
2597        sqxtn           v18.4h,  v20.4s
2598        sqxtn2          v18.8h,  v21.4s
2599        sqxtn           v19.4h,  v22.4s
2600        sqxtn2          v19.8h,  v23.4s
2601        sqxtn           v20.4h,  v24.4s
2602        sqxtn2          v20.8h,  v25.4s
2603        sqxtn           v21.4h,  v26.4s
2604        sqxtn2          v21.8h,  v27.4s
2605        sqxtn           v22.4h,  v28.4s
2606        sqxtn2          v22.8h,  v29.4s
2607        sqxtn           v23.4h,  v30.4s
2608        sqxtn2          v23.8h,  v31.4s
2609.endif
2610
2611        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2612
2613
2614        cmp             w3,  w12
2615.if \w == 8
2616        load_add_store_8x8 x0, x7, shiftbits=2
2617.else
2618        load_add_store_8x8 x0, x7, shiftbits=3
2619.endif
2620
2621        b.lt            9f
2622.if \w == 8
2623        sub             x2,  x2,  x8, lsl #3
2624        add             x2,  x2,  #4*8
2625.else
2626        sub             x0,  x0,  x1, lsl #3
2627        add             x0,  x0,  #2*8
2628.endif
2629        b               1b
2630
26319:
2632        ret
2633endfunc
2634.endm
2635
2636def_identity_832 8, 32
2637def_identity_832 32, 8
2638
2639function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
2640        idct_dc         32,  32,  2
2641
2642        mov             x15, x30
2643        sub             sp,  sp,  #2048
2644        movrel          x13, eob_32x32
2645        ldrh            w12, [x13], #2
2646
2647.irp i, 0, 4, 8, 12, 16, 20, 24, 28
2648        add             x6,  sp,  #(\i*32*2)
2649.if \i > 0
2650        mov             w8,  #(32 - \i)
2651        cmp             w3,  w12
2652        b.lt            1f
2653.if \i < 28
2654        ldrh            w12, [x13], #2
2655.endif
2656.endif
2657        add             x7,  x2,  #(\i*4)
2658        mov             x8,  #32*4
2659        bl              inv_txfm_horz_dct_32x4_neon
2660.endr
2661        b               3f
2662
26631:
2664        movi            v4.8h,  #0
2665        movi            v5.8h,  #0
2666        movi            v6.8h,  #0
2667        movi            v7.8h,  #0
26682:
2669        subs            w8,  w8,  #4
2670.rept 4
2671        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2672.endr
2673        b.gt            2b
2674
26753:
2676.irp i, 0, 8, 16, 24
2677        add             x6,  x0,  #(\i*2)
2678        add             x7,  sp,  #(\i*2)
2679        mov             x8,  #32*2
2680        bl              inv_txfm_add_vert_dct_8x32_neon
2681.endr
2682
2683        add             sp,  sp,  #2048
2684        ret             x15
2685endfunc
2686
2687function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
2688        idct_dc         16,  32,  1
2689
2690        mov             x15, x30
2691        sub             sp,  sp,  #1024
2692        movrel          x13, eob_16x32
2693        ldrh            w12, [x13], #2
2694        adr             x4,  inv_dct_4s_x16_neon
2695
2696.irp i, 0, 4, 8, 12, 16, 20, 24, 28
2697        add             x6,  sp,  #(\i*16*2)
2698        add             x7,  x2,  #(\i*4)
2699.if \i > 0
2700        mov             w8,  #(32 - \i)
2701        cmp             w3,  w12
2702        b.lt            1f
2703.if \i < 28
2704        ldrh            w12, [x13], #2
2705.endif
2706.endif
2707        mov             x8,  #4*32
2708        bl              inv_txfm_horz_scale_16x4_neon
2709.endr
2710        b               3f
2711
27121:
2713        movi            v4.8h,  #0
2714        movi            v5.8h,  #0
2715        movi            v6.8h,  #0
2716        movi            v7.8h,  #0
27172:
2718        subs            w8,  w8,  #4
2719.rept 2
2720        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2721.endr
2722        b.gt            2b
2723
27243:
2725.irp i, 0, 8
2726        add             x6,  x0,  #(\i*2)
2727        add             x7,  sp,  #(\i*2)
2728        mov             x8,  #16*2
2729        bl              inv_txfm_add_vert_dct_8x32_neon
2730.endr
2731
2732        add             sp,  sp,  #1024
2733        ret             x15
2734endfunc
2735
2736function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
2737        idct_dc         32,  16,  1
2738
2739        mov             x15, x30
2740        sub             sp,  sp,  #1024
2741
2742        movrel          x13, eob_16x32
2743        movrel          x5,  X(inv_dct_8h_x16_neon)
2744        ldrh            w12, [x13], #2
2745
2746.irp i, 0, 4, 8, 12
2747        add             x6,  sp,  #(\i*32*2)
2748        add             x7,  x2,  #(\i*4)
2749.if \i > 0
2750        mov             w8,  #(16 - \i)
2751        cmp             w3,  w12
2752        b.lt            1f
2753.if \i < 12
2754        ldrh            w12, [x13], #2
2755.endif
2756.endif
2757        mov             x8,  #4*16
2758        bl              inv_txfm_horz_scale_dct_32x4_neon
2759.endr
2760        b               3f
2761
27621:
2763        movi            v4.8h,  #0
2764        movi            v5.8h,  #0
2765        movi            v6.8h,  #0
2766        movi            v7.8h,  #0
27672:
2768        subs            w8,  w8,  #4
2769.rept 4
2770        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2771.endr
2772        b.gt            2b
2773
27743:
2775.irp i, 0, 8, 16, 24
2776        add             x6,  x0,  #(\i*2)
2777        add             x7,  sp,  #(\i*2)
2778        mov             x8,  #32*2
2779        bl              inv_txfm_add_vert_8x16_neon
2780.endr
2781
2782        add             sp,  sp,  #1024
2783        ret             x15
2784endfunc
2785
2786function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
2787        idct_dc         8,   32, 2
2788
2789        mov             x15, x30
2790        sub             sp,  sp,  #512
2791
2792        movrel          x13, eob_8x32
2793
2794        movi            v28.4s,  #0
2795        mov             x8,  #4*32
2796        mov             w9,  #32
2797        mov             x6,  sp
2798        mov             x7,  x2
27991:
2800.irp i, 16, 17, 18, 19, 20, 21, 22, 23
2801        ld1             {v\i\().4s}, [x7]
2802        st1             {v28.4s}, [x7], x8
2803.endr
2804        ldrh            w12, [x13], #2
2805        sub             w9,  w9,  #4
2806        sub             x7,  x7,  x8, lsl #3
2807        add             x7,  x7,  #4*4
2808
2809        bl              inv_dct_4s_x8_neon
2810
2811        sqrshrn         v16.4h,  v16.4s,  #2
2812        sqrshrn         v17.4h,  v17.4s,  #2
2813        sqrshrn         v18.4h,  v18.4s,  #2
2814        sqrshrn         v19.4h,  v19.4s,  #2
2815        sqrshrn2        v16.8h,  v20.4s,  #2
2816        sqrshrn2        v17.8h,  v21.4s,  #2
2817        sqrshrn2        v18.8h,  v22.4s,  #2
2818        sqrshrn2        v19.8h,  v23.4s,  #2
2819
2820        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
2821
2822        cmp             w3,  w12
2823        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
2824
2825        b.ge            1b
2826        cbz             w9,  3f
2827
2828        movi            v29.8h,  #0
2829        movi            v30.8h,  #0
2830        movi            v31.8h,  #0
28312:
2832        subs            w9,  w9,  #4
2833        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
2834        b.gt            2b
2835
28363:
2837        mov             x6,  x0
2838        mov             x7,  sp
2839        mov             x8,  #8*2
2840        bl              inv_txfm_add_vert_dct_8x32_neon
2841
2842        add             sp,  sp,  #512
2843        ret             x15
2844endfunc
2845
2846function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
2847        idct_dc         32,  8,   2
2848
2849        mov             x15, x30
2850        sub             sp,  sp,  #512
2851
2852.irp i, 0, 4
2853        add             x6,  sp,  #(\i*32*2)
2854        add             x7,  x2,  #(\i*4)
2855.if \i > 0
2856        cmp             w3,  #10
2857        b.lt            1f
2858.endif
2859        mov             x8,  #8*4
2860        bl              inv_txfm_horz_dct_32x4_neon
2861.endr
2862        b               2f
2863
28641:
2865        movi            v4.8h,   #0
2866        movi            v5.8h,   #0
2867        movi            v6.8h,   #0
2868        movi            v7.8h,   #0
2869.rept 4
2870        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2871.endr
2872
28732:
2874        mov             x8,  #2*32
2875        mov             w9,  #0
28761:
2877        add             x6,  x0,  x9, lsl #1
2878        add             x7,  sp,  x9, lsl #1 // #(\i*2)
2879
2880.irp i, 16, 17, 18, 19, 20, 21, 22, 23
2881        ld1             {v\i\().8h}, [x7], x8
2882.endr
2883        add             w9,  w9,  #8
2884
2885        bl              X(inv_dct_8h_x8_neon)
2886
2887        cmp             w9,  #32
2888
2889        load_add_store_8x8 x6, x7
2890
2891        b.lt            1b
2892
2893        add             sp,  sp,  #512
2894        ret             x15
2895endfunc
2896
2897function inv_dct64_step1_neon
2898        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
2899        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
2900        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
2901        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
2902
2903        ld1             {v0.4s, v1.4s}, [x17], #32
2904
2905        sqrdmulh        v23.4s,  v16.4s,  v0.s[1]       // t63a
2906        sqrdmulh        v16.4s,  v16.4s,  v0.s[0]       // t32a
2907        sqrdmulh        v22.4s,  v17.4s,  v0.s[2]       // t62a
2908        sqrdmulh        v17.4s,  v17.4s,  v0.s[3]       // t33a
2909        sqrdmulh        v21.4s,  v18.4s,  v1.s[1]       // t61a
2910        sqrdmulh        v18.4s,  v18.4s,  v1.s[0]       // t34a
2911        sqrdmulh        v20.4s,  v19.4s,  v1.s[2]       // t60a
2912        sqrdmulh        v19.4s,  v19.4s,  v1.s[3]       // t35a
2913
2914        ld1             {v0.4s}, [x17], #16
2915
2916        sqadd           v24.4s,  v16.4s,  v17.4s        // t32
2917        sqsub           v25.4s,  v16.4s,  v17.4s        // t33
2918        sqsub           v26.4s,  v19.4s,  v18.4s        // t34
2919        sqadd           v27.4s,  v19.4s,  v18.4s        // t35
2920        sqadd           v28.4s,  v20.4s,  v21.4s        // t60
2921        sqsub           v29.4s,  v20.4s,  v21.4s        // t61
2922        sqsub           v30.4s,  v23.4s,  v22.4s        // t62
2923        sqadd           v31.4s,  v23.4s,  v22.4s        // t63
2924
2925.irp r, v24, v25, v26, v27, v28, v29, v30, v31
2926        smin_4s         \r, \r, v5
2927.endr
2928.irp r, v24, v25, v26, v27, v28, v29, v30, v31
2929        smax_4s         \r, \r, v4
2930.endr
2931
2932        mul_mla         v2,  v29, v26, v0.s[0], v0.s[1] // -> t34a
2933        mul_mls         v7,  v29, v26, v0.s[1], v0.s[0] // -> t61a
2934        neg             v2.4s,   v2.4s                  // t34a
2935        mul_mls         v6,  v30, v25, v0.s[1], v0.s[0] // -> t33a
2936        srshr           v26.4s, v2.4s,  #12             // t34a
2937        mul_mla         v2,  v30, v25, v0.s[0], v0.s[1] // -> t62a
2938        srshr           v29.4s, v7.4s,  #12             // t61a
2939        srshr           v25.4s, v6.4s,  #12             // t33a
2940        srshr           v30.4s, v2.4s,  #12             // t62a
2941
2942        sqadd           v16.4s,  v24.4s,  v27.4s        // t32a
2943        sqsub           v19.4s,  v24.4s,  v27.4s        // t35a
2944        sqadd           v17.4s,  v25.4s,  v26.4s        // t33
2945        sqsub           v18.4s,  v25.4s,  v26.4s        // t34
2946        sqsub           v20.4s,  v31.4s,  v28.4s        // t60a
2947        sqadd           v23.4s,  v31.4s,  v28.4s        // t63a
2948        sqsub           v21.4s,  v30.4s,  v29.4s        // t61
2949        sqadd           v22.4s,  v30.4s,  v29.4s        // t62
2950
2951.irp r, v16, v19, v17, v18, v20, v23, v21, v22
2952        smin_4s         \r, \r, v5
2953.endr
2954.irp r, v16, v19, v17, v18, v20, v23, v21, v22
2955        smax_4s         \r, \r, v4
2956.endr
2957
2958        mul_mla         v2,  v21, v18, v0.s[2], v0.s[3] // -> t61a
2959        mul_mls         v7,  v21, v18, v0.s[3], v0.s[2] // -> t34a
2960        mul_mla         v6,  v20, v19, v0.s[2], v0.s[3] // -> t60
2961        srshr           v21.4s, v2.4s,  #12             // t61a
2962        srshr           v18.4s, v7.4s,  #12             // t34a
2963        mul_mls         v2,  v20, v19, v0.s[3], v0.s[2] // -> t35
2964        srshr           v20.4s, v6.4s,  #12             // t60
2965        srshr           v19.4s, v2.4s,  #12             // t35
2966
2967        st1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
2968        st1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
2969
2970        ret
2971endfunc
2972
2973function inv_dct64_step2_neon
2974        movrel          x16, idct_coeffs
2975        ld1             {v0.4s}, [x16]
29761:
2977        // t32a/33/34a/35/60/61a/62/63a
2978        // t56a/57/58a/59/36/37a/38/39a
2979        // t40a/41/42a/43/52/53a/54/55a
2980        // t48a/49/50a/51/44/45a/46/47a
2981        ldr             q16, [x6, #4*4*0]  // t32a
2982        ldr             q17, [x9, #4*4*8]  // t39a
2983        ldr             q18, [x9, #4*4*0]  // t63a
2984        ldr             q19, [x6, #4*4*8]  // t56a
2985        ldr             q20, [x6, #4*4*16] // t40a
2986        ldr             q21, [x9, #4*4*24] // t47a
2987        ldr             q22, [x9, #4*4*16] // t55a
2988        ldr             q23, [x6, #4*4*24] // t48a
2989
2990        sqadd           v24.4s,  v16.4s, v17.4s         // t32
2991        sqsub           v25.4s,  v16.4s, v17.4s         // t39
2992        sqadd           v26.4s,  v18.4s, v19.4s         // t63
2993        sqsub           v27.4s,  v18.4s, v19.4s         // t56
2994        sqsub           v28.4s,  v21.4s, v20.4s         // t40
2995        sqadd           v29.4s,  v21.4s, v20.4s         // t47
2996        sqadd           v30.4s,  v23.4s, v22.4s         // t48
2997        sqsub           v31.4s,  v23.4s, v22.4s         // t55
2998
2999.irp r, v24, v25, v26, v27, v28, v29, v30, v31
3000        smin_4s         \r, \r, v5
3001.endr
3002.irp r, v24, v25, v26, v27, v28, v29, v30, v31
3003        smax_4s         \r, \r, v4
3004.endr
3005
3006        mul_mla         v2,  v27, v25, v0.s[3], v0.s[2] // -> t56a
3007        mul_mls         v7,  v27, v25, v0.s[2], v0.s[3] // -> t39a
3008        mul_mla         v6,  v31, v28, v0.s[3], v0.s[2] // -> t40a
3009        srshr           v25.4s, v2.4s,  #12             // t56a
3010        srshr           v27.4s, v7.4s,  #12             // t39a
3011        neg             v6.4s,   v6.4s                  // t40a
3012        mul_mls         v2,  v31, v28, v0.s[2], v0.s[3] // -> t55a
3013        srshr           v31.4s, v6.4s,  #12             // t40a
3014        srshr           v28.4s, v2.4s,  #12             // t55a
3015
3016        sqadd           v16.4s,  v24.4s,  v29.4s        // t32a
3017        sqsub           v19.4s,  v24.4s,  v29.4s        // t47a
3018        sqadd           v17.4s,  v27.4s,  v31.4s        // t39
3019        sqsub           v18.4s,  v27.4s,  v31.4s        // t40
3020        sqsub           v20.4s,  v26.4s,  v30.4s        // t48a
3021        sqadd           v23.4s,  v26.4s,  v30.4s        // t63a
3022        sqsub           v21.4s,  v25.4s,  v28.4s        // t55
3023        sqadd           v22.4s,  v25.4s,  v28.4s        // t56
3024
3025.irp r, v16, v19, v17, v18, v20, v23, v21, v22
3026        smin_4s         \r, \r, v5
3027.endr
3028.irp r, v16, v19, v17, v18, v20, v23, v21, v22
3029        smax_4s         \r, \r, v4
3030.endr
3031
3032        mul_mls         v2,  v21, v18, v0.s[0], v0.s[0] // -> t40a
3033        mul_mla         v7,  v21, v18, v0.s[0], v0.s[0] // -> t55a
3034        mul_mls         v6,  v20, v19, v0.s[0], v0.s[0] // -> t47
3035        srshr           v18.4s, v2.4s,  #12             // t40a
3036        srshr           v21.4s, v7.4s,  #12             // t55a
3037        mul_mla         v2,  v20, v19, v0.s[0], v0.s[0] // -> t48
3038        srshr           v19.4s, v6.4s,  #12             // t47
3039        srshr           v20.4s, v2.4s,  #12             // t48
3040
3041        str             q16, [x6, #4*4*0]  // t32a
3042        str             q17, [x9, #4*4*0]  // t39
3043        str             q18, [x6, #4*4*8]  // t40a
3044        str             q19, [x9, #4*4*8]  // t47
3045        str             q20, [x6, #4*4*16] // t48
3046        str             q21, [x9, #4*4*16] // t55a
3047        str             q22, [x6, #4*4*24] // t56
3048        str             q23, [x9, #4*4*24] // t63a
3049
3050        add             x6,  x6,  #4*4
3051        sub             x9,  x9,  #4*4
3052        cmp             x6,  x9
3053        b.lt            1b
3054        ret
3055endfunc
3056
3057.macro load8 src, strd, zero, clear
3058.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
3059.if \clear
3060        ld1             {\i}, [\src]
3061        st1             {\zero}, [\src], \strd
3062.else
3063        ld1             {\i}, [\src], \strd
3064.endif
3065.endr
3066.endm
3067
3068.macro store16 dst
3069.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
3070        st1             {\i}, [\dst], #16
3071.endr
3072.endm
3073
3074.macro clear_upper8
3075.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
3076        movi            \i,  #0
3077.endr
3078.endm
3079
3080.macro movi_if reg, val, cond
3081.if \cond
3082        movi            \reg, \val
3083.endif
3084.endm
3085
3086.macro movz16dup_if reg, gpr, val, cond
3087.if \cond
3088        movz            \gpr, \val, lsl #16
3089        dup             \reg, \gpr
3090.endif
3091.endm
3092
3093.macro st1_if regs, dst, cond
3094.if \cond
3095        st1             \regs, \dst
3096.endif
3097.endm
3098
3099.macro str_if reg, dst, cond
3100.if \cond
3101        str             \reg, \dst
3102.endif
3103.endm
3104
3105.macro stroff_if reg, dst, dstoff, cond
3106.if \cond
3107        str             \reg, \dst, \dstoff
3108.endif
3109.endm
3110
3111.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
3112.if \cond
3113        scale_input     .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
3114.endif
3115.endm
3116
3117.macro def_dct64_func suffix, clear=0, scale=0
3118function inv_txfm_dct\suffix\()_4s_x64_neon
3119        mov             x14, x30
3120        mov             x6,  sp
3121        lsl             x8,  x8,  #2
3122
3123        movz16dup_if    v0.2s, w16, #2896*8, \scale
3124        movi_if         v7.4s,  #0, \clear
3125        load8           x7,  x8,  v7.4s, \clear
3126        clear_upper8
3127        sub             x7,  x7,  x8, lsl #3
3128        add             x7,  x7,  x8, lsr #1
3129        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
3130
3131        bl              inv_dct_4s_x16_neon
3132
3133        // idct_16 leaves the row_clip_max/min constants in v5 and v4
3134.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
3135        smin_4s         \r, \r, v5
3136.endr
3137.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
3138        smax_4s         \r, \r, v4
3139.endr
3140
3141        store16         x6
3142
3143        movz16dup_if    v0.2s, w16, #2896*8, \scale
3144        movi_if         v7.8h,  #0, \clear
3145        load8           x7,  x8,  v7.4s, \clear
3146        clear_upper8
3147        sub             x7,  x7,  x8, lsl #3
3148        lsr             x8,  x8,  #1
3149        sub             x7,  x7,  x8, lsr #1
3150        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
3151
3152        bl              inv_dct32_odd_4s_x16_neon
3153
3154        add             x10, x6,  #16*15
3155        sub             x6,  x6,  #16*16
3156
3157        mov             x9,  #-16
3158
3159        movi            v1.4s,  #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
3160        mvni            v0.4s,  #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
3161
3162.macro store_addsub r0, r1, r2, r3
3163        ld1             {v2.4s}, [x6], #16
3164        ld1             {v3.4s}, [x6], #16
3165        sqadd           v6.4s,  v2.4s,  \r0
3166        sqsub           \r0,    v2.4s,  \r0
3167        ld1             {v4.4s}, [x6], #16
3168        sqadd           v7.4s,  v3.4s,  \r1
3169        sqsub           \r1,    v3.4s,  \r1
3170        smin            v6.4s,  v6.4s,  v1.4s
3171        smin            \r0,    \r0,    v1.4s
3172        ld1             {v5.4s}, [x6], #16
3173        sqadd           v2.4s,  v4.4s,  \r2
3174        sub             x6,  x6,  #16*4
3175        smax            v6.4s,  v6.4s,  v0.4s
3176        smax            \r0,    \r0,    v0.4s
3177        sqsub           \r2,    v4.4s,  \r2
3178        smin            v7.4s,  v7.4s,  v1.4s
3179        smin            \r1,    \r1,    v1.4s
3180        st1             {v6.4s}, [x6], #16
3181        st1             {\r0},   [x10], x9
3182        smin            v2.4s,  v2.4s,  v1.4s
3183        smin            \r2,    \r2,    v1.4s
3184        smax            v7.4s,  v7.4s,  v0.4s
3185        smax            \r1,    \r1,    v0.4s
3186        sqadd           v3.4s,  v5.4s,  \r3
3187        sqsub           \r3,    v5.4s,  \r3
3188        smax            v2.4s,  v2.4s,  v0.4s
3189        smax            \r2,    \r2,    v0.4s
3190        smin            v3.4s,  v3.4s,  v1.4s
3191        smin            \r3,    \r3,    v1.4s
3192        st1             {v7.4s}, [x6], #16
3193        st1             {\r1},   [x10], x9
3194        smax            v3.4s,  v3.4s,  v0.4s
3195        smax            \r3,    \r3,    v0.4s
3196        st1             {v2.4s}, [x6], #16
3197        st1             {\r2},   [x10], x9
3198        st1             {v3.4s}, [x6], #16
3199        st1             {\r3},   [x10], x9
3200.endm
3201        store_addsub    v31.4s, v30.4s, v29.4s, v28.4s
3202        store_addsub    v27.4s, v26.4s, v25.4s, v24.4s
3203        store_addsub    v23.4s, v22.4s, v21.4s, v20.4s
3204        store_addsub    v19.4s, v18.4s, v17.4s, v16.4s
3205.purgem store_addsub
3206
3207        add             x6,  x6,  #4*4*16
3208
3209        movrel          x17, idct64_coeffs
3210        movi            v5.4s,  #1, msl #16  // row_clip_max = ~(~bdmax << 7), 0x1ffff
3211        mvni            v4.4s,  #1, msl #16  // row_clip_min = (~bdmax << 7), 0xfffe0000
3212        movz16dup_if    v0.2s, w16, #2896*8, \scale
3213        movi_if         v7.4s,  #0, \clear
3214        add             x9,  x7,  x8, lsl #4 // offset 16
3215        add             x10, x7,  x8, lsl #3 // offset 8
3216        sub             x9,  x9,  x8         // offset 15
3217        sub             x11, x10, x8         // offset 7
3218        ld1             {v16.4s}, [x7]  // in1  (offset 0)
3219        ld1             {v17.4s}, [x9]  // in31 (offset 15)
3220        ld1             {v18.4s}, [x10] // in17 (offset 8)
3221        ld1             {v19.4s}, [x11] // in15 (offset 7)
3222        st1_if          {v7.4s}, [x7],  \clear
3223        st1_if          {v7.4s}, [x9],  \clear
3224        st1_if          {v7.4s}, [x10], \clear
3225        st1_if          {v7.4s}, [x11], \clear
3226        scale_if        \scale, v0.s[0], v16, v17, v18, v19
3227        bl              inv_dct64_step1_neon
3228        movz16dup_if    v0.2s, w16, #2896*8, \scale
3229        movi_if         v7.4s,  #0, \clear
3230        add             x7,  x7,  x8, lsl #2 // offset 4
3231        sub             x9,  x9,  x8, lsl #2 // offset 11
3232        sub             x10, x7,  x8         // offset 3
3233        add             x11, x9,  x8         // offset 12
3234        ld1             {v16.4s}, [x10] // in7  (offset 3)
3235        ld1             {v17.4s}, [x11] // in25 (offset 12)
3236        ld1             {v18.4s}, [x9]  // in23 (offset 11)
3237        ld1             {v19.4s}, [x7]  // in9  (offset 4)
3238        st1_if          {v7.4s}, [x7],  \clear
3239        st1_if          {v7.4s}, [x9],  \clear
3240        st1_if          {v7.4s}, [x10], \clear
3241        st1_if          {v7.4s}, [x11], \clear
3242        scale_if        \scale, v0.s[0], v16, v17, v18, v19
3243        bl              inv_dct64_step1_neon
3244        movz16dup_if    v0.2s, w16, #2896*8, \scale
3245        movi_if         v7.4s,  #0, \clear
3246        sub             x10, x10, x8, lsl #1 // offset 1
3247        sub             x9,  x9,  x8, lsl #1 // offset 9
3248        add             x7,  x7,  x8         // offset 5
3249        add             x11, x11, x8         // offset 13
3250        ldr             q16, [x10, x8] // in5  (offset 2)
3251        ldr             q17, [x11]     // in27 (offset 13)
3252        ldr             q18, [x9,  x8] // in21 (offset 10)
3253        ldr             q19, [x7]      // in11 (offset 5)
3254        stroff_if       q7,  [x10, x8], \clear
3255        str_if          q7,  [x11],     \clear
3256        stroff_if       q7,  [x9,  x8], \clear
3257        str_if          q7,  [x7],      \clear
3258        scale_if        \scale, v0.s[0], v16, v17, v18, v19
3259        bl              inv_dct64_step1_neon
3260        movz16dup_if    v0.2s, w16, #2896*8, \scale
3261        movi_if         v7.4s,  #0, \clear
3262        ldr             q16, [x10]     // in3  (offset 1)
3263        ldr             q17, [x11, x8] // in29 (offset 14)
3264        ldr             q18, [x9]      // in19 (offset 9)
3265        ldr             q19, [x7,  x8] // in13 (offset 6)
3266        str_if          q7,  [x10],     \clear
3267        stroff_if       q7,  [x11, x8], \clear
3268        str_if          q7,  [x9],      \clear
3269        stroff_if       q7,  [x7,  x8], \clear
3270        scale_if        \scale, v0.s[0], v16, v17, v18, v19
3271        bl              inv_dct64_step1_neon
3272
3273        sub             x6,  x6,  #4*4*32
3274        add             x9,  x6,  #4*4*7
3275
3276        bl              inv_dct64_step2_neon
3277
3278        ret             x14
3279endfunc
3280.endm
3281
3282def_dct64_func _clear, clear=1
3283def_dct64_func _clear_scale, clear=1, scale=1
3284
3285
3286function inv_txfm_horz_dct_64x4_neon
3287        mov             x14, x30
3288
3289        mov             x7,  sp
3290        add             x8,  sp,  #4*4*(64 - 4)
3291        add             x9,  x6,  #2*56
3292        mov             x10, #2*64
3293        mov             x11, #-4*4*4
3294
3295        dup             v7.4s,  w12
32961:
3297        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
3298        ld1             {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
3299        ld1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
3300        ld1             {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
3301        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
3302        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
3303        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
3304        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
3305
3306.macro store_addsub src0, src1, src2, src3
3307        sqsub           v1.4s,   \src0,   \src1
3308        sqadd           v0.4s,   \src0,   \src1
3309        sqsub           v3.4s,   \src2,   \src3
3310        srshl           v1.4s,   v1.4s,   v7.4s
3311        sqadd           v2.4s,   \src2,   \src3
3312        srshl           v3.4s,   v3.4s,   v7.4s
3313        srshl           v0.4s,   v0.4s,   v7.4s
3314        srshl           v2.4s,   v2.4s,   v7.4s
3315        sqxtn           v3.4h,   v3.4s
3316        sqxtn2          v3.8h,   v1.4s
3317        sqxtn           v0.4h,   v0.4s
3318        sqxtn2          v0.8h,   v2.4s
3319        rev64           v3.8h,   v3.8h
3320        st1             {v0.8h},  [x6], x10
3321        st1             {v3.8h},  [x9], x10
3322.endm
3323        store_addsub    v16.4s,  v31.4s,  v20.4s,  v27.4s
3324        store_addsub    v17.4s,  v30.4s,  v21.4s,  v26.4s
3325        store_addsub    v18.4s,  v29.4s,  v22.4s,  v25.4s
3326        store_addsub    v19.4s,  v28.4s,  v23.4s,  v24.4s
3327.purgem store_addsub
3328        sub             x6,  x6,  x10, lsl #2
3329        sub             x9,  x9,  x10, lsl #2
3330        add             x6,  x6,  #16
3331        sub             x9,  x9,  #16
3332
3333        cmp             x7,  x8
3334        b.lt            1b
3335        ret             x14
3336endfunc
3337
3338function inv_txfm_add_vert_dct_8x64_neon
3339        mov             x14, x30
3340        lsl             x8,  x8,  #1
3341
3342        mov             x7,  sp
3343        add             x8,  sp,  #2*8*(64 - 4)
3344        add             x9,  x6,  x1, lsl #6
3345        sub             x9,  x9,  x1
3346        neg             x10, x1
3347        mov             x11, #-2*8*4
3348
33491:
3350        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
3351        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
3352        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
3353        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
3354
3355        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
3356.macro add_dest_addsub src0, src1, src2, src3
3357        ld1             {v0.8h}, [x6], x1
3358        ld1             {v1.8h}, [x9], x10
3359        sqadd           v4.8h,   \src0,   \src1
3360        ld1             {v2.8h}, [x6]
3361        sqsub           \src0,   \src0,   \src1
3362        ld1             {v3.8h}, [x9]
3363        sqadd           v5.8h,   \src2,   \src3
3364        sqsub           \src2,   \src2,   \src3
3365        sub             x6,  x6,  x1
3366        sub             x9,  x9,  x10
3367        srshr           v4.8h,   v4.8h,   #4
3368        srshr           v5.8h,   v5.8h,   #4
3369        srshr           \src0,   \src0,   #4
3370        usqadd          v0.8h,   v4.8h
3371        srshr           \src2,   \src2,   #4
3372        usqadd          v1.8h,   \src0
3373        usqadd          v2.8h,   v5.8h
3374        smin            v0.8h,   v0.8h,   v7.8h
3375        usqadd          v3.8h,   \src2
3376        smin            v1.8h,   v1.8h,   v7.8h
3377        st1             {v0.8h}, [x6], x1
3378        smin            v2.8h,   v2.8h,   v7.8h
3379        st1             {v1.8h}, [x9], x10
3380        smin            v3.8h,   v3.8h,   v7.8h
3381        st1             {v2.8h}, [x6], x1
3382        st1             {v3.8h}, [x9], x10
3383.endm
3384        add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
3385        add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
3386        add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
3387        add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
3388.purgem add_dest_addsub
3389        cmp             x7,  x8
3390        b.lt            1b
3391
3392        ret             x14
3393endfunc
3394
3395function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
3396        idct_dc         64,  64,  2
3397
3398        mov             x15, x30
3399
3400        sub_sp          64*32*2+64*4*4
3401        add             x5,  sp, #64*4*4
3402
3403        movrel          x13, eob_32x32
3404
3405.irp i, 0, 4, 8, 12, 16, 20, 24, 28
3406        add             x6,  x5,  #(\i*64*2)
3407.if \i > 0
3408        mov             w8,  #(32 - \i)
3409        cmp             w3,  w12
3410        b.lt            1f
3411.endif
3412        add             x7,  x2,  #(\i*4)
3413        mov             x8,  #32*4
3414        mov             x12, #-2 // shift
3415        bl              inv_txfm_dct_clear_4s_x64_neon
3416        add             x6,  x5,  #(\i*64*2)
3417        bl              inv_txfm_horz_dct_64x4_neon
3418.if \i < 28
3419        ldrh            w12, [x13], #2
3420.endif
3421.endr
3422        b               3f
3423
34241:
3425        movi            v4.8h,  #0
3426        movi            v5.8h,  #0
3427        movi            v6.8h,  #0
3428        movi            v7.8h,  #0
34292:
3430        subs            w8,  w8,  #2
3431.rept 4
3432        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3433.endr
3434        b.gt            2b
3435
34363:
3437.irp i, 0, 8, 16, 24, 32, 40, 48, 56
3438        add             x7,  x5,  #(\i*2)
3439        mov             x8,  #64*2
3440        bl              X(inv_txfm_dct_8h_x64_neon)
3441        add             x6,  x0,  #(\i*2)
3442        bl              inv_txfm_add_vert_dct_8x64_neon
3443.endr
3444
3445        add             sp,  x5,  #64*32*2
3446        ret             x15
3447endfunc
3448
3449function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
3450        idct_dc         64,  32,  1
3451
3452        mov             x15, x30
3453
3454        sub_sp          64*32*2+64*4*4
3455        add             x5,  sp, #64*4*4
3456
3457        movrel          x13, eob_32x32
3458
3459.irp i, 0, 4, 8, 12, 16, 20, 24, 28
3460        add             x6,  x5,  #(\i*64*2)
3461.if \i > 0
3462        mov             w8,  #(32 - \i)
3463        cmp             w3,  w12
3464        b.lt            1f
3465.endif
3466        add             x7,  x2,  #(\i*4)
3467        mov             x8,  #32*4
3468        mov             x12, #-1 // shift
3469        bl              inv_txfm_dct_clear_scale_4s_x64_neon
3470        add             x6,  x5,  #(\i*64*2)
3471        bl              inv_txfm_horz_dct_64x4_neon
3472.if \i < 28
3473        ldrh            w12, [x13], #2
3474.endif
3475.endr
3476        b               3f
3477
34781:
3479        movi            v4.8h,  #0
3480        movi            v5.8h,  #0
3481        movi            v6.8h,  #0
3482        movi            v7.8h,  #0
34832:
3484        subs            w8,  w8,  #2
3485.rept 4
3486        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3487.endr
3488        b.gt            2b
3489
34903:
3491.irp i, 0, 8, 16, 24, 32, 40, 48, 56
3492        add             x6,  x0,  #(\i*2)
3493        add             x7,  x5,  #(\i*2)
3494        mov             x8,  #64*2
3495        bl              inv_txfm_add_vert_dct_8x32_neon
3496.endr
3497
3498        add             sp,  x5,  #64*32*2
3499        ret             x15
3500endfunc
3501
3502function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
3503        idct_dc         32,  64,  1
3504
3505        mov             x15, x30
3506
3507        sub_sp          32*32*2+64*8*2
3508        add             x5,  sp, #64*8*2
3509
3510        movrel          x13, eob_32x32
3511        ldrh            w12, [x13], #2
3512
3513.irp i, 0, 4, 8, 12, 16, 20, 24, 28
3514        add             x6,  x5,  #(\i*32*2)
3515.if \i > 0
3516        mov             w8,  #(32 - \i)
3517        cmp             w3,  w12
3518        b.lt            1f
3519        ldrh            w12, [x13], #2
3520.endif
3521        add             x7,  x2,  #(\i*4)
3522        mov             x8,  #32*4
3523        bl              inv_txfm_horz_scale_dct_32x4_neon
3524.endr
3525        b               3f
3526
35271:
3528        movi            v4.8h,  #0
3529        movi            v5.8h,  #0
3530        movi            v6.8h,  #0
3531        movi            v7.8h,  #0
35322:
3533        subs            w8,  w8,  #4
3534.rept 4
3535        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3536.endr
3537        b.gt            2b
3538
35393:
3540.irp i, 0, 8, 16, 24
3541        add             x7,  x5,  #(\i*2)
3542        mov             x8,  #32*2
3543        bl              X(inv_txfm_dct_8h_x64_neon)
3544        add             x6,  x0,  #(\i*2)
3545        bl              inv_txfm_add_vert_dct_8x64_neon
3546.endr
3547
3548        add             sp,  x5,  #32*32*2
3549        ret             x15
3550endfunc
3551
3552function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
3553        idct_dc         64,  16,  2
3554
3555        mov             x15, x30
3556
3557        sub_sp          64*16*2+64*4*4
3558        add             x4,  sp, #64*4*4
3559
3560        movrel          x13, eob_16x32
3561
3562.irp i, 0, 4, 8, 12
3563        add             x6,  x4,  #(\i*64*2)
3564.if \i > 0
3565        mov             w8,  #(16 - \i)
3566        cmp             w3,  w12
3567        b.lt            1f
3568.endif
3569        add             x7,  x2,  #(\i*4)
3570        mov             x8,  #16*4
3571        mov             x12, #-2 // shift
3572        bl              inv_txfm_dct_clear_4s_x64_neon
3573        add             x6,  x4,  #(\i*64*2)
3574        bl              inv_txfm_horz_dct_64x4_neon
3575.if \i < 12
3576        ldrh            w12, [x13], #2
3577.endif
3578.endr
3579        b               3f
3580
35811:
3582        movi            v4.8h,  #0
3583        movi            v5.8h,  #0
3584        movi            v6.8h,  #0
3585        movi            v7.8h,  #0
35862:
3587        subs            w8,  w8,  #2
3588.rept 4
3589        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3590.endr
3591        b.gt            2b
3592
35933:
3594        movrel          x5,  X(inv_dct_8h_x16_neon)
3595.irp i, 0, 8, 16, 24, 32, 40, 48, 56
3596        add             x6,  x0,  #(\i*2)
3597        add             x7,  x4,  #(\i*2)
3598        mov             x8,  #64*2
3599        bl              inv_txfm_add_vert_8x16_neon
3600.endr
3601
3602        add             sp,  x4,  #64*16*2
3603        ret             x15
3604endfunc
3605
3606function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
3607        idct_dc         16,  64,  2
3608
3609        mov             x15, x30
3610
3611        sub_sp          16*32*2+64*8*2
3612        add             x5,  sp, #64*8*2
3613
3614        movrel          x13, eob_16x32
3615        ldrh            w12, [x13], #2
3616
3617        adr             x4,  inv_dct_4s_x16_neon
3618.irp i, 0, 4, 8, 12, 16, 20, 24, 28
3619        add             x6,  x5,  #(\i*16*2)
3620.if \i > 0
3621        mov             w8,  #(32 - \i)
3622        cmp             w3,  w12
3623        b.lt            1f
3624.if \i < 28
3625        ldrh            w12, [x13], #2
3626.endif
3627.endif
3628        add             x7,  x2,  #(\i*4)
3629        mov             x8,  #32*4
3630        bl              inv_txfm_horz_16x4_neon
3631.endr
3632        b               3f
3633
36341:
3635        movi            v4.8h,  #0
3636        movi            v5.8h,  #0
3637        movi            v6.8h,  #0
3638        movi            v7.8h,  #0
36392:
3640        subs            w8,  w8,  #4
3641.rept 2
3642        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3643.endr
3644        b.gt            2b
3645
36463:
3647.irp i, 0, 8
3648        add             x7,  x5,  #(\i*2)
3649        mov             x8,  #16*2
3650        bl              X(inv_txfm_dct_8h_x64_neon)
3651        add             x6,  x0,  #(\i*2)
3652        bl              inv_txfm_add_vert_dct_8x64_neon
3653.endr
3654
3655        add             sp,  x5,  #16*32*2
3656        ret             x15
3657endfunc
3658