xref: /aosp_15_r20/external/libdav1d/src/arm/64/itx.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// The exported functions in this file have got the following signature:
32// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
33
34// Most of the functions use the following register layout:
35// x0-x3  external parameters
36// x4     function pointer to first transform
37// x5     function pointer to second transform
38// x6     output parameter for helper function
39// x7     input parameter for helper function
40// x8     input stride for helper function
41// x9-x12 scratch variables for helper functions
42// x13    pointer to list of eob thresholds
43// x14    return pointer for helper function
44// x15    return pointer for main function
45
46// The SIMD registers most often use the following layout:
47// v0-v1   multiplication coefficients
48// v2-v7   scratch registers
49// v8-v15  unused
50// v16-v31 inputs/outputs of transforms
51
52// Potential further optimizations, that are left unimplemented for now:
53// - Trying to keep multiplication coefficients in registers across multiple
54//   transform functions. (The register layout is designed to potentially
55//   allow this.)
56// - Use a simplified version of the transforms themselves for cases where
57//   we know a significant number of inputs are zero. E.g. if the eob value
58//   indicates only a quarter of input values are set, for idct16 and up,
59//   a significant amount of calculation can be skipped, at the cost of more
60//   code duplication and special casing.
61
62const idct_coeffs, align=4
63        // idct4
64        .short          2896, 2896*8, 1567, 3784
65        // idct8
66        .short          799, 4017, 3406, 2276
67        // idct16
68        .short          401, 4076, 3166, 2598
69        .short          1931, 3612, 3920, 1189
70        // idct32
71        .short          201, 4091, 3035, 2751
72        .short          1751, 3703, 3857, 1380
73        .short          995, 3973, 3513, 2106
74        .short          2440, 3290, 4052, 601
75endconst
76
77const idct64_coeffs, align=4
78        .short          101*8, 4095*8, 2967*8, -2824*8
79        .short          1660*8, 3745*8, 3822*8, -1474*8
80        .short          4076, 401, 4017, 799
81        .short          0, 0, 0, 0
82
83        .short          4036*8, -700*8, 2359*8, 3349*8
84        .short          3461*8, -2191*8, 897*8, 3996*8
85        .short          -3166, -2598, -799, -4017
86        .short          0, 0, 0, 0
87
88        .short          501*8, 4065*8, 3229*8, -2520*8
89        .short          2019*8, 3564*8, 3948*8, -1092*8
90        .short          3612, 1931, 2276, 3406
91        .short          0, 0, 0, 0
92
93        .short          4085*8, -301*8, 2675*8, 3102*8
94        .short          3659*8, -1842*8, 1285*8, 3889*8
95        .short          -3920, -1189, -3406, -2276
96        .short          0, 0, 0, 0
97endconst
98
99const iadst4_coeffs, align=4
100        // .h[4-5] can be interpreted as .s[2]
101        .short          1321, 3803, 2482, 3344, 3344, 0
102endconst
103
104const iadst8_coeffs, align=4
105        .short          4076, 401, 3612, 1931
106        .short          2598, 3166, 1189, 3920
107        // idct_coeffs
108        .short          2896, 0, 1567, 3784, 0, 0, 0, 0
109endconst
110
111const iadst16_coeffs, align=4
112        .short          4091, 201, 3973, 995
113        .short          3703, 1751, 3290, 2440
114        .short          2751, 3035, 2106, 3513
115        .short          1380, 3857, 601, 4052
116endconst
117
118.macro smull_smlal d0, d1, s0, s1, c0, c1, sz
119        smull           \d0\().4s, \s0\().4h, \c0
120        smlal           \d0\().4s, \s1\().4h, \c1
121.ifc \sz, .8h
122        smull2          \d1\().4s, \s0\().8h, \c0
123        smlal2          \d1\().4s, \s1\().8h, \c1
124.endif
125.endm
126
127.macro smull_smlsl d0, d1, s0, s1, c0, c1, sz
128        smull           \d0\().4s, \s0\().4h, \c0
129        smlsl           \d0\().4s, \s1\().4h, \c1
130.ifc \sz, .8h
131        smull2          \d1\().4s, \s0\().8h, \c0
132        smlsl2          \d1\().4s, \s1\().8h, \c1
133.endif
134.endm
135
136.macro sqrshrn_sz d0, s0, s1, shift, sz
137        sqrshrn         \d0\().4h, \s0\().4s, \shift
138.ifc \sz, .8h
139        sqrshrn2        \d0\().8h, \s1\().4s, \shift
140.endif
141.endm
142
143.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
144        sqrdmulh        \r0\sz,  \r0\sz,  \c
145        sqrdmulh        \r1\sz,  \r1\sz,  \c
146        sqrdmulh        \r2\sz,  \r2\sz,  \c
147        sqrdmulh        \r3\sz,  \r3\sz,  \c
148.ifnb \r4
149        sqrdmulh        \r4\sz,  \r4\sz,  \c
150        sqrdmulh        \r5\sz,  \r5\sz,  \c
151        sqrdmulh        \r6\sz,  \r6\sz,  \c
152        sqrdmulh        \r7\sz,  \r7\sz,  \c
153.endif
154.endm
155
156.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
157.ifnb \load
158        ld1             {\load},  [\src], x1
159.endif
160.ifnb \shift
161        srshr           \shift,  \shift,  #\shiftbits
162.endif
163.ifnb \addsrc
164        uaddw           \adddst, \adddst, \addsrc
165.endif
166.ifnb \narrowsrc
167        sqxtun          \narrowdst, \narrowsrc
168.endif
169.ifnb \store
170        st1             {\store},  [\dst], x1
171.endif
172.endm
173.macro load_add_store_8x16 dst, src
174        mov             \src, \dst
175        load_add_store  v2.8b, v16.8h,      ,       ,       ,      ,      , \dst, \src
176        load_add_store  v3.8b, v17.8h,      ,       ,       ,      ,      , \dst, \src
177        load_add_store  v4.8b, v18.8h, v2.8b, v16.8h,       ,      ,      , \dst, \src
178        load_add_store  v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b,      , \dst, \src
179        load_add_store  v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
180        load_add_store  v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
181        load_add_store  v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src
182        load_add_store  v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src
183        load_add_store  v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src
184        load_add_store  v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src
185        load_add_store  v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src
186        load_add_store  v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src
187        load_add_store  v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src
188        load_add_store  v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src
189        load_add_store  v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src
190        load_add_store  v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src
191        load_add_store       ,       , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src
192        load_add_store       ,       , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src
193        load_add_store       ,       ,      ,       , v31.8h, v5.8b, v4.8b, \dst, \src
194        load_add_store       ,       ,      ,       ,       ,      , v5.8b, \dst, \src
195.endm
196.macro load_add_store_8x8 dst, src, shiftbits=4
197        mov             \src, \dst
198        load_add_store  v2.8b, v16.8h,      ,       ,       ,      ,      , \dst, \src, \shiftbits
199        load_add_store  v3.8b, v17.8h,      ,       ,       ,      ,      , \dst, \src, \shiftbits
200        load_add_store  v4.8b, v18.8h, v2.8b, v16.8h,       ,      ,      , \dst, \src, \shiftbits
201        load_add_store  v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b,      , \dst, \src, \shiftbits
202        load_add_store  v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
203        load_add_store  v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits
204        load_add_store  v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits
205        load_add_store  v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits
206        load_add_store       ,       , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits
207        load_add_store       ,       , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits
208        load_add_store       ,       ,      ,       , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
209        load_add_store       ,       ,      ,       ,       ,      , v3.8b, \dst, \src, \shiftbits
210.endm
211.macro load_add_store_8x4 dst, src
212        mov             \src, \dst
213        load_add_store  v2.8b, v16.8h,      ,       ,       ,      ,      , \dst, \src
214        load_add_store  v3.8b, v17.8h,      ,       ,       ,      ,      , \dst, \src
215        load_add_store  v4.8b, v18.8h, v2.8b, v16.8h,       ,      ,      , \dst, \src
216        load_add_store  v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b,      , \dst, \src
217        load_add_store       ,       , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
218        load_add_store       ,       , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
219        load_add_store       ,       ,      ,       , v19.8h, v5.8b, v4.8b, \dst, \src
220        load_add_store       ,       ,      ,       ,       ,      , v5.8b, \dst, \src
221.endm
222.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
223.ifnb \load
224        ld1             {\load}[0],  [\src], x1
225.endif
226.ifnb \inssrc
227        ins             \insdst\().d[1],   \inssrc\().d[0]
228.endif
229.ifnb \shift
230        srshr           \shift,  \shift,  #4
231.endif
232.ifnb \load
233        ld1             {\load}[1],  [\src], x1
234.endif
235.ifnb \addsrc
236        uaddw           \adddst, \adddst, \addsrc
237.endif
238.ifnb \store
239        st1             {\store}[0],  [\dst], x1
240.endif
241.ifnb \narrowsrc
242        sqxtun          \narrowdst, \narrowsrc
243.endif
244.ifnb \store
245        st1             {\store}[1],  [\dst], x1
246.endif
247.endm
248.macro load_add_store_4x16 dst, src
249        mov             \src, \dst
250        load_add_store4 v0.s, v17, v16,       ,      ,       ,       ,      ,     , \dst, \src
251        load_add_store4 v1.s, v19, v18,       ,      ,       ,       ,      ,     , \dst, \src
252        load_add_store4 v2.s, v21, v20, v16.8h,      ,       ,       ,      ,     , \dst, \src
253        load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h,       ,      ,     , \dst, \src
254        load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b,     , \dst, \src
255        load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
256        load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
257        load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src
258        load_add_store4     ,    ,    , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src
259        load_add_store4     ,    ,    , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src
260        load_add_store4     ,    ,    ,       , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src
261        load_add_store4     ,    ,    ,       ,      ,       , v30.8h, v7.8b, v6.s, \dst, \src
262        load_add_store4     ,    ,    ,       ,      ,       ,       ,      , v7.s, \dst, \src
263.endm
264.macro load_add_store_4x8 dst, src
265        mov             \src, \dst
266        load_add_store4 v0.s, v17, v16,       ,      ,       ,       ,      ,     , \dst, \src
267        load_add_store4 v1.s, v19, v18,       ,      ,       ,       ,      ,     , \dst, \src
268        load_add_store4 v2.s, v21, v20, v16.8h,      ,       ,       ,      ,     , \dst, \src
269        load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h,       ,      ,     , \dst, \src
270        load_add_store4     ,    ,    , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b,     , \dst, \src
271        load_add_store4     ,    ,    , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
272        load_add_store4     ,    ,    ,       , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
273        load_add_store4     ,    ,    ,       ,      ,       , v22.8h, v3.8b, v2.s, \dst, \src
274        load_add_store4     ,    ,    ,       ,      ,       ,       ,      , v3.s, \dst, \src
275.endm
276
277.macro idct_dc w, h, shift
278        cbnz            w3,  1f
279        mov             w16, #2896*8
280        ld1r            {v16.8h}, [x2]
281        dup             v0.4h,   w16
282        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]
283        strh            wzr, [x2]
284.if (\w == 2*\h) || (2*\w == \h)
285        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]
286.endif
287.if \shift > 0
288        srshr           v16.8h,  v16.8h,  #\shift
289.endif
290        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]
291        srshr           v16.8h,  v16.8h,  #4
292        mov             w4,  #\h
293        b               idct_dc_w\w\()_neon
2941:
295.endm
296
297function idct_dc_w4_neon
2981:
299        ld1             {v0.s}[0], [x0], x1
300        ld1             {v0.s}[1], [x0], x1
301        ld1             {v1.s}[0], [x0], x1
302        ld1             {v1.s}[1], [x0], x1
303        subs            w4,  w4,  #4
304        sub             x0,  x0,  x1, lsl #2
305        uaddw           v0.8h,   v16.8h,  v0.8b
306        sqxtun          v0.8b,   v0.8h
307        uaddw           v1.8h,   v16.8h,  v1.8b
308        st1             {v0.s}[0], [x0], x1
309        sqxtun          v1.8b,   v1.8h
310        st1             {v0.s}[1], [x0], x1
311        st1             {v1.s}[0], [x0], x1
312        st1             {v1.s}[1], [x0], x1
313        b.gt            1b
314        ret
315endfunc
316
317function idct_dc_w8_neon
3181:
319        ld1             {v0.8b}, [x0], x1
320        ld1             {v1.8b}, [x0], x1
321        ld1             {v2.8b}, [x0], x1
322        uaddw           v20.8h,  v16.8h, v0.8b
323        ld1             {v3.8b}, [x0], x1
324        sub             x0,  x0,  x1, lsl #2
325        subs            w4,  w4,  #4
326        uaddw           v21.8h,  v16.8h, v1.8b
327        sqxtun          v0.8b,   v20.8h
328        uaddw           v22.8h,  v16.8h, v2.8b
329        sqxtun          v1.8b,   v21.8h
330        uaddw           v23.8h,  v16.8h, v3.8b
331        st1             {v0.8b}, [x0], x1
332        sqxtun          v2.8b,   v22.8h
333        st1             {v1.8b}, [x0], x1
334        sqxtun          v3.8b,   v23.8h
335        st1             {v2.8b}, [x0], x1
336        st1             {v3.8b}, [x0], x1
337        b.gt            1b
338        ret
339endfunc
340
341function idct_dc_w16_neon
3421:
343        ld1             {v0.16b}, [x0], x1
344        ld1             {v1.16b}, [x0], x1
345        ld1             {v2.16b}, [x0], x1
346        subs            w4,  w4,  #4
347        uaddw           v20.8h,  v16.8h, v0.8b
348        uaddw2          v21.8h,  v16.8h, v0.16b
349        ld1             {v3.16b}, [x0], x1
350        uaddw           v22.8h,  v16.8h, v1.8b
351        uaddw2          v23.8h,  v16.8h, v1.16b
352        sub             x0,  x0,  x1, lsl #2
353        uaddw           v24.8h,  v16.8h, v2.8b
354        uaddw2          v25.8h,  v16.8h, v2.16b
355        sqxtun          v0.8b,   v20.8h
356        sqxtun2         v0.16b,  v21.8h
357        uaddw           v26.8h,  v16.8h, v3.8b
358        uaddw2          v27.8h,  v16.8h, v3.16b
359        sqxtun          v1.8b,   v22.8h
360        sqxtun2         v1.16b,  v23.8h
361        sqxtun          v2.8b,   v24.8h
362        sqxtun2         v2.16b,  v25.8h
363        st1             {v0.16b}, [x0], x1
364        sqxtun          v3.8b,   v26.8h
365        sqxtun2         v3.16b,  v27.8h
366        st1             {v1.16b}, [x0], x1
367        st1             {v2.16b}, [x0], x1
368        st1             {v3.16b}, [x0], x1
369        b.gt            1b
370        ret
371endfunc
372
373function idct_dc_w32_neon
3741:
375        ld1             {v0.16b, v1.16b},  [x0], x1
376        subs            w4,  w4,  #2
377        uaddw           v20.8h,  v16.8h, v0.8b
378        uaddw2          v21.8h,  v16.8h, v0.16b
379        ld1             {v2.16b, v3.16b},  [x0]
380        uaddw           v22.8h,  v16.8h, v1.8b
381        uaddw2          v23.8h,  v16.8h, v1.16b
382        sub             x0,  x0,  x1
383        uaddw           v24.8h,  v16.8h, v2.8b
384        uaddw2          v25.8h,  v16.8h, v2.16b
385        sqxtun          v0.8b,   v20.8h
386        sqxtun2         v0.16b,  v21.8h
387        uaddw           v26.8h,  v16.8h, v3.8b
388        uaddw2          v27.8h,  v16.8h, v3.16b
389        sqxtun          v1.8b,   v22.8h
390        sqxtun2         v1.16b,  v23.8h
391        sqxtun          v2.8b,   v24.8h
392        sqxtun2         v2.16b,  v25.8h
393        st1             {v0.16b, v1.16b},  [x0], x1
394        sqxtun          v3.8b,   v26.8h
395        sqxtun2         v3.16b,  v27.8h
396        st1             {v2.16b, v3.16b},  [x0], x1
397        b.gt            1b
398        ret
399endfunc
400
401function idct_dc_w64_neon
4021:
403        ld1             {v0.16b, v1.16b, v2.16b, v3.16b},  [x0]
404        subs            w4,  w4,  #1
405        uaddw           v20.8h,  v16.8h, v0.8b
406        uaddw2          v21.8h,  v16.8h, v0.16b
407        uaddw           v22.8h,  v16.8h, v1.8b
408        uaddw2          v23.8h,  v16.8h, v1.16b
409        uaddw           v24.8h,  v16.8h, v2.8b
410        uaddw2          v25.8h,  v16.8h, v2.16b
411        sqxtun          v0.8b,   v20.8h
412        sqxtun2         v0.16b,  v21.8h
413        uaddw           v26.8h,  v16.8h, v3.8b
414        uaddw2          v27.8h,  v16.8h, v3.16b
415        sqxtun          v1.8b,   v22.8h
416        sqxtun2         v1.16b,  v23.8h
417        sqxtun          v2.8b,   v24.8h
418        sqxtun2         v2.16b,  v25.8h
419        sqxtun          v3.8b,   v26.8h
420        sqxtun2         v3.16b,  v27.8h
421        st1             {v0.16b, v1.16b, v2.16b, v3.16b},  [x0], x1
422        b.gt            1b
423        ret
424endfunc
425
426.macro iwht4
427        add             v16.4h,  v16.4h,  v17.4h
428        sub             v21.4h,  v18.4h,  v19.4h
429        sub             v20.4h,  v16.4h,  v21.4h
430        sshr            v20.4h,  v20.4h,  #1
431        sub             v18.4h,  v20.4h,  v17.4h
432        sub             v17.4h,  v20.4h,  v19.4h
433        add             v19.4h,  v21.4h,  v18.4h
434        sub             v16.4h,  v16.4h,  v17.4h
435.endm
436
437.macro idct_4 r0, r1, r2, r3, sz
438        smull_smlal     v6,  v7,  \r1, \r3, v0.h[3], v0.h[2], \sz
439        smull_smlsl     v4,  v5,  \r1, \r3, v0.h[2], v0.h[3], \sz
440        smull_smlal     v2,  v3,  \r0, \r2, v0.h[0], v0.h[0], \sz
441        sqrshrn_sz      v6,  v6,  v7,  #12, \sz
442        sqrshrn_sz      v7,  v4,  v5,  #12, \sz
443        smull_smlsl     v4,  v5,  \r0, \r2, v0.h[0], v0.h[0], \sz
444        sqrshrn_sz      v2,  v2,  v3,  #12, \sz
445        sqrshrn_sz      v3,  v4,  v5,  #12, \sz
446        sqadd           \r0\sz,  v2\sz,   v6\sz
447        sqsub           \r3\sz,  v2\sz,   v6\sz
448        sqadd           \r1\sz,  v3\sz,   v7\sz
449        sqsub           \r2\sz,  v3\sz,   v7\sz
450.endm
451
452function inv_dct_4h_x4_neon, export=1
453        movrel          x16, idct_coeffs
454        ld1             {v0.4h}, [x16]
455        idct_4          v16, v17, v18, v19, .4h
456        ret
457endfunc
458
459function inv_dct_8h_x4_neon, export=1
460        movrel          x16, idct_coeffs
461        ld1             {v0.4h}, [x16]
462        idct_4          v16, v17, v18, v19, .8h
463        ret
464endfunc
465
466.macro iadst_4x4 o0, o1, o2, o3
467        movrel          x16, iadst4_coeffs
468        ld1             {v0.8h}, [x16]
469
470        ssubl           v3.4s,   v16.4h,  v18.4h
471        smull           v4.4s,   v16.4h,  v0.h[0]
472        smlal           v4.4s,   v18.4h,  v0.h[1]
473        smlal           v4.4s,   v19.4h,  v0.h[2]
474        smull           v7.4s,   v17.4h,  v0.h[3]
475        saddw           v3.4s,   v3.4s,   v19.4h
476        smull           v5.4s,   v16.4h,  v0.h[2]
477        smlsl           v5.4s,   v18.4h,  v0.h[0]
478        smlsl           v5.4s,   v19.4h,  v0.h[1]
479
480        add             \o3\().4s, v4.4s,     v5.4s
481        mul             \o2\().4s, v3.4s,     v0.s[2]
482        add             \o0\().4s, v4.4s,     v7.4s
483        add             \o1\().4s, v5.4s,     v7.4s
484        sub             \o3\().4s, \o3\().4s, v7.4s
485
486        sqrshrn         \o0\().4h, \o0\().4s, #12
487        sqrshrn         \o2\().4h, \o2\().4s, #12
488        sqrshrn         \o1\().4h, \o1\().4s, #12
489        sqrshrn         \o3\().4h, \o3\().4s, #12
490.endm
491
492function inv_adst_4h_x4_neon, export=1
493        iadst_4x4       v16, v17, v18, v19
494        ret
495endfunc
496
497function inv_flipadst_4h_x4_neon, export=1
498        iadst_4x4       v19, v18, v17, v16
499        ret
500endfunc
501
502.macro iadst_8x4 o0, o1, o2, o3
503        movrel          x16, iadst4_coeffs
504        ld1             {v0.8h}, [x16]
505
506        ssubl           v2.4s,   v16.4h,  v18.4h
507        ssubl2          v3.4s,   v16.8h,  v18.8h
508        smull           v4.4s,   v16.4h,  v0.h[0]
509        smlal           v4.4s,   v18.4h,  v0.h[1]
510        smlal           v4.4s,   v19.4h,  v0.h[2]
511        smull2          v5.4s,   v16.8h,  v0.h[0]
512        smlal2          v5.4s,   v18.8h,  v0.h[1]
513        smlal2          v5.4s,   v19.8h,  v0.h[2]
514        saddw           v2.4s,   v2.4s,   v19.4h
515        saddw2          v3.4s,   v3.4s,   v19.8h
516        smull           v6.4s,   v16.4h,  v0.h[2]
517        smlsl           v6.4s,   v18.4h,  v0.h[0]
518        smlsl           v6.4s,   v19.4h,  v0.h[1]
519        smull2          v7.4s,   v16.8h,  v0.h[2]
520        smlsl2          v7.4s,   v18.8h,  v0.h[0]
521        smlsl2          v7.4s,   v19.8h,  v0.h[1]
522
523        mul             v18.4s,  v2.4s,   v0.s[2]
524        mul             v19.4s,  v3.4s,   v0.s[2]
525
526        smull           v2.4s,   v17.4h,  v0.h[3]
527        smull2          v3.4s,   v17.8h,  v0.h[3]
528
529        add             v16.4s,  v4.4s,   v2.4s // out0
530        add             v17.4s,  v5.4s,   v3.4s
531
532        add             v4.4s,   v4.4s,   v6.4s // out3
533        add             v5.4s,   v5.4s,   v7.4s
534
535        add             v6.4s,   v6.4s,   v2.4s // out1
536        add             v7.4s,   v7.4s,   v3.4s
537
538        sub             v4.4s,   v4.4s,   v2.4s // out3
539        sub             v5.4s,   v5.4s,   v3.4s
540
541        sqrshrn         v18.4h,  v18.4s, #12
542        sqrshrn2        v18.8h,  v19.4s, #12
543
544        sqrshrn         \o0\().4h, v16.4s, #12
545        sqrshrn2        \o0\().8h, v17.4s, #12
546
547.ifc \o2, v17
548        mov             v17.16b,   v18.16b
549.endif
550
551        sqrshrn         \o1\().4h, v6.4s,  #12
552        sqrshrn2        \o1\().8h, v7.4s,  #12
553
554        sqrshrn         \o3\().4h, v4.4s,  #12
555        sqrshrn2        \o3\().8h, v5.4s,  #12
556.endm
557
558function inv_adst_8h_x4_neon, export=1
559        iadst_8x4       v16, v17, v18, v19
560        ret
561endfunc
562
563function inv_flipadst_8h_x4_neon, export=1
564        iadst_8x4       v19, v18, v17, v16
565        ret
566endfunc
567
568function inv_identity_4h_x4_neon, export=1
569        mov             w16, #(5793-4096)*8
570        dup             v0.4h,   w16
571        sqrdmulh        v4.4h,   v16.4h,  v0.h[0]
572        sqrdmulh        v5.4h,   v17.4h,  v0.h[0]
573        sqrdmulh        v6.4h,   v18.4h,  v0.h[0]
574        sqrdmulh        v7.4h,   v19.4h,  v0.h[0]
575        sqadd           v16.4h,  v16.4h,  v4.4h
576        sqadd           v17.4h,  v17.4h,  v5.4h
577        sqadd           v18.4h,  v18.4h,  v6.4h
578        sqadd           v19.4h,  v19.4h,  v7.4h
579        ret
580endfunc
581
582function inv_identity_8h_x4_neon, export=1
583        mov             w16, #(5793-4096)*8
584        dup             v0.4h,   w16
585        sqrdmulh        v4.8h,   v16.8h,  v0.h[0]
586        sqrdmulh        v5.8h,   v17.8h,  v0.h[0]
587        sqrdmulh        v6.8h,   v18.8h,  v0.h[0]
588        sqrdmulh        v7.8h,   v19.8h,  v0.h[0]
589        sqadd           v16.8h,  v16.8h,  v4.8h
590        sqadd           v17.8h,  v17.8h,  v5.8h
591        sqadd           v18.8h,  v18.8h,  v6.8h
592        sqadd           v19.8h,  v19.8h,  v7.8h
593        ret
594endfunc
595
596.macro identity_8x4_shift1 r0, r1, r2, r3, c
597.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
598        sqrdmulh        v2.8h,  \i,  \c
599        srhadd          \i,     \i,  v2.8h
600.endr
601.endm
602
603function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
604        mov             x15, x30
605        movi            v31.8h,  #0
606        ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
607        st1             {v31.8h}, [x2], #16
608
609        sshr            v16.4h,  v16.4h,  #2
610        sshr            v17.4h,  v17.4h,  #2
611        sshr            v18.4h,  v18.4h,  #2
612        sshr            v19.4h,  v19.4h,  #2
613
614        iwht4
615
616        st1             {v31.8h}, [x2], #16
617        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
618
619        iwht4
620
621        ld1             {v0.s}[0], [x0], x1
622        ld1             {v0.s}[1], [x0], x1
623        ins             v16.d[1], v17.d[0]
624        ins             v18.d[1], v19.d[0]
625        ld1             {v1.s}[0], [x0], x1
626        ld1             {v1.s}[1], [x0], x1
627
628        b               L(itx_4x4_end)
629endfunc
630
631function inv_txfm_add_4x4_neon
632        movi            v31.8h,  #0
633        ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
634        st1             {v31.8h}, [x2], #16
635
636        blr             x4
637
638        st1             {v31.8h}, [x2], #16
639        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
640
641        blr             x5
642
643        ld1             {v0.s}[0], [x0], x1
644        ld1             {v0.s}[1], [x0], x1
645        ins             v16.d[1], v17.d[0]
646        ins             v18.d[1], v19.d[0]
647        ld1             {v1.s}[0], [x0], x1
648        ld1             {v1.s}[1], [x0], x1
649        srshr           v16.8h,  v16.8h,  #4
650        srshr           v18.8h,  v18.8h,  #4
651
652L(itx_4x4_end):
653        sub             x0,  x0,  x1, lsl #2
654        uaddw           v16.8h,  v16.8h,  v0.8b
655        sqxtun          v0.8b,   v16.8h
656        uaddw           v18.8h,  v18.8h,  v1.8b
657        st1             {v0.s}[0], [x0], x1
658        sqxtun          v1.8b,   v18.8h
659        st1             {v0.s}[1], [x0], x1
660        st1             {v1.s}[0], [x0], x1
661        st1             {v1.s}[1], [x0], x1
662
663        ret             x15
664endfunc
665
666.macro def_fn_4x4 txfm1, txfm2
667function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
668        mov             x15, x30
669
670.ifc \txfm1\()_\txfm2, dct_dct
671        cbnz            w3,  1f
672        mov             w16, #2896*8
673        ld1r            {v16.8h}, [x2]
674        dup             v4.8h,   w16
675        strh            wzr, [x2]
676        sqrdmulh        v16.8h,  v16.8h,  v4.h[0]
677        ld1             {v0.s}[0], [x0], x1
678        sqrdmulh        v20.8h,  v16.8h,  v4.h[0]
679        ld1             {v0.s}[1], [x0], x1
680        srshr           v16.8h,  v20.8h,  #4
681        ld1             {v1.s}[0], [x0], x1
682        srshr           v18.8h,  v20.8h,  #4
683        ld1             {v1.s}[1], [x0], x1
684        b               L(itx_4x4_end)
6851:
686.endif
687        adr             x4,  inv_\txfm1\()_4h_x4_neon
688        adr             x5,  inv_\txfm2\()_4h_x4_neon
689        b               inv_txfm_add_4x4_neon
690endfunc
691.endm
692
693def_fn_4x4 dct, dct
694def_fn_4x4 identity, identity
695def_fn_4x4 dct, adst
696def_fn_4x4 dct, flipadst
697def_fn_4x4 dct, identity
698def_fn_4x4 adst, dct
699def_fn_4x4 adst, adst
700def_fn_4x4 adst, flipadst
701def_fn_4x4 flipadst, dct
702def_fn_4x4 flipadst, adst
703def_fn_4x4 flipadst, flipadst
704def_fn_4x4 identity, dct
705
706def_fn_4x4 adst, identity
707def_fn_4x4 flipadst, identity
708def_fn_4x4 identity, adst
709def_fn_4x4 identity, flipadst
710
711.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb
712        idct_4          \r0, \r2, \r4, \r6, \sz
713
714        smull_smlsl     v2,  v3,  \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a
715        smull_smlal     v4,  v5,  \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a
716        smull_smlsl     v6,  v7,  \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a
717        sqrshrn_sz      \r1, v2,  v3,  #12, \sz                   // t4a
718        sqrshrn_sz      \r7, v4,  v5,  #12, \sz                   // t7a
719        smull_smlal     v2,  v3,  \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
720        sqrshrn_sz      \r3, v6,  v7,  #12, \sz                   // t5a
721        sqrshrn_sz      \r5, v2,  v3,  #12, \sz                   // t6a
722
723        sqadd           v2\sz,   \r1\sz,  \r3\sz // t4
724        sqsub           \r1\sz,  \r1\sz,  \r3\sz // t5a
725        sqadd           v3\sz,   \r7\sz,  \r5\sz // t7
726        sqsub           \r3\sz,  \r7\sz,  \r5\sz // t6a
727
728        smull_smlsl     v4,  v5,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
729        smull_smlal     v6,  v7,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
730        sqrshrn_sz      v4,  v4,  v5,  #12, \sz // t5
731        sqrshrn_sz      v5,  v6,  v7,  #12, \sz // t6
732
733        sqsub           \r7\sz,  \r0\sz,  v3\sz // out7
734        sqadd           \r0\sz,  \r0\sz,  v3\sz // out0
735        sqadd           \r1\sz,  \r2\sz,  v5\sz // out1
736        sqsub           v6\sz,   \r2\sz,  v5\sz // out6
737        sqadd           \r2\sz,  \r4\sz,  v4\sz // out2
738        sqsub           \r5\sz,  \r4\sz,  v4\sz // out5
739        sqadd           \r3\sz,  \r6\sz,  v2\sz // out3
740        sqsub           \r4\sz,  \r6\sz,  v2\sz // out4
741        mov             \r6\szb, v6\szb         // out6
742.endm
743
744function inv_dct_8h_x8_neon, export=1
745        movrel          x16, idct_coeffs
746        ld1             {v0.8h}, [x16]
747        idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b
748        ret
749endfunc
750
751function inv_dct_4h_x8_neon, export=1
752        movrel          x16, idct_coeffs
753        ld1             {v0.8h}, [x16]
754        idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b
755        ret
756endfunc
757
758.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz
759        movrel          x16, iadst8_coeffs
760        ld1             {v0.8h, v1.8h}, [x16]
761
762        smull_smlal     v2,  v3,  v23, v16, v0.h[0], v0.h[1], \sz
763        smull_smlsl     v4,  v5,  v23, v16, v0.h[1], v0.h[0], \sz
764        smull_smlal     v6,  v7,  v21, v18, v0.h[2], v0.h[3], \sz
765        sqrshrn_sz      v16, v2,  v3,  #12, \sz  // t0a
766        sqrshrn_sz      v23, v4,  v5,  #12, \sz  // t1a
767        smull_smlsl     v2,  v3,  v21, v18, v0.h[3], v0.h[2], \sz
768        smull_smlal     v4,  v5,  v19, v20, v0.h[4], v0.h[5], \sz
769        sqrshrn_sz      v18, v6,  v7,  #12, \sz  // t2a
770        sqrshrn_sz      v21, v2,  v3,  #12, \sz  // t3a
771        smull_smlsl     v6,  v7,  v19, v20, v0.h[5], v0.h[4], \sz
772        smull_smlal     v2,  v3,  v17, v22, v0.h[6], v0.h[7], \sz
773        sqrshrn_sz      v20, v4,  v5,  #12, \sz  // t4a
774        sqrshrn_sz      v19, v6,  v7,  #12, \sz  // t5a
775        smull_smlsl     v4,  v5,  v17, v22, v0.h[7], v0.h[6], \sz
776        sqrshrn_sz      v22, v2,  v3,  #12, \sz  // t6a
777        sqrshrn_sz      v17, v4,  v5,  #12, \sz  // t7a
778
779        sqadd           v2\sz,   v16\sz,  v20\sz // t0
780        sqsub           v3\sz,   v16\sz,  v20\sz // t4
781        sqadd           v4\sz,   v23\sz,  v19\sz // t1
782        sqsub           v5\sz,   v23\sz,  v19\sz // t5
783        sqadd           v6\sz,   v18\sz,  v22\sz // t2
784        sqsub           v7\sz,   v18\sz,  v22\sz // t6
785        sqadd           v18\sz,  v21\sz,  v17\sz // t3
786        sqsub           v19\sz,  v21\sz,  v17\sz // t7
787
788        smull_smlal     v16, v17, v3,  v5,  v1.h[3], v1.h[2], \sz
789        smull_smlsl     v20, v21, v3,  v5,  v1.h[2], v1.h[3], \sz
790        smull_smlsl     v22, v23, v19, v7,  v1.h[3], v1.h[2], \sz
791
792        sqrshrn_sz      v3,  v16, v17, #12, \sz  // t4a
793        sqrshrn_sz      v5,  v20, v21, #12, \sz  // t5a
794
795        smull_smlal     v16, v17, v19, v7,  v1.h[2], v1.h[3], \sz
796
797        sqrshrn_sz      v7,  v22, v23, #12, \sz  // t6a
798        sqrshrn_sz      v19, v16, v17, #12, \sz  // t7a
799
800        sqadd           \o0\()\sz, v2\sz, v6\sz  // out0
801        sqsub           v2\sz,     v2\sz, v6\sz  // t2
802        sqadd           \o7\()\sz, v4\sz, v18\sz // out7
803        sqsub           v4\sz,     v4\sz, v18\sz // t3
804        sqneg           \o7\()\sz, \o7\()\sz     // out7
805
806        sqadd           \o1\()\sz, v3\sz, v7\sz  // out1
807        sqsub           v3\sz,     v3\sz, v7\sz  // t6
808        sqadd           \o6\()\sz, v5\sz, v19\sz // out6
809        sqsub           v5\sz,     v5\sz, v19\sz // t7
810        sqneg           \o1\()\sz, \o1\()\sz     // out1
811
812        smull_smlal     v18, v19, v2,  v4,  v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
813        smull_smlsl     v6,  v7,  v2,  v4,  v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
814        smull_smlsl     v20, v21, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
815        sqrshrn_sz      v2,  v18, v19, #12, \sz // out3
816        smull_smlal     v18, v19, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
817        sqrshrn_sz      v3,  v20, v21, #12, \sz // out5
818        sqrshrn_sz      \o2, v18, v19, #12, \sz // out2 (v18 or v21)
819        sqrshrn_sz      \o4, v6,  v7,  #12, \sz // out4 (v20 or v19)
820
821        sqneg           \o3\()\sz, v2\sz     // out3
822        sqneg           \o5\()\sz, v3\sz     // out5
823.endm
824
825function inv_adst_8h_x8_neon, export=1
826        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .8h
827        ret
828endfunc
829
830function inv_flipadst_8h_x8_neon, export=1
831        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .8h
832        ret
833endfunc
834
835function inv_adst_4h_x8_neon, export=1
836        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .4h
837        ret
838endfunc
839
840function inv_flipadst_4h_x8_neon, export=1
841        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .4h
842        ret
843endfunc
844
845function inv_identity_8h_x8_neon, export=1
846        sqshl           v16.8h,  v16.8h,  #1
847        sqshl           v17.8h,  v17.8h,  #1
848        sqshl           v18.8h,  v18.8h,  #1
849        sqshl           v19.8h,  v19.8h,  #1
850        sqshl           v20.8h,  v20.8h,  #1
851        sqshl           v21.8h,  v21.8h,  #1
852        sqshl           v22.8h,  v22.8h,  #1
853        sqshl           v23.8h,  v23.8h,  #1
854        ret
855endfunc
856
857function inv_identity_4h_x8_neon, export=1
858        sqshl           v16.4h,  v16.4h,  #1
859        sqshl           v17.4h,  v17.4h,  #1
860        sqshl           v18.4h,  v18.4h,  #1
861        sqshl           v19.4h,  v19.4h,  #1
862        sqshl           v20.4h,  v20.4h,  #1
863        sqshl           v21.4h,  v21.4h,  #1
864        sqshl           v22.4h,  v22.4h,  #1
865        sqshl           v23.4h,  v23.4h,  #1
866        ret
867endfunc
868
869.macro def_fn_8x8_base variant
870function inv_txfm_\variant\()add_8x8_neon
871        movi            v28.8h,  #0
872        movi            v29.8h,  #0
873        movi            v30.8h,  #0
874        movi            v31.8h,  #0
875        ld1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
876        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64
877        ld1             {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
878        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
879
880.ifc \variant, identity_
881        // The identity shl #1 and downshift srshr #1 cancel out
882
883        b               L(itx_8x8_epilog)
884.else
885        blr             x4
886
887        srshr           v16.8h,  v16.8h,  #1
888        srshr           v17.8h,  v17.8h,  #1
889        srshr           v18.8h,  v18.8h,  #1
890        srshr           v19.8h,  v19.8h,  #1
891        srshr           v20.8h,  v20.8h,  #1
892        srshr           v21.8h,  v21.8h,  #1
893        srshr           v22.8h,  v22.8h,  #1
894        srshr           v23.8h,  v23.8h,  #1
895
896L(itx_8x8_epilog):
897        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
898
899        blr             x5
900
901        load_add_store_8x8 x0, x7
902        ret             x15
903.endif
904endfunc
905.endm
906
907def_fn_8x8_base identity_
908def_fn_8x8_base
909
910.macro def_fn_8x8 txfm1, txfm2
911function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
912        mov             x15, x30
913
914.ifc \txfm1\()_\txfm2, dct_dct
915        idct_dc         8,   8,   1
916.endif
917        adr             x5,  inv_\txfm2\()_8h_x8_neon
918.ifc \txfm1, identity
919        b               inv_txfm_identity_add_8x8_neon
920.else
921        adr             x4,  inv_\txfm1\()_8h_x8_neon
922        b               inv_txfm_add_8x8_neon
923.endif
924endfunc
925.endm
926
927def_fn_8x8 dct, dct
928def_fn_8x8 identity, identity
929def_fn_8x8 dct, adst
930def_fn_8x8 dct, flipadst
931def_fn_8x8 dct, identity
932def_fn_8x8 adst, dct
933def_fn_8x8 adst, adst
934def_fn_8x8 adst, flipadst
935def_fn_8x8 flipadst, dct
936def_fn_8x8 flipadst, adst
937def_fn_8x8 flipadst, flipadst
938def_fn_8x8 identity, dct
939def_fn_8x8 adst, identity
940def_fn_8x8 flipadst, identity
941def_fn_8x8 identity, adst
942def_fn_8x8 identity, flipadst
943
944function inv_txfm_add_8x4_neon
945        movi            v30.8h,  #0
946        movi            v31.8h,  #0
947        mov             w16, #2896*8
948        dup             v0.4h,   w16
949        ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
950        st1             {v30.8h,v31.8h}, [x2], #32
951        ld1             {v20.4h,v21.4h,v22.4h,v23.4h}, [x2]
952        st1             {v30.8h,v31.8h}, [x2]
953
954        scale_input     .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
955
956        blr             x4
957
958        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
959        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
960        ins             v16.d[1], v20.d[0]
961        ins             v17.d[1], v21.d[0]
962        ins             v18.d[1], v22.d[0]
963        ins             v19.d[1], v23.d[0]
964
965        blr             x5
966
967        load_add_store_8x4 x0, x7
968        ret             x15
969endfunc
970
971function inv_txfm_add_4x8_neon
972        movi            v28.8h,  #0
973        movi            v29.8h,  #0
974        movi            v30.8h,  #0
975        movi            v31.8h,  #0
976        mov             w16, #2896*8
977        dup             v0.4h,   w16
978        ld1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
979        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
980
981        scale_input     .8h, v0.h[0], v16, v17, v18, v19
982
983        blr             x4
984
985        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
986        ins             v20.d[0], v16.d[1]
987        ins             v21.d[0], v17.d[1]
988        ins             v22.d[0], v18.d[1]
989        ins             v23.d[0], v19.d[1]
990
991        blr             x5
992
993        load_add_store_4x8 x0, x7
994        ret             x15
995endfunc
996
997.macro def_fn_48 w, h, txfm1, txfm2
998function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
999        mov             x15, x30
1000
1001.ifc \txfm1\()_\txfm2, dct_dct
1002        idct_dc         \w,  \h,  0
1003.endif
1004        adr             x4,  inv_\txfm1\()_\h\()h_x\w\()_neon
1005        adr             x5,  inv_\txfm2\()_\w\()h_x\h\()_neon
1006        b               inv_txfm_add_\w\()x\h\()_neon
1007endfunc
1008.endm
1009
1010.macro def_fns_48 w, h
1011def_fn_48 \w, \h, dct, dct
1012def_fn_48 \w, \h, identity, identity
1013def_fn_48 \w, \h, dct, adst
1014def_fn_48 \w, \h, dct, flipadst
1015def_fn_48 \w, \h, dct, identity
1016def_fn_48 \w, \h, adst, dct
1017def_fn_48 \w, \h, adst, adst
1018def_fn_48 \w, \h, adst, flipadst
1019def_fn_48 \w, \h, flipadst, dct
1020def_fn_48 \w, \h, flipadst, adst
1021def_fn_48 \w, \h, flipadst, flipadst
1022def_fn_48 \w, \h, identity, dct
1023def_fn_48 \w, \h, adst, identity
1024def_fn_48 \w, \h, flipadst, identity
1025def_fn_48 \w, \h, identity, adst
1026def_fn_48 \w, \h, identity, flipadst
1027.endm
1028
1029def_fns_48 4, 8
1030def_fns_48 8, 4
1031
1032
1033.macro idct_16 sz, szb
1034        idct_8          v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb
1035
1036        smull_smlsl     v2,  v3,  v17, v31, v1.h[0], v1.h[1], \sz // -> t8a
1037        smull_smlal     v4,  v5,  v17, v31, v1.h[1], v1.h[0], \sz // -> t15a
1038        smull_smlsl     v6,  v7,  v25, v23, v1.h[2], v1.h[3], \sz // -> t9a
1039        sqrshrn_sz      v17, v2,  v3,  #12, \sz                   // t8a
1040        sqrshrn_sz      v31, v4,  v5,  #12, \sz                   // t15a
1041        smull_smlal     v2,  v3,  v25, v23, v1.h[3], v1.h[2], \sz // -> t14a
1042        smull_smlsl     v4,  v5,  v21, v27, v1.h[4], v1.h[5], \sz // -> t10a
1043        sqrshrn_sz      v23, v6,  v7,  #12, \sz                   // t9a
1044        sqrshrn_sz      v25, v2,  v3,  #12, \sz                   // t14a
1045        smull_smlal     v6,  v7,  v21, v27, v1.h[5], v1.h[4], \sz // -> t13a
1046        smull_smlsl     v2,  v3,  v29, v19, v1.h[6], v1.h[7], \sz // -> t11a
1047        sqrshrn_sz      v21, v4,  v5,  #12, \sz                   // t10a
1048        sqrshrn_sz      v27, v6,  v7,  #12, \sz                   // t13a
1049        smull_smlal     v4,  v5,  v29, v19, v1.h[7], v1.h[6], \sz // -> t12a
1050        sqrshrn_sz      v19, v2,  v3,  #12, \sz                   // t11a
1051        sqrshrn_sz      v29, v4,  v5,  #12, \sz                   // t12a
1052
1053        sqsub           v2\sz,   v17\sz,  v23\sz  // t9
1054        sqadd           v17\sz,  v17\sz,  v23\sz  // t8
1055        sqsub           v3\sz,   v31\sz,  v25\sz  // t14
1056        sqadd           v31\sz,  v31\sz,  v25\sz  // t15
1057        sqsub           v23\sz,  v19\sz,  v21\sz  // t10
1058        sqadd           v19\sz,  v19\sz,  v21\sz  // t11
1059        sqadd           v25\sz,  v29\sz,  v27\sz  // t12
1060        sqsub           v29\sz,  v29\sz,  v27\sz  // t13
1061
1062        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[2], v0.h[3], \sz // -> t9a
1063        smull_smlal     v6,  v7,  v3,  v2,  v0.h[3], v0.h[2], \sz // -> t14a
1064        sqrshrn_sz      v21, v4,  v5,  #12, \sz                   // t9a
1065        sqrshrn_sz      v27, v6,  v7,  #12, \sz                   // t14a
1066
1067        smull_smlsl     v4,  v5,  v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
1068        smull_smlal     v6,  v7,  v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
1069        sqrshrn_sz      v29, v4,  v5,  #12, \sz                   // t13a
1070        neg             v6.4s,   v6.4s
1071.ifc \sz, .8h
1072        neg             v7.4s,   v7.4s
1073.endif
1074        sqrshrn_sz      v23, v6,  v7,  #12, \sz                   // t10a
1075
1076        sqsub           v2\sz,   v17\sz,  v19\sz  // t11a
1077        sqadd           v17\sz,  v17\sz,  v19\sz  // t8a
1078        sqsub           v3\sz,   v31\sz,  v25\sz  // t12a
1079        sqadd           v31\sz,  v31\sz,  v25\sz  // t15a
1080        sqadd           v19\sz,  v21\sz,  v23\sz  // t9
1081        sqsub           v21\sz,  v21\sz,  v23\sz  // t10
1082        sqsub           v25\sz,  v27\sz,  v29\sz  // t13
1083        sqadd           v27\sz,  v27\sz,  v29\sz  // t14
1084
1085        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[0], v0.h[0], \sz // -> t11
1086        smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], \sz // -> t12
1087        smull_smlsl     v2,  v3,  v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
1088
1089        sqrshrn_sz      v4,  v4,  v5,  #12, \sz   // t11
1090        sqrshrn_sz      v5,  v6,  v7,  #12, \sz   // t12
1091        smull_smlal     v6,  v7,  v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
1092        sqrshrn_sz      v2,  v2,  v3,  #12, \sz   // t10a
1093        sqrshrn_sz      v3,  v6,  v7,  #12, \sz   // t13a
1094
1095        sqadd           v6\sz,   v16\sz,  v31\sz  // out0
1096        sqsub           v31\sz,  v16\sz,  v31\sz  // out15
1097        mov             v16\szb, v6\szb
1098        sqadd           v23\sz,  v30\sz,  v17\sz  // out7
1099        sqsub           v7\sz,   v30\sz,  v17\sz  // out8
1100        sqadd           v17\sz,  v18\sz,  v27\sz  // out1
1101        sqsub           v30\sz,  v18\sz,  v27\sz  // out14
1102        sqadd           v18\sz,  v20\sz,  v3\sz   // out2
1103        sqsub           v29\sz,  v20\sz,  v3\sz   // out13
1104        sqadd           v3\sz,   v28\sz,  v19\sz  // out6
1105        sqsub           v25\sz,  v28\sz,  v19\sz  // out9
1106        sqadd           v19\sz,  v22\sz,  v5\sz   // out3
1107        sqsub           v28\sz,  v22\sz,  v5\sz   // out12
1108        sqadd           v20\sz,  v24\sz,  v4\sz   // out4
1109        sqsub           v27\sz,  v24\sz,  v4\sz   // out11
1110        sqadd           v21\sz,  v26\sz,  v2\sz   // out5
1111        sqsub           v26\sz,  v26\sz,  v2\sz   // out10
1112        mov             v24\szb, v7\szb
1113        mov             v22\szb, v3\szb
1114.endm
1115
1116function inv_dct_8h_x16_neon, export=1
1117        movrel          x16, idct_coeffs
1118        ld1             {v0.8h, v1.8h}, [x16]
1119        idct_16         .8h, .16b
1120        ret
1121endfunc
1122
1123function inv_dct_4h_x16_neon, export=1
1124        movrel          x16, idct_coeffs
1125        ld1             {v0.8h, v1.8h}, [x16]
1126        idct_16         .4h, .8b
1127        ret
1128endfunc
1129
1130.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb
1131        movrel          x16, iadst16_coeffs
1132        ld1             {v0.8h, v1.8h}, [x16]
1133        movrel          x16, idct_coeffs
1134
1135        smull_smlal     v2,  v3,  v31, v16, v0.h[0], v0.h[1], \sz // -> t0
1136        smull_smlsl     v4,  v5,  v31, v16, v0.h[1], v0.h[0], \sz // -> t1
1137        smull_smlal     v6,  v7,  v29, v18, v0.h[2], v0.h[3], \sz // -> t2
1138        sqrshrn_sz      v16, v2,  v3,  #12, \sz   // t0
1139        sqrshrn_sz      v31, v4,  v5,  #12, \sz   // t1
1140        smull_smlsl     v2,  v3,  v29, v18, v0.h[3], v0.h[2], \sz // -> t3
1141        smull_smlal     v4,  v5,  v27, v20, v0.h[4], v0.h[5], \sz // -> t4
1142        sqrshrn_sz      v18, v6,  v7,  #12, \sz   // t2
1143        sqrshrn_sz      v29, v2,  v3,  #12, \sz   // t3
1144        smull_smlsl     v6,  v7,  v27, v20, v0.h[5], v0.h[4], \sz // -> t5
1145        smull_smlal     v2,  v3,  v25, v22, v0.h[6], v0.h[7], \sz // -> t6
1146        sqrshrn_sz      v20, v4,  v5,  #12, \sz   // t4
1147        sqrshrn_sz      v27, v6,  v7,  #12, \sz   // t5
1148        smull_smlsl     v4,  v5,  v25, v22, v0.h[7], v0.h[6], \sz // -> t7
1149        smull_smlal     v6,  v7,  v23, v24, v1.h[0], v1.h[1], \sz // -> t8
1150        sqrshrn_sz      v22, v2,  v3,  #12, \sz   // t6
1151        sqrshrn_sz      v25, v4,  v5,  #12, \sz   // t7
1152        smull_smlsl     v2,  v3,  v23, v24, v1.h[1], v1.h[0], \sz // -> t9
1153        smull_smlal     v4,  v5,  v21, v26, v1.h[2], v1.h[3], \sz // -> t10
1154        sqrshrn_sz      v23, v6,  v7,  #12, \sz   // t8
1155        sqrshrn_sz      v24, v2,  v3,  #12, \sz   // t9
1156        smull_smlsl     v6,  v7,  v21, v26, v1.h[3], v1.h[2], \sz // -> t11
1157        smull_smlal     v2,  v3,  v19, v28, v1.h[4], v1.h[5], \sz // -> t12
1158        sqrshrn_sz      v21, v4,  v5,  #12, \sz   // t10
1159        sqrshrn_sz      v26, v6,  v7,  #12, \sz   // t11
1160        smull_smlsl     v4,  v5,  v19, v28, v1.h[5], v1.h[4], \sz // -> t13
1161        smull_smlal     v6,  v7,  v17, v30, v1.h[6], v1.h[7], \sz // -> t14
1162        sqrshrn_sz      v19, v2,  v3,  #12, \sz   // t12
1163        sqrshrn_sz      v28, v4,  v5,  #12, \sz   // t13
1164        smull_smlsl     v2,  v3,  v17, v30, v1.h[7], v1.h[6], \sz // -> t15
1165        sqrshrn_sz      v17, v6,  v7,  #12, \sz   // t14
1166        sqrshrn_sz      v30, v2,  v3,  #12, \sz   // t15
1167
1168        ld1             {v0.8h}, [x16]
1169
1170        sqsub           v2\sz,   v16\sz,  v23\sz // t8a
1171        sqadd           v16\sz,  v16\sz,  v23\sz // t0a
1172        sqsub           v3\sz,   v31\sz,  v24\sz // t9a
1173        sqadd           v31\sz,  v31\sz,  v24\sz // t1a
1174        sqadd           v23\sz,  v18\sz,  v21\sz // t2a
1175        sqsub           v18\sz,  v18\sz,  v21\sz // t10a
1176        sqadd           v24\sz,  v29\sz,  v26\sz // t3a
1177        sqsub           v29\sz,  v29\sz,  v26\sz // t11a
1178        sqadd           v21\sz,  v20\sz,  v19\sz // t4a
1179        sqsub           v20\sz,  v20\sz,  v19\sz // t12a
1180        sqadd           v26\sz,  v27\sz,  v28\sz // t5a
1181        sqsub           v27\sz,  v27\sz,  v28\sz // t13a
1182        sqadd           v19\sz,  v22\sz,  v17\sz // t6a
1183        sqsub           v22\sz,  v22\sz,  v17\sz // t14a
1184        sqadd           v28\sz,  v25\sz,  v30\sz // t7a
1185        sqsub           v25\sz,  v25\sz,  v30\sz // t15a
1186
1187        smull_smlal     v4,  v5,  v2,  v3,  v0.h[5], v0.h[4], \sz // -> t8
1188        smull_smlsl     v6,  v7,  v2,  v3,  v0.h[4], v0.h[5], \sz // -> t9
1189        smull_smlal     v2,  v3,  v18, v29, v0.h[7], v0.h[6], \sz // -> t10
1190        sqrshrn_sz      v17, v4,  v5,  #12, \sz  // t8
1191        sqrshrn_sz      v30, v6,  v7,  #12, \sz  // t9
1192        smull_smlsl     v4,  v5,  v18, v29, v0.h[6], v0.h[7], \sz // -> t11
1193        smull_smlsl     v6,  v7,  v27, v20, v0.h[5], v0.h[4], \sz // -> t12
1194        sqrshrn_sz      v18, v2,  v3,  #12, \sz  // t10
1195        sqrshrn_sz      v29, v4,  v5,  #12, \sz  // t11
1196        smull_smlal     v2,  v3,  v27, v20, v0.h[4], v0.h[5], \sz // -> t13
1197        smull_smlsl     v4,  v5,  v25, v22, v0.h[7], v0.h[6], \sz // -> t14
1198        sqrshrn_sz      v27, v6,  v7,  #12, \sz  // t12
1199        sqrshrn_sz      v20, v2,  v3,  #12, \sz  // t13
1200        smull_smlal     v6,  v7,  v25, v22, v0.h[6], v0.h[7], \sz // -> t15
1201        sqrshrn_sz      v25, v4,  v5,  #12, \sz  // t14
1202        sqrshrn_sz      v22, v6,  v7,  #12, \sz  // t15
1203
1204        sqsub           v2\sz,   v16\sz,  v21\sz // t4
1205        sqadd           v16\sz,  v16\sz,  v21\sz // t0
1206        sqsub           v3\sz,   v31\sz,  v26\sz // t5
1207        sqadd           v31\sz,  v31\sz,  v26\sz // t1
1208        sqadd           v21\sz,  v23\sz,  v19\sz // t2
1209        sqsub           v23\sz,  v23\sz,  v19\sz // t6
1210        sqadd           v26\sz,  v24\sz,  v28\sz // t3
1211        sqsub           v24\sz,  v24\sz,  v28\sz // t7
1212        sqadd           v19\sz,  v17\sz,  v27\sz // t8a
1213        sqsub           v17\sz,  v17\sz,  v27\sz // t12a
1214        sqadd           v28\sz,  v30\sz,  v20\sz // t9a
1215        sqsub           v30\sz,  v30\sz,  v20\sz // t13a
1216        sqadd           v27\sz,  v18\sz,  v25\sz // t10a
1217        sqsub           v18\sz,  v18\sz,  v25\sz // t14a
1218        sqadd           v20\sz,  v29\sz,  v22\sz // t11a
1219        sqsub           v29\sz,  v29\sz,  v22\sz // t15a
1220
1221        smull_smlal     v4,  v5,  v2,  v3,  v0.h[3], v0.h[2], \sz // -> t4a
1222        smull_smlsl     v6,  v7,  v2,  v3,  v0.h[2], v0.h[3], \sz // -> t5a
1223        smull_smlsl     v2,  v3,  v24, v23, v0.h[3], v0.h[2], \sz // -> t6a
1224        sqrshrn_sz      v22, v4,  v5,  #12, \sz // t4a
1225        sqrshrn_sz      v25, v6,  v7,  #12, \sz // t5a
1226        smull_smlal     v4,  v5,  v24, v23, v0.h[2], v0.h[3], \sz // -> t7a
1227        smull_smlal     v6,  v7,  v17, v30, v0.h[3], v0.h[2], \sz // -> t12
1228        sqrshrn_sz      v24, v2,  v3,  #12, \sz // t6a
1229        sqrshrn_sz      v23, v4,  v5,  #12, \sz // t7a
1230        smull_smlsl     v2,  v3,  v17, v30, v0.h[2], v0.h[3], \sz // -> t13
1231        smull_smlsl     v4,  v5,  v29, v18, v0.h[3], v0.h[2], \sz // -> t14
1232        sqrshrn_sz      v17, v6,  v7,  #12, \sz // t12
1233        smull_smlal     v6,  v7,  v29, v18, v0.h[2], v0.h[3], \sz // -> t15
1234        sqrshrn_sz      v29, v2,  v3,  #12, \sz // t13
1235        sqrshrn_sz      v30, v4,  v5,  #12, \sz // t14
1236        sqrshrn_sz      v18, v6,  v7,  #12, \sz // t15
1237
1238        sqsub           v2\sz,   v16\sz,  v21\sz // t2a
1239.ifc \o0, v16
1240        sqadd           \o0\sz,  v16\sz,  v21\sz // out0
1241        sqsub           v21\sz,  v31\sz,  v26\sz // t3a
1242        sqadd           \o15\sz, v31\sz,  v26\sz // out15
1243.else
1244        sqadd           v4\sz,   v16\sz,  v21\sz // out0
1245        sqsub           v21\sz,  v31\sz,  v26\sz // t3a
1246        sqadd           \o15\sz, v31\sz,  v26\sz // out15
1247        mov             \o0\szb, v4\szb
1248.endif
1249        sqneg           \o15\sz, \o15\sz         // out15
1250
1251        sqsub           v3\sz,   v29\sz,  v18\sz // t15a
1252        sqadd           \o13\sz, v29\sz,  v18\sz // out13
1253        sqadd           \o2\sz,  v17\sz,  v30\sz // out2
1254        sqsub           v26\sz,  v17\sz,  v30\sz // t14a
1255        sqneg           \o13\sz, \o13\sz         // out13
1256
1257        sqadd           \o1\sz,  v19\sz,  v27\sz // out1
1258        sqsub           v27\sz,  v19\sz,  v27\sz // t10
1259        sqadd           \o14\sz, v28\sz,  v20\sz // out14
1260        sqsub           v20\sz,  v28\sz,  v20\sz // t11
1261        sqneg           \o1\sz,  \o1\sz          // out1
1262
1263        sqadd           \o3\sz,  v22\sz,  v24\sz // out3
1264        sqsub           v22\sz,  v22\sz,  v24\sz // t6
1265        sqadd           \o12\sz, v25\sz,  v23\sz // out12
1266        sqsub           v23\sz,  v25\sz,  v23\sz // t7
1267        sqneg           \o3\sz,  \o3\sz          // out3
1268
1269        smull_smlsl     v24, v25, v2,  v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
1270        smull_smlal     v4,  v5,  v2,  v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
1271        smull_smlal     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
1272
1273        sqrshrn_sz      v24, v24, v25, #12, \sz // out8
1274        sqrshrn_sz      v4,  v4,  v5,  #12, \sz // out7
1275        sqrshrn_sz      v5,  v6,  v7,  #12, \sz // out5
1276        smull_smlsl     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
1277        smull_smlal     v2,  v3,  v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
1278        sqrshrn_sz      v26, v6,  v7,  #12, \sz // out10
1279
1280        smull_smlsl     v6,  v7,  v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
1281        smull_smlal     v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
1282        smull_smlsl     v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
1283
1284        sqrshrn_sz      \o4, v2,  v3,  #12, \sz // out4
1285        sqrshrn_sz      v6,  v6,  v7,  #12, \sz // out11
1286        sqrshrn_sz      v7,  v21, v25, #12, \sz // out9
1287        sqrshrn_sz      \o6, v22, v23, #12, \sz // out6
1288
1289.ifc \o8, v23
1290        mov             \o8\szb,  v24\szb
1291        mov             \o10\szb, v26\szb
1292.endif
1293
1294        sqneg           \o7\sz,  v4\sz // out7
1295        sqneg           \o5\sz,  v5\sz // out5
1296        sqneg           \o11\sz, v6\sz // out11
1297        sqneg           \o9\sz,  v7\sz // out9
1298.endm
1299
1300function inv_adst_8h_x16_neon, export=1
1301        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b
1302        ret
1303endfunc
1304
1305function inv_flipadst_8h_x16_neon, export=1
1306        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b
1307        ret
1308endfunc
1309
1310function inv_adst_4h_x16_neon, export=1
1311        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b
1312        ret
1313endfunc
1314
1315function inv_flipadst_4h_x16_neon, export=1
1316        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b
1317        ret
1318endfunc
1319
1320function inv_identity_8h_x16_neon, export=1
1321        mov             w16, #2*(5793-4096)*8
1322        dup             v0.4h,   w16
1323.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1324        sqrdmulh        v2.8h,      v\i\().8h,  v0.h[0]
1325        sqadd           v\i\().8h,  v\i\().8h,  v\i\().8h
1326        sqadd           v\i\().8h,  v\i\().8h,  v2.8h
1327.endr
1328        ret
1329endfunc
1330
1331function inv_identity_4h_x16_neon, export=1
1332        mov             w16, #2*(5793-4096)*8
1333        dup             v0.4h,   w16
1334.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1335        sqrdmulh        v2.4h,      v\i\().4h,  v0.h[0]
1336        sqadd           v\i\().4h,  v\i\().4h,  v\i\().4h
1337        sqadd           v\i\().4h,  v\i\().4h,  v2.4h
1338.endr
1339        ret
1340endfunc
1341
1342.macro identity_8x16_shift2 c
1343.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
1344        sqrdmulh        v2.8h,   \i,      \c
1345        sshr            v2.8h,   v2.8h,   #1
1346        srhadd          \i,      \i,      v2.8h
1347.endr
1348.endm
1349
1350.macro identity_8x16_shift1 c
1351.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
1352        sqrdmulh        v2.8h,   \i,      \c
1353        srshr           v2.8h,   v2.8h,   #1
1354        sqadd           \i,      \i,      v2.8h
1355.endr
1356.endm
1357
1358.macro identity_8x8_shift1 c
1359.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
1360        sqrdmulh        v2.8h,   \i,      \c
1361        srshr           v2.8h,   v2.8h,   #1
1362        sqadd           \i,      \i,      v2.8h
1363.endr
1364.endm
1365
1366.macro identity_8x8 c
1367.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
1368        sqrdmulh        v2.8h,   \i,      \c
1369        sqadd           \i,      \i,      \i
1370        sqadd           \i,      \i,      v2.8h
1371.endr
1372.endm
1373
1374.macro def_horz_16 scale=0, identity=0, shift=2, suffix
1375function inv_txfm_horz\suffix\()_16x8_neon
1376        AARCH64_VALID_CALL_TARGET
1377        mov             x14, x30
1378        movi            v7.8h,  #0
1379.if \identity
1380        mov             w16, #2*(5793-4096)*8
1381        dup             v0.4h,   w16
1382.elseif \scale
1383        mov             w16, #2896*8
1384        dup             v0.4h,   w16
1385.endif
1386.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
1387        ld1             {\i}, [x7]
1388        st1             {v7.8h}, [x7], x8
1389.endr
1390.if \scale
1391        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
1392        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
1393.endif
1394.if \identity
1395        identity_8x16_shift2 v0.h[0]
1396        b               L(horz_16x8_epilog)
1397.else
1398        blr             x4
1399.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
1400        srshr           \i,  \i,  #\shift
1401.endr
1402.if \shift == 1
1403        b               L(horz_16x8_epilog)
1404.else
1405L(horz_16x8_epilog):
1406        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
1407        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
1408
1409.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h
1410        st1             {\i}, [x6], #16
1411.endr
1412
1413        ret             x14
1414.endif
1415.endif
1416endfunc
1417.endm
1418
1419def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
1420def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
1421def_horz_16 scale=0, identity=0, shift=2
1422
1423function inv_txfm_add_vert_8x16_neon
1424        mov             x14, x30
1425.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1426        ld1             {v\i\().8h}, [x7], x8
1427.endr
1428        blr             x5
1429        load_add_store_8x16 x6, x7
1430        ret             x14
1431endfunc
1432
1433function inv_txfm_add_16x16_neon
1434        mov             x15, x30
1435        sub             sp,  sp,  #512
1436        mov             x8,  #16*2
1437.irp i, 0, 8
1438        add             x6,  sp,  #(\i*16*2)
1439.if \i == 8
1440        cmp             w3,  w13
1441        b.lt            1f
1442.endif
1443        add             x7,  x2,  #(\i*2)
1444        blr             x9
1445.endr
1446        b               2f
14471:
1448        movi            v4.8h,  #0
1449        movi            v5.8h,  #0
1450        movi            v6.8h,  #0
1451        movi            v7.8h,  #0
1452.rept 4
1453        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
1454.endr
14552:
1456.irp i, 0, 8
1457        add             x6,  x0,  #(\i)
1458        add             x7,  sp,  #(\i*2)
1459        bl              inv_txfm_add_vert_8x16_neon
1460.endr
1461
1462        add             sp,  sp,  #512
1463        ret             x15
1464endfunc
1465
1466.macro def_fn_16x16 txfm1, txfm2, eob_half
1467function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
1468.ifc \txfm1\()_\txfm2, dct_dct
1469        idct_dc         16,  16,  2
1470.endif
1471.ifc \txfm1, identity
1472        adr             x9,  inv_txfm_horz_identity_16x8_neon
1473.else
1474        adr             x9,  inv_txfm_horz_16x8_neon
1475        adr             x4,  inv_\txfm1\()_8h_x16_neon
1476.endif
1477        adr             x5,  inv_\txfm2\()_8h_x16_neon
1478        mov             x13, #\eob_half
1479        b               inv_txfm_add_16x16_neon
1480endfunc
1481.endm
1482
1483def_fn_16x16 dct, dct, 36
1484def_fn_16x16 identity, identity, 36
1485def_fn_16x16 dct, adst, 36
1486def_fn_16x16 dct, flipadst, 36
1487def_fn_16x16 dct, identity, 8
1488def_fn_16x16 adst, dct, 36
1489def_fn_16x16 adst, adst, 36
1490def_fn_16x16 adst, flipadst, 36
1491def_fn_16x16 flipadst, dct, 36
1492def_fn_16x16 flipadst, adst, 36
1493def_fn_16x16 flipadst, flipadst, 36
1494def_fn_16x16 identity, dct, 8
1495
1496.macro def_fn_416_base variant
1497function inv_txfm_\variant\()add_16x4_neon
1498        mov             x15, x30
1499        movi            v4.8h,  #0
1500
1501.ifc \variant, identity_
1502.irp i, v16.4h, v17.4h, v18.4h, v19.4h
1503        ld1             {\i},    [x2]
1504        st1             {v4.4h}, [x2], #8
1505.endr
1506.irp i, v16.d, v17.d, v18.d, v19.d
1507        ld1             {\i}[1], [x2]
1508        st1             {v4.4h}, [x2], #8
1509.endr
1510        mov             w16, #2*(5793-4096)*8
1511        dup             v0.4h,   w16
1512.irp i, v20.4h, v21.4h, v22.4h, v23.4h
1513        ld1             {\i},    [x2]
1514        st1             {v4.4h}, [x2], #8
1515.endr
1516.irp i, v20.d, v21.d, v22.d, v23.d
1517        ld1             {\i}[1], [x2]
1518        st1             {v4.4h}, [x2], #8
1519.endr
1520
1521        identity_8x16_shift1 v0.h[0]
1522
1523        b               L(itx_16x4_epilog)
1524.else
1525.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
1526        ld1             {\i},    [x2]
1527        st1             {v4.4h}, [x2], #8
1528.endr
1529
1530        blr             x4
1531
1532        ins             v16.d[1], v20.d[0]
1533        ins             v17.d[1], v21.d[0]
1534        ins             v18.d[1], v22.d[0]
1535        ins             v19.d[1], v23.d[0]
1536.irp i, v16.8h, v17.8h, v18.8h, v19.8h
1537        srshr           \i,  \i,  #1
1538.endr
1539
1540        ins             v24.d[1], v28.d[0]
1541        ins             v25.d[1], v29.d[0]
1542        ins             v26.d[1], v30.d[0]
1543        ins             v27.d[1], v31.d[0]
1544        srshr           v20.8h,  v24.8h,  #1
1545        srshr           v21.8h,  v25.8h,  #1
1546        srshr           v22.8h,  v26.8h,  #1
1547        srshr           v23.8h,  v27.8h,  #1
1548
1549L(itx_16x4_epilog):
1550        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
1551        blr             x5
1552        mov             x6,  x0
1553        load_add_store_8x4 x6, x7
1554
1555        transpose_4x8h_mov v20, v21, v22, v23, v2,  v3,  v4,  v5, v16, v17, v18, v19
1556        blr             x5
1557        add             x6,  x0,  #8
1558        load_add_store_8x4 x6, x7
1559
1560        ret             x15
1561.endif
1562endfunc
1563
1564function inv_txfm_\variant\()add_4x16_neon
1565        mov             x15, x30
1566        movi            v2.8h,   #0
1567
1568        mov             x11, #32
1569        cmp             w3,  w13
1570        b.lt            1f
1571
1572        add             x6,  x2,  #16
1573.ifc \variant, identity_
1574.irp i, v24.8h, v25.8h, v26.8h, v27.8h
1575        ld1             {\i},    [x6]
1576        st1             {v2.8h}, [x6], x11
1577.endr
1578        mov             w16, #(5793-4096)*8
1579        dup             v0.4h,   w16
1580        identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
1581.else
1582.irp i, v16.8h, v17.8h, v18.8h, v19.8h
1583        ld1             {\i},    [x6]
1584        st1             {v2.8h}, [x6], x11
1585.endr
1586        blr             x4
1587        srshr           v24.8h,  v16.8h,  #1
1588        srshr           v25.8h,  v17.8h,  #1
1589        srshr           v26.8h,  v18.8h,  #1
1590        srshr           v27.8h,  v19.8h,  #1
1591.endif
1592        transpose_4x8h  v24, v25, v26, v27, v4,  v5,  v6,  v7
1593        ins             v28.d[0], v24.d[1]
1594        ins             v29.d[0], v25.d[1]
1595        ins             v30.d[0], v26.d[1]
1596        ins             v31.d[0], v27.d[1]
1597
1598        b               2f
15991:
1600.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
1601        movi            \i,  #0
1602.endr
16032:
1604        movi            v2.8h,   #0
1605.irp i, v16.8h, v17.8h, v18.8h, v19.8h
1606        ld1             {\i},    [x2]
1607        st1             {v2.8h}, [x2], x11
1608.endr
1609.ifc \variant, identity_
1610        mov             w16, #(5793-4096)*8
1611        dup             v0.4h,   w16
1612        identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
1613
1614        b               L(itx_4x16_epilog)
1615.else
1616        blr             x4
1617.irp i, v16.8h, v17.8h, v18.8h, v19.8h
1618        srshr           \i,  \i,  #1
1619.endr
1620L(itx_4x16_epilog):
1621        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
1622        ins             v20.d[0], v16.d[1]
1623        ins             v21.d[0], v17.d[1]
1624        ins             v22.d[0], v18.d[1]
1625        ins             v23.d[0], v19.d[1]
1626
1627        blr             x5
1628
1629        load_add_store_4x16 x0, x6
1630
1631        ret             x15
1632.endif
1633endfunc
1634.endm
1635
1636def_fn_416_base identity_
1637def_fn_416_base
1638
1639.macro def_fn_416 w, h, txfm1, txfm2, eob_half
1640function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
1641.ifc \txfm1\()_\txfm2, dct_dct
1642        idct_dc         \w,  \h,  1
1643.endif
1644.if \w == 4
1645.ifnc \txfm1, identity
1646        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
1647.endif
1648        adr             x5,  inv_\txfm2\()_4h_x\h\()_neon
1649        mov             w13, #\eob_half
1650.else
1651.ifnc \txfm1, identity
1652        adr             x4,  inv_\txfm1\()_4h_x\w\()_neon
1653.endif
1654        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
1655.endif
1656.ifc \txfm1, identity
1657        b               inv_txfm_identity_add_\w\()x\h\()_neon
1658.else
1659        b               inv_txfm_add_\w\()x\h\()_neon
1660.endif
1661endfunc
1662.endm
1663
1664.macro def_fns_416 w, h
1665def_fn_416 \w, \h, dct, dct, 29
1666def_fn_416 \w, \h, identity, identity, 29
1667def_fn_416 \w, \h, dct, adst, 29
1668def_fn_416 \w, \h, dct, flipadst, 29
1669def_fn_416 \w, \h, dct, identity, 8
1670def_fn_416 \w, \h, adst, dct, 29
1671def_fn_416 \w, \h, adst, adst, 29
1672def_fn_416 \w, \h, adst, flipadst, 29
1673def_fn_416 \w, \h, flipadst, dct, 29
1674def_fn_416 \w, \h, flipadst, adst, 29
1675def_fn_416 \w, \h, flipadst, flipadst, 29
1676def_fn_416 \w, \h, identity, dct, 32
1677def_fn_416 \w, \h, adst, identity, 8
1678def_fn_416 \w, \h, flipadst, identity, 8
1679def_fn_416 \w, \h, identity, adst, 32
1680def_fn_416 \w, \h, identity, flipadst, 32
1681.endm
1682
1683def_fns_416 4, 16
1684def_fns_416 16, 4
1685
1686
1687.macro def_fn_816_base variant
1688function inv_txfm_\variant\()add_16x8_neon
1689        mov             x15, x30
1690        movi            v4.8h,  #0
1691        mov             w16, #2896*8
1692        dup             v0.4h,   w16
1693
1694.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
1695        ld1             {\i},    [x2]
1696        st1             {v4.8h}, [x2], #16
1697.endr
1698
1699        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
1700        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
1701.ifc \variant, identity_
1702        mov             w16, #2*(5793-4096)*8
1703        dup             v0.4h,   w16
1704        identity_8x16_shift1 v0.h[0]
1705
1706        b               L(itx_16x8_epilog)
1707.else
1708        blr             x4
1709
1710.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
1711        srshr           \i,  \i,  #1
1712.endr
1713
1714L(itx_16x8_epilog):
1715        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1716
1717        blr             x5
1718
1719        mov             x6,  x0
1720        load_add_store_8x8 x6, x7
1721
1722        transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23
1723
1724        blr             x5
1725
1726        add             x0,  x0,  #8
1727        load_add_store_8x8 x0, x7
1728
1729        ret             x15
1730.endif
1731endfunc
1732
1733function inv_txfm_\variant\()add_8x16_neon
1734        mov             x15, x30
1735        movi            v4.8h,   #0
1736        mov             w16, #2896*8
1737        dup             v0.4h,   w16
1738        mov             x11, #32
1739
1740        cmp             w3,  w13
1741        b.lt            1f
1742
1743        add             x6,  x2,  #16
1744.ifc \variant, identity_
1745.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
1746        ld1             {\i},    [x6]
1747        st1             {v4.8h}, [x6], x11
1748.endr
1749        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
1750        // The identity shl #1 and downshift srshr #1 cancel out
1751.else
1752.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
1753        ld1             {\i},    [x6]
1754        st1             {v4.8h}, [x6], x11
1755.endr
1756        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
1757        blr             x4
1758
1759        srshr           v24.8h,  v16.8h,  #1
1760        srshr           v25.8h,  v17.8h,  #1
1761        srshr           v26.8h,  v18.8h,  #1
1762        srshr           v27.8h,  v19.8h,  #1
1763        srshr           v28.8h,  v20.8h,  #1
1764        srshr           v29.8h,  v21.8h,  #1
1765        srshr           v30.8h,  v22.8h,  #1
1766        srshr           v31.8h,  v23.8h,  #1
1767.endif
1768        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
1769
1770        b               2f
1771
17721:
1773.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
1774        movi            \i,  #0
1775.endr
1776
17772:
1778        movi            v4.8h,   #0
1779        mov             w16, #2896*8
1780        dup             v0.4h,   w16
1781
1782.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
1783        ld1             {\i},    [x2]
1784        st1             {v4.8h}, [x2], x11
1785.endr
1786        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
1787.ifc \variant, identity_
1788        // The identity shl #1 and downshift srshr #1 cancel out
1789
1790        b               L(itx_8x16_epilog)
1791.else
1792        blr             x4
1793
1794.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
1795        srshr           \i,  \i,  #1
1796.endr
1797
1798L(itx_8x16_epilog):
1799        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1800
1801        blr             x5
1802
1803        load_add_store_8x16 x0, x6
1804
1805        ret             x15
1806.endif
1807endfunc
1808.endm
1809
1810def_fn_816_base identity_
1811def_fn_816_base
1812
1813.macro def_fn_816 w, h, txfm1, txfm2, eob_half
1814function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
1815.ifc \txfm1\()_\txfm2, dct_dct
1816        idct_dc         \w,  \h,  1
1817.endif
1818.ifnc \txfm1, identity
1819        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
1820.endif
1821        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
1822.if \w == 8
1823        mov             x13, #\eob_half
1824.endif
1825.ifc \txfm1, identity
1826        b               inv_txfm_identity_add_\w\()x\h\()_neon
1827.else
1828        b               inv_txfm_add_\w\()x\h\()_neon
1829.endif
1830endfunc
1831.endm
1832
1833.macro def_fns_816 w, h
1834def_fn_816 \w, \h, dct, dct, 43
1835def_fn_816 \w, \h, identity, identity, 43
1836def_fn_816 \w, \h, dct, adst, 43
1837def_fn_816 \w, \h, dct, flipadst, 43
1838def_fn_816 \w, \h, dct, identity, 8
1839def_fn_816 \w, \h, adst, dct, 43
1840def_fn_816 \w, \h, adst, adst, 43
1841def_fn_816 \w, \h, adst, flipadst, 43
1842def_fn_816 \w, \h, flipadst, dct, 43
1843def_fn_816 \w, \h, flipadst, adst, 43
1844def_fn_816 \w, \h, flipadst, flipadst, 43
1845def_fn_816 \w, \h, identity, dct, 64
1846def_fn_816 \w, \h, adst, identity, 8
1847def_fn_816 \w, \h, flipadst, identity, 8
1848def_fn_816 \w, \h, identity, adst, 64
1849def_fn_816 \w, \h, identity, flipadst, 64
1850.endm
1851
1852def_fns_816 8, 16
1853def_fns_816 16, 8
1854
1855function inv_dct32_odd_8h_x16_neon, export=1
1856        movrel          x16, idct_coeffs, 2*16
1857        ld1             {v0.8h, v1.8h}, [x16]
1858        sub             x16, x16, #2*16
1859
1860        smull_smlsl     v2,  v3,  v16, v31, v0.h[0], v0.h[1], .8h // -> t16a
1861        smull_smlal     v4,  v5,  v16, v31, v0.h[1], v0.h[0], .8h // -> t31a
1862        smull_smlsl     v6,  v7,  v24, v23, v0.h[2], v0.h[3], .8h // -> t17a
1863        sqrshrn_sz      v16, v2,  v3,  #12, .8h                   // t16a
1864        sqrshrn_sz      v31, v4,  v5,  #12, .8h                   // t31a
1865        smull_smlal     v2,  v3,  v24, v23, v0.h[3], v0.h[2], .8h // -> t30a
1866        smull_smlsl     v4,  v5,  v20, v27, v0.h[4], v0.h[5], .8h // -> t18a
1867        sqrshrn_sz      v24, v6,  v7,  #12, .8h                   // t17a
1868        sqrshrn_sz      v23, v2,  v3,  #12, .8h                   // t30a
1869        smull_smlal     v6,  v7,  v20, v27, v0.h[5], v0.h[4], .8h // -> t29a
1870        smull_smlsl     v2,  v3,  v28, v19, v0.h[6], v0.h[7], .8h // -> t19a
1871        sqrshrn_sz      v20, v4,  v5,  #12, .8h                   // t18a
1872        sqrshrn_sz      v27, v6,  v7,  #12, .8h                   // t29a
1873        smull_smlal     v4,  v5,  v28, v19, v0.h[7], v0.h[6], .8h // -> t28a
1874        smull_smlsl     v6,  v7,  v18, v29, v1.h[0], v1.h[1], .8h // -> t20a
1875        sqrshrn_sz      v28, v2,  v3,  #12, .8h                   // t19a
1876        sqrshrn_sz      v19, v4,  v5,  #12, .8h                   // t28a
1877        smull_smlal     v2,  v3,  v18, v29, v1.h[1], v1.h[0], .8h // -> t27a
1878        smull_smlsl     v4,  v5,  v26, v21, v1.h[2], v1.h[3], .8h // -> t21a
1879        sqrshrn_sz      v18, v6,  v7,  #12, .8h                   // t20a
1880        sqrshrn_sz      v29, v2,  v3,  #12, .8h                   // t27a
1881        smull_smlal     v6,  v7,  v26, v21, v1.h[3], v1.h[2], .8h // -> t26a
1882        smull_smlsl     v2,  v3,  v22, v25, v1.h[4], v1.h[5], .8h // -> t22a
1883        sqrshrn_sz      v26, v4,  v5,  #12, .8h                   // t21a
1884        sqrshrn_sz      v21, v6,  v7,  #12, .8h                   // t26a
1885        smull_smlal     v4,  v5,  v22, v25, v1.h[5], v1.h[4], .8h // -> t25a
1886        smull_smlsl     v6,  v7,  v30, v17, v1.h[6], v1.h[7], .8h // -> t23a
1887        sqrshrn_sz      v22, v2,  v3,  #12, .8h                   // t22a
1888        sqrshrn_sz      v25, v4,  v5,  #12, .8h                   // t25a
1889        smull_smlal     v2,  v3,  v30, v17, v1.h[7], v1.h[6], .8h // -> t24a
1890        sqrshrn_sz      v30, v6,  v7,  #12, .8h                   // t23a
1891        sqrshrn_sz      v17, v2,  v3,  #12, .8h                   // t24a
1892
1893        ld1             {v0.8h}, [x16]
1894
1895        sqsub           v2.8h,   v16.8h,  v24.8h // t17
1896        sqadd           v16.8h,  v16.8h,  v24.8h // t16
1897        sqsub           v3.8h,   v31.8h,  v23.8h // t30
1898        sqadd           v31.8h,  v31.8h,  v23.8h // t31
1899        sqsub           v24.8h,  v28.8h,  v20.8h // t18
1900        sqadd           v28.8h,  v28.8h,  v20.8h // t19
1901        sqadd           v23.8h,  v18.8h,  v26.8h // t20
1902        sqsub           v18.8h,  v18.8h,  v26.8h // t21
1903        sqsub           v20.8h,  v30.8h,  v22.8h // t22
1904        sqadd           v30.8h,  v30.8h,  v22.8h // t23
1905        sqadd           v26.8h,  v17.8h,  v25.8h // t24
1906        sqsub           v17.8h,  v17.8h,  v25.8h // t25
1907        sqsub           v22.8h,  v29.8h,  v21.8h // t26
1908        sqadd           v29.8h,  v29.8h,  v21.8h // t27
1909        sqadd           v25.8h,  v19.8h,  v27.8h // t28
1910        sqsub           v19.8h,  v19.8h,  v27.8h // t29
1911
1912        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[4], v0.h[5], .8h // -> t17a
1913        smull_smlal     v6,  v7,  v3,  v2,  v0.h[5], v0.h[4], .8h // -> t30a
1914        smull_smlal     v2,  v3,  v19, v24, v0.h[5], v0.h[4], .8h // -> t18a
1915        sqrshrn_sz      v21, v4,  v5,  #12, .8h                   // t17a
1916        sqrshrn_sz      v27, v6,  v7,  #12, .8h                   // t30a
1917        neg             v2.4s,   v2.4s                            // -> t18a
1918        neg             v3.4s,   v3.4s                            // -> t18a
1919        smull_smlsl     v4,  v5,  v19, v24, v0.h[4], v0.h[5], .8h // -> t29a
1920        smull_smlsl     v6,  v7,  v22, v18, v0.h[6], v0.h[7], .8h // -> t21a
1921        sqrshrn_sz      v19, v2,  v3,  #12, .8h                   // t18a
1922        sqrshrn_sz      v24, v4,  v5,  #12, .8h                   // t29a
1923        smull_smlal     v2,  v3,  v22, v18, v0.h[7], v0.h[6], .8h // -> t26a
1924        smull_smlal     v4,  v5,  v17, v20, v0.h[7], v0.h[6], .8h // -> t22a
1925        sqrshrn_sz      v22, v6,  v7,  #12, .8h                   // t21a
1926        sqrshrn_sz      v18, v2,  v3,  #12, .8h                   // t26a
1927        neg             v4.4s,   v4.4s                            // -> t22a
1928        neg             v5.4s,   v5.4s                            // -> t22a
1929        smull_smlsl     v6,  v7,  v17, v20, v0.h[6], v0.h[7], .8h // -> t25a
1930        sqrshrn_sz      v17, v4,  v5,  #12, .8h                   // t22a
1931        sqrshrn_sz      v20, v6,  v7,  #12, .8h                   // t25a
1932
1933        sqsub           v2.8h,   v27.8h,  v24.8h // t29
1934        sqadd           v27.8h,  v27.8h,  v24.8h // t30
1935        sqsub           v3.8h,   v21.8h,  v19.8h // t18
1936        sqadd           v21.8h,  v21.8h,  v19.8h // t17
1937        sqsub           v24.8h,  v16.8h,  v28.8h // t19a
1938        sqadd           v16.8h,  v16.8h,  v28.8h // t16a
1939        sqsub           v19.8h,  v30.8h,  v23.8h // t20a
1940        sqadd           v30.8h,  v30.8h,  v23.8h // t23a
1941        sqsub           v28.8h,  v17.8h,  v22.8h // t21
1942        sqadd           v17.8h,  v17.8h,  v22.8h // t22
1943        sqadd           v23.8h,  v26.8h,  v29.8h // t24a
1944        sqsub           v26.8h,  v26.8h,  v29.8h // t27a
1945        sqadd           v22.8h,  v20.8h,  v18.8h // t25
1946        sqsub           v20.8h,  v20.8h,  v18.8h // t26
1947        sqsub           v29.8h,  v31.8h,  v25.8h // t28a
1948        sqadd           v31.8h,  v31.8h,  v25.8h // t31a
1949
1950        smull_smlsl     v4,  v5,  v2,  v3,  v0.h[2], v0.h[3], .8h // -> t18a
1951        smull_smlal     v6,  v7,  v2,  v3,  v0.h[3], v0.h[2], .8h // -> t29a
1952        smull_smlsl     v2,  v3,  v29, v24, v0.h[2], v0.h[3], .8h // -> t19
1953        sqrshrn_sz      v18, v4,  v5,  #12, .8h                   // t18a
1954        sqrshrn_sz      v25, v6,  v7,  #12, .8h                   // t29a
1955        smull_smlal     v4,  v5,  v29, v24, v0.h[3], v0.h[2], .8h // -> t28
1956        smull_smlal     v6,  v7,  v26, v19, v0.h[3], v0.h[2], .8h // -> t20
1957        sqrshrn_sz      v29, v2,  v3,  #12, .8h                   // t19
1958        sqrshrn_sz      v24, v4,  v5,  #12, .8h                   // t28
1959        neg             v6.4s,   v6.4s                            // -> t20
1960        neg             v7.4s,   v7.4s                            // -> t20
1961        smull_smlsl     v2,  v3,  v26, v19, v0.h[2], v0.h[3], .8h // -> t27
1962        smull_smlal     v4,  v5,  v20, v28, v0.h[3], v0.h[2], .8h // -> t21a
1963        sqrshrn_sz      v26, v6,  v7,  #12, .8h                   // t20
1964        sqrshrn_sz      v19, v2,  v3,  #12, .8h                   // t27
1965        neg             v4.4s,   v4.4s                            // -> t21a
1966        neg             v5.4s,   v5.4s                            // -> t21a
1967        smull_smlsl     v6,  v7,  v20, v28, v0.h[2], v0.h[3], .8h // -> t26a
1968        sqrshrn_sz      v20, v4,  v5,  #12, .8h                   // t21a
1969        sqrshrn_sz      v28, v6,  v7,  #12, .8h                   // t26a
1970
1971        sqsub           v2.8h,   v16.8h,  v30.8h // t23
1972        sqadd           v16.8h,  v16.8h,  v30.8h // t16 = out16
1973        sqsub           v3.8h,   v31.8h,  v23.8h // t24
1974        sqadd           v31.8h,  v31.8h,  v23.8h // t31 = out31
1975        sqsub           v23.8h,  v21.8h,  v17.8h // t22a
1976        sqadd           v17.8h,  v21.8h,  v17.8h // t17a = out17
1977        sqadd           v30.8h,  v27.8h,  v22.8h // t30a = out30
1978        sqsub           v21.8h,  v27.8h,  v22.8h // t25a
1979        sqsub           v27.8h,  v18.8h,  v20.8h // t21
1980        sqadd           v18.8h,  v18.8h,  v20.8h // t18 = out18
1981        sqadd           v4.8h,   v29.8h,  v26.8h // t19a = out19
1982        sqsub           v26.8h,  v29.8h,  v26.8h // t20a
1983        sqadd           v29.8h,  v25.8h,  v28.8h // t29 = out29
1984        sqsub           v25.8h,  v25.8h,  v28.8h // t26
1985        sqadd           v28.8h,  v24.8h,  v19.8h // t28a = out28
1986        sqsub           v24.8h,  v24.8h,  v19.8h // t27a
1987        mov             v19.16b, v4.16b          // out19
1988
1989        smull_smlsl     v4,  v5,  v24, v26, v0.h[0], v0.h[0], .8h // -> t20
1990        smull_smlal     v6,  v7,  v24, v26, v0.h[0], v0.h[0], .8h // -> t27
1991        sqrshrn_sz      v20, v4,  v5,  #12, .8h   // t20
1992        sqrshrn_sz      v22, v6,  v7,  #12, .8h   // t27
1993
1994        smull_smlal     v4,  v5,  v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
1995        smull_smlsl     v6,  v7,  v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
1996        mov             v27.16b,  v22.16b         // t27
1997        sqrshrn_sz      v26, v4,  v5,  #12, .8h   // t26a
1998
1999        smull_smlsl     v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
2000        smull_smlal     v4,  v5,  v21, v23, v0.h[0], v0.h[0], .8h // -> t25
2001        sqrshrn_sz      v21, v6,  v7,  #12, .8h   // t21a
2002        sqrshrn_sz      v22, v24, v25, #12, .8h   // t22
2003        sqrshrn_sz      v25, v4,  v5,  #12, .8h   // t25
2004
2005        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t23a
2006        smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t24a
2007        sqrshrn_sz      v23, v4,  v5,  #12, .8h   // t23a
2008        sqrshrn_sz      v24, v6,  v7,  #12, .8h   // t24a
2009
2010        ret
2011endfunc
2012
2013.macro def_horz_32 scale=0, shift=2, suffix
2014function inv_txfm_horz\suffix\()_dct_32x8_neon
2015        mov             x14, x30
2016        movi            v7.8h,  #0
2017        lsl             x8,  x8,  #1
2018.if \scale
2019        mov             w16, #2896*8
2020        dup             v0.4h,   w16
2021.endif
2022
2023.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
2024        ld1             {\i}, [x7]
2025        st1             {v7.8h}, [x7], x8
2026.endr
2027        sub             x7,  x7,  x8, lsl #4
2028        add             x7,  x7,  x8, lsr #1
2029.if \scale
2030        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
2031        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
2032.endif
2033        bl              inv_dct_8h_x16_neon
2034        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2035        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
2036
2037.macro store1 r0, r1
2038        st1             {\r0}, [x6], #16
2039        st1             {\r1}, [x6], #16
2040        add             x6,  x6,  #32
2041.endm
2042        store1          v16.8h,  v24.8h
2043        store1          v17.8h,  v25.8h
2044        store1          v18.8h,  v26.8h
2045        store1          v19.8h,  v27.8h
2046        store1          v20.8h,  v28.8h
2047        store1          v21.8h,  v29.8h
2048        store1          v22.8h,  v30.8h
2049        store1          v23.8h,  v31.8h
2050.purgem store1
2051        sub             x6,  x6,  #64*8
2052
2053        movi            v7.8h,  #0
2054.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
2055        ld1             {\i}, [x7]
2056        st1             {v7.8h}, [x7], x8
2057.endr
2058.if \scale
2059        // This relies on the fact that the idct also leaves the right coeff in v0.h[1]
2060        scale_input     .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23
2061        scale_input     .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31
2062.endif
2063        bl              inv_dct32_odd_8h_x16_neon
2064        transpose_8x8h  v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
2065        transpose_8x8h  v23, v22, v21, v20, v19, v18, v17, v16, v4, v5
2066.macro store2 r0, r1, shift
2067        ld1             {v4.8h, v5.8h}, [x6]
2068        sqsub           v7.8h,   v4.8h,   \r0
2069        sqsub           v6.8h,   v5.8h,   \r1
2070        sqadd           v4.8h,   v4.8h,   \r0
2071        sqadd           v5.8h,   v5.8h,   \r1
2072        rev64           v6.8h,   v6.8h
2073        rev64           v7.8h,   v7.8h
2074        srshr           v4.8h,   v4.8h,   #\shift
2075        srshr           v5.8h,   v5.8h,   #\shift
2076        srshr           v6.8h,   v6.8h,   #\shift
2077        srshr           v7.8h,   v7.8h,   #\shift
2078        ext             v6.16b,  v6.16b,  v6.16b,  #8
2079        st1             {v4.8h, v5.8h}, [x6], #32
2080        ext             v7.16b,  v7.16b,  v7.16b,  #8
2081        st1             {v6.8h, v7.8h}, [x6], #32
2082.endm
2083
2084        store2          v31.8h,  v23.8h, \shift
2085        store2          v30.8h,  v22.8h, \shift
2086        store2          v29.8h,  v21.8h, \shift
2087        store2          v28.8h,  v20.8h, \shift
2088        store2          v27.8h,  v19.8h, \shift
2089        store2          v26.8h,  v18.8h, \shift
2090        store2          v25.8h,  v17.8h, \shift
2091        store2          v24.8h,  v16.8h, \shift
2092.purgem store2
2093        ret             x14
2094endfunc
2095.endm
2096
2097def_horz_32 scale=0, shift=2
2098def_horz_32 scale=1, shift=1, suffix=_scale
2099
2100function inv_txfm_add_vert_dct_8x32_neon
2101        mov             x14, x30
2102        lsl             x8,  x8,  #1
2103
2104.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2105        ld1             {v\i\().8h}, [x7], x8
2106.endr
2107        sub             x7,  x7,  x8, lsl #4
2108
2109        bl              inv_dct_8h_x16_neon
2110
2111.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2112        st1             {v\i\().8h}, [x7], x8
2113.endr
2114        sub             x7,  x7,  x8, lsl #4
2115        add             x7,  x7,  x8, lsr #1
2116
2117.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2118        ld1             {v\i\().8h}, [x7], x8
2119.endr
2120        sub             x7,  x7,  x8, lsl #4
2121        sub             x7,  x7,  x8, lsr #1
2122        bl              inv_dct32_odd_8h_x16_neon
2123
2124        neg             x9,  x8
2125        mov             x10, x6
2126.macro combine r0, r1, r2, r3, op, stride
2127        ld1             {v5.8h}, [x7],    \stride
2128        ld1             {v2.8b}, [x10],   x1
2129        ld1             {v6.8h}, [x7],    \stride
2130        ld1             {v3.8b}, [x10],   x1
2131        \op             v5.8h,   v5.8h,   \r0
2132        ld1             {v7.8h}, [x7],    \stride
2133        ld1             {v4.8b}, [x10],   x1
2134        srshr           v5.8h,   v5.8h,   #4
2135        \op             v6.8h,   v6.8h,   \r1
2136        uaddw           v5.8h,   v5.8h,   v2.8b
2137        srshr           v6.8h,   v6.8h,   #4
2138        \op             v7.8h,   v7.8h,   \r2
2139        sqxtun          v2.8b,   v5.8h
2140        ld1             {v5.8h}, [x7],    \stride
2141        uaddw           v6.8h,   v6.8h,   v3.8b
2142        srshr           v7.8h,   v7.8h,   #4
2143        \op             v5.8h,   v5.8h,   \r3
2144        st1             {v2.8b}, [x6],    x1
2145        ld1             {v2.8b}, [x10],   x1
2146        sqxtun          v3.8b,   v6.8h
2147        uaddw           v7.8h,   v7.8h,   v4.8b
2148        srshr           v5.8h,   v5.8h,   #4
2149        st1             {v3.8b}, [x6],    x1
2150        sqxtun          v4.8b,   v7.8h
2151        uaddw           v5.8h,   v5.8h,   v2.8b
2152        st1             {v4.8b}, [x6],    x1
2153        sqxtun          v2.8b,   v5.8h
2154        st1             {v2.8b}, [x6],    x1
2155.endm
2156        combine         v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
2157        combine         v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
2158        combine         v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
2159        combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
2160        sub             x7,  x7,  x8
2161        combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
2162        combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
2163        combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
2164        combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
2165.purgem combine
2166
2167        ret             x14
2168endfunc
2169
2170const eob_32x32
2171        .short 36, 136, 300, 1024
2172endconst
2173
2174const eob_16x32
2175        .short 36, 151, 279, 512
2176endconst
2177
2178const eob_16x32_shortside
2179        .short 36, 512
2180endconst
2181
2182const eob_8x32
2183        .short 43, 107, 171, 256
2184endconst
2185
2186function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
2187        movi            v0.8h,  #0
2188        movrel          x13, eob_32x32
2189
2190        mov             x8,  #2*32
21911:
2192        mov             w9,  #0
2193        movrel          x12, eob_32x32
21942:
2195        add             w9,  w9,  #8
2196.irp i, 16, 17, 18, 19, 20, 21, 22, 23
2197        ld1             {v\i\().8h}, [x2]
2198        st1             {v0.8h}, [x2], x8
2199.endr
2200        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2201
2202        load_add_store_8x8 x0, x7, shiftbits=2
2203        ldrh            w11, [x12], #2
2204        sub             x0,  x0,  x1, lsl #3
2205        add             x0,  x0,  #8
2206        cmp             w3,  w11
2207        b.ge            2b
2208
2209        ldrh            w11, [x13], #2
2210        cmp             w3,  w11
2211        b.lt            9f
2212
2213        sub             x0,  x0,  w9, uxtw
2214        add             x0,  x0,  x1, lsl #3
2215        msub            x2,  x8,  x9,  x2
2216        add             x2,  x2,  #2*8
2217        b               1b
22189:
2219        ret
2220endfunc
2221
2222.macro shift_8_regs op, shift
2223.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
2224        \op             \i,  \i,  #\shift
2225.endr
2226.endm
2227
2228.macro def_identity_1632 w, h, wshort, hshort
2229function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
2230        mov             w16, #2896*8
2231        mov             w17, #2*(5793-4096)*8
2232        dup             v1.4h,   w16
2233        movi            v0.8h,   #0
2234        mov             v1.h[1], w17
2235        movrel          x13, eob_16x32\hshort
2236
2237        mov             x8,  #2*\h
22381:
2239        mov             w9,  #0
2240        movrel          x12, eob_16x32\wshort
22412:
2242        add             w9,  w9,  #8
2243.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
2244        ld1             {\i}, [x2]
2245        st1             {v0.8h}, [x2], x8
2246.endr
2247        scale_input     .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23
2248
2249.if \w == 16
2250        // 16x32
2251        identity_8x8_shift1 v1.h[1]
2252.else
2253        // 32x16
2254        shift_8_regs    sqshl, 1
2255        identity_8x8    v1.h[1]
2256.endif
2257
2258        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2259
2260.if \w == 16
2261        load_add_store_8x8 x0, x7, shiftbits=2
2262.else
2263        load_add_store_8x8 x0, x7, shiftbits=4
2264.endif
2265        ldrh            w11, [x12], #2
2266        sub             x0,  x0,  x1, lsl #3
2267        add             x0,  x0,  #8
2268        cmp             w3,  w11
2269        b.ge            2b
2270
2271        ldrh            w11, [x13], #2
2272        cmp             w3,  w11
2273        b.lt            9f
2274
2275        sub             x0,  x0,  w9, uxtw
2276        add             x0,  x0,  x1, lsl #3
2277        msub            x2,  x8,  x9,  x2
2278        add             x2,  x2,  #2*8
2279        b               1b
22809:
2281        ret
2282endfunc
2283.endm
2284
2285def_identity_1632 16, 32, _shortside,
2286def_identity_1632 32, 16, , _shortside
2287
2288.macro def_identity_832 w, h
2289function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
2290        movi            v0.8h,  #0
2291        movrel          x13, eob_8x32
2292
2293        mov             w8,  #2*\h
22941:
2295        ldrh            w12, [x13], #2
2296.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
2297        ld1             {\i}, [x2]
2298        st1             {v0.8h}, [x2], x8
2299.endr
2300
2301.if \w == 8
2302        // 8x32
2303        shift_8_regs    srshr, 1
2304.endif
2305
2306        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2307
2308        cmp             w3,  w12
2309.if \w == 8
2310        load_add_store_8x8 x0, x7, shiftbits=2
2311.else
2312        load_add_store_8x8 x0, x7, shiftbits=3
2313.endif
2314
2315        b.lt            9f
2316.if \w == 8
2317        sub             x2,  x2,  x8, lsl #3
2318        add             x2,  x2,  #2*8
2319.else
2320        sub             x0,  x0,  x1, lsl #3
2321        add             x0,  x0,  #8
2322.endif
2323        b               1b
2324
23259:
2326        ret
2327endfunc
2328.endm
2329
2330def_identity_832 8, 32
2331def_identity_832 32, 8
2332
2333function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
2334        idct_dc         32,  32,  2
2335
2336        mov             x15, x30
2337        sub             sp,  sp,  #2048
2338        movrel          x13, eob_32x32
2339        ldrh            w12, [x13], #2
2340
2341.irp i, 0, 8, 16, 24
2342        add             x6,  sp,  #(\i*32*2)
2343.if \i > 0
2344        mov             w8,  #(32 - \i)
2345        cmp             w3,  w12
2346        b.lt            1f
2347.if \i < 24
2348        ldrh            w12, [x13], #2
2349.endif
2350.endif
2351        add             x7,  x2,  #(\i*2)
2352        mov             x8,  #32*2
2353        bl              inv_txfm_horz_dct_32x8_neon
2354.endr
2355        b               3f
2356
23571:
2358        movi            v4.8h,  #0
2359        movi            v5.8h,  #0
2360        movi            v6.8h,  #0
2361        movi            v7.8h,  #0
23622:
2363        subs            w8,  w8,  #4
2364.rept 4
2365        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2366.endr
2367        b.gt            2b
2368
23693:
2370.irp i, 0, 8, 16, 24
2371        add             x6,  x0,  #(\i)
2372        add             x7,  sp,  #(\i*2)
2373        mov             x8,  #32*2
2374        bl              inv_txfm_add_vert_dct_8x32_neon
2375.endr
2376
2377        add             sp,  sp,  #2048
2378        ret             x15
2379endfunc
2380
2381function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
2382        idct_dc         16,  32,  1
2383
2384        mov             x15, x30
2385        sub             sp,  sp,  #1024
2386        movrel          x13, eob_16x32
2387        ldrh            w12, [x13], #2
2388        adr             x4,  inv_dct_8h_x16_neon
2389
2390.irp i, 0, 8, 16, 24
2391        add             x6,  sp,  #(\i*16*2)
2392        add             x7,  x2,  #(\i*2)
2393.if \i > 0
2394        mov             w8,  #(32 - \i)
2395        cmp             w3,  w12
2396        b.lt            1f
2397.if \i < 24
2398        ldrh            w12, [x13], #2
2399.endif
2400.endif
2401        mov             x8,  #2*32
2402        bl              inv_txfm_horz_scale_16x8_neon
2403.endr
2404        b               3f
2405
24061:
2407        movi            v4.8h,  #0
2408        movi            v5.8h,  #0
2409        movi            v6.8h,  #0
2410        movi            v7.8h,  #0
24112:
2412        subs            w8,  w8,  #8
2413.rept 4
2414        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2415.endr
2416        b.gt            2b
2417
24183:
2419.irp i, 0, 8
2420        add             x6,  x0,  #(\i)
2421        add             x7,  sp,  #(\i*2)
2422        mov             x8,  #16*2
2423        bl              inv_txfm_add_vert_dct_8x32_neon
2424.endr
2425
2426        add             sp,  sp,  #1024
2427        ret             x15
2428endfunc
2429
2430function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
2431        idct_dc         32,  16,  1
2432
2433        mov             x15, x30
2434        sub             sp,  sp,  #1024
2435
2436        adr             x5,  inv_dct_8h_x16_neon
2437
2438.irp i, 0, 8
2439        add             x6,  sp,  #(\i*32*2)
2440        add             x7,  x2,  #(\i*2)
2441.if \i > 0
2442        mov             w8,  #(16 - \i)
2443        cmp             w3,  #36
2444        b.lt            1f
2445.endif
2446        mov             x8,  #2*16
2447        bl              inv_txfm_horz_scale_dct_32x8_neon
2448.endr
2449        b               3f
2450
24511:
2452        movi            v4.8h,  #0
2453        movi            v5.8h,  #0
2454        movi            v6.8h,  #0
2455        movi            v7.8h,  #0
24562:
2457        subs            w8,  w8,  #4
2458.rept 4
2459        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2460.endr
2461        b.gt            2b
2462
24633:
2464        mov             x8,  #32*2
2465.irp i, 0, 8, 16, 24
2466        add             x6,  x0,  #(\i)
2467        add             x7,  sp,  #(\i*2)
2468        bl              inv_txfm_add_vert_8x16_neon
2469.endr
2470
2471        add             sp,  sp,  #1024
2472        ret             x15
2473endfunc
2474
2475function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
2476        idct_dc         8,   32, 2
2477
2478        mov             x15, x30
2479        sub             sp,  sp,  #512
2480
2481        movrel          x13, eob_8x32
2482
2483        movi            v28.8h,  #0
2484        mov             x8,  #2*32
2485        mov             w9,  #32
2486        mov             x6,  sp
24871:
2488.irp i, 16, 17, 18, 19, 20, 21, 22, 23
2489        ld1             {v\i\().8h}, [x2]
2490        st1             {v28.8h}, [x2], x8
2491.endr
2492        ldrh            w12, [x13], #2
2493        sub             x2,  x2,  x8, lsl #3
2494        sub             w9,  w9,  #8
2495        add             x2,  x2,  #2*8
2496
2497        bl              inv_dct_8h_x8_neon
2498
2499.irp i, 16, 17, 18, 19, 20, 21, 22, 23
2500        srshr           v\i\().8h,  v\i\().8h,  #2
2501.endr
2502
2503        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
2504
2505        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
2506        cmp             w3,  w12
2507        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
2508
2509        b.ge            1b
2510        cbz             w9,  3f
2511
2512        movi            v29.8h,  #0
2513        movi            v30.8h,  #0
2514        movi            v31.8h,  #0
25152:
2516        subs            w9,  w9,  #8
2517.rept 2
2518        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
2519.endr
2520        b.gt            2b
2521
25223:
2523        mov             x6,  x0
2524        mov             x7,  sp
2525        mov             x8,  #8*2
2526        bl              inv_txfm_add_vert_dct_8x32_neon
2527
2528        add             sp,  sp,  #512
2529        ret             x15
2530endfunc
2531
2532function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
2533        idct_dc         32,  8,   2
2534
2535        mov             x15, x30
2536        sub             sp,  sp,  #512
2537
2538        mov             x6,  sp
2539        mov             x7,  x2
2540        mov             x8,  #8*2
2541        bl              inv_txfm_horz_dct_32x8_neon
2542
2543        mov             x8,  #2*32
2544        mov             w9,  #0
25451:
2546        add             x6,  x0,  x9
2547        add             x7,  sp,  x9, lsl #1 // #(\i*2)
2548
2549.irp i, 16, 17, 18, 19, 20, 21, 22, 23
2550        ld1             {v\i\().8h}, [x7], x8
2551.endr
2552        add             w9,  w9,  #8
2553
2554        bl              inv_dct_8h_x8_neon
2555
2556        cmp             w9,  #32
2557
2558        load_add_store_8x8 x6, x7
2559
2560        b.lt            1b
2561
2562        add             sp,  sp,  #512
2563        ret             x15
2564endfunc
2565
2566function inv_dct64_step1_neon
2567        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
2568        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
2569        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
2570        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
2571
2572        ld1             {v0.8h, v1.8h}, [x17], #32
2573
2574        sqrdmulh        v23.8h,  v16.8h,  v0.h[1]   // t63a
2575        sqrdmulh        v16.8h,  v16.8h,  v0.h[0]   // t32a
2576        sqrdmulh        v22.8h,  v17.8h,  v0.h[2]   // t62a
2577        sqrdmulh        v17.8h,  v17.8h,  v0.h[3]   // t33a
2578        sqrdmulh        v21.8h,  v18.8h,  v0.h[5]   // t61a
2579        sqrdmulh        v18.8h,  v18.8h,  v0.h[4]   // t34a
2580        sqrdmulh        v20.8h,  v19.8h,  v0.h[6]   // t60a
2581        sqrdmulh        v19.8h,  v19.8h,  v0.h[7]   // t35a
2582
2583        sqadd           v24.8h,  v16.8h,  v17.8h    // t32
2584        sqsub           v25.8h,  v16.8h,  v17.8h    // t33
2585        sqsub           v26.8h,  v19.8h,  v18.8h    // t34
2586        sqadd           v27.8h,  v19.8h,  v18.8h    // t35
2587        sqadd           v28.8h,  v20.8h,  v21.8h    // t60
2588        sqsub           v29.8h,  v20.8h,  v21.8h    // t61
2589        sqsub           v30.8h,  v23.8h,  v22.8h    // t62
2590        sqadd           v31.8h,  v23.8h,  v22.8h    // t63
2591
2592        smull_smlal     v2,  v3,  v29, v26, v1.h[0], v1.h[1], .8h // -> t34a
2593        smull_smlsl     v4,  v5,  v29, v26, v1.h[1], v1.h[0], .8h // -> t61a
2594        neg             v2.4s,   v2.4s              // t34a
2595        neg             v3.4s,   v3.4s              // t34a
2596        smull_smlsl     v6,  v7,  v30, v25, v1.h[1], v1.h[0], .8h // -> t33a
2597        sqrshrn_sz      v26, v2,  v3,  #12, .8h     // t34a
2598        smull_smlal     v2,  v3,  v30, v25, v1.h[0], v1.h[1], .8h // -> t62a
2599        sqrshrn_sz      v29, v4,  v5,  #12, .8h     // t61a
2600        sqrshrn_sz      v25, v6,  v7,  #12, .8h     // t33a
2601        sqrshrn_sz      v30, v2,  v3,  #12, .8h     // t62a
2602
2603        sqadd           v16.8h,  v24.8h,  v27.8h    // t32a
2604        sqsub           v19.8h,  v24.8h,  v27.8h    // t35a
2605        sqadd           v17.8h,  v25.8h,  v26.8h    // t33
2606        sqsub           v18.8h,  v25.8h,  v26.8h    // t34
2607        sqsub           v20.8h,  v31.8h,  v28.8h    // t60a
2608        sqadd           v23.8h,  v31.8h,  v28.8h    // t63a
2609        sqsub           v21.8h,  v30.8h,  v29.8h    // t61
2610        sqadd           v22.8h,  v30.8h,  v29.8h    // t62
2611
2612        smull_smlal     v2,  v3,  v21, v18, v1.h[2], v1.h[3], .8h // -> t61a
2613        smull_smlsl     v4,  v5,  v21, v18, v1.h[3], v1.h[2], .8h // -> t34a
2614        smull_smlal     v6,  v7,  v20, v19, v1.h[2], v1.h[3], .8h // -> t60
2615        sqrshrn_sz      v21, v2,  v3,  #12, .8h     // t61a
2616        sqrshrn_sz      v18, v4,  v5,  #12, .8h     // t34a
2617        smull_smlsl     v2,  v3,  v20, v19, v1.h[3], v1.h[2], .8h // -> t35
2618        sqrshrn_sz      v20, v6,  v7,  #12, .8h     // t60
2619        sqrshrn_sz      v19, v2,  v3,  #12, .8h     // t35
2620
2621        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
2622        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
2623
2624        ret
2625endfunc
2626
2627function inv_dct64_step2_neon
2628        movrel          x16, idct_coeffs
2629        ld1             {v0.4h}, [x16]
26301:
2631        // t32a/33/34a/35/60/61a/62/63a
2632        // t56a/57/58a/59/36/37a/38/39a
2633        // t40a/41/42a/43/52/53a/54/55a
2634        // t48a/49/50a/51/44/45a/46/47a
2635        ldr             q16, [x6, #2*8*0]  // t32a
2636        ldr             q17, [x9, #2*8*8]  // t39a
2637        ldr             q18, [x9, #2*8*0]  // t63a
2638        ldr             q19, [x6, #2*8*8]  // t56a
2639        ldr             q20, [x6, #2*8*16] // t40a
2640        ldr             q21, [x9, #2*8*24] // t47a
2641        ldr             q22, [x9, #2*8*16] // t55a
2642        ldr             q23, [x6, #2*8*24] // t48a
2643
2644        sqadd           v24.8h,  v16.8h, v17.8h // t32
2645        sqsub           v25.8h,  v16.8h, v17.8h // t39
2646        sqadd           v26.8h,  v18.8h, v19.8h // t63
2647        sqsub           v27.8h,  v18.8h, v19.8h // t56
2648        sqsub           v28.8h,  v21.8h, v20.8h // t40
2649        sqadd           v29.8h,  v21.8h, v20.8h // t47
2650        sqadd           v30.8h,  v23.8h, v22.8h // t48
2651        sqsub           v31.8h,  v23.8h, v22.8h // t55
2652
2653        smull_smlal     v2,  v3,  v27, v25, v0.h[3], v0.h[2], .8h // -> t56a
2654        smull_smlsl     v4,  v5,  v27, v25, v0.h[2], v0.h[3], .8h // -> t39a
2655        smull_smlal     v6,  v7,  v31, v28, v0.h[3], v0.h[2], .8h // -> t40a
2656        sqrshrn_sz      v25, v2,  v3,  #12, .8h     // t56a
2657        sqrshrn_sz      v27, v4,  v5,  #12, .8h     // t39a
2658        neg             v6.4s,   v6.4s              // t40a
2659        neg             v7.4s,   v7.4s              // t40a
2660        smull_smlsl     v2,  v3,  v31, v28, v0.h[2], v0.h[3], .8h // -> t55a
2661        sqrshrn_sz      v31, v6,  v7,  #12, .8h     // t40a
2662        sqrshrn_sz      v28, v2,  v3,  #12, .8h     // t55a
2663
2664        sqadd           v16.8h,  v24.8h,  v29.8h    // t32a
2665        sqsub           v19.8h,  v24.8h,  v29.8h    // t47a
2666        sqadd           v17.8h,  v27.8h,  v31.8h    // t39
2667        sqsub           v18.8h,  v27.8h,  v31.8h    // t40
2668        sqsub           v20.8h,  v26.8h,  v30.8h    // t48a
2669        sqadd           v23.8h,  v26.8h,  v30.8h    // t63a
2670        sqsub           v21.8h,  v25.8h,  v28.8h    // t55
2671        sqadd           v22.8h,  v25.8h,  v28.8h    // t56
2672
2673        smull_smlsl     v2,  v3,  v21, v18, v0.h[0], v0.h[0], .8h // -> t40a
2674        smull_smlal     v4,  v5,  v21, v18, v0.h[0], v0.h[0], .8h // -> t55a
2675        smull_smlsl     v6,  v7,  v20, v19, v0.h[0], v0.h[0], .8h // -> t47
2676        sqrshrn_sz      v18, v2,  v3,  #12, .8h     // t40a
2677        sqrshrn_sz      v21, v4,  v5,  #12, .8h     // t55a
2678        smull_smlal     v2,  v3,  v20, v19, v0.h[0], v0.h[0], .8h // -> t48
2679        sqrshrn_sz      v19, v6,  v7,  #12, .8h     // t47
2680        sqrshrn_sz      v20, v2,  v3,  #12, .8h     // t48
2681
2682        str             q16, [x6, #2*8*0]  // t32a
2683        str             q17, [x9, #2*8*0]  // t39
2684        str             q18, [x6, #2*8*8]  // t40a
2685        str             q19, [x9, #2*8*8]  // t47
2686        str             q20, [x6, #2*8*16] // t48
2687        str             q21, [x9, #2*8*16] // t55a
2688        str             q22, [x6, #2*8*24] // t56
2689        str             q23, [x9, #2*8*24] // t63a
2690
2691        add             x6,  x6,  #2*8
2692        sub             x9,  x9,  #2*8
2693        cmp             x6,  x9
2694        b.lt            1b
2695        ret
2696endfunc
2697
2698.macro load8 src, strd, zero, clear
2699.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
2700.if \clear
2701        ld1             {\i}, [\src]
2702        st1             {\zero}, [\src], \strd
2703.else
2704        ld1             {\i}, [\src], \strd
2705.endif
2706.endr
2707.endm
2708
2709.macro store16 dst
2710.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
2711        st1             {\i}, [\dst], #16
2712.endr
2713.endm
2714
2715.macro clear_upper8
2716.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
2717        movi            \i,  #0
2718.endr
2719.endm
2720
2721.macro movi_if reg, val, cond
2722.if \cond
2723        movi            \reg, \val
2724.endif
2725.endm
2726
2727.macro movdup_if reg, gpr, val, cond
2728.if \cond
2729        mov             \gpr, \val
2730        dup             \reg, \gpr
2731.endif
2732.endm
2733
2734.macro st1_if regs, dst, cond
2735.if \cond
2736        st1             \regs, \dst
2737.endif
2738.endm
2739
2740.macro str_if reg, dst, cond
2741.if \cond
2742        str             \reg, \dst
2743.endif
2744.endm
2745
2746.macro stroff_if reg, dst, dstoff, cond
2747.if \cond
2748        str             \reg, \dst, \dstoff
2749.endif
2750.endm
2751
2752.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
2753.if \cond
2754        scale_input     .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
2755.endif
2756.endm
2757
2758.macro def_dct64_func suffix, clear=0, scale=0
2759function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
2760        mov             x14, x30
2761        mov             x6,  sp
2762        lsl             x8,  x8,  #2
2763
2764        movdup_if       v0.4h, w16, #2896*8, \scale
2765        movi_if         v7.8h,  #0, \clear
2766        load8           x7,  x8,  v7.8h, \clear
2767        clear_upper8
2768        sub             x7,  x7,  x8, lsl #3
2769        add             x7,  x7,  x8, lsr #1
2770        scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
2771
2772        bl              inv_dct_8h_x16_neon
2773
2774        store16         x6
2775
2776        movdup_if       v0.4h, w16, #2896*8, \scale
2777        movi_if         v7.8h,  #0, \clear
2778        load8           x7,  x8,  v7.8h, \clear
2779        clear_upper8
2780        sub             x7,  x7,  x8, lsl #3
2781        lsr             x8,  x8,  #1
2782        sub             x7,  x7,  x8, lsr #1
2783        scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
2784
2785        bl              inv_dct32_odd_8h_x16_neon
2786
2787        add             x10, x6,  #16*15
2788        sub             x6,  x6,  #16*16
2789
2790        mov             x9,  #-16
2791
2792.macro store_addsub r0, r1, r2, r3
2793        ld1             {v2.8h}, [x6], #16
2794        ld1             {v3.8h}, [x6], #16
2795        sqadd           v6.8h,  v2.8h,  \r0
2796        sqsub           \r0,    v2.8h,  \r0
2797        ld1             {v4.8h}, [x6], #16
2798        sqadd           v7.8h,  v3.8h,  \r1
2799        sqsub           \r1,    v3.8h,  \r1
2800        ld1             {v5.8h}, [x6], #16
2801        sqadd           v2.8h,  v4.8h,  \r2
2802        sub             x6,  x6,  #16*4
2803        sqsub           \r2,    v4.8h,  \r2
2804        st1             {v6.8h}, [x6], #16
2805        st1             {\r0},   [x10], x9
2806        sqadd           v3.8h,  v5.8h,  \r3
2807        sqsub           \r3,    v5.8h,  \r3
2808        st1             {v7.8h}, [x6], #16
2809        st1             {\r1},   [x10], x9
2810        st1             {v2.8h}, [x6], #16
2811        st1             {\r2},   [x10], x9
2812        st1             {v3.8h}, [x6], #16
2813        st1             {\r3},   [x10], x9
2814.endm
2815        store_addsub    v31.8h, v30.8h, v29.8h, v28.8h
2816        store_addsub    v27.8h, v26.8h, v25.8h, v24.8h
2817        store_addsub    v23.8h, v22.8h, v21.8h, v20.8h
2818        store_addsub    v19.8h, v18.8h, v17.8h, v16.8h
2819.purgem store_addsub
2820
2821        add             x6,  x6,  #2*8*16
2822
2823        movrel          x17, idct64_coeffs
2824        movdup_if       v0.4h, w16, #2896*8, \scale
2825        movi_if         v7.8h,  #0, \clear
2826        add             x9,  x7,  x8, lsl #4 // offset 16
2827        add             x10, x7,  x8, lsl #3 // offset 8
2828        sub             x9,  x9,  x8         // offset 15
2829        sub             x11, x10, x8         // offset 7
2830        ld1             {v16.8h}, [x7]  // in1  (offset 0)
2831        ld1             {v17.8h}, [x9]  // in31 (offset 15)
2832        ld1             {v18.8h}, [x10] // in17 (offset 8)
2833        ld1             {v19.8h}, [x11] // in15 (offset 7)
2834        st1_if          {v7.8h}, [x7],  \clear
2835        st1_if          {v7.8h}, [x9],  \clear
2836        st1_if          {v7.8h}, [x10], \clear
2837        st1_if          {v7.8h}, [x11], \clear
2838        scale_if        \scale, v0.h[0], v16, v17, v18, v19
2839        bl              inv_dct64_step1_neon
2840        movdup_if       v0.4h, w16, #2896*8, \scale
2841        movi_if         v7.8h,  #0, \clear
2842        add             x7,  x7,  x8, lsl #2 // offset 4
2843        sub             x9,  x9,  x8, lsl #2 // offset 11
2844        sub             x10, x7,  x8         // offset 3
2845        add             x11, x9,  x8         // offset 12
2846        ld1             {v16.8h}, [x10] // in7  (offset 3)
2847        ld1             {v17.8h}, [x11] // in25 (offset 12)
2848        ld1             {v18.8h}, [x9]  // in23 (offset 11)
2849        ld1             {v19.8h}, [x7]  // in9  (offset 4)
2850        st1_if          {v7.8h}, [x7],  \clear
2851        st1_if          {v7.8h}, [x9],  \clear
2852        st1_if          {v7.8h}, [x10], \clear
2853        st1_if          {v7.8h}, [x11], \clear
2854        scale_if        \scale, v0.h[0], v16, v17, v18, v19
2855        bl              inv_dct64_step1_neon
2856        movdup_if       v0.4h, w16, #2896*8, \scale
2857        movi_if         v7.8h,  #0, \clear
2858        sub             x10, x10, x8, lsl #1 // offset 1
2859        sub             x9,  x9,  x8, lsl #1 // offset 9
2860        add             x7,  x7,  x8         // offset 5
2861        add             x11, x11, x8         // offset 13
2862        ldr             q16, [x10, x8] // in5  (offset 2)
2863        ldr             q17, [x11]     // in27 (offset 13)
2864        ldr             q18, [x9,  x8] // in21 (offset 10)
2865        ldr             q19, [x7]      // in11 (offset 5)
2866        stroff_if       q7,  [x10, x8], \clear
2867        str_if          q7,  [x11],     \clear
2868        stroff_if       q7,  [x9,  x8], \clear
2869        str_if          q7,  [x7],      \clear
2870        scale_if        \scale, v0.h[0], v16, v17, v18, v19
2871        bl              inv_dct64_step1_neon
2872        movdup_if       v0.4h, w16, #2896*8, \scale
2873        movi_if         v7.8h,  #0, \clear
2874        ldr             q16, [x10]     // in3  (offset 1)
2875        ldr             q17, [x11, x8] // in29 (offset 14)
2876        ldr             q18, [x9]      // in19 (offset 9)
2877        ldr             q19, [x7,  x8] // in13 (offset 6)
2878        str_if          q7,  [x10],     \clear
2879        stroff_if       q7,  [x11, x8], \clear
2880        str_if          q7,  [x9],      \clear
2881        stroff_if       q7,  [x7,  x8], \clear
2882        scale_if        \scale, v0.h[0], v16, v17, v18, v19
2883        bl              inv_dct64_step1_neon
2884
2885        sub             x6,  x6,  #2*8*32
2886        add             x9,  x6,  #2*8*7
2887
2888        bl              inv_dct64_step2_neon
2889
2890        ret             x14
2891endfunc
2892.endm
2893
2894def_dct64_func
2895def_dct64_func _clear, clear=1
2896def_dct64_func _clear_scale, clear=1, scale=1
2897
2898
2899function inv_txfm_horz_dct_64x8_neon
2900        mov             x14, x30
2901
2902        mov             x7,  sp
2903        add             x8,  sp,  #2*8*(64 - 4)
2904        add             x9,  x6,  #2*56
2905        mov             x10, #2*64
2906        mov             x11, #-2*8*4
2907
2908        dup             v7.8h,  w12
29091:
2910        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
2911        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
2912        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
2913        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
2914        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2915        transpose_8x8h  v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
2916
2917.macro store_addsub src0, src1, src2, src3
2918        sqsub           v1.8h,   \src0,   \src1
2919        sqadd           v0.8h,   \src0,   \src1
2920        sqsub           v3.8h,   \src2,   \src3
2921        srshl           v1.8h,   v1.8h,   v7.8h
2922        sqadd           v2.8h,   \src2,   \src3
2923        srshl           v0.8h,   v0.8h,   v7.8h
2924        srshl           v3.8h,   v3.8h,   v7.8h
2925        rev64           v1.8h,   v1.8h
2926        srshl           v2.8h,   v2.8h,   v7.8h
2927        rev64           v3.8h,   v3.8h
2928        ext             v1.16b,  v1.16b,  v1.16b,  #8
2929        st1             {v0.8h},  [x6], x10
2930        ext             v3.16b,  v3.16b,  v3.16b,  #8
2931        st1             {v1.8h},  [x9], x10
2932        st1             {v2.8h},  [x6], x10
2933        st1             {v3.8h},  [x9], x10
2934.endm
2935        store_addsub    v16.8h,  v31.8h,  v17.8h,  v30.8h
2936        store_addsub    v18.8h,  v29.8h,  v19.8h,  v28.8h
2937        store_addsub    v20.8h,  v27.8h,  v21.8h,  v26.8h
2938        store_addsub    v22.8h,  v25.8h,  v23.8h,  v24.8h
2939.purgem store_addsub
2940        sub             x6,  x6,  x10, lsl #3
2941        sub             x9,  x9,  x10, lsl #3
2942        add             x6,  x6,  #16
2943        sub             x9,  x9,  #16
2944
2945        cmp             x7,  x8
2946        b.lt            1b
2947        ret             x14
2948endfunc
2949
2950function inv_txfm_add_vert_dct_8x64_neon
2951        mov             x14, x30
2952        lsl             x8,  x8,  #1
2953
2954        mov             x7,  sp
2955        add             x8,  sp,  #2*8*(64 - 4)
2956        add             x9,  x6,  x1, lsl #6
2957        sub             x9,  x9,  x1
2958        neg             x10, x1
2959        mov             x11, #-2*8*4
2960
29611:
2962        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
2963        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
2964        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
2965        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
2966
2967.macro add_dest_addsub src0, src1, src2, src3
2968        ld1             {v0.8b}, [x6], x1
2969        ld1             {v1.8b}, [x9], x10
2970        sqadd           v4.8h,   \src0,   \src1
2971        ld1             {v2.8b}, [x6]
2972        sqsub           v5.8h,   \src0,   \src1
2973        ld1             {v3.8b}, [x9]
2974        sqadd           v6.8h,   \src2,   \src3
2975        sqsub           v7.8h,   \src2,   \src3
2976        sub             x6,  x6,  x1
2977        sub             x9,  x9,  x10
2978        srshr           v4.8h,   v4.8h,   #4
2979        srshr           v5.8h,   v5.8h,   #4
2980        srshr           v6.8h,   v6.8h,   #4
2981        uaddw           v4.8h,   v4.8h,   v0.8b
2982        srshr           v7.8h,   v7.8h,   #4
2983        uaddw           v5.8h,   v5.8h,   v1.8b
2984        uaddw           v6.8h,   v6.8h,   v2.8b
2985        sqxtun          v0.8b,   v4.8h
2986        uaddw           v7.8h,   v7.8h,   v3.8b
2987        sqxtun          v1.8b,   v5.8h
2988        st1             {v0.8b}, [x6], x1
2989        sqxtun          v2.8b,   v6.8h
2990        st1             {v1.8b}, [x9], x10
2991        sqxtun          v3.8b,   v7.8h
2992        st1             {v2.8b}, [x6], x1
2993        st1             {v3.8b}, [x9], x10
2994.endm
2995        add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
2996        add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
2997        add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
2998        add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
2999.purgem add_dest_addsub
3000        cmp             x7,  x8
3001        b.lt            1b
3002
3003        ret             x14
3004endfunc
3005
3006function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
3007        idct_dc         64,  64,  2
3008
3009        mov             x15, x30
3010
3011        sub_sp          64*32*2+64*8*2
3012        add             x5,  sp, #64*8*2
3013
3014        movrel          x13, eob_32x32
3015
3016.irp i, 0, 8, 16, 24
3017        add             x6,  x5,  #(\i*64*2)
3018.if \i > 0
3019        mov             w8,  #(32 - \i)
3020        cmp             w3,  w12
3021        b.lt            1f
3022.endif
3023        add             x7,  x2,  #(\i*2)
3024        mov             x8,  #32*2
3025        mov             x12, #-2 // shift
3026        bl              inv_txfm_dct_clear_8h_x64_neon
3027        add             x6,  x5,  #(\i*64*2)
3028        bl              inv_txfm_horz_dct_64x8_neon
3029.if \i < 24
3030        ldrh            w12, [x13], #2
3031.endif
3032.endr
3033        b               3f
3034
30351:
3036        movi            v4.8h,  #0
3037        movi            v5.8h,  #0
3038        movi            v6.8h,  #0
3039        movi            v7.8h,  #0
30402:
3041        subs            w8,  w8,  #2
3042.rept 4
3043        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3044.endr
3045        b.gt            2b
3046
30473:
3048.irp i, 0, 8, 16, 24, 32, 40, 48, 56
3049        add             x7,  x5,  #(\i*2)
3050        mov             x8,  #64*2
3051        bl              inv_txfm_dct_8h_x64_neon
3052        add             x6,  x0,  #(\i)
3053        bl              inv_txfm_add_vert_dct_8x64_neon
3054.endr
3055
3056        add             sp,  x5,  #64*32*2
3057        ret             x15
3058endfunc
3059
3060function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
3061        idct_dc         64,  32,  1
3062
3063        mov             x15, x30
3064
3065        sub_sp          64*32*2+64*8*2
3066        add             x5,  sp, #64*8*2
3067
3068        movrel          x13, eob_32x32
3069
3070.irp i, 0, 8, 16, 24
3071        add             x6,  x5,  #(\i*64*2)
3072.if \i > 0
3073        mov             w8,  #(32 - \i)
3074        cmp             w3,  w12
3075        b.lt            1f
3076.endif
3077        add             x7,  x2,  #(\i*2)
3078        mov             x8,  #32*2
3079        mov             x12, #-1 // shift
3080        bl              inv_txfm_dct_clear_scale_8h_x64_neon
3081        add             x6,  x5,  #(\i*64*2)
3082        bl              inv_txfm_horz_dct_64x8_neon
3083.if \i < 24
3084        ldrh            w12, [x13], #2
3085.endif
3086.endr
3087        b               3f
3088
30891:
3090        movi            v4.8h,  #0
3091        movi            v5.8h,  #0
3092        movi            v6.8h,  #0
3093        movi            v7.8h,  #0
30942:
3095        subs            w8,  w8,  #2
3096.rept 4
3097        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3098.endr
3099        b.gt            2b
3100
31013:
3102.irp i, 0, 8, 16, 24, 32, 40, 48, 56
3103        add             x6,  x0,  #(\i)
3104        add             x7,  x5,  #(\i*2)
3105        mov             x8,  #64*2
3106        bl              inv_txfm_add_vert_dct_8x32_neon
3107.endr
3108
3109        add             sp,  x5,  #64*32*2
3110        ret             x15
3111endfunc
3112
3113function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
3114        idct_dc         32,  64,  1
3115
3116        mov             x15, x30
3117
3118        sub_sp          32*32*2+64*8*2
3119        add             x5,  sp, #64*8*2
3120
3121        movrel          x13, eob_32x32
3122        ldrh            w12, [x13], #2
3123
3124.irp i, 0, 8, 16, 24
3125        add             x6,  x5,  #(\i*32*2)
3126.if \i > 0
3127        mov             w8,  #(32 - \i)
3128        cmp             w3,  w12
3129        b.lt            1f
3130.if \i < 24
3131        ldrh            w12, [x13], #2
3132.endif
3133.endif
3134        add             x7,  x2,  #(\i*2)
3135        mov             x8,  #32*2
3136        bl              inv_txfm_horz_scale_dct_32x8_neon
3137.endr
3138        b               3f
3139
31401:
3141        movi            v4.8h,  #0
3142        movi            v5.8h,  #0
3143        movi            v6.8h,  #0
3144        movi            v7.8h,  #0
31452:
3146        subs            w8,  w8,  #4
3147.rept 4
3148        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3149.endr
3150        b.gt            2b
3151
31523:
3153.irp i, 0, 8, 16, 24
3154        add             x7,  x5,  #(\i*2)
3155        mov             x8,  #32*2
3156        bl              inv_txfm_dct_8h_x64_neon
3157        add             x6,  x0,  #(\i)
3158        bl              inv_txfm_add_vert_dct_8x64_neon
3159.endr
3160
3161        add             sp,  x5,  #32*32*2
3162        ret             x15
3163endfunc
3164
3165function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
3166        idct_dc         64,  16,  2
3167
3168        mov             x15, x30
3169
3170        sub_sp          64*16*2+64*8*2
3171        add             x4,  sp, #64*8*2
3172
3173        movrel          x13, eob_16x32
3174
3175.irp i, 0, 8
3176        add             x6,  x4,  #(\i*64*2)
3177.if \i > 0
3178        mov             w8,  #(16 - \i)
3179        cmp             w3,  w12
3180        b.lt            1f
3181.endif
3182        add             x7,  x2,  #(\i*2)
3183        mov             x8,  #16*2
3184        mov             x12, #-2 // shift
3185        bl              inv_txfm_dct_clear_8h_x64_neon
3186        add             x6,  x4,  #(\i*64*2)
3187        bl              inv_txfm_horz_dct_64x8_neon
3188.if \i < 8
3189        ldrh            w12, [x13], #2
3190.endif
3191.endr
3192        b               3f
3193
31941:
3195        movi            v4.8h,  #0
3196        movi            v5.8h,  #0
3197        movi            v6.8h,  #0
3198        movi            v7.8h,  #0
31992:
3200        subs            w8,  w8,  #2
3201.rept 4
3202        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3203.endr
3204        b.gt            2b
3205
32063:
3207        adr             x5,  inv_dct_8h_x16_neon
3208        mov             x8,  #64*2
3209.irp i, 0, 8, 16, 24, 32, 40, 48, 56
3210        add             x6,  x0,  #(\i)
3211        add             x7,  x4,  #(\i*2)
3212        bl              inv_txfm_add_vert_8x16_neon
3213.endr
3214
3215        add             sp,  x4,  #64*16*2
3216        ret             x15
3217endfunc
3218
3219function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
3220        idct_dc         16,  64,  2
3221
3222        mov             x15, x30
3223
3224        sub_sp          16*32*2+64*8*2
3225        add             x5,  sp, #64*8*2
3226
3227        movrel          x13, eob_16x32
3228        ldrh            w12, [x13], #2
3229
3230        adr             x4,  inv_dct_8h_x16_neon
3231.irp i, 0, 8, 16, 24
3232        add             x6,  x5,  #(\i*16*2)
3233.if \i > 0
3234        mov             w8,  #(32 - \i)
3235        cmp             w3,  w12
3236        b.lt            1f
3237.if \i < 24
3238        ldrh            w12, [x13], #2
3239.endif
3240.endif
3241        add             x7,  x2,  #(\i*2)
3242        mov             x8,  #32*2
3243        bl              inv_txfm_horz_16x8_neon
3244.endr
3245        b               3f
3246
32471:
3248        movi            v4.8h,  #0
3249        movi            v5.8h,  #0
3250        movi            v6.8h,  #0
3251        movi            v7.8h,  #0
32522:
3253        subs            w8,  w8,  #8
3254.rept 4
3255        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3256.endr
3257        b.gt            2b
3258
32593:
3260.irp i, 0, 8
3261        add             x7,  x5,  #(\i*2)
3262        mov             x8,  #16*2
3263        bl              inv_txfm_dct_8h_x64_neon
3264        add             x6,  x0,  #(\i)
3265        bl              inv_txfm_add_vert_dct_8x64_neon
3266.endr
3267
3268        add             sp,  x5,  #16*32*2
3269        ret             x15
3270endfunc
3271