1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// The exported functions in this file have got the following signature: 32// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); 33 34// Most of the functions use the following register layout: 35// x0-x3 external parameters 36// x4 function pointer to first transform 37// x5 function pointer to second transform 38// x6 output parameter for helper function 39// x7 input parameter for helper function 40// x8 input stride for helper function 41// x9-x12 scratch variables for helper functions 42// x13 pointer to list of eob thresholds 43// x14 return pointer for helper function 44// x15 return pointer for main function 45 46// The SIMD registers most often use the following layout: 47// v0-v1 multiplication coefficients 48// v2-v7 scratch registers 49// v8-v15 unused 50// v16-v31 inputs/outputs of transforms 51 52// Potential further optimizations, that are left unimplemented for now: 53// - Trying to keep multiplication coefficients in registers across multiple 54// transform functions. (The register layout is designed to potentially 55// allow this.) 56// - Use a simplified version of the transforms themselves for cases where 57// we know a significant number of inputs are zero. E.g. if the eob value 58// indicates only a quarter of input values are set, for idct16 and up, 59// a significant amount of calculation can be skipped, at the cost of more 60// code duplication and special casing. 61 62const idct_coeffs, align=4 63 // idct4 64 .short 2896, 2896*8, 1567, 3784 65 // idct8 66 .short 799, 4017, 3406, 2276 67 // idct16 68 .short 401, 4076, 3166, 2598 69 .short 1931, 3612, 3920, 1189 70 // idct32 71 .short 201, 4091, 3035, 2751 72 .short 1751, 3703, 3857, 1380 73 .short 995, 3973, 3513, 2106 74 .short 2440, 3290, 4052, 601 75endconst 76 77const idct64_coeffs, align=4 78 .short 101*8, 4095*8, 2967*8, -2824*8 79 .short 1660*8, 3745*8, 3822*8, -1474*8 80 .short 4076, 401, 4017, 799 81 .short 0, 0, 0, 0 82 83 .short 4036*8, -700*8, 2359*8, 3349*8 84 .short 3461*8, -2191*8, 897*8, 3996*8 85 .short -3166, -2598, -799, -4017 86 .short 0, 0, 0, 0 87 88 .short 501*8, 4065*8, 3229*8, -2520*8 89 .short 2019*8, 3564*8, 3948*8, -1092*8 90 .short 3612, 1931, 2276, 3406 91 .short 0, 0, 0, 0 92 93 .short 4085*8, -301*8, 2675*8, 3102*8 94 .short 3659*8, -1842*8, 1285*8, 3889*8 95 .short -3920, -1189, -3406, -2276 96 .short 0, 0, 0, 0 97endconst 98 99const iadst4_coeffs, align=4 100 // .h[4-5] can be interpreted as .s[2] 101 .short 1321, 3803, 2482, 3344, 3344, 0 102endconst 103 104const iadst8_coeffs, align=4 105 .short 4076, 401, 3612, 1931 106 .short 2598, 3166, 1189, 3920 107 // idct_coeffs 108 .short 2896, 0, 1567, 3784, 0, 0, 0, 0 109endconst 110 111const iadst16_coeffs, align=4 112 .short 4091, 201, 3973, 995 113 .short 3703, 1751, 3290, 2440 114 .short 2751, 3035, 2106, 3513 115 .short 1380, 3857, 601, 4052 116endconst 117 118.macro smull_smlal d0, d1, s0, s1, c0, c1, sz 119 smull \d0\().4s, \s0\().4h, \c0 120 smlal \d0\().4s, \s1\().4h, \c1 121.ifc \sz, .8h 122 smull2 \d1\().4s, \s0\().8h, \c0 123 smlal2 \d1\().4s, \s1\().8h, \c1 124.endif 125.endm 126 127.macro smull_smlsl d0, d1, s0, s1, c0, c1, sz 128 smull \d0\().4s, \s0\().4h, \c0 129 smlsl \d0\().4s, \s1\().4h, \c1 130.ifc \sz, .8h 131 smull2 \d1\().4s, \s0\().8h, \c0 132 smlsl2 \d1\().4s, \s1\().8h, \c1 133.endif 134.endm 135 136.macro sqrshrn_sz d0, s0, s1, shift, sz 137 sqrshrn \d0\().4h, \s0\().4s, \shift 138.ifc \sz, .8h 139 sqrshrn2 \d0\().8h, \s1\().4s, \shift 140.endif 141.endm 142 143.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 144 sqrdmulh \r0\sz, \r0\sz, \c 145 sqrdmulh \r1\sz, \r1\sz, \c 146 sqrdmulh \r2\sz, \r2\sz, \c 147 sqrdmulh \r3\sz, \r3\sz, \c 148.ifnb \r4 149 sqrdmulh \r4\sz, \r4\sz, \c 150 sqrdmulh \r5\sz, \r5\sz, \c 151 sqrdmulh \r6\sz, \r6\sz, \c 152 sqrdmulh \r7\sz, \r7\sz, \c 153.endif 154.endm 155 156.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 157.ifnb \load 158 ld1 {\load}, [\src], x1 159.endif 160.ifnb \shift 161 srshr \shift, \shift, #\shiftbits 162.endif 163.ifnb \addsrc 164 uaddw \adddst, \adddst, \addsrc 165.endif 166.ifnb \narrowsrc 167 sqxtun \narrowdst, \narrowsrc 168.endif 169.ifnb \store 170 st1 {\store}, [\dst], x1 171.endif 172.endm 173.macro load_add_store_8x16 dst, src 174 mov \src, \dst 175 load_add_store v2.8b, v16.8h, , , , , , \dst, \src 176 load_add_store v3.8b, v17.8h, , , , , , \dst, \src 177 load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src 178 load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src 179 load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src 180 load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src 181 load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src 182 load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src 183 load_add_store v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src 184 load_add_store v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src 185 load_add_store v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src 186 load_add_store v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src 187 load_add_store v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src 188 load_add_store v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src 189 load_add_store v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src 190 load_add_store v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src 191 load_add_store , , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src 192 load_add_store , , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src 193 load_add_store , , , , v31.8h, v5.8b, v4.8b, \dst, \src 194 load_add_store , , , , , , v5.8b, \dst, \src 195.endm 196.macro load_add_store_8x8 dst, src, shiftbits=4 197 mov \src, \dst 198 load_add_store v2.8b, v16.8h, , , , , , \dst, \src, \shiftbits 199 load_add_store v3.8b, v17.8h, , , , , , \dst, \src, \shiftbits 200 load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src, \shiftbits 201 load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src, \shiftbits 202 load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits 203 load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits 204 load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits 205 load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits 206 load_add_store , , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits 207 load_add_store , , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits 208 load_add_store , , , , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits 209 load_add_store , , , , , , v3.8b, \dst, \src, \shiftbits 210.endm 211.macro load_add_store_8x4 dst, src 212 mov \src, \dst 213 load_add_store v2.8b, v16.8h, , , , , , \dst, \src 214 load_add_store v3.8b, v17.8h, , , , , , \dst, \src 215 load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src 216 load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src 217 load_add_store , , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src 218 load_add_store , , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src 219 load_add_store , , , , v19.8h, v5.8b, v4.8b, \dst, \src 220 load_add_store , , , , , , v5.8b, \dst, \src 221.endm 222.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src 223.ifnb \load 224 ld1 {\load}[0], [\src], x1 225.endif 226.ifnb \inssrc 227 ins \insdst\().d[1], \inssrc\().d[0] 228.endif 229.ifnb \shift 230 srshr \shift, \shift, #4 231.endif 232.ifnb \load 233 ld1 {\load}[1], [\src], x1 234.endif 235.ifnb \addsrc 236 uaddw \adddst, \adddst, \addsrc 237.endif 238.ifnb \store 239 st1 {\store}[0], [\dst], x1 240.endif 241.ifnb \narrowsrc 242 sqxtun \narrowdst, \narrowsrc 243.endif 244.ifnb \store 245 st1 {\store}[1], [\dst], x1 246.endif 247.endm 248.macro load_add_store_4x16 dst, src 249 mov \src, \dst 250 load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src 251 load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src 252 load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src 253 load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src 254 load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src 255 load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src 256 load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src 257 load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src 258 load_add_store4 , , , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src 259 load_add_store4 , , , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src 260 load_add_store4 , , , , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src 261 load_add_store4 , , , , , , v30.8h, v7.8b, v6.s, \dst, \src 262 load_add_store4 , , , , , , , , v7.s, \dst, \src 263.endm 264.macro load_add_store_4x8 dst, src 265 mov \src, \dst 266 load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src 267 load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src 268 load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src 269 load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src 270 load_add_store4 , , , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src 271 load_add_store4 , , , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src 272 load_add_store4 , , , , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src 273 load_add_store4 , , , , , , v22.8h, v3.8b, v2.s, \dst, \src 274 load_add_store4 , , , , , , , , v3.s, \dst, \src 275.endm 276 277.macro idct_dc w, h, shift 278 cbnz w3, 1f 279 mov w16, #2896*8 280 ld1r {v16.8h}, [x2] 281 dup v0.4h, w16 282 sqrdmulh v16.8h, v16.8h, v0.h[0] 283 strh wzr, [x2] 284.if (\w == 2*\h) || (2*\w == \h) 285 sqrdmulh v16.8h, v16.8h, v0.h[0] 286.endif 287.if \shift > 0 288 srshr v16.8h, v16.8h, #\shift 289.endif 290 sqrdmulh v16.8h, v16.8h, v0.h[0] 291 srshr v16.8h, v16.8h, #4 292 mov w4, #\h 293 b idct_dc_w\w\()_neon 2941: 295.endm 296 297function idct_dc_w4_neon 2981: 299 ld1 {v0.s}[0], [x0], x1 300 ld1 {v0.s}[1], [x0], x1 301 ld1 {v1.s}[0], [x0], x1 302 ld1 {v1.s}[1], [x0], x1 303 subs w4, w4, #4 304 sub x0, x0, x1, lsl #2 305 uaddw v0.8h, v16.8h, v0.8b 306 sqxtun v0.8b, v0.8h 307 uaddw v1.8h, v16.8h, v1.8b 308 st1 {v0.s}[0], [x0], x1 309 sqxtun v1.8b, v1.8h 310 st1 {v0.s}[1], [x0], x1 311 st1 {v1.s}[0], [x0], x1 312 st1 {v1.s}[1], [x0], x1 313 b.gt 1b 314 ret 315endfunc 316 317function idct_dc_w8_neon 3181: 319 ld1 {v0.8b}, [x0], x1 320 ld1 {v1.8b}, [x0], x1 321 ld1 {v2.8b}, [x0], x1 322 uaddw v20.8h, v16.8h, v0.8b 323 ld1 {v3.8b}, [x0], x1 324 sub x0, x0, x1, lsl #2 325 subs w4, w4, #4 326 uaddw v21.8h, v16.8h, v1.8b 327 sqxtun v0.8b, v20.8h 328 uaddw v22.8h, v16.8h, v2.8b 329 sqxtun v1.8b, v21.8h 330 uaddw v23.8h, v16.8h, v3.8b 331 st1 {v0.8b}, [x0], x1 332 sqxtun v2.8b, v22.8h 333 st1 {v1.8b}, [x0], x1 334 sqxtun v3.8b, v23.8h 335 st1 {v2.8b}, [x0], x1 336 st1 {v3.8b}, [x0], x1 337 b.gt 1b 338 ret 339endfunc 340 341function idct_dc_w16_neon 3421: 343 ld1 {v0.16b}, [x0], x1 344 ld1 {v1.16b}, [x0], x1 345 ld1 {v2.16b}, [x0], x1 346 subs w4, w4, #4 347 uaddw v20.8h, v16.8h, v0.8b 348 uaddw2 v21.8h, v16.8h, v0.16b 349 ld1 {v3.16b}, [x0], x1 350 uaddw v22.8h, v16.8h, v1.8b 351 uaddw2 v23.8h, v16.8h, v1.16b 352 sub x0, x0, x1, lsl #2 353 uaddw v24.8h, v16.8h, v2.8b 354 uaddw2 v25.8h, v16.8h, v2.16b 355 sqxtun v0.8b, v20.8h 356 sqxtun2 v0.16b, v21.8h 357 uaddw v26.8h, v16.8h, v3.8b 358 uaddw2 v27.8h, v16.8h, v3.16b 359 sqxtun v1.8b, v22.8h 360 sqxtun2 v1.16b, v23.8h 361 sqxtun v2.8b, v24.8h 362 sqxtun2 v2.16b, v25.8h 363 st1 {v0.16b}, [x0], x1 364 sqxtun v3.8b, v26.8h 365 sqxtun2 v3.16b, v27.8h 366 st1 {v1.16b}, [x0], x1 367 st1 {v2.16b}, [x0], x1 368 st1 {v3.16b}, [x0], x1 369 b.gt 1b 370 ret 371endfunc 372 373function idct_dc_w32_neon 3741: 375 ld1 {v0.16b, v1.16b}, [x0], x1 376 subs w4, w4, #2 377 uaddw v20.8h, v16.8h, v0.8b 378 uaddw2 v21.8h, v16.8h, v0.16b 379 ld1 {v2.16b, v3.16b}, [x0] 380 uaddw v22.8h, v16.8h, v1.8b 381 uaddw2 v23.8h, v16.8h, v1.16b 382 sub x0, x0, x1 383 uaddw v24.8h, v16.8h, v2.8b 384 uaddw2 v25.8h, v16.8h, v2.16b 385 sqxtun v0.8b, v20.8h 386 sqxtun2 v0.16b, v21.8h 387 uaddw v26.8h, v16.8h, v3.8b 388 uaddw2 v27.8h, v16.8h, v3.16b 389 sqxtun v1.8b, v22.8h 390 sqxtun2 v1.16b, v23.8h 391 sqxtun v2.8b, v24.8h 392 sqxtun2 v2.16b, v25.8h 393 st1 {v0.16b, v1.16b}, [x0], x1 394 sqxtun v3.8b, v26.8h 395 sqxtun2 v3.16b, v27.8h 396 st1 {v2.16b, v3.16b}, [x0], x1 397 b.gt 1b 398 ret 399endfunc 400 401function idct_dc_w64_neon 4021: 403 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] 404 subs w4, w4, #1 405 uaddw v20.8h, v16.8h, v0.8b 406 uaddw2 v21.8h, v16.8h, v0.16b 407 uaddw v22.8h, v16.8h, v1.8b 408 uaddw2 v23.8h, v16.8h, v1.16b 409 uaddw v24.8h, v16.8h, v2.8b 410 uaddw2 v25.8h, v16.8h, v2.16b 411 sqxtun v0.8b, v20.8h 412 sqxtun2 v0.16b, v21.8h 413 uaddw v26.8h, v16.8h, v3.8b 414 uaddw2 v27.8h, v16.8h, v3.16b 415 sqxtun v1.8b, v22.8h 416 sqxtun2 v1.16b, v23.8h 417 sqxtun v2.8b, v24.8h 418 sqxtun2 v2.16b, v25.8h 419 sqxtun v3.8b, v26.8h 420 sqxtun2 v3.16b, v27.8h 421 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 422 b.gt 1b 423 ret 424endfunc 425 426.macro iwht4 427 add v16.4h, v16.4h, v17.4h 428 sub v21.4h, v18.4h, v19.4h 429 sub v20.4h, v16.4h, v21.4h 430 sshr v20.4h, v20.4h, #1 431 sub v18.4h, v20.4h, v17.4h 432 sub v17.4h, v20.4h, v19.4h 433 add v19.4h, v21.4h, v18.4h 434 sub v16.4h, v16.4h, v17.4h 435.endm 436 437.macro idct_4 r0, r1, r2, r3, sz 438 smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz 439 smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz 440 smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz 441 sqrshrn_sz v6, v6, v7, #12, \sz 442 sqrshrn_sz v7, v4, v5, #12, \sz 443 smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz 444 sqrshrn_sz v2, v2, v3, #12, \sz 445 sqrshrn_sz v3, v4, v5, #12, \sz 446 sqadd \r0\sz, v2\sz, v6\sz 447 sqsub \r3\sz, v2\sz, v6\sz 448 sqadd \r1\sz, v3\sz, v7\sz 449 sqsub \r2\sz, v3\sz, v7\sz 450.endm 451 452function inv_dct_4h_x4_neon, export=1 453 movrel x16, idct_coeffs 454 ld1 {v0.4h}, [x16] 455 idct_4 v16, v17, v18, v19, .4h 456 ret 457endfunc 458 459function inv_dct_8h_x4_neon, export=1 460 movrel x16, idct_coeffs 461 ld1 {v0.4h}, [x16] 462 idct_4 v16, v17, v18, v19, .8h 463 ret 464endfunc 465 466.macro iadst_4x4 o0, o1, o2, o3 467 movrel x16, iadst4_coeffs 468 ld1 {v0.8h}, [x16] 469 470 ssubl v3.4s, v16.4h, v18.4h 471 smull v4.4s, v16.4h, v0.h[0] 472 smlal v4.4s, v18.4h, v0.h[1] 473 smlal v4.4s, v19.4h, v0.h[2] 474 smull v7.4s, v17.4h, v0.h[3] 475 saddw v3.4s, v3.4s, v19.4h 476 smull v5.4s, v16.4h, v0.h[2] 477 smlsl v5.4s, v18.4h, v0.h[0] 478 smlsl v5.4s, v19.4h, v0.h[1] 479 480 add \o3\().4s, v4.4s, v5.4s 481 mul \o2\().4s, v3.4s, v0.s[2] 482 add \o0\().4s, v4.4s, v7.4s 483 add \o1\().4s, v5.4s, v7.4s 484 sub \o3\().4s, \o3\().4s, v7.4s 485 486 sqrshrn \o0\().4h, \o0\().4s, #12 487 sqrshrn \o2\().4h, \o2\().4s, #12 488 sqrshrn \o1\().4h, \o1\().4s, #12 489 sqrshrn \o3\().4h, \o3\().4s, #12 490.endm 491 492function inv_adst_4h_x4_neon, export=1 493 iadst_4x4 v16, v17, v18, v19 494 ret 495endfunc 496 497function inv_flipadst_4h_x4_neon, export=1 498 iadst_4x4 v19, v18, v17, v16 499 ret 500endfunc 501 502.macro iadst_8x4 o0, o1, o2, o3 503 movrel x16, iadst4_coeffs 504 ld1 {v0.8h}, [x16] 505 506 ssubl v2.4s, v16.4h, v18.4h 507 ssubl2 v3.4s, v16.8h, v18.8h 508 smull v4.4s, v16.4h, v0.h[0] 509 smlal v4.4s, v18.4h, v0.h[1] 510 smlal v4.4s, v19.4h, v0.h[2] 511 smull2 v5.4s, v16.8h, v0.h[0] 512 smlal2 v5.4s, v18.8h, v0.h[1] 513 smlal2 v5.4s, v19.8h, v0.h[2] 514 saddw v2.4s, v2.4s, v19.4h 515 saddw2 v3.4s, v3.4s, v19.8h 516 smull v6.4s, v16.4h, v0.h[2] 517 smlsl v6.4s, v18.4h, v0.h[0] 518 smlsl v6.4s, v19.4h, v0.h[1] 519 smull2 v7.4s, v16.8h, v0.h[2] 520 smlsl2 v7.4s, v18.8h, v0.h[0] 521 smlsl2 v7.4s, v19.8h, v0.h[1] 522 523 mul v18.4s, v2.4s, v0.s[2] 524 mul v19.4s, v3.4s, v0.s[2] 525 526 smull v2.4s, v17.4h, v0.h[3] 527 smull2 v3.4s, v17.8h, v0.h[3] 528 529 add v16.4s, v4.4s, v2.4s // out0 530 add v17.4s, v5.4s, v3.4s 531 532 add v4.4s, v4.4s, v6.4s // out3 533 add v5.4s, v5.4s, v7.4s 534 535 add v6.4s, v6.4s, v2.4s // out1 536 add v7.4s, v7.4s, v3.4s 537 538 sub v4.4s, v4.4s, v2.4s // out3 539 sub v5.4s, v5.4s, v3.4s 540 541 sqrshrn v18.4h, v18.4s, #12 542 sqrshrn2 v18.8h, v19.4s, #12 543 544 sqrshrn \o0\().4h, v16.4s, #12 545 sqrshrn2 \o0\().8h, v17.4s, #12 546 547.ifc \o2, v17 548 mov v17.16b, v18.16b 549.endif 550 551 sqrshrn \o1\().4h, v6.4s, #12 552 sqrshrn2 \o1\().8h, v7.4s, #12 553 554 sqrshrn \o3\().4h, v4.4s, #12 555 sqrshrn2 \o3\().8h, v5.4s, #12 556.endm 557 558function inv_adst_8h_x4_neon, export=1 559 iadst_8x4 v16, v17, v18, v19 560 ret 561endfunc 562 563function inv_flipadst_8h_x4_neon, export=1 564 iadst_8x4 v19, v18, v17, v16 565 ret 566endfunc 567 568function inv_identity_4h_x4_neon, export=1 569 mov w16, #(5793-4096)*8 570 dup v0.4h, w16 571 sqrdmulh v4.4h, v16.4h, v0.h[0] 572 sqrdmulh v5.4h, v17.4h, v0.h[0] 573 sqrdmulh v6.4h, v18.4h, v0.h[0] 574 sqrdmulh v7.4h, v19.4h, v0.h[0] 575 sqadd v16.4h, v16.4h, v4.4h 576 sqadd v17.4h, v17.4h, v5.4h 577 sqadd v18.4h, v18.4h, v6.4h 578 sqadd v19.4h, v19.4h, v7.4h 579 ret 580endfunc 581 582function inv_identity_8h_x4_neon, export=1 583 mov w16, #(5793-4096)*8 584 dup v0.4h, w16 585 sqrdmulh v4.8h, v16.8h, v0.h[0] 586 sqrdmulh v5.8h, v17.8h, v0.h[0] 587 sqrdmulh v6.8h, v18.8h, v0.h[0] 588 sqrdmulh v7.8h, v19.8h, v0.h[0] 589 sqadd v16.8h, v16.8h, v4.8h 590 sqadd v17.8h, v17.8h, v5.8h 591 sqadd v18.8h, v18.8h, v6.8h 592 sqadd v19.8h, v19.8h, v7.8h 593 ret 594endfunc 595 596.macro identity_8x4_shift1 r0, r1, r2, r3, c 597.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h 598 sqrdmulh v2.8h, \i, \c 599 srhadd \i, \i, v2.8h 600.endr 601.endm 602 603function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 604 mov x15, x30 605 movi v31.8h, #0 606 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] 607 st1 {v31.8h}, [x2], #16 608 609 sshr v16.4h, v16.4h, #2 610 sshr v17.4h, v17.4h, #2 611 sshr v18.4h, v18.4h, #2 612 sshr v19.4h, v19.4h, #2 613 614 iwht4 615 616 st1 {v31.8h}, [x2], #16 617 transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 618 619 iwht4 620 621 ld1 {v0.s}[0], [x0], x1 622 ld1 {v0.s}[1], [x0], x1 623 ins v16.d[1], v17.d[0] 624 ins v18.d[1], v19.d[0] 625 ld1 {v1.s}[0], [x0], x1 626 ld1 {v1.s}[1], [x0], x1 627 628 b L(itx_4x4_end) 629endfunc 630 631function inv_txfm_add_4x4_neon 632 movi v31.8h, #0 633 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] 634 st1 {v31.8h}, [x2], #16 635 636 blr x4 637 638 st1 {v31.8h}, [x2], #16 639 transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 640 641 blr x5 642 643 ld1 {v0.s}[0], [x0], x1 644 ld1 {v0.s}[1], [x0], x1 645 ins v16.d[1], v17.d[0] 646 ins v18.d[1], v19.d[0] 647 ld1 {v1.s}[0], [x0], x1 648 ld1 {v1.s}[1], [x0], x1 649 srshr v16.8h, v16.8h, #4 650 srshr v18.8h, v18.8h, #4 651 652L(itx_4x4_end): 653 sub x0, x0, x1, lsl #2 654 uaddw v16.8h, v16.8h, v0.8b 655 sqxtun v0.8b, v16.8h 656 uaddw v18.8h, v18.8h, v1.8b 657 st1 {v0.s}[0], [x0], x1 658 sqxtun v1.8b, v18.8h 659 st1 {v0.s}[1], [x0], x1 660 st1 {v1.s}[0], [x0], x1 661 st1 {v1.s}[1], [x0], x1 662 663 ret x15 664endfunc 665 666.macro def_fn_4x4 txfm1, txfm2 667function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 668 mov x15, x30 669 670.ifc \txfm1\()_\txfm2, dct_dct 671 cbnz w3, 1f 672 mov w16, #2896*8 673 ld1r {v16.8h}, [x2] 674 dup v4.8h, w16 675 strh wzr, [x2] 676 sqrdmulh v16.8h, v16.8h, v4.h[0] 677 ld1 {v0.s}[0], [x0], x1 678 sqrdmulh v20.8h, v16.8h, v4.h[0] 679 ld1 {v0.s}[1], [x0], x1 680 srshr v16.8h, v20.8h, #4 681 ld1 {v1.s}[0], [x0], x1 682 srshr v18.8h, v20.8h, #4 683 ld1 {v1.s}[1], [x0], x1 684 b L(itx_4x4_end) 6851: 686.endif 687 adr x4, inv_\txfm1\()_4h_x4_neon 688 adr x5, inv_\txfm2\()_4h_x4_neon 689 b inv_txfm_add_4x4_neon 690endfunc 691.endm 692 693def_fn_4x4 dct, dct 694def_fn_4x4 identity, identity 695def_fn_4x4 dct, adst 696def_fn_4x4 dct, flipadst 697def_fn_4x4 dct, identity 698def_fn_4x4 adst, dct 699def_fn_4x4 adst, adst 700def_fn_4x4 adst, flipadst 701def_fn_4x4 flipadst, dct 702def_fn_4x4 flipadst, adst 703def_fn_4x4 flipadst, flipadst 704def_fn_4x4 identity, dct 705 706def_fn_4x4 adst, identity 707def_fn_4x4 flipadst, identity 708def_fn_4x4 identity, adst 709def_fn_4x4 identity, flipadst 710 711.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb 712 idct_4 \r0, \r2, \r4, \r6, \sz 713 714 smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a 715 smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a 716 smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a 717 sqrshrn_sz \r1, v2, v3, #12, \sz // t4a 718 sqrshrn_sz \r7, v4, v5, #12, \sz // t7a 719 smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a 720 sqrshrn_sz \r3, v6, v7, #12, \sz // t5a 721 sqrshrn_sz \r5, v2, v3, #12, \sz // t6a 722 723 sqadd v2\sz, \r1\sz, \r3\sz // t4 724 sqsub \r1\sz, \r1\sz, \r3\sz // t5a 725 sqadd v3\sz, \r7\sz, \r5\sz // t7 726 sqsub \r3\sz, \r7\sz, \r5\sz // t6a 727 728 smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5 729 smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6 730 sqrshrn_sz v4, v4, v5, #12, \sz // t5 731 sqrshrn_sz v5, v6, v7, #12, \sz // t6 732 733 sqsub \r7\sz, \r0\sz, v3\sz // out7 734 sqadd \r0\sz, \r0\sz, v3\sz // out0 735 sqadd \r1\sz, \r2\sz, v5\sz // out1 736 sqsub v6\sz, \r2\sz, v5\sz // out6 737 sqadd \r2\sz, \r4\sz, v4\sz // out2 738 sqsub \r5\sz, \r4\sz, v4\sz // out5 739 sqadd \r3\sz, \r6\sz, v2\sz // out3 740 sqsub \r4\sz, \r6\sz, v2\sz // out4 741 mov \r6\szb, v6\szb // out6 742.endm 743 744function inv_dct_8h_x8_neon, export=1 745 movrel x16, idct_coeffs 746 ld1 {v0.8h}, [x16] 747 idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b 748 ret 749endfunc 750 751function inv_dct_4h_x8_neon, export=1 752 movrel x16, idct_coeffs 753 ld1 {v0.8h}, [x16] 754 idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b 755 ret 756endfunc 757 758.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz 759 movrel x16, iadst8_coeffs 760 ld1 {v0.8h, v1.8h}, [x16] 761 762 smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz 763 smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz 764 smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz 765 sqrshrn_sz v16, v2, v3, #12, \sz // t0a 766 sqrshrn_sz v23, v4, v5, #12, \sz // t1a 767 smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz 768 smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz 769 sqrshrn_sz v18, v6, v7, #12, \sz // t2a 770 sqrshrn_sz v21, v2, v3, #12, \sz // t3a 771 smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz 772 smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz 773 sqrshrn_sz v20, v4, v5, #12, \sz // t4a 774 sqrshrn_sz v19, v6, v7, #12, \sz // t5a 775 smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz 776 sqrshrn_sz v22, v2, v3, #12, \sz // t6a 777 sqrshrn_sz v17, v4, v5, #12, \sz // t7a 778 779 sqadd v2\sz, v16\sz, v20\sz // t0 780 sqsub v3\sz, v16\sz, v20\sz // t4 781 sqadd v4\sz, v23\sz, v19\sz // t1 782 sqsub v5\sz, v23\sz, v19\sz // t5 783 sqadd v6\sz, v18\sz, v22\sz // t2 784 sqsub v7\sz, v18\sz, v22\sz // t6 785 sqadd v18\sz, v21\sz, v17\sz // t3 786 sqsub v19\sz, v21\sz, v17\sz // t7 787 788 smull_smlal v16, v17, v3, v5, v1.h[3], v1.h[2], \sz 789 smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz 790 smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz 791 792 sqrshrn_sz v3, v16, v17, #12, \sz // t4a 793 sqrshrn_sz v5, v20, v21, #12, \sz // t5a 794 795 smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz 796 797 sqrshrn_sz v7, v22, v23, #12, \sz // t6a 798 sqrshrn_sz v19, v16, v17, #12, \sz // t7a 799 800 sqadd \o0\()\sz, v2\sz, v6\sz // out0 801 sqsub v2\sz, v2\sz, v6\sz // t2 802 sqadd \o7\()\sz, v4\sz, v18\sz // out7 803 sqsub v4\sz, v4\sz, v18\sz // t3 804 sqneg \o7\()\sz, \o7\()\sz // out7 805 806 sqadd \o1\()\sz, v3\sz, v7\sz // out1 807 sqsub v3\sz, v3\sz, v7\sz // t6 808 sqadd \o6\()\sz, v5\sz, v19\sz // out6 809 sqsub v5\sz, v5\sz, v19\sz // t7 810 sqneg \o1\()\sz, \o1\()\sz // out1 811 812 smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20) 813 smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19) 814 smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18) 815 sqrshrn_sz v2, v18, v19, #12, \sz // out3 816 smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21) 817 sqrshrn_sz v3, v20, v21, #12, \sz // out5 818 sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21) 819 sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19) 820 821 sqneg \o3\()\sz, v2\sz // out3 822 sqneg \o5\()\sz, v3\sz // out5 823.endm 824 825function inv_adst_8h_x8_neon, export=1 826 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h 827 ret 828endfunc 829 830function inv_flipadst_8h_x8_neon, export=1 831 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h 832 ret 833endfunc 834 835function inv_adst_4h_x8_neon, export=1 836 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h 837 ret 838endfunc 839 840function inv_flipadst_4h_x8_neon, export=1 841 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h 842 ret 843endfunc 844 845function inv_identity_8h_x8_neon, export=1 846 sqshl v16.8h, v16.8h, #1 847 sqshl v17.8h, v17.8h, #1 848 sqshl v18.8h, v18.8h, #1 849 sqshl v19.8h, v19.8h, #1 850 sqshl v20.8h, v20.8h, #1 851 sqshl v21.8h, v21.8h, #1 852 sqshl v22.8h, v22.8h, #1 853 sqshl v23.8h, v23.8h, #1 854 ret 855endfunc 856 857function inv_identity_4h_x8_neon, export=1 858 sqshl v16.4h, v16.4h, #1 859 sqshl v17.4h, v17.4h, #1 860 sqshl v18.4h, v18.4h, #1 861 sqshl v19.4h, v19.4h, #1 862 sqshl v20.4h, v20.4h, #1 863 sqshl v21.4h, v21.4h, #1 864 sqshl v22.4h, v22.4h, #1 865 sqshl v23.4h, v23.4h, #1 866 ret 867endfunc 868 869.macro def_fn_8x8_base variant 870function inv_txfm_\variant\()add_8x8_neon 871 movi v28.8h, #0 872 movi v29.8h, #0 873 movi v30.8h, #0 874 movi v31.8h, #0 875 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] 876 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64 877 ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2] 878 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] 879 880.ifc \variant, identity_ 881 // The identity shl #1 and downshift srshr #1 cancel out 882 883 b L(itx_8x8_epilog) 884.else 885 blr x4 886 887 srshr v16.8h, v16.8h, #1 888 srshr v17.8h, v17.8h, #1 889 srshr v18.8h, v18.8h, #1 890 srshr v19.8h, v19.8h, #1 891 srshr v20.8h, v20.8h, #1 892 srshr v21.8h, v21.8h, #1 893 srshr v22.8h, v22.8h, #1 894 srshr v23.8h, v23.8h, #1 895 896L(itx_8x8_epilog): 897 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 898 899 blr x5 900 901 load_add_store_8x8 x0, x7 902 ret x15 903.endif 904endfunc 905.endm 906 907def_fn_8x8_base identity_ 908def_fn_8x8_base 909 910.macro def_fn_8x8 txfm1, txfm2 911function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 912 mov x15, x30 913 914.ifc \txfm1\()_\txfm2, dct_dct 915 idct_dc 8, 8, 1 916.endif 917 adr x5, inv_\txfm2\()_8h_x8_neon 918.ifc \txfm1, identity 919 b inv_txfm_identity_add_8x8_neon 920.else 921 adr x4, inv_\txfm1\()_8h_x8_neon 922 b inv_txfm_add_8x8_neon 923.endif 924endfunc 925.endm 926 927def_fn_8x8 dct, dct 928def_fn_8x8 identity, identity 929def_fn_8x8 dct, adst 930def_fn_8x8 dct, flipadst 931def_fn_8x8 dct, identity 932def_fn_8x8 adst, dct 933def_fn_8x8 adst, adst 934def_fn_8x8 adst, flipadst 935def_fn_8x8 flipadst, dct 936def_fn_8x8 flipadst, adst 937def_fn_8x8 flipadst, flipadst 938def_fn_8x8 identity, dct 939def_fn_8x8 adst, identity 940def_fn_8x8 flipadst, identity 941def_fn_8x8 identity, adst 942def_fn_8x8 identity, flipadst 943 944function inv_txfm_add_8x4_neon 945 movi v30.8h, #0 946 movi v31.8h, #0 947 mov w16, #2896*8 948 dup v0.4h, w16 949 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] 950 st1 {v30.8h,v31.8h}, [x2], #32 951 ld1 {v20.4h,v21.4h,v22.4h,v23.4h}, [x2] 952 st1 {v30.8h,v31.8h}, [x2] 953 954 scale_input .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 955 956 blr x4 957 958 transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 959 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 960 ins v16.d[1], v20.d[0] 961 ins v17.d[1], v21.d[0] 962 ins v18.d[1], v22.d[0] 963 ins v19.d[1], v23.d[0] 964 965 blr x5 966 967 load_add_store_8x4 x0, x7 968 ret x15 969endfunc 970 971function inv_txfm_add_4x8_neon 972 movi v28.8h, #0 973 movi v29.8h, #0 974 movi v30.8h, #0 975 movi v31.8h, #0 976 mov w16, #2896*8 977 dup v0.4h, w16 978 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] 979 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] 980 981 scale_input .8h, v0.h[0], v16, v17, v18, v19 982 983 blr x4 984 985 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 986 ins v20.d[0], v16.d[1] 987 ins v21.d[0], v17.d[1] 988 ins v22.d[0], v18.d[1] 989 ins v23.d[0], v19.d[1] 990 991 blr x5 992 993 load_add_store_4x8 x0, x7 994 ret x15 995endfunc 996 997.macro def_fn_48 w, h, txfm1, txfm2 998function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 999 mov x15, x30 1000 1001.ifc \txfm1\()_\txfm2, dct_dct 1002 idct_dc \w, \h, 0 1003.endif 1004 adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon 1005 adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon 1006 b inv_txfm_add_\w\()x\h\()_neon 1007endfunc 1008.endm 1009 1010.macro def_fns_48 w, h 1011def_fn_48 \w, \h, dct, dct 1012def_fn_48 \w, \h, identity, identity 1013def_fn_48 \w, \h, dct, adst 1014def_fn_48 \w, \h, dct, flipadst 1015def_fn_48 \w, \h, dct, identity 1016def_fn_48 \w, \h, adst, dct 1017def_fn_48 \w, \h, adst, adst 1018def_fn_48 \w, \h, adst, flipadst 1019def_fn_48 \w, \h, flipadst, dct 1020def_fn_48 \w, \h, flipadst, adst 1021def_fn_48 \w, \h, flipadst, flipadst 1022def_fn_48 \w, \h, identity, dct 1023def_fn_48 \w, \h, adst, identity 1024def_fn_48 \w, \h, flipadst, identity 1025def_fn_48 \w, \h, identity, adst 1026def_fn_48 \w, \h, identity, flipadst 1027.endm 1028 1029def_fns_48 4, 8 1030def_fns_48 8, 4 1031 1032 1033.macro idct_16 sz, szb 1034 idct_8 v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb 1035 1036 smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a 1037 smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a 1038 smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a 1039 sqrshrn_sz v17, v2, v3, #12, \sz // t8a 1040 sqrshrn_sz v31, v4, v5, #12, \sz // t15a 1041 smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a 1042 smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a 1043 sqrshrn_sz v23, v6, v7, #12, \sz // t9a 1044 sqrshrn_sz v25, v2, v3, #12, \sz // t14a 1045 smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a 1046 smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a 1047 sqrshrn_sz v21, v4, v5, #12, \sz // t10a 1048 sqrshrn_sz v27, v6, v7, #12, \sz // t13a 1049 smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a 1050 sqrshrn_sz v19, v2, v3, #12, \sz // t11a 1051 sqrshrn_sz v29, v4, v5, #12, \sz // t12a 1052 1053 sqsub v2\sz, v17\sz, v23\sz // t9 1054 sqadd v17\sz, v17\sz, v23\sz // t8 1055 sqsub v3\sz, v31\sz, v25\sz // t14 1056 sqadd v31\sz, v31\sz, v25\sz // t15 1057 sqsub v23\sz, v19\sz, v21\sz // t10 1058 sqadd v19\sz, v19\sz, v21\sz // t11 1059 sqadd v25\sz, v29\sz, v27\sz // t12 1060 sqsub v29\sz, v29\sz, v27\sz // t13 1061 1062 smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a 1063 smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a 1064 sqrshrn_sz v21, v4, v5, #12, \sz // t9a 1065 sqrshrn_sz v27, v6, v7, #12, \sz // t14a 1066 1067 smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a 1068 smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a 1069 sqrshrn_sz v29, v4, v5, #12, \sz // t13a 1070 neg v6.4s, v6.4s 1071.ifc \sz, .8h 1072 neg v7.4s, v7.4s 1073.endif 1074 sqrshrn_sz v23, v6, v7, #12, \sz // t10a 1075 1076 sqsub v2\sz, v17\sz, v19\sz // t11a 1077 sqadd v17\sz, v17\sz, v19\sz // t8a 1078 sqsub v3\sz, v31\sz, v25\sz // t12a 1079 sqadd v31\sz, v31\sz, v25\sz // t15a 1080 sqadd v19\sz, v21\sz, v23\sz // t9 1081 sqsub v21\sz, v21\sz, v23\sz // t10 1082 sqsub v25\sz, v27\sz, v29\sz // t13 1083 sqadd v27\sz, v27\sz, v29\sz // t14 1084 1085 smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11 1086 smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12 1087 smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a 1088 1089 sqrshrn_sz v4, v4, v5, #12, \sz // t11 1090 sqrshrn_sz v5, v6, v7, #12, \sz // t12 1091 smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a 1092 sqrshrn_sz v2, v2, v3, #12, \sz // t10a 1093 sqrshrn_sz v3, v6, v7, #12, \sz // t13a 1094 1095 sqadd v6\sz, v16\sz, v31\sz // out0 1096 sqsub v31\sz, v16\sz, v31\sz // out15 1097 mov v16\szb, v6\szb 1098 sqadd v23\sz, v30\sz, v17\sz // out7 1099 sqsub v7\sz, v30\sz, v17\sz // out8 1100 sqadd v17\sz, v18\sz, v27\sz // out1 1101 sqsub v30\sz, v18\sz, v27\sz // out14 1102 sqadd v18\sz, v20\sz, v3\sz // out2 1103 sqsub v29\sz, v20\sz, v3\sz // out13 1104 sqadd v3\sz, v28\sz, v19\sz // out6 1105 sqsub v25\sz, v28\sz, v19\sz // out9 1106 sqadd v19\sz, v22\sz, v5\sz // out3 1107 sqsub v28\sz, v22\sz, v5\sz // out12 1108 sqadd v20\sz, v24\sz, v4\sz // out4 1109 sqsub v27\sz, v24\sz, v4\sz // out11 1110 sqadd v21\sz, v26\sz, v2\sz // out5 1111 sqsub v26\sz, v26\sz, v2\sz // out10 1112 mov v24\szb, v7\szb 1113 mov v22\szb, v3\szb 1114.endm 1115 1116function inv_dct_8h_x16_neon, export=1 1117 movrel x16, idct_coeffs 1118 ld1 {v0.8h, v1.8h}, [x16] 1119 idct_16 .8h, .16b 1120 ret 1121endfunc 1122 1123function inv_dct_4h_x16_neon, export=1 1124 movrel x16, idct_coeffs 1125 ld1 {v0.8h, v1.8h}, [x16] 1126 idct_16 .4h, .8b 1127 ret 1128endfunc 1129 1130.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb 1131 movrel x16, iadst16_coeffs 1132 ld1 {v0.8h, v1.8h}, [x16] 1133 movrel x16, idct_coeffs 1134 1135 smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0 1136 smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1 1137 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2 1138 sqrshrn_sz v16, v2, v3, #12, \sz // t0 1139 sqrshrn_sz v31, v4, v5, #12, \sz // t1 1140 smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3 1141 smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4 1142 sqrshrn_sz v18, v6, v7, #12, \sz // t2 1143 sqrshrn_sz v29, v2, v3, #12, \sz // t3 1144 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5 1145 smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6 1146 sqrshrn_sz v20, v4, v5, #12, \sz // t4 1147 sqrshrn_sz v27, v6, v7, #12, \sz // t5 1148 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7 1149 smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8 1150 sqrshrn_sz v22, v2, v3, #12, \sz // t6 1151 sqrshrn_sz v25, v4, v5, #12, \sz // t7 1152 smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9 1153 smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10 1154 sqrshrn_sz v23, v6, v7, #12, \sz // t8 1155 sqrshrn_sz v24, v2, v3, #12, \sz // t9 1156 smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11 1157 smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12 1158 sqrshrn_sz v21, v4, v5, #12, \sz // t10 1159 sqrshrn_sz v26, v6, v7, #12, \sz // t11 1160 smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13 1161 smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14 1162 sqrshrn_sz v19, v2, v3, #12, \sz // t12 1163 sqrshrn_sz v28, v4, v5, #12, \sz // t13 1164 smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15 1165 sqrshrn_sz v17, v6, v7, #12, \sz // t14 1166 sqrshrn_sz v30, v2, v3, #12, \sz // t15 1167 1168 ld1 {v0.8h}, [x16] 1169 1170 sqsub v2\sz, v16\sz, v23\sz // t8a 1171 sqadd v16\sz, v16\sz, v23\sz // t0a 1172 sqsub v3\sz, v31\sz, v24\sz // t9a 1173 sqadd v31\sz, v31\sz, v24\sz // t1a 1174 sqadd v23\sz, v18\sz, v21\sz // t2a 1175 sqsub v18\sz, v18\sz, v21\sz // t10a 1176 sqadd v24\sz, v29\sz, v26\sz // t3a 1177 sqsub v29\sz, v29\sz, v26\sz // t11a 1178 sqadd v21\sz, v20\sz, v19\sz // t4a 1179 sqsub v20\sz, v20\sz, v19\sz // t12a 1180 sqadd v26\sz, v27\sz, v28\sz // t5a 1181 sqsub v27\sz, v27\sz, v28\sz // t13a 1182 sqadd v19\sz, v22\sz, v17\sz // t6a 1183 sqsub v22\sz, v22\sz, v17\sz // t14a 1184 sqadd v28\sz, v25\sz, v30\sz // t7a 1185 sqsub v25\sz, v25\sz, v30\sz // t15a 1186 1187 smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8 1188 smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9 1189 smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10 1190 sqrshrn_sz v17, v4, v5, #12, \sz // t8 1191 sqrshrn_sz v30, v6, v7, #12, \sz // t9 1192 smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11 1193 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12 1194 sqrshrn_sz v18, v2, v3, #12, \sz // t10 1195 sqrshrn_sz v29, v4, v5, #12, \sz // t11 1196 smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13 1197 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14 1198 sqrshrn_sz v27, v6, v7, #12, \sz // t12 1199 sqrshrn_sz v20, v2, v3, #12, \sz // t13 1200 smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15 1201 sqrshrn_sz v25, v4, v5, #12, \sz // t14 1202 sqrshrn_sz v22, v6, v7, #12, \sz // t15 1203 1204 sqsub v2\sz, v16\sz, v21\sz // t4 1205 sqadd v16\sz, v16\sz, v21\sz // t0 1206 sqsub v3\sz, v31\sz, v26\sz // t5 1207 sqadd v31\sz, v31\sz, v26\sz // t1 1208 sqadd v21\sz, v23\sz, v19\sz // t2 1209 sqsub v23\sz, v23\sz, v19\sz // t6 1210 sqadd v26\sz, v24\sz, v28\sz // t3 1211 sqsub v24\sz, v24\sz, v28\sz // t7 1212 sqadd v19\sz, v17\sz, v27\sz // t8a 1213 sqsub v17\sz, v17\sz, v27\sz // t12a 1214 sqadd v28\sz, v30\sz, v20\sz // t9a 1215 sqsub v30\sz, v30\sz, v20\sz // t13a 1216 sqadd v27\sz, v18\sz, v25\sz // t10a 1217 sqsub v18\sz, v18\sz, v25\sz // t14a 1218 sqadd v20\sz, v29\sz, v22\sz // t11a 1219 sqsub v29\sz, v29\sz, v22\sz // t15a 1220 1221 smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a 1222 smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a 1223 smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a 1224 sqrshrn_sz v22, v4, v5, #12, \sz // t4a 1225 sqrshrn_sz v25, v6, v7, #12, \sz // t5a 1226 smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a 1227 smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12 1228 sqrshrn_sz v24, v2, v3, #12, \sz // t6a 1229 sqrshrn_sz v23, v4, v5, #12, \sz // t7a 1230 smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13 1231 smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14 1232 sqrshrn_sz v17, v6, v7, #12, \sz // t12 1233 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15 1234 sqrshrn_sz v29, v2, v3, #12, \sz // t13 1235 sqrshrn_sz v30, v4, v5, #12, \sz // t14 1236 sqrshrn_sz v18, v6, v7, #12, \sz // t15 1237 1238 sqsub v2\sz, v16\sz, v21\sz // t2a 1239.ifc \o0, v16 1240 sqadd \o0\sz, v16\sz, v21\sz // out0 1241 sqsub v21\sz, v31\sz, v26\sz // t3a 1242 sqadd \o15\sz, v31\sz, v26\sz // out15 1243.else 1244 sqadd v4\sz, v16\sz, v21\sz // out0 1245 sqsub v21\sz, v31\sz, v26\sz // t3a 1246 sqadd \o15\sz, v31\sz, v26\sz // out15 1247 mov \o0\szb, v4\szb 1248.endif 1249 sqneg \o15\sz, \o15\sz // out15 1250 1251 sqsub v3\sz, v29\sz, v18\sz // t15a 1252 sqadd \o13\sz, v29\sz, v18\sz // out13 1253 sqadd \o2\sz, v17\sz, v30\sz // out2 1254 sqsub v26\sz, v17\sz, v30\sz // t14a 1255 sqneg \o13\sz, \o13\sz // out13 1256 1257 sqadd \o1\sz, v19\sz, v27\sz // out1 1258 sqsub v27\sz, v19\sz, v27\sz // t10 1259 sqadd \o14\sz, v28\sz, v20\sz // out14 1260 sqsub v20\sz, v28\sz, v20\sz // t11 1261 sqneg \o1\sz, \o1\sz // out1 1262 1263 sqadd \o3\sz, v22\sz, v24\sz // out3 1264 sqsub v22\sz, v22\sz, v24\sz // t6 1265 sqadd \o12\sz, v25\sz, v23\sz // out12 1266 sqsub v23\sz, v25\sz, v23\sz // t7 1267 sqneg \o3\sz, \o3\sz // out3 1268 1269 smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23) 1270 smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24) 1271 smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26) 1272 1273 sqrshrn_sz v24, v24, v25, #12, \sz // out8 1274 sqrshrn_sz v4, v4, v5, #12, \sz // out7 1275 sqrshrn_sz v5, v6, v7, #12, \sz // out5 1276 smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21) 1277 smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27) 1278 sqrshrn_sz v26, v6, v7, #12, \sz // out10 1279 1280 smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20) 1281 smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25) 1282 smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22) 1283 1284 sqrshrn_sz \o4, v2, v3, #12, \sz // out4 1285 sqrshrn_sz v6, v6, v7, #12, \sz // out11 1286 sqrshrn_sz v7, v21, v25, #12, \sz // out9 1287 sqrshrn_sz \o6, v22, v23, #12, \sz // out6 1288 1289.ifc \o8, v23 1290 mov \o8\szb, v24\szb 1291 mov \o10\szb, v26\szb 1292.endif 1293 1294 sqneg \o7\sz, v4\sz // out7 1295 sqneg \o5\sz, v5\sz // out5 1296 sqneg \o11\sz, v6\sz // out11 1297 sqneg \o9\sz, v7\sz // out9 1298.endm 1299 1300function inv_adst_8h_x16_neon, export=1 1301 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b 1302 ret 1303endfunc 1304 1305function inv_flipadst_8h_x16_neon, export=1 1306 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b 1307 ret 1308endfunc 1309 1310function inv_adst_4h_x16_neon, export=1 1311 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b 1312 ret 1313endfunc 1314 1315function inv_flipadst_4h_x16_neon, export=1 1316 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b 1317 ret 1318endfunc 1319 1320function inv_identity_8h_x16_neon, export=1 1321 mov w16, #2*(5793-4096)*8 1322 dup v0.4h, w16 1323.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1324 sqrdmulh v2.8h, v\i\().8h, v0.h[0] 1325 sqadd v\i\().8h, v\i\().8h, v\i\().8h 1326 sqadd v\i\().8h, v\i\().8h, v2.8h 1327.endr 1328 ret 1329endfunc 1330 1331function inv_identity_4h_x16_neon, export=1 1332 mov w16, #2*(5793-4096)*8 1333 dup v0.4h, w16 1334.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1335 sqrdmulh v2.4h, v\i\().4h, v0.h[0] 1336 sqadd v\i\().4h, v\i\().4h, v\i\().4h 1337 sqadd v\i\().4h, v\i\().4h, v2.4h 1338.endr 1339 ret 1340endfunc 1341 1342.macro identity_8x16_shift2 c 1343.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 1344 sqrdmulh v2.8h, \i, \c 1345 sshr v2.8h, v2.8h, #1 1346 srhadd \i, \i, v2.8h 1347.endr 1348.endm 1349 1350.macro identity_8x16_shift1 c 1351.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 1352 sqrdmulh v2.8h, \i, \c 1353 srshr v2.8h, v2.8h, #1 1354 sqadd \i, \i, v2.8h 1355.endr 1356.endm 1357 1358.macro identity_8x8_shift1 c 1359.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 1360 sqrdmulh v2.8h, \i, \c 1361 srshr v2.8h, v2.8h, #1 1362 sqadd \i, \i, v2.8h 1363.endr 1364.endm 1365 1366.macro identity_8x8 c 1367.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 1368 sqrdmulh v2.8h, \i, \c 1369 sqadd \i, \i, \i 1370 sqadd \i, \i, v2.8h 1371.endr 1372.endm 1373 1374.macro def_horz_16 scale=0, identity=0, shift=2, suffix 1375function inv_txfm_horz\suffix\()_16x8_neon 1376 AARCH64_VALID_CALL_TARGET 1377 mov x14, x30 1378 movi v7.8h, #0 1379.if \identity 1380 mov w16, #2*(5793-4096)*8 1381 dup v0.4h, w16 1382.elseif \scale 1383 mov w16, #2896*8 1384 dup v0.4h, w16 1385.endif 1386.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 1387 ld1 {\i}, [x7] 1388 st1 {v7.8h}, [x7], x8 1389.endr 1390.if \scale 1391 scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 1392 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 1393.endif 1394.if \identity 1395 identity_8x16_shift2 v0.h[0] 1396 b L(horz_16x8_epilog) 1397.else 1398 blr x4 1399.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 1400 srshr \i, \i, #\shift 1401.endr 1402.if \shift == 1 1403 b L(horz_16x8_epilog) 1404.else 1405L(horz_16x8_epilog): 1406 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 1407 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 1408 1409.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h 1410 st1 {\i}, [x6], #16 1411.endr 1412 1413 ret x14 1414.endif 1415.endif 1416endfunc 1417.endm 1418 1419def_horz_16 scale=1, identity=0, shift=1, suffix=_scale 1420def_horz_16 scale=0, identity=1, shift=0, suffix=_identity 1421def_horz_16 scale=0, identity=0, shift=2 1422 1423function inv_txfm_add_vert_8x16_neon 1424 mov x14, x30 1425.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1426 ld1 {v\i\().8h}, [x7], x8 1427.endr 1428 blr x5 1429 load_add_store_8x16 x6, x7 1430 ret x14 1431endfunc 1432 1433function inv_txfm_add_16x16_neon 1434 mov x15, x30 1435 sub sp, sp, #512 1436 mov x8, #16*2 1437.irp i, 0, 8 1438 add x6, sp, #(\i*16*2) 1439.if \i == 8 1440 cmp w3, w13 1441 b.lt 1f 1442.endif 1443 add x7, x2, #(\i*2) 1444 blr x9 1445.endr 1446 b 2f 14471: 1448 movi v4.8h, #0 1449 movi v5.8h, #0 1450 movi v6.8h, #0 1451 movi v7.8h, #0 1452.rept 4 1453 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 1454.endr 14552: 1456.irp i, 0, 8 1457 add x6, x0, #(\i) 1458 add x7, sp, #(\i*2) 1459 bl inv_txfm_add_vert_8x16_neon 1460.endr 1461 1462 add sp, sp, #512 1463 ret x15 1464endfunc 1465 1466.macro def_fn_16x16 txfm1, txfm2, eob_half 1467function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 1468.ifc \txfm1\()_\txfm2, dct_dct 1469 idct_dc 16, 16, 2 1470.endif 1471.ifc \txfm1, identity 1472 adr x9, inv_txfm_horz_identity_16x8_neon 1473.else 1474 adr x9, inv_txfm_horz_16x8_neon 1475 adr x4, inv_\txfm1\()_8h_x16_neon 1476.endif 1477 adr x5, inv_\txfm2\()_8h_x16_neon 1478 mov x13, #\eob_half 1479 b inv_txfm_add_16x16_neon 1480endfunc 1481.endm 1482 1483def_fn_16x16 dct, dct, 36 1484def_fn_16x16 identity, identity, 36 1485def_fn_16x16 dct, adst, 36 1486def_fn_16x16 dct, flipadst, 36 1487def_fn_16x16 dct, identity, 8 1488def_fn_16x16 adst, dct, 36 1489def_fn_16x16 adst, adst, 36 1490def_fn_16x16 adst, flipadst, 36 1491def_fn_16x16 flipadst, dct, 36 1492def_fn_16x16 flipadst, adst, 36 1493def_fn_16x16 flipadst, flipadst, 36 1494def_fn_16x16 identity, dct, 8 1495 1496.macro def_fn_416_base variant 1497function inv_txfm_\variant\()add_16x4_neon 1498 mov x15, x30 1499 movi v4.8h, #0 1500 1501.ifc \variant, identity_ 1502.irp i, v16.4h, v17.4h, v18.4h, v19.4h 1503 ld1 {\i}, [x2] 1504 st1 {v4.4h}, [x2], #8 1505.endr 1506.irp i, v16.d, v17.d, v18.d, v19.d 1507 ld1 {\i}[1], [x2] 1508 st1 {v4.4h}, [x2], #8 1509.endr 1510 mov w16, #2*(5793-4096)*8 1511 dup v0.4h, w16 1512.irp i, v20.4h, v21.4h, v22.4h, v23.4h 1513 ld1 {\i}, [x2] 1514 st1 {v4.4h}, [x2], #8 1515.endr 1516.irp i, v20.d, v21.d, v22.d, v23.d 1517 ld1 {\i}[1], [x2] 1518 st1 {v4.4h}, [x2], #8 1519.endr 1520 1521 identity_8x16_shift1 v0.h[0] 1522 1523 b L(itx_16x4_epilog) 1524.else 1525.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h 1526 ld1 {\i}, [x2] 1527 st1 {v4.4h}, [x2], #8 1528.endr 1529 1530 blr x4 1531 1532 ins v16.d[1], v20.d[0] 1533 ins v17.d[1], v21.d[0] 1534 ins v18.d[1], v22.d[0] 1535 ins v19.d[1], v23.d[0] 1536.irp i, v16.8h, v17.8h, v18.8h, v19.8h 1537 srshr \i, \i, #1 1538.endr 1539 1540 ins v24.d[1], v28.d[0] 1541 ins v25.d[1], v29.d[0] 1542 ins v26.d[1], v30.d[0] 1543 ins v27.d[1], v31.d[0] 1544 srshr v20.8h, v24.8h, #1 1545 srshr v21.8h, v25.8h, #1 1546 srshr v22.8h, v26.8h, #1 1547 srshr v23.8h, v27.8h, #1 1548 1549L(itx_16x4_epilog): 1550 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 1551 blr x5 1552 mov x6, x0 1553 load_add_store_8x4 x6, x7 1554 1555 transpose_4x8h_mov v20, v21, v22, v23, v2, v3, v4, v5, v16, v17, v18, v19 1556 blr x5 1557 add x6, x0, #8 1558 load_add_store_8x4 x6, x7 1559 1560 ret x15 1561.endif 1562endfunc 1563 1564function inv_txfm_\variant\()add_4x16_neon 1565 mov x15, x30 1566 movi v2.8h, #0 1567 1568 mov x11, #32 1569 cmp w3, w13 1570 b.lt 1f 1571 1572 add x6, x2, #16 1573.ifc \variant, identity_ 1574.irp i, v24.8h, v25.8h, v26.8h, v27.8h 1575 ld1 {\i}, [x6] 1576 st1 {v2.8h}, [x6], x11 1577.endr 1578 mov w16, #(5793-4096)*8 1579 dup v0.4h, w16 1580 identity_8x4_shift1 v24, v25, v26, v27, v0.h[0] 1581.else 1582.irp i, v16.8h, v17.8h, v18.8h, v19.8h 1583 ld1 {\i}, [x6] 1584 st1 {v2.8h}, [x6], x11 1585.endr 1586 blr x4 1587 srshr v24.8h, v16.8h, #1 1588 srshr v25.8h, v17.8h, #1 1589 srshr v26.8h, v18.8h, #1 1590 srshr v27.8h, v19.8h, #1 1591.endif 1592 transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7 1593 ins v28.d[0], v24.d[1] 1594 ins v29.d[0], v25.d[1] 1595 ins v30.d[0], v26.d[1] 1596 ins v31.d[0], v27.d[1] 1597 1598 b 2f 15991: 1600.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h 1601 movi \i, #0 1602.endr 16032: 1604 movi v2.8h, #0 1605.irp i, v16.8h, v17.8h, v18.8h, v19.8h 1606 ld1 {\i}, [x2] 1607 st1 {v2.8h}, [x2], x11 1608.endr 1609.ifc \variant, identity_ 1610 mov w16, #(5793-4096)*8 1611 dup v0.4h, w16 1612 identity_8x4_shift1 v16, v17, v18, v19, v0.h[0] 1613 1614 b L(itx_4x16_epilog) 1615.else 1616 blr x4 1617.irp i, v16.8h, v17.8h, v18.8h, v19.8h 1618 srshr \i, \i, #1 1619.endr 1620L(itx_4x16_epilog): 1621 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 1622 ins v20.d[0], v16.d[1] 1623 ins v21.d[0], v17.d[1] 1624 ins v22.d[0], v18.d[1] 1625 ins v23.d[0], v19.d[1] 1626 1627 blr x5 1628 1629 load_add_store_4x16 x0, x6 1630 1631 ret x15 1632.endif 1633endfunc 1634.endm 1635 1636def_fn_416_base identity_ 1637def_fn_416_base 1638 1639.macro def_fn_416 w, h, txfm1, txfm2, eob_half 1640function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 1641.ifc \txfm1\()_\txfm2, dct_dct 1642 idct_dc \w, \h, 1 1643.endif 1644.if \w == 4 1645.ifnc \txfm1, identity 1646 adr x4, inv_\txfm1\()_8h_x\w\()_neon 1647.endif 1648 adr x5, inv_\txfm2\()_4h_x\h\()_neon 1649 mov w13, #\eob_half 1650.else 1651.ifnc \txfm1, identity 1652 adr x4, inv_\txfm1\()_4h_x\w\()_neon 1653.endif 1654 adr x5, inv_\txfm2\()_8h_x\h\()_neon 1655.endif 1656.ifc \txfm1, identity 1657 b inv_txfm_identity_add_\w\()x\h\()_neon 1658.else 1659 b inv_txfm_add_\w\()x\h\()_neon 1660.endif 1661endfunc 1662.endm 1663 1664.macro def_fns_416 w, h 1665def_fn_416 \w, \h, dct, dct, 29 1666def_fn_416 \w, \h, identity, identity, 29 1667def_fn_416 \w, \h, dct, adst, 29 1668def_fn_416 \w, \h, dct, flipadst, 29 1669def_fn_416 \w, \h, dct, identity, 8 1670def_fn_416 \w, \h, adst, dct, 29 1671def_fn_416 \w, \h, adst, adst, 29 1672def_fn_416 \w, \h, adst, flipadst, 29 1673def_fn_416 \w, \h, flipadst, dct, 29 1674def_fn_416 \w, \h, flipadst, adst, 29 1675def_fn_416 \w, \h, flipadst, flipadst, 29 1676def_fn_416 \w, \h, identity, dct, 32 1677def_fn_416 \w, \h, adst, identity, 8 1678def_fn_416 \w, \h, flipadst, identity, 8 1679def_fn_416 \w, \h, identity, adst, 32 1680def_fn_416 \w, \h, identity, flipadst, 32 1681.endm 1682 1683def_fns_416 4, 16 1684def_fns_416 16, 4 1685 1686 1687.macro def_fn_816_base variant 1688function inv_txfm_\variant\()add_16x8_neon 1689 mov x15, x30 1690 movi v4.8h, #0 1691 mov w16, #2896*8 1692 dup v0.4h, w16 1693 1694.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 1695 ld1 {\i}, [x2] 1696 st1 {v4.8h}, [x2], #16 1697.endr 1698 1699 scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 1700 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 1701.ifc \variant, identity_ 1702 mov w16, #2*(5793-4096)*8 1703 dup v0.4h, w16 1704 identity_8x16_shift1 v0.h[0] 1705 1706 b L(itx_16x8_epilog) 1707.else 1708 blr x4 1709 1710.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 1711 srshr \i, \i, #1 1712.endr 1713 1714L(itx_16x8_epilog): 1715 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 1716 1717 blr x5 1718 1719 mov x6, x0 1720 load_add_store_8x8 x6, x7 1721 1722 transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23 1723 1724 blr x5 1725 1726 add x0, x0, #8 1727 load_add_store_8x8 x0, x7 1728 1729 ret x15 1730.endif 1731endfunc 1732 1733function inv_txfm_\variant\()add_8x16_neon 1734 mov x15, x30 1735 movi v4.8h, #0 1736 mov w16, #2896*8 1737 dup v0.4h, w16 1738 mov x11, #32 1739 1740 cmp w3, w13 1741 b.lt 1f 1742 1743 add x6, x2, #16 1744.ifc \variant, identity_ 1745.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 1746 ld1 {\i}, [x6] 1747 st1 {v4.8h}, [x6], x11 1748.endr 1749 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 1750 // The identity shl #1 and downshift srshr #1 cancel out 1751.else 1752.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 1753 ld1 {\i}, [x6] 1754 st1 {v4.8h}, [x6], x11 1755.endr 1756 scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 1757 blr x4 1758 1759 srshr v24.8h, v16.8h, #1 1760 srshr v25.8h, v17.8h, #1 1761 srshr v26.8h, v18.8h, #1 1762 srshr v27.8h, v19.8h, #1 1763 srshr v28.8h, v20.8h, #1 1764 srshr v29.8h, v21.8h, #1 1765 srshr v30.8h, v22.8h, #1 1766 srshr v31.8h, v23.8h, #1 1767.endif 1768 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 1769 1770 b 2f 1771 17721: 1773.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 1774 movi \i, #0 1775.endr 1776 17772: 1778 movi v4.8h, #0 1779 mov w16, #2896*8 1780 dup v0.4h, w16 1781 1782.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 1783 ld1 {\i}, [x2] 1784 st1 {v4.8h}, [x2], x11 1785.endr 1786 scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 1787.ifc \variant, identity_ 1788 // The identity shl #1 and downshift srshr #1 cancel out 1789 1790 b L(itx_8x16_epilog) 1791.else 1792 blr x4 1793 1794.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 1795 srshr \i, \i, #1 1796.endr 1797 1798L(itx_8x16_epilog): 1799 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 1800 1801 blr x5 1802 1803 load_add_store_8x16 x0, x6 1804 1805 ret x15 1806.endif 1807endfunc 1808.endm 1809 1810def_fn_816_base identity_ 1811def_fn_816_base 1812 1813.macro def_fn_816 w, h, txfm1, txfm2, eob_half 1814function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 1815.ifc \txfm1\()_\txfm2, dct_dct 1816 idct_dc \w, \h, 1 1817.endif 1818.ifnc \txfm1, identity 1819 adr x4, inv_\txfm1\()_8h_x\w\()_neon 1820.endif 1821 adr x5, inv_\txfm2\()_8h_x\h\()_neon 1822.if \w == 8 1823 mov x13, #\eob_half 1824.endif 1825.ifc \txfm1, identity 1826 b inv_txfm_identity_add_\w\()x\h\()_neon 1827.else 1828 b inv_txfm_add_\w\()x\h\()_neon 1829.endif 1830endfunc 1831.endm 1832 1833.macro def_fns_816 w, h 1834def_fn_816 \w, \h, dct, dct, 43 1835def_fn_816 \w, \h, identity, identity, 43 1836def_fn_816 \w, \h, dct, adst, 43 1837def_fn_816 \w, \h, dct, flipadst, 43 1838def_fn_816 \w, \h, dct, identity, 8 1839def_fn_816 \w, \h, adst, dct, 43 1840def_fn_816 \w, \h, adst, adst, 43 1841def_fn_816 \w, \h, adst, flipadst, 43 1842def_fn_816 \w, \h, flipadst, dct, 43 1843def_fn_816 \w, \h, flipadst, adst, 43 1844def_fn_816 \w, \h, flipadst, flipadst, 43 1845def_fn_816 \w, \h, identity, dct, 64 1846def_fn_816 \w, \h, adst, identity, 8 1847def_fn_816 \w, \h, flipadst, identity, 8 1848def_fn_816 \w, \h, identity, adst, 64 1849def_fn_816 \w, \h, identity, flipadst, 64 1850.endm 1851 1852def_fns_816 8, 16 1853def_fns_816 16, 8 1854 1855function inv_dct32_odd_8h_x16_neon, export=1 1856 movrel x16, idct_coeffs, 2*16 1857 ld1 {v0.8h, v1.8h}, [x16] 1858 sub x16, x16, #2*16 1859 1860 smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a 1861 smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a 1862 smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a 1863 sqrshrn_sz v16, v2, v3, #12, .8h // t16a 1864 sqrshrn_sz v31, v4, v5, #12, .8h // t31a 1865 smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a 1866 smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a 1867 sqrshrn_sz v24, v6, v7, #12, .8h // t17a 1868 sqrshrn_sz v23, v2, v3, #12, .8h // t30a 1869 smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a 1870 smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a 1871 sqrshrn_sz v20, v4, v5, #12, .8h // t18a 1872 sqrshrn_sz v27, v6, v7, #12, .8h // t29a 1873 smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a 1874 smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a 1875 sqrshrn_sz v28, v2, v3, #12, .8h // t19a 1876 sqrshrn_sz v19, v4, v5, #12, .8h // t28a 1877 smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a 1878 smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a 1879 sqrshrn_sz v18, v6, v7, #12, .8h // t20a 1880 sqrshrn_sz v29, v2, v3, #12, .8h // t27a 1881 smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a 1882 smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a 1883 sqrshrn_sz v26, v4, v5, #12, .8h // t21a 1884 sqrshrn_sz v21, v6, v7, #12, .8h // t26a 1885 smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a 1886 smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a 1887 sqrshrn_sz v22, v2, v3, #12, .8h // t22a 1888 sqrshrn_sz v25, v4, v5, #12, .8h // t25a 1889 smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a 1890 sqrshrn_sz v30, v6, v7, #12, .8h // t23a 1891 sqrshrn_sz v17, v2, v3, #12, .8h // t24a 1892 1893 ld1 {v0.8h}, [x16] 1894 1895 sqsub v2.8h, v16.8h, v24.8h // t17 1896 sqadd v16.8h, v16.8h, v24.8h // t16 1897 sqsub v3.8h, v31.8h, v23.8h // t30 1898 sqadd v31.8h, v31.8h, v23.8h // t31 1899 sqsub v24.8h, v28.8h, v20.8h // t18 1900 sqadd v28.8h, v28.8h, v20.8h // t19 1901 sqadd v23.8h, v18.8h, v26.8h // t20 1902 sqsub v18.8h, v18.8h, v26.8h // t21 1903 sqsub v20.8h, v30.8h, v22.8h // t22 1904 sqadd v30.8h, v30.8h, v22.8h // t23 1905 sqadd v26.8h, v17.8h, v25.8h // t24 1906 sqsub v17.8h, v17.8h, v25.8h // t25 1907 sqsub v22.8h, v29.8h, v21.8h // t26 1908 sqadd v29.8h, v29.8h, v21.8h // t27 1909 sqadd v25.8h, v19.8h, v27.8h // t28 1910 sqsub v19.8h, v19.8h, v27.8h // t29 1911 1912 smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a 1913 smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a 1914 smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a 1915 sqrshrn_sz v21, v4, v5, #12, .8h // t17a 1916 sqrshrn_sz v27, v6, v7, #12, .8h // t30a 1917 neg v2.4s, v2.4s // -> t18a 1918 neg v3.4s, v3.4s // -> t18a 1919 smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a 1920 smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a 1921 sqrshrn_sz v19, v2, v3, #12, .8h // t18a 1922 sqrshrn_sz v24, v4, v5, #12, .8h // t29a 1923 smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a 1924 smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a 1925 sqrshrn_sz v22, v6, v7, #12, .8h // t21a 1926 sqrshrn_sz v18, v2, v3, #12, .8h // t26a 1927 neg v4.4s, v4.4s // -> t22a 1928 neg v5.4s, v5.4s // -> t22a 1929 smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a 1930 sqrshrn_sz v17, v4, v5, #12, .8h // t22a 1931 sqrshrn_sz v20, v6, v7, #12, .8h // t25a 1932 1933 sqsub v2.8h, v27.8h, v24.8h // t29 1934 sqadd v27.8h, v27.8h, v24.8h // t30 1935 sqsub v3.8h, v21.8h, v19.8h // t18 1936 sqadd v21.8h, v21.8h, v19.8h // t17 1937 sqsub v24.8h, v16.8h, v28.8h // t19a 1938 sqadd v16.8h, v16.8h, v28.8h // t16a 1939 sqsub v19.8h, v30.8h, v23.8h // t20a 1940 sqadd v30.8h, v30.8h, v23.8h // t23a 1941 sqsub v28.8h, v17.8h, v22.8h // t21 1942 sqadd v17.8h, v17.8h, v22.8h // t22 1943 sqadd v23.8h, v26.8h, v29.8h // t24a 1944 sqsub v26.8h, v26.8h, v29.8h // t27a 1945 sqadd v22.8h, v20.8h, v18.8h // t25 1946 sqsub v20.8h, v20.8h, v18.8h // t26 1947 sqsub v29.8h, v31.8h, v25.8h // t28a 1948 sqadd v31.8h, v31.8h, v25.8h // t31a 1949 1950 smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a 1951 smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a 1952 smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19 1953 sqrshrn_sz v18, v4, v5, #12, .8h // t18a 1954 sqrshrn_sz v25, v6, v7, #12, .8h // t29a 1955 smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28 1956 smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20 1957 sqrshrn_sz v29, v2, v3, #12, .8h // t19 1958 sqrshrn_sz v24, v4, v5, #12, .8h // t28 1959 neg v6.4s, v6.4s // -> t20 1960 neg v7.4s, v7.4s // -> t20 1961 smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27 1962 smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a 1963 sqrshrn_sz v26, v6, v7, #12, .8h // t20 1964 sqrshrn_sz v19, v2, v3, #12, .8h // t27 1965 neg v4.4s, v4.4s // -> t21a 1966 neg v5.4s, v5.4s // -> t21a 1967 smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a 1968 sqrshrn_sz v20, v4, v5, #12, .8h // t21a 1969 sqrshrn_sz v28, v6, v7, #12, .8h // t26a 1970 1971 sqsub v2.8h, v16.8h, v30.8h // t23 1972 sqadd v16.8h, v16.8h, v30.8h // t16 = out16 1973 sqsub v3.8h, v31.8h, v23.8h // t24 1974 sqadd v31.8h, v31.8h, v23.8h // t31 = out31 1975 sqsub v23.8h, v21.8h, v17.8h // t22a 1976 sqadd v17.8h, v21.8h, v17.8h // t17a = out17 1977 sqadd v30.8h, v27.8h, v22.8h // t30a = out30 1978 sqsub v21.8h, v27.8h, v22.8h // t25a 1979 sqsub v27.8h, v18.8h, v20.8h // t21 1980 sqadd v18.8h, v18.8h, v20.8h // t18 = out18 1981 sqadd v4.8h, v29.8h, v26.8h // t19a = out19 1982 sqsub v26.8h, v29.8h, v26.8h // t20a 1983 sqadd v29.8h, v25.8h, v28.8h // t29 = out29 1984 sqsub v25.8h, v25.8h, v28.8h // t26 1985 sqadd v28.8h, v24.8h, v19.8h // t28a = out28 1986 sqsub v24.8h, v24.8h, v19.8h // t27a 1987 mov v19.16b, v4.16b // out19 1988 1989 smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20 1990 smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27 1991 sqrshrn_sz v20, v4, v5, #12, .8h // t20 1992 sqrshrn_sz v22, v6, v7, #12, .8h // t27 1993 1994 smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a 1995 smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a 1996 mov v27.16b, v22.16b // t27 1997 sqrshrn_sz v26, v4, v5, #12, .8h // t26a 1998 1999 smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22 2000 smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25 2001 sqrshrn_sz v21, v6, v7, #12, .8h // t21a 2002 sqrshrn_sz v22, v24, v25, #12, .8h // t22 2003 sqrshrn_sz v25, v4, v5, #12, .8h // t25 2004 2005 smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a 2006 smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a 2007 sqrshrn_sz v23, v4, v5, #12, .8h // t23a 2008 sqrshrn_sz v24, v6, v7, #12, .8h // t24a 2009 2010 ret 2011endfunc 2012 2013.macro def_horz_32 scale=0, shift=2, suffix 2014function inv_txfm_horz\suffix\()_dct_32x8_neon 2015 mov x14, x30 2016 movi v7.8h, #0 2017 lsl x8, x8, #1 2018.if \scale 2019 mov w16, #2896*8 2020 dup v0.4h, w16 2021.endif 2022 2023.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 2024 ld1 {\i}, [x7] 2025 st1 {v7.8h}, [x7], x8 2026.endr 2027 sub x7, x7, x8, lsl #4 2028 add x7, x7, x8, lsr #1 2029.if \scale 2030 scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 2031 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 2032.endif 2033 bl inv_dct_8h_x16_neon 2034 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2035 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 2036 2037.macro store1 r0, r1 2038 st1 {\r0}, [x6], #16 2039 st1 {\r1}, [x6], #16 2040 add x6, x6, #32 2041.endm 2042 store1 v16.8h, v24.8h 2043 store1 v17.8h, v25.8h 2044 store1 v18.8h, v26.8h 2045 store1 v19.8h, v27.8h 2046 store1 v20.8h, v28.8h 2047 store1 v21.8h, v29.8h 2048 store1 v22.8h, v30.8h 2049 store1 v23.8h, v31.8h 2050.purgem store1 2051 sub x6, x6, #64*8 2052 2053 movi v7.8h, #0 2054.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 2055 ld1 {\i}, [x7] 2056 st1 {v7.8h}, [x7], x8 2057.endr 2058.if \scale 2059 // This relies on the fact that the idct also leaves the right coeff in v0.h[1] 2060 scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23 2061 scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31 2062.endif 2063 bl inv_dct32_odd_8h_x16_neon 2064 transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 2065 transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5 2066.macro store2 r0, r1, shift 2067 ld1 {v4.8h, v5.8h}, [x6] 2068 sqsub v7.8h, v4.8h, \r0 2069 sqsub v6.8h, v5.8h, \r1 2070 sqadd v4.8h, v4.8h, \r0 2071 sqadd v5.8h, v5.8h, \r1 2072 rev64 v6.8h, v6.8h 2073 rev64 v7.8h, v7.8h 2074 srshr v4.8h, v4.8h, #\shift 2075 srshr v5.8h, v5.8h, #\shift 2076 srshr v6.8h, v6.8h, #\shift 2077 srshr v7.8h, v7.8h, #\shift 2078 ext v6.16b, v6.16b, v6.16b, #8 2079 st1 {v4.8h, v5.8h}, [x6], #32 2080 ext v7.16b, v7.16b, v7.16b, #8 2081 st1 {v6.8h, v7.8h}, [x6], #32 2082.endm 2083 2084 store2 v31.8h, v23.8h, \shift 2085 store2 v30.8h, v22.8h, \shift 2086 store2 v29.8h, v21.8h, \shift 2087 store2 v28.8h, v20.8h, \shift 2088 store2 v27.8h, v19.8h, \shift 2089 store2 v26.8h, v18.8h, \shift 2090 store2 v25.8h, v17.8h, \shift 2091 store2 v24.8h, v16.8h, \shift 2092.purgem store2 2093 ret x14 2094endfunc 2095.endm 2096 2097def_horz_32 scale=0, shift=2 2098def_horz_32 scale=1, shift=1, suffix=_scale 2099 2100function inv_txfm_add_vert_dct_8x32_neon 2101 mov x14, x30 2102 lsl x8, x8, #1 2103 2104.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2105 ld1 {v\i\().8h}, [x7], x8 2106.endr 2107 sub x7, x7, x8, lsl #4 2108 2109 bl inv_dct_8h_x16_neon 2110 2111.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2112 st1 {v\i\().8h}, [x7], x8 2113.endr 2114 sub x7, x7, x8, lsl #4 2115 add x7, x7, x8, lsr #1 2116 2117.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2118 ld1 {v\i\().8h}, [x7], x8 2119.endr 2120 sub x7, x7, x8, lsl #4 2121 sub x7, x7, x8, lsr #1 2122 bl inv_dct32_odd_8h_x16_neon 2123 2124 neg x9, x8 2125 mov x10, x6 2126.macro combine r0, r1, r2, r3, op, stride 2127 ld1 {v5.8h}, [x7], \stride 2128 ld1 {v2.8b}, [x10], x1 2129 ld1 {v6.8h}, [x7], \stride 2130 ld1 {v3.8b}, [x10], x1 2131 \op v5.8h, v5.8h, \r0 2132 ld1 {v7.8h}, [x7], \stride 2133 ld1 {v4.8b}, [x10], x1 2134 srshr v5.8h, v5.8h, #4 2135 \op v6.8h, v6.8h, \r1 2136 uaddw v5.8h, v5.8h, v2.8b 2137 srshr v6.8h, v6.8h, #4 2138 \op v7.8h, v7.8h, \r2 2139 sqxtun v2.8b, v5.8h 2140 ld1 {v5.8h}, [x7], \stride 2141 uaddw v6.8h, v6.8h, v3.8b 2142 srshr v7.8h, v7.8h, #4 2143 \op v5.8h, v5.8h, \r3 2144 st1 {v2.8b}, [x6], x1 2145 ld1 {v2.8b}, [x10], x1 2146 sqxtun v3.8b, v6.8h 2147 uaddw v7.8h, v7.8h, v4.8b 2148 srshr v5.8h, v5.8h, #4 2149 st1 {v3.8b}, [x6], x1 2150 sqxtun v4.8b, v7.8h 2151 uaddw v5.8h, v5.8h, v2.8b 2152 st1 {v4.8b}, [x6], x1 2153 sqxtun v2.8b, v5.8h 2154 st1 {v2.8b}, [x6], x1 2155.endm 2156 combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 2157 combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 2158 combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 2159 combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 2160 sub x7, x7, x8 2161 combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 2162 combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 2163 combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 2164 combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 2165.purgem combine 2166 2167 ret x14 2168endfunc 2169 2170const eob_32x32 2171 .short 36, 136, 300, 1024 2172endconst 2173 2174const eob_16x32 2175 .short 36, 151, 279, 512 2176endconst 2177 2178const eob_16x32_shortside 2179 .short 36, 512 2180endconst 2181 2182const eob_8x32 2183 .short 43, 107, 171, 256 2184endconst 2185 2186function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 2187 movi v0.8h, #0 2188 movrel x13, eob_32x32 2189 2190 mov x8, #2*32 21911: 2192 mov w9, #0 2193 movrel x12, eob_32x32 21942: 2195 add w9, w9, #8 2196.irp i, 16, 17, 18, 19, 20, 21, 22, 23 2197 ld1 {v\i\().8h}, [x2] 2198 st1 {v0.8h}, [x2], x8 2199.endr 2200 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2201 2202 load_add_store_8x8 x0, x7, shiftbits=2 2203 ldrh w11, [x12], #2 2204 sub x0, x0, x1, lsl #3 2205 add x0, x0, #8 2206 cmp w3, w11 2207 b.ge 2b 2208 2209 ldrh w11, [x13], #2 2210 cmp w3, w11 2211 b.lt 9f 2212 2213 sub x0, x0, w9, uxtw 2214 add x0, x0, x1, lsl #3 2215 msub x2, x8, x9, x2 2216 add x2, x2, #2*8 2217 b 1b 22189: 2219 ret 2220endfunc 2221 2222.macro shift_8_regs op, shift 2223.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 2224 \op \i, \i, #\shift 2225.endr 2226.endm 2227 2228.macro def_identity_1632 w, h, wshort, hshort 2229function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 2230 mov w16, #2896*8 2231 mov w17, #2*(5793-4096)*8 2232 dup v1.4h, w16 2233 movi v0.8h, #0 2234 mov v1.h[1], w17 2235 movrel x13, eob_16x32\hshort 2236 2237 mov x8, #2*\h 22381: 2239 mov w9, #0 2240 movrel x12, eob_16x32\wshort 22412: 2242 add w9, w9, #8 2243.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 2244 ld1 {\i}, [x2] 2245 st1 {v0.8h}, [x2], x8 2246.endr 2247 scale_input .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23 2248 2249.if \w == 16 2250 // 16x32 2251 identity_8x8_shift1 v1.h[1] 2252.else 2253 // 32x16 2254 shift_8_regs sqshl, 1 2255 identity_8x8 v1.h[1] 2256.endif 2257 2258 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2259 2260.if \w == 16 2261 load_add_store_8x8 x0, x7, shiftbits=2 2262.else 2263 load_add_store_8x8 x0, x7, shiftbits=4 2264.endif 2265 ldrh w11, [x12], #2 2266 sub x0, x0, x1, lsl #3 2267 add x0, x0, #8 2268 cmp w3, w11 2269 b.ge 2b 2270 2271 ldrh w11, [x13], #2 2272 cmp w3, w11 2273 b.lt 9f 2274 2275 sub x0, x0, w9, uxtw 2276 add x0, x0, x1, lsl #3 2277 msub x2, x8, x9, x2 2278 add x2, x2, #2*8 2279 b 1b 22809: 2281 ret 2282endfunc 2283.endm 2284 2285def_identity_1632 16, 32, _shortside, 2286def_identity_1632 32, 16, , _shortside 2287 2288.macro def_identity_832 w, h 2289function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 2290 movi v0.8h, #0 2291 movrel x13, eob_8x32 2292 2293 mov w8, #2*\h 22941: 2295 ldrh w12, [x13], #2 2296.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 2297 ld1 {\i}, [x2] 2298 st1 {v0.8h}, [x2], x8 2299.endr 2300 2301.if \w == 8 2302 // 8x32 2303 shift_8_regs srshr, 1 2304.endif 2305 2306 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2307 2308 cmp w3, w12 2309.if \w == 8 2310 load_add_store_8x8 x0, x7, shiftbits=2 2311.else 2312 load_add_store_8x8 x0, x7, shiftbits=3 2313.endif 2314 2315 b.lt 9f 2316.if \w == 8 2317 sub x2, x2, x8, lsl #3 2318 add x2, x2, #2*8 2319.else 2320 sub x0, x0, x1, lsl #3 2321 add x0, x0, #8 2322.endif 2323 b 1b 2324 23259: 2326 ret 2327endfunc 2328.endm 2329 2330def_identity_832 8, 32 2331def_identity_832 32, 8 2332 2333function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 2334 idct_dc 32, 32, 2 2335 2336 mov x15, x30 2337 sub sp, sp, #2048 2338 movrel x13, eob_32x32 2339 ldrh w12, [x13], #2 2340 2341.irp i, 0, 8, 16, 24 2342 add x6, sp, #(\i*32*2) 2343.if \i > 0 2344 mov w8, #(32 - \i) 2345 cmp w3, w12 2346 b.lt 1f 2347.if \i < 24 2348 ldrh w12, [x13], #2 2349.endif 2350.endif 2351 add x7, x2, #(\i*2) 2352 mov x8, #32*2 2353 bl inv_txfm_horz_dct_32x8_neon 2354.endr 2355 b 3f 2356 23571: 2358 movi v4.8h, #0 2359 movi v5.8h, #0 2360 movi v6.8h, #0 2361 movi v7.8h, #0 23622: 2363 subs w8, w8, #4 2364.rept 4 2365 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2366.endr 2367 b.gt 2b 2368 23693: 2370.irp i, 0, 8, 16, 24 2371 add x6, x0, #(\i) 2372 add x7, sp, #(\i*2) 2373 mov x8, #32*2 2374 bl inv_txfm_add_vert_dct_8x32_neon 2375.endr 2376 2377 add sp, sp, #2048 2378 ret x15 2379endfunc 2380 2381function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 2382 idct_dc 16, 32, 1 2383 2384 mov x15, x30 2385 sub sp, sp, #1024 2386 movrel x13, eob_16x32 2387 ldrh w12, [x13], #2 2388 adr x4, inv_dct_8h_x16_neon 2389 2390.irp i, 0, 8, 16, 24 2391 add x6, sp, #(\i*16*2) 2392 add x7, x2, #(\i*2) 2393.if \i > 0 2394 mov w8, #(32 - \i) 2395 cmp w3, w12 2396 b.lt 1f 2397.if \i < 24 2398 ldrh w12, [x13], #2 2399.endif 2400.endif 2401 mov x8, #2*32 2402 bl inv_txfm_horz_scale_16x8_neon 2403.endr 2404 b 3f 2405 24061: 2407 movi v4.8h, #0 2408 movi v5.8h, #0 2409 movi v6.8h, #0 2410 movi v7.8h, #0 24112: 2412 subs w8, w8, #8 2413.rept 4 2414 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2415.endr 2416 b.gt 2b 2417 24183: 2419.irp i, 0, 8 2420 add x6, x0, #(\i) 2421 add x7, sp, #(\i*2) 2422 mov x8, #16*2 2423 bl inv_txfm_add_vert_dct_8x32_neon 2424.endr 2425 2426 add sp, sp, #1024 2427 ret x15 2428endfunc 2429 2430function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 2431 idct_dc 32, 16, 1 2432 2433 mov x15, x30 2434 sub sp, sp, #1024 2435 2436 adr x5, inv_dct_8h_x16_neon 2437 2438.irp i, 0, 8 2439 add x6, sp, #(\i*32*2) 2440 add x7, x2, #(\i*2) 2441.if \i > 0 2442 mov w8, #(16 - \i) 2443 cmp w3, #36 2444 b.lt 1f 2445.endif 2446 mov x8, #2*16 2447 bl inv_txfm_horz_scale_dct_32x8_neon 2448.endr 2449 b 3f 2450 24511: 2452 movi v4.8h, #0 2453 movi v5.8h, #0 2454 movi v6.8h, #0 2455 movi v7.8h, #0 24562: 2457 subs w8, w8, #4 2458.rept 4 2459 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2460.endr 2461 b.gt 2b 2462 24633: 2464 mov x8, #32*2 2465.irp i, 0, 8, 16, 24 2466 add x6, x0, #(\i) 2467 add x7, sp, #(\i*2) 2468 bl inv_txfm_add_vert_8x16_neon 2469.endr 2470 2471 add sp, sp, #1024 2472 ret x15 2473endfunc 2474 2475function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 2476 idct_dc 8, 32, 2 2477 2478 mov x15, x30 2479 sub sp, sp, #512 2480 2481 movrel x13, eob_8x32 2482 2483 movi v28.8h, #0 2484 mov x8, #2*32 2485 mov w9, #32 2486 mov x6, sp 24871: 2488.irp i, 16, 17, 18, 19, 20, 21, 22, 23 2489 ld1 {v\i\().8h}, [x2] 2490 st1 {v28.8h}, [x2], x8 2491.endr 2492 ldrh w12, [x13], #2 2493 sub x2, x2, x8, lsl #3 2494 sub w9, w9, #8 2495 add x2, x2, #2*8 2496 2497 bl inv_dct_8h_x8_neon 2498 2499.irp i, 16, 17, 18, 19, 20, 21, 22, 23 2500 srshr v\i\().8h, v\i\().8h, #2 2501.endr 2502 2503 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 2504 2505 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 2506 cmp w3, w12 2507 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 2508 2509 b.ge 1b 2510 cbz w9, 3f 2511 2512 movi v29.8h, #0 2513 movi v30.8h, #0 2514 movi v31.8h, #0 25152: 2516 subs w9, w9, #8 2517.rept 2 2518 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 2519.endr 2520 b.gt 2b 2521 25223: 2523 mov x6, x0 2524 mov x7, sp 2525 mov x8, #8*2 2526 bl inv_txfm_add_vert_dct_8x32_neon 2527 2528 add sp, sp, #512 2529 ret x15 2530endfunc 2531 2532function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 2533 idct_dc 32, 8, 2 2534 2535 mov x15, x30 2536 sub sp, sp, #512 2537 2538 mov x6, sp 2539 mov x7, x2 2540 mov x8, #8*2 2541 bl inv_txfm_horz_dct_32x8_neon 2542 2543 mov x8, #2*32 2544 mov w9, #0 25451: 2546 add x6, x0, x9 2547 add x7, sp, x9, lsl #1 // #(\i*2) 2548 2549.irp i, 16, 17, 18, 19, 20, 21, 22, 23 2550 ld1 {v\i\().8h}, [x7], x8 2551.endr 2552 add w9, w9, #8 2553 2554 bl inv_dct_8h_x8_neon 2555 2556 cmp w9, #32 2557 2558 load_add_store_8x8 x6, x7 2559 2560 b.lt 1b 2561 2562 add sp, sp, #512 2563 ret x15 2564endfunc 2565 2566function inv_dct64_step1_neon 2567 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 2568 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 2569 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 2570 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 2571 2572 ld1 {v0.8h, v1.8h}, [x17], #32 2573 2574 sqrdmulh v23.8h, v16.8h, v0.h[1] // t63a 2575 sqrdmulh v16.8h, v16.8h, v0.h[0] // t32a 2576 sqrdmulh v22.8h, v17.8h, v0.h[2] // t62a 2577 sqrdmulh v17.8h, v17.8h, v0.h[3] // t33a 2578 sqrdmulh v21.8h, v18.8h, v0.h[5] // t61a 2579 sqrdmulh v18.8h, v18.8h, v0.h[4] // t34a 2580 sqrdmulh v20.8h, v19.8h, v0.h[6] // t60a 2581 sqrdmulh v19.8h, v19.8h, v0.h[7] // t35a 2582 2583 sqadd v24.8h, v16.8h, v17.8h // t32 2584 sqsub v25.8h, v16.8h, v17.8h // t33 2585 sqsub v26.8h, v19.8h, v18.8h // t34 2586 sqadd v27.8h, v19.8h, v18.8h // t35 2587 sqadd v28.8h, v20.8h, v21.8h // t60 2588 sqsub v29.8h, v20.8h, v21.8h // t61 2589 sqsub v30.8h, v23.8h, v22.8h // t62 2590 sqadd v31.8h, v23.8h, v22.8h // t63 2591 2592 smull_smlal v2, v3, v29, v26, v1.h[0], v1.h[1], .8h // -> t34a 2593 smull_smlsl v4, v5, v29, v26, v1.h[1], v1.h[0], .8h // -> t61a 2594 neg v2.4s, v2.4s // t34a 2595 neg v3.4s, v3.4s // t34a 2596 smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a 2597 sqrshrn_sz v26, v2, v3, #12, .8h // t34a 2598 smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a 2599 sqrshrn_sz v29, v4, v5, #12, .8h // t61a 2600 sqrshrn_sz v25, v6, v7, #12, .8h // t33a 2601 sqrshrn_sz v30, v2, v3, #12, .8h // t62a 2602 2603 sqadd v16.8h, v24.8h, v27.8h // t32a 2604 sqsub v19.8h, v24.8h, v27.8h // t35a 2605 sqadd v17.8h, v25.8h, v26.8h // t33 2606 sqsub v18.8h, v25.8h, v26.8h // t34 2607 sqsub v20.8h, v31.8h, v28.8h // t60a 2608 sqadd v23.8h, v31.8h, v28.8h // t63a 2609 sqsub v21.8h, v30.8h, v29.8h // t61 2610 sqadd v22.8h, v30.8h, v29.8h // t62 2611 2612 smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a 2613 smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a 2614 smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60 2615 sqrshrn_sz v21, v2, v3, #12, .8h // t61a 2616 sqrshrn_sz v18, v4, v5, #12, .8h // t34a 2617 smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35 2618 sqrshrn_sz v20, v6, v7, #12, .8h // t60 2619 sqrshrn_sz v19, v2, v3, #12, .8h // t35 2620 2621 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 2622 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 2623 2624 ret 2625endfunc 2626 2627function inv_dct64_step2_neon 2628 movrel x16, idct_coeffs 2629 ld1 {v0.4h}, [x16] 26301: 2631 // t32a/33/34a/35/60/61a/62/63a 2632 // t56a/57/58a/59/36/37a/38/39a 2633 // t40a/41/42a/43/52/53a/54/55a 2634 // t48a/49/50a/51/44/45a/46/47a 2635 ldr q16, [x6, #2*8*0] // t32a 2636 ldr q17, [x9, #2*8*8] // t39a 2637 ldr q18, [x9, #2*8*0] // t63a 2638 ldr q19, [x6, #2*8*8] // t56a 2639 ldr q20, [x6, #2*8*16] // t40a 2640 ldr q21, [x9, #2*8*24] // t47a 2641 ldr q22, [x9, #2*8*16] // t55a 2642 ldr q23, [x6, #2*8*24] // t48a 2643 2644 sqadd v24.8h, v16.8h, v17.8h // t32 2645 sqsub v25.8h, v16.8h, v17.8h // t39 2646 sqadd v26.8h, v18.8h, v19.8h // t63 2647 sqsub v27.8h, v18.8h, v19.8h // t56 2648 sqsub v28.8h, v21.8h, v20.8h // t40 2649 sqadd v29.8h, v21.8h, v20.8h // t47 2650 sqadd v30.8h, v23.8h, v22.8h // t48 2651 sqsub v31.8h, v23.8h, v22.8h // t55 2652 2653 smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a 2654 smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a 2655 smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a 2656 sqrshrn_sz v25, v2, v3, #12, .8h // t56a 2657 sqrshrn_sz v27, v4, v5, #12, .8h // t39a 2658 neg v6.4s, v6.4s // t40a 2659 neg v7.4s, v7.4s // t40a 2660 smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a 2661 sqrshrn_sz v31, v6, v7, #12, .8h // t40a 2662 sqrshrn_sz v28, v2, v3, #12, .8h // t55a 2663 2664 sqadd v16.8h, v24.8h, v29.8h // t32a 2665 sqsub v19.8h, v24.8h, v29.8h // t47a 2666 sqadd v17.8h, v27.8h, v31.8h // t39 2667 sqsub v18.8h, v27.8h, v31.8h // t40 2668 sqsub v20.8h, v26.8h, v30.8h // t48a 2669 sqadd v23.8h, v26.8h, v30.8h // t63a 2670 sqsub v21.8h, v25.8h, v28.8h // t55 2671 sqadd v22.8h, v25.8h, v28.8h // t56 2672 2673 smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a 2674 smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a 2675 smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47 2676 sqrshrn_sz v18, v2, v3, #12, .8h // t40a 2677 sqrshrn_sz v21, v4, v5, #12, .8h // t55a 2678 smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48 2679 sqrshrn_sz v19, v6, v7, #12, .8h // t47 2680 sqrshrn_sz v20, v2, v3, #12, .8h // t48 2681 2682 str q16, [x6, #2*8*0] // t32a 2683 str q17, [x9, #2*8*0] // t39 2684 str q18, [x6, #2*8*8] // t40a 2685 str q19, [x9, #2*8*8] // t47 2686 str q20, [x6, #2*8*16] // t48 2687 str q21, [x9, #2*8*16] // t55a 2688 str q22, [x6, #2*8*24] // t56 2689 str q23, [x9, #2*8*24] // t63a 2690 2691 add x6, x6, #2*8 2692 sub x9, x9, #2*8 2693 cmp x6, x9 2694 b.lt 1b 2695 ret 2696endfunc 2697 2698.macro load8 src, strd, zero, clear 2699.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 2700.if \clear 2701 ld1 {\i}, [\src] 2702 st1 {\zero}, [\src], \strd 2703.else 2704 ld1 {\i}, [\src], \strd 2705.endif 2706.endr 2707.endm 2708 2709.macro store16 dst 2710.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 2711 st1 {\i}, [\dst], #16 2712.endr 2713.endm 2714 2715.macro clear_upper8 2716.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h 2717 movi \i, #0 2718.endr 2719.endm 2720 2721.macro movi_if reg, val, cond 2722.if \cond 2723 movi \reg, \val 2724.endif 2725.endm 2726 2727.macro movdup_if reg, gpr, val, cond 2728.if \cond 2729 mov \gpr, \val 2730 dup \reg, \gpr 2731.endif 2732.endm 2733 2734.macro st1_if regs, dst, cond 2735.if \cond 2736 st1 \regs, \dst 2737.endif 2738.endm 2739 2740.macro str_if reg, dst, cond 2741.if \cond 2742 str \reg, \dst 2743.endif 2744.endm 2745 2746.macro stroff_if reg, dst, dstoff, cond 2747.if \cond 2748 str \reg, \dst, \dstoff 2749.endif 2750.endm 2751 2752.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 2753.if \cond 2754 scale_input .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 2755.endif 2756.endm 2757 2758.macro def_dct64_func suffix, clear=0, scale=0 2759function inv_txfm_dct\suffix\()_8h_x64_neon, export=1 2760 mov x14, x30 2761 mov x6, sp 2762 lsl x8, x8, #2 2763 2764 movdup_if v0.4h, w16, #2896*8, \scale 2765 movi_if v7.8h, #0, \clear 2766 load8 x7, x8, v7.8h, \clear 2767 clear_upper8 2768 sub x7, x7, x8, lsl #3 2769 add x7, x7, x8, lsr #1 2770 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 2771 2772 bl inv_dct_8h_x16_neon 2773 2774 store16 x6 2775 2776 movdup_if v0.4h, w16, #2896*8, \scale 2777 movi_if v7.8h, #0, \clear 2778 load8 x7, x8, v7.8h, \clear 2779 clear_upper8 2780 sub x7, x7, x8, lsl #3 2781 lsr x8, x8, #1 2782 sub x7, x7, x8, lsr #1 2783 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 2784 2785 bl inv_dct32_odd_8h_x16_neon 2786 2787 add x10, x6, #16*15 2788 sub x6, x6, #16*16 2789 2790 mov x9, #-16 2791 2792.macro store_addsub r0, r1, r2, r3 2793 ld1 {v2.8h}, [x6], #16 2794 ld1 {v3.8h}, [x6], #16 2795 sqadd v6.8h, v2.8h, \r0 2796 sqsub \r0, v2.8h, \r0 2797 ld1 {v4.8h}, [x6], #16 2798 sqadd v7.8h, v3.8h, \r1 2799 sqsub \r1, v3.8h, \r1 2800 ld1 {v5.8h}, [x6], #16 2801 sqadd v2.8h, v4.8h, \r2 2802 sub x6, x6, #16*4 2803 sqsub \r2, v4.8h, \r2 2804 st1 {v6.8h}, [x6], #16 2805 st1 {\r0}, [x10], x9 2806 sqadd v3.8h, v5.8h, \r3 2807 sqsub \r3, v5.8h, \r3 2808 st1 {v7.8h}, [x6], #16 2809 st1 {\r1}, [x10], x9 2810 st1 {v2.8h}, [x6], #16 2811 st1 {\r2}, [x10], x9 2812 st1 {v3.8h}, [x6], #16 2813 st1 {\r3}, [x10], x9 2814.endm 2815 store_addsub v31.8h, v30.8h, v29.8h, v28.8h 2816 store_addsub v27.8h, v26.8h, v25.8h, v24.8h 2817 store_addsub v23.8h, v22.8h, v21.8h, v20.8h 2818 store_addsub v19.8h, v18.8h, v17.8h, v16.8h 2819.purgem store_addsub 2820 2821 add x6, x6, #2*8*16 2822 2823 movrel x17, idct64_coeffs 2824 movdup_if v0.4h, w16, #2896*8, \scale 2825 movi_if v7.8h, #0, \clear 2826 add x9, x7, x8, lsl #4 // offset 16 2827 add x10, x7, x8, lsl #3 // offset 8 2828 sub x9, x9, x8 // offset 15 2829 sub x11, x10, x8 // offset 7 2830 ld1 {v16.8h}, [x7] // in1 (offset 0) 2831 ld1 {v17.8h}, [x9] // in31 (offset 15) 2832 ld1 {v18.8h}, [x10] // in17 (offset 8) 2833 ld1 {v19.8h}, [x11] // in15 (offset 7) 2834 st1_if {v7.8h}, [x7], \clear 2835 st1_if {v7.8h}, [x9], \clear 2836 st1_if {v7.8h}, [x10], \clear 2837 st1_if {v7.8h}, [x11], \clear 2838 scale_if \scale, v0.h[0], v16, v17, v18, v19 2839 bl inv_dct64_step1_neon 2840 movdup_if v0.4h, w16, #2896*8, \scale 2841 movi_if v7.8h, #0, \clear 2842 add x7, x7, x8, lsl #2 // offset 4 2843 sub x9, x9, x8, lsl #2 // offset 11 2844 sub x10, x7, x8 // offset 3 2845 add x11, x9, x8 // offset 12 2846 ld1 {v16.8h}, [x10] // in7 (offset 3) 2847 ld1 {v17.8h}, [x11] // in25 (offset 12) 2848 ld1 {v18.8h}, [x9] // in23 (offset 11) 2849 ld1 {v19.8h}, [x7] // in9 (offset 4) 2850 st1_if {v7.8h}, [x7], \clear 2851 st1_if {v7.8h}, [x9], \clear 2852 st1_if {v7.8h}, [x10], \clear 2853 st1_if {v7.8h}, [x11], \clear 2854 scale_if \scale, v0.h[0], v16, v17, v18, v19 2855 bl inv_dct64_step1_neon 2856 movdup_if v0.4h, w16, #2896*8, \scale 2857 movi_if v7.8h, #0, \clear 2858 sub x10, x10, x8, lsl #1 // offset 1 2859 sub x9, x9, x8, lsl #1 // offset 9 2860 add x7, x7, x8 // offset 5 2861 add x11, x11, x8 // offset 13 2862 ldr q16, [x10, x8] // in5 (offset 2) 2863 ldr q17, [x11] // in27 (offset 13) 2864 ldr q18, [x9, x8] // in21 (offset 10) 2865 ldr q19, [x7] // in11 (offset 5) 2866 stroff_if q7, [x10, x8], \clear 2867 str_if q7, [x11], \clear 2868 stroff_if q7, [x9, x8], \clear 2869 str_if q7, [x7], \clear 2870 scale_if \scale, v0.h[0], v16, v17, v18, v19 2871 bl inv_dct64_step1_neon 2872 movdup_if v0.4h, w16, #2896*8, \scale 2873 movi_if v7.8h, #0, \clear 2874 ldr q16, [x10] // in3 (offset 1) 2875 ldr q17, [x11, x8] // in29 (offset 14) 2876 ldr q18, [x9] // in19 (offset 9) 2877 ldr q19, [x7, x8] // in13 (offset 6) 2878 str_if q7, [x10], \clear 2879 stroff_if q7, [x11, x8], \clear 2880 str_if q7, [x9], \clear 2881 stroff_if q7, [x7, x8], \clear 2882 scale_if \scale, v0.h[0], v16, v17, v18, v19 2883 bl inv_dct64_step1_neon 2884 2885 sub x6, x6, #2*8*32 2886 add x9, x6, #2*8*7 2887 2888 bl inv_dct64_step2_neon 2889 2890 ret x14 2891endfunc 2892.endm 2893 2894def_dct64_func 2895def_dct64_func _clear, clear=1 2896def_dct64_func _clear_scale, clear=1, scale=1 2897 2898 2899function inv_txfm_horz_dct_64x8_neon 2900 mov x14, x30 2901 2902 mov x7, sp 2903 add x8, sp, #2*8*(64 - 4) 2904 add x9, x6, #2*56 2905 mov x10, #2*64 2906 mov x11, #-2*8*4 2907 2908 dup v7.8h, w12 29091: 2910 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 2911 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 2912 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 2913 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 2914 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2915 transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 2916 2917.macro store_addsub src0, src1, src2, src3 2918 sqsub v1.8h, \src0, \src1 2919 sqadd v0.8h, \src0, \src1 2920 sqsub v3.8h, \src2, \src3 2921 srshl v1.8h, v1.8h, v7.8h 2922 sqadd v2.8h, \src2, \src3 2923 srshl v0.8h, v0.8h, v7.8h 2924 srshl v3.8h, v3.8h, v7.8h 2925 rev64 v1.8h, v1.8h 2926 srshl v2.8h, v2.8h, v7.8h 2927 rev64 v3.8h, v3.8h 2928 ext v1.16b, v1.16b, v1.16b, #8 2929 st1 {v0.8h}, [x6], x10 2930 ext v3.16b, v3.16b, v3.16b, #8 2931 st1 {v1.8h}, [x9], x10 2932 st1 {v2.8h}, [x6], x10 2933 st1 {v3.8h}, [x9], x10 2934.endm 2935 store_addsub v16.8h, v31.8h, v17.8h, v30.8h 2936 store_addsub v18.8h, v29.8h, v19.8h, v28.8h 2937 store_addsub v20.8h, v27.8h, v21.8h, v26.8h 2938 store_addsub v22.8h, v25.8h, v23.8h, v24.8h 2939.purgem store_addsub 2940 sub x6, x6, x10, lsl #3 2941 sub x9, x9, x10, lsl #3 2942 add x6, x6, #16 2943 sub x9, x9, #16 2944 2945 cmp x7, x8 2946 b.lt 1b 2947 ret x14 2948endfunc 2949 2950function inv_txfm_add_vert_dct_8x64_neon 2951 mov x14, x30 2952 lsl x8, x8, #1 2953 2954 mov x7, sp 2955 add x8, sp, #2*8*(64 - 4) 2956 add x9, x6, x1, lsl #6 2957 sub x9, x9, x1 2958 neg x10, x1 2959 mov x11, #-2*8*4 2960 29611: 2962 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 2963 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 2964 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 2965 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 2966 2967.macro add_dest_addsub src0, src1, src2, src3 2968 ld1 {v0.8b}, [x6], x1 2969 ld1 {v1.8b}, [x9], x10 2970 sqadd v4.8h, \src0, \src1 2971 ld1 {v2.8b}, [x6] 2972 sqsub v5.8h, \src0, \src1 2973 ld1 {v3.8b}, [x9] 2974 sqadd v6.8h, \src2, \src3 2975 sqsub v7.8h, \src2, \src3 2976 sub x6, x6, x1 2977 sub x9, x9, x10 2978 srshr v4.8h, v4.8h, #4 2979 srshr v5.8h, v5.8h, #4 2980 srshr v6.8h, v6.8h, #4 2981 uaddw v4.8h, v4.8h, v0.8b 2982 srshr v7.8h, v7.8h, #4 2983 uaddw v5.8h, v5.8h, v1.8b 2984 uaddw v6.8h, v6.8h, v2.8b 2985 sqxtun v0.8b, v4.8h 2986 uaddw v7.8h, v7.8h, v3.8b 2987 sqxtun v1.8b, v5.8h 2988 st1 {v0.8b}, [x6], x1 2989 sqxtun v2.8b, v6.8h 2990 st1 {v1.8b}, [x9], x10 2991 sqxtun v3.8b, v7.8h 2992 st1 {v2.8b}, [x6], x1 2993 st1 {v3.8b}, [x9], x10 2994.endm 2995 add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h 2996 add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h 2997 add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h 2998 add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h 2999.purgem add_dest_addsub 3000 cmp x7, x8 3001 b.lt 1b 3002 3003 ret x14 3004endfunc 3005 3006function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 3007 idct_dc 64, 64, 2 3008 3009 mov x15, x30 3010 3011 sub_sp 64*32*2+64*8*2 3012 add x5, sp, #64*8*2 3013 3014 movrel x13, eob_32x32 3015 3016.irp i, 0, 8, 16, 24 3017 add x6, x5, #(\i*64*2) 3018.if \i > 0 3019 mov w8, #(32 - \i) 3020 cmp w3, w12 3021 b.lt 1f 3022.endif 3023 add x7, x2, #(\i*2) 3024 mov x8, #32*2 3025 mov x12, #-2 // shift 3026 bl inv_txfm_dct_clear_8h_x64_neon 3027 add x6, x5, #(\i*64*2) 3028 bl inv_txfm_horz_dct_64x8_neon 3029.if \i < 24 3030 ldrh w12, [x13], #2 3031.endif 3032.endr 3033 b 3f 3034 30351: 3036 movi v4.8h, #0 3037 movi v5.8h, #0 3038 movi v6.8h, #0 3039 movi v7.8h, #0 30402: 3041 subs w8, w8, #2 3042.rept 4 3043 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3044.endr 3045 b.gt 2b 3046 30473: 3048.irp i, 0, 8, 16, 24, 32, 40, 48, 56 3049 add x7, x5, #(\i*2) 3050 mov x8, #64*2 3051 bl inv_txfm_dct_8h_x64_neon 3052 add x6, x0, #(\i) 3053 bl inv_txfm_add_vert_dct_8x64_neon 3054.endr 3055 3056 add sp, x5, #64*32*2 3057 ret x15 3058endfunc 3059 3060function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 3061 idct_dc 64, 32, 1 3062 3063 mov x15, x30 3064 3065 sub_sp 64*32*2+64*8*2 3066 add x5, sp, #64*8*2 3067 3068 movrel x13, eob_32x32 3069 3070.irp i, 0, 8, 16, 24 3071 add x6, x5, #(\i*64*2) 3072.if \i > 0 3073 mov w8, #(32 - \i) 3074 cmp w3, w12 3075 b.lt 1f 3076.endif 3077 add x7, x2, #(\i*2) 3078 mov x8, #32*2 3079 mov x12, #-1 // shift 3080 bl inv_txfm_dct_clear_scale_8h_x64_neon 3081 add x6, x5, #(\i*64*2) 3082 bl inv_txfm_horz_dct_64x8_neon 3083.if \i < 24 3084 ldrh w12, [x13], #2 3085.endif 3086.endr 3087 b 3f 3088 30891: 3090 movi v4.8h, #0 3091 movi v5.8h, #0 3092 movi v6.8h, #0 3093 movi v7.8h, #0 30942: 3095 subs w8, w8, #2 3096.rept 4 3097 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3098.endr 3099 b.gt 2b 3100 31013: 3102.irp i, 0, 8, 16, 24, 32, 40, 48, 56 3103 add x6, x0, #(\i) 3104 add x7, x5, #(\i*2) 3105 mov x8, #64*2 3106 bl inv_txfm_add_vert_dct_8x32_neon 3107.endr 3108 3109 add sp, x5, #64*32*2 3110 ret x15 3111endfunc 3112 3113function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 3114 idct_dc 32, 64, 1 3115 3116 mov x15, x30 3117 3118 sub_sp 32*32*2+64*8*2 3119 add x5, sp, #64*8*2 3120 3121 movrel x13, eob_32x32 3122 ldrh w12, [x13], #2 3123 3124.irp i, 0, 8, 16, 24 3125 add x6, x5, #(\i*32*2) 3126.if \i > 0 3127 mov w8, #(32 - \i) 3128 cmp w3, w12 3129 b.lt 1f 3130.if \i < 24 3131 ldrh w12, [x13], #2 3132.endif 3133.endif 3134 add x7, x2, #(\i*2) 3135 mov x8, #32*2 3136 bl inv_txfm_horz_scale_dct_32x8_neon 3137.endr 3138 b 3f 3139 31401: 3141 movi v4.8h, #0 3142 movi v5.8h, #0 3143 movi v6.8h, #0 3144 movi v7.8h, #0 31452: 3146 subs w8, w8, #4 3147.rept 4 3148 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3149.endr 3150 b.gt 2b 3151 31523: 3153.irp i, 0, 8, 16, 24 3154 add x7, x5, #(\i*2) 3155 mov x8, #32*2 3156 bl inv_txfm_dct_8h_x64_neon 3157 add x6, x0, #(\i) 3158 bl inv_txfm_add_vert_dct_8x64_neon 3159.endr 3160 3161 add sp, x5, #32*32*2 3162 ret x15 3163endfunc 3164 3165function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 3166 idct_dc 64, 16, 2 3167 3168 mov x15, x30 3169 3170 sub_sp 64*16*2+64*8*2 3171 add x4, sp, #64*8*2 3172 3173 movrel x13, eob_16x32 3174 3175.irp i, 0, 8 3176 add x6, x4, #(\i*64*2) 3177.if \i > 0 3178 mov w8, #(16 - \i) 3179 cmp w3, w12 3180 b.lt 1f 3181.endif 3182 add x7, x2, #(\i*2) 3183 mov x8, #16*2 3184 mov x12, #-2 // shift 3185 bl inv_txfm_dct_clear_8h_x64_neon 3186 add x6, x4, #(\i*64*2) 3187 bl inv_txfm_horz_dct_64x8_neon 3188.if \i < 8 3189 ldrh w12, [x13], #2 3190.endif 3191.endr 3192 b 3f 3193 31941: 3195 movi v4.8h, #0 3196 movi v5.8h, #0 3197 movi v6.8h, #0 3198 movi v7.8h, #0 31992: 3200 subs w8, w8, #2 3201.rept 4 3202 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3203.endr 3204 b.gt 2b 3205 32063: 3207 adr x5, inv_dct_8h_x16_neon 3208 mov x8, #64*2 3209.irp i, 0, 8, 16, 24, 32, 40, 48, 56 3210 add x6, x0, #(\i) 3211 add x7, x4, #(\i*2) 3212 bl inv_txfm_add_vert_8x16_neon 3213.endr 3214 3215 add sp, x4, #64*16*2 3216 ret x15 3217endfunc 3218 3219function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 3220 idct_dc 16, 64, 2 3221 3222 mov x15, x30 3223 3224 sub_sp 16*32*2+64*8*2 3225 add x5, sp, #64*8*2 3226 3227 movrel x13, eob_16x32 3228 ldrh w12, [x13], #2 3229 3230 adr x4, inv_dct_8h_x16_neon 3231.irp i, 0, 8, 16, 24 3232 add x6, x5, #(\i*16*2) 3233.if \i > 0 3234 mov w8, #(32 - \i) 3235 cmp w3, w12 3236 b.lt 1f 3237.if \i < 24 3238 ldrh w12, [x13], #2 3239.endif 3240.endif 3241 add x7, x2, #(\i*2) 3242 mov x8, #32*2 3243 bl inv_txfm_horz_16x8_neon 3244.endr 3245 b 3f 3246 32471: 3248 movi v4.8h, #0 3249 movi v5.8h, #0 3250 movi v6.8h, #0 3251 movi v7.8h, #0 32522: 3253 subs w8, w8, #8 3254.rept 4 3255 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3256.endr 3257 b.gt 2b 3258 32593: 3260.irp i, 0, 8 3261 add x7, x5, #(\i*2) 3262 mov x8, #16*2 3263 bl inv_txfm_dct_8h_x64_neon 3264 add x6, x0, #(\i) 3265 bl inv_txfm_add_vert_dct_8x64_neon 3266.endr 3267 3268 add sp, x5, #16*32*2 3269 ret x15 3270endfunc 3271