1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2020, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// The exported functions in this file have got the following signature: 32// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, 33// int bitdepth_max); 34 35// Most of the functions use the following register layout: 36// x0-x3 external parameters 37// x4 function pointer to first transform 38// x5 function pointer to second transform 39// x6 output parameter for helper function 40// x7 input parameter for helper function 41// x8 input stride for helper function 42// x9-x12 scratch variables for helper functions 43// x13 pointer to list of eob thresholds 44// x14 return pointer for helper function 45// x15 return pointer for main function 46 47// The SIMD registers most often use the following layout: 48// v0-v1 multiplication coefficients 49// v2-v7 scratch registers 50// v8-v15 unused 51// v16-v31 inputs/outputs of transforms 52 53const idct_coeffs, align=4 54 // idct4 55 .int 2896, 2896*8*(1<<16), 1567, 3784 56 // idct8 57 .int 799, 4017, 3406, 2276 58 // idct16 59 .int 401, 4076, 3166, 2598 60 .int 1931, 3612, 3920, 1189 61 // idct32 62 .int 201, 4091, 3035, 2751 63 .int 1751, 3703, 3857, 1380 64 .int 995, 3973, 3513, 2106 65 .int 2440, 3290, 4052, 601 66endconst 67 68const idct64_coeffs, align=4 69 .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) 70 .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) 71 .int 4076, 401, 4017, 799 72 73 .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) 74 .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) 75 .int -3166, -2598, -799, -4017 76 77 .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) 78 .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) 79 .int 3612, 1931, 2276, 3406 80 81 .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) 82 .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) 83 .int -3920, -1189, -3406, -2276 84endconst 85 86const iadst4_coeffs, align=4 87 .int 1321, 3803, 2482, 3344 88endconst 89 90const iadst8_coeffs, align=4 91 .int 4076, 401, 3612, 1931 92 .int 2598, 3166, 1189, 3920 93 // idct_coeffs 94 .int 2896, 0, 1567, 3784 95endconst 96 97const iadst16_coeffs, align=4 98 .int 4091, 201, 3973, 995 99 .int 3703, 1751, 3290, 2440 100 .int 2751, 3035, 2106, 3513 101 .int 1380, 3857, 601, 4052 102endconst 103 104.macro mul_mla d, s0, s1, c0, c1 105 mul \d\().4s, \s0\().4s, \c0 106 mla \d\().4s, \s1\().4s, \c1 107.endm 108 109.macro mul_mls d, s0, s1, c0, c1 110 mul \d\().4s, \s0\().4s, \c0 111 mls \d\().4s, \s1\().4s, \c1 112.endm 113 114.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 115 sqrdmulh \r0\sz, \r0\sz, \c 116 sqrdmulh \r1\sz, \r1\sz, \c 117 sqrdmulh \r2\sz, \r2\sz, \c 118 sqrdmulh \r3\sz, \r3\sz, \c 119.ifnb \r4 120 sqrdmulh \r4\sz, \r4\sz, \c 121 sqrdmulh \r5\sz, \r5\sz, \c 122 sqrdmulh \r6\sz, \r6\sz, \c 123 sqrdmulh \r7\sz, \r7\sz, \c 124.endif 125.endm 126 127.macro smin_4s r0, r1, r2 128 smin \r0\().4s, \r1\().4s, \r2\().4s 129.endm 130.macro smax_4s r0, r1, r2 131 smax \r0\().4s, \r1\().4s, \r2\().4s 132.endm 133 134.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 135.ifnb \load 136 ld1 {\load}, [\src], x1 137.endif 138.ifnb \shift 139 srshr \shift, \shift, #\shiftbits 140.endif 141.ifnb \addsrc 142 usqadd \adddst, \addsrc 143.endif 144.ifnb \min 145 smin \min, \min, v7.8h 146.endif 147.ifnb \store 148 st1 {\store}, [\dst], x1 149.endif 150.endm 151.macro load_add_store_8x16 dst, src 152 mov \src, \dst 153 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 154 load_add_store v2.8h, v16.8h, , , , , \dst, \src 155 load_add_store v3.8h, v17.8h, , , , , \dst, \src 156 load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src 157 load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src 158 load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src 159 load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src 160 load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src 161 load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src 162 load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src 163 load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src 164 load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src 165 load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src 166 load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src 167 load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src 168 load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src 169 load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src 170 load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src 171 load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src 172 load_add_store , , , , v27.8h, v26.8h, \dst, \src 173 load_add_store , , , , , v27.8h, \dst, \src 174.endm 175.macro load_add_store_8x8 dst, src, shiftbits=4 176 mov \src, \dst 177 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 178 load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits 179 load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits 180 load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits 181 load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits 182 load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits 183 load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits 184 load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits 185 load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits 186 load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits 187 load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits 188 load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits 189 load_add_store , , , , , v19.8h, \dst, \src, \shiftbits 190.endm 191.macro load_add_store_8x4 dst, src, shiftbits=4 192 mov \src, \dst 193 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 194 load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits 195 load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits 196 load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits 197 load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits 198 load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits 199 load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits 200 load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits 201 load_add_store , , , , , v5.8h, \dst, \src, \shiftbits 202.endm 203.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src 204.ifnb \load 205 ld1 {\load}[0], [\src], x1 206.endif 207.ifnb \inssrc 208 ins \insdst\().d[1], \inssrc\().d[0] 209.endif 210.ifnb \shift 211 srshr \shift, \shift, #4 212.endif 213.ifnb \load 214 ld1 {\load}[1], [\src], x1 215.endif 216.ifnb \addsrc 217 usqadd \adddst, \addsrc 218.endif 219.ifnb \store 220 st1 {\store}[0], [\dst], x1 221.endif 222.ifnb \min 223 smin \min, \min, v7.8h 224.endif 225.ifnb \store 226 st1 {\store}[1], [\dst], x1 227.endif 228.endm 229.macro load_add_store_4x16 dst, src 230 mov \src, \dst 231 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 232 load_add_store4 v0.d, v17, v16, , , , , , \dst, \src 233 load_add_store4 v1.d, v19, v18, , , , , , \dst, \src 234 load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src 235 load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src 236 load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src 237 load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src 238 load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src 239 load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src 240 load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src 241 load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src 242 load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src 243 load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src 244 load_add_store4 , , , , , , , v23.d, \dst, \src 245.endm 246.macro load_add_store_4x8 dst, src 247 mov \src, \dst 248 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 249 load_add_store4 v0.d, v17, v16, , , , , , \dst, \src 250 load_add_store4 v1.d, v19, v18, , , , , , \dst, \src 251 load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src 252 load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src 253 load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src 254 load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src 255 load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src 256 load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src 257 load_add_store4 , , , , , , , v3.d, \dst, \src 258.endm 259 260.macro idct_dc w, h, shift 261 cbnz w3, 1f 262 movz w16, #2896*8, lsl #16 263 ld1r {v16.4s}, [x2] 264 dup v0.2s, w16 265 sqrdmulh v20.4s, v16.4s, v0.s[0] 266 str wzr, [x2] 267.if (\w == 2*\h) || (2*\w == \h) 268 sqrdmulh v20.4s, v20.4s, v0.s[0] 269.endif 270.if \shift > 0 271 sqrshrn v16.4h, v20.4s, #\shift 272 sqrshrn2 v16.8h, v20.4s, #\shift 273.else 274 sqxtn v16.4h, v20.4s 275 sqxtn2 v16.8h, v20.4s 276.endif 277 sqrdmulh v16.8h, v16.8h, v0.h[1] 278 srshr v16.8h, v16.8h, #4 279 mov w4, #\h 280 b idct_dc_w\w\()_neon 2811: 282.endm 283 284function idct_dc_w4_neon 285 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 2861: 287 ld1 {v0.d}[0], [x0], x1 288 ld1 {v0.d}[1], [x0], x1 289 ld1 {v1.d}[0], [x0], x1 290 subs w4, w4, #4 291 ld1 {v1.d}[1], [x0], x1 292 usqadd v0.8h, v16.8h 293 sub x0, x0, x1, lsl #2 294 usqadd v1.8h, v16.8h 295 smin v0.8h, v0.8h, v31.8h 296 st1 {v0.d}[0], [x0], x1 297 smin v1.8h, v1.8h, v31.8h 298 st1 {v0.d}[1], [x0], x1 299 st1 {v1.d}[0], [x0], x1 300 st1 {v1.d}[1], [x0], x1 301 b.gt 1b 302 ret 303endfunc 304 305function idct_dc_w8_neon 306 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 3071: 308 ld1 {v0.8h}, [x0], x1 309 subs w4, w4, #4 310 ld1 {v1.8h}, [x0], x1 311 usqadd v0.8h, v16.8h 312 ld1 {v2.8h}, [x0], x1 313 usqadd v1.8h, v16.8h 314 ld1 {v3.8h}, [x0], x1 315 usqadd v2.8h, v16.8h 316 usqadd v3.8h, v16.8h 317 sub x0, x0, x1, lsl #2 318 smin v0.8h, v0.8h, v31.8h 319 smin v1.8h, v1.8h, v31.8h 320 st1 {v0.8h}, [x0], x1 321 smin v2.8h, v2.8h, v31.8h 322 st1 {v1.8h}, [x0], x1 323 smin v3.8h, v3.8h, v31.8h 324 st1 {v2.8h}, [x0], x1 325 st1 {v3.8h}, [x0], x1 326 b.gt 1b 327 ret 328endfunc 329 330function idct_dc_w16_neon 331 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 3321: 333 ld1 {v0.8h, v1.8h}, [x0], x1 334 subs w4, w4, #2 335 ld1 {v2.8h, v3.8h}, [x0], x1 336 usqadd v0.8h, v16.8h 337 usqadd v1.8h, v16.8h 338 sub x0, x0, x1, lsl #1 339 usqadd v2.8h, v16.8h 340 usqadd v3.8h, v16.8h 341 smin v0.8h, v0.8h, v31.8h 342 smin v1.8h, v1.8h, v31.8h 343 smin v2.8h, v2.8h, v31.8h 344 st1 {v0.8h, v1.8h}, [x0], x1 345 smin v3.8h, v3.8h, v31.8h 346 st1 {v2.8h, v3.8h}, [x0], x1 347 b.gt 1b 348 ret 349endfunc 350 351function idct_dc_w32_neon 352 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 3531: 354 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 355 subs w4, w4, #1 356 usqadd v0.8h, v16.8h 357 usqadd v1.8h, v16.8h 358 usqadd v2.8h, v16.8h 359 usqadd v3.8h, v16.8h 360 smin v0.8h, v0.8h, v31.8h 361 smin v1.8h, v1.8h, v31.8h 362 smin v2.8h, v2.8h, v31.8h 363 smin v3.8h, v3.8h, v31.8h 364 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 365 b.gt 1b 366 ret 367endfunc 368 369function idct_dc_w64_neon 370 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 371 sub x1, x1, #64 3721: 373 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 374 subs w4, w4, #1 375 usqadd v0.8h, v16.8h 376 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] 377 usqadd v1.8h, v16.8h 378 sub x0, x0, #64 379 usqadd v2.8h, v16.8h 380 usqadd v3.8h, v16.8h 381 usqadd v4.8h, v16.8h 382 usqadd v5.8h, v16.8h 383 usqadd v6.8h, v16.8h 384 usqadd v7.8h, v16.8h 385 smin v0.8h, v0.8h, v31.8h 386 smin v1.8h, v1.8h, v31.8h 387 smin v2.8h, v2.8h, v31.8h 388 smin v3.8h, v3.8h, v31.8h 389 smin v4.8h, v4.8h, v31.8h 390 smin v5.8h, v5.8h, v31.8h 391 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 392 smin v6.8h, v6.8h, v31.8h 393 smin v7.8h, v7.8h, v31.8h 394 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 395 b.gt 1b 396 ret 397endfunc 398 399.macro iwht4 400 add v16.4s, v16.4s, v17.4s 401 sub v21.4s, v18.4s, v19.4s 402 sub v20.4s, v16.4s, v21.4s 403 sshr v20.4s, v20.4s, #1 404 sub v18.4s, v20.4s, v17.4s 405 sub v17.4s, v20.4s, v19.4s 406 add v19.4s, v21.4s, v18.4s 407 sub v16.4s, v16.4s, v17.4s 408.endm 409 410.macro idct_4 r0, r1, r2, r3 411 mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] 412 mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] 413 mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] 414 mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] 415 srshr v6.4s, v6.4s, #12 416 srshr v2.4s, v2.4s, #12 417 srshr v7.4s, v4.4s, #12 418 srshr v3.4s, v3.4s, #12 419 sqadd \r0\().4s, v2.4s, v6.4s 420 sqsub \r3\().4s, v2.4s, v6.4s 421 sqadd \r1\().4s, v3.4s, v7.4s 422 sqsub \r2\().4s, v3.4s, v7.4s 423.endm 424 425function inv_dct_4s_x4_neon 426 AARCH64_VALID_CALL_TARGET 427 movrel x16, idct_coeffs 428 ld1 {v0.4s}, [x16] 429 idct_4 v16, v17, v18, v19 430 ret 431endfunc 432 433.macro iadst_4x4 o0, o1, o2, o3 434 movrel x16, iadst4_coeffs 435 ld1 {v0.4s}, [x16] 436 437 sub v3.4s, v16.4s, v18.4s 438 mul v4.4s, v16.4s, v0.s[0] 439 mla v4.4s, v18.4s, v0.s[1] 440 mla v4.4s, v19.4s, v0.s[2] 441 mul v7.4s, v17.4s, v0.s[3] 442 add v3.4s, v3.4s, v19.4s 443 mul v5.4s, v16.4s, v0.s[2] 444 mls v5.4s, v18.4s, v0.s[0] 445 mls v5.4s, v19.4s, v0.s[1] 446 447 add \o3\().4s, v4.4s, v5.4s 448 mul \o2\().4s, v3.4s, v0.s[3] 449 add \o0\().4s, v4.4s, v7.4s 450 add \o1\().4s, v5.4s, v7.4s 451 sub \o3\().4s, \o3\().4s, v7.4s 452 453 srshr \o0\().4s, \o0\().4s, #12 454 srshr \o2\().4s, \o2\().4s, #12 455 srshr \o1\().4s, \o1\().4s, #12 456 srshr \o3\().4s, \o3\().4s, #12 457.endm 458 459function inv_adst_4s_x4_neon 460 AARCH64_VALID_CALL_TARGET 461 iadst_4x4 v16, v17, v18, v19 462 ret 463endfunc 464 465function inv_flipadst_4s_x4_neon 466 AARCH64_VALID_CALL_TARGET 467 iadst_4x4 v19, v18, v17, v16 468 ret 469endfunc 470 471function inv_identity_4s_x4_neon 472 AARCH64_VALID_CALL_TARGET 473 movz w16, #(5793-4096)*8, lsl #16 474 dup v0.2s, w16 475 sqrdmulh v4.4s, v16.4s, v0.s[0] 476 sqrdmulh v5.4s, v17.4s, v0.s[0] 477 sqrdmulh v6.4s, v18.4s, v0.s[0] 478 sqrdmulh v7.4s, v19.4s, v0.s[0] 479 sqadd v16.4s, v16.4s, v4.4s 480 sqadd v17.4s, v17.4s, v5.4s 481 sqadd v18.4s, v18.4s, v6.4s 482 sqadd v19.4s, v19.4s, v7.4s 483 ret 484endfunc 485 486function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 487 mov x15, x30 488 movi v30.4s, #0 489 movi v31.4s, #0 490 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] 491 st1 {v30.4s, v31.4s}, [x2], #32 492 493 sshr v16.4s, v16.4s, #2 494 sshr v17.4s, v17.4s, #2 495 sshr v18.4s, v18.4s, #2 496 sshr v19.4s, v19.4s, #2 497 498 iwht4 499 500 st1 {v30.4s, v31.4s}, [x2], #32 501 transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 502 503 iwht4 504 505 ld1 {v0.d}[0], [x0], x1 506 sqxtn v16.4h, v16.4s 507 ld1 {v0.d}[1], [x0], x1 508 sqxtn2 v16.8h, v17.4s 509 ld1 {v1.d}[0], [x0], x1 510 sqxtn v18.4h, v18.4s 511 ld1 {v1.d}[1], [x0], x1 512 sqxtn2 v18.8h, v19.4s 513 514 b L(itx_4x4_end) 515endfunc 516 517// HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers 518// x0-x4 external parameters 519// x5 function pointer to first transform 520// x6 function pointer to second transform 521function inv_txfm_add_4x4_neon 522 movi v30.4s, #0 523 movi v31.4s, #0 524 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] 525 st1 {v30.4s, v31.4s}, [x2], #32 526 527 blr x5 528 529 st1 {v30.4s, v31.4s}, [x2], #32 530 sqxtn v16.4h, v16.4s 531 sqxtn v17.4h, v17.4s 532 sqxtn v18.4h, v18.4s 533 sqxtn v19.4h, v19.4s 534 transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 535 536 blr x6 537 538 ld1 {v0.d}[0], [x0], x1 539 ld1 {v0.d}[1], [x0], x1 540 ins v16.d[1], v17.d[0] 541 ins v18.d[1], v19.d[0] 542 ld1 {v1.d}[0], [x0], x1 543 ld1 {v1.d}[1], [x0], x1 544 srshr v16.8h, v16.8h, #4 545 srshr v18.8h, v18.8h, #4 546 547L(itx_4x4_end): 548 dup v31.8h, w4 549 sub x0, x0, x1, lsl #2 550 usqadd v0.8h, v16.8h 551 usqadd v1.8h, v18.8h 552 smin v0.8h, v0.8h, v31.8h 553 st1 {v0.d}[0], [x0], x1 554 smin v1.8h, v1.8h, v31.8h 555 st1 {v0.d}[1], [x0], x1 556 st1 {v1.d}[0], [x0], x1 557 st1 {v1.d}[1], [x0], x1 558 559 ret x15 560endfunc 561 562.macro def_fn_4x4 txfm1, txfm2 563function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 564 mov x15, x30 565 566.ifc \txfm1\()_\txfm2, dct_dct 567 cbnz w3, 1f 568 movz w16, #2896*8, lsl #16 569 ld1r {v16.4s}, [x2] 570 dup v4.2s, w16 571 str wzr, [x2] 572 sqrdmulh v16.4s, v16.4s, v4.s[0] 573 ld1 {v0.d}[0], [x0], x1 574 sqxtn v20.4h, v16.4s 575 sqxtn2 v20.8h, v16.4s 576 ld1 {v0.d}[1], [x0], x1 577 sqrdmulh v20.8h, v20.8h, v4.h[1] 578 ld1 {v1.d}[0], [x0], x1 579 srshr v16.8h, v20.8h, #4 580 ld1 {v1.d}[1], [x0], x1 581 srshr v18.8h, v20.8h, #4 582 movi v30.8h, #0 583 b L(itx_4x4_end) 5841: 585.endif 586 adr x5, inv_\txfm1\()_4s_x4_neon 587 movrel x6, X(inv_\txfm2\()_4h_x4_neon) 588 b inv_txfm_add_4x4_neon 589endfunc 590.endm 591 592def_fn_4x4 dct, dct 593def_fn_4x4 identity, identity 594def_fn_4x4 dct, adst 595def_fn_4x4 dct, flipadst 596def_fn_4x4 dct, identity 597def_fn_4x4 adst, dct 598def_fn_4x4 adst, adst 599def_fn_4x4 adst, flipadst 600def_fn_4x4 flipadst, dct 601def_fn_4x4 flipadst, adst 602def_fn_4x4 flipadst, flipadst 603def_fn_4x4 identity, dct 604 605def_fn_4x4 adst, identity 606def_fn_4x4 flipadst, identity 607def_fn_4x4 identity, adst 608def_fn_4x4 identity, flipadst 609 610.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 611 idct_4 \r0, \r2, \r4, \r6 612 613 movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff 614 mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 615.irp r, \r0, \r2, \r4, \r6 616 smin_4s \r, \r, v5 617.endr 618.irp r, \r0, \r2, \r4, \r6 619 smax_4s \r, \r, v4 620.endr 621 622 mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a 623 mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a 624 mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a 625 mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a 626 srshr \r1\().4s, v2.4s, #12 // t4a 627 srshr \r7\().4s, v3.4s, #12 // t7a 628 srshr \r3\().4s, v6.4s, #12 // t5a 629 srshr \r5\().4s, v7.4s, #12 // t6a 630 631 sqadd v2.4s, \r1\().4s, \r3\().4s // t4 632 sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a 633 sqadd v3.4s, \r7\().4s, \r5\().4s // t7 634 sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a 635 636.irp r, v2, \r1, v3, \r3 637 smin_4s \r, \r, v5 638.endr 639.irp r, v2, \r1, v3, \r3 640 smax_4s \r, \r, v4 641.endr 642 643 mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5 644 mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 645 srshr v7.4s, v7.4s, #12 // t5 646 srshr v6.4s, v6.4s, #12 // t6 647 648 sqsub \r7\().4s, \r0\().4s, v3.4s // out7 649 sqadd \r0\().4s, \r0\().4s, v3.4s // out0 650 sqadd \r1\().4s, \r2\().4s, v6.4s // out1 651 sqsub v6.4s, \r2\().4s, v6.4s // out6 652 sqadd \r2\().4s, \r4\().4s, v7.4s // out2 653 sqsub \r5\().4s, \r4\().4s, v7.4s // out5 654 sqadd \r3\().4s, \r6\().4s, v2.4s // out3 655 sqsub \r4\().4s, \r6\().4s, v2.4s // out4 656 mov \r6\().16b, v6.16b // out6 657.endm 658 659function inv_dct_4s_x8_neon 660 AARCH64_VALID_CALL_TARGET 661 movrel x16, idct_coeffs 662 ld1 {v0.4s, v1.4s}, [x16] 663 idct_8 v16, v17, v18, v19, v20, v21, v22, v23 664 ret 665endfunc 666 667.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 668 movrel x16, iadst8_coeffs 669 ld1 {v0.4s, v1.4s}, [x16], #32 670 671 mul_mla v2, v23, v16, v0.s[0], v0.s[1] 672 mul_mls v4, v23, v16, v0.s[1], v0.s[0] 673 mul_mla v6, v21, v18, v0.s[2], v0.s[3] 674 srshr v16.4s, v2.4s, #12 // t0a 675 srshr v23.4s, v4.4s, #12 // t1a 676 mul_mls v2, v21, v18, v0.s[3], v0.s[2] 677 mul_mla v4, v19, v20, v1.s[0], v1.s[1] 678 srshr v18.4s, v6.4s, #12 // t2a 679 srshr v21.4s, v2.4s, #12 // t3a 680 mul_mls v6, v19, v20, v1.s[1], v1.s[0] 681 mul_mla v2, v17, v22, v1.s[2], v1.s[3] 682 srshr v20.4s, v4.4s, #12 // t4a 683 srshr v19.4s, v6.4s, #12 // t5a 684 mul_mls v4, v17, v22, v1.s[3], v1.s[2] 685 srshr v22.4s, v2.4s, #12 // t6a 686 srshr v17.4s, v4.4s, #12 // t7a 687 688 ld1 {v0.4s}, [x16] 689 690 movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff 691 692 sqadd v2.4s, v16.4s, v20.4s // t0 693 sqsub v3.4s, v16.4s, v20.4s // t4 694 mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 695 sqadd v4.4s, v23.4s, v19.4s // t1 696 sqsub v5.4s, v23.4s, v19.4s // t5 697 sqadd v6.4s, v18.4s, v22.4s // t2 698 sqsub v7.4s, v18.4s, v22.4s // t6 699 sqadd v18.4s, v21.4s, v17.4s // t3 700 sqsub v19.4s, v21.4s, v17.4s // t7 701 702.irp r, v2, v3, v4, v5, v6, v7, v18, v19 703 smin_4s \r, \r, v1 704.endr 705.irp r, v2, v3, v4, v5, v6, v7, v18, v19 706 smax_4s \r, \r, v20 707.endr 708 709 mul_mla v16, v3, v5, v0.s[3], v0.s[2] 710 mul_mls v20, v3, v5, v0.s[2], v0.s[3] 711 mul_mls v22, v19, v7, v0.s[3], v0.s[2] 712 713 srshr v3.4s, v16.4s, #12 // t4a 714 srshr v5.4s, v20.4s, #12 // t5a 715 716 mul_mla v16, v19, v7, v0.s[2], v0.s[3] 717 718 srshr v7.4s, v22.4s, #12 // t6a 719 srshr v19.4s, v16.4s, #12 // t7a 720 721 sqadd \o0\().4s, v2.4s, v6.4s // out0 722 sqsub v2.4s, v2.4s, v6.4s // t2 723 sqadd \o7\().4s, v4.4s, v18.4s // out7 724 sqsub v4.4s, v4.4s, v18.4s // t3 725 726 mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 727 728 sqadd \o1\().4s, v3.4s, v7.4s // out1 729 sqsub v3.4s, v3.4s, v7.4s // t6 730 sqadd \o6\().4s, v5.4s, v19.4s // out6 731 sqsub v5.4s, v5.4s, v19.4s // t7 732 733 // Not clipping the output registers, as they will be downshifted and 734 // narrowed afterwards anyway. 735.irp r, v2, v4, v3, v5 736 smin_4s \r, \r, v1 737.endr 738.irp r, v2, v4, v3, v5 739 smax_4s \r, \r, v18 740.endr 741 742 sqneg \o7\().4s, \o7\().4s // out7 743 sqneg \o1\().4s, \o1\().4s // out1 744 745 mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) 746 mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) 747 mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) 748 srshr v2.4s, v18.4s, #12 // out3 749 mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) 750 srshr v3.4s, v20.4s, #12 // out5 751 srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) 752 srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) 753 754 sqneg \o3\().4s, v2.4s // out3 755 sqneg \o5\().4s, v3.4s // out5 756.endm 757 758function inv_adst_4s_x8_neon 759 AARCH64_VALID_CALL_TARGET 760 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 761 ret 762endfunc 763 764function inv_flipadst_4s_x8_neon 765 AARCH64_VALID_CALL_TARGET 766 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 767 ret 768endfunc 769 770function inv_identity_4s_x8_neon 771 AARCH64_VALID_CALL_TARGET 772 sqshl v16.4s, v16.4s, #1 773 sqshl v17.4s, v17.4s, #1 774 sqshl v18.4s, v18.4s, #1 775 sqshl v19.4s, v19.4s, #1 776 sqshl v20.4s, v20.4s, #1 777 sqshl v21.4s, v21.4s, #1 778 sqshl v22.4s, v22.4s, #1 779 sqshl v23.4s, v23.4s, #1 780 ret 781endfunc 782 783function inv_txfm_add_8x8_neon 784 movi v31.4s, #0 785 786 cmp w3, w13 787 mov x11, #32 788 b.lt 1f 789 790 add x6, x2, #16 791.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 792 ld1 {\i}, [x6] 793 st1 {v31.4s}, [x6], x11 794.endr 795 796 blr x4 797 798 sqrshrn v24.4h, v16.4s, #1 799 sqrshrn v25.4h, v17.4s, #1 800 sqrshrn v26.4h, v18.4s, #1 801 sqrshrn v27.4h, v19.4s, #1 802 sqrshrn2 v24.8h, v20.4s, #1 803 sqrshrn2 v25.8h, v21.4s, #1 804 sqrshrn2 v26.8h, v22.4s, #1 805 sqrshrn2 v27.8h, v23.4s, #1 806 807 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 808 809 b 2f 810 8111: 812.irp i, v24.8h, v25.8h, v26.8h, v27.8h 813 movi \i, #0 814.endr 815 8162: 817 818.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 819 ld1 {\i}, [x2] 820 st1 {v31.4s}, [x2], x11 821.endr 822 823 blr x4 824 825 sqrshrn v16.4h, v16.4s, #1 826 sqrshrn v17.4h, v17.4s, #1 827 sqrshrn v18.4h, v18.4s, #1 828 sqrshrn v19.4h, v19.4s, #1 829 sqrshrn2 v16.8h, v20.4s, #1 830 sqrshrn2 v17.8h, v21.4s, #1 831 sqrshrn2 v18.8h, v22.4s, #1 832 sqrshrn2 v19.8h, v23.4s, #1 833 834 transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 835 836 mov v20.16b, v24.16b 837 mov v21.16b, v25.16b 838 mov v22.16b, v26.16b 839 mov v23.16b, v27.16b 840 841 blr x5 842 843 load_add_store_8x8 x0, x7 844 ret x15 845endfunc 846 847.macro def_fn_8x8 txfm1, txfm2, eob_half 848function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 849 mov x15, x30 850 851.ifc \txfm1\()_\txfm2, dct_dct 852 idct_dc 8, 8, 1 853.endif 854 movrel x5, X(inv_\txfm2\()_8h_x8_neon) 855 mov w13, #\eob_half 856 adr x4, inv_\txfm1\()_4s_x8_neon 857 b inv_txfm_add_8x8_neon 858endfunc 859.endm 860 861def_fn_8x8 dct, dct, 10 862def_fn_8x8 identity, identity, 10 863def_fn_8x8 dct, adst, 10 864def_fn_8x8 dct, flipadst, 10 865def_fn_8x8 dct, identity, 4 866def_fn_8x8 adst, dct, 10 867def_fn_8x8 adst, adst, 10 868def_fn_8x8 adst, flipadst, 10 869def_fn_8x8 flipadst, dct, 10 870def_fn_8x8 flipadst, adst, 10 871def_fn_8x8 flipadst, flipadst, 10 872def_fn_8x8 identity, dct, 4 873def_fn_8x8 adst, identity, 4 874def_fn_8x8 flipadst, identity, 4 875def_fn_8x8 identity, adst, 4 876def_fn_8x8 identity, flipadst, 4 877 878function inv_txfm_add_8x4_neon 879 movi v28.4s, #0 880 movi v29.4s, #0 881 movi v30.4s, #0 882 movi v31.4s, #0 883 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] 884 st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 885 movz w16, #2896*8, lsl #16 886 dup v0.2s, w16 887 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] 888 st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] 889 890 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 891 892 blr x4 893 894 sqxtn v16.4h, v16.4s 895 sqxtn v17.4h, v17.4s 896 sqxtn v18.4h, v18.4s 897 sqxtn v19.4h, v19.4s 898 sqxtn v20.4h, v20.4s 899 sqxtn v21.4h, v21.4s 900 sqxtn v22.4h, v22.4s 901 sqxtn v23.4h, v23.4s 902 903 transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 904 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 905 ins v16.d[1], v20.d[0] 906 ins v17.d[1], v21.d[0] 907 ins v18.d[1], v22.d[0] 908 ins v19.d[1], v23.d[0] 909 910 blr x5 911 912 load_add_store_8x4 x0, x7 913 ret x15 914endfunc 915 916function inv_txfm_add_4x8_neon 917 movz w16, #2896*8, lsl #16 918 movi v31.4s, #0 919 dup v30.2s, w16 920 921 cmp w3, w13 922 mov x11, #32 923 b.lt 1f 924 925 add x6, x2, #16 926.irp i, v16.4s, v17.4s, v18.4s, v19.4s 927 ld1 {\i}, [x6] 928 st1 {v31.4s}, [x6], x11 929.endr 930 scale_input .4s, v30.s[0], v16, v17, v18, v19 931 blr x4 932 sqxtn v20.4h, v16.4s 933 sqxtn v21.4h, v17.4s 934 sqxtn v22.4h, v18.4s 935 sqxtn v23.4h, v19.4s 936 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 937 938 b 2f 939 9401: 941.irp i, v20, v21, v22, v23 942 movi \i\().4h, #0 943.endr 944 9452: 946 947.irp i, v16.4s, v17.4s, v18.4s, v19.4s 948 ld1 {\i}, [x2] 949 st1 {v31.4s}, [x2], x11 950.endr 951 scale_input .4s, v30.s[0], v16, v17, v18, v19 952 blr x4 953 sqxtn v16.4h, v16.4s 954 sqxtn v17.4h, v17.4s 955 sqxtn v18.4h, v18.4s 956 sqxtn v19.4h, v19.4s 957 transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 958 959 blr x5 960 961 load_add_store_4x8 x0, x7 962 ret x15 963endfunc 964 965.macro def_fn_48 w, h, txfm1, txfm2, eob_half 966function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 967 mov x15, x30 968 969.ifc \txfm1\()_\txfm2, dct_dct 970 idct_dc \w, \h, 0 971.endif 972 adr x4, inv_\txfm1\()_4s_x\w\()_neon 973.if \w == 4 974 mov w13, #\eob_half 975.endif 976 movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) 977 b inv_txfm_add_\w\()x\h\()_neon 978endfunc 979.endm 980 981.macro def_fns_48 w, h 982def_fn_48 \w, \h, dct, dct, 13 983def_fn_48 \w, \h, identity, identity, 13 984def_fn_48 \w, \h, dct, adst, 13 985def_fn_48 \w, \h, dct, flipadst, 13 986def_fn_48 \w, \h, dct, identity, 4 987def_fn_48 \w, \h, adst, dct, 13 988def_fn_48 \w, \h, adst, adst, 13 989def_fn_48 \w, \h, adst, flipadst, 13 990def_fn_48 \w, \h, flipadst, dct, 13 991def_fn_48 \w, \h, flipadst, adst, 13 992def_fn_48 \w, \h, flipadst, flipadst, 13 993def_fn_48 \w, \h, identity, dct, 16 994def_fn_48 \w, \h, adst, identity, 4 995def_fn_48 \w, \h, flipadst, identity, 4 996def_fn_48 \w, \h, identity, adst, 16 997def_fn_48 \w, \h, identity, flipadst, 16 998.endm 999 1000def_fns_48 4, 8 1001def_fns_48 8, 4 1002 1003 1004function inv_dct_4s_x16_neon 1005 AARCH64_VALID_CALL_TARGET 1006 movrel x16, idct_coeffs 1007 ld1 {v0.4s, v1.4s}, [x16], #32 1008 1009 idct_8 v16, v18, v20, v22, v24, v26, v28, v30 1010 1011 // idct_8 leaves the row_clip_max/min constants in v5 and v4 1012.irp r, v16, v18, v20, v22, v24, v26, v28, v30 1013 smin \r\().4s, \r\().4s, v5.4s 1014.endr 1015.irp r, v16, v18, v20, v22, v24, v26, v28, v30 1016 smax \r\().4s, \r\().4s, v4.4s 1017.endr 1018 1019 ld1 {v0.4s, v1.4s}, [x16] 1020 sub x16, x16, #32 1021 1022 mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a 1023 mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a 1024 mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a 1025 srshr v17.4s, v2.4s, #12 // t8a 1026 srshr v31.4s, v3.4s, #12 // t15a 1027 mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a 1028 mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a 1029 srshr v23.4s, v6.4s, #12 // t9a 1030 srshr v25.4s, v2.4s, #12 // t14a 1031 mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a 1032 mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a 1033 srshr v21.4s, v3.4s, #12 // t10a 1034 srshr v27.4s, v6.4s, #12 // t13a 1035 mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a 1036 srshr v19.4s, v2.4s, #12 // t11a 1037 srshr v29.4s, v3.4s, #12 // t12a 1038 1039 ld1 {v0.4s}, [x16] 1040 1041 sqsub v2.4s, v17.4s, v23.4s // t9 1042 sqadd v17.4s, v17.4s, v23.4s // t8 1043 sqsub v3.4s, v31.4s, v25.4s // t14 1044 sqadd v31.4s, v31.4s, v25.4s // t15 1045 sqsub v23.4s, v19.4s, v21.4s // t10 1046 sqadd v19.4s, v19.4s, v21.4s // t11 1047 sqadd v25.4s, v29.4s, v27.4s // t12 1048 sqsub v29.4s, v29.4s, v27.4s // t13 1049 1050.irp r, v2, v17, v3, v31, v23, v19, v25, v29 1051 smin \r\().4s, \r\().4s, v5.4s 1052.endr 1053.irp r, v2, v17, v3, v31, v23, v19, v25, v29 1054 smax \r\().4s, \r\().4s, v4.4s 1055.endr 1056 1057 mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a 1058 mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a 1059 srshr v21.4s, v7.4s, #12 // t9a 1060 srshr v27.4s, v6.4s, #12 // t14a 1061 1062 mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a 1063 mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a 1064 srshr v29.4s, v7.4s, #12 // t13a 1065 neg v6.4s, v6.4s 1066 srshr v23.4s, v6.4s, #12 // t10a 1067 1068 sqsub v2.4s, v17.4s, v19.4s // t11a 1069 sqadd v17.4s, v17.4s, v19.4s // t8a 1070 sqsub v3.4s, v31.4s, v25.4s // t12a 1071 sqadd v31.4s, v31.4s, v25.4s // t15a 1072 sqadd v19.4s, v21.4s, v23.4s // t9 1073 sqsub v21.4s, v21.4s, v23.4s // t10 1074 sqsub v25.4s, v27.4s, v29.4s // t13 1075 sqadd v27.4s, v27.4s, v29.4s // t14 1076 1077.irp r, v2, v17, v3, v31, v19, v21, v25, v27 1078 smin \r\().4s, \r\().4s, v5.4s 1079.endr 1080.irp r, v2, v17, v3, v31, v19, v21, v25, v27 1081 smax \r\().4s, \r\().4s, v4.4s 1082.endr 1083 1084 mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11 1085 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 1086 mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a 1087 1088 srshr v7.4s, v7.4s, #12 // t11 1089 srshr v6.4s, v6.4s, #12 // t12 1090 mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a 1091 srshr v2.4s, v2.4s, #12 // t10a 1092 srshr v3.4s, v3.4s, #12 // t13a 1093 1094 sqadd v1.4s, v16.4s, v31.4s // out0 1095 sqsub v31.4s, v16.4s, v31.4s // out15 1096 mov v16.16b, v1.16b 1097 sqadd v23.4s, v30.4s, v17.4s // out7 1098 sqsub v1.4s, v30.4s, v17.4s // out8 1099 sqadd v17.4s, v18.4s, v27.4s // out1 1100 sqsub v30.4s, v18.4s, v27.4s // out14 1101 sqadd v18.4s, v20.4s, v3.4s // out2 1102 sqsub v29.4s, v20.4s, v3.4s // out13 1103 sqadd v3.4s, v28.4s, v19.4s // out6 1104 sqsub v25.4s, v28.4s, v19.4s // out9 1105 sqadd v19.4s, v22.4s, v6.4s // out3 1106 sqsub v28.4s, v22.4s, v6.4s // out12 1107 sqadd v20.4s, v24.4s, v7.4s // out4 1108 sqsub v27.4s, v24.4s, v7.4s // out11 1109 sqadd v21.4s, v26.4s, v2.4s // out5 1110 sqsub v26.4s, v26.4s, v2.4s // out10 1111 mov v24.16b, v1.16b 1112 mov v22.16b, v3.16b 1113 1114 ret 1115endfunc 1116 1117.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 1118 movrel x16, iadst16_coeffs 1119 ld1 {v0.4s, v1.4s}, [x16], #32 1120 1121 mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 1122 mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 1123 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 1124 srshr v16.4s, v2.4s, #12 // t0 1125 srshr v31.4s, v4.4s, #12 // t1 1126 mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 1127 mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 1128 srshr v18.4s, v6.4s, #12 // t2 1129 srshr v29.4s, v2.4s, #12 // t3 1130 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 1131 mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 1132 srshr v20.4s, v4.4s, #12 // t4 1133 srshr v27.4s, v6.4s, #12 // t5 1134 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 1135 ld1 {v0.4s, v1.4s}, [x16] 1136 movrel x16, idct_coeffs 1137 mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 1138 srshr v22.4s, v2.4s, #12 // t6 1139 srshr v25.4s, v4.4s, #12 // t7 1140 mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 1141 mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 1142 srshr v23.4s, v6.4s, #12 // t8 1143 srshr v24.4s, v2.4s, #12 // t9 1144 mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 1145 mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 1146 srshr v21.4s, v4.4s, #12 // t10 1147 srshr v26.4s, v6.4s, #12 // t11 1148 mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 1149 mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 1150 srshr v19.4s, v2.4s, #12 // t12 1151 srshr v28.4s, v4.4s, #12 // t13 1152 mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 1153 srshr v17.4s, v6.4s, #12 // t14 1154 srshr v30.4s, v2.4s, #12 // t15 1155 1156 ld1 {v0.4s, v1.4s}, [x16] 1157 1158 movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff 1159 mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 1160 1161 sqsub v2.4s, v16.4s, v23.4s // t8a 1162 sqadd v16.4s, v16.4s, v23.4s // t0a 1163 sqsub v3.4s, v31.4s, v24.4s // t9a 1164 sqadd v31.4s, v31.4s, v24.4s // t1a 1165 sqadd v23.4s, v18.4s, v21.4s // t2a 1166 sqsub v18.4s, v18.4s, v21.4s // t10a 1167 sqadd v24.4s, v29.4s, v26.4s // t3a 1168 sqsub v29.4s, v29.4s, v26.4s // t11a 1169 sqadd v21.4s, v20.4s, v19.4s // t4a 1170 sqsub v20.4s, v20.4s, v19.4s // t12a 1171 sqadd v26.4s, v27.4s, v28.4s // t5a 1172 sqsub v27.4s, v27.4s, v28.4s // t13a 1173 sqadd v19.4s, v22.4s, v17.4s // t6a 1174 sqsub v22.4s, v22.4s, v17.4s // t14a 1175 sqadd v28.4s, v25.4s, v30.4s // t7a 1176 sqsub v25.4s, v25.4s, v30.4s // t15a 1177 1178.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 1179 smin_4s \r, \r, v5 1180.endr 1181.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 1182 smax_4s \r, \r, v7 1183.endr 1184 1185 mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 1186 mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 1187 mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 1188 srshr v17.4s, v4.4s, #12 // t8 1189 srshr v30.4s, v6.4s, #12 // t9 1190 mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 1191 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 1192 srshr v18.4s, v2.4s, #12 // t10 1193 srshr v29.4s, v4.4s, #12 // t11 1194 mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 1195 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 1196 srshr v27.4s, v6.4s, #12 // t12 1197 srshr v20.4s, v2.4s, #12 // t13 1198 mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 1199 srshr v25.4s, v4.4s, #12 // t14 1200 srshr v22.4s, v6.4s, #12 // t15 1201 1202 sqsub v2.4s, v16.4s, v21.4s // t4 1203 sqadd v16.4s, v16.4s, v21.4s // t0 1204 sqsub v3.4s, v31.4s, v26.4s // t5 1205 sqadd v31.4s, v31.4s, v26.4s // t1 1206 sqadd v21.4s, v23.4s, v19.4s // t2 1207 sqsub v23.4s, v23.4s, v19.4s // t6 1208 sqadd v26.4s, v24.4s, v28.4s // t3 1209 sqsub v24.4s, v24.4s, v28.4s // t7 1210 sqadd v19.4s, v17.4s, v27.4s // t8a 1211 sqsub v17.4s, v17.4s, v27.4s // t12a 1212 sqadd v28.4s, v30.4s, v20.4s // t9a 1213 sqsub v30.4s, v30.4s, v20.4s // t13a 1214 sqadd v27.4s, v18.4s, v25.4s // t10a 1215 sqsub v18.4s, v18.4s, v25.4s // t14a 1216 sqadd v20.4s, v29.4s, v22.4s // t11a 1217 sqsub v29.4s, v29.4s, v22.4s // t15a 1218 1219.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 1220 smin_4s \r, \r, v5 1221.endr 1222.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 1223 smax_4s \r, \r, v7 1224.endr 1225 1226 mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a 1227 mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a 1228 mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a 1229 srshr v22.4s, v4.4s, #12 // t4a 1230 srshr v25.4s, v6.4s, #12 // t5a 1231 mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a 1232 mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 1233 srshr v24.4s, v2.4s, #12 // t6a 1234 srshr v23.4s, v4.4s, #12 // t7a 1235 mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 1236 mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 1237 srshr v17.4s, v6.4s, #12 // t12 1238 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 1239 srshr v29.4s, v2.4s, #12 // t13 1240 srshr v30.4s, v4.4s, #12 // t14 1241 srshr v18.4s, v6.4s, #12 // t15 1242 1243 sqsub v2.4s, v16.4s, v21.4s // t2a 1244.ifc \o0, v16 1245 sqadd \o0\().4s, v16.4s, v21.4s // out0 1246 sqsub v21.4s, v31.4s, v26.4s // t3a 1247 sqadd \o15\().4s, v31.4s, v26.4s // out15 1248.else 1249 sqadd v4.4s, v16.4s, v21.4s // out0 1250 sqsub v21.4s, v31.4s, v26.4s // t3a 1251 sqadd \o15\().4s, v31.4s, v26.4s // out15 1252 mov \o0\().16b, v4.16b 1253.endif 1254 1255 sqsub v3.4s, v29.4s, v18.4s // t15a 1256 sqadd \o13\().4s, v29.4s, v18.4s // out13 1257 sqadd \o2\().4s, v17.4s, v30.4s // out2 1258 sqsub v26.4s, v17.4s, v30.4s // t14a 1259 1260 sqadd \o1\().4s, v19.4s, v27.4s // out1 1261 sqsub v27.4s, v19.4s, v27.4s // t10 1262 sqadd \o14\().4s, v28.4s, v20.4s // out14 1263 sqsub v20.4s, v28.4s, v20.4s // t11 1264 1265 sqadd \o3\().4s, v22.4s, v24.4s // out3 1266 sqsub v22.4s, v22.4s, v24.4s // t6 1267 sqadd \o12\().4s, v25.4s, v23.4s // out12 1268 sqsub v23.4s, v25.4s, v23.4s // t7 1269 1270 // Not clipping the output registers, as they will be downshifted and 1271 // narrowed afterwards anyway. 1272.irp r, v2, v21, v3, v26, v27, v20, v22, v23 1273 smin_4s \r, \r, v5 1274.endr 1275.irp r, v2, v21, v3, v26, v27, v20, v22, v23 1276 smax_4s \r, \r, v7 1277.endr 1278 1279 sqneg \o15\().4s, \o15\().4s // out15 1280 sqneg \o13\().4s, \o13\().4s // out13 1281 sqneg \o1\().4s, \o1\().4s // out1 1282 sqneg \o3\().4s, \o3\().4s // out3 1283 1284 mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) 1285 mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) 1286 mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) 1287 1288 srshr v24.4s, v24.4s, #12 // out8 1289 srshr v4.4s, v4.4s, #12 // out7 1290 srshr v5.4s, v6.4s, #12 // out5 1291 mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) 1292 mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) 1293 srshr v26.4s, v6.4s, #12 // out10 1294 1295 mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) 1296 mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) 1297 mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) 1298 1299 srshr \o4\().4s, v2.4s, #12 // out4 1300 srshr v6.4s, v6.4s, #12 // out11 1301 srshr v7.4s, v21.4s, #12 // out9 1302 srshr \o6\().4s, v22.4s, #12 // out6 1303 1304.ifc \o8, v23 1305 mov \o8\().16b, v24.16b 1306 mov \o10\().16b, v26.16b 1307.endif 1308 1309 sqneg \o7\().4s, v4.4s // out7 1310 sqneg \o5\().4s, v5.4s // out5 1311 sqneg \o11\().4s, v6.4s // out11 1312 sqneg \o9\().4s, v7.4s // out9 1313.endm 1314 1315function inv_adst_4s_x16_neon 1316 AARCH64_VALID_CALL_TARGET 1317 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 1318 ret 1319endfunc 1320 1321function inv_flipadst_4s_x16_neon 1322 AARCH64_VALID_CALL_TARGET 1323 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 1324 ret 1325endfunc 1326 1327function inv_identity_4s_x16_neon 1328 AARCH64_VALID_CALL_TARGET 1329 movz w16, #2*(5793-4096)*8, lsl #16 1330 dup v0.2s, w16 1331.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1332 sqrdmulh v2.4s, v\i\().4s, v0.s[0] 1333 sqadd v\i\().4s, v\i\().4s, v\i\().4s 1334 sqadd v\i\().4s, v\i\().4s, v2.4s 1335.endr 1336 ret 1337endfunc 1338 1339.macro identity_4x16_shift1 c 1340.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1341 sqrdmulh v3.4s, \i, \c 1342 srshr v3.4s, v3.4s, #1 1343 sqadd \i, \i, v3.4s 1344.endr 1345.endm 1346 1347.macro identity_4x16 c 1348.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1349 sqrdmulh v3.4s, \i, \c 1350 sqadd \i, \i, \i 1351 sqadd \i, \i, v3.4s 1352.endr 1353.endm 1354 1355.macro def_horz_16 scale=0, shift=2, suffix 1356function inv_txfm_horz\suffix\()_16x4_neon 1357 mov x14, x30 1358 movi v7.4s, #0 1359.if \scale 1360 movz w16, #2896*8, lsl #16 1361 dup v0.2s, w16 1362.endif 1363.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1364 ld1 {\i}, [x7] 1365 st1 {v7.4s}, [x7], x8 1366.endr 1367.if \scale 1368 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1369 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 1370.endif 1371 blr x4 1372 sqrshrn v16.4h, v16.4s, #\shift 1373 sqrshrn v17.4h, v17.4s, #\shift 1374 sqrshrn v18.4h, v18.4s, #\shift 1375 sqrshrn v19.4h, v19.4s, #\shift 1376 sqrshrn2 v16.8h, v20.4s, #\shift 1377 sqrshrn2 v17.8h, v21.4s, #\shift 1378 sqrshrn2 v18.8h, v22.4s, #\shift 1379 sqrshrn2 v19.8h, v23.4s, #\shift 1380 sqrshrn v20.4h, v24.4s, #\shift 1381 sqrshrn v21.4h, v25.4s, #\shift 1382 sqrshrn v22.4h, v26.4s, #\shift 1383 sqrshrn v23.4h, v27.4s, #\shift 1384 sqrshrn2 v20.8h, v28.4s, #\shift 1385 sqrshrn2 v21.8h, v29.4s, #\shift 1386 sqrshrn2 v22.8h, v30.4s, #\shift 1387 sqrshrn2 v23.8h, v31.4s, #\shift 1388.if \scale 1389 b L(horz_16x4_epilog) 1390.else 1391L(horz_16x4_epilog): 1392 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 1393 transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 1394 1395.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h 1396 st1 {\i}, [x6], #16 1397.endr 1398 1399 ret x14 1400.endif 1401endfunc 1402.endm 1403 1404def_horz_16 scale=1, shift=1, suffix=_scale 1405def_horz_16 scale=0, shift=2 1406 1407function inv_txfm_add_vert_8x16_neon 1408 mov x14, x30 1409.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1410 ld1 {v\i\().8h}, [x7], x8 1411.endr 1412 blr x5 1413 load_add_store_8x16 x6, x7 1414 ret x14 1415endfunc 1416 1417function inv_txfm_add_16x16_neon 1418 mov x15, x30 1419 sub sp, sp, #512 1420 ldrh w12, [x13], #2 1421.irp i, 0, 4, 8, 12 1422 add x6, sp, #(\i*16*2) 1423.if \i > 0 1424 mov w8, #(16 - \i) 1425 cmp w3, w12 1426 b.lt 1f 1427.if \i < 12 1428 ldrh w12, [x13], #2 1429.endif 1430.endif 1431 add x7, x2, #(\i*4) 1432 mov x8, #16*4 1433 bl inv_txfm_horz_16x4_neon 1434.endr 1435 b 3f 14361: 1437 movi v4.8h, #0 1438 movi v5.8h, #0 1439 movi v6.8h, #0 1440 movi v7.8h, #0 14412: 1442 subs w8, w8, #4 1443.rept 2 1444 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 1445.endr 1446 b.gt 2b 14473: 1448.irp i, 0, 8 1449 add x6, x0, #(\i*2) 1450 add x7, sp, #(\i*2) 1451 mov x8, #32 1452 bl inv_txfm_add_vert_8x16_neon 1453.endr 1454 1455 add sp, sp, #512 1456 ret x15 1457endfunc 1458 1459const eob_16x16 1460 .short 10, 36, 78, 256 1461endconst 1462 1463const eob_16x16_identity 1464 .short 4, 8, 12, 256 1465endconst 1466 1467.macro def_fn_16x16 txfm1, txfm2 1468function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 1469.ifc \txfm1\()_\txfm2, dct_dct 1470 idct_dc 16, 16, 2 1471.endif 1472 adr x4, inv_\txfm1\()_4s_x16_neon 1473 movrel x5, X(inv_\txfm2\()_8h_x16_neon) 1474.ifc \txfm1, identity 1475.ifc \txfm2, identity 1476 movrel x13, eob_16x16 1477.else 1478 movrel x13, eob_16x16_identity 1479.endif 1480.else 1481.ifc \txfm2, identity 1482 movrel x13, eob_16x16_identity 1483.else 1484 movrel x13, eob_16x16 1485.endif 1486.endif 1487 b inv_txfm_add_16x16_neon 1488endfunc 1489.endm 1490 1491def_fn_16x16 dct, dct 1492def_fn_16x16 identity, identity 1493def_fn_16x16 dct, adst 1494def_fn_16x16 dct, flipadst 1495def_fn_16x16 dct, identity 1496def_fn_16x16 adst, dct 1497def_fn_16x16 adst, adst 1498def_fn_16x16 adst, flipadst 1499def_fn_16x16 flipadst, dct 1500def_fn_16x16 flipadst, adst 1501def_fn_16x16 flipadst, flipadst 1502def_fn_16x16 identity, dct 1503 1504function inv_txfm_add_16x4_neon 1505 mov x15, x30 1506 movi v4.4s, #0 1507 1508.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1509 ld1 {\i}, [x2] 1510 st1 {v4.4s}, [x2], #16 1511.endr 1512 1513 blr x4 1514 1515 sqrshrn v16.4h, v16.4s, #1 1516 sqrshrn v17.4h, v17.4s, #1 1517 sqrshrn v18.4h, v18.4s, #1 1518 sqrshrn v19.4h, v19.4s, #1 1519 sqrshrn2 v16.8h, v20.4s, #1 1520 sqrshrn2 v17.8h, v21.4s, #1 1521 sqrshrn2 v18.8h, v22.4s, #1 1522 sqrshrn2 v19.8h, v23.4s, #1 1523 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 1524 blr x5 1525 mov x6, x0 1526 load_add_store_8x4 x6, x7 1527 1528 sqrshrn v16.4h, v24.4s, #1 1529 sqrshrn v17.4h, v25.4s, #1 1530 sqrshrn v18.4h, v26.4s, #1 1531 sqrshrn v19.4h, v27.4s, #1 1532 sqrshrn2 v16.8h, v28.4s, #1 1533 sqrshrn2 v17.8h, v29.4s, #1 1534 sqrshrn2 v18.8h, v30.4s, #1 1535 sqrshrn2 v19.8h, v31.4s, #1 1536 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 1537 blr x5 1538 add x6, x0, #16 1539 load_add_store_8x4 x6, x7 1540 1541 ret x15 1542endfunc 1543 1544function inv_txfm_add_4x16_neon 1545 ldrh w12, [x13, #4] 1546 mov x15, x30 1547 1548 mov x11, #64 1549 1550 cmp w3, w12 1551 ldrh w12, [x13, #2] 1552 b.lt 1f 1553 1554 add x6, x2, #48 1555 movi v2.4s, #0 1556.irp i, v16.4s, v17.4s, v18.4s, v19.4s 1557 ld1 {\i}, [x6] 1558 st1 {v2.4s}, [x6], x11 1559.endr 1560 blr x4 1561 sqrshrn v28.4h, v16.4s, #1 1562 sqrshrn v29.4h, v17.4s, #1 1563 sqrshrn v30.4h, v18.4s, #1 1564 sqrshrn v31.4h, v19.4s, #1 1565 transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 1566 1567 b 2f 15681: 1569.irp i, v28.4h, v29.4h, v30.4h, v31.4h 1570 movi \i, #0 1571.endr 15722: 1573 cmp w3, w12 1574 ldrh w12, [x13, #0] 1575 b.lt 1f 1576 1577 add x6, x2, #32 1578 movi v2.4s, #0 1579.irp i, v16.4s, v17.4s, v18.4s, v19.4s 1580 ld1 {\i}, [x6] 1581 st1 {v2.4s}, [x6], x11 1582.endr 1583 blr x4 1584 sqrshrn v24.4h, v16.4s, #1 1585 sqrshrn v25.4h, v17.4s, #1 1586 sqrshrn v26.4h, v18.4s, #1 1587 sqrshrn v27.4h, v19.4s, #1 1588 transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 1589 1590 b 2f 15911: 1592.irp i, v24.4h, v25.4h, v26.4h, v27.4h 1593 movi \i, #0 1594.endr 15952: 1596 cmp w3, w12 1597 b.lt 1f 1598 1599 add x6, x2, #16 1600 movi v2.4s, #0 1601.irp i, v16.4s, v17.4s, v18.4s, v19.4s 1602 ld1 {\i}, [x6] 1603 st1 {v2.4s}, [x6], x11 1604.endr 1605 blr x4 1606 sqrshrn v20.4h, v16.4s, #1 1607 sqrshrn v21.4h, v17.4s, #1 1608 sqrshrn v22.4h, v18.4s, #1 1609 sqrshrn v23.4h, v19.4s, #1 1610 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 1611 1612 b 2f 16131: 1614.irp i, v20.4h, v21.4h, v22.4h, v23.4h 1615 movi \i, #0 1616.endr 16172: 1618 1619 movi v2.4s, #0 1620.irp i, v16.4s, v17.4s, v18.4s, v19.4s 1621 ld1 {\i}, [x2] 1622 st1 {v2.4s}, [x2], x11 1623.endr 1624 blr x4 1625 sqrshrn v16.4h, v16.4s, #1 1626 sqrshrn v17.4h, v17.4s, #1 1627 sqrshrn v18.4h, v18.4s, #1 1628 sqrshrn v19.4h, v19.4s, #1 1629 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 1630 1631 blr x5 1632 1633 load_add_store_4x16 x0, x6 1634 1635 ret x15 1636endfunc 1637 1638const eob_4x16 1639 .short 13, 29, 45, 64 1640endconst 1641 1642const eob_4x16_identity1 1643 .short 16, 32, 48, 64 1644endconst 1645 1646const eob_4x16_identity2 1647 .short 4, 8, 12, 64 1648endconst 1649 1650.macro def_fn_416 w, h, txfm1, txfm2 1651function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 1652.ifc \txfm1\()_\txfm2, dct_dct 1653 idct_dc \w, \h, 1 1654.endif 1655.if \w == 4 1656 adr x4, inv_\txfm1\()_4s_x\w\()_neon 1657 movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) 1658.ifc \txfm1, identity 1659.ifc \txfm2, identity 1660 movrel x13, eob_4x16 1661.else 1662 movrel x13, eob_4x16_identity1 1663.endif 1664.else 1665.ifc \txfm2, identity 1666 movrel x13, eob_4x16_identity2 1667.else 1668 movrel x13, eob_4x16 1669.endif 1670.endif 1671.else 1672 adr x4, inv_\txfm1\()_4s_x\w\()_neon 1673 movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) 1674.endif 1675 b inv_txfm_add_\w\()x\h\()_neon 1676endfunc 1677.endm 1678 1679.macro def_fns_416 w, h 1680def_fn_416 \w, \h, dct, dct 1681def_fn_416 \w, \h, identity, identity 1682def_fn_416 \w, \h, dct, adst 1683def_fn_416 \w, \h, dct, flipadst 1684def_fn_416 \w, \h, dct, identity 1685def_fn_416 \w, \h, adst, dct 1686def_fn_416 \w, \h, adst, adst 1687def_fn_416 \w, \h, adst, flipadst 1688def_fn_416 \w, \h, flipadst, dct 1689def_fn_416 \w, \h, flipadst, adst 1690def_fn_416 \w, \h, flipadst, flipadst 1691def_fn_416 \w, \h, identity, dct 1692def_fn_416 \w, \h, adst, identity 1693def_fn_416 \w, \h, flipadst, identity 1694def_fn_416 \w, \h, identity, adst 1695def_fn_416 \w, \h, identity, flipadst 1696.endm 1697 1698def_fns_416 4, 16 1699def_fns_416 16, 4 1700 1701 1702function inv_txfm_add_16x8_neon 1703 mov x15, x30 1704 stp d8, d9, [sp, #-0x40]! 1705 stp d10, d11, [sp, #0x10] 1706 stp d12, d13, [sp, #0x20] 1707 stp d14, d15, [sp, #0x30] 1708 1709 cmp w3, w13 1710 mov x11, #32 1711 b.lt 1f 1712 1713 movi v4.4s, #0 1714 movz w16, #2896*8, lsl #16 1715 dup v0.2s, w16 1716 1717 add x6, x2, #16 1718.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1719 ld1 {\i}, [x6] 1720 st1 {v4.4s}, [x6], x11 1721.endr 1722 1723 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1724 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 1725 blr x4 1726 1727 sqrshrn v8.4h, v16.4s, #1 1728 sqrshrn v9.4h, v17.4s, #1 1729 sqrshrn v10.4h, v18.4s, #1 1730 sqrshrn v11.4h, v19.4s, #1 1731 sqrshrn2 v8.8h, v20.4s, #1 1732 sqrshrn2 v9.8h, v21.4s, #1 1733 sqrshrn2 v10.8h, v22.4s, #1 1734 sqrshrn2 v11.8h, v23.4s, #1 1735 sqrshrn v12.4h, v24.4s, #1 1736 sqrshrn v13.4h, v25.4s, #1 1737 sqrshrn v14.4h, v26.4s, #1 1738 sqrshrn v15.4h, v27.4s, #1 1739 sqrshrn2 v12.8h, v28.4s, #1 1740 sqrshrn2 v13.8h, v29.4s, #1 1741 sqrshrn2 v14.8h, v30.4s, #1 1742 sqrshrn2 v15.8h, v31.4s, #1 1743 1744 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 1745 transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 1746 1747 b 2f 17481: 1749.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h 1750 movi \i, #0 1751.endr 17522: 1753 movz w16, #2896*8, lsl #16 1754 dup v0.2s, w16 1755 1756 movi v4.4s, #0 1757.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1758 ld1 {\i}, [x2] 1759 st1 {v4.4s}, [x2], x11 1760.endr 1761 1762 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1763 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 1764 blr x4 1765 1766 sqrshrn v16.4h, v16.4s, #1 1767 sqrshrn v17.4h, v17.4s, #1 1768 sqrshrn v18.4h, v18.4s, #1 1769 sqrshrn v19.4h, v19.4s, #1 1770 sqrshrn2 v16.8h, v20.4s, #1 1771 sqrshrn2 v17.8h, v21.4s, #1 1772 sqrshrn2 v18.8h, v22.4s, #1 1773 sqrshrn2 v19.8h, v23.4s, #1 1774 1775 mov v20.16b, v8.16b 1776 mov v21.16b, v9.16b 1777 mov v22.16b, v10.16b 1778 mov v23.16b, v11.16b 1779 1780 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 1781 1782 sqrshrn v8.4h, v24.4s, #1 1783 sqrshrn v9.4h, v25.4s, #1 1784 sqrshrn v10.4h, v26.4s, #1 1785 sqrshrn v11.4h, v27.4s, #1 1786 sqrshrn2 v8.8h, v28.4s, #1 1787 sqrshrn2 v9.8h, v29.4s, #1 1788 sqrshrn2 v10.8h, v30.4s, #1 1789 sqrshrn2 v11.8h, v31.4s, #1 1790 1791 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 1792 1793 blr x5 1794 1795 mov x6, x0 1796 load_add_store_8x8 x6, x7 1797 1798 mov v16.16b, v8.16b 1799 mov v17.16b, v9.16b 1800 mov v18.16b, v10.16b 1801 mov v19.16b, v11.16b 1802 mov v20.16b, v12.16b 1803 mov v21.16b, v13.16b 1804 mov v22.16b, v14.16b 1805 mov v23.16b, v15.16b 1806 1807 blr x5 1808 1809 add x0, x0, #16 1810 load_add_store_8x8 x0, x7 1811 1812 ldp d14, d15, [sp, #0x30] 1813 ldp d12, d13, [sp, #0x20] 1814 ldp d10, d11, [sp, #0x10] 1815 ldp d8, d9, [sp], 0x40 1816 ret x15 1817endfunc 1818 1819function inv_txfm_add_8x16_neon 1820 mov x15, x30 1821 stp d8, d9, [sp, #-0x20]! 1822 stp d10, d11, [sp, #0x10] 1823 ldrh w12, [x13, #4] 1824 1825 mov x11, #64 1826 1827 cmp w3, w12 1828 ldrh w12, [x13, #2] 1829 b.lt 1f 1830 1831 add x6, x2, #48 1832 movi v4.4s, #0 1833 movz w16, #2896*8, lsl #16 1834 dup v0.2s, w16 1835.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1836 ld1 {\i}, [x6] 1837 st1 {v4.4s}, [x6], x11 1838.endr 1839 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1840 blr x4 1841 1842 sqrshrn v28.4h, v16.4s, #1 1843 sqrshrn v29.4h, v17.4s, #1 1844 sqrshrn v30.4h, v18.4s, #1 1845 sqrshrn v31.4h, v19.4s, #1 1846 sqrshrn2 v28.8h, v20.4s, #1 1847 sqrshrn2 v29.8h, v21.4s, #1 1848 sqrshrn2 v30.8h, v22.4s, #1 1849 sqrshrn2 v31.8h, v23.4s, #1 1850 transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 1851 1852 b 2f 1853 18541: 1855.irp i, v28.8h, v29.8h, v30.8h, v31.8h 1856 movi \i, #0 1857.endr 1858 18592: 1860 cmp w3, w12 1861 ldrh w12, [x13, #0] 1862 b.lt 1f 1863 1864 add x6, x2, #32 1865 movi v4.4s, #0 1866 movz w16, #2896*8, lsl #16 1867 dup v0.2s, w16 1868.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1869 ld1 {\i}, [x6] 1870 st1 {v4.4s}, [x6], x11 1871.endr 1872 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1873 blr x4 1874 1875 sqrshrn v24.4h, v16.4s, #1 1876 sqrshrn v25.4h, v17.4s, #1 1877 sqrshrn v26.4h, v18.4s, #1 1878 sqrshrn v27.4h, v19.4s, #1 1879 sqrshrn2 v24.8h, v20.4s, #1 1880 sqrshrn2 v25.8h, v21.4s, #1 1881 sqrshrn2 v26.8h, v22.4s, #1 1882 sqrshrn2 v27.8h, v23.4s, #1 1883 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 1884 1885 b 2f 1886 18871: 1888.irp i, v24.8h, v25.8h, v26.8h, v27.8h 1889 movi \i, #0 1890.endr 1891 18922: 1893 cmp w3, w12 1894 b.lt 1f 1895 1896 add x6, x2, #16 1897 movi v4.4s, #0 1898 movz w16, #2896*8, lsl #16 1899 dup v0.2s, w16 1900.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1901 ld1 {\i}, [x6] 1902 st1 {v4.4s}, [x6], x11 1903.endr 1904 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1905 blr x4 1906 1907 sqrshrn v8.4h, v16.4s, #1 1908 sqrshrn v9.4h, v17.4s, #1 1909 sqrshrn v10.4h, v18.4s, #1 1910 sqrshrn v11.4h, v19.4s, #1 1911 sqrshrn2 v8.8h, v20.4s, #1 1912 sqrshrn2 v9.8h, v21.4s, #1 1913 sqrshrn2 v10.8h, v22.4s, #1 1914 sqrshrn2 v11.8h, v23.4s, #1 1915 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 1916 1917 b 2f 1918 19191: 1920.irp i, v8.8h, v9.8h, v10.8h, v11.8h 1921 movi \i, #0 1922.endr 1923 19242: 1925 movi v4.4s, #0 1926 movz w16, #2896*8, lsl #16 1927 dup v0.2s, w16 1928.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1929 ld1 {\i}, [x2] 1930 st1 {v4.4s}, [x2], x11 1931.endr 1932 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 1933 blr x4 1934 1935 sqrshrn v16.4h, v16.4s, #1 1936 sqrshrn v17.4h, v17.4s, #1 1937 sqrshrn v18.4h, v18.4s, #1 1938 sqrshrn v19.4h, v19.4s, #1 1939 sqrshrn2 v16.8h, v20.4s, #1 1940 sqrshrn2 v17.8h, v21.4s, #1 1941 sqrshrn2 v18.8h, v22.4s, #1 1942 sqrshrn2 v19.8h, v23.4s, #1 1943 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 1944 1945 mov v20.16b, v8.16b 1946 mov v21.16b, v9.16b 1947 mov v22.16b, v10.16b 1948 mov v23.16b, v11.16b 1949 1950 blr x5 1951 1952 load_add_store_8x16 x0, x6 1953 1954 ldp d10, d11, [sp, #0x10] 1955 ldp d8, d9, [sp], 0x20 1956 1957 ret x15 1958endfunc 1959 1960const eob_8x16 1961 .short 10, 43, 75, 128 1962endconst 1963 1964const eob_8x16_identity1 1965 .short 4, 64, 96, 128 1966endconst 1967 1968const eob_8x16_identity2 1969 .short 4, 8, 12, 128 1970endconst 1971 1972.macro def_fn_816 w, h, txfm1, txfm2 1973function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 1974.ifc \txfm1\()_\txfm2, dct_dct 1975 idct_dc \w, \h, 1 1976.endif 1977 adr x4, inv_\txfm1\()_4s_x\w\()_neon 1978 movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) 1979.ifc \txfm1, identity 1980.ifc \txfm2, identity 1981 movrel x13, eob_8x16 1982.else 1983 movrel x13, eob_8x16_identity1 1984.endif 1985.else 1986.ifc \txfm2, identity 1987 movrel x13, eob_8x16_identity2 1988.else 1989 movrel x13, eob_8x16 1990.endif 1991.endif 1992.if \h == 8 1993 ldrh w13, [x13] 1994.endif 1995 b inv_txfm_add_\w\()x\h\()_neon 1996endfunc 1997.endm 1998 1999.macro def_fns_816 w, h 2000def_fn_816 \w, \h, dct, dct 2001def_fn_816 \w, \h, identity, identity 2002def_fn_816 \w, \h, dct, adst 2003def_fn_816 \w, \h, dct, flipadst 2004def_fn_816 \w, \h, dct, identity 2005def_fn_816 \w, \h, adst, dct 2006def_fn_816 \w, \h, adst, adst 2007def_fn_816 \w, \h, adst, flipadst 2008def_fn_816 \w, \h, flipadst, dct 2009def_fn_816 \w, \h, flipadst, adst 2010def_fn_816 \w, \h, flipadst, flipadst 2011def_fn_816 \w, \h, identity, dct 2012def_fn_816 \w, \h, adst, identity 2013def_fn_816 \w, \h, flipadst, identity 2014def_fn_816 \w, \h, identity, adst 2015def_fn_816 \w, \h, identity, flipadst 2016.endm 2017 2018def_fns_816 8, 16 2019def_fns_816 16, 8 2020 2021function inv_dct32_odd_4s_x16_neon 2022 movrel x16, idct_coeffs, 4*16 2023 ld1 {v0.4s, v1.4s}, [x16], #32 2024 2025 mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a 2026 mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a 2027 mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a 2028 srshr v16.4s, v2.4s, #12 // t16a 2029 srshr v31.4s, v4.4s, #12 // t31a 2030 mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a 2031 mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a 2032 srshr v24.4s, v6.4s, #12 // t17a 2033 srshr v23.4s, v2.4s, #12 // t30a 2034 mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a 2035 mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a 2036 srshr v20.4s, v4.4s, #12 // t18a 2037 srshr v27.4s, v6.4s, #12 // t29a 2038 mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a 2039 ld1 {v0.4s, v1.4s}, [x16] 2040 sub x16, x16, #4*24 2041 mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a 2042 srshr v28.4s, v2.4s, #12 // t19a 2043 srshr v19.4s, v4.4s, #12 // t28a 2044 mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a 2045 mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a 2046 srshr v18.4s, v6.4s, #12 // t20a 2047 srshr v29.4s, v2.4s, #12 // t27a 2048 mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a 2049 mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a 2050 srshr v26.4s, v4.4s, #12 // t21a 2051 srshr v21.4s, v6.4s, #12 // t26a 2052 mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a 2053 mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a 2054 srshr v22.4s, v2.4s, #12 // t22a 2055 srshr v25.4s, v4.4s, #12 // t25a 2056 mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a 2057 srshr v30.4s, v6.4s, #12 // t23a 2058 srshr v17.4s, v2.4s, #12 // t24a 2059 2060 ld1 {v0.4s, v1.4s}, [x16] 2061 2062 movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff 2063 mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 2064 2065 sqsub v2.4s, v16.4s, v24.4s // t17 2066 sqadd v16.4s, v16.4s, v24.4s // t16 2067 sqsub v3.4s, v31.4s, v23.4s // t30 2068 sqadd v31.4s, v31.4s, v23.4s // t31 2069 sqsub v24.4s, v28.4s, v20.4s // t18 2070 sqadd v28.4s, v28.4s, v20.4s // t19 2071 sqadd v23.4s, v18.4s, v26.4s // t20 2072 sqsub v18.4s, v18.4s, v26.4s // t21 2073 sqsub v20.4s, v30.4s, v22.4s // t22 2074 sqadd v30.4s, v30.4s, v22.4s // t23 2075 sqadd v26.4s, v17.4s, v25.4s // t24 2076 sqsub v17.4s, v17.4s, v25.4s // t25 2077 sqsub v22.4s, v29.4s, v21.4s // t26 2078 sqadd v29.4s, v29.4s, v21.4s // t27 2079 sqadd v25.4s, v19.4s, v27.4s // t28 2080 sqsub v19.4s, v19.4s, v27.4s // t29 2081 2082.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 2083 smin \r\().4s, \r\().4s, v5.4s 2084.endr 2085.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 2086 smax \r\().4s, \r\().4s, v4.4s 2087.endr 2088 2089 mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a 2090 mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a 2091 mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a 2092 srshr v21.4s, v7.4s, #12 // t17a 2093 srshr v27.4s, v6.4s, #12 // t30a 2094 neg v2.4s, v2.4s // -> t18a 2095 mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a 2096 mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a 2097 srshr v19.4s, v2.4s, #12 // t18a 2098 srshr v24.4s, v7.4s, #12 // t29a 2099 mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a 2100 mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a 2101 srshr v22.4s, v6.4s, #12 // t21a 2102 srshr v18.4s, v2.4s, #12 // t26a 2103 neg v7.4s, v7.4s // -> t22a 2104 mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a 2105 srshr v17.4s, v7.4s, #12 // t22a 2106 srshr v20.4s, v6.4s, #12 // t25a 2107 2108 sqsub v2.4s, v27.4s, v24.4s // t29 2109 sqadd v27.4s, v27.4s, v24.4s // t30 2110 sqsub v3.4s, v21.4s, v19.4s // t18 2111 sqadd v21.4s, v21.4s, v19.4s // t17 2112 sqsub v24.4s, v16.4s, v28.4s // t19a 2113 sqadd v16.4s, v16.4s, v28.4s // t16a 2114 sqsub v19.4s, v30.4s, v23.4s // t20a 2115 sqadd v30.4s, v30.4s, v23.4s // t23a 2116 sqsub v28.4s, v17.4s, v22.4s // t21 2117 sqadd v17.4s, v17.4s, v22.4s // t22 2118 sqadd v23.4s, v26.4s, v29.4s // t24a 2119 sqsub v26.4s, v26.4s, v29.4s // t27a 2120 sqadd v22.4s, v20.4s, v18.4s // t25 2121 sqsub v20.4s, v20.4s, v18.4s // t26 2122 sqsub v29.4s, v31.4s, v25.4s // t28a 2123 sqadd v31.4s, v31.4s, v25.4s // t31a 2124 2125.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 2126 smin \r\().4s, \r\().4s, v5.4s 2127.endr 2128.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 2129 smax \r\().4s, \r\().4s, v4.4s 2130.endr 2131 2132 mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a 2133 mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a 2134 mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 2135 srshr v18.4s, v7.4s, #12 // t18a 2136 srshr v25.4s, v6.4s, #12 // t29a 2137 mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28 2138 mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 2139 srshr v29.4s, v2.4s, #12 // t19 2140 srshr v24.4s, v7.4s, #12 // t28 2141 neg v6.4s, v6.4s // -> t20 2142 mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 2143 mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a 2144 srshr v26.4s, v6.4s, #12 // t20 2145 srshr v19.4s, v2.4s, #12 // t27 2146 neg v7.4s, v7.4s // -> t21a 2147 mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a 2148 srshr v20.4s, v7.4s, #12 // t21a 2149 srshr v28.4s, v6.4s, #12 // t26a 2150 2151 sqsub v2.4s, v16.4s, v30.4s // t23 2152 sqadd v16.4s, v16.4s, v30.4s // t16 = out16 2153 sqsub v3.4s, v31.4s, v23.4s // t24 2154 sqadd v31.4s, v31.4s, v23.4s // t31 = out31 2155 sqsub v23.4s, v21.4s, v17.4s // t22a 2156 sqadd v17.4s, v21.4s, v17.4s // t17a = out17 2157 sqadd v30.4s, v27.4s, v22.4s // t30a = out30 2158 sqsub v21.4s, v27.4s, v22.4s // t25a 2159 sqsub v27.4s, v18.4s, v20.4s // t21 2160 sqadd v18.4s, v18.4s, v20.4s // t18 = out18 2161 sqadd v7.4s, v29.4s, v26.4s // t19a = out19 2162 sqsub v26.4s, v29.4s, v26.4s // t20a 2163 sqadd v29.4s, v25.4s, v28.4s // t29 = out29 2164 sqsub v25.4s, v25.4s, v28.4s // t26 2165 sqadd v28.4s, v24.4s, v19.4s // t28a = out28 2166 sqsub v24.4s, v24.4s, v19.4s // t27a 2167 mov v19.16b, v7.16b // out19 2168 2169.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 2170 smin \r\().4s, \r\().4s, v5.4s 2171.endr 2172.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 2173 smax \r\().4s, \r\().4s, v4.4s 2174.endr 2175 2176 mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20 2177 mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 2178 srshr v20.4s, v7.4s, #12 // t20 2179 srshr v22.4s, v6.4s, #12 // t27 2180 2181 mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a 2182 mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a 2183 mov v27.16b, v22.16b // t27 2184 srshr v26.4s, v7.4s, #12 // t26a 2185 2186 mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 2187 mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25 2188 srshr v21.4s, v6.4s, #12 // t21a 2189 srshr v22.4s, v24.4s, #12 // t22 2190 srshr v25.4s, v7.4s, #12 // t25 2191 2192 mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a 2193 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a 2194 srshr v23.4s, v7.4s, #12 // t23a 2195 srshr v24.4s, v6.4s, #12 // t24a 2196 2197 ret 2198endfunc 2199 2200.macro def_horz_32 scale=0, shift=2, suffix 2201function inv_txfm_horz\suffix\()_dct_32x4_neon 2202 mov x14, x30 2203 movi v7.4s, #0 2204 lsl x8, x8, #1 2205.if \scale 2206 movz w16, #2896*8, lsl #16 2207 dup v0.2s, w16 2208.endif 2209 2210.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 2211 ld1 {\i}, [x7] 2212 st1 {v7.4s}, [x7], x8 2213.endr 2214 sub x7, x7, x8, lsl #4 2215 add x7, x7, x8, lsr #1 2216.if \scale 2217 scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 2218 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 2219.endif 2220 bl inv_dct_4s_x16_neon 2221 2222 // idct_16 leaves the row_clip_max/min constants in v5 and v4 2223.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 2224 smin_4s \r, \r, v5 2225.endr 2226.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 2227 smax_4s \r, \r, v4 2228.endr 2229 2230 transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 2231 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 2232 transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 2233 transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 2234 2235.macro store1 r0, r1, r2, r3 2236 st1 {\r0}, [x6], #16 2237 st1 {\r1}, [x6], #16 2238 st1 {\r2}, [x6], #16 2239 st1 {\r3}, [x6], #16 2240.endm 2241 store1 v16.4s, v20.4s, v24.4s, v28.4s 2242 store1 v17.4s, v21.4s, v25.4s, v29.4s 2243 store1 v18.4s, v22.4s, v26.4s, v30.4s 2244 store1 v19.4s, v23.4s, v27.4s, v31.4s 2245.purgem store1 2246 sub x6, x6, #64*4 2247 2248 movi v7.4s, #0 2249.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 2250 ld1 {\i}, [x7] 2251 st1 {v7.4s}, [x7], x8 2252.endr 2253.if \scale 2254 // This relies on the fact that the idct also leaves the right coeff in v0.s[1] 2255 scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 2256 scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 2257.endif 2258 bl inv_dct32_odd_4s_x16_neon 2259 transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 2260 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 2261 transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 2262 transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 2263.macro store2 r0, r1, r2, r3, shift 2264 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] 2265 sqsub v4.4s, v0.4s, \r0 2266 sqadd v0.4s, v0.4s, \r0 2267 sqsub v5.4s, v1.4s, \r1 2268 sqadd v1.4s, v1.4s, \r1 2269 sqsub v6.4s, v2.4s, \r2 2270 sqadd v2.4s, v2.4s, \r2 2271 sqsub v7.4s, v3.4s, \r3 2272 sqadd v3.4s, v3.4s, \r3 2273 sqrshrn v0.4h, v0.4s, #\shift 2274 sqrshrn2 v0.8h, v1.4s, #\shift 2275 sqrshrn v1.4h, v2.4s, #\shift 2276 sqrshrn2 v1.8h, v3.4s, #\shift 2277 sqrshrn v2.4h, v7.4s, #\shift 2278 sqrshrn2 v2.8h, v6.4s, #\shift 2279 sqrshrn v3.4h, v5.4s, #\shift 2280 sqrshrn2 v3.8h, v4.4s, #\shift 2281 st1 {v0.8h, v1.8h}, [x6], #32 2282 rev64 v2.8h, v2.8h 2283 rev64 v3.8h, v3.8h 2284 st1 {v2.8h, v3.8h}, [x6], #32 2285.endm 2286 2287 store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift 2288 store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift 2289 store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift 2290 store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift 2291.purgem store2 2292 ret x14 2293endfunc 2294.endm 2295 2296def_horz_32 scale=0, shift=2 2297def_horz_32 scale=1, shift=1, suffix=_scale 2298 2299function inv_txfm_add_vert_dct_8x32_neon 2300 mov x14, x30 2301 lsl x8, x8, #1 2302 2303.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2304 ld1 {v\i\().8h}, [x7], x8 2305.endr 2306 sub x7, x7, x8, lsl #4 2307 2308 bl X(inv_dct_8h_x16_neon) 2309 2310.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2311 st1 {v\i\().8h}, [x7], x8 2312.endr 2313 sub x7, x7, x8, lsl #4 2314 add x7, x7, x8, lsr #1 2315 2316.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 2317 ld1 {v\i\().8h}, [x7], x8 2318.endr 2319 sub x7, x7, x8, lsl #4 2320 sub x7, x7, x8, lsr #1 2321 bl X(inv_dct32_odd_8h_x16_neon) 2322 2323 neg x9, x8 2324 mov x10, x6 2325 mvni v1.8h, #0xfc, lsl #8 // 0x3ff 2326.macro combine r0, r1, r2, r3, op, stride 2327 ld1 {v5.8h}, [x7], \stride 2328 ld1 {v2.8h}, [x10], x1 2329 ld1 {v6.8h}, [x7], \stride 2330 ld1 {v3.8h}, [x10], x1 2331 \op v5.8h, v5.8h, \r0 2332 ld1 {v7.8h}, [x7], \stride 2333 ld1 {v4.8h}, [x10], x1 2334 srshr v5.8h, v5.8h, #4 2335 \op v6.8h, v6.8h, \r1 2336 usqadd v2.8h, v5.8h 2337 srshr v6.8h, v6.8h, #4 2338 \op v7.8h, v7.8h, \r2 2339 ld1 {v5.8h}, [x7], \stride 2340 usqadd v3.8h, v6.8h 2341 smin v2.8h, v2.8h, v1.8h 2342 srshr v7.8h, v7.8h, #4 2343 \op v5.8h, v5.8h, \r3 2344 st1 {v2.8h}, [x6], x1 2345 ld1 {v2.8h}, [x10], x1 2346 usqadd v4.8h, v7.8h 2347 smin v3.8h, v3.8h, v1.8h 2348 srshr v5.8h, v5.8h, #4 2349 st1 {v3.8h}, [x6], x1 2350 usqadd v2.8h, v5.8h 2351 smin v4.8h, v4.8h, v1.8h 2352 st1 {v4.8h}, [x6], x1 2353 smin v2.8h, v2.8h, v1.8h 2354 st1 {v2.8h}, [x6], x1 2355.endm 2356 combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 2357 combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 2358 combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 2359 combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 2360 sub x7, x7, x8 2361 combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 2362 combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 2363 combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 2364 combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 2365.purgem combine 2366 2367 ret x14 2368endfunc 2369 2370const eob_32x32 2371 .short 10, 36, 78, 136, 210, 300, 406, 1024 2372endconst 2373 2374const eob_16x32 2375 .short 10, 36, 78, 151, 215, 279, 343, 512 2376endconst 2377 2378const eob_16x32_shortside 2379 .short 10, 36, 78, 512 2380endconst 2381 2382const eob_8x32 2383 .short 10, 43, 75, 107, 139, 171, 203, 256 2384endconst 2385 2386function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 2387 movi v0.8h, #0 2388 movi v1.8h, #0 2389 movrel x13, eob_32x32, 2 2390 2391 mov x8, #4*32 23921: 2393 mov w9, #0 2394 movrel x12, eob_32x32, 2 23952: 2396 add w9, w9, #8 2397 ld1 {v16.4s, v17.4s}, [x2] 2398 st1 {v0.4s, v1.4s}, [x2], x8 2399 ld1 {v18.4s, v19.4s}, [x2] 2400 st1 {v0.4s, v1.4s}, [x2], x8 2401 ld1 {v20.4s, v21.4s}, [x2] 2402 st1 {v0.4s, v1.4s}, [x2], x8 2403 ld1 {v22.4s, v23.4s}, [x2] 2404 st1 {v0.4s, v1.4s}, [x2], x8 2405 ld1 {v24.4s, v25.4s}, [x2] 2406 st1 {v0.4s, v1.4s}, [x2], x8 2407 ld1 {v26.4s, v27.4s}, [x2] 2408 st1 {v0.4s, v1.4s}, [x2], x8 2409 ld1 {v28.4s, v29.4s}, [x2] 2410 st1 {v0.4s, v1.4s}, [x2], x8 2411 ld1 {v30.4s, v31.4s}, [x2] 2412 st1 {v0.4s, v1.4s}, [x2], x8 2413 sqxtn v16.4h, v16.4s 2414 sqxtn2 v16.8h, v17.4s 2415 sqxtn v17.4h, v18.4s 2416 sqxtn2 v17.8h, v19.4s 2417 sqxtn v18.4h, v20.4s 2418 sqxtn2 v18.8h, v21.4s 2419 sqxtn v19.4h, v22.4s 2420 sqxtn2 v19.8h, v23.4s 2421 sqxtn v20.4h, v24.4s 2422 sqxtn2 v20.8h, v25.4s 2423 sqxtn v21.4h, v26.4s 2424 sqxtn2 v21.8h, v27.4s 2425 sqxtn v22.4h, v28.4s 2426 sqxtn2 v22.8h, v29.4s 2427 sqxtn v23.4h, v30.4s 2428 sqxtn2 v23.8h, v31.4s 2429 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2430 2431 load_add_store_8x8 x0, x7, shiftbits=2 2432 ldrh w11, [x12], #4 2433 sub x0, x0, x1, lsl #3 2434 add x0, x0, #2*8 2435 cmp w3, w11 2436 b.ge 2b 2437 2438 ldrh w11, [x13], #4 2439 cmp w3, w11 2440 b.lt 9f 2441 2442 sub x0, x0, w9, uxtw #1 2443 add x0, x0, x1, lsl #3 2444 msub x2, x8, x9, x2 2445 add x2, x2, #4*8 2446 b 1b 24479: 2448 ret 2449endfunc 2450 2451.macro shift_16_regs op, shift 2452.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 2453 \op \i, \i, #\shift 2454.endr 2455.endm 2456 2457.macro def_identity_1632 w, h, wshort, hshort 2458function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 2459 movz w16, #2896*8, lsl #16 2460 movz w17, #2*(5793-4096)*8, lsl #16 2461 movi v0.4s, #0 2462 movi v1.4s, #0 2463 movrel x13, eob_16x32\hshort, 2 2464 2465 mov x8, #4*\h 24661: 2467 mov w9, #0 2468 movrel x12, eob_16x32\wshort, 2 24692: 2470 add w9, w9, #8 2471 ld1 {v16.4s, v17.4s}, [x2] 2472 st1 {v0.4s, v1.4s}, [x2], x8 2473 dup v2.2s, w16 2474 ld1 {v18.4s, v19.4s}, [x2] 2475 st1 {v0.4s, v1.4s}, [x2], x8 2476 mov v2.s[1], w17 2477 ld1 {v20.4s, v21.4s}, [x2] 2478 st1 {v0.4s, v1.4s}, [x2], x8 2479 ld1 {v22.4s, v23.4s}, [x2] 2480 st1 {v0.4s, v1.4s}, [x2], x8 2481 ld1 {v24.4s, v25.4s}, [x2] 2482 st1 {v0.4s, v1.4s}, [x2], x8 2483 ld1 {v26.4s, v27.4s}, [x2] 2484 st1 {v0.4s, v1.4s}, [x2], x8 2485 ld1 {v28.4s, v29.4s}, [x2] 2486 st1 {v0.4s, v1.4s}, [x2], x8 2487 ld1 {v30.4s, v31.4s}, [x2] 2488 st1 {v0.4s, v1.4s}, [x2], x8 2489 scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 2490 scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 2491 2492.if \w == 16 2493 // 16x32 2494 identity_4x16_shift1 v2.s[1] 2495.else 2496 // 32x16 2497 shift_16_regs sqshl, 1 2498 identity_4x16 v2.s[1] 2499.endif 2500 sqxtn v16.4h, v16.4s 2501 sqxtn2 v16.8h, v17.4s 2502 sqxtn v17.4h, v18.4s 2503 sqxtn2 v17.8h, v19.4s 2504 sqxtn v18.4h, v20.4s 2505 sqxtn2 v18.8h, v21.4s 2506 sqxtn v19.4h, v22.4s 2507 sqxtn2 v19.8h, v23.4s 2508 sqxtn v20.4h, v24.4s 2509 sqxtn2 v20.8h, v25.4s 2510 sqxtn v21.4h, v26.4s 2511 sqxtn2 v21.8h, v27.4s 2512 sqxtn v22.4h, v28.4s 2513 sqxtn2 v22.8h, v29.4s 2514 sqxtn v23.4h, v30.4s 2515 sqxtn2 v23.8h, v31.4s 2516 2517 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2518 2519.if \w == 16 2520 load_add_store_8x8 x0, x7, shiftbits=2 2521.else 2522 load_add_store_8x8 x0, x7, shiftbits=4 2523.endif 2524 ldrh w11, [x12], #4 2525 sub x0, x0, x1, lsl #3 2526 add x0, x0, #16 2527 cmp w3, w11 2528 b.ge 2b 2529 2530 ldrh w11, [x13], #4 2531 cmp w3, w11 2532 b.lt 9f 2533 2534 sub x0, x0, w9, uxtw #1 2535 add x0, x0, x1, lsl #3 2536 msub x2, x8, x9, x2 2537 add x2, x2, #4*8 2538 b 1b 25399: 2540 ret 2541endfunc 2542.endm 2543 2544def_identity_1632 16, 32, _shortside, 2545def_identity_1632 32, 16, , _shortside 2546 2547.macro def_identity_832 w, h 2548function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 2549 movi v0.4s, #0 2550 movi v1.4s, #0 2551 // Working on 8x8 blocks, read every other entry from eob_8x32 2552 movrel x13, eob_8x32, 2 2553 2554 mov w8, #4*\h 25551: 2556 // Working on 8x8 blocks, read every other entry from eob_8x32 2557 ldrh w12, [x13], #4 2558 ld1 {v16.4s, v17.4s}, [x2] 2559 st1 {v0.4s, v1.4s}, [x2], x8 2560 ld1 {v18.4s, v19.4s}, [x2] 2561 st1 {v0.4s, v1.4s}, [x2], x8 2562 ld1 {v20.4s, v21.4s}, [x2] 2563 st1 {v0.4s, v1.4s}, [x2], x8 2564 ld1 {v22.4s, v23.4s}, [x2] 2565 st1 {v0.4s, v1.4s}, [x2], x8 2566 ld1 {v24.4s, v25.4s}, [x2] 2567 st1 {v0.4s, v1.4s}, [x2], x8 2568 ld1 {v26.4s, v27.4s}, [x2] 2569 st1 {v0.4s, v1.4s}, [x2], x8 2570 ld1 {v28.4s, v29.4s}, [x2] 2571 st1 {v0.4s, v1.4s}, [x2], x8 2572 ld1 {v30.4s, v31.4s}, [x2] 2573 st1 {v0.4s, v1.4s}, [x2], x8 2574 2575.if \w == 8 2576 sqrshrn v16.4h, v16.4s, #1 2577 sqrshrn2 v16.8h, v17.4s, #1 2578 sqrshrn v17.4h, v18.4s, #1 2579 sqrshrn2 v17.8h, v19.4s, #1 2580 sqrshrn v18.4h, v20.4s, #1 2581 sqrshrn2 v18.8h, v21.4s, #1 2582 sqrshrn v19.4h, v22.4s, #1 2583 sqrshrn2 v19.8h, v23.4s, #1 2584 sqrshrn v20.4h, v24.4s, #1 2585 sqrshrn2 v20.8h, v25.4s, #1 2586 sqrshrn v21.4h, v26.4s, #1 2587 sqrshrn2 v21.8h, v27.4s, #1 2588 sqrshrn v22.4h, v28.4s, #1 2589 sqrshrn2 v22.8h, v29.4s, #1 2590 sqrshrn v23.4h, v30.4s, #1 2591 sqrshrn2 v23.8h, v31.4s, #1 2592.else 2593 sqxtn v16.4h, v16.4s 2594 sqxtn2 v16.8h, v17.4s 2595 sqxtn v17.4h, v18.4s 2596 sqxtn2 v17.8h, v19.4s 2597 sqxtn v18.4h, v20.4s 2598 sqxtn2 v18.8h, v21.4s 2599 sqxtn v19.4h, v22.4s 2600 sqxtn2 v19.8h, v23.4s 2601 sqxtn v20.4h, v24.4s 2602 sqxtn2 v20.8h, v25.4s 2603 sqxtn v21.4h, v26.4s 2604 sqxtn2 v21.8h, v27.4s 2605 sqxtn v22.4h, v28.4s 2606 sqxtn2 v22.8h, v29.4s 2607 sqxtn v23.4h, v30.4s 2608 sqxtn2 v23.8h, v31.4s 2609.endif 2610 2611 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 2612 2613 2614 cmp w3, w12 2615.if \w == 8 2616 load_add_store_8x8 x0, x7, shiftbits=2 2617.else 2618 load_add_store_8x8 x0, x7, shiftbits=3 2619.endif 2620 2621 b.lt 9f 2622.if \w == 8 2623 sub x2, x2, x8, lsl #3 2624 add x2, x2, #4*8 2625.else 2626 sub x0, x0, x1, lsl #3 2627 add x0, x0, #2*8 2628.endif 2629 b 1b 2630 26319: 2632 ret 2633endfunc 2634.endm 2635 2636def_identity_832 8, 32 2637def_identity_832 32, 8 2638 2639function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 2640 idct_dc 32, 32, 2 2641 2642 mov x15, x30 2643 sub sp, sp, #2048 2644 movrel x13, eob_32x32 2645 ldrh w12, [x13], #2 2646 2647.irp i, 0, 4, 8, 12, 16, 20, 24, 28 2648 add x6, sp, #(\i*32*2) 2649.if \i > 0 2650 mov w8, #(32 - \i) 2651 cmp w3, w12 2652 b.lt 1f 2653.if \i < 28 2654 ldrh w12, [x13], #2 2655.endif 2656.endif 2657 add x7, x2, #(\i*4) 2658 mov x8, #32*4 2659 bl inv_txfm_horz_dct_32x4_neon 2660.endr 2661 b 3f 2662 26631: 2664 movi v4.8h, #0 2665 movi v5.8h, #0 2666 movi v6.8h, #0 2667 movi v7.8h, #0 26682: 2669 subs w8, w8, #4 2670.rept 4 2671 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2672.endr 2673 b.gt 2b 2674 26753: 2676.irp i, 0, 8, 16, 24 2677 add x6, x0, #(\i*2) 2678 add x7, sp, #(\i*2) 2679 mov x8, #32*2 2680 bl inv_txfm_add_vert_dct_8x32_neon 2681.endr 2682 2683 add sp, sp, #2048 2684 ret x15 2685endfunc 2686 2687function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 2688 idct_dc 16, 32, 1 2689 2690 mov x15, x30 2691 sub sp, sp, #1024 2692 movrel x13, eob_16x32 2693 ldrh w12, [x13], #2 2694 adr x4, inv_dct_4s_x16_neon 2695 2696.irp i, 0, 4, 8, 12, 16, 20, 24, 28 2697 add x6, sp, #(\i*16*2) 2698 add x7, x2, #(\i*4) 2699.if \i > 0 2700 mov w8, #(32 - \i) 2701 cmp w3, w12 2702 b.lt 1f 2703.if \i < 28 2704 ldrh w12, [x13], #2 2705.endif 2706.endif 2707 mov x8, #4*32 2708 bl inv_txfm_horz_scale_16x4_neon 2709.endr 2710 b 3f 2711 27121: 2713 movi v4.8h, #0 2714 movi v5.8h, #0 2715 movi v6.8h, #0 2716 movi v7.8h, #0 27172: 2718 subs w8, w8, #4 2719.rept 2 2720 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2721.endr 2722 b.gt 2b 2723 27243: 2725.irp i, 0, 8 2726 add x6, x0, #(\i*2) 2727 add x7, sp, #(\i*2) 2728 mov x8, #16*2 2729 bl inv_txfm_add_vert_dct_8x32_neon 2730.endr 2731 2732 add sp, sp, #1024 2733 ret x15 2734endfunc 2735 2736function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 2737 idct_dc 32, 16, 1 2738 2739 mov x15, x30 2740 sub sp, sp, #1024 2741 2742 movrel x13, eob_16x32 2743 movrel x5, X(inv_dct_8h_x16_neon) 2744 ldrh w12, [x13], #2 2745 2746.irp i, 0, 4, 8, 12 2747 add x6, sp, #(\i*32*2) 2748 add x7, x2, #(\i*4) 2749.if \i > 0 2750 mov w8, #(16 - \i) 2751 cmp w3, w12 2752 b.lt 1f 2753.if \i < 12 2754 ldrh w12, [x13], #2 2755.endif 2756.endif 2757 mov x8, #4*16 2758 bl inv_txfm_horz_scale_dct_32x4_neon 2759.endr 2760 b 3f 2761 27621: 2763 movi v4.8h, #0 2764 movi v5.8h, #0 2765 movi v6.8h, #0 2766 movi v7.8h, #0 27672: 2768 subs w8, w8, #4 2769.rept 4 2770 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2771.endr 2772 b.gt 2b 2773 27743: 2775.irp i, 0, 8, 16, 24 2776 add x6, x0, #(\i*2) 2777 add x7, sp, #(\i*2) 2778 mov x8, #32*2 2779 bl inv_txfm_add_vert_8x16_neon 2780.endr 2781 2782 add sp, sp, #1024 2783 ret x15 2784endfunc 2785 2786function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 2787 idct_dc 8, 32, 2 2788 2789 mov x15, x30 2790 sub sp, sp, #512 2791 2792 movrel x13, eob_8x32 2793 2794 movi v28.4s, #0 2795 mov x8, #4*32 2796 mov w9, #32 2797 mov x6, sp 2798 mov x7, x2 27991: 2800.irp i, 16, 17, 18, 19, 20, 21, 22, 23 2801 ld1 {v\i\().4s}, [x7] 2802 st1 {v28.4s}, [x7], x8 2803.endr 2804 ldrh w12, [x13], #2 2805 sub w9, w9, #4 2806 sub x7, x7, x8, lsl #3 2807 add x7, x7, #4*4 2808 2809 bl inv_dct_4s_x8_neon 2810 2811 sqrshrn v16.4h, v16.4s, #2 2812 sqrshrn v17.4h, v17.4s, #2 2813 sqrshrn v18.4h, v18.4s, #2 2814 sqrshrn v19.4h, v19.4s, #2 2815 sqrshrn2 v16.8h, v20.4s, #2 2816 sqrshrn2 v17.8h, v21.4s, #2 2817 sqrshrn2 v18.8h, v22.4s, #2 2818 sqrshrn2 v19.8h, v23.4s, #2 2819 2820 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 2821 2822 cmp w3, w12 2823 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 2824 2825 b.ge 1b 2826 cbz w9, 3f 2827 2828 movi v29.8h, #0 2829 movi v30.8h, #0 2830 movi v31.8h, #0 28312: 2832 subs w9, w9, #4 2833 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 2834 b.gt 2b 2835 28363: 2837 mov x6, x0 2838 mov x7, sp 2839 mov x8, #8*2 2840 bl inv_txfm_add_vert_dct_8x32_neon 2841 2842 add sp, sp, #512 2843 ret x15 2844endfunc 2845 2846function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 2847 idct_dc 32, 8, 2 2848 2849 mov x15, x30 2850 sub sp, sp, #512 2851 2852.irp i, 0, 4 2853 add x6, sp, #(\i*32*2) 2854 add x7, x2, #(\i*4) 2855.if \i > 0 2856 cmp w3, #10 2857 b.lt 1f 2858.endif 2859 mov x8, #8*4 2860 bl inv_txfm_horz_dct_32x4_neon 2861.endr 2862 b 2f 2863 28641: 2865 movi v4.8h, #0 2866 movi v5.8h, #0 2867 movi v6.8h, #0 2868 movi v7.8h, #0 2869.rept 4 2870 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 2871.endr 2872 28732: 2874 mov x8, #2*32 2875 mov w9, #0 28761: 2877 add x6, x0, x9, lsl #1 2878 add x7, sp, x9, lsl #1 // #(\i*2) 2879 2880.irp i, 16, 17, 18, 19, 20, 21, 22, 23 2881 ld1 {v\i\().8h}, [x7], x8 2882.endr 2883 add w9, w9, #8 2884 2885 bl X(inv_dct_8h_x8_neon) 2886 2887 cmp w9, #32 2888 2889 load_add_store_8x8 x6, x7 2890 2891 b.lt 1b 2892 2893 add sp, sp, #512 2894 ret x15 2895endfunc 2896 2897function inv_dct64_step1_neon 2898 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 2899 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 2900 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 2901 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 2902 2903 ld1 {v0.4s, v1.4s}, [x17], #32 2904 2905 sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a 2906 sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a 2907 sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a 2908 sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a 2909 sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a 2910 sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a 2911 sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a 2912 sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a 2913 2914 ld1 {v0.4s}, [x17], #16 2915 2916 sqadd v24.4s, v16.4s, v17.4s // t32 2917 sqsub v25.4s, v16.4s, v17.4s // t33 2918 sqsub v26.4s, v19.4s, v18.4s // t34 2919 sqadd v27.4s, v19.4s, v18.4s // t35 2920 sqadd v28.4s, v20.4s, v21.4s // t60 2921 sqsub v29.4s, v20.4s, v21.4s // t61 2922 sqsub v30.4s, v23.4s, v22.4s // t62 2923 sqadd v31.4s, v23.4s, v22.4s // t63 2924 2925.irp r, v24, v25, v26, v27, v28, v29, v30, v31 2926 smin_4s \r, \r, v5 2927.endr 2928.irp r, v24, v25, v26, v27, v28, v29, v30, v31 2929 smax_4s \r, \r, v4 2930.endr 2931 2932 mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a 2933 mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a 2934 neg v2.4s, v2.4s // t34a 2935 mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a 2936 srshr v26.4s, v2.4s, #12 // t34a 2937 mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a 2938 srshr v29.4s, v7.4s, #12 // t61a 2939 srshr v25.4s, v6.4s, #12 // t33a 2940 srshr v30.4s, v2.4s, #12 // t62a 2941 2942 sqadd v16.4s, v24.4s, v27.4s // t32a 2943 sqsub v19.4s, v24.4s, v27.4s // t35a 2944 sqadd v17.4s, v25.4s, v26.4s // t33 2945 sqsub v18.4s, v25.4s, v26.4s // t34 2946 sqsub v20.4s, v31.4s, v28.4s // t60a 2947 sqadd v23.4s, v31.4s, v28.4s // t63a 2948 sqsub v21.4s, v30.4s, v29.4s // t61 2949 sqadd v22.4s, v30.4s, v29.4s // t62 2950 2951.irp r, v16, v19, v17, v18, v20, v23, v21, v22 2952 smin_4s \r, \r, v5 2953.endr 2954.irp r, v16, v19, v17, v18, v20, v23, v21, v22 2955 smax_4s \r, \r, v4 2956.endr 2957 2958 mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a 2959 mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a 2960 mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 2961 srshr v21.4s, v2.4s, #12 // t61a 2962 srshr v18.4s, v7.4s, #12 // t34a 2963 mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 2964 srshr v20.4s, v6.4s, #12 // t60 2965 srshr v19.4s, v2.4s, #12 // t35 2966 2967 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 2968 st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 2969 2970 ret 2971endfunc 2972 2973function inv_dct64_step2_neon 2974 movrel x16, idct_coeffs 2975 ld1 {v0.4s}, [x16] 29761: 2977 // t32a/33/34a/35/60/61a/62/63a 2978 // t56a/57/58a/59/36/37a/38/39a 2979 // t40a/41/42a/43/52/53a/54/55a 2980 // t48a/49/50a/51/44/45a/46/47a 2981 ldr q16, [x6, #4*4*0] // t32a 2982 ldr q17, [x9, #4*4*8] // t39a 2983 ldr q18, [x9, #4*4*0] // t63a 2984 ldr q19, [x6, #4*4*8] // t56a 2985 ldr q20, [x6, #4*4*16] // t40a 2986 ldr q21, [x9, #4*4*24] // t47a 2987 ldr q22, [x9, #4*4*16] // t55a 2988 ldr q23, [x6, #4*4*24] // t48a 2989 2990 sqadd v24.4s, v16.4s, v17.4s // t32 2991 sqsub v25.4s, v16.4s, v17.4s // t39 2992 sqadd v26.4s, v18.4s, v19.4s // t63 2993 sqsub v27.4s, v18.4s, v19.4s // t56 2994 sqsub v28.4s, v21.4s, v20.4s // t40 2995 sqadd v29.4s, v21.4s, v20.4s // t47 2996 sqadd v30.4s, v23.4s, v22.4s // t48 2997 sqsub v31.4s, v23.4s, v22.4s // t55 2998 2999.irp r, v24, v25, v26, v27, v28, v29, v30, v31 3000 smin_4s \r, \r, v5 3001.endr 3002.irp r, v24, v25, v26, v27, v28, v29, v30, v31 3003 smax_4s \r, \r, v4 3004.endr 3005 3006 mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a 3007 mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a 3008 mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a 3009 srshr v25.4s, v2.4s, #12 // t56a 3010 srshr v27.4s, v7.4s, #12 // t39a 3011 neg v6.4s, v6.4s // t40a 3012 mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a 3013 srshr v31.4s, v6.4s, #12 // t40a 3014 srshr v28.4s, v2.4s, #12 // t55a 3015 3016 sqadd v16.4s, v24.4s, v29.4s // t32a 3017 sqsub v19.4s, v24.4s, v29.4s // t47a 3018 sqadd v17.4s, v27.4s, v31.4s // t39 3019 sqsub v18.4s, v27.4s, v31.4s // t40 3020 sqsub v20.4s, v26.4s, v30.4s // t48a 3021 sqadd v23.4s, v26.4s, v30.4s // t63a 3022 sqsub v21.4s, v25.4s, v28.4s // t55 3023 sqadd v22.4s, v25.4s, v28.4s // t56 3024 3025.irp r, v16, v19, v17, v18, v20, v23, v21, v22 3026 smin_4s \r, \r, v5 3027.endr 3028.irp r, v16, v19, v17, v18, v20, v23, v21, v22 3029 smax_4s \r, \r, v4 3030.endr 3031 3032 mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a 3033 mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a 3034 mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 3035 srshr v18.4s, v2.4s, #12 // t40a 3036 srshr v21.4s, v7.4s, #12 // t55a 3037 mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 3038 srshr v19.4s, v6.4s, #12 // t47 3039 srshr v20.4s, v2.4s, #12 // t48 3040 3041 str q16, [x6, #4*4*0] // t32a 3042 str q17, [x9, #4*4*0] // t39 3043 str q18, [x6, #4*4*8] // t40a 3044 str q19, [x9, #4*4*8] // t47 3045 str q20, [x6, #4*4*16] // t48 3046 str q21, [x9, #4*4*16] // t55a 3047 str q22, [x6, #4*4*24] // t56 3048 str q23, [x9, #4*4*24] // t63a 3049 3050 add x6, x6, #4*4 3051 sub x9, x9, #4*4 3052 cmp x6, x9 3053 b.lt 1b 3054 ret 3055endfunc 3056 3057.macro load8 src, strd, zero, clear 3058.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 3059.if \clear 3060 ld1 {\i}, [\src] 3061 st1 {\zero}, [\src], \strd 3062.else 3063 ld1 {\i}, [\src], \strd 3064.endif 3065.endr 3066.endm 3067 3068.macro store16 dst 3069.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 3070 st1 {\i}, [\dst], #16 3071.endr 3072.endm 3073 3074.macro clear_upper8 3075.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 3076 movi \i, #0 3077.endr 3078.endm 3079 3080.macro movi_if reg, val, cond 3081.if \cond 3082 movi \reg, \val 3083.endif 3084.endm 3085 3086.macro movz16dup_if reg, gpr, val, cond 3087.if \cond 3088 movz \gpr, \val, lsl #16 3089 dup \reg, \gpr 3090.endif 3091.endm 3092 3093.macro st1_if regs, dst, cond 3094.if \cond 3095 st1 \regs, \dst 3096.endif 3097.endm 3098 3099.macro str_if reg, dst, cond 3100.if \cond 3101 str \reg, \dst 3102.endif 3103.endm 3104 3105.macro stroff_if reg, dst, dstoff, cond 3106.if \cond 3107 str \reg, \dst, \dstoff 3108.endif 3109.endm 3110 3111.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 3112.if \cond 3113 scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 3114.endif 3115.endm 3116 3117.macro def_dct64_func suffix, clear=0, scale=0 3118function inv_txfm_dct\suffix\()_4s_x64_neon 3119 mov x14, x30 3120 mov x6, sp 3121 lsl x8, x8, #2 3122 3123 movz16dup_if v0.2s, w16, #2896*8, \scale 3124 movi_if v7.4s, #0, \clear 3125 load8 x7, x8, v7.4s, \clear 3126 clear_upper8 3127 sub x7, x7, x8, lsl #3 3128 add x7, x7, x8, lsr #1 3129 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 3130 3131 bl inv_dct_4s_x16_neon 3132 3133 // idct_16 leaves the row_clip_max/min constants in v5 and v4 3134.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 3135 smin_4s \r, \r, v5 3136.endr 3137.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 3138 smax_4s \r, \r, v4 3139.endr 3140 3141 store16 x6 3142 3143 movz16dup_if v0.2s, w16, #2896*8, \scale 3144 movi_if v7.8h, #0, \clear 3145 load8 x7, x8, v7.4s, \clear 3146 clear_upper8 3147 sub x7, x7, x8, lsl #3 3148 lsr x8, x8, #1 3149 sub x7, x7, x8, lsr #1 3150 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 3151 3152 bl inv_dct32_odd_4s_x16_neon 3153 3154 add x10, x6, #16*15 3155 sub x6, x6, #16*16 3156 3157 mov x9, #-16 3158 3159 movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff 3160 mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 3161 3162.macro store_addsub r0, r1, r2, r3 3163 ld1 {v2.4s}, [x6], #16 3164 ld1 {v3.4s}, [x6], #16 3165 sqadd v6.4s, v2.4s, \r0 3166 sqsub \r0, v2.4s, \r0 3167 ld1 {v4.4s}, [x6], #16 3168 sqadd v7.4s, v3.4s, \r1 3169 sqsub \r1, v3.4s, \r1 3170 smin v6.4s, v6.4s, v1.4s 3171 smin \r0, \r0, v1.4s 3172 ld1 {v5.4s}, [x6], #16 3173 sqadd v2.4s, v4.4s, \r2 3174 sub x6, x6, #16*4 3175 smax v6.4s, v6.4s, v0.4s 3176 smax \r0, \r0, v0.4s 3177 sqsub \r2, v4.4s, \r2 3178 smin v7.4s, v7.4s, v1.4s 3179 smin \r1, \r1, v1.4s 3180 st1 {v6.4s}, [x6], #16 3181 st1 {\r0}, [x10], x9 3182 smin v2.4s, v2.4s, v1.4s 3183 smin \r2, \r2, v1.4s 3184 smax v7.4s, v7.4s, v0.4s 3185 smax \r1, \r1, v0.4s 3186 sqadd v3.4s, v5.4s, \r3 3187 sqsub \r3, v5.4s, \r3 3188 smax v2.4s, v2.4s, v0.4s 3189 smax \r2, \r2, v0.4s 3190 smin v3.4s, v3.4s, v1.4s 3191 smin \r3, \r3, v1.4s 3192 st1 {v7.4s}, [x6], #16 3193 st1 {\r1}, [x10], x9 3194 smax v3.4s, v3.4s, v0.4s 3195 smax \r3, \r3, v0.4s 3196 st1 {v2.4s}, [x6], #16 3197 st1 {\r2}, [x10], x9 3198 st1 {v3.4s}, [x6], #16 3199 st1 {\r3}, [x10], x9 3200.endm 3201 store_addsub v31.4s, v30.4s, v29.4s, v28.4s 3202 store_addsub v27.4s, v26.4s, v25.4s, v24.4s 3203 store_addsub v23.4s, v22.4s, v21.4s, v20.4s 3204 store_addsub v19.4s, v18.4s, v17.4s, v16.4s 3205.purgem store_addsub 3206 3207 add x6, x6, #4*4*16 3208 3209 movrel x17, idct64_coeffs 3210 movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff 3211 mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 3212 movz16dup_if v0.2s, w16, #2896*8, \scale 3213 movi_if v7.4s, #0, \clear 3214 add x9, x7, x8, lsl #4 // offset 16 3215 add x10, x7, x8, lsl #3 // offset 8 3216 sub x9, x9, x8 // offset 15 3217 sub x11, x10, x8 // offset 7 3218 ld1 {v16.4s}, [x7] // in1 (offset 0) 3219 ld1 {v17.4s}, [x9] // in31 (offset 15) 3220 ld1 {v18.4s}, [x10] // in17 (offset 8) 3221 ld1 {v19.4s}, [x11] // in15 (offset 7) 3222 st1_if {v7.4s}, [x7], \clear 3223 st1_if {v7.4s}, [x9], \clear 3224 st1_if {v7.4s}, [x10], \clear 3225 st1_if {v7.4s}, [x11], \clear 3226 scale_if \scale, v0.s[0], v16, v17, v18, v19 3227 bl inv_dct64_step1_neon 3228 movz16dup_if v0.2s, w16, #2896*8, \scale 3229 movi_if v7.4s, #0, \clear 3230 add x7, x7, x8, lsl #2 // offset 4 3231 sub x9, x9, x8, lsl #2 // offset 11 3232 sub x10, x7, x8 // offset 3 3233 add x11, x9, x8 // offset 12 3234 ld1 {v16.4s}, [x10] // in7 (offset 3) 3235 ld1 {v17.4s}, [x11] // in25 (offset 12) 3236 ld1 {v18.4s}, [x9] // in23 (offset 11) 3237 ld1 {v19.4s}, [x7] // in9 (offset 4) 3238 st1_if {v7.4s}, [x7], \clear 3239 st1_if {v7.4s}, [x9], \clear 3240 st1_if {v7.4s}, [x10], \clear 3241 st1_if {v7.4s}, [x11], \clear 3242 scale_if \scale, v0.s[0], v16, v17, v18, v19 3243 bl inv_dct64_step1_neon 3244 movz16dup_if v0.2s, w16, #2896*8, \scale 3245 movi_if v7.4s, #0, \clear 3246 sub x10, x10, x8, lsl #1 // offset 1 3247 sub x9, x9, x8, lsl #1 // offset 9 3248 add x7, x7, x8 // offset 5 3249 add x11, x11, x8 // offset 13 3250 ldr q16, [x10, x8] // in5 (offset 2) 3251 ldr q17, [x11] // in27 (offset 13) 3252 ldr q18, [x9, x8] // in21 (offset 10) 3253 ldr q19, [x7] // in11 (offset 5) 3254 stroff_if q7, [x10, x8], \clear 3255 str_if q7, [x11], \clear 3256 stroff_if q7, [x9, x8], \clear 3257 str_if q7, [x7], \clear 3258 scale_if \scale, v0.s[0], v16, v17, v18, v19 3259 bl inv_dct64_step1_neon 3260 movz16dup_if v0.2s, w16, #2896*8, \scale 3261 movi_if v7.4s, #0, \clear 3262 ldr q16, [x10] // in3 (offset 1) 3263 ldr q17, [x11, x8] // in29 (offset 14) 3264 ldr q18, [x9] // in19 (offset 9) 3265 ldr q19, [x7, x8] // in13 (offset 6) 3266 str_if q7, [x10], \clear 3267 stroff_if q7, [x11, x8], \clear 3268 str_if q7, [x9], \clear 3269 stroff_if q7, [x7, x8], \clear 3270 scale_if \scale, v0.s[0], v16, v17, v18, v19 3271 bl inv_dct64_step1_neon 3272 3273 sub x6, x6, #4*4*32 3274 add x9, x6, #4*4*7 3275 3276 bl inv_dct64_step2_neon 3277 3278 ret x14 3279endfunc 3280.endm 3281 3282def_dct64_func _clear, clear=1 3283def_dct64_func _clear_scale, clear=1, scale=1 3284 3285 3286function inv_txfm_horz_dct_64x4_neon 3287 mov x14, x30 3288 3289 mov x7, sp 3290 add x8, sp, #4*4*(64 - 4) 3291 add x9, x6, #2*56 3292 mov x10, #2*64 3293 mov x11, #-4*4*4 3294 3295 dup v7.4s, w12 32961: 3297 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 3298 ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 3299 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 3300 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 3301 transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 3302 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 3303 transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 3304 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 3305 3306.macro store_addsub src0, src1, src2, src3 3307 sqsub v1.4s, \src0, \src1 3308 sqadd v0.4s, \src0, \src1 3309 sqsub v3.4s, \src2, \src3 3310 srshl v1.4s, v1.4s, v7.4s 3311 sqadd v2.4s, \src2, \src3 3312 srshl v3.4s, v3.4s, v7.4s 3313 srshl v0.4s, v0.4s, v7.4s 3314 srshl v2.4s, v2.4s, v7.4s 3315 sqxtn v3.4h, v3.4s 3316 sqxtn2 v3.8h, v1.4s 3317 sqxtn v0.4h, v0.4s 3318 sqxtn2 v0.8h, v2.4s 3319 rev64 v3.8h, v3.8h 3320 st1 {v0.8h}, [x6], x10 3321 st1 {v3.8h}, [x9], x10 3322.endm 3323 store_addsub v16.4s, v31.4s, v20.4s, v27.4s 3324 store_addsub v17.4s, v30.4s, v21.4s, v26.4s 3325 store_addsub v18.4s, v29.4s, v22.4s, v25.4s 3326 store_addsub v19.4s, v28.4s, v23.4s, v24.4s 3327.purgem store_addsub 3328 sub x6, x6, x10, lsl #2 3329 sub x9, x9, x10, lsl #2 3330 add x6, x6, #16 3331 sub x9, x9, #16 3332 3333 cmp x7, x8 3334 b.lt 1b 3335 ret x14 3336endfunc 3337 3338function inv_txfm_add_vert_dct_8x64_neon 3339 mov x14, x30 3340 lsl x8, x8, #1 3341 3342 mov x7, sp 3343 add x8, sp, #2*8*(64 - 4) 3344 add x9, x6, x1, lsl #6 3345 sub x9, x9, x1 3346 neg x10, x1 3347 mov x11, #-2*8*4 3348 33491: 3350 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 3351 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 3352 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 3353 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 3354 3355 mvni v7.8h, #0xfc, lsl #8 // 0x3ff 3356.macro add_dest_addsub src0, src1, src2, src3 3357 ld1 {v0.8h}, [x6], x1 3358 ld1 {v1.8h}, [x9], x10 3359 sqadd v4.8h, \src0, \src1 3360 ld1 {v2.8h}, [x6] 3361 sqsub \src0, \src0, \src1 3362 ld1 {v3.8h}, [x9] 3363 sqadd v5.8h, \src2, \src3 3364 sqsub \src2, \src2, \src3 3365 sub x6, x6, x1 3366 sub x9, x9, x10 3367 srshr v4.8h, v4.8h, #4 3368 srshr v5.8h, v5.8h, #4 3369 srshr \src0, \src0, #4 3370 usqadd v0.8h, v4.8h 3371 srshr \src2, \src2, #4 3372 usqadd v1.8h, \src0 3373 usqadd v2.8h, v5.8h 3374 smin v0.8h, v0.8h, v7.8h 3375 usqadd v3.8h, \src2 3376 smin v1.8h, v1.8h, v7.8h 3377 st1 {v0.8h}, [x6], x1 3378 smin v2.8h, v2.8h, v7.8h 3379 st1 {v1.8h}, [x9], x10 3380 smin v3.8h, v3.8h, v7.8h 3381 st1 {v2.8h}, [x6], x1 3382 st1 {v3.8h}, [x9], x10 3383.endm 3384 add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h 3385 add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h 3386 add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h 3387 add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h 3388.purgem add_dest_addsub 3389 cmp x7, x8 3390 b.lt 1b 3391 3392 ret x14 3393endfunc 3394 3395function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 3396 idct_dc 64, 64, 2 3397 3398 mov x15, x30 3399 3400 sub_sp 64*32*2+64*4*4 3401 add x5, sp, #64*4*4 3402 3403 movrel x13, eob_32x32 3404 3405.irp i, 0, 4, 8, 12, 16, 20, 24, 28 3406 add x6, x5, #(\i*64*2) 3407.if \i > 0 3408 mov w8, #(32 - \i) 3409 cmp w3, w12 3410 b.lt 1f 3411.endif 3412 add x7, x2, #(\i*4) 3413 mov x8, #32*4 3414 mov x12, #-2 // shift 3415 bl inv_txfm_dct_clear_4s_x64_neon 3416 add x6, x5, #(\i*64*2) 3417 bl inv_txfm_horz_dct_64x4_neon 3418.if \i < 28 3419 ldrh w12, [x13], #2 3420.endif 3421.endr 3422 b 3f 3423 34241: 3425 movi v4.8h, #0 3426 movi v5.8h, #0 3427 movi v6.8h, #0 3428 movi v7.8h, #0 34292: 3430 subs w8, w8, #2 3431.rept 4 3432 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3433.endr 3434 b.gt 2b 3435 34363: 3437.irp i, 0, 8, 16, 24, 32, 40, 48, 56 3438 add x7, x5, #(\i*2) 3439 mov x8, #64*2 3440 bl X(inv_txfm_dct_8h_x64_neon) 3441 add x6, x0, #(\i*2) 3442 bl inv_txfm_add_vert_dct_8x64_neon 3443.endr 3444 3445 add sp, x5, #64*32*2 3446 ret x15 3447endfunc 3448 3449function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 3450 idct_dc 64, 32, 1 3451 3452 mov x15, x30 3453 3454 sub_sp 64*32*2+64*4*4 3455 add x5, sp, #64*4*4 3456 3457 movrel x13, eob_32x32 3458 3459.irp i, 0, 4, 8, 12, 16, 20, 24, 28 3460 add x6, x5, #(\i*64*2) 3461.if \i > 0 3462 mov w8, #(32 - \i) 3463 cmp w3, w12 3464 b.lt 1f 3465.endif 3466 add x7, x2, #(\i*4) 3467 mov x8, #32*4 3468 mov x12, #-1 // shift 3469 bl inv_txfm_dct_clear_scale_4s_x64_neon 3470 add x6, x5, #(\i*64*2) 3471 bl inv_txfm_horz_dct_64x4_neon 3472.if \i < 28 3473 ldrh w12, [x13], #2 3474.endif 3475.endr 3476 b 3f 3477 34781: 3479 movi v4.8h, #0 3480 movi v5.8h, #0 3481 movi v6.8h, #0 3482 movi v7.8h, #0 34832: 3484 subs w8, w8, #2 3485.rept 4 3486 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3487.endr 3488 b.gt 2b 3489 34903: 3491.irp i, 0, 8, 16, 24, 32, 40, 48, 56 3492 add x6, x0, #(\i*2) 3493 add x7, x5, #(\i*2) 3494 mov x8, #64*2 3495 bl inv_txfm_add_vert_dct_8x32_neon 3496.endr 3497 3498 add sp, x5, #64*32*2 3499 ret x15 3500endfunc 3501 3502function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 3503 idct_dc 32, 64, 1 3504 3505 mov x15, x30 3506 3507 sub_sp 32*32*2+64*8*2 3508 add x5, sp, #64*8*2 3509 3510 movrel x13, eob_32x32 3511 ldrh w12, [x13], #2 3512 3513.irp i, 0, 4, 8, 12, 16, 20, 24, 28 3514 add x6, x5, #(\i*32*2) 3515.if \i > 0 3516 mov w8, #(32 - \i) 3517 cmp w3, w12 3518 b.lt 1f 3519 ldrh w12, [x13], #2 3520.endif 3521 add x7, x2, #(\i*4) 3522 mov x8, #32*4 3523 bl inv_txfm_horz_scale_dct_32x4_neon 3524.endr 3525 b 3f 3526 35271: 3528 movi v4.8h, #0 3529 movi v5.8h, #0 3530 movi v6.8h, #0 3531 movi v7.8h, #0 35322: 3533 subs w8, w8, #4 3534.rept 4 3535 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3536.endr 3537 b.gt 2b 3538 35393: 3540.irp i, 0, 8, 16, 24 3541 add x7, x5, #(\i*2) 3542 mov x8, #32*2 3543 bl X(inv_txfm_dct_8h_x64_neon) 3544 add x6, x0, #(\i*2) 3545 bl inv_txfm_add_vert_dct_8x64_neon 3546.endr 3547 3548 add sp, x5, #32*32*2 3549 ret x15 3550endfunc 3551 3552function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 3553 idct_dc 64, 16, 2 3554 3555 mov x15, x30 3556 3557 sub_sp 64*16*2+64*4*4 3558 add x4, sp, #64*4*4 3559 3560 movrel x13, eob_16x32 3561 3562.irp i, 0, 4, 8, 12 3563 add x6, x4, #(\i*64*2) 3564.if \i > 0 3565 mov w8, #(16 - \i) 3566 cmp w3, w12 3567 b.lt 1f 3568.endif 3569 add x7, x2, #(\i*4) 3570 mov x8, #16*4 3571 mov x12, #-2 // shift 3572 bl inv_txfm_dct_clear_4s_x64_neon 3573 add x6, x4, #(\i*64*2) 3574 bl inv_txfm_horz_dct_64x4_neon 3575.if \i < 12 3576 ldrh w12, [x13], #2 3577.endif 3578.endr 3579 b 3f 3580 35811: 3582 movi v4.8h, #0 3583 movi v5.8h, #0 3584 movi v6.8h, #0 3585 movi v7.8h, #0 35862: 3587 subs w8, w8, #2 3588.rept 4 3589 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3590.endr 3591 b.gt 2b 3592 35933: 3594 movrel x5, X(inv_dct_8h_x16_neon) 3595.irp i, 0, 8, 16, 24, 32, 40, 48, 56 3596 add x6, x0, #(\i*2) 3597 add x7, x4, #(\i*2) 3598 mov x8, #64*2 3599 bl inv_txfm_add_vert_8x16_neon 3600.endr 3601 3602 add sp, x4, #64*16*2 3603 ret x15 3604endfunc 3605 3606function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 3607 idct_dc 16, 64, 2 3608 3609 mov x15, x30 3610 3611 sub_sp 16*32*2+64*8*2 3612 add x5, sp, #64*8*2 3613 3614 movrel x13, eob_16x32 3615 ldrh w12, [x13], #2 3616 3617 adr x4, inv_dct_4s_x16_neon 3618.irp i, 0, 4, 8, 12, 16, 20, 24, 28 3619 add x6, x5, #(\i*16*2) 3620.if \i > 0 3621 mov w8, #(32 - \i) 3622 cmp w3, w12 3623 b.lt 1f 3624.if \i < 28 3625 ldrh w12, [x13], #2 3626.endif 3627.endif 3628 add x7, x2, #(\i*4) 3629 mov x8, #32*4 3630 bl inv_txfm_horz_16x4_neon 3631.endr 3632 b 3f 3633 36341: 3635 movi v4.8h, #0 3636 movi v5.8h, #0 3637 movi v6.8h, #0 3638 movi v7.8h, #0 36392: 3640 subs w8, w8, #4 3641.rept 2 3642 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 3643.endr 3644 b.gt 2b 3645 36463: 3647.irp i, 0, 8 3648 add x7, x5, #(\i*2) 3649 mov x8, #16*2 3650 bl X(inv_txfm_dct_8h_x64_neon) 3651 add x6, x0, #(\i*2) 3652 bl inv_txfm_add_vert_dct_8x64_neon 3653.endr 3654 3655 add sp, x5, #16*32*2 3656 ret x15 3657endfunc 3658