1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2020, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32#define PREP_BIAS 8192 33 34.macro avg d0, d1, t0, t1, t2, t3 35 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 36 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 37 sqadd \t0\().8h, \t0\().8h, \t2\().8h 38 sqadd \t1\().8h, \t1\().8h, \t3\().8h 39 smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 40 smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 41 sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 42 sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 43 sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) 44 sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) 45.endm 46 47.macro w_avg d0, d1, t0, t1, t2, t3 48 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 49 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 50 // This difference requires a 17 bit range, and all bits are 51 // significant for the following multiplication. 52 ssubl \d0\().4s, \t2\().4h, \t0\().4h 53 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 54 ssubl \d1\().4s, \t3\().4h, \t1\().4h 55 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 56 mul \d0\().4s, \d0\().4s, v27.4s 57 mul \t0\().4s, \t0\().4s, v27.4s 58 mul \d1\().4s, \d1\().4s, v27.4s 59 mul \t1\().4s, \t1\().4s, v27.4s 60 sshr \d0\().4s, \d0\().4s, #4 61 sshr \t0\().4s, \t0\().4s, #4 62 sshr \d1\().4s, \d1\().4s, #4 63 sshr \t1\().4s, \t1\().4s, #4 64 saddw \d0\().4s, \d0\().4s, \t2\().4h 65 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 66 saddw \d1\().4s, \d1\().4s, \t3\().4h 67 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 68 uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 69 uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto 70 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 71 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 72 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 73 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 74 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 75 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 76 smax \d0\().8h, \d0\().8h, v30.8h // 0 77 smax \d1\().8h, \d1\().8h, v30.8h // 0 78.endm 79 80.macro mask d0, d1, t0, t1, t2, t3 81 ld1 {v27.16b}, [x6], 16 82 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 83 neg v27.16b, v27.16b 84 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 85 sxtl v26.8h, v27.8b 86 sxtl2 v27.8h, v27.16b 87 sxtl v24.4s, v26.4h 88 sxtl2 v25.4s, v26.8h 89 sxtl v26.4s, v27.4h 90 sxtl2 v27.4s, v27.8h 91 ssubl \d0\().4s, \t2\().4h, \t0\().4h 92 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 93 ssubl \d1\().4s, \t3\().4h, \t1\().4h 94 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 95 mul \d0\().4s, \d0\().4s, v24.4s 96 mul \t0\().4s, \t0\().4s, v25.4s 97 mul \d1\().4s, \d1\().4s, v26.4s 98 mul \t1\().4s, \t1\().4s, v27.4s 99 sshr \d0\().4s, \d0\().4s, #6 100 sshr \t0\().4s, \t0\().4s, #6 101 sshr \d1\().4s, \d1\().4s, #6 102 sshr \t1\().4s, \t1\().4s, #6 103 saddw \d0\().4s, \d0\().4s, \t2\().4h 104 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 105 saddw \d1\().4s, \d1\().4s, \t3\().4h 106 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 107 uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 108 uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto 109 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 110 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 111 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 112 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 113 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 114 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 115 smax \d0\().8h, \d0\().8h, v30.8h // 0 116 smax \d1\().8h, \d1\().8h, v30.8h // 0 117.endm 118 119.macro bidir_fn type, bdmax 120function \type\()_16bpc_neon, export=1 121 clz w4, w4 122.ifnc \type, avg 123 dup v31.8h, \bdmax // bitdepth_max 124 movi v30.8h, #0 125.endif 126 clz w7, \bdmax 127 sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 128.ifc \type, avg 129 mov w9, #1 130 mov w8, #-2*PREP_BIAS 131 lsl w9, w9, w7 // 1 << intermediate_bits 132 add w7, w7, #1 133 sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits 134 neg w7, w7 // -(intermediate_bits+1) 135 dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits 136 dup v29.8h, w7 // -(intermediate_bits+1) 137.else 138 mov w8, #PREP_BIAS 139 lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits 140 neg w7, w7 // -intermediate_bits 141 dup v28.8h, w8 // PREP_BIAS >> intermediate_bits 142 dup v29.8h, w7 // -intermediate_bits 143.endif 144.ifc \type, w_avg 145 dup v27.4s, w6 146 neg v27.4s, v27.4s 147.endif 148 movrel x7, \type\()_tbl 149 sub w4, w4, #24 150 \type v4, v5, v0, v1, v2, v3 151 ldrsw x4, [x7, x4, lsl #2] 152 add x7, x7, x4 153 br x7 15440: 155 AARCH64_VALID_JUMP_TARGET 156 add x7, x0, x1 157 lsl x1, x1, #1 1584: 159 subs w5, w5, #4 160 st1 {v4.8b}, [x0], x1 161 st1 {v4.d}[1], [x7], x1 162 st1 {v5.8b}, [x0], x1 163 st1 {v5.d}[1], [x7], x1 164 b.le 0f 165 \type v4, v5, v0, v1, v2, v3 166 b 4b 16780: 168 AARCH64_VALID_JUMP_TARGET 169 add x7, x0, x1 170 lsl x1, x1, #1 1718: 172 st1 {v4.8h}, [x0], x1 173 subs w5, w5, #2 174 st1 {v5.8h}, [x7], x1 175 b.le 0f 176 \type v4, v5, v0, v1, v2, v3 177 b 8b 178160: 179 AARCH64_VALID_JUMP_TARGET 18016: 181 \type v6, v7, v0, v1, v2, v3 182 st1 {v4.8h, v5.8h}, [x0], x1 183 subs w5, w5, #2 184 st1 {v6.8h, v7.8h}, [x0], x1 185 b.le 0f 186 \type v4, v5, v0, v1, v2, v3 187 b 16b 188320: 189 AARCH64_VALID_JUMP_TARGET 19032: 191 \type v6, v7, v0, v1, v2, v3 192 subs w5, w5, #1 193 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 194 b.le 0f 195 \type v4, v5, v0, v1, v2, v3 196 b 32b 197640: 198 AARCH64_VALID_JUMP_TARGET 199 add x7, x0, #64 20064: 201 \type v6, v7, v0, v1, v2, v3 202 \type v16, v17, v0, v1, v2, v3 203 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 204 \type v18, v19, v0, v1, v2, v3 205 subs w5, w5, #1 206 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 207 b.le 0f 208 \type v4, v5, v0, v1, v2, v3 209 b 64b 2101280: 211 AARCH64_VALID_JUMP_TARGET 212 add x7, x0, #64 213 mov x8, #128 214 sub x1, x1, #128 215128: 216 \type v6, v7, v0, v1, v2, v3 217 \type v16, v17, v0, v1, v2, v3 218 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8 219 \type v18, v19, v0, v1, v2, v3 220 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8 221 \type v4, v5, v0, v1, v2, v3 222 \type v6, v7, v0, v1, v2, v3 223 \type v16, v17, v0, v1, v2, v3 224 subs w5, w5, #1 225 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 226 \type v18, v19, v0, v1, v2, v3 227 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 228 b.le 0f 229 \type v4, v5, v0, v1, v2, v3 230 b 128b 2310: 232 ret 233endfunc 234 235jumptable \type\()_tbl 236 .word 1280b - \type\()_tbl 237 .word 640b - \type\()_tbl 238 .word 320b - \type\()_tbl 239 .word 160b - \type\()_tbl 240 .word 80b - \type\()_tbl 241 .word 40b - \type\()_tbl 242endjumptable 243.endm 244 245bidir_fn avg, w6 246bidir_fn w_avg, w7 247bidir_fn mask, w7 248 249 250.macro w_mask_fn type 251function w_mask_\type\()_16bpc_neon, export=1 252 ldr w8, [sp] 253 clz w9, w4 254 movrel x10, w_mask_\type\()_tbl 255 dup v31.8h, w8 // bitdepth_max 256 sub w9, w9, #24 257 clz w8, w8 // clz(bitdepth_max) 258 ldrsw x9, [x10, x9, lsl #2] 259 add x10, x10, x9 260 sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 261 mov w9, #PREP_BIAS*64 262 neg w8, w8 // -sh 263 mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd 264 dup v30.4s, w9 // PREP_BIAS*64 265 dup v29.4s, w8 // -sh 266 dup v0.8h, w11 267.if \type == 444 268 movi v1.16b, #64 269.elseif \type == 422 270 dup v2.8b, w7 271 movi v3.8b, #129 272 sub v3.8b, v3.8b, v2.8b 273.elseif \type == 420 274 dup v2.8h, w7 275 movi v3.8h, #1, lsl #8 276 sub v3.8h, v3.8h, v2.8h 277.endif 278 add x12, x0, x1 279 lsl x1, x1, #1 280 br x10 28140: 282 AARCH64_VALID_JUMP_TARGET 2834: 284 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 285 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 286 subs w5, w5, #4 287 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 288 sabd v21.8h, v5.8h, v7.8h 289 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 290 ssubl2 v17.4s, v6.8h, v4.8h 291 ssubl v18.4s, v7.4h, v5.4h 292 ssubl2 v19.4s, v7.8h, v5.8h 293 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 294 uqsub v21.8h, v0.8h, v21.8h 295 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 296 sshll v6.4s, v5.4h, #6 297 sshll2 v5.4s, v4.8h, #6 298 sshll v4.4s, v4.4h, #6 299 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 300 ushr v21.8h, v21.8h, #10 301 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 302 add v5.4s, v5.4s, v30.4s 303 add v6.4s, v6.4s, v30.4s 304 add v7.4s, v7.4s, v30.4s 305 uxtl v22.4s, v20.4h 306 uxtl2 v23.4s, v20.8h 307 uxtl v24.4s, v21.4h 308 uxtl2 v25.4s, v21.8h 309 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 310 mla v5.4s, v17.4s, v23.4s 311 mla v6.4s, v18.4s, v24.4s 312 mla v7.4s, v19.4s, v25.4s 313 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 314 srshl v5.4s, v5.4s, v29.4s 315 srshl v6.4s, v6.4s, v29.4s 316 srshl v7.4s, v7.4s, v29.4s 317 sqxtun v4.4h, v4.4s // iclip_pixel 318 sqxtun2 v4.8h, v5.4s 319 sqxtun v5.4h, v6.4s 320 sqxtun2 v5.8h, v7.4s 321 umin v4.8h, v4.8h, v31.8h // iclip_pixel 322 umin v5.8h, v5.8h, v31.8h 323.if \type == 444 324 uzp1 v20.16b, v20.16b, v21.16b // 64 - m 325 sub v20.16b, v1.16b, v20.16b // m 326 st1 {v20.16b}, [x6], #16 327.elseif \type == 422 328 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 329 xtn v20.8b, v20.8h 330 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 331 st1 {v20.8b}, [x6], #8 332.elseif \type == 420 333 trn1 v24.2d, v20.2d, v21.2d 334 trn2 v25.2d, v20.2d, v21.2d 335 add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) 336 addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) 337 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 338 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 339 str s20, [x6], #4 340.endif 341 st1 {v4.8b}, [x0], x1 342 st1 {v4.d}[1], [x12], x1 343 st1 {v5.8b}, [x0], x1 344 st1 {v5.d}[1], [x12], x1 345 b.gt 4b 346 ret 34780: 348 AARCH64_VALID_JUMP_TARGET 3498: 350 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 351 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 352 subs w5, w5, #2 353 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 354 sabd v21.8h, v5.8h, v7.8h 355 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 356 ssubl2 v17.4s, v6.8h, v4.8h 357 ssubl v18.4s, v7.4h, v5.4h 358 ssubl2 v19.4s, v7.8h, v5.8h 359 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 360 uqsub v21.8h, v0.8h, v21.8h 361 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 362 sshll v6.4s, v5.4h, #6 363 sshll2 v5.4s, v4.8h, #6 364 sshll v4.4s, v4.4h, #6 365 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 366 ushr v21.8h, v21.8h, #10 367 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 368 add v5.4s, v5.4s, v30.4s 369 add v6.4s, v6.4s, v30.4s 370 add v7.4s, v7.4s, v30.4s 371 uxtl v22.4s, v20.4h 372 uxtl2 v23.4s, v20.8h 373 uxtl v24.4s, v21.4h 374 uxtl2 v25.4s, v21.8h 375 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 376 mla v5.4s, v17.4s, v23.4s 377 mla v6.4s, v18.4s, v24.4s 378 mla v7.4s, v19.4s, v25.4s 379 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 380 srshl v5.4s, v5.4s, v29.4s 381 srshl v6.4s, v6.4s, v29.4s 382 srshl v7.4s, v7.4s, v29.4s 383 sqxtun v4.4h, v4.4s // iclip_pixel 384 sqxtun2 v4.8h, v5.4s 385 sqxtun v5.4h, v6.4s 386 sqxtun2 v5.8h, v7.4s 387 umin v4.8h, v4.8h, v31.8h // iclip_pixel 388 umin v5.8h, v5.8h, v31.8h 389.if \type == 444 390 uzp1 v20.16b, v20.16b, v21.16b // 64 - m 391 sub v20.16b, v1.16b, v20.16b // m 392 st1 {v20.16b}, [x6], #16 393.elseif \type == 422 394 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 395 xtn v20.8b, v20.8h 396 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 397 st1 {v20.8b}, [x6], #8 398.elseif \type == 420 399 add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) 400 addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) 401 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 402 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 403 str s20, [x6], #4 404.endif 405 st1 {v4.8h}, [x0], x1 406 st1 {v5.8h}, [x12], x1 407 b.gt 8b 408 ret 4091280: 410640: 411320: 412160: 413 AARCH64_VALID_JUMP_TARGET 414 mov w11, w4 415 sub x1, x1, w4, uxtw #1 416.if \type == 444 417 add x10, x6, w4, uxtw 418.elseif \type == 422 419 add x10, x6, x11, lsr #1 420.endif 421 add x9, x3, w4, uxtw #1 422 add x7, x2, w4, uxtw #1 423161: 424 mov w8, w4 42516: 426 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 427 ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 428 ld1 {v6.8h, v7.8h}, [x7], #32 429 ld1 {v18.8h, v19.8h}, [x9], #32 430 subs w8, w8, #16 431 sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) 432 sabd v21.8h, v5.8h, v17.8h 433 ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 434 ssubl2 v23.4s, v16.8h, v4.8h 435 ssubl v24.4s, v17.4h, v5.4h 436 ssubl2 v25.4s, v17.8h, v5.8h 437 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 438 uqsub v21.8h, v0.8h, v21.8h 439 sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 440 sshll v26.4s, v5.4h, #6 441 sshll2 v5.4s, v4.8h, #6 442 sshll v4.4s, v4.4h, #6 443 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 444 ushr v21.8h, v21.8h, #10 445 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 446 add v5.4s, v5.4s, v30.4s 447 add v26.4s, v26.4s, v30.4s 448 add v27.4s, v27.4s, v30.4s 449 uxtl v16.4s, v20.4h 450 uxtl2 v17.4s, v20.8h 451 uxtl v28.4s, v21.4h 452 mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) 453 uxtl2 v16.4s, v21.8h 454 mla v5.4s, v23.4s, v17.4s 455 mla v26.4s, v24.4s, v28.4s 456 mla v27.4s, v25.4s, v16.4s 457 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 458 srshl v5.4s, v5.4s, v29.4s 459 srshl v26.4s, v26.4s, v29.4s 460 srshl v27.4s, v27.4s, v29.4s 461 sqxtun v4.4h, v4.4s // iclip_pixel 462 sqxtun2 v4.8h, v5.4s 463 sqxtun v5.4h, v26.4s 464 sqxtun2 v5.8h, v27.4s 465 466 // Start of other half 467 sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) 468 sabd v23.8h, v7.8h, v19.8h 469 470 umin v4.8h, v4.8h, v31.8h // iclip_pixel 471 umin v5.8h, v5.8h, v31.8h 472 473 ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) 474 ssubl2 v17.4s, v18.8h, v6.8h 475 ssubl v18.4s, v19.4h, v7.4h 476 ssubl2 v19.4s, v19.8h, v7.8h 477 uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() 478 uqsub v23.8h, v0.8h, v23.8h 479 sshll v24.4s, v6.4h, #6 // tmp1 << 6 480 sshll2 v25.4s, v6.8h, #6 481 sshll v26.4s, v7.4h, #6 482 sshll2 v27.4s, v7.8h, #6 483 ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 484 ushr v23.8h, v23.8h, #10 485 add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 486 add v25.4s, v25.4s, v30.4s 487 add v26.4s, v26.4s, v30.4s 488 add v27.4s, v27.4s, v30.4s 489 uxtl v6.4s, v22.4h 490 uxtl2 v7.4s, v22.8h 491 uxtl v28.4s, v23.4h 492 mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) 493 uxtl2 v6.4s, v23.8h 494 mla v25.4s, v17.4s, v7.4s 495 mla v26.4s, v18.4s, v28.4s 496 mla v27.4s, v19.4s, v6.4s 497 srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 498 srshl v25.4s, v25.4s, v29.4s 499 srshl v26.4s, v26.4s, v29.4s 500 srshl v27.4s, v27.4s, v29.4s 501 sqxtun v6.4h, v24.4s // iclip_pixel 502 sqxtun2 v6.8h, v25.4s 503 sqxtun v7.4h, v26.4s 504 sqxtun2 v7.8h, v27.4s 505 umin v6.8h, v6.8h, v31.8h // iclip_pixel 506 umin v7.8h, v7.8h, v31.8h 507.if \type == 444 508 uzp1 v20.16b, v20.16b, v21.16b // 64 - m 509 uzp1 v21.16b, v22.16b, v23.16b 510 sub v20.16b, v1.16b, v20.16b // m 511 sub v21.16b, v1.16b, v21.16b 512 st1 {v20.16b}, [x6], #16 513 st1 {v21.16b}, [x10], #16 514.elseif \type == 422 515 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 516 addp v21.8h, v22.8h, v23.8h 517 xtn v20.8b, v20.8h 518 xtn v21.8b, v21.8h 519 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 520 uhsub v21.8b, v3.8b, v21.8b 521 st1 {v20.8b}, [x6], #8 522 st1 {v21.8b}, [x10], #8 523.elseif \type == 420 524 add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) 525 add v21.8h, v21.8h, v23.8h 526 addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) 527 sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) 528 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 529 st1 {v20.8b}, [x6], #8 530.endif 531 st1 {v4.8h, v5.8h}, [x0], #32 532 st1 {v6.8h, v7.8h}, [x12], #32 533 b.gt 16b 534 subs w5, w5, #2 535 add x2, x2, w4, uxtw #1 536 add x3, x3, w4, uxtw #1 537 add x7, x7, w4, uxtw #1 538 add x9, x9, w4, uxtw #1 539.if \type == 444 540 add x6, x6, w4, uxtw 541 add x10, x10, w4, uxtw 542.elseif \type == 422 543 add x6, x6, x11, lsr #1 544 add x10, x10, x11, lsr #1 545.endif 546 add x0, x0, x1 547 add x12, x12, x1 548 b.gt 161b 549 ret 550endfunc 551 552jumptable w_mask_\type\()_tbl 553 .word 1280b - w_mask_\type\()_tbl 554 .word 640b - w_mask_\type\()_tbl 555 .word 320b - w_mask_\type\()_tbl 556 .word 160b - w_mask_\type\()_tbl 557 .word 80b - w_mask_\type\()_tbl 558 .word 40b - w_mask_\type\()_tbl 559endjumptable 560.endm 561 562w_mask_fn 444 563w_mask_fn 422 564w_mask_fn 420 565 566 567function blend_16bpc_neon, export=1 568 movrel x6, blend_tbl 569 clz w3, w3 570 sub w3, w3, #26 571 ldrsw x3, [x6, x3, lsl #2] 572 add x6, x6, x3 573 add x8, x0, x1 574 br x6 57540: 576 AARCH64_VALID_JUMP_TARGET 577 lsl x1, x1, #1 5784: 579 ld1 {v2.8b}, [x5], #8 580 ld1 {v1.8h}, [x2], #16 581 ldr d0, [x0] 582 neg v2.8b, v2.8b // -m 583 subs w4, w4, #2 584 ld1 {v0.d}[1], [x8] 585 sxtl v2.8h, v2.8b 586 shl v2.8h, v2.8h, #9 // -m << 9 587 sub v1.8h, v0.8h, v1.8h // a - b 588 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 589 add v0.8h, v0.8h, v1.8h 590 st1 {v0.8b}, [x0], x1 591 st1 {v0.d}[1], [x8], x1 592 b.gt 4b 593 ret 59480: 595 AARCH64_VALID_JUMP_TARGET 596 lsl x1, x1, #1 5978: 598 ld1 {v4.16b}, [x5], #16 599 ld1 {v2.8h, v3.8h}, [x2], #32 600 neg v5.16b, v4.16b // -m 601 ld1 {v0.8h}, [x0] 602 ld1 {v1.8h}, [x8] 603 sxtl v4.8h, v5.8b 604 sxtl2 v5.8h, v5.16b 605 shl v4.8h, v4.8h, #9 // -m << 9 606 shl v5.8h, v5.8h, #9 607 sub v2.8h, v0.8h, v2.8h // a - b 608 sub v3.8h, v1.8h, v3.8h 609 subs w4, w4, #2 610 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 611 sqrdmulh v3.8h, v3.8h, v5.8h 612 add v0.8h, v0.8h, v2.8h 613 add v1.8h, v1.8h, v3.8h 614 st1 {v0.8h}, [x0], x1 615 st1 {v1.8h}, [x8], x1 616 b.gt 8b 617 ret 618160: 619 AARCH64_VALID_JUMP_TARGET 620 lsl x1, x1, #1 62116: 622 ld1 {v16.16b, v17.16b}, [x5], #32 623 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 624 subs w4, w4, #2 625 neg v18.16b, v16.16b // -m 626 neg v19.16b, v17.16b 627 ld1 {v0.8h, v1.8h}, [x0] 628 sxtl v16.8h, v18.8b 629 sxtl2 v17.8h, v18.16b 630 sxtl v18.8h, v19.8b 631 sxtl2 v19.8h, v19.16b 632 ld1 {v2.8h, v3.8h}, [x8] 633 shl v16.8h, v16.8h, #9 // -m << 9 634 shl v17.8h, v17.8h, #9 635 shl v18.8h, v18.8h, #9 636 shl v19.8h, v19.8h, #9 637 sub v4.8h, v0.8h, v4.8h // a - b 638 sub v5.8h, v1.8h, v5.8h 639 sub v6.8h, v2.8h, v6.8h 640 sub v7.8h, v3.8h, v7.8h 641 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 642 sqrdmulh v5.8h, v5.8h, v17.8h 643 sqrdmulh v6.8h, v6.8h, v18.8h 644 sqrdmulh v7.8h, v7.8h, v19.8h 645 add v0.8h, v0.8h, v4.8h 646 add v1.8h, v1.8h, v5.8h 647 add v2.8h, v2.8h, v6.8h 648 add v3.8h, v3.8h, v7.8h 649 st1 {v0.8h, v1.8h}, [x0], x1 650 st1 {v2.8h, v3.8h}, [x8], x1 651 b.gt 16b 652 ret 653320: 654 AARCH64_VALID_JUMP_TARGET 65532: 656 ld1 {v16.16b, v17.16b}, [x5], #32 657 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 658 subs w4, w4, #1 659 neg v18.16b, v16.16b // -m 660 neg v19.16b, v17.16b 661 sxtl v16.8h, v18.8b 662 sxtl2 v17.8h, v18.16b 663 sxtl v18.8h, v19.8b 664 sxtl2 v19.8h, v19.16b 665 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 666 shl v16.8h, v16.8h, #9 // -m << 9 667 shl v17.8h, v17.8h, #9 668 shl v18.8h, v18.8h, #9 669 shl v19.8h, v19.8h, #9 670 sub v4.8h, v0.8h, v4.8h // a - b 671 sub v5.8h, v1.8h, v5.8h 672 sub v6.8h, v2.8h, v6.8h 673 sub v7.8h, v3.8h, v7.8h 674 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 675 sqrdmulh v5.8h, v5.8h, v17.8h 676 sqrdmulh v6.8h, v6.8h, v18.8h 677 sqrdmulh v7.8h, v7.8h, v19.8h 678 add v0.8h, v0.8h, v4.8h 679 add v1.8h, v1.8h, v5.8h 680 add v2.8h, v2.8h, v6.8h 681 add v3.8h, v3.8h, v7.8h 682 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 683 b.gt 32b 684 ret 685endfunc 686 687jumptable blend_tbl 688 .word 320b - blend_tbl 689 .word 160b - blend_tbl 690 .word 80b - blend_tbl 691 .word 40b - blend_tbl 692endjumptable 693 694function blend_h_16bpc_neon, export=1 695 movrel x6, blend_h_tbl 696 movrel x5, X(obmc_masks) 697 add x5, x5, w4, uxtw 698 sub w4, w4, w4, lsr #2 699 clz w7, w3 700 add x8, x0, x1 701 lsl x1, x1, #1 702 sub w7, w7, #24 703 ldrsw x7, [x6, x7, lsl #2] 704 add x6, x6, x7 705 br x6 70620: 707 AARCH64_VALID_JUMP_TARGET 7082: 709 ld2r {v2.8b, v3.8b}, [x5], #2 710 ld1 {v1.4h}, [x2], #8 711 ext v2.8b, v2.8b, v3.8b, #6 712 subs w4, w4, #2 713 neg v2.8b, v2.8b // -m 714 ldr s0, [x0] 715 ld1 {v0.s}[1], [x8] 716 sxtl v2.8h, v2.8b 717 shl v2.4h, v2.4h, #9 // -m << 9 718 sub v1.4h, v0.4h, v1.4h // a - b 719 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 720 add v0.4h, v0.4h, v1.4h 721 st1 {v0.s}[0], [x0], x1 722 st1 {v0.s}[1], [x8], x1 723 b.gt 2b 724 ret 72540: 726 AARCH64_VALID_JUMP_TARGET 7274: 728 ld2r {v2.8b, v3.8b}, [x5], #2 729 ld1 {v1.8h}, [x2], #16 730 ext v2.8b, v2.8b, v3.8b, #4 731 subs w4, w4, #2 732 neg v2.8b, v2.8b // -m 733 ldr d0, [x0] 734 ld1 {v0.d}[1], [x8] 735 sxtl v2.8h, v2.8b 736 shl v2.8h, v2.8h, #9 // -m << 9 737 sub v1.8h, v0.8h, v1.8h // a - b 738 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 739 add v0.8h, v0.8h, v1.8h 740 st1 {v0.8b}, [x0], x1 741 st1 {v0.d}[1], [x8], x1 742 b.gt 4b 743 ret 74480: 745 AARCH64_VALID_JUMP_TARGET 7468: 747 ld2r {v4.8b, v5.8b}, [x5], #2 748 ld1 {v2.8h, v3.8h}, [x2], #32 749 neg v4.8b, v4.8b // -m 750 neg v5.8b, v5.8b 751 ld1 {v0.8h}, [x0] 752 subs w4, w4, #2 753 sxtl v4.8h, v4.8b 754 sxtl v5.8h, v5.8b 755 ld1 {v1.8h}, [x8] 756 shl v4.8h, v4.8h, #9 // -m << 9 757 shl v5.8h, v5.8h, #9 758 sub v2.8h, v0.8h, v2.8h // a - b 759 sub v3.8h, v1.8h, v3.8h 760 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 761 sqrdmulh v3.8h, v3.8h, v5.8h 762 add v0.8h, v0.8h, v2.8h 763 add v1.8h, v1.8h, v3.8h 764 st1 {v0.8h}, [x0], x1 765 st1 {v1.8h}, [x8], x1 766 b.gt 8b 767 ret 768160: 769 AARCH64_VALID_JUMP_TARGET 77016: 771 ld2r {v16.8b, v17.8b}, [x5], #2 772 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 773 neg v16.8b, v16.8b // -m 774 neg v17.8b, v17.8b 775 ld1 {v0.8h, v1.8h}, [x0] 776 ld1 {v2.8h, v3.8h}, [x8] 777 subs w4, w4, #2 778 sxtl v16.8h, v16.8b 779 sxtl v17.8h, v17.8b 780 shl v16.8h, v16.8h, #9 // -m << 9 781 shl v17.8h, v17.8h, #9 782 sub v4.8h, v0.8h, v4.8h // a - b 783 sub v5.8h, v1.8h, v5.8h 784 sub v6.8h, v2.8h, v6.8h 785 sub v7.8h, v3.8h, v7.8h 786 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 787 sqrdmulh v5.8h, v5.8h, v16.8h 788 sqrdmulh v6.8h, v6.8h, v17.8h 789 sqrdmulh v7.8h, v7.8h, v17.8h 790 add v0.8h, v0.8h, v4.8h 791 add v1.8h, v1.8h, v5.8h 792 add v2.8h, v2.8h, v6.8h 793 add v3.8h, v3.8h, v7.8h 794 st1 {v0.8h, v1.8h}, [x0], x1 795 st1 {v2.8h, v3.8h}, [x8], x1 796 b.gt 16b 797 ret 7981280: 799640: 800320: 801 AARCH64_VALID_JUMP_TARGET 802 sub x1, x1, w3, uxtw #1 803 add x7, x2, w3, uxtw #1 804321: 805 ld2r {v24.8b, v25.8b}, [x5], #2 806 mov w6, w3 807 neg v24.8b, v24.8b // -m 808 neg v25.8b, v25.8b 809 sxtl v24.8h, v24.8b 810 sxtl v25.8h, v25.8b 811 shl v24.8h, v24.8h, #9 // -m << 9 812 shl v25.8h, v25.8h, #9 81332: 814 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 815 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 816 subs w6, w6, #32 817 sub v16.8h, v0.8h, v16.8h // a - b 818 sub v17.8h, v1.8h, v17.8h 819 sub v18.8h, v2.8h, v18.8h 820 sub v19.8h, v3.8h, v19.8h 821 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 822 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8] 823 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 824 sqrdmulh v17.8h, v17.8h, v24.8h 825 sqrdmulh v18.8h, v18.8h, v24.8h 826 sqrdmulh v19.8h, v19.8h, v24.8h 827 sub v20.8h, v4.8h, v20.8h // a - b 828 sub v21.8h, v5.8h, v21.8h 829 sub v22.8h, v6.8h, v22.8h 830 sub v23.8h, v7.8h, v23.8h 831 add v0.8h, v0.8h, v16.8h 832 add v1.8h, v1.8h, v17.8h 833 add v2.8h, v2.8h, v18.8h 834 add v3.8h, v3.8h, v19.8h 835 sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6 836 sqrdmulh v21.8h, v21.8h, v25.8h 837 sqrdmulh v22.8h, v22.8h, v25.8h 838 sqrdmulh v23.8h, v23.8h, v25.8h 839 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 840 add v4.8h, v4.8h, v20.8h 841 add v5.8h, v5.8h, v21.8h 842 add v6.8h, v6.8h, v22.8h 843 add v7.8h, v7.8h, v23.8h 844 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64 845 b.gt 32b 846 subs w4, w4, #2 847 add x0, x0, x1 848 add x8, x8, x1 849 add x2, x2, w3, uxtw #1 850 add x7, x7, w3, uxtw #1 851 b.gt 321b 852 ret 853endfunc 854 855jumptable blend_h_tbl 856 .word 1280b - blend_h_tbl 857 .word 640b - blend_h_tbl 858 .word 320b - blend_h_tbl 859 .word 160b - blend_h_tbl 860 .word 80b - blend_h_tbl 861 .word 40b - blend_h_tbl 862 .word 20b - blend_h_tbl 863endjumptable 864 865function blend_v_16bpc_neon, export=1 866 movrel x6, blend_v_tbl 867 movrel x5, X(obmc_masks) 868 add x5, x5, w3, uxtw 869 clz w3, w3 870 add x8, x0, x1 871 lsl x1, x1, #1 872 sub w3, w3, #26 873 ldrsw x3, [x6, x3, lsl #2] 874 add x6, x6, x3 875 br x6 87620: 877 AARCH64_VALID_JUMP_TARGET 878 ld1r {v2.8b}, [x5] 879 neg v2.8b, v2.8b // -m 880 sxtl v2.8h, v2.8b 881 shl v2.4h, v2.4h, #9 // -m << 9 8822: 883 ldr s1, [x2], #4 884 ldr h0, [x0] 885 subs w4, w4, #2 886 ld1 {v1.h}[1], [x2] 887 ld1 {v0.h}[1], [x8] 888 add x2, x2, #4 889 sub v1.4h, v0.4h, v1.4h // a - b 890 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 891 add v0.4h, v0.4h, v1.4h 892 st1 {v0.h}[0], [x0], x1 893 st1 {v0.h}[1], [x8], x1 894 b.gt 2b 895 ret 89640: 897 AARCH64_VALID_JUMP_TARGET 898 ld1r {v2.2s}, [x5] 899 sub x1, x1, #4 900 neg v2.8b, v2.8b // -m 901 sxtl v2.8h, v2.8b 902 shl v2.8h, v2.8h, #9 // -m << 9 9034: 904 ld1 {v1.8h}, [x2], #16 905 ldr d0, [x0] 906 ld1 {v0.d}[1], [x8] 907 subs w4, w4, #2 908 sub v1.8h, v0.8h, v1.8h // a - b 909 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 910 add v0.8h, v0.8h, v1.8h 911 str s0, [x0], #4 912 st1 {v0.s}[2], [x8], #4 913 st1 {v0.h}[2], [x0], x1 914 st1 {v0.h}[6], [x8], x1 915 b.gt 4b 916 ret 91780: 918 AARCH64_VALID_JUMP_TARGET 919 ld1 {v4.8b}, [x5] 920 sub x1, x1, #8 921 neg v4.8b, v4.8b // -m 922 sxtl v4.8h, v4.8b 923 shl v4.8h, v4.8h, #9 // -m << 9 9248: 925 ld1 {v2.8h, v3.8h}, [x2], #32 926 ld1 {v0.8h}, [x0] 927 ld1 {v1.8h}, [x8] 928 subs w4, w4, #2 929 sub v2.8h, v0.8h, v2.8h // a - b 930 sub v3.8h, v1.8h, v3.8h 931 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 932 sqrdmulh v3.8h, v3.8h, v4.8h 933 add v0.8h, v0.8h, v2.8h 934 add v1.8h, v1.8h, v3.8h 935 str d0, [x0], #8 936 str d1, [x8], #8 937 st1 {v0.s}[2], [x0], x1 938 st1 {v1.s}[2], [x8], x1 939 b.gt 8b 940 ret 941160: 942 AARCH64_VALID_JUMP_TARGET 943 ld1 {v16.16b}, [x5] 944 sub x1, x1, #16 945 neg v17.16b, v16.16b // -m 946 sxtl v16.8h, v17.8b 947 sxtl2 v17.8h, v17.16b 948 shl v16.8h, v16.8h, #9 // -m << 9 949 shl v17.4h, v17.4h, #9 95016: 951 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 952 ld1 {v0.8h, v1.8h}, [x0] 953 subs w4, w4, #2 954 ld1 {v2.8h, v3.8h}, [x8] 955 sub v4.8h, v0.8h, v4.8h // a - b 956 sub v5.4h, v1.4h, v5.4h 957 sub v6.8h, v2.8h, v6.8h 958 sub v7.4h, v3.4h, v7.4h 959 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 960 sqrdmulh v5.4h, v5.4h, v17.4h 961 sqrdmulh v6.8h, v6.8h, v16.8h 962 sqrdmulh v7.4h, v7.4h, v17.4h 963 add v0.8h, v0.8h, v4.8h 964 add v1.4h, v1.4h, v5.4h 965 add v2.8h, v2.8h, v6.8h 966 add v3.4h, v3.4h, v7.4h 967 st1 {v0.8h}, [x0], #16 968 st1 {v2.8h}, [x8], #16 969 st1 {v1.4h}, [x0], x1 970 st1 {v3.4h}, [x8], x1 971 b.gt 16b 972 ret 973320: 974 AARCH64_VALID_JUMP_TARGET 975 ld1 {v24.16b, v25.16b}, [x5] 976 neg v26.16b, v24.16b // -m 977 neg v27.8b, v25.8b 978 sxtl v24.8h, v26.8b 979 sxtl2 v25.8h, v26.16b 980 sxtl v26.8h, v27.8b 981 shl v24.8h, v24.8h, #9 // -m << 9 982 shl v25.8h, v25.8h, #9 983 shl v26.8h, v26.8h, #9 98432: 985 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 986 ld1 {v0.8h, v1.8h, v2.8h}, [x0] 987 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 988 ld1 {v4.8h, v5.8h, v6.8h}, [x8] 989 subs w4, w4, #2 990 sub v16.8h, v0.8h, v16.8h // a - b 991 sub v17.8h, v1.8h, v17.8h 992 sub v18.8h, v2.8h, v18.8h 993 sub v20.8h, v4.8h, v20.8h 994 sub v21.8h, v5.8h, v21.8h 995 sub v22.8h, v6.8h, v22.8h 996 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 997 sqrdmulh v17.8h, v17.8h, v25.8h 998 sqrdmulh v18.8h, v18.8h, v26.8h 999 sqrdmulh v20.8h, v20.8h, v24.8h 1000 sqrdmulh v21.8h, v21.8h, v25.8h 1001 sqrdmulh v22.8h, v22.8h, v26.8h 1002 add v0.8h, v0.8h, v16.8h 1003 add v1.8h, v1.8h, v17.8h 1004 add v2.8h, v2.8h, v18.8h 1005 add v4.8h, v4.8h, v20.8h 1006 add v5.8h, v5.8h, v21.8h 1007 add v6.8h, v6.8h, v22.8h 1008 st1 {v0.8h, v1.8h, v2.8h}, [x0], x1 1009 st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 1010 b.gt 32b 1011 ret 1012endfunc 1013 1014jumptable blend_v_tbl 1015 .word 320b - blend_v_tbl 1016 .word 160b - blend_v_tbl 1017 .word 80b - blend_v_tbl 1018 .word 40b - blend_v_tbl 1019 .word 20b - blend_v_tbl 1020endjumptable 1021 1022 1023// This has got the same signature as the put_8tap functions, 1024// and assumes that x9 is set to (clz(w)-24). 1025function put_16bpc_neon, export=1 1026 movrel x10, put_16bpc_tbl 1027 ldrsw x9, [x10, x9, lsl #2] 1028 add x10, x10, x9 1029 br x10 1030 103120: 1032 AARCH64_VALID_JUMP_TARGET 10332: 1034 ld1r {v0.4s}, [x2], x3 1035 ld1r {v1.4s}, [x2], x3 1036 subs w5, w5, #2 1037 st1 {v0.s}[0], [x0], x1 1038 st1 {v1.s}[0], [x0], x1 1039 b.gt 2b 1040 ret 104140: 1042 AARCH64_VALID_JUMP_TARGET 10434: 1044 ld1 {v0.4h}, [x2], x3 1045 ld1 {v1.4h}, [x2], x3 1046 subs w5, w5, #2 1047 st1 {v0.4h}, [x0], x1 1048 st1 {v1.4h}, [x0], x1 1049 b.gt 4b 1050 ret 105180: 1052 AARCH64_VALID_JUMP_TARGET 1053 add x8, x0, x1 1054 lsl x1, x1, #1 1055 add x9, x2, x3 1056 lsl x3, x3, #1 10578: 1058 ld1 {v0.8h}, [x2], x3 1059 ld1 {v1.8h}, [x9], x3 1060 subs w5, w5, #2 1061 st1 {v0.8h}, [x0], x1 1062 st1 {v1.8h}, [x8], x1 1063 b.gt 8b 1064 ret 1065160: 1066 AARCH64_VALID_JUMP_TARGET 106716: 1068 ldp x6, x7, [x2] 1069 ldp x8, x9, [x2, #16] 1070 stp x6, x7, [x0] 1071 subs w5, w5, #1 1072 stp x8, x9, [x0, #16] 1073 add x2, x2, x3 1074 add x0, x0, x1 1075 b.gt 16b 1076 ret 1077320: 1078 AARCH64_VALID_JUMP_TARGET 107932: 1080 ldp x6, x7, [x2] 1081 ldp x8, x9, [x2, #16] 1082 stp x6, x7, [x0] 1083 ldp x10, x11, [x2, #32] 1084 stp x8, x9, [x0, #16] 1085 subs w5, w5, #1 1086 ldp x12, x13, [x2, #48] 1087 stp x10, x11, [x0, #32] 1088 stp x12, x13, [x0, #48] 1089 add x2, x2, x3 1090 add x0, x0, x1 1091 b.gt 32b 1092 ret 1093640: 1094 AARCH64_VALID_JUMP_TARGET 109564: 1096 ldp q0, q1, [x2] 1097 ldp q2, q3, [x2, #32] 1098 stp q0, q1, [x0] 1099 ldp q4, q5, [x2, #64] 1100 stp q2, q3, [x0, #32] 1101 ldp q6, q7, [x2, #96] 1102 subs w5, w5, #1 1103 stp q4, q5, [x0, #64] 1104 stp q6, q7, [x0, #96] 1105 add x2, x2, x3 1106 add x0, x0, x1 1107 b.gt 64b 1108 ret 11091280: 1110 AARCH64_VALID_JUMP_TARGET 1111128: 1112 ldp q0, q1, [x2] 1113 ldp q2, q3, [x2, #32] 1114 stp q0, q1, [x0] 1115 ldp q4, q5, [x2, #64] 1116 stp q2, q3, [x0, #32] 1117 ldp q6, q7, [x2, #96] 1118 subs w5, w5, #1 1119 stp q4, q5, [x0, #64] 1120 ldp q16, q17, [x2, #128] 1121 stp q6, q7, [x0, #96] 1122 ldp q18, q19, [x2, #160] 1123 stp q16, q17, [x0, #128] 1124 ldp q20, q21, [x2, #192] 1125 stp q18, q19, [x0, #160] 1126 ldp q22, q23, [x2, #224] 1127 stp q20, q21, [x0, #192] 1128 stp q22, q23, [x0, #224] 1129 add x2, x2, x3 1130 add x0, x0, x1 1131 b.gt 128b 1132 ret 1133endfunc 1134 1135jumptable put_16bpc_tbl 1136 .word 1280b - put_16bpc_tbl 1137 .word 640b - put_16bpc_tbl 1138 .word 320b - put_16bpc_tbl 1139 .word 160b - put_16bpc_tbl 1140 .word 80b - put_16bpc_tbl 1141 .word 40b - put_16bpc_tbl 1142 .word 20b - put_16bpc_tbl 1143endjumptable 1144 1145 1146// This has got the same signature as the prep_8tap functions, 1147// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and 1148// x8 to w*2. 1149function prep_16bpc_neon 1150 movrel x10, prep_16bpc_tbl 1151 ldrsw x9, [x10, x9, lsl #2] 1152 dup v31.8h, w7 // intermediate_bits 1153 movi v30.8h, #(PREP_BIAS >> 8), lsl #8 1154 add x10, x10, x9 1155 br x10 1156 115740: 1158 AARCH64_VALID_JUMP_TARGET 1159 add x9, x1, x2 1160 lsl x2, x2, #1 11614: 1162 ld1 {v0.8b}, [x1], x2 1163 ld1 {v0.d}[1], [x9], x2 1164 subs w4, w4, #2 1165 sshl v0.8h, v0.8h, v31.8h 1166 sub v0.8h, v0.8h, v30.8h 1167 st1 {v0.8h}, [x0], #16 1168 b.gt 4b 1169 ret 117080: 1171 AARCH64_VALID_JUMP_TARGET 1172 add x9, x1, x2 1173 lsl x2, x2, #1 11748: 1175 ld1 {v0.8h}, [x1], x2 1176 ld1 {v1.8h}, [x9], x2 1177 subs w4, w4, #2 1178 sshl v0.8h, v0.8h, v31.8h 1179 sshl v1.8h, v1.8h, v31.8h 1180 sub v0.8h, v0.8h, v30.8h 1181 sub v1.8h, v1.8h, v30.8h 1182 st1 {v0.8h, v1.8h}, [x0], #32 1183 b.gt 8b 1184 ret 1185160: 1186 AARCH64_VALID_JUMP_TARGET 118716: 1188 ldp q0, q1, [x1] 1189 add x1, x1, x2 1190 sshl v0.8h, v0.8h, v31.8h 1191 ldp q2, q3, [x1] 1192 add x1, x1, x2 1193 subs w4, w4, #2 1194 sshl v1.8h, v1.8h, v31.8h 1195 sshl v2.8h, v2.8h, v31.8h 1196 sshl v3.8h, v3.8h, v31.8h 1197 sub v0.8h, v0.8h, v30.8h 1198 sub v1.8h, v1.8h, v30.8h 1199 sub v2.8h, v2.8h, v30.8h 1200 sub v3.8h, v3.8h, v30.8h 1201 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1202 b.gt 16b 1203 ret 1204320: 1205 AARCH64_VALID_JUMP_TARGET 120632: 1207 ldp q0, q1, [x1] 1208 sshl v0.8h, v0.8h, v31.8h 1209 ldp q2, q3, [x1, #32] 1210 add x1, x1, x2 1211 sshl v1.8h, v1.8h, v31.8h 1212 sshl v2.8h, v2.8h, v31.8h 1213 sshl v3.8h, v3.8h, v31.8h 1214 subs w4, w4, #1 1215 sub v0.8h, v0.8h, v30.8h 1216 sub v1.8h, v1.8h, v30.8h 1217 sub v2.8h, v2.8h, v30.8h 1218 sub v3.8h, v3.8h, v30.8h 1219 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1220 b.gt 32b 1221 ret 1222640: 1223 AARCH64_VALID_JUMP_TARGET 122464: 1225 ldp q0, q1, [x1] 1226 subs w4, w4, #1 1227 sshl v0.8h, v0.8h, v31.8h 1228 ldp q2, q3, [x1, #32] 1229 sshl v1.8h, v1.8h, v31.8h 1230 ldp q4, q5, [x1, #64] 1231 sshl v2.8h, v2.8h, v31.8h 1232 sshl v3.8h, v3.8h, v31.8h 1233 ldp q6, q7, [x1, #96] 1234 add x1, x1, x2 1235 sshl v4.8h, v4.8h, v31.8h 1236 sshl v5.8h, v5.8h, v31.8h 1237 sshl v6.8h, v6.8h, v31.8h 1238 sshl v7.8h, v7.8h, v31.8h 1239 sub v0.8h, v0.8h, v30.8h 1240 sub v1.8h, v1.8h, v30.8h 1241 sub v2.8h, v2.8h, v30.8h 1242 sub v3.8h, v3.8h, v30.8h 1243 stp q0, q1, [x0] 1244 sub v4.8h, v4.8h, v30.8h 1245 sub v5.8h, v5.8h, v30.8h 1246 stp q2, q3, [x0, #32] 1247 sub v6.8h, v6.8h, v30.8h 1248 sub v7.8h, v7.8h, v30.8h 1249 stp q4, q5, [x0, #64] 1250 stp q6, q7, [x0, #96] 1251 add x0, x0, x8 1252 b.gt 64b 1253 ret 12541280: 1255 AARCH64_VALID_JUMP_TARGET 1256128: 1257 ldp q0, q1, [x1] 1258 subs w4, w4, #1 1259 sshl v0.8h, v0.8h, v31.8h 1260 ldp q2, q3, [x1, #32] 1261 sshl v1.8h, v1.8h, v31.8h 1262 ldp q4, q5, [x1, #64] 1263 sshl v2.8h, v2.8h, v31.8h 1264 sshl v3.8h, v3.8h, v31.8h 1265 ldp q6, q7, [x1, #96] 1266 sshl v4.8h, v4.8h, v31.8h 1267 sshl v5.8h, v5.8h, v31.8h 1268 ldp q16, q17, [x1, #128] 1269 sshl v6.8h, v6.8h, v31.8h 1270 sshl v7.8h, v7.8h, v31.8h 1271 ldp q18, q19, [x1, #160] 1272 sshl v16.8h, v16.8h, v31.8h 1273 sshl v17.8h, v17.8h, v31.8h 1274 ldp q20, q21, [x1, #192] 1275 sshl v18.8h, v18.8h, v31.8h 1276 sshl v19.8h, v19.8h, v31.8h 1277 ldp q22, q23, [x1, #224] 1278 add x1, x1, x2 1279 sshl v20.8h, v20.8h, v31.8h 1280 sshl v21.8h, v21.8h, v31.8h 1281 sshl v22.8h, v22.8h, v31.8h 1282 sshl v23.8h, v23.8h, v31.8h 1283 sub v0.8h, v0.8h, v30.8h 1284 sub v1.8h, v1.8h, v30.8h 1285 sub v2.8h, v2.8h, v30.8h 1286 sub v3.8h, v3.8h, v30.8h 1287 stp q0, q1, [x0] 1288 sub v4.8h, v4.8h, v30.8h 1289 sub v5.8h, v5.8h, v30.8h 1290 stp q2, q3, [x0, #32] 1291 sub v6.8h, v6.8h, v30.8h 1292 sub v7.8h, v7.8h, v30.8h 1293 stp q4, q5, [x0, #64] 1294 sub v16.8h, v16.8h, v30.8h 1295 sub v17.8h, v17.8h, v30.8h 1296 stp q6, q7, [x0, #96] 1297 sub v18.8h, v18.8h, v30.8h 1298 sub v19.8h, v19.8h, v30.8h 1299 stp q16, q17, [x0, #128] 1300 sub v20.8h, v20.8h, v30.8h 1301 sub v21.8h, v21.8h, v30.8h 1302 stp q18, q19, [x0, #160] 1303 sub v22.8h, v22.8h, v30.8h 1304 sub v23.8h, v23.8h, v30.8h 1305 stp q20, q21, [x0, #192] 1306 stp q22, q23, [x0, #224] 1307 add x0, x0, x8 1308 b.gt 128b 1309 ret 1310endfunc 1311 1312jumptable prep_16bpc_tbl 1313 .word 1280b - prep_16bpc_tbl 1314 .word 640b - prep_16bpc_tbl 1315 .word 320b - prep_16bpc_tbl 1316 .word 160b - prep_16bpc_tbl 1317 .word 80b - prep_16bpc_tbl 1318 .word 40b - prep_16bpc_tbl 1319endjumptable 1320 1321 1322.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1323 ld1 {\d0\wd}[0], [\s0], \strd 1324 ld1 {\d1\wd}[0], [\s1], \strd 1325.ifnb \d2 1326 ld1 {\d2\wd}[0], [\s0], \strd 1327 ld1 {\d3\wd}[0], [\s1], \strd 1328.endif 1329.ifnb \d4 1330 ld1 {\d4\wd}[0], [\s0], \strd 1331.endif 1332.ifnb \d5 1333 ld1 {\d5\wd}[0], [\s1], \strd 1334.endif 1335.ifnb \d6 1336 ld1 {\d6\wd}[0], [\s0], \strd 1337.endif 1338.endm 1339.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1340 ld1 {\d0\wd}, [\s0], \strd 1341 ld1 {\d1\wd}, [\s1], \strd 1342.ifnb \d2 1343 ld1 {\d2\wd}, [\s0], \strd 1344 ld1 {\d3\wd}, [\s1], \strd 1345.endif 1346.ifnb \d4 1347 ld1 {\d4\wd}, [\s0], \strd 1348.endif 1349.ifnb \d5 1350 ld1 {\d5\wd}, [\s1], \strd 1351.endif 1352.ifnb \d6 1353 ld1 {\d6\wd}, [\s0], \strd 1354.endif 1355.endm 1356.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 1357 ld1 {\d0\wd, \d1\wd}, [\s0], \strd 1358.ifnb \d2 1359 ld1 {\d2\wd, \d3\wd}, [\s1], \strd 1360.endif 1361.ifnb \d4 1362 ld1 {\d4\wd, \d5\wd}, [\s0], \strd 1363.endif 1364.endm 1365.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1366 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1367.endm 1368.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1369 load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1370.endm 1371.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1372 load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1373.endm 1374.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 1375 load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 1376.endm 1377.macro interleave_1 wd, r0, r1, r2, r3, r4 1378 trn1 \r0\wd, \r0\wd, \r1\wd 1379 trn1 \r1\wd, \r1\wd, \r2\wd 1380.ifnb \r3 1381 trn1 \r2\wd, \r2\wd, \r3\wd 1382 trn1 \r3\wd, \r3\wd, \r4\wd 1383.endif 1384.endm 1385.macro interleave_1_s r0, r1, r2, r3, r4 1386 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1387.endm 1388.macro umin_h c, wd, r0, r1, r2, r3 1389 umin \r0\wd, \r0\wd, \c\wd 1390.ifnb \r1 1391 umin \r1\wd, \r1\wd, \c\wd 1392.endif 1393.ifnb \r2 1394 umin \r2\wd, \r2\wd, \c\wd 1395 umin \r3\wd, \r3\wd, \c\wd 1396.endif 1397.endm 1398.macro sub_h c, wd, r0, r1, r2, r3 1399 sub \r0\wd, \r0\wd, \c\wd 1400.ifnb \r1 1401 sub \r1\wd, \r1\wd, \c\wd 1402.endif 1403.ifnb \r2 1404 sub \r2\wd, \r2\wd, \c\wd 1405 sub \r3\wd, \r3\wd, \c\wd 1406.endif 1407.endm 1408.macro smull_smlal_4tap d, s0, s1, s2, s3 1409 smull \d\().4s, \s0\().4h, v0.h[0] 1410 smlal \d\().4s, \s1\().4h, v0.h[1] 1411 smlal \d\().4s, \s2\().4h, v0.h[2] 1412 smlal \d\().4s, \s3\().4h, v0.h[3] 1413.endm 1414.macro smull2_smlal2_4tap d, s0, s1, s2, s3 1415 smull2 \d\().4s, \s0\().8h, v0.h[0] 1416 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1417 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1418 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1419.endm 1420.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 1421 smull \d\().4s, \s1\().4h, v0.h[1] 1422 smlal \d\().4s, \s2\().4h, v0.h[2] 1423 smlal \d\().4s, \s3\().4h, v0.h[3] 1424 smlal \d\().4s, \s4\().4h, v0.h[4] 1425 smlal \d\().4s, \s5\().4h, v0.h[5] 1426 smlal \d\().4s, \s6\().4h, v0.h[6] 1427.endm 1428.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 1429 smull2 \d\().4s, \s1\().8h, v0.h[1] 1430 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1431 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1432 smlal2 \d\().4s, \s4\().8h, v0.h[4] 1433 smlal2 \d\().4s, \s5\().8h, v0.h[5] 1434 smlal2 \d\().4s, \s6\().8h, v0.h[6] 1435.endm 1436.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 1437 smull \d\().4s, \s0\().4h, v0.h[0] 1438 smlal \d\().4s, \s1\().4h, v0.h[1] 1439 smlal \d\().4s, \s2\().4h, v0.h[2] 1440 smlal \d\().4s, \s3\().4h, v0.h[3] 1441 smlal \d\().4s, \s4\().4h, v0.h[4] 1442 smlal \d\().4s, \s5\().4h, v0.h[5] 1443 smlal \d\().4s, \s6\().4h, v0.h[6] 1444 smlal \d\().4s, \s7\().4h, v0.h[7] 1445.endm 1446.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 1447 smull2 \d\().4s, \s0\().8h, v0.h[0] 1448 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1449 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1450 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1451 smlal2 \d\().4s, \s4\().8h, v0.h[4] 1452 smlal2 \d\().4s, \s5\().8h, v0.h[5] 1453 smlal2 \d\().4s, \s6\().8h, v0.h[6] 1454 smlal2 \d\().4s, \s7\().8h, v0.h[7] 1455.endm 1456.macro sqrshrun_h shift, r0, r1, r2, r3 1457 sqrshrun \r0\().4h, \r0\().4s, #\shift 1458.ifnb \r1 1459 sqrshrun2 \r0\().8h, \r1\().4s, #\shift 1460.endif 1461.ifnb \r2 1462 sqrshrun \r2\().4h, \r2\().4s, #\shift 1463 sqrshrun2 \r2\().8h, \r3\().4s, #\shift 1464.endif 1465.endm 1466.macro xtn_h r0, r1, r2, r3 1467 uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2 1468.ifnb \r2 1469 uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto 1470.endif 1471.endm 1472.macro srshl_s shift, r0, r1, r2, r3 1473 srshl \r0\().4s, \r0\().4s, \shift\().4s 1474 srshl \r1\().4s, \r1\().4s, \shift\().4s 1475.ifnb \r2 1476 srshl \r2\().4s, \r2\().4s, \shift\().4s 1477 srshl \r3\().4s, \r3\().4s, \shift\().4s 1478.endif 1479.endm 1480.macro st_s strd, reg, lanes 1481 st1 {\reg\().s}[0], [x0], \strd 1482 st1 {\reg\().s}[1], [x9], \strd 1483.if \lanes > 2 1484 st1 {\reg\().s}[2], [x0], \strd 1485 st1 {\reg\().s}[3], [x9], \strd 1486.endif 1487.endm 1488.macro st_d strd, r0, r1 1489 st1 {\r0\().8b}, [x0], \strd 1490 st1 {\r0\().d}[1], [x9], \strd 1491.ifnb \r1 1492 st1 {\r1\().8b}, [x0], \strd 1493 st1 {\r1\().d}[1], [x9], \strd 1494.endif 1495.endm 1496.macro shift_store_4 type, strd, r0, r1, r2, r3 1497.ifc \type, put 1498 sqrshrun_h 6, \r0, \r1, \r2, \r3 1499 umin_h v31, .8h, \r0, \r2 1500.else 1501 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1502 xtn_h \r0, \r1, \r2, \r3 1503 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1504.endif 1505 st_d \strd, \r0, \r2 1506.endm 1507.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1508 st1 {\r0\wd}, [x0], \strd 1509 st1 {\r1\wd}, [x9], \strd 1510.ifnb \r2 1511 st1 {\r2\wd}, [x0], \strd 1512 st1 {\r3\wd}, [x9], \strd 1513.endif 1514.ifnb \r4 1515 st1 {\r4\wd}, [x0], \strd 1516 st1 {\r5\wd}, [x9], \strd 1517 st1 {\r6\wd}, [x0], \strd 1518 st1 {\r7\wd}, [x9], \strd 1519.endif 1520.endm 1521.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 1522 st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1523.endm 1524.macro shift_store_8 type, strd, r0, r1, r2, r3 1525.ifc \type, put 1526 sqrshrun_h 6, \r0, \r1, \r2, \r3 1527 umin_h v31, .8h, \r0, \r2 1528.else 1529 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1530 xtn_h \r0, \r1, \r2, \r3 1531 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1532.endif 1533 st_8h \strd, \r0, \r2 1534.endm 1535.macro shift_store_16 type, strd, dst, r0, r1, r2, r3 1536.ifc \type, put 1537 sqrshrun_h 6, \r0, \r1, \r2, \r3 1538 umin \r0\().8h, \r0\().8h, v31.8h 1539 umin \r1\().8h, \r2\().8h, v31.8h 1540.else 1541 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1542 xtn_h \r0, \r1, \r2, \r3 1543 sub \r0\().8h, \r0\().8h, v29.8h 1544 sub \r1\().8h, \r2\().8h, v29.8h 1545.endif 1546 st1 {\r0\().8h, \r1\().8h}, [\dst], \strd 1547.endm 1548 1549.macro make_8tap_fn op, type, type_h, type_v, taps 1550function \op\()_8tap_\type\()_16bpc_neon, export=1 1551 mov w9, \type_h 1552 mov w10, \type_v 1553 b \op\()_\taps\()_neon 1554endfunc 1555.endm 1556 1557// No spaces in these expressions, due to gas-preprocessor. 1558#define REGULAR ((0*15<<7)|3*15) 1559#define SMOOTH ((1*15<<7)|4*15) 1560#define SHARP ((2*15<<7)|3*15) 1561 1562.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps 1563function \type\()_\taps\()_neon 1564.ifc \bdmax, w8 1565 ldr w8, [sp] 1566.endif 1567 mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1568 mul \mx, \mx, w11 1569 mul \my, \my, w11 1570 add \mx, \mx, w9 // mx, 8tap_h, 4tap_h 1571 add \my, \my, w10 // my, 8tap_v, 4tap_v 1572.ifc \type, prep 1573 uxtw \d_strd, \w 1574 lsl \d_strd, \d_strd, #1 1575.endif 1576 1577 dup v31.8h, \bdmax // bitdepth_max 1578 clz \bdmax, \bdmax 1579 clz w9, \w 1580 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 1581 mov w12, #6 1582 tst \mx, #(0x7f << 14) 1583 sub w9, w9, #24 1584 add w13, w12, \bdmax // 6 + intermediate_bits 1585 sub w12, w12, \bdmax // 6 - intermediate_bits 1586 movrel x11, X(mc_subpel_filters), -8 1587 b.ne L(\type\()_\taps\()_h) 1588 tst \my, #(0x7f << 14) 1589 b.ne L(\type\()_\taps\()_v) 1590 b \type\()_16bpc_neon 1591 1592L(\type\()_\taps\()_h): 1593 cmp \w, #4 1594 ubfx w10, \mx, #7, #7 1595 and \mx, \mx, #0x7f 1596 b.le 4f 1597 mov \mx, w10 15984: 1599 tst \my, #(0x7f << 14) 1600 add \xmx, x11, \mx, uxtw #3 1601 b.ne L(\type\()_\taps\()_hv) 1602 1603 movrel x10, \type\()_\taps\()_h_tbl 1604 ldrsw x9, [x10, x9, lsl #2] 1605.ifc \type, put 1606 mov w12, #34 // rounding for 10-bit 1607 mov w13, #40 // rounding for 12-bit 1608 cmp \bdmax, #2 // 10-bit: 4, 12-bit: 2 1609 csel w12, w12, w13, ne // select rounding based on \bdmax 1610.else 1611 neg w12, w12 // -(6 - intermediate_bits) 1612 movi v28.8h, #(PREP_BIAS >> 8), lsl #8 1613.endif 1614 add x10, x10, x9 1615 dup v30.4s, w12 // rounding or shift amount 1616 br x10 1617 161820: // 2xN h 1619 AARCH64_VALID_JUMP_TARGET 1620.ifc \type, put 1621 ldur s0, [\xmx, #2] 1622 sub \src, \src, #2 1623 add \ds2, \dst, \d_strd 1624 add \sr2, \src, \s_strd 1625 lsl \d_strd, \d_strd, #1 1626 lsl \s_strd, \s_strd, #1 1627 sxtl v0.8h, v0.8b 16282: 1629 ld1 {v4.8h}, [\src], \s_strd 1630 ld1 {v6.8h}, [\sr2], \s_strd 1631 mov v2.16b, v30.16b 1632 ext v5.16b, v4.16b, v4.16b, #2 1633 ext v7.16b, v6.16b, v6.16b, #2 1634 subs \h, \h, #2 1635 trn1 v3.2s, v4.2s, v6.2s 1636 trn2 v6.2s, v4.2s, v6.2s 1637 trn1 v4.2s, v5.2s, v7.2s 1638 trn2 v7.2s, v5.2s, v7.2s 1639 smlal v2.4s, v3.4h, v0.h[0] 1640 smlal v2.4s, v4.4h, v0.h[1] 1641 smlal v2.4s, v6.4h, v0.h[2] 1642 smlal v2.4s, v7.4h, v0.h[3] 1643 sqshrun v2.4h, v2.4s, #6 1644 umin v2.4h, v2.4h, v31.4h 1645 st1 {v2.s}[0], [\dst], \d_strd 1646 st1 {v2.s}[1], [\ds2], \d_strd 1647 b.gt 2b 1648 ret 1649.endif 1650 165140: // 4xN h 1652 AARCH64_VALID_JUMP_TARGET 1653 ldur s0, [\xmx, #2] 1654 sub \src, \src, #2 1655 add \ds2, \dst, \d_strd 1656 add \sr2, \src, \s_strd 1657 lsl \d_strd, \d_strd, #1 1658 lsl \s_strd, \s_strd, #1 1659 sxtl v0.8h, v0.8b 16604: 1661 ld1 {v16.8h}, [\src], \s_strd 1662 ld1 {v20.8h}, [\sr2], \s_strd 1663.ifc \type, put 1664 mov v2.16b, v30.16b 1665 mov v3.16b, v30.16b 1666.endif 1667 ext v17.16b, v16.16b, v16.16b, #2 1668 ext v18.16b, v16.16b, v16.16b, #4 1669 ext v19.16b, v16.16b, v16.16b, #6 1670 ext v21.16b, v20.16b, v20.16b, #2 1671 ext v22.16b, v20.16b, v20.16b, #4 1672 ext v23.16b, v20.16b, v20.16b, #6 1673 subs \h, \h, #2 1674.ifc \type, put 1675 smlal v2.4s, v16.4h, v0.h[0] 1676.else 1677 smull v2.4s, v16.4h, v0.h[0] 1678.endif 1679 smlal v2.4s, v17.4h, v0.h[1] 1680 smlal v2.4s, v18.4h, v0.h[2] 1681 smlal v2.4s, v19.4h, v0.h[3] 1682.ifc \type, put 1683 smlal v3.4s, v20.4h, v0.h[0] 1684.else 1685 smull v3.4s, v20.4h, v0.h[0] 1686.endif 1687 smlal v3.4s, v21.4h, v0.h[1] 1688 smlal v3.4s, v22.4h, v0.h[2] 1689 smlal v3.4s, v23.4h, v0.h[3] 1690.ifc \type, put 1691 sqshrun v16.4h, v2.4s, #6 1692 sqshrun2 v16.8h, v3.4s, #6 1693 umin v16.8h, v16.8h, v31.8h 1694.else 1695 srshl v16.4s, v2.4s, v30.4s // -(6-intermediate_bits) 1696 srshl v20.4s, v3.4s, v30.4s // -(6-intermediate_bits) 1697 uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2 1698 sub v16.8h, v16.8h, v28.8h // PREP_BIAS 1699.endif 1700 st1 {v16.8b}, [\dst], \d_strd 1701 st1 {v16.d}[1], [\ds2], \d_strd 1702 b.gt 4b 1703 ret 1704 170580: 1706160: 1707320: 1708640: 17091280: // 8xN, 16xN, 32xN, ... h 1710 AARCH64_VALID_JUMP_TARGET 1711 ld1 {v0.8b}, [\xmx] 1712.ifc \taps, 6tap 1713 sub \src, \src, #4 1714.else 1715 sub \src, \src, #6 1716.endif 1717 add \ds2, \dst, \d_strd 1718 add \sr2, \src, \s_strd 1719 lsl \s_strd, \s_strd, #1 1720 sxtl v0.8h, v0.8b 1721 1722 sub \s_strd, \s_strd, \w, uxtw #1 1723 sub \s_strd, \s_strd, #16 1724.ifc \type, put 1725 lsl \d_strd, \d_strd, #1 1726 sub \d_strd, \d_strd, \w, uxtw #1 1727.endif 172881: 1729 ld1 {v16.8h, v17.8h}, [\src], #32 1730 ld1 {v20.8h, v21.8h}, [\sr2], #32 1731 mov \mx, \w 1732 17338: 1734.ifc \taps, 6tap 1735 .ifc \type, put 1736 mov v18.16b, v30.16b 1737 mov v19.16b, v30.16b 1738 smlal v18.4s, v16.4h, v0.h[1] 1739 smlal2 v19.4s, v16.8h, v0.h[1] 1740 mov v22.16b, v30.16b 1741 mov v23.16b, v30.16b 1742 smlal v22.4s, v20.4h, v0.h[1] 1743 smlal2 v23.4s, v20.8h, v0.h[1] 1744 .else 1745 smull v18.4s, v16.4h, v0.h[1] 1746 smull2 v19.4s, v16.8h, v0.h[1] 1747 smull v22.4s, v20.4h, v0.h[1] 1748 smull2 v23.4s, v20.8h, v0.h[1] 1749 .endif 1750 .irpc i, 23456 1751 ext v24.16b, v16.16b, v17.16b, #(2*\i-2) 1752 ext v25.16b, v20.16b, v21.16b, #(2*\i-2) 1753 smlal v18.4s, v24.4h, v0.h[\i] 1754 smlal2 v19.4s, v24.8h, v0.h[\i] 1755 smlal v22.4s, v25.4h, v0.h[\i] 1756 smlal2 v23.4s, v25.8h, v0.h[\i] 1757 .endr 1758.else // 8tap 1759 .ifc \type, put 1760 mov v18.16b, v30.16b 1761 mov v19.16b, v30.16b 1762 smlal v18.4s, v16.4h, v0.h[0] 1763 smlal2 v19.4s, v16.8h, v0.h[0] 1764 mov v22.16b, v30.16b 1765 mov v23.16b, v30.16b 1766 smlal v22.4s, v20.4h, v0.h[0] 1767 smlal2 v23.4s, v20.8h, v0.h[0] 1768 .else 1769 smull v18.4s, v16.4h, v0.h[0] 1770 smull2 v19.4s, v16.8h, v0.h[0] 1771 smull v22.4s, v20.4h, v0.h[0] 1772 smull2 v23.4s, v20.8h, v0.h[0] 1773 .endif 1774 .irpc i, 1234567 1775 ext v24.16b, v16.16b, v17.16b, #(2*\i) 1776 ext v25.16b, v20.16b, v21.16b, #(2*\i) 1777 smlal v18.4s, v24.4h, v0.h[\i] 1778 smlal2 v19.4s, v24.8h, v0.h[\i] 1779 smlal v22.4s, v25.4h, v0.h[\i] 1780 smlal2 v23.4s, v25.8h, v0.h[\i] 1781 .endr 1782.endif 1783 subs \mx, \mx, #8 1784.ifc \type, put 1785 sqshrun v18.4h, v18.4s, #6 1786 sqshrun2 v18.8h, v19.4s, #6 1787 sqshrun v22.4h, v22.4s, #6 1788 sqshrun2 v22.8h, v23.4s, #6 1789 umin v18.8h, v18.8h, v31.8h 1790 umin v22.8h, v22.8h, v31.8h 1791.else 1792 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) 1793 srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) 1794 srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) 1795 srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) 1796 uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2 1797 uzp1 v22.8h, v22.8h, v23.8h // Ditto 1798 sub v18.8h, v18.8h, v28.8h // PREP_BIAS 1799 sub v22.8h, v22.8h, v28.8h // PREP_BIAS 1800.endif 1801 st1 {v18.8h}, [\dst], #16 1802 st1 {v22.8h}, [\ds2], #16 1803 b.le 9f 1804 1805 mov v16.16b, v17.16b 1806 mov v20.16b, v21.16b 1807 ld1 {v17.8h}, [\src], #16 1808 ld1 {v21.8h}, [\sr2], #16 1809 b 8b 1810 18119: 1812 add \dst, \dst, \d_strd 1813 add \ds2, \ds2, \d_strd 1814 add \src, \src, \s_strd 1815 add \sr2, \sr2, \s_strd 1816 1817 subs \h, \h, #2 1818 b.gt 81b 1819 ret 1820endfunc 1821 1822jumptable \type\()_\taps\()_h_tbl 1823 .word 1280b - \type\()_\taps\()_h_tbl 1824 .word 640b - \type\()_\taps\()_h_tbl 1825 .word 320b - \type\()_\taps\()_h_tbl 1826 .word 160b - \type\()_\taps\()_h_tbl 1827 .word 80b - \type\()_\taps\()_h_tbl 1828 .word 40b - \type\()_\taps\()_h_tbl 1829 .word 20b - \type\()_\taps\()_h_tbl 1830endjumptable 1831 1832 1833function L(\type\()_\taps\()_v) 1834 cmp \h, #4 1835 ubfx w10, \my, #7, #7 1836 and \my, \my, #0x7f 1837 b.le 4f 1838 mov \my, w10 18394: 1840 add \xmy, x11, \my, uxtw #3 1841 1842.ifc \type, prep 1843 dup v30.4s, w12 // 6 - intermediate_bits 1844 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 1845.endif 1846 movrel x10, \type\()_\taps\()_v_tbl 1847 ldrsw x9, [x10, x9, lsl #2] 1848.ifc \type, prep 1849 neg v30.4s, v30.4s // -(6-intermediate_bits) 1850.endif 1851 add x10, x10, x9 1852 br x10 1853 185420: // 2xN v 1855 AARCH64_VALID_JUMP_TARGET 1856.ifc \type, put 1857 b.gt 28f 1858 1859 cmp \h, #2 1860 ldur s0, [\xmy, #2] 1861 sub \src, \src, \s_strd 1862 add \ds2, \dst, \d_strd 1863 add \sr2, \src, \s_strd 1864 lsl \s_strd, \s_strd, #1 1865 lsl \d_strd, \d_strd, #1 1866 sxtl v0.8h, v0.8b 1867 1868 // 2x2 v 1869 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1870 interleave_1_s v1, v2, v3, v4, v5 1871 b.gt 24f 1872 smull_smlal_4tap v6, v1, v2, v3, v4 1873 sqrshrun_h 6, v6 1874 umin_h v31, .8h, v6 1875 st_s \d_strd, v6, 2 1876 ret 1877 187824: // 2x4 v 1879 load_s \sr2, \src, \s_strd, v6, v7 1880 interleave_1_s v5, v6, v7 1881 smull_smlal_4tap v16, v1, v2, v3, v4 1882 smull_smlal_4tap v17, v3, v4, v5, v6 1883 sqrshrun_h 6, v16, v17 1884 umin_h v31, .8h, v16 1885 st_s \d_strd, v16, 4 1886 ret 1887 188828: // 2x6, 2x8, 2x12, 2x16 v 1889 ld1 {v0.8b}, [\xmy] 1890 sub \sr2, \src, \s_strd, lsl #1 1891 add \ds2, \dst, \d_strd 1892 sub \src, \sr2, \s_strd 1893 lsl \d_strd, \d_strd, #1 1894 lsl \s_strd, \s_strd, #1 1895 sxtl v0.8h, v0.8b 1896 1897 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1898 interleave_1_s v1, v2, v3, v4, v5 1899 interleave_1_s v5, v6, v7 1900216: 1901 subs \h, \h, #4 1902 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 1903 interleave_1_s v7, v16, v17, v18, v19 1904 smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 1905 smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18 1906 sqrshrun_h 6, v24, v25 1907 umin_h v31, .8h, v24 1908 st_s \d_strd, v24, 4 1909 b.le 0f 1910 cmp \h, #2 1911 mov v1.16b, v5.16b 1912 mov v2.16b, v6.16b 1913 mov v3.16b, v7.16b 1914 mov v4.16b, v16.16b 1915 mov v5.16b, v17.16b 1916 mov v6.16b, v18.16b 1917 mov v7.16b, v19.16b 1918 b.eq 26f 1919 b 216b 192026: 1921 load_s \sr2, \src, \s_strd, v16, v17 1922 interleave_1_s v7, v16, v17 1923 smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 1924 sqrshrun_h 6, v24 1925 umin_h v31, .4h, v24 1926 st_s \d_strd, v24, 2 19270: 1928 ret 1929.endif 1930 193140: 1932 AARCH64_VALID_JUMP_TARGET 1933 b.gt 480f 1934 1935 // 4x2, 4x4 v 1936 cmp \h, #2 1937 ldur s0, [\xmy, #2] 1938 sub \src, \src, \s_strd 1939 add \ds2, \dst, \d_strd 1940 add \sr2, \src, \s_strd 1941 lsl \s_strd, \s_strd, #1 1942 lsl \d_strd, \d_strd, #1 1943 sxtl v0.8h, v0.8b 1944 1945 load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1946 smull_smlal_4tap v6, v1, v2, v3, v4 1947 smull_smlal_4tap v7, v2, v3, v4, v5 1948 shift_store_4 \type, \d_strd, v6, v7 1949 b.le 0f 1950 load_4h \sr2, \src, \s_strd, v6, v7 1951 smull_smlal_4tap v1, v3, v4, v5, v6 1952 smull_smlal_4tap v2, v4, v5, v6, v7 1953 shift_store_4 \type, \d_strd, v1, v2 19540: 1955 ret 1956 1957480: // 4x6, 4x8, 4x12, 4x16 v 1958 ld1 {v0.8b}, [\xmy] 1959 sub \sr2, \src, \s_strd, lsl #1 1960 add \ds2, \dst, \d_strd 1961 sub \src, \sr2, \s_strd 1962 lsl \s_strd, \s_strd, #1 1963 lsl \d_strd, \d_strd, #1 1964 sxtl v0.8h, v0.8b 1965 1966 load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1967 196848: 1969 subs \h, \h, #4 1970 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 1971 smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 1972 smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 1973 smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25 1974 smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 1975 shift_store_4 \type, \d_strd, v1, v2, v3, v4 1976 b.le 0f 1977 cmp \h, #2 1978 mov v16.8b, v20.8b 1979 mov v17.8b, v21.8b 1980 mov v18.8b, v22.8b 1981 mov v19.8b, v23.8b 1982 mov v20.8b, v24.8b 1983 mov v21.8b, v25.8b 1984 mov v22.8b, v26.8b 1985 b.eq 46f 1986 b 48b 198746: 1988 load_4h \sr2, \src, \s_strd, v23, v24 1989 smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 1990 smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 1991 shift_store_4 \type, \d_strd, v1, v2 19920: 1993 ret 1994 199580: 1996 AARCH64_VALID_JUMP_TARGET 1997 b.gt 880f 1998 1999 // 8x2, 8x4 v 2000 cmp \h, #2 2001 ldur s0, [\xmy, #2] 2002 sub \src, \src, \s_strd 2003 add \ds2, \dst, \d_strd 2004 add \sr2, \src, \s_strd 2005 lsl \s_strd, \s_strd, #1 2006 lsl \d_strd, \d_strd, #1 2007 sxtl v0.8h, v0.8b 2008 2009 load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 2010 smull_smlal_4tap v16, v1, v2, v3, v4 2011 smull2_smlal2_4tap v17, v1, v2, v3, v4 2012 smull_smlal_4tap v18, v2, v3, v4, v5 2013 smull2_smlal2_4tap v19, v2, v3, v4, v5 2014 shift_store_8 \type, \d_strd, v16, v17, v18, v19 2015 b.le 0f 2016 load_8h \sr2, \src, \s_strd, v6, v7 2017 smull_smlal_4tap v16, v3, v4, v5, v6 2018 smull2_smlal2_4tap v17, v3, v4, v5, v6 2019 smull_smlal_4tap v18, v4, v5, v6, v7 2020 smull2_smlal2_4tap v19, v4, v5, v6, v7 2021 shift_store_8 \type, \d_strd, v16, v17, v18, v19 20220: 2023 ret 2024 2025880: // 8x6, 8x8, 8x16, 8x32 v 20261680: // 16x8, 16x16, ... 2027320: // 32x8, 32x16, ... 2028640: 20291280: 2030 AARCH64_VALID_JUMP_TARGET 2031 ld1 {v0.8b}, [\xmy] 2032 sub \src, \src, \s_strd 2033 sub \src, \src, \s_strd, lsl #1 2034 sxtl v0.8h, v0.8b 2035 mov \my, \h 2036168: 2037 add \ds2, \dst, \d_strd 2038 add \sr2, \src, \s_strd 2039 lsl \s_strd, \s_strd, #1 2040 lsl \d_strd, \d_strd, #1 2041 2042 load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 2043 204488: 2045 subs \h, \h, #2 2046 load_8h \sr2, \src, \s_strd, v23, v24 2047 smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 2048 smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23 2049 smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24 2050 smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24 2051 shift_store_8 \type, \d_strd, v1, v2, v3, v4 2052 b.le 9f 2053 subs \h, \h, #2 2054 load_8h \sr2, \src, \s_strd, v25, v26 2055 smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25 2056 smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25 2057 smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26 2058 smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 2059 shift_store_8 \type, \d_strd, v1, v2, v3, v4 2060 b.le 9f 2061 mov v16.16b, v20.16b 2062 mov v17.16b, v21.16b 2063 mov v18.16b, v22.16b 2064 mov v19.16b, v23.16b 2065 mov v20.16b, v24.16b 2066 mov v21.16b, v25.16b 2067 mov v22.16b, v26.16b 2068 b 88b 20699: 2070 subs \w, \w, #8 2071 b.le 0f 2072 asr \s_strd, \s_strd, #1 2073 asr \d_strd, \d_strd, #1 2074 msub \src, \s_strd, \xmy, \src 2075 msub \dst, \d_strd, \xmy, \dst 2076 sub \src, \src, \s_strd, lsl #3 2077 mov \h, \my 2078 add \src, \src, #16 2079 add \dst, \dst, #16 2080 b 168b 20810: 2082 ret 2083 2084160: 2085 AARCH64_VALID_JUMP_TARGET 2086 b.gt 1680b 2087 2088 // 16x2, 16x4 v 2089 ldur s0, [\xmy, #2] 2090 sub \src, \src, \s_strd 2091 sxtl v0.8h, v0.8b 2092 2093 load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 209416: 2095 load_16h \src, \src, \s_strd, v22, v23 2096 subs \h, \h, #1 2097 smull_smlal_4tap v1, v16, v18, v20, v22 2098 smull2_smlal2_4tap v2, v16, v18, v20, v22 2099 smull_smlal_4tap v3, v17, v19, v21, v23 2100 smull2_smlal2_4tap v4, v17, v19, v21, v23 2101 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 2102 b.le 0f 2103 mov v16.16b, v18.16b 2104 mov v17.16b, v19.16b 2105 mov v18.16b, v20.16b 2106 mov v19.16b, v21.16b 2107 mov v20.16b, v22.16b 2108 mov v21.16b, v23.16b 2109 b 16b 21100: 2111 ret 2112endfunc 2113 2114jumptable \type\()_\taps\()_v_tbl 2115 .word 1280b - \type\()_\taps\()_v_tbl 2116 .word 640b - \type\()_\taps\()_v_tbl 2117 .word 320b - \type\()_\taps\()_v_tbl 2118 .word 160b - \type\()_\taps\()_v_tbl 2119 .word 80b - \type\()_\taps\()_v_tbl 2120 .word 40b - \type\()_\taps\()_v_tbl 2121 .word 20b - \type\()_\taps\()_v_tbl 2122endjumptable 2123 2124function L(\type\()_\taps\()_hv) 2125 cmp \h, #4 2126 ubfx w10, \my, #7, #7 2127 and \my, \my, #0x7f 2128 b.le 4f 2129 mov \my, w10 21304: 2131 add \xmy, x11, \my, uxtw #3 2132 2133 movrel x10, \type\()_\taps\()_hv_tbl 2134 dup v30.4s, w12 // 6 - intermediate_bits 2135 ldrsw x9, [x10, x9, lsl #2] 2136 neg v30.4s, v30.4s // -(6-intermediate_bits) 2137.ifc \type, put 2138 dup v29.4s, w13 // 6 + intermediate_bits 2139.else 2140 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2141.endif 2142 add x10, x10, x9 2143.ifc \type, put 2144 neg v29.4s, v29.4s // -(6+intermediate_bits) 2145.endif 2146 br x10 2147 214820: 2149 AARCH64_VALID_JUMP_TARGET 2150.ifc \type, put 2151 ldur s0, [\xmx, #2] 2152 b.gt 280f 2153 ldur s1, [\xmy, #2] 2154 2155 // 2x2, 2x4 hv 2156 sub \sr2, \src, #2 2157 sub \src, \sr2, \s_strd 2158 add \ds2, \dst, \d_strd 2159 lsl \s_strd, \s_strd, #1 2160 lsl \d_strd, \d_strd, #1 2161 sxtl v0.8h, v0.8b 2162 sxtl v1.8h, v1.8b 2163 mov x15, x30 2164 2165 ld1 {v27.8h}, [\src], \s_strd 2166 ext v28.16b, v27.16b, v27.16b, #2 2167 smull v27.4s, v27.4h, v0.4h 2168 smull v28.4s, v28.4h, v0.4h 2169 addp v27.4s, v27.4s, v28.4s 2170 addp v16.4s, v27.4s, v27.4s 2171 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2172 bl L(\type\()_\taps\()_filter_2) 2173 // The intermediates from the horizontal pass fit in 16 bit without 2174 // any bias; we could just as well keep them as .4s, but narrowing 2175 // them to .4h gives a significant speedup on out of order cores 2176 // (at the cost of a smaller slowdown on in-order cores such as A53). 2177 xtn v16.4h, v16.4s 2178 2179 trn1 v16.2s, v16.2s, v24.2s 2180 mov v17.8b, v24.8b 2181 21822: 2183 bl L(\type\()_\taps\()_filter_2) 2184 2185 ext v18.8b, v17.8b, v24.8b, #4 2186 smull v2.4s, v16.4h, v1.h[0] 2187 smlal v2.4s, v17.4h, v1.h[1] 2188 smlal v2.4s, v18.4h, v1.h[2] 2189 smlal v2.4s, v24.4h, v1.h[3] 2190 2191 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2192 sqxtun v2.4h, v2.4s 2193 umin v2.4h, v2.4h, v31.4h 2194 subs \h, \h, #2 2195 st1 {v2.s}[0], [\dst], \d_strd 2196 st1 {v2.s}[1], [\ds2], \d_strd 2197 b.le 0f 2198 mov v16.8b, v18.8b 2199 mov v17.8b, v24.8b 2200 b 2b 2201 2202280: // 2x8, 2x16, 2x32 hv 2203 ld1 {v1.8b}, [\xmy] 2204 sub \src, \src, #2 2205 sub \sr2, \src, \s_strd, lsl #1 2206 sub \src, \sr2, \s_strd 2207 add \ds2, \dst, \d_strd 2208 lsl \s_strd, \s_strd, #1 2209 lsl \d_strd, \d_strd, #1 2210 sxtl v0.8h, v0.8b 2211 sxtl v1.8h, v1.8b 2212 mov x15, x30 2213 2214 ld1 {v27.8h}, [\src], \s_strd 2215 ext v28.16b, v27.16b, v27.16b, #2 2216 smull v27.4s, v27.4h, v0.4h 2217 smull v28.4s, v28.4h, v0.4h 2218 addp v27.4s, v27.4s, v28.4s 2219 addp v16.4s, v27.4s, v27.4s 2220 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2221 // The intermediates from the horizontal pass fit in 16 bit without 2222 // any bias; we could just as well keep them as .4s, but narrowing 2223 // them to .4h gives a significant speedup on out of order cores 2224 // (at the cost of a smaller slowdown on in-order cores such as A53). 2225 2226 bl L(\type\()_\taps\()_filter_2) 2227 xtn v16.4h, v16.4s 2228 trn1 v16.2s, v16.2s, v24.2s 2229 mov v17.8b, v24.8b 2230 bl L(\type\()_\taps\()_filter_2) 2231 ext v18.8b, v17.8b, v24.8b, #4 2232 mov v19.8b, v24.8b 2233 bl L(\type\()_\taps\()_filter_2) 2234 ext v20.8b, v19.8b, v24.8b, #4 2235 mov v21.8b, v24.8b 2236 223728: 2238 bl L(\type\()_\taps\()_filter_2) 2239 ext v22.8b, v21.8b, v24.8b, #4 2240.ifc \taps, 6tap 2241 smull v3.4s, v17.4h, v1.h[1] 2242 smlal v3.4s, v18.4h, v1.h[2] 2243 smlal v3.4s, v19.4h, v1.h[3] 2244 smlal v3.4s, v20.4h, v1.h[4] 2245 smlal v3.4s, v21.4h, v1.h[5] 2246 smlal v3.4s, v22.4h, v1.h[6] 2247.else // 8tap 2248 smull v3.4s, v16.4h, v1.h[0] 2249 smlal v3.4s, v17.4h, v1.h[1] 2250 smlal v3.4s, v18.4h, v1.h[2] 2251 smlal v3.4s, v19.4h, v1.h[3] 2252 smlal v3.4s, v20.4h, v1.h[4] 2253 smlal v3.4s, v21.4h, v1.h[5] 2254 smlal v3.4s, v22.4h, v1.h[6] 2255 smlal v3.4s, v24.4h, v1.h[7] 2256.endif 2257 2258 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2259 sqxtun v3.4h, v3.4s 2260 umin v3.4h, v3.4h, v31.4h 2261 subs \h, \h, #2 2262 st1 {v3.s}[0], [\dst], \d_strd 2263 st1 {v3.s}[1], [\ds2], \d_strd 2264 b.le 0f 2265 mov v16.8b, v18.8b 2266 mov v17.8b, v19.8b 2267 mov v18.8b, v20.8b 2268 mov v19.8b, v21.8b 2269 mov v20.8b, v22.8b 2270 mov v21.8b, v24.8b 2271 b 28b 2272 22730: 2274 ret x15 2275 2276L(\type\()_\taps\()_filter_2): 2277 ld1 {v25.8h}, [\sr2], \s_strd 2278 ld1 {v27.8h}, [\src], \s_strd 2279 ext v26.16b, v25.16b, v25.16b, #2 2280 ext v28.16b, v27.16b, v27.16b, #2 2281 trn1 v24.2s, v25.2s, v27.2s 2282 trn2 v27.2s, v25.2s, v27.2s 2283 trn1 v25.2s, v26.2s, v28.2s 2284 trn2 v28.2s, v26.2s, v28.2s 2285 smull v24.4s, v24.4h, v0.h[0] 2286 smlal v24.4s, v25.4h, v0.h[1] 2287 smlal v24.4s, v27.4h, v0.h[2] 2288 smlal v24.4s, v28.4h, v0.h[3] 2289 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2290 xtn v24.4h, v24.4s 2291 ret 2292.endif 2293 229440: 2295 AARCH64_VALID_JUMP_TARGET 2296 ldur s0, [\xmx, #2] 2297 b.gt 480f 2298 ldur s1, [\xmy, #2] 2299 sub \sr2, \src, #2 2300 sub \src, \sr2, \s_strd 2301 add \ds2, \dst, \d_strd 2302 lsl \s_strd, \s_strd, #1 2303 lsl \d_strd, \d_strd, #1 2304 sxtl v0.8h, v0.8b 2305 sxtl v1.8h, v1.8b 2306 mov x15, x30 2307 2308 // 4x2, 4x4 hv 2309 ld1 {v25.8h}, [\src], \s_strd 2310 ext v26.16b, v25.16b, v25.16b, #2 2311 ext v27.16b, v25.16b, v25.16b, #4 2312 ext v28.16b, v25.16b, v25.16b, #6 2313 smull v25.4s, v25.4h, v0.h[0] 2314 smlal v25.4s, v26.4h, v0.h[1] 2315 smlal v25.4s, v27.4h, v0.h[2] 2316 smlal v25.4s, v28.4h, v0.h[3] 2317 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2318 // The intermediates from the horizontal pass fit in 16 bit without 2319 // any bias; we could just as well keep them as .4s, but narrowing 2320 // them to .4h gives a significant speedup on out of order cores 2321 // (at the cost of a smaller slowdown on in-order cores such as A53). 2322 xtn v16.4h, v16.4s 2323 2324 bl L(\type\()_\taps\()_filter_4) 2325 mov v17.8b, v24.8b 2326 mov v18.8b, v25.8b 2327 23284: 2329 bl L(\type\()_\taps\()_filter_4) 2330 smull v2.4s, v16.4h, v1.h[0] 2331 smlal v2.4s, v17.4h, v1.h[1] 2332 smlal v2.4s, v18.4h, v1.h[2] 2333 smlal v2.4s, v24.4h, v1.h[3] 2334 smull v3.4s, v17.4h, v1.h[0] 2335 smlal v3.4s, v18.4h, v1.h[1] 2336 smlal v3.4s, v24.4h, v1.h[2] 2337 smlal v3.4s, v25.4h, v1.h[3] 2338.ifc \type, put 2339 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2340 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2341 sqxtun v2.4h, v2.4s 2342 sqxtun2 v2.8h, v3.4s 2343 umin v2.8h, v2.8h, v31.8h 2344.else 2345 rshrn v2.4h, v2.4s, #6 2346 rshrn2 v2.8h, v3.4s, #6 2347 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2348.endif 2349 subs \h, \h, #2 2350 2351 st1 {v2.8b}, [\dst], \d_strd 2352 st1 {v2.d}[1], [\ds2], \d_strd 2353 b.le 0f 2354 mov v16.8b, v18.8b 2355 mov v17.8b, v24.8b 2356 mov v18.8b, v25.8b 2357 b 4b 2358 2359480: // 4x8, 4x16, 4x32 hv 2360 ld1 {v1.8b}, [\xmy] 2361 sub \src, \src, #2 2362.ifc \taps, 6tap 2363 sub \sr2, \src, \s_strd 2364 sub \src, \src, \s_strd, lsl #1 2365.else 2366 sub \sr2, \src, \s_strd, lsl #1 2367 sub \src, \sr2, \s_strd 2368.endif 2369 add \ds2, \dst, \d_strd 2370 lsl \s_strd, \s_strd, #1 2371 lsl \d_strd, \d_strd, #1 2372 sxtl v0.8h, v0.8b 2373 sxtl v1.8h, v1.8b 2374 mov x15, x30 2375 2376 ld1 {v25.8h}, [\src], \s_strd 2377 ext v26.16b, v25.16b, v25.16b, #2 2378 ext v27.16b, v25.16b, v25.16b, #4 2379 ext v28.16b, v25.16b, v25.16b, #6 2380 smull v25.4s, v25.4h, v0.h[0] 2381 smlal v25.4s, v26.4h, v0.h[1] 2382 smlal v25.4s, v27.4h, v0.h[2] 2383 smlal v25.4s, v28.4h, v0.h[3] 2384 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2385 // The intermediates from the horizontal pass fit in 16 bit without 2386 // any bias; we could just as well keep them as .4s, but narrowing 2387 // them to .4h gives a significant speedup on out of order cores 2388 // (at the cost of a smaller slowdown on in-order cores such as A53). 2389.ifc \taps, 6tap 2390 xtn v18.4h, v16.4s 2391.else 2392 xtn v16.4h, v16.4s 2393 2394 bl L(\type\()_\taps\()_filter_4) 2395 mov v17.8b, v24.8b 2396 mov v18.8b, v25.8b 2397.endif 2398 bl L(\type\()_\taps\()_filter_4) 2399 mov v19.8b, v24.8b 2400 mov v20.8b, v25.8b 2401 bl L(\type\()_\taps\()_filter_4) 2402 mov v21.8b, v24.8b 2403 mov v22.8b, v25.8b 2404 240548: 2406 bl L(\type\()_\taps\()_filter_4) 2407.ifc \taps, 6tap 2408 smull v3.4s, v18.4h, v1.h[1] 2409 smlal v3.4s, v19.4h, v1.h[2] 2410 smlal v3.4s, v20.4h, v1.h[3] 2411 smlal v3.4s, v21.4h, v1.h[4] 2412 smlal v3.4s, v22.4h, v1.h[5] 2413 smlal v3.4s, v24.4h, v1.h[6] 2414 smull v4.4s, v19.4h, v1.h[1] 2415 smlal v4.4s, v20.4h, v1.h[2] 2416 smlal v4.4s, v21.4h, v1.h[3] 2417 smlal v4.4s, v22.4h, v1.h[4] 2418 smlal v4.4s, v24.4h, v1.h[5] 2419 smlal v4.4s, v25.4h, v1.h[6] 2420.else // 8tap 2421 smull v3.4s, v16.4h, v1.h[0] 2422 smlal v3.4s, v17.4h, v1.h[1] 2423 smlal v3.4s, v18.4h, v1.h[2] 2424 smlal v3.4s, v19.4h, v1.h[3] 2425 smlal v3.4s, v20.4h, v1.h[4] 2426 smlal v3.4s, v21.4h, v1.h[5] 2427 smlal v3.4s, v22.4h, v1.h[6] 2428 smlal v3.4s, v24.4h, v1.h[7] 2429 smull v4.4s, v17.4h, v1.h[0] 2430 smlal v4.4s, v18.4h, v1.h[1] 2431 smlal v4.4s, v19.4h, v1.h[2] 2432 smlal v4.4s, v20.4h, v1.h[3] 2433 smlal v4.4s, v21.4h, v1.h[4] 2434 smlal v4.4s, v22.4h, v1.h[5] 2435 smlal v4.4s, v24.4h, v1.h[6] 2436 smlal v4.4s, v25.4h, v1.h[7] 2437.endif 2438.ifc \type, put 2439 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2440 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2441 sqxtun v3.4h, v3.4s 2442 sqxtun2 v3.8h, v4.4s 2443 umin v3.8h, v3.8h, v31.8h 2444.else 2445 rshrn v3.4h, v3.4s, #6 2446 rshrn2 v3.8h, v4.4s, #6 2447 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2448.endif 2449 subs \h, \h, #2 2450 st1 {v3.8b}, [\dst], \d_strd 2451 st1 {v3.d}[1], [\ds2], \d_strd 2452 b.le 0f 2453.ifc \taps, 8tap 2454 mov v16.8b, v18.8b 2455 mov v17.8b, v19.8b 2456.endif 2457 mov v18.8b, v20.8b 2458 mov v19.8b, v21.8b 2459 mov v20.8b, v22.8b 2460 mov v21.8b, v24.8b 2461 mov v22.8b, v25.8b 2462 b 48b 24630: 2464 ret x15 2465 2466L(\type\()_\taps\()_filter_4): 2467 ld1 {v24.8h}, [\sr2], \s_strd 2468 ld1 {v25.8h}, [\src], \s_strd 2469 ext v26.16b, v24.16b, v24.16b, #2 2470 ext v27.16b, v24.16b, v24.16b, #4 2471 ext v28.16b, v24.16b, v24.16b, #6 2472 smull v24.4s, v24.4h, v0.h[0] 2473 smlal v24.4s, v26.4h, v0.h[1] 2474 smlal v24.4s, v27.4h, v0.h[2] 2475 smlal v24.4s, v28.4h, v0.h[3] 2476 ext v26.16b, v25.16b, v25.16b, #2 2477 ext v27.16b, v25.16b, v25.16b, #4 2478 ext v28.16b, v25.16b, v25.16b, #6 2479 smull v25.4s, v25.4h, v0.h[0] 2480 smlal v25.4s, v26.4h, v0.h[1] 2481 smlal v25.4s, v27.4h, v0.h[2] 2482 smlal v25.4s, v28.4h, v0.h[3] 2483 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2484 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2485 xtn v24.4h, v24.4s 2486 xtn v25.4h, v25.4s 2487 ret 2488 248980: 2490160: 2491320: 2492 AARCH64_VALID_JUMP_TARGET 2493 b.gt 880f 2494 ld1 {v0.8b}, [\xmx] 2495 ldur s1, [\xmy, #2] 2496.ifc \taps, 6tap 2497 sub \src, \src, #4 2498.else 2499 sub \src, \src, #6 2500.endif 2501 sub \src, \src, \s_strd 2502 sxtl v0.8h, v0.8b 2503 sxtl v1.8h, v1.8b 2504 mov x15, x30 2505 mov \my, \h 2506 2507164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2508 add \ds2, \dst, \d_strd 2509 add \sr2, \src, \s_strd 2510 lsl \d_strd, \d_strd, #1 2511 lsl \s_strd, \s_strd, #1 2512 2513 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2514.ifc \taps, 6tap 2515 smull v24.4s, v27.4h, v0.h[1] 2516 smull2 v25.4s, v27.8h, v0.h[1] 2517 .irpc i, 23456 2518 ext v26.16b, v27.16b, v28.16b, #(2*\i-2) 2519 smlal v24.4s, v26.4h, v0.h[\i] 2520 smlal2 v25.4s, v26.8h, v0.h[\i] 2521 .endr 2522.else 2523 smull v24.4s, v27.4h, v0.h[0] 2524 smull2 v25.4s, v27.8h, v0.h[0] 2525 .irpc i, 1234567 2526 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2527 smlal v24.4s, v26.4h, v0.h[\i] 2528 smlal2 v25.4s, v26.8h, v0.h[\i] 2529 .endr 2530.endif 2531 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2532 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2533 // The intermediates from the horizontal pass fit in 16 bit without 2534 // any bias; we could just as well keep them as .4s, but narrowing 2535 // them to .4h gives a significant speedup on out of order cores 2536 // (at the cost of a smaller slowdown on in-order cores such as A53), 2537 // and conserves register space (no need to clobber v8-v15). 2538 uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 2539 2540 bl L(\type\()_\taps\()_filter_8) 2541 mov v17.16b, v23.16b 2542 mov v18.16b, v24.16b 2543 25448: 2545 smull v2.4s, v16.4h, v1.h[0] 2546 smull2 v3.4s, v16.8h, v1.h[0] 2547 bl L(\type\()_\taps\()_filter_8) 2548 smull v4.4s, v17.4h, v1.h[0] 2549 smull2 v5.4s, v17.8h, v1.h[0] 2550 smlal v2.4s, v17.4h, v1.h[1] 2551 smlal2 v3.4s, v17.8h, v1.h[1] 2552 smlal v4.4s, v18.4h, v1.h[1] 2553 smlal2 v5.4s, v18.8h, v1.h[1] 2554 smlal v2.4s, v18.4h, v1.h[2] 2555 smlal2 v3.4s, v18.8h, v1.h[2] 2556 smlal v4.4s, v23.4h, v1.h[2] 2557 smlal2 v5.4s, v23.8h, v1.h[2] 2558 smlal v2.4s, v23.4h, v1.h[3] 2559 smlal2 v3.4s, v23.8h, v1.h[3] 2560 smlal v4.4s, v24.4h, v1.h[3] 2561 smlal2 v5.4s, v24.8h, v1.h[3] 2562.ifc \type, put 2563 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2564 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2565 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2566 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2567 sqxtun v2.4h, v2.4s 2568 sqxtun2 v2.8h, v3.4s 2569 sqxtun v3.4h, v4.4s 2570 sqxtun2 v3.8h, v5.4s 2571 umin v2.8h, v2.8h, v31.8h 2572 umin v3.8h, v3.8h, v31.8h 2573.else 2574 rshrn v2.4h, v2.4s, #6 2575 rshrn2 v2.8h, v3.4s, #6 2576 rshrn v3.4h, v4.4s, #6 2577 rshrn2 v3.8h, v5.4s, #6 2578 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2579 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2580.endif 2581 subs \h, \h, #2 2582 st1 {v2.8h}, [\dst], \d_strd 2583 st1 {v3.8h}, [\ds2], \d_strd 2584 b.le 9f 2585 mov v16.16b, v18.16b 2586 mov v17.16b, v23.16b 2587 mov v18.16b, v24.16b 2588 b 8b 25899: 2590 subs \w, \w, #8 2591 b.le 0f 2592 asr \s_strd, \s_strd, #1 2593 asr \d_strd, \d_strd, #1 2594 msub \src, \s_strd, \xmy, \src 2595 msub \dst, \d_strd, \xmy, \dst 2596 sub \src, \src, \s_strd, lsl #2 2597 mov \h, \my 2598 add \src, \src, #16 2599 add \dst, \dst, #16 2600 b 164b 2601 2602880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2603640: 26041280: 2605 AARCH64_VALID_JUMP_TARGET 2606 ld1 {v0.8b}, [\xmx] 2607 ld1 {v1.8b}, [\xmy] 2608.ifc \taps, 6tap 2609 sub \src, \src, #4 2610.else 2611 sub \src, \src, #6 2612 sub \src, \src, \s_strd 2613.endif 2614 sub \src, \src, \s_strd, lsl #1 2615 sxtl v0.8h, v0.8b 2616 sxtl v1.8h, v1.8b 2617 mov x15, x30 2618 mov \my, \h 2619 2620168: 2621 add \ds2, \dst, \d_strd 2622 add \sr2, \src, \s_strd 2623 lsl \d_strd, \d_strd, #1 2624 lsl \s_strd, \s_strd, #1 2625 2626 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2627.ifc \taps, 6tap 2628 smull v24.4s, v27.4h, v0.h[1] 2629 smull2 v25.4s, v27.8h, v0.h[1] 2630 .irpc i, 23456 2631 ext v26.16b, v27.16b, v28.16b, #(2*\i-2) 2632 smlal v24.4s, v26.4h, v0.h[\i] 2633 smlal2 v25.4s, v26.8h, v0.h[\i] 2634 .endr 2635.else // 8tap 2636 smull v24.4s, v27.4h, v0.h[0] 2637 smull2 v25.4s, v27.8h, v0.h[0] 2638 .irpc i, 1234567 2639 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2640 smlal v24.4s, v26.4h, v0.h[\i] 2641 smlal2 v25.4s, v26.8h, v0.h[\i] 2642 .endr 2643.endif 2644 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2645 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2646 // The intermediates from the horizontal pass fit in 16 bit without 2647 // any bias; we could just as well keep them as .4s, but narrowing 2648 // them to .4h gives a significant speedup on out of order cores 2649 // (at the cost of a smaller slowdown on in-order cores such as A53), 2650 // and conserves register space (no need to clobber v8-v15). 2651.ifc \taps, 6tap 2652 uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2 2653.else 2654 uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 2655 2656 bl L(\type\()_\taps\()_filter_8) 2657 mov v17.16b, v23.16b 2658 mov v18.16b, v24.16b 2659.endif 2660 bl L(\type\()_\taps\()_filter_8) 2661 mov v19.16b, v23.16b 2662 mov v20.16b, v24.16b 2663 bl L(\type\()_\taps\()_filter_8) 2664 mov v21.16b, v23.16b 2665 mov v22.16b, v24.16b 2666 266788: 2668.ifc \taps, 6tap 2669 smull v2.4s, v18.4h, v1.h[1] 2670 smull2 v3.4s, v18.8h, v1.h[1] 2671 bl L(\type\()_\taps\()_filter_8) 2672 smull v4.4s, v19.4h, v1.h[1] 2673 smull2 v5.4s, v19.8h, v1.h[1] 2674 smlal v2.4s, v19.4h, v1.h[2] 2675 smlal2 v3.4s, v19.8h, v1.h[2] 2676 smlal v4.4s, v20.4h, v1.h[2] 2677 smlal2 v5.4s, v20.8h, v1.h[2] 2678 smlal v2.4s, v20.4h, v1.h[3] 2679 smlal2 v3.4s, v20.8h, v1.h[3] 2680 smlal v4.4s, v21.4h, v1.h[3] 2681 smlal2 v5.4s, v21.8h, v1.h[3] 2682 smlal v2.4s, v21.4h, v1.h[4] 2683 smlal2 v3.4s, v21.8h, v1.h[4] 2684 smlal v4.4s, v22.4h, v1.h[4] 2685 smlal2 v5.4s, v22.8h, v1.h[4] 2686 smlal v2.4s, v22.4h, v1.h[5] 2687 smlal2 v3.4s, v22.8h, v1.h[5] 2688 smlal v4.4s, v23.4h, v1.h[5] 2689 smlal2 v5.4s, v23.8h, v1.h[5] 2690 smlal v2.4s, v23.4h, v1.h[6] 2691 smlal2 v3.4s, v23.8h, v1.h[6] 2692 smlal v4.4s, v24.4h, v1.h[6] 2693 smlal2 v5.4s, v24.8h, v1.h[6] 2694.else // 8tap 2695 smull v2.4s, v16.4h, v1.h[0] 2696 smull2 v3.4s, v16.8h, v1.h[0] 2697 bl L(\type\()_\taps\()_filter_8) 2698 smull v4.4s, v17.4h, v1.h[0] 2699 smull2 v5.4s, v17.8h, v1.h[0] 2700 smlal v2.4s, v17.4h, v1.h[1] 2701 smlal2 v3.4s, v17.8h, v1.h[1] 2702 smlal v4.4s, v18.4h, v1.h[1] 2703 smlal2 v5.4s, v18.8h, v1.h[1] 2704 smlal v2.4s, v18.4h, v1.h[2] 2705 smlal2 v3.4s, v18.8h, v1.h[2] 2706 smlal v4.4s, v19.4h, v1.h[2] 2707 smlal2 v5.4s, v19.8h, v1.h[2] 2708 smlal v2.4s, v19.4h, v1.h[3] 2709 smlal2 v3.4s, v19.8h, v1.h[3] 2710 smlal v4.4s, v20.4h, v1.h[3] 2711 smlal2 v5.4s, v20.8h, v1.h[3] 2712 smlal v2.4s, v20.4h, v1.h[4] 2713 smlal2 v3.4s, v20.8h, v1.h[4] 2714 smlal v4.4s, v21.4h, v1.h[4] 2715 smlal2 v5.4s, v21.8h, v1.h[4] 2716 smlal v2.4s, v21.4h, v1.h[5] 2717 smlal2 v3.4s, v21.8h, v1.h[5] 2718 smlal v4.4s, v22.4h, v1.h[5] 2719 smlal2 v5.4s, v22.8h, v1.h[5] 2720 smlal v2.4s, v22.4h, v1.h[6] 2721 smlal2 v3.4s, v22.8h, v1.h[6] 2722 smlal v4.4s, v23.4h, v1.h[6] 2723 smlal2 v5.4s, v23.8h, v1.h[6] 2724 smlal v2.4s, v23.4h, v1.h[7] 2725 smlal2 v3.4s, v23.8h, v1.h[7] 2726 smlal v4.4s, v24.4h, v1.h[7] 2727 smlal2 v5.4s, v24.8h, v1.h[7] 2728.endif 2729.ifc \type, put 2730 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2731 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2732 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2733 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2734 sqxtun v2.4h, v2.4s 2735 sqxtun2 v2.8h, v3.4s 2736 sqxtun v3.4h, v4.4s 2737 sqxtun2 v3.8h, v5.4s 2738 umin v2.8h, v2.8h, v31.8h 2739 umin v3.8h, v3.8h, v31.8h 2740.else 2741 rshrn v2.4h, v2.4s, #6 2742 rshrn2 v2.8h, v3.4s, #6 2743 rshrn v3.4h, v4.4s, #6 2744 rshrn2 v3.8h, v5.4s, #6 2745 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2746 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2747.endif 2748 subs \h, \h, #2 2749 st1 {v2.8h}, [\dst], \d_strd 2750 st1 {v3.8h}, [\ds2], \d_strd 2751 b.le 9f 2752.ifc \taps, 8tap 2753 mov v16.16b, v18.16b 2754 mov v17.16b, v19.16b 2755.endif 2756 mov v18.16b, v20.16b 2757 mov v19.16b, v21.16b 2758 mov v20.16b, v22.16b 2759 mov v21.16b, v23.16b 2760 mov v22.16b, v24.16b 2761 b 88b 27629: 2763 subs \w, \w, #8 2764 b.le 0f 2765 asr \s_strd, \s_strd, #1 2766 asr \d_strd, \d_strd, #1 2767 msub \src, \s_strd, \xmy, \src 2768 msub \dst, \d_strd, \xmy, \dst 2769 sub \src, \src, \s_strd, lsl #3 2770 mov \h, \my 2771 add \src, \src, #16 2772 add \dst, \dst, #16 2773.ifc \taps, 6tap 2774 add \src, \src, \s_strd, lsl #1 2775.endif 2776 b 168b 27770: 2778 ret x15 2779 2780L(\type\()_\taps\()_filter_8): 2781 ld1 {v4.8h, v5.8h}, [\sr2], \s_strd 2782 ld1 {v6.8h, v7.8h}, [\src], \s_strd 2783.ifc \taps, 6tap 2784 smull v25.4s, v4.4h, v0.h[1] 2785 smull2 v26.4s, v4.8h, v0.h[1] 2786 smull v27.4s, v6.4h, v0.h[1] 2787 smull2 v28.4s, v6.8h, v0.h[1] 2788.irpc i, 23456 2789 ext v23.16b, v4.16b, v5.16b, #(2*\i-2) 2790 ext v24.16b, v6.16b, v7.16b, #(2*\i-2) 2791 smlal v25.4s, v23.4h, v0.h[\i] 2792 smlal2 v26.4s, v23.8h, v0.h[\i] 2793 smlal v27.4s, v24.4h, v0.h[\i] 2794 smlal2 v28.4s, v24.8h, v0.h[\i] 2795.endr 2796.else // 8tap 2797 smull v25.4s, v4.4h, v0.h[0] 2798 smull2 v26.4s, v4.8h, v0.h[0] 2799 smull v27.4s, v6.4h, v0.h[0] 2800 smull2 v28.4s, v6.8h, v0.h[0] 2801.irpc i, 1234567 2802 ext v23.16b, v4.16b, v5.16b, #(2*\i) 2803 ext v24.16b, v6.16b, v7.16b, #(2*\i) 2804 smlal v25.4s, v23.4h, v0.h[\i] 2805 smlal2 v26.4s, v23.8h, v0.h[\i] 2806 smlal v27.4s, v24.4h, v0.h[\i] 2807 smlal2 v28.4s, v24.8h, v0.h[\i] 2808.endr 2809.endif 2810 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2811 srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) 2812 srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) 2813 srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) 2814 uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2 2815 uzp1 v24.8h, v27.8h, v28.8h // Ditto 2816 ret 2817endfunc 2818 2819jumptable \type\()_\taps\()_hv_tbl 2820 .word 1280b - \type\()_\taps\()_hv_tbl 2821 .word 640b - \type\()_\taps\()_hv_tbl 2822 .word 320b - \type\()_\taps\()_hv_tbl 2823 .word 160b - \type\()_\taps\()_hv_tbl 2824 .word 80b - \type\()_\taps\()_hv_tbl 2825 .word 40b - \type\()_\taps\()_hv_tbl 2826 .word 20b - \type\()_\taps\()_hv_tbl 2827endjumptable 2828.endm 2829 2830 2831.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 2832function \type\()_bilin_16bpc_neon, export=1 2833.ifc \bdmax, w8 2834 ldr w8, [sp] 2835.endif 2836 dup v1.8h, \mx 2837 dup v3.8h, \my 2838 mov w10, #16 2839 sub w9, w10, \mx 2840 sub w10, w10, \my 2841 dup v0.8h, w9 2842 dup v2.8h, w10 2843.ifc \type, prep 2844 uxtw \d_strd, \w 2845 lsl \d_strd, \d_strd, #1 2846.endif 2847 2848 clz \bdmax, \bdmax // bitdepth_max 2849 clz w9, \w 2850 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 2851 mov w11, #4 2852 sub w9, w9, #24 2853 sub w11, w11, \bdmax // 4 - intermediate_bits 2854 add w12, \bdmax, #4 // 4 + intermediate_bits 2855 cbnz \mx, L(\type\()_bilin_h) 2856 cbnz \my, L(\type\()_bilin_v) 2857 b \type\()_16bpc_neon 2858 2859L(\type\()_bilin_h): 2860 cbnz \my, L(\type\()_bilin_hv) 2861 2862 movrel x10, \type\()_bilin_h_tbl 2863 dup v31.8h, w11 // 4 - intermediate_bits 2864 ldrsw x9, [x10, x9, lsl #2] 2865 neg v31.8h, v31.8h // -(4-intermediate_bits) 2866.ifc \type, put 2867 dup v30.8h, \bdmax // intermediate_bits 2868.else 2869 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2870.endif 2871 add x10, x10, x9 2872.ifc \type, put 2873 neg v30.8h, v30.8h // -intermediate_bits 2874.endif 2875 br x10 2876 287720: // 2xN h 2878 AARCH64_VALID_JUMP_TARGET 2879.ifc \type, put 2880 add \ds2, \dst, \d_strd 2881 add \sr2, \src, \s_strd 2882 lsl \d_strd, \d_strd, #1 2883 lsl \s_strd, \s_strd, #1 28842: 2885 ld1 {v4.4h}, [\src], \s_strd 2886 ld1 {v6.4h}, [\sr2], \s_strd 2887 ext v5.8b, v4.8b, v4.8b, #2 2888 ext v7.8b, v6.8b, v6.8b, #2 2889 trn1 v4.2s, v4.2s, v6.2s 2890 trn1 v5.2s, v5.2s, v7.2s 2891 subs \h, \h, #2 2892 mul v4.4h, v4.4h, v0.4h 2893 mla v4.4h, v5.4h, v1.4h 2894 urshl v4.4h, v4.4h, v31.4h 2895 urshl v4.4h, v4.4h, v30.4h 2896 st1 {v4.s}[0], [\dst], \d_strd 2897 st1 {v4.s}[1], [\ds2], \d_strd 2898 b.gt 2b 2899 ret 2900.endif 2901 290240: // 4xN h 2903 AARCH64_VALID_JUMP_TARGET 2904 add \ds2, \dst, \d_strd 2905 add \sr2, \src, \s_strd 2906 lsl \d_strd, \d_strd, #1 2907 lsl \s_strd, \s_strd, #1 29084: 2909 ld1 {v4.8h}, [\src], \s_strd 2910 ld1 {v6.8h}, [\sr2], \s_strd 2911 ext v5.16b, v4.16b, v4.16b, #2 2912 ext v7.16b, v6.16b, v6.16b, #2 2913 trn1 v4.2d, v4.2d, v6.2d 2914 trn1 v5.2d, v5.2d, v7.2d 2915 subs \h, \h, #2 2916 mul v4.8h, v4.8h, v0.8h 2917 mla v4.8h, v5.8h, v1.8h 2918 urshl v4.8h, v4.8h, v31.8h 2919.ifc \type, put 2920 urshl v4.8h, v4.8h, v30.8h 2921.else 2922 sub v4.8h, v4.8h, v29.8h 2923.endif 2924 st1 {v4.8b}, [\dst], \d_strd 2925 st1 {v4.d}[1], [\ds2], \d_strd 2926 b.gt 4b 2927 ret 2928 292980: // 8xN h 2930 AARCH64_VALID_JUMP_TARGET 2931 add \ds2, \dst, \d_strd 2932 add \sr2, \src, \s_strd 2933 lsl \d_strd, \d_strd, #1 2934 lsl \s_strd, \s_strd, #1 29358: 2936 ldr h5, [\src, #16] 2937 ldr h7, [\sr2, #16] 2938 ld1 {v4.8h}, [\src], \s_strd 2939 ld1 {v6.8h}, [\sr2], \s_strd 2940 ext v5.16b, v4.16b, v5.16b, #2 2941 ext v7.16b, v6.16b, v7.16b, #2 2942 subs \h, \h, #2 2943 mul v4.8h, v4.8h, v0.8h 2944 mla v4.8h, v5.8h, v1.8h 2945 mul v6.8h, v6.8h, v0.8h 2946 mla v6.8h, v7.8h, v1.8h 2947 urshl v4.8h, v4.8h, v31.8h 2948 urshl v6.8h, v6.8h, v31.8h 2949.ifc \type, put 2950 urshl v4.8h, v4.8h, v30.8h 2951 urshl v6.8h, v6.8h, v30.8h 2952.else 2953 sub v4.8h, v4.8h, v29.8h 2954 sub v6.8h, v6.8h, v29.8h 2955.endif 2956 st1 {v4.8h}, [\dst], \d_strd 2957 st1 {v6.8h}, [\ds2], \d_strd 2958 b.gt 8b 2959 ret 2960160: 2961320: 2962640: 29631280: // 16xN, 32xN, ... h 2964 AARCH64_VALID_JUMP_TARGET 2965 add \ds2, \dst, \d_strd 2966 add \sr2, \src, \s_strd 2967 lsl \s_strd, \s_strd, #1 2968 2969 sub \s_strd, \s_strd, \w, uxtw #1 2970 sub \s_strd, \s_strd, #16 2971.ifc \type, put 2972 lsl \d_strd, \d_strd, #1 2973 sub \d_strd, \d_strd, \w, uxtw #1 2974.endif 2975161: 2976 ld1 {v16.8h}, [\src], #16 2977 ld1 {v21.8h}, [\sr2], #16 2978 mov \mx, \w 2979 298016: 2981 ld1 {v17.8h, v18.8h}, [\src], #32 2982 ld1 {v22.8h, v23.8h}, [\sr2], #32 2983 ext v19.16b, v16.16b, v17.16b, #2 2984 ext v20.16b, v17.16b, v18.16b, #2 2985 ext v24.16b, v21.16b, v22.16b, #2 2986 ext v25.16b, v22.16b, v23.16b, #2 2987 mul v16.8h, v16.8h, v0.8h 2988 mla v16.8h, v19.8h, v1.8h 2989 mul v17.8h, v17.8h, v0.8h 2990 mla v17.8h, v20.8h, v1.8h 2991 mul v21.8h, v21.8h, v0.8h 2992 mla v21.8h, v24.8h, v1.8h 2993 mul v22.8h, v22.8h, v0.8h 2994 mla v22.8h, v25.8h, v1.8h 2995 urshl v16.8h, v16.8h, v31.8h 2996 urshl v17.8h, v17.8h, v31.8h 2997 urshl v21.8h, v21.8h, v31.8h 2998 urshl v22.8h, v22.8h, v31.8h 2999 subs \mx, \mx, #16 3000.ifc \type, put 3001 urshl v16.8h, v16.8h, v30.8h 3002 urshl v17.8h, v17.8h, v30.8h 3003 urshl v21.8h, v21.8h, v30.8h 3004 urshl v22.8h, v22.8h, v30.8h 3005.else 3006 sub v16.8h, v16.8h, v29.8h 3007 sub v17.8h, v17.8h, v29.8h 3008 sub v21.8h, v21.8h, v29.8h 3009 sub v22.8h, v22.8h, v29.8h 3010.endif 3011 st1 {v16.8h, v17.8h}, [\dst], #32 3012 st1 {v21.8h, v22.8h}, [\ds2], #32 3013 b.le 9f 3014 3015 mov v16.16b, v18.16b 3016 mov v21.16b, v23.16b 3017 b 16b 3018 30199: 3020 add \dst, \dst, \d_strd 3021 add \ds2, \ds2, \d_strd 3022 add \src, \src, \s_strd 3023 add \sr2, \sr2, \s_strd 3024 3025 subs \h, \h, #2 3026 b.gt 161b 3027 ret 3028endfunc 3029 3030jumptable \type\()_bilin_h_tbl 3031 .word 1280b - \type\()_bilin_h_tbl 3032 .word 640b - \type\()_bilin_h_tbl 3033 .word 320b - \type\()_bilin_h_tbl 3034 .word 160b - \type\()_bilin_h_tbl 3035 .word 80b - \type\()_bilin_h_tbl 3036 .word 40b - \type\()_bilin_h_tbl 3037 .word 20b - \type\()_bilin_h_tbl 3038endjumptable 3039 3040 3041function L(\type\()_bilin_v) 3042 cmp \h, #4 3043 movrel x10, \type\()_bilin_v_tbl 3044.ifc \type, prep 3045 dup v31.8h, w11 // 4 - intermediate_bits 3046.endif 3047 ldrsw x9, [x10, x9, lsl #2] 3048.ifc \type, prep 3049 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 3050 neg v31.8h, v31.8h // -(4-intermediate_bits) 3051.endif 3052 add x10, x10, x9 3053 br x10 3054 305520: // 2xN v 3056 AARCH64_VALID_JUMP_TARGET 3057.ifc \type, put 3058 cmp \h, #2 3059 add \ds2, \dst, \d_strd 3060 add \sr2, \src, \s_strd 3061 lsl \s_strd, \s_strd, #1 3062 lsl \d_strd, \d_strd, #1 3063 3064 // 2x2 v 3065 ld1r {v16.4s}, [\src], \s_strd 3066 b.gt 24f 306722: 3068 ld1r {v17.4s}, [\sr2], \s_strd 3069 ld1r {v18.4s}, [\src], \s_strd 3070 trn1 v16.2s, v16.2s, v17.2s 3071 trn1 v17.2s, v17.2s, v18.2s 3072 mul v4.4h, v16.4h, v2.4h 3073 mla v4.4h, v17.4h, v3.4h 3074 urshr v4.8h, v4.8h, #4 3075 str s4, [\dst] 3076 st1 {v4.s}[1], [\ds2] 3077 ret 307824: // 2x4, 2x6, 2x8, ... v 3079 ld1r {v17.4s}, [\sr2], \s_strd 3080 ld1r {v18.4s}, [\src], \s_strd 3081 ld1r {v19.4s}, [\sr2], \s_strd 3082 ld1r {v20.4s}, [\src], \s_strd 3083 sub \h, \h, #4 3084 trn1 v16.2s, v16.2s, v17.2s 3085 trn1 v17.2s, v17.2s, v18.2s 3086 trn1 v18.2s, v18.2s, v19.2s 3087 trn1 v19.2s, v19.2s, v20.2s 3088 trn1 v16.2d, v16.2d, v18.2d 3089 trn1 v17.2d, v17.2d, v19.2d 3090 mul v4.8h, v16.8h, v2.8h 3091 mla v4.8h, v17.8h, v3.8h 3092 cmp \h, #2 3093 urshr v4.8h, v4.8h, #4 3094 st1 {v4.s}[0], [\dst], \d_strd 3095 st1 {v4.s}[1], [\ds2], \d_strd 3096 st1 {v4.s}[2], [\dst], \d_strd 3097 st1 {v4.s}[3], [\ds2], \d_strd 3098 b.lt 0f 3099 mov v16.8b, v20.8b 3100 b.eq 22b 3101 b 24b 31020: 3103 ret 3104.endif 3105 310640: // 4xN v 3107 AARCH64_VALID_JUMP_TARGET 3108 add \ds2, \dst, \d_strd 3109 add \sr2, \src, \s_strd 3110 lsl \s_strd, \s_strd, #1 3111 lsl \d_strd, \d_strd, #1 3112 ld1 {v16.4h}, [\src], \s_strd 31134: 3114 ld1 {v17.4h}, [\sr2], \s_strd 3115 ld1 {v18.4h}, [\src], \s_strd 3116 trn1 v16.2d, v16.2d, v17.2d 3117 trn1 v17.2d, v17.2d, v18.2d 3118 mul v4.8h, v16.8h, v2.8h 3119 mla v4.8h, v17.8h, v3.8h 3120 subs \h, \h, #2 3121.ifc \type, put 3122 urshr v4.8h, v4.8h, #4 3123.else 3124 urshl v4.8h, v4.8h, v31.8h 3125 sub v4.8h, v4.8h, v29.8h 3126.endif 3127 st1 {v4.8b}, [\dst], \d_strd 3128 st1 {v4.d}[1], [\ds2], \d_strd 3129 b.le 0f 3130 mov v16.8b, v18.8b 3131 b 4b 31320: 3133 ret 3134 313580: // 8xN v 3136 AARCH64_VALID_JUMP_TARGET 3137 add \ds2, \dst, \d_strd 3138 add \sr2, \src, \s_strd 3139 lsl \s_strd, \s_strd, #1 3140 lsl \d_strd, \d_strd, #1 3141 ld1 {v16.8h}, [\src], \s_strd 31428: 3143 ld1 {v17.8h}, [\sr2], \s_strd 3144 ld1 {v18.8h}, [\src], \s_strd 3145 mul v4.8h, v16.8h, v2.8h 3146 mla v4.8h, v17.8h, v3.8h 3147 mul v5.8h, v17.8h, v2.8h 3148 mla v5.8h, v18.8h, v3.8h 3149 subs \h, \h, #2 3150.ifc \type, put 3151 urshr v4.8h, v4.8h, #4 3152 urshr v5.8h, v5.8h, #4 3153.else 3154 urshl v4.8h, v4.8h, v31.8h 3155 urshl v5.8h, v5.8h, v31.8h 3156 sub v4.8h, v4.8h, v29.8h 3157 sub v5.8h, v5.8h, v29.8h 3158.endif 3159 st1 {v4.8h}, [\dst], \d_strd 3160 st1 {v5.8h}, [\ds2], \d_strd 3161 b.le 0f 3162 mov v16.16b, v18.16b 3163 b 8b 31640: 3165 ret 3166 3167160: // 16xN, 32xN, ... 3168320: 3169640: 31701280: 3171 AARCH64_VALID_JUMP_TARGET 3172 mov \my, \h 31731: 3174 add \ds2, \dst, \d_strd 3175 add \sr2, \src, \s_strd 3176 lsl \s_strd, \s_strd, #1 3177 lsl \d_strd, \d_strd, #1 3178 3179 ld1 {v16.8h, v17.8h}, [\src], \s_strd 31802: 3181 ld1 {v18.8h, v19.8h}, [\sr2], \s_strd 3182 ld1 {v20.8h, v21.8h}, [\src], \s_strd 3183 mul v4.8h, v16.8h, v2.8h 3184 mla v4.8h, v18.8h, v3.8h 3185 mul v5.8h, v17.8h, v2.8h 3186 mla v5.8h, v19.8h, v3.8h 3187 mul v6.8h, v18.8h, v2.8h 3188 mla v6.8h, v20.8h, v3.8h 3189 mul v7.8h, v19.8h, v2.8h 3190 mla v7.8h, v21.8h, v3.8h 3191 subs \h, \h, #2 3192.ifc \type, put 3193 urshr v4.8h, v4.8h, #4 3194 urshr v5.8h, v5.8h, #4 3195 urshr v6.8h, v6.8h, #4 3196 urshr v7.8h, v7.8h, #4 3197.else 3198 urshl v4.8h, v4.8h, v31.8h 3199 urshl v5.8h, v5.8h, v31.8h 3200 urshl v6.8h, v6.8h, v31.8h 3201 urshl v7.8h, v7.8h, v31.8h 3202 sub v4.8h, v4.8h, v29.8h 3203 sub v5.8h, v5.8h, v29.8h 3204 sub v6.8h, v6.8h, v29.8h 3205 sub v7.8h, v7.8h, v29.8h 3206.endif 3207 st1 {v4.8h, v5.8h}, [\dst], \d_strd 3208 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 3209 b.le 9f 3210 mov v16.16b, v20.16b 3211 mov v17.16b, v21.16b 3212 b 2b 32139: 3214 subs \w, \w, #16 3215 b.le 0f 3216 asr \s_strd, \s_strd, #1 3217 asr \d_strd, \d_strd, #1 3218 msub \src, \s_strd, \xmy, \src 3219 msub \dst, \d_strd, \xmy, \dst 3220 sub \src, \src, \s_strd, lsl #1 3221 mov \h, \my 3222 add \src, \src, #32 3223 add \dst, \dst, #32 3224 b 1b 32250: 3226 ret 3227endfunc 3228 3229jumptable \type\()_bilin_v_tbl 3230 .word 1280b - \type\()_bilin_v_tbl 3231 .word 640b - \type\()_bilin_v_tbl 3232 .word 320b - \type\()_bilin_v_tbl 3233 .word 160b - \type\()_bilin_v_tbl 3234 .word 80b - \type\()_bilin_v_tbl 3235 .word 40b - \type\()_bilin_v_tbl 3236 .word 20b - \type\()_bilin_v_tbl 3237endjumptable 3238 3239function L(\type\()_bilin_hv) 3240 movrel x10, \type\()_bilin_hv_tbl 3241 dup v31.8h, w11 // 4 - intermediate_bits 3242 ldrsw x9, [x10, x9, lsl #2] 3243 neg v31.8h, v31.8h // -(4-intermediate_bits) 3244.ifc \type, put 3245 dup v30.4s, w12 // 4 + intermediate_bits 3246.else 3247 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 3248.endif 3249 add x10, x10, x9 3250.ifc \type, put 3251 neg v30.4s, v30.4s // -(4+intermediate_bits) 3252.endif 3253 br x10 3254 325520: // 2xN hv 3256 AARCH64_VALID_JUMP_TARGET 3257.ifc \type, put 3258 add \sr2, \src, \s_strd 3259 add \ds2, \dst, \d_strd 3260 lsl \s_strd, \s_strd, #1 3261 lsl \d_strd, \d_strd, #1 3262 3263 ld1 {v20.4h}, [\src], \s_strd 3264 ext v21.8b, v20.8b, v20.8b, #2 3265 mul v16.4h, v20.4h, v0.4h 3266 mla v16.4h, v21.4h, v1.4h 3267 urshl v16.4h, v16.4h, v31.4h 3268 32692: 3270 ld1 {v22.4h}, [\sr2], \s_strd 3271 ld1 {v24.4h}, [\src], \s_strd 3272 ext v23.8b, v22.8b, v22.8b, #2 3273 ext v25.8b, v24.8b, v24.8b, #2 3274 trn1 v22.2s, v22.2s, v24.2s 3275 trn1 v23.2s, v23.2s, v25.2s 3276 mul v17.4h, v22.4h, v0.4h 3277 mla v17.4h, v23.4h, v1.4h 3278 urshl v17.4h, v17.4h, v31.4h 3279 3280 trn1 v16.2s, v16.2s, v17.2s 3281 3282 umull v4.4s, v16.4h, v2.4h 3283 umlal v4.4s, v17.4h, v3.4h 3284 urshl v4.4s, v4.4s, v30.4s 3285 xtn v4.4h, v4.4s 3286 subs \h, \h, #2 3287 st1 {v4.s}[0], [\dst], \d_strd 3288 st1 {v4.s}[1], [\ds2], \d_strd 3289 b.le 0f 3290 trn2 v16.2s, v17.2s, v17.2s 3291 b 2b 32920: 3293 ret 3294.endif 3295 329640: // 4xN hv 3297 AARCH64_VALID_JUMP_TARGET 3298 add \sr2, \src, \s_strd 3299 add \ds2, \dst, \d_strd 3300 lsl \s_strd, \s_strd, #1 3301 lsl \d_strd, \d_strd, #1 3302 3303 ld1 {v20.8h}, [\src], \s_strd 3304 ext v21.16b, v20.16b, v20.16b, #2 3305 mul v16.4h, v20.4h, v0.4h 3306 mla v16.4h, v21.4h, v1.4h 3307 urshl v16.4h, v16.4h, v31.4h 3308 33094: 3310 ld1 {v22.8h}, [\sr2], \s_strd 3311 ld1 {v24.8h}, [\src], \s_strd 3312 ext v23.16b, v22.16b, v22.16b, #2 3313 ext v25.16b, v24.16b, v24.16b, #2 3314 trn1 v22.2d, v22.2d, v24.2d 3315 trn1 v23.2d, v23.2d, v25.2d 3316 mul v17.8h, v22.8h, v0.8h 3317 mla v17.8h, v23.8h, v1.8h 3318 urshl v17.8h, v17.8h, v31.8h 3319 3320 trn1 v16.2d, v16.2d, v17.2d 3321 3322 umull v4.4s, v16.4h, v2.4h 3323 umlal v4.4s, v17.4h, v3.4h 3324 umull2 v5.4s, v16.8h, v2.8h 3325 umlal2 v5.4s, v17.8h, v3.8h 3326.ifc \type, put 3327 urshl v4.4s, v4.4s, v30.4s 3328 urshl v5.4s, v5.4s, v30.4s 3329 uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 3330.else 3331 rshrn v4.4h, v4.4s, #4 3332 rshrn2 v4.8h, v5.4s, #4 3333 sub v4.8h, v4.8h, v29.8h 3334.endif 3335 subs \h, \h, #2 3336 st1 {v4.8b}, [\dst], \d_strd 3337 st1 {v4.d}[1], [\ds2], \d_strd 3338 b.le 0f 3339 trn2 v16.2d, v17.2d, v17.2d 3340 b 4b 33410: 3342 ret 3343 334480: // 8xN, 16xN, ... hv 3345160: 3346320: 3347640: 33481280: 3349 AARCH64_VALID_JUMP_TARGET 3350 mov \my, \h 3351 33521: 3353 add \sr2, \src, \s_strd 3354 add \ds2, \dst, \d_strd 3355 lsl \s_strd, \s_strd, #1 3356 lsl \d_strd, \d_strd, #1 3357 3358 ldr h21, [\src, #16] 3359 ld1 {v20.8h}, [\src], \s_strd 3360 ext v21.16b, v20.16b, v21.16b, #2 3361 mul v16.8h, v20.8h, v0.8h 3362 mla v16.8h, v21.8h, v1.8h 3363 urshl v16.8h, v16.8h, v31.8h 3364 33652: 3366 ldr h23, [\sr2, #16] 3367 ld1 {v22.8h}, [\sr2], \s_strd 3368 ldr h25, [\src, #16] 3369 ld1 {v24.8h}, [\src], \s_strd 3370 ext v23.16b, v22.16b, v23.16b, #2 3371 ext v25.16b, v24.16b, v25.16b, #2 3372 mul v17.8h, v22.8h, v0.8h 3373 mla v17.8h, v23.8h, v1.8h 3374 mul v18.8h, v24.8h, v0.8h 3375 mla v18.8h, v25.8h, v1.8h 3376 urshl v17.8h, v17.8h, v31.8h 3377 urshl v18.8h, v18.8h, v31.8h 3378 3379 umull v4.4s, v16.4h, v2.4h 3380 umlal v4.4s, v17.4h, v3.4h 3381 umull2 v5.4s, v16.8h, v2.8h 3382 umlal2 v5.4s, v17.8h, v3.8h 3383 umull v6.4s, v17.4h, v2.4h 3384 umlal v6.4s, v18.4h, v3.4h 3385 umull2 v7.4s, v17.8h, v2.8h 3386 umlal2 v7.4s, v18.8h, v3.8h 3387.ifc \type, put 3388 urshl v4.4s, v4.4s, v30.4s 3389 urshl v5.4s, v5.4s, v30.4s 3390 urshl v6.4s, v6.4s, v30.4s 3391 urshl v7.4s, v7.4s, v30.4s 3392 uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 3393 uzp1 v5.8h, v6.8h, v7.8h // Ditto 3394.else 3395 rshrn v4.4h, v4.4s, #4 3396 rshrn2 v4.8h, v5.4s, #4 3397 rshrn v5.4h, v6.4s, #4 3398 rshrn2 v5.8h, v7.4s, #4 3399 sub v4.8h, v4.8h, v29.8h 3400 sub v5.8h, v5.8h, v29.8h 3401.endif 3402 subs \h, \h, #2 3403 st1 {v4.8h}, [\dst], \d_strd 3404 st1 {v5.8h}, [\ds2], \d_strd 3405 b.le 9f 3406 mov v16.16b, v18.16b 3407 b 2b 34089: 3409 subs \w, \w, #8 3410 b.le 0f 3411 asr \s_strd, \s_strd, #1 3412 asr \d_strd, \d_strd, #1 3413 msub \src, \s_strd, \xmy, \src 3414 msub \dst, \d_strd, \xmy, \dst 3415 sub \src, \src, \s_strd, lsl #1 3416 mov \h, \my 3417 add \src, \src, #16 3418 add \dst, \dst, #16 3419 b 1b 34200: 3421 ret 3422endfunc 3423 3424jumptable \type\()_bilin_hv_tbl 3425 .word 1280b - \type\()_bilin_hv_tbl 3426 .word 640b - \type\()_bilin_hv_tbl 3427 .word 320b - \type\()_bilin_hv_tbl 3428 .word 160b - \type\()_bilin_hv_tbl 3429 .word 80b - \type\()_bilin_hv_tbl 3430 .word 40b - \type\()_bilin_hv_tbl 3431 .word 20b - \type\()_bilin_hv_tbl 3432endjumptable 3433.endm 3434 3435make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap 3436make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap 3437make_8tap_fn put, sharp, SHARP, SHARP, 8tap 3438make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap 3439make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap 3440filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap 3441 3442make_8tap_fn put, regular, REGULAR, REGULAR, 6tap 3443make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap 3444make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap 3445make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap 3446filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap 3447filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 3448 3449make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap 3450make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap 3451make_8tap_fn prep, sharp, SHARP, SHARP, 8tap 3452make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap 3453make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap 3454filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap 3455 3456make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap 3457make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap 3458make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap 3459make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap 3460filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap 3461filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 3462 3463 3464.macro load_filter_row dst, src, inc 3465 asr w13, \src, #10 3466 add \src, \src, \inc 3467 ldr \dst, [x11, w13, sxtw #3] 3468.endm 3469 3470function warp_filter_horz_neon 3471 add w12, w5, #512 3472 3473 ld1 {v16.8h, v17.8h}, [x2], x3 3474 3475 load_filter_row d0, w12, w7 3476 load_filter_row d1, w12, w7 3477 load_filter_row d2, w12, w7 3478 sxtl v0.8h, v0.8b 3479 load_filter_row d3, w12, w7 3480 sxtl v1.8h, v1.8b 3481 load_filter_row d4, w12, w7 3482 sxtl v2.8h, v2.8b 3483 load_filter_row d5, w12, w7 3484 sxtl v3.8h, v3.8b 3485 load_filter_row d6, w12, w7 3486 sxtl v4.8h, v4.8b 3487 load_filter_row d7, w12, w7 3488 sxtl v5.8h, v5.8b 3489 ext v18.16b, v16.16b, v17.16b, #2*1 3490 smull v8.4s, v16.4h, v0.4h 3491 smull2 v9.4s, v16.8h, v0.8h 3492 sxtl v6.8h, v6.8b 3493 ext v19.16b, v16.16b, v17.16b, #2*2 3494 smull v10.4s, v18.4h, v1.4h 3495 smull2 v11.4s, v18.8h, v1.8h 3496 sxtl v7.8h, v7.8b 3497 ext v20.16b, v16.16b, v17.16b, #2*3 3498 smull v0.4s, v19.4h, v2.4h 3499 smull2 v1.4s, v19.8h, v2.8h 3500 ext v21.16b, v16.16b, v17.16b, #2*4 3501 addp v8.4s, v8.4s, v9.4s 3502 smull v2.4s, v20.4h, v3.4h 3503 smull2 v3.4s, v20.8h, v3.8h 3504 ext v22.16b, v16.16b, v17.16b, #2*5 3505 addp v9.4s, v10.4s, v11.4s 3506 smull v10.4s, v21.4h, v4.4h 3507 smull2 v11.4s, v21.8h, v4.8h 3508 ext v23.16b, v16.16b, v17.16b, #2*6 3509 addp v0.4s, v0.4s, v1.4s 3510 smull v18.4s, v22.4h, v5.4h 3511 smull2 v19.4s, v22.8h, v5.8h 3512 ext v16.16b, v16.16b, v17.16b, #2*7 3513 addp v1.4s, v2.4s, v3.4s 3514 addp v2.4s, v10.4s, v11.4s 3515 smull v20.4s, v23.4h, v6.4h 3516 smull2 v21.4s, v23.8h, v6.8h 3517 addp v3.4s, v18.4s, v19.4s 3518 smull v22.4s, v16.4h, v7.4h 3519 smull2 v23.4s, v16.8h, v7.8h 3520 addp v4.4s, v20.4s, v21.4s 3521 addp v5.4s, v22.4s, v23.4s 3522 3523 addp v8.4s, v8.4s, v9.4s 3524 addp v0.4s, v0.4s, v1.4s 3525 addp v2.4s, v2.4s, v3.4s 3526 addp v4.4s, v4.4s, v5.4s 3527 3528 addp v16.4s, v8.4s, v0.4s 3529 addp v17.4s, v2.4s, v4.4s 3530 3531 add w5, w5, w8 3532 3533 srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) 3534 srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) 3535 3536 ret 3537endfunc 3538 3539// void dav1d_warp_affine_8x8_16bpc_neon( 3540// pixel *dst, const ptrdiff_t dst_stride, 3541// const pixel *src, const ptrdiff_t src_stride, 3542// const int16_t *const abcd, int mx, int my, 3543// const int bitdepth_max) 3544.macro warp t 3545function warp_affine_8x8\t\()_16bpc_neon, export=1 3546 stp d8, d9, [sp, #-0x40]! 3547 stp d10, d11, [sp, #0x10] 3548 stp d12, d13, [sp, #0x20] 3549 stp d14, d15, [sp, #0x30] 3550 3551.ifb \t 3552 dup v15.8h, w7 // bitdepth_max 3553.else 3554 movi v15.8h, #(PREP_BIAS >> 8), lsl #8 3555.endif 3556 clz w7, w7 3557 // intermediate_bits = clz(bitdepth_max) - 18 3558.ifb \t 3559 sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 3560.endif 3561 sub w7, w7, #25 // -(7 - intermediate_bits) 3562.ifb \t 3563 neg w8, w8 // -(7 + intermediate_bits) 3564.endif 3565 dup v14.4s, w7 // -(7 - intermediate_bits) 3566.ifb \t 3567 dup v13.4s, w8 // -(7 + intermediate_bits) 3568.endif 3569 3570 ldr x4, [x4] 3571 sbfx x7, x4, #0, #16 3572 sbfx x8, x4, #16, #16 3573 sbfx x9, x4, #32, #16 3574 sbfx x4, x4, #48, #16 3575 mov w10, #8 3576 sub x2, x2, x3, lsl #1 3577 sub x2, x2, x3 3578 sub x2, x2, #6 3579 movrel x11, X(mc_warp_filter), 64*8 3580 mov x15, x30 3581.ifnb \t 3582 lsl x1, x1, #1 3583.endif 3584 3585 bl warp_filter_horz_neon 3586 uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2 3587 bl warp_filter_horz_neon 3588 uzp1 v25.8h, v16.8h, v17.8h // Ditto 3589 bl warp_filter_horz_neon 3590 uzp1 v26.8h, v16.8h, v17.8h // Ditto 3591 bl warp_filter_horz_neon 3592 uzp1 v27.8h, v16.8h, v17.8h // Ditto 3593 bl warp_filter_horz_neon 3594 uzp1 v28.8h, v16.8h, v17.8h // Ditto 3595 bl warp_filter_horz_neon 3596 uzp1 v29.8h, v16.8h, v17.8h // Ditto 3597 bl warp_filter_horz_neon 3598 uzp1 v30.8h, v16.8h, v17.8h // Ditto 3599 36001: 3601 add w14, w6, #512 3602 bl warp_filter_horz_neon 3603 uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2 3604 3605 load_filter_row d0, w14, w9 3606 load_filter_row d1, w14, w9 3607 load_filter_row d2, w14, w9 3608 load_filter_row d3, w14, w9 3609 load_filter_row d4, w14, w9 3610 load_filter_row d5, w14, w9 3611 load_filter_row d6, w14, w9 3612 load_filter_row d7, w14, w9 3613 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl 3614 3615 // This ordering of smull/smlal/smull2/smlal2 is highly 3616 // beneficial for Cortex A53 here. 3617 smull v16.4s, v24.4h, v0.4h 3618 smlal v16.4s, v25.4h, v1.4h 3619 smlal v16.4s, v26.4h, v2.4h 3620 smlal v16.4s, v27.4h, v3.4h 3621 smlal v16.4s, v28.4h, v4.4h 3622 smlal v16.4s, v29.4h, v5.4h 3623 smlal v16.4s, v30.4h, v6.4h 3624 smlal v16.4s, v31.4h, v7.4h 3625 smull2 v17.4s, v24.8h, v0.8h 3626 smlal2 v17.4s, v25.8h, v1.8h 3627 smlal2 v17.4s, v26.8h, v2.8h 3628 smlal2 v17.4s, v27.8h, v3.8h 3629 smlal2 v17.4s, v28.8h, v4.8h 3630 smlal2 v17.4s, v29.8h, v5.8h 3631 smlal2 v17.4s, v30.8h, v6.8h 3632 smlal2 v17.4s, v31.8h, v7.8h 3633 3634 mov v24.16b, v25.16b 3635 mov v25.16b, v26.16b 3636.ifb \t 3637 srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) 3638 srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) 3639.else 3640 rshrn v16.4h, v16.4s, #7 3641 rshrn2 v16.8h, v17.4s, #7 3642.endif 3643 mov v26.16b, v27.16b 3644.ifb \t 3645 sqxtun v16.4h, v16.4s 3646 sqxtun2 v16.8h, v17.4s 3647.else 3648 sub v16.8h, v16.8h, v15.8h // PREP_BIAS 3649.endif 3650 mov v27.16b, v28.16b 3651 mov v28.16b, v29.16b 3652.ifb \t 3653 umin v16.8h, v16.8h, v15.8h // bitdepth_max 3654.endif 3655 mov v29.16b, v30.16b 3656 mov v30.16b, v31.16b 3657 subs w10, w10, #1 3658 st1 {v16.8h}, [x0], x1 3659 3660 add w6, w6, w4 3661 b.gt 1b 3662 3663 ldp d14, d15, [sp, #0x30] 3664 ldp d12, d13, [sp, #0x20] 3665 ldp d10, d11, [sp, #0x10] 3666 ldp d8, d9, [sp], 0x40 3667 3668 ret x15 3669endfunc 3670.endm 3671 3672warp 3673warp t 3674 3675// void dav1d_emu_edge_16bpc_neon( 3676// const intptr_t bw, const intptr_t bh, 3677// const intptr_t iw, const intptr_t ih, 3678// const intptr_t x, const intptr_t y, 3679// pixel *dst, const ptrdiff_t dst_stride, 3680// const pixel *ref, const ptrdiff_t ref_stride) 3681function emu_edge_16bpc_neon, export=1 3682 ldp x8, x9, [sp] 3683 3684 // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 3685 // ref += iclip(x, 0, iw - 1) 3686 sub x12, x3, #1 // ih - 1 3687 cmp x5, x3 3688 sub x13, x2, #1 // iw - 1 3689 csel x12, x12, x5, ge // min(y, ih - 1) 3690 cmp x4, x2 3691 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) 3692 csel x13, x13, x4, ge // min(x, iw - 1) 3693 bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) 3694 madd x8, x12, x9, x8 // ref += iclip() * stride 3695 add x8, x8, x13, lsl #1 // ref += iclip() 3696 3697 // bottom_ext = iclip(y + bh - ih, 0, bh - 1) 3698 // top_ext = iclip(-y, 0, bh - 1) 3699 add x10, x5, x1 // y + bh 3700 neg x5, x5 // -y 3701 sub x10, x10, x3 // y + bh - ih 3702 sub x12, x1, #1 // bh - 1 3703 cmp x10, x1 3704 bic x5, x5, x5, asr #63 // max(-y, 0) 3705 csel x10, x10, x12, lt // min(y + bh - ih, bh-1) 3706 cmp x5, x1 3707 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) 3708 csel x5, x5, x12, lt // min(max(-y, 0), bh-1) 3709 3710 // right_ext = iclip(x + bw - iw, 0, bw - 1) 3711 // left_ext = iclip(-x, 0, bw - 1) 3712 add x11, x4, x0 // x + bw 3713 neg x4, x4 // -x 3714 sub x11, x11, x2 // x + bw - iw 3715 sub x13, x0, #1 // bw - 1 3716 cmp x11, x0 3717 bic x4, x4, x4, asr #63 // max(-x, 0) 3718 csel x11, x11, x13, lt // min(x + bw - iw, bw-1) 3719 cmp x4, x0 3720 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) 3721 csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) 3722 3723 // center_h = bh - top_ext - bottom_ext 3724 // dst += top_ext * PXSTRIDE(dst_stride) 3725 // center_w = bw - left_ext - right_ext 3726 sub x1, x1, x5 // bh - top_ext 3727 madd x6, x5, x7, x6 3728 sub x2, x0, x4 // bw - left_ext 3729 sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext 3730 sub x2, x2, x11 // center_w = bw - left_ext - right_ext 3731 3732 mov x14, x6 // backup of dst 3733 3734.macro v_loop need_left, need_right 37350: 3736.if \need_left 3737 ld1r {v0.8h}, [x8] 3738 mov x12, x6 // out = dst 3739 mov x3, x4 3740 mov v1.16b, v0.16b 37411: 3742 subs x3, x3, #16 3743 st1 {v0.8h, v1.8h}, [x12], #32 3744 b.gt 1b 3745.endif 3746 mov x13, x8 3747 add x12, x6, x4, lsl #1 // out = dst + left_ext 3748 mov x3, x2 37491: 3750 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 3751 subs x3, x3, #32 3752 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 3753 b.gt 1b 3754.if \need_right 3755 add x3, x8, x2, lsl #1 // in + center_w 3756 sub x3, x3, #2 // in + center_w - 1 3757 add x12, x6, x4, lsl #1 // dst + left_ext 3758 ld1r {v0.8h}, [x3] 3759 add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w 3760 mov x3, x11 3761 mov v1.16b, v0.16b 37621: 3763 subs x3, x3, #16 3764 st1 {v0.8h, v1.8h}, [x12], #32 3765 b.gt 1b 3766.endif 3767 3768 subs x1, x1, #1 // center_h-- 3769 add x6, x6, x7 3770 add x8, x8, x9 3771 b.gt 0b 3772.endm 3773 3774 cbz x4, 2f 3775 // need_left 3776 cbz x11, 3f 3777 // need_left + need_right 3778 v_loop 1, 1 3779 b 5f 3780 37812: 3782 // !need_left 3783 cbz x11, 4f 3784 // !need_left + need_right 3785 v_loop 0, 1 3786 b 5f 3787 37883: 3789 // need_left + !need_right 3790 v_loop 1, 0 3791 b 5f 3792 37934: 3794 // !need_left + !need_right 3795 v_loop 0, 0 3796 37975: 3798 3799 cbz x10, 3f 3800 // need_bottom 3801 sub x8, x6, x7 // ref = dst - stride 3802 mov x4, x0 38031: 3804 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 3805 mov x3, x10 38062: 3807 subs x3, x3, #1 3808 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 3809 b.gt 2b 3810 msub x6, x7, x10, x6 // dst -= bottom_ext * stride 3811 subs x4, x4, #32 // bw -= 32 3812 add x6, x6, #64 // dst += 32 3813 b.gt 1b 3814 38153: 3816 cbz x5, 3f 3817 // need_top 3818 msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 38191: 3820 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 3821 mov x3, x5 38222: 3823 subs x3, x3, #1 3824 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 3825 b.gt 2b 3826 msub x6, x7, x5, x6 // dst -= top_ext * stride 3827 subs x0, x0, #32 // bw -= 32 3828 add x6, x6, #64 // dst += 32 3829 b.gt 1b 3830 38313: 3832 ret 3833endfunc 3834