1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29 30#define FILTER_OUT_STRIDE 384 31 32.macro sgr_funcs bpc 33// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp, 34// const pixel *src, 35// const ptrdiff_t src_stride, 36// const int32_t **a, 37// const int16_t **b, 38// const int w, const int h); 39function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1 40 stp d8, d9, [sp, #-0x40]! 41 stp d10, d11, [sp, #0x10] 42 stp d12, d13, [sp, #0x20] 43 stp d14, d15, [sp, #0x30] 44 45 ldp x7, x8, [x3] 46 ldp x9, x3, [x3, #16] 47 ldp x10, x11, [x4] 48 ldp x12, x4, [x4, #16] 49 50 mov x13, #FILTER_OUT_STRIDE 51 cmp w6, #1 52 add x2, x1, x2 // src + stride 53 csel x2, x1, x2, le // if (h <= 1) x2 = x1 54 add x13, x0, x13, lsl #1 55 56 movi v30.8h, #3 57 movi v31.4s, #3 581: 59 ld1 {v0.8h, v1.8h}, [x10], #32 60 ld1 {v2.8h, v3.8h}, [x11], #32 61 ld1 {v4.8h, v5.8h}, [x12], #32 62 ld1 {v6.8h, v7.8h}, [x4], #32 63 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 64 ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 65 ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48 66 ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48 67 682: 69 ext v8.16b, v0.16b, v1.16b, #2 // [0][1] 70 ext v9.16b, v2.16b, v3.16b, #2 // [1][1] 71 ext v10.16b, v4.16b, v5.16b, #2 // [2][1] 72 ext v11.16b, v0.16b, v1.16b, #4 // [0][2] 73 ext v12.16b, v2.16b, v3.16b, #4 // [1][2] 74 ext v13.16b, v4.16b, v5.16b, #4 // [2][2] 75 76 add v14.8h, v2.8h, v8.8h // [1][0] + [0][1] 77 add v15.8h, v9.8h, v10.8h // [1][1] + [2][1] 78 79 add v28.8h, v0.8h, v11.8h // [0][0] + [0][2] 80 add v14.8h, v14.8h, v12.8h // () + [1][2] 81 add v29.8h, v4.8h, v13.8h // [2][0] + [2][2] 82 83 ext v8.16b, v6.16b, v7.16b, #2 // [3][1] 84 ext v11.16b, v6.16b, v7.16b, #4 // [3][2] 85 86 add v14.8h, v14.8h, v15.8h // mid 87 add v15.8h, v28.8h, v29.8h // corners 88 89 add v28.8h, v4.8h, v9.8h // [2][0] + [1][1] 90 add v29.8h, v10.8h, v8.8h // [2][1] + [3][1] 91 92 add v2.8h, v2.8h, v12.8h // [1][0] + [1][2] 93 add v28.8h, v28.8h, v13.8h // () + [2][2] 94 add v4.8h, v6.8h, v11.8h // [3][0] + [3][2] 95 96 add v0.8h, v28.8h, v29.8h // mid 97 add v2.8h, v2.8h, v4.8h // corners 98 99 shl v4.8h, v14.8h, #2 100 mla v4.8h, v15.8h, v30.8h // * 3 -> a 101 102 shl v0.8h, v0.8h, #2 103 mla v0.8h, v2.8h, v30.8h // * 3 -> a 104 105 ext v8.16b, v16.16b, v17.16b, #4 // [0][1] 106 ext v9.16b, v17.16b, v18.16b, #4 107 ext v10.16b, v16.16b, v17.16b, #8 // [0][2] 108 ext v11.16b, v17.16b, v18.16b, #8 109 ext v12.16b, v19.16b, v20.16b, #4 // [1][1] 110 ext v13.16b, v20.16b, v21.16b, #4 111 add v8.4s, v8.4s, v19.4s // [0][1] + [1][0] 112 add v9.4s, v9.4s, v20.4s 113 add v16.4s, v16.4s, v10.4s // [0][0] + [0][2] 114 add v17.4s, v17.4s, v11.4s 115 ext v14.16b, v19.16b, v20.16b, #8 // [1][2] 116 ext v15.16b, v20.16b, v21.16b, #8 117 add v16.4s, v16.4s, v22.4s // () + [2][0] 118 add v17.4s, v17.4s, v23.4s 119 add v28.4s, v12.4s, v14.4s // [1][1] + [1][2] 120 add v29.4s, v13.4s, v15.4s 121 ext v10.16b, v22.16b, v23.16b, #4 // [2][1] 122 ext v11.16b, v23.16b, v24.16b, #4 123 add v8.4s, v8.4s, v28.4s // mid (incomplete) 124 add v9.4s, v9.4s, v29.4s 125 126 add v19.4s, v19.4s, v14.4s // [1][0] + [1][2] 127 add v20.4s, v20.4s, v15.4s 128 add v14.4s, v22.4s, v12.4s // [2][0] + [1][1] 129 add v15.4s, v23.4s, v13.4s 130 131 ext v12.16b, v22.16b, v23.16b, #8 // [2][2] 132 ext v13.16b, v23.16b, v24.16b, #8 133 ext v28.16b, v25.16b, v26.16b, #4 // [3][1] 134 ext v29.16b, v26.16b, v27.16b, #4 135 add v8.4s, v8.4s, v10.4s // () + [2][1] = mid 136 add v9.4s, v9.4s, v11.4s 137 add v14.4s, v14.4s, v10.4s // () + [2][1] 138 add v15.4s, v15.4s, v11.4s 139 ext v10.16b, v25.16b, v26.16b, #8 // [3][2] 140 ext v11.16b, v26.16b, v27.16b, #8 141 add v16.4s, v16.4s, v12.4s // () + [2][2] = corner 142 add v17.4s, v17.4s, v13.4s 143 144 add v12.4s, v12.4s, v28.4s // [2][2] + [3][1] 145 add v13.4s, v13.4s, v29.4s 146 add v25.4s, v25.4s, v10.4s // [3][0] + [3][2] 147 add v26.4s, v26.4s, v11.4s 148 149 add v14.4s, v14.4s, v12.4s // mid 150 add v15.4s, v15.4s, v13.4s 151 add v19.4s, v19.4s, v25.4s // corner 152 add v20.4s, v20.4s, v26.4s 153 154.if \bpc == 8 155 ld1 {v25.8b}, [x1], #8 // src 156 ld1 {v26.8b}, [x2], #8 157.else 158 ld1 {v25.8h}, [x1], #16 // src 159 ld1 {v26.8h}, [x2], #16 160.endif 161 162 shl v8.4s, v8.4s, #2 163 shl v9.4s, v9.4s, #2 164 mla v8.4s, v16.4s, v31.4s // * 3 -> b 165 mla v9.4s, v17.4s, v31.4s 166 167.if \bpc == 8 168 uxtl v25.8h, v25.8b // src 169 uxtl v26.8h, v26.8b 170.endif 171 172 shl v14.4s, v14.4s, #2 173 shl v15.4s, v15.4s, #2 174 mla v14.4s, v19.4s, v31.4s // * 3 -> b 175 mla v15.4s, v20.4s, v31.4s 176 177 umlal v8.4s, v4.4h, v25.4h // b + a * src 178 umlal2 v9.4s, v4.8h, v25.8h 179 umlal v14.4s, v0.4h, v26.4h // b + a * src 180 umlal2 v15.4s, v0.8h, v26.8h 181 mov v0.16b, v1.16b 182 rshrn v8.4h, v8.4s, #9 183 rshrn2 v8.8h, v9.4s, #9 184 mov v2.16b, v3.16b 185 rshrn v14.4h, v14.4s, #9 186 rshrn2 v14.8h, v15.4s, #9 187 subs w5, w5, #8 188 mov v4.16b, v5.16b 189 st1 {v8.8h}, [x0], #16 190 mov v6.16b, v7.16b 191 st1 {v14.8h}, [x13], #16 192 193 b.le 3f 194 mov v16.16b, v18.16b 195 mov v19.16b, v21.16b 196 mov v22.16b, v24.16b 197 mov v25.16b, v27.16b 198 ld1 {v1.8h}, [x10], #16 199 ld1 {v3.8h}, [x11], #16 200 ld1 {v5.8h}, [x12], #16 201 ld1 {v7.8h}, [x4], #16 202 ld1 {v17.4s, v18.4s}, [x7], #32 203 ld1 {v20.4s, v21.4s}, [x8], #32 204 ld1 {v23.4s, v24.4s}, [x9], #32 205 ld1 {v26.4s, v27.4s}, [x3], #32 206 b 2b 207 2083: 209 ldp d14, d15, [sp, #0x30] 210 ldp d12, d13, [sp, #0x20] 211 ldp d10, d11, [sp, #0x10] 212 ldp d8, d9, [sp], 0x40 213 ret 214endfunc 215 216// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst, 217// const int32_t **a, const int16_t **b, 218// const int w, const int w1, 219// const int bitdepth_max); 220function sgr_finish_weighted1_\bpc\()bpc_neon, export=1 221 ldp x7, x8, [x1] 222 ldr x1, [x1, #16] 223 ldp x9, x10, [x2] 224 ldr x2, [x2, #16] 225 226 dup v31.8h, w4 227 dup v30.8h, w5 228 229 movi v6.8h, #3 230 movi v7.4s, #3 2311: 232 ld1 {v0.8h, v1.8h}, [x9], #32 233 ld1 {v2.8h, v3.8h}, [x10], #32 234 ld1 {v4.8h, v5.8h}, [x2], #32 235 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 236 ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 237 ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48 238 2392: 240 ext v25.16b, v0.16b, v1.16b, #2 // -stride 241 ext v26.16b, v2.16b, v3.16b, #2 // 0 242 ext v27.16b, v4.16b, v5.16b, #2 // +stride 243 ext v28.16b, v0.16b, v1.16b, #4 // +1-stride 244 ext v29.16b, v2.16b, v3.16b, #4 // +1 245 add v2.8h, v2.8h, v25.8h // -1, -stride 246 ext v25.16b, v4.16b, v5.16b, #4 // +1+stride 247 add v26.8h, v26.8h, v27.8h // 0, +stride 248 add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride 249 add v2.8h, v2.8h, v26.8h 250 add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride 251 add v2.8h, v2.8h, v29.8h // +1 252 add v0.8h, v0.8h, v4.8h 253 254 ext v25.16b, v16.16b, v17.16b, #4 // -stride 255 ext v26.16b, v17.16b, v18.16b, #4 256 shl v2.8h, v2.8h, #2 257 ext v27.16b, v16.16b, v17.16b, #8 // +1-stride 258 ext v28.16b, v17.16b, v18.16b, #8 259 ext v29.16b, v19.16b, v20.16b, #4 // 0 260 ext v4.16b, v20.16b, v21.16b, #4 261 mla v2.8h, v0.8h, v6.8h // * 3 -> a 262 add v25.4s, v25.4s, v19.4s // -stride, -1 263 add v26.4s, v26.4s, v20.4s 264 add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride 265 add v17.4s, v17.4s, v28.4s 266 ext v27.16b, v19.16b, v20.16b, #8 // +1 267 ext v28.16b, v20.16b, v21.16b, #8 268 add v16.4s, v16.4s, v22.4s // -1+stride 269 add v17.4s, v17.4s, v23.4s 270 add v29.4s, v29.4s, v27.4s // 0, +1 271 add v4.4s, v4.4s, v28.4s 272 add v25.4s, v25.4s, v29.4s 273 add v26.4s, v26.4s, v4.4s 274 ext v27.16b, v22.16b, v23.16b, #4 // +stride 275 ext v28.16b, v23.16b, v24.16b, #4 276 ext v29.16b, v22.16b, v23.16b, #8 // +1+stride 277 ext v4.16b, v23.16b, v24.16b, #8 278.if \bpc == 8 279 ld1 {v19.8b}, [x0] // src 280.else 281 ld1 {v19.8h}, [x0] // src 282.endif 283 add v25.4s, v25.4s, v27.4s // +stride 284 add v26.4s, v26.4s, v28.4s 285 add v16.4s, v16.4s, v29.4s // +1+stride 286 add v17.4s, v17.4s, v4.4s 287 shl v25.4s, v25.4s, #2 288 shl v26.4s, v26.4s, #2 289 mla v25.4s, v16.4s, v7.4s // * 3 -> b 290 mla v26.4s, v17.4s, v7.4s 291.if \bpc == 8 292 uxtl v19.8h, v19.8b // src 293.endif 294 mov v0.16b, v1.16b 295 umlal v25.4s, v2.4h, v19.4h // b + a * src 296 umlal2 v26.4s, v2.8h, v19.8h 297 mov v2.16b, v3.16b 298 rshrn v25.4h, v25.4s, #9 299 rshrn2 v25.8h, v26.4s, #9 300 301 subs w3, w3, #8 302 303 // weighted1 304 shl v19.8h, v19.8h, #4 // u 305 mov v4.16b, v5.16b 306 307 sub v25.8h, v25.8h, v19.8h // t1 - u 308 ld1 {v1.8h}, [x9], #16 309 ushll v26.4s, v19.4h, #7 // u << 7 310 ushll2 v27.4s, v19.8h, #7 // u << 7 311 ld1 {v3.8h}, [x10], #16 312 smlal v26.4s, v25.4h, v31.4h // v 313 smlal2 v27.4s, v25.8h, v31.8h // v 314 ld1 {v5.8h}, [x2], #16 315.if \bpc == 8 316 rshrn v26.4h, v26.4s, #11 317 rshrn2 v26.8h, v27.4s, #11 318 mov v16.16b, v18.16b 319 sqxtun v26.8b, v26.8h 320 mov v19.16b, v21.16b 321 mov v22.16b, v24.16b 322 st1 {v26.8b}, [x0], #8 323.else 324 sqrshrun v26.4h, v26.4s, #11 325 sqrshrun2 v26.8h, v27.4s, #11 326 mov v16.16b, v18.16b 327 umin v26.8h, v26.8h, v30.8h 328 mov v19.16b, v21.16b 329 mov v22.16b, v24.16b 330 st1 {v26.8h}, [x0], #16 331.endif 332 333 b.le 3f 334 ld1 {v17.4s, v18.4s}, [x7], #32 335 ld1 {v20.4s, v21.4s}, [x8], #32 336 ld1 {v23.4s, v24.4s}, [x1], #32 337 b 2b 338 3393: 340 ret 341endfunc 342 343// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp, 344// const pixel *src, 345// const ptrdiff_t stride, 346// const int32_t **a, 347// const int16_t **b, 348// const int w, const int h); 349function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1 350 stp d8, d9, [sp, #-0x40]! 351 stp d10, d11, [sp, #0x10] 352 stp d12, d13, [sp, #0x20] 353 stp d14, d15, [sp, #0x30] 354 355 ldp x3, x7, [x3] 356 ldp x4, x8, [x4] 357 mov x10, #FILTER_OUT_STRIDE 358 cmp w6, #1 359 add x2, x1, x2 // src + stride 360 csel x2, x1, x2, le // if (h <= 1) x2 = x1 361 add x10, x0, x10, lsl #1 362 movi v4.8h, #5 363 movi v5.4s, #5 364 movi v6.8h, #6 365 movi v7.4s, #6 3661: 367 ld1 {v0.8h, v1.8h}, [x4], #32 368 ld1 {v2.8h, v3.8h}, [x8], #32 369 ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 370 ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 371 3722: 373 ext v24.16b, v0.16b, v1.16b, #4 // +1-stride 374 ext v25.16b, v2.16b, v3.16b, #4 // +1+stride 375 ext v22.16b, v0.16b, v1.16b, #2 // -stride 376 ext v23.16b, v2.16b, v3.16b, #2 // +stride 377 add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride 378 add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride 379 add v2.8h, v22.8h, v23.8h // -stride, +stride 380 add v0.8h, v0.8h, v25.8h 381 382 mul v8.8h, v25.8h, v4.8h // * 5 383 mla v8.8h, v23.8h, v6.8h // * 6 384 385 ext v22.16b, v16.16b, v17.16b, #4 // -stride 386 ext v23.16b, v17.16b, v18.16b, #4 387 ext v24.16b, v19.16b, v20.16b, #4 // +stride 388 ext v25.16b, v20.16b, v21.16b, #4 389 ext v26.16b, v16.16b, v17.16b, #8 // +1-stride 390 ext v27.16b, v17.16b, v18.16b, #8 391 ext v28.16b, v19.16b, v20.16b, #8 // +1+stride 392 ext v29.16b, v20.16b, v21.16b, #8 393 mul v0.8h, v0.8h, v4.8h // * 5 394 mla v0.8h, v2.8h, v6.8h // * 6 395.if \bpc == 8 396 ld1 {v31.8b}, [x1], #8 397 ld1 {v30.8b}, [x2], #8 398.else 399 ld1 {v31.8h}, [x1], #16 400 ld1 {v30.8h}, [x2], #16 401.endif 402 add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride 403 add v17.4s, v17.4s, v27.4s 404 add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride 405 add v20.4s, v20.4s, v29.4s 406 add v16.4s, v16.4s, v19.4s 407 add v17.4s, v17.4s, v20.4s 408 409 mul v9.4s, v19.4s, v5.4s // * 5 410 mla v9.4s, v24.4s, v7.4s // * 6 411 mul v10.4s, v20.4s, v5.4s // * 5 412 mla v10.4s, v25.4s, v7.4s // * 6 413 414 add v22.4s, v22.4s, v24.4s // -stride, +stride 415 add v23.4s, v23.4s, v25.4s 416 // This is, surprisingly, faster than other variants where the 417 // mul+mla pairs are further apart, on Cortex A53. 418 mul v16.4s, v16.4s, v5.4s // * 5 419 mla v16.4s, v22.4s, v7.4s // * 6 420 mul v17.4s, v17.4s, v5.4s // * 5 421 mla v17.4s, v23.4s, v7.4s // * 6 422 423.if \bpc == 8 424 uxtl v31.8h, v31.8b 425 uxtl v30.8h, v30.8b 426.endif 427 umlal v16.4s, v0.4h, v31.4h // b + a * src 428 umlal2 v17.4s, v0.8h, v31.8h 429 umlal v9.4s, v8.4h, v30.4h // b + a * src 430 umlal2 v10.4s, v8.8h, v30.8h 431 mov v0.16b, v1.16b 432 rshrn v16.4h, v16.4s, #9 433 rshrn2 v16.8h, v17.4s, #9 434 rshrn v9.4h, v9.4s, #8 435 rshrn2 v9.8h, v10.4s, #8 436 subs w5, w5, #8 437 mov v2.16b, v3.16b 438 st1 {v16.8h}, [x0], #16 439 st1 {v9.8h}, [x10], #16 440 441 b.le 9f 442 mov v16.16b, v18.16b 443 mov v19.16b, v21.16b 444 ld1 {v1.8h}, [x4], #16 445 ld1 {v3.8h}, [x8], #16 446 ld1 {v17.4s, v18.4s}, [x3], #32 447 ld1 {v20.4s, v21.4s}, [x7], #32 448 b 2b 449 4509: 451 ldp d14, d15, [sp, #0x30] 452 ldp d12, d13, [sp, #0x20] 453 ldp d10, d11, [sp, #0x10] 454 ldp d8, d9, [sp], 0x40 455 ret 456endfunc 457 458// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, 459// const int32_t **a, 460// const int16_t **b, 461// const int w, const int h, 462// const int w1, 463// const int bitdepth_max); 464function sgr_finish_weighted2_\bpc\()bpc_neon, export=1 465 stp d8, d9, [sp, #-0x30]! 466 str d10, [sp, #0x10] 467 stp d14, d15, [sp, #0x20] 468 469 dup v14.8h, w6 470 dup v15.8h, w7 471 472 ldp x2, x7, [x2] 473 ldp x3, x8, [x3] 474 cmp w5, #1 475 add x1, x0, x1 // src + stride 476 // if (h <= 1), set the pointer to the second row to any dummy buffer 477 // we can clobber (x2 in this case) 478 csel x1, x2, x1, le 479 movi v4.8h, #5 480 movi v5.4s, #5 481 movi v6.8h, #6 482 movi v7.4s, #6 4831: 484 ld1 {v0.8h, v1.8h}, [x3], #32 485 ld1 {v2.8h, v3.8h}, [x8], #32 486 ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48 487 ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 488 4892: 490 ext v24.16b, v0.16b, v1.16b, #4 // +1-stride 491 ext v25.16b, v2.16b, v3.16b, #4 // +1+stride 492 ext v22.16b, v0.16b, v1.16b, #2 // -stride 493 ext v23.16b, v2.16b, v3.16b, #2 // +stride 494 add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride 495 add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride 496 add v2.8h, v22.8h, v23.8h // -stride, +stride 497 add v0.8h, v0.8h, v25.8h 498 499 mul v8.8h, v25.8h, v4.8h // * 5 500 mla v8.8h, v23.8h, v6.8h // * 6 501 502 ext v22.16b, v16.16b, v17.16b, #4 // -stride 503 ext v23.16b, v17.16b, v18.16b, #4 504 ext v24.16b, v19.16b, v20.16b, #4 // +stride 505 ext v25.16b, v20.16b, v21.16b, #4 506 ext v26.16b, v16.16b, v17.16b, #8 // +1-stride 507 ext v27.16b, v17.16b, v18.16b, #8 508 ext v28.16b, v19.16b, v20.16b, #8 // +1+stride 509 ext v29.16b, v20.16b, v21.16b, #8 510 mul v0.8h, v0.8h, v4.8h // * 5 511 mla v0.8h, v2.8h, v6.8h // * 6 512.if \bpc == 8 513 ld1 {v31.8b}, [x0] 514 ld1 {v30.8b}, [x1] 515.else 516 ld1 {v31.8h}, [x0] 517 ld1 {v30.8h}, [x1] 518.endif 519 add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride 520 add v17.4s, v17.4s, v27.4s 521 add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride 522 add v20.4s, v20.4s, v29.4s 523 add v16.4s, v16.4s, v19.4s 524 add v17.4s, v17.4s, v20.4s 525 526 mul v9.4s, v19.4s, v5.4s // * 5 527 mla v9.4s, v24.4s, v7.4s // * 6 528 mul v10.4s, v20.4s, v5.4s // * 5 529 mla v10.4s, v25.4s, v7.4s // * 6 530 531 add v22.4s, v22.4s, v24.4s // -stride, +stride 532 add v23.4s, v23.4s, v25.4s 533 // This is, surprisingly, faster than other variants where the 534 // mul+mla pairs are further apart, on Cortex A53. 535 mul v16.4s, v16.4s, v5.4s // * 5 536 mla v16.4s, v22.4s, v7.4s // * 6 537 mul v17.4s, v17.4s, v5.4s // * 5 538 mla v17.4s, v23.4s, v7.4s // * 6 539 540.if \bpc == 8 541 uxtl v31.8h, v31.8b 542 uxtl v30.8h, v30.8b 543.endif 544 umlal v16.4s, v0.4h, v31.4h // b + a * src 545 umlal2 v17.4s, v0.8h, v31.8h 546 umlal v9.4s, v8.4h, v30.4h // b + a * src 547 umlal2 v10.4s, v8.8h, v30.8h 548 mov v0.16b, v1.16b 549 rshrn v16.4h, v16.4s, #9 550 rshrn2 v16.8h, v17.4s, #9 551 rshrn v9.4h, v9.4s, #8 552 rshrn2 v9.8h, v10.4s, #8 553 554 subs w4, w4, #8 555 556 // weighted1 557 shl v31.8h, v31.8h, #4 // u 558 shl v30.8h, v30.8h, #4 559 mov v2.16b, v3.16b 560 561 sub v16.8h, v16.8h, v31.8h // t1 - u 562 sub v9.8h, v9.8h, v30.8h 563 ld1 {v1.8h}, [x3], #16 564 ushll v22.4s, v31.4h, #7 // u << 7 565 ushll2 v23.4s, v31.8h, #7 566 ushll v24.4s, v30.4h, #7 567 ushll2 v25.4s, v30.8h, #7 568 ld1 {v3.8h}, [x8], #16 569 smlal v22.4s, v16.4h, v14.4h // v 570 smlal2 v23.4s, v16.8h, v14.8h 571 mov v16.16b, v18.16b 572 smlal v24.4s, v9.4h, v14.4h 573 smlal2 v25.4s, v9.8h, v14.8h 574 mov v19.16b, v21.16b 575.if \bpc == 8 576 rshrn v22.4h, v22.4s, #11 577 rshrn2 v22.8h, v23.4s, #11 578 rshrn v23.4h, v24.4s, #11 579 rshrn2 v23.8h, v25.4s, #11 580 sqxtun v22.8b, v22.8h 581 sqxtun v23.8b, v23.8h 582 st1 {v22.8b}, [x0], #8 583 st1 {v23.8b}, [x1], #8 584.else 585 sqrshrun v22.4h, v22.4s, #11 586 sqrshrun2 v22.8h, v23.4s, #11 587 sqrshrun v23.4h, v24.4s, #11 588 sqrshrun2 v23.8h, v25.4s, #11 589 umin v22.8h, v22.8h, v15.8h 590 umin v23.8h, v23.8h, v15.8h 591 st1 {v22.8h}, [x0], #16 592 st1 {v23.8h}, [x1], #16 593.endif 594 595 b.le 3f 596 ld1 {v17.4s, v18.4s}, [x2], #32 597 ld1 {v20.4s, v21.4s}, [x7], #32 598 b 2b 599 6003: 601 ldp d14, d15, [sp, #0x20] 602 ldr d10, [sp, #0x10] 603 ldp d8, d9, [sp], 0x30 604 ret 605endfunc 606 607// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, 608// const pixel *src, const ptrdiff_t src_stride, 609// const int16_t *t1, const int16_t *t2, 610// const int w, const int h, 611// const int16_t wt[2], const int bitdepth_max); 612function sgr_weighted2_\bpc\()bpc_neon, export=1 613.if \bpc == 8 614 ldr x8, [sp] 615.else 616 ldp x8, x9, [sp] 617.endif 618 cmp w7, #2 619 add x10, x0, x1 620 add x11, x2, x3 621 add x12, x4, #2*FILTER_OUT_STRIDE 622 add x13, x5, #2*FILTER_OUT_STRIDE 623 ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1] 624.if \bpc == 16 625 dup v29.8h, w9 626.endif 627 mov x8, #4*FILTER_OUT_STRIDE 628 lsl x1, x1, #1 629 lsl x3, x3, #1 630 add x9, x6, #7 631 bic x9, x9, #7 // Aligned width 632.if \bpc == 8 633 sub x1, x1, x9 634 sub x3, x3, x9 635.else 636 sub x1, x1, x9, lsl #1 637 sub x3, x3, x9, lsl #1 638.endif 639 sub x8, x8, x9, lsl #1 640 mov w9, w6 641 b.lt 2f 6421: 643.if \bpc == 8 644 ld1 {v0.8b}, [x2], #8 645 ld1 {v16.8b}, [x11], #8 646.else 647 ld1 {v0.8h}, [x2], #16 648 ld1 {v16.8h}, [x11], #16 649.endif 650 ld1 {v1.8h}, [x4], #16 651 ld1 {v17.8h}, [x12], #16 652 ld1 {v2.8h}, [x5], #16 653 ld1 {v18.8h}, [x13], #16 654 subs w6, w6, #8 655.if \bpc == 8 656 ushll v0.8h, v0.8b, #4 // u 657 ushll v16.8h, v16.8b, #4 // u 658.else 659 shl v0.8h, v0.8h, #4 // u 660 shl v16.8h, v16.8h, #4 // u 661.endif 662 sub v1.8h, v1.8h, v0.8h // t1 - u 663 sub v2.8h, v2.8h, v0.8h // t2 - u 664 sub v17.8h, v17.8h, v16.8h // t1 - u 665 sub v18.8h, v18.8h, v16.8h // t2 - u 666 ushll v3.4s, v0.4h, #7 // u << 7 667 ushll2 v4.4s, v0.8h, #7 // u << 7 668 ushll v19.4s, v16.4h, #7 // u << 7 669 ushll2 v20.4s, v16.8h, #7 // u << 7 670 smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) 671 smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) 672 smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) 673 smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) 674 smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u) 675 smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u) 676 smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u) 677 smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u) 678.if \bpc == 8 679 rshrn v3.4h, v3.4s, #11 680 rshrn2 v3.8h, v4.4s, #11 681 rshrn v19.4h, v19.4s, #11 682 rshrn2 v19.8h, v20.4s, #11 683 sqxtun v3.8b, v3.8h 684 sqxtun v19.8b, v19.8h 685 st1 {v3.8b}, [x0], #8 686 st1 {v19.8b}, [x10], #8 687.else 688 sqrshrun v3.4h, v3.4s, #11 689 sqrshrun2 v3.8h, v4.4s, #11 690 sqrshrun v19.4h, v19.4s, #11 691 sqrshrun2 v19.8h, v20.4s, #11 692 umin v3.8h, v3.8h, v29.8h 693 umin v19.8h, v19.8h, v29.8h 694 st1 {v3.8h}, [x0], #16 695 st1 {v19.8h}, [x10], #16 696.endif 697 b.gt 1b 698 699 subs w7, w7, #2 700 cmp w7, #1 701 b.lt 0f 702 mov w6, w9 703 add x0, x0, x1 704 add x10, x10, x1 705 add x2, x2, x3 706 add x11, x11, x3 707 add x4, x4, x8 708 add x12, x12, x8 709 add x5, x5, x8 710 add x13, x13, x8 711 b.eq 2f 712 b 1b 713 7142: 715.if \bpc == 8 716 ld1 {v0.8b}, [x2], #8 717.else 718 ld1 {v0.8h}, [x2], #16 719.endif 720 ld1 {v1.8h}, [x4], #16 721 ld1 {v2.8h}, [x5], #16 722 subs w6, w6, #8 723.if \bpc == 8 724 ushll v0.8h, v0.8b, #4 // u 725.else 726 shl v0.8h, v0.8h, #4 // u 727.endif 728 sub v1.8h, v1.8h, v0.8h // t1 - u 729 sub v2.8h, v2.8h, v0.8h // t2 - u 730 ushll v3.4s, v0.4h, #7 // u << 7 731 ushll2 v4.4s, v0.8h, #7 // u << 7 732 smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) 733 smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) 734 smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) 735 smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) 736.if \bpc == 8 737 rshrn v3.4h, v3.4s, #11 738 rshrn2 v3.8h, v4.4s, #11 739 sqxtun v3.8b, v3.8h 740 st1 {v3.8b}, [x0], #8 741.else 742 sqrshrun v3.4h, v3.4s, #11 743 sqrshrun2 v3.8h, v4.4s, #11 744 umin v3.8h, v3.8h, v29.8h 745 st1 {v3.8h}, [x0], #16 746.endif 747 b.gt 1b 7480: 749 ret 750endfunc 751.endm 752