1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29 30/* 31static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, 32 const int bx4, const int bw4, int bh4) 33*/ 34 35function splat_mv_lsx 36 vld vr0, a1, 0 // 0 1 ... 11 ... 37 clz.w t4, a3 38 vaddi.bu vr1, vr0, 0 39 addi.w t4, t4, -26 40 vextrins.w vr1, vr0, 0x30 // 0 1 2 ... 11 0 1 2 3 41 la.local t5, .SPLAT_LSX_JRTABLE 42 vbsrl.v vr2, vr1, 4 // 4 5 6 7...11 0 1 2 3 0 0 0 0 43 alsl.d t6, t4, t5, 1 44 vextrins.w vr2, vr0, 0x31 // 4 5 6 7...11 0 1 2 3 4 5 6 7 45 ld.h t7, t6, 0 46 vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0 47 add.d t8, t5, t7 48 alsl.d a2, a2, a2, 1 49 vextrins.w vr3, vr0, 0x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 50 slli.w a2, a2, 2 51 jirl $r0, t8, 0 52 53.SPLAT_LSX_JRTABLE: 54 .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE 55 .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE 56 .hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE 57 .hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE 58 .hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE 59 .hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE 60 61.SPLAT_W1_LSX: 62 ld.d t3, a0, 0 63 addi.d a0, a0, 8 64 addi.d a4, a4, -1 65 add.d t3, t3, a2 66 67 fst.d f1, t3, 0 68 fst.s f3, t3, 8 69 blt zero, a4, .SPLAT_W1_LSX 70 b .splat_end 71.SPLAT_W2_LSX: 72 ld.d t3, a0, 0 73 addi.d a0, a0, 8 74 addi.d a4, a4, -1 75 add.d t3, t3, a2 76 77 vst vr1, t3, 0 78 fst.d f2, t3, 16 79 blt zero, a4, .SPLAT_W2_LSX 80 b .splat_end 81 82.SPLAT_W4_LSX: 83 ld.d t3, a0, 0 84 addi.d a0, a0, 8 85 addi.d a4, a4, -1 86 add.d t3, t3, a2 87 88 vst vr1, t3, 0 89 vst vr2, t3, 16 90 vst vr3, t3, 32 91 blt zero, a4, .SPLAT_W4_LSX 92 b .splat_end 93 94.SPLAT_W8_LSX: 95 ld.d t3, a0, 0 96 addi.d a0, a0, 8 97 addi.d a4, a4, -1 98 add.d t3, t3, a2 99 100 vst vr1, t3, 0 101 vst vr2, t3, 16 102 vst vr3, t3, 32 103 104 vst vr1, t3, 48 105 vst vr2, t3, 64 106 vst vr3, t3, 80 107 blt zero, a4, .SPLAT_W8_LSX 108 b .splat_end 109 110.SPLAT_W16_LSX: 111 ld.d t3, a0, 0 112 addi.d a0, a0, 8 113 addi.d a4, a4, -1 114 add.d t3, t3, a2 115 116.rept 2 117 vst vr1, t3, 0 118 vst vr2, t3, 16 119 vst vr3, t3, 32 120 121 vst vr1, t3, 48 122 vst vr2, t3, 64 123 vst vr3, t3, 80 124 125 addi.d t3, t3, 96 126.endr 127 128 blt zero, a4, .SPLAT_W16_LSX 129 b .splat_end 130 131.SPLAT_W32_LSX: 132 ld.d t3, a0, 0 133 addi.d a0, a0, 8 134 addi.d a4, a4, -1 135 add.d t3, t3, a2 136 137.rept 4 138 vst vr1, t3, 0 139 vst vr2, t3, 16 140 vst vr3, t3, 32 141 142 vst vr1, t3, 48 143 vst vr2, t3, 64 144 vst vr3, t3, 80 145 146 addi.d t3, t3, 96 147.endr 148 149 blt zero, a4, .SPLAT_W32_LSX 150 151.splat_end: 152endfunc 153 154const la_div_mult 155.short 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 156.short 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 157.short 1024, 963, 910, 862, 819, 780, 744, 712 158.short 682, 655, 630, 606, 585, 564, 546, 528 159endconst 160 161/* 162 * temp reg: a6 a7 163 */ 164.macro LOAD_SET_LOOP is_odd 165 slli.d a6, t6, 2 166 add.d a6, a6, t6 // col_w * 5 1670: 168 addi.d a7, zero, 0 // x 169.if \is_odd 170 stx.w t7, t3, a7 171 addi.d a7, a7, 5 172 bge a7, a6, 2f 173.endif 174 1751: 176 stx.w t7, t3, a7 177 addi.d a7, a7, 5 178 stx.w t7, t3, a7 179 addi.d a7, a7, 5 180 blt a7, a6, 1b 1812: 182 add.d t3, t3, t2 183 addi.d t5, t5, 1 184 blt t5, a5, 0b 185.endm 186 187/* 188 * static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx, 189 * const int col_start8, const int col_end8, 190 * const int row_start8, int row_end8) 191 */ 192function load_tmvs_lsx 193 addi.d sp, sp, -80 194 st.d s0, sp, 0 195 st.d s1, sp, 8 196 st.d s2, sp, 16 197 st.d s3, sp, 24 198 st.d s4, sp, 32 199 st.d s5, sp, 40 200 st.d s6, sp, 48 201 st.d s7, sp, 56 202 st.d s8, sp, 64 203 204 vld vr16, a0, 16 205 vld vr0, a0, 52 // rf->mfmv_ref 206 ld.w s8, a0, 152 // [0] - rf->n_mfmvs 207 vld vr17, a0, 168 // [0] - rp_ref| [1]- rp_proj 208 ld.d t1, a0, 184 // stride 209 ld.w t0, a0, 200 210 addi.w t0, t0, -1 211 bnez t0, 1f 212 addi.w a1, zero, 0 2131: 214 addi.d t0, a3, 8 215 vinsgr2vr.w vr1, t0, 0 216 vinsgr2vr.w vr1, a5, 1 217 vmin.w vr1, vr1, vr16 // [0] col_end8i [1] row_end8 218 addi.d t0, a2, -8 219 bge t0, zero, 2f 220 addi.w t0, zero, 0 // t0 col_start8i 2212: 222 vpickve2gr.d t4, vr17, 1 // rf->rp_proj 223 slli.d t2, t1, 2 224 add.d t2, t2, t1 // stride * 5 225 slli.d a1, a1, 4 // tile_row_idx * 16 226 andi t3, a4, 0xf 227 add.d t3, t3, a1 // tile_row_idx * 16 + row_start8 & 15 228 mul.w t3, t3, t2 229 mul.w t8, a1, t2 230 vpickve2gr.w a5, vr1, 1 231 addi.d t5, a4, 0 232 sub.d t6, a3, a2 // col_end8 - col_start8 233 li.w t7, 0x80008000 234 slli.d a7, a2, 2 235 add.d t3, t3, a2 236 add.d t3, t3, a7 237 add.d t3, t3, t4 // rp_proj 238 andi a6, t6, 1 239 bnez a6, 3f 240 LOAD_SET_LOOP 0 241 b 4f 2423: 243 LOAD_SET_LOOP 1 2444: 245 addi.d a6, zero, 0 // n 246 bge a6, s8, .end_load 247 add.d t3, t8, t4 // rp_proj 248 mul.w t6, a4, t2 249 addi.d s7, zero, 40 250 vpickve2gr.w t1, vr1, 0 // col_end8i 251 vbsrl.v vr2, vr0, 4 // rf->mfmv_ref2cur 252 addi.d t5, a0, 64 // rf->mfmv_ref2ref 253 la.local t8, la_div_mult 254 vld vr6, t8, 0 255 vld vr7, t8, 16 256 vld vr8, t8, 32 257 vld vr9, t8, 48 258 li.w t8, 0x3fff 259 vreplgr2vr.h vr21, t8 260 vxor.v vr18, vr18, vr18 // zero 261 vsub.h vr20, vr18, vr21 262 vpickev.b vr12, vr7, vr6 263 vpickod.b vr13, vr7, vr6 264 vpickev.b vr14, vr9, vr8 265 vpickod.b vr15, vr9, vr8 266 vpickve2gr.d s6, vr17, 0 // rf->rp_ref 2675: 268 vld vr10, t5, 0 269 vld vr11, t5, 16 270 vpickev.h vr10, vr11, vr10 271 vpickev.b vr10, vr11, vr10 // [1...7] 272 273 vbsrl.v vr0, vr0, 1 274 vpickve2gr.wu t8, vr2, 0 // ref2cur 275 vbsrl.v vr2, vr2, 4 276 srli.d t4, t8, 24 277 xori t4, t4, 0x80 278 beqz t4, 8f 279 280 vreplgr2vr.h vr23, t8 281 vshuf.b vr6, vr14, vr12, vr10 282 vshuf.b vr7, vr15, vr13, vr10 283 vilvl.b vr8, vr7, vr6 284 vmulwev.w.h vr6, vr8, vr23 285 vmulwod.w.h vr7, vr8, vr23 286 287 vpickve2gr.b s0, vr0, 0 // ref 288 slli.d t8, s0, 3 289 ldx.d s1, s6, t8 // rf->rp_ref[ref] 290 addi.d s0, s0, -4 // ref_sign 291 vreplgr2vr.h vr19, s0 292 add.d s1, s1, t6 // &rf->rp_ref[ref][row_start8 * stride] 293 addi.d s2, a4, 0 // y 294 vilvl.w vr8, vr7, vr6 295 vilvh.w vr9, vr7, vr6 2966: // for (int y = row_start8; 297 andi s3, s2, 0xff8 298 299 addi.d s4, s3, 8 300 blt a4, s3, 0f 301 addi.d s3, a4, 0 // y_proj_start 3020: 303 blt s4, a5, 0f 304 addi.d s4, a5, 0 // y_proj_end 3050: 306 addi.d s5, t0, 0 // x 3077: // for (int x = col_start8i; 308 slli.d a7, s5, 2 309 add.d a7, a7, s5 310 add.d a7, s1, a7 // rb 311 vld vr3, a7, 0 // [rb] 312 vpickve2gr.b t4, vr3, 4 // b_ref 313 beqz t4, .end_x 314 vreplve.b vr11, vr10, t4 315 vpickve2gr.b t7, vr11, 4 // ref2ref 316 beqz t7, .end_x 317 vsllwil.w.h vr4, vr3, 0 318 vreplgr2vr.w vr6, t4 319 vshuf.w vr6, vr9, vr8 // frac 320 vmul.w vr5, vr6, vr4 321 vsrai.w vr4, vr5, 31 322 vadd.w vr4, vr4, vr5 323 vssrarni.h.w vr4, vr4, 14 324 vclip.h vr4, vr4, vr20, vr21 // offset 325 vxor.v vr5, vr4, vr19 // offset.x ^ ref_sign 326 vori.b vr5, vr5, 0x1 // offset.x ^ ref_sign 327 vabsd.h vr4, vr4, vr18 328 vsrli.h vr4, vr4, 6 // abs(offset.x) >> 6 329 vsigncov.h vr4, vr5, vr4 // apply_sign 330 vpickve2gr.h s0, vr4, 0 331 add.d s0, s2, s0 // pos_y 332 blt s0, s3, .n_posy 333 bge s0, s4, .n_posy 334 andi s0, s0, 0xf 335 mul.w s0, s0, t2 // pos 336 vpickve2gr.h t7, vr4, 1 337 add.d t7, t7, s5 // pos_x 338 add.d s0, t3, s0 // rp_proj + pos 339 340.loop_posx: 341 andi t4, s5, 0xff8 // x_sb_align 342 343 blt t7, a2, .n_posx 344 addi.d t8, t4, -8 345 blt t7, t8, .n_posx 346 347 bge t7, a3, .n_posx 348 addi.d t4, t4, 16 349 bge t7, t4, .n_posx 350 351 slli.d t4, t7, 2 352 add.d t4, t4, t7 // pos_x * 5 353 add.d t4, s0, t4 // rp_proj[pos + pos_x] 354 vstelm.w vr3, t4, 0, 0 355 vstelm.b vr11, t4, 4, 4 356 357.n_posx: 358 addi.d s5, s5, 1 // x + 1 359 bge s5, t1, .ret_posx 360 addi.d a7, a7, 5 // rb + 1 361 vld vr4, a7, 0 // [rb] 362 vseq.b vr5, vr4, vr3 363 364 vpickve2gr.d t8, vr5, 0 365 cto.d t8, t8 366 blt t8, s7, 7b 367 368 addi.d t7, t7, 1 // pos_x + 1 369 370 /* Core computing loop expansion(sencond) */ 371 andi t4, s5, 0xff8 // x_sb_align 372 373 blt t7, a2, .n_posx 374 addi.d t8, t4, -8 375 blt t7, t8, .n_posx 376 377 bge t7, a3, .n_posx 378 addi.d t4, t4, 16 379 bge t7, t4, .n_posx 380 381 slli.d t4, t7, 2 382 add.d t4, t4, t7 // pos_x * 5 383 add.d t4, s0, t4 // rp_proj[pos + pos_x] 384 vstelm.w vr3, t4, 0, 0 385 vstelm.b vr11, t4, 4, 4 386 387 addi.d s5, s5, 1 // x + 1 388 bge s5, t1, .ret_posx 389 addi.d a7, a7, 5 // rb + 1 390 vld vr4, a7, 0 // [rb] 391 vseq.b vr5, vr4, vr3 392 393 vpickve2gr.d t8, vr5, 0 394 cto.d t8, t8 395 blt t8, s7, 7b 396 397 addi.d t7, t7, 1 // pos_x + 1 398 399 /* Core computing loop expansion(third) */ 400 andi t4, s5, 0xff8 // x_sb_align 401 402 blt t7, a2, .n_posx 403 addi.d t8, t4, -8 404 blt t7, t8, .n_posx 405 406 bge t7, a3, .n_posx 407 addi.d t4, t4, 16 408 bge t7, t4, .n_posx 409 410 slli.d t4, t7, 2 411 add.d t4, t4, t7 // pos_x * 5 412 add.d t4, s0, t4 // rp_proj[pos + pos_x] 413 vstelm.w vr3, t4, 0, 0 414 vstelm.b vr11, t4, 4, 4 415 416 addi.d s5, s5, 1 // x + 1 417 bge s5, t1, .ret_posx 418 addi.d a7, a7, 5 // rb + 1 419 vld vr4, a7, 0 // [rb] 420 vseq.b vr5, vr4, vr3 421 422 vpickve2gr.d t8, vr5, 0 423 cto.d t8, t8 424 blt t8, s7, 7b 425 426 addi.d t7, t7, 1 // pos_x + 1 427 428 b .loop_posx 429 430.n_posy: 431 addi.d s5, s5, 1 // x + 1 432 bge s5, t1, .ret_posx 433 addi.d a7, a7, 5 // rb + 1 434 vld vr4, a7, 0 // [rb] 435 vseq.b vr5, vr4, vr3 436 437 vpickve2gr.d t8, vr5, 0 438 cto.d t8, t8 439 blt t8, s7, 7b 440 441 addi.d s5, s5, 1 // x + 1 442 bge s5, t1, .ret_posx 443 addi.d a7, a7, 5 // rb + 1 444 vld vr4, a7, 0 // [rb] 445 vseq.b vr5, vr4, vr3 446 447 vpickve2gr.d t8, vr5, 0 448 cto.d t8, t8 449 blt t8, s7, 7b 450 451 b .n_posy 452 453.end_x: 454 addi.d s5, s5, 1 // x + 1 455 blt s5, t1, 7b 456 457.ret_posx: 458 add.d s1, s1, t2 // r + stride 459 addi.d s2, s2, 1 // y + 1 460 blt s2, a5, 6b 4618: 462 addi.d a6, a6, 1 // n + 1 463 addi.d t5, t5, 28 // mfmv_ref2ref(offset) + 28 464 blt a6, s8, 5b 465 466.end_load: 467 ld.d s0, sp, 0 468 ld.d s1, sp, 8 469 ld.d s2, sp, 16 470 ld.d s3, sp, 24 471 ld.d s4, sp, 32 472 ld.d s5, sp, 40 473 ld.d s6, sp, 48 474 ld.d s7, sp, 56 475 ld.d s8, sp, 64 476 addi.d sp, sp, 80 477endfunc 478 479const mv_tbls 480 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 481 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 482 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 483 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 484endconst 485 486const mask_mult 487 .byte 1, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 488endconst 489 490const mask_mv0 491 .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 492endconst 493 494const mask_mv1 495 .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 496endconst 497 498// void dav1d_save_tmvs_lsx(refmvs_temporal_block *rp, ptrdiff_t stride, 499// refmvs_block **rr, const uint8_t *ref_sign, 500// int col_end8, int row_end8, 501// int col_start8, int row_start8) 502function save_tmvs_lsx 503 addi.d sp, sp, -0x28 504 st.d s0, sp, 0x00 505 st.d s1, sp, 0x08 506 st.d s2, sp, 0x10 507 st.d s3, sp, 0x18 508 st.d s4, sp, 0x20 509 move t0, ra 510 511 vxor.v vr10, vr10, vr10 512 vld vr11, a3, 0 // Load ref_sign[0] ~ Load ref_sign[7] 513 la.local t2, .save_tevs_tbl 514 la.local s1, mask_mult 515 la.local t7, mv_tbls 516 vld vr9, s1, 0 // Load mask_mult 517 vslli.d vr11, vr11, 8 // 0, ref_sign[0], ... ,ref_sign[6] 518 la.local s3, mask_mv0 519 vld vr8, s3, 0 // Load mask_mv0 520 la.local s4, mask_mv1 521 vld vr7, s4, 0 // Load mask_mv1 522 li.d s0, 5 523 li.d t8, 12 * 2 524 mul.d a1, a1, s0 // stride *= 5 525 sub.d a5, a5, a7 // h = row_end8 - row_start8 526 slli.d a7, a7, 1 // row_start8 <<= 1 5271: 528 li.d s0, 5 529 andi t3, a7, 30 // (y & 15) * 2 530 slli.d s4, t3, 3 531 ldx.d t3, a2, s4 // b = rr[(y & 15) * 2] 532 addi.d t3, t3, 12 // &b[... + 1] 533 mul.d s4, a4, t8 534 add.d t4, s4, t3 // end_cand_b = &b[col_end8*2 + 1] 535 mul.d s3, a6, t8 536 add.d t3, s3, t3 // cand_b = &b[x*2 + 1] 537 mul.d s4, a6, s0 538 add.d a3, s4, a0 // &rp[x] 5392: 540 /* First cand_b */ 541 ld.b t5, t3, 10 // cand_b->bs 542 vld vr0, t3, 0 // cand_b->mv and ref 543 alsl.d t5, t5, t2, 2 // bt2 index 544 ld.h s3, t3, 8 // cand_b->ref 545 ld.h t6, t5, 0 // bt2 546 move s0, t2 547 alsl.d t3, t6, t3, 1 // Next cand_b += bt2 * 2 548 vor.v vr2, vr0, vr0 549 vinsgr2vr.h vr1, s3, 0 550 move t1 , t3 551 bge t3, t4, 3f 552 553 /* Next cand_b */ 554 ld.b s0, t3, 10 // cand_b->bs 555 vld vr4, t3, 0 // cand_b->mv and ref 556 alsl.d s0, s0, t2, 2 // bt2 index 557 ld.h s4, t3, 8 // cand_b->ref 558 ld.h t6, s0, 0 // bt2 559 alsl.d t3, t6, t3, 1 // Next cand_b += bt2*2 560 vpackev.d vr2, vr4, vr0 // a0.mv[0] a0.mv[1] a1.mv[0], a1.mv[1] 561 vinsgr2vr.h vr1, s4, 1 // a0.ref[0] a0.ref[1], a1.ref[0], a1.ref[1] 5623: 563 vabsd.h vr2, vr2, vr10 // abs(mv[].xy) 564 vsle.b vr16, vr10, vr1 565 vand.v vr1, vr16, vr1 566 vshuf.b vr1, vr11, vr11, vr1 // ref_sign[ref] 567 vsrli.h vr2, vr2, 12 // abs(mv[].xy) >> 12 568 vilvl.b vr1, vr1, vr1 569 vmulwev.h.bu vr1, vr1, vr9 // ef_sign[ref] * {1, 2} 570 571 vseqi.w vr2, vr2, 0 // abs(mv[].xy) <= 4096 572 vpickev.h vr2, vr2, vr2 // abs() condition to 16 bit 573 574 vand.v vr1, vr2, vr1 // h[0-3] contains conditions for mv[0-1] 575 vhaddw.wu.hu vr1, vr1, vr1 // Combine condition for [1] and [0] 576 vpickve2gr.wu s1, vr1, 0 // Extract case for first block 577 vpickve2gr.wu s2, vr1, 1 578 579 ld.hu t5, t5, 2 // Fetch jump table entry 580 ld.hu s0, s0, 2 581 alsl.d s3, s1, t7, 4 // Load permutation table base on case 582 vld vr1, s3, 0 583 alsl.d s4, s2, t7, 4 584 vld vr5, s4, 0 585 sub.d t5, t2, t5 // Find jump table target 586 sub.d s0, t2, s0 587 588 vshuf.b vr0, vr0, vr0, vr1 // Permute cand_b to output refmvs_temporal_block 589 vshuf.b vr4, vr4, vr4, vr5 590 vsle.b vr16, vr10, vr1 591 vand.v vr0, vr16, vr0 592 593 vsle.b vr17, vr10, vr5 594 vand.v vr4, vr17, vr4 595 // v1 follows on v0, with another 3 full repetitions of the pattern. 596 vshuf.b vr1, vr0, vr0, vr8 // 1, 2, 3, ... , 15, 16 597 vshuf.b vr5, vr4, vr4, vr8 // 1, 2, 3, ... , 15, 16 598 // v2 ends with 3 complete repetitions of the pattern. 599 vshuf.b vr2, vr1, vr0, vr7 600 vshuf.b vr6, vr5, vr4, vr7 // 4, 5, 6, 7, ... , 12, 13, 14, 15, 16, 17, 18, 19 601 602 jirl ra, t5, 0 603 bge t1 , t4, 4f // if (cand_b >= end) 604 vor.v vr0, vr4, vr4 605 vor.v vr1, vr5, vr5 606 vor.v vr2, vr6, vr6 607 jirl ra, s0, 0 608 blt t3, t4, 2b // if (cand_b < end) 609 6104: 611 addi.d a5, a5, -1 // h-- 612 addi.d a7, a7, 2 // y += 2 613 add.d a0, a0, a1 // rp += stride 614 blt zero, a5, 1b 615 616 ld.d s0, sp, 0x00 617 ld.d s1, sp, 0x08 618 ld.d s2, sp, 0x10 619 ld.d s3, sp, 0x18 620 ld.d s4, sp, 0x20 621 addi.d sp, sp, 0x28 622 623 move ra, t0 624 jirl zero, ra, 0x00 625 62610: 627 addi.d s1, a3, 4 628 vstelm.w vr0, a3, 0, 0 // .mv 629 vstelm.b vr0, s1, 0, 4 // .ref 630 addi.d a3, a3, 5 631 jirl zero, ra, 0x00 63220: 633 addi.d s1, a3, 8 634 vstelm.d vr0, a3, 0, 0 // .mv 635 vstelm.h vr0, s1, 0, 4 // .ref 636 addi.d a3, a3, 2 * 5 637 jirl zero, ra, 0x00 63840: 639 vst vr0, a3, 0 640 vstelm.w vr1, a3, 0x10, 0 641 addi.d a3, a3, 4 * 5 642 jirl zero, ra, 0x00 643 64480: 645 vst vr0, a3, 0 646 vst vr1, a3, 0x10 // This writes 6 full entries plus 2 extra bytes 647 vst vr2, a3, 5 * 8 - 16 // Write the last few, overlapping with the first write. 648 addi.d a3, a3, 8 * 5 649 jirl zero, ra, 0x00 650160: 651 addi.d s1, a3, 6 * 5 652 addi.d s2, a3, 12 * 5 653 vst vr0, a3, 0 654 vst vr1, a3, 0x10 // This writes 6 full entries plus 2 extra bytes 655 vst vr0, a3, 6 * 5 656 vst vr1, a3, 6 * 5 + 16 // Write another 6 full entries, slightly overlapping with the first set 657 vstelm.d vr0, s2, 0, 0 // Write 8 bytes (one full entry) after the first 12 658 vst vr2, a3, 5 * 16 - 16 // Write the last 3 entries 659 addi.d a3, a3, 16 * 5 660 jirl zero, ra, 0x00 661 662.save_tevs_tbl: 663 .hword 16 * 12 // bt2 * 12, 12 is sizeof(refmvs_block) 664 .hword .save_tevs_tbl - 160b 665 .hword 16 * 12 666 .hword .save_tevs_tbl - 160b 667 .hword 8 * 12 668 .hword .save_tevs_tbl - 80b 669 .hword 8 * 12 670 .hword .save_tevs_tbl - 80b 671 .hword 8 * 12 672 .hword .save_tevs_tbl - 80b 673 .hword 8 * 12 674 .hword .save_tevs_tbl - 80b 675 .hword 4 * 12 676 .hword .save_tevs_tbl - 40b 677 .hword 4 * 12 678 .hword .save_tevs_tbl - 40b 679 .hword 4 * 12 680 .hword .save_tevs_tbl - 40b 681 .hword 4 * 12 682 .hword .save_tevs_tbl - 40b 683 .hword 2 * 12 684 .hword .save_tevs_tbl - 20b 685 .hword 2 * 12 686 .hword .save_tevs_tbl - 20b 687 .hword 2 * 12 688 .hword .save_tevs_tbl - 20b 689 .hword 2 * 12 690 .hword .save_tevs_tbl - 20b 691 .hword 2 * 12 692 .hword .save_tevs_tbl - 20b 693 .hword 1 * 12 694 .hword .save_tevs_tbl - 10b 695 .hword 1 * 12 696 .hword .save_tevs_tbl - 10b 697 .hword 1 * 12 698 .hword .save_tevs_tbl - 10b 699 .hword 1 * 12 700 .hword .save_tevs_tbl - 10b 701 .hword 1 * 12 702 .hword .save_tevs_tbl - 10b 703 .hword 1 * 12 704 .hword .save_tevs_tbl - 10b 705 .hword 1 * 12 706 .hword .save_tevs_tbl - 10b 707endfunc 708 709