1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "loongson_asm.S" 29 30const min_prob 31 .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 32endconst 33 34const ph_0xff00 35.rept 8 36 .short 0xff00 37.endr 38endconst 39 40.macro decode_symbol_adapt w 41 addi.d sp, sp, -48 42 vldrepl.h vr0, a0, 24 //rng 43 fst.s f0, sp, 0 //val==0 44 vld vr1, a1, 0 //cdf 45.if \w == 16 46 vld vr11, a1, 16 47.endif 48 vldrepl.d vr2, a0, 16 //dif 49 ld.w t1, a0, 32 //allow_update_cdf 50 la.local t2, min_prob 51 addi.d t2, t2, 30 52 slli.w t3, a2, 1 53 sub.d t2, t2, t3 54 vld vr3, t2, 0 //min_prob 55.if \w == 16 56 vld vr13, t2, 16 57.endif 58 vsrli.h vr4, vr0, 8 //r = s->rng >> 8 59 vslli.h vr4, vr4, 8 //r << 8 60 vsrli.h vr5, vr1, 6 61 vslli.h vr5, vr5, 7 62.if \w == 16 63 vsrli.h vr15, vr11, 6 64 vslli.h vr15, vr15, 7 65.endif 66 vmuh.hu vr5, vr4, vr5 67 vadd.h vr5, vr5, vr3 //v 68.if \w == 16 69 vmuh.hu vr15, vr4, vr15 70 vadd.h vr15, vr15, vr13 71.endif 72 addi.d t8, sp, 2 73 vst vr5, t8, 0 //store v 74.if \w == 16 75 vst vr15, t8, 16 76.endif 77 vreplvei.h vr20, vr2, 3 //c 78 vsle.hu vr6, vr5, vr20 79.if \w == 16 80 vsle.hu vr16, vr15, vr20 81 vpickev.b vr21, vr16, vr6 82.endif 83.if \w <= 8 84 vmskltz.h vr10, vr6 85.else 86 vmskltz.b vr10, vr21 87.endif 88 beqz t1, .renorm\()\w 89 90 // update_cdf 91 alsl.d t1, a2, a1, 1 92 ld.h t2, t1, 0 //count 93 srli.w t3, t2, 4 //count >> 4 94.if \w == 16 95 addi.w t3, t3, 5 //rate 96.else 97 addi.w t3, t3, 4 98 li.w t5, 2 99 sltu t5, t5, a2 100 add.w t3, t3, t5 //rate 101.endif 102 sltui t5, t2, 32 103 add.w t2, t2, t5 //count + (count < 32) 104 vreplgr2vr.h vr9, t3 105 vseq.h vr7, vr7, vr7 106 vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768 107 vsub.h vr5, vr5, vr1 108 vsub.h vr8, vr1, vr6 109.if \w == 16 110 vavgr.hu vr15, vr16, vr7 111 vsub.h vr15, vr15, vr11 112 vsub.h vr18, vr11, vr16 113.endif 114 vsra.h vr5, vr5, vr9 115 vadd.h vr8, vr8, vr5 116.if \w == 4 117 fst.d f8, a1, 0 118.else 119 vst vr8, a1, 0 120.endif 121.if \w == 16 122 vsra.h vr15, vr15, vr9 123 vadd.h vr18, vr18, vr15 124 vst vr18, a1, 16 125.endif 126 st.h t2, t1, 0 127 128.renorm\()\w: 129 vpickve2gr.h t3, vr10, 0 130 ctz.w a7, t3 // ret 131 alsl.d t3, a7, t8, 1 132 ld.hu t4, t3, 0 // v 133 ld.hu t5, t3, -2 // u 134 sub.w t5, t5, t4 // rng 135 slli.d t4, t4, 48 136 vpickve2gr.d t6, vr2, 0 137 sub.d t6, t6, t4 // dif 138 clz.w t4, t5 // d 139 xori t4, t4, 16 // d 140 sll.d t6, t6, t4 141 ld.w t0, a0, 28 //cnt 142 sll.w t5, t5, t4 143 sub.w t7, t0, t4 // cnt-d 144 st.w t5, a0, 24 // store rng 145 bgeu t0, t4, 9f 146 147 // refill 148 ld.d t0, a0, 0 // buf_pos 149 ld.d t1, a0, 8 // buf_end 150 addi.d t2, t0, 8 151 bltu t1, t2, 2f 152 153 ld.d t3, t0, 0 // next_bits 154 addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) 155 nor t3, t3, t3 156 sub.w t2, zero, t1 157 revb.d t3, t3 // next_bits = bswap(next_bits) 158 srli.w t2, t2, 3 // num_bytes_read 159 srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) 160 b 3f 1611: 162 addi.w t3, t7, -48 163 srl.d t3, t3, t3 // pad with ones 164 b 4f 1652: 166 bgeu t0, t1, 1b 167 ld.d t3, t1, -8 // next_bits 168 sub.w t2, t2, t1 169 sub.w t1, t1, t0 // num_bytes_left 170 slli.w t2, t2, 3 171 srl.d t3, t3, t2 172 addi.w t2, t7, -48 173 nor t3, t3, t3 174 sub.w t4, zero, t2 175 revb.d t3, t3 176 srli.w t4, t4, 3 177 srl.d t3, t3, t2 178 sltu t2, t1, t4 179 maskeqz t1, t1, t2 180 masknez t2, t4, t2 181 or t2, t2, t1 // num_bytes_read 1823: 183 slli.w t1, t2, 3 184 add.d t0, t0, t2 185 add.w t7, t7, t1 // cnt += num_bits_read 186 st.d t0, a0, 0 1874: 188 or t6, t6, t3 // dif |= next_bits 1899: 190 st.w t7, a0, 28 // store cnt 191 st.d t6, a0, 16 // store dif 192 move a0, a7 193 addi.d sp, sp, 48 194.endm 195 196function msac_decode_symbol_adapt4_lsx 197 decode_symbol_adapt 4 198endfunc 199 200function msac_decode_symbol_adapt8_lsx 201 decode_symbol_adapt 8 202endfunc 203 204function msac_decode_symbol_adapt16_lsx 205 decode_symbol_adapt 16 206endfunc 207 208function msac_decode_bool_lsx 209 ld.w t0, a0, 24 // rng 210 srli.w a1, a1, 6 211 ld.d t1, a0, 16 // dif 212 srli.w t2, t0, 8 // r >> 8 213 mul.w t2, t2, a1 214 ld.w a5, a0, 28 // cnt 215 srli.w t2, t2, 1 216 addi.w t2, t2, 4 // v 217 slli.d t3, t2, 48 // vw 218 sltu t4, t1, t3 219 move t8, t4 // ret 220 xori t4, t4, 1 221 maskeqz t6, t3, t4 // if (ret) vw 222 sub.d t6, t1, t6 // dif 223 slli.w t5, t2, 1 224 sub.w t5, t0, t5 // r - 2v 225 maskeqz t7, t5, t4 // if (ret) r - 2v 226 add.w t5, t2, t7 // v(rng) 227 228 // renorm 229 clz.w t4, t5 // d 230 xori t4, t4, 16 // d 231 sll.d t6, t6, t4 232 sll.w t5, t5, t4 233 sub.w t7, a5, t4 // cnt-d 234 st.w t5, a0, 24 // store rng 235 bgeu a5, t4, 9f 236 237 // refill 238 ld.d t0, a0, 0 // buf_pos 239 ld.d t1, a0, 8 // buf_end 240 addi.d t2, t0, 8 241 bltu t1, t2, 2f 242 243 ld.d t3, t0, 0 // next_bits 244 addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) 245 nor t3, t3, t3 246 sub.w t2, zero, t1 247 revb.d t3, t3 // next_bits = bswap(next_bits) 248 srli.w t2, t2, 3 // num_bytes_read 249 srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) 250 b 3f 2511: 252 addi.w t3, t7, -48 253 srl.d t3, t3, t3 // pad with ones 254 b 4f 2552: 256 bgeu t0, t1, 1b 257 ld.d t3, t1, -8 // next_bits 258 sub.w t2, t2, t1 259 sub.w t1, t1, t0 // num_bytes_left 260 slli.w t2, t2, 3 261 srl.d t3, t3, t2 262 addi.w t2, t7, -48 263 nor t3, t3, t3 264 sub.w t4, zero, t2 265 revb.d t3, t3 266 srli.w t4, t4, 3 267 srl.d t3, t3, t2 268 sltu t2, t1, t4 269 maskeqz t1, t1, t2 270 masknez t2, t4, t2 271 or t2, t2, t1 // num_bytes_read 2723: 273 slli.w t1, t2, 3 274 add.d t0, t0, t2 275 add.w t7, t7, t1 // cnt += num_bits_read 276 st.d t0, a0, 0 2774: 278 or t6, t6, t3 // dif |= next_bits 2799: 280 st.w t7, a0, 28 // store cnt 281 st.d t6, a0, 16 // store dif 282 move a0, t8 283endfunc 284 285function msac_decode_bool_equi_lsx 286 ld.w t0, a0, 24 // rng 287 ld.d t1, a0, 16 // dif 288 ld.w a5, a0, 28 // cnt 289 srli.w t2, t0, 8 // r >> 8 290 slli.w t2, t2, 7 291 addi.w t2, t2, 4 // v 292 293 slli.d t3, t2, 48 // vw 294 sltu t4, t1, t3 295 move t8, t4 // ret 296 xori t4, t4, 1 297 maskeqz t6, t3, t4 // if (ret) vw 298 sub.d t6, t1, t6 // dif 299 slli.w t5, t2, 1 300 sub.w t5, t0, t5 // r - 2v 301 maskeqz t7, t5, t4 // if (ret) r - 2v 302 add.w t5, t2, t7 // v(rng) 303 304 // renorm 305 clz.w t4, t5 // d 306 xori t4, t4, 16 // d 307 sll.d t6, t6, t4 308 sll.w t5, t5, t4 309 sub.w t7, a5, t4 // cnt-d 310 st.w t5, a0, 24 // store rng 311 bgeu a5, t4, 9f 312 313 // refill 314 ld.d t0, a0, 0 // buf_pos 315 ld.d t1, a0, 8 // buf_end 316 addi.d t2, t0, 8 317 bltu t1, t2, 2f 318 319 ld.d t3, t0, 0 // next_bits 320 addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) 321 nor t3, t3, t3 322 sub.w t2, zero, t1 323 revb.d t3, t3 // next_bits = bswap(next_bits) 324 srli.w t2, t2, 3 // num_bytes_read 325 srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) 326 b 3f 3271: 328 addi.w t3, t7, -48 329 srl.d t3, t3, t3 // pad with ones 330 b 4f 3312: 332 bgeu t0, t1, 1b 333 ld.d t3, t1, -8 // next_bits 334 sub.w t2, t2, t1 335 sub.w t1, t1, t0 // num_bytes_left 336 slli.w t2, t2, 3 337 srl.d t3, t3, t2 338 addi.w t2, t7, -48 339 nor t3, t3, t3 340 sub.w t4, zero, t2 341 revb.d t3, t3 342 srli.w t4, t4, 3 343 srl.d t3, t3, t2 344 sltu t2, t1, t4 345 maskeqz t1, t1, t2 346 masknez t2, t4, t2 347 or t2, t2, t1 // num_bytes_read 3483: 349 slli.w t1, t2, 3 350 add.d t0, t0, t2 351 add.w t7, t7, t1 // cnt += num_bits_read 352 st.d t0, a0, 0 3534: 354 or t6, t6, t3 // dif |= next_bits 3559: 356 st.w t7, a0, 28 // store cnt 357 st.d t6, a0, 16 // store dif 358 move a0, t8 359endfunc 360 361function msac_decode_bool_adapt_lsx 362 ld.hu a3, a1, 0 // cdf[0] /f 363 ld.w t0, a0, 24 // rng 364 ld.d t1, a0, 16 // dif 365 srli.w t2, t0, 8 // r >> 8 366 srli.w a7, a3, 6 367 mul.w t2, t2, a7 368 ld.w a4, a0, 32 // allow_update_cdf 369 ld.w a5, a0, 28 // cnt 370 srli.w t2, t2, 1 371 addi.w t2, t2, 4 // v 372 slli.d t3, t2, 48 // vw 373 sltu t4, t1, t3 374 move t8, t4 // bit 375 xori t4, t4, 1 376 maskeqz t6, t3, t4 // if (ret) vw 377 sub.d t6, t1, t6 // dif 378 slli.w t5, t2, 1 379 sub.w t5, t0, t5 // r - 2v 380 maskeqz t7, t5, t4 // if (ret) r - 2v 381 add.w t5, t2, t7 // v(rng) 382 beqz a4, .renorm 383 384 // update_cdf 385 ld.hu t0, a1, 2 // cdf[1] 386 srli.w t1, t0, 4 387 addi.w t1, t1, 4 // rate 388 sltui t2, t0, 32 // count < 32 389 add.w t0, t0, t2 // count + (count < 32) 390 sub.w a3, a3, t8 // cdf[0] -= bit 391 slli.w t4, t8, 15 392 sub.w t7, a3, t4 // cdf[0] - bit - 32768 393 sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate 394 sub.w t7, a3, t7 // cdf[0] 395 st.h t7, a1, 0 396 st.h t0, a1, 2 397 398.renorm: 399 clz.w t4, t5 // d 400 xori t4, t4, 16 // d 401 sll.d t6, t6, t4 402 sll.w t5, t5, t4 403 sub.w t7, a5, t4 // cnt-d 404 st.w t5, a0, 24 // store rng 405 bgeu a5, t4, 9f 406 407 // refill 408 ld.d t0, a0, 0 // buf_pos 409 ld.d t1, a0, 8 // buf_end 410 addi.d t2, t0, 8 411 bltu t1, t2, 2f 412 413 ld.d t3, t0, 0 // next_bits 414 addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) 415 nor t3, t3, t3 416 sub.w t2, zero, t1 417 revb.d t3, t3 // next_bits = bswap(next_bits) 418 srli.w t2, t2, 3 // num_bytes_read 419 srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) 420 b 3f 4211: 422 addi.w t3, t7, -48 423 srl.d t3, t3, t3 // pad with ones 424 b 4f 4252: 426 bgeu t0, t1, 1b 427 ld.d t3, t1, -8 // next_bits 428 sub.w t2, t2, t1 429 sub.w t1, t1, t0 // num_bytes_left 430 slli.w t2, t2, 3 431 srl.d t3, t3, t2 432 addi.w t2, t7, -48 433 nor t3, t3, t3 434 sub.w t4, zero, t2 435 revb.d t3, t3 436 srli.w t4, t4, 3 437 srl.d t3, t3, t2 438 sltu t2, t1, t4 439 maskeqz t1, t1, t2 440 masknez t2, t4, t2 441 or t2, t2, t1 // num_bytes_read 4423: 443 slli.w t1, t2, 3 444 add.d t0, t0, t2 445 add.w t7, t7, t1 // cnt += num_bits_read 446 st.d t0, a0, 0 4474: 448 or t6, t6, t3 // dif |= next_bits 4499: 450 st.w t7, a0, 28 // store cnt 451 st.d t6, a0, 16 // store dif 452 move a0, t8 453endfunc 454 455.macro HI_TOK allow_update_cdf 456.\allow_update_cdf\()_hi_tok_lsx_start: 457.if \allow_update_cdf == 1 458 ld.hu a4, a1, 0x06 // cdf[3] 459.endif 460 vor.v vr1, vr0, vr0 461 vsrli.h vr1, vr1, 0x06 // cdf[val] >> EC_PROB_SHIFT 462 vstelm.h vr2, sp, 0, 0 // -0x1a 463 vand.v vr2, vr2, vr4 // (8 x rng) & 0xff00 464 vslli.h vr1, vr1, 0x07 465 vmuh.hu vr1, vr1, vr2 466 vadd.h vr1, vr1, vr5 // v += EC_MIN_PROB/* 4 */ * ((unsigned)n_symbols/* 3 */ - val); 467 vst vr1, sp, 0x02 // -0x18 468 vssub.hu vr1, vr1, vr3 // v - c 469 vseqi.h vr1, vr1, 0 470.if \allow_update_cdf == 1 471 addi.d t4, a4, 0x50 472 srli.d t4, t4, 0x04 473 sltui t7, a4, 32 474 add.w a4, a4, t7 475 476 vreplgr2vr.h vr7, t4 477 vavgr.hu vr9, vr8, vr1 478 vsub.h vr9, vr9, vr0 479 vsub.h vr0, vr0, vr1 480 vsra.h vr9, vr9, vr7 481 vadd.h vr0, vr0, vr9 482 vstelm.d vr0, a1, 0, 0 483 st.h a4, a1, 0x06 484.endif 485 vmsknz.b vr7, vr1 486 movfr2gr.s t4, f7 487 ctz.w t4, t4 // loop_times * 2 488 addi.d t7, t4, 2 489 ldx.hu t6, sp, t4 // u 490 ldx.hu t5, sp, t7 // v 491 addi.w t3, t3, 0x05 492 addi.w t4, t4, -0x05 // if t4 == 3, continue 493 sub.w t6, t6, t5 // u - v , rng for ctx_norm 494 slli.d t5, t5, 0x30 // (ec_win)v << (EC_WIN_SIZE - 16) 495 sub.d t1, t1, t5 // s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)) 496 // Init ctx_norm param 497 clz.w t7, t6 498 xori t7, t7, 0x1f 499 xori t7, t7, 0x0f // d = 15 ^ (31 ^ clz(rng)); 500 sll.d t1, t1, t7 // dif << d 501 sll.d t6, t6, t7 // rng << d 502 // update vr2 8 x rng 503 vreplgr2vr.h vr2, t6 504 vreplvei.h vr2, vr2, 0 505 st.w t6, a0, 0x18 // store rng 506 move t0, t2 507 sub.w t2, t2, t7 // cnt - d 508 bgeu t0, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end // if ((unsigned)cnt < (unsigned)d) goto ctx_norm_end 509 // Step into ctx_fill 510 ld.d t5, a0, 0x00 // buf_pos 511 ld.d t6, a0, 0x08 // end_pos 512 addi.d t7, t5, 0x08 // buf_pos + 8 513 sub.d t7, t7, t6 // (buf_pos + 8) - end_pos 514 blt zero, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob 515 // (end_pos - buf_pos) >= 8 516 ld.d t6, t5, 0x00 // load buf_pos[0]~buf_pos[7] 517 addi.w t7, t2, -0x30 // cnt - 0x30 518 nor t6, t6, t6 // not buf data 519 revb.d t6, t6 // Byte reversal 520 srl.d t6, t6, t7 // Replace left shift with right shift 521 sub.w t7, zero, t7 // neg 522 srli.w t7, t7, 0x03 // Loop times 523 or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c 524 b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end 525.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob: 526 bge t5, t6, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one 527 // end_pos - buf_pos < 8 && buf_pos < end_pos 528 ld.d t0, t6, -0x08 529 slli.d t7, t7, 0x03 530 srl.d t6, t0, t7 // Retrieve the buf data and remove the excess data 531 addi.w t7, t2, -0x30 // cnt - 0x30 532 nor t6, t6, t6 // not 533 revb.d t6, t6 // Byte reversal 534 srl.d t6, t6, t7 // Replace left shift with right shift 535 sub.w t7, zero, t7 // neg 536 or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c 537 ld.d t6, a0, 0x08 // end_pos 538 srli.w t7, t7, 0x03 // Loop times 539 sub.d t6, t6, t5 // end_pos - buf_pos 540 slt t0, t6, t7 541 maskeqz a3, t6, t0 // min(loop_times, end_pos - buf_pos) 542 masknez t0, t7, t0 543 or t7, a3, t0 544 b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end 545.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one: 546 // buf_pos >= end_pos 547 addi.w t7, t2, -0x10 548 andi t7, t7, 0xf 549 nor t0, zero, zero 550 srl.d t0, t0, t7 551 or t1, t1, t0 // dif |= ~(~(ec_win)0xff << c); 552 b .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end 553.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end: 554 add.d t5, t5, t7 // buf_pos + Loop_times 555 st.d t5, a0, 0x00 // Store buf_pos 556 alsl.w t2, t7, t2, 0x03 // update cnt 557.\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end: 558 srli.d t7, t1, 0x30 559 vreplgr2vr.h vr3, t7 // broadcast the high 16 bits of dif 560 add.w t3, t4, t3 // update control parameter 561 beqz t3, .\allow_update_cdf\()_hi_tok_lsx_end // control loop for at most 4 times. 562 blt zero, t4, .\allow_update_cdf\()_hi_tok_lsx_start // tok_br == 3 563.\allow_update_cdf\()_hi_tok_lsx_end: 564 addi.d t3, t3, 0x1e 565 st.d t1, a0, 0x10 // store dif 566 st.w t2, a0, 0x1c // store cnt 567 srli.w a0, t3, 0x01 // tok 568 addi.d sp, sp, 0x1a 569.endm 570 571/** 572 * @param unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) 573 * * Reg Alloction 574 * * vr0: cdf; 575 * * vr1: temp; 576 * * vr2: rng; 577 * * vr3: dif; 578 * * vr4: const 0xff00ff00...ff00ff00; 579 * * vr5: const 0x0004080c; 580 * * vr6: const 0; 581 * * t0: allow_update_cdf, tmp; 582 * * t1: dif; 583 * * t2: cnt; 584 * * t3: 0xffffffe8, outermost control parameter; 585 * * t4: loop time 586 * * t5: v, buf_pos, temp; 587 * * t6: u, rng, end_pos, buf, temp; 588 * * t7: temp; 589 */ 590function msac_decode_hi_tok_lsx 591 fld.d f0, a1, 0 // Load cdf[0]~cdf[3] 592 vldrepl.h vr2, a0, 0x18 // 8 x rng, assert(rng <= 65535U), only the lower 16 bits are valid 593 vldrepl.h vr3, a0, 0x16 // broadcast the high 16 bits of dif, c = s->dif >> (EC_WIN_SIZE - 16) 594 ld.w t0, a0, 0x20 // allow_update_cdf 595 la.local t7, ph_0xff00 596 vld vr4, t7, 0x00 // 0xff00ff00...ff00ff00 597 la.local t7, min_prob 598 vld vr5, t7, 12 * 2 // 0x0004080c 599 vxor.v vr6, vr6, vr6 // const 0 600 ld.d t1, a0, 0x10 // dif 601 ld.w t2, a0, 0x1c // cnt 602 orn t3, t3, t3 603 srli.d t3, t3, 32 604 addi.d t3, t3, -0x17 // 0xffffffe8 605 vseq.h vr8, vr8, vr8 606 addi.d sp, sp, -0x1a // alloc stack 607 beqz t0, .hi_tok_lsx_no_update_cdf 608 HI_TOK 1 609 jirl zero, ra, 0x0 610.hi_tok_lsx_no_update_cdf: 611 HI_TOK 0 612endfunc 613