1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29 30#define REST_UNIT_STRIDE (400) 31 32.macro MADD_HU_BU in0, in1, out0, out1 33 vsllwil.hu.bu vr12, \in0, 0 34 vexth.hu.bu vr13, \in0 35 vmadd.h \out0, vr12, \in1 36 vmadd.h \out1, vr13, \in1 37.endm 38 39const wiener_shuf 40.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 41endconst 42 43/* 44void wiener_filter_h_lsx(int32_t *hor_ptr, 45 uint8_t *tmp_ptr, 46 const int16_t filterh[8], 47 const int w, const int h) 48*/ 49function wiener_filter_h_8bpc_lsx 50 addi.d sp, sp, -40 51 fst.d f24, sp, 0 52 fst.d f25, sp, 8 53 fst.d f26, sp, 16 54 fst.d f27, sp, 24 55 fst.d f28, sp, 32 56 li.w t7, 1<<14 // clip_limit 57 58 la.local t1, wiener_shuf 59 vld vr4, t1, 0 60 vld vr14, a2, 0 // filter[0][k] 61 vreplvei.h vr21, vr14, 0 62 vreplvei.h vr22, vr14, 1 63 vreplvei.h vr23, vr14, 2 64 vreplvei.h vr24, vr14, 3 65 vreplvei.h vr25, vr14, 4 66 vreplvei.h vr26, vr14, 5 67 vreplvei.h vr27, vr14, 6 68 vreplgr2vr.w vr0, t7 69 70.WIENER_FILTER_H_H: 71 addi.w a4, a4, -1 // h 72 addi.w t0, a3, 0 // w 73 addi.d t1, a1, 0 // tmp_ptr 74 addi.d t2, a0, 0 // hor_ptr 75 76.WIENER_FILTER_H_W: 77 addi.w t0, t0, -16 78 vld vr5, t1, 0 79 vld vr13, t1, 16 80 81 vsubi.bu vr14, vr4, 2 82 vsubi.bu vr15, vr4, 1 83 vshuf.b vr6, vr13, vr5, vr14 // 1 ... 8, 9 ... 16 84 vshuf.b vr7, vr13, vr5, vr15 // 2 ... 9, 10 ... 17 85 vshuf.b vr8, vr13, vr5, vr4 // 3 ... 10, 11 ... 18 86 vaddi.bu vr14, vr4, 1 87 vaddi.bu vr15, vr4, 2 88 vshuf.b vr9, vr13, vr5, vr14 // 4 ... 11, 12 ... 19 89 vshuf.b vr10, vr13, vr5, vr15 // 5 ... 12, 13 ... 20 90 vaddi.bu vr14, vr4, 3 91 vshuf.b vr11, vr13, vr5, vr14 // 6 ... 13, 14 ... 21 92 93 vsllwil.hu.bu vr15, vr8, 0 // 3 4 5 6 7 8 9 10 94 vexth.hu.bu vr16, vr8 // 11 12 13 14 15 16 17 18 95 vsllwil.wu.hu vr17, vr15, 7 // 3 4 5 6 96 vexth.wu.hu vr18, vr15 // 7 8 9 10 97 vsllwil.wu.hu vr19, vr16, 7 // 11 12 13 14 98 vexth.wu.hu vr20, vr16 // 15 16 17 18 99 vslli.w vr18, vr18, 7 100 vslli.w vr20, vr20, 7 101 vxor.v vr15, vr15, vr15 102 vxor.v vr14, vr14, vr14 103 104 MADD_HU_BU vr5, vr21, vr14, vr15 105 MADD_HU_BU vr6, vr22, vr14, vr15 106 MADD_HU_BU vr7, vr23, vr14, vr15 107 MADD_HU_BU vr8, vr24, vr14, vr15 108 MADD_HU_BU vr9, vr25, vr14, vr15 109 MADD_HU_BU vr10, vr26, vr14, vr15 110 MADD_HU_BU vr11, vr27, vr14, vr15 111 112 vsllwil.w.h vr5, vr14, 0 // 0 1 2 3 113 vexth.w.h vr6, vr14 // 4 5 6 7 114 vsllwil.w.h vr7, vr15, 0 // 8 9 10 11 115 vexth.w.h vr8, vr15 // 12 13 14 15 116 vadd.w vr17, vr17, vr5 117 vadd.w vr18, vr18, vr6 118 vadd.w vr19, vr19, vr7 119 vadd.w vr20, vr20, vr8 120 vadd.w vr17, vr17, vr0 121 vadd.w vr18, vr18, vr0 122 vadd.w vr19, vr19, vr0 123 vadd.w vr20, vr20, vr0 124 125 vsrli.w vr1, vr0, 1 126 vsubi.wu vr1, vr1, 1 127 vxor.v vr3, vr3, vr3 128 vsrari.w vr17, vr17, 3 129 vsrari.w vr18, vr18, 3 130 vsrari.w vr19, vr19, 3 131 vsrari.w vr20, vr20, 3 132 vclip.w vr17, vr17, vr3, vr1 133 vclip.w vr18, vr18, vr3, vr1 134 vclip.w vr19, vr19, vr3, vr1 135 vclip.w vr20, vr20, vr3, vr1 136 137 vst vr17, t2, 0 138 vst vr18, t2, 16 139 vst vr19, t2, 32 140 vst vr20, t2, 48 141 addi.d t1, t1, 16 142 addi.d t2, t2, 64 143 blt zero, t0, .WIENER_FILTER_H_W 144 145 addi.d a1, a1, REST_UNIT_STRIDE 146 addi.d a0, a0, (REST_UNIT_STRIDE << 2) 147 bnez a4, .WIENER_FILTER_H_H 148 149 fld.d f24, sp, 0 150 fld.d f25, sp, 8 151 fld.d f26, sp, 16 152 fld.d f27, sp, 24 153 fld.d f28, sp, 32 154 addi.d sp, sp, 40 155endfunc 156 157.macro APPLY_FILTER in0, in1, in2 158 alsl.d t7, \in0, \in1, 2 159 vld vr10, t7, 0 160 vld vr11, t7, 16 161 vld vr12, t7, 32 162 vld vr13, t7, 48 163 vmadd.w vr14, vr10, \in2 164 vmadd.w vr15, vr11, \in2 165 vmadd.w vr16, vr12, \in2 166 vmadd.w vr17, vr13, \in2 167.endm 168 169.macro wiener_filter_v_8bpc_core_lsx 170 vreplgr2vr.w vr14, t6 171 vreplgr2vr.w vr15, t6 172 vreplgr2vr.w vr16, t6 173 vreplgr2vr.w vr17, t6 174 175 addi.w t7, t2, 0 // j + index k 176 mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE 177 add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i 178 179 APPLY_FILTER t7, a2, vr2 180 APPLY_FILTER t8, t7, vr3 181 APPLY_FILTER t8, t7, vr4 182 APPLY_FILTER t8, t7, vr5 183 APPLY_FILTER t8, t7, vr6 184 APPLY_FILTER t8, t7, vr7 185 APPLY_FILTER t8, t7, vr8 186 vssrarni.hu.w vr15, vr14, 11 187 vssrarni.hu.w vr17, vr16, 11 188 vssrlni.bu.h vr17, vr15, 0 189.endm 190 191/* 192void wiener_filter_v_lsx(uint8_t *p, 193 const ptrdiff_t p_stride, 194 const int32_t *hor, 195 const int16_t filterv[8], 196 const int w, const int h) 197*/ 198function wiener_filter_v_8bpc_lsx 199 li.w t6, -(1 << 18) 200 201 li.w t8, REST_UNIT_STRIDE 202 ld.h t0, a3, 0 203 ld.h t1, a3, 2 204 vreplgr2vr.w vr2, t0 205 vreplgr2vr.w vr3, t1 206 ld.h t0, a3, 4 207 ld.h t1, a3, 6 208 vreplgr2vr.w vr4, t0 209 vreplgr2vr.w vr5, t1 210 ld.h t0, a3, 8 211 ld.h t1, a3, 10 212 vreplgr2vr.w vr6, t0 213 vreplgr2vr.w vr7, t1 214 ld.h t0, a3, 12 215 vreplgr2vr.w vr8, t0 216 217 andi t1, a4, 0xf 218 sub.w t0, a4, t1 // w-w%16 219 or t2, zero, zero // j 220 or t4, zero, zero 221 beqz t0, .WIENER_FILTER_V_W_LT16 222 223.WIENER_FILTER_V_H: 224 andi t1, a4, 0xf 225 add.d t3, zero, a0 // p 226 or t4, zero, zero // i 227 228.WIENER_FILTER_V_W: 229 230 wiener_filter_v_8bpc_core_lsx 231 232 mul.w t5, t2, a1 // j * stride 233 add.w t5, t5, t4 // j * stride + i 234 add.d t3, a0, t5 235 addi.w t4, t4, 16 236 vst vr17, t3, 0 237 bne t0, t4, .WIENER_FILTER_V_W 238 239 beqz t1, .WIENER_FILTER_V_W_EQ16 240 241 wiener_filter_v_8bpc_core_lsx 242 243 addi.d t3, t3, 16 244 andi t1, a4, 0xf 245 246.WIENER_FILTER_V_ST_REM: 247 vstelm.b vr17, t3, 0, 0 248 vbsrl.v vr17, vr17, 1 249 addi.d t3, t3, 1 250 addi.w t1, t1, -1 251 bnez t1, .WIENER_FILTER_V_ST_REM 252.WIENER_FILTER_V_W_EQ16: 253 addi.w t2, t2, 1 254 blt t2, a5, .WIENER_FILTER_V_H 255 b .WIENER_FILTER_V_END 256 257.WIENER_FILTER_V_W_LT16: 258 andi t1, a4, 0xf 259 add.d t3, zero, a0 260 261 wiener_filter_v_8bpc_core_lsx 262 263 mul.w t5, t2, a1 // j * stride 264 add.d t3, a0, t5 265 266.WIENER_FILTER_V_ST_REM_1: 267 vstelm.b vr17, t3, 0, 0 268 vbsrl.v vr17, vr17, 1 269 addi.d t3, t3, 1 270 addi.w t1, t1, -1 271 bnez t1, .WIENER_FILTER_V_ST_REM_1 272 273 addi.w t2, t2, 1 274 blt t2, a5, .WIENER_FILTER_V_W_LT16 275 276.WIENER_FILTER_V_END: 277endfunc 278 279/* 280void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src, 281 const int w, const int h) 282*/ 283function boxsum3_h_8bpc_lsx 284 addi.d a2, a2, REST_UNIT_STRIDE 285 li.w t0, 1 286 addi.w a3, a3, -2 287 addi.w a4, a4, -4 288 289.LBS3_H_H: 290 alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x 291 alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x 292 add.d t3, t0, a2 // s 293 addi.w t5, a3, 0 294.LBS3_H_W: 295 vld vr0, t3, 0 296 vld vr1, t3, REST_UNIT_STRIDE 297 vld vr2, t3, (REST_UNIT_STRIDE<<1) 298 299 vilvl.b vr3, vr1, vr0 300 vhaddw.hu.bu vr4, vr3, vr3 301 vilvh.b vr5, vr1, vr0 302 vhaddw.hu.bu vr6, vr5, vr5 303 vsllwil.hu.bu vr7, vr2, 0 304 vexth.hu.bu vr8, vr2 305 // sum_v 306 vadd.h vr4, vr4, vr7 307 vadd.h vr6, vr6, vr8 308 vst vr4, t1, REST_UNIT_STRIDE<<1 309 vst vr6, t1, (REST_UNIT_STRIDE<<1)+16 310 addi.d t1, t1, 32 311 // sumsq 312 vmulwev.h.bu vr9, vr3, vr3 313 vmulwod.h.bu vr10, vr3, vr3 314 vmulwev.h.bu vr11, vr5, vr5 315 vmulwod.h.bu vr12, vr5, vr5 316 vaddwev.w.hu vr13, vr10, vr9 317 vaddwod.w.hu vr14, vr10, vr9 318 vaddwev.w.hu vr15, vr12, vr11 319 vaddwod.w.hu vr16, vr12, vr11 320 vmaddwev.w.hu vr13, vr7, vr7 321 vmaddwod.w.hu vr14, vr7, vr7 322 vmaddwev.w.hu vr15, vr8, vr8 323 vmaddwod.w.hu vr16, vr8, vr8 324 vilvl.w vr9, vr14, vr13 325 vilvh.w vr10, vr14, vr13 326 vilvl.w vr11, vr16, vr15 327 vilvh.w vr12, vr16, vr15 328 vst vr9, t2, REST_UNIT_STRIDE<<2 329 vst vr10, t2, (REST_UNIT_STRIDE<<2)+16 330 vst vr11, t2, (REST_UNIT_STRIDE<<2)+32 331 vst vr12, t2, (REST_UNIT_STRIDE<<2)+48 332 333 addi.d t2, t2, 64 334 addi.w t5, t5, -16 335 addi.d t3, t3, 16 336 blt zero, t5, .LBS3_H_W 337 338 addi.d a0, a0, REST_UNIT_STRIDE<<2 339 addi.d a1, a1, REST_UNIT_STRIDE<<1 340 addi.d a2, a2, REST_UNIT_STRIDE 341 addi.d a4, a4, -1 342 blt zero, a4, .LBS3_H_H 343endfunc 344 345/* 346void boxsum3_v(int32_t *sumsq, coef *sum, 347 const int w, const int h) 348*/ 349function boxsum3_v_8bpc_lsx 350 addi.d a0, a0, (REST_UNIT_STRIDE<<2) 351 addi.d a1, a1, (REST_UNIT_STRIDE<<1) 352 addi.w a3, a3, -4 353 addi.w a2, a2, -4 354 355.LBS3_V_H: 356 sub.w t3, a2, zero 357 addi.d t0, a0, 4 358 addi.d t1, a1, 2 359 addi.d t5, a0, 8 360 addi.d t6, a1, 4 361 362 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 363 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 364 vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 365 vld vr3, t0, 0 // a2 0 1 2 3 366 vld vr4, t0, 4 // b2 1 2 3 4 367 vld vr5, t0, 8 // c2 2 3 4 5 368 vld vr6, t0, 16 // 3 4 5 6 369 vld vr7, t0, 20 // 4 5 6 7 370 vld vr8, t0, 24 // 5 6 7 8 371 vadd.h vr9, vr0, vr1 372 vadd.w vr10, vr3, vr4 373 vadd.w vr11, vr6, vr7 374 vadd.h vr9, vr9, vr2 375 vadd.w vr10, vr10, vr5 376 vadd.w vr11, vr11, vr8 377 vpickve2gr.h t7, vr2, 6 378 vpickve2gr.w t8, vr8, 2 379 vst vr9, t6, 0 380 vst vr10, t5, 0 381 vst vr11, t5, 16 382 383 addi.d t1, t1, 16 384 addi.d t0, t0, 32 385 addi.d t5, t5, 32 386 addi.d t6, t6, 16 387 addi.d t3, t3, -8 388 ble t3, zero, .LBS3_V_H0 389 390.LBS3_V_W8: 391 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 392 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 393 vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 394 vld vr3, t0, 0 // a2 0 1 2 3 395 vld vr4, t0, 4 // b2 1 2 3 4 396 vld vr5, t0, 8 // c2 2 3 4 5 397 vld vr6, t0, 16 // 3 4 5 6 398 vld vr7, t0, 20 // 4 5 6 7 399 vld vr8, t0, 24 // 5 6 7 8 400 vinsgr2vr.h vr0, t7, 0 401 vinsgr2vr.w vr3, t8, 0 402 vpickve2gr.h t7, vr2, 6 403 vpickve2gr.w t8, vr8, 2 404 vadd.h vr9, vr0, vr1 405 vadd.w vr10, vr3, vr4 406 vadd.w vr11, vr6, vr7 407 vadd.h vr9, vr9, vr2 408 vadd.w vr10, vr10, vr5 409 vadd.w vr11, vr11, vr8 410 vst vr9, t6, 0 411 vst vr10, t5, 0 412 vst vr11, t5, 16 413 addi.d t3, t3, -8 414 addi.d t1, t1, 16 415 addi.d t0, t0, 32 416 addi.d t5, t5, 32 417 addi.d t6, t6, 16 418 blt zero, t3, .LBS3_V_W8 419 420.LBS3_V_H0: 421 addi.d a1, a1, REST_UNIT_STRIDE<<1 422 addi.d a0, a0, REST_UNIT_STRIDE<<2 423 addi.w a3, a3, -1 424 bnez a3, .LBS3_V_H 425endfunc 426 427/* 428boxsum3_selfguided_filter(int32_t *sumsq, coef *sum, 429 const int w, const int h, 430 const unsigned s) 431*/ 432function boxsum3_sgf_h_8bpc_lsx 433 addi.d a0, a0, REST_UNIT_STRIDE<<2 434 addi.d a0, a0, 12 // AA 435 addi.d a1, a1, REST_UNIT_STRIDE<<1 436 addi.d a1, a1, 6 // BB 437 la.local t8, dav1d_sgr_x_by_x 438 li.w t6, 455 439 vreplgr2vr.w vr20, t6 440 li.w t6, 255 441 vreplgr2vr.w vr22, t6 442 vaddi.wu vr21, vr22, 1 // 256 443 vreplgr2vr.w vr6, a4 444 vldi vr19, 0x809 445 addi.w a2, a2, 2 // w + 2 446 addi.w a3, a3, 2 // h + 2 447 448.LBS3SGF_H_H: 449 addi.w t2, a2, 0 450 addi.d t0, a0, -4 451 addi.d t1, a1, -2 452 453.LBS3SGF_H_W: 454 addi.w t2, t2, -8 455 vld vr0, t0, 0 // AA[i] 456 vld vr1, t0, 16 457 vld vr2, t1, 0 // BB[i] 458 459 vmul.w vr4, vr0, vr19 // a * n 460 vmul.w vr5, vr1, vr19 // a * n 461 vsllwil.w.h vr9, vr2, 0 462 vexth.w.h vr10, vr2 463 vmsub.w vr4, vr9, vr9 // p 464 vmsub.w vr5, vr10, vr10 // p 465 vmaxi.w vr4, vr4, 0 466 vmaxi.w vr5, vr5, 0 // p 467 vmul.w vr4, vr4, vr6 // p * s 468 vmul.w vr5, vr5, vr6 // p * s 469 vsrlri.w vr4, vr4, 20 470 vsrlri.w vr5, vr5, 20 // z 471 vmin.w vr4, vr4, vr22 472 vmin.w vr5, vr5, vr22 473 474 vpickve2gr.w t6, vr4, 0 475 ldx.bu t7, t8, t6 476 vinsgr2vr.w vr7, t7, 0 477 vpickve2gr.w t6, vr4, 1 478 ldx.bu t7, t8, t6 479 vinsgr2vr.w vr7, t7, 1 480 vpickve2gr.w t6, vr4, 2 481 ldx.bu t7, t8, t6 482 vinsgr2vr.w vr7, t7, 2 483 vpickve2gr.w t6, vr4, 3 484 ldx.bu t7, t8, t6 485 vinsgr2vr.w vr7, t7, 3 486 487 vpickve2gr.w t6, vr5, 0 488 ldx.bu t7, t8, t6 489 vinsgr2vr.w vr8, t7, 0 490 vpickve2gr.w t6, vr5, 1 491 ldx.bu t7, t8, t6 492 vinsgr2vr.w vr8, t7, 1 493 vpickve2gr.w t6, vr5, 2 494 ldx.bu t7, t8, t6 495 vinsgr2vr.w vr8, t7, 2 496 vpickve2gr.w t6, vr5, 3 497 ldx.bu t7, t8, t6 498 vinsgr2vr.w vr8, t7, 3 // x 499 500 vmul.w vr9, vr7, vr9 // x * BB[i] 501 vmul.w vr10, vr8, vr10 502 vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x 503 vmul.w vr10, vr10, vr20 504 vsrlri.w vr9, vr9, 12 505 vsrlri.w vr10, vr10, 12 506 vsub.w vr7, vr21, vr7 507 vsub.w vr8, vr21, vr8 508 vpickev.h vr8, vr8, vr7 509 510 vst vr9, t0, 0 511 vst vr10, t0, 16 512 vst vr8, t1, 0 513 addi.d t0, t0, 32 514 addi.d t1, t1, 16 515 blt zero, t2, .LBS3SGF_H_W 516 517 addi.d a0, a0, REST_UNIT_STRIDE<<2 518 addi.d a1, a1, REST_UNIT_STRIDE<<1 519 addi.w a3, a3, -1 520 bnez a3, .LBS3SGF_H_H 521endfunc 522 523/* 524boxsum3_selfguided_filter(coef *dst, pixel *src, 525 int32_t *sumsq, coef *sum, 526 const int w, const int h) 527*/ 528function boxsum3_sgf_v_8bpc_lsx 529 addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src 530 addi.d a2, a2, REST_UNIT_STRIDE<<2 531 addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12 532 addi.d a3, a3, REST_UNIT_STRIDE<<2 533 addi.d a3, a3, 6 534.LBS3SGF_V_H: 535 // A int32_t *sumsq 536 addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride 537 addi.d t1, a2, 0 // sumsq 538 addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride 539 addi.d t6, a1, 0 540 addi.w t7, a4, 0 541 addi.d t8, a0, 0 542 // B coef *sum 543 addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride 544 addi.d t4, a3, 0 545 addi.d t5, a3, REST_UNIT_STRIDE<<1 546 547.LBS3SGF_V_W: 548 vld vr0, t0, 0 // P[i - REST_UNIT_STRIDE] 549 vld vr1, t0, 16 550 vld vr2, t1, -4 // P[i-1] -1 0 1 2 551 vld vr3, t1, 12 // 3 4 5 6 552 vld vr4, t2, 0 // P[i + REST_UNIT_STRIDE] 553 vld vr5, t2, 16 554 vld vr6, t1, 0 // p[i] 0 1 2 3 555 vld vr7, t1, 16 // 4 5 6 7 556 vld vr8, t1, 4 // p[i+1] 1 2 3 4 557 vld vr9, t1, 20 // 5 6 7 8 558 559 vld vr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE] 560 vld vr11, t0, 12 561 vld vr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE] 562 vld vr13, t2, 12 563 vld vr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE] 564 vld vr15, t0, 20 565 vld vr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE] 566 vld vr17, t2, 20 567 568 vadd.w vr0, vr2, vr0 569 vadd.w vr4, vr6, vr4 570 vadd.w vr0, vr0, vr8 571 vadd.w vr20, vr0, vr4 572 vslli.w vr20, vr20, 2 // 0 1 2 3 573 vadd.w vr0, vr1, vr3 574 vadd.w vr4, vr5, vr7 575 vadd.w vr0, vr0, vr9 576 vadd.w vr21, vr0, vr4 577 vslli.w vr21, vr21, 2 // 4 5 6 7 578 vadd.w vr12, vr10, vr12 579 vadd.w vr16, vr14, vr16 580 vadd.w vr22, vr12, vr16 581 vslli.w vr23, vr22, 1 582 vadd.w vr22, vr23, vr22 583 vadd.w vr11, vr11, vr13 584 vadd.w vr15, vr15, vr17 585 vadd.w vr0, vr11, vr15 586 vslli.w vr23, vr0, 1 587 vadd.w vr23, vr23, vr0 588 vadd.w vr20, vr20, vr22 // b 589 vadd.w vr21, vr21, vr23 590 591 // B coef *sum 592 vld vr0, t3, 0 // P[i - REST_UNIT_STRIDE] 593 vld vr1, t4, -2 // p[i - 1] 594 vld vr2, t4, 0 // p[i] 595 vld vr3, t4, 2 // p[i + 1] 596 vld vr4, t5, 0 // P[i + REST_UNIT_STRIDE] 597 vld vr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE] 598 vld vr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE] 599 vld vr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE] 600 vld vr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE] 601 vaddwev.w.h vr9, vr0, vr1 602 vaddwod.w.h vr10, vr0, vr1 603 vaddwev.w.h vr11, vr2, vr3 604 vaddwod.w.h vr12, vr2, vr3 605 vadd.w vr9, vr11, vr9 606 vadd.w vr10, vr12, vr10 607 vilvl.w vr11, vr10, vr9 // 0 1 2 3 608 vilvh.w vr12, vr10, vr9 // 4 5 6 7 609 vsllwil.w.h vr0, vr4, 0 610 vexth.w.h vr1, vr4 611 vadd.w vr0, vr11, vr0 612 vadd.w vr1, vr12, vr1 613 vslli.w vr0, vr0, 2 614 vslli.w vr1, vr1, 2 615 vaddwev.w.h vr9, vr5, vr6 616 vaddwod.w.h vr10, vr5, vr6 617 vaddwev.w.h vr11, vr7, vr8 618 vaddwod.w.h vr12, vr7, vr8 619 vadd.w vr9, vr11, vr9 620 vadd.w vr10, vr12, vr10 621 vilvl.w vr13, vr10, vr9 622 vilvh.w vr14, vr10, vr9 623 vslli.w vr15, vr13, 1 624 vslli.w vr16, vr14, 1 625 vadd.w vr15, vr13, vr15 // a 626 vadd.w vr16, vr14, vr16 627 vadd.w vr22, vr0, vr15 628 vadd.w vr23, vr1, vr16 629 vld vr0, t6, 0 // src 630 vsllwil.hu.bu vr0, vr0, 0 631 vsllwil.wu.hu vr1, vr0, 0 632 vexth.wu.hu vr2, vr0 633 vmadd.w vr20, vr22, vr1 634 vmadd.w vr21, vr23, vr2 635 vssrlrni.h.w vr21, vr20, 9 636 vst vr21, t8, 0 637 addi.d t8, t8, 16 638 639 addi.d t0, t0, 32 640 addi.d t1, t1, 32 641 addi.d t2, t2, 32 642 addi.d t3, t3, 16 643 addi.d t4, t4, 16 644 addi.d t5, t5, 16 645 addi.d t6, t6, 8 646 addi.w t7, t7, -8 647 blt zero, t7, .LBS3SGF_V_W 648 649 addi.w a5, a5, -1 650 addi.d a0, a0, 384*2 651 addi.d a1, a1, REST_UNIT_STRIDE 652 addi.d a3, a3, REST_UNIT_STRIDE<<1 653 addi.d a2, a2, REST_UNIT_STRIDE<<2 654 bnez a5, .LBS3SGF_V_H 655endfunc 656 657function boxsum3_sgf_v_8bpc_lasx 658 addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src 659 addi.d a2, a2, REST_UNIT_STRIDE<<2 660 addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12 661 addi.d a3, a3, REST_UNIT_STRIDE<<2 662 addi.d a3, a3, 6 663.LBS3SGF_V_H_LASX: 664 // A int32_t *sumsq 665 addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride 666 addi.d t1, a2, 0 // sumsq 667 addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride 668 addi.d t6, a1, 0 669 addi.w t7, a4, 0 670 addi.d t8, a0, 0 671 // B coef *sum 672 addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride 673 addi.d t4, a3, 0 674 addi.d t5, a3, REST_UNIT_STRIDE<<1 675 676.LBS3SGF_V_W_LASX: 677 xvld xr0, t0, 0 // P[i - REST_UNIT_STRIDE] 678 xvld xr1, t0, 32 679 xvld xr2, t1, -4 // P[i-1] -1 0 1 2 680 xvld xr3, t1, 28 // 3 4 5 6 681 xvld xr4, t2, 0 // P[i + REST_UNIT_STRIDE] 682 xvld xr5, t2, 32 683 xvld xr6, t1, 0 // p[i] 0 1 2 3 684 xvld xr7, t1, 32 // 4 5 6 7 685 xvld xr8, t1, 4 // p[i+1] 1 2 3 4 686 xvld xr9, t1, 36 // 5 6 7 8 687 688 xvld xr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE] 689 xvld xr11, t0, 28 690 xvld xr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE] 691 xvld xr13, t2, 28 692 xvld xr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE] 693 xvld xr15, t0, 36 694 xvld xr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE] 695 xvld xr17, t2, 36 696 697 xvadd.w xr0, xr2, xr0 698 xvadd.w xr4, xr6, xr4 699 xvadd.w xr0, xr0, xr8 700 xvadd.w xr20, xr0, xr4 701 xvslli.w xr20, xr20, 2 // 0 1 2 3 702 xvadd.w xr0, xr1, xr3 703 xvadd.w xr4, xr5, xr7 704 xvadd.w xr0, xr0, xr9 705 xvadd.w xr21, xr0, xr4 706 xvslli.w xr21, xr21, 2 // 4 5 6 7 707 xvadd.w xr12, xr10, xr12 708 xvadd.w xr16, xr14, xr16 709 xvadd.w xr22, xr12, xr16 710 xvslli.w xr23, xr22, 1 711 xvadd.w xr22, xr23, xr22 712 xvadd.w xr11, xr11, xr13 713 xvadd.w xr15, xr15, xr17 714 xvadd.w xr0, xr11, xr15 715 xvslli.w xr23, xr0, 1 716 xvadd.w xr23, xr23, xr0 717 xvadd.w xr20, xr20, xr22 // b 718 xvadd.w xr21, xr21, xr23 719 720 // B coef *sum 721 xvld xr0, t3, 0 // P[i - REST_UNIT_STRIDE] 722 xvld xr1, t4, -2 // p[i - 1] 723 xvld xr2, t4, 0 // p[i] 724 xvld xr3, t4, 2 // p[i + 1] 725 xvld xr4, t5, 0 // P[i + REST_UNIT_STRIDE] 726 xvld xr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE] 727 xvld xr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE] 728 xvld xr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE] 729 xvld xr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE] 730 731 xvaddwev.w.h xr9, xr0, xr1 732 xvaddwod.w.h xr10, xr0, xr1 733 xvaddwev.w.h xr11, xr2, xr3 734 xvaddwod.w.h xr12, xr2, xr3 735 xvadd.w xr9, xr11, xr9 // 0 2 4 6 8 10 12 14 736 xvadd.w xr10, xr12, xr10 // 1 3 5 7 9 11 13 15 737 xvilvl.w xr11, xr10, xr9 // 0 1 2 3 8 9 10 11 738 xvilvh.w xr12, xr10, xr9 // 4 5 6 7 12 13 14 15 739 xvsllwil.w.h xr0, xr4, 0 // 0 1 2 3 8 9 10 11 740 xvexth.w.h xr1, xr4 // 4 5 6 7 12 13 14 15 741 742 xvadd.w xr0, xr11, xr0 743 xvadd.w xr1, xr12, xr1 744 xvslli.w xr0, xr0, 2 745 xvslli.w xr1, xr1, 2 746 747 xvaddwev.w.h xr9, xr5, xr6 748 xvaddwod.w.h xr10, xr5, xr6 749 xvaddwev.w.h xr11, xr7, xr8 750 xvaddwod.w.h xr12, xr7, xr8 751 xvadd.w xr9, xr11, xr9 752 xvadd.w xr10, xr12, xr10 753 xvilvl.w xr13, xr10, xr9 // 0 1 2 3 8 9 10 11 754 xvilvh.w xr14, xr10, xr9 // 4 5 6 7 12 13 14 15 755 756 xvslli.w xr15, xr13, 1 757 xvslli.w xr16, xr14, 1 758 xvadd.w xr15, xr13, xr15 // a 759 xvadd.w xr16, xr14, xr16 760 xvadd.w xr22, xr0, xr15 // A B 761 xvadd.w xr23, xr1, xr16 // C D 762 763 vld vr0, t6, 0 // src 764 vilvh.d vr2, vr0, vr0 765 vext2xv.wu.bu xr1, xr0 766 vext2xv.wu.bu xr2, xr2 767 xvor.v xr15, xr22, xr22 // A B 768 xvpermi.q xr22, xr23, 0b00000010 // A C 769 xvpermi.q xr23, xr15, 0b00110001 770 xvmadd.w xr20, xr22, xr1 771 xvmadd.w xr21, xr23, xr2 772 xvssrlrni.h.w xr21, xr20, 9 773 xvpermi.d xr22, xr21, 0b11011000 774 xvst xr22, t8, 0 775 addi.d t8, t8, 32 776 777 addi.d t0, t0, 64 778 addi.d t1, t1, 64 779 addi.d t2, t2, 64 780 addi.d t3, t3, 32 781 addi.d t4, t4, 32 782 addi.d t5, t5, 32 783 addi.d t6, t6, 16 784 addi.w t7, t7, -16 785 blt zero, t7, .LBS3SGF_V_W_LASX 786 787 addi.w a5, a5, -1 788 addi.d a0, a0, 384*2 789 addi.d a1, a1, REST_UNIT_STRIDE 790 addi.d a3, a3, REST_UNIT_STRIDE<<1 791 addi.d a2, a2, REST_UNIT_STRIDE<<2 792 bnez a5, .LBS3SGF_V_H_LASX 793endfunc 794 795#define FILTER_OUT_STRIDE (384) 796 797/* 798sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride, 799 const int16_t *dst, const int w1; 800 const int w, const int h); 801*/ 802function sgr_3x3_finish_8bpc_lsx 803 vreplgr2vr.w vr3, a3 // w1 804 andi t4, a4, 0x7 805 sub.w t5, a4, t4 806 807 beq zero, t5, .LSGR3X3_REM 808 809.LSGR3X3_H: 810 addi.d t0, a0, 0 811 addi.d t1, a2, 0 812 addi.w t2, t5, 0 813 andi t4, a4, 0x7 814.LSGR3X3_W: 815 vld vr0, t0, 0 816 vld vr1, t1, 0 817 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 818 vsllwil.wu.hu vr4, vr2, 0 // p 819 vexth.wu.hu vr5, vr2 // p 820 vslli.w vr6, vr4, 7 821 vslli.w vr7, vr5, 7 822 vsllwil.w.h vr8, vr1, 0 // dst 823 vexth.w.h vr9, vr1 // dst 824 vsub.w vr8, vr8, vr4 825 vsub.w vr9, vr9, vr5 826 vmadd.w vr6, vr8, vr3 // v 0 - 3 827 vmadd.w vr7, vr9, vr3 // v 4 - 7 828 vssrarni.hu.w vr7, vr6, 11 829 vssrlni.bu.h vr7, vr7, 0 830 vstelm.d vr7, t0, 0, 0 831 addi.d t0, t0, 8 832 addi.d t1, t1, 16 833 addi.d t2, t2, -8 834 bne zero, t2, .LSGR3X3_W 835 836 beq t4, zero, .LSGR3X3_NOREM 837 838 vld vr0, t0, 0 839 vld vr1, t1, 0 840 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 841 vsllwil.wu.hu vr4, vr2, 0 // p 842 vexth.wu.hu vr5, vr2 // p 843 vslli.w vr6, vr4, 7 844 vslli.w vr7, vr5, 7 845 vsllwil.w.h vr8, vr1, 0 // dst 846 vexth.w.h vr9, vr1 // dst 847 vsub.w vr8, vr8, vr4 848 vsub.w vr9, vr9, vr5 849 vmadd.w vr6, vr8, vr3 // v 0 - 3 850 vmadd.w vr7, vr9, vr3 // v 4 - 7 851 vssrarni.hu.w vr7, vr6, 11 852 vssrlni.bu.h vr7, vr7, 0 853 854.LSGR3X3_ST: 855 vstelm.b vr7, t0, 0, 0 856 addi.d t0, t0, 1 857 vbsrl.v vr7, vr7, 1 858 addi.w t4, t4, -1 859 bnez t4, .LSGR3X3_ST 860 861.LSGR3X3_NOREM: 862 addi.w a5, a5, -1 863 add.d a0, a0, a1 864 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) 865 bnez a5, .LSGR3X3_H 866 b .LSGR3X3_END 867 868.LSGR3X3_REM: 869 andi t4, a4, 0x7 870 addi.d t0, a0, 0 871 vld vr0, t0, 0 872 vld vr1, a2, 0 873 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 874 vsllwil.wu.hu vr4, vr2, 0 // p 875 vexth.wu.hu vr5, vr2 // p 876 vslli.w vr6, vr4, 7 877 vslli.w vr7, vr5, 7 878 vsllwil.w.h vr8, vr1, 0 // dst 879 vexth.w.h vr9, vr1 // dst 880 vsub.w vr8, vr8, vr4 881 vsub.w vr9, vr9, vr5 882 vmadd.w vr6, vr8, vr3 // v 0 - 3 883 vmadd.w vr7, vr9, vr3 // v 4 - 7 884 vssrarni.hu.w vr7, vr6, 11 885 vssrlni.bu.h vr7, vr7, 0 886 887.LSGR3X3_REM_ST: 888 vstelm.b vr7, t0, 0, 0 889 addi.d t0, t0, 1 890 vbsrl.v vr7, vr7, 1 891 addi.w t4, t4, -1 892 bnez t4, .LSGR3X3_REM_ST 893 addi.w a5, a5, -1 894 add.d a0, a0, a1 895 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) 896 bnez a5, .LSGR3X3_REM 897 898.LSGR3X3_END: 899endfunc 900 901/* 902void boxsum5(int32_t *sumsq, coef *sum, 903 const pixel *const src, 904 const int w, const int h) 905*/ 906function boxsum5_h_8bpc_lsx 907 addi.w a4, a4, -4 908 addi.d a0, a0, REST_UNIT_STRIDE<<2 909 addi.d a1, a1, REST_UNIT_STRIDE<<1 910 li.w t6, 1 911.LBOXSUM5_H_H: 912 addi.w t3, a3, 0 913 addi.d t2, a2, 0 914 addi.d t0, a0, 0 915 addi.d t1, a1, 0 916 917.LBOXSUM5_H_W: 918 vld vr0, t2, 0 // a 919 vld vr1, t2, REST_UNIT_STRIDE // b 920 vld vr2, t2, REST_UNIT_STRIDE<<1 // c 921 vld vr3, t2, REST_UNIT_STRIDE*3 // d 922 vld vr4, t2, REST_UNIT_STRIDE<<2 // e 923 924 vilvl.b vr5, vr1, vr0 925 vilvh.b vr6, vr1, vr0 926 vilvl.b vr7, vr3, vr2 927 vilvh.b vr8, vr3, vr2 928 //sum_v 929 vhaddw.hu.bu vr9, vr5, vr5 // 0 1 2 3 4 5 6 7 930 vhaddw.hu.bu vr10, vr6, vr6 // 8 9 10 11 12 13 14 15 a+b 931 vhaddw.hu.bu vr11, vr7, vr7 932 vhaddw.hu.bu vr12, vr8, vr8 933 vadd.h vr9, vr9, vr11 934 vadd.h vr10, vr10, vr12 // a + b + c + d 935 vsllwil.hu.bu vr11, vr4, 0 936 vexth.hu.bu vr12, vr4 937 vadd.h vr9, vr9, vr11 938 vadd.h vr10, vr10, vr12 939 vst vr9, t1, 0 940 vst vr10, t1, 16 941 addi.d t1, t1, 32 942 943 // sumsq 944 vmulwev.h.bu vr9, vr5, vr5 // a*a 0 1 2 3 4 5 6 7 945 vmulwev.h.bu vr10, vr6, vr6 // a*a 8 9 10 11 12 13 14 15 946 vmulwod.h.bu vr13, vr5, vr5 // b*b 0 1 2 3 4 5 6 7 947 vmulwod.h.bu vr14, vr6, vr6 // b*b 8 9 10 11 12 13 14 15 948 vmulwev.h.bu vr15, vr7, vr7 // c*c 0 1 2 3 4 5 6 7 949 vmulwev.h.bu vr16, vr8, vr8 // c*c 8 9 10 11 12 13 14 15 950 vmulwod.h.bu vr17, vr7, vr7 // d*d 0 1 2 3 4 5 6 7 951 vmulwod.h.bu vr18, vr8, vr8 // d*d 8 9 10 11 12 13 14 15 952 vaddwev.w.hu vr5, vr9, vr13 // 0 2 4 6 953 vaddwod.w.hu vr6, vr9, vr13 // 1 3 5 7 954 vaddwev.w.hu vr7, vr10, vr14 // 8 10 12 14 955 vaddwod.w.hu vr8, vr10, vr14 // 9 11 13 15 a + b 956 vaddwev.w.hu vr19, vr15, vr17 // 0 2 4 6 957 vaddwod.w.hu vr20, vr15, vr17 // 1 3 5 7 958 vaddwev.w.hu vr21, vr16, vr18 // 8 10 12 14 959 vaddwod.w.hu vr22, vr16, vr18 // 9 11 13 15 c + d 960 vadd.w vr5, vr5, vr19 961 vadd.w vr6, vr6, vr20 962 vadd.w vr7, vr7, vr21 963 vadd.w vr8, vr8, vr22 964 vmaddwev.w.hu vr5, vr11, vr11 965 vmaddwod.w.hu vr6, vr11, vr11 966 vmaddwev.w.hu vr7, vr12, vr12 967 vmaddwod.w.hu vr8, vr12, vr12 968 vilvl.w vr19, vr6, vr5 969 vilvh.w vr20, vr6, vr5 970 vilvl.w vr21, vr8, vr7 971 vilvh.w vr22, vr8, vr7 972 973 vst vr19, t0, 0 974 vst vr20, t0, 16 975 vst vr21, t0, 32 976 vst vr22, t0, 48 977 addi.d t0, t0, 64 978 addi.d t2, t2, 16 979 addi.w t3, t3, -16 980 blt zero, t3, .LBOXSUM5_H_W 981 982 addi.d a0, a0, REST_UNIT_STRIDE<<2 983 addi.d a1, a1, REST_UNIT_STRIDE<<1 984 addi.d a2, a2, REST_UNIT_STRIDE 985 addi.d a4, a4, -1 986 bnez a4, .LBOXSUM5_H_H 987endfunc 988 989/* 990void boxsum5_h(int32_t *sumsq, coef *sum, 991 const int w, const int h) 992*/ 993function boxsum5_v_8bpc_lsx 994 addi.d a0, a0, (REST_UNIT_STRIDE<<2) 995 addi.d a1, a1, (REST_UNIT_STRIDE<<1) 996 addi.w a3, a3, -4 997 addi.w a2, a2, -4 998 999.LBOXSUM5_V_H: 1000 addi.w t3, a2, 0 1001 addi.d t0, a0, 0 1002 addi.d t1, a1, 0 1003 addi.d t2, a0, 8 1004 addi.d t3, a1, 4 1005 addi.d t4, a2, 0 1006 1007 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 1008 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 1009 vld vr2, t1, 4 // c 2 1010 vld vr3, t1, 6 // d 3 1011 vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 1012 vadd.h vr5, vr0, vr1 1013 vadd.h vr6, vr2, vr3 1014 vpickve2gr.w t5, vr4, 2 1015 vadd.h vr5, vr5, vr6 1016 vadd.h vr5, vr5, vr4 1017 vst vr5, t3, 0 1018 1019 vld vr0, t0, 0 // 0 1 2 3 a 1020 vld vr1, t0, 4 // 1 2 3 4 b 1021 vld vr2, t0, 8 // 2 3 4 5 c 1022 vld vr3, t0, 12 // 3 4 5 6 d 1023 vld vr4, t0, 16 // 4 5 6 7 e a 1024 vld vr5, t0, 20 // 5 6 7 8 b 1025 vld vr6, t0, 24 // 6 7 8 9 c 1026 vld vr7, t0, 28 // 7 8 9 10 d 1027 vld vr8, t0, 32 // 8 9 10 11 e 1028 1029 vadd.w vr9, vr0, vr1 1030 vadd.w vr10, vr2, vr3 1031 vadd.w vr9, vr9, vr10 1032 vadd.w vr9, vr9, vr4 1033 vadd.w vr10, vr4, vr5 1034 vadd.w vr11, vr6, vr7 1035 vadd.w vr10, vr10, vr8 1036 vadd.w vr10, vr10, vr11 1037 vst vr9, t2, 0 1038 vst vr10, t2, 16 1039 1040 addi.d t3, t3, 16 1041 addi.d t1, t1, 16 1042 addi.d t0, t0, 32 1043 addi.d t2, t2, 32 1044 addi.w t4, t4, -8 1045 ble t4, zero, .LBOXSUM5_V_H1 1046 1047.LBOXSUM5_V_W: 1048 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 1049 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 1050 vld vr2, t1, 4 // c 2 1051 vld vr3, t1, 6 // d 3 1052 vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 1053 vinsgr2vr.w vr0, t5, 0 1054 vpickve2gr.w t5, vr4, 2 1055 vextrins.h vr1, vr0, 0x01 1056 vadd.h vr5, vr0, vr1 1057 vadd.h vr6, vr2, vr3 1058 vadd.h vr5, vr5, vr6 1059 vadd.h vr5, vr5, vr4 1060 vst vr5, t3, 0 1061 1062 vaddi.hu vr0, vr8, 0 // 8 9 10 11 a 1063 vld vr1, t0, 4 // 9 10 11 12 b 1064 vld vr2, t0, 8 // 10 11 12 13 c 1065 vld vr3, t0, 12 // 14 15 16 17 d 1066 vld vr4, t0, 16 // 15 16 17 18 e a 1067 vld vr5, t0, 20 // 16 17 18 19 b 1068 vld vr6, t0, 24 // 17 18 19 20 c 1069 vld vr7, t0, 28 // 18 19 20 21 d 1070 vld vr8, t0, 32 // 19 20 21 22 e 1071 vextrins.w vr1, vr0, 0x01 1072 vadd.w vr9, vr0, vr1 1073 vadd.w vr10, vr2, vr3 1074 vadd.w vr9, vr9, vr10 1075 vadd.w vr9, vr9, vr4 1076 vadd.w vr10, vr4, vr5 1077 vadd.w vr11, vr6, vr7 1078 vadd.w vr10, vr10, vr8 1079 vadd.w vr10, vr10, vr11 1080 vst vr9, t2, 0 1081 vst vr10, t2, 16 1082 1083 addi.d t3, t3, 16 1084 addi.d t1, t1, 16 1085 addi.d t0, t0, 32 1086 addi.d t2, t2, 32 1087 addi.w t4, t4, -8 1088 blt zero, t4, .LBOXSUM5_V_W 1089 1090.LBOXSUM5_V_H1: 1091 addi.d a1, a1, REST_UNIT_STRIDE<<1 1092 addi.d a0, a0, REST_UNIT_STRIDE<<2 1093 addi.w a3, a3, -1 1094 bnez a3, .LBOXSUM5_V_H 1095endfunc 1096 1097/* 1098selfguided_filter(int32_t *sumsq, coef *sum, 1099 const int w, const int h, 1100 const unsigned s) 1101*/ 1102function boxsum5_sgf_h_8bpc_lsx 1103 addi.d a0, a0, REST_UNIT_STRIDE<<2 1104 addi.d a0, a0, 12 // AA 1105 addi.d a1, a1, REST_UNIT_STRIDE<<1 1106 addi.d a1, a1, 6 // BB 1107 la.local t8, dav1d_sgr_x_by_x 1108 li.w t6, 164 1109 vreplgr2vr.w vr20, t6 1110 li.w t6, 255 1111 vreplgr2vr.w vr22, t6 1112 vaddi.wu vr21, vr22, 1 // 256 1113 vreplgr2vr.w vr6, a4 1114 vldi vr19, 0x819 1115 addi.w a2, a2, 2 // w + 2 1116 addi.w a3, a3, 2 // h + 2 1117 1118.LBS5SGF_H_H: 1119 addi.w t2, a2, 0 1120 addi.d t0, a0, -4 1121 addi.d t1, a1, -2 1122 1123.LBS5SGF_H_W: 1124 vld vr0, t0, 0 // AA[i] 1125 vld vr1, t0, 16 1126 vld vr2, t1, 0 // BB[i] 1127 1128 vmul.w vr4, vr0, vr19 // a * n 1129 vmul.w vr5, vr1, vr19 // a * n 1130 vsllwil.w.h vr9, vr2, 0 1131 vexth.w.h vr10, vr2 1132 vmsub.w vr4, vr9, vr9 // p 1133 vmsub.w vr5, vr10, vr10 // p 1134 vmaxi.w vr4, vr4, 0 1135 vmaxi.w vr5, vr5, 0 // p 1136 vmul.w vr4, vr4, vr6 // p * s 1137 vmul.w vr5, vr5, vr6 // p * s 1138 vsrlri.w vr4, vr4, 20 1139 vsrlri.w vr5, vr5, 20 // z 1140 vmin.w vr4, vr4, vr22 1141 vmin.w vr5, vr5, vr22 1142 1143 // load table data 1144 vpickve2gr.w t6, vr4, 0 1145 ldx.bu t7, t8, t6 1146 vinsgr2vr.w vr7, t7, 0 1147 vpickve2gr.w t6, vr4, 1 1148 ldx.bu t7, t8, t6 1149 vinsgr2vr.w vr7, t7, 1 1150 vpickve2gr.w t6, vr4, 2 1151 ldx.bu t7, t8, t6 1152 vinsgr2vr.w vr7, t7, 2 1153 vpickve2gr.w t6, vr4, 3 1154 ldx.bu t7, t8, t6 1155 vinsgr2vr.w vr7, t7, 3 1156 1157 vpickve2gr.w t6, vr5, 0 1158 ldx.bu t7, t8, t6 1159 vinsgr2vr.w vr8, t7, 0 1160 vpickve2gr.w t6, vr5, 1 1161 ldx.bu t7, t8, t6 1162 vinsgr2vr.w vr8, t7, 1 1163 vpickve2gr.w t6, vr5, 2 1164 ldx.bu t7, t8, t6 1165 vinsgr2vr.w vr8, t7, 2 1166 vpickve2gr.w t6, vr5, 3 1167 ldx.bu t7, t8, t6 1168 vinsgr2vr.w vr8, t7, 3 // x 1169 1170 vmul.w vr9, vr7, vr9 // x * BB[i] 1171 vmul.w vr10, vr8, vr10 1172 vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x 1173 vmul.w vr10, vr10, vr20 1174 vsrlri.w vr9, vr9, 12 1175 vsrlri.w vr10, vr10, 12 1176 vsub.w vr7, vr21, vr7 1177 vsub.w vr8, vr21, vr8 1178 vpickev.h vr8, vr8, vr7 1179 vst vr9, t0, 0 1180 vst vr10, t0, 16 1181 vst vr8, t1, 0 1182 addi.d t0, t0, 32 1183 addi.d t1, t1, 16 1184 addi.w t2, t2, -8 1185 blt zero, t2, .LBS5SGF_H_W 1186 1187 addi.d a0, a0, REST_UNIT_STRIDE<<2 1188 addi.d a0, a0, REST_UNIT_STRIDE<<2 1189 addi.d a1, a1, REST_UNIT_STRIDE<<2 1190 addi.w a3, a3, -2 1191 blt zero, a3, .LBS5SGF_H_H 1192endfunc 1193 1194/* 1195selfguided_filter(coef *dst, pixel *src, 1196 int32_t *sumsq, coef *sum, 1197 const int w, const int h) 1198*/ 1199function boxsum5_sgf_v_8bpc_lsx 1200 addi.d a1, a1, 3*REST_UNIT_STRIDE+3 // src 1201 addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 // A 1202 addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 1203 addi.d a3, a3, (2*REST_UNIT_STRIDE+3)<<1 // B 1204 addi.w a5, a5, -1 1205 vldi vr10, 0x806 1206 vldi vr11, 0x805 1207 vldi vr22, 0x406 1208 1209.LBS5SGF_V_H: 1210 addi.d t0, a0, 0 1211 addi.d t1, a1, 0 1212 addi.d t2, a2, 0 1213 addi.d t3, a3, 0 1214 addi.w t4, a4, 0 1215 1216 addi.d t5, a0, 384*2 1217 addi.d t6, a1, REST_UNIT_STRIDE 1218 addi.d t7, a2, REST_UNIT_STRIDE<<2 1219 addi.d t8, a3, REST_UNIT_STRIDE<<1 // B 1220.LBS5SGF_V_W: 1221 // a 1222 vld vr0, t3, -REST_UNIT_STRIDE*2 1223 vld vr1, t3, REST_UNIT_STRIDE*2 1224 vld vr2, t3, (-REST_UNIT_STRIDE-1)*2 1225 vld vr3, t3, (REST_UNIT_STRIDE-1)*2 1226 vld vr4, t3, (1-REST_UNIT_STRIDE)*2 1227 vld vr5, t3, (1+REST_UNIT_STRIDE)*2 1228 vaddwev.w.h vr6, vr0, vr1 1229 vaddwod.w.h vr7, vr0, vr1 1230 vmul.w vr6, vr6, vr10 1231 vmul.w vr7, vr7, vr10 1232 vaddwev.w.h vr8, vr2, vr3 1233 vaddwod.w.h vr9, vr2, vr3 1234 vaddwev.w.h vr12, vr4, vr5 1235 vaddwod.w.h vr13, vr4, vr5 1236 vadd.w vr8, vr8, vr12 1237 vadd.w vr9, vr9, vr13 1238 vmadd.w vr6, vr8, vr11 1239 vmadd.w vr7, vr9, vr11 1240 vilvl.w vr18, vr7, vr6 1241 vilvh.w vr19, vr7, vr6 1242 // b 1243 vld vr0, t2, -REST_UNIT_STRIDE*4 1244 vld vr1, t2, -REST_UNIT_STRIDE*4+16 1245 vld vr2, t2, REST_UNIT_STRIDE*4 1246 vld vr3, t2, REST_UNIT_STRIDE*4+16 1247 vld vr4, t2, (-REST_UNIT_STRIDE-1)*4 1248 vld vr5, t2, (-REST_UNIT_STRIDE-1)*4+16 1249 vld vr8, t2, (REST_UNIT_STRIDE-1)*4 1250 vld vr9, t2, (REST_UNIT_STRIDE-1)*4+16 1251 vld vr12, t2, (1-REST_UNIT_STRIDE)*4 1252 vld vr13, t2, (1-REST_UNIT_STRIDE)*4+16 1253 vld vr14, t2, (1+REST_UNIT_STRIDE)*4 1254 vld vr15, t2, (1+REST_UNIT_STRIDE)*4+16 1255 vadd.w vr0, vr0, vr2 // 0 1 2 3 1256 vadd.w vr1, vr1, vr3 // 4 5 6 7 1257 vmul.w vr20, vr0, vr10 1258 vmul.w vr21, vr1, vr10 1259 vadd.w vr4, vr4, vr8 // 0 1 2 3 1260 vadd.w vr5, vr5, vr9 // 4 5 6 7 1261 vadd.w vr12, vr12, vr14 1262 vadd.w vr13, vr13, vr15 1263 vadd.w vr12, vr12, vr4 1264 vadd.w vr13, vr13, vr5 1265 vmadd.w vr20, vr12, vr11 1266 vmadd.w vr21, vr13, vr11 1267 vld vr2, t1, 0 1268 vsllwil.hu.bu vr2, vr2, 0 1269 vsllwil.wu.hu vr3, vr2, 0 1270 vexth.wu.hu vr4, vr2 1271 vmadd.w vr20, vr18, vr3 1272 vmadd.w vr21, vr19, vr4 1273 vssrlrni.h.w vr21, vr20, 9 1274 vst vr21, t0, 0 1275 1276 addi.d t1, t1, 8 1277 addi.d t2, t2, 32 1278 addi.d t3, t3, 16 1279 1280 // a 1281 vld vr0, t8, 0 1282 vld vr1, t8, -2 1283 vld vr2, t8, 2 1284 vmulwev.w.h vr3, vr0, vr22 1285 vmulwod.w.h vr4, vr0, vr22 1286 vaddwev.w.h vr5, vr1, vr2 1287 vaddwod.w.h vr6, vr1, vr2 1288 vmadd.w vr3, vr5, vr11 1289 vmadd.w vr4, vr6, vr11 1290 vilvl.w vr19, vr4, vr3 1291 vilvh.w vr20, vr4, vr3 1292 // b 1293 vld vr0, t7, 0 1294 vld vr1, t7, -4 1295 vld vr2, t7, 4 1296 vld vr5, t7, 16 1297 vld vr6, t7, 12 1298 vld vr7, t7, 20 1299 vmul.w vr8, vr0, vr10 1300 vmul.w vr9, vr5, vr10 1301 vadd.w vr12, vr1, vr2 1302 vadd.w vr13, vr6, vr7 1303 vmadd.w vr8, vr12, vr11 1304 vmadd.w vr9, vr13, vr11 1305 vld vr2, t6, 0 1306 vsllwil.hu.bu vr2, vr2, 0 1307 vsllwil.wu.hu vr3, vr2, 0 1308 vexth.wu.hu vr4, vr2 1309 vmadd.w vr8, vr19, vr3 1310 vmadd.w vr9, vr20, vr4 1311 vssrlrni.h.w vr9, vr8, 8 1312 vst vr9, t0, 384*2 1313 1314 addi.d t0, t0, 16 1315 addi.d t8, t8, 16 1316 addi.d t7, t7, 32 1317 addi.d t6, t6, 8 1318 addi.w t4, t4, -8 1319 blt zero, t4, .LBS5SGF_V_W 1320 1321 addi.w a5, a5, -2 1322 addi.d a0, a0, 384*4 // dst 1323 addi.d a1, a1, REST_UNIT_STRIDE<<1 // src 1324 addi.d a2, a2, REST_UNIT_STRIDE<<2 // 1325 addi.d a2, a2, REST_UNIT_STRIDE<<2 1326 addi.d a3, a3, REST_UNIT_STRIDE<<2 // 1327 blt zero, a5, .LBS5SGF_V_H 1328 bnez a5, .LBS5SGF_END 1329.LBS5SGF_V_W1: 1330 // a 1331 vld vr0, a3, -REST_UNIT_STRIDE*2 1332 vld vr1, a3, REST_UNIT_STRIDE*2 1333 vld vr2, a3, (-REST_UNIT_STRIDE-1)*2 1334 vld vr3, a3, (REST_UNIT_STRIDE-1)*2 1335 vld vr4, a3, (1-REST_UNIT_STRIDE)*2 1336 vld vr5, a3, (1+REST_UNIT_STRIDE)*2 1337 vaddwev.w.h vr6, vr0, vr1 1338 vaddwod.w.h vr7, vr0, vr1 1339 vmul.w vr6, vr6, vr10 1340 vmul.w vr7, vr7, vr10 1341 vaddwev.w.h vr8, vr2, vr3 1342 vaddwod.w.h vr9, vr2, vr3 1343 vaddwev.w.h vr12, vr4, vr5 1344 vaddwod.w.h vr13, vr4, vr5 1345 vadd.w vr8, vr8, vr12 1346 vadd.w vr9, vr9, vr13 1347 vmadd.w vr6, vr8, vr11 1348 vmadd.w vr7, vr9, vr11 1349 vilvl.w vr18, vr7, vr6 1350 vilvh.w vr19, vr7, vr6 1351 // b 1352 vld vr0, a2, -REST_UNIT_STRIDE*4 1353 vld vr1, a2, -REST_UNIT_STRIDE*4+16 1354 vld vr2, a2, REST_UNIT_STRIDE*4 1355 vld vr3, a2, REST_UNIT_STRIDE*4+16 1356 vld vr4, a2, (-REST_UNIT_STRIDE-1)*4 1357 vld vr5, a2, (-REST_UNIT_STRIDE-1)*4+16 1358 vld vr8, a2, (REST_UNIT_STRIDE-1)*4 1359 vld vr9, a2, (REST_UNIT_STRIDE-1)*4+16 1360 vld vr12, a2, (1-REST_UNIT_STRIDE)*4 1361 vld vr13, a2, (1-REST_UNIT_STRIDE)*4+16 1362 vld vr14, a2, (1+REST_UNIT_STRIDE)*4 1363 vld vr15, a2, (1+REST_UNIT_STRIDE)*4+16 1364 vadd.w vr0, vr0, vr2 // 0 1 2 3 1365 vadd.w vr1, vr1, vr3 // 4 5 6 7 1366 vmul.w vr20, vr0, vr10 1367 vmul.w vr21, vr1, vr10 1368 vadd.w vr4, vr4, vr8 // 0 1 2 3 1369 vadd.w vr5, vr5, vr9 // 4 5 6 7 1370 vadd.w vr12, vr12, vr14 1371 vadd.w vr13, vr13, vr15 1372 vadd.w vr12, vr12, vr4 1373 vadd.w vr13, vr13, vr5 1374 vmadd.w vr20, vr12, vr11 1375 vmadd.w vr21, vr13, vr11 1376 vld vr2, a1, 0 1377 vsllwil.hu.bu vr2, vr2, 0 1378 vsllwil.wu.hu vr3, vr2, 0 1379 vexth.wu.hu vr4, vr2 1380 vmadd.w vr20, vr18, vr3 1381 vmadd.w vr21, vr19, vr4 1382 vssrlrni.h.w vr21, vr20, 9 1383 vst vr21, a0, 0 1384 addi.d a3, a3, 16 1385 addi.d a2, a2, 32 1386 addi.d a1, a1, 8 1387 addi.d a0, a0, 16 1388 addi.w a4, a4, -8 1389 blt zero, a4, .LBS5SGF_V_W1 1390.LBS5SGF_END: 1391endfunc 1392 1393/* 1394void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride, 1395 const int16_t *dst0, const int16_t *dst1, 1396 const int w0, const int w1, 1397 const int w, const int h); 1398*/ 1399function sgr_mix_finish_8bpc_lsx 1400 vreplgr2vr.w vr3, a4 // w0 1401 vreplgr2vr.w vr13, a5 // w1 1402 andi t4, a6, 0x7 1403 sub.w t5, a6, t4 1404 1405 beq zero, t5, .LSGRMIX_REM 1406 1407.LSGRMIX_H: 1408 addi.d t0, a0, 0 1409 addi.d t1, a2, 0 // dst0 1410 addi.d t3, a3, 0 // dst1 1411 addi.w t2, t5, 0 1412 andi t4, a6, 0x7 1413.LSGRMIX_W: 1414 vld vr0, t0, 0 1415 vld vr1, t1, 0 1416 vld vr10, t3, 0 1417 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 1418 vsllwil.wu.hu vr4, vr2, 0 // u 0 1 2 3 1419 vexth.wu.hu vr5, vr2 // u 4 5 6 7 1420 vslli.w vr6, vr4, 7 1421 vslli.w vr7, vr5, 7 1422 vsllwil.w.h vr8, vr1, 0 // dst0 1423 vexth.w.h vr9, vr1 // dst0 1424 vsub.w vr8, vr8, vr4 1425 vsub.w vr9, vr9, vr5 1426 vmadd.w vr6, vr8, vr3 // v 0 - 3 1427 vmadd.w vr7, vr9, vr3 // v 4 - 7 1428 1429 vsllwil.w.h vr11, vr10, 0 // dst1 1430 vexth.w.h vr12, vr10 // dst1 1431 vsub.w vr11, vr11, vr4 1432 vsub.w vr12, vr12, vr5 1433 vmadd.w vr6, vr11, vr13 1434 vmadd.w vr7, vr12, vr13 1435 1436 vssrarni.hu.w vr7, vr6, 11 1437 vssrlni.bu.h vr7, vr7, 0 1438 vstelm.d vr7, t0, 0, 0 1439 addi.d t0, t0, 8 1440 addi.d t1, t1, 16 1441 addi.d t3, t3, 16 1442 addi.d t2, t2, -8 1443 bne zero, t2, .LSGRMIX_W 1444 1445 beq t4, zero, .LSGRMIX_W8 1446 1447 vld vr0, t0, 0 1448 vld vr1, t1, 0 1449 vld vr10, t3, 0 1450 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 1451 vsllwil.wu.hu vr4, vr2, 0 // p 1452 vexth.wu.hu vr5, vr2 // p 1453 vslli.w vr6, vr4, 7 1454 vslli.w vr7, vr5, 7 1455 vsllwil.w.h vr8, vr1, 0 // dst 1456 vexth.w.h vr9, vr1 // dst 1457 vsub.w vr8, vr8, vr4 1458 vsub.w vr9, vr9, vr5 1459 vmadd.w vr6, vr8, vr3 // v 0 - 3 1460 vmadd.w vr7, vr9, vr3 // v 4 - 7 1461 1462 vsllwil.w.h vr11, vr10, 0 // dst1 1463 vexth.w.h vr12, vr10 // dst1 1464 vsub.w vr11, vr11, vr4 1465 vsub.w vr12, vr12, vr5 1466 vmadd.w vr6, vr11, vr13 1467 vmadd.w vr7, vr12, vr13 1468 1469 vssrarni.hu.w vr7, vr6, 11 1470 vssrlni.bu.h vr7, vr7, 0 1471 1472.LSGRMIX_ST: 1473 vstelm.b vr7, t0, 0, 0 1474 addi.d t0, t0, 1 1475 vbsrl.v vr7, vr7, 1 1476 addi.w t4, t4, -1 1477 bnez t4, .LSGRMIX_ST 1478 1479.LSGRMIX_W8: 1480 addi.w a7, a7, -1 1481 add.d a0, a0, a1 1482 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) 1483 addi.d a3, a3, (FILTER_OUT_STRIDE<<1) 1484 bnez a7, .LSGRMIX_H 1485 b .LSGR_MIX_END 1486 1487.LSGRMIX_REM: 1488 andi t4, a6, 0x7 1489 vld vr0, a0, 0 1490 vld vr1, a2, 0 1491 vld vr10, a3, 0 1492 vsllwil.hu.bu vr2, vr0, 4 // u 8 h 1493 vsllwil.wu.hu vr4, vr2, 0 // p 1494 vexth.wu.hu vr5, vr2 // p 1495 vslli.w vr6, vr4, 7 1496 vslli.w vr7, vr5, 7 1497 vsllwil.w.h vr8, vr1, 0 // dst 1498 vexth.w.h vr9, vr1 // dst 1499 vsub.w vr8, vr8, vr4 1500 vsub.w vr9, vr9, vr5 1501 vmadd.w vr6, vr8, vr3 // v 0 - 3 1502 vmadd.w vr7, vr9, vr3 // v 4 - 7 1503 1504 vsllwil.w.h vr11, vr10, 0 // dst1 1505 vexth.w.h vr12, vr10 // dst1 1506 vsub.w vr11, vr11, vr4 1507 vsub.w vr12, vr12, vr5 1508 vmadd.w vr6, vr11, vr13 1509 vmadd.w vr7, vr12, vr13 1510 1511 vssrarni.hu.w vr7, vr6, 11 1512 vssrlni.bu.h vr7, vr7, 0 1513 addi.d t0, a0, 0 1514.LSGRMIX_REM_ST: 1515 vstelm.b vr7, t0, 0, 0 1516 addi.d t0, t0, 1 1517 vbsrl.v vr7, vr7, 1 1518 addi.w t4, t4, -1 1519 bnez t4, .LSGRMIX_REM_ST 1520 1521 addi.w a7, a7, -1 1522 add.d a0, a0, a1 1523 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) 1524 addi.d a3, a3, (FILTER_OUT_STRIDE<<1) 1525 bnez a7, .LSGRMIX_REM 1526 1527.LSGR_MIX_END: 1528endfunc 1529 1530.macro MADD_HU_BU_LASX in0, in1, out0, out1 1531 xvsllwil.hu.bu xr12, \in0, 0 1532 xvexth.hu.bu xr13, \in0 1533 xvmadd.h \out0, xr12, \in1 1534 xvmadd.h \out1, xr13, \in1 1535.endm 1536 1537const wiener_shuf_lasx 1538.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 1539.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 1540endconst 1541 1542function wiener_filter_h_8bpc_lasx 1543 addi.d sp, sp, -40 1544 fst.d f24, sp, 0 1545 fst.d f25, sp, 8 1546 fst.d f26, sp, 16 1547 fst.d f27, sp, 24 1548 fst.d f28, sp, 32 1549 li.w t7, 1<<14 // clip_limit 1550 1551 la.local t1, wiener_shuf_lasx 1552 xvld xr4, t1, 0 1553 vld vr27, a2, 0 // filter[0][k] 1554 xvpermi.q xr14, xr27, 0b00000000 1555 xvrepl128vei.h xr21, xr14, 0 1556 xvrepl128vei.h xr22, xr14, 1 1557 xvrepl128vei.h xr23, xr14, 2 1558 xvrepl128vei.h xr24, xr14, 3 1559 xvrepl128vei.h xr25, xr14, 4 1560 xvrepl128vei.h xr26, xr14, 5 1561 xvrepl128vei.h xr27, xr14, 6 1562 xvreplgr2vr.w xr0, t7 1563 1564.WIENER_FILTER_H_H_LASX: 1565 addi.w a4, a4, -1 // h 1566 addi.w t0, a3, 0 // w 1567 addi.d t1, a1, 0 // tmp_ptr 1568 addi.d t2, a0, 0 // hor_ptr 1569 1570.WIENER_FILTER_H_W_LASX: 1571 addi.w t0, t0, -32 1572 xvld xr5, t1, 0 1573 xvld xr13, t1, 16 1574 1575 xvsubi.bu xr14, xr4, 2 1576 xvsubi.bu xr15, xr4, 1 1577 xvshuf.b xr6, xr13, xr5, xr14 // 1 ... 8, 9 ... 16 1578 xvshuf.b xr7, xr13, xr5, xr15 // 2 ... 9, 10 ... 17 1579 xvshuf.b xr8, xr13, xr5, xr4 // 3 ... 10, 11 ... 18 1580 xvaddi.bu xr14, xr4, 1 1581 xvaddi.bu xr15, xr4, 2 1582 xvshuf.b xr9, xr13, xr5, xr14 // 4 ... 11, 12 ... 19 1583 xvshuf.b xr10, xr13, xr5, xr15 // 5 ... 12, 13 ... 20 1584 xvaddi.bu xr14, xr4, 3 1585 xvshuf.b xr11, xr13, xr5, xr14 // 6 ... 13, 14 ... 21 1586 1587 xvsllwil.hu.bu xr15, xr8, 0 // 3 4 5 6 7 8 9 10 1588 xvexth.hu.bu xr16, xr8 // 11 12 13 14 15 16 17 18 1589 xvsllwil.wu.hu xr17, xr15, 7 // 3 4 5 6 1590 xvexth.wu.hu xr18, xr15 // 7 8 9 10 1591 xvsllwil.wu.hu xr19, xr16, 7 // 11 12 13 14 1592 xvexth.wu.hu xr20, xr16 // 15 16 17 18 1593 xvslli.w xr18, xr18, 7 1594 xvslli.w xr20, xr20, 7 1595 xvxor.v xr15, xr15, xr15 1596 xvxor.v xr14, xr14, xr14 1597 1598 MADD_HU_BU_LASX xr5, xr21, xr14, xr15 1599 MADD_HU_BU_LASX xr6, xr22, xr14, xr15 1600 MADD_HU_BU_LASX xr7, xr23, xr14, xr15 1601 MADD_HU_BU_LASX xr8, xr24, xr14, xr15 1602 MADD_HU_BU_LASX xr9, xr25, xr14, xr15 1603 MADD_HU_BU_LASX xr10, xr26, xr14, xr15 1604 MADD_HU_BU_LASX xr11, xr27, xr14, xr15 1605 1606 xvsllwil.w.h xr5, xr14, 0 // 0 1 2 3 1607 xvexth.w.h xr6, xr14 // 4 5 6 7 1608 xvsllwil.w.h xr7, xr15, 0 // 8 9 10 11 1609 xvexth.w.h xr8, xr15 // 12 13 14 15 1610 xvadd.w xr17, xr17, xr5 1611 xvadd.w xr18, xr18, xr6 1612 xvadd.w xr19, xr19, xr7 1613 xvadd.w xr20, xr20, xr8 1614 xvadd.w xr17, xr17, xr0 1615 xvadd.w xr18, xr18, xr0 1616 xvadd.w xr19, xr19, xr0 1617 xvadd.w xr20, xr20, xr0 1618 1619 xvsrli.w xr1, xr0, 1 1620 xvsubi.wu xr1, xr1, 1 1621 xvxor.v xr3, xr3, xr3 1622 xvsrari.w xr17, xr17, 3 1623 xvsrari.w xr18, xr18, 3 1624 xvsrari.w xr19, xr19, 3 1625 xvsrari.w xr20, xr20, 3 1626 xvclip.w xr17, xr17, xr3, xr1 1627 xvclip.w xr18, xr18, xr3, xr1 1628 xvclip.w xr19, xr19, xr3, xr1 1629 xvclip.w xr20, xr20, xr3, xr1 1630 1631 xvor.v xr5, xr17, xr17 1632 xvor.v xr6, xr19, xr19 1633 xvpermi.q xr17, xr18, 0b00000010 1634 xvpermi.q xr19, xr20, 0b00000010 1635 1636 xvst xr17, t2, 0 1637 xvst xr19, t2, 32 1638 xvpermi.q xr18, xr5, 0b00110001 1639 xvpermi.q xr20, xr6, 0b00110001 1640 xvst xr18, t2, 64 1641 xvst xr20, t2, 96 1642 addi.d t1, t1, 32 1643 addi.d t2, t2, 128 1644 blt zero, t0, .WIENER_FILTER_H_W_LASX 1645 1646 addi.d a1, a1, REST_UNIT_STRIDE 1647 addi.d a0, a0, (REST_UNIT_STRIDE << 2) 1648 bnez a4, .WIENER_FILTER_H_H_LASX 1649 1650 fld.d f24, sp, 0 1651 fld.d f25, sp, 8 1652 fld.d f26, sp, 16 1653 fld.d f27, sp, 24 1654 fld.d f28, sp, 32 1655 addi.d sp, sp, 40 1656endfunc 1657 1658.macro APPLY_FILTER_LASX in0, in1, in2 1659 alsl.d t7, \in0, \in1, 2 1660 xvld xr10, t7, 0 1661 xvld xr12, t7, 32 1662 xvmadd.w xr14, xr10, \in2 1663 xvmadd.w xr16, xr12, \in2 1664.endm 1665 1666.macro wiener_filter_v_8bpc_core_lasx 1667 xvreplgr2vr.w xr14, t6 1668 xvreplgr2vr.w xr16, t6 1669 1670 addi.w t7, t2, 0 // j + index k 1671 mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE 1672 add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i 1673 1674 APPLY_FILTER_LASX t7, a2, xr2 1675 APPLY_FILTER_LASX t8, t7, xr3 1676 APPLY_FILTER_LASX t8, t7, xr4 1677 APPLY_FILTER_LASX t8, t7, xr5 1678 APPLY_FILTER_LASX t8, t7, xr6 1679 APPLY_FILTER_LASX t8, t7, xr7 1680 APPLY_FILTER_LASX t8, t7, xr8 1681 xvssrarni.hu.w xr16, xr14, 11 1682 xvpermi.d xr17, xr16, 0b11011000 1683 xvssrlni.bu.h xr17, xr17, 0 1684 xvpermi.d xr17, xr17, 0b00001000 1685.endm 1686 1687function wiener_filter_v_8bpc_lasx 1688 li.w t6, -(1 << 18) 1689 1690 li.w t8, REST_UNIT_STRIDE 1691 ld.h t0, a3, 0 1692 ld.h t1, a3, 2 1693 xvreplgr2vr.w xr2, t0 1694 xvreplgr2vr.w xr3, t1 1695 ld.h t0, a3, 4 1696 ld.h t1, a3, 6 1697 xvreplgr2vr.w xr4, t0 1698 xvreplgr2vr.w xr5, t1 1699 ld.h t0, a3, 8 1700 ld.h t1, a3, 10 1701 xvreplgr2vr.w xr6, t0 1702 xvreplgr2vr.w xr7, t1 1703 ld.h t0, a3, 12 1704 xvreplgr2vr.w xr8, t0 1705 1706 andi t1, a4, 0xf 1707 sub.w t0, a4, t1 // w-w%16 1708 or t2, zero, zero // j 1709 or t4, zero, zero 1710 beqz t0, .WIENER_FILTER_V_W_LT16_LASX 1711 1712.WIENER_FILTER_V_H_LASX: 1713 andi t1, a4, 0xf 1714 add.d t3, zero, a0 // p 1715 or t4, zero, zero // i 1716 1717.WIENER_FILTER_V_W_LASX: 1718 1719 wiener_filter_v_8bpc_core_lasx 1720 1721 mul.w t5, t2, a1 // j * stride 1722 add.w t5, t5, t4 // j * stride + i 1723 add.d t3, a0, t5 1724 addi.w t4, t4, 16 1725 vst vr17, t3, 0 1726 bne t0, t4, .WIENER_FILTER_V_W_LASX 1727 1728 beqz t1, .WIENER_FILTER_V_W_EQ16_LASX 1729 1730 wiener_filter_v_8bpc_core_lsx 1731 1732 addi.d t3, t3, 16 1733 andi t1, a4, 0xf 1734 1735.WIENER_FILTER_V_ST_REM_LASX: 1736 vstelm.b vr17, t3, 0, 0 1737 vbsrl.v vr17, vr17, 1 1738 addi.d t3, t3, 1 1739 addi.w t1, t1, -1 1740 bnez t1, .WIENER_FILTER_V_ST_REM_LASX 1741.WIENER_FILTER_V_W_EQ16_LASX: 1742 addi.w t2, t2, 1 1743 blt t2, a5, .WIENER_FILTER_V_H_LASX 1744 b .WIENER_FILTER_V_LASX_END 1745 1746.WIENER_FILTER_V_W_LT16_LASX: 1747 andi t1, a4, 0xf 1748 add.d t3, zero, a0 1749 1750 wiener_filter_v_8bpc_core_lsx 1751 1752 mul.w t5, t2, a1 // j * stride 1753 add.d t3, a0, t5 1754 1755.WIENER_FILTER_V_ST_REM_1_LASX: 1756 vstelm.b vr17, t3, 0, 0 1757 vbsrl.v vr17, vr17, 1 1758 addi.d t3, t3, 1 1759 addi.w t1, t1, -1 1760 bnez t1, .WIENER_FILTER_V_ST_REM_1_LASX 1761 1762 addi.w t2, t2, 1 1763 blt t2, a5, .WIENER_FILTER_V_W_LT16_LASX 1764 1765.WIENER_FILTER_V_LASX_END: 1766endfunc 1767 1768function boxsum3_sgf_h_8bpc_lasx 1769 addi.d a0, a0, (REST_UNIT_STRIDE<<2)+12 // AA 1770 //addi.d a0, a0, 12 // AA 1771 addi.d a1, a1, (REST_UNIT_STRIDE<<1)+6 // BB 1772 //addi.d a1, a1, 6 // BB 1773 la.local t8, dav1d_sgr_x_by_x 1774 li.w t6, 455 1775 xvreplgr2vr.w xr20, t6 1776 li.w t6, 255 1777 xvreplgr2vr.w xr22, t6 1778 xvaddi.wu xr21, xr22, 1 // 256 1779 xvreplgr2vr.w xr6, a4 1780 xvldi xr19, 0x809 1781 addi.w a2, a2, 2 // w + 2 1782 addi.w a3, a3, 2 // h + 2 1783 1784.LBS3SGF_H_H_LASX: 1785 addi.w t2, a2, 0 1786 addi.d t0, a0, -4 1787 addi.d t1, a1, -2 1788 1789.LBS3SGF_H_W_LASX: 1790 addi.w t2, t2, -16 1791 xvld xr0, t0, 0 // AA[i] 1792 xvld xr1, t0, 32 1793 xvld xr2, t1, 0 // BB[i] 1794 1795 xvmul.w xr4, xr0, xr19 // a * n 1796 xvmul.w xr5, xr1, xr19 1797 vext2xv.w.h xr9, xr2 1798 xvpermi.q xr10, xr2, 0b00000001 1799 vext2xv.w.h xr10, xr10 1800 xvmsub.w xr4, xr9, xr9 // p 1801 xvmsub.w xr5, xr10, xr10 1802 xvmaxi.w xr4, xr4, 0 1803 xvmaxi.w xr5, xr5, 0 1804 xvmul.w xr4, xr4, xr6 // p * s 1805 xvmul.w xr5, xr5, xr6 1806 xvsrlri.w xr4, xr4, 20 1807 xvsrlri.w xr5, xr5, 20 1808 xvmin.w xr4, xr4, xr22 1809 xvmin.w xr5, xr5, xr22 1810 1811 vpickve2gr.w t6, vr4, 0 1812 ldx.bu t7, t8, t6 1813 vinsgr2vr.w vr7, t7, 0 1814 vpickve2gr.w t6, vr4, 1 1815 ldx.bu t7, t8, t6 1816 vinsgr2vr.w vr7, t7, 1 1817 vpickve2gr.w t6, vr4, 2 1818 ldx.bu t7, t8, t6 1819 vinsgr2vr.w vr7, t7, 2 1820 vpickve2gr.w t6, vr4, 3 1821 ldx.bu t7, t8, t6 1822 vinsgr2vr.w vr7, t7, 3 1823 1824 xvpickve2gr.w t6, xr4, 4 1825 ldx.bu t7, t8, t6 1826 xvinsgr2vr.w xr7, t7, 4 1827 xvpickve2gr.w t6, xr4, 5 1828 ldx.bu t7, t8, t6 1829 xvinsgr2vr.w xr7, t7, 5 1830 xvpickve2gr.w t6, xr4, 6 1831 ldx.bu t7, t8, t6 1832 xvinsgr2vr.w xr7, t7, 6 1833 xvpickve2gr.w t6, xr4, 7 1834 ldx.bu t7, t8, t6 1835 xvinsgr2vr.w xr7, t7, 7 // x 1836 1837 vpickve2gr.w t6, vr5, 0 1838 ldx.bu t7, t8, t6 1839 vinsgr2vr.w vr8, t7, 0 1840 vpickve2gr.w t6, vr5, 1 1841 ldx.bu t7, t8, t6 1842 vinsgr2vr.w vr8, t7, 1 1843 vpickve2gr.w t6, vr5, 2 1844 ldx.bu t7, t8, t6 1845 vinsgr2vr.w vr8, t7, 2 1846 vpickve2gr.w t6, vr5, 3 1847 ldx.bu t7, t8, t6 1848 vinsgr2vr.w vr8, t7, 3 1849 1850 xvpickve2gr.w t6, xr5, 4 1851 ldx.bu t7, t8, t6 1852 xvinsgr2vr.w xr8, t7, 4 1853 xvpickve2gr.w t6, xr5, 5 1854 ldx.bu t7, t8, t6 1855 xvinsgr2vr.w xr8, t7, 5 1856 xvpickve2gr.w t6, xr5, 6 1857 ldx.bu t7, t8, t6 1858 xvinsgr2vr.w xr8, t7, 6 1859 xvpickve2gr.w t6, xr5, 7 1860 ldx.bu t7, t8, t6 1861 xvinsgr2vr.w xr8, t7, 7 // x 1862 1863 xvmul.w xr9, xr7, xr9 // x * BB[i] 1864 xvmul.w xr10, xr8, xr10 1865 xvmul.w xr9, xr9, xr20 // x * BB[i] * sgr_one_by_x 1866 xvmul.w xr10, xr10, xr20 1867 xvsrlri.w xr9, xr9, 12 1868 xvsrlri.w xr10, xr10, 12 1869 xvsub.w xr7, xr21, xr7 1870 xvsub.w xr8, xr21, xr8 1871 xvpickev.h xr12, xr8, xr7 1872 xvpermi.d xr11, xr12, 0b11011000 1873 1874 xvst xr9, t0, 0 1875 xvst xr10, t0, 32 1876 xvst xr11, t1, 0 1877 addi.d t0, t0, 64 1878 addi.d t1, t1, 32 1879 blt zero, t2, .LBS3SGF_H_W_LASX 1880 1881 addi.d a0, a0, REST_UNIT_STRIDE<<2 1882 addi.d a1, a1, REST_UNIT_STRIDE<<1 1883 addi.w a3, a3, -1 1884 bnez a3, .LBS3SGF_H_H_LASX 1885endfunc 1886 1887function boxsum3_h_8bpc_lasx 1888 addi.d a2, a2, REST_UNIT_STRIDE 1889 li.w t0, 1 1890 addi.w a3, a3, -2 1891 addi.w a4, a4, -4 1892.LBS3_H_H_LASX: 1893 alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x 1894 alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x 1895 add.d t3, t0, a2 // s 1896 addi.w t5, a3, 0 1897 1898.LBS3_H_W_LASX: 1899 xvld xr0, t3, 0 1900 xvld xr1, t3, REST_UNIT_STRIDE 1901 xvld xr2, t3, (REST_UNIT_STRIDE<<1) 1902 1903 xvilvl.b xr3, xr1, xr0 1904 xvhaddw.hu.bu xr4, xr3, xr3 1905 xvilvh.b xr5, xr1, xr0 1906 xvhaddw.hu.bu xr6, xr5, xr5 1907 xvsllwil.hu.bu xr7, xr2, 0 1908 xvexth.hu.bu xr8, xr2 1909 // sum_v 1910 xvadd.h xr4, xr4, xr7 // 0 2 1911 xvadd.h xr6, xr6, xr8 // 1 3 1912 xvor.v xr9, xr4, xr4 1913 xvpermi.q xr4, xr6, 0b00000010 1914 xvpermi.q xr6, xr9, 0b00110001 1915 xvst xr4, t1, REST_UNIT_STRIDE<<1 1916 xvst xr6, t1, (REST_UNIT_STRIDE<<1)+32 1917 addi.d t1, t1, 64 1918 // sumsq 1919 xvmulwev.h.bu xr9, xr3, xr3 1920 xvmulwod.h.bu xr10, xr3, xr3 1921 xvmulwev.h.bu xr11, xr5, xr5 1922 xvmulwod.h.bu xr12, xr5, xr5 1923 xvaddwev.w.hu xr13, xr10, xr9 1924 xvaddwod.w.hu xr14, xr10, xr9 1925 xvaddwev.w.hu xr15, xr12, xr11 1926 xvaddwod.w.hu xr16, xr12, xr11 1927 xvmaddwev.w.hu xr13, xr7, xr7 1928 xvmaddwod.w.hu xr14, xr7, xr7 1929 xvmaddwev.w.hu xr15, xr8, xr8 1930 xvmaddwod.w.hu xr16, xr8, xr8 1931 xvilvl.w xr9, xr14, xr13 1932 xvilvh.w xr10, xr14, xr13 1933 xvilvl.w xr11, xr16, xr15 1934 xvilvh.w xr12, xr16, xr15 1935 xvor.v xr7, xr9, xr9 1936 xvor.v xr8, xr11, xr11 1937 xvpermi.q xr9, xr10, 0b00000010 1938 xvpermi.q xr10, xr7, 0b00110001 1939 xvpermi.q xr11, xr12, 0b00000010 1940 xvpermi.q xr12, xr8, 0b00110001 1941 xvst xr9, t2, REST_UNIT_STRIDE<<2 1942 xvst xr11, t2, (REST_UNIT_STRIDE<<2)+32 1943 xvst xr10, t2, (REST_UNIT_STRIDE<<2)+64 1944 xvst xr12, t2, (REST_UNIT_STRIDE<<2)+96 1945 1946 addi.d t2, t2, 128 1947 addi.w t5, t5, -32 1948 addi.d t3, t3, 32 1949 blt zero, t5, .LBS3_H_W_LASX 1950 1951 addi.d a0, a0, REST_UNIT_STRIDE<<2 1952 addi.d a1, a1, REST_UNIT_STRIDE<<1 1953 addi.d a2, a2, REST_UNIT_STRIDE 1954 addi.d a4, a4, -1 1955 blt zero, a4, .LBS3_H_H_LASX 1956endfunc 1957