1/* 2 * Copyright © 2024, VideoLAN and dav1d authors 3 * Copyright © 2024, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29 30// static int cdef_find_dir_lsx(const pixel *img, const ptrdiff_t stride, 31// unsigned *const var HIGHBD_DECL_SUFFIX) 32// param: img: a0, stride: a1, var: a2 33function cdef_find_dir_8bpc_lsx 34 addi.d sp, sp, -64 35 fst.d f24, sp, 0 36 fst.d f25, sp, 8 37 fst.d f26, sp, 16 38 fst.d f27, sp, 24 39 fst.d f28, sp, 32 40 fst.d f29, sp, 40 41 fst.d f30, sp, 48 42 fst.d f31, sp, 56 43 44 li.d a3, 128 45 vreplgr2vr.w vr31, a3 46 47 // hv: vr0-vr3 diag: vr4-vr11 alt: vr12-vr23 48.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, vr9, vr10, \ 49 vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ 50 vr20, vr21, vr22, vr23 51 vxor.v \i, \i, \i 52.endr 53 54.CFDL01: // 8 55 // 0 56 fld.d f24, a0, 0 //img 57 vpermi.w vr25, vr24, 0x01 58 59 vsllwil.hu.bu vr24, vr24, 0 60 vsllwil.hu.bu vr24, vr24, 0 61 vsllwil.hu.bu vr25, vr25, 0 62 vsllwil.hu.bu vr25, vr25, 0 63 64 vsub.w vr24, vr24, vr31 //px 65 vsub.w vr25, vr25, vr31 66 67 vadd.w vr4, vr4, vr24 //diag[0][y+x] 68 vadd.w vr5, vr5, vr25 69 70 vpackev.w vr26, vr25, vr24 71 vpackod.w vr27, vr25, vr24 72 vpermi.w vr26, vr26, 0xd8 //px0246 73 vpermi.w vr27, vr27, 0xd8 //px1357 74 vadd.w vr12, vr12, vr26 75 vadd.w vr12, vr12, vr27 //alt[0][y+(x>>1)] 76 77 vhaddw.d.w vr28, vr24, vr24 78 vhaddw.q.d vr28, vr28, vr28 79 vpickve2gr.d a3, vr28, 0 80 vhaddw.d.w vr28, vr25, vr25 81 vhaddw.q.d vr28, vr28, vr28 82 vpickve2gr.d a4, vr28, 0 83 add.d a3, a3, a4 84 vinsgr2vr.w vr0, a3, 0 //hv[0][y] 85 86 vadd.w vr15, vr15, vr26 87 vadd.w vr15, vr15, vr27 //alt[1][3+y-(x>>1)] 88 vpermi.w vr15, vr15, 0x1b 89 90 vadd.w vr9, vr9, vr24 91 vadd.w vr8, vr8, vr25 92 vpermi.w vr8, vr8, 0x1b 93 vpermi.w vr9, vr9, 0x1b //diag[1][7+y-x] 94 95 vxor.v vr28, vr28, vr28 96 vxor.v vr29, vr29, vr29 97 vadd.w vr28, vr28, vr24 98 vadd.w vr29, vr29, vr25 99 vextrins.w vr18, vr28, 0x30 100 vshuf4i.w vr19, vr28, 0x39 101 vextrins.w vr19, vr29, 0x30 102 vshuf4i.w vr20, vr29, 0x39 //alt[2][3-(y>>1)+7] 103 vinsgr2vr.w vr20, zero, 3 104 105 vadd.w vr2, vr2, vr24 106 vadd.w vr3, vr3, vr25 //hv[1][x] 107 108 vadd.w vr21, vr21, vr24 109 vadd.w vr22, vr22, vr25 //alt[3][(y>>1)+x] 110 111 add.d a0, a0, a1 112 113 // 1 114 fld.d f24, a0, 0 //img 115 vpermi.w vr25, vr24, 0x01 116 117 vsllwil.hu.bu vr24, vr24, 0 118 vsllwil.hu.bu vr24, vr24, 0 119 vsllwil.hu.bu vr25, vr25, 0 120 vsllwil.hu.bu vr25, vr25, 0 121 122 vsub.w vr24, vr24, vr31 //px 123 vsub.w vr25, vr25, vr31 124 125 vbsrl.v vr28, vr4, 4 //1-4 126 vbsrl.v vr29, vr5, 4 //5-8 127 vextrins.w vr28, vr5, 0x30 128 vadd.w vr28, vr28, vr24 //diag[0][y+x] 129 vadd.w vr29, vr29, vr25 130 vbsll.v vr5, vr29, 4 131 vextrins.w vr5, vr28, 0x03 132 vextrins.w vr6, vr29, 0x03 133 vextrins.w vr28, vr4, 0x30 134 vshuf4i.w vr4, vr28, 0x93 135 136 vbsrl.v vr28, vr12, 4 137 vextrins.w vr28, vr13, 0x30 138 vpackev.w vr26, vr25, vr24 139 vpackod.w vr27, vr25, vr24 140 vpermi.w vr26, vr26, 0xd8 //px0246 141 vpermi.w vr27, vr27, 0xd8 //px1357 142 vadd.w vr28, vr28, vr26 143 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] 144 vextrins.w vr13, vr28, 0x03 145 vextrins.w vr28, vr12, 0x30 146 vshuf4i.w vr12, vr28, 0x93 147 148 vhaddw.d.w vr28, vr24, vr24 149 vhaddw.q.d vr28, vr28, vr28 150 vpickve2gr.d a3, vr28, 0 151 vhaddw.d.w vr28, vr25, vr25 152 vhaddw.q.d vr28, vr28, vr28 153 vpickve2gr.d a4, vr28, 0 154 add.d a3, a3, a4 155 vinsgr2vr.w vr0, a3, 1 //hv[0][y] 156 157 vbsrl.v vr28, vr15, 4 158 vextrins.w vr28, vr16, 0x30 159 vpermi.w vr28, vr28, 0x1b 160 vadd.w vr28, vr28, vr26 161 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] 162 vextrins.w vr16, vr28, 0x00 163 vextrins.w vr28, vr15, 0x00 164 vshuf4i.w vr15, vr28, 0x6c 165 166 vbsrl.v vr28, vr8, 4 //4321 167 vbsrl.v vr29, vr9, 4 //8765 168 vextrins.w vr28, vr9, 0x30 169 vpermi.w vr28, vr28, 0x1b 170 vpermi.w vr29, vr29, 0x1b 171 vadd.w vr29, vr29, vr24 172 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] 173 vextrins.w vr10, vr29, 0x00 174 vextrins.w vr29, vr28, 0x00 175 vshuf4i.w vr9, vr29, 0x6c 176 vextrins.w vr28, vr8, 0x00 177 vshuf4i.w vr8, vr28, 0x6c 178 179 vbsll.v vr28, vr19, 4 180 vextrins.w vr28, vr18, 0x03 181 vbsll.v vr29, vr20, 4 182 vextrins.w vr29, vr19, 0x03 183 vadd.w vr28, vr28, vr24 184 vadd.w vr29, vr29, vr25 //alt[2][3-(y>>1)+7] 185 vextrins.w vr18, vr28, 0x30 186 vextrins.w vr28, vr29, 0x00 187 vshuf4i.w vr19, vr28, 0x39 188 vbsrl.v vr20, vr29, 4 189 190 vadd.w vr2, vr2, vr24 191 vadd.w vr3, vr3, vr25 //hv[1][x] 192 193 vadd.w vr21, vr21, vr24 194 vadd.w vr22, vr22, vr25 //alt[3][(y>>1)+x] 195 196 add.d a0, a0, a1 197 198 // 2 199 fld.d f24, a0, 0 //img 200 vpermi.w vr25, vr24, 0x01 201 202 vsllwil.hu.bu vr24, vr24, 0 203 vsllwil.hu.bu vr24, vr24, 0 204 vsllwil.hu.bu vr25, vr25, 0 205 vsllwil.hu.bu vr25, vr25, 0 206 207 vsub.w vr24, vr24, vr31 //px 208 vsub.w vr25, vr25, vr31 209 210 vbsrl.v vr28, vr4, 8 211 vbsrl.v vr29, vr5, 8 212 vextrins.d vr28, vr5, 0x10 //2-5 213 vextrins.d vr29, vr6, 0x10 //6-9 214 vadd.w vr28, vr28, vr24 //diag[0][y+x] 215 vadd.w vr29, vr29, vr25 216 vextrins.d vr4, vr28, 0x10 217 vextrins.d vr5, vr28, 0x01 218 vextrins.d vr5, vr29, 0x10 219 vextrins.d vr6, vr29, 0x01 220 221 vbsrl.v vr28, vr12, 8 222 vextrins.d vr28, vr13, 0x10 223 vpackev.w vr26, vr25, vr24 224 vpackod.w vr27, vr25, vr24 225 vpermi.w vr26, vr26, 0xd8 //px0246 226 vpermi.w vr27, vr27, 0xd8 //px1357 227 vadd.w vr28, vr28, vr26 228 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] 229 vextrins.d vr12, vr28, 0x10 230 vextrins.d vr13, vr28, 0x01 231 232 vhaddw.d.w vr28, vr24, vr24 233 vhaddw.q.d vr28, vr28, vr28 234 vpickve2gr.d a3, vr28, 0 235 vhaddw.d.w vr28, vr25, vr25 236 vhaddw.q.d vr28, vr28, vr28 237 vpickve2gr.d a4, vr28, 0 238 add.d a3, a3, a4 239 vinsgr2vr.w vr0, a3, 2 //hv[0][y] 240 241 vbsrl.v vr28, vr15, 8 242 vextrins.d vr28, vr16, 0x10 243 vpermi.w vr28, vr28, 0x1b 244 vadd.w vr28, vr28, vr26 245 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] 246 vpermi.w vr28, vr28, 0x1b 247 vextrins.d vr15, vr28, 0x10 248 vextrins.d vr16, vr28, 0x01 249 250 vbsrl.v vr28, vr8, 8 251 vextrins.d vr28, vr9, 0x10 252 vbsrl.v vr29, vr9, 8 253 vextrins.d vr29, vr10, 0x10 254 vpermi.w vr28, vr28, 0x1b //5432 255 vpermi.w vr29, vr29, 0x1b //9876 256 vadd.w vr29, vr29, vr24 257 vadd.w vr28, vr28, vr25 258 vpermi.w vr28, vr28, 0x1b 259 vpermi.w vr29, vr29, 0x1b 260 vextrins.d vr8, vr28, 0x10 261 vextrins.d vr9, vr28, 0x01 262 vextrins.d vr9, vr29, 0x10 263 vextrins.d vr10, vr29, 0x01 //diag[1][7+y-x] 264 265 vbsrl.v vr28, vr18, 8 266 vextrins.d vr28, vr19, 0x10 //2345 267 vbsrl.v vr29, vr19, 8 268 vextrins.d vr29, vr20, 0x10 //6789 269 vadd.w vr28, vr28, vr24 270 vadd.w vr29, vr29, vr25 271 vextrins.d vr18, vr28, 0x10 272 vextrins.d vr19, vr28, 0x01 273 vextrins.d vr19, vr29, 0x10 274 vextrins.d vr20, vr29, 0x01 //alt[2][3-(y>>1)+7] 275 276 vadd.w vr2, vr2, vr24 277 vadd.w vr3, vr3, vr25 //hv[1][x] 278 279 vbsrl.v vr28, vr21, 4 280 vextrins.w vr28, vr22, 0x30 //1234 281 vbsrl.v vr29, vr22, 4 //5678 282 vadd.w vr28, vr28, vr24 283 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] 284 vextrins.w vr23, vr29, 0x03 285 vextrins.w vr29, vr28, 0x33 286 vshuf4i.w vr22, vr29, 0x93 287 vextrins.w vr28, vr21, 0x30 288 vshuf4i.w vr21, vr28, 0x93 289 290 add.d a0, a0, a1 291 292 // 3 293 fld.d f24, a0, 0 //img 294 vpermi.w vr25, vr24, 0x01 295 296 vsllwil.hu.bu vr24, vr24, 0 297 vsllwil.hu.bu vr24, vr24, 0 298 vsllwil.hu.bu vr25, vr25, 0 299 vsllwil.hu.bu vr25, vr25, 0 300 301 vsub.w vr24, vr24, vr31 //px 302 vsub.w vr25, vr25, vr31 303 304 vbsll.v vr28, vr5, 4 305 vextrins.w vr28, vr4, 0x03 //3456 306 vbsll.v vr29, vr6, 4 307 vextrins.w vr29, vr5, 0x03 //78910 308 vadd.w vr28, vr28, vr24 //diag[0][y+x] 309 vadd.w vr29, vr29, vr25 310 vextrins.w vr4, vr28, 0x30 311 vextrins.w vr28, vr29, 0x00 312 vshuf4i.w vr5, vr28, 0x39 313 vbsrl.v vr6, vr29, 4 314 315 vbsll.v vr28, vr13, 4 316 vextrins.w vr28, vr12, 0x03 317 vpackev.w vr26, vr25, vr24 318 vpackod.w vr27, vr25, vr24 319 vpermi.w vr26, vr26, 0xd8 //px0246 320 vpermi.w vr27, vr27, 0xd8 //px1357 321 vadd.w vr28, vr28, vr26 322 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] 323 vextrins.w vr12, vr28, 0x30 324 vbsrl.v vr13, vr28, 4 325 326 vhaddw.d.w vr28, vr24, vr24 327 vhaddw.q.d vr28, vr28, vr28 328 vpickve2gr.d a3, vr28, 0 329 vhaddw.d.w vr28, vr25, vr25 330 vhaddw.q.d vr28, vr28, vr28 331 vpickve2gr.d a4, vr28, 0 332 add.d a3, a3, a4 333 vinsgr2vr.w vr0, a3, 3 //hv[0][y] 334 335 vbsll.v vr28, vr16, 4 336 vextrins.w vr28, vr15, 0x03 337 vpermi.w vr28, vr28, 0x1b //6543 338 vadd.w vr28, vr28, vr26 339 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] 340 vextrins.w vr15, vr28, 0x33 341 vshuf4i.w vr16, vr28, 0xc6 342 vinsgr2vr.w vr16, zero, 3 343 344 vbsll.v vr28, vr9, 4 345 vextrins.w vr28, vr8, 0x03 //3456 346 vbsll.v vr29, vr10, 4 347 vextrins.w vr29, vr9, 0x03 //78910 348 vpermi.w vr28, vr28, 0x1b //6543 349 vpermi.w vr29, vr29, 0x1b //10987 350 vadd.w vr29, vr29, vr24 351 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] 352 vextrins.w vr8, vr28, 0x33 353 vextrins.w vr28, vr29, 0x33 354 vshuf4i.w vr9, vr28, 0xc6 355 vshuf4i.w vr10, vr29, 0xc6 356 vinsgr2vr.w vr10, zero, 3 357 358 vbsrl.v vr28, vr18, 8 359 vextrins.d vr28, vr19, 0x10 //2345 360 vbsrl.v vr29, vr19, 8 361 vextrins.d vr29, vr20, 0x10 //6789 362 vadd.w vr28, vr28, vr24 363 vadd.w vr29, vr29, vr25 364 vextrins.d vr18, vr28, 0x10 365 vextrins.d vr19, vr28, 0x01 366 vextrins.d vr19, vr29, 0x10 367 vextrins.d vr20, vr29, 0x01 //alt[2][3-(y>>1)+7] 368 369 vadd.w vr2, vr2, vr24 370 vadd.w vr3, vr3, vr25 //hv[1][x] 371 372 vbsrl.v vr28, vr21, 4 373 vextrins.w vr28, vr22, 0x30 //1234 374 vbsrl.v vr29, vr22, 4 //5678 375 vextrins.w vr29, vr23, 0x30 376 vadd.w vr28, vr28, vr24 377 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] 378 vextrins.w vr23, vr29, 0x03 379 vextrins.w vr29, vr28, 0x33 380 vshuf4i.w vr22, vr29, 0x93 381 vextrins.w vr28, vr21, 0x30 382 vshuf4i.w vr21, vr28, 0x93 383 384 add.d a0, a0, a1 385 386 // 4 387 fld.d f24, a0, 0 //img 388 vpermi.w vr25, vr24, 0x01 389 390 vsllwil.hu.bu vr24, vr24, 0 391 vsllwil.hu.bu vr24, vr24, 0 392 vsllwil.hu.bu vr25, vr25, 0 393 vsllwil.hu.bu vr25, vr25, 0 394 395 vsub.w vr24, vr24, vr31 //px 396 vsub.w vr25, vr25, vr31 397 398 vadd.w vr5, vr5, vr24 //diag[0][y+x] 399 vadd.w vr6, vr6, vr25 400 401 vpackev.w vr26, vr25, vr24 402 vpackod.w vr27, vr25, vr24 403 vpermi.w vr26, vr26, 0xd8 //px0246 404 vpermi.w vr27, vr27, 0xd8 //px1357 405 vadd.w vr13, vr13, vr26 406 vadd.w vr13, vr13, vr27 //alt[0][y+(x>>1)] 407 408 vhaddw.d.w vr28, vr24, vr24 409 vhaddw.q.d vr28, vr28, vr28 410 vpickve2gr.d a3, vr28, 0 411 vhaddw.d.w vr28, vr25, vr25 412 vhaddw.q.d vr28, vr28, vr28 413 vpickve2gr.d a4, vr28, 0 414 add.d a3, a3, a4 415 vinsgr2vr.w vr1, a3, 0 //hv[0][y] 416 417 vpermi.w vr16, vr16, 0x1b 418 vadd.w vr16, vr16, vr26 419 vadd.w vr16, vr16, vr27 //alt[1][3+y-(x>>1)] 420 vpermi.w vr16, vr16, 0x1b 421 422 vpermi.w vr9, vr9, 0x1b 423 vpermi.w vr10, vr10, 0x1b 424 vadd.w vr10, vr10, vr24 425 vadd.w vr9, vr9, vr25 426 vpermi.w vr9, vr9, 0x1b 427 vpermi.w vr10, vr10, 0x1b //diag[1][7+y-x] 428 429 vbsrl.v vr28, vr18, 4 430 vextrins.w vr28, vr19, 0x30 //1234 431 vbsrl.v vr29, vr19, 4 432 vextrins.w vr29, vr20, 0x30 //5678 433 vadd.w vr28, vr28, vr24 434 vadd.w vr29, vr29, vr25 //alt[2][3-(y>>1)+7] 435 vextrins.w vr20, vr29, 0x03 436 vextrins.w vr29, vr28, 0x33 437 vshuf4i.w vr19, vr29, 0x93 438 vbsll.v vr18, vr28, 4 439 440 vadd.w vr2, vr2, vr24 441 vadd.w vr3, vr3, vr25 //hv[1][x] 442 443 vbsrl.v vr28, vr21, 8 444 vextrins.d vr28, vr22, 0x10 445 vbsrl.v vr29, vr22, 8 446 vextrins.d vr29, vr23, 0x10 447 vadd.w vr28, vr28, vr24 448 vadd.w vr29, vr29, vr25 449 vextrins.d vr21, vr28, 0x10 450 vextrins.d vr22, vr28, 0x01 451 vextrins.d vr22, vr29, 0x10 452 vextrins.d vr23, vr29, 0x01 //alt[3][(y>>1)+x] 453 454 add.d a0, a0, a1 455 456 // 5 457 fld.d f24, a0, 0 //img 458 vpermi.w vr25, vr24, 0x01 459 460 vsllwil.hu.bu vr24, vr24, 0 461 vsllwil.hu.bu vr24, vr24, 0 462 vsllwil.hu.bu vr25, vr25, 0 463 vsllwil.hu.bu vr25, vr25, 0 464 465 vsub.w vr24, vr24, vr31 //px 466 vsub.w vr25, vr25, vr31 467 468 vbsrl.v vr28, vr5, 4 //5-8 469 vbsrl.v vr29, vr6, 4 //9-12 470 vextrins.w vr28, vr6, 0x30 471 vadd.w vr28, vr28, vr24 //diag[0][y+x] 472 vadd.w vr29, vr29, vr25 473 vextrins.w vr7, vr29, 0x03 474 vextrins.w vr29, vr28, 0x33 475 vshuf4i.w vr6, vr29, 0x93 476 vextrins.w vr28, vr5, 0x30 477 vshuf4i.w vr5, vr28, 0x93 478 479 vbsrl.v vr28, vr13, 4 480 vextrins.w vr28, vr14, 0x30 481 vpackev.w vr26, vr25, vr24 482 vpackod.w vr27, vr25, vr24 483 vpermi.w vr26, vr26, 0xd8 //px0246 484 vpermi.w vr27, vr27, 0xd8 //px1357 485 vadd.w vr28, vr28, vr26 486 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] 487 vextrins.w vr14, vr28, 0x03 488 vextrins.w vr28, vr13, 0x30 489 vshuf4i.w vr13, vr28, 0x93 490 491 vhaddw.d.w vr28, vr24, vr24 492 vhaddw.q.d vr28, vr28, vr28 493 vpickve2gr.d a3, vr28, 0 494 vhaddw.d.w vr28, vr25, vr25 495 vhaddw.q.d vr28, vr28, vr28 496 vpickve2gr.d a4, vr28, 0 497 add.d a3, a3, a4 498 vinsgr2vr.w vr1, a3, 1 //hv[0][y] 499 500 vbsrl.v vr28, vr16, 4 501 vextrins.w vr28, vr17, 0x30 502 vpermi.w vr28, vr28, 0x1b 503 vadd.w vr28, vr28, vr26 504 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] 505 vextrins.w vr17, vr28, 0x00 506 vextrins.w vr28, vr16, 0x00 507 vshuf4i.w vr16, vr28, 0x6c 508 509 vbsrl.v vr28, vr9, 4 510 vbsrl.v vr29, vr10, 4 511 vextrins.w vr28, vr10, 0x30 512 vpermi.w vr28, vr28, 0x1b //8-5 513 vpermi.w vr29, vr29, 0x1b //12-9 514 vadd.w vr29, vr29, vr24 515 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] 516 vextrins.w vr11, vr29, 0x00 517 vextrins.w vr29, vr28, 0x00 518 vshuf4i.w vr10, vr29, 0x6c 519 vextrins.w vr28, vr9, 0x00 520 vshuf4i.w vr9, vr28, 0x6c 521 522 vbsrl.v vr28, vr18, 4 523 vextrins.w vr28, vr19, 0x30 //1234 524 vbsrl.v vr29, vr19, 4 525 vextrins.w vr29, vr20, 0x30 //5678 526 vadd.w vr28, vr28, vr24 527 vadd.w vr29, vr29, vr25 //alt[2][3-(y>>1)+7] 528 vextrins.w vr20, vr29, 0x03 529 vextrins.w vr29, vr28, 0x33 530 vshuf4i.w vr19, vr29, 0x93 531 vbsll.v vr18, vr28, 4 532 533 vadd.w vr2, vr2, vr24 534 vadd.w vr3, vr3, vr25 //hv[1][x] 535 536 vbsrl.v vr28, vr21, 8 537 vextrins.d vr28, vr22, 0x10 538 vbsrl.v vr29, vr22, 8 539 vextrins.d vr29, vr23, 0x10 540 vadd.w vr28, vr28, vr24 541 vadd.w vr29, vr29, vr25 542 vextrins.d vr21, vr28, 0x10 543 vextrins.d vr22, vr28, 0x01 544 vextrins.d vr22, vr29, 0x10 545 vextrins.d vr23, vr29, 0x01 //alt[3][(y>>1)+x] 546 547 add.d a0, a0, a1 548 549 // 6 550 fld.d f24, a0, 0 //img 551 vpermi.w vr25, vr24, 0x01 552 553 vsllwil.hu.bu vr24, vr24, 0 554 vsllwil.hu.bu vr24, vr24, 0 555 vsllwil.hu.bu vr25, vr25, 0 556 vsllwil.hu.bu vr25, vr25, 0 557 558 vsub.w vr24, vr24, vr31 //px 559 vsub.w vr25, vr25, vr31 560 561 vbsrl.v vr28, vr5, 8 562 vbsrl.v vr29, vr6, 8 563 vextrins.d vr28, vr6, 0x10 //6-9 564 vextrins.d vr29, vr7, 0x10 //10-13 565 vadd.w vr28, vr28, vr24 //diag[0][y+x] 566 vadd.w vr29, vr29, vr25 567 vextrins.d vr5, vr28, 0x10 568 vextrins.d vr6, vr28, 0x01 569 vextrins.d vr6, vr29, 0x10 570 vextrins.d vr7, vr29, 0x01 571 572 vbsrl.v vr28, vr13, 8 573 vextrins.d vr28, vr14, 0x10 574 vpackev.w vr26, vr25, vr24 575 vpackod.w vr27, vr25, vr24 576 vpermi.w vr26, vr26, 0xd8 //px0246 577 vpermi.w vr27, vr27, 0xd8 //px1357 578 vadd.w vr28, vr28, vr26 579 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] 580 vextrins.d vr13, vr28, 0x10 581 vextrins.d vr14, vr28, 0x01 582 583 vhaddw.d.w vr28, vr24, vr24 584 vhaddw.q.d vr28, vr28, vr28 585 vpickve2gr.d a3, vr28, 0 586 vhaddw.d.w vr28, vr25, vr25 587 vhaddw.q.d vr28, vr28, vr28 588 vpickve2gr.d a4, vr28, 0 589 add.d a3, a3, a4 590 vinsgr2vr.w vr1, a3, 2 //hv[0][y] 591 592 vbsrl.v vr28, vr16, 8 593 vextrins.d vr28, vr17, 0x10 594 vpermi.w vr28, vr28, 0x1b 595 vadd.w vr28, vr28, vr26 596 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] 597 vpermi.w vr28, vr28, 0x1b 598 vextrins.d vr16, vr28, 0x10 599 vextrins.d vr17, vr28, 0x01 600 601 vbsrl.v vr28, vr9, 8 602 vextrins.d vr28, vr10, 0x10 603 vbsrl.v vr29, vr10, 8 604 vextrins.d vr29, vr11, 0x10 605 vpermi.w vr28, vr28, 0x1b //9876 606 vpermi.w vr29, vr29, 0x1b //13-10 607 vadd.w vr29, vr29, vr24 608 vadd.w vr28, vr28, vr25 609 vpermi.w vr28, vr28, 0x1b 610 vpermi.w vr29, vr29, 0x1b 611 vextrins.d vr9, vr28, 0x10 612 vextrins.d vr10, vr28, 0x01 613 vextrins.d vr10, vr29, 0x10 614 vextrins.d vr11, vr29, 0x01 //diag[1][7+y-x] 615 616 vadd.w vr18, vr18, vr24 //0123 617 vadd.w vr19, vr19, vr25 //4567 alt[2][3-(y>>1)+7] 618 619 vadd.w vr2, vr2, vr24 620 vadd.w vr3, vr3, vr25 //hv[1][x] 621 622 vbsll.v vr28, vr22, 4 623 vextrins.w vr28, vr21, 0x03 //3456 624 vbsll.v vr29, vr23, 4 625 vextrins.w vr29, vr22, 0x03 //78910 626 vadd.w vr28, vr28, vr24 627 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] 628 vextrins.w vr21, vr28, 0x30 629 vextrins.w vr28, vr29, 0x00 630 vshuf4i.w vr22, vr28, 0x39 631 vbsrl.v vr23, vr29, 4 632 633 add.d a0, a0, a1 634 635 // 7 636 fld.d f24, a0, 0 //img 637 vpermi.w vr25, vr24, 0x01 638 639 vsllwil.hu.bu vr24, vr24, 0 640 vsllwil.hu.bu vr24, vr24, 0 641 vsllwil.hu.bu vr25, vr25, 0 642 vsllwil.hu.bu vr25, vr25, 0 643 644 vsub.w vr24, vr24, vr31 //px 645 vsub.w vr25, vr25, vr31 646 647 vbsll.v vr28, vr6, 4 648 vextrins.w vr28, vr5, 0x03 //78910 649 vbsll.v vr29, vr7, 4 650 vextrins.w vr29, vr6, 0x03 //11-14 651 vadd.w vr28, vr28, vr24 //diag[0][y+x] 652 vadd.w vr29, vr29, vr25 653 vextrins.w vr5, vr28, 0x30 654 vextrins.w vr28, vr29, 0x00 655 vshuf4i.w vr6, vr28, 0x39 656 vbsrl.v vr7, vr29, 4 657 658 vbsll.v vr28, vr14, 4 659 vextrins.w vr28, vr13, 0x03 660 vpackev.w vr26, vr25, vr24 661 vpackod.w vr27, vr25, vr24 662 vpermi.w vr26, vr26, 0xd8 //px0246 663 vpermi.w vr27, vr27, 0xd8 //px1357 664 vadd.w vr28, vr28, vr26 665 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] 666 vextrins.w vr13, vr28, 0x30 667 vbsrl.v vr14, vr28, 4 668 669 vhaddw.d.w vr28, vr24, vr24 670 vhaddw.q.d vr28, vr28, vr28 671 vpickve2gr.d a3, vr28, 0 672 vhaddw.d.w vr28, vr25, vr25 673 vhaddw.q.d vr28, vr28, vr28 674 vpickve2gr.d a4, vr28, 0 675 add.d a3, a3, a4 676 vinsgr2vr.w vr1, a3, 3 //hv[0][y] 677 678 vbsll.v vr28, vr17, 4 679 vextrins.w vr28, vr16, 0x03 680 vpermi.w vr28, vr28, 0x1b //10987 681 vadd.w vr28, vr28, vr26 682 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] 683 vextrins.w vr16, vr28, 0x33 684 vshuf4i.w vr17, vr28, 0xc6 685 vinsgr2vr.w vr17, zero, 3 686 687 vbsll.v vr28, vr10, 4 688 vextrins.w vr28, vr9, 0x03 //7-10 689 vbsll.v vr29, vr11, 4 690 vextrins.w vr29, vr10, 0x03 //11-14 691 vpermi.w vr28, vr28, 0x1b //10-7 692 vpermi.w vr29, vr29, 0x1b //14-11 693 vadd.w vr29, vr29, vr24 694 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] 695 vextrins.w vr9, vr28, 0x33 696 vextrins.w vr28, vr29, 0x33 697 vshuf4i.w vr10, vr28, 0xc6 698 vshuf4i.w vr11, vr29, 0xc6 699 vinsgr2vr.w vr11, zero, 3 700 701 vadd.w vr18, vr18, vr24 //0123 702 vadd.w vr19, vr19, vr25 //4567 alt[2][3-(y>>1)+7] 703 704 vadd.w vr2, vr2, vr24 705 vadd.w vr3, vr3, vr25 //hv[1][x] 706 707 vbsll.v vr28, vr22, 4 708 vextrins.w vr28, vr21, 0x03 //3456 709 vbsll.v vr29, vr23, 4 710 vextrins.w vr29, vr22, 0x03 //78910 711 vadd.w vr28, vr28, vr24 712 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] 713 vextrins.w vr21, vr28, 0x30 714 vextrins.w vr28, vr29, 0x00 715 vshuf4i.w vr22, vr28, 0x39 716 vbsrl.v vr23, vr29, 4 717 718 add.d a0, a0, a1 719 720 vxor.v vr24, vr24, vr24 //unsigned cost[8] 721 vxor.v vr25, vr25, vr25 722 723 vmul.w vr26, vr0, vr0 724 vmul.w vr27, vr1, vr1 725 vhaddw.d.w vr28, vr26, vr26 726 vhaddw.q.d vr28, vr28, vr28 727 vpickve2gr.d a3, vr28, 0 728 vhaddw.d.w vr28, vr27, vr27 729 vhaddw.q.d vr28, vr28, vr28 730 vpickve2gr.d a4, vr28, 0 731 add.d a3, a3, a4 732 733 vmul.w vr26, vr2, vr2 734 vmul.w vr27, vr3, vr3 735 vhaddw.d.w vr28, vr26, vr26 736 vhaddw.q.d vr28, vr28, vr28 737 vpickve2gr.d a4, vr28, 0 738 vhaddw.d.w vr28, vr27, vr27 739 vhaddw.q.d vr28, vr28, vr28 740 vpickve2gr.d a5, vr28, 0 741 add.d a4, a4, a5 742 743 li.d a6, 105 744 mul.w a3, a3, a6 745 mul.w a4, a4, a6 746 vinsgr2vr.w vr24, a3, 2 747 vinsgr2vr.w vr25, a4, 2 748 749 vxor.v vr30, vr30, vr30 //div_table 750 vxor.v vr31, vr31, vr31 751 li.d t0, 840 752 vinsgr2vr.w vr30, t0, 0 753 li.d t0, 420 754 vinsgr2vr.w vr30, t0, 1 755 li.d t0, 280 756 vinsgr2vr.w vr30, t0, 2 757 li.d t0, 210 758 vinsgr2vr.w vr30, t0, 3 759 li.d t0, 168 760 vinsgr2vr.w vr31, t0, 0 761 li.d t0, 140 762 vinsgr2vr.w vr31, t0, 1 763 li.d t0, 120 764 vinsgr2vr.w vr31, t0, 2 765 766 vbsll.v vr27, vr7, 4 767 vextrins.w vr27, vr6, 0x03 768 vpermi.w vr27, vr27, 0x1b 769 vmul.w vr26, vr4, vr4 770 vmadd.w vr26, vr27, vr27 771 vmul.w vr26, vr26, vr30 772 vhaddw.d.w vr28, vr26, vr26 773 vhaddw.q.d vr28, vr28, vr28 774 vpickve2gr.d a3, vr28, 0 775 vbsll.v vr27, vr6, 4 776 vpermi.w vr27, vr27, 0x1b 777 vmul.w vr26, vr5, vr5 778 vmadd.w vr26, vr27, vr27 779 vmul.w vr26, vr26, vr31 780 vextrins.w vr26, vr31, 0x33 781 vhaddw.d.w vr28, vr26, vr26 782 vhaddw.q.d vr28, vr28, vr28 783 vpickve2gr.d a4, vr28, 0 784 add.d a3, a3, a4 //cost[0] 785 786 vbsll.v vr27, vr11, 4 787 vextrins.w vr27, vr10, 0x03 788 vpermi.w vr27, vr27, 0x1b 789 vmul.w vr26, vr8, vr8 790 vmadd.w vr26, vr27, vr27 791 vmul.w vr26, vr26, vr30 792 vhaddw.d.w vr28, vr26, vr26 793 vhaddw.q.d vr28, vr28, vr28 794 vpickve2gr.d a4, vr28, 0 795 vbsll.v vr27, vr10, 4 796 vpermi.w vr27, vr27, 0x1b 797 vmul.w vr26, vr9, vr9 798 vmadd.w vr26, vr27, vr27 799 vmul.w vr26, vr26, vr31 800 vextrins.w vr26, vr31, 0x33 801 vhaddw.d.w vr28, vr26, vr26 802 vhaddw.q.d vr28, vr28, vr28 803 vpickve2gr.d a5, vr28, 0 804 add.d a4, a4, a5 //cost[4] 805 806 vpickve2gr.w a5, vr5, 3 807 mul.w a5, a5, a5 808 mul.w a5, a5, a6 809 add.w a3, a3, a5 810 vinsgr2vr.w vr24, a3, 0 811 vpickve2gr.w a5, vr9, 3 812 mul.w a5, a5, a5 813 mul.w a5, a5, a6 814 add.w a4, a4, a5 815 vinsgr2vr.w vr25, a4, 0 816 817 //n=0 818 vpickve2gr.w a3, vr24, 1 819 vmul.w vr26, vr13, vr13 820 vhaddw.d.w vr28, vr26, vr26 821 vhaddw.q.d vr28, vr28, vr28 822 vpickve2gr.d a4, vr28, 0 823 vpickve2gr.w a5, vr12, 3 824 mul.w a5, a5, a5 825 add.d a3, a3, a4 826 add.d a3, a3, a5 827 mul.w a3, a3, a6 //*cost_ptr 828 829 vextrins.w vr29, vr30, 0x01 830 vextrins.w vr29, vr30, 0x13 831 vextrins.w vr29, vr31, 0x21 832 vextrins.w vr29, vr31, 0x33 833 vbsll.v vr27, vr14, 4 834 vpermi.w vr27, vr27, 0x1b 835 vmul.w vr28, vr12, vr12 836 vextrins.w vr28, vr31, 0x33 837 vmadd.w vr28, vr27, vr27 838 vmul.w vr26, vr28, vr29 839 vhaddw.d.w vr28, vr26, vr26 840 vhaddw.q.d vr28, vr28, vr28 841 vpickve2gr.d a4, vr28, 0 842 add.d a3, a3, a4 843 vinsgr2vr.w vr24, a3, 1 844 845 //n=1 846 vpickve2gr.w a3, vr24, 3 847 vmul.w vr26, vr16, vr16 848 vhaddw.d.w vr28, vr26, vr26 849 vhaddw.q.d vr28, vr28, vr28 850 vpickve2gr.d a4, vr28, 0 851 vpickve2gr.w a5, vr15, 3 852 mul.w a5, a5, a5 853 add.d a3, a3, a4 854 add.d a3, a3, a5 855 mul.w a3, a3, a6 //*cost_ptr 856 857 vbsll.v vr27, vr17, 4 858 vpermi.w vr27, vr27, 0x1b 859 vmul.w vr28, vr15, vr15 860 vextrins.w vr28, vr31, 0x33 861 vmadd.w vr28, vr27, vr27 862 vmul.w vr26, vr28, vr29 863 vhaddw.d.w vr28, vr26, vr26 864 vhaddw.q.d vr28, vr28, vr28 865 vpickve2gr.d a4, vr28, 0 866 add.d a3, a3, a4 867 vinsgr2vr.w vr24, a3, 3 868 869 //n=2 870 vpickve2gr.w a3, vr25, 1 871 vmul.w vr26, vr19, vr19 872 vhaddw.d.w vr28, vr26, vr26 873 vhaddw.q.d vr28, vr28, vr28 874 vpickve2gr.d a4, vr28, 0 875 vpickve2gr.w a5, vr18, 3 876 mul.w a5, a5, a5 877 add.d a3, a3, a4 878 add.d a3, a3, a5 879 mul.w a3, a3, a6 //*cost_ptr 880 881 vbsll.v vr27, vr20, 4 882 vpermi.w vr27, vr27, 0x1b 883 vmul.w vr28, vr18, vr18 884 vextrins.w vr28, vr31, 0x33 885 vmadd.w vr28, vr27, vr27 886 vmul.w vr26, vr28, vr29 887 vhaddw.d.w vr28, vr26, vr26 888 vhaddw.q.d vr28, vr28, vr28 889 vpickve2gr.d a4, vr28, 0 890 add.d a3, a3, a4 891 vinsgr2vr.w vr25, a3, 1 892 893 //n=3 894 vpickve2gr.w a3, vr25, 3 895 vmul.w vr26, vr22, vr22 896 vhaddw.d.w vr28, vr26, vr26 897 vhaddw.q.d vr28, vr28, vr28 898 vpickve2gr.d a4, vr28, 0 899 vpickve2gr.w a5, vr21, 3 900 mul.w a5, a5, a5 901 add.d a3, a3, a4 902 add.d a3, a3, a5 903 mul.w a3, a3, a6 //*cost_ptr 904 905 vbsll.v vr27, vr23, 4 906 vpermi.w vr27, vr27, 0x1b 907 vmul.w vr28, vr21, vr21 908 vextrins.w vr28, vr31, 0x33 909 vmadd.w vr28, vr27, vr27 910 vmul.w vr26, vr28, vr29 911 vhaddw.d.w vr28, vr26, vr26 912 vhaddw.q.d vr28, vr28, vr28 913 vpickve2gr.d a4, vr28, 0 914 add.d a3, a3, a4 915 vinsgr2vr.w vr25, a3, 3 916 917 xor a3, a3, a3 //best_dir 918 vpickve2gr.w a4, vr24, 0 //best_cost 919.BSETDIR01: 920 vpickve2gr.w a5, vr24, 1 921 bge a4, a5, .BSETDIR02 922 or a4, a5, a5 923 ori a3, zero, 1 924.BSETDIR02: 925 vpickve2gr.w a5, vr24, 2 926 bge a4, a5, .BSETDIR03 927 or a4, a5, a5 928 ori a3, zero, 2 929.BSETDIR03: 930 vpickve2gr.w a5, vr24, 3 931 bge a4, a5, .BSETDIR04 932 or a4, a5, a5 933 ori a3, zero, 3 934.BSETDIR04: 935 vpickve2gr.w a5, vr25, 0 936 bge a4, a5, .BSETDIR05 937 or a4, a5, a5 938 ori a3, zero, 4 939.BSETDIR05: 940 vpickve2gr.w a5, vr25, 1 941 bge a4, a5, .BSETDIR06 942 or a4, a5, a5 943 ori a3, zero, 5 944.BSETDIR06: 945 vpickve2gr.w a5, vr25, 2 946 bge a4, a5, .BSETDIR07 947 or a4, a5, a5 948 ori a3, zero, 6 949.BSETDIR07: 950 vpickve2gr.w a5, vr25, 3 951 bge a4, a5, .BSETDIREND 952 or a4, a5, a5 953 ori a3, zero, 7 954.BSETDIREND: 955 xori a5, a3, 4 956 li.d a1, 4 957 bge a5, a1, .GETCOST01 958 vreplve.w vr26, vr24, a5 959 b .GETCOST02 960.GETCOST01: 961 vreplve.w vr26, vr25, a5 962.GETCOST02: 963 vpickve2gr.w a5, vr26, 0 964 sub.w a5, a4, a5 965 srai.d a5, a5, 10 966 st.w a5, a2, 0 967 or a0, a3, a3 968 969 fld.d f24, sp, 0 970 fld.d f25, sp, 8 971 fld.d f26, sp, 16 972 fld.d f27, sp, 24 973 fld.d f28, sp, 32 974 fld.d f29, sp, 40 975 fld.d f30, sp, 48 976 fld.d f31, sp, 56 977 addi.d sp, sp, 64 978 979endfunc 980 981.macro cdef_fill tmp, stride, w, h 982 beqz \h, 700f //h 983 or t0, zero, zero //y 984100: 985 or t1, zero, zero //xx 986 srai.d s6, \w, 3 //x 987 beqz s6, 300f 988200: 989 vstx vr18, \tmp, t1 990 addi.d t1, t1, 16 991 addi.d s6, s6, -1 992 bnez s6, 200b 993300: 994 andi s6, \w, 4 995 beqz s6, 400f 996 fstx.d f18, \tmp, t1 997 addi.d t1, t1, 8 998400: 999 andi s6, \w, 2 1000 beqz s6, 500f 1001 fstx.s f18, \tmp, t1 1002 addi.d t1, t1, 4 1003500: 1004 andi s6, \w, 1 1005 beqz s6, 600f 1006 li.w s6, -16384 1007 stx.h s6, \tmp, t1 1008 addi.d t1, t1, 2 1009600: 1010 add.d \tmp, \tmp, \stride 1011 add.d \tmp, \tmp, \stride 1012 addi.d t0, t0, 1 1013 blt t0, \h, 100b 1014700: 1015.endm 1016 1017const dav1d_cdef_directions 1018.byte 1 * 12 + 0, 2 * 12 + 0 1019.byte 1 * 12 + 0, 2 * 12 - 1 1020.byte -1 * 12 + 1, -2 * 12 + 2 1021.byte 0 * 12 + 1, -1 * 12 + 2 1022.byte 0 * 12 + 1, 0 * 12 + 2 1023.byte 0 * 12 + 1, 1 * 12 + 2 1024.byte 1 * 12 + 1, 2 * 12 + 2 1025.byte 1 * 12 + 0, 2 * 12 + 1 1026.byte 1 * 12 + 0, 2 * 12 + 0 1027.byte 1 * 12 + 0, 2 * 12 - 1 1028.byte -1 * 12 + 1, -2 * 12 + 2 1029.byte 0 * 12 + 1, -1 * 12 + 2 1030endconst 1031 1032.macro constrain_vrh in0, in1, in2, tmp0, tmp1, out 1033 vabsd.h \tmp0, \in0, vr23 //adiff 1034 vsra.h \tmp1, \tmp0, \in2 1035 vsub.h \tmp1, \in1, \tmp1 1036 vmax.h \tmp1, vr23, \tmp1 //imax 1037 vmin.h \tmp0, \tmp0, \tmp1 //imin 1038 1039 //apply_sign 1040 vslt.h \tmp1, \in0, vr23 1041 vandn.v \in0, \tmp1, \tmp0 1042 vsigncov.h \tmp0, \tmp1, \tmp0 1043 vor.v \out, \in0, \tmp0 1044.endm 1045 1046.macro iclip_vrh in0, in1, in2, tmp0, tmp1, out 1047 vmin.h \tmp0, \in2, \in0 1048 vslt.h \in0, \in0, \in1 1049 vand.v \tmp1, \in0, \in1 1050 vandn.v \tmp0, \in0, \tmp0 1051 vor.v \out, \tmp1, \tmp0 1052.endm 1053 1054.macro cdef_padding_data 1055 //y < 0 1056 beqz t7, 90f 10574: 1058 or t4, t5, t5 //data index xx 1059 slli.d t0, t4, 1 1060 mul.w t2, t7, s5 1061 slli.d t2, t2, 1 1062 add.d t2, s4, t2 1063 1064 sub.d t3, t6, t5 //loop param x 1065 srai.d t3, t3, 3 1066 add.d t3, t3, t5 1067 beq t5, t3, 6f 10685: // /8 1069 fldx.d f18, a3, t4 1070 vsllwil.hu.bu vr18, vr18, 0 1071 vstx vr18, t2, t0 1072 addi.d t0, t0, 16 1073 addi.d t4, t4, 8 1074 1075 addi.d t3, t3, -1 1076 bne t5, t3, 5b 10776: // &4 1078 sub.d t1, t6, t5 1079 andi t1, t1, 4 1080 beqz t1, 7f 1081 1082 fldx.s f18, a3, t4 1083 vsllwil.hu.bu vr18, vr18, 0 1084 fstx.d f18, t2, t0 1085 addi.d t0, t0, 8 1086 addi.d t4, t4, 4 10877: // &2 1088 sub.d t1, t6, t5 1089 andi t1, t1, 2 1090 beqz t1, 9f 1091 1092 ldx.bu t1, a3, t4 1093 stx.h t1, t2, t0 1094 addi.d t0, t0, 2 1095 addi.d t4, t4, 1 1096 ldx.bu t1, a3, t4 1097 stx.h t1, t2, t0 1098 addi.d t0, t0, 2 1099 addi.d t4, t4, 1 11009: 1101 add.d a3, a3, a1 1102 addi.d t7, t7, 1 1103 bnez t7, 4b 1104 110590: 1106 // y < h 1107 beqz s1, 12f 1108 beqz t5, 12f 1109 or t7, zero, zero //y 111010: 1111 or t4, t5, t5 //data index x 111211: 1113 slli.d t3, t7, 1 1114 addi.d t3, t3, 2 1115 add.d t3, t3, t4 1116 ldx.bu t1, a2, t3 1117 1118 mul.w t3, t7, s5 1119 add.d t3, t3, t4 1120 slli.d t3, t3, 1 1121 stx.h t1, s4, t3 1122 1123 addi.d t4, t4, 1 1124 bnez t4, 11b 1125 1126 addi.d t7, t7, 1 1127 bne t7, s1, 10b 1128 112912: 1130 // y = 0 ; y < h 1131 or s0, s4, s4 1132 beqz s1, 20f 1133 or s6, a0, a0 1134 or t7, zero, zero //y 1135 srai.d t4, t6, 3 //loop max 113613: 1137 or t0, zero, zero //loop param 1138 or t3, t0, t0 //data index src 1139 or t1, t0, t0 //data index tmp 1140 beqz t4, 16f 114115: // /8 1142 fldx.d f18, s6, t3 1143 vsllwil.hu.bu vr18, vr18, 0 1144 vstx vr18, s0, t1 1145 addi.d t3, t3, 8 1146 addi.d t1, t1, 16 1147 1148 addi.d t0, t0, 1 1149 blt t0, t4, 15b 115016: // &4 1151 andi t0, t6, 4 1152 beqz t0, 17f 1153 1154 fldx.s f18, s6, t3 1155 vsllwil.hu.bu vr18, vr18, 0 1156 fstx.d f18, s0, t1 1157 addi.d t3, t3, 4 1158 addi.d t1, t1, 8 115917: // &2 1160 andi t0, t6, 2 1161 beqz t0, 19f 1162 1163 ldx.bu t2, s6, t3 1164 stx.h t2, s0, t1 1165 addi.d t3, t3, 1 1166 addi.d t1, t1, 2 1167 ldx.bu t2, s6, t3 1168 stx.h t2, s0, t1 1169 addi.d t3, t3, 1 1170 addi.d t1, t1, 2 117119: // src+ tmp+ 1172 add.d s6, s6, a1 1173 add.d s0, s0, s5 1174 add.d s0, s0, s5 1175 1176 addi.d t7, t7, 1 1177 blt t7, s1, 13b 1178 1179 // y = h ; y < y_end 118020: 1181 beq s1, t8, 27f 1182 or t7, s1, s1 //y 1183 sub.d t4, t6, t5 1184 srai.d t4, t4, 3 1185 add.d t4, t4, t5 //8 loop max 118621: 1187 or t0, t5, t5 //xx 1188 or t3, t0, t0 //data index bottom 1189 slli.d t1, t0, 1 //data index tmp 1190 beq t5, t4, 23f 119122: // /8 1192 fldx.d f18, a4, t3 1193 vsllwil.hu.bu vr18, vr18, 0 1194 vstx vr18, s0, t1 1195 addi.d t3, t3, 8 1196 addi.d t1, t1, 16 1197 1198 addi.d t0, t0, 1 1199 blt t0, t4, 22b 120023: // &4 1201 sub.d t0, t6, t5 1202 andi t0, t0, 4 1203 beqz t0, 24f 1204 1205 fldx.s f18, a4, t3 1206 vsllwil.hu.bu vr18, vr18, 0 1207 fstx.d f18, s0, t1 1208 addi.d t3, t3, 4 1209 addi.d t1, t1, 8 121024: // &2 1211 sub.d t0, t6, t5 1212 andi t0, t0, 2 1213 beqz t0, 26f 1214 1215 ldx.bu t2, a4, t3 1216 stx.h t2, s0, t1 1217 addi.d t3, t3, 1 1218 addi.d t1, t1, 2 1219 ldx.bu t2, a4, t3 1220 stx.h t2, s0, t1 1221 addi.d t3, t3, 1 1222 addi.d t1, t1, 2 122326: // bottom+ tmp+ 1224 add.d a4, a4, a1 1225 add.d s0, s0, s5 1226 add.d s0, s0, s5 1227 1228 addi.d t7, t7, 1 1229 blt t7, t8, 21b 123027: 1231 // padding end 1232.endm 1233 1234.macro cdef_pri_sec_init 1235 clz.w t3, a6 1236 sub.w t3, t2, t3 1237 sub.w t3, s7, t3 //sec_shift 1238 1239 vreplgr2vr.h vr4, t0 //pri_tap_k 1240 vreplgr2vr.h vr9, a5 //pri_strength 1241 vreplgr2vr.h vr10, t1 //pri_shift 1242 vreplgr2vr.h vr18, a6 //sec_strength 1243 vreplgr2vr.h vr19, t3 //sec_shift 1244 1245 or t2, s1, s1 //dowhile loop param 1246 addi.d s1, a7, 2 1247 slli.d s1, s1, 1 //directions dir+2 1248 addi.d s2, a7, 4 1249 slli.d s2, s2, 1 //directions dir+4 1250 slli.d s3, a7, 1 //directions dir+0 1251 1252 la.local t0, dav1d_cdef_directions 1253 add.d s1, t0, s1 1254 ld.b a2, s1, 0 //off01 1255 ld.b a3, s1, 1 //off11 1256 add.d s2, t0, s2 1257 ld.b s1, s2, 0 //off02 1258 ld.b s2, s2, 1 //off12 1259 add.d s3, t0, s3 1260 ld.b t0, s3, 0 //off03 1261 ld.b s3, s3, 1 //off13 1262 1263 slli.d a2, a2, 1 1264 slli.d a3, a3, 1 1265 slli.d s1, s1, 1 1266 slli.d s2, s2, 1 1267 slli.d t0, t0, 1 1268 slli.d s3, s3, 1 1269.endm 1270 1271.macro cdef_pri_init 1272 vreplgr2vr.h vr4, t0 //pri_tap_k 1273 vreplgr2vr.h vr9, a5 //pri_strength 1274 vreplgr2vr.h vr10, t1 //pri_shift 1275 1276 or t2, s1, s1 //dowhile loop param 1277 addi.d s1, a7, 2 1278 slli.d s1, s1, 1 //directions dir+2 1279 1280 la.local t0, dav1d_cdef_directions 1281 add.d s1, t0, s1 1282 ld.b a2, s1, 0 //off01 1283 ld.b a3, s1, 1 //off11 1284 1285 slli.d a2, a2, 1 1286 slli.d a3, a3, 1 1287.endm 1288 1289.macro cdef_sec_init 1290 clz.w t3, a6 1291 li.w t2, 31 1292 sub.w t3, t2, t3 1293 sub.w t3, s7, t3 //sec_shift 1294 1295 vreplgr2vr.h vr18, a6 //sec_strength 1296 vreplgr2vr.h vr19, t3 //sec_shift 1297 1298 or t2, s1, s1 //dowhile loop param 1299 addi.d s2, a7, 4 1300 slli.d s2, s2, 1 //directions dir+4 1301 slli.d s3, a7, 1 //directions dir+0 1302 1303 la.local t0, dav1d_cdef_directions 1304 add.d s1, t0, s1 1305 add.d s2, t0, s2 1306 ld.b s1, s2, 0 //off02 1307 ld.b s2, s2, 1 //off12 1308 add.d s3, t0, s3 1309 ld.b t0, s3, 0 //off03 1310 ld.b s3, s3, 1 //off13 1311 1312 slli.d s1, s1, 1 1313 slli.d s2, s2, 1 1314 slli.d t0, t0, 1 1315 slli.d s3, s3, 1 1316.endm 1317 1318.macro cdef_process_data_w8 in0, in1 1319 vsub.h vr11, vr5, vr0 1320 vsub.h vr12, vr6, vr0 1321 vsub.h vr13, vr7, vr0 1322 vsub.h vr14, vr8, vr0 1323 1324 constrain_vrh vr11, \in0, \in1, vr16, vr17, vr11 1325 constrain_vrh vr12, \in0, \in1, vr16, vr17, vr12 1326 constrain_vrh vr13, \in0, \in1, vr16, vr17, vr13 1327 constrain_vrh vr14, \in0, \in1, vr16, vr17, vr14 1328.endm 1329 1330.macro cdef_process_data_w4 in0, in1 1331 vpermi.w vr6, vr5, 0x44 1332 vpermi.w vr8, vr7, 0x44 1333 1334 vsub.h vr12, vr6, vr0 1335 vsub.h vr14, vr8, vr0 1336 1337 constrain_vrh vr12, \in0, \in1, vr16, vr17, vr12 1338 constrain_vrh vr14, \in0, \in1, vr16, vr17, vr14 1339.endm 1340 1341.macro cdef_calc_sum_tapchange_w8 1342 vmul.h vr1, vr15, vr11 //sum 1343 vmadd.h vr1, vr15, vr12 //sum 1344 vand.v vr15, vr15, vr21 1345 vor.v vr15, vr15, vr22 1346 vmadd.h vr1, vr15, vr13 //sum 1347 vmadd.h vr1, vr15, vr14 //sum 1348.endm 1349 1350.macro cdef_calc_sum_tapchange_w4 1351 vmul.h vr1, vr15, vr12 //sum 1352 vand.v vr15, vr15, vr21 1353 vor.v vr15, vr15, vr22 1354 vmadd.h vr1, vr15, vr14 //sum 1355.endm 1356 1357.macro cdef_calc_sum_no_tapchange_w4 in0 1358 vmadd.h vr1, \in0, vr12 1359 vmadd.h vr1, \in0, vr14 1360.endm 1361 1362.macro cdef_calc_sum_no_tapchange_w8 in0 1363 vmadd.h vr1, \in0, vr11 //sum 1364 vmadd.h vr1, \in0, vr12 1365 vmadd.h vr1, \in0, vr13 1366 vmadd.h vr1, \in0, vr14 1367.endm 1368 1369.macro cdef_calc_maxmin_w4 1370 vmin.hu vr3, vr6, vr3 1371 vmax.h vr2, vr6, vr2 1372 vmin.hu vr3, vr8, vr3 //min 1373 vmax.h vr2, vr8, vr2 //max 1374.endm 1375 1376.macro cdef_calc_maxmin_w8 1377 vmin.hu vr3, vr5, vr3 1378 vmax.h vr2, vr5, vr2 1379 vmin.hu vr3, vr6, vr3 1380 vmax.h vr2, vr6, vr2 1381 vmin.hu vr3, vr7, vr3 1382 vmax.h vr2, vr7, vr2 1383 vmin.hu vr3, vr8, vr3 //min 1384 vmax.h vr2, vr8, vr2 //max 1385.endm 1386 1387.macro cdef_calc_dst 1388 vslti.h vr5, vr1, 0 1389 vand.v vr5, vr5, vr20 1390 vsub.h vr5, vr1, vr5 1391 vaddi.hu vr5, vr5, 8 1392 vsrai.h vr5, vr5, 4 1393 vadd.h vr5, vr0, vr5 1394.endm 1395 1396//static NOINLINE void cdef_filter_block_lsx 1397// (pixel *dst, const ptrdiff_t dst_stride, 1398// const pixel (*left)[2], const pixel *const top, 1399// const int pri_strength, const int sec_strength, 1400// const int dir, const int damping, const int w, int h, 1401// const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) 1402// w=4 h=4 1403//param: dst:a0, dst_stride:a1, left:a2, top:a3, bottom:a4, pri_strength:a5 1404//sec_strength:a6, dir:a7, damping:s7, w:s0, h:s1, edges:s2 1405function cdef_filter_block_4x4_8bpc_lsx 1406 ld.w t0, sp, 0 1407 ld.w t1, sp, 8 1408 addi.d sp, sp, -(64+288) 1409 st.d s0, sp, 0 1410 st.d s1, sp, 8 1411 st.d s2, sp, 16 1412 st.d s3, sp, 24 1413 st.d s4, sp, 32 1414 st.d s5, sp, 40 1415 st.d s6, sp, 48 1416 st.d s7, sp, 56 1417 1418 li.w s0, 4 //w 1419 li.w s1, 4 //h 1420 or s2, t1, t1 //edges 1421 or s7, t0, t0 //damping 1422 1423 li.d s5, 12 //tmp_stride 1424 addi.d s4, sp, 64 1425 slli.d t0, s5, 1 1426 addi.d t0, t0, 2 1427 slli.d t0, t0, 1 1428 add.d s4, s4, t0 //ptr tmp 1429 vxor.v vr23, vr23, vr23 1430 li.w t2, 1 1431 vreplgr2vr.h vr20, t2 1432 vaddi.hu vr21, vr20, 2 1433 vaddi.hu vr22, vr20, 1 1434 1435 li.w t0, -16384 1436 vreplgr2vr.h vr18, t0 1437 1438 //padding 1439 li.w t5, -2 //x_start 1440 addi.d t6, s0, 2 //x_end 1441 li.w t7, -2 //y_start 1442 addi.d t8, s1, 2 //y_end 1443 li.w t2, 2 1444 1445 andi t4, s2, 4 1446 bnez t4, 1f 1447 1448 //CDEF_HAVE_TOP 1449 slli.d t3, s5, 2 1450 addi.d t4, s4, -4 1451 sub.d t4, t4, t3 1452 addi.d t3, s0, 4 1453 1454 cdef_fill t4, s5, t3, t2 1455 1456 or t7, zero, zero 1457 14581: //CDEF_HAVE_BOTTOM 1459 andi t4, s2,8 1460 bnez t4, 2f 1461 1462 mul.w t3, s1, s5 1463 slli.d t3, t3, 1 1464 add.d t4, s4, t3 1465 addi.d t4, t4, -4 1466 li.d t3, 8 1467 1468 cdef_fill t4, s5, t3, t2 1469 1470 addi.d t8, t8, -2 1471 14722: //CDEF_HAVE_LEFT 1473 andi t4, s2,1 1474 bnez t4, 3f 1475 1476 mul.w t3, t7, s5 1477 slli.d t3, t3, 1 1478 add.d t4, s4, t3 1479 addi.d t4, t4, -4 1480 sub.d t3, t8, t7 1481 1482 cdef_fill t4, s5, t2, t3 1483 1484 or t5, zero, zero 1485 14863: //CDEF_HAVE_RIGHT 1487 andi t4, s2,2 1488 bnez t4, 40f 1489 1490 mul.w t3, t7, s5 1491 slli.d t3, t3, 1 1492 add.d t4, s4, t3 1493 addi.d t4, t4, 8 1494 sub.d t3, t8, t7 1495 1496 cdef_fill t4, s5, t2, t3 1497 1498 addi.d t6, t6, -2 1499 150040: 1501 cdef_padding_data 1502 1503 beqz a5, 33f 1504 150528: //if (pri_strength) 1506 li.w t0, 4 1507 andi t1, a5, 1 1508 sub.d t0, t0, t1 //pri_tap 1509 1510 clz.w t1, a5 1511 li.d t2, 31 1512 sub.w t1, t2, t1 1513 sub.w t1, s7, t1 1514 1515 blt t1, zero, 281f 1516 or t1, t1, t1 1517 b 282f 1518281: 1519 or t1, zero, zero //t1: pri_shift 1520282: 1521 1522 beqz a6, 31f 1523 152429: //if (sec_strength) 1525 cdef_pri_sec_init 1526 152730: 1528 fld.s f0, a0, 0 //px 1529 vsllwil.hu.bu vr0, vr0, 0 1530 vpermi.w vr0, vr0, 0x44 1531 1532 vxor.v vr1, vr1, vr1 //sum 1533 vor.v vr2, vr0, vr0 //max 1534 vor.v vr3, vr0, vr0 //min 1535 vor.v vr15, vr4, vr4 //pri_tap_k 1536 1537 sub.d t4, s4, a2 1538 sub.d t5, s4, a3 1539 1540 fldx.d f5, s4, a2 //p0_00 1541 fld.d f6, t4, 0 //p0_01 1542 fldx.d f7, s4, a3 //p0_10 1543 fld.d f8, t5, 0 //p0_11 1544 1545 cdef_process_data_w4 vr9, vr10 1546 cdef_calc_sum_tapchange_w4 1547 cdef_calc_maxmin_w4 1548 1549 sub.d t4, s4, s1 //tmp[-off02] 1550 sub.d t5, s4, t0 //tmp[-off03] 1551 1552 fldx.d f5, s4, s1 //s0_00 1553 fld.d f6, t4, 0 //s0_01 1554 fldx.d f7, s4, t0 //s0_02 1555 fld.d f8, t5, 0 //s0_03 1556 1557 cdef_process_data_w4 vr18, vr19 1558 cdef_calc_sum_no_tapchange_w4 vr22 1559 cdef_calc_maxmin_w4 1560 1561 sub.d t4, s4, s2 //tmp[-off12] 1562 sub.d t5, s4, s3 //tmp[-off13] 1563 1564 fldx.d f5, s4, s2 //s0_10 1565 fld.d f6, t4, 0 //s0_11 1566 fldx.d f7, s4, s3 //s0_12 1567 fld.d f8, t5, 0 //s0_13 1568 1569 cdef_process_data_w4 vr18, vr19 1570 cdef_calc_sum_no_tapchange_w4 vr20 1571 cdef_calc_maxmin_w4 1572 1573 vshuf4i.w vr5, vr1, 0x0e 1574 vshuf4i.w vr6, vr3, 0x0e 1575 vshuf4i.w vr7, vr2, 0x0e 1576 vadd.h vr1, vr1, vr5 1577 vmin.hu vr3, vr6, vr3 1578 vmax.h vr2, vr7, vr2 1579 1580 cdef_calc_dst 1581 iclip_vrh vr5, vr3, vr2, vr16, vr17, vr5 1582 1583 vsrlni.b.h vr5, vr5, 0 1584 fst.s f5, a0, 0 1585 1586 add.d a0, a0, a1 1587 add.d s4, s4, s5 1588 add.d s4, s4, s5 1589 1590 addi.d t2, t2, -1 1591 blt zero, t2, 30b 1592 b 35f 1593 159431: // pri_strength only 1595 cdef_pri_init 1596 159732: 1598 fld.s f0, a0, 0 //px 1599 vsllwil.hu.bu vr0, vr0, 0 1600 vpermi.w vr0, vr0, 0x44 1601 1602 vxor.v vr1, vr1, vr1 //sum 1603 vor.v vr15, vr4, vr4 //pri_tap_k 1604 1605 sub.d t4, s4, a2 1606 sub.d t5, s4, a3 1607 1608 fldx.d f5, s4, a2 //p0_00 1609 fld.d f6, t4, 0 //p0_01 1610 fldx.d f7, s4, a3 //p0_10 1611 fld.d f8, t5, 0 //p0_11 1612 1613 cdef_process_data_w4 vr9, vr10 1614 cdef_calc_sum_tapchange_w4 1615 1616 vshuf4i.w vr5, vr1, 0x0e 1617 vadd.h vr1, vr1, vr5 1618 1619 cdef_calc_dst 1620 1621 vsrlni.b.h vr5, vr5, 0 1622 fst.s f5, a0, 0 1623 1624 add.d a0, a0, a1 1625 add.d s4, s4, s5 1626 add.d s4, s4, s5 1627 1628 addi.d t2, t2, -1 1629 blt zero, t2, 32b 1630 b 35f 1631 163233: // sec_strength only 1633 cdef_sec_init 1634 163534: 1636 fld.s f0, a0, 0 //px 1637 vsllwil.hu.bu vr0, vr0, 0 1638 vpermi.w vr0, vr0, 0x44 1639 1640 vxor.v vr1, vr1, vr1 //sum 1641 1642 sub.d t4, s4, s1 //tmp[-off02] 1643 sub.d t5, s4, t0 //tmp[-off03] 1644 1645 fldx.d f5, s4, s1 //s0_00 1646 fld.d f6, t4, 0 //s0_01 1647 fldx.d f7, s4, t0 //s0_02 1648 fld.d f8, t5, 0 //s0_03 1649 1650 cdef_process_data_w4 vr18, vr19 1651 cdef_calc_sum_no_tapchange_w4 vr22 1652 1653 sub.d t4, s4, s2 //tmp[-off12] 1654 sub.d t5, s4, s3 //tmp[-off13] 1655 1656 fldx.d f5, s4, s2 //s0_10 1657 fld.d f6, t4, 0 //s0_11 1658 fldx.d f7, s4, s3 //s0_12 1659 fld.d f8, t5, 0 //s0_13 1660 1661 cdef_process_data_w4 vr18, vr19 1662 cdef_calc_sum_no_tapchange_w4 vr20 1663 1664 vshuf4i.w vr5, vr1, 0x0e 1665 vadd.h vr1, vr1, vr5 1666 1667 cdef_calc_dst 1668 1669 vsrlni.b.h vr5, vr5, 0 1670 fst.s f5, a0, 0 1671 1672 add.d a0, a0, a1 1673 add.d s4, s4, s5 1674 add.d s4, s4, s5 1675 1676 addi.d t2, t2, -1 1677 blt zero, t2, 34b 1678 167935: 1680 ld.d s0, sp, 0 1681 ld.d s1, sp, 8 1682 ld.d s2, sp, 16 1683 ld.d s3, sp, 24 1684 ld.d s4, sp, 32 1685 ld.d s5, sp, 40 1686 ld.d s6, sp, 48 1687 ld.d s7, sp, 56 1688 addi.d sp, sp, (64+288) 1689endfunc 1690 1691function cdef_filter_block_4x8_8bpc_lsx 1692 ld.w t0, sp, 0 1693 ld.w t1, sp, 8 1694 addi.d sp, sp, -(64+288) 1695 st.d s0, sp, 0 1696 st.d s1, sp, 8 1697 st.d s2, sp, 16 1698 st.d s3, sp, 24 1699 st.d s4, sp, 32 1700 st.d s5, sp, 40 1701 st.d s6, sp, 48 1702 st.d s7, sp, 56 1703 1704 li.w s0, 4 //w 1705 li.w s1, 8 //h 1706 or s2, t1, t1 //edges 1707 or s7, t0, t0 //damping 1708 1709 li.d s5, 12 //tmp_stride 1710 addi.d s4, sp, 64 1711 slli.d t0, s5, 1 1712 addi.d t0, t0, 2 1713 slli.d t0, t0, 1 1714 add.d s4, s4, t0 //ptr tmp 1715 vxor.v vr23, vr23, vr23 1716 li.w t2, 1 1717 vreplgr2vr.h vr20, t2 1718 vaddi.hu vr21, vr20, 2 1719 vaddi.hu vr22, vr20, 1 1720 1721 li.w t0, -16384 1722 vreplgr2vr.h vr18, t0 1723 1724 //padding 1725 li.w t5, -2 //x_start 1726 addi.d t6, s0, 2 //x_end 1727 li.w t7, -2 //y_start 1728 addi.d t8, s1, 2 //y_end 1729 li.w t2, 2 1730 1731 andi t4, s2, 4 1732 bnez t4, 1f 1733 1734 //CDEF_HAVE_TOP 1735 slli.d t3, s5, 2 1736 addi.d t4, s4, -4 1737 sub.d t4, t4, t3 1738 addi.d t3, s0, 4 1739 1740 cdef_fill t4, s5, t3, t2 1741 1742 or t7, zero, zero 1743 17441: //CDEF_HAVE_BOTTOM 1745 andi t4, s2,8 1746 bnez t4, 2f 1747 1748 mul.w t3, s1, s5 1749 slli.d t3, t3, 1 1750 add.d t4, s4, t3 1751 addi.d t4, t4, -4 1752 li.d t3, 8 1753 1754 cdef_fill t4, s5, t3, t2 1755 1756 addi.d t8, t8, -2 1757 17582: //CDEF_HAVE_LEFT 1759 andi t4, s2,1 1760 bnez t4, 3f 1761 1762 mul.w t3, t7, s5 1763 slli.d t3, t3, 1 1764 add.d t4, s4, t3 1765 addi.d t4, t4, -4 1766 sub.d t3, t8, t7 1767 1768 cdef_fill t4, s5, t2, t3 1769 1770 or t5, zero, zero 1771 17723: //CDEF_HAVE_RIGHT 1773 andi t4, s2,2 1774 bnez t4, 40f 1775 1776 mul.w t3, t7, s5 1777 slli.d t3, t3, 1 1778 add.d t4, s4, t3 1779 addi.d t4, t4, 8 1780 sub.d t3, t8, t7 1781 1782 cdef_fill t4, s5, t2, t3 1783 1784 addi.d t6, t6, -2 1785 178640: 1787 cdef_padding_data 1788 1789 beqz a5, 33f 1790 179128: //if (pri_strength) 1792 li.w t0, 4 1793 andi t1, a5, 1 1794 sub.d t0, t0, t1 //pri_tap 1795 1796 clz.w t1, a5 1797 li.d t2, 31 1798 sub.w t1, t2, t1 1799 sub.w t1, s7, t1 1800 1801 blt t1, zero, 281f 1802 or t1, t1, t1 1803 b 282f 1804281: 1805 or t1, zero, zero //t1: pri_shift 1806282: 1807 1808 beqz a6, 31f 1809 181029: //if (sec_strength) 1811 cdef_pri_sec_init 1812 181330: 1814 fld.s f0, a0, 0 //px 1815 vsllwil.hu.bu vr0, vr0, 0 1816 vpermi.w vr0, vr0, 0x44 1817 1818 vxor.v vr1, vr1, vr1 //sum 1819 vor.v vr2, vr0, vr0 //max 1820 vor.v vr3, vr0, vr0 //min 1821 vor.v vr15, vr4, vr4 //pri_tap_k 1822 1823 sub.d t4, s4, a2 1824 sub.d t5, s4, a3 1825 1826 fldx.d f5, s4, a2 //p0_00 1827 fld.d f6, t4, 0 //p0_01 1828 fldx.d f7, s4, a3 //p0_10 1829 fld.d f8, t5, 0 //p0_11 1830 1831 cdef_process_data_w4 vr9, vr10 1832 cdef_calc_sum_tapchange_w4 1833 cdef_calc_maxmin_w4 1834 1835 sub.d t4, s4, s1 //tmp[-off02] 1836 sub.d t5, s4, t0 //tmp[-off03] 1837 1838 fldx.d f5, s4, s1 //s0_00 1839 fld.d f6, t4, 0 //s0_01 1840 fldx.d f7, s4, t0 //s0_02 1841 fld.d f8, t5, 0 //s0_03 1842 1843 cdef_process_data_w4 vr18, vr19 1844 cdef_calc_sum_no_tapchange_w4 vr22 1845 cdef_calc_maxmin_w4 1846 1847 sub.d t4, s4, s2 //tmp[-off12] 1848 sub.d t5, s4, s3 //tmp[-off13] 1849 1850 fldx.d f5, s4, s2 //s0_10 1851 fld.d f6, t4, 0 //s0_11 1852 fldx.d f7, s4, s3 //s0_12 1853 fld.d f8, t5, 0 //s0_13 1854 1855 cdef_process_data_w4 vr18, vr19 1856 cdef_calc_sum_no_tapchange_w4 vr20 1857 cdef_calc_maxmin_w4 1858 1859 vshuf4i.w vr5, vr1, 0x0e 1860 vshuf4i.w vr6, vr3, 0x0e 1861 vshuf4i.w vr7, vr2, 0x0e 1862 vadd.h vr1, vr1, vr5 1863 vmin.hu vr3, vr6, vr3 1864 vmax.h vr2, vr7, vr2 1865 1866 cdef_calc_dst 1867 iclip_vrh vr5, vr3, vr2, vr16, vr17, vr5 1868 1869 vsrlni.b.h vr5, vr5, 0 1870 fst.s f5, a0, 0 1871 1872 add.d a0, a0, a1 1873 add.d s4, s4, s5 1874 add.d s4, s4, s5 1875 1876 addi.d t2, t2, -1 1877 blt zero, t2, 30b 1878 b 35f 1879 188031: // pri_strength only 1881 cdef_pri_init 1882 188332: 1884 fld.s f0, a0, 0 //px 1885 vsllwil.hu.bu vr0, vr0, 0 1886 vpermi.w vr0, vr0, 0x44 1887 1888 vxor.v vr1, vr1, vr1 //sum 1889 vor.v vr15, vr4, vr4 //pri_tap_k 1890 1891 sub.d t4, s4, a2 1892 sub.d t5, s4, a3 1893 1894 fldx.d f5, s4, a2 //p0_00 1895 fld.d f6, t4, 0 //p0_01 1896 fldx.d f7, s4, a3 //p0_10 1897 fld.d f8, t5, 0 //p0_11 1898 1899 cdef_process_data_w4 vr9, vr10 1900 cdef_calc_sum_tapchange_w4 1901 1902 vshuf4i.w vr5, vr1, 0x0e 1903 vadd.h vr1, vr1, vr5 1904 1905 cdef_calc_dst 1906 1907 vsrlni.b.h vr5, vr5, 0 1908 fst.s f5, a0, 0 1909 1910 add.d a0, a0, a1 1911 add.d s4, s4, s5 1912 add.d s4, s4, s5 1913 1914 addi.d t2, t2, -1 1915 blt zero, t2, 32b 1916 b 35f 1917 191833: // sec_strength only 1919 cdef_sec_init 1920 192134: 1922 fld.s f0, a0, 0 //px 1923 vsllwil.hu.bu vr0, vr0, 0 1924 vpermi.w vr0, vr0, 0x44 1925 1926 vxor.v vr1, vr1, vr1 //sum 1927 1928 sub.d t4, s4, s1 //tmp[-off02] 1929 sub.d t5, s4, t0 //tmp[-off03] 1930 1931 fldx.d f5, s4, s1 //s0_00 1932 fld.d f6, t4, 0 //s0_01 1933 fldx.d f7, s4, t0 //s0_02 1934 fld.d f8, t5, 0 //s0_03 1935 1936 cdef_process_data_w4 vr18, vr19 1937 cdef_calc_sum_no_tapchange_w4 vr22 1938 1939 sub.d t4, s4, s2 //tmp[-off12] 1940 sub.d t5, s4, s3 //tmp[-off13] 1941 1942 fldx.d f5, s4, s2 //s0_10 1943 fld.d f6, t4, 0 //s0_11 1944 fldx.d f7, s4, s3 //s0_12 1945 fld.d f8, t5, 0 //s0_13 1946 1947 cdef_process_data_w4 vr18, vr19 1948 cdef_calc_sum_no_tapchange_w4 vr20 1949 1950 vshuf4i.w vr5, vr1, 0x0e 1951 vadd.h vr1, vr1, vr5 1952 1953 cdef_calc_dst 1954 1955 vsrlni.b.h vr5, vr5, 0 1956 fst.s f5, a0, 0 1957 1958 add.d a0, a0, a1 1959 add.d s4, s4, s5 1960 add.d s4, s4, s5 1961 1962 addi.d t2, t2, -1 1963 blt zero, t2, 34b 1964 196535: 1966 ld.d s0, sp, 0 1967 ld.d s1, sp, 8 1968 ld.d s2, sp, 16 1969 ld.d s3, sp, 24 1970 ld.d s4, sp, 32 1971 ld.d s5, sp, 40 1972 ld.d s6, sp, 48 1973 ld.d s7, sp, 56 1974 addi.d sp, sp, (64+288) 1975endfunc 1976 1977function cdef_filter_block_8x8_8bpc_lsx 1978 ld.w t0, sp, 0 1979 ld.w t1, sp, 8 1980 addi.d sp, sp, -(64+288) 1981 st.d s0, sp, 0 1982 st.d s1, sp, 8 1983 st.d s2, sp, 16 1984 st.d s3, sp, 24 1985 st.d s4, sp, 32 1986 st.d s5, sp, 40 1987 st.d s6, sp, 48 1988 st.d s7, sp, 56 1989 1990 li.w s0, 8 //w 1991 li.w s1, 8 //h 1992 or s2, t1, t1 //edges 1993 or s7, t0, t0 //damping 1994 1995 // cdef_filter_block_kernel 1996 li.d s5, 12 //tmp_stride 1997 addi.d s4, sp, 64 1998 slli.d t0, s5, 1 1999 addi.d t0, t0, 2 2000 slli.d t0, t0, 1 2001 add.d s4, s4, t0 //ptr tmp 2002 vxor.v vr23, vr23, vr23 2003 li.w t2, 1 2004 vreplgr2vr.h vr20, t2 2005 vaddi.hu vr21, vr20, 2 2006 vaddi.hu vr22, vr20, 1 2007 2008 li.w t0, -16384 2009 vreplgr2vr.h vr18, t0 2010 2011 //padding 2012 li.w t5, -2 //x_start 2013 addi.d t6, s0, 2 //x_end 2014 li.w t7, -2 //y_start 2015 addi.d t8, s1, 2 //y_end 2016 li.w t2, 2 2017 2018 andi t4, s2, 4 2019 bnez t4, 1f 2020 2021 //CDEF_HAVE_TOP 2022 slli.d t3, s5, 2 2023 addi.d t4, s4, -4 2024 sub.d t4, t4, t3 2025 addi.d t3, s0, 4 2026 2027 cdef_fill t4, s5, t3, t2 2028 2029 or t7, zero, zero 2030 20311: //CDEF_HAVE_BOTTOM 2032 andi t4, s2,8 2033 bnez t4, 2f 2034 2035 mul.w t3, s1, s5 2036 slli.d t3, t3, 1 2037 add.d t4, s4, t3 2038 addi.d t4, t4, -4 2039 li.d t3, 12 2040 2041 cdef_fill t4, s5, t3, t2 2042 2043 addi.d t8, t8, -2 2044 20452: //CDEF_HAVE_LEFT 2046 andi t4, s2,1 2047 bnez t4, 3f 2048 2049 mul.w t3, t7, s5 2050 slli.d t3, t3, 1 2051 add.d t4, s4, t3 2052 addi.d t4, t4, -4 2053 sub.d t3, t8, t7 2054 li.d t2, 2 2055 2056 cdef_fill t4, s5, t2, t3 2057 2058 or t5, zero, zero 2059 20603: //CDEF_HAVE_RIGHT 2061 andi t4, s2,2 2062 bnez t4, 40f 2063 2064 mul.w t3, t7, s5 2065 slli.d t3, t3, 1 2066 add.d t4, s4, t3 2067 addi.d t4, t4, 16 2068 sub.d t3, t8, t7 2069 li.d t2, 2 2070 2071 cdef_fill t4, s5, t2, t3 2072 2073 addi.d t6, t6, -2 2074 207540: 2076 cdef_padding_data 2077 2078 beqz a5, 33f 2079 208028: //if (pri_strength) 2081 li.w t0, 4 2082 andi t1, a5, 1 2083 sub.d t0, t0, t1 //pri_tap 2084 2085 //edit 2086 clz.w t1, a5 2087 li.d t2, 31 2088 sub.w t3, t2, t1 2089 sub.w t3, s7, t3 2090 2091 or t1, zero, zero //t1: pri_shift 2092 blt t3, zero, 281f 2093 or t1, t3, t3 2094281: 2095 2096 beqz a6, 31f 2097 209829: //if (sec_strength) 2099 cdef_pri_sec_init 2100 2101301: 2102 fld.d f0, a0, 0 //px 2103 vsllwil.hu.bu vr0, vr0, 0 2104 2105 vxor.v vr1, vr1, vr1 //sum 2106 vor.v vr2, vr0, vr0 //max 2107 vor.v vr3, vr0, vr0 //min 2108 vor.v vr15, vr4, vr4 //pri_tap_k 2109 2110 sub.d t4, s4, a2 2111 sub.d t5, s4, a3 2112 2113 vldx vr5, s4, a2 2114 vld vr6, t4, 0 2115 vldx vr7, s4, a3 2116 vld vr8, t5, 0 2117 2118 cdef_process_data_w8 vr9, vr10 2119 cdef_calc_sum_tapchange_w8 2120 cdef_calc_maxmin_w8 2121 2122 //s 00-03 2123 sub.d t4, s4, s1 //tmp[-off02] 2124 sub.d t5, s4, t0 //tmp[-off03] 2125 2126 vldx vr5, s4, s1 2127 vld vr6, t4, 0 2128 vldx vr7, s4, t0 2129 vld vr8, t5, 0 2130 2131 cdef_process_data_w8 vr18, vr19 2132 cdef_calc_sum_no_tapchange_w8 vr22 2133 cdef_calc_maxmin_w8 2134 2135 //s 10-13 2136 sub.d t4, s4, s2 //tmp[-off12] 2137 sub.d t5, s4, s3 //tmp[-off13] 2138 2139 vldx vr5, s4, s2 2140 vld vr6, t4, 0 2141 vldx vr7, s4, s3 2142 vld vr8, t5, 0 2143 2144 cdef_process_data_w8 vr18, vr19 2145 cdef_calc_sum_no_tapchange_w8 vr20 2146 2147 cdef_calc_maxmin_w8 2148 cdef_calc_dst 2149 2150 iclip_vrh vr5, vr3, vr2, vr16, vr17, vr5 2151 2152 vsrlni.b.h vr5, vr5, 0 2153 fst.d f5, a0, 0 2154 2155 add.d a0, a0, a1 2156 add.d s4, s4, s5 2157 add.d s4, s4, s5 2158 2159 addi.d t2, t2, -1 2160 blt zero, t2, 301b 2161 b 35f 2162 216331: // pri_strength only 2164 cdef_pri_init 2165 216632: 2167 fld.d f0, a0, 0 //px 2168 vsllwil.hu.bu vr0, vr0, 0 2169 2170 vxor.v vr1, vr1, vr1 //sum 2171 vor.v vr15, vr4, vr4 //pri_tap_k 2172 2173 sub.d t4, s4, a2 2174 sub.d t5, s4, a3 2175 2176 vldx vr5, s4, a2 2177 vld vr6, t4, 0 2178 vldx vr7, s4, a3 2179 vld vr8, t5, 0 2180 2181 cdef_process_data_w8 vr9, vr10 2182 cdef_calc_sum_tapchange_w8 2183 cdef_calc_dst 2184 2185 vsrlni.b.h vr5, vr5, 0 2186 fst.d f5, a0, 0 2187 2188 add.d a0, a0, a1 2189 add.d s4, s4, s5 2190 add.d s4, s4, s5 2191 2192 addi.d t2, t2, -1 2193 blt zero, t2, 32b 2194 b 35f 2195 219633: // sec_strength only 2197 cdef_sec_init 2198 219934: 2200 fld.d f0, a0, 0 //px 2201 vsllwil.hu.bu vr0, vr0, 0 2202 2203 vxor.v vr1, vr1, vr1 //sum 2204 2205 sub.d t4, s4, s1 //tmp[-off02] 2206 sub.d t5, s4, t0 //tmp[-off03] 2207 2208 vldx vr5, s4, s1 2209 vld vr6, t4, 0 2210 vldx vr7, s4, t0 2211 vld vr8, t5, 0 2212 2213 cdef_process_data_w8 vr18, vr19 2214 cdef_calc_sum_no_tapchange_w8 vr22 2215 2216 sub.d t4, s4, s2 //tmp[-off12] 2217 sub.d t5, s4, s3 //tmp[-off13] 2218 2219 vldx vr5, s4, s2 2220 vld vr6, t4, 0 2221 vldx vr7, s4, s3 2222 vld vr8, t5, 0 2223 2224 cdef_process_data_w8 vr18, vr19 2225 cdef_calc_sum_no_tapchange_w8 vr20 2226 cdef_calc_dst 2227 2228 vsrlni.b.h vr5, vr5, 0 2229 fst.d f5, a0, 0 2230 2231 add.d a0, a0, a1 2232 add.d s4, s4, s5 2233 add.d s4, s4, s5 2234 2235 addi.d t2, t2, -1 2236 blt zero, t2, 34b 2237 223835: 2239 ld.d s0, sp, 0 2240 ld.d s1, sp, 8 2241 ld.d s2, sp, 16 2242 ld.d s3, sp, 24 2243 ld.d s4, sp, 32 2244 ld.d s5, sp, 40 2245 ld.d s6, sp, 48 2246 ld.d s7, sp, 56 2247 addi.d sp, sp, (64+288) 2248endfunc 2249 2250