1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29#include "src/loongarch/loongson_util.S" 30 31// depending on how many pixels need to be stored, returns: 32// t4 = (1 << 0) : 0 pixels 33// t4 = (1 << 4) : inner 4 pixels 34// t4 = (1 << 6) : inner 6 pixels 35// t4 = 0 : all pixels 36.macro FILTER wd 37functionl lpf_16_wd\wd\()_lsx 38 vabsd.bu vr0, vr22, vr23 // abs(p1 - p0) 39 vabsd.bu vr1, vr25, vr24 // abs(q1 - q0) 40 vabsd.bu vr2, vr23, vr24 // abs(p0 - q0) 41 vabsd.bu vr3, vr22, vr25 // abs(p1 - q1) 42.if \wd >= 6 43 vabsd.bu vr4, vr21, vr22 // abs(p2 - p1) 44 vabsd.bu vr5, vr26, vr25 // abs(q2 - q1) 45.endif 46.if \wd >= 8 47 vabsd.bu vr6, vr20, vr21 // abs(p3 - p2) 48 vabsd.bu vr7, vr27, vr26 // abs(q3 - q3) 49.endif 50.if \wd >= 6 51 vmax.bu vr4, vr4, vr5 52.endif 53 vsadd.bu vr2, vr2, vr2 // abs(p0 - q0) * 2 54.if \wd >= 8 55 vmax.bu vr6, vr6, vr7 56.endif 57 vsrli.b vr3, vr3, 1 // abs(p1 - q1) >> 1 58.if \wd >= 8 59 vmax.bu vr4, vr4, vr6 60.endif 61.if \wd >= 6 62 vand.v vr4, vr4, vr14 63.endif 64 vmax.bu vr0, vr0, vr1 // max(abs(p1 - p0), abs(q1 - q0)) 65 vsadd.bu vr2, vr2, vr3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 66.if \wd >= 6 67 vmax.bu vr4, vr0, vr4 68 vsle.bu vr1, vr4, vr11 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I 69.else 70 vsle.bu vr1, vr0, vr11 // max(abs(p1 - p0), abs(q1 - q0)) <= I 71.endif 72 vsle.bu vr2, vr2, vr10 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E 73 vand.v vr1, vr1, vr2 // fm 74 vand.v vr1, vr1, vr13 // fm && wd >= 4 75.if \wd >= 6 76 vand.v vr14, vr14, vr1 // fm && wd > 4 77.endif 78.if \wd >= 16 79 vand.v vr15, vr15, vr1 // fm && wd == 16 80.endif 81 vhaddw.qu.du vr8, vr1, vr1 82 vpickve2gr.du t6, vr8, 0 83 bnez t6, 9f // if (!fm || wd < 4) return; 84 li.w t4, 1 << 0 85 jirl zero, ra, 0x00 86 879: 88.if \wd >= 6 89 vabsd.bu vr2, vr21, vr23 // abs(p2 - p0) 90 vabsd.bu vr3, vr22, vr23 // abs(p1 - p0) 91 vabsd.bu vr4, vr25, vr24 // abs(q1 - q0) 92 vabsd.bu vr5, vr26, vr24 // abs(q2 - q0) 93.if \wd >= 8 94 vabsd.bu vr6, vr20, vr23 // abs(p3 - p0) 95 vabsd.bu vr7, vr27, vr24 // abs(q3 - q0) 96.endif 97 vmax.bu vr2, vr2, vr3 98 vmax.bu vr4, vr4, vr5 99.if \wd >= 8 100 vmax.bu vr6, vr6, vr7 101.endif 102 vmax.bu vr2, vr2, vr4 103.if \wd >= 8 104 vmax.bu vr2, vr2, vr6 105.endif 106 107.if \wd == 16 108 vabsd.bu vr3, vr17, vr23 // abs(p6 - p0) 109 vabsd.bu vr4, vr18, vr23 // abs(p5 - p0) 110 vabsd.bu vr5, vr19, vr23 // abs(p4 - p0) 111.endif 112 vslei.bu vr2, vr2, 1 // flat8in 113.if \wd == 16 114 vabsd.bu vr6, vr28, vr24 // abs(q4 - q0) 115 vabsd.bu vr7, vr29, vr24 // abs(q5 - q0) 116 vabsd.bu vr8, vr30, vr24 // abs(q6 - q0) 117.endif 118 vand.v vr14, vr2, vr14 // flat8in && fm && wd > 4 119 vandn.v vr1, vr14, vr1 // fm && wd >= 4 && !flat8in 120.if \wd == 16 121 vmax.bu vr3, vr3, vr4 122 vmax.bu vr5, vr5, vr6 123.endif 124 vhaddw.qu.du vr9, vr1, vr1 125.if \wd == 16 126 vmax.bu vr7, vr7, vr8 127 vmax.bu vr3, vr3, vr5 128 vmax.bu vr3, vr3, vr7 129 vslei.bu vr3, vr3, 1 // flat8out 130.endif 131 vpickve2gr.du t6, vr9, 0 132.if \wd == 16 133 vand.v vr15, vr15, vr3 // flat8out && fm && wd == 16 134 vand.v vr15, vr15, vr14 // flat8out && flat8in && fm && wd == 16 135 vandn.v vr14, vr15, vr14 // flat8in && fm && wd >= 4 && !flat8out 136.endif 137 beqz t6, 1f // skip wd == 4 case 138.endif 139 vxori.b vr2, vr22, 128 // p1 - 128 140 vxori.b vr3, vr25, 128 // q1 - 128 141 vslt.bu vr0, vr12, vr0 // hev 142 vssub.b vr2, vr2, vr3 // iclip_diff(p1 - q1) 143 vand.v vr4, vr2, vr0 // if (hev) iclip_diff(p1 - q1) 144 vandn.v vr0, vr0, vr1 // (fm && wd >= 4 && !hev) 145 vxor.v vr5, vr5, vr5 146 vaddi.hu vr5, vr5, 3 147 vsubwev.h.bu vr2, vr24, vr23 148 vsubwod.h.bu vr3, vr24, vr23 149 vmul.h vr2, vr2, vr5 150 vmul.h vr3, vr3, vr5 151 vxor.v vr6, vr6, vr6 152 vaddwev.h.b vr7, vr4, vr6 153 vaddwod.h.b vr6, vr4, vr6 154 vadd.h vr2, vr2, vr7 155 vadd.h vr3, vr3, vr6 156 vssrani.b.h vr2, vr2, 0 157 vssrani.b.h vr3, vr3, 0 158 vilvl.b vr2, vr3, vr2 // f 159 vxor.v vr6, vr6, vr6 160 vaddi.bu vr5, vr6, 3 161 vaddi.bu vr6, vr6, 4 // 4 162 vsadd.b vr4, vr6, vr2 // imin(f + 4, 127) 163 vsadd.b vr5, vr5, vr2 // imin(f + 3, 127) 164 vsrai.b vr4, vr4, 3 // f1 165 vsrai.b vr5, vr5, 3 // f2 166 vaddi.bu vr2, vr23, 0 // p0 167 vaddi.bu vr3, vr24, 0 // q0 168 vxori.b vr2, vr2, 128 169 vxori.b vr3, vr3, 128 170 vsadd.b vr2, vr2, vr5 // p0 + f2 out p0 171 vssub.b vr3, vr3, vr4 // q0 - f1 out q0 172 vxori.b vr2, vr2, 128 173 vxori.b vr3, vr3, 128 174 vsrari.b vr4, vr4, 1 // (f1 + 1) >> 1 175 vbitsel.v vr23, vr23, vr2, vr1 // if (fm && wd >= 4) 176 vbitsel.v vr24, vr24, vr3, vr1 // if (fm && wd >= 4) 177 vaddi.bu vr2, vr22, 0 // p1 178 vaddi.bu vr3, vr25, 0 // q1 179 vxori.b vr2, vr2, 128 180 vxori.b vr3, vr3, 128 181 vsadd.b vr2, vr2, vr4 // out p1 182 vssub.b vr3, vr3, vr4 // out q1 183 vxori.b vr2, vr2, 128 184 vxori.b vr3, vr3, 128 185 vbitsel.v vr22, vr22, vr2, vr0 // if (fm && wd >= 4 && !hev) 186 vbitsel.v vr25, vr25, vr3, vr0 // if (fm && wd >= 4 && !hev) 1871: 188 189.if \wd == 6 190 vhaddw.qu.du vr0, vr14, vr14 191 vpickve2gr.du t6, vr0, 0 192 beqz t6, 2f // skip if there's no flat8in 193 194 vaddwev.h.bu vr0, vr21, vr21 195 vaddwod.h.bu vr1, vr21, vr21 // p2 * 2 196 vaddwev.h.bu vr2, vr21, vr22 197 vaddwod.h.bu vr3, vr21, vr22 // p2 + p1 198 vaddwev.h.bu vr4, vr22, vr23 199 vaddwod.h.bu vr5, vr22, vr23 // p1 + p0 200 vaddwev.h.bu vr6, vr23, vr24 201 vaddwod.h.bu vr7, vr23, vr24 // p0 + q0 202 vadd.h vr8, vr0, vr2 203 vadd.h vr9, vr1, vr3 204 vadd.h vr10, vr4, vr6 205 vadd.h vr11, vr5, vr7 206 vaddwev.h.bu vr12, vr24, vr25 207 vaddwod.h.bu vr13, vr24, vr25 // q0 + q1 208 vadd.h vr8, vr8, vr10 209 vadd.h vr9, vr9, vr11 210 vsub.h vr12, vr12, vr0 211 vsub.h vr13, vr13, vr1 212 vaddwev.h.bu vr10, vr25, vr26 213 vaddwod.h.bu vr11, vr25, vr26 // q1 + q2 214 vssrlrni.bu.h vr0, vr8, 3 215 vssrlrni.bu.h vr1, vr9, 3 216 vilvl.b vr0, vr1, vr0 // out p1 217 218 vadd.h vr8, vr8, vr12 219 vadd.h vr9, vr9, vr13 220 vsub.h vr10, vr10, vr2 221 vsub.h vr11, vr11, vr3 222 vaddwev.h.bu vr12, vr26, vr26 // q2 + q2 223 vaddwod.h.bu vr13, vr26, vr26 224 vssrlrni.bu.h vr1, vr8, 3 225 vssrlrni.bu.h vr2, vr9, 3 226 vilvl.b vr1, vr2, vr1 // out p0 227 228 vadd.h vr8, vr8, vr10 229 vadd.h vr9, vr9, vr11 230 vsub.h vr12, vr12, vr4 231 vsub.h vr13, vr13, vr5 232 vssrlrni.bu.h vr2, vr8, 3 233 vssrlrni.bu.h vr3, vr9, 3 234 vilvl.b vr2, vr3, vr2 // out q0 235 236 vbitsel.v vr22, vr22, vr0, vr14 237 vadd.h vr8, vr8, vr12 238 vadd.h vr9, vr9, vr13 239 vbitsel.v vr23, vr23, vr1, vr14 240 vssrlrni.bu.h vr3, vr8, 3 241 vssrlrni.bu.h vr4, vr9, 3 242 vilvl.b vr3, vr4, vr3 243 vbitsel.v vr24, vr24, vr2, vr14 244 vbitsel.v vr25, vr25, vr3, vr14 245.elseif \wd >= 8 246 vhaddw.qu.du vr0, vr14, vr14 247 vpickve2gr.du t6, vr0, 0 248.if \wd == 8 249 beqz t6, 8f // skip if there's no flat8in 250.else 251 beqz t6, 2f // skip if there's no flat8in 252.endif 253 254 vaddwev.h.bu vr0, vr20, vr21 255 vaddwod.h.bu vr1, vr20, vr21 // p3 + p2 256 vaddwev.h.bu vr2, vr22, vr25 257 vaddwod.h.bu vr3, vr22, vr25 // p1 + q1 258 vaddwev.h.bu vr4, vr20, vr22 259 vaddwod.h.bu vr5, vr20, vr22 // p3 + p1 260 vaddwev.h.bu vr6, vr23, vr26 261 vaddwod.h.bu vr7, vr23, vr26 // p0 + q2 262 vadd.h vr8, vr0, vr0 263 vadd.h vr9, vr1, vr1 // 2 * (p3 + p2) 264 vxor.v vr10, vr10, vr10 265 vaddwev.h.bu vr11, vr23, vr10 266 vaddwod.h.bu vr12, vr23, vr10 267 vaddwev.h.bu vr13, vr24, vr10 268 vaddwod.h.bu vr10, vr24, vr10 269 vadd.h vr8, vr8, vr11 // + p0 270 vadd.h vr9, vr9, vr12 271 vadd.h vr8, vr8, vr13 // + q0 272 vadd.h vr9, vr9, vr10 273 vadd.h vr8, vr8, vr4 274 vadd.h vr9, vr9, vr5 // + p3 + p1 275 vsub.h vr2, vr2, vr0 276 vsub.h vr3, vr3, vr1 // p1 + q1 - p3 - p2 277 vsub.h vr6, vr6, vr4 278 vsub.h vr7, vr7, vr5 // p0 + q2 - p3 - p1 279 vssrlrni.bu.h vr10, vr8, 3 280 vssrlrni.bu.h vr11, vr9, 3 281 vilvl.b vr10, vr11, vr10 // out p2 282 283 vadd.h vr8, vr8, vr2 284 vadd.h vr9, vr9, vr3 285 vaddwev.h.bu vr0, vr20, vr23 286 vaddwod.h.bu vr1, vr20, vr23 // p3 + p0 287 vaddwev.h.bu vr2, vr24, vr27 288 vaddwod.h.bu vr3, vr24, vr27 // q0 + q3 289 vssrlrni.bu.h vr11, vr8, 3 290 vssrlrni.bu.h vr12, vr9, 3 291 vilvl.b vr11, vr12, vr11 // out p1 292 293 vadd.h vr8, vr8, vr6 294 vadd.h vr9, vr9, vr7 295 vsub.h vr2, vr2, vr0 // q0 + q3 - p3 - p0 296 vsub.h vr3, vr3, vr1 297 vaddwev.h.bu vr4, vr21, vr24 // p2 + q0 298 vaddwod.h.bu vr5, vr21, vr24 299 vaddwev.h.bu vr6, vr25, vr27 // q1 + q3 300 vaddwod.h.bu vr7, vr25, vr27 301 vssrlrni.bu.h vr12, vr8, 3 302 vssrlrni.bu.h vr13, vr9, 3 303 vilvl.b vr12, vr13, vr12 // out p0 304 305 vadd.h vr8, vr8, vr2 306 vadd.h vr9, vr9, vr3 307 vsub.h vr6, vr6, vr4 // q1 + q3 - p2 - q0 308 vsub.h vr7, vr7, vr5 309 vaddwev.h.bu vr0, vr22, vr25 // p1 + q1 310 vaddwod.h.bu vr1, vr22, vr25 311 vaddwev.h.bu vr2, vr26, vr27 312 vaddwod.h.bu vr3, vr26, vr27 // q2 + q3 313 vssrlrni.bu.h vr13, vr8, 3 314 vssrlrni.bu.h vr4, vr9, 3 315 vilvl.b vr13, vr4, vr13 // out q0 316 317 vadd.h vr8, vr8, vr6 318 vadd.h vr9, vr9, vr7 319 vsub.h vr2, vr2, vr0 // q2 + q3 - p1 - q1 320 vsub.h vr3, vr3, vr1 321 vssrlrni.bu.h vr0, vr8, 3 322 vssrlrni.bu.h vr1, vr9, 3 323 vilvl.b vr0, vr1, vr0 // out q1 324 325 vadd.h vr8, vr8, vr2 326 vadd.h vr9, vr9, vr3 327 328 vbitsel.v vr21, vr21, vr10, vr14 329 vbitsel.v vr22, vr22, vr11, vr14 330 vbitsel.v vr23, vr23, vr12, vr14 331 vbitsel.v vr24, vr24, vr13, vr14 332 vssrlrni.bu.h vr1, vr8, 3 333 vssrlrni.bu.h vr2, vr9, 3 334 vilvl.b vr1, vr2, vr1 // out q2 335 vbitsel.v vr25, vr25, vr0, vr14 336 vbitsel.v vr26, vr26, vr1, vr14 337.endif 3382: 339.if \wd == 16 340 vhaddw.qu.du vr2, vr15, vr15 341 vpickve2gr.du t6, vr2, 0 342 bnez t6, 1f // check if flat8out is needed 343 vhaddw.qu.du vr2, vr14, vr14 344 vpickve2gr.du t6, vr2, 0 345 beqz t6, 8f // if there was no flat8in, just write the inner 4 pixels 346 b 7f // if flat8in was used, write the inner 6 pixels 3471: 348 349 vaddwev.h.bu vr2, vr17, vr17 // p6 + p6 350 vaddwod.h.bu vr3, vr17, vr17 351 vaddwev.h.bu vr4, vr17, vr18 352 vaddwod.h.bu vr5, vr17, vr18 // p6 + p5 353 vaddwev.h.bu vr6, vr17, vr19 354 vaddwod.h.bu vr7, vr17, vr19 // p6 + p4 355 vaddwev.h.bu vr8, vr17, vr20 356 vaddwod.h.bu vr9, vr17, vr20 // p6 + p3 357 vadd.h vr12, vr2, vr4 358 vadd.h vr13, vr3, vr5 359 vadd.h vr10, vr6, vr8 360 vadd.h vr11, vr7, vr9 361 vaddwev.h.bu vr6, vr17, vr21 362 vaddwod.h.bu vr7, vr17, vr21 // p6 + p2 363 vadd.h vr12, vr12, vr10 364 vadd.h vr13, vr13, vr11 365 vaddwev.h.bu vr8, vr17, vr22 366 vaddwod.h.bu vr9, vr17, vr22 // p6 + p1 367 vaddwev.h.bu vr10, vr18, vr23 368 vaddwod.h.bu vr11, vr18, vr23 // p5 + p0 369 vadd.h vr6, vr6, vr8 370 vadd.h vr7, vr7, vr9 371 vaddwev.h.bu vr8, vr19, vr24 372 vaddwod.h.bu vr9, vr19, vr24 // p4 + q0 373 vadd.h vr12, vr12, vr6 374 vadd.h vr13, vr13, vr7 375 vadd.h vr10, vr10, vr8 376 vadd.h vr11, vr11, vr9 377 vaddwev.h.bu vr6, vr20, vr25 378 vaddwod.h.bu vr7, vr20, vr25 // p3 + q1 379 vadd.h vr12, vr12, vr10 380 vadd.h vr13, vr13, vr11 381 vsub.h vr6, vr6, vr2 382 vsub.h vr7, vr7, vr3 383 vaddwev.h.bu vr2, vr21, vr26 384 vaddwod.h.bu vr3, vr21, vr26 // p2 + q2 385 vssrlrni.bu.h vr0, vr12, 4 386 vssrlrni.bu.h vr1, vr13, 4 387 vilvl.b vr0, vr1, vr0 // out p5 388 vadd.h vr12, vr12, vr6 389 vadd.h vr13, vr13, vr7 // - (p6 + p6) + (p3 + q1) 390 vsub.h vr2, vr2, vr4 391 vsub.h vr3, vr3, vr5 392 vaddwev.h.bu vr4, vr22, vr27 393 vaddwod.h.bu vr5, vr22, vr27 // p1 + q3 394 vaddwev.h.bu vr6, vr17, vr19 395 vaddwod.h.bu vr7, vr17, vr19 // p6 + p4 396 vssrlrni.bu.h vr1, vr12, 4 397 vssrlrni.bu.h vr8, vr13, 4 398 vilvl.b vr1, vr8, vr1 // out p4 399 vadd.h vr12, vr12, vr2 400 vadd.h vr13, vr13, vr3 // - (p6 + p5) + (p2 + q2) 401 vsub.h vr4, vr4, vr6 402 vsub.h vr5, vr5, vr7 403 vaddwev.h.bu vr6, vr23, vr28 404 vaddwod.h.bu vr7, vr23, vr28 // p0 + q4 405 vaddwev.h.bu vr8, vr17, vr20 406 vaddwod.h.bu vr9, vr17, vr20 // p6 + p3 407 vssrlrni.bu.h vr2, vr12, 4 408 vssrlrni.bu.h vr10, vr13, 4 409 vilvl.b vr2, vr10, vr2 // out p3 410 vadd.h vr12, vr12, vr4 411 vadd.h vr13, vr13, vr5 // - (p6 + p4) + (p1 + q3) 412 vsub.h vr6, vr6, vr8 413 vsub.h vr7, vr7, vr9 414 vaddwev.h.bu vr8, vr24, vr29 415 vaddwod.h.bu vr9, vr24, vr29 // q0 + q5 416 vaddwev.h.bu vr4, vr17, vr21 417 vaddwod.h.bu vr5, vr17, vr21 // p6 + p2 418 vssrlrni.bu.h vr3, vr12, 4 419 vssrlrni.bu.h vr11, vr13, 4 420 vilvl.b vr3, vr11, vr3 // out p2 421 vadd.h vr12, vr12, vr6 422 vadd.h vr13, vr13, vr7 // - (p6 + p3) + (p0 + q4) 423 vsub.h vr8, vr8, vr4 424 vsub.h vr9, vr9, vr5 425 vaddwev.h.bu vr6, vr25, vr30 426 vaddwod.h.bu vr7, vr25, vr30 // q1 + q6 427 vaddwev.h.bu vr10, vr17, vr22 428 vaddwod.h.bu vr11, vr17, vr22 // p6 + p1 429 vssrlrni.bu.h vr4, vr12, 4 430 vssrlrni.bu.h vr5, vr13, 4 431 vilvl.b vr4, vr5, vr4 // out p1 432 433 vadd.h vr12, vr12, vr8 434 vadd.h vr13, vr13, vr9 // - (p6 + p2) + (q0 + q5) 435 vsub.h vr6, vr6, vr10 436 vsub.h vr7, vr7, vr11 437 vaddwev.h.bu vr8, vr26, vr30 438 vaddwod.h.bu vr9, vr26, vr30 // q2 + q6 439 vbitsel.v vr0, vr18, vr0, vr15 // out p5 440 vaddwev.h.bu vr10, vr18, vr23 441 vaddwod.h.bu vr11, vr18, vr23 // p5 + p0 442 vssrlrni.bu.h vr5, vr12, 4 443 vssrlrni.bu.h vr18, vr13, 4 444 vilvl.b vr5, vr18, vr5 // out p0 445 446 vadd.h vr12, vr12, vr6 447 vadd.h vr13, vr13, vr7 // - (p6 + p1) + (q1 + q6) 448 vsub.h vr8, vr8, vr10 449 vsub.h vr9, vr9, vr11 450 vaddwev.h.bu vr10, vr27, vr30 451 vaddwod.h.bu vr11, vr27, vr30 // q3 + q6 452 vbitsel.v vr1, vr19, vr1, vr15 // out p4 453 454 vaddwev.h.bu vr18, vr19, vr24 455 vaddwod.h.bu vr19, vr19, vr24 // p4 + q0 456 vssrlrni.bu.h vr6, vr12, 4 457 vssrlrni.bu.h vr7, vr13, 4 458 vilvl.b vr6, vr7, vr6 // out q0 459 460 vadd.h vr12, vr12, vr8 461 vadd.h vr13, vr13, vr9 // - (p5 + p0) + (q2 + q6) 462 vsub.h vr10, vr10, vr18 463 vsub.h vr11, vr11, vr19 464 vaddwev.h.bu vr8, vr28, vr30 465 vaddwod.h.bu vr9, vr28, vr30 // q4 + q6 466 vbitsel.v vr2, vr20, vr2, vr15 // out p3 467 vaddwev.h.bu vr18, vr20, vr25 468 vaddwod.h.bu vr19, vr20, vr25 // p3 + q1 469 vssrlrni.bu.h vr7, vr12, 4 470 vssrlrni.bu.h vr20, vr13, 4 471 vilvl.b vr7, vr20, vr7 // out q1 472 473 vadd.h vr12, vr12, vr10 474 vadd.h vr13, vr13, vr11 // - (p4 + q0) + (q3 + q6) 475 vsub.h vr18, vr8, vr18 476 vsub.h vr19, vr9, vr19 477 vaddwev.h.bu vr10, vr29, vr30 478 vaddwod.h.bu vr11, vr29, vr30 // q5 + q6 479 vbitsel.v vr3, vr21, vr3, vr15 // out p2 480 vaddwev.h.bu vr20, vr21, vr26 481 vaddwod.h.bu vr21, vr21, vr26 // p2 + q2 482 vssrlrni.bu.h vr8, vr12, 4 483 vssrlrni.bu.h vr9, vr13, 4 484 vilvl.b vr8, vr9, vr8 // out q2 485 486 vadd.h vr12, vr12, vr18 487 vadd.h vr13, vr13, vr19 // - (p3 + q1) + (q4 + q6) 488 vsub.h vr10, vr10, vr20 489 vsub.h vr11, vr11, vr21 490 vaddwev.h.bu vr18, vr30, vr30 491 vaddwod.h.bu vr19, vr30, vr30 // q6 + q6 492 vbitsel.v vr4, vr22, vr4, vr15 // out p1 493 vaddwev.h.bu vr20, vr22, vr27 494 vaddwod.h.bu vr21, vr22, vr27 // p1 + q3 495 vssrlrni.bu.h vr9, vr12, 4 496 vssrlrni.bu.h vr22, vr13, 4 497 vilvl.b vr9, vr22, vr9 // out q3 498 vadd.h vr12, vr12, vr10 499 vadd.h vr13, vr13, vr11 // - (p2 + q2) + (q5 + q6) 500 vsub.h vr18, vr18, vr20 501 vsub.h vr19, vr19, vr21 502 vbitsel.v vr5, vr23, vr5, vr15 // out p0 503 vssrlrni.bu.h vr10, vr12, 4 504 vssrlrni.bu.h vr23, vr13, 4 505 vilvl.b vr10, vr23, vr10 // out q4 506 vadd.h vr12, vr12, vr18 507 vadd.h vr13, vr13, vr19 // - (p1 + q3) + (q6 + q6) 508 vssrlrni.bu.h vr11, vr12, 4 509 vssrlrni.bu.h vr12, vr13, 4 510 vilvl.b vr11, vr12, vr11 // out q5 511 vbitsel.v vr6, vr24, vr6, vr15 512 vbitsel.v vr7, vr25, vr7, vr15 513 vbitsel.v vr8, vr26, vr8, vr15 514 vbitsel.v vr9, vr27, vr9, vr15 515 vbitsel.v vr10, vr28, vr10, vr15 516 vbitsel.v vr11, vr29, vr11, vr15 517.endif 518 li.w t4, 0 519 jirl zero, ra, 0x00 520.if \wd == 16 5217: 522 // Return to a shorter epilogue, writing only the inner 6 pixels 523 li.w t4, 1 << 6 524 jirl zero, ra, 0x00 525.endif 526.if \wd >= 8 5278: 528 // Return to a shorter epilogue, writing only the inner 4 pixels 529 li.w t4, 1 << 4 530 jirl zero, ra, 0x00 531.endif 532endfuncl 533.endm 534 535FILTER 16 536FILTER 8 537FILTER 6 538FILTER 4 539 540.macro LPF_16_WD16 541 move t7, ra 542 bl lpf_16_wd16_lsx 543 move ra, t7 544 beqz t4, 1f 545 andi t5, t4, 1 << 6 546 bnez t5, 7f 547 andi t5, t4, 1 << 4 548 bnez t5, 8f 549 jirl zero, ra, 0x00 5501: 551.endm 552 553.macro LPF_16_WD8 554 move t7, ra 555 bl lpf_16_wd8_lsx 556 move ra, t7 557 beqz t4, 1f 558 andi t5, t4, 1 << 4 559 bnez t5, 8f 560 jirl zero, ra, 0x00 5611: 562.endm 563 564.macro LPF_16_WD6 565 move t7, ra 566 bl lpf_16_wd6_lsx 567 move ra, t7 568 beqz t4, 1f 569 jirl zero, ra, 0x00 5701: 571.endm 572 573.macro LPF_16_WD4 574 move t7, ra 575 bl lpf_16_wd4_lsx 576 move ra, t7 577 beqz t4, 1f 578 jirl zero, ra, 0x00 5791: 580.endm 581 582functionl lpf_v_4_16_lsx 583 slli.d t3, a1, 1 584 sub.d t3, a0, t3 585 vld vr22, t3, 0 // p1 586 vldx vr23, t3, a1 // p0 587 vld vr24, a0, 0 // q0 588 vldx vr25, a0, a1 // q1 589 590 LPF_16_WD4 591 592 vst vr22, t3, 0 // p1 593 vstx vr23, t3, a1 // p0 594 vst vr24, a0, 0 // q0 595 vstx vr25, a0, a1 // q1 596endfuncl 597 598functionl lpf_h_4_16_lsx 599 addi.d t3, a0, -2 600 fld.s f22, t3, 0 601 fldx.s f23, t3, a1 602 alsl.d t3, a1, t3, 1 603 fld.s f24, t3, 0 604 fldx.s f25, t3, a1 605 alsl.d t3, a1, t3, 1 606 fld.s f17, t3, 0 607 fldx.s f18, t3, a1 608 alsl.d t3, a1, t3, 1 609 fld.s f19, t3, 0 610 fldx.s f20, t3, a1 611 alsl.d t3, a1, t3, 1 612 vilvl.w vr22, vr17, vr22 613 vilvl.w vr23, vr18, vr23 614 vilvl.w vr24, vr19, vr24 615 vilvl.w vr25, vr20, vr25 616 fld.s f17, t3, 0 617 fldx.s f18, t3, a1 618 alsl.d t3, a1, t3, 1 619 fld.s f19, t3, 0 620 fldx.s f20, t3, a1 621 alsl.d t3, a1, t3, 1 622 fld.s f26, t3, 0 623 fldx.s f27, t3, a1 624 alsl.d t3, a1, t3, 1 625 fld.s f28, t3, 0 626 fldx.s f29, t3, a1 627 alsl.d t3, a1, t3, 1 628 vilvl.w vr17, vr26, vr17 629 vilvl.w vr18, vr27, vr18 630 vilvl.w vr19, vr28, vr19 631 vilvl.w vr20, vr29, vr20 632 vilvl.d vr22, vr17, vr22 633 vilvl.d vr23, vr18, vr23 634 vilvl.d vr24, vr19, vr24 635 vilvl.d vr25, vr20, vr25 636 addi.d a0, t3, 2 637 638 TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 639 640 LPF_16_WD4 641 642 slli.d t3, a1, 4 643 sub.d a0, a0, t3 644 645 TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 646 647 addi.d a0, a0, -2 648.irp i, vr22, vr23, vr24, vr25 649 vstelm.w \i, a0, 0, 0 650 add.d a0, a0, a1 651.endr 652.irp i, vr22, vr23, vr24, vr25 653 vstelm.w \i, a0, 0, 1 654 add.d a0, a0, a1 655.endr 656.irp i, vr22, vr23, vr24, vr25 657 vstelm.w \i, a0, 0, 2 658 add.d a0, a0, a1 659.endr 660.irp i, vr22, vr23, vr24, vr25 661 vstelm.w \i, a0, 0, 3 662 add.d a0, a0, a1 663.endr 664 addi.d a0, a0, 2 665endfuncl 666 667functionl lpf_v_6_16_lsx 668 slli.d t3, a1, 1 669 sub.d t3, a0, t3 670 sub.d s0, t3, a1 671 vld vr21, s0, 0 // p2 672 vldx vr22, s0, a1 // p1 673 alsl.d s0, a1, s0, 1 674 vld vr23, s0, 0 // p0 675 vldx vr24, s0, a1 // q0 676 alsl.d s0, a1, s0, 1 677 vld vr25, s0, 0 // q1 678 vldx vr26, s0, a1 // q2 679 680 LPF_16_WD6 681 682 vst vr22, t3, 0 // p1 683 vstx vr23, t3, a1 // p0 684 vst vr24, a0, 0 // q0 685 vstx vr25, a0, a1 // q1 686endfuncl 687 688functionl lpf_h_6_16_lsx 689 addi.d t3, a0, -4 690 fld.d f20, t3, 0 691 fldx.d f21, t3, a1 692 alsl.d t3, a1, t3, 1 693 fld.d f22, t3, 0 694 fldx.d f23, t3, a1 695 alsl.d t3, a1, t3, 1 696 fld.d f24, t3, 0 697 fldx.d f25, t3, a1 698 alsl.d t3, a1, t3, 1 699 fld.d f26, t3, 0 700 fldx.d f27, t3, a1 701 alsl.d t3, a1, t3, 1 702 fld.d f16, t3, 0 703 fldx.d f17, t3, a1 704 alsl.d t3, a1, t3, 1 705 fld.d f18, t3, 0 706 fldx.d f19, t3, a1 707 alsl.d t3, a1, t3, 1 708 fld.d f28, t3, 0 709 fldx.d f29, t3, a1 710 alsl.d t3, a1, t3, 1 711 fld.d f30, t3, 0 712 fldx.d f31, t3, a1 713 alsl.d t3, a1, t3, 1 714 715 vilvl.d vr20, vr16, vr20 716 vilvl.d vr21, vr17, vr21 717 vilvl.d vr22, vr18, vr22 718 vilvl.d vr23, vr19, vr23 719 vilvl.d vr24, vr28, vr24 720 vilvl.d vr25, vr29, vr25 721 vilvl.d vr26, vr30, vr26 722 vilvl.d vr27, vr31, vr27 723 addi.d a0, t3, 4 724 725 TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 726 727 LPF_16_WD6 728 729 slli.d t3, a1, 4 730 sub.d a0, a0, t3 731 732 TRANSPOSE_4x16b vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 733 734 addi.d a0, a0, -2 735.irp i, vr22, vr23, vr24, vr25 736 vstelm.w \i, a0, 0, 0 737 add.d a0, a0, a1 738.endr 739.irp i, vr22, vr23, vr24, vr25 740 vstelm.w \i, a0, 0, 1 741 add.d a0, a0, a1 742.endr 743.irp i, vr22, vr23, vr24, vr25 744 vstelm.w \i, a0, 0, 2 745 add.d a0, a0, a1 746.endr 747.irp i, vr22, vr23, vr24, vr25 748 vstelm.w \i, a0, 0, 3 749 add.d a0, a0, a1 750.endr 751 addi.d a0, a0, 2 752endfuncl 753 754functionl lpf_v_8_16_lsx 755 slli.d t3, a1, 2 756 sub.d s0, a0, t3 757 vld vr20, s0, 0 // p3 758 vldx vr21, s0, a1 // p2 759 alsl.d s0, a1, s0, 1 760 vld vr22, s0, 0 // p1 761 vldx vr23, s0, a1 // p0 762 alsl.d s0, a1, s0, 1 763 vld vr24, s0, 0 // q0 764 vldx vr25, s0, a1 // q1 765 alsl.d s0, a1, s0, 1 766 vld vr26, s0, 0 // q2 767 vldx vr27, s0, a1 // q3 768 769 LPF_16_WD8 770 771 sub.d t3, a0, t3 772 add.d t3, t3, a1 // -3 773 vst vr21, t3, 0 // p2 774 vstx vr22, t3, a1 // p1 775 alsl.d t3, a1, t3, 1 776 vst vr23, t3, 0 // p0 777 vstx vr24, t3, a1 // q0 778 alsl.d t3, a1, t3, 1 779 vst vr25, t3, 0 // q1 780 vstx vr26, t3, a1 // q2 781 jirl zero, ra, 0x00 7828: 783 slli.d t3, a1, 1 784 sub.d t3, a0, t3 785 vst vr22, t3, 0 // p1 786 vstx vr23, t3, a1 // p0 787 alsl.d t3, a1, t3, 1 788 vst vr24, t3, 0 // q0 789 vstx vr25, t3, a1 // q1 790endfuncl 791 792functionl lpf_h_8_16_lsx 793 addi.d t3, a0, -4 794 fld.d f20, t3, 0 795 fldx.d f21, t3, a1 796 alsl.d t3, a1, t3, 1 797 fld.d f22, t3, 0 798 fldx.d f23, t3, a1 799 alsl.d t3, a1, t3, 1 800 fld.d f24, t3, 0 801 fldx.d f25, t3, a1 802 alsl.d t3, a1, t3, 1 803 fld.d f26, t3, 0 804 fldx.d f27, t3, a1 805 806 alsl.d t3, a1, t3, 1 807 fld.d f16, t3, 0 808 fldx.d f17, t3, a1 809 alsl.d t3, a1, t3, 1 810 fld.d f18, t3, 0 811 fldx.d f19, t3, a1 812 alsl.d t3, a1, t3, 1 813 fld.d f28, t3, 0 814 fldx.d f29, t3, a1 815 alsl.d t3, a1, t3, 1 816 fld.d f30, t3, 0 817 fldx.d f31, t3, a1 818 alsl.d t3, a1, t3, 1 819 820 vilvl.d vr20, vr16, vr20 821 vilvl.d vr21, vr17, vr21 822 vilvl.d vr22, vr18, vr22 823 vilvl.d vr23, vr19, vr23 824 vilvl.d vr24, vr28, vr24 825 vilvl.d vr25, vr29, vr25 826 vilvl.d vr26, vr30, vr26 827 vilvl.d vr27, vr31, vr27 828 addi.d a0, t3, 4 829 830 TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 831 832 LPF_16_WD8 833 834 slli.d t3, a1, 4 835 sub.d a0, a0, t3 836 837 TRANSPOSE_8x16b vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 838 839 addi.d a0, a0, -4 840.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 841 vstelm.d \i, a0, 0, 0 842 add.d a0, a0, a1 843.endr 844.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 845 vstelm.d \i, a0, 0, 1 846 add.d a0, a0, a1 847.endr 848 addi.d a0, a0, 4 849 jirl zero, ra, 0x00 850 8518: 852 slli.d t3, a1, 4 853 sub.d a0, a0, t3 854 855 TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 856 857 addi.d a0, a0, -2 858.irp i, vr22, vr23, vr24, vr25 859 vstelm.w \i, a0, 0, 0 860 add.d a0, a0, a1 861.endr 862.irp i, vr22, vr23, vr24, vr25 863 vstelm.w \i, a0, 0, 1 864 add.d a0, a0, a1 865.endr 866.irp i, vr22, vr23, vr24, vr25 867 vstelm.w \i, a0, 0, 2 868 add.d a0, a0, a1 869.endr 870.irp i, vr22, vr23, vr24, vr25 871 vstelm.w \i, a0, 0, 3 872 add.d a0, a0, a1 873.endr 874 addi.d a0, a0, 2 875endfuncl 876 877functionl lpf_v_16_16_lsx 878 slli.d t3, a1, 3 879 sub.d s0, a0, t3 880 add.d s0, s0, a1 881 vld vr17, s0, 0 // p6 882 vldx vr18, s0, a1 // p5 883 alsl.d s0, a1, s0, 1 884 vld vr19, s0, 0 // p4 885 vldx vr20, s0, a1 // p3 886 alsl.d s0, a1, s0, 1 887 vld vr21, s0, 0 // p2 888 vldx vr22, s0, a1 // p1 889 alsl.d s0, a1, s0, 1 890 vld vr23, s0, 0 // p0 891 vldx vr24, s0, a1 // q0 892 alsl.d s0, a1, s0, 1 893 vld vr25, s0, 0 // q1 894 vldx vr26, s0, a1 // q2 895 alsl.d s0, a1, s0, 1 896 vld vr27, s0, 0 // q3 897 vldx vr28, s0, a1 // q4 898 alsl.d s0, a1, s0, 1 899 vld vr29, s0, 0 // q5 900 vldx vr30, s0, a1 // q6 901 902 LPF_16_WD16 903 904 sub.d s0, a0, t3 905 alsl.d s0, a1, s0, 1 906 vst vr0, s0, 0 // p5 907 vstx vr1, s0, a1 // p4 908 alsl.d s0, a1, s0, 1 909 vst vr2, s0, 0 // p3 910 vstx vr3, s0, a1 // p2 911 alsl.d s0, a1, s0, 1 912 vst vr4, s0, 0 // p1 913 vstx vr5, s0, a1 // p0 914 alsl.d s0, a1, s0, 1 915 vst vr6, s0, 0 // q0 916 vstx vr7, s0, a1 // q1 917 alsl.d s0, a1, s0, 1 918 vst vr8, s0, 0 // q2 919 vstx vr9, s0, a1 // q3 920 alsl.d s0, a1, s0, 1 921 vst vr10, s0, 0 // q4 922 vstx vr11, s0, a1 // q5 923 jirl zero, ra, 0x00 9247: 925 slli.d t3, a1, 1 926 add.d t3, t3, a1 927 sub.d s0, a0, t3 928 vst vr21, s0, 0 // p2 929 vstx vr22, s0, a1 // p1 930 alsl.d s0, a1, s0, 1 931 vst vr23, s0, 0 // p0 932 vstx vr24, s0, a1 // q0 933 alsl.d s0, a1, s0, 1 934 vst vr25, s0, 0 // q1 935 vstx vr26, s0, a1 // q2 936 jirl zero, ra, 0x00 9378: 938 slli.d t3, a1, 1 939 sub.d s0, a0, t3 940 vst vr22, s0, 0 // p1 941 vstx vr23, s0, a1 // p0 942 alsl.d s0, a1, s0, 1 943 vst vr24, s0, 0 // q0 944 vstx vr25, s0, a1 // q1 945endfuncl 946 947functionl lpf_h_16_16_lsx 948 addi.d t3, a0, -8 949 vld vr16, t3, 0 950 vldx vr17, t3, a1 951 alsl.d t3, a1, t3, 1 952 vld vr18, t3, 0 953 vldx vr19, t3, a1 954 alsl.d t3, a1, t3, 1 955 vld vr20, t3, 0 956 vldx vr21, t3, a1 957 alsl.d t3, a1, t3, 1 958 vld vr22, t3, 0 959 vldx vr23, t3, a1 960 alsl.d t3, a1, t3, 1 961 vld vr24, t3, 0 962 vldx vr25, t3, a1 963 alsl.d t3, a1, t3, 1 964 vld vr26, t3, 0 965 vldx vr27, t3, a1 966 alsl.d t3, a1, t3, 1 967 vld vr28, t3, 0 968 vldx vr29, t3, a1 969 alsl.d t3, a1, t3, 1 970 vld vr30, t3, 0 971 vldx vr31, t3, a1 972 alsl.d t3, a1, t3, 1 973.macro SWAPD in0, in1 974 vaddi.bu vr0, \in0, 0 975 vilvl.d \in0, \in1, \in0 976 vilvh.d \in1, \in1, vr0 977.endm 978 SWAPD vr16, vr24 979 SWAPD vr17, vr25 980 SWAPD vr18, vr26 981 SWAPD vr19, vr27 982 SWAPD vr20, vr28 983 SWAPD vr21, vr29 984 SWAPD vr22, vr30 985 SWAPD vr23, vr31 986 addi.d a0, t3, 8 987 988 TRANSPOSE_8x16B vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, vr0, vr1 989 TRANSPOSE_8x16B vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, vr0, vr1 990 991 LPF_16_WD16 992 993 slli.d t3, a1, 4 994 sub.d a0, a0, t3 995 996 TRANSPOSE_8x16B vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5, vr18, vr19 997 TRANSPOSE_8x16B vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31, vr18, vr19 998 999 addi.d t3, a0, -8 1000.irp i, vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5 1001 vstelm.d \i, t3, 0, 0 1002 add.d t3, t3, a1 1003.endr 1004.irp i, vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5 1005 vstelm.d \i, t3, 0, 1 1006 add.d t3, t3, a1 1007.endr 1008.irp i, vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31 1009 vstelm.d \i, a0, 0, 0 1010 add.d a0, a0, a1 1011.endr 1012.irp i, vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31 1013 vstelm.d \i, a0, 0, 1 1014 add.d a0, a0, a1 1015.endr 1016 jirl zero, ra, 0x00 1017 10187: 1019 slli.d t3, a1, 4 1020 sub.d a0, a0, t3 1021 1022 TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 1023 1024 addi.d a0, a0, -4 1025.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 1026 vstelm.d \i, a0, 0, 0 1027 add.d a0, a0, a1 1028.endr 1029.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 1030 vstelm.d \i, a0, 0, 1 1031 add.d a0, a0, a1 1032.endr 1033 addi.d a0, a0, 4 1034 jirl zero, ra, 0x00 10358: 1036 1037 slli.d t3, a1, 4 1038 sub.d a0, a0, t3 1039 1040 TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 1041 1042 addi.d a0, a0, -2 1043.irp i, 0, 1, 2, 3 1044 vstelm.w vr22, a0, 0, \i 1045 add.d a0, a0, a1 1046 vstelm.w vr23, a0, 0, \i 1047 add.d a0, a0, a1 1048 vstelm.w vr24, a0, 0, \i 1049 add.d a0, a0, a1 1050 vstelm.w vr25, a0, 0, \i 1051 add.d a0, a0, a1 1052.endr 1053 addi.d a0, a0, 2 1054endfuncl 1055 1056.macro PUSH_REG 1057 addi.d sp, sp, -64-8 1058 fst.d f24, sp, 0 1059 fst.d f25, sp, 8 1060 fst.d f26, sp, 16 1061 fst.d f27, sp, 24 1062 fst.d f28, sp, 32 1063 fst.d f29, sp, 40 1064 fst.d f30, sp, 48 1065 fst.d f31, sp, 56 1066 st.d s0, sp, 64 1067.endm 1068.macro POP_REG 1069 fld.d f24, sp, 0 1070 fld.d f25, sp, 8 1071 fld.d f26, sp, 16 1072 fld.d f27, sp, 24 1073 fld.d f28, sp, 32 1074 fld.d f29, sp, 40 1075 fld.d f30, sp, 48 1076 fld.d f31, sp, 56 1077 ld.d s0, sp, 64 1078 addi.d sp, sp, 64+8 1079.endm 1080 1081const mask_1248 1082.word 1, 2, 4, 8 1083endconst 1084 1085.macro LPF_FUNC DIR, TYPE 1086function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx 1087 PUSH_REG 1088 move t8, ra 1089 vld vr0, a2, 0 //vmask 1090 vpickve2gr.wu t0, vr0, 0 1091 vpickve2gr.wu t1, vr0, 1 1092.ifc \TYPE, y 1093 vpickve2gr.wu t2, vr0, 2 1094.endif 1095 addi.d a5, a5, 128 // Move to sharp part of lut 1096.ifc \TYPE, y 1097 or t1, t1, t2 // vmask[1] |= vmaks[2] 1098.endif 1099 slli.d a4, a4, 2 1100.ifc \DIR, v 1101 sub.d a4, a3, a4 1102.else 1103 addi.d a3, a3, -4 1104.endif 1105 or t0, t0, t1 // vmaks[0] |= vmask[1] 11061: 1107 andi t3, t0, 0x0f 1108.ifc \DIR, v 1109 vld vr0, a4, 0 // l[-b4_stride][] 1110 addi.d a4, a4, 16 1111 vld vr1, a3, 0 // l[0][] 1112 addi.d a3, a3, 16 1113.else 1114 fld.d f0, a3, 0 1115 fldx.d f1, a3, a4 1116 alsl.d a3, a4, a3, 1 1117 fld.d f2, a3, 0 1118 fldx.d f3, a3, a4 1119 alsl.d a3, a4, a3, 1 1120 vilvl.w vr1, vr1, vr0 1121 vilvl.w vr2, vr3, vr2 1122 vilvl.d vr0, vr2, vr1 1123 vilvh.d vr1, vr2, vr1 1124.endif 1125 beqz t3, 7f 1126 //l[0][] ? l[0][] : l[-b4_stride][] 1127 vseqi.b vr2, vr1, 0 1128 vbitsel.v vr1, vr1, vr0, vr2 1129 li.w t3, 0xff 1130 vreplgr2vr.w vr3, t3 1131 vand.v vr1, vr1, vr3 1132 vshuf4i.b vr1, vr1, 0x00 // L -- 1 0 2 0 1133 vseqi.w vr2, vr1, 0 // 0 -1 0 -1 1134 vseqi.w vr2, vr2, 0 // L != 0 -- -1 0 -1 0 1135 vhaddw.qu.du vr3, vr2, vr2 1136 vpickve2gr.du t4, vr3, 0 1137 beqz t4, 7f // if (!L) continue 1138 la.local t3, mask_1248 // bits x 1139 vld vr16, t3, 0 1140 vreplgr2vr.w vr13, t0 // vmask[0] 1141 vreplgr2vr.w vr14, t1 // vmaks[1] 1142 vand.v vr13, vr13, vr16 1143 vseqi.w vr13, vr13, 0 1144 vseqi.w vr13, vr13, 0 // if (vmask[0] & x) 1145 vand.v vr13, vr13, vr2 // vmask[0] &= L != 0 1146 vand.v vr14, vr14, vr16 1147 vseqi.w vr14, vr14, 0 1148 vseqi.w vr14, vr14, 0 // if (vmask[1] & x) 1149.ifc \TYPE, y 1150 vreplgr2vr.w vr15, t2 // vmask[2] 1151 vand.v vr15, vr15, vr16 1152 vseqi.w vr15, vr15, 0 1153 vseqi.w vr15, vr15, 0 // if (vmask[2] & x) 1154.endif 1155 vldrepl.b vr5, a5, 0 // sharp[0] 1156 addi.d t5, a5, 8 1157 vldrepl.b vr6, t5, 0 // sharp[1] 1158 vsrl.b vr3, vr1, vr5 // L >> sharp[0] 1159 vsrli.b vr12, vr1, 4 // H 1160 vmin.bu vr3, vr3, vr6 // imin(L >> sharp[0], sharp[1]) 1161 vaddi.bu vr0, vr1, 2 // L + 2 1162 vmaxi.bu vr11, vr3, 1 // imax(imin(), 1) = limit = I 1163 vslli.b vr0, vr0, 1 // 2*(L + 2) 1164 vadd.b vr10, vr0, vr11 // 2*(L + 2) + limit = E 1165.ifc \TYPE, y 1166 andi t3, t2, 0x0f 1167 beqz t3, 2f 1168 //wd16 1169 bl lpf_\DIR\()_16_16_lsx 1170 b 8f 11712: 1172.endif 1173 andi t3, t1, 0x0f 1174 beqz t3, 3f 1175.ifc \TYPE, y 1176 // wd8 1177 bl lpf_\DIR\()_8_16_lsx 1178.else 1179 // wd6 1180 bl lpf_\DIR\()_6_16_lsx 1181.endif 1182 b 8f 11833: 1184 // wd4 1185 bl lpf_\DIR\()_4_16_lsx 1186.ifc \DIR, h 1187 b 8f 11887: 1189 // For dir h, the functions above increment a0. 1190 // If the whole function is skipped, increment it here instead. 1191 alsl.d a0, a1, a0, 4 1192.else 11937: 1194.endif 11958: 1196 srli.d t0, t0, 4 1197 srli.d t1, t1, 4 1198.ifc \TYPE, y 1199 srli.d t2, t2, 4 1200.endif 1201.ifc \DIR, v 1202 addi.d a0, a0, 16 1203.else 1204 // For dir h, a0 is returned incremented 1205.endif 1206 bnez t0, 1b 1207 move ra, t8 1208 POP_REG 1209endfunc 1210.endm 1211 1212LPF_FUNC h, y 1213LPF_FUNC v, y 1214LPF_FUNC h, uv 1215LPF_FUNC v, uv 1216