1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29 30#define FILTER_OUT_STRIDE 384 31 32.macro sgr_funcs bpc 33// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, 34// const pixel *src, const ptrdiff_t stride, 35// const int32_t *a, const int16_t *b, 36// const int w, const int h); 37function sgr_finish_filter1_\bpc\()bpc_neon, export=1 38 push {r4-r11,lr} 39 vpush {q4-q7} 40 ldrd r4, r5, [sp, #100] 41 ldr r6, [sp, #108] 42 sub r7, r3, #(4*SUM_STRIDE) 43 add r8, r3, #(4*SUM_STRIDE) 44 sub r9, r4, #(2*SUM_STRIDE) 45 add r10, r4, #(2*SUM_STRIDE) 46 mov r11, #SUM_STRIDE 47 mov r12, #FILTER_OUT_STRIDE 48 add lr, r5, #3 49 bic lr, lr, #3 // Aligned width 50.if \bpc == 8 51 sub r2, r2, lr 52.else 53 sub r2, r2, lr, lsl #1 54.endif 55 sub r12, r12, lr 56 sub r11, r11, lr 57 sub r11, r11, #4 // We read 4 extra elements from both a and b 58 mov lr, r5 59 vmov.i16 q14, #3 60 vmov.i32 q15, #3 611: 62 vld1.16 {q0}, [r9, :128]! 63 vld1.16 {q1}, [r4, :128]! 64 vld1.16 {q2}, [r10, :128]! 65 vld1.32 {q8, q9}, [r7, :128]! 66 vld1.32 {q10, q11}, [r3, :128]! 67 vld1.32 {q12, q13}, [r8, :128]! 68 692: 70 subs r5, r5, #4 71 vext.8 d6, d0, d1, #2 // -stride 72 vext.8 d7, d2, d3, #2 // 0 73 vext.8 d8, d4, d5, #2 // +stride 74 vext.8 d9, d0, d1, #4 // +1-stride 75 vext.8 d10, d2, d3, #4 // +1 76 vext.8 d11, d4, d5, #4 // +1+stride 77 vadd.i16 d2, d2, d6 // -1, -stride 78 vadd.i16 d7, d7, d8 // 0, +stride 79 vadd.i16 d0, d0, d9 // -1-stride, +1-stride 80 vadd.i16 d2, d2, d7 81 vadd.i16 d4, d4, d11 // -1+stride, +1+stride 82 vadd.i16 d2, d2, d10 // +1 83 vadd.i16 d0, d0, d4 84 85 vext.8 q3, q8, q9, #4 // -stride 86 vshl.i16 d2, d2, #2 87 vext.8 q4, q8, q9, #8 // +1-stride 88 vext.8 q5, q10, q11, #4 // 0 89 vext.8 q6, q10, q11, #8 // +1 90 vmla.i16 d2, d0, d28 // * 3 -> a 91 vadd.i32 q3, q3, q10 // -stride, -1 92 vadd.i32 q8, q8, q4 // -1-stride, +1-stride 93 vadd.i32 q5, q5, q6 // 0, +1 94 vadd.i32 q8, q8, q12 // -1+stride 95 vadd.i32 q3, q3, q5 96 vext.8 q7, q12, q13, #4 // +stride 97 vext.8 q10, q12, q13, #8 // +1+stride 98.if \bpc == 8 99 vld1.32 {d24[0]}, [r1, :32]! // src 100.else 101 vld1.16 {d24}, [r1, :64]! // src 102.endif 103 vadd.i32 q3, q3, q7 // +stride 104 vadd.i32 q8, q8, q10 // +1+stride 105 vshl.i32 q3, q3, #2 106 vmla.i32 q3, q8, q15 // * 3 -> b 107.if \bpc == 8 108 vmovl.u8 q12, d24 // src 109.endif 110 vmov d0, d1 111 vmlal.u16 q3, d2, d24 // b + a * src 112 vmov d2, d3 113 vrshrn.i32 d6, q3, #9 114 vmov d4, d5 115 vst1.16 {d6}, [r0]! 116 117 ble 3f 118 vmov q8, q9 119 vmov q10, q11 120 vmov q12, q13 121 vld1.16 {d1}, [r9, :64]! 122 vld1.16 {d3}, [r4, :64]! 123 vld1.16 {d5}, [r10, :64]! 124 vld1.32 {q9}, [r7, :128]! 125 vld1.32 {q11}, [r3, :128]! 126 vld1.32 {q13}, [r8, :128]! 127 b 2b 128 1293: 130 subs r6, r6, #1 131 ble 0f 132 mov r5, lr 133 add r0, r0, r12, lsl #1 134 add r1, r1, r2 135 add r3, r3, r11, lsl #2 136 add r7, r7, r11, lsl #2 137 add r8, r8, r11, lsl #2 138 add r4, r4, r11, lsl #1 139 add r9, r9, r11, lsl #1 140 add r10, r10, r11, lsl #1 141 b 1b 1420: 143 vpop {q4-q7} 144 pop {r4-r11,pc} 145endfunc 146 147// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, 148// const pixel *src, const ptrdiff_t stride, 149// const int32_t *a, const int16_t *b, 150// const int w, const int h); 151function sgr_finish_filter2_\bpc\()bpc_neon, export=1 152 push {r4-r11,lr} 153 vpush {q4-q7} 154 ldrd r4, r5, [sp, #100] 155 ldr r6, [sp, #108] 156 add r7, r3, #(4*(SUM_STRIDE)) 157 sub r3, r3, #(4*(SUM_STRIDE)) 158 add r8, r4, #(2*(SUM_STRIDE)) 159 sub r4, r4, #(2*(SUM_STRIDE)) 160 mov r9, #(2*SUM_STRIDE) 161 mov r10, #FILTER_OUT_STRIDE 162 add r11, r5, #7 163 bic r11, r11, #7 // Aligned width 164.if \bpc == 8 165 sub r2, r2, r11 166.else 167 sub r2, r2, r11, lsl #1 168.endif 169 sub r10, r10, r11 170 sub r9, r9, r11 171 sub r9, r9, #4 // We read 4 extra elements from a 172 sub r12, r9, #4 // We read 8 extra elements from b 173 mov lr, r5 174 1751: 176 vld1.16 {q0, q1}, [r4, :128]! 177 vld1.16 {q2, q3}, [r8, :128]! 178 vld1.32 {q8, q9}, [r3, :128]! 179 vld1.32 {q11, q12}, [r7, :128]! 180 vld1.32 {q10}, [r3, :128]! 181 vld1.32 {q13}, [r7, :128]! 182 1832: 184 vmov.i16 q14, #5 185 vmov.i16 q15, #6 186 subs r5, r5, #8 187 vext.8 q4, q0, q1, #4 // +1-stride 188 vext.8 q5, q2, q3, #4 // +1+stride 189 vext.8 q6, q0, q1, #2 // -stride 190 vext.8 q7, q2, q3, #2 // +stride 191 vadd.i16 q0, q0, q4 // -1-stride, +1-stride 192 vadd.i16 q5, q2, q5 // -1+stride, +1+stride 193 vadd.i16 q2, q6, q7 // -stride, +stride 194 vadd.i16 q0, q0, q5 195 196 vext.8 q4, q8, q9, #8 // +1-stride 197 vext.8 q5, q9, q10, #8 198 vext.8 q6, q11, q12, #8 // +1+stride 199 vext.8 q7, q12, q13, #8 200 vmul.i16 q0, q0, q14 // * 5 201 vmla.i16 q0, q2, q15 // * 6 202 vadd.i32 q4, q4, q8 // -1-stride, +1-stride 203 vadd.i32 q5, q5, q9 204 vadd.i32 q6, q6, q11 // -1+stride, +1+stride 205 vadd.i32 q7, q7, q12 206 vadd.i32 q4, q4, q6 207 vadd.i32 q5, q5, q7 208 vext.8 q6, q8, q9, #4 // -stride 209 vext.8 q7, q9, q10, #4 210 vext.8 q8, q11, q12, #4 // +stride 211 vext.8 q11, q12, q13, #4 212 213.if \bpc == 8 214 vld1.8 {d4}, [r1, :64]! 215.else 216 vld1.8 {q2}, [r1, :128]! 217.endif 218 219 vmov.i32 q14, #5 220 vmov.i32 q15, #6 221 222 vadd.i32 q6, q6, q8 // -stride, +stride 223 vadd.i32 q7, q7, q11 224 vmul.i32 q4, q4, q14 // * 5 225 vmla.i32 q4, q6, q15 // * 6 226 vmul.i32 q5, q5, q14 // * 5 227 vmla.i32 q5, q7, q15 // * 6 228 229.if \bpc == 8 230 vmovl.u8 q2, d4 231.endif 232 vmlal.u16 q4, d0, d4 // b + a * src 233 vmlal.u16 q5, d1, d5 // b + a * src 234 vmov q0, q1 235 vrshrn.i32 d8, q4, #9 236 vrshrn.i32 d9, q5, #9 237 vmov q2, q3 238 vst1.16 {q4}, [r0, :128]! 239 240 ble 3f 241 vmov q8, q10 242 vmov q11, q13 243 vld1.16 {q1}, [r4, :128]! 244 vld1.16 {q3}, [r8, :128]! 245 vld1.32 {q9, q10}, [r3, :128]! 246 vld1.32 {q12, q13}, [r7, :128]! 247 b 2b 248 2493: 250 subs r6, r6, #1 251 ble 0f 252 mov r5, lr 253 add r0, r0, r10, lsl #1 254 add r1, r1, r2 255 add r3, r3, r9, lsl #2 256 add r7, r7, r9, lsl #2 257 add r4, r4, r12, lsl #1 258 add r8, r8, r12, lsl #1 259 260 vld1.32 {q8, q9}, [r3, :128]! 261 vld1.16 {q0, q1}, [r4, :128]! 262 vld1.32 {q10}, [r3, :128]! 263 264 vmov.i16 q12, #5 265 vmov.i16 q13, #6 266 2674: 268 subs r5, r5, #8 269 vext.8 q3, q0, q1, #4 // +1 270 vext.8 q2, q0, q1, #2 // 0 271 vadd.i16 q0, q0, q3 // -1, +1 272 273 vext.8 q4, q8, q9, #4 // 0 274 vext.8 q5, q9, q10, #4 275 vext.8 q6, q8, q9, #8 // +1 276 vext.8 q7, q9, q10, #8 277 vmul.i16 q2, q2, q13 // * 6 278 vmla.i16 q2, q0, q12 // * 5 -> a 279.if \bpc == 8 280 vld1.8 {d22}, [r1, :64]! 281.else 282 vld1.16 {q11}, [r1, :128]! 283.endif 284 vadd.i32 q8, q8, q6 // -1, +1 285 vadd.i32 q9, q9, q7 286.if \bpc == 8 287 vmovl.u8 q11, d22 288.endif 289 vmul.i32 q4, q4, q15 // * 6 290 vmla.i32 q4, q8, q14 // * 5 -> b 291 vmul.i32 q5, q5, q15 // * 6 292 vmla.i32 q5, q9, q14 // * 5 -> b 293 294 vmlal.u16 q4, d4, d22 // b + a * src 295 vmlal.u16 q5, d5, d23 296 vmov q0, q1 297 vrshrn.i32 d8, q4, #8 298 vrshrn.i32 d9, q5, #8 299 vmov q8, q10 300 vst1.16 {q4}, [r0, :128]! 301 302 ble 5f 303 vld1.16 {q1}, [r4, :128]! 304 vld1.32 {q9, q10}, [r3, :128]! 305 b 4b 306 3075: 308 subs r6, r6, #1 309 ble 0f 310 mov r5, lr 311 sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started 312 sub r4, r4, r11, lsl #1 313 add r0, r0, r10, lsl #1 314 add r1, r1, r2 315 sub r3, r3, #16 316 sub r4, r4, #16 317 b 1b 3180: 319 vpop {q4-q7} 320 pop {r4-r11,pc} 321endfunc 322 323// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, 324// const pixel *src, const ptrdiff_t src_stride, 325// const int16_t *t1, const int w, const int h, 326// const int wt, const int bitdepth_max); 327function sgr_weighted1_\bpc\()bpc_neon, export=1 328 push {r4-r9,lr} 329 ldrd r4, r5, [sp, #28] 330 ldrd r6, r7, [sp, #36] 331.if \bpc == 16 332 ldr r8, [sp, #44] 333.endif 334 vdup.16 d31, r7 335 cmp r6, #2 336.if \bpc == 16 337 vdup.16 q14, r8 338.endif 339 add r9, r0, r1 340 add r12, r2, r3 341 add lr, r4, #2*FILTER_OUT_STRIDE 342 mov r7, #(4*FILTER_OUT_STRIDE) 343 lsl r1, r1, #1 344 lsl r3, r3, #1 345 add r8, r5, #7 346 bic r8, r8, #7 // Aligned width 347.if \bpc == 8 348 sub r1, r1, r8 349 sub r3, r3, r8 350.else 351 sub r1, r1, r8, lsl #1 352 sub r3, r3, r8, lsl #1 353.endif 354 sub r7, r7, r8, lsl #1 355 mov r8, r5 356 blt 2f 3571: 358.if \bpc == 8 359 vld1.8 {d0}, [r2, :64]! 360 vld1.8 {d16}, [r12, :64]! 361.else 362 vld1.16 {q0}, [r2, :128]! 363 vld1.16 {q8}, [r12, :128]! 364.endif 365 vld1.16 {q1}, [r4, :128]! 366 vld1.16 {q9}, [lr, :128]! 367 subs r5, r5, #8 368.if \bpc == 8 369 vshll.u8 q0, d0, #4 // u 370 vshll.u8 q8, d16, #4 // u 371.else 372 vshl.i16 q0, q0, #4 // u 373 vshl.i16 q8, q8, #4 // u 374.endif 375 vsub.i16 q1, q1, q0 // t1 - u 376 vsub.i16 q9, q9, q8 // t1 - u 377 vshll.u16 q2, d0, #7 // u << 7 378 vshll.u16 q3, d1, #7 // u << 7 379 vshll.u16 q10, d16, #7 // u << 7 380 vshll.u16 q11, d17, #7 // u << 7 381 vmlal.s16 q2, d2, d31 // v 382 vmlal.s16 q3, d3, d31 // v 383 vmlal.s16 q10, d18, d31 // v 384 vmlal.s16 q11, d19, d31 // v 385.if \bpc == 8 386 vrshrn.i32 d4, q2, #11 387 vrshrn.i32 d5, q3, #11 388 vrshrn.i32 d20, q10, #11 389 vrshrn.i32 d21, q11, #11 390 vqmovun.s16 d4, q2 391 vqmovun.s16 d20, q10 392 vst1.8 {d4}, [r0, :64]! 393 vst1.8 {d20}, [r9, :64]! 394.else 395 vqrshrun.s32 d4, q2, #11 396 vqrshrun.s32 d5, q3, #11 397 vqrshrun.s32 d20, q10, #11 398 vqrshrun.s32 d21, q11, #11 399 vmin.u16 q2, q2, q14 400 vmin.u16 q10, q10, q14 401 vst1.16 {q2}, [r0, :128]! 402 vst1.16 {q10}, [r9, :128]! 403.endif 404 bgt 1b 405 406 sub r6, r6, #2 407 cmp r6, #1 408 blt 0f 409 mov r5, r8 410 add r0, r0, r1 411 add r9, r9, r1 412 add r2, r2, r3 413 add r12, r12, r3 414 add r4, r4, r7 415 add lr, lr, r7 416 beq 2f 417 b 1b 418 4192: 420.if \bpc == 8 421 vld1.8 {d0}, [r2, :64]! 422.else 423 vld1.16 {q0}, [r2, :128]! 424.endif 425 vld1.16 {q1}, [r4, :128]! 426 subs r5, r5, #8 427.if \bpc == 8 428 vshll.u8 q0, d0, #4 // u 429.else 430 vshl.i16 q0, q0, #4 // u 431.endif 432 vsub.i16 q1, q1, q0 // t1 - u 433 vshll.u16 q2, d0, #7 // u << 7 434 vshll.u16 q3, d1, #7 // u << 7 435 vmlal.s16 q2, d2, d31 // v 436 vmlal.s16 q3, d3, d31 // v 437.if \bpc == 8 438 vrshrn.i32 d4, q2, #11 439 vrshrn.i32 d5, q3, #11 440 vqmovun.s16 d2, q2 441 vst1.8 {d2}, [r0, :64]! 442.else 443 vqrshrun.s32 d4, q2, #11 444 vqrshrun.s32 d5, q3, #11 445 vmin.u16 q2, q2, q14 446 vst1.16 {q2}, [r0, :128]! 447.endif 448 bgt 2b 4490: 450 pop {r4-r9,pc} 451endfunc 452 453// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, 454// const pixel *src, const ptrdiff_t src_stride, 455// const int16_t *t1, const int16_t *t2, 456// const int w, const int h, 457// const int16_t wt[2], const int bitdepth_max); 458function sgr_weighted2_\bpc\()bpc_neon, export=1 459 push {r4-r11,lr} 460 ldrd r4, r5, [sp, #36] 461 ldrd r6, r7, [sp, #44] 462.if \bpc == 8 463 ldr r8, [sp, #52] 464.else 465 ldrd r8, r9, [sp, #52] 466.endif 467 cmp r7, #2 468 add r10, r0, r1 469 add r11, r2, r3 470 add r12, r4, #2*FILTER_OUT_STRIDE 471 add lr, r5, #2*FILTER_OUT_STRIDE 472 vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1] 473.if \bpc == 16 474 vdup.16 q14, r9 475.endif 476 mov r8, #4*FILTER_OUT_STRIDE 477 lsl r1, r1, #1 478 lsl r3, r3, #1 479 add r9, r6, #7 480 bic r9, r9, #7 // Aligned width 481.if \bpc == 8 482 sub r1, r1, r9 483 sub r3, r3, r9 484.else 485 sub r1, r1, r9, lsl #1 486 sub r3, r3, r9, lsl #1 487.endif 488 sub r8, r8, r9, lsl #1 489 mov r9, r6 490 blt 2f 4911: 492.if \bpc == 8 493 vld1.8 {d0}, [r2, :64]! 494 vld1.8 {d16}, [r11, :64]! 495.else 496 vld1.16 {q0}, [r2, :128]! 497 vld1.16 {q8}, [r11, :128]! 498.endif 499 vld1.16 {q1}, [r4, :128]! 500 vld1.16 {q9}, [r12, :128]! 501 vld1.16 {q2}, [r5, :128]! 502 vld1.16 {q10}, [lr, :128]! 503 subs r6, r6, #8 504.if \bpc == 8 505 vshll.u8 q0, d0, #4 // u 506 vshll.u8 q8, d16, #4 // u 507.else 508 vshl.i16 q0, q0, #4 // u 509 vshl.i16 q8, q8, #4 // u 510.endif 511 vsub.i16 q1, q1, q0 // t1 - u 512 vsub.i16 q2, q2, q0 // t2 - u 513 vsub.i16 q9, q9, q8 // t1 - u 514 vsub.i16 q10, q10, q8 // t2 - u 515 vshll.u16 q3, d0, #7 // u << 7 516 vshll.u16 q0, d1, #7 // u << 7 517 vshll.u16 q11, d16, #7 // u << 7 518 vshll.u16 q8, d17, #7 // u << 7 519 vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) 520 vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) 521 vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) 522 vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) 523 vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u) 524 vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u) 525 vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u) 526 vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u) 527.if \bpc == 8 528 vrshrn.i32 d6, q3, #11 529 vrshrn.i32 d7, q0, #11 530 vrshrn.i32 d22, q11, #11 531 vrshrn.i32 d23, q8, #11 532 vqmovun.s16 d6, q3 533 vqmovun.s16 d22, q11 534 vst1.8 {d6}, [r0, :64]! 535 vst1.8 {d22}, [r10, :64]! 536.else 537 vqrshrun.s32 d6, q3, #11 538 vqrshrun.s32 d7, q0, #11 539 vqrshrun.s32 d22, q11, #11 540 vqrshrun.s32 d23, q8, #11 541 vmin.u16 q3, q3, q14 542 vmin.u16 q11, q11, q14 543 vst1.16 {q3}, [r0, :128]! 544 vst1.16 {q11}, [r10, :128]! 545.endif 546 bgt 1b 547 548 subs r7, r7, #2 549 cmp r7, #1 550 blt 0f 551 mov r6, r9 552 add r0, r0, r1 553 add r10, r10, r1 554 add r2, r2, r3 555 add r11, r11, r3 556 add r4, r4, r8 557 add r12, r12, r8 558 add r5, r5, r8 559 add lr, lr, r8 560 beq 2f 561 b 1b 562 5632: 564.if \bpc == 8 565 vld1.8 {d0}, [r2, :64]! 566.else 567 vld1.16 {q0}, [r2, :128]! 568.endif 569 vld1.16 {q1}, [r4, :128]! 570 vld1.16 {q2}, [r5, :128]! 571 subs r6, r6, #8 572.if \bpc == 8 573 vshll.u8 q0, d0, #4 // u 574.else 575 vshl.i16 q0, q0, #4 // u 576.endif 577 vsub.i16 q1, q1, q0 // t1 - u 578 vsub.i16 q2, q2, q0 // t2 - u 579 vshll.u16 q3, d0, #7 // u << 7 580 vshll.u16 q0, d1, #7 // u << 7 581 vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) 582 vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) 583 vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) 584 vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) 585.if \bpc == 8 586 vrshrn.i32 d6, q3, #11 587 vrshrn.i32 d7, q0, #11 588 vqmovun.s16 d6, q3 589 vst1.8 {d6}, [r0, :64]! 590.else 591 vqrshrun.s32 d6, q3, #11 592 vqrshrun.s32 d7, q0, #11 593 vmin.u16 q3, q3, q14 594 vst1.16 {q3}, [r0, :128]! 595.endif 596 bgt 1b 5970: 598 pop {r4-r11,pc} 599endfunc 600.endm 601