1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 2-* 32*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1) 33*c0909341SAndroid Build Coastguard Worker %1_table: 34*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_table 35*c0909341SAndroid Build Coastguard Worker %rep %0 - 1 36*c0909341SAndroid Build Coastguard Worker dd %%prefix %+ .w%2 - %%base 37*c0909341SAndroid Build Coastguard Worker %rotate 1 38*c0909341SAndroid Build Coastguard Worker %endrep 39*c0909341SAndroid Build Coastguard Worker%endmacro 40*c0909341SAndroid Build Coastguard Worker 41*c0909341SAndroid Build Coastguard Worker%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix 42*c0909341SAndroid Build Coastguard Worker %rep %1 43*c0909341SAndroid Build Coastguard Worker db %2*3 44*c0909341SAndroid Build Coastguard Worker db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \ 45*c0909341SAndroid Build Coastguard Worker mangle(private_prefix %+ _save_tmvs_%3).write1 46*c0909341SAndroid Build Coastguard Worker %endrep 47*c0909341SAndroid Build Coastguard Worker%endmacro 48*c0909341SAndroid Build Coastguard Worker 49*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 50*c0909341SAndroid Build Coastguard Workermv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 51*c0909341SAndroid Build Coastguard Worker dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 52*c0909341SAndroid Build Coastguard Worker dw 1024, 963, 910, 862, 819, 780, 744, 712 53*c0909341SAndroid Build Coastguard Worker dw 682, 655, 630, 606, 585, 564, 546, 528 54*c0909341SAndroid Build Coastguard Workersplat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 55*c0909341SAndroid Build Coastguard Worker db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 56*c0909341SAndroid Build Coastguard Worker db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 57*c0909341SAndroid Build Coastguard Worker db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 58*c0909341SAndroid Build Coastguard Worker%endif 59*c0909341SAndroid Build Coastguard Workersave_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0 60*c0909341SAndroid Build Coastguard Worker db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 61*c0909341SAndroid Build Coastguard Workersave_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2 62*c0909341SAndroid Build Coastguard Worker db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3 63*c0909341SAndroid Build Coastguard Workersave_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 64*c0909341SAndroid Build Coastguard Workercond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 65*c0909341SAndroid Build Coastguard Workersave_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 66*c0909341SAndroid Build Coastguard Workersave_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 67*c0909341SAndroid Build Coastguard Workerpb_128: times 16 db 128 68*c0909341SAndroid Build Coastguard Workerpq_8192: dq 8192 69*c0909341SAndroid Build Coastguard Worker 70*c0909341SAndroid Build Coastguard Workersave_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3 71*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 4, 8, ssse3 72*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 4, 4, ssse3 73*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 5, 2, ssse3 74*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 7, 1, ssse3 75*c0909341SAndroid Build Coastguard Worker 76*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 77*c0909341SAndroid Build Coastguard Workersave_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2 78*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 4, 8, avx2 79*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 4, 4, avx2 80*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 5, 2, avx2 81*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 7, 1, avx2 82*c0909341SAndroid Build Coastguard Worker 83*c0909341SAndroid Build Coastguard Workersave_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl 84*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 4, 8, avx512icl 85*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 4, 4, avx512icl 86*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 5, 2, avx512icl 87*c0909341SAndroid Build Coastguard Worker SAVE_TMVS_TABLE 7, 1, avx512icl 88*c0909341SAndroid Build Coastguard Worker 89*c0909341SAndroid Build Coastguard WorkerJMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 90*c0909341SAndroid Build Coastguard WorkerJMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 91*c0909341SAndroid Build Coastguard Worker%endif 92*c0909341SAndroid Build Coastguard Worker 93*c0909341SAndroid Build Coastguard WorkerJMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 94*c0909341SAndroid Build Coastguard Worker 95*c0909341SAndroid Build Coastguard Workerstruc rf 96*c0909341SAndroid Build Coastguard Worker .frm_hdr: resq 1 97*c0909341SAndroid Build Coastguard Worker .iw4: resd 1 98*c0909341SAndroid Build Coastguard Worker .ih4: resd 1 99*c0909341SAndroid Build Coastguard Worker .iw8: resd 1 100*c0909341SAndroid Build Coastguard Worker .ih8: resd 1 101*c0909341SAndroid Build Coastguard Worker .sbsz: resd 1 102*c0909341SAndroid Build Coastguard Worker .use_rf_mvs: resd 1 103*c0909341SAndroid Build Coastguard Worker .sign_bias: resb 7 104*c0909341SAndroid Build Coastguard Worker .mfmv_sign: resb 7 105*c0909341SAndroid Build Coastguard Worker .pocdiff: resb 7 106*c0909341SAndroid Build Coastguard Worker .mfmv_ref: resb 3 107*c0909341SAndroid Build Coastguard Worker .mfmv_ref2cur: resd 3 108*c0909341SAndroid Build Coastguard Worker .mfmv_ref2ref: resd 3*7 109*c0909341SAndroid Build Coastguard Worker .n_mfmvs: resd 1 110*c0909341SAndroid Build Coastguard Worker .n_blocks: resd 1 111*c0909341SAndroid Build Coastguard Worker .rp: resq 1 112*c0909341SAndroid Build Coastguard Worker .rp_ref: resq 1 113*c0909341SAndroid Build Coastguard Worker .rp_proj: resq 1 114*c0909341SAndroid Build Coastguard Worker .rp_stride: resq 1 115*c0909341SAndroid Build Coastguard Worker .r: resq 1 116*c0909341SAndroid Build Coastguard Worker .n_tile_threads: resd 1 117*c0909341SAndroid Build Coastguard Worker .n_frame_threads: resd 1 118*c0909341SAndroid Build Coastguard Workerendstruc 119*c0909341SAndroid Build Coastguard Worker 120*c0909341SAndroid Build Coastguard WorkerSECTION .text 121*c0909341SAndroid Build Coastguard Worker 122*c0909341SAndroid Build Coastguard Worker%macro movif32 2 123*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 124*c0909341SAndroid Build Coastguard Worker mov %1, %2 125*c0909341SAndroid Build Coastguard Worker%endif 126*c0909341SAndroid Build Coastguard Worker%endmacro 127*c0909341SAndroid Build Coastguard Worker 128*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 129*c0909341SAndroid Build Coastguard Worker; refmvs_temporal_block *rp, ptrdiff_t stride, 130*c0909341SAndroid Build Coastguard Worker; refmvs_block **rr, uint8_t *ref_sign, 131*c0909341SAndroid Build Coastguard Worker; int col_end8, int row_end8, int col_start8, int row_start8 132*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 133*c0909341SAndroid Build Coastguard Workercglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \ 134*c0909341SAndroid Build Coastguard Worker xend, yend, xstart, ystart 135*c0909341SAndroid Build Coastguard Worker%define base_reg r12 136*c0909341SAndroid Build Coastguard Worker%else 137*c0909341SAndroid Build Coastguard Workercglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \ 138*c0909341SAndroid Build Coastguard Worker xend, yend, xstart, ystart 139*c0909341SAndroid Build Coastguard Worker movq m5, [ref_signq] 140*c0909341SAndroid Build Coastguard Worker lea strided, [strided*5] 141*c0909341SAndroid Build Coastguard Worker mov stridem, strided 142*c0909341SAndroid Build Coastguard Worker mov r3, xstartm 143*c0909341SAndroid Build Coastguard Worker mov r1, ystartm 144*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS b, ystart, rr, cand, xend, x 145*c0909341SAndroid Build Coastguard Worker%define stridemp r1m 146*c0909341SAndroid Build Coastguard Worker%define m8 [base+pb_128] 147*c0909341SAndroid Build Coastguard Worker%define m9 [base+save_pack0+ 0] 148*c0909341SAndroid Build Coastguard Worker%define m10 [base+save_pack0+16] 149*c0909341SAndroid Build Coastguard Worker%define base_reg r6 150*c0909341SAndroid Build Coastguard Worker%endif 151*c0909341SAndroid Build Coastguard Worker%define base base_reg-.write1 152*c0909341SAndroid Build Coastguard Worker LEA base_reg, .write1 153*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 154*c0909341SAndroid Build Coastguard Worker movifnidn xendd, xendm 155*c0909341SAndroid Build Coastguard Worker movifnidn yendd, yendm 156*c0909341SAndroid Build Coastguard Worker mov xstartd, xstartm 157*c0909341SAndroid Build Coastguard Worker mov ystartd, ystartm 158*c0909341SAndroid Build Coastguard Worker movq m5, [ref_signq] 159*c0909341SAndroid Build Coastguard Worker%endif 160*c0909341SAndroid Build Coastguard Worker movu m4, [base+save_ref_shuf] 161*c0909341SAndroid Build Coastguard Worker movddup m6, [base+save_cond0] 162*c0909341SAndroid Build Coastguard Worker movddup m7, [base+save_cond1] 163*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 164*c0909341SAndroid Build Coastguard Worker mova m8, [base+pb_128] 165*c0909341SAndroid Build Coastguard Worker mova m9, [base+save_pack0+ 0] 166*c0909341SAndroid Build Coastguard Worker mova m10, [base+save_pack0+16] 167*c0909341SAndroid Build Coastguard Worker%endif 168*c0909341SAndroid Build Coastguard Worker psllq m5, 8 169*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 170*c0909341SAndroid Build Coastguard Worker lea r9d, [xendq*5] 171*c0909341SAndroid Build Coastguard Worker lea xstartd, [xstartq*5] 172*c0909341SAndroid Build Coastguard Worker sub yendd, ystartd 173*c0909341SAndroid Build Coastguard Worker add ystartd, ystartd 174*c0909341SAndroid Build Coastguard Worker lea strideq, [strideq*5] 175*c0909341SAndroid Build Coastguard Worker sub xstartq, r9 176*c0909341SAndroid Build Coastguard Worker add xendd, r9d 177*c0909341SAndroid Build Coastguard Worker add rpq, r9 178*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 179*c0909341SAndroid Build Coastguard Worker%else 180*c0909341SAndroid Build Coastguard Worker lea r0, [xendd*5] ; xend5 181*c0909341SAndroid Build Coastguard Worker lea r3, [r3*5] ; xstart5 182*c0909341SAndroid Build Coastguard Worker sub r3, r0 ; -w5 183*c0909341SAndroid Build Coastguard Worker mov r6m, r3 184*c0909341SAndroid Build Coastguard Worker%define xstartq r6m 185*c0909341SAndroid Build Coastguard Worker add xendd, r0 ; xend6 186*c0909341SAndroid Build Coastguard Worker add r0m, r0 ; rp+xend5 187*c0909341SAndroid Build Coastguard Worker mov xendm, xendd 188*c0909341SAndroid Build Coastguard Worker sub r5, r1 ; h 189*c0909341SAndroid Build Coastguard Worker add r1, r1 190*c0909341SAndroid Build Coastguard Worker mov r7m, r1 191*c0909341SAndroid Build Coastguard Worker mov r5m, r5 192*c0909341SAndroid Build Coastguard Worker%define hd r5mp 193*c0909341SAndroid Build Coastguard Worker jmp .loop_y_noload 194*c0909341SAndroid Build Coastguard Worker%endif 195*c0909341SAndroid Build Coastguard Worker.loop_y: 196*c0909341SAndroid Build Coastguard Worker movif32 ystartd, r7m 197*c0909341SAndroid Build Coastguard Worker movif32 xendd, xendm 198*c0909341SAndroid Build Coastguard Worker.loop_y_noload: 199*c0909341SAndroid Build Coastguard Worker and ystartd, 30 200*c0909341SAndroid Build Coastguard Worker mov xq, xstartq 201*c0909341SAndroid Build Coastguard Worker mov bq, [rrq+ystartq*gprsize] 202*c0909341SAndroid Build Coastguard Worker add ystartd, 2 203*c0909341SAndroid Build Coastguard Worker movif32 r7m, ystartd 204*c0909341SAndroid Build Coastguard Worker lea bq, [bq+xendq*4] 205*c0909341SAndroid Build Coastguard Worker.loop_x: 206*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 207*c0909341SAndroid Build Coastguard Worker%define rpq r3 208*c0909341SAndroid Build Coastguard Worker%define r10 r1 209*c0909341SAndroid Build Coastguard Worker%define r10d r1 210*c0909341SAndroid Build Coastguard Worker%define r11 r4 211*c0909341SAndroid Build Coastguard Worker%define r11d r4 212*c0909341SAndroid Build Coastguard Worker%endif 213*c0909341SAndroid Build Coastguard Worker imul candq, xq, 0x9999 ; x / 5 * 3 214*c0909341SAndroid Build Coastguard Worker sar candq, 16 215*c0909341SAndroid Build Coastguard Worker movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 216*c0909341SAndroid Build Coastguard Worker movu m0, [bq+candq*8+12] ; cand_b 217*c0909341SAndroid Build Coastguard Worker movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0] 218*c0909341SAndroid Build Coastguard Worker movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1] 219*c0909341SAndroid Build Coastguard Worker add r10, base_reg 220*c0909341SAndroid Build Coastguard Worker add candq, r11 221*c0909341SAndroid Build Coastguard Worker jge .calc 222*c0909341SAndroid Build Coastguard Worker movu m1, [bq+candq*8+12] 223*c0909341SAndroid Build Coastguard Worker movzx r11d, byte [bq+candq*8+22] 224*c0909341SAndroid Build Coastguard Worker movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1] 225*c0909341SAndroid Build Coastguard Worker add r11, base_reg 226*c0909341SAndroid Build Coastguard Worker.calc: 227*c0909341SAndroid Build Coastguard Worker movif32 rpq, r0m 228*c0909341SAndroid Build Coastguard Worker ; ref check 229*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m0, m1 230*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ... 231*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1] 232*c0909341SAndroid Build Coastguard Worker ; mv check 233*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ... 234*c0909341SAndroid Build Coastguard Worker pabsw m2, m2 235*c0909341SAndroid Build Coastguard Worker psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 236*c0909341SAndroid Build Coastguard Worker ; res 237*c0909341SAndroid Build Coastguard Worker pcmpgtd m3, m2 238*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q2301 239*c0909341SAndroid Build Coastguard Worker pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ... 240*c0909341SAndroid Build Coastguard Worker pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ... 241*c0909341SAndroid Build Coastguard Worker por m3, m2 ; b0.shuf b1.shuf | ... 242*c0909341SAndroid Build Coastguard Worker pxor m3, m8 ; if cond0|cond1 == 0 => zero out 243*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 244*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 245*c0909341SAndroid Build Coastguard Worker call r10 246*c0909341SAndroid Build Coastguard Worker jge .next_line 247*c0909341SAndroid Build Coastguard Worker pshufd m0, m1, q3232 248*c0909341SAndroid Build Coastguard Worker call r11 249*c0909341SAndroid Build Coastguard Worker jl .loop_x 250*c0909341SAndroid Build Coastguard Worker.next_line: 251*c0909341SAndroid Build Coastguard Worker add rpq, stridemp 252*c0909341SAndroid Build Coastguard Worker movif32 r0m, rpq 253*c0909341SAndroid Build Coastguard Worker dec hd 254*c0909341SAndroid Build Coastguard Worker jg .loop_y 255*c0909341SAndroid Build Coastguard Worker RET 256*c0909341SAndroid Build Coastguard Worker.write1: 257*c0909341SAndroid Build Coastguard Worker movd [rpq+xq+0], m0 258*c0909341SAndroid Build Coastguard Worker psrlq m0, 8 259*c0909341SAndroid Build Coastguard Worker movd [rpq+xq+1], m0 260*c0909341SAndroid Build Coastguard Worker add xq, 5*1 261*c0909341SAndroid Build Coastguard Worker ret 262*c0909341SAndroid Build Coastguard Worker.write2: 263*c0909341SAndroid Build Coastguard Worker movq [rpq+xq+0], m0 264*c0909341SAndroid Build Coastguard Worker psrlq m0, 8 265*c0909341SAndroid Build Coastguard Worker movd [rpq+xq+6], m0 266*c0909341SAndroid Build Coastguard Worker add xq, 5*2 267*c0909341SAndroid Build Coastguard Worker ret 268*c0909341SAndroid Build Coastguard Worker.write4: 269*c0909341SAndroid Build Coastguard Worker pshufb m0, m9 270*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+ 0], m0 271*c0909341SAndroid Build Coastguard Worker psrlq m0, 8 272*c0909341SAndroid Build Coastguard Worker movd [rpq+xq+16], m0 273*c0909341SAndroid Build Coastguard Worker add xq, 5*4 274*c0909341SAndroid Build Coastguard Worker ret 275*c0909341SAndroid Build Coastguard Worker.write8: 276*c0909341SAndroid Build Coastguard Worker pshufb m2, m0, m9 277*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+ 0], m2 278*c0909341SAndroid Build Coastguard Worker pshufb m0, m10 279*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+16], m0 280*c0909341SAndroid Build Coastguard Worker psrldq m2, 2 281*c0909341SAndroid Build Coastguard Worker movq [rpq+xq+32], m2 282*c0909341SAndroid Build Coastguard Worker add xq, 5*8 283*c0909341SAndroid Build Coastguard Worker ret 284*c0909341SAndroid Build Coastguard Worker.write16: 285*c0909341SAndroid Build Coastguard Worker pshufb m2, m0, m9 286*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+ 0], m2 287*c0909341SAndroid Build Coastguard Worker pshufb m0, m10 288*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+16], m0 289*c0909341SAndroid Build Coastguard Worker shufps m2, m0, q1032 290*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+48], m2 291*c0909341SAndroid Build Coastguard Worker shufps m2, m0, q2121 292*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+32], m2 293*c0909341SAndroid Build Coastguard Worker shufps m0, m2, q1032 294*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+64], m0 295*c0909341SAndroid Build Coastguard Worker add xq, 5*16 296*c0909341SAndroid Build Coastguard Worker ret 297*c0909341SAndroid Build Coastguard Worker 298*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse2 299*c0909341SAndroid Build Coastguard Worker; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4 300*c0909341SAndroid Build Coastguard Workercglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 301*c0909341SAndroid Build Coastguard Worker add bx4d, bw4d 302*c0909341SAndroid Build Coastguard Worker tzcnt bw4d, bw4d 303*c0909341SAndroid Build Coastguard Worker mova m2, [aq] 304*c0909341SAndroid Build Coastguard Worker LEA aq, splat_mv_sse2_table 305*c0909341SAndroid Build Coastguard Worker lea bx4q, [bx4q*3-32] 306*c0909341SAndroid Build Coastguard Worker movsxd bw4q, [aq+bw4q*4] 307*c0909341SAndroid Build Coastguard Worker movifnidn bh4d, bh4m 308*c0909341SAndroid Build Coastguard Worker pshufd m0, m2, q0210 309*c0909341SAndroid Build Coastguard Worker pshufd m1, m2, q1021 310*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q2102 311*c0909341SAndroid Build Coastguard Worker add bw4q, aq 312*c0909341SAndroid Build Coastguard Worker.loop: 313*c0909341SAndroid Build Coastguard Worker mov aq, [rrq] 314*c0909341SAndroid Build Coastguard Worker add rrq, gprsize 315*c0909341SAndroid Build Coastguard Worker lea aq, [aq+bx4q*4] 316*c0909341SAndroid Build Coastguard Worker jmp bw4q 317*c0909341SAndroid Build Coastguard Worker.w32: 318*c0909341SAndroid Build Coastguard Worker mova [aq-16*16], m0 319*c0909341SAndroid Build Coastguard Worker mova [aq-16*15], m1 320*c0909341SAndroid Build Coastguard Worker mova [aq-16*14], m2 321*c0909341SAndroid Build Coastguard Worker mova [aq-16*13], m0 322*c0909341SAndroid Build Coastguard Worker mova [aq-16*12], m1 323*c0909341SAndroid Build Coastguard Worker mova [aq-16*11], m2 324*c0909341SAndroid Build Coastguard Worker mova [aq-16*10], m0 325*c0909341SAndroid Build Coastguard Worker mova [aq-16* 9], m1 326*c0909341SAndroid Build Coastguard Worker mova [aq-16* 8], m2 327*c0909341SAndroid Build Coastguard Worker mova [aq-16* 7], m0 328*c0909341SAndroid Build Coastguard Worker mova [aq-16* 6], m1 329*c0909341SAndroid Build Coastguard Worker mova [aq-16* 5], m2 330*c0909341SAndroid Build Coastguard Worker.w16: 331*c0909341SAndroid Build Coastguard Worker mova [aq-16* 4], m0 332*c0909341SAndroid Build Coastguard Worker mova [aq-16* 3], m1 333*c0909341SAndroid Build Coastguard Worker mova [aq-16* 2], m2 334*c0909341SAndroid Build Coastguard Worker mova [aq-16* 1], m0 335*c0909341SAndroid Build Coastguard Worker mova [aq+16* 0], m1 336*c0909341SAndroid Build Coastguard Worker mova [aq+16* 1], m2 337*c0909341SAndroid Build Coastguard Worker.w8: 338*c0909341SAndroid Build Coastguard Worker mova [aq+16* 2], m0 339*c0909341SAndroid Build Coastguard Worker mova [aq+16* 3], m1 340*c0909341SAndroid Build Coastguard Worker mova [aq+16* 4], m2 341*c0909341SAndroid Build Coastguard Worker.w4: 342*c0909341SAndroid Build Coastguard Worker mova [aq+16* 5], m0 343*c0909341SAndroid Build Coastguard Worker mova [aq+16* 6], m1 344*c0909341SAndroid Build Coastguard Worker mova [aq+16* 7], m2 345*c0909341SAndroid Build Coastguard Worker dec bh4d 346*c0909341SAndroid Build Coastguard Worker jg .loop 347*c0909341SAndroid Build Coastguard Worker RET 348*c0909341SAndroid Build Coastguard Worker.w2: 349*c0909341SAndroid Build Coastguard Worker movu [aq+104], m0 350*c0909341SAndroid Build Coastguard Worker movq [aq+120], m1 351*c0909341SAndroid Build Coastguard Worker dec bh4d 352*c0909341SAndroid Build Coastguard Worker jg .loop 353*c0909341SAndroid Build Coastguard Worker RET 354*c0909341SAndroid Build Coastguard Worker.w1: 355*c0909341SAndroid Build Coastguard Worker movq [aq+116], m0 356*c0909341SAndroid Build Coastguard Worker movd [aq+124], m2 357*c0909341SAndroid Build Coastguard Worker dec bh4d 358*c0909341SAndroid Build Coastguard Worker jg .loop 359*c0909341SAndroid Build Coastguard Worker RET 360*c0909341SAndroid Build Coastguard Worker 361*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 362*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse4 363*c0909341SAndroid Build Coastguard Worker; refmvs_frame *rf, int tile_row_idx, 364*c0909341SAndroid Build Coastguard Worker; int col_start8, int col_end8, int row_start8, int row_end8 365*c0909341SAndroid Build Coastguard Workercglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ 366*c0909341SAndroid Build Coastguard Worker stride, rp_proj, roff, troff, \ 367*c0909341SAndroid Build Coastguard Worker xendi, xstarti, iw8, ih8, dst 368*c0909341SAndroid Build Coastguard Worker xor r14d, r14d 369*c0909341SAndroid Build Coastguard Worker cmp dword [rfq+rf.n_tile_threads], 1 370*c0909341SAndroid Build Coastguard Worker mov ih8d, [rfq+rf.ih8] 371*c0909341SAndroid Build Coastguard Worker mov iw8d, [rfq+rf.iw8] 372*c0909341SAndroid Build Coastguard Worker mov xstartd, xstartd 373*c0909341SAndroid Build Coastguard Worker mov xendd, xendd 374*c0909341SAndroid Build Coastguard Worker cmove tridxd, r14d 375*c0909341SAndroid Build Coastguard Worker lea xstartid, [xstartq-8] 376*c0909341SAndroid Build Coastguard Worker lea xendid, [xendq+8] 377*c0909341SAndroid Build Coastguard Worker mov strideq, [rfq+rf.rp_stride] 378*c0909341SAndroid Build Coastguard Worker mov rp_projq, [rfq+rf.rp_proj] 379*c0909341SAndroid Build Coastguard Worker cmp ih8d, yendd 380*c0909341SAndroid Build Coastguard Worker mov [rsp+0x30], strideq 381*c0909341SAndroid Build Coastguard Worker cmovs yendd, ih8d 382*c0909341SAndroid Build Coastguard Worker test xstartid, xstartid 383*c0909341SAndroid Build Coastguard Worker cmovs xstartid, r14d 384*c0909341SAndroid Build Coastguard Worker cmp iw8d, xendid 385*c0909341SAndroid Build Coastguard Worker cmovs xendid, iw8d 386*c0909341SAndroid Build Coastguard Worker mov troffq, strideq 387*c0909341SAndroid Build Coastguard Worker shl troffq, 4 388*c0909341SAndroid Build Coastguard Worker imul troffq, tridxq 389*c0909341SAndroid Build Coastguard Worker mov dstd, ystartd 390*c0909341SAndroid Build Coastguard Worker and dstd, 15 391*c0909341SAndroid Build Coastguard Worker imul dstq, strideq 392*c0909341SAndroid Build Coastguard Worker add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride 393*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq*5] 394*c0909341SAndroid Build Coastguard Worker add dstq, rp_projq 395*c0909341SAndroid Build Coastguard Worker lea troffq, [troffq*5] ; 16 * tridx * stride * 5 396*c0909341SAndroid Build Coastguard Worker lea r13d, [xendq*5] 397*c0909341SAndroid Build Coastguard Worker lea r12, [strideq*5] 398*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \ 399*c0909341SAndroid Build Coastguard Worker _, troff, xendi, xstarti, stride5, _, dst 400*c0909341SAndroid Build Coastguard Worker lea w5d, [xstartq*5] 401*c0909341SAndroid Build Coastguard Worker add r7, troffq ; rp_proj + tile_row_offset 402*c0909341SAndroid Build Coastguard Worker mov hd, yendd 403*c0909341SAndroid Build Coastguard Worker mov [rsp+0x28], r7 404*c0909341SAndroid Build Coastguard Worker add dstq, r13 405*c0909341SAndroid Build Coastguard Worker sub w5q, r13 406*c0909341SAndroid Build Coastguard Worker sub hd, ystartd 407*c0909341SAndroid Build Coastguard Worker.init_xloop_start: 408*c0909341SAndroid Build Coastguard Worker mov x5q, w5q 409*c0909341SAndroid Build Coastguard Worker test w5b, 1 410*c0909341SAndroid Build Coastguard Worker jz .init_2blk 411*c0909341SAndroid Build Coastguard Worker mov dword [dstq+x5q], 0x80008000 412*c0909341SAndroid Build Coastguard Worker add x5q, 5 413*c0909341SAndroid Build Coastguard Worker jz .init_next_row 414*c0909341SAndroid Build Coastguard Worker.init_2blk: 415*c0909341SAndroid Build Coastguard Worker mov dword [dstq+x5q+0], 0x80008000 416*c0909341SAndroid Build Coastguard Worker mov dword [dstq+x5q+5], 0x80008000 417*c0909341SAndroid Build Coastguard Worker add x5q, 10 418*c0909341SAndroid Build Coastguard Worker jl .init_2blk 419*c0909341SAndroid Build Coastguard Worker.init_next_row: 420*c0909341SAndroid Build Coastguard Worker add dstq, stride5q 421*c0909341SAndroid Build Coastguard Worker dec hd 422*c0909341SAndroid Build Coastguard Worker jg .init_xloop_start 423*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \ 424*c0909341SAndroid Build Coastguard Worker _, _, xendi, xstarti, stride5, _, n 425*c0909341SAndroid Build Coastguard Worker mov r13d, [rfq+rf.n_mfmvs] 426*c0909341SAndroid Build Coastguard Worker test r13d, r13d 427*c0909341SAndroid Build Coastguard Worker jz .ret 428*c0909341SAndroid Build Coastguard Worker mov [rsp+0x0c], r13d 429*c0909341SAndroid Build Coastguard Worker mov strideq, [rsp+0x30] 430*c0909341SAndroid Build Coastguard Worker movddup m3, [pq_8192] 431*c0909341SAndroid Build Coastguard Worker mov r9d, ystartd 432*c0909341SAndroid Build Coastguard Worker mov [rsp+0x38], yendd 433*c0909341SAndroid Build Coastguard Worker mov [rsp+0x20], xstartid 434*c0909341SAndroid Build Coastguard Worker xor nd, nd 435*c0909341SAndroid Build Coastguard Worker xor n7d, n7d 436*c0909341SAndroid Build Coastguard Worker imul r9, strideq ; ystart * stride 437*c0909341SAndroid Build Coastguard Worker mov [rsp+0x48], rfq 438*c0909341SAndroid Build Coastguard Worker mov [rsp+0x18], stride5q 439*c0909341SAndroid Build Coastguard Worker lea r7, [r9*5] 440*c0909341SAndroid Build Coastguard Worker mov [rsp+0x24], ystartd 441*c0909341SAndroid Build Coastguard Worker mov [rsp+0x00], r7 442*c0909341SAndroid Build Coastguard Worker.nloop: 443*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \ 444*c0909341SAndroid Build Coastguard Worker ref, rp_ref, xendi, xstarti, _, _, n 445*c0909341SAndroid Build Coastguard Worker mov rfq, [rsp+0x48] 446*c0909341SAndroid Build Coastguard Worker mov refd, [rfq+rf.mfmv_ref2cur+nq*4] 447*c0909341SAndroid Build Coastguard Worker cmp refd, 0x80000000 448*c0909341SAndroid Build Coastguard Worker je .next_n 449*c0909341SAndroid Build Coastguard Worker mov [rsp+0x40], refd 450*c0909341SAndroid Build Coastguard Worker mov offq, [rsp+0x00] ; ystart * stride * 5 451*c0909341SAndroid Build Coastguard Worker movzx refd, byte [rfq+rf.mfmv_ref+nq] 452*c0909341SAndroid Build Coastguard Worker lea refsignq, [refq-4] 453*c0909341SAndroid Build Coastguard Worker mov rp_refq, [rfq+rf.rp_ref] 454*c0909341SAndroid Build Coastguard Worker movq m2, refsignq 455*c0909341SAndroid Build Coastguard Worker add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset 456*c0909341SAndroid Build Coastguard Worker mov [rsp+0x14], nd 457*c0909341SAndroid Build Coastguard Worker mov yd, ystartd 458*c0909341SAndroid Build Coastguard Worker.yloop: 459*c0909341SAndroid Build Coastguard Worker mov r11d, [rsp+0x24] ; ystart 460*c0909341SAndroid Build Coastguard Worker mov r12d, [rsp+0x38] ; yend 461*c0909341SAndroid Build Coastguard Worker mov r14d, yd 462*c0909341SAndroid Build Coastguard Worker and r14d, ~7 ; y_sb_align 463*c0909341SAndroid Build Coastguard Worker cmp r11d, r14d 464*c0909341SAndroid Build Coastguard Worker cmovs r11d, r14d ; imax(y_sb_align, ystart) 465*c0909341SAndroid Build Coastguard Worker mov [rsp+0x44], r11d ; y_proj_start 466*c0909341SAndroid Build Coastguard Worker add r14d, 8 467*c0909341SAndroid Build Coastguard Worker cmp r12d, r14d 468*c0909341SAndroid Build Coastguard Worker cmovs r14d, r12d ; imin(y_sb_align + 8, yend) 469*c0909341SAndroid Build Coastguard Worker mov [rsp+0x3c], r14d ; y_proj_end 470*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \ 471*c0909341SAndroid Build Coastguard Worker ref, x, xendi, mvx, mvy, rb, ref2ref 472*c0909341SAndroid Build Coastguard Worker mov xd, [rsp+0x20] ; xstarti 473*c0909341SAndroid Build Coastguard Worker.xloop: 474*c0909341SAndroid Build Coastguard Worker lea rbd, [xq*5] 475*c0909341SAndroid Build Coastguard Worker add rbq, srcq 476*c0909341SAndroid Build Coastguard Worker movsx refd, byte [rbq+4] 477*c0909341SAndroid Build Coastguard Worker test refd, refd 478*c0909341SAndroid Build Coastguard Worker jz .next_x_bad_ref 479*c0909341SAndroid Build Coastguard Worker mov rfq, [rsp+0x48] 480*c0909341SAndroid Build Coastguard Worker lea ref2refd, [(rf.mfmv_ref2ref/4)+n7q+refq-1] 481*c0909341SAndroid Build Coastguard Worker mov ref2refd, [rfq+ref2refq*4] ; rf->mfmv_ref2ref[n][b_ref-1] 482*c0909341SAndroid Build Coastguard Worker test ref2refd, ref2refd 483*c0909341SAndroid Build Coastguard Worker jz .next_x_bad_ref 484*c0909341SAndroid Build Coastguard Worker lea fracq, [mv_proj] 485*c0909341SAndroid Build Coastguard Worker movzx fracd, word [fracq+ref2refq*2] 486*c0909341SAndroid Build Coastguard Worker mov mvd, [rbq] 487*c0909341SAndroid Build Coastguard Worker imul fracd, [rsp+0x40] ; ref2cur 488*c0909341SAndroid Build Coastguard Worker pmovsxwq m0, [rbq] 489*c0909341SAndroid Build Coastguard Worker movd m1, fracd 490*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 491*c0909341SAndroid Build Coastguard Worker pmuldq m0, m1 ; mv * frac 492*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q3311 493*c0909341SAndroid Build Coastguard Worker paddd m0, m3 494*c0909341SAndroid Build Coastguard Worker paddd m0, m1 495*c0909341SAndroid Build Coastguard Worker psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14 496*c0909341SAndroid Build Coastguard Worker pabsd m1, m0 497*c0909341SAndroid Build Coastguard Worker packssdw m0, m0 498*c0909341SAndroid Build Coastguard Worker psrld m1, 6 499*c0909341SAndroid Build Coastguard Worker packuswb m1, m1 500*c0909341SAndroid Build Coastguard Worker pxor m0, m2 ; offset ^ ref_sign 501*c0909341SAndroid Build Coastguard Worker psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign) 502*c0909341SAndroid Build Coastguard Worker movq mvxq, m1 503*c0909341SAndroid Build Coastguard Worker lea mvyd, [mvxq+yq] ; ypos 504*c0909341SAndroid Build Coastguard Worker sar mvxq, 32 505*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \ 506*c0909341SAndroid Build Coastguard Worker ref, x, xendi, mvx, ypos, rb, ref2ref 507*c0909341SAndroid Build Coastguard Worker cmp yposd, [rsp+0x44] ; y_proj_start 508*c0909341SAndroid Build Coastguard Worker jl .next_x_bad_pos_y 509*c0909341SAndroid Build Coastguard Worker cmp yposd, [rsp+0x3c] ; y_proj_end 510*c0909341SAndroid Build Coastguard Worker jge .next_x_bad_pos_y 511*c0909341SAndroid Build Coastguard Worker and yposd, 15 512*c0909341SAndroid Build Coastguard Worker add mvxq, xq ; xpos 513*c0909341SAndroid Build Coastguard Worker imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride 514*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \ 515*c0909341SAndroid Build Coastguard Worker ref, x, xendi, xpos, pos, rb, ref2ref 516*c0909341SAndroid Build Coastguard Worker mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset 517*c0909341SAndroid Build Coastguard Worker add posq, xposq ; pos += xpos 518*c0909341SAndroid Build Coastguard Worker lea posq, [posq*5] 519*c0909341SAndroid Build Coastguard Worker add dstq, posq ; dst += pos5 520*c0909341SAndroid Build Coastguard Worker jmp .write_loop_entry 521*c0909341SAndroid Build Coastguard Worker.write_loop: 522*c0909341SAndroid Build Coastguard Worker add rbq, 5 523*c0909341SAndroid Build Coastguard Worker cmp refb, byte [rbq+4] 524*c0909341SAndroid Build Coastguard Worker jne .xloop 525*c0909341SAndroid Build Coastguard Worker cmp mvd, [rbq] 526*c0909341SAndroid Build Coastguard Worker jne .xloop 527*c0909341SAndroid Build Coastguard Worker add dstq, 5 528*c0909341SAndroid Build Coastguard Worker inc xposd 529*c0909341SAndroid Build Coastguard Worker.write_loop_entry: 530*c0909341SAndroid Build Coastguard Worker mov r12d, xd 531*c0909341SAndroid Build Coastguard Worker and r12d, ~7 532*c0909341SAndroid Build Coastguard Worker lea r5d, [r12-8] 533*c0909341SAndroid Build Coastguard Worker cmp r5d, xstartd 534*c0909341SAndroid Build Coastguard Worker cmovs r5d, xstartd ; x_proj_start 535*c0909341SAndroid Build Coastguard Worker cmp xposd, r5d 536*c0909341SAndroid Build Coastguard Worker jl .next_xpos 537*c0909341SAndroid Build Coastguard Worker add r12d, 16 538*c0909341SAndroid Build Coastguard Worker cmp xendd, r12d 539*c0909341SAndroid Build Coastguard Worker cmovs r12d, xendd ; x_proj_end 540*c0909341SAndroid Build Coastguard Worker cmp xposd, r12d 541*c0909341SAndroid Build Coastguard Worker jge .next_xpos 542*c0909341SAndroid Build Coastguard Worker mov [dstq+0], mvd 543*c0909341SAndroid Build Coastguard Worker mov byte [dstq+4], ref2refb 544*c0909341SAndroid Build Coastguard Worker.next_xpos: 545*c0909341SAndroid Build Coastguard Worker inc xd 546*c0909341SAndroid Build Coastguard Worker cmp xd, xendid 547*c0909341SAndroid Build Coastguard Worker jl .write_loop 548*c0909341SAndroid Build Coastguard Worker.next_y: 549*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n 550*c0909341SAndroid Build Coastguard Worker add srcq, [rsp+0x18] ; stride5 551*c0909341SAndroid Build Coastguard Worker inc yd 552*c0909341SAndroid Build Coastguard Worker cmp yd, [rsp+0x38] ; yend 553*c0909341SAndroid Build Coastguard Worker jne .yloop 554*c0909341SAndroid Build Coastguard Worker mov nd, [rsp+0x14] 555*c0909341SAndroid Build Coastguard Worker mov ystartd, [rsp+0x24] 556*c0909341SAndroid Build Coastguard Worker.next_n: 557*c0909341SAndroid Build Coastguard Worker add n7d, 7 558*c0909341SAndroid Build Coastguard Worker inc nd 559*c0909341SAndroid Build Coastguard Worker cmp nd, [rsp+0x0c] ; n_mfmvs 560*c0909341SAndroid Build Coastguard Worker jne .nloop 561*c0909341SAndroid Build Coastguard Worker.ret: 562*c0909341SAndroid Build Coastguard Worker RET 563*c0909341SAndroid Build Coastguard Worker.next_x: 564*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _ 565*c0909341SAndroid Build Coastguard Worker add rbq, 5 566*c0909341SAndroid Build Coastguard Worker cmp refb, byte [rbq+4] 567*c0909341SAndroid Build Coastguard Worker jne .xloop 568*c0909341SAndroid Build Coastguard Worker cmp mvd, [rbq] 569*c0909341SAndroid Build Coastguard Worker jne .xloop 570*c0909341SAndroid Build Coastguard Worker.next_x_bad_pos_y: 571*c0909341SAndroid Build Coastguard Worker inc xd 572*c0909341SAndroid Build Coastguard Worker cmp xd, xendid 573*c0909341SAndroid Build Coastguard Worker jl .next_x 574*c0909341SAndroid Build Coastguard Worker jmp .next_y 575*c0909341SAndroid Build Coastguard Worker.next_x_bad_ref: 576*c0909341SAndroid Build Coastguard Worker inc xd 577*c0909341SAndroid Build Coastguard Worker cmp xd, xendid 578*c0909341SAndroid Build Coastguard Worker jl .xloop 579*c0909341SAndroid Build Coastguard Worker jmp .next_y 580*c0909341SAndroid Build Coastguard Worker 581*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 582*c0909341SAndroid Build Coastguard Worker; refmvs_temporal_block *rp, ptrdiff_t stride, 583*c0909341SAndroid Build Coastguard Worker; refmvs_block **rr, uint8_t *ref_sign, 584*c0909341SAndroid Build Coastguard Worker; int col_end8, int row_end8, int col_start8, int row_start8 585*c0909341SAndroid Build Coastguard Workercglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \ 586*c0909341SAndroid Build Coastguard Worker xend, yend, xstart, ystart 587*c0909341SAndroid Build Coastguard Worker%define base r12-.write1 588*c0909341SAndroid Build Coastguard Worker lea r12, [.write1] 589*c0909341SAndroid Build Coastguard Worker movifnidn xendd, xendm 590*c0909341SAndroid Build Coastguard Worker movifnidn yendd, yendm 591*c0909341SAndroid Build Coastguard Worker mov xstartd, xstartm 592*c0909341SAndroid Build Coastguard Worker mov ystartd, ystartm 593*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [ref_signq] 594*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [base+save_ref_shuf+8] 595*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [base+save_cond0] 596*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [base+save_cond1] 597*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pb_128] 598*c0909341SAndroid Build Coastguard Worker mova m8, [base+save_pack0] 599*c0909341SAndroid Build Coastguard Worker mova m9, [base+save_pack1] 600*c0909341SAndroid Build Coastguard Worker psllq m4, 8 601*c0909341SAndroid Build Coastguard Worker lea r9d, [xendq*5] 602*c0909341SAndroid Build Coastguard Worker lea xstartd, [xstartq*5] 603*c0909341SAndroid Build Coastguard Worker sub yendd, ystartd 604*c0909341SAndroid Build Coastguard Worker add ystartd, ystartd 605*c0909341SAndroid Build Coastguard Worker lea strideq, [strideq*5] 606*c0909341SAndroid Build Coastguard Worker sub xstartq, r9 607*c0909341SAndroid Build Coastguard Worker add xendd, r9d 608*c0909341SAndroid Build Coastguard Worker add rpq, r9 609*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 610*c0909341SAndroid Build Coastguard Worker.loop_y: 611*c0909341SAndroid Build Coastguard Worker and ystartd, 30 612*c0909341SAndroid Build Coastguard Worker mov xq, xstartq 613*c0909341SAndroid Build Coastguard Worker mov bq, [rrq+ystartq*8] 614*c0909341SAndroid Build Coastguard Worker add ystartd, 2 615*c0909341SAndroid Build Coastguard Worker lea bq, [bq+xendq*4] 616*c0909341SAndroid Build Coastguard Worker.loop_x: 617*c0909341SAndroid Build Coastguard Worker imul candq, xq, 0x9999 618*c0909341SAndroid Build Coastguard Worker sar candq, 16 ; x / 5 * 3 619*c0909341SAndroid Build Coastguard Worker movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 620*c0909341SAndroid Build Coastguard Worker movu xm0, [bq+candq*8+12] ; cand_b 621*c0909341SAndroid Build Coastguard Worker movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0] 622*c0909341SAndroid Build Coastguard Worker movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1] 623*c0909341SAndroid Build Coastguard Worker add r10, r12 624*c0909341SAndroid Build Coastguard Worker add candq, r11 625*c0909341SAndroid Build Coastguard Worker jge .calc 626*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [bq+candq*8+12], 1 627*c0909341SAndroid Build Coastguard Worker movzx r11d, byte [bq+candq*8+22] 628*c0909341SAndroid Build Coastguard Worker movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1] 629*c0909341SAndroid Build Coastguard Worker add r11, r12 630*c0909341SAndroid Build Coastguard Worker.calc: 631*c0909341SAndroid Build Coastguard Worker pshufb m1, m0, m3 632*c0909341SAndroid Build Coastguard Worker pabsw m2, m0 633*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] 634*c0909341SAndroid Build Coastguard Worker psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 635*c0909341SAndroid Build Coastguard Worker pcmpgtd m1, m2 636*c0909341SAndroid Build Coastguard Worker pshufd m2, m1, q2301 637*c0909341SAndroid Build Coastguard Worker pand m1, m5 ; b0.cond0 b1.cond0 638*c0909341SAndroid Build Coastguard Worker pand m2, m6 ; b0.cond1 b1.cond1 639*c0909341SAndroid Build Coastguard Worker por m1, m2 ; b0.shuf b1.shuf 640*c0909341SAndroid Build Coastguard Worker pxor m1, m7 ; if cond0|cond1 == 0 => zero out 641*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 642*c0909341SAndroid Build Coastguard Worker call r10 643*c0909341SAndroid Build Coastguard Worker jge .next_line 644*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m0, 1 645*c0909341SAndroid Build Coastguard Worker call r11 646*c0909341SAndroid Build Coastguard Worker jl .loop_x 647*c0909341SAndroid Build Coastguard Worker.next_line: 648*c0909341SAndroid Build Coastguard Worker add rpq, strideq 649*c0909341SAndroid Build Coastguard Worker dec hd 650*c0909341SAndroid Build Coastguard Worker jg .loop_y 651*c0909341SAndroid Build Coastguard Worker RET 652*c0909341SAndroid Build Coastguard Worker.write1: 653*c0909341SAndroid Build Coastguard Worker movd [rpq+xq+ 0], xm0 654*c0909341SAndroid Build Coastguard Worker pextrb [rpq+xq+ 4], xm0, 4 655*c0909341SAndroid Build Coastguard Worker add xq, 5*1 656*c0909341SAndroid Build Coastguard Worker ret 657*c0909341SAndroid Build Coastguard Worker.write2: 658*c0909341SAndroid Build Coastguard Worker movq [rpq+xq+0], xm0 659*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 8 660*c0909341SAndroid Build Coastguard Worker movd [rpq+xq+6], xm1 661*c0909341SAndroid Build Coastguard Worker add xq, 5*2 662*c0909341SAndroid Build Coastguard Worker ret 663*c0909341SAndroid Build Coastguard Worker.write4: 664*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm0, xm8 665*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+ 0], xm1 666*c0909341SAndroid Build Coastguard Worker psrlq xm1, 8 667*c0909341SAndroid Build Coastguard Worker movd [rpq+xq+16], xm1 668*c0909341SAndroid Build Coastguard Worker add xq, 5*4 669*c0909341SAndroid Build Coastguard Worker ret 670*c0909341SAndroid Build Coastguard Worker.write8: 671*c0909341SAndroid Build Coastguard Worker vinserti128 m1, m0, xm0, 1 672*c0909341SAndroid Build Coastguard Worker pshufb m1, m8 673*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+ 0], m1 674*c0909341SAndroid Build Coastguard Worker psrldq xm1, 2 675*c0909341SAndroid Build Coastguard Worker movq [rpq+xq+32], xm1 676*c0909341SAndroid Build Coastguard Worker add xq, 5*8 677*c0909341SAndroid Build Coastguard Worker ret 678*c0909341SAndroid Build Coastguard Worker.write16: 679*c0909341SAndroid Build Coastguard Worker vinserti128 m1, m0, xm0, 1 680*c0909341SAndroid Build Coastguard Worker pshufb m2, m1, m8 681*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+ 0], m2 682*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 683*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+32], m1 684*c0909341SAndroid Build Coastguard Worker shufps xm2, xm1, q1021 685*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+64], xm2 686*c0909341SAndroid Build Coastguard Worker add xq, 5*16 687*c0909341SAndroid Build Coastguard Worker ret 688*c0909341SAndroid Build Coastguard Worker 689*c0909341SAndroid Build Coastguard Workercglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 690*c0909341SAndroid Build Coastguard Worker add bx4d, bw4d 691*c0909341SAndroid Build Coastguard Worker tzcnt bw4d, bw4d 692*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m0, [aq] 693*c0909341SAndroid Build Coastguard Worker lea aq, [splat_mv_avx2_table] 694*c0909341SAndroid Build Coastguard Worker lea bx4q, [bx4q*3-32] 695*c0909341SAndroid Build Coastguard Worker movsxd bw4q, [aq+bw4q*4] 696*c0909341SAndroid Build Coastguard Worker pshufb m0, [splat_mv_shuf] 697*c0909341SAndroid Build Coastguard Worker movifnidn bh4d, bh4m 698*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q2102 699*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q1021 700*c0909341SAndroid Build Coastguard Worker add bw4q, aq 701*c0909341SAndroid Build Coastguard Worker.loop: 702*c0909341SAndroid Build Coastguard Worker mov aq, [rrq] 703*c0909341SAndroid Build Coastguard Worker add rrq, gprsize 704*c0909341SAndroid Build Coastguard Worker lea aq, [aq+bx4q*4] 705*c0909341SAndroid Build Coastguard Worker jmp bw4q 706*c0909341SAndroid Build Coastguard Worker.w32: 707*c0909341SAndroid Build Coastguard Worker mova [aq-32*8], m0 708*c0909341SAndroid Build Coastguard Worker mova [aq-32*7], m1 709*c0909341SAndroid Build Coastguard Worker mova [aq-32*6], m2 710*c0909341SAndroid Build Coastguard Worker mova [aq-32*5], m0 711*c0909341SAndroid Build Coastguard Worker mova [aq-32*4], m1 712*c0909341SAndroid Build Coastguard Worker mova [aq-32*3], m2 713*c0909341SAndroid Build Coastguard Worker.w16: 714*c0909341SAndroid Build Coastguard Worker mova [aq-32*2], m0 715*c0909341SAndroid Build Coastguard Worker mova [aq-32*1], m1 716*c0909341SAndroid Build Coastguard Worker mova [aq+32*0], m2 717*c0909341SAndroid Build Coastguard Worker.w8: 718*c0909341SAndroid Build Coastguard Worker mova [aq+32*1], m0 719*c0909341SAndroid Build Coastguard Worker mova [aq+32*2], m1 720*c0909341SAndroid Build Coastguard Worker mova [aq+32*3], m2 721*c0909341SAndroid Build Coastguard Worker dec bh4d 722*c0909341SAndroid Build Coastguard Worker jg .loop 723*c0909341SAndroid Build Coastguard Worker RET 724*c0909341SAndroid Build Coastguard Worker.w4: 725*c0909341SAndroid Build Coastguard Worker movu [aq+ 80], m0 726*c0909341SAndroid Build Coastguard Worker mova [aq+112], xm1 727*c0909341SAndroid Build Coastguard Worker dec bh4d 728*c0909341SAndroid Build Coastguard Worker jg .loop 729*c0909341SAndroid Build Coastguard Worker RET 730*c0909341SAndroid Build Coastguard Worker.w2: 731*c0909341SAndroid Build Coastguard Worker movu [aq+104], xm0 732*c0909341SAndroid Build Coastguard Worker movq [aq+120], xm2 733*c0909341SAndroid Build Coastguard Worker dec bh4d 734*c0909341SAndroid Build Coastguard Worker jg .loop 735*c0909341SAndroid Build Coastguard Worker RET 736*c0909341SAndroid Build Coastguard Worker.w1: 737*c0909341SAndroid Build Coastguard Worker movq [aq+116], xm0 738*c0909341SAndroid Build Coastguard Worker movd [aq+124], xm1 739*c0909341SAndroid Build Coastguard Worker dec bh4d 740*c0909341SAndroid Build Coastguard Worker jg .loop 741*c0909341SAndroid Build Coastguard Worker RET 742*c0909341SAndroid Build Coastguard Worker 743*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 744*c0909341SAndroid Build Coastguard Worker; refmvs_temporal_block *rp, ptrdiff_t stride, 745*c0909341SAndroid Build Coastguard Worker; refmvs_block **rr, uint8_t *ref_sign, 746*c0909341SAndroid Build Coastguard Worker; int col_end8, int row_end8, int col_start8, int row_start8 747*c0909341SAndroid Build Coastguard Workercglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \ 748*c0909341SAndroid Build Coastguard Worker xend, yend, xstart, ystart 749*c0909341SAndroid Build Coastguard Worker%define base r14-.write1 750*c0909341SAndroid Build Coastguard Worker lea r14, [.write1] 751*c0909341SAndroid Build Coastguard Worker movifnidn xendd, xendm 752*c0909341SAndroid Build Coastguard Worker movifnidn yendd, yendm 753*c0909341SAndroid Build Coastguard Worker mov xstartd, xstartm 754*c0909341SAndroid Build Coastguard Worker mov ystartd, ystartm 755*c0909341SAndroid Build Coastguard Worker psllq m4, [ref_signq]{bcstq}, 8 756*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [base+save_ref_shuf+8] 757*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [base+cond_shuf512] 758*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [base+save_cond0] 759*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pb_128] 760*c0909341SAndroid Build Coastguard Worker mova m8, [base+save_pack0] 761*c0909341SAndroid Build Coastguard Worker movu xm9, [base+save_pack0+4] 762*c0909341SAndroid Build Coastguard Worker lea r9d, [xendq*5] 763*c0909341SAndroid Build Coastguard Worker lea xstartd, [xstartq*5] 764*c0909341SAndroid Build Coastguard Worker sub yendd, ystartd 765*c0909341SAndroid Build Coastguard Worker add ystartd, ystartd 766*c0909341SAndroid Build Coastguard Worker lea strideq, [strideq*5] 767*c0909341SAndroid Build Coastguard Worker sub xstartq, r9 768*c0909341SAndroid Build Coastguard Worker add xendd, r9d 769*c0909341SAndroid Build Coastguard Worker add rpq, r9 770*c0909341SAndroid Build Coastguard Worker mov r10d, 0x1f 771*c0909341SAndroid Build Coastguard Worker kmovb k2, r10d 772*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 773*c0909341SAndroid Build Coastguard Worker.loop_y: 774*c0909341SAndroid Build Coastguard Worker and ystartd, 30 775*c0909341SAndroid Build Coastguard Worker mov xq, xstartq 776*c0909341SAndroid Build Coastguard Worker mov bq, [rrq+ystartq*8] 777*c0909341SAndroid Build Coastguard Worker add ystartd, 2 778*c0909341SAndroid Build Coastguard Worker lea bq, [bq+xendq*4] 779*c0909341SAndroid Build Coastguard Worker.loop_x: 780*c0909341SAndroid Build Coastguard Worker imul candq, xq, 0x9999 781*c0909341SAndroid Build Coastguard Worker sar candq, 16 ; x / 5 * 3 782*c0909341SAndroid Build Coastguard Worker movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 783*c0909341SAndroid Build Coastguard Worker movu xm0, [bq+candq*8+12] ; cand_b 784*c0909341SAndroid Build Coastguard Worker movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0] 785*c0909341SAndroid Build Coastguard Worker movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1] 786*c0909341SAndroid Build Coastguard Worker add r10, r14 787*c0909341SAndroid Build Coastguard Worker add candq, r11 788*c0909341SAndroid Build Coastguard Worker jge .calc 789*c0909341SAndroid Build Coastguard Worker movzx r11d, byte [bq+candq*8+22] 790*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, [bq+candq*8+12], 1 791*c0909341SAndroid Build Coastguard Worker movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0] 792*c0909341SAndroid Build Coastguard Worker movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1] 793*c0909341SAndroid Build Coastguard Worker add r11, r14 794*c0909341SAndroid Build Coastguard Worker add candq, r12 795*c0909341SAndroid Build Coastguard Worker jge .calc 796*c0909341SAndroid Build Coastguard Worker movzx r12d, byte [bq+candq*8+22] 797*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [bq+candq*8+12], 2 798*c0909341SAndroid Build Coastguard Worker movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0] 799*c0909341SAndroid Build Coastguard Worker movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1] 800*c0909341SAndroid Build Coastguard Worker add r12, r14 801*c0909341SAndroid Build Coastguard Worker add candq, r13 802*c0909341SAndroid Build Coastguard Worker jge .calc 803*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [bq+candq*8+12], 3 804*c0909341SAndroid Build Coastguard Worker movzx r13d, byte [bq+candq*8+22] 805*c0909341SAndroid Build Coastguard Worker movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1] 806*c0909341SAndroid Build Coastguard Worker add r13, r14 807*c0909341SAndroid Build Coastguard Worker.calc: 808*c0909341SAndroid Build Coastguard Worker pshufb m1, m0, m3 809*c0909341SAndroid Build Coastguard Worker pabsw m2, m0 810*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] 811*c0909341SAndroid Build Coastguard Worker psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 812*c0909341SAndroid Build Coastguard Worker psubd m2, m1 813*c0909341SAndroid Build Coastguard Worker pshufb m2, m5 ; c0 c1 c1 c0 814*c0909341SAndroid Build Coastguard Worker pand m2, m6 815*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m2, m2 816*c0909341SAndroid Build Coastguard Worker vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80 817*c0909341SAndroid Build Coastguard Worker pshufb m2, m0, m1 818*c0909341SAndroid Build Coastguard Worker mova xm0, xm2 819*c0909341SAndroid Build Coastguard Worker call r10 820*c0909341SAndroid Build Coastguard Worker jge .next_line 821*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m2, 1 822*c0909341SAndroid Build Coastguard Worker call r11 823*c0909341SAndroid Build Coastguard Worker jge .next_line 824*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m2, 2 825*c0909341SAndroid Build Coastguard Worker call r12 826*c0909341SAndroid Build Coastguard Worker jge .next_line 827*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m2, 3 828*c0909341SAndroid Build Coastguard Worker call r13 829*c0909341SAndroid Build Coastguard Worker jl .loop_x 830*c0909341SAndroid Build Coastguard Worker.next_line: 831*c0909341SAndroid Build Coastguard Worker add rpq, strideq 832*c0909341SAndroid Build Coastguard Worker dec hd 833*c0909341SAndroid Build Coastguard Worker jg .loop_y 834*c0909341SAndroid Build Coastguard Worker RET 835*c0909341SAndroid Build Coastguard Worker.write1: 836*c0909341SAndroid Build Coastguard Worker vmovdqu8 [rpq+xq]{k2}, xm0 837*c0909341SAndroid Build Coastguard Worker add xq, 5*1 838*c0909341SAndroid Build Coastguard Worker ret 839*c0909341SAndroid Build Coastguard Worker.write2: 840*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm8 841*c0909341SAndroid Build Coastguard Worker vmovdqu16 [rpq+xq]{k2}, xm0 842*c0909341SAndroid Build Coastguard Worker add xq, 5*2 843*c0909341SAndroid Build Coastguard Worker ret 844*c0909341SAndroid Build Coastguard Worker.write4: 845*c0909341SAndroid Build Coastguard Worker vpermb ym0, ym8, ym0 846*c0909341SAndroid Build Coastguard Worker vmovdqu32 [rpq+xq]{k2}, ym0 847*c0909341SAndroid Build Coastguard Worker add xq, 5*4 848*c0909341SAndroid Build Coastguard Worker ret 849*c0909341SAndroid Build Coastguard Worker.write8: 850*c0909341SAndroid Build Coastguard Worker vpermb m0, m8, m0 851*c0909341SAndroid Build Coastguard Worker vmovdqu64 [rpq+xq]{k2}, m0 852*c0909341SAndroid Build Coastguard Worker add xq, 5*8 853*c0909341SAndroid Build Coastguard Worker ret 854*c0909341SAndroid Build Coastguard Worker.write16: 855*c0909341SAndroid Build Coastguard Worker vpermb m1, m8, m0 856*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+ 0], m1 857*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm9 858*c0909341SAndroid Build Coastguard Worker movu [rpq+xq+64], xm0 859*c0909341SAndroid Build Coastguard Worker add xq, 5*16 860*c0909341SAndroid Build Coastguard Worker ret 861*c0909341SAndroid Build Coastguard Worker 862*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 863*c0909341SAndroid Build Coastguard Workercglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 864*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [aq] 865*c0909341SAndroid Build Coastguard Worker lea r1, [splat_mv_avx512icl_table] 866*c0909341SAndroid Build Coastguard Worker tzcnt bw4d, bw4d 867*c0909341SAndroid Build Coastguard Worker lea bx4d, [bx4q*3] 868*c0909341SAndroid Build Coastguard Worker pshufb m0, [splat_mv_shuf] 869*c0909341SAndroid Build Coastguard Worker movsxd bw4q, [r1+bw4q*4] 870*c0909341SAndroid Build Coastguard Worker mov r6d, bh4m 871*c0909341SAndroid Build Coastguard Worker add bw4q, r1 872*c0909341SAndroid Build Coastguard Worker lea rrq, [rrq+r6*8] 873*c0909341SAndroid Build Coastguard Worker mov r1d, 0x3f 874*c0909341SAndroid Build Coastguard Worker neg r6 875*c0909341SAndroid Build Coastguard Worker kmovb k1, r1d 876*c0909341SAndroid Build Coastguard Worker jmp bw4q 877*c0909341SAndroid Build Coastguard Worker.w1: 878*c0909341SAndroid Build Coastguard Worker mov r1, [rrq+r6*8] 879*c0909341SAndroid Build Coastguard Worker vmovdqu16 [r1+bx4q*4]{k1}, xm0 880*c0909341SAndroid Build Coastguard Worker inc r6 881*c0909341SAndroid Build Coastguard Worker jl .w1 882*c0909341SAndroid Build Coastguard Worker RET 883*c0909341SAndroid Build Coastguard Worker.w2: 884*c0909341SAndroid Build Coastguard Worker mov r1, [rrq+r6*8] 885*c0909341SAndroid Build Coastguard Worker vmovdqu32 [r1+bx4q*4]{k1}, ym0 886*c0909341SAndroid Build Coastguard Worker inc r6 887*c0909341SAndroid Build Coastguard Worker jl .w2 888*c0909341SAndroid Build Coastguard Worker RET 889*c0909341SAndroid Build Coastguard Worker.w4: 890*c0909341SAndroid Build Coastguard Worker mov r1, [rrq+r6*8] 891*c0909341SAndroid Build Coastguard Worker vmovdqu64 [r1+bx4q*4]{k1}, m0 892*c0909341SAndroid Build Coastguard Worker inc r6 893*c0909341SAndroid Build Coastguard Worker jl .w4 894*c0909341SAndroid Build Coastguard Worker RET 895*c0909341SAndroid Build Coastguard Worker.w8: 896*c0909341SAndroid Build Coastguard Worker pshufd ym1, ym0, q1021 897*c0909341SAndroid Build Coastguard Worker.w8_loop: 898*c0909341SAndroid Build Coastguard Worker mov r1, [rrq+r6*8+0] 899*c0909341SAndroid Build Coastguard Worker mov r3, [rrq+r6*8+8] 900*c0909341SAndroid Build Coastguard Worker movu [r1+bx4q*4+ 0], m0 901*c0909341SAndroid Build Coastguard Worker mova [r1+bx4q*4+64], ym1 902*c0909341SAndroid Build Coastguard Worker movu [r3+bx4q*4+ 0], m0 903*c0909341SAndroid Build Coastguard Worker mova [r3+bx4q*4+64], ym1 904*c0909341SAndroid Build Coastguard Worker add r6, 2 905*c0909341SAndroid Build Coastguard Worker jl .w8_loop 906*c0909341SAndroid Build Coastguard Worker RET 907*c0909341SAndroid Build Coastguard Worker.w16: 908*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q1021 909*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q2102 910*c0909341SAndroid Build Coastguard Worker.w16_loop: 911*c0909341SAndroid Build Coastguard Worker mov r1, [rrq+r6*8+0] 912*c0909341SAndroid Build Coastguard Worker mov r3, [rrq+r6*8+8] 913*c0909341SAndroid Build Coastguard Worker mova [r1+bx4q*4+64*0], m0 914*c0909341SAndroid Build Coastguard Worker mova [r1+bx4q*4+64*1], m1 915*c0909341SAndroid Build Coastguard Worker mova [r1+bx4q*4+64*2], m2 916*c0909341SAndroid Build Coastguard Worker mova [r3+bx4q*4+64*0], m0 917*c0909341SAndroid Build Coastguard Worker mova [r3+bx4q*4+64*1], m1 918*c0909341SAndroid Build Coastguard Worker mova [r3+bx4q*4+64*2], m2 919*c0909341SAndroid Build Coastguard Worker add r6, 2 920*c0909341SAndroid Build Coastguard Worker jl .w16_loop 921*c0909341SAndroid Build Coastguard Worker RET 922*c0909341SAndroid Build Coastguard Worker.w32: 923*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q1021 924*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q2102 925*c0909341SAndroid Build Coastguard Worker.w32_loop: 926*c0909341SAndroid Build Coastguard Worker mov r1, [rrq+r6*8] 927*c0909341SAndroid Build Coastguard Worker lea r1, [r1+bx4q*4] 928*c0909341SAndroid Build Coastguard Worker mova [r1+64*0], m0 929*c0909341SAndroid Build Coastguard Worker mova [r1+64*1], m1 930*c0909341SAndroid Build Coastguard Worker mova [r1+64*2], m2 931*c0909341SAndroid Build Coastguard Worker mova [r1+64*3], m0 932*c0909341SAndroid Build Coastguard Worker mova [r1+64*4], m1 933*c0909341SAndroid Build Coastguard Worker mova [r1+64*5], m2 934*c0909341SAndroid Build Coastguard Worker inc r6 935*c0909341SAndroid Build Coastguard Worker jl .w32_loop 936*c0909341SAndroid Build Coastguard Worker RET 937*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 938