xref: /aosp_15_r20/external/libdav1d/src/x86/refmvs.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 2-*
32*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1)
33*c0909341SAndroid Build Coastguard Worker    %1_table:
34*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_table
35*c0909341SAndroid Build Coastguard Worker    %rep %0 - 1
36*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .w%2 - %%base
37*c0909341SAndroid Build Coastguard Worker        %rotate 1
38*c0909341SAndroid Build Coastguard Worker    %endrep
39*c0909341SAndroid Build Coastguard Worker%endmacro
40*c0909341SAndroid Build Coastguard Worker
41*c0909341SAndroid Build Coastguard Worker%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix
42*c0909341SAndroid Build Coastguard Worker    %rep %1
43*c0909341SAndroid Build Coastguard Worker        db %2*3
44*c0909341SAndroid Build Coastguard Worker        db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \
45*c0909341SAndroid Build Coastguard Worker           mangle(private_prefix %+ _save_tmvs_%3).write1
46*c0909341SAndroid Build Coastguard Worker    %endrep
47*c0909341SAndroid Build Coastguard Worker%endmacro
48*c0909341SAndroid Build Coastguard Worker
49*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
50*c0909341SAndroid Build Coastguard Workermv_proj:       dw    0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
51*c0909341SAndroid Build Coastguard Worker               dw 2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092
52*c0909341SAndroid Build Coastguard Worker               dw 1024,   963,  910,  862,  819,  780,  744,  712
53*c0909341SAndroid Build Coastguard Worker               dw  682,   655,  630,  606,  585,  564,  546,  528
54*c0909341SAndroid Build Coastguard Workersplat_mv_shuf: db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
55*c0909341SAndroid Build Coastguard Worker               db  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7
56*c0909341SAndroid Build Coastguard Worker               db  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
57*c0909341SAndroid Build Coastguard Worker               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
58*c0909341SAndroid Build Coastguard Worker%endif
59*c0909341SAndroid Build Coastguard Workersave_pack0:    db  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0
60*c0909341SAndroid Build Coastguard Worker               db  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1
61*c0909341SAndroid Build Coastguard Workersave_pack1:    db  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2
62*c0909341SAndroid Build Coastguard Worker               db  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3
63*c0909341SAndroid Build Coastguard Workersave_ref_shuf: db  0, -1, -1, -1,  1, -1, -1, -1,  8, -1, -1, -1,  9, -1, -1, -1
64*c0909341SAndroid Build Coastguard Workercond_shuf512:  db  3,  3,  3,  3,  7,  7,  7,  7,  7,  7,  7,  7,  3,  3,  3,  3
65*c0909341SAndroid Build Coastguard Workersave_cond0:    db  0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
66*c0909341SAndroid Build Coastguard Workersave_cond1:    db  0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
67*c0909341SAndroid Build Coastguard Workerpb_128:        times 16 db 128
68*c0909341SAndroid Build Coastguard Workerpq_8192:       dq 8192
69*c0909341SAndroid Build Coastguard Worker
70*c0909341SAndroid Build Coastguard Workersave_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
71*c0909341SAndroid Build Coastguard Worker                       SAVE_TMVS_TABLE 4,  8, ssse3
72*c0909341SAndroid Build Coastguard Worker                       SAVE_TMVS_TABLE 4,  4, ssse3
73*c0909341SAndroid Build Coastguard Worker                       SAVE_TMVS_TABLE 5,  2, ssse3
74*c0909341SAndroid Build Coastguard Worker                       SAVE_TMVS_TABLE 7,  1, ssse3
75*c0909341SAndroid Build Coastguard Worker
76*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
77*c0909341SAndroid Build Coastguard Workersave_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
78*c0909341SAndroid Build Coastguard Worker                      SAVE_TMVS_TABLE 4,  8, avx2
79*c0909341SAndroid Build Coastguard Worker                      SAVE_TMVS_TABLE 4,  4, avx2
80*c0909341SAndroid Build Coastguard Worker                      SAVE_TMVS_TABLE 5,  2, avx2
81*c0909341SAndroid Build Coastguard Worker                      SAVE_TMVS_TABLE 7,  1, avx2
82*c0909341SAndroid Build Coastguard Worker
83*c0909341SAndroid Build Coastguard Workersave_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl
84*c0909341SAndroid Build Coastguard Worker                           SAVE_TMVS_TABLE 4,  8, avx512icl
85*c0909341SAndroid Build Coastguard Worker                           SAVE_TMVS_TABLE 4,  4, avx512icl
86*c0909341SAndroid Build Coastguard Worker                           SAVE_TMVS_TABLE 5,  2, avx512icl
87*c0909341SAndroid Build Coastguard Worker                           SAVE_TMVS_TABLE 7,  1, avx512icl
88*c0909341SAndroid Build Coastguard Worker
89*c0909341SAndroid Build Coastguard WorkerJMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
90*c0909341SAndroid Build Coastguard WorkerJMP_TABLE splat_mv_avx2,      1, 2, 4, 8, 16, 32
91*c0909341SAndroid Build Coastguard Worker%endif
92*c0909341SAndroid Build Coastguard Worker
93*c0909341SAndroid Build Coastguard WorkerJMP_TABLE splat_mv_sse2,      1, 2, 4, 8, 16, 32
94*c0909341SAndroid Build Coastguard Worker
95*c0909341SAndroid Build Coastguard Workerstruc rf
96*c0909341SAndroid Build Coastguard Worker    .frm_hdr:         resq 1
97*c0909341SAndroid Build Coastguard Worker    .iw4:             resd 1
98*c0909341SAndroid Build Coastguard Worker    .ih4:             resd 1
99*c0909341SAndroid Build Coastguard Worker    .iw8:             resd 1
100*c0909341SAndroid Build Coastguard Worker    .ih8:             resd 1
101*c0909341SAndroid Build Coastguard Worker    .sbsz:            resd 1
102*c0909341SAndroid Build Coastguard Worker    .use_rf_mvs:      resd 1
103*c0909341SAndroid Build Coastguard Worker    .sign_bias:       resb 7
104*c0909341SAndroid Build Coastguard Worker    .mfmv_sign:       resb 7
105*c0909341SAndroid Build Coastguard Worker    .pocdiff:         resb 7
106*c0909341SAndroid Build Coastguard Worker    .mfmv_ref:        resb 3
107*c0909341SAndroid Build Coastguard Worker    .mfmv_ref2cur:    resd 3
108*c0909341SAndroid Build Coastguard Worker    .mfmv_ref2ref:    resd 3*7
109*c0909341SAndroid Build Coastguard Worker    .n_mfmvs:         resd 1
110*c0909341SAndroid Build Coastguard Worker    .n_blocks:        resd 1
111*c0909341SAndroid Build Coastguard Worker    .rp:              resq 1
112*c0909341SAndroid Build Coastguard Worker    .rp_ref:          resq 1
113*c0909341SAndroid Build Coastguard Worker    .rp_proj:         resq 1
114*c0909341SAndroid Build Coastguard Worker    .rp_stride:       resq 1
115*c0909341SAndroid Build Coastguard Worker    .r:               resq 1
116*c0909341SAndroid Build Coastguard Worker    .n_tile_threads:  resd 1
117*c0909341SAndroid Build Coastguard Worker    .n_frame_threads: resd 1
118*c0909341SAndroid Build Coastguard Workerendstruc
119*c0909341SAndroid Build Coastguard Worker
120*c0909341SAndroid Build Coastguard WorkerSECTION .text
121*c0909341SAndroid Build Coastguard Worker
122*c0909341SAndroid Build Coastguard Worker%macro movif32 2
123*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
124*c0909341SAndroid Build Coastguard Worker    mov             %1, %2
125*c0909341SAndroid Build Coastguard Worker%endif
126*c0909341SAndroid Build Coastguard Worker%endmacro
127*c0909341SAndroid Build Coastguard Worker
128*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
129*c0909341SAndroid Build Coastguard Worker; refmvs_temporal_block *rp, ptrdiff_t stride,
130*c0909341SAndroid Build Coastguard Worker; refmvs_block **rr, uint8_t *ref_sign,
131*c0909341SAndroid Build Coastguard Worker; int col_end8, int row_end8, int col_start8, int row_start8
132*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
133*c0909341SAndroid Build Coastguard Workercglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \
134*c0909341SAndroid Build Coastguard Worker                             xend, yend, xstart, ystart
135*c0909341SAndroid Build Coastguard Worker%define base_reg r12
136*c0909341SAndroid Build Coastguard Worker%else
137*c0909341SAndroid Build Coastguard Workercglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
138*c0909341SAndroid Build Coastguard Worker                            xend, yend, xstart, ystart
139*c0909341SAndroid Build Coastguard Worker    movq            m5, [ref_signq]
140*c0909341SAndroid Build Coastguard Worker    lea        strided, [strided*5]
141*c0909341SAndroid Build Coastguard Worker    mov        stridem, strided
142*c0909341SAndroid Build Coastguard Worker    mov             r3, xstartm
143*c0909341SAndroid Build Coastguard Worker    mov             r1, ystartm
144*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS b, ystart, rr, cand, xend, x
145*c0909341SAndroid Build Coastguard Worker%define stridemp r1m
146*c0909341SAndroid Build Coastguard Worker%define m8  [base+pb_128]
147*c0909341SAndroid Build Coastguard Worker%define m9  [base+save_pack0+ 0]
148*c0909341SAndroid Build Coastguard Worker%define m10 [base+save_pack0+16]
149*c0909341SAndroid Build Coastguard Worker%define base_reg r6
150*c0909341SAndroid Build Coastguard Worker%endif
151*c0909341SAndroid Build Coastguard Worker%define base base_reg-.write1
152*c0909341SAndroid Build Coastguard Worker    LEA       base_reg, .write1
153*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
154*c0909341SAndroid Build Coastguard Worker    movifnidn    xendd, xendm
155*c0909341SAndroid Build Coastguard Worker    movifnidn    yendd, yendm
156*c0909341SAndroid Build Coastguard Worker    mov        xstartd, xstartm
157*c0909341SAndroid Build Coastguard Worker    mov        ystartd, ystartm
158*c0909341SAndroid Build Coastguard Worker    movq            m5, [ref_signq]
159*c0909341SAndroid Build Coastguard Worker%endif
160*c0909341SAndroid Build Coastguard Worker    movu            m4, [base+save_ref_shuf]
161*c0909341SAndroid Build Coastguard Worker    movddup         m6, [base+save_cond0]
162*c0909341SAndroid Build Coastguard Worker    movddup         m7, [base+save_cond1]
163*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
164*c0909341SAndroid Build Coastguard Worker    mova            m8, [base+pb_128]
165*c0909341SAndroid Build Coastguard Worker    mova            m9, [base+save_pack0+ 0]
166*c0909341SAndroid Build Coastguard Worker    mova           m10, [base+save_pack0+16]
167*c0909341SAndroid Build Coastguard Worker%endif
168*c0909341SAndroid Build Coastguard Worker    psllq           m5, 8
169*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
170*c0909341SAndroid Build Coastguard Worker    lea            r9d, [xendq*5]
171*c0909341SAndroid Build Coastguard Worker    lea        xstartd, [xstartq*5]
172*c0909341SAndroid Build Coastguard Worker    sub          yendd, ystartd
173*c0909341SAndroid Build Coastguard Worker    add        ystartd, ystartd
174*c0909341SAndroid Build Coastguard Worker    lea        strideq, [strideq*5]
175*c0909341SAndroid Build Coastguard Worker    sub        xstartq, r9
176*c0909341SAndroid Build Coastguard Worker    add          xendd, r9d
177*c0909341SAndroid Build Coastguard Worker    add            rpq, r9
178*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
179*c0909341SAndroid Build Coastguard Worker%else
180*c0909341SAndroid Build Coastguard Worker    lea             r0, [xendd*5]   ; xend5
181*c0909341SAndroid Build Coastguard Worker    lea             r3, [r3*5]      ; xstart5
182*c0909341SAndroid Build Coastguard Worker    sub             r3, r0          ; -w5
183*c0909341SAndroid Build Coastguard Worker    mov            r6m, r3
184*c0909341SAndroid Build Coastguard Worker%define xstartq r6m
185*c0909341SAndroid Build Coastguard Worker    add          xendd, r0          ; xend6
186*c0909341SAndroid Build Coastguard Worker    add            r0m, r0          ; rp+xend5
187*c0909341SAndroid Build Coastguard Worker    mov          xendm, xendd
188*c0909341SAndroid Build Coastguard Worker    sub             r5, r1          ; h
189*c0909341SAndroid Build Coastguard Worker    add             r1, r1
190*c0909341SAndroid Build Coastguard Worker    mov            r7m, r1
191*c0909341SAndroid Build Coastguard Worker    mov            r5m, r5
192*c0909341SAndroid Build Coastguard Worker%define hd r5mp
193*c0909341SAndroid Build Coastguard Worker    jmp .loop_y_noload
194*c0909341SAndroid Build Coastguard Worker%endif
195*c0909341SAndroid Build Coastguard Worker.loop_y:
196*c0909341SAndroid Build Coastguard Worker    movif32    ystartd, r7m
197*c0909341SAndroid Build Coastguard Worker    movif32      xendd, xendm
198*c0909341SAndroid Build Coastguard Worker.loop_y_noload:
199*c0909341SAndroid Build Coastguard Worker    and        ystartd, 30
200*c0909341SAndroid Build Coastguard Worker    mov             xq, xstartq
201*c0909341SAndroid Build Coastguard Worker    mov             bq, [rrq+ystartq*gprsize]
202*c0909341SAndroid Build Coastguard Worker    add        ystartd, 2
203*c0909341SAndroid Build Coastguard Worker    movif32        r7m, ystartd
204*c0909341SAndroid Build Coastguard Worker    lea             bq, [bq+xendq*4]
205*c0909341SAndroid Build Coastguard Worker.loop_x:
206*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
207*c0909341SAndroid Build Coastguard Worker%define rpq  r3
208*c0909341SAndroid Build Coastguard Worker%define r10  r1
209*c0909341SAndroid Build Coastguard Worker%define r10d r1
210*c0909341SAndroid Build Coastguard Worker%define r11  r4
211*c0909341SAndroid Build Coastguard Worker%define r11d r4
212*c0909341SAndroid Build Coastguard Worker%endif
213*c0909341SAndroid Build Coastguard Worker    imul         candq, xq, 0x9999  ; x / 5 * 3
214*c0909341SAndroid Build Coastguard Worker    sar          candq, 16
215*c0909341SAndroid Build Coastguard Worker    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
216*c0909341SAndroid Build Coastguard Worker    movu            m0, [bq+candq*8+12]      ; cand_b
217*c0909341SAndroid Build Coastguard Worker    movzx         r11d, byte [base+save_tmvs_ssse3_table+r10*2+0]
218*c0909341SAndroid Build Coastguard Worker    movzx         r10d, byte [base+save_tmvs_ssse3_table+r10*2+1]
219*c0909341SAndroid Build Coastguard Worker    add            r10, base_reg
220*c0909341SAndroid Build Coastguard Worker    add          candq, r11
221*c0909341SAndroid Build Coastguard Worker    jge .calc
222*c0909341SAndroid Build Coastguard Worker    movu            m1, [bq+candq*8+12]
223*c0909341SAndroid Build Coastguard Worker    movzx         r11d, byte [bq+candq*8+22]
224*c0909341SAndroid Build Coastguard Worker    movzx         r11d, byte [base+save_tmvs_ssse3_table+r11*2+1]
225*c0909341SAndroid Build Coastguard Worker    add            r11, base_reg
226*c0909341SAndroid Build Coastguard Worker.calc:
227*c0909341SAndroid Build Coastguard Worker    movif32        rpq, r0m
228*c0909341SAndroid Build Coastguard Worker    ; ref check
229*c0909341SAndroid Build Coastguard Worker    punpckhqdq      m2, m0, m1
230*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4      ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ...
231*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m2  ; ref > 0 && res_sign[ref - 1]
232*c0909341SAndroid Build Coastguard Worker    ; mv check
233*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m2, m0, m1  ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ...
234*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m2
235*c0909341SAndroid Build Coastguard Worker    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
236*c0909341SAndroid Build Coastguard Worker    ; res
237*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m3, m2
238*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m3, q2301
239*c0909341SAndroid Build Coastguard Worker    pand            m3, m6      ; b0c0 b0c1 b1c0 b1c1 | ...
240*c0909341SAndroid Build Coastguard Worker    pand            m2, m7      ; b0c1 b0c0 b1c1 b1c0 | ...
241*c0909341SAndroid Build Coastguard Worker    por             m3, m2      ; b0.shuf b1.shuf | ...
242*c0909341SAndroid Build Coastguard Worker    pxor            m3, m8      ; if cond0|cond1 == 0 => zero out
243*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m3
244*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m3
245*c0909341SAndroid Build Coastguard Worker    call           r10
246*c0909341SAndroid Build Coastguard Worker    jge .next_line
247*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m1, q3232
248*c0909341SAndroid Build Coastguard Worker    call           r11
249*c0909341SAndroid Build Coastguard Worker    jl .loop_x
250*c0909341SAndroid Build Coastguard Worker.next_line:
251*c0909341SAndroid Build Coastguard Worker    add            rpq, stridemp
252*c0909341SAndroid Build Coastguard Worker    movif32        r0m, rpq
253*c0909341SAndroid Build Coastguard Worker    dec             hd
254*c0909341SAndroid Build Coastguard Worker    jg .loop_y
255*c0909341SAndroid Build Coastguard Worker    RET
256*c0909341SAndroid Build Coastguard Worker.write1:
257*c0909341SAndroid Build Coastguard Worker    movd    [rpq+xq+0], m0
258*c0909341SAndroid Build Coastguard Worker    psrlq           m0, 8
259*c0909341SAndroid Build Coastguard Worker    movd    [rpq+xq+1], m0
260*c0909341SAndroid Build Coastguard Worker    add             xq, 5*1
261*c0909341SAndroid Build Coastguard Worker    ret
262*c0909341SAndroid Build Coastguard Worker.write2:
263*c0909341SAndroid Build Coastguard Worker    movq    [rpq+xq+0], m0
264*c0909341SAndroid Build Coastguard Worker    psrlq           m0, 8
265*c0909341SAndroid Build Coastguard Worker    movd    [rpq+xq+6], m0
266*c0909341SAndroid Build Coastguard Worker    add             xq, 5*2
267*c0909341SAndroid Build Coastguard Worker    ret
268*c0909341SAndroid Build Coastguard Worker.write4:
269*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m9
270*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+ 0], m0
271*c0909341SAndroid Build Coastguard Worker    psrlq           m0, 8
272*c0909341SAndroid Build Coastguard Worker    movd   [rpq+xq+16], m0
273*c0909341SAndroid Build Coastguard Worker    add             xq, 5*4
274*c0909341SAndroid Build Coastguard Worker    ret
275*c0909341SAndroid Build Coastguard Worker.write8:
276*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m0, m9
277*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+ 0], m2
278*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m10
279*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+16], m0
280*c0909341SAndroid Build Coastguard Worker    psrldq          m2, 2
281*c0909341SAndroid Build Coastguard Worker    movq   [rpq+xq+32], m2
282*c0909341SAndroid Build Coastguard Worker    add             xq, 5*8
283*c0909341SAndroid Build Coastguard Worker    ret
284*c0909341SAndroid Build Coastguard Worker.write16:
285*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m0, m9
286*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+ 0], m2
287*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m10
288*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+16], m0
289*c0909341SAndroid Build Coastguard Worker    shufps          m2, m0, q1032
290*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+48], m2
291*c0909341SAndroid Build Coastguard Worker    shufps          m2, m0, q2121
292*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+32], m2
293*c0909341SAndroid Build Coastguard Worker    shufps          m0, m2, q1032
294*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+64], m0
295*c0909341SAndroid Build Coastguard Worker    add             xq, 5*16
296*c0909341SAndroid Build Coastguard Worker    ret
297*c0909341SAndroid Build Coastguard Worker
298*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse2
299*c0909341SAndroid Build Coastguard Worker; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
300*c0909341SAndroid Build Coastguard Workercglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
301*c0909341SAndroid Build Coastguard Worker    add           bx4d, bw4d
302*c0909341SAndroid Build Coastguard Worker    tzcnt         bw4d, bw4d
303*c0909341SAndroid Build Coastguard Worker    mova            m2, [aq]
304*c0909341SAndroid Build Coastguard Worker    LEA             aq, splat_mv_sse2_table
305*c0909341SAndroid Build Coastguard Worker    lea           bx4q, [bx4q*3-32]
306*c0909341SAndroid Build Coastguard Worker    movsxd        bw4q, [aq+bw4q*4]
307*c0909341SAndroid Build Coastguard Worker    movifnidn     bh4d, bh4m
308*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m2, q0210
309*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m2, q1021
310*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m2, q2102
311*c0909341SAndroid Build Coastguard Worker    add           bw4q, aq
312*c0909341SAndroid Build Coastguard Worker.loop:
313*c0909341SAndroid Build Coastguard Worker    mov             aq, [rrq]
314*c0909341SAndroid Build Coastguard Worker    add            rrq, gprsize
315*c0909341SAndroid Build Coastguard Worker    lea             aq, [aq+bx4q*4]
316*c0909341SAndroid Build Coastguard Worker    jmp           bw4q
317*c0909341SAndroid Build Coastguard Worker.w32:
318*c0909341SAndroid Build Coastguard Worker    mova    [aq-16*16], m0
319*c0909341SAndroid Build Coastguard Worker    mova    [aq-16*15], m1
320*c0909341SAndroid Build Coastguard Worker    mova    [aq-16*14], m2
321*c0909341SAndroid Build Coastguard Worker    mova    [aq-16*13], m0
322*c0909341SAndroid Build Coastguard Worker    mova    [aq-16*12], m1
323*c0909341SAndroid Build Coastguard Worker    mova    [aq-16*11], m2
324*c0909341SAndroid Build Coastguard Worker    mova    [aq-16*10], m0
325*c0909341SAndroid Build Coastguard Worker    mova    [aq-16* 9], m1
326*c0909341SAndroid Build Coastguard Worker    mova    [aq-16* 8], m2
327*c0909341SAndroid Build Coastguard Worker    mova    [aq-16* 7], m0
328*c0909341SAndroid Build Coastguard Worker    mova    [aq-16* 6], m1
329*c0909341SAndroid Build Coastguard Worker    mova    [aq-16* 5], m2
330*c0909341SAndroid Build Coastguard Worker.w16:
331*c0909341SAndroid Build Coastguard Worker    mova    [aq-16* 4], m0
332*c0909341SAndroid Build Coastguard Worker    mova    [aq-16* 3], m1
333*c0909341SAndroid Build Coastguard Worker    mova    [aq-16* 2], m2
334*c0909341SAndroid Build Coastguard Worker    mova    [aq-16* 1], m0
335*c0909341SAndroid Build Coastguard Worker    mova    [aq+16* 0], m1
336*c0909341SAndroid Build Coastguard Worker    mova    [aq+16* 1], m2
337*c0909341SAndroid Build Coastguard Worker.w8:
338*c0909341SAndroid Build Coastguard Worker    mova    [aq+16* 2], m0
339*c0909341SAndroid Build Coastguard Worker    mova    [aq+16* 3], m1
340*c0909341SAndroid Build Coastguard Worker    mova    [aq+16* 4], m2
341*c0909341SAndroid Build Coastguard Worker.w4:
342*c0909341SAndroid Build Coastguard Worker    mova    [aq+16* 5], m0
343*c0909341SAndroid Build Coastguard Worker    mova    [aq+16* 6], m1
344*c0909341SAndroid Build Coastguard Worker    mova    [aq+16* 7], m2
345*c0909341SAndroid Build Coastguard Worker    dec           bh4d
346*c0909341SAndroid Build Coastguard Worker    jg .loop
347*c0909341SAndroid Build Coastguard Worker    RET
348*c0909341SAndroid Build Coastguard Worker.w2:
349*c0909341SAndroid Build Coastguard Worker    movu      [aq+104], m0
350*c0909341SAndroid Build Coastguard Worker    movq      [aq+120], m1
351*c0909341SAndroid Build Coastguard Worker    dec           bh4d
352*c0909341SAndroid Build Coastguard Worker    jg .loop
353*c0909341SAndroid Build Coastguard Worker    RET
354*c0909341SAndroid Build Coastguard Worker.w1:
355*c0909341SAndroid Build Coastguard Worker    movq      [aq+116], m0
356*c0909341SAndroid Build Coastguard Worker    movd      [aq+124], m2
357*c0909341SAndroid Build Coastguard Worker    dec           bh4d
358*c0909341SAndroid Build Coastguard Worker    jg .loop
359*c0909341SAndroid Build Coastguard Worker    RET
360*c0909341SAndroid Build Coastguard Worker
361*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
362*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse4
363*c0909341SAndroid Build Coastguard Worker; refmvs_frame *rf, int tile_row_idx,
364*c0909341SAndroid Build Coastguard Worker; int col_start8, int col_end8, int row_start8, int row_end8
365*c0909341SAndroid Build Coastguard Workercglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
366*c0909341SAndroid Build Coastguard Worker                                    stride, rp_proj, roff, troff, \
367*c0909341SAndroid Build Coastguard Worker                                    xendi, xstarti, iw8, ih8, dst
368*c0909341SAndroid Build Coastguard Worker    xor           r14d, r14d
369*c0909341SAndroid Build Coastguard Worker    cmp dword [rfq+rf.n_tile_threads], 1
370*c0909341SAndroid Build Coastguard Worker    mov           ih8d, [rfq+rf.ih8]
371*c0909341SAndroid Build Coastguard Worker    mov           iw8d, [rfq+rf.iw8]
372*c0909341SAndroid Build Coastguard Worker    mov        xstartd, xstartd
373*c0909341SAndroid Build Coastguard Worker    mov          xendd, xendd
374*c0909341SAndroid Build Coastguard Worker    cmove       tridxd, r14d
375*c0909341SAndroid Build Coastguard Worker    lea       xstartid, [xstartq-8]
376*c0909341SAndroid Build Coastguard Worker    lea         xendid, [xendq+8]
377*c0909341SAndroid Build Coastguard Worker    mov        strideq, [rfq+rf.rp_stride]
378*c0909341SAndroid Build Coastguard Worker    mov       rp_projq, [rfq+rf.rp_proj]
379*c0909341SAndroid Build Coastguard Worker    cmp           ih8d, yendd
380*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x30], strideq
381*c0909341SAndroid Build Coastguard Worker    cmovs        yendd, ih8d
382*c0909341SAndroid Build Coastguard Worker    test      xstartid, xstartid
383*c0909341SAndroid Build Coastguard Worker    cmovs     xstartid, r14d
384*c0909341SAndroid Build Coastguard Worker    cmp           iw8d, xendid
385*c0909341SAndroid Build Coastguard Worker    cmovs       xendid, iw8d
386*c0909341SAndroid Build Coastguard Worker    mov         troffq, strideq
387*c0909341SAndroid Build Coastguard Worker    shl         troffq, 4
388*c0909341SAndroid Build Coastguard Worker    imul        troffq, tridxq
389*c0909341SAndroid Build Coastguard Worker    mov           dstd, ystartd
390*c0909341SAndroid Build Coastguard Worker    and           dstd, 15
391*c0909341SAndroid Build Coastguard Worker    imul          dstq, strideq
392*c0909341SAndroid Build Coastguard Worker    add           dstq, troffq      ; (16 * tridx + (ystart & 15)) * stride
393*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq*5]
394*c0909341SAndroid Build Coastguard Worker    add           dstq, rp_projq
395*c0909341SAndroid Build Coastguard Worker    lea         troffq, [troffq*5]  ; 16 * tridx * stride * 5
396*c0909341SAndroid Build Coastguard Worker    lea           r13d, [xendq*5]
397*c0909341SAndroid Build Coastguard Worker    lea            r12, [strideq*5]
398*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \
399*c0909341SAndroid Build Coastguard Worker             _, troff, xendi, xstarti, stride5, _, dst
400*c0909341SAndroid Build Coastguard Worker    lea            w5d, [xstartq*5]
401*c0909341SAndroid Build Coastguard Worker    add             r7, troffq      ; rp_proj + tile_row_offset
402*c0909341SAndroid Build Coastguard Worker    mov             hd, yendd
403*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x28], r7
404*c0909341SAndroid Build Coastguard Worker    add           dstq, r13
405*c0909341SAndroid Build Coastguard Worker    sub            w5q, r13
406*c0909341SAndroid Build Coastguard Worker    sub             hd, ystartd
407*c0909341SAndroid Build Coastguard Worker.init_xloop_start:
408*c0909341SAndroid Build Coastguard Worker    mov            x5q, w5q
409*c0909341SAndroid Build Coastguard Worker    test           w5b, 1
410*c0909341SAndroid Build Coastguard Worker    jz .init_2blk
411*c0909341SAndroid Build Coastguard Worker    mov dword [dstq+x5q], 0x80008000
412*c0909341SAndroid Build Coastguard Worker    add            x5q, 5
413*c0909341SAndroid Build Coastguard Worker    jz .init_next_row
414*c0909341SAndroid Build Coastguard Worker.init_2blk:
415*c0909341SAndroid Build Coastguard Worker    mov dword [dstq+x5q+0], 0x80008000
416*c0909341SAndroid Build Coastguard Worker    mov dword [dstq+x5q+5], 0x80008000
417*c0909341SAndroid Build Coastguard Worker    add            x5q, 10
418*c0909341SAndroid Build Coastguard Worker    jl .init_2blk
419*c0909341SAndroid Build Coastguard Worker.init_next_row:
420*c0909341SAndroid Build Coastguard Worker    add           dstq, stride5q
421*c0909341SAndroid Build Coastguard Worker    dec             hd
422*c0909341SAndroid Build Coastguard Worker    jg .init_xloop_start
423*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
424*c0909341SAndroid Build Coastguard Worker             _, _, xendi, xstarti, stride5, _, n
425*c0909341SAndroid Build Coastguard Worker    mov           r13d, [rfq+rf.n_mfmvs]
426*c0909341SAndroid Build Coastguard Worker    test          r13d, r13d
427*c0909341SAndroid Build Coastguard Worker    jz .ret
428*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x0c], r13d
429*c0909341SAndroid Build Coastguard Worker    mov        strideq, [rsp+0x30]
430*c0909341SAndroid Build Coastguard Worker    movddup         m3, [pq_8192]
431*c0909341SAndroid Build Coastguard Worker    mov            r9d, ystartd
432*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x38], yendd
433*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x20], xstartid
434*c0909341SAndroid Build Coastguard Worker    xor             nd, nd
435*c0909341SAndroid Build Coastguard Worker    xor            n7d, n7d
436*c0909341SAndroid Build Coastguard Worker    imul            r9, strideq     ; ystart * stride
437*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x48], rfq
438*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x18], stride5q
439*c0909341SAndroid Build Coastguard Worker    lea             r7, [r9*5]
440*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x24], ystartd
441*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x00], r7
442*c0909341SAndroid Build Coastguard Worker.nloop:
443*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
444*c0909341SAndroid Build Coastguard Worker             ref, rp_ref, xendi, xstarti, _, _, n
445*c0909341SAndroid Build Coastguard Worker    mov            rfq, [rsp+0x48]
446*c0909341SAndroid Build Coastguard Worker    mov           refd, [rfq+rf.mfmv_ref2cur+nq*4]
447*c0909341SAndroid Build Coastguard Worker    cmp           refd, 0x80000000
448*c0909341SAndroid Build Coastguard Worker    je .next_n
449*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x40], refd
450*c0909341SAndroid Build Coastguard Worker    mov           offq, [rsp+0x00]          ; ystart * stride * 5
451*c0909341SAndroid Build Coastguard Worker    movzx         refd, byte [rfq+rf.mfmv_ref+nq]
452*c0909341SAndroid Build Coastguard Worker    lea       refsignq, [refq-4]
453*c0909341SAndroid Build Coastguard Worker    mov        rp_refq, [rfq+rf.rp_ref]
454*c0909341SAndroid Build Coastguard Worker    movq            m2, refsignq
455*c0909341SAndroid Build Coastguard Worker    add           offq, [rp_refq+refq*8]    ; r = rp_ref[ref] + row_offset
456*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x14], nd
457*c0909341SAndroid Build Coastguard Worker    mov             yd, ystartd
458*c0909341SAndroid Build Coastguard Worker.yloop:
459*c0909341SAndroid Build Coastguard Worker    mov           r11d, [rsp+0x24]          ; ystart
460*c0909341SAndroid Build Coastguard Worker    mov           r12d, [rsp+0x38]          ; yend
461*c0909341SAndroid Build Coastguard Worker    mov           r14d, yd
462*c0909341SAndroid Build Coastguard Worker    and           r14d, ~7                  ; y_sb_align
463*c0909341SAndroid Build Coastguard Worker    cmp           r11d, r14d
464*c0909341SAndroid Build Coastguard Worker    cmovs         r11d, r14d                ; imax(y_sb_align, ystart)
465*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x44], r11d                ; y_proj_start
466*c0909341SAndroid Build Coastguard Worker    add           r14d, 8
467*c0909341SAndroid Build Coastguard Worker    cmp           r12d, r14d
468*c0909341SAndroid Build Coastguard Worker    cmovs         r14d, r12d                ; imin(y_sb_align + 8, yend)
469*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x3c], r14d                ; y_proj_end
470*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \
471*c0909341SAndroid Build Coastguard Worker             ref, x, xendi, mvx, mvy, rb, ref2ref
472*c0909341SAndroid Build Coastguard Worker    mov             xd, [rsp+0x20] ; xstarti
473*c0909341SAndroid Build Coastguard Worker.xloop:
474*c0909341SAndroid Build Coastguard Worker    lea            rbd, [xq*5]
475*c0909341SAndroid Build Coastguard Worker    add            rbq, srcq
476*c0909341SAndroid Build Coastguard Worker    movsx         refd, byte [rbq+4]
477*c0909341SAndroid Build Coastguard Worker    test          refd, refd
478*c0909341SAndroid Build Coastguard Worker    jz .next_x_bad_ref
479*c0909341SAndroid Build Coastguard Worker    mov            rfq, [rsp+0x48]
480*c0909341SAndroid Build Coastguard Worker    lea       ref2refd, [(rf.mfmv_ref2ref/4)+n7q+refq-1]
481*c0909341SAndroid Build Coastguard Worker    mov       ref2refd, [rfq+ref2refq*4]    ; rf->mfmv_ref2ref[n][b_ref-1]
482*c0909341SAndroid Build Coastguard Worker    test      ref2refd, ref2refd
483*c0909341SAndroid Build Coastguard Worker    jz .next_x_bad_ref
484*c0909341SAndroid Build Coastguard Worker    lea          fracq, [mv_proj]
485*c0909341SAndroid Build Coastguard Worker    movzx        fracd, word [fracq+ref2refq*2]
486*c0909341SAndroid Build Coastguard Worker    mov            mvd, [rbq]
487*c0909341SAndroid Build Coastguard Worker    imul         fracd, [rsp+0x40] ; ref2cur
488*c0909341SAndroid Build Coastguard Worker    pmovsxwq        m0, [rbq]
489*c0909341SAndroid Build Coastguard Worker    movd            m1, fracd
490*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m1, m1
491*c0909341SAndroid Build Coastguard Worker    pmuldq          m0, m1          ; mv * frac
492*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m0, q3311
493*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
494*c0909341SAndroid Build Coastguard Worker    paddd           m0, m1
495*c0909341SAndroid Build Coastguard Worker    psrad           m0, 14          ; offset = (xy + (xy >> 31) + 8192) >> 14
496*c0909341SAndroid Build Coastguard Worker    pabsd           m1, m0
497*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m0
498*c0909341SAndroid Build Coastguard Worker    psrld           m1, 6
499*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m1
500*c0909341SAndroid Build Coastguard Worker    pxor            m0, m2          ; offset ^ ref_sign
501*c0909341SAndroid Build Coastguard Worker    psignd          m1, m0          ; apply_sign(abs(offset) >> 6, offset ^ refsign)
502*c0909341SAndroid Build Coastguard Worker    movq          mvxq, m1
503*c0909341SAndroid Build Coastguard Worker    lea           mvyd, [mvxq+yq]   ; ypos
504*c0909341SAndroid Build Coastguard Worker    sar           mvxq, 32
505*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \
506*c0909341SAndroid Build Coastguard Worker             ref, x, xendi, mvx, ypos, rb, ref2ref
507*c0909341SAndroid Build Coastguard Worker    cmp          yposd, [rsp+0x44] ; y_proj_start
508*c0909341SAndroid Build Coastguard Worker    jl .next_x_bad_pos_y
509*c0909341SAndroid Build Coastguard Worker    cmp          yposd, [rsp+0x3c] ; y_proj_end
510*c0909341SAndroid Build Coastguard Worker    jge .next_x_bad_pos_y
511*c0909341SAndroid Build Coastguard Worker    and          yposd, 15
512*c0909341SAndroid Build Coastguard Worker    add           mvxq, xq          ; xpos
513*c0909341SAndroid Build Coastguard Worker    imul         yposq, [rsp+0x30]  ; pos = (ypos & 15) * stride
514*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \
515*c0909341SAndroid Build Coastguard Worker             ref, x, xendi, xpos, pos, rb, ref2ref
516*c0909341SAndroid Build Coastguard Worker    mov           dstq, [rsp+0x28]  ; dst = rp_proj + tile_row_offset
517*c0909341SAndroid Build Coastguard Worker    add           posq, xposq       ; pos += xpos
518*c0909341SAndroid Build Coastguard Worker    lea           posq, [posq*5]
519*c0909341SAndroid Build Coastguard Worker    add           dstq, posq        ; dst += pos5
520*c0909341SAndroid Build Coastguard Worker    jmp .write_loop_entry
521*c0909341SAndroid Build Coastguard Worker.write_loop:
522*c0909341SAndroid Build Coastguard Worker    add            rbq, 5
523*c0909341SAndroid Build Coastguard Worker    cmp           refb, byte [rbq+4]
524*c0909341SAndroid Build Coastguard Worker    jne .xloop
525*c0909341SAndroid Build Coastguard Worker    cmp            mvd, [rbq]
526*c0909341SAndroid Build Coastguard Worker    jne .xloop
527*c0909341SAndroid Build Coastguard Worker    add           dstq, 5
528*c0909341SAndroid Build Coastguard Worker    inc          xposd
529*c0909341SAndroid Build Coastguard Worker.write_loop_entry:
530*c0909341SAndroid Build Coastguard Worker    mov           r12d, xd
531*c0909341SAndroid Build Coastguard Worker    and           r12d, ~7
532*c0909341SAndroid Build Coastguard Worker    lea            r5d, [r12-8]
533*c0909341SAndroid Build Coastguard Worker    cmp            r5d, xstartd
534*c0909341SAndroid Build Coastguard Worker    cmovs          r5d, xstartd     ; x_proj_start
535*c0909341SAndroid Build Coastguard Worker    cmp          xposd, r5d
536*c0909341SAndroid Build Coastguard Worker    jl .next_xpos
537*c0909341SAndroid Build Coastguard Worker    add           r12d, 16
538*c0909341SAndroid Build Coastguard Worker    cmp          xendd, r12d
539*c0909341SAndroid Build Coastguard Worker    cmovs         r12d, xendd       ; x_proj_end
540*c0909341SAndroid Build Coastguard Worker    cmp          xposd, r12d
541*c0909341SAndroid Build Coastguard Worker    jge .next_xpos
542*c0909341SAndroid Build Coastguard Worker    mov       [dstq+0], mvd
543*c0909341SAndroid Build Coastguard Worker    mov  byte [dstq+4], ref2refb
544*c0909341SAndroid Build Coastguard Worker.next_xpos:
545*c0909341SAndroid Build Coastguard Worker    inc             xd
546*c0909341SAndroid Build Coastguard Worker    cmp             xd, xendid
547*c0909341SAndroid Build Coastguard Worker    jl .write_loop
548*c0909341SAndroid Build Coastguard Worker.next_y:
549*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n
550*c0909341SAndroid Build Coastguard Worker    add           srcq, [rsp+0x18] ; stride5
551*c0909341SAndroid Build Coastguard Worker    inc             yd
552*c0909341SAndroid Build Coastguard Worker    cmp             yd, [rsp+0x38] ; yend
553*c0909341SAndroid Build Coastguard Worker    jne .yloop
554*c0909341SAndroid Build Coastguard Worker    mov             nd, [rsp+0x14]
555*c0909341SAndroid Build Coastguard Worker    mov        ystartd, [rsp+0x24]
556*c0909341SAndroid Build Coastguard Worker.next_n:
557*c0909341SAndroid Build Coastguard Worker    add            n7d, 7
558*c0909341SAndroid Build Coastguard Worker    inc             nd
559*c0909341SAndroid Build Coastguard Worker    cmp             nd, [rsp+0x0c] ; n_mfmvs
560*c0909341SAndroid Build Coastguard Worker    jne .nloop
561*c0909341SAndroid Build Coastguard Worker.ret:
562*c0909341SAndroid Build Coastguard Worker    RET
563*c0909341SAndroid Build Coastguard Worker.next_x:
564*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _
565*c0909341SAndroid Build Coastguard Worker    add            rbq, 5
566*c0909341SAndroid Build Coastguard Worker    cmp           refb, byte [rbq+4]
567*c0909341SAndroid Build Coastguard Worker    jne .xloop
568*c0909341SAndroid Build Coastguard Worker    cmp            mvd, [rbq]
569*c0909341SAndroid Build Coastguard Worker    jne .xloop
570*c0909341SAndroid Build Coastguard Worker.next_x_bad_pos_y:
571*c0909341SAndroid Build Coastguard Worker    inc             xd
572*c0909341SAndroid Build Coastguard Worker    cmp             xd, xendid
573*c0909341SAndroid Build Coastguard Worker    jl .next_x
574*c0909341SAndroid Build Coastguard Worker    jmp .next_y
575*c0909341SAndroid Build Coastguard Worker.next_x_bad_ref:
576*c0909341SAndroid Build Coastguard Worker    inc             xd
577*c0909341SAndroid Build Coastguard Worker    cmp             xd, xendid
578*c0909341SAndroid Build Coastguard Worker    jl .xloop
579*c0909341SAndroid Build Coastguard Worker    jmp .next_y
580*c0909341SAndroid Build Coastguard Worker
581*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
582*c0909341SAndroid Build Coastguard Worker; refmvs_temporal_block *rp, ptrdiff_t stride,
583*c0909341SAndroid Build Coastguard Worker; refmvs_block **rr, uint8_t *ref_sign,
584*c0909341SAndroid Build Coastguard Worker; int col_end8, int row_end8, int col_start8, int row_start8
585*c0909341SAndroid Build Coastguard Workercglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \
586*c0909341SAndroid Build Coastguard Worker                              xend, yend, xstart, ystart
587*c0909341SAndroid Build Coastguard Worker%define base r12-.write1
588*c0909341SAndroid Build Coastguard Worker    lea            r12, [.write1]
589*c0909341SAndroid Build Coastguard Worker    movifnidn    xendd, xendm
590*c0909341SAndroid Build Coastguard Worker    movifnidn    yendd, yendm
591*c0909341SAndroid Build Coastguard Worker    mov        xstartd, xstartm
592*c0909341SAndroid Build Coastguard Worker    mov        ystartd, ystartm
593*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m4, [ref_signq]
594*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m3, [base+save_ref_shuf+8]
595*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m5, [base+save_cond0]
596*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m6, [base+save_cond1]
597*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m7, [base+pb_128]
598*c0909341SAndroid Build Coastguard Worker    mova            m8, [base+save_pack0]
599*c0909341SAndroid Build Coastguard Worker    mova            m9, [base+save_pack1]
600*c0909341SAndroid Build Coastguard Worker    psllq           m4, 8
601*c0909341SAndroid Build Coastguard Worker    lea            r9d, [xendq*5]
602*c0909341SAndroid Build Coastguard Worker    lea        xstartd, [xstartq*5]
603*c0909341SAndroid Build Coastguard Worker    sub          yendd, ystartd
604*c0909341SAndroid Build Coastguard Worker    add        ystartd, ystartd
605*c0909341SAndroid Build Coastguard Worker    lea        strideq, [strideq*5]
606*c0909341SAndroid Build Coastguard Worker    sub        xstartq, r9
607*c0909341SAndroid Build Coastguard Worker    add          xendd, r9d
608*c0909341SAndroid Build Coastguard Worker    add            rpq, r9
609*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
610*c0909341SAndroid Build Coastguard Worker.loop_y:
611*c0909341SAndroid Build Coastguard Worker    and        ystartd, 30
612*c0909341SAndroid Build Coastguard Worker    mov             xq, xstartq
613*c0909341SAndroid Build Coastguard Worker    mov             bq, [rrq+ystartq*8]
614*c0909341SAndroid Build Coastguard Worker    add        ystartd, 2
615*c0909341SAndroid Build Coastguard Worker    lea             bq, [bq+xendq*4]
616*c0909341SAndroid Build Coastguard Worker.loop_x:
617*c0909341SAndroid Build Coastguard Worker    imul         candq, xq, 0x9999
618*c0909341SAndroid Build Coastguard Worker    sar          candq, 16                   ; x / 5 * 3
619*c0909341SAndroid Build Coastguard Worker    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
620*c0909341SAndroid Build Coastguard Worker    movu           xm0, [bq+candq*8+12]      ; cand_b
621*c0909341SAndroid Build Coastguard Worker    movzx         r11d, byte [base+save_tmvs_avx2_table+r10*2+0]
622*c0909341SAndroid Build Coastguard Worker    movzx         r10d, byte [base+save_tmvs_avx2_table+r10*2+1]
623*c0909341SAndroid Build Coastguard Worker    add            r10, r12
624*c0909341SAndroid Build Coastguard Worker    add          candq, r11
625*c0909341SAndroid Build Coastguard Worker    jge .calc
626*c0909341SAndroid Build Coastguard Worker    vinserti128     m0, [bq+candq*8+12], 1
627*c0909341SAndroid Build Coastguard Worker    movzx         r11d, byte [bq+candq*8+22]
628*c0909341SAndroid Build Coastguard Worker    movzx         r11d, byte [base+save_tmvs_avx2_table+r11*2+1]
629*c0909341SAndroid Build Coastguard Worker    add            r11, r12
630*c0909341SAndroid Build Coastguard Worker.calc:
631*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m0, m3
632*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m0
633*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m1  ; ref > 0 && res_sign[ref - 1]
634*c0909341SAndroid Build Coastguard Worker    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
635*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m1, m2
636*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m1, q2301
637*c0909341SAndroid Build Coastguard Worker    pand            m1, m5      ; b0.cond0 b1.cond0
638*c0909341SAndroid Build Coastguard Worker    pand            m2, m6      ; b0.cond1 b1.cond1
639*c0909341SAndroid Build Coastguard Worker    por             m1, m2      ; b0.shuf b1.shuf
640*c0909341SAndroid Build Coastguard Worker    pxor            m1, m7      ; if cond0|cond1 == 0 => zero out
641*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m1
642*c0909341SAndroid Build Coastguard Worker    call           r10
643*c0909341SAndroid Build Coastguard Worker    jge .next_line
644*c0909341SAndroid Build Coastguard Worker    vextracti128   xm0, m0, 1
645*c0909341SAndroid Build Coastguard Worker    call           r11
646*c0909341SAndroid Build Coastguard Worker    jl .loop_x
647*c0909341SAndroid Build Coastguard Worker.next_line:
648*c0909341SAndroid Build Coastguard Worker    add            rpq, strideq
649*c0909341SAndroid Build Coastguard Worker    dec             hd
650*c0909341SAndroid Build Coastguard Worker    jg .loop_y
651*c0909341SAndroid Build Coastguard Worker    RET
652*c0909341SAndroid Build Coastguard Worker.write1:
653*c0909341SAndroid Build Coastguard Worker    movd   [rpq+xq+ 0], xm0
654*c0909341SAndroid Build Coastguard Worker    pextrb [rpq+xq+ 4], xm0, 4
655*c0909341SAndroid Build Coastguard Worker    add             xq, 5*1
656*c0909341SAndroid Build Coastguard Worker    ret
657*c0909341SAndroid Build Coastguard Worker.write2:
658*c0909341SAndroid Build Coastguard Worker    movq    [rpq+xq+0], xm0
659*c0909341SAndroid Build Coastguard Worker    psrlq          xm1, xm0, 8
660*c0909341SAndroid Build Coastguard Worker    movd    [rpq+xq+6], xm1
661*c0909341SAndroid Build Coastguard Worker    add             xq, 5*2
662*c0909341SAndroid Build Coastguard Worker    ret
663*c0909341SAndroid Build Coastguard Worker.write4:
664*c0909341SAndroid Build Coastguard Worker    pshufb         xm1, xm0, xm8
665*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+ 0], xm1
666*c0909341SAndroid Build Coastguard Worker    psrlq          xm1, 8
667*c0909341SAndroid Build Coastguard Worker    movd   [rpq+xq+16], xm1
668*c0909341SAndroid Build Coastguard Worker    add             xq, 5*4
669*c0909341SAndroid Build Coastguard Worker    ret
670*c0909341SAndroid Build Coastguard Worker.write8:
671*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, m0, xm0, 1
672*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m8
673*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+ 0], m1
674*c0909341SAndroid Build Coastguard Worker    psrldq         xm1, 2
675*c0909341SAndroid Build Coastguard Worker    movq   [rpq+xq+32], xm1
676*c0909341SAndroid Build Coastguard Worker    add             xq, 5*8
677*c0909341SAndroid Build Coastguard Worker    ret
678*c0909341SAndroid Build Coastguard Worker.write16:
679*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, m0, xm0, 1
680*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m1, m8
681*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+ 0], m2
682*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m9
683*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+32], m1
684*c0909341SAndroid Build Coastguard Worker    shufps         xm2, xm1, q1021
685*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+64], xm2
686*c0909341SAndroid Build Coastguard Worker    add             xq, 5*16
687*c0909341SAndroid Build Coastguard Worker    ret
688*c0909341SAndroid Build Coastguard Worker
689*c0909341SAndroid Build Coastguard Workercglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
690*c0909341SAndroid Build Coastguard Worker    add           bx4d, bw4d
691*c0909341SAndroid Build Coastguard Worker    tzcnt         bw4d, bw4d
692*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m0, [aq]
693*c0909341SAndroid Build Coastguard Worker    lea             aq, [splat_mv_avx2_table]
694*c0909341SAndroid Build Coastguard Worker    lea           bx4q, [bx4q*3-32]
695*c0909341SAndroid Build Coastguard Worker    movsxd        bw4q, [aq+bw4q*4]
696*c0909341SAndroid Build Coastguard Worker    pshufb          m0, [splat_mv_shuf]
697*c0909341SAndroid Build Coastguard Worker    movifnidn     bh4d, bh4m
698*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m0, q2102
699*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m0, q1021
700*c0909341SAndroid Build Coastguard Worker    add           bw4q, aq
701*c0909341SAndroid Build Coastguard Worker.loop:
702*c0909341SAndroid Build Coastguard Worker    mov             aq, [rrq]
703*c0909341SAndroid Build Coastguard Worker    add            rrq, gprsize
704*c0909341SAndroid Build Coastguard Worker    lea             aq, [aq+bx4q*4]
705*c0909341SAndroid Build Coastguard Worker    jmp           bw4q
706*c0909341SAndroid Build Coastguard Worker.w32:
707*c0909341SAndroid Build Coastguard Worker    mova     [aq-32*8], m0
708*c0909341SAndroid Build Coastguard Worker    mova     [aq-32*7], m1
709*c0909341SAndroid Build Coastguard Worker    mova     [aq-32*6], m2
710*c0909341SAndroid Build Coastguard Worker    mova     [aq-32*5], m0
711*c0909341SAndroid Build Coastguard Worker    mova     [aq-32*4], m1
712*c0909341SAndroid Build Coastguard Worker    mova     [aq-32*3], m2
713*c0909341SAndroid Build Coastguard Worker.w16:
714*c0909341SAndroid Build Coastguard Worker    mova     [aq-32*2], m0
715*c0909341SAndroid Build Coastguard Worker    mova     [aq-32*1], m1
716*c0909341SAndroid Build Coastguard Worker    mova     [aq+32*0], m2
717*c0909341SAndroid Build Coastguard Worker.w8:
718*c0909341SAndroid Build Coastguard Worker    mova     [aq+32*1], m0
719*c0909341SAndroid Build Coastguard Worker    mova     [aq+32*2], m1
720*c0909341SAndroid Build Coastguard Worker    mova     [aq+32*3], m2
721*c0909341SAndroid Build Coastguard Worker    dec           bh4d
722*c0909341SAndroid Build Coastguard Worker    jg .loop
723*c0909341SAndroid Build Coastguard Worker    RET
724*c0909341SAndroid Build Coastguard Worker.w4:
725*c0909341SAndroid Build Coastguard Worker    movu      [aq+ 80], m0
726*c0909341SAndroid Build Coastguard Worker    mova      [aq+112], xm1
727*c0909341SAndroid Build Coastguard Worker    dec           bh4d
728*c0909341SAndroid Build Coastguard Worker    jg .loop
729*c0909341SAndroid Build Coastguard Worker    RET
730*c0909341SAndroid Build Coastguard Worker.w2:
731*c0909341SAndroid Build Coastguard Worker    movu      [aq+104], xm0
732*c0909341SAndroid Build Coastguard Worker    movq      [aq+120], xm2
733*c0909341SAndroid Build Coastguard Worker    dec           bh4d
734*c0909341SAndroid Build Coastguard Worker    jg .loop
735*c0909341SAndroid Build Coastguard Worker    RET
736*c0909341SAndroid Build Coastguard Worker.w1:
737*c0909341SAndroid Build Coastguard Worker    movq      [aq+116], xm0
738*c0909341SAndroid Build Coastguard Worker    movd      [aq+124], xm1
739*c0909341SAndroid Build Coastguard Worker    dec           bh4d
740*c0909341SAndroid Build Coastguard Worker    jg .loop
741*c0909341SAndroid Build Coastguard Worker    RET
742*c0909341SAndroid Build Coastguard Worker
743*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
744*c0909341SAndroid Build Coastguard Worker; refmvs_temporal_block *rp, ptrdiff_t stride,
745*c0909341SAndroid Build Coastguard Worker; refmvs_block **rr, uint8_t *ref_sign,
746*c0909341SAndroid Build Coastguard Worker; int col_end8, int row_end8, int col_start8, int row_start8
747*c0909341SAndroid Build Coastguard Workercglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \
748*c0909341SAndroid Build Coastguard Worker                              xend, yend, xstart, ystart
749*c0909341SAndroid Build Coastguard Worker%define base r14-.write1
750*c0909341SAndroid Build Coastguard Worker    lea            r14, [.write1]
751*c0909341SAndroid Build Coastguard Worker    movifnidn    xendd, xendm
752*c0909341SAndroid Build Coastguard Worker    movifnidn    yendd, yendm
753*c0909341SAndroid Build Coastguard Worker    mov        xstartd, xstartm
754*c0909341SAndroid Build Coastguard Worker    mov        ystartd, ystartm
755*c0909341SAndroid Build Coastguard Worker    psllq           m4, [ref_signq]{bcstq}, 8
756*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m3, [base+save_ref_shuf+8]
757*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m5, [base+cond_shuf512]
758*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m6, [base+save_cond0]
759*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m7, [base+pb_128]
760*c0909341SAndroid Build Coastguard Worker    mova            m8, [base+save_pack0]
761*c0909341SAndroid Build Coastguard Worker    movu           xm9, [base+save_pack0+4]
762*c0909341SAndroid Build Coastguard Worker    lea            r9d, [xendq*5]
763*c0909341SAndroid Build Coastguard Worker    lea        xstartd, [xstartq*5]
764*c0909341SAndroid Build Coastguard Worker    sub          yendd, ystartd
765*c0909341SAndroid Build Coastguard Worker    add        ystartd, ystartd
766*c0909341SAndroid Build Coastguard Worker    lea        strideq, [strideq*5]
767*c0909341SAndroid Build Coastguard Worker    sub        xstartq, r9
768*c0909341SAndroid Build Coastguard Worker    add          xendd, r9d
769*c0909341SAndroid Build Coastguard Worker    add            rpq, r9
770*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0x1f
771*c0909341SAndroid Build Coastguard Worker    kmovb           k2, r10d
772*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
773*c0909341SAndroid Build Coastguard Worker.loop_y:
774*c0909341SAndroid Build Coastguard Worker    and        ystartd, 30
775*c0909341SAndroid Build Coastguard Worker    mov             xq, xstartq
776*c0909341SAndroid Build Coastguard Worker    mov             bq, [rrq+ystartq*8]
777*c0909341SAndroid Build Coastguard Worker    add        ystartd, 2
778*c0909341SAndroid Build Coastguard Worker    lea             bq, [bq+xendq*4]
779*c0909341SAndroid Build Coastguard Worker.loop_x:
780*c0909341SAndroid Build Coastguard Worker    imul         candq, xq, 0x9999
781*c0909341SAndroid Build Coastguard Worker    sar          candq, 16                   ; x / 5 * 3
782*c0909341SAndroid Build Coastguard Worker    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
783*c0909341SAndroid Build Coastguard Worker    movu           xm0, [bq+candq*8+12]      ; cand_b
784*c0909341SAndroid Build Coastguard Worker    movzx         r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0]
785*c0909341SAndroid Build Coastguard Worker    movzx         r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1]
786*c0909341SAndroid Build Coastguard Worker    add            r10, r14
787*c0909341SAndroid Build Coastguard Worker    add          candq, r11
788*c0909341SAndroid Build Coastguard Worker    jge .calc
789*c0909341SAndroid Build Coastguard Worker    movzx         r11d, byte [bq+candq*8+22]
790*c0909341SAndroid Build Coastguard Worker    vinserti32x4   ym0, [bq+candq*8+12], 1
791*c0909341SAndroid Build Coastguard Worker    movzx         r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0]
792*c0909341SAndroid Build Coastguard Worker    movzx         r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1]
793*c0909341SAndroid Build Coastguard Worker    add            r11, r14
794*c0909341SAndroid Build Coastguard Worker    add          candq, r12
795*c0909341SAndroid Build Coastguard Worker    jge .calc
796*c0909341SAndroid Build Coastguard Worker    movzx         r12d, byte [bq+candq*8+22]
797*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m0, [bq+candq*8+12], 2
798*c0909341SAndroid Build Coastguard Worker    movzx         r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0]
799*c0909341SAndroid Build Coastguard Worker    movzx         r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1]
800*c0909341SAndroid Build Coastguard Worker    add            r12, r14
801*c0909341SAndroid Build Coastguard Worker    add          candq, r13
802*c0909341SAndroid Build Coastguard Worker    jge .calc
803*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m0, [bq+candq*8+12], 3
804*c0909341SAndroid Build Coastguard Worker    movzx         r13d, byte [bq+candq*8+22]
805*c0909341SAndroid Build Coastguard Worker    movzx         r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1]
806*c0909341SAndroid Build Coastguard Worker    add            r13, r14
807*c0909341SAndroid Build Coastguard Worker.calc:
808*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m0, m3
809*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m0
810*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m1      ; ref > 0 && res_sign[ref - 1]
811*c0909341SAndroid Build Coastguard Worker    psrlw           m2, 12          ; (abs(mv.x) | abs(mv.y)) < 4096
812*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1
813*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m5           ; c0 c1 c1 c0
814*c0909341SAndroid Build Coastguard Worker    pand            m2, m6
815*c0909341SAndroid Build Coastguard Worker    punpckhqdq      m1, m2, m2
816*c0909341SAndroid Build Coastguard Worker    vpternlogd      m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80
817*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m0, m1
818*c0909341SAndroid Build Coastguard Worker    mova           xm0, xm2
819*c0909341SAndroid Build Coastguard Worker    call           r10
820*c0909341SAndroid Build Coastguard Worker    jge .next_line
821*c0909341SAndroid Build Coastguard Worker    vextracti32x4  xm0, m2, 1
822*c0909341SAndroid Build Coastguard Worker    call           r11
823*c0909341SAndroid Build Coastguard Worker    jge .next_line
824*c0909341SAndroid Build Coastguard Worker    vextracti32x4  xm0, m2, 2
825*c0909341SAndroid Build Coastguard Worker    call           r12
826*c0909341SAndroid Build Coastguard Worker    jge .next_line
827*c0909341SAndroid Build Coastguard Worker    vextracti32x4  xm0, m2, 3
828*c0909341SAndroid Build Coastguard Worker    call           r13
829*c0909341SAndroid Build Coastguard Worker    jl .loop_x
830*c0909341SAndroid Build Coastguard Worker.next_line:
831*c0909341SAndroid Build Coastguard Worker    add            rpq, strideq
832*c0909341SAndroid Build Coastguard Worker    dec             hd
833*c0909341SAndroid Build Coastguard Worker    jg .loop_y
834*c0909341SAndroid Build Coastguard Worker    RET
835*c0909341SAndroid Build Coastguard Worker.write1:
836*c0909341SAndroid Build Coastguard Worker    vmovdqu8 [rpq+xq]{k2}, xm0
837*c0909341SAndroid Build Coastguard Worker    add             xq, 5*1
838*c0909341SAndroid Build Coastguard Worker    ret
839*c0909341SAndroid Build Coastguard Worker.write2:
840*c0909341SAndroid Build Coastguard Worker    pshufb         xm0, xm8
841*c0909341SAndroid Build Coastguard Worker    vmovdqu16 [rpq+xq]{k2}, xm0
842*c0909341SAndroid Build Coastguard Worker    add             xq, 5*2
843*c0909341SAndroid Build Coastguard Worker    ret
844*c0909341SAndroid Build Coastguard Worker.write4:
845*c0909341SAndroid Build Coastguard Worker    vpermb         ym0, ym8, ym0
846*c0909341SAndroid Build Coastguard Worker    vmovdqu32 [rpq+xq]{k2}, ym0
847*c0909341SAndroid Build Coastguard Worker    add             xq, 5*4
848*c0909341SAndroid Build Coastguard Worker    ret
849*c0909341SAndroid Build Coastguard Worker.write8:
850*c0909341SAndroid Build Coastguard Worker    vpermb          m0, m8, m0
851*c0909341SAndroid Build Coastguard Worker    vmovdqu64 [rpq+xq]{k2}, m0
852*c0909341SAndroid Build Coastguard Worker    add             xq, 5*8
853*c0909341SAndroid Build Coastguard Worker    ret
854*c0909341SAndroid Build Coastguard Worker.write16:
855*c0909341SAndroid Build Coastguard Worker    vpermb          m1, m8, m0
856*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+ 0], m1
857*c0909341SAndroid Build Coastguard Worker    pshufb         xm0, xm9
858*c0909341SAndroid Build Coastguard Worker    movu   [rpq+xq+64], xm0
859*c0909341SAndroid Build Coastguard Worker    add             xq, 5*16
860*c0909341SAndroid Build Coastguard Worker    ret
861*c0909341SAndroid Build Coastguard Worker
862*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
863*c0909341SAndroid Build Coastguard Workercglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
864*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4    m0, [aq]
865*c0909341SAndroid Build Coastguard Worker    lea                r1, [splat_mv_avx512icl_table]
866*c0909341SAndroid Build Coastguard Worker    tzcnt            bw4d, bw4d
867*c0909341SAndroid Build Coastguard Worker    lea              bx4d, [bx4q*3]
868*c0909341SAndroid Build Coastguard Worker    pshufb             m0, [splat_mv_shuf]
869*c0909341SAndroid Build Coastguard Worker    movsxd           bw4q, [r1+bw4q*4]
870*c0909341SAndroid Build Coastguard Worker    mov               r6d, bh4m
871*c0909341SAndroid Build Coastguard Worker    add              bw4q, r1
872*c0909341SAndroid Build Coastguard Worker    lea               rrq, [rrq+r6*8]
873*c0909341SAndroid Build Coastguard Worker    mov               r1d, 0x3f
874*c0909341SAndroid Build Coastguard Worker    neg                r6
875*c0909341SAndroid Build Coastguard Worker    kmovb              k1, r1d
876*c0909341SAndroid Build Coastguard Worker    jmp              bw4q
877*c0909341SAndroid Build Coastguard Worker.w1:
878*c0909341SAndroid Build Coastguard Worker    mov                r1, [rrq+r6*8]
879*c0909341SAndroid Build Coastguard Worker    vmovdqu16 [r1+bx4q*4]{k1}, xm0
880*c0909341SAndroid Build Coastguard Worker    inc                r6
881*c0909341SAndroid Build Coastguard Worker    jl .w1
882*c0909341SAndroid Build Coastguard Worker    RET
883*c0909341SAndroid Build Coastguard Worker.w2:
884*c0909341SAndroid Build Coastguard Worker    mov                r1, [rrq+r6*8]
885*c0909341SAndroid Build Coastguard Worker    vmovdqu32 [r1+bx4q*4]{k1}, ym0
886*c0909341SAndroid Build Coastguard Worker    inc                r6
887*c0909341SAndroid Build Coastguard Worker    jl .w2
888*c0909341SAndroid Build Coastguard Worker    RET
889*c0909341SAndroid Build Coastguard Worker.w4:
890*c0909341SAndroid Build Coastguard Worker    mov                r1, [rrq+r6*8]
891*c0909341SAndroid Build Coastguard Worker    vmovdqu64 [r1+bx4q*4]{k1}, m0
892*c0909341SAndroid Build Coastguard Worker    inc                r6
893*c0909341SAndroid Build Coastguard Worker    jl .w4
894*c0909341SAndroid Build Coastguard Worker    RET
895*c0909341SAndroid Build Coastguard Worker.w8:
896*c0909341SAndroid Build Coastguard Worker    pshufd            ym1, ym0, q1021
897*c0909341SAndroid Build Coastguard Worker.w8_loop:
898*c0909341SAndroid Build Coastguard Worker    mov                r1, [rrq+r6*8+0]
899*c0909341SAndroid Build Coastguard Worker    mov                r3, [rrq+r6*8+8]
900*c0909341SAndroid Build Coastguard Worker    movu   [r1+bx4q*4+ 0], m0
901*c0909341SAndroid Build Coastguard Worker    mova   [r1+bx4q*4+64], ym1
902*c0909341SAndroid Build Coastguard Worker    movu   [r3+bx4q*4+ 0], m0
903*c0909341SAndroid Build Coastguard Worker    mova   [r3+bx4q*4+64], ym1
904*c0909341SAndroid Build Coastguard Worker    add                r6, 2
905*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
906*c0909341SAndroid Build Coastguard Worker    RET
907*c0909341SAndroid Build Coastguard Worker.w16:
908*c0909341SAndroid Build Coastguard Worker    pshufd             m1, m0, q1021
909*c0909341SAndroid Build Coastguard Worker    pshufd             m2, m0, q2102
910*c0909341SAndroid Build Coastguard Worker.w16_loop:
911*c0909341SAndroid Build Coastguard Worker    mov                r1, [rrq+r6*8+0]
912*c0909341SAndroid Build Coastguard Worker    mov                r3, [rrq+r6*8+8]
913*c0909341SAndroid Build Coastguard Worker    mova [r1+bx4q*4+64*0], m0
914*c0909341SAndroid Build Coastguard Worker    mova [r1+bx4q*4+64*1], m1
915*c0909341SAndroid Build Coastguard Worker    mova [r1+bx4q*4+64*2], m2
916*c0909341SAndroid Build Coastguard Worker    mova [r3+bx4q*4+64*0], m0
917*c0909341SAndroid Build Coastguard Worker    mova [r3+bx4q*4+64*1], m1
918*c0909341SAndroid Build Coastguard Worker    mova [r3+bx4q*4+64*2], m2
919*c0909341SAndroid Build Coastguard Worker    add                r6, 2
920*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
921*c0909341SAndroid Build Coastguard Worker    RET
922*c0909341SAndroid Build Coastguard Worker.w32:
923*c0909341SAndroid Build Coastguard Worker    pshufd             m1, m0, q1021
924*c0909341SAndroid Build Coastguard Worker    pshufd             m2, m0, q2102
925*c0909341SAndroid Build Coastguard Worker.w32_loop:
926*c0909341SAndroid Build Coastguard Worker    mov                r1, [rrq+r6*8]
927*c0909341SAndroid Build Coastguard Worker    lea                r1, [r1+bx4q*4]
928*c0909341SAndroid Build Coastguard Worker    mova        [r1+64*0], m0
929*c0909341SAndroid Build Coastguard Worker    mova        [r1+64*1], m1
930*c0909341SAndroid Build Coastguard Worker    mova        [r1+64*2], m2
931*c0909341SAndroid Build Coastguard Worker    mova        [r1+64*3], m0
932*c0909341SAndroid Build Coastguard Worker    mova        [r1+64*4], m1
933*c0909341SAndroid Build Coastguard Worker    mova        [r1+64*5], m2
934*c0909341SAndroid Build Coastguard Worker    inc                r6
935*c0909341SAndroid Build Coastguard Worker    jl .w32_loop
936*c0909341SAndroid Build Coastguard Worker    RET
937*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
938