xref: /aosp_15_r20/external/libdav1d/src/x86/ipred16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA
30
31filter_shuf:   db  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  4,  5,  2,  3, -1, -1
32pal_pred_shuf: db  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15
33z_base_inc:    dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
34z_base_inc_z2: dw   7*64,   6*64,   5*64,   4*64,   3*64,   2*64,   1*64,   0*64
35z_upsample:    db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
36z2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1,  8,  9,  8,  9, 10, 11, 12, 13
37               db  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
38z2_top_shufA:  db  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
39z2_top_shufB:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
40z2_left_shufA: db 14, 15, 12, 13, 10, 11,  8,  9, 12, 13, 10, 11,  8,  9,  6,  7
41z2_left_shufB: db 14, 15, 10, 11,  6,  7,  2,  3, 12, 13,  8,  9,  4,  5,  0,  1
42z_filt_wh16:   db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
43z_filt_t_w48:  db 55,127,  7,127, 15, 31, 39, 31,127, 39,127, 39,  7, 15, 31, 15
44               db 39, 63,  3, 63,  3,  3, 19,  3, 47, 19, 47, 19,  3,  3,  3,  3
45z_filt_t_w16:  db 15, 31,  7, 15, 31,  7,  3, 31,  3,  3,  3,  3,  3,  3,  0,  0
46z_filt_wh4:    db  7,  7, 19,  7,
47z_filt_wh8:    db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
48ALIGN 8
49pb_2_3:   times 4 db 2, 3
50z2_dy_offset:     dw 96*64, 96*64, 95*64, 95*64
51z_filt_k: times 4 dw 8
52          times 4 dw 6
53          times 4 dw 4
54          times 4 dw 5
55pw_m3584: times 4 dw -3584
56pw_m3072: times 4 dw -3072
57pw_m2560: times 4 dw -2560
58pw_m2048: times 4 dw -2048
59pw_m1536: times 4 dw -1536
60pw_m1024: times 4 dw -1024
61pw_m512:  times 4 dw -512
62pw_1:     times 4 dw 1
63pw_2:     times 4 dw 2
64pw_3:     times 4 dw 3
65pw_62:    times 4 dw 62
66pw_256:   times 4 dw 256
67pw_512:   times 4 dw 512
68pw_2048:  times 4 dw 2048
69
70%define pw_4 (z_filt_k+8*2)
71%define pw_8 (z_filt_k+8*0)
72%define pw_m1to4 z2_upsample_l
73
74%macro JMP_TABLE 3-*
75    %xdefine %1_%2_table (%%table - 2*4)
76    %xdefine %%base mangle(private_prefix %+ _%1_%2)
77    %%table:
78    %rep %0 - 2
79        dd %%base %+ .%3 - (%%table - 2*4)
80        %rotate 1
81    %endrep
82%endmacro
83
84%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4)
85%define ipred_dc_128_16bpc_ssse3_table   (ipred_dc_16bpc_ssse3_table + 15*4)
86%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4)
87
88JMP_TABLE ipred_dc_left_16bpc,    ssse3, h4, h8, h16, h32, h64
89JMP_TABLE ipred_dc_16bpc,         ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
90                                         s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \
91                                         s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4
92JMP_TABLE ipred_h_16bpc,          ssse3, w4, w8, w16, w32, w64
93JMP_TABLE ipred_z1_16bpc,         ssse3, w4, w8, w16, w32, w64
94JMP_TABLE ipred_z2_16bpc,         ssse3, w4, w8, w16, w32, w64
95JMP_TABLE ipred_z3_16bpc,         ssse3, h4, h8, h16, h32, h64
96JMP_TABLE ipred_cfl_16bpc,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
97                                         s4-8*4, s8-8*4, s16-8*4, s32-8*4
98JMP_TABLE ipred_cfl_left_16bpc,   ssse3, h4, h8, h16, h32
99JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32
100JMP_TABLE pal_pred_16bpc,         ssse3, w4, w8, w16, w32, w64
101
102cextern smooth_weights_1d_16bpc
103cextern smooth_weights_2d_16bpc
104cextern dr_intra_derivative
105cextern filter_intra_taps
106
107SECTION .text
108
109INIT_XMM ssse3
110cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
111    LEA                  r5, ipred_dc_left_16bpc_ssse3_table
112    movd                 m4, wm
113    tzcnt                wd, wm
114    add                 tlq, 2
115    movifnidn            hd, hm
116    pxor                 m3, m3
117    pavgw                m4, m3
118    movd                 m5, wd
119    movu                 m0, [tlq]
120    movsxd               r6, [r5+wq*4]
121    add                  r6, r5
122    add                  r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
123    movsxd               wq, [r5+wq*4]
124    add                  wq, r5
125    jmp                  r6
126
127cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
128    LEA                  r5, ipred_dc_left_16bpc_ssse3_table
129    mov                  hd, hm
130    movd                 m4, hm
131    tzcnt               r6d, hd
132    sub                 tlq, hq
133    tzcnt                wd, wm
134    pxor                 m3, m3
135    sub                 tlq, hq
136    pavgw                m4, m3
137    movd                 m5, r6d
138    movu                 m0, [tlq]
139    movsxd               r6, [r5+r6*4]
140    add                  r6, r5
141    add                  r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
142    movsxd               wq, [r5+wq*4]
143    add                  wq, r5
144    jmp                  r6
145.h64:
146    movu                 m2, [tlq+112]
147    movu                 m1, [tlq+ 96]
148    paddw                m0, m2
149    movu                 m2, [tlq+ 80]
150    paddw                m1, m2
151    movu                 m2, [tlq+ 64]
152    paddw                m0, m2
153    paddw                m0, m1
154.h32:
155    movu                 m1, [tlq+ 48]
156    movu                 m2, [tlq+ 32]
157    paddw                m1, m2
158    paddw                m0, m1
159.h16:
160    movu                 m1, [tlq+ 16]
161    paddw                m0, m1
162.h8:
163    movhlps              m1, m0
164    paddw                m0, m1
165.h4:
166    punpcklwd            m0, m3
167    paddd                m4, m0
168    punpckhqdq           m0, m0
169    paddd                m0, m4
170    pshuflw              m4, m0, q1032
171    paddd                m0, m4
172    psrld                m0, m5
173    lea            stride3q, [strideq*3]
174    pshuflw              m0, m0, q0000
175    punpcklqdq           m0, m0
176    jmp                  wq
177
178cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
179    movifnidn            hd, hm
180    tzcnt               r6d, hd
181    lea                 r5d, [wq+hq]
182    movd                 m4, r5d
183    tzcnt               r5d, r5d
184    movd                 m5, r5d
185    LEA                  r5, ipred_dc_16bpc_ssse3_table
186    tzcnt                wd, wd
187    movsxd               r6, [r5+r6*4]
188    movsxd               wq, [r5+wq*4+5*4]
189    pxor                 m3, m3
190    psrlw                m4, 1
191    add                  r6, r5
192    add                  wq, r5
193    lea            stride3q, [strideq*3]
194    jmp                  r6
195.h4:
196    movq                 m0, [tlq-8]
197    jmp                  wq
198.w4:
199    movq                 m1, [tlq+2]
200    paddw                m1, m0
201    punpckhwd            m0, m3
202    punpcklwd            m1, m3
203    paddd                m0, m1
204    paddd                m4, m0
205    punpckhqdq           m0, m0
206    paddd                m0, m4
207    pshuflw              m1, m0, q1032
208    paddd                m0, m1
209    cmp                  hd, 4
210    jg .w4_mul
211    psrlw                m0, 3
212    jmp .w4_end
213.w4_mul:
214    mov                 r2d, 0xAAAB
215    mov                 r3d, 0x6667
216    cmp                  hd, 16
217    cmove               r2d, r3d
218    psrld                m0, 2
219    movd                 m1, r2d
220    pmulhuw              m0, m1
221    psrlw                m0, 1
222.w4_end:
223    pshuflw              m0, m0, q0000
224.s4:
225    movq   [dstq+strideq*0], m0
226    movq   [dstq+strideq*1], m0
227    movq   [dstq+strideq*2], m0
228    movq   [dstq+stride3q ], m0
229    lea                dstq, [dstq+strideq*4]
230    sub                  hd, 4
231    jg .s4
232    RET
233.h8:
234    mova                 m0, [tlq-16]
235    jmp                  wq
236.w8:
237    movu                 m1, [tlq+2]
238    paddw                m0, m1
239    punpcklwd            m1, m0, m3
240    punpckhwd            m0, m3
241    paddd                m0, m1
242    paddd                m4, m0
243    punpckhqdq           m0, m0
244    paddd                m0, m4
245    pshuflw              m1, m0, q1032
246    paddd                m0, m1
247    psrld                m0, m5
248    cmp                  hd, 8
249    je .w8_end
250    mov                 r2d, 0xAAAB
251    mov                 r3d, 0x6667
252    cmp                  hd, 32
253    cmove               r2d, r3d
254    movd                 m1, r2d
255    pmulhuw              m0, m1
256    psrlw                m0, 1
257.w8_end:
258    pshuflw              m0, m0, q0000
259    punpcklqdq           m0, m0
260.s8:
261    mova   [dstq+strideq*0], m0
262    mova   [dstq+strideq*1], m0
263    mova   [dstq+strideq*2], m0
264    mova   [dstq+stride3q ], m0
265    lea                dstq, [dstq+strideq*4]
266    sub                  hd, 4
267    jg .s8
268    RET
269.h16:
270    mova                 m0, [tlq-32]
271    paddw                m0, [tlq-16]
272    jmp                  wq
273.w16:
274    movu                 m1, [tlq+ 2]
275    movu                 m2, [tlq+18]
276    paddw                m1, m2
277    paddw                m0, m1
278    punpckhwd            m1, m0, m3
279    punpcklwd            m0, m3
280    paddd                m0, m1
281    paddd                m4, m0
282    punpckhqdq           m0, m0
283    paddd                m0, m4
284    pshuflw              m1, m0, q1032
285    paddd                m0, m1
286    psrld                m0, m5
287    cmp                  hd, 16
288    je .w16_end
289    mov                 r2d, 0xAAAB
290    mov                 r3d, 0x6667
291    test                 hd, 8|32
292    cmovz               r2d, r3d
293    movd                 m1, r2d
294    pmulhuw              m0, m1
295    psrlw                m0, 1
296.w16_end:
297    pshuflw              m0, m0, q0000
298    punpcklqdq           m0, m0
299.s16c:
300    mova                 m1, m0
301.s16:
302    mova [dstq+strideq*0+16*0], m0
303    mova [dstq+strideq*0+16*1], m1
304    mova [dstq+strideq*1+16*0], m0
305    mova [dstq+strideq*1+16*1], m1
306    mova [dstq+strideq*2+16*0], m0
307    mova [dstq+strideq*2+16*1], m1
308    mova [dstq+stride3q +16*0], m0
309    mova [dstq+stride3q +16*1], m1
310    lea                dstq, [dstq+strideq*4]
311    sub                  hd, 4
312    jg .s16
313    RET
314.h32:
315    mova                 m0, [tlq-64]
316    paddw                m0, [tlq-48]
317    paddw                m0, [tlq-32]
318    paddw                m0, [tlq-16]
319    jmp                  wq
320.w32:
321    movu                 m1, [tlq+ 2]
322    movu                 m2, [tlq+18]
323    paddw                m1, m2
324    movu                 m2, [tlq+34]
325    paddw                m0, m2
326    movu                 m2, [tlq+50]
327    paddw                m1, m2
328    paddw                m0, m1
329    punpcklwd            m1, m0, m3
330    punpckhwd            m0, m3
331    paddd                m0, m1
332    paddd                m4, m0
333    punpckhqdq           m0, m0
334    paddd                m0, m4
335    pshuflw              m1, m0, q1032
336    paddd                m0, m1
337    psrld                m0, m5
338    cmp                  hd, 32
339    je .w32_end
340    mov                 r2d, 0xAAAB
341    mov                 r3d, 0x6667
342    cmp                  hd, 8
343    cmove               r2d, r3d
344    movd                 m1, r2d
345    pmulhuw              m0, m1
346    psrlw                m0, 1
347.w32_end:
348    pshuflw              m0, m0, q0000
349    punpcklqdq           m0, m0
350.s32c:
351    mova                 m1, m0
352    mova                 m2, m0
353    mova                 m3, m0
354.s32:
355    mova [dstq+strideq*0+16*0], m0
356    mova [dstq+strideq*0+16*1], m1
357    mova [dstq+strideq*0+16*2], m2
358    mova [dstq+strideq*0+16*3], m3
359    mova [dstq+strideq*1+16*0], m0
360    mova [dstq+strideq*1+16*1], m1
361    mova [dstq+strideq*1+16*2], m2
362    mova [dstq+strideq*1+16*3], m3
363    lea                dstq, [dstq+strideq*2]
364    sub                  hd, 2
365    jg .s32
366    RET
367.h64:
368    mova                 m0, [tlq-128]
369    mova                 m1, [tlq-112]
370    paddw                m0, [tlq- 96]
371    paddw                m1, [tlq- 80]
372    paddw                m0, [tlq- 64]
373    paddw                m1, [tlq- 48]
374    paddw                m0, [tlq- 32]
375    paddw                m1, [tlq- 16]
376    paddw                m0, m1
377    jmp                  wq
378.w64:
379    movu                 m1, [tlq+  2]
380    movu                 m2, [tlq+ 18]
381    paddw                m1, m2
382    movu                 m2, [tlq+ 34]
383    paddw                m0, m2
384    movu                 m2, [tlq+ 50]
385    paddw                m1, m2
386    movu                 m2, [tlq+ 66]
387    paddw                m0, m2
388    movu                 m2, [tlq+ 82]
389    paddw                m1, m2
390    movu                 m2, [tlq+ 98]
391    paddw                m0, m2
392    movu                 m2, [tlq+114]
393    paddw                m1, m2
394    paddw                m0, m1
395    punpcklwd            m1, m0, m3
396    punpckhwd            m0, m3
397    paddd                m0, m1
398    paddd                m4, m0
399    punpckhqdq           m0, m0
400    paddd                m0, m4
401    pshuflw              m1, m0, q1032
402    paddd                m0, m1
403    psrld                m0, m5
404    cmp                  hd, 64
405    je .w64_end
406    mov                 r2d, 0xAAAB
407    mov                 r3d, 0x6667
408    cmp                  hd, 16
409    cmove               r2d, r3d
410    movd                 m1, r2d
411    pmulhuw              m0, m1
412    psrlw                m0, 1
413.w64_end:
414    pshuflw              m0, m0, q0000
415    punpcklqdq           m0, m0
416.s64:
417    mova        [dstq+16*0], m0
418    mova        [dstq+16*1], m0
419    mova        [dstq+16*2], m0
420    mova        [dstq+16*3], m0
421    mova        [dstq+16*4], m0
422    mova        [dstq+16*5], m0
423    mova        [dstq+16*6], m0
424    mova        [dstq+16*7], m0
425    add                dstq, strideq
426    dec                  hd
427    jg .s64
428    RET
429
430cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
431    mov                 r6d, r8m
432    LEA                  r5, ipred_dc_128_16bpc_ssse3_table
433    tzcnt                wd, wm
434    shr                 r6d, 11
435    movifnidn            hd, hm
436    movsxd               wq, [r5+wq*4]
437    movddup              m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8]
438    add                  wq, r5
439    lea            stride3q, [strideq*3]
440    jmp                  wq
441
442cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
443    LEA                  r5, ipred_dc_splat_16bpc_ssse3_table
444    movifnidn            hd, hm
445    movu                 m0, [tlq+  2]
446    movu                 m1, [tlq+ 18]
447    movu                 m2, [tlq+ 34]
448    movu                 m3, [tlq+ 50]
449    cmp                  wd, 64
450    je .w64
451    tzcnt                wd, wd
452    movsxd               wq, [r5+wq*4]
453    add                  wq, r5
454    lea            stride3q, [strideq*3]
455    jmp                  wq
456.w64:
457    WIN64_SPILL_XMM 8
458    movu                 m4, [tlq+ 66]
459    movu                 m5, [tlq+ 82]
460    movu                 m6, [tlq+ 98]
461    movu                 m7, [tlq+114]
462.w64_loop:
463    mova        [dstq+16*0], m0
464    mova        [dstq+16*1], m1
465    mova        [dstq+16*2], m2
466    mova        [dstq+16*3], m3
467    mova        [dstq+16*4], m4
468    mova        [dstq+16*5], m5
469    mova        [dstq+16*6], m6
470    mova        [dstq+16*7], m7
471    add                dstq, strideq
472    dec                  hd
473    jg .w64_loop
474    RET
475
476cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
477%define base r5-ipred_h_16bpc_ssse3_table
478    tzcnt                wd, wm
479    LEA                  r5, ipred_h_16bpc_ssse3_table
480    movifnidn            hd, hm
481    movsxd               wq, [r5+wq*4]
482    movddup              m2, [base+pw_256]
483    movddup              m3, [base+pb_2_3]
484    add                  wq, r5
485    lea            stride3q, [strideq*3]
486    jmp                  wq
487.w4:
488    sub                 tlq, 8
489    movq                 m3, [tlq]
490    pshuflw              m0, m3, q3333
491    pshuflw              m1, m3, q2222
492    pshuflw              m2, m3, q1111
493    pshuflw              m3, m3, q0000
494    movq   [dstq+strideq*0], m0
495    movq   [dstq+strideq*1], m1
496    movq   [dstq+strideq*2], m2
497    movq   [dstq+stride3q ], m3
498    lea                dstq, [dstq+strideq*4]
499    sub                  hd, 4
500    jg .w4
501    RET
502.w8:
503    sub                 tlq, 8
504    movq                 m3, [tlq]
505    punpcklwd            m3, m3
506    pshufd               m0, m3, q3333
507    pshufd               m1, m3, q2222
508    pshufd               m2, m3, q1111
509    pshufd               m3, m3, q0000
510    mova   [dstq+strideq*0], m0
511    mova   [dstq+strideq*1], m1
512    mova   [dstq+strideq*2], m2
513    mova   [dstq+stride3q ], m3
514    lea                dstq, [dstq+strideq*4]
515    sub                  hd, 4
516    jg .w8
517    RET
518.w16:
519    sub                 tlq, 4
520    movd                 m1, [tlq]
521    pshufb               m0, m1, m3
522    pshufb               m1, m2
523    mova [dstq+strideq*0+16*0], m0
524    mova [dstq+strideq*0+16*1], m0
525    mova [dstq+strideq*1+16*0], m1
526    mova [dstq+strideq*1+16*1], m1
527    lea                dstq, [dstq+strideq*2]
528    sub                  hd, 2
529    jg .w16
530    RET
531.w32:
532    sub                 tlq, 4
533    movd                 m1, [tlq]
534    pshufb               m0, m1, m3
535    pshufb               m1, m2
536    mova [dstq+strideq*0+16*0], m0
537    mova [dstq+strideq*0+16*1], m0
538    mova [dstq+strideq*0+16*2], m0
539    mova [dstq+strideq*0+16*3], m0
540    mova [dstq+strideq*1+16*0], m1
541    mova [dstq+strideq*1+16*1], m1
542    mova [dstq+strideq*1+16*2], m1
543    mova [dstq+strideq*1+16*3], m1
544    lea                dstq, [dstq+strideq*2]
545    sub                  hd, 2
546    jg .w32
547    RET
548.w64:
549    sub                 tlq, 2
550    movd                 m0, [tlq]
551    pshufb               m0, m2
552    mova        [dstq+16*0], m0
553    mova        [dstq+16*1], m0
554    mova        [dstq+16*2], m0
555    mova        [dstq+16*3], m0
556    mova        [dstq+16*4], m0
557    mova        [dstq+16*5], m0
558    mova        [dstq+16*6], m0
559    mova        [dstq+16*7], m0
560    add                dstq, strideq
561    dec                  hd
562    jg .w64
563    RET
564
565cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left
566%define base r5-ipred_paeth_16bpc_ssse3_table
567    movifnidn            hd, hm
568    pshuflw              m4, [tlq], q0000
569    mov               leftq, tlq
570    add                  hd, hd
571    punpcklqdq           m4, m4      ; topleft
572    sub               leftq, hq
573    and                  wd, ~7
574    jnz .w8
575    movddup              m5, [tlq+2] ; top
576    psubw                m6, m5, m4
577    pabsw                m7, m6
578.w4_loop:
579    movd                 m1, [leftq+hq-4]
580    punpcklwd            m1, m1
581    punpckldq            m1, m1      ; left
582%macro PAETH 0
583    paddw                m0, m6, m1
584    psubw                m2, m4, m0  ; tldiff
585    psubw                m0, m5      ; tdiff
586    pabsw                m2, m2
587    pabsw                m0, m0
588    pminsw               m2, m0
589    pcmpeqw              m0, m2
590    pand                 m3, m5, m0
591    pandn                m0, m4
592    por                  m0, m3
593    pcmpgtw              m3, m7, m2
594    pand                 m0, m3
595    pandn                m3, m1
596    por                  m0, m3
597%endmacro
598    PAETH
599    movhps [dstq+strideq*0], m0
600    movq   [dstq+strideq*1], m0
601    lea                dstq, [dstq+strideq*2]
602    sub                  hd, 2*2
603    jg .w4_loop
604    RET
605.w8:
606%if ARCH_X86_32
607    PUSH                 r6
608    %define             r7d  hm
609    %assign regs_used     7
610%elif WIN64
611    movaps              r4m, m8
612    PUSH                 r7
613    %assign regs_used     8
614%endif
615%if ARCH_X86_64
616    movddup              m8, [pw_256]
617%endif
618    lea                 tlq, [tlq+wq*2+2]
619    neg                  wq
620    mov                 r7d, hd
621.w8_loop0:
622    movu                 m5, [tlq+wq*2]
623    mov                  r6, dstq
624    add                dstq, 16
625    psubw                m6, m5, m4
626    pabsw                m7, m6
627.w8_loop:
628    movd                 m1, [leftq+hq-2]
629%if ARCH_X86_64
630    pshufb               m1, m8
631%else
632    pshuflw              m1, m1, q0000
633    punpcklqdq           m1, m1
634%endif
635    PAETH
636    mova               [r6], m0
637    add                  r6, strideq
638    sub                  hd, 1*2
639    jg .w8_loop
640    mov                  hd, r7d
641    add                  wq, 8
642    jl .w8_loop0
643%if WIN64
644    movaps               m8, r4m
645%endif
646    RET
647
648%if ARCH_X86_64
649DECLARE_REG_TMP 7
650%else
651DECLARE_REG_TMP 4
652%endif
653
654cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights
655    LEA            weightsq, smooth_weights_1d_16bpc
656    mov                  hd, hm
657    lea            weightsq, [weightsq+hq*4]
658    neg                  hq
659    movd                 m5, [tlq+hq*2] ; bottom
660    pshuflw              m5, m5, q0000
661    punpcklqdq           m5, m5
662    cmp                  wd, 4
663    jne .w8
664    movddup              m4, [tlq+2]    ; top
665    lea                  r3, [strideq*3]
666    psubw                m4, m5         ; top - bottom
667.w4_loop:
668    movq                 m1, [weightsq+hq*2]
669    punpcklwd            m1, m1
670    pshufd               m0, m1, q1100
671    punpckhdq            m1, m1
672    pmulhrsw             m0, m4
673    pmulhrsw             m1, m4
674    paddw                m0, m5
675    paddw                m1, m5
676    movq   [dstq+strideq*0], m0
677    movhps [dstq+strideq*1], m0
678    movq   [dstq+strideq*2], m1
679    movhps [dstq+r3       ], m1
680    lea                dstq, [dstq+strideq*4]
681    add                  hq, 4
682    jl .w4_loop
683    RET
684.w8:
685%if ARCH_X86_32
686    PUSH                 r6
687    %assign regs_used     7
688    mov                  hm, hq
689    %define              hq  hm
690%elif WIN64
691    PUSH                 r7
692    %assign regs_used     8
693%endif
694.w8_loop0:
695    mov                  t0, hq
696    movu                 m4, [tlq+2]
697    add                 tlq, 16
698    mov                  r6, dstq
699    add                dstq, 16
700    psubw                m4, m5
701.w8_loop:
702    movq                 m3, [weightsq+t0*2]
703    punpcklwd            m3, m3
704    pshufd               m0, m3, q0000
705    pshufd               m1, m3, q1111
706    pshufd               m2, m3, q2222
707    pshufd               m3, m3, q3333
708    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
709    REPX   {paddw    x, m5}, m0, m1, m2, m3
710    mova     [r6+strideq*0], m0
711    mova     [r6+strideq*1], m1
712    lea                  r6, [r6+strideq*2]
713    mova     [r6+strideq*0], m2
714    mova     [r6+strideq*1], m3
715    lea                  r6, [r6+strideq*2]
716    add                  t0, 4
717    jl .w8_loop
718    sub                  wd, 8
719    jg .w8_loop0
720    RET
721
722cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights
723    LEA            weightsq, smooth_weights_1d_16bpc
724    mov                  wd, wm
725    movifnidn            hd, hm
726    movd                 m5, [tlq+wq*2] ; right
727    sub                 tlq, 8
728    add                  hd, hd
729    pshuflw              m5, m5, q0000
730    sub                 tlq, hq
731    punpcklqdq           m5, m5
732    cmp                  wd, 4
733    jne .w8
734    movddup              m4, [weightsq+4*2]
735    lea                  r3, [strideq*3]
736.w4_loop:
737    movq                 m1, [tlq+hq]   ; left
738    punpcklwd            m1, m1
739    psubw                m1, m5         ; left - right
740    pshufd               m0, m1, q3322
741    punpckldq            m1, m1
742    pmulhrsw             m0, m4
743    pmulhrsw             m1, m4
744    paddw                m0, m5
745    paddw                m1, m5
746    movhps [dstq+strideq*0], m0
747    movq   [dstq+strideq*1], m0
748    movhps [dstq+strideq*2], m1
749    movq   [dstq+r3       ], m1
750    lea                dstq, [dstq+strideq*4]
751    sub                  hd, 4*2
752    jg .w4_loop
753    RET
754.w8:
755    lea            weightsq, [weightsq+wq*4]
756    neg                  wq
757%if ARCH_X86_32
758    PUSH                 r6
759    %assign regs_used     7
760    %define              hd  hm
761%elif WIN64
762    PUSH                 r7
763    %assign regs_used     8
764%endif
765.w8_loop0:
766    mov                 t0d, hd
767    mova                 m4, [weightsq+wq*2]
768    mov                  r6, dstq
769    add                dstq, 16
770.w8_loop:
771    movq                 m3, [tlq+t0*(1+ARCH_X86_32)]
772    punpcklwd            m3, m3
773    psubw                m3, m5
774    pshufd               m0, m3, q3333
775    pshufd               m1, m3, q2222
776    pshufd               m2, m3, q1111
777    pshufd               m3, m3, q0000
778    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
779    REPX   {paddw    x, m5}, m0, m1, m2, m3
780    mova     [r6+strideq*0], m0
781    mova     [r6+strideq*1], m1
782    lea                  r6, [r6+strideq*2]
783    mova     [r6+strideq*0], m2
784    mova     [r6+strideq*1], m3
785    lea                  r6, [r6+strideq*2]
786    sub                 t0d, 4*(1+ARCH_X86_64)
787    jg .w8_loop
788    add                  wq, 8
789    jl .w8_loop0
790    RET
791
792%if ARCH_X86_64
793DECLARE_REG_TMP 10
794%else
795DECLARE_REG_TMP 3
796%endif
797
798cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \
799                                     h_weights, v_weights, top
800    LEA          h_weightsq, smooth_weights_2d_16bpc
801    mov                  wd, wm
802    mov                  hd, hm
803    movd                 m7, [tlq+wq*2] ; right
804    lea          v_weightsq, [h_weightsq+hq*8]
805    neg                  hq
806    movd                 m6, [tlq+hq*2] ; bottom
807    pshuflw              m7, m7, q0000
808    pshuflw              m6, m6, q0000
809    cmp                  wd, 4
810    jne .w8
811    movq                 m4, [tlq+2]    ; top
812    mova                 m5, [h_weightsq+4*4]
813    punpcklwd            m4, m6         ; top, bottom
814    pxor                 m6, m6
815.w4_loop:
816    movq                 m1, [v_weightsq+hq*4]
817    sub                 tlq, 4
818    movd                 m3, [tlq]      ; left
819    pshufd               m0, m1, q0000
820    pshufd               m1, m1, q1111
821    pmaddwd              m0, m4
822    punpcklwd            m3, m7         ; left, right
823    pmaddwd              m1, m4
824    pshufd               m2, m3, q1111
825    pshufd               m3, m3, q0000
826    pmaddwd              m2, m5
827    pmaddwd              m3, m5
828    paddd                m0, m2
829    paddd                m1, m3
830    psrld                m0, 8
831    psrld                m1, 8
832    packssdw             m0, m1
833    pavgw                m0, m6
834    movq   [dstq+strideq*0], m0
835    movhps [dstq+strideq*1], m0
836    lea                dstq, [dstq+strideq*2]
837    add                  hq, 2
838    jl .w4_loop
839    RET
840.w8:
841%if ARCH_X86_32
842    lea          h_weightsq, [h_weightsq+wq*4]
843    mov                  t0, tlq
844    mov                 r1m, tlq
845    mov                 r2m, hq
846    %define              m8  [h_weightsq+16*0]
847    %define              m9  [h_weightsq+16*1]
848%else
849%if WIN64
850    movaps              r4m, m8
851    movaps              r6m, m9
852    PUSH                 r7
853    PUSH                 r8
854%endif
855    PUSH                 r9
856    PUSH                r10
857    %assign       regs_used  11
858    lea          h_weightsq, [h_weightsq+wq*8]
859    lea                topq, [tlq+wq*2]
860    neg                  wq
861    mov                  r8, tlq
862    mov                  r9, hq
863%endif
864    punpcklqdq           m6, m6
865.w8_loop0:
866%if ARCH_X86_32
867    movu                 m5, [t0+2]
868    add                  t0, 16
869    mov                 r0m, t0
870%else
871    movu                 m5, [topq+wq*2+2]
872    mova                 m8, [h_weightsq+wq*4+16*0]
873    mova                 m9, [h_weightsq+wq*4+16*1]
874%endif
875    mov                  t0, dstq
876    add                dstq, 16
877    punpcklwd            m4, m5, m6
878    punpckhwd            m5, m6
879.w8_loop:
880    movd                 m1, [v_weightsq+hq*4]
881    sub                 tlq, 2
882    movd                 m3, [tlq]      ; left
883    pshufd               m1, m1, q0000
884    pmaddwd              m0, m4, m1
885    pshuflw              m3, m3, q0000
886    pmaddwd              m1, m5
887    punpcklwd            m3, m7         ; left, right
888    pmaddwd              m2, m8, m3
889    pmaddwd              m3, m9
890    paddd                m0, m2
891    paddd                m1, m3
892    psrld                m0, 8
893    psrld                m1, 8
894    packssdw             m0, m1
895    pxor                 m1, m1
896    pavgw                m0, m1
897    mova               [t0], m0
898    add                  t0, strideq
899    inc                  hq
900    jl .w8_loop
901%if ARCH_X86_32
902    mov                  t0, r0m
903    mov                 tlq, r1m
904    add          h_weightsq, 16*2
905    mov                  hq, r2m
906    sub            dword wm, 8
907    jg .w8_loop0
908%else
909    mov                 tlq, r8
910    mov                  hq, r9
911    add                  wq, 8
912    jl .w8_loop0
913%endif
914%if WIN64
915    movaps               m8, r4m
916    movaps               m9, r6m
917%endif
918    RET
919
920%if ARCH_X86_64
921cglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx
922    %define            base  r7-$$
923    %define          bdmaxm  r8m
924    lea                  r7, [$$]
925%else
926cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx
927    %define            base  r1-$$
928    %define        stridemp  [rsp+4*0]
929    %define          bdmaxm  [rsp+4*1]
930    mov                  r3, r8m
931    mov            stridemp, r1
932    mov              bdmaxm, r3
933    LEA                  r1, $$
934%endif
935    tzcnt                wd, wm
936    movifnidn        angled, anglem
937    movifnidn            hd, hm
938    add                 tlq, 2
939    movsxd               wq, [base+ipred_z1_16bpc_ssse3_table+wq*4]
940    mov                 dxd, angled
941    movddup              m0, [base+pw_256]
942    and                 dxd, 0x7e
943    movddup              m7, [base+pw_62]
944    add              angled, 165 ; ~90
945    lea                  wq, [base+wq+ipred_z1_16bpc_ssse3_table]
946    movzx               dxd, word [base+dr_intra_derivative+dxq]
947    xor              angled, 0x4ff ; d = 90 - angle
948    jmp                  wq
949.w4:
950    lea                 r3d, [angleq+88]
951    test                r3d, 0x480
952    jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
953    sar                 r3d, 9
954    add                 r3d, hd
955    cmp                 r3d, 8
956    jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
957    movd                 m3, [tlq+14]
958    movu                 m2, [tlq+ 0]  ; 1 2 3 4 5 6 7 8
959    movd                 m1, bdmaxm
960    pshufb               m3, m0
961    palignr              m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8
962    paddw                m4, [tlq- 2]  ; 0 1 2 3 4 5 6 7
963    add                 dxd, dxd
964    mova           [rsp+32], m3
965    palignr              m3, m2, 2     ; 2 3 4 5 6 7 8 8
966    pshufb               m1, m0
967    paddw                m3, m2        ; -1 * a + 9 * b + 9 * c + -1 * d
968    psubw                m5, m3, m4    ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
969    movd                 m4, dxd
970    psraw                m5, 3         ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
971    paddw                m3, m5
972    pxor                 m5, m5
973    pmaxsw               m3, m5
974    mov                 r3d, dxd
975    pavgw                m3, m5
976    pshufb               m4, m0
977    pminsw               m3, m1
978    punpcklwd            m1, m2, m3
979    punpckhwd            m2, m3
980    mova                 m3, [base+z_upsample]
981    movifnidn       strideq, stridemp
982    mova           [rsp+ 0], m1
983    paddw                m5, m4, m4
984    mova           [rsp+16], m2
985    punpcklqdq           m4, m5 ; xpos0 xpos1
986.w4_upsample_loop:
987    lea                 r2d, [r3+dxq]
988    shr                 r3d, 6 ; base0
989    movu                 m1, [rsp+r3*2]
990    lea                 r3d, [r2+dxq]
991    shr                 r2d, 6 ; base1
992    movu                 m2, [rsp+r2*2]
993    pshufb               m1, m3
994    pshufb               m2, m3
995    punpcklqdq           m0, m1, m2
996    punpckhqdq           m1, m2
997    pand                 m2, m7, m4 ; frac
998    psllw                m2, 9      ; (a * (64 - frac) + b * frac + 32) >> 6
999    psubw                m1, m0     ; = a + (((b - a) * frac + 32) >> 6)
1000    pmulhrsw             m1, m2     ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
1001    paddw                m4, m5     ; xpos += dx
1002    paddw                m0, m1
1003    movq   [dstq+strideq*0], m0
1004    movhps [dstq+strideq*1], m0
1005    lea                dstq, [dstq+strideq*2]
1006    sub                  hd, 2
1007    jg .w4_upsample_loop
1008    RET
1009.w4_no_upsample:
1010    mov                 r3d, 7     ; max_base
1011    test             angled, 0x400 ; !enable_intra_edge_filter
1012    jnz .w4_main
1013    lea                 r3d, [hq+3]
1014    movd                 m1, r3d
1015    movd                 m3, angled
1016    shr              angled, 8 ; is_sm << 1
1017    pxor                 m2, m2
1018    pshufb               m1, m2
1019    pshufb               m3, m2
1020    pcmpeqb              m1, [base+z_filt_wh4]
1021    pand                 m1, m3
1022    pcmpgtb              m1, [base+z_filt_t_w48+angleq*8]
1023    pmovmskb            r5d, m1
1024    mov                 r3d, 7
1025    test                r5d, r5d
1026    jz .w4_main ; filter_strength == 0
1027    pshuflw              m1, [tlq-2], q0000
1028    movu                 m2, [tlq+16*0]
1029    imul                r5d, 0x55555555
1030    movd                 m3, [tlq+r3*2]
1031    shr                 r5d, 30 ; filter_strength
1032    movd           [rsp+12], m1
1033    pshuflw              m3, m3, q0000
1034    mova         [rsp+16*1], m2
1035    lea                 r2d, [r3+2]
1036    movq      [rsp+r3*2+18], m3
1037    cmp                  hd, 8
1038    cmovae              r3d, r2d
1039    lea                 tlq, [rsp+16*1]
1040    call .filter_edge
1041.w4_main:
1042    lea                 tlq, [tlq+r3*2]
1043    movd                 m4, dxd
1044    movddup              m1, [base+z_base_inc] ; base_inc << 6
1045    movd                 m6, [tlq] ; top[max_base_x]
1046    shl                 r3d, 6
1047    movd                 m3, r3d
1048    pshufb               m4, m0
1049    mov                 r5d, dxd ; xpos
1050    pshufb               m6, m0
1051    sub                  r5, r3
1052    pshufb               m3, m0
1053    paddw                m5, m4, m4
1054    psubw                m3, m1 ; max_base_x
1055    punpcklqdq           m4, m5 ; xpos0 xpos1
1056    movifnidn       strideq, stridemp
1057.w4_loop:
1058    lea                  r3, [r5+dxq]
1059    sar                  r5, 6      ; base0
1060    movq                 m0, [tlq+r5*2+0]
1061    movq                 m1, [tlq+r5*2+2]
1062    lea                  r5, [r3+dxq]
1063    sar                  r3, 6      ; base1
1064    movhps               m0, [tlq+r3*2+0]
1065    movhps               m1, [tlq+r3*2+2]
1066    pand                 m2, m7, m4
1067    psllw                m2, 9
1068    psubw                m1, m0
1069    pmulhrsw             m1, m2
1070    pcmpgtw              m2, m3, m4 ; xpos < max_base_x
1071    paddw                m4, m5     ; xpos += dx
1072    paddw                m0, m1
1073    pand                 m0, m2
1074    pandn                m2, m6
1075    por                  m0, m2
1076    movq   [dstq+strideq*0], m0
1077    movhps [dstq+strideq*1], m0
1078    sub                  hd, 2
1079    jz .w4_end
1080    lea                dstq, [dstq+strideq*2]
1081    test                r5d, r5d
1082    jl .w4_loop
1083.w4_end_loop:
1084    movq   [dstq+strideq*0], m6
1085    movq   [dstq+strideq*1], m6
1086    lea                dstq, [dstq+strideq*2]
1087    sub                  hd, 2
1088    jg .w4_end_loop
1089.w4_end:
1090    RET
1091.w8:
1092    lea                 r3d, [angleq+88]
1093    and                 r3d, ~0x7f
1094    or                  r3d, hd
1095    cmp                 r3d, 8
1096    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1097    movu                 m1, [tlq+ 0]  ; 1 2 3 4 5 6 7 8
1098    movu                 m5, [tlq+ 2]  ; 2 3 4 5 6 7 8 9
1099    movu                 m3, [tlq+ 4]  ; 3 4 5 6 7 8 9 a
1100    paddw                m5, m1
1101    paddw                m3, [tlq- 2]  ; 0 1 2 3 4 5 6 7
1102    psubw                m2, m5, m3
1103    movu                 m6, [tlq+18]  ; a b c d e f g _
1104    psraw                m2, 3
1105    movu                 m3, [tlq+20]  ; b c d e f g _ _
1106    paddw                m5, m2
1107    movu                 m2, [tlq+16]  ; 9 a b c d e f g
1108    paddw                m6, m2
1109    add                 dxd, dxd
1110    cmp                  hd, 4
1111    jne .w8_upsample_h8 ; awkward single-pixel edge case
1112    pshuflw              m3, m3, q1110 ; b c c _ _ _ _ _
1113.w8_upsample_h8:
1114    paddw                m3, [tlq+14]  ; 8 9 a b c d e f
1115    psubw                m4, m6, m3
1116    movd                 m3, bdmaxm
1117    psraw                m4, 3
1118    mov                 r3d, dxd
1119    paddw                m6, m4
1120    pxor                 m4, m4
1121    pmaxsw               m5, m4
1122    pmaxsw               m6, m4
1123    pshufb               m3, m0
1124    pavgw                m5, m4
1125    pavgw                m6, m4
1126    movd                 m4, dxd
1127    pminsw               m5, m3
1128    pminsw               m6, m3
1129    mova                 m3, [base+z_upsample]
1130    pshufb               m4, m0
1131    movifnidn       strideq, stridemp
1132    punpcklwd            m0, m1, m5
1133    mova           [rsp+ 0], m0
1134    punpckhwd            m1, m5
1135    mova           [rsp+16], m1
1136    punpcklwd            m0, m2, m6
1137    mova           [rsp+32], m0
1138    punpckhwd            m2, m6
1139    mova           [rsp+48], m2
1140    mova                 m5, m4
1141.w8_upsample_loop:
1142    mov                 r2d, r3d
1143    shr                 r2d, 6
1144    movu                 m1, [rsp+r2*2+ 0]
1145    movu                 m2, [rsp+r2*2+16]
1146    add                 r3d, dxd
1147    pshufb               m1, m3
1148    pshufb               m2, m3
1149    punpcklqdq           m0, m1, m2
1150    punpckhqdq           m1, m2
1151    pand                 m2, m7, m4
1152    psllw                m2, 9
1153    psubw                m1, m0
1154    pmulhrsw             m1, m2
1155    paddw                m4, m5
1156    paddw                m0, m1
1157    mova             [dstq], m0
1158    add                dstq, strideq
1159    dec                  hd
1160    jg .w8_upsample_loop
1161    RET
1162.w8_no_upsample:
1163    lea                 r3d, [hq+7]
1164    movd                 m1, r3d
1165    and                 r3d, 7
1166    or                  r3d, 8 ; imin(h+7, 15)
1167    test             angled, 0x400
1168    jnz .w8_main
1169    movd                 m3, angled
1170    shr              angled, 8 ; is_sm << 1
1171    pxor                 m2, m2
1172    pshufb               m1, m2
1173    pshufb               m3, m2
1174    movu                 m2, [base+z_filt_wh8]
1175    psrldq               m4, [base+z_filt_t_w48+angleq*8], 4
1176    pcmpeqb              m2, m1
1177    pand                 m2, m3
1178    pcmpgtb              m2, m4
1179    pmovmskb            r5d, m2
1180    test                r5d, r5d
1181    jz .w8_main ; filter_strength == 0
1182    pshuflw              m1, [tlq-2], q0000
1183    movu                 m2, [tlq+16*0]
1184    imul                r5d, 0x55555555
1185    movu                 m3, [tlq+16*1]
1186    movd                 m4, [tlq+r3*2]
1187    shr                 r5d, 30 ; filter_strength
1188    movd           [rsp+12], m1
1189    mova         [rsp+16*1], m2
1190    pshuflw              m4, m4, q0000
1191    mova         [rsp+16*2], m3
1192    lea                 r2d, [r3+2]
1193    movq      [rsp+r3*2+18], m4
1194    cmp                  hd, 16
1195    cmovae              r3d, r2d
1196    lea                 tlq, [rsp+16*1]
1197    call .filter_edge
1198.w8_main:
1199    lea                 tlq, [tlq+r3*2]
1200    movd                 m5, dxd
1201    mova                 m4, [base+z_base_inc]
1202    shl                 r3d, 6
1203    movd                 m6, [tlq] ; top[max_base_x]
1204    movd                 m1, r3d
1205    pshufb               m5, m0
1206    mov                 r5d, dxd ; xpos
1207    pshufb               m1, m0
1208    sub                  r5, r3
1209    psubw                m4, m1 ; max_base_x
1210    pshufb               m6, m0
1211    paddw                m4, m5
1212    movifnidn       strideq, stridemp
1213.w8_loop:
1214    mov                  r3, r5
1215    sar                  r3, 6
1216    movu                 m0, [tlq+r3*2+0]
1217    movu                 m1, [tlq+r3*2+2]
1218    pand                 m2, m7, m4
1219    psllw                m2, 9
1220    psubw                m1, m0
1221    pmulhrsw             m1, m2
1222    psraw                m2, m4, 15 ; xpos < max_base_x
1223    paddw                m4, m5     ; xpos += dx
1224    paddw                m0, m1
1225    pand                 m0, m2
1226    pandn                m2, m6
1227    por                  m0, m2
1228    mova             [dstq], m0
1229    dec                  hd
1230    jz .w8_end
1231    add                dstq, strideq
1232    add                  r5, dxq
1233    jl .w8_loop
1234.w8_end_loop:
1235    mova             [dstq], m6
1236    add                dstq, strideq
1237    dec                  hd
1238    jg .w8_end_loop
1239.w8_end:
1240    RET
1241.w16:
1242%if ARCH_X86_32
1243    %define         strideq  r3
1244%endif
1245    lea                 r3d, [hq+15]
1246    movd                 m1, r3d
1247    and                 r3d, 15
1248    or                  r3d, 16 ; imin(h+15, 31)
1249    test             angled, 0x400
1250    jnz .w16_main
1251    movd                 m3, angled
1252    shr              angled, 8 ; is_sm << 1
1253    pxor                 m2, m2
1254    pshufb               m1, m2
1255    pshufb               m3, m2
1256    movq                 m4, [base+z_filt_t_w16+angleq*4]
1257    pcmpeqb              m1, [base+z_filt_wh16]
1258    pand                 m1, m3
1259    pcmpgtb              m1, m4
1260    pmovmskb            r5d, m1
1261    test                r5d, r5d
1262    jz .w16_main ; filter_strength == 0
1263    pshuflw              m1, [tlq-2], q0000
1264    movu                 m2, [tlq+16*0]
1265    imul                r5d, 0x24924924
1266    movu                 m3, [tlq+16*1]
1267    movu                 m4, [tlq+16*2]
1268    shr                 r5d, 30
1269    movu                 m5, [tlq+16*3]
1270    movd                 m6, [tlq+r3*2]
1271    adc                 r5d, -1 ; filter_strength
1272    movd           [rsp+12], m1
1273    mova         [rsp+16*1], m2
1274    mova         [rsp+16*2], m3
1275    pshuflw              m6, m6, q0000
1276    mova         [rsp+16*3], m4
1277    mova         [rsp+16*4], m5
1278    lea                 r2d, [r3+2]
1279    movq      [rsp+r3*2+18], m6
1280    cmp                  hd, 32
1281    cmovae              r3d, r2d
1282    lea                 tlq, [rsp+16*1]
1283    call .filter_edge
1284.w16_main:
1285    lea                 tlq, [tlq+r3*2]
1286    movd                 m5, dxd
1287    mova                 m4, [base+z_base_inc]
1288    shl                 r3d, 6
1289    movd                 m6, [tlq] ; top[max_base_x]
1290    movd                 m1, r3d
1291    pshufb               m5, m0
1292    mov                 r5d, dxd ; xpos
1293    pshufb               m1, m0
1294    sub                  r5, r3
1295    psubw                m4, m1 ; max_base_x
1296    pshufb               m6, m0
1297    paddw                m4, m5
1298.w16_loop:
1299    mov                  r3, r5
1300    sar                  r3, 6
1301    movu                 m0, [tlq+r3*2+ 0]
1302    movu                 m2, [tlq+r3*2+ 2]
1303    pand                 m3, m7, m4
1304    psllw                m3, 9
1305    psubw                m2, m0
1306    pmulhrsw             m2, m3
1307    movu                 m1, [tlq+r3*2+16]
1308    paddw                m0, m2
1309    movu                 m2, [tlq+r3*2+18]
1310    psubw                m2, m1
1311    pmulhrsw             m2, m3
1312    movddup              m3, [base+pw_m512]
1313    paddw                m1, m2
1314    psraw                m2, m4, 15
1315    pcmpgtw              m3, m4
1316    paddw                m4, m5
1317    pand                 m0, m2
1318    pandn                m2, m6
1319    pand                 m1, m3
1320    pandn                m3, m6
1321    por                  m0, m2
1322    mova        [dstq+16*0], m0
1323    por                  m1, m3
1324    mova        [dstq+16*1], m1
1325    dec                  hd
1326    jz .w16_end
1327    movifnidn       strideq, stridemp
1328    add                dstq, strideq
1329    add                  r5, dxq
1330    jl .w16_loop
1331.w16_end_loop:
1332    mova        [dstq+16*0], m6
1333    mova        [dstq+16*1], m6
1334    add                dstq, strideq
1335    dec                  hd
1336    jg .w16_end_loop
1337.w16_end:
1338    RET
1339.w32:
1340    lea                 r3d, [hq+31]
1341    and                 r3d, 31
1342    or                  r3d, 32    ; imin(h+31, 63)
1343    test             angled, 0x400 ; !enable_intra_edge_filter
1344    jnz .w32_main
1345    call .filter_copy
1346    lea                 r5d, [r3+2]
1347    cmp                  hd, 64
1348    cmove               r3d, r5d
1349    call .filter_edge_s3
1350.w32_main:
1351    lea                 tlq, [tlq+r3*2]
1352    movd                 m5, dxd
1353    mova                 m4, [base+z_base_inc]
1354    shl                 r3d, 6
1355    movd                 m6, [tlq] ; top[max_base_x]
1356    movd                 m1, r3d
1357    pshufb               m5, m0
1358    mov                 r5d, dxd ; xpos
1359    pshufb               m1, m0
1360    sub                  r5, r3
1361    psubw                m4, m1 ; max_base_x
1362    pshufb               m6, m0
1363    paddw                m4, m5
1364.w32_loop:
1365    mov                  r3, r5
1366    sar                  r3, 6
1367    movu                 m0, [tlq+r3*2+ 0]
1368    movu                 m2, [tlq+r3*2+ 2]
1369    pand                 m3, m7, m4
1370    psllw                m3, 9
1371    psubw                m2, m0
1372    pmulhrsw             m2, m3
1373    movu                 m1, [tlq+r3*2+16]
1374    paddw                m0, m2
1375    movu                 m2, [tlq+r3*2+18]
1376    psubw                m2, m1
1377    pmulhrsw             m2, m3
1378    paddw                m1, m2
1379    psraw                m2, m4, 15
1380    pand                 m0, m2
1381    pandn                m2, m6
1382    por                  m0, m2
1383    movddup              m2, [base+pw_m512]
1384    pcmpgtw              m2, m4
1385    pand                 m1, m2
1386    pandn                m2, m6
1387    mova        [dstq+16*0], m0
1388    por                  m1, m2
1389    mova        [dstq+16*1], m1
1390    movu                 m0, [tlq+r3*2+32]
1391    movu                 m2, [tlq+r3*2+34]
1392    psubw                m2, m0
1393    pmulhrsw             m2, m3
1394    movu                 m1, [tlq+r3*2+48]
1395    paddw                m0, m2
1396    movu                 m2, [tlq+r3*2+50]
1397    psubw                m2, m1
1398    pmulhrsw             m2, m3
1399    paddw                m1, m2
1400    movddup              m2, [base+pw_m1024]
1401    movddup              m3, [base+pw_m1536]
1402    pcmpgtw              m2, m4
1403    pcmpgtw              m3, m4
1404    paddw                m4, m5
1405    pand                 m0, m2
1406    pandn                m2, m6
1407    pand                 m1, m3
1408    pandn                m3, m6
1409    por                  m0, m2
1410    mova        [dstq+16*2], m0
1411    por                  m1, m3
1412    mova        [dstq+16*3], m1
1413    dec                  hd
1414    jz .w32_end
1415    movifnidn       strideq, stridemp
1416    add                dstq, strideq
1417    add                  r5, dxq
1418    jl .w32_loop
1419.w32_end_loop:
1420    REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3
1421    add                dstq, strideq
1422    dec                  hd
1423    jg .w32_end_loop
1424.w32_end:
1425    RET
1426.w64:
1427    lea                 r3d, [hq+63]
1428    test             angled, 0x400 ; !enable_intra_edge_filter
1429    jnz .w64_main
1430    call .filter_copy
1431    call .filter_edge_s3
1432.w64_main:
1433    lea                 tlq, [tlq+r3*2]
1434    movd                 m5, dxd
1435    mova                 m4, [base+z_base_inc]
1436    shl                 r3d, 6
1437    movd                 m6, [tlq] ; top[max_base_x]
1438    movd                 m1, r3d
1439    pshufb               m5, m0
1440    mov                 r5d, dxd ; xpos
1441    pshufb               m1, m0
1442    sub                  r5, r3
1443    psubw                m4, m1 ; max_base_x
1444    pshufb               m6, m0
1445    paddw                m4, m5
1446.w64_loop:
1447    mov                  r3, r5
1448    sar                  r3, 6
1449    movu                 m0, [tlq+r3*2+ 0]
1450    movu                 m2, [tlq+r3*2+ 2]
1451    pand                 m3, m7, m4
1452    psllw                m3, 9
1453    psubw                m2, m0
1454    pmulhrsw             m2, m3
1455    movu                 m1, [tlq+r3*2+16]
1456    paddw                m0, m2
1457    movu                 m2, [tlq+r3*2+18]
1458    psubw                m2, m1
1459    pmulhrsw             m2, m3
1460    paddw                m1, m2
1461    psraw                m2, m4, 15
1462    pand                 m0, m2
1463    pandn                m2, m6
1464    por                  m0, m2
1465    movddup              m2, [base+pw_m512]
1466    pcmpgtw              m2, m4
1467    pand                 m1, m2
1468    pandn                m2, m6
1469    mova        [dstq+16*0], m0
1470    por                  m1, m2
1471    mova        [dstq+16*1], m1
1472    movu                 m0, [tlq+r3*2+32]
1473    movu                 m2, [tlq+r3*2+34]
1474    psubw                m2, m0
1475    pmulhrsw             m2, m3
1476    movu                 m1, [tlq+r3*2+48]
1477    paddw                m0, m2
1478    movu                 m2, [tlq+r3*2+50]
1479    psubw                m2, m1
1480    pmulhrsw             m2, m3
1481    paddw                m1, m2
1482    movddup              m2, [base+pw_m1024]
1483    pcmpgtw              m2, m4
1484    pand                 m0, m2
1485    pandn                m2, m6
1486    por                  m0, m2
1487    movddup              m2, [base+pw_m1536]
1488    pcmpgtw              m2, m4
1489    pand                 m1, m2
1490    pandn                m2, m6
1491    mova        [dstq+16*2], m0
1492    por                  m1, m2
1493    mova        [dstq+16*3], m1
1494    movu                 m0, [tlq+r3*2+64]
1495    movu                 m2, [tlq+r3*2+66]
1496    psubw                m2, m0
1497    pmulhrsw             m2, m3
1498    movu                 m1, [tlq+r3*2+80]
1499    paddw                m0, m2
1500    movu                 m2, [tlq+r3*2+82]
1501    psubw                m2, m1
1502    pmulhrsw             m2, m3
1503    paddw                m1, m2
1504    movddup              m2, [base+pw_m2048]
1505    pcmpgtw              m2, m4
1506    pand                 m0, m2
1507    pandn                m2, m6
1508    por                  m0, m2
1509    movddup              m2, [base+pw_m2560]
1510    pcmpgtw              m2, m4
1511    pand                 m1, m2
1512    pandn                m2, m6
1513    mova        [dstq+16*4], m0
1514    por                  m1, m2
1515    mova        [dstq+16*5], m1
1516    movu                 m0, [tlq+r3*2+96]
1517    movu                 m2, [tlq+r3*2+98]
1518    psubw                m2, m0
1519    pmulhrsw             m2, m3
1520    movu                 m1, [tlq+r3*2+112]
1521    paddw                m0, m2
1522    movu                 m2, [tlq+r3*2+114]
1523    psubw                m2, m1
1524    pmulhrsw             m2, m3
1525    paddw                m1, m2
1526    movddup              m2, [base+pw_m3072]
1527    movddup              m3, [base+pw_m3584]
1528    pcmpgtw              m2, m4
1529    pcmpgtw              m3, m4
1530    paddw                m4, m5
1531    pand                 m0, m2
1532    pandn                m2, m6
1533    pand                 m1, m3
1534    pandn                m3, m6
1535    por                  m0, m2
1536    mova        [dstq+16*6], m0
1537    por                  m1, m3
1538    mova        [dstq+16*7], m1
1539    dec                  hd
1540    jz .w64_end
1541    movifnidn       strideq, stridemp
1542    add                dstq, strideq
1543    add                  r5, dxq
1544    jl .w64_loop
1545.w64_end_loop:
1546    REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
1547    add                dstq, strideq
1548    dec                  hd
1549    jg .w64_end_loop
1550.w64_end:
1551    RET
1552ALIGN function_align
1553.filter_copy:
1554    pshuflw              m2, [tlq-2], q0000
1555    pshuflw              m3, [tlq+r3*2], q0000
1556    xor                 r5d, r5d
1557    movd   [rsp+gprsize+12], m2
1558.filter_copy_loop:
1559    movu                 m1, [tlq+r5*2+16*0]
1560    movu                 m2, [tlq+r5*2+16*1]
1561    add                 r5d, 16
1562    mova [rsp+r5*2+gprsize-16*1], m1
1563    mova [rsp+r5*2+gprsize-16*0], m2
1564    cmp                 r5d, r3d
1565    jle .filter_copy_loop
1566    lea                 tlq, [rsp+gprsize+16*1]
1567    movq       [tlq+r3*2+2], m3
1568    ret
1569.filter_edge:
1570    cmp                 r5d, 3
1571    je .filter_edge_s3
1572    movddup              m4, [base+z_filt_k+r5*8-8]
1573    movddup              m5, [base+z_filt_k+r5*8+8]
1574    xor                 r5d, r5d
1575    movddup              m6, [base+pw_8]
1576    movu                 m2, [tlq-2]
1577    jmp .filter_edge_start
1578.filter_edge_loop:
1579    movu                 m2, [tlq+r5*2-2]
1580    mova      [tlq+r5*2-16], m1
1581.filter_edge_start:
1582    pmullw               m1, m4, [tlq+r5*2]
1583    movu                 m3, [tlq+r5*2+2]
1584    paddw                m2, m3
1585    pmullw               m2, m5
1586    add                 r5d, 8
1587    paddw                m1, m6
1588    paddw                m1, m2
1589    psrlw                m1, 4
1590    cmp                 r5d, r3d
1591    jl .filter_edge_loop
1592    mova      [tlq+r5*2-16], m1
1593    ret
1594.filter_edge_s3:
1595    movddup              m5, [base+pw_3]
1596    xor                 r5d, r5d
1597    movu                 m2, [tlq-2]
1598    movu                 m3, [tlq-4]
1599    jmp .filter_edge_s3_start
1600.filter_edge_s3_loop:
1601    movu                 m2, [tlq+r5*2-2]
1602    movu                 m3, [tlq+r5*2-4]
1603    mova      [tlq+r5*2-16], m1
1604.filter_edge_s3_start:
1605    paddw                m2, [tlq+r5*2+0]
1606    paddw                m3, m5
1607    movu                 m1, [tlq+r5*2+2]
1608    movu                 m4, [tlq+r5*2+4]
1609    add                 r5d, 8
1610    paddw                m1, m2
1611    pavgw                m3, m4
1612    paddw                m1, m3
1613    psrlw                m1, 2
1614    cmp                 r5d, r3d
1615    jl .filter_edge_s3_loop
1616    mova      [tlq+r5*2-16], m1
1617    ret
1618
1619%if ARCH_X86_64
1620cglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy
1621    %define            base  r7-$$
1622    %define           maxwm  r6m
1623    %define           maxhm  r7m
1624    %define          bdmaxm  r8m
1625    lea                  r7, [$$]
1626    mov                  hd, hm
1627    movddup              m8, [base+pw_62]
1628    lea                 r9d, [wq-4]
1629    shl                 r9d, 6
1630    mova                 m9, [base+z2_top_shufA]
1631    or                  r9d, hd
1632    mova                m10, [base+z2_left_shufA]
1633%else
1634cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx
1635    %define            base  r1-$$
1636    %define             r9b  byte  [rsp+16*26+4*0]
1637    %define             r9d  dword [rsp+16*26+4*0]
1638    %define            r10d  dword [rsp+16*26+4*1]
1639    %define            r11d  dword [rsp+16*26+4*2]
1640    %define           maxwm  [rsp+16*2+4*0]
1641    %define           maxhm  [rsp+16*2+4*1]
1642    %define          bdmaxm  [rsp+16*2+4*2]
1643    %define        stridemp  [rsp+16*26+4*3]
1644    %define         strideq  r3
1645    %define             dyd  r4
1646    %define             dyq  r4
1647    mov            stridemp, r1
1648    mov                 r1d, r6m
1649    mov                 r4d, r7m
1650    mov                 r5d, r8m
1651    mov               maxwm, r1d
1652    mov               maxhm, r4d
1653    mov              bdmaxm, r5d
1654    LEA                  r1, $$
1655    lea                  hd, [wq-4]
1656    mova                 m0, [base+z2_top_shufA]
1657    shl                  hd, 6
1658    mova                 m1, [base+z2_left_shufA]
1659    or                   hd, hm
1660    mova        [rsp+16*24], m0
1661    mov                 r9d, hd
1662    mova        [rsp+16*25], m1
1663%endif
1664    tzcnt                wd, wd
1665    movifnidn        angled, anglem
1666    mova                 m0, [tlq-16*8]
1667    mova                 m1, [tlq-16*7]
1668    mova                 m2, [tlq-16*6]
1669    mova                 m3, [tlq-16*5]
1670    movsxd               wq, [base+ipred_z2_16bpc_ssse3_table+wq*4]
1671%if ARCH_X86_64
1672    movzx               dxd, angleb
1673%else
1674    movzx               dxd, byte anglem
1675%endif
1676    mova                 m4, [tlq-16*4]
1677    mova                 m5, [tlq-16*3]
1678    mova                 m6, [tlq-16*2]
1679    mova                 m7, [tlq-16*1]
1680    mova        [rsp+16* 5], m0
1681    xor              angled, 0x400
1682    mova        [rsp+16* 6], m1
1683    mov                 dyd, dxd
1684    mova        [rsp+16* 7], m2
1685    neg                 dxq
1686    mova        [rsp+16* 8], m3
1687    and                 dyd, ~1
1688    mova        [rsp+16* 9], m4
1689    and                 dxq, ~1
1690    mova        [rsp+16*10], m5
1691    lea                  wq, [base+ipred_z2_16bpc_ssse3_table+wq]
1692    mova        [rsp+16*11], m6
1693    pxor                 m3, m3
1694    mova        [rsp+16*12], m7
1695    movzx               dyd, word [base+dr_intra_derivative+dyq-90]  ; angle - 90
1696    movzx               dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
1697    movddup              m0, [base+pw_256] ; 4<<6
1698    movd                 m4, [tlq]
1699    movu                 m5, [tlq+16*0+2]
1700    movu                 m6, [tlq+16*1+2]
1701    movsldup             m1, [base+z2_dy_offset]
1702    pshufb               m4, m0
1703    movq                 m7, [base+z_base_inc+2]
1704    mov                r11d, (112-4)<<6
1705    mova        [rsp+16*13], m4
1706    neg                 dxd
1707    mova        [rsp+16*14], m5
1708    or                  dyd, 4<<16
1709    mova        [rsp+16*15], m6
1710%if ARCH_X86_64
1711    lea                r10d, [dxq+(112<<6)] ; xpos
1712%else
1713    mov           [rsp+8*3], dyd
1714    lea                 r4d, [dxq+(112<<6)]
1715    mov                r10d, r4d
1716    movzx                hd, r9b
1717%endif
1718    movq          [rsp+8*0], m1
1719    movq          [rsp+8*1], m0
1720    movq          [rsp+8*2], m7
1721    jmp                  wq
1722.w4:
1723    test             angled, 0x400
1724    jnz .w4_main
1725    lea                 r3d, [hq+2]
1726    add              angled, 1022
1727    pshuflw              m1, m5, q3333
1728    shl                 r3d, 6
1729    movq      [rsp+16*14+8], m1
1730    test                r3d, angled
1731    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
1732    call .upsample_above
1733    sub              angled, 1075 ; angle - 53
1734    lea                 r3d, [hq+3]
1735    xor              angled, 0x7f ; 180 - angle
1736    movd                 m2, r3d
1737    movd                 m7, angled
1738    shr              angled, 8 ; is_sm << 1
1739    pshufb               m2, m3
1740    pshufb               m7, m3
1741    pcmpeqb              m2, [base+z_filt_wh4]
1742    pand                 m7, m2
1743    pcmpgtb              m7, [base+z_filt_t_w48+angleq*8]
1744    jmp .w8_filter_left
1745.upsample_above: ; w4/w8
1746    paddw                m2, m5, [tlq]
1747    movu                 m1, [rsp+gprsize+16*14+2]
1748    movu                 m4, [rsp+gprsize+16*14-4]
1749%if ARCH_X86_64
1750    movd                 m6, r9m ; bdmax, offset due to call
1751%else
1752    movd                 m6, [rsp+gprsize+16*2+4*2]
1753%endif
1754    paddw                m4, m1
1755    psubw                m1, m2, m4
1756    pshufb               m6, m0
1757    psraw                m1, 3
1758    paddw                m2, m1
1759    add                 dxd, dxd
1760    pmaxsw               m2, m3
1761    paddw                m7, m7
1762    pavgw                m2, m3
1763    pminsw               m2, m6
1764%if ARCH_X86_64
1765    mova                 m9, [base+z2_top_shufB]
1766    lea                r10d, [dxq+(113<<6)]
1767    mov                r11d, (112-7)<<6
1768%else
1769    mova                 m1, [base+z2_top_shufB]
1770    lea                 r3d, [dxq+(113<<6)]
1771    mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6
1772    mov [rsp+gprsize+16*26+4*1], r3d
1773    mova [rsp+gprsize+16*24], m1
1774%endif
1775    punpcklwd            m1, m2, m5
1776    punpckhwd            m2, m5
1777    movq  [rsp+gprsize+8*2], m7
1778    mova [rsp+gprsize+16*14], m1
1779    mova [rsp+gprsize+16*15], m2
1780    ret
1781.w4_no_upsample_above:
1782    lea                 r3d, [hq+3]
1783    mov          [rsp+16*4], angled
1784    sub              angled, 1112 ; angle - 90
1785    movd                 m2, r3d
1786    mov                 r3d, 90
1787    movd                 m1, angled
1788    sub                 r3d, angled ; 180 - angle
1789    shr              angled, 8 ; is_sm << 1
1790    mova                 m4, [base+z_filt_wh4]
1791    movd                 m7, r3d
1792    mova                 m5, [base+z_filt_t_w48+angleq*8]
1793    mov                 r3d, 4
1794    call .w8_filter_top
1795    mov              angled, [rsp+16*4]
1796    lea                 r3d, [hq+2]
1797    sub              angled, 139
1798    shl                 r3d, 6
1799    test                r3d, angled
1800    jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
1801.upsample_left: ; w4/w8
1802    mova                 m2, [tlq-16]
1803    lea                 r3d, [hq-4]
1804    movu                 m3, [tlq-14]
1805    movu                 m4, [rsp+16*12+4]
1806    pshufb               m1, m2, [base+z2_upsample_l+r3*4]
1807    movd                 m6, bdmaxm
1808    pxor                 m5, m5
1809    paddw                m3, m2
1810    paddw                m4, m1
1811    psubw                m1, m3, m4
1812    movshdup             m4, [base+z2_dy_offset]
1813    psraw                m1, 3
1814    pshufb               m6, m0
1815    paddw                m3, m1
1816    pmaxsw               m3, m5
1817    pavgw                m3, m5
1818    pminsw               m3, m6
1819%if ARCH_X86_64
1820    mova                m10, [base+z2_left_shufB]
1821    add                 dyd, dyd
1822%else
1823    mova                 m1, [base+z2_left_shufB]
1824    shl     dword [rsp+8*3], 1
1825    mova        [rsp+16*25], m1
1826%endif
1827    punpckhwd            m1, m2, m3
1828    punpcklwd            m2, m3
1829    movq          [rsp+8*0], m4
1830    mova        [rsp+16*12], m1
1831    mova        [rsp+16*11], m2
1832.w4_main:
1833    movd                 m6, dxd
1834%if ARCH_X86_64
1835    movd                 m3, dyd
1836%else
1837    movd                 m3, [rsp+8*3]
1838%endif
1839    pshufb               m6, m0
1840    movddup              m0, [rsp+8*2]
1841    paddw                m7, m6, m6
1842    movq                 m5, [base+pw_m1to4]
1843    pshuflw              m4, m3, q0000
1844    punpcklqdq           m6, m7
1845    pmullw               m4, m5
1846    pshuflw              m3, m3, q1111
1847    paddw                m6, m0
1848    mov                 r2d, r10d
1849    pshuflw              m0, m4, q3333
1850    psubw                m4, [rsp+8*0]
1851    movq          [rsp+8*3], m3
1852    movq          [rsp+8*5], m0 ; dy*4
1853    mov                  r5, dstq
1854.w4_loop0:
1855    mova         [rsp+16*4], m6
1856    movq          [rsp+8*4], m4
1857%if ARCH_X86_64
1858    pand                 m0, m8, m4
1859%else
1860    movq                 m0, [base+pw_62]
1861    pand                 m0, m4
1862%endif
1863    psraw                m4, 6
1864    psllw                m0, 9 ; frac_y << 9
1865    movq          [rsp+8*7], m0
1866    pabsw                m4, m4
1867    movq          [rsp+8*6], m4
1868    movzx                hd, r9b
1869.w4_loop:
1870    lea                 r3d, [r2+dxq]
1871    shr                 r2d, 6        ; base_x0
1872    movu                 m2, [rsp+r2*2]
1873    lea                 r2d, [r3+dxq]
1874    shr                 r3d, 6        ; base_x1
1875    movu                 m1, [rsp+r3*2]
1876    lea                 r3d, [r2+dxq]
1877    shr                 r2d, 6        ; base_x2
1878    movu                 m3, [rsp+r2*2]
1879    lea                 r2d, [r3+dxq]
1880    shr                 r3d, 6        ; base_x3
1881    movu                 m4, [rsp+r3*2]
1882%if ARCH_X86_64
1883    REPX     {pshufb x, m9}, m2, m1, m3, m4
1884%else
1885    mova                 m0, [rsp+16*24]
1886    REPX     {pshufb x, m0}, m2, m1, m3, m4
1887%endif
1888    punpcklqdq           m0, m2, m1
1889    punpckhqdq           m2, m1
1890    punpcklqdq           m1, m3, m4
1891    punpckhqdq           m3, m4
1892%if ARCH_X86_64
1893    pand                 m5, m8, m6
1894%else
1895    movddup              m5, [base+pw_62]
1896    pand                 m5, m6
1897%endif
1898    psllw                m5, 9
1899    psubw                m2, m0
1900    pmulhrsw             m2, m5
1901    paddw                m5, m6, m7
1902    psubw                m3, m1
1903    paddw                m0, m2
1904%if ARCH_X86_64
1905    pand                 m2, m8, m5
1906%else
1907    movddup              m2, [base+pw_62]
1908    pand                 m2, m5
1909%endif
1910    psllw                m2, 9
1911    pmulhrsw             m3, m2
1912    paddw                m1, m3
1913    cmp                 r3d, 111 ; topleft
1914    jge .w4_toponly
1915    mova        [rsp+16*22], m0
1916    mova        [rsp+16*23], m1
1917    movzx               r3d, byte [rsp+8*6+0] ; base_y0
1918    movu                 m3, [rsp+r3*2]
1919    movzx               r3d, byte [rsp+8*6+2] ; base_y1
1920    movu                 m2, [rsp+r3*2]
1921    movzx               r3d, byte [rsp+8*6+4] ; base_y2
1922    movu                 m4, [rsp+r3*2]
1923    movzx               r3d, byte [rsp+8*6+6] ; base_y3
1924    movu                 m0, [rsp+r3*2]
1925%if ARCH_X86_64
1926    REPX    {pshufb x, m10}, m3, m2, m4, m0
1927%else
1928    mova                 m1, [rsp+16*25]
1929    REPX     {pshufb x, m1}, m3, m2, m4, m0
1930%endif
1931    punpcklwd            m1, m3, m2
1932    punpckhwd            m3, m2     ; 01
1933    punpcklwd            m2, m4, m0
1934    punpckhwd            m4, m0     ; 23
1935    punpckldq            m0, m1, m2 ; y0 d1
1936    punpckhdq            m1, m2     ; y2 y3
1937    punpckldq            m2, m3, m4
1938    punpckhdq            m3, m4
1939    movddup              m4, [rsp+8*7]
1940    psubw                m2, m0
1941    psubw                m3, m1
1942    pmulhrsw             m2, m4
1943    pmulhrsw             m3, m4
1944    psraw                m6, 15       ; base_x < topleft
1945    psraw                m4, m5, 15
1946    paddw                m0, m2
1947    paddw                m1, m3
1948    pand                 m0, m6
1949    pandn                m6, [rsp+16*22]
1950    pand                 m1, m4
1951    pandn                m4, [rsp+16*23]
1952    por                  m0, m6
1953    por                  m1, m4
1954.w4_toponly:
1955    movifnidn       strideq, stridemp
1956    movq   [dstq+strideq*0], m0
1957    movhps [dstq+strideq*1], m0
1958    lea                dstq, [dstq+strideq*2]
1959    movq   [dstq+strideq*0], m1
1960    movhps [dstq+strideq*1], m1
1961    sub                  hd, 4
1962    jz .w4_end
1963    movq                 m4, [rsp+8*6]
1964    paddsw               m6, m5, m7   ; xpos += dx
1965    movq                 m5, [rsp+8*3]
1966    psubw                m4, m5
1967    lea                dstq, [dstq+strideq*2]
1968    movq          [rsp+8*6], m4
1969    cmp                 r2d, r11d
1970    jge .w4_loop
1971.w4_leftonly_loop:
1972    movzx               r2d, byte [rsp+8*6+0] ; base_y0
1973    movu                 m3, [rsp+r2*2]
1974    movzx               r2d, byte [rsp+8*6+2] ; base_y1
1975    movu                 m2, [rsp+r2*2]
1976    movzx               r2d, byte [rsp+8*6+4] ; base_y2
1977    movu                 m6, [rsp+r2*2]
1978    movzx               r2d, byte [rsp+8*6+6] ; base_y3
1979    movu                 m0, [rsp+r2*2]
1980    psubw                m4, m5
1981%if ARCH_X86_64
1982    REPX    {pshufb x, m10}, m3, m2, m6, m0
1983%else
1984    mova                 m1, [rsp+16*25]
1985    REPX     {pshufb x, m1}, m3, m2, m6, m0
1986%endif
1987    movq          [rsp+8*6], m4
1988    punpcklwd            m1, m3, m2
1989    punpckhwd            m3, m2
1990    punpcklwd            m2, m6, m0
1991    punpckhwd            m6, m0
1992    punpckldq            m0, m1, m2
1993    punpckhdq            m1, m2
1994    punpckldq            m2, m3, m6
1995    punpckhdq            m3, m6
1996    movddup              m6, [rsp+8*7]
1997    psubw                m2, m0
1998    psubw                m3, m1
1999    pmulhrsw             m2, m6
2000    pmulhrsw             m3, m6
2001    paddw                m0, m2
2002    paddw                m1, m3
2003    movq   [dstq+strideq*0], m0
2004    movhps [dstq+strideq*1], m0
2005    lea                dstq, [dstq+strideq*2]
2006    movq   [dstq+strideq*0], m1
2007    movhps [dstq+strideq*1], m1
2008    lea                dstq, [dstq+strideq*2]
2009    sub                  hd, 4
2010    jg .w4_leftonly_loop
2011.w4_end:
2012    sub                 r9d, 1<<8
2013    jl .w4_ret
2014    movq                 m4, [rsp+8*5]
2015    add                  r5, 8
2016    mov                dstq, r5
2017    paddw                m4, [rsp+8*4] ; base_y += 4*dy
2018    movzx               r2d, word [rsp+8*1]
2019    movddup              m6, [rsp+8*1]
2020    paddw                m6, [rsp+16*4] ; base_x += (4 << upsample_above)
2021    add                 r2d, r10d
2022    mov                r10d, r2d
2023    jmp .w4_loop0
2024.w4_ret:
2025    RET
2026.w8:
2027    test             angled, 0x400
2028    jnz .w4_main
2029    lea                 r3d, [angleq+126]
2030    pshufhw              m1, m5, q3333
2031%if ARCH_X86_64
2032    mov                 r3b, hb
2033%else
2034    xor                 r3b, r3b
2035    or                  r3d, hd
2036%endif
2037    movhps      [rsp+16*15], m1
2038    cmp                 r3d, 8
2039    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2040    call .upsample_above
2041    sub              angled, 53
2042    lea                 r3d, [hq+7]
2043    xor              angled, 0x7f ; 180 - angle
2044    movu                 m1, [base+z_filt_wh8]
2045    movd                 m2, r3d
2046    movd                 m7, angled
2047    shr              angled, 8 ; is_sm << 1
2048    psrldq               m4, [base+z_filt_t_w48+angleq*8], 4
2049    pshufb               m2, m3
2050    pshufb               m7, m3
2051    pcmpeqb              m2, m1
2052    movq                 m1, [base+pw_512]
2053    pand                 m7, m2
2054    pcmpgtb              m7, m4
2055    movq          [rsp+8*1], m1 ; 8<<6
2056    jmp .w8_filter_left
2057.w8_no_upsample_above:
2058    lea                 r3d, [hq+7]
2059    mov          [rsp+16*4], angled
2060    sub              angled, 90
2061    movd                 m2, r3d
2062    mov                 r3d, 90
2063    movd                 m1, angled
2064    sub                 r3d, angled ; 180 - angle
2065    shr              angled, 8 ; is_sm << 1
2066    movu                 m4, [base+z_filt_wh8]
2067    movd                 m7, r3d
2068    psrldq               m5, [base+z_filt_t_w48+angleq*8], 4
2069    mov                 r3d, 8
2070    call .w8_filter_top
2071    mov                 r3d, [rsp+16*4]
2072    sub                 r3d, 141
2073%if ARCH_X86_64
2074    mov                 r3b, hb
2075%else
2076    xor                 r3b, r3b
2077    or                  r3d, hd
2078%endif
2079    cmp                 r3d, 8
2080    jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
2081.w8_filter_left:
2082    pmovmskb            r5d, m7
2083    test                r5d, r5d
2084    jz .w4_main
2085    imul                r5d, 0x55555555
2086    neg                  hq
2087    mov                  r3, tlq
2088    movd                 m1, [tlq+hq*2]
2089    shr                 r5d, 30 ; filter_strength
2090    lea                 tlq, [rsp+16*13-2]
2091    pshuflw              m1, m1, q0000
2092    movq       [tlq+hq*2-6], m1
2093    call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge
2094    jmp .filter_left_end
2095.w8_filter_top:
2096    REPX     {pshufb x, m3}, m2, m1, m7
2097    pcmpeqb              m2, m4
2098    pand                 m1, m2
2099    pand                 m7, m2
2100    pcmpgtb              m1, m5
2101    pcmpgtb              m7, m5
2102    pmovmskb            r5d, m1
2103    test                r5d, r5d
2104    jz .w8_filter_top_end ; filter_strength == 0
2105    imul                r5d, 0x55555555
2106    mov              [dstq], tlq
2107    lea                 tlq, [rsp+16*14+gprsize]
2108    shr                 r5d, 30 ; filter_strength
2109    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge
2110%if ARCH_X86_64
2111    mov                 r3d, r7m ; maxw, offset due to call
2112%else
2113    mov                 r3d, [rsp+16*2+4*1]
2114%endif
2115    mov                 tlq, [dstq]
2116    cmp                 r3d, 8
2117    jge .w8_filter_top_end
2118    movu                 m1, [tlq+r3*2+16*0+2]
2119    movu                 m2, [tlq+r3*2+16*1+2]
2120    movu [rsp+r3*2+16*14+gprsize], m1
2121    movu [rsp+r3*2+16*15+gprsize], m2
2122.w8_filter_top_end:
2123    ret
2124.w16:
2125    test             angled, 0x400
2126    jnz .w4_main
2127    lea                 r3d, [hq+15]
2128    sub              angled, 90
2129    movd                 m2, r3d
2130    mov                 r3d, 90
2131    movd                 m1, angled
2132    sub                 r3d, angled ; 180 - angle
2133    shr              angled, 8 ; is_sm << 1
2134    movd                 m7, r3d
2135    REPX     {pshufb x, m3}, m2, m1, m7
2136    movq                 m4, [base+z_filt_t_w16+angleq*4]
2137    pcmpeqb              m2, [base+z_filt_wh16]
2138    pand                 m1, m2
2139    pand                 m7, m2
2140    pcmpgtb              m1, m4
2141    pcmpgtb              m7, m4
2142    pmovmskb            r5d, m1
2143    test                r5d, r5d
2144    jz .w16_filter_left ; filter_strength == 0
2145    imul                r5d, 0x24924924
2146    pshufhw              m6, m6, q3333
2147    mov              [dstq], tlq
2148    lea                 tlq, [rsp+16*14]
2149    shr                 r5d, 30
2150    movhps       [tlq+16*2], m6
2151    adc                 r5d, -1 ; filter_strength
2152    mov                 r3d, 16
2153    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge
2154    mov                 r3d, maxwm
2155    mov                 tlq, [dstq]
2156    cmp                 r3d, 16
2157    jge .w16_filter_left
2158    movu                 m1, [tlq+r3*2+16*0+2]
2159    movu                 m2, [tlq+r3*2+16*1+2]
2160    movu   [rsp+r3*2+16*14], m1
2161    movu   [rsp+r3*2+16*15], m2
2162.w16_filter_left:
2163    pmovmskb            r5d, m7
2164    test                r5d, r5d
2165    jz .w4_main
2166    imul                r5d, 0x24924924
2167    neg                  hq
2168    mov                  r3, tlq
2169    movd                 m1, [tlq+hq*2]
2170    shr                 r5d, 30
2171    lea                 tlq, [rsp+16*13-2]
2172    pshuflw              m1, m1, q0000
2173    adc                 r5d, -1 ; filter_strength
2174    movq       [tlq+hq*2-6], m1
2175    call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge
2176    jmp .filter_left_end
2177.w32:
2178    movu                 m1, [tlq+16*2+2]
2179    movu                 m2, [tlq+16*3+2]
2180    mova        [rsp+16*16], m1
2181    mova        [rsp+16*17], m2
2182    test             angled, 0x400
2183    jnz .w4_main
2184    mov              [dstq], tlq
2185    lea                 tlq, [rsp+16*14]
2186    pshufhw              m2, m2, q3333
2187    mov                 r3d, 32
2188    movhps       [tlq+16*4], m2
2189    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3
2190    mov                 r3d, maxwm
2191    mov                 tlq, [dstq]
2192    cmp                 r3d, 32
2193    jge .filter_left
2194    movu                 m1, [tlq+r3*2+16*0+2]
2195    movu                 m2, [tlq+r3*2+16*1+2]
2196    movu   [rsp+r3*2+16*14], m1
2197    movu   [rsp+r3*2+16*15], m2
2198    cmp                 r3d, 16
2199    jge .filter_left
2200    movu                 m1, [tlq+r3*2+16*2+2]
2201    movu                 m2, [tlq+r3*2+16*3+2]
2202    movu   [rsp+r3*2+16*16], m1
2203    movu   [rsp+r3*2+16*17], m2
2204.filter_left:
2205    neg                  hq
2206    mov                  r3, tlq
2207    pshuflw              m1, [tlq+hq*2], q0000
2208    lea                 tlq, [rsp+16*13-2]
2209    movq       [tlq+hq*2-6], m1
2210    call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3
2211.filter_left_end:
2212    mov                 r2d, maxhm
2213    cmp                 r2d, hd
2214    jge .w4_main
2215    neg                  r2
2216    movu                 m1, [r3+r2*2-16*1]
2217    movu                 m2, [r3+r2*2-16*2]
2218    movu   [rsp+r2*2+16*12], m1
2219    movu   [rsp+r2*2+16*11], m2
2220    cmp                 r2d, -48
2221    jle .w4_main
2222    movu                 m1, [r3+r2*2-16*3]
2223    movu                 m2, [r3+r2*2-16*4]
2224    movu   [rsp+r2*2+16*10], m1
2225    movu   [rsp+r2*2+16* 9], m2
2226    cmp                 r2d, -32
2227    jle .w4_main
2228    movu                 m1, [r3+r2*2-16*5]
2229    movu                 m2, [r3+r2*2-16*6]
2230    movu   [rsp+r2*2+16* 8], m1
2231    movu   [rsp+r2*2+16* 7], m2
2232    cmp                 r2d, -16
2233    jle .w4_main
2234    movu                 m1, [r3+r2*2-16*7]
2235    movu                 m2, [r3+r2*2-16*8]
2236    movu   [rsp+r2*2+16* 6], m1
2237    movu   [rsp+r2*2+16* 5], m2
2238    jmp .w4_main
2239.w64:
2240    movu                 m1, [tlq+16*2+2]
2241    movu                 m2, [tlq+16*3+2]
2242    movu                 m3, [tlq+16*4+2]
2243    movu                 m4, [tlq+16*5+2]
2244    movu                 m5, [tlq+16*6+2]
2245    movu                 m6, [tlq+16*7+2]
2246    mov              [dstq], tlq
2247    lea                 tlq, [rsp+16*14]
2248    mova         [tlq+16*2], m1
2249    mova         [tlq+16*3], m2
2250    mova         [tlq+16*4], m3
2251    mova         [tlq+16*5], m4
2252    mova         [tlq+16*6], m5
2253    mova         [tlq+16*7], m6
2254    test             angled, 0x400
2255    jnz .w4_main
2256    pshufhw              m6, m6, q3333
2257    mov                 r3d, 64
2258    movhps       [tlq+16*8], m6
2259    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3
2260    mov                 r3d, maxwm
2261    mov                 tlq, [dstq]
2262    cmp                 r3d, 64
2263    jge .filter_left
2264    movu                 m1, [tlq+r3*2+16*0+2]
2265    movu                 m2, [tlq+r3*2+16*1+2]
2266    movu   [rsp+r3*2+16*14], m1
2267    movu   [rsp+r3*2+16*15], m2
2268    cmp                 r3d, 48
2269    jge .filter_left
2270    movu                 m1, [tlq+r3*2+16*2+2]
2271    movu                 m2, [tlq+r3*2+16*3+2]
2272    movu   [rsp+r3*2+16*16], m1
2273    movu   [rsp+r3*2+16*17], m2
2274    cmp                 r3d, 32
2275    jge .filter_left
2276    movu                 m1, [tlq+r3*2+16*4+2]
2277    movu                 m2, [tlq+r3*2+16*5+2]
2278    movu   [rsp+r3*2+16*18], m1
2279    movu   [rsp+r3*2+16*19], m2
2280    cmp                 r3d, 16
2281    jge .filter_left
2282    movu                 m1, [tlq+r3*2+16*6+2]
2283    movu                 m2, [tlq+r3*2+16*7+2]
2284    movu   [rsp+r3*2+16*20], m1
2285    movu   [rsp+r3*2+16*21], m2
2286    jmp .filter_left
2287
2288%if ARCH_X86_64
2289cglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w
2290    %define            base  r7-$$
2291    lea                  r7, [$$]
2292    mov              org_wd, wd
2293%else
2294cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy
2295    %define            base  r1-$$
2296    %define          org_wd  r5
2297    %define          org_wq  r5
2298    movd                 m6, r8m ; pixel_max
2299    mov          [dstq+4*0], strideq
2300    LEA                  r1, $$
2301    mov          [dstq+4*1], wd
2302%endif
2303    tzcnt                hd, hm
2304    movifnidn        angled, anglem
2305    sub                 tlq, 2
2306    movsxd               hq, [base+ipred_z3_16bpc_ssse3_table+hq*4]
2307    sub              angled, 180
2308    movddup              m0, [base+pw_256]
2309    mov                 dyd, angled
2310    neg                 dyd
2311    xor              angled, 0x400
2312    movddup              m7, [base+pw_62]
2313    or                  dyq, ~0x7e
2314    lea                  hq, [base+ipred_z3_16bpc_ssse3_table+hq]
2315    movzx               dyd, word [base+dr_intra_derivative+45*2-1+dyq]
2316    jmp                  hq
2317.h4:
2318    lea                 r4d, [angleq+88]
2319    test                r4d, 0x480
2320    jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
2321    sar                 r4d, 9
2322    add                 r4d, wd
2323    cmp                 r4d, 8
2324    jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
2325    mova                 m2, [tlq-14]   ; 7 6 5 4 3 2 1 0
2326    movu                 m3, [tlq-12]   ; 8 7 6 5 4 3 2 1
2327%if ARCH_X86_64
2328    movd                 m6, r8m
2329%endif
2330    pshufb               m4, m2, m0
2331    mov                 tlq, rsp
2332    palignr              m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2
2333    add                 dyd, dyd
2334    palignr              m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3
2335    paddw                m1, m2
2336    paddw                m3, m5
2337    psubw                m5, m1, m3
2338    mova                 m3, [base+z_upsample]
2339    mova           [tlq+ 0], m4
2340    movd                 m4, dyd
2341    psraw                m5, 3
2342    neg                 dyd
2343    paddw                m1, m5
2344    pxor                 m5, m5
2345    lea                 r5d, [dyq+(16<<6)+63] ; ypos
2346    pmaxsw               m1, m5
2347    pshufb               m6, m0
2348    shl                  wd, 3
2349    pavgw                m1, m5
2350    pshufb               m4, m0
2351    pminsw               m1, m6
2352    sub                 rsp, wq
2353    punpckhwd            m0, m1, m2
2354    paddw                m5, m4, m4
2355    punpcklwd            m1, m2
2356    mova           [tlq+32], m0
2357    movsd                m4, m5
2358    mova           [tlq+16], m1
2359.h4_upsample_loop:
2360    lea                 r4d, [r5+dyq]
2361    sar                 r5d, 6
2362    movu                 m2, [tlq+r5*2]
2363    lea                 r5d, [r4+dyq]
2364    sar                 r4d, 6
2365    movu                 m1, [tlq+r4*2]
2366    pshufb               m2, m3
2367    pshufb               m1, m3
2368    punpckhqdq           m0, m1, m2
2369    punpcklqdq           m1, m2
2370    pand                 m2, m7, m4
2371    psllw                m2, 9
2372    psubw                m1, m0
2373    pmulhrsw             m1, m2
2374    paddw                m4, m5
2375    paddw                m0, m1
2376    mova        [rsp+wq-16], m0
2377    sub                  wd, 16
2378    jg .h4_upsample_loop
2379    or                  r3d, 4*2
2380    jmp .end_transpose
2381.h4_no_upsample:
2382    mov                 r4d, 7
2383    test             angled, 0x400 ; !enable_intra_edge_filter
2384    jnz .h4_main
2385    lea                 r4d, [wq+3]
2386    movd                 m1, r4d
2387    movd                 m3, angled
2388    shr              angled, 8 ; is_sm << 1
2389    pxor                 m2, m2
2390    pshufb               m1, m2
2391    pshufb               m3, m2
2392    pcmpeqb              m1, [base+z_filt_wh4]
2393    pand                 m1, m3
2394    pcmpgtb              m1, [base+z_filt_t_w48+angleq*8]
2395    pmovmskb            r5d, m1
2396    mov                 r4d, 7
2397    test                r5d, r5d
2398    jz .h4_main ; filter_strength == 0
2399    pshuflw              m1, [tlq+2], q0000
2400    imul                r5d, 0x55555555
2401    mova                 m2, [tlq-14]
2402    neg                  r4
2403    movd                 m3, [tlq+r4*2]
2404    shr                 r5d, 30
2405    movd        [rsp+16*17], m1
2406    pshuflw              m3, m3, q0000
2407    mova        [rsp+16*16], m2
2408    lea                  r2, [r4-2]
2409    movq [rsp+16*17+r4*2-10], m3
2410    cmp                  wd, 8
2411    cmovae               r4, r2
2412    lea                 tlq, [rsp+16*17-2]
2413    call .filter_edge
2414.h4_main:
2415    movd                 m4, dyd
2416    sub                 tlq, r4
2417    movddup              m1, [base+z_base_inc_z2+8] ; base_inc << 6
2418    sub                 tlq, r4
2419    shl                 r4d, 6
2420    movd                 m6, [tlq]
2421    movd                 m3, r4d
2422    pshufb               m4, m0
2423    neg                 dyq
2424    pshufb               m6, m0
2425    lea                  r5, [dyq+r4+63] ; ypos
2426    pshufb               m3, m0
2427    shl                  wd, 3
2428    paddw                m5, m4, m4
2429    sub                 rsp, wq
2430    psubw                m3, m1 ; max_base_y
2431    movsd                m4, m5 ; ypos1 ypos0
2432.h4_loop:
2433    lea                  r4, [r5+dyq]
2434    sar                  r5, 6
2435    movddup              m0, [tlq+r5*2-6]
2436    movddup              m1, [tlq+r5*2-8]
2437    lea                  r5, [r4+dyq]
2438    sar                  r4, 6
2439    movlps               m0, [tlq+r4*2-6]
2440    movlps               m1, [tlq+r4*2-8]
2441    pand                 m2, m7, m4
2442    psllw                m2, 9
2443    psubw                m1, m0
2444    pmulhrsw             m1, m2
2445    pcmpgtw              m2, m3, m4
2446    paddw                m4, m5
2447    paddw                m0, m1
2448    pand                 m0, m2
2449    pandn                m2, m6
2450    por                  m0, m2
2451    mova        [rsp+wq-16], m0
2452    sub                  wd, 16
2453    jz .h4_transpose
2454    test                r5d, r5d
2455    jg .h4_loop
2456.h4_end_loop:
2457    mova        [rsp+wq-16], m6
2458    sub                  wd, 16
2459    jg .h4_end_loop
2460.h4_transpose:
2461    or                  r3d, 4*2
2462    jmp .end_transpose
2463.h8:
2464    lea                 r4d, [angleq+88]
2465    and                 r4d, ~0x7f
2466    or                  r4d, wd
2467    cmp                 r4d, 8
2468    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
2469    mova                 m2, [tlq-30]  ; g f e d c b a 9
2470    movu                 m1, [tlq-32]  ; _ g f e d c b a
2471    movu                 m3, [tlq-16]  ; 9 8 7 6 5 4 3 2
2472    paddw                m3, [tlq-14]  ; 8 7 6 5 4 3 2 1
2473    pshufd               m4, m2, q2100 ; _ _ g f e d c b
2474    paddw                m1, m2
2475    movu                 m5, [tlq-28]  ; f e d c b a 9 8
2476    add                 dyd, dyd
2477    cmp                  wd, 8
2478    je .h8_upsample_w8
2479    pshufhw              m4, m2, q1000 ; _ _ _ _ c c c b
2480.h8_upsample_w8:
2481    paddw                m4, m5
2482    psubw                m5, m1, m4
2483    movu                 m4, [tlq-18]  ; a 9 8 7 6 5 4 3
2484    psraw                m5, 3
2485    paddw                m1, m5
2486    movu                 m5, [tlq-12]  ; 7 6 5 4 3 2 1 0
2487%if ARCH_X86_64
2488    movd                 m6, r8m ; pixel_max
2489%endif
2490    paddw                m4, m5
2491    shl                  wd, 4
2492    psubw                m5, m3, m4
2493    movd                 m4, dyd
2494    psraw                m5, 3
2495    neg                 dyd
2496    paddw                m3, m5
2497    pshufb               m6, m0
2498    mova                 m5, [tlq-14]
2499    pshufb               m4, m0
2500    pxor                 m0, m0
2501    pmaxsw               m1, m0
2502    pmaxsw               m3, m0
2503    mov                 tlq, rsp
2504    pavgw                m1, m0
2505    pavgw                m3, m0
2506    sub                 rsp, wq
2507    pminsw               m1, m6
2508    pminsw               m6, m3
2509    mova                 m3, [base+z_upsample]
2510    lea                 r5d, [dyq+(16<<6)+63] ; ypos
2511    punpcklwd            m0, m1, m2
2512    mova         [tlq+16*0], m0
2513    punpckhwd            m1, m2
2514    mova         [tlq+16*1], m1
2515    punpcklwd            m0, m6, m5
2516    mova         [tlq+16*2], m0
2517    punpckhwd            m6, m5
2518    mova         [tlq+16*3], m6
2519    mova                 m5, m4
2520.h8_upsample_loop:
2521    mov                 r4d, r5d
2522    sar                 r4d, 6
2523    movu                 m1, [tlq+r4*2+16*0]
2524    movu                 m2, [tlq+r4*2+16*1]
2525    add                 r5d, dyd
2526    pshufb               m2, m3
2527    pshufb               m1, m3
2528    punpckhqdq           m0, m1, m2
2529    punpcklqdq           m1, m2
2530    pand                 m2, m7, m4
2531    psllw                m2, 9
2532    psubw                m1, m0
2533    pmulhrsw             m1, m2
2534    paddw                m4, m5
2535    paddw                m0, m1
2536    mova        [rsp+wq-16], m0
2537    sub                  wd, 16
2538    jg .h8_upsample_loop
2539    or                  r3d, 8*2
2540    jmp .end_transpose
2541.h8_no_upsample:
2542    lea                 r4d, [wq+7]
2543    movd                 m1, r4d
2544    and                 r4d, 7
2545    or                  r4d, 8 ; imin(w+7, 15)
2546    test             angled, 0x400
2547    jnz .h8_main
2548    movd                 m3, angled
2549    shr              angled, 8 ; is_sm << 1
2550    pxor                 m2, m2
2551    pshufb               m1, m2
2552    pshufb               m3, m2
2553    movu                 m2, [base+z_filt_wh8]
2554    psrldq               m4, [base+z_filt_t_w48+angleq*8], 4
2555    pcmpeqb              m2, m1
2556    pand                 m2, m3
2557    pcmpgtb              m2, m4
2558    pmovmskb            r5d, m2
2559    test                r5d, r5d
2560    jz .h8_main ; filter_strength == 0
2561    pshuflw              m1, [tlq+2], q0000
2562    imul                r5d, 0x55555555
2563    mova                 m2, [tlq-16*1+2]
2564    neg                  r4
2565    mova                 m3, [tlq-16*2+2]
2566    shr                 r5d, 30
2567    movd                 m4, [tlq+r4*2]
2568    movd        [rsp+16*17], m1
2569    mova        [rsp+16*16], m2
2570    pshuflw              m4, m4, q0000
2571    mova        [rsp+16*15], m3
2572    lea                  r2, [r4-2]
2573    movq [rsp+16*17+r4*2-10], m4
2574    cmp                  wd, 16
2575    cmovae               r4, r2
2576    lea                 tlq, [rsp+16*17-2]
2577    call .filter_edge
2578.h8_main:
2579    sub                 tlq, r4
2580    movd                 m4, dyd
2581    sub                 tlq, r4
2582    shl                 r4d, 6
2583    movd                 m6, [tlq]
2584    movd                 m3, r4d
2585    pshufb               m4, m0
2586    neg                 dyq
2587    pshufb               m6, m0
2588    lea                  r5, [dyq+r4+63]
2589    pshufb               m3, m0
2590    shl                  wd, 4
2591    mova                 m5, m4
2592    sub                 rsp, wq
2593    psubw                m3, [base+z_base_inc_z2]
2594.h8_loop:
2595    mov                  r4, r5
2596    sar                  r4, 6
2597    movu                 m0, [tlq+r4*2-14]
2598    movu                 m1, [tlq+r4*2-16]
2599    pand                 m2, m7, m4
2600    psllw                m2, 9
2601    psubw                m1, m0
2602    pmulhrsw             m1, m2
2603    pcmpgtw              m2, m3, m4
2604    paddw                m4, m5
2605    paddw                m0, m1
2606    pand                 m0, m2
2607    pandn                m2, m6
2608    por                  m0, m2
2609    mova        [rsp+wq-16], m0
2610    sub                  wd, 8*2
2611    jz .h8_transpose
2612    add                  r5, dyq
2613    jg .h8_loop
2614.h8_end_loop:
2615    mova        [rsp+wq-16], m6
2616    sub                  wd, 8*2
2617    jg .h8_end_loop
2618.h8_transpose:
2619    or                  r3d, 8*2
2620    jmp .end_transpose
2621.h16:
2622    lea                 r4d, [wq+15]
2623    movd                 m1, r4d
2624    and                 r4d, 15
2625    or                  r4d, 16 ; imin(w+15, 31)
2626    test             angled, 0x400
2627    jnz .h16_main
2628    movd                 m3, angled
2629    shr              angled, 8 ; is_sm << 1
2630    pxor                 m2, m2
2631    pshufb               m1, m2
2632    pshufb               m3, m2
2633    movq                 m4, [base+z_filt_t_w16+angleq*4]
2634    pcmpeqb              m1, [base+z_filt_wh16]
2635    pand                 m1, m3
2636    pcmpgtb              m1, m4
2637    pmovmskb            r5d, m1
2638    test                r5d, r5d
2639    jz .h16_main ; filter_strength == 0
2640    pshuflw              m1, [tlq+2], q0000
2641    mova                 m2, [tlq-16*1+2]
2642    imul                r5d, 0x24924924
2643    mova                 m3, [tlq-16*2+2]
2644    neg                  r4
2645    mova                 m4, [tlq-16*3+2]
2646    shr                 r5d, 30
2647    mova                 m5, [tlq-16*4+2]
2648    movd                 m6, [tlq+r4*2]
2649    adc                 r5d, -1 ; filter_strength
2650    movd        [rsp+16*17], m1
2651    mova        [rsp+16*16], m2
2652    mova        [rsp+16*15], m3
2653    pshuflw              m6, m6, q0000
2654    mova        [rsp+16*14], m4
2655    mova        [rsp+16*13], m5
2656    lea                  r2, [r4-2]
2657    movq [rsp+16*17+r4*2-10], m6
2658    cmp                  wd, 32
2659    cmovae               r4, r2
2660    lea                 tlq, [rsp+16*17-2]
2661    call .filter_edge
2662.h16_main:
2663    sub                 tlq, r4
2664    movd                 m5, dyd
2665    sub                 tlq, r4
2666    shl                 r4d, 6
2667    movd                 m6, [tlq]
2668    movd                 m3, r4d
2669    pshufb               m5, m0
2670    neg                 dyq
2671    pshufb               m6, m0
2672    lea                  r5, [dyq+r4+63]
2673    pshufb               m3, m0
2674    shl                  wd, 5
2675    paddw                m4, m5, [base+z_base_inc_z2]
2676    sub                 rsp, wq
2677    psubw                m4, m3
2678.h16_loop:
2679    mov                  r4, r5
2680    sar                  r4, 6
2681    movu                 m0, [tlq+r4*2-14]
2682    movu                 m2, [tlq+r4*2-16]
2683    pand                 m3, m7, m4
2684    psllw                m3, 9
2685    psubw                m2, m0
2686    pmulhrsw             m2, m3
2687    movu                 m1, [tlq+r4*2-30]
2688    paddw                m0, m2
2689    movu                 m2, [tlq+r4*2-32]
2690    psubw                m2, m1
2691    pmulhrsw             m2, m3
2692    movddup              m3, [base+pw_m512]
2693    paddw                m1, m2
2694    psraw                m2, m4, 15
2695    pcmpgtw              m3, m4
2696    paddw                m4, m5
2697    pand                 m0, m2
2698    pandn                m2, m6
2699    pand                 m1, m3
2700    pandn                m3, m6
2701    por                  m0, m2
2702    mova      [rsp+wq-16*1], m0
2703    por                  m1, m3
2704    mova      [rsp+wq-16*2], m1
2705    sub                  wd, 16*2
2706    jz .h16_transpose
2707    add                  r5, dyq
2708    jg .h16_loop
2709.h16_end_loop:
2710    mova      [rsp+wq-16*1], m6
2711    mova      [rsp+wq-16*2], m6
2712    sub                  wd, 16*2
2713    jg .h16_end_loop
2714.h16_transpose:
2715    or                  r3d, 16*2
2716    jmp .end_transpose
2717.h32:
2718    lea                 r4d, [wq+31]
2719    and                 r4d, 31
2720    or                  r4d, 32 ; imin(w+31, 63)
2721    test             angled, 0x400 ; !enable_intra_edge_filter
2722    jnz .h32_main
2723    call .filter_copy
2724    lea                  r5, [r4-2]
2725    cmp                  wd, 64
2726    cmove                r4, r5
2727    call .filter_edge_s3
2728.h32_main:
2729    sub                 tlq, r4
2730    movd                 m5, dyd
2731    sub                 tlq, r4
2732    shl                 r4d, 6
2733    movd                 m6, [tlq]
2734    movd                 m3, r4d
2735    pshufb               m5, m0
2736    neg                 dyq
2737    pshufb               m6, m0
2738    lea                  r5, [dyq+r4+63]
2739    pshufb               m3, m0
2740    paddw                m4, m5, [base+z_base_inc_z2]
2741    psubw                m4, m3
2742.h32_loop:
2743    mov                  r4, r5
2744    sar                  r4, 6
2745    movu                 m0, [tlq+r4*2-14]
2746    movu                 m3, [tlq+r4*2-16]
2747    pand                 m2, m7, m4
2748    psllw                m2, 9
2749    psubw                m3, m0
2750    pmulhrsw             m3, m2
2751    movu                 m1, [tlq+r4*2-30]
2752    paddw                m0, m3
2753    movu                 m3, [tlq+r4*2-32]
2754    psubw                m3, m1
2755    pmulhrsw             m3, m2
2756    sub                 rsp, 16*4
2757    paddw                m1, m3
2758    psraw                m3, m4, 15
2759    pand                 m0, m3
2760    pandn                m3, m6
2761    por                  m0, m3
2762    movddup              m3, [base+pw_m512]
2763    pcmpgtw              m3, m4
2764    pand                 m1, m3
2765    pandn                m3, m6
2766    mova         [rsp+16*3], m0
2767    por                  m1, m3
2768    mova         [rsp+16*2], m1
2769    movu                 m0, [tlq+r4*2-46]
2770    movu                 m3, [tlq+r4*2-48]
2771    psubw                m3, m0
2772    pmulhrsw             m3, m2
2773    movu                 m1, [tlq+r4*2-62]
2774    paddw                m0, m3
2775    movu                 m3, [tlq+r4*2-64]
2776    psubw                m3, m1
2777    pmulhrsw             m3, m2
2778    movddup              m2, [base+pw_m1024]
2779    paddw                m1, m3
2780    movddup              m3, [base+pw_m1536]
2781    pcmpgtw              m2, m4
2782    pcmpgtw              m3, m4
2783    paddw                m4, m5
2784    pand                 m0, m2
2785    pandn                m2, m6
2786    pand                 m1, m3
2787    pandn                m3, m6
2788    por                  m0, m2
2789    mova         [rsp+16*1], m0
2790    por                  m1, m3
2791    mova         [rsp+16*0], m1
2792    dec                  wd
2793    jz .h32_transpose
2794    add                  r5, dyq
2795    jg .h32_loop
2796.h32_end_loop:
2797    sub                 rsp, 16*4
2798    REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0
2799    dec                  wd
2800    jg .h32_end_loop
2801.h32_transpose:
2802    or                  r3d, 32*2
2803    jmp .end_transpose
2804.h64:
2805    lea                 r4d, [wq+63]
2806    test             angled, 0x400 ; !enable_intra_edge_filter
2807    jnz .h64_main
2808    call .filter_copy
2809    call .filter_edge_s3
2810.h64_main:
2811    sub                 tlq, r4
2812    movd                 m5, dyd
2813    sub                 tlq, r4
2814    shl                 r4d, 6
2815    movd                 m6, [tlq]
2816    movd                 m3, r4d
2817    pshufb               m5, m0
2818    neg                 dyq
2819    pshufb               m6, m0
2820    lea                  r5, [dyq+r4+63]
2821    pshufb               m3, m0
2822    paddw                m4, m5, [base+z_base_inc_z2]
2823    psubw                m4, m3
2824.h64_loop:
2825    mov                  r4, r5
2826    sar                  r4, 6
2827    movu                 m0, [tlq+r4*2- 14]
2828    movu                 m3, [tlq+r4*2- 16]
2829    pand                 m2, m7, m4
2830    psllw                m2, 9
2831    psubw                m3, m0
2832    pmulhrsw             m3, m2
2833    movu                 m1, [tlq+r4*2- 30]
2834    paddw                m0, m3
2835    movu                 m3, [tlq+r4*2- 32]
2836    psubw                m3, m1
2837    pmulhrsw             m3, m2
2838    sub                 rsp, 16*8
2839    paddw                m1, m3
2840    psraw                m3, m4, 15
2841    pand                 m0, m3
2842    pandn                m3, m6
2843    por                  m0, m3
2844    movddup              m3, [base+pw_m512]
2845    pcmpgtw              m3, m4
2846    pand                 m1, m3
2847    pandn                m3, m6
2848    mova         [rsp+16*7], m0
2849    por                  m1, m3
2850    mova         [rsp+16*6], m1
2851    movu                 m0, [tlq+r4*2- 46]
2852    movu                 m3, [tlq+r4*2- 48]
2853    psubw                m3, m0
2854    pmulhrsw             m3, m2
2855    movu                 m1, [tlq+r4*2- 62]
2856    paddw                m0, m3
2857    movu                 m3, [tlq+r4*2- 64]
2858    psubw                m3, m1
2859    pmulhrsw             m3, m2
2860    paddw                m1, m3
2861    movddup              m3, [base+pw_m1024]
2862    pcmpgtw              m3, m4
2863    pand                 m0, m3
2864    pandn                m3, m6
2865    por                  m0, m3
2866    movddup              m3, [base+pw_m1536]
2867    pcmpgtw              m3, m4
2868    pand                 m1, m3
2869    pandn                m3, m6
2870    mova         [rsp+16*5], m0
2871    por                  m1, m3
2872    mova         [rsp+16*4], m1
2873    movu                 m0, [tlq+r4*2- 78]
2874    movu                 m3, [tlq+r4*2- 80]
2875    psubw                m3, m0
2876    pmulhrsw             m3, m2
2877    movu                 m1, [tlq+r4*2- 94]
2878    paddw                m0, m3
2879    movu                 m3, [tlq+r4*2- 96]
2880    psubw                m3, m1
2881    pmulhrsw             m3, m2
2882    paddw                m1, m3
2883    movddup              m3, [base+pw_m2048]
2884    pcmpgtw              m3, m4
2885    pand                 m0, m3
2886    pandn                m3, m6
2887    por                  m0, m3
2888    movddup              m3, [base+pw_m2560]
2889    pcmpgtw              m3, m4
2890    pand                 m1, m3
2891    pandn                m3, m6
2892    mova         [rsp+16*3], m0
2893    por                  m1, m3
2894    mova         [rsp+16*2], m1
2895    movu                 m0, [tlq+r4*2-110]
2896    movu                 m3, [tlq+r4*2-112]
2897    psubw                m3, m0
2898    pmulhrsw             m3, m2
2899    movu                 m1, [tlq+r4*2-126]
2900    paddw                m0, m3
2901    movu                 m3, [tlq+r4*2-128]
2902    psubw                m3, m1
2903    pmulhrsw             m3, m2
2904    movddup              m2, [base+pw_m3072]
2905    paddw                m1, m3
2906    movddup              m3, [base+pw_m3584]
2907    pcmpgtw              m2, m4
2908    pcmpgtw              m3, m4
2909    paddw                m4, m5
2910    pand                 m0, m2
2911    pandn                m2, m6
2912    pand                 m1, m3
2913    pandn                m3, m6
2914    por                  m0, m2
2915    mova         [rsp+16*1], m0
2916    por                  m1, m3
2917    mova         [rsp+16*0], m1
2918    dec                  wd
2919    jz .h64_transpose
2920    add                  r5, dyq
2921    jg .h64_loop
2922.h64_end_loop:
2923    sub                 rsp, 16*8
2924    REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0
2925    dec                  wd
2926    jg .h64_end_loop
2927.h64_transpose:
2928    add                 r3d, 64*2
2929.end_transpose:
2930%if ARCH_X86_64
2931    lea                  r7, [strideq*3]
2932%else
2933    mov             strideq, [dstq+4*0]
2934    mov              org_wd, [dstq+4*1]
2935%endif
2936    lea                 r4d, [r3*3]
2937.end_transpose_loop:
2938    lea                  r2, [rsp+r3-8]
2939    lea                  r6, [dstq+org_wq*2-8]
2940.end_transpose_loop_y:
2941    movq                 m0, [r2+r4  ]
2942    movq                 m1, [r2+r3*2]
2943    movq                 m2, [r2+r3*1]
2944    movq                 m3, [r2+r3*0]
2945    sub                  r2, 8
2946    punpcklwd            m0, m1
2947    punpcklwd            m2, m3
2948    punpckhdq            m1, m0, m2
2949    punpckldq            m0, m2
2950    movhps   [r6+strideq*0], m1
2951    movq     [r6+strideq*1], m1
2952%if ARCH_X86_64
2953    movhps   [r6+strideq*2], m0
2954    movq     [r6+r7       ], m0
2955    lea                  r6, [r6+strideq*4]
2956%else
2957    lea                  r6, [r6+strideq*2]
2958    movhps   [r6+strideq*0], m0
2959    movq     [r6+strideq*1], m0
2960    lea                  r6, [r6+strideq*2]
2961%endif
2962    cmp                  r2, rsp
2963    jae .end_transpose_loop_y
2964    lea                 rsp, [rsp+r3*4]
2965    sub              org_wd, 4
2966    jg .end_transpose_loop
2967    RET
2968.filter_copy:
2969    neg                  r4
2970    pshuflw              m2, [tlq+2], q0000
2971    xor                 r5d, r5d
2972    pshuflw              m3, [tlq+r4*2], q0000
2973    movq [rsp+gprsize+16*17], m2
2974.filter_copy_loop:
2975    mova                 m1, [tlq+r5*2-16*1+2]
2976    mova                 m2, [tlq+r5*2-16*2+2]
2977    sub                  r5, 16
2978    mova [rsp+r5*2+gprsize+16*18], m1
2979    mova [rsp+r5*2+gprsize+16*17], m2
2980    cmp                 r5d, r4d
2981    jg .filter_copy_loop
2982    lea                 tlq, [rsp+gprsize+16*17-2]
2983    movq       [tlq+r4*2-8], m3
2984    ret
2985.filter_edge:
2986    cmp                 r5d, 3
2987    je .filter_edge_s3
2988    movddup              m4, [base+z_filt_k+r5*8-8]
2989    movddup              m5, [base+z_filt_k+r5*8+8]
2990    xor                 r5d, r5d
2991    movddup              m6, [base+pw_8]
2992    movu                 m2, [tlq-12]
2993    jmp .filter_edge_start
2994.filter_edge_loop:
2995    movu                 m2, [tlq+r5*2-12]
2996    mova       [tlq+r5*2+2], m1
2997.filter_edge_start:
2998    pmullw               m1, m4, [tlq+r5*2-14]
2999    movu                 m3, [tlq+r5*2-16]
3000    sub                  r5, 8
3001    paddw                m2, m3
3002    pmullw               m2, m5
3003    paddw                m1, m6
3004    paddw                m1, m2
3005    psrlw                m1, 4
3006    cmp                 r5d, r4d
3007    jg .filter_edge_loop
3008    mova       [tlq+r5*2+2], m1
3009    neg                 r4d
3010    ret
3011.filter_edge_s3:
3012    movddup              m5, [base+pw_3]
3013    xor                 r5d, r5d
3014    movu                 m2, [tlq-12]
3015    movu                 m3, [tlq-10]
3016    jmp .filter_edge_s3_start
3017.filter_edge_s3_loop:
3018    movu                 m2, [tlq+r5*2-12]
3019    movu                 m3, [tlq+r5*2-10]
3020    mova       [tlq+r5*2+2], m1
3021.filter_edge_s3_start:
3022    paddw                m2, [tlq+r5*2-14]
3023    paddw                m3, m5
3024    movu                 m1, [tlq+r5*2-16]
3025    movu                 m4, [tlq+r5*2-18]
3026    sub                  r5, 8
3027    paddw                m1, m2
3028    pavgw                m3, m4
3029    paddw                m1, m3
3030    psrlw                m1, 2
3031    cmp                 r5d, r4d
3032    jg .filter_edge_s3_loop
3033    mova       [tlq+r5*2+2], m1
3034    neg                 r4d
3035    ret
3036
3037%if ARCH_X86_64
3038cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter
3039%else
3040cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter
3041%define  m8 [esp+16*0]
3042%define  m9 [esp+16*1]
3043%define m10 [esp+16*2]
3044%define m11 [esp+16*3]
3045%define m12 [esp+16*4]
3046%define m13 [esp+16*5]
3047%define m14 [esp+16*6]
3048%define m15 [esp+16*7]
3049%endif
3050%define base r6-$$
3051    movifnidn            hd, hm
3052    movd                 m6, r8m     ; bitdepth_max
3053%ifidn filterd, filterm
3054    movzx           filterd, filterb
3055%else
3056    movzx           filterd, byte filterm
3057%endif
3058    LEA                  r6, $$
3059    shl             filterd, 6
3060    movu                 m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3
3061    mova                 m1, [base+filter_intra_taps+filterq+16*0]
3062    mova                 m2, [base+filter_intra_taps+filterq+16*1]
3063    mova                 m3, [base+filter_intra_taps+filterq+16*2]
3064    mova                 m4, [base+filter_intra_taps+filterq+16*3]
3065    pxor                 m5, m5
3066%if ARCH_X86_64
3067    punpcklbw            m8, m5, m1  ; place 8-bit coefficients in the upper
3068    punpckhbw            m9, m5, m1  ; half of each 16-bit word to avoid
3069    punpcklbw           m10, m5, m2  ; having to perform sign-extension.
3070    punpckhbw           m11, m5, m2
3071    punpcklbw           m12, m5, m3
3072    punpckhbw           m13, m5, m3
3073    punpcklbw           m14, m5, m4
3074    punpckhbw           m15, m5, m4
3075%else
3076    punpcklbw            m7, m5, m1
3077    mova                 m8, m7
3078    punpckhbw            m7, m5, m1
3079    mova                 m9, m7
3080    punpcklbw            m7, m5, m2
3081    mova                m10, m7
3082    punpckhbw            m7, m5, m2
3083    mova                m11, m7
3084    punpcklbw            m7, m5, m3
3085    mova                m12, m7
3086    punpckhbw            m7, m5, m3
3087    mova                m13, m7
3088    punpcklbw            m7, m5, m4
3089    mova                m14, m7
3090    punpckhbw            m7, m5, m4
3091    mova                m15, m7
3092%endif
3093    mova                 m7, [base+filter_shuf]
3094    add                  hd, hd
3095    mov                  r5, dstq
3096    pshuflw              m6, m6, q0000
3097    mov                  r6, tlq
3098    punpcklqdq           m6, m6
3099    sub                 tlq, hq
3100.left_loop:
3101    pshufb               m0, m7      ; tl t0 t1 t2 t3 l0 l1 __
3102    pshufd               m1, m0, q0000
3103    pmaddwd              m2, m8, m1
3104    pmaddwd              m1, m9
3105    pshufd               m4, m0, q1111
3106    pmaddwd              m3, m10, m4
3107    pmaddwd              m4, m11
3108    paddd                m2, m3
3109    paddd                m1, m4
3110    pshufd               m4, m0, q2222
3111    pmaddwd              m3, m12, m4
3112    pmaddwd              m4, m13
3113    paddd                m2, m3
3114    paddd                m1, m4
3115    pshufd               m3, m0, q3333
3116    pmaddwd              m0, m14, m3
3117    pmaddwd              m3, m15
3118    paddd                m0, m2
3119    paddd                m1, m3
3120    psrad                m0, 11     ; x >> 3
3121    psrad                m1, 11
3122    packssdw             m0, m1
3123    pmaxsw               m0, m5
3124    pavgw                m0, m5     ; (x + 8) >> 4
3125    pminsw               m0, m6
3126    movq   [dstq+strideq*0], m0
3127    movhps [dstq+strideq*1], m0
3128    movlps               m0, [tlq+hq-10]
3129    lea                dstq, [dstq+strideq*2]
3130    sub                  hd, 2*2
3131    jg .left_loop
3132    sub                  wd, 4
3133    jz .end
3134    sub                 tld, r6d     ; -h*2
3135    sub                  r6, r5      ; tl-dst
3136.right_loop0:
3137    add                  r5, 8
3138    mov                  hd, tld
3139    movu                 m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __
3140    mov                dstq, r5
3141.right_loop:
3142    pshufd               m2, m0, q0000
3143    pmaddwd              m1, m8, m2
3144    pmaddwd              m2, m9
3145    pshufd               m4, m0, q1111
3146    pmaddwd              m3, m10, m4
3147    pmaddwd              m4, m11
3148    pinsrw               m0, [dstq+strideq*0-2], 5
3149    paddd                m1, m3
3150    paddd                m2, m4
3151    pshufd               m0, m0, q2222
3152    movddup              m4, [dstq+strideq*1-8]
3153    pmaddwd              m3, m12, m0
3154    pmaddwd              m0, m13
3155    paddd                m1, m3
3156    paddd                m0, m2
3157    pshuflw              m2, m4, q3333
3158    punpcklwd            m2, m5
3159    pmaddwd              m3, m14, m2
3160    pmaddwd              m2, m15
3161    paddd                m1, m3
3162    paddd                m0, m2
3163    psrad                m1, 11
3164    psrad                m0, 11
3165    packssdw             m0, m1
3166    pmaxsw               m0, m5
3167    pavgw                m0, m5
3168    pminsw               m0, m6
3169    movhps [dstq+strideq*0], m0
3170    movq   [dstq+strideq*1], m0
3171    palignr              m0, m4, 14
3172    lea                dstq, [dstq+strideq*2]
3173    add                  hd, 2*2
3174    jl .right_loop
3175    sub                  wd, 4
3176    jg .right_loop0
3177.end:
3178    RET
3179
3180%if UNIX64
3181DECLARE_REG_TMP 7
3182%else
3183DECLARE_REG_TMP 5
3184%endif
3185
3186cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac
3187    LEA                  t0, ipred_cfl_left_16bpc_ssse3_table
3188    movd                 m4, wd
3189    tzcnt                wd, wd
3190    movifnidn            hd, hm
3191    add                 tlq, 2
3192    movsxd               r6, [t0+wq*4]
3193    movd                 m5, wd
3194    jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start)
3195
3196cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
3197    movifnidn            hd, hm
3198    LEA                  t0, ipred_cfl_left_16bpc_ssse3_table
3199    tzcnt                wd, wm
3200    lea                 r6d, [hq*2]
3201    movd                 m4, hd
3202    sub                 tlq, r6
3203    tzcnt               r6d, hd
3204    movd                 m5, r6d
3205    movsxd               r6, [t0+r6*4]
3206.start:
3207    movd                 m7, r7m
3208    movu                 m0, [tlq]
3209    add                  r6, t0
3210    add                  t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table
3211    movsxd               wq, [t0+wq*4]
3212    pxor                 m6, m6
3213    pshuflw              m7, m7, q0000
3214    pcmpeqw              m3, m3
3215    add                  wq, t0
3216    movifnidn           acq, acmp
3217    pavgw                m4, m6
3218    punpcklqdq           m7, m7
3219    jmp                  r6
3220.h32:
3221    movu                 m1, [tlq+48]
3222    movu                 m2, [tlq+32]
3223    paddw                m0, m1
3224    paddw                m0, m2
3225.h16:
3226    movu                 m1, [tlq+16]
3227    paddw                m0, m1
3228.h8:
3229    pshufd               m1, m0, q1032
3230    paddw                m0, m1
3231.h4:
3232    pmaddwd              m0, m3
3233    psubd                m4, m0
3234    pshuflw              m0, m4, q1032
3235    paddd                m0, m4
3236    psrld                m0, m5
3237    pshuflw              m0, m0, q0000
3238    punpcklqdq           m0, m0
3239    jmp                  wq
3240
3241%macro IPRED_CFL 2 ; dst, src
3242    pabsw               m%1, m%2
3243    pmulhrsw            m%1, m2
3244    psignw              m%2, m1
3245    psignw              m%1, m%2
3246    paddw               m%1, m0
3247    pmaxsw              m%1, m6
3248    pminsw              m%1, m7
3249%endmacro
3250
3251cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha
3252    movifnidn            hd, hm
3253    tzcnt               r6d, hd
3254    lea                 t0d, [wq+hq]
3255    movd                 m4, t0d
3256    tzcnt               t0d, t0d
3257    movd                 m5, t0d
3258    LEA                  t0, ipred_cfl_16bpc_ssse3_table
3259    tzcnt                wd, wd
3260    movd                 m7, r7m
3261    movsxd               r6, [t0+r6*4]
3262    movsxd               wq, [t0+wq*4+4*4]
3263    psrlw                m4, 1
3264    pxor                 m6, m6
3265    pshuflw              m7, m7, q0000
3266    add                  r6, t0
3267    add                  wq, t0
3268    movifnidn           acq, acmp
3269    pcmpeqw              m3, m3
3270    punpcklqdq           m7, m7
3271    jmp                  r6
3272.h4:
3273    movq                 m0, [tlq-8]
3274    jmp                  wq
3275.w4:
3276    movq                 m1, [tlq+2]
3277    paddw                m0, m1
3278    pmaddwd              m0, m3
3279    psubd                m4, m0
3280    pshufd               m0, m4, q1032
3281    paddd                m0, m4
3282    pshuflw              m4, m0, q1032
3283    paddd                m0, m4
3284    cmp                  hd, 4
3285    jg .w4_mul
3286    psrld                m0, 3
3287    jmp .w4_end
3288.w4_mul:
3289    mov                 r6d, 0xAAAB
3290    mov                 r2d, 0x6667
3291    cmp                  hd, 16
3292    cmove               r6d, r2d
3293    movd                 m1, r6d
3294    psrld                m0, 2
3295    pmulhuw              m0, m1
3296    psrlw                m0, 1
3297.w4_end:
3298    pshuflw              m0, m0, q0000
3299    punpcklqdq           m0, m0
3300.s4:
3301    movd                 m1, alpham
3302    lea                  r6, [strideq*3]
3303    pshuflw              m1, m1, q0000
3304    punpcklqdq           m1, m1
3305    pabsw                m2, m1
3306    psllw                m2, 9
3307.s4_loop:
3308    mova                 m4, [acq+16*0]
3309    mova                 m5, [acq+16*1]
3310    add                 acq, 16*2
3311    IPRED_CFL             3, 4
3312    IPRED_CFL             4, 5
3313    movq   [dstq+strideq*0], m3
3314    movhps [dstq+strideq*1], m3
3315    movq   [dstq+strideq*2], m4
3316    movhps [dstq+r6       ], m4
3317    lea                dstq, [dstq+strideq*4]
3318    sub                  hd, 4
3319    jg .s4_loop
3320    RET
3321.h8:
3322    mova                 m0, [tlq-16]
3323    jmp                  wq
3324.w8:
3325    movu                 m1, [tlq+2]
3326    paddw                m0, m1
3327    pmaddwd              m0, m3
3328    psubd                m4, m0
3329    pshufd               m0, m4, q1032
3330    paddd                m0, m4
3331    pshuflw              m4, m0, q1032
3332    paddd                m0, m4
3333    psrld                m0, m5
3334    cmp                  hd, 8
3335    je .w8_end
3336    mov                 r6d, 0xAAAB
3337    mov                 r2d, 0x6667
3338    cmp                  hd, 32
3339    cmove               r6d, r2d
3340    movd                 m1, r6d
3341    pmulhuw              m0, m1
3342    psrlw                m0, 1
3343.w8_end:
3344    pshuflw              m0, m0, q0000
3345    punpcklqdq           m0, m0
3346.s8:
3347    movd                 m1, alpham
3348    pshuflw              m1, m1, q0000
3349    punpcklqdq           m1, m1
3350    pabsw                m2, m1
3351    psllw                m2, 9
3352.s8_loop:
3353    mova                 m4, [acq+16*0]
3354    mova                 m5, [acq+16*1]
3355    add                 acq, 16*2
3356    IPRED_CFL             3, 4
3357    IPRED_CFL             4, 5
3358    mova   [dstq+strideq*0], m3
3359    mova   [dstq+strideq*1], m4
3360    lea                dstq, [dstq+strideq*2]
3361    sub                  hd, 2
3362    jg .s8_loop
3363    RET
3364.h16:
3365    mova                 m0, [tlq-32]
3366    paddw                m0, [tlq-16]
3367    jmp                  wq
3368.w16:
3369    movu                 m1, [tlq+ 2]
3370    movu                 m2, [tlq+18]
3371    paddw                m1, m2
3372    paddw                m0, m1
3373    pmaddwd              m0, m3
3374    psubd                m4, m0
3375    pshufd               m0, m4, q1032
3376    paddd                m0, m4
3377    pshuflw              m4, m0, q1032
3378    paddd                m0, m4
3379    psrld                m0, m5
3380    cmp                  hd, 16
3381    je .w16_end
3382    mov                 r6d, 0xAAAB
3383    mov                 r2d, 0x6667
3384    test                 hd, 8|32
3385    cmovz               r6d, r2d
3386    movd                 m1, r6d
3387    pmulhuw              m0, m1
3388    psrlw                m0, 1
3389.w16_end:
3390    pshuflw              m0, m0, q0000
3391    punpcklqdq           m0, m0
3392.s16:
3393    movd                 m1, alpham
3394    pshuflw              m1, m1, q0000
3395    punpcklqdq           m1, m1
3396    pabsw                m2, m1
3397    psllw                m2, 9
3398.s16_loop:
3399    mova                 m4, [acq+16*0]
3400    mova                 m5, [acq+16*1]
3401    add                 acq, 16*2
3402    IPRED_CFL             3, 4
3403    IPRED_CFL             4, 5
3404    mova        [dstq+16*0], m3
3405    mova        [dstq+16*1], m4
3406    add                dstq, strideq
3407    dec                  hd
3408    jg .s16_loop
3409    RET
3410.h32:
3411    mova                 m0, [tlq-64]
3412    paddw                m0, [tlq-48]
3413    paddw                m0, [tlq-32]
3414    paddw                m0, [tlq-16]
3415    jmp                  wq
3416.w32:
3417    movu                 m1, [tlq+ 2]
3418    movu                 m2, [tlq+18]
3419    paddw                m1, m2
3420    movu                 m2, [tlq+34]
3421    paddw                m1, m2
3422    movu                 m2, [tlq+50]
3423    paddw                m1, m2
3424    paddw                m0, m1
3425    pmaddwd              m0, m3
3426    psubd                m4, m0
3427    pshufd               m0, m4, q1032
3428    paddd                m0, m4
3429    pshuflw              m4, m0, q1032
3430    paddd                m0, m4
3431    psrld                m0, m5
3432    cmp                  hd, 32
3433    je .w32_end
3434    mov                 r6d, 0xAAAB
3435    mov                 r2d, 0x6667
3436    cmp                  hd, 8
3437    cmove               r6d, r2d
3438    movd                 m1, r6d
3439    pmulhuw              m0, m1
3440    psrlw                m0, 1
3441.w32_end:
3442    pshuflw              m0, m0, q0000
3443    punpcklqdq           m0, m0
3444.s32:
3445    movd                 m1, alpham
3446    pshuflw              m1, m1, q0000
3447    punpcklqdq           m1, m1
3448    pabsw                m2, m1
3449    psllw                m2, 9
3450.s32_loop:
3451    mova                 m4, [acq+16*0]
3452    mova                 m5, [acq+16*1]
3453    IPRED_CFL             3, 4
3454    IPRED_CFL             4, 5
3455    mova        [dstq+16*0], m3
3456    mova        [dstq+16*1], m4
3457    mova                 m4, [acq+16*2]
3458    mova                 m5, [acq+16*3]
3459    add                 acq, 16*4
3460    IPRED_CFL             3, 4
3461    IPRED_CFL             4, 5
3462    mova        [dstq+16*2], m3
3463    mova        [dstq+16*3], m4
3464    add                dstq, strideq
3465    dec                  hd
3466    jg .s32_loop
3467    RET
3468
3469cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac
3470    tzcnt                wd, wm
3471    LEA                  t0, ipred_cfl_splat_16bpc_ssse3_table
3472    mov                 r6d, r7m
3473    movifnidn            hd, hm
3474    shr                 r6d, 11
3475    movd                 m7, r7m
3476    movsxd               wq, [t0+wq*4]
3477    movddup              m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8]
3478    pshuflw              m7, m7, q0000
3479    pxor                 m6, m6
3480    add                  wq, t0
3481    movifnidn           acq, acmp
3482    punpcklqdq           m7, m7
3483    jmp                  wq
3484
3485cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
3486    movifnidn         hpadd, hpadm
3487%if ARCH_X86_32 && PIC
3488    pcmpeqw              m5, m5
3489    pabsw                m5, m5
3490    paddw                m5, m5
3491%else
3492    movddup              m5, [pw_2]
3493%endif
3494    mov                  hd, hm
3495    shl               hpadd, 2
3496    pxor                 m4, m4
3497    sub                  hd, hpadd
3498    cmp            dword wm, 8
3499    mov                  r5, acq
3500    jg .w16
3501    je .w8
3502    lea                  r3, [strideq*3]
3503.w4_loop:
3504    pmaddwd              m0, m5, [ypxq+strideq*0]
3505    pmaddwd              m1, m5, [ypxq+strideq*1]
3506    pmaddwd              m2, m5, [ypxq+strideq*2]
3507    pmaddwd              m3, m5, [ypxq+r3       ]
3508    lea                ypxq, [ypxq+strideq*4]
3509    paddd                m0, m1
3510    paddd                m2, m3
3511    paddd                m4, m0
3512    packssdw             m0, m2
3513    paddd                m4, m2
3514    mova              [acq], m0
3515    add                 acq, 16
3516    sub                  hd, 2
3517    jg .w4_loop
3518    test              hpadd, hpadd
3519    jz .dc
3520    punpckhqdq           m0, m0
3521    pslld                m2, 2
3522.w4_hpad:
3523    mova         [acq+16*0], m0
3524    paddd                m4, m2
3525    mova         [acq+16*1], m0
3526    add                 acq, 16*2
3527    sub               hpadd, 4
3528    jg .w4_hpad
3529    jmp .dc
3530.w8:
3531%if ARCH_X86_32
3532    cmp         dword wpadm, 0
3533%else
3534    test              wpadd, wpadd
3535%endif
3536    jnz .w8_wpad1
3537.w8_loop:
3538    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
3539    pmaddwd              m2, m5, [ypxq+strideq*1+16*0]
3540    pmaddwd              m1, m5, [ypxq+strideq*0+16*1]
3541    pmaddwd              m3, m5, [ypxq+strideq*1+16*1]
3542    lea                ypxq, [ypxq+strideq*2]
3543    paddd                m0, m2
3544    paddd                m1, m3
3545    paddd                m2, m0, m1
3546    packssdw             m0, m1
3547    paddd                m4, m2
3548    mova              [acq], m0
3549    add                 acq, 16
3550    dec                  hd
3551    jg .w8_loop
3552.w8_hpad:
3553    test              hpadd, hpadd
3554    jz .dc
3555    pslld                m2, 2
3556    mova                 m1, m0
3557    jmp .hpad
3558.w8_wpad1:
3559    pmaddwd              m0, m5, [ypxq+strideq*0]
3560    pmaddwd              m1, m5, [ypxq+strideq*1]
3561    lea                ypxq, [ypxq+strideq*2]
3562    paddd                m0, m1
3563    pshufd               m1, m0, q3333
3564    paddd                m2, m0, m1
3565    packssdw             m0, m1
3566    paddd                m4, m2
3567    mova              [acq], m0
3568    add                 acq, 16
3569    dec                  hd
3570    jg .w8_wpad1
3571    jmp .w8_hpad
3572.w16_wpad3:
3573    pshufd               m3, m0, q3333
3574    mova                 m1, m3
3575    mova                 m2, m3
3576    jmp .w16_wpad_end
3577.w16_wpad2:
3578    pshufd               m1, m3, q3333
3579    mova                 m2, m1
3580    jmp .w16_wpad_end
3581.w16_wpad1:
3582    pshufd               m2, m1, q3333
3583    jmp .w16_wpad_end
3584.w16:
3585    movifnidn         wpadd, wpadm
3586    WIN64_SPILL_XMM       7
3587.w16_loop:
3588    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
3589    pmaddwd              m6, m5, [ypxq+strideq*1+16*0]
3590    paddd                m0, m6
3591    cmp               wpadd, 2
3592    jg .w16_wpad3
3593    pmaddwd              m3, m5, [ypxq+strideq*0+16*1]
3594    pmaddwd              m6, m5, [ypxq+strideq*1+16*1]
3595    paddd                m3, m6
3596    je .w16_wpad2
3597    pmaddwd              m1, m5, [ypxq+strideq*0+16*2]
3598    pmaddwd              m6, m5, [ypxq+strideq*1+16*2]
3599    paddd                m1, m6
3600    jp .w16_wpad1
3601    pmaddwd              m2, m5, [ypxq+strideq*0+16*3]
3602    pmaddwd              m6, m5, [ypxq+strideq*1+16*3]
3603    paddd                m2, m6
3604.w16_wpad_end:
3605    lea                ypxq, [ypxq+strideq*2]
3606    paddd                m6, m0, m3
3607    packssdw             m0, m3
3608    paddd                m6, m1
3609    mova         [acq+16*0], m0
3610    packssdw             m1, m2
3611    paddd                m2, m6
3612    mova         [acq+16*1], m1
3613    add                 acq, 16*2
3614    paddd                m4, m2
3615    dec                  hd
3616    jg .w16_loop
3617    WIN64_RESTORE_XMM
3618    add               hpadd, hpadd
3619    jz .dc
3620    paddd                m2, m2
3621.hpad:
3622    mova         [acq+16*0], m0
3623    mova         [acq+16*1], m1
3624    paddd                m4, m2
3625    mova         [acq+16*2], m0
3626    mova         [acq+16*3], m1
3627    add                 acq, 16*4
3628    sub               hpadd, 4
3629    jg .hpad
3630.dc:
3631    sub                  r5, acq ; -w*h*2
3632    pshufd               m2, m4, q1032
3633    tzcnt               r1d, r5d
3634    paddd                m2, m4
3635    sub                 r1d, 2
3636    pshufd               m4, m2, q2301
3637    movd                 m0, r1d
3638    paddd                m2, m4
3639    psrld                m2, m0
3640    pxor                 m0, m0
3641    pavgw                m2, m0
3642    packssdw             m2, m2
3643.dc_loop:
3644    mova                 m0, [acq+r5+16*0]
3645    mova                 m1, [acq+r5+16*1]
3646    psubw                m0, m2
3647    psubw                m1, m2
3648    mova      [acq+r5+16*0], m0
3649    mova      [acq+r5+16*1], m1
3650    add                  r5, 16*2
3651    jl .dc_loop
3652    RET
3653
3654cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
3655    movifnidn         hpadd, hpadm
3656%if ARCH_X86_32 && PIC
3657    pcmpeqw              m5, m5
3658    pabsw                m5, m5
3659    psllw                m5, 2
3660%else
3661    movddup              m5, [pw_4]
3662%endif
3663    mov                  hd, hm
3664    shl               hpadd, 2
3665    pxor                 m4, m4
3666    sub                  hd, hpadd
3667    cmp            dword wm, 8
3668    mov                  r5, acq
3669    jg .w16
3670    je .w8
3671    lea                  r3, [strideq*3]
3672.w4_loop:
3673    pmaddwd              m0, m5, [ypxq+strideq*0]
3674    pmaddwd              m3, m5, [ypxq+strideq*1]
3675    pmaddwd              m1, m5, [ypxq+strideq*2]
3676    pmaddwd              m2, m5, [ypxq+r3       ]
3677    lea                ypxq, [ypxq+strideq*4]
3678    paddd                m4, m0
3679    packssdw             m0, m3
3680    paddd                m3, m1
3681    packssdw             m1, m2
3682    paddd                m4, m2
3683    paddd                m4, m3
3684    mova         [acq+16*0], m0
3685    mova         [acq+16*1], m1
3686    add                 acq, 16*2
3687    sub                  hd, 4
3688    jg .w4_loop
3689    test              hpadd, hpadd
3690    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3691    punpckhqdq           m1, m1
3692    pslld                m2, 3
3693    mova         [acq+16*0], m1
3694    mova         [acq+16*1], m1
3695    paddd                m4, m2
3696    mova         [acq+16*2], m1
3697    mova         [acq+16*3], m1
3698    add                 acq, 16*4
3699    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3700.w8:
3701%if ARCH_X86_32
3702    cmp         dword wpadm, 0
3703%else
3704    test              wpadd, wpadd
3705%endif
3706    jnz .w8_wpad1
3707.w8_loop:
3708    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
3709    pmaddwd              m2, m5, [ypxq+strideq*0+16*1]
3710    pmaddwd              m1, m5, [ypxq+strideq*1+16*0]
3711    pmaddwd              m3, m5, [ypxq+strideq*1+16*1]
3712    lea                ypxq, [ypxq+strideq*2]
3713    paddd                m4, m0
3714    packssdw             m0, m2
3715    paddd                m4, m2
3716    mova         [acq+16*0], m0
3717    paddd                m2, m1, m3
3718    packssdw             m1, m3
3719    paddd                m4, m2
3720    mova         [acq+16*1], m1
3721    add                 acq, 16*2
3722    sub                  hd, 2
3723    jg .w8_loop
3724.w8_hpad:
3725    test              hpadd, hpadd
3726    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3727    pslld                m2, 2
3728    mova                 m0, m1
3729    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
3730.w8_wpad1:
3731    pmaddwd              m0, m5, [ypxq+strideq*0]
3732    pmaddwd              m1, m5, [ypxq+strideq*1]
3733    lea                ypxq, [ypxq+strideq*2]
3734    pshufd               m2, m0, q3333
3735    pshufd               m3, m1, q3333
3736    paddd                m4, m0
3737    packssdw             m0, m2
3738    paddd                m4, m2
3739    paddd                m2, m1, m3
3740    packssdw             m1, m3
3741    paddd                m4, m2
3742    mova         [acq+16*0], m0
3743    mova         [acq+16*1], m1
3744    add                 acq, 16*2
3745    sub                  hd, 2
3746    jg .w8_wpad1
3747    jmp .w8_hpad
3748.w16_wpad3:
3749    pshufd               m3, m0, q3333
3750    mova                 m1, m3
3751    mova                 m2, m3
3752    jmp .w16_wpad_end
3753.w16_wpad2:
3754    pshufd               m1, m3, q3333
3755    mova                 m2, m1
3756    jmp .w16_wpad_end
3757.w16_wpad1:
3758    pshufd               m2, m1, q3333
3759    jmp .w16_wpad_end
3760.w16:
3761    movifnidn         wpadd, wpadm
3762    WIN64_SPILL_XMM       7
3763.w16_loop:
3764    pmaddwd              m0, m5, [ypxq+16*0]
3765    cmp               wpadd, 2
3766    jg .w16_wpad3
3767    pmaddwd              m3, m5, [ypxq+16*1]
3768    je .w16_wpad2
3769    pmaddwd              m1, m5, [ypxq+16*2]
3770    jp .w16_wpad1
3771    pmaddwd              m2, m5, [ypxq+16*3]
3772.w16_wpad_end:
3773    add                ypxq, strideq
3774    paddd                m6, m0, m3
3775    packssdw             m0, m3
3776    mova         [acq+16*0], m0
3777    paddd                m6, m1
3778    packssdw             m1, m2
3779    paddd                m2, m6
3780    mova         [acq+16*1], m1
3781    add                 acq, 16*2
3782    paddd                m4, m2
3783    dec                  hd
3784    jg .w16_loop
3785    WIN64_RESTORE_XMM
3786    add               hpadd, hpadd
3787    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3788    paddd                m2, m2
3789    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
3790
3791cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
3792%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table
3793    LEA                  r6, ipred_cfl_ac_444_16bpc_ssse3_table
3794    tzcnt                wd, wm
3795    movifnidn         hpadd, hpadm
3796    pxor                 m4, m4
3797    movsxd               wq, [r6+wq*4]
3798    movddup              m5, [base+pw_1]
3799    add                  wq, r6
3800    mov                  hd, hm
3801    shl               hpadd, 2
3802    sub                  hd, hpadd
3803    jmp                  wq
3804.w4:
3805    lea                  r3, [strideq*3]
3806    mov                  r5, acq
3807.w4_loop:
3808    movq                 m0, [ypxq+strideq*0]
3809    movhps               m0, [ypxq+strideq*1]
3810    movq                 m1, [ypxq+strideq*2]
3811    movhps               m1, [ypxq+r3       ]
3812    lea                ypxq, [ypxq+strideq*4]
3813    psllw                m0, 3
3814    psllw                m1, 3
3815    mova         [acq+16*0], m0
3816    pmaddwd              m0, m5
3817    mova         [acq+16*1], m1
3818    pmaddwd              m2, m5, m1
3819    add                 acq, 16*2
3820    paddd                m4, m0
3821    paddd                m4, m2
3822    sub                  hd, 4
3823    jg .w4_loop
3824    test              hpadd, hpadd
3825    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3826    punpckhqdq           m1, m1
3827    mova         [acq+16*0], m1
3828    pslld                m2, 2
3829    mova         [acq+16*1], m1
3830    punpckhqdq           m2, m2
3831    mova         [acq+16*2], m1
3832    paddd                m4, m2
3833    mova         [acq+16*3], m1
3834    add                 acq, 16*4
3835    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3836.w8:
3837    mov                  r5, acq
3838.w8_loop:
3839    mova                 m0, [ypxq+strideq*0]
3840    mova                 m1, [ypxq+strideq*1]
3841    lea                ypxq, [ypxq+strideq*2]
3842    psllw                m0, 3
3843    psllw                m1, 3
3844    mova         [acq+16*0], m0
3845    pmaddwd              m0, m5
3846    mova         [acq+16*1], m1
3847    pmaddwd              m2, m5, m1
3848    add                 acq, 16*2
3849    paddd                m4, m0
3850    paddd                m4, m2
3851    sub                  hd, 2
3852    jg .w8_loop
3853.w8_hpad:
3854    test              hpadd, hpadd
3855    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3856    pslld                m2, 2
3857    mova                 m0, m1
3858    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
3859.w16_wpad2:
3860    pshufhw              m3, m2, q3333
3861    pshufhw              m1, m0, q3333
3862    punpckhqdq           m3, m3
3863    punpckhqdq           m1, m1
3864    jmp .w16_wpad_end
3865.w16:
3866    movifnidn         wpadd, wpadm
3867    mov                  r5, acq
3868.w16_loop:
3869    mova                 m2, [ypxq+strideq*0+16*0]
3870    mova                 m0, [ypxq+strideq*1+16*0]
3871    psllw                m2, 3
3872    psllw                m0, 3
3873    test              wpadd, wpadd
3874    jnz .w16_wpad2
3875    mova                 m3, [ypxq+strideq*0+16*1]
3876    mova                 m1, [ypxq+strideq*1+16*1]
3877    psllw                m3, 3
3878    psllw                m1, 3
3879.w16_wpad_end:
3880    lea                ypxq, [ypxq+strideq*2]
3881    mova         [acq+16*0], m2
3882    pmaddwd              m2, m5
3883    mova         [acq+16*1], m3
3884    pmaddwd              m3, m5
3885    paddd                m4, m2
3886    pmaddwd              m2, m5, m0
3887    mova         [acq+16*2], m0
3888    paddd                m4, m3
3889    pmaddwd              m3, m5, m1
3890    mova         [acq+16*3], m1
3891    add                 acq, 16*4
3892    paddd                m2, m3
3893    paddd                m4, m2
3894    sub                  hd, 2
3895    jg .w16_loop
3896    add               hpadd, hpadd
3897    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3898    paddd                m2, m2
3899    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
3900.w32_wpad6:
3901    pshufhw              m1, m0, q3333
3902    punpckhqdq           m1, m1
3903    mova                 m2, m1
3904    mova                 m3, m1
3905    jmp .w32_wpad_end
3906.w32_wpad4:
3907    pshufhw              m2, m1, q3333
3908    punpckhqdq           m2, m2
3909    mova                 m3, m2
3910    jmp .w32_wpad_end
3911.w32_wpad2:
3912    pshufhw              m3, m2, q3333
3913    punpckhqdq           m3, m3
3914    jmp .w32_wpad_end
3915.w32:
3916    movifnidn         wpadd, wpadm
3917    mov                  r5, acq
3918    WIN64_SPILL_XMM       8
3919.w32_loop:
3920    mova                 m0, [ypxq+16*0]
3921    psllw                m0, 3
3922    cmp               wpadd, 4
3923    jg .w32_wpad6
3924    mova                 m1, [ypxq+16*1]
3925    psllw                m1, 3
3926    je .w32_wpad4
3927    mova                 m2, [ypxq+16*2]
3928    psllw                m2, 3
3929    jnp .w32_wpad2
3930    mova                 m3, [ypxq+16*3]
3931    psllw                m3, 3
3932.w32_wpad_end:
3933    add                ypxq, strideq
3934    pmaddwd              m6, m5, m0
3935    mova         [acq+16*0], m0
3936    pmaddwd              m7, m5, m1
3937    mova         [acq+16*1], m1
3938    paddd                m6, m7
3939    pmaddwd              m7, m5, m2
3940    mova         [acq+16*2], m2
3941    paddd                m6, m7
3942    pmaddwd              m7, m5, m3
3943    mova         [acq+16*3], m3
3944    add                 acq, 16*4
3945    paddd                m6, m7
3946    paddd                m4, m6
3947    dec                  hd
3948    jg .w32_loop
3949%if WIN64
3950    mova                 m5, m6
3951    WIN64_RESTORE_XMM
3952    SWAP                  5, 6
3953%endif
3954    test              hpadd, hpadd
3955    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3956.w32_hpad_loop:
3957    mova         [acq+16*0], m0
3958    mova         [acq+16*1], m1
3959    paddd                m4, m6
3960    mova         [acq+16*2], m2
3961    mova         [acq+16*3], m3
3962    add                 acq, 16*4
3963    dec               hpadd
3964    jg .w32_hpad_loop
3965    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3966
3967cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h
3968%define base r2-pal_pred_16bpc_ssse3_table
3969%if ARCH_X86_32
3970    %define              hd  r2d
3971%endif
3972    mova                 m4, [palq]
3973    LEA                  r2, pal_pred_16bpc_ssse3_table
3974    tzcnt                wd, wm
3975    pshufb               m4, [base+pal_pred_shuf]
3976    movsxd               wq, [r2+wq*4]
3977    pshufd               m5, m4, q1032
3978    add                  wq, r2
3979    movifnidn            hd, hm
3980    jmp                  wq
3981.w4:
3982    movq                 m0, [idxq]
3983    add                idxq, 8
3984    psrlw                m1, m0, 4
3985    punpcklbw            m0, m1
3986    pshufb               m1, m4, m0
3987    pshufb               m2, m5, m0
3988    punpcklbw            m0, m1, m2
3989    punpckhbw            m1, m2
3990    movq   [dstq+strideq*0], m0
3991    movhps [dstq+strideq*1], m0
3992    lea                dstq, [dstq+strideq*2]
3993    movq   [dstq+strideq*0], m1
3994    movhps [dstq+strideq*1], m1
3995    lea                dstq, [dstq+strideq*2]
3996    sub                  hd, 4
3997    jg .w4
3998    RET
3999.w8:
4000    movu                 m3, [idxq]
4001    add                idxq, 16
4002    psrlw                m1, m3, 4
4003    punpcklbw            m0, m3, m1
4004    punpckhbw            m3, m1
4005    pshufb               m1, m4, m0
4006    pshufb               m2, m5, m0
4007    punpcklbw            m0, m1, m2
4008    punpckhbw            m1, m2
4009    mova   [dstq+strideq*0], m0
4010    mova   [dstq+strideq*1], m1
4011    lea                dstq, [dstq+strideq*2]
4012    pshufb               m1, m4, m3
4013    pshufb               m2, m5, m3
4014    punpcklbw            m0, m1, m2
4015    punpckhbw            m1, m2
4016    mova   [dstq+strideq*0], m0
4017    mova   [dstq+strideq*1], m1
4018    lea                dstq, [dstq+strideq*2]
4019    sub                  hd, 4
4020    jg .w8
4021    RET
4022.w16:
4023    movu                 m3, [idxq]
4024    add                idxq, 16
4025    psrlw                m1, m3, 4
4026    punpcklbw            m0, m3, m1
4027    punpckhbw            m3, m1
4028    pshufb               m1, m4, m0
4029    pshufb               m2, m5, m0
4030    punpcklbw            m0, m1, m2
4031    punpckhbw            m1, m2
4032    mova          [dstq+ 0], m0
4033    mova          [dstq+16], m1
4034    pshufb               m1, m4, m3
4035    pshufb               m2, m5, m3
4036    punpcklbw            m0, m1, m2
4037    punpckhbw            m1, m2
4038    mova  [dstq+strideq+ 0], m0
4039    mova  [dstq+strideq+16], m1
4040    lea                dstq, [dstq+strideq*2]
4041    sub                  hd, 2
4042    jg .w16
4043    RET
4044.w32:
4045    movu                 m3, [idxq]
4046    add                idxq, 16
4047    psrlw                m1, m3, 4
4048    punpcklbw            m0, m3, m1
4049    punpckhbw            m3, m1
4050    pshufb               m1, m4, m0
4051    pshufb               m2, m5, m0
4052    punpcklbw            m0, m1, m2
4053    punpckhbw            m1, m2
4054    mova        [dstq+16*0], m0
4055    mova        [dstq+16*1], m1
4056    pshufb               m1, m4, m3
4057    pshufb               m2, m5, m3
4058    punpcklbw            m0, m1, m2
4059    punpckhbw            m1, m2
4060    mova        [dstq+16*2], m0
4061    mova        [dstq+16*3], m1
4062    add                dstq, strideq
4063    dec                  hd
4064    jg .w32
4065    RET
4066.w64:
4067    movu                 m3, [idxq+16*0]
4068    psrlw                m1, m3, 4
4069    punpcklbw            m0, m3, m1
4070    punpckhbw            m3, m1
4071    pshufb               m1, m4, m0
4072    pshufb               m2, m5, m0
4073    punpcklbw            m0, m1, m2
4074    punpckhbw            m1, m2
4075    mova        [dstq+16*0], m0
4076    mova        [dstq+16*1], m1
4077    pshufb               m1, m4, m3
4078    pshufb               m2, m5, m3
4079    movu                 m3, [idxq+16*1]
4080    add                idxq, 32
4081    punpcklbw            m0, m1, m2
4082    punpckhbw            m1, m2
4083    mova        [dstq+16*2], m0
4084    mova        [dstq+16*3], m1
4085    psrlw                m1, m3, 4
4086    punpcklbw            m0, m3, m1
4087    punpckhbw            m3, m1
4088    pshufb               m1, m4, m0
4089    pshufb               m2, m5, m0
4090    punpcklbw            m0, m1, m2
4091    punpckhbw            m1, m2
4092    mova        [dstq+16*4], m0
4093    mova        [dstq+16*5], m1
4094    pshufb               m1, m4, m3
4095    pshufb               m2, m5, m3
4096    punpcklbw            m0, m1, m2
4097    punpckhbw            m1, m2
4098    mova        [dstq+16*6], m0
4099    mova        [dstq+16*7], m1
4100    add                dstq, strideq
4101    dec                  hd
4102    jg .w64
4103    RET
4104