xref: /aosp_15_r20/external/libdav1d/src/x86/ipred_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2018-2021, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33%macro SMOOTH_WEIGHT_TABLE 1-*
34    %rep %0
35        db %1-128, 127-%1
36        %rotate 1
37    %endrep
38%endmacro
39
40; sm_weights[], but modified to precalculate x and 256-x with offsets to
41; enable efficient use of pmaddubsw (which requires signed values)
42smooth_weights: SMOOTH_WEIGHT_TABLE         \
43      0,   0, 255, 128, 255, 149,  85,  64, \
44    255, 197, 146, 105,  73,  50,  37,  32, \
45    255, 225, 196, 170, 145, 123, 102,  84, \
46     68,  54,  43,  33,  26,  20,  17,  16, \
47    255, 240, 225, 210, 196, 182, 169, 157, \
48    145, 133, 122, 111, 101,  92,  83,  74, \
49     66,  59,  52,  45,  39,  34,  29,  25, \
50     21,  17,  14,  12,  10,   9,   8,   8, \
51    255, 248, 240, 233, 225, 218, 210, 203, \
52    196, 189, 182, 176, 169, 163, 156, 150, \
53    144, 138, 133, 127, 121, 116, 111, 106, \
54    101,  96,  91,  86,  82,  77,  73,  69, \
55     65,  61,  57,  54,  50,  47,  44,  41, \
56     38,  35,  32,  29,  27,  25,  22,  20, \
57     18,  16,  15,  13,  12,  10,   9,   8, \
58      7,   6,   6,   5,   5,   4,   4,   4
59
60pb_1to32:     db  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
61              db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
62pb_32to1:     db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17
63pb_16to1:     db 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1
64z_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
65              db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
66z_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
67              db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
68              db  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  8,  0
69const \
70z_filter_s,   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
71              db  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
72              db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
73pb_128:       times 4 db 128 ; those are just placed here for alignment.
74pb_36_m4:     times 2 db 36, -4
75z3_shuf:      db  8,  7,  7,  6,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  0
76z_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
77z_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
78z_upsample1:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
79z_upsample2:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  8
80z2_upsample:  db  7,  6, 15, 14,  5,  4, 13, 12,  3,  2, 11, 10,  1,  0,  9,  8
81z1_shuf_w4:   db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
82z2_shuf_h2:   db  3,  2,  7,  6, 11, 10, 15, 14,  2,  1,  6,  5, 10,  9, 14, 13
83z2_shuf_h4:   db  7,  6, 15, 14,  6,  5, 14, 13,  5,  4, 13, 12,  4,  3, 12, 11
84z3_shuf_w4:   db  4,  3,  3,  2,  2,  1,  1,  0, 12, 11, 11, 10, 10,  9,  9,  8
85z_transpose4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
86z_base_inc:   dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
87              dw  16*64,  17*64,  18*64,  19*64,  20*64,  21*64,  22*64,  23*64
88z2_base_inc:  dw   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64,   8*64
89              dw   9*64,  10*64,  11*64,  12*64,  13*64,  14*64,  15*64,  16*64
90z2_ymul:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
91z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7
92              db 32, 32, 32, 32, 12, 12, 12, 12,  1,  0,  1,  0,  5, -1, -1, -1 ; 0, 4, 1, 5
93; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
94filter_shuf1: db 10,  4, 10,  4, 37,  6,  5,  6,103,  9,  7,  9, 72, -1,  8, -1
95              db 16,  4,  0,  4, 53,  6,  5,  6,119, 11,  7, 11, 95, -1, 15, -1
96filter_shuf2: db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
97filter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11; 15, -1, 15, -1
98pb_127_m127:  times 2 db 127, -127
99ipred_v_shuf: db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
100              db  2,  3,  2,  3,  6,  7,  6,  7, 10, 11, 10, 11, 14, 15, 14, 15
101ipred_h_shuf: db  7,  7,  7,  7,  3,  3,  3,  3,  5,  5,  5,  5,  1,  1,  1,  1
102              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4;  0,  0,  0,  0
103pw_64:        times 2 dw 64
104
105cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1
106                             times 9 db 7, -1
107cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
108                        db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
109                        ; w=8, w_pad=1 as well as second half of previous one
110cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
111                        times 5 db 6, 7
112                        ; w=16,w_pad=2
113                        db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
114                        times 8 db 14, 15
115                        ; w=16,w_pad=3
116                        db 0, 1, 2, 3, 4, 5
117                        times 13 db 6, 7
118pb_15to0:               db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
119
120%define pb_0to15 cfl_ac_w16_pad_shuffle
121%define pb_1  (ipred_h_shuf+12)
122%define pb_2  (ipred_h_shuf+20)
123%define pb_3  (ipred_h_shuf+ 4)
124%define pb_4  (ipred_h_shuf+24)
125%define pb_5  (ipred_h_shuf+ 8)
126%define pb_7  (ipred_h_shuf+ 0)
127%define pb_8  (z_upsample2 +12)
128%define pb_12 (z2_y_shuf_h4+20)
129%define pb_14 (z2_y_shuf_h4+ 4)
130%define pb_15 (z_filter_s  +32)
131%define pb_27 (z2_y_shuf_h4+ 8)
132%define pb_31 (z2_y_shuf_h4+12)
133%define pb_32 (z2_y_shuf_h4+16)
134%define pb_90 (z2_y_shuf_h4+ 0)
135%define pw_1  (z2_y_shuf_h4+24)
136%define pw_8  (z_filter_k  +32)
137
138pw_62:    times 2 dw 62
139pw_128:   times 2 dw 128
140pw_255:   times 2 dw 255
141pw_512:   times 2 dw 512
142
143%macro JMP_TABLE 3-*
144    %xdefine %1_%2_table (%%table - 2*4)
145    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
146    %%table:
147    %rep %0 - 2
148        dd %%base %+ .%3 - (%%table - 2*4)
149        %rotate 1
150    %endrep
151%endmacro
152
153%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
154%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4)
155
156JMP_TABLE ipred_smooth,     avx2, w4, w8, w16, w32, w64
157JMP_TABLE ipred_smooth_v,   avx2, w4, w8, w16, w32, w64
158JMP_TABLE ipred_smooth_h,   avx2, w4, w8, w16, w32, w64
159JMP_TABLE ipred_paeth,      avx2, w4, w8, w16, w32, w64
160JMP_TABLE ipred_filter,     avx2, w4, w8, w16, w32
161JMP_TABLE ipred_dc,         avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
162                                  s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
163JMP_TABLE ipred_dc_left,    avx2, h4, h8, h16, h32, h64
164JMP_TABLE ipred_h,          avx2, w4, w8, w16, w32, w64
165JMP_TABLE ipred_z1,         avx2, w4, w8, w16, w32, w64
166JMP_TABLE ipred_z2,         avx2, w4, w8, w16, w32, w64
167JMP_TABLE ipred_z3,         avx2, h4, h8, h16, h32, h64
168JMP_TABLE ipred_cfl,        avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
169                                  s4-8*4, s8-8*4, s16-8*4, s32-8*4
170JMP_TABLE ipred_cfl_left,   avx2, h4, h8, h16, h32
171JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
172JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
173JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32
174JMP_TABLE pal_pred,         avx2, w4, w8, w16, w32, w64
175
176cextern dr_intra_derivative
177cextern filter_intra_taps
178
179SECTION .text
180
181INIT_YMM avx2
182cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
183    lea                  r5, [ipred_dc_left_avx2_table]
184    tzcnt                wd, wm
185    inc                 tlq
186    movu                 m0, [tlq]
187    movifnidn            hd, hm
188    mov                 r6d, 0x8000
189    shrx                r6d, r6d, wd
190    movd                xm3, r6d
191    movsxd               r6, [r5+wq*4]
192    pcmpeqd              m2, m2
193    pmaddubsw            m0, m2
194    add                  r6, r5
195    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
196    movsxd               wq, [r5+wq*4]
197    add                  wq, r5
198    jmp                  r6
199
200cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
201    mov                  hd, hm ; zero upper half
202    tzcnt               r6d, hd
203    sub                 tlq, hq
204    tzcnt                wd, wm
205    movu                 m0, [tlq]
206    mov                 r5d, 0x8000
207    shrx                r5d, r5d, r6d
208    movd                xm3, r5d
209    lea                  r5, [ipred_dc_left_avx2_table]
210    movsxd               r6, [r5+r6*4]
211    pcmpeqd              m2, m2
212    pmaddubsw            m0, m2
213    add                  r6, r5
214    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
215    movsxd               wq, [r5+wq*4]
216    add                  wq, r5
217    jmp                  r6
218.h64:
219    movu                 m1, [tlq+32] ; unaligned when jumping here from dc_top
220    pmaddubsw            m1, m2
221    paddw                m0, m1
222.h32:
223    vextracti128        xm1, m0, 1
224    paddw               xm0, xm1
225.h16:
226    punpckhqdq          xm1, xm0, xm0
227    paddw               xm0, xm1
228.h8:
229    psrlq               xm1, xm0, 32
230    paddw               xm0, xm1
231.h4:
232    pmaddwd             xm0, xm2
233    pmulhrsw            xm0, xm3
234    lea            stride3q, [strideq*3]
235    vpbroadcastb         m0, xm0
236    mova                 m1, m0
237    jmp                  wq
238
239cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
240    movifnidn            hd, hm
241    movifnidn            wd, wm
242    tzcnt               r6d, hd
243    lea                 r5d, [wq+hq]
244    movd                xm4, r5d
245    tzcnt               r5d, r5d
246    movd                xm5, r5d
247    lea                  r5, [ipred_dc_avx2_table]
248    tzcnt                wd, wd
249    movsxd               r6, [r5+r6*4]
250    movsxd               wq, [r5+wq*4+5*4]
251    pcmpeqd              m3, m3
252    psrlw               xm4, 1
253    add                  r6, r5
254    add                  wq, r5
255    lea            stride3q, [strideq*3]
256    jmp                  r6
257.h4:
258    movd                xm0, [tlq-4]
259    pmaddubsw           xm0, xm3
260    jmp                  wq
261.w4:
262    movd                xm1, [tlq+1]
263    pmaddubsw           xm1, xm3
264    psubw               xm0, xm4
265    paddw               xm0, xm1
266    pmaddwd             xm0, xm3
267    cmp                  hd, 4
268    jg .w4_mul
269    psrlw               xm0, 3
270    jmp .w4_end
271.w4_mul:
272    punpckhqdq          xm1, xm0, xm0
273    lea                 r2d, [hq*2]
274    mov                 r6d, 0x55563334
275    paddw               xm0, xm1
276    shrx                r6d, r6d, r2d
277    psrlq               xm1, xm0, 32
278    paddw               xm0, xm1
279    movd                xm1, r6d
280    psrlw               xm0, 2
281    pmulhuw             xm0, xm1
282.w4_end:
283    vpbroadcastb        xm0, xm0
284.s4:
285    movd   [dstq+strideq*0], xm0
286    movd   [dstq+strideq*1], xm0
287    movd   [dstq+strideq*2], xm0
288    movd   [dstq+stride3q ], xm0
289    lea                dstq, [dstq+strideq*4]
290    sub                  hd, 4
291    jg .s4
292    RET
293ALIGN function_align
294.h8:
295    movq                xm0, [tlq-8]
296    pmaddubsw           xm0, xm3
297    jmp                  wq
298.w8:
299    movq                xm1, [tlq+1]
300    vextracti128        xm2, m0, 1
301    pmaddubsw           xm1, xm3
302    psubw               xm0, xm4
303    paddw               xm0, xm2
304    punpckhqdq          xm2, xm0, xm0
305    paddw               xm0, xm2
306    paddw               xm0, xm1
307    psrlq               xm1, xm0, 32
308    paddw               xm0, xm1
309    pmaddwd             xm0, xm3
310    psrlw               xm0, xm5
311    cmp                  hd, 8
312    je .w8_end
313    mov                 r6d, 0x5556
314    mov                 r2d, 0x3334
315    cmp                  hd, 32
316    cmove               r6d, r2d
317    movd                xm1, r6d
318    pmulhuw             xm0, xm1
319.w8_end:
320    vpbroadcastb        xm0, xm0
321.s8:
322    movq   [dstq+strideq*0], xm0
323    movq   [dstq+strideq*1], xm0
324    movq   [dstq+strideq*2], xm0
325    movq   [dstq+stride3q ], xm0
326    lea                dstq, [dstq+strideq*4]
327    sub                  hd, 4
328    jg .s8
329    RET
330ALIGN function_align
331.h16:
332    mova                xm0, [tlq-16]
333    pmaddubsw           xm0, xm3
334    jmp                  wq
335.w16:
336    movu                xm1, [tlq+1]
337    vextracti128        xm2, m0, 1
338    pmaddubsw           xm1, xm3
339    psubw               xm0, xm4
340    paddw               xm0, xm2
341    paddw               xm0, xm1
342    punpckhqdq          xm1, xm0, xm0
343    paddw               xm0, xm1
344    psrlq               xm1, xm0, 32
345    paddw               xm0, xm1
346    pmaddwd             xm0, xm3
347    psrlw               xm0, xm5
348    cmp                  hd, 16
349    je .w16_end
350    mov                 r6d, 0x5556
351    mov                 r2d, 0x3334
352    test                 hb, 8|32
353    cmovz               r6d, r2d
354    movd                xm1, r6d
355    pmulhuw             xm0, xm1
356.w16_end:
357    vpbroadcastb        xm0, xm0
358.s16:
359    mova   [dstq+strideq*0], xm0
360    mova   [dstq+strideq*1], xm0
361    mova   [dstq+strideq*2], xm0
362    mova   [dstq+stride3q ], xm0
363    lea                dstq, [dstq+strideq*4]
364    sub                  hd, 4
365    jg .s16
366    RET
367ALIGN function_align
368.h32:
369    mova                 m0, [tlq-32]
370    pmaddubsw            m0, m3
371    jmp                  wq
372.w32:
373    movu                 m1, [tlq+1]
374    pmaddubsw            m1, m3
375    paddw                m0, m1
376    vextracti128        xm1, m0, 1
377    psubw               xm0, xm4
378    paddw               xm0, xm1
379    punpckhqdq          xm1, xm0, xm0
380    paddw               xm0, xm1
381    psrlq               xm1, xm0, 32
382    paddw               xm0, xm1
383    pmaddwd             xm0, xm3
384    psrlw               xm0, xm5
385    cmp                  hd, 32
386    je .w32_end
387    lea                 r2d, [hq*2]
388    mov                 r6d, 0x33345556
389    shrx                r6d, r6d, r2d
390    movd                xm1, r6d
391    pmulhuw             xm0, xm1
392.w32_end:
393    vpbroadcastb         m0, xm0
394.s32:
395    mova   [dstq+strideq*0], m0
396    mova   [dstq+strideq*1], m0
397    mova   [dstq+strideq*2], m0
398    mova   [dstq+stride3q ], m0
399    lea                dstq, [dstq+strideq*4]
400    sub                  hd, 4
401    jg .s32
402    RET
403ALIGN function_align
404.h64:
405    mova                 m0, [tlq-64]
406    mova                 m1, [tlq-32]
407    pmaddubsw            m0, m3
408    pmaddubsw            m1, m3
409    paddw                m0, m1
410    jmp                  wq
411.w64:
412    movu                 m1, [tlq+ 1]
413    movu                 m2, [tlq+33]
414    pmaddubsw            m1, m3
415    pmaddubsw            m2, m3
416    paddw                m0, m1
417    paddw                m0, m2
418    vextracti128        xm1, m0, 1
419    psubw               xm0, xm4
420    paddw               xm0, xm1
421    punpckhqdq          xm1, xm0, xm0
422    paddw               xm0, xm1
423    psrlq               xm1, xm0, 32
424    paddw               xm0, xm1
425    pmaddwd             xm0, xm3
426    psrlw               xm0, xm5
427    cmp                  hd, 64
428    je .w64_end
429    mov                 r6d, 0x33345556
430    shrx                r6d, r6d, hd
431    movd                xm1, r6d
432    pmulhuw             xm0, xm1
433.w64_end:
434    vpbroadcastb         m0, xm0
435    mova                 m1, m0
436.s64:
437    mova [dstq+strideq*0+32*0], m0
438    mova [dstq+strideq*0+32*1], m1
439    mova [dstq+strideq*1+32*0], m0
440    mova [dstq+strideq*1+32*1], m1
441    mova [dstq+strideq*2+32*0], m0
442    mova [dstq+strideq*2+32*1], m1
443    mova [dstq+stride3q +32*0], m0
444    mova [dstq+stride3q +32*1], m1
445    lea                dstq, [dstq+strideq*4]
446    sub                  hd, 4
447    jg .s64
448    RET
449
450cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
451    lea                  r5, [ipred_dc_splat_avx2_table]
452    tzcnt                wd, wm
453    movifnidn            hd, hm
454    movsxd               wq, [r5+wq*4]
455    vpbroadcastd         m0, [r5-ipred_dc_splat_avx2_table+pb_128]
456    mova                 m1, m0
457    add                  wq, r5
458    lea            stride3q, [strideq*3]
459    jmp                  wq
460
461cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
462    lea                  r5, [ipred_dc_splat_avx2_table]
463    tzcnt                wd, wm
464    movu                 m0, [tlq+ 1]
465    movu                 m1, [tlq+33]
466    movifnidn            hd, hm
467    movsxd               wq, [r5+wq*4]
468    add                  wq, r5
469    lea            stride3q, [strideq*3]
470    jmp                  wq
471
472%macro IPRED_H 2 ; w, store_type
473    vpbroadcastb         m0, [tlq-1]
474    vpbroadcastb         m1, [tlq-2]
475    vpbroadcastb         m2, [tlq-3]
476    sub                 tlq, 4
477    vpbroadcastb         m3, [tlq+0]
478    mov%2  [dstq+strideq*0], m0
479    mov%2  [dstq+strideq*1], m1
480    mov%2  [dstq+strideq*2], m2
481    mov%2  [dstq+stride3q ], m3
482    lea                dstq, [dstq+strideq*4]
483    sub                  hd, 4
484    jg .w%1
485    RET
486ALIGN function_align
487%endmacro
488
489INIT_XMM avx2
490cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
491    lea                  r5, [ipred_h_avx2_table]
492    tzcnt                wd, wm
493    movifnidn            hd, hm
494    movsxd               wq, [r5+wq*4]
495    add                  wq, r5
496    lea            stride3q, [strideq*3]
497    jmp                  wq
498.w4:
499    IPRED_H               4, d
500.w8:
501    IPRED_H               8, q
502.w16:
503    IPRED_H              16, a
504INIT_YMM avx2
505.w32:
506    IPRED_H              32, a
507.w64:
508    vpbroadcastb         m0, [tlq-1]
509    vpbroadcastb         m1, [tlq-2]
510    vpbroadcastb         m2, [tlq-3]
511    sub                 tlq, 4
512    vpbroadcastb         m3, [tlq+0]
513    mova [dstq+strideq*0+32*0], m0
514    mova [dstq+strideq*0+32*1], m0
515    mova [dstq+strideq*1+32*0], m1
516    mova [dstq+strideq*1+32*1], m1
517    mova [dstq+strideq*2+32*0], m2
518    mova [dstq+strideq*2+32*1], m2
519    mova [dstq+stride3q +32*0], m3
520    mova [dstq+stride3q +32*1], m3
521    lea                dstq, [dstq+strideq*4]
522    sub                  hd, 4
523    jg .w64
524    RET
525
526%macro PAETH 2 ; top, ldiff
527    pavgb                m1, m%1, m3 ; Calculating tldiff normally requires
528    pxor                 m0, m%1, m3 ; 10-bit intermediates, but we can do it
529    pand                 m0, m4      ; in 8-bit with some tricks which avoids
530    psubusb              m2, m5, m1  ; having to unpack everything to 16-bit.
531    psubb                m1, m0
532    psubusb              m1, m5
533    por                  m1, m2
534    paddusb              m1, m1
535    por                  m1, m0      ; min(tldiff, 255)
536    psubusb              m2, m5, m3
537    psubusb              m0, m3, m5
538    por                  m2, m0      ; tdiff
539    pminub               m2, m%2
540    pcmpeqb              m0, m%2, m2 ; ldiff <= tdiff
541    vpblendvb            m0, m%1, m3, m0
542    pminub               m1, m2
543    pcmpeqb              m1, m2      ; ldiff <= tldiff || tdiff <= tldiff
544    vpblendvb            m0, m5, m0, m1
545%endmacro
546
547cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h
548%define base r5-ipred_paeth_avx2_table
549    lea                  r5, [ipred_paeth_avx2_table]
550    tzcnt                wd, wm
551    vpbroadcastb         m5, [tlq]   ; topleft
552    movifnidn            hd, hm
553    movsxd               wq, [r5+wq*4]
554    vpbroadcastd         m4, [base+pb_1]
555    add                  wq, r5
556    jmp                  wq
557.w4:
558    vpbroadcastd         m6, [tlq+1] ; top
559    mova                 m8, [base+ipred_h_shuf]
560    lea                  r3, [strideq*3]
561    psubusb              m7, m5, m6
562    psubusb              m0, m6, m5
563    por                  m7, m0      ; ldiff
564.w4_loop:
565    sub                 tlq, 8
566    vpbroadcastq         m3, [tlq]
567    pshufb               m3, m8      ; left
568    PAETH                 6, 7
569    vextracti128        xm1, m0, 1
570    movd   [dstq+strideq*0], xm0
571    movd   [dstq+strideq*1], xm1
572    pextrd [dstq+strideq*2], xm0, 2
573    pextrd [dstq+r3       ], xm1, 2
574    cmp                  hd, 4
575    je .ret
576    lea                dstq, [dstq+strideq*4]
577    pextrd [dstq+strideq*0], xm0, 1
578    pextrd [dstq+strideq*1], xm1, 1
579    pextrd [dstq+strideq*2], xm0, 3
580    pextrd [dstq+r3       ], xm1, 3
581    lea                dstq, [dstq+strideq*4]
582    sub                  hd, 8
583    jg .w4_loop
584.ret:
585    RET
586ALIGN function_align
587.w8:
588    vpbroadcastq         m6, [tlq+1]
589    mova                 m8, [base+ipred_h_shuf]
590    lea                  r3, [strideq*3]
591    psubusb              m7, m5, m6
592    psubusb              m0, m6, m5
593    por                  m7, m0
594.w8_loop:
595    sub                 tlq, 4
596    vpbroadcastd         m3, [tlq]
597    pshufb               m3, m8
598    PAETH                 6, 7
599    vextracti128        xm1, m0, 1
600    movq   [dstq+strideq*0], xm0
601    movq   [dstq+strideq*1], xm1
602    movhps [dstq+strideq*2], xm0
603    movhps [dstq+r3       ], xm1
604    lea                dstq, [dstq+strideq*4]
605    sub                  hd, 4
606    jg .w8_loop
607    RET
608ALIGN function_align
609.w16:
610    vbroadcasti128       m6, [tlq+1]
611    mova                xm8, xm4 ; lower half = 1, upper half = 0
612    psubusb              m7, m5, m6
613    psubusb              m0, m6, m5
614    por                  m7, m0
615.w16_loop:
616    sub                 tlq, 2
617    vpbroadcastd         m3, [tlq]
618    pshufb               m3, m8
619    PAETH                 6, 7
620    mova         [dstq+strideq*0], xm0
621    vextracti128 [dstq+strideq*1], m0, 1
622    lea                dstq, [dstq+strideq*2]
623    sub                  hd, 2
624    jg .w16_loop
625    RET
626ALIGN function_align
627.w32:
628    movu                 m6, [tlq+1]
629    psubusb              m7, m5, m6
630    psubusb              m0, m6, m5
631    por                  m7, m0
632.w32_loop:
633    dec                 tlq
634    vpbroadcastb         m3, [tlq]
635    PAETH                 6, 7
636    mova             [dstq], m0
637    add                dstq, strideq
638    dec                  hd
639    jg .w32_loop
640    RET
641ALIGN function_align
642.w64:
643    movu                 m6, [tlq+ 1]
644    movu                 m7, [tlq+33]
645%if WIN64
646    movaps              r4m, xmm9
647%endif
648    psubusb              m8, m5, m6
649    psubusb              m0, m6, m5
650    psubusb              m9, m5, m7
651    psubusb              m1, m7, m5
652    por                  m8, m0
653    por                  m9, m1
654.w64_loop:
655    dec                 tlq
656    vpbroadcastb         m3, [tlq]
657    PAETH                 6, 8
658    mova        [dstq+32*0], m0
659    PAETH                 7, 9
660    mova        [dstq+32*1], m0
661    add                dstq, strideq
662    dec                  hd
663    jg .w64_loop
664%if WIN64
665    movaps             xmm9, r4m
666%endif
667    RET
668
669%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
670    ; w * a         = (w - 128) * a + 128 * a
671    ; (256 - w) * b = (127 - w) * b + 129 * b
672    pmaddubsw            m0, m%3, m%1
673    pmaddubsw            m1, m%4, m%2
674    paddw                m0, m%5
675    paddw                m1, m%6
676    psrlw                m0, 8
677    psrlw                m1, 8
678    packuswb             m0, m1
679%endmacro
680
681cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights
682%define base r6-ipred_smooth_v_avx2_table
683    lea                  r6, [ipred_smooth_v_avx2_table]
684    tzcnt                wd, wm
685    mov                  hd, hm
686    movsxd               wq, [r6+wq*4]
687    vpbroadcastd         m0, [base+pb_127_m127]
688    vpbroadcastd         m1, [base+pw_128]
689    lea            weightsq, [base+smooth_weights+hq*4]
690    neg                  hq
691    vpbroadcastb         m5, [tlq+hq] ; bottom
692    add                  wq, r6
693    jmp                  wq
694.w4:
695    vpbroadcastd         m2, [tlq+1]
696    punpcklbw            m2, m5 ; top, bottom
697    mova                 m5, [base+ipred_v_shuf]
698    lea                  r3, [strideq*3]
699    punpckldq            m4, m5, m5
700    punpckhdq            m5, m5
701    pmaddubsw            m3, m2, m0
702    paddw                m1, m2 ;   1 * top + 256 * bottom + 128, overflow is ok
703    paddw                m3, m1 ; 128 * top + 129 * bottom + 128
704.w4_loop:
705    vbroadcasti128       m1, [weightsq+hq*2]
706    pshufb               m0, m1, m4
707    pshufb               m1, m5
708    SMOOTH                0, 1, 2, 2, 3, 3
709    vextracti128        xm1, m0, 1
710    movd   [dstq+strideq*0], xm0
711    movd   [dstq+strideq*1], xm1
712    pextrd [dstq+strideq*2], xm0, 1
713    pextrd [dstq+r3       ], xm1, 1
714    cmp                  hd, -4
715    je .ret
716    lea                dstq, [dstq+strideq*4]
717    pextrd [dstq+strideq*0], xm0, 2
718    pextrd [dstq+strideq*1], xm1, 2
719    pextrd [dstq+strideq*2], xm0, 3
720    pextrd [dstq+r3       ], xm1, 3
721    lea                dstq, [dstq+strideq*4]
722    add                  hq, 8
723    jl .w4_loop
724.ret:
725    RET
726ALIGN function_align
727.w8:
728    vpbroadcastq         m2, [tlq+1]
729    punpcklbw            m2, m5
730    mova                 m5, [base+ipred_v_shuf]
731    lea                  r3, [strideq*3]
732    pshufd               m4, m5, q0000
733    pshufd               m5, m5, q1111
734    pmaddubsw            m3, m2, m0
735    paddw                m1, m2
736    paddw                m3, m1
737.w8_loop:
738    vpbroadcastq         m1, [weightsq+hq*2]
739    pshufb               m0, m1, m4
740    pshufb               m1, m5
741    SMOOTH                0, 1, 2, 2, 3, 3
742    vextracti128        xm1, m0, 1
743    movq   [dstq+strideq*0], xm0
744    movq   [dstq+strideq*1], xm1
745    movhps [dstq+strideq*2], xm0
746    movhps [dstq+r3       ], xm1
747    lea                dstq, [dstq+strideq*4]
748    add                  hq, 4
749    jl .w8_loop
750    RET
751ALIGN function_align
752.w16:
753    WIN64_SPILL_XMM       7
754    vbroadcasti128       m3, [tlq+1]
755    mova                 m6, [base+ipred_v_shuf]
756    punpcklbw            m2, m3, m5
757    punpckhbw            m3, m5
758    pmaddubsw            m4, m2, m0
759    pmaddubsw            m5, m3, m0
760    paddw                m0, m1, m2
761    paddw                m1, m3
762    paddw                m4, m0
763    paddw                m5, m1
764.w16_loop:
765    vpbroadcastd         m1, [weightsq+hq*2]
766    pshufb               m1, m6
767    SMOOTH                1, 1, 2, 3, 4, 5
768    mova         [dstq+strideq*0], xm0
769    vextracti128 [dstq+strideq*1], m0, 1
770    lea                dstq, [dstq+strideq*2]
771    add                  hq, 2
772    jl .w16_loop
773    RET
774ALIGN function_align
775.w32:
776    WIN64_SPILL_XMM       6
777    movu                 m3, [tlq+1]
778    punpcklbw            m2, m3, m5
779    punpckhbw            m3, m5
780    pmaddubsw            m4, m2, m0
781    pmaddubsw            m5, m3, m0
782    paddw                m0, m1, m2
783    paddw                m1, m3
784    paddw                m4, m0
785    paddw                m5, m1
786.w32_loop:
787    vpbroadcastw         m1, [weightsq+hq*2]
788    SMOOTH                1, 1, 2, 3, 4, 5
789    mova             [dstq], m0
790    add                dstq, strideq
791    inc                  hq
792    jl .w32_loop
793    RET
794ALIGN function_align
795.w64:
796    WIN64_SPILL_XMM      11
797    movu                 m4, [tlq+ 1]
798    movu                 m8, [tlq+33]
799    punpcklbw            m3, m4, m5
800    punpckhbw            m4, m5
801    punpcklbw            m7, m8, m5
802    punpckhbw            m8, m5
803    pmaddubsw            m5, m3, m0
804    pmaddubsw            m6, m4, m0
805    pmaddubsw            m9, m7, m0
806    pmaddubsw           m10, m8, m0
807    paddw                m2, m1, m3
808    paddw                m5, m2
809    paddw                m2, m1, m4
810    paddw                m6, m2
811    paddw                m0, m1, m7
812    paddw                m9, m0
813    paddw                m1, m8
814    paddw               m10, m1
815.w64_loop:
816    vpbroadcastw         m2, [weightsq+hq*2]
817    SMOOTH                2, 2, 3, 4, 5, 6
818    mova        [dstq+32*0], m0
819    SMOOTH                2, 2, 7, 8, 9, 10
820    mova        [dstq+32*1], m0
821    add                dstq, strideq
822    inc                  hq
823    jl .w64_loop
824    RET
825
826cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
827%define base r5-ipred_smooth_h_avx2_table
828    lea                  r5, [ipred_smooth_h_avx2_table]
829    mov                  wd, wm
830    vpbroadcastb         m3, [tlq+wq] ; right
831    tzcnt                wd, wd
832    mov                  hd, hm
833    movsxd               wq, [r5+wq*4]
834    vpbroadcastd         m4, [base+pb_127_m127]
835    vpbroadcastd         m5, [base+pw_128]
836    add                  wq, r5
837    jmp                  wq
838.w4:
839    WIN64_SPILL_XMM       8
840    vpbroadcastq         m6, [base+smooth_weights+4*2]
841    mova                 m7, [base+ipred_h_shuf]
842    sub                 tlq, 8
843    sub                 tlq, hq
844    lea                  r3, [strideq*3]
845.w4_loop:
846    vpbroadcastq         m2, [tlq+hq]
847    pshufb               m2, m7
848    punpcklbw            m1, m2, m3 ; left, right
849    punpckhbw            m2, m3
850    pmaddubsw            m0, m1, m4 ; 127 * left - 127 * right
851    paddw                m0, m1     ; 128 * left + 129 * right
852    pmaddubsw            m1, m6
853    paddw                m1, m5
854    paddw                m0, m1
855    pmaddubsw            m1, m2, m4
856    paddw                m1, m2
857    pmaddubsw            m2, m6
858    paddw                m2, m5
859    paddw                m1, m2
860    psrlw                m0, 8
861    psrlw                m1, 8
862    packuswb             m0, m1
863    vextracti128        xm1, m0, 1
864    movd   [dstq+strideq*0], xm0
865    movd   [dstq+strideq*1], xm1
866    pextrd [dstq+strideq*2], xm0, 2
867    pextrd [dstq+r3       ], xm1, 2
868    cmp                  hd, 4
869    je .ret
870    lea                dstq, [dstq+strideq*4]
871    pextrd [dstq+strideq*0], xm0, 1
872    pextrd [dstq+strideq*1], xm1, 1
873    pextrd [dstq+strideq*2], xm0, 3
874    pextrd [dstq+r3       ], xm1, 3
875    lea                dstq, [dstq+strideq*4]
876    sub                  hd, 8
877    jg .w4_loop
878.ret:
879    RET
880ALIGN function_align
881.w8:
882    WIN64_SPILL_XMM       8
883    vbroadcasti128       m6, [base+smooth_weights+8*2]
884    mova                 m7, [base+ipred_h_shuf]
885    sub                 tlq, 4
886    lea                  r3, [strideq*3]
887    sub                 tlq, hq
888.w8_loop:
889    vpbroadcastd         m2, [tlq+hq]
890    pshufb               m2, m7
891    punpcklbw            m1, m2, m3
892    punpckhbw            m2, m3
893    pmaddubsw            m0, m1, m4
894    paddw                m0, m1
895    pmaddubsw            m1, m6
896    paddw                m1, m5
897    paddw                m0, m1
898    pmaddubsw            m1, m2, m4
899    paddw                m1, m2
900    pmaddubsw            m2, m6
901    paddw                m2, m5
902    paddw                m1, m2
903    psrlw                m0, 8
904    psrlw                m1, 8
905    packuswb             m0, m1
906    vextracti128        xm1, m0, 1
907    movq   [dstq+strideq*0], xm0
908    movq   [dstq+strideq*1], xm1
909    movhps [dstq+strideq*2], xm0
910    movhps [dstq+r3       ], xm1
911    lea                dstq, [dstq+strideq*4]
912    sub                  hd, 4
913    jg .w8_loop
914    RET
915ALIGN function_align
916.w16:
917    ALLOC_STACK        32*4, 8
918    lea                  r3, [rsp+64*2-4]
919    call .prep ; only worthwhile for for w16 and above
920    sub                 tlq, 2
921    vpbroadcastd        xm6, [base+pb_1]
922    mova                xm7, [base+ipred_v_shuf+16]
923    vinserti128          m7, [base+ipred_v_shuf+ 0], 1
924    vbroadcasti128       m4, [base+smooth_weights+16*2]
925    vbroadcasti128       m5, [base+smooth_weights+16*3]
926.w16_loop:
927    vpbroadcastd         m1, [tlq+hq]
928    vpbroadcastd         m2, [r3+hq*2]
929    pshufb               m1, m6
930    punpcklbw            m1, m3
931    pshufb               m2, m7
932    SMOOTH                4, 5, 1, 1, 2, 2
933    mova         [dstq+strideq*0], xm0
934    vextracti128 [dstq+strideq*1], m0, 1
935    lea                dstq, [dstq+strideq*2]
936    sub                  hd, 2
937    jg .w16_loop
938    RET
939ALIGN function_align
940.w32:
941    ALLOC_STACK        32*4
942    lea                  r3, [rsp+64*2-2]
943    call .prep
944    dec                 tlq
945    mova                xm4, [base+smooth_weights+16*4]
946    vinserti128          m4, [base+smooth_weights+16*6], 1
947    mova                xm5, [base+smooth_weights+16*5]
948    vinserti128          m5, [base+smooth_weights+16*7], 1
949.w32_loop:
950    vpbroadcastb         m1, [tlq+hq]
951    punpcklbw            m1, m3
952    vpbroadcastw         m2, [r3+hq*2]
953    SMOOTH                4, 5, 1, 1, 2, 2
954    mova             [dstq], m0
955    add                dstq, strideq
956    dec                  hd
957    jg .w32_loop
958    RET
959ALIGN function_align
960.w64:
961    ALLOC_STACK        32*4, 9
962    lea                  r3, [rsp+64*2-2]
963    call .prep
964    add                  r5, smooth_weights+16*15-ipred_smooth_h_avx2_table
965    dec                 tlq
966    mova                xm5, [r5-16*7]
967    vinserti128          m5, [r5-16*5], 1
968    mova                xm6, [r5-16*6]
969    vinserti128          m6, [r5-16*4], 1
970    mova                xm7, [r5-16*3]
971    vinserti128          m7, [r5-16*1], 1
972    mova                xm8, [r5-16*2]
973    vinserti128          m8, [r5-16*0], 1
974.w64_loop:
975    vpbroadcastb         m2, [tlq+hq]
976    punpcklbw            m2, m3
977    vpbroadcastw         m4, [r3+hq*2]
978    SMOOTH                5, 6, 2, 2, 4, 4
979    mova        [dstq+32*0], m0
980    SMOOTH                7, 8, 2, 2, 4, 4
981    mova        [dstq+32*1], m0
982    add                dstq, strideq
983    dec                  hd
984    jg .w64_loop
985    RET
986ALIGN function_align
987.prep:
988    vpermq               m2, [tlq-32*1], q3120
989    punpckhbw            m1, m2, m3
990    punpcklbw            m2, m3
991    pmaddubsw            m0, m1, m4 ; 127 * left - 127 * right
992    paddw                m1, m5     ;   1 * left + 256 * right + 128
993    paddw                m0, m1     ; 128 * left + 129 * right + 128
994    pmaddubsw            m1, m2, m4
995    paddw                m2, m5
996    paddw                m1, m2
997    vpermq               m2, [tlq-32*2], q3120
998    mova [rsp+gprsize+32*3], m0
999    mova [rsp+gprsize+32*2], m1
1000    punpckhbw            m1, m2, m3
1001    punpcklbw            m2, m3
1002    pmaddubsw            m0, m1, m4
1003    paddw                m1, m5
1004    paddw                m0, m1
1005    pmaddubsw            m1, m2, m4
1006    paddw                m2, m5
1007    paddw                m1, m2
1008    mova [rsp+gprsize+32*1], m0
1009    mova [rsp+gprsize+32*0], m1
1010    sub                  r3, hq
1011    sub                 tlq, hq
1012    sub                  r3, hq
1013    ret
1014
1015%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
1016    pmaddubsw            m0, m%3, m%1
1017    pmaddubsw            m1, m%4, m%2
1018%ifnum %5
1019    paddw                m0, m%5
1020%else
1021    paddw                m0, %5
1022%endif
1023%ifnum %6
1024    paddw                m1, m%6
1025%else
1026    paddw                m1, %6
1027%endif
1028    pavgw                m0, m2
1029    pavgw                m1, m3
1030    psrlw                m0, 8
1031    psrlw                m1, 8
1032    packuswb             m0, m1
1033%endmacro
1034
1035cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
1036%define base r6-ipred_smooth_avx2_table
1037    lea                  r6, [ipred_smooth_avx2_table]
1038    mov                  wd, wm
1039    vpbroadcastb         m4, [tlq+wq] ; right
1040    tzcnt                wd, wd
1041    mov                  hd, hm
1042    mov                  r5, tlq
1043    sub                  r5, hq
1044    movsxd               wq, [r6+wq*4]
1045    vpbroadcastd         m5, [base+pb_127_m127]
1046    vpbroadcastb         m0, [r5] ; bottom
1047    vpbroadcastd         m3, [base+pw_255]
1048    add                  wq, r6
1049    lea          v_weightsq, [base+smooth_weights+hq*2]
1050    jmp                  wq
1051.w4:
1052    WIN64_SPILL_XMM      12
1053    mova                m10, [base+ipred_h_shuf]
1054    vpbroadcastq        m11, [base+smooth_weights+4*2]
1055    mova                 m7, [base+ipred_v_shuf]
1056    vpbroadcastd         m8, [tlq+1]
1057    sub                 tlq, 8
1058    lea                  r3, [strideq*3]
1059    sub                 tlq, hq
1060    punpcklbw            m8, m0 ; top, bottom
1061    pshufd               m6, m7, q2200
1062    pshufd               m7, m7, q3311
1063    pmaddubsw            m9, m8, m5
1064    paddw                m3, m8 ;   1 * top + 255 * bottom + 255
1065    paddw                m9, m3 ; 128 * top + 129 * bottom + 255
1066.w4_loop:
1067    vpbroadcastq         m1, [tlq+hq]
1068    pshufb               m1, m10
1069    punpcklbw            m0, m1, m4 ; left, right
1070    punpckhbw            m1, m4
1071    pmaddubsw            m2, m0, m5 ; 127 * left - 127 * right
1072    pmaddubsw            m3, m1, m5
1073    paddw                m2, m0     ; 128 * left + 129 * right
1074    paddw                m3, m1
1075    pmaddubsw            m0, m11
1076    pmaddubsw            m1, m11
1077    paddw                m2, m0
1078    paddw                m3, m1
1079    vbroadcasti128       m1, [v_weightsq]
1080    add          v_weightsq, 16
1081    pshufb               m0, m1, m6
1082    pshufb               m1, m7
1083    SMOOTH_2D_END         0, 1, 8, 8, 9, 9
1084    vextracti128        xm1, m0, 1
1085    movd   [dstq+strideq*0], xm0
1086    movd   [dstq+strideq*1], xm1
1087    pextrd [dstq+strideq*2], xm0, 2
1088    pextrd [dstq+r3       ], xm1, 2
1089    cmp                  hd, 4
1090    je .ret
1091    lea                dstq, [dstq+strideq*4]
1092    pextrd [dstq+strideq*0], xm0, 1
1093    pextrd [dstq+strideq*1], xm1, 1
1094    pextrd [dstq+strideq*2], xm0, 3
1095    pextrd [dstq+r3       ], xm1, 3
1096    lea                dstq, [dstq+strideq*4]
1097    sub                  hd, 8
1098    jg .w4_loop
1099.ret:
1100    RET
1101ALIGN function_align
1102.w8:
1103    WIN64_SPILL_XMM      12
1104    mova                m10, [base+ipred_h_shuf]
1105    vbroadcasti128      m11, [base+smooth_weights+8*2]
1106    mova                 m7, [base+ipred_v_shuf]
1107    vpbroadcastq         m8, [tlq+1]
1108    sub                 tlq, 4
1109    lea                  r3, [strideq*3]
1110    sub                 tlq, hq
1111    punpcklbw            m8, m0
1112    pshufd               m6, m7, q0000
1113    pshufd               m7, m7, q1111
1114    pmaddubsw            m9, m8, m5
1115    paddw                m3, m8
1116    paddw                m9, m3
1117.w8_loop:
1118    vpbroadcastd         m1, [tlq+hq]
1119    pshufb               m1, m10
1120    punpcklbw            m0, m1, m4
1121    punpckhbw            m1, m4
1122    pmaddubsw            m2, m0, m5
1123    pmaddubsw            m3, m1, m5
1124    paddw                m2, m0
1125    paddw                m3, m1
1126    pmaddubsw            m0, m11
1127    pmaddubsw            m1, m11
1128    paddw                m2, m0
1129    paddw                m3, m1
1130    vpbroadcastq         m1, [v_weightsq]
1131    add          v_weightsq, 8
1132    pshufb               m0, m1, m6
1133    pshufb               m1, m7
1134    SMOOTH_2D_END         0, 1, 8, 8, 9, 9
1135    vextracti128        xm1, m0, 1
1136    movq   [dstq+strideq*0], xm0
1137    movq   [dstq+strideq*1], xm1
1138    movhps [dstq+strideq*2], xm0
1139    movhps [dstq+r3       ], xm1
1140    lea                dstq, [dstq+strideq*4]
1141    sub                  hd, 4
1142    jg .w8_loop
1143    RET
1144ALIGN function_align
1145.w16:
1146    %assign regs_used 4
1147    ALLOC_STACK       -32*4, 14
1148    %assign regs_used 7
1149    vbroadcasti128      m11, [tlq+1]
1150    lea                  r3, [rsp+64*2-4]
1151    punpcklbw           m10, m11, m0 ; top, bottom
1152    punpckhbw           m11, m0
1153    call .prep_v
1154    sub                 tlq, 2
1155    pmaddubsw           m12, m10, m5
1156    pmaddubsw           m13, m11, m5
1157    vpbroadcastd        xm5, [base+pb_1]
1158    mova                 m9, [base+ipred_v_shuf]
1159    vbroadcasti128       m6, [base+smooth_weights+16*2]
1160    vbroadcasti128       m7, [base+smooth_weights+16*3]
1161    vperm2i128           m8, m9, m9, 0x01
1162    paddw                m0, m10, m3
1163    paddw                m3, m11
1164    paddw               m12, m0
1165    paddw               m13, m3
1166.w16_loop:
1167    vpbroadcastd         m3, [tlq+hq]
1168    vpbroadcastd         m0, [r3+hq*2]
1169    vpbroadcastd         m1, [v_weightsq]
1170    add          v_weightsq, 4
1171    pshufb               m3, m5
1172    punpcklbw            m3, m4 ; left, right
1173    pmaddubsw            m2, m3, m6
1174    pmaddubsw            m3, m7
1175    pshufb               m0, m8
1176    pshufb               m1, m9
1177    paddw                m2, m0
1178    paddw                m3, m0
1179    SMOOTH_2D_END         1, 1, 10, 11, 12, 13
1180    mova         [dstq+strideq*0], xm0
1181    vextracti128 [dstq+strideq*1], m0, 1
1182    lea                dstq, [dstq+strideq*2]
1183    sub                  hd, 2
1184    jg .w16_loop
1185    RET
1186ALIGN function_align
1187.w32:
1188    %assign regs_used 4
1189    ALLOC_STACK       -32*4, 11
1190    %assign regs_used 7
1191    movu                 m8, [tlq+1]
1192    lea                  r3, [rsp+64*2-2]
1193    punpcklbw            m7, m8, m0
1194    punpckhbw            m8, m0
1195    call .prep_v
1196    dec                 tlq
1197    pmaddubsw            m9, m7, m5
1198    pmaddubsw           m10, m8, m5
1199    mova                xm5, [base+smooth_weights+16*4]
1200    vinserti128          m5, [base+smooth_weights+16*6], 1
1201    mova                xm6, [base+smooth_weights+16*5]
1202    vinserti128          m6, [base+smooth_weights+16*7], 1
1203    paddw                m0, m7, m3
1204    paddw                m3, m8
1205    paddw                m9, m0
1206    paddw               m10, m3
1207.w32_loop:
1208    vpbroadcastb         m3, [tlq+hq]
1209    punpcklbw            m3, m4
1210    vpbroadcastw         m0, [r3+hq*2]
1211    vpbroadcastw         m1, [v_weightsq]
1212    add          v_weightsq, 2
1213    pmaddubsw            m2, m3, m5
1214    pmaddubsw            m3, m6
1215    paddw                m2, m0
1216    paddw                m3, m0
1217    SMOOTH_2D_END         1, 1, 7, 8, 9, 10
1218    mova             [dstq], m0
1219    add                dstq, strideq
1220    dec                  hd
1221    jg .w32_loop
1222    RET
1223ALIGN function_align
1224.w64:
1225    %assign regs_used 4
1226    ALLOC_STACK       -32*8, 16
1227    %assign regs_used 7
1228    movu                m13, [tlq+1 ]
1229    movu                m15, [tlq+33]
1230    add                  r6, smooth_weights+16*15-ipred_smooth_avx2_table
1231    lea                  r3, [rsp+64*2-2]
1232    punpcklbw           m12, m13, m0
1233    punpckhbw           m13, m0
1234    punpcklbw           m14, m15, m0
1235    punpckhbw           m15, m0
1236    call .prep_v
1237    dec                 tlq
1238    pmaddubsw            m0, m12, m5
1239    pmaddubsw            m1, m13, m5
1240    pmaddubsw            m2, m14, m5
1241    pmaddubsw            m5, m15, m5
1242    mova                xm8, [r6-16*7]
1243    vinserti128          m8, [r6-16*5], 1
1244    mova                xm9, [r6-16*6]
1245    vinserti128          m9, [r6-16*4], 1
1246    mova               xm10, [r6-16*3]
1247    vinserti128         m10, [r6-16*1], 1
1248    mova               xm11, [r6-16*2]
1249    vinserti128         m11, [r6-16*0], 1
1250    lea                  r6, [rsp+32*4]
1251    paddw                m0, m3
1252    paddw                m1, m3
1253    paddw                m2, m3
1254    paddw                m3, m5
1255    paddw                m0, m12
1256    paddw                m1, m13
1257    paddw                m2, m14
1258    paddw                m3, m15
1259    mova          [r6+32*0], m0
1260    mova          [r6+32*1], m1
1261    mova          [r6+32*2], m2
1262    mova          [r6+32*3], m3
1263.w64_loop:
1264    vpbroadcastb         m5, [tlq+hq]
1265    punpcklbw            m5, m4
1266    vpbroadcastw         m6, [r3+hq*2]
1267    vpbroadcastw         m7, [v_weightsq]
1268    add          v_weightsq, 2
1269    pmaddubsw            m2, m5, m8
1270    pmaddubsw            m3, m5, m9
1271    paddw                m2, m6
1272    paddw                m3, m6
1273    SMOOTH_2D_END         7, 7, 12, 13, [r6+32*0], [r6+32*1]
1274    mova        [dstq+32*0], m0
1275    pmaddubsw            m2, m5, m10
1276    pmaddubsw            m3, m5, m11
1277    paddw                m2, m6
1278    paddw                m3, m6
1279    SMOOTH_2D_END         7, 7, 14, 15, [r6+32*2], [r6+32*3]
1280    mova        [dstq+32*1], m0
1281    add                dstq, strideq
1282    dec                  hd
1283    jg .w64_loop
1284    RET
1285ALIGN function_align
1286.prep_v:
1287    vpermq               m2, [tlq-32*1], q3120
1288    punpckhbw            m1, m2, m4
1289    punpcklbw            m2, m4
1290    pmaddubsw            m0, m1, m5 ; 127 * left - 127 * right
1291    paddw                m0, m1     ; 128 * left + 129 * right
1292    pmaddubsw            m1, m2, m5
1293    paddw                m1, m2
1294    vpermq               m2, [tlq-32*2], q3120
1295    mova [rsp+gprsize+32*3], m0
1296    mova [rsp+gprsize+32*2], m1
1297    punpckhbw            m1, m2, m4
1298    punpcklbw            m2, m4
1299    pmaddubsw            m0, m1, m5
1300    paddw                m0, m1
1301    pmaddubsw            m1, m2, m5
1302    paddw                m1, m2
1303    mova [rsp+gprsize+32*1], m0
1304    mova [rsp+gprsize+32*0], m1
1305    sub                  r3, hq
1306    sub                 tlq, hq
1307    sub                  r3, hq
1308    ret
1309
1310cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
1311    lea                  r6, [ipred_z1_avx2_table]
1312    tzcnt                wd, wm
1313    movifnidn        angled, anglem
1314    movifnidn            hd, hm
1315    lea                  r7, [dr_intra_derivative]
1316    inc                 tlq
1317    movsxd               wq, [r6+wq*4]
1318    add                  wq, r6
1319    mov                 dxd, angled
1320    and                 dxd, 0x7e
1321    add              angled, 165 ; ~90
1322    movzx               dxd, word [r7+dxq]
1323    xor              angled, 0x4ff ; d = 90 - angle
1324    vpbroadcastd         m3, [pw_512]
1325    vpbroadcastd         m4, [pw_62]
1326    vpbroadcastd         m5, [pw_64]
1327    jmp                  wq
1328.w4:
1329    cmp              angleb, 40
1330    jae .w4_no_upsample
1331    lea                 r3d, [angleq-1024]
1332    sar                 r3d, 7
1333    add                 r3d, hd
1334    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
1335    ALLOC_STACK         -32, 8
1336    mova                xm1, [tlq-1]
1337    pshufb              xm0, xm1, [z_upsample1]
1338    pshufb              xm1, [z_upsample2]
1339    vpbroadcastd        xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
1340    add                 dxd, dxd        ; pw_512 (which is already in m3)
1341    pmaddubsw           xm0, xm2        ; for rounding instead of pw_2048
1342    pextrd         [rsp+16], xm1, 3 ; top[max_base_x]
1343    pmaddubsw           xm1, xm2
1344    movd                xm7, dxd
1345    mov                 r3d, dxd ; xpos
1346    vpbroadcastw         m7, xm7
1347    paddw               xm1, xm0
1348    movq                xm0, [tlq]
1349    pmulhrsw            xm1, xm3
1350    pslldq               m6, m7, 8
1351    paddw               xm2, xm7, xm7
1352    lea                  r2, [strideq*3]
1353    paddw                m6, m7
1354    packuswb            xm1, xm1
1355    paddw                m6, m2 ; xpos2 xpos3 xpos0 xpos1
1356    punpcklbw           xm0, xm1
1357    psllw                m7, 2
1358    mova              [rsp], xm0
1359.w4_upsample_loop:
1360    lea                 r5d, [r3+dxq]
1361    shr                 r3d, 6 ; base0
1362    vpbroadcastq         m1, [rsp+r3]
1363    lea                 r3d, [r5+dxq]
1364    shr                 r5d, 6 ; base1
1365    vpbroadcastq         m2, [rsp+r5]
1366    lea                 r5d, [r3+dxq]
1367    shr                 r3d, 6 ; base2
1368    movq                xm0, [rsp+r3]
1369    lea                 r3d, [r5+dxq]
1370    shr                 r5d, 6 ; base3
1371    movhps              xm0, [rsp+r5]
1372    vpblendd             m1, m2, 0xc0
1373    pand                 m2, m4, m6 ; frac
1374    vpblendd             m0, m1, 0xf0
1375    psubw                m1, m5, m2 ; 64-frac
1376    psllw                m2, 8
1377    por                  m1, m2     ; 64-frac, frac
1378    pmaddubsw            m0, m1
1379    paddw                m6, m7     ; xpos += dx
1380    pmulhrsw             m0, m3
1381    packuswb             m0, m0
1382    vextracti128        xm1, m0, 1
1383    movd   [dstq+strideq*2], xm0
1384    pextrd [dstq+r2       ], xm0, 1
1385    movd   [dstq+strideq*0], xm1
1386    pextrd [dstq+strideq*1], xm1, 1
1387    lea                dstq, [dstq+strideq*4]
1388    sub                  hd, 4
1389    jg .w4_upsample_loop
1390    RET
1391ALIGN function_align
1392.filter_strength: ; w4/w8/w16
1393    ; The C version uses a lot of branches, but we can do all the comparisons
1394    ; in parallel and use popcnt to get the final filter strength value.
1395%define base r3-z_filter_t0
1396    lea                  r3, [z_filter_t0]
1397    movd                xm0, maxbased
1398    movd                xm2, angled
1399    shr              angled, 8 ; is_sm << 1
1400    vpbroadcastb         m0, xm0
1401    vpbroadcastb         m2, xm2
1402    pcmpeqb              m1, m0, [base+z_filter_wh]
1403    pand                 m1, m2
1404    mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
1405    pcmpgtb              m1, m2
1406    pmovmskb            r5d, m1
1407    ret
1408.w4_no_upsample:
1409    ALLOC_STACK         -16, 11
1410    mov            maxbased, 7
1411    test             angled, 0x400 ; !enable_intra_edge_filter
1412    jnz .w4_main
1413    lea            maxbased, [hq+3]
1414    call .filter_strength
1415    mov            maxbased, 7
1416    test                r5d, r5d
1417    jz .w4_main ; filter_strength == 0
1418    popcnt              r5d, r5d
1419    vpbroadcastd         m7, [base+pb_8]
1420    vbroadcasti128       m2, [tlq-1]
1421    pminub               m1, m7, [base+z_filter_s]
1422    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
1423    pminub               m7, [base+z_filter_s+8]
1424    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
1425    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
1426    pshufb               m0, m2, m1
1427    shufps               m1, m7, q2121
1428    pmaddubsw            m0, m8
1429    pshufb               m1, m2, m1
1430    pmaddubsw            m1, m9
1431    pshufb               m2, m7
1432    pmaddubsw            m2, m10
1433    paddw                m0, m1
1434    paddw                m0, m2
1435    pmulhrsw             m0, m3
1436    mov                 r3d, 9
1437    mov                 tlq, rsp
1438    cmp                  hd, 4
1439    cmovne         maxbased, r3d
1440    vextracti128        xm1, m0, 1
1441    packuswb            xm0, xm1
1442    mova              [tlq], xm0
1443.w4_main:
1444    movd                xm6, dxd
1445    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
1446    vpbroadcastb         m7, [tlq+maxbaseq]
1447    shl            maxbased, 6
1448    vpbroadcastw         m6, xm6
1449    mov                 r3d, dxd ; xpos
1450    movd                xm9, maxbased
1451    vpbroadcastw         m9, xm9
1452    vbroadcasti128       m8, [z1_shuf_w4]
1453    psrlw                m7, 8  ; top[max_base_x]
1454    paddw               m10, m6, m6
1455    psubw                m9, m0 ; max_base_x
1456    vpblendd             m6, m10, 0xcc
1457    mova                xm0, xm10
1458    paddw                m6, m0 ; xpos2 xpos3 xpos0 xpos1
1459    paddw               m10, m10
1460.w4_loop:
1461    lea                 r5d, [r3+dxq]
1462    shr                 r3d, 6 ; base0
1463    vpbroadcastq         m1, [tlq+r3]
1464    lea                 r3d, [r5+dxq]
1465    shr                 r5d, 6 ; base1
1466    vpbroadcastq         m2, [tlq+r5]
1467    lea                 r5d, [r3+dxq]
1468    shr                 r3d, 6 ; base2
1469    movq                xm0, [tlq+r3]
1470    lea                 r3d, [r5+dxq]
1471    shr                 r5d, 6 ; base3
1472    movhps              xm0, [tlq+r5]
1473    vpblendd             m1, m2, 0xc0
1474    pand                 m2, m4, m6 ; frac
1475    vpblendd             m0, m1, 0xf0
1476    psubw                m1, m5, m2 ; 64-frac
1477    psllw                m2, 8
1478    pshufb               m0, m8
1479    por                  m1, m2     ; 64-frac, frac
1480    pmaddubsw            m0, m1
1481    pcmpgtw              m1, m9, m6 ; base < max_base_x
1482    pmulhrsw             m0, m3
1483    paddw                m6, m10    ; xpos += dx
1484    lea                  r5, [dstq+strideq*2]
1485    vpblendvb            m0, m7, m0, m1
1486    packuswb             m0, m0
1487    vextracti128        xm1, m0, 1
1488    movd   [r5  +strideq*0], xm0
1489    pextrd [r5  +strideq*1], xm0, 1
1490    movd   [dstq+strideq*0], xm1
1491    pextrd [dstq+strideq*1], xm1, 1
1492    sub                  hd, 4
1493    jz .w4_end
1494    lea                dstq, [dstq+strideq*4]
1495    cmp                 r3d, maxbased
1496    jb .w4_loop
1497    packuswb            xm7, xm7
1498    lea                  r6, [strideq*3]
1499.w4_end_loop:
1500    movd   [dstq+strideq*0], xm7
1501    movd   [dstq+strideq*1], xm7
1502    movd   [dstq+strideq*2], xm7
1503    movd   [dstq+r6       ], xm7
1504    lea                dstq, [dstq+strideq*4]
1505    sub                  hd, 4
1506    jg .w4_end_loop
1507.w4_end:
1508    RET
1509ALIGN function_align
1510.w8:
1511    lea                 r3d, [angleq+216]
1512    mov                 r3b, hb
1513    cmp                 r3d, 8
1514    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1515    ALLOC_STACK         -32, 8
1516    movu                xm2, [z_filter_s+6]
1517    mova                xm0, [tlq-1]
1518    movd                xm6, hd
1519    vinserti128          m0, [tlq+7], 1
1520    vpbroadcastb        xm6, xm6
1521    vbroadcasti128       m1, [z_upsample1]
1522    pminub              xm6, xm2
1523    vpbroadcastd         m7, [pb_36_m4]
1524    vinserti128          m2, xm6, 1
1525    add                 dxd, dxd
1526    pshufb               m1, m0, m1
1527    pshufb               m2, m0, m2
1528    movd                xm6, dxd
1529    pmaddubsw            m1, m7
1530    pmaddubsw            m2, m7
1531    vpbroadcastw         m6, xm6
1532    mov                 r3d, dxd
1533    psrldq               m0, 1
1534    lea                  r2, [strideq*3]
1535    paddw                m7, m6, m6
1536    paddw                m1, m2
1537    vpblendd             m6, m7, 0xf0
1538    pmulhrsw             m1, m3
1539    pslldq               m2, m7, 8
1540    paddw                m7, m7
1541    paddw                m6, m2
1542    packuswb             m1, m1
1543    punpcklbw            m0, m1
1544    mova              [rsp], m0
1545.w8_upsample_loop:
1546    lea                 r5d, [r3+dxq]
1547    shr                 r3d, 6 ; base0
1548    movu                xm0, [rsp+r3]
1549    lea                 r3d, [r5+dxq]
1550    shr                 r5d, 6 ; base1
1551    vinserti128          m0, [rsp+r5], 1
1552    lea                 r5d, [r3+dxq]
1553    shr                 r3d, 6 ; base2
1554    pand                 m1, m4, m6
1555    psubw                m2, m5, m1
1556    psllw                m1, 8
1557    por                  m2, m1
1558    punpcklqdq           m1, m2, m2 ; frac0 frac1
1559    pmaddubsw            m0, m1
1560    movu                xm1, [rsp+r3]
1561    lea                 r3d, [r5+dxq]
1562    shr                 r5d, 6 ; base3
1563    vinserti128          m1, [rsp+r5], 1
1564    punpckhqdq           m2, m2 ; frac2 frac3
1565    pmaddubsw            m1, m2
1566    pmulhrsw             m0, m3
1567    paddw                m6, m7
1568    pmulhrsw             m1, m3
1569    packuswb             m0, m1
1570    vextracti128        xm1, m0, 1
1571    movq   [dstq+strideq*0], xm0
1572    movhps [dstq+strideq*2], xm0
1573    movq   [dstq+strideq*1], xm1
1574    movhps [dstq+r2       ], xm1
1575    lea                dstq, [dstq+strideq*4]
1576    sub                  hd, 4
1577    jg .w8_upsample_loop
1578    RET
1579.w8_no_intra_edge_filter:
1580    and            maxbased, 7
1581    or             maxbased, 8 ; imin(h+7, 15)
1582    jmp .w8_main
1583.w8_no_upsample:
1584    ALLOC_STACK         -32, 10
1585    lea            maxbased, [hq+7]
1586    test             angled, 0x400
1587    jnz .w8_no_intra_edge_filter
1588    call .filter_strength
1589    test                r5d, r5d
1590    jz .w8_main ; filter_strength == 0
1591    popcnt              r5d, r5d
1592    movu                xm2, [tlq]
1593    pminub              xm1, xm0, [base+z_filter_s+14]
1594    vinserti128          m2, [tlq-1], 1
1595    vinserti128          m1, [base+z_filter_s+ 0], 1
1596    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
1597    pminub              xm0, [base+z_filter_s+22]
1598    vinserti128          m0, [base+z_filter_s+ 8], 1
1599    pshufb               m6, m2, m1
1600    pmaddubsw            m6, m7
1601    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
1602    movzx               r3d, byte [tlq+15]
1603    shufps               m1, m0, q2121
1604    pshufb               m1, m2, m1
1605    pmaddubsw            m1, m7
1606    paddw                m1, m6
1607    sub                 r5d, 3
1608    jnz .w8_3tap
1609    ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
1610    ; which also results in an awkward edge case where out[w*2] is
1611    ; slightly different from out[max_base_x] when h > w.
1612    vpbroadcastd         m7, [z_filter_k+4*8]
1613    movzx               r2d, byte [tlq+14]
1614    pshufb               m2, m0
1615    pmaddubsw            m2, m7
1616    sub                 r2d, r3d
1617    lea                 r2d, [r2+r3*8+4]
1618    shr                 r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
1619    mov            [rsp+16], r2b
1620    paddw                m1, m2
1621.w8_3tap:
1622    pmulhrsw             m1, m3
1623    sar                 r5d, 1
1624    mov                 tlq, rsp
1625    add                 r5d, 17 ; w*2 + (filter_strength == 3)
1626    cmp                  hd, 16
1627    cmovns         maxbased, r5d
1628    mov            [tlq+r5], r3b
1629    vextracti128        xm0, m1, 1
1630    packuswb            xm0, xm1
1631    mova              [tlq], xm0
1632.w8_main:
1633    movd                xm2, dxd
1634    vbroadcasti128       m0, [z_base_inc]
1635    vpbroadcastw         m2, xm2
1636    vpbroadcastb         m7, [tlq+maxbaseq]
1637    shl            maxbased, 6
1638    movd                xm9, maxbased
1639    vbroadcasti128       m8, [z_filter_s+2]
1640    vpbroadcastw         m9, xm9
1641    psrlw                m7, 8
1642    psubw                m9, m0
1643    mov                 r3d, dxd
1644    paddw                m6, m2, m2
1645    vpblendd             m2, m6, 0xf0
1646.w8_loop:
1647    lea                 r5d, [r3+dxq]
1648    shr                 r3d, 6
1649    pand                 m0, m4, m2
1650    psubw                m1, m5, m0
1651    psllw                m0, 8
1652    por                  m1, m0
1653    movu                xm0, [tlq+r3]
1654    lea                 r3d, [r5+dxq]
1655    shr                 r5d, 6 ; base1
1656    vinserti128          m0, [tlq+r5], 1
1657    pshufb               m0, m8
1658    pmaddubsw            m0, m1
1659    pcmpgtw              m1, m9, m2
1660    paddw                m2, m6
1661    pmulhrsw             m0, m3
1662    vpblendvb            m0, m7, m0, m1
1663    vextracti128        xm1, m0, 1
1664    packuswb            xm0, xm1
1665    movq   [dstq+strideq*0], xm0
1666    movhps [dstq+strideq*1], xm0
1667    sub                  hd, 2
1668    jz .w8_end
1669    lea                dstq, [dstq+strideq*2]
1670    cmp                 r3d, maxbased
1671    jb .w8_loop
1672    packuswb            xm7, xm7
1673.w8_end_loop:
1674    movq   [dstq+strideq*0], xm7
1675    movq   [dstq+strideq*1], xm7
1676    lea                dstq, [dstq+strideq*2]
1677    sub                  hd, 2
1678    jg .w8_end_loop
1679.w8_end:
1680    RET
1681.w16_no_intra_edge_filter:
1682    and            maxbased, 15
1683    or             maxbased, 16 ; imin(h+15, 31)
1684    jmp .w16_main
1685ALIGN function_align
1686.w16:
1687    ALLOC_STACK         -64, 12
1688    lea            maxbased, [hq+15]
1689    test             angled, 0x400
1690    jnz .w16_no_intra_edge_filter
1691    call .filter_strength
1692    test                r5d, r5d
1693    jz .w16_main ; filter_strength == 0
1694    popcnt              r5d, r5d
1695    vpbroadcastd         m1, [base+pb_12]
1696    vbroadcasti128       m6, [base+z_filter_s+8]
1697    vinserti128          m2, m6, [base+z_filter_s], 0
1698    vinserti128          m6, [base+z_filter_s+16], 1
1699    mova               xm10, [tlq-1]
1700    vinserti128         m10, [tlq+3], 1
1701    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
1702    vbroadcasti128       m7, [base+z_filter_s+14]
1703    vinserti128          m8, m7, [base+z_filter_s+6], 0
1704    vinserti128          m7, [base+z_filter_s+22], 1
1705    psubw                m0, m1
1706    movu               xm11, [tlq+12]
1707    vinserti128         m11, [tlq+16], 1
1708    pminub               m8, m0
1709    pminub               m7, m0
1710    pshufb               m0, m10, m2
1711    shufps               m2, m6, q2121
1712    pmaddubsw            m0, m9
1713    pshufb               m1, m11, m8
1714    shufps               m8, m7, q2121
1715    pmaddubsw            m1, m9
1716    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
1717    movzx               r3d, byte [tlq+31]
1718    pshufb               m2, m10, m2
1719    pmaddubsw            m2, m9
1720    pshufb               m8, m11, m8
1721    pmaddubsw            m8, m9
1722    paddw                m0, m2
1723    paddw                m1, m8
1724    sub                 r5d, 3
1725    jnz .w16_3tap
1726    vpbroadcastd         m9, [z_filter_k+4*8]
1727    movzx               r2d, byte [tlq+30]
1728    pshufb              m10, m6
1729    pmaddubsw           m10, m9
1730    pshufb              m11, m7
1731    pmaddubsw           m11, m9
1732    sub                 r2d, r3d
1733    lea                 r2d, [r2+r3*8+4]
1734    shr                 r2d, 3
1735    mov            [rsp+32], r2b
1736    paddw                m0, m10
1737    paddw                m1, m11
1738.w16_3tap:
1739    pmulhrsw             m0, m3
1740    pmulhrsw             m1, m3
1741    sar                 r5d, 1
1742    mov                 tlq, rsp
1743    add                 r5d, 33
1744    cmp                  hd, 32
1745    cmovns         maxbased, r5d
1746    mov            [tlq+r5], r3b
1747    packuswb             m0, m1
1748    vpermq               m0, m0, q3120
1749    mova              [tlq], m0
1750.w16_main:
1751    movd                xm6, dxd
1752    vbroadcasti128       m0, [z_base_inc]
1753    vpbroadcastb         m7, [tlq+maxbaseq]
1754    shl            maxbased, 6
1755    vpbroadcastw         m6, xm6
1756    movd                xm9, maxbased
1757    vbroadcasti128       m8, [z_filter_s+2]
1758    vpbroadcastw         m9, xm9
1759    mov                 r3d, dxd
1760    psubw                m9, m0
1761    paddw               m11, m6, m6
1762    psubw               m10, m9, m3 ; 64*8
1763    vpblendd             m6, m11, 0xf0
1764.w16_loop:
1765    lea                 r5d, [r3+dxq]
1766    shr                 r3d, 6 ; base0
1767    pand                 m1, m4, m6
1768    psubw                m2, m5, m1
1769    psllw                m1, 8
1770    por                  m2, m1
1771    movu                xm0, [tlq+r3+0]
1772    movu                xm1, [tlq+r3+8]
1773    lea                 r3d, [r5+dxq]
1774    shr                 r5d, 6 ; base1
1775    vinserti128          m0, [tlq+r5+0], 1
1776    vinserti128          m1, [tlq+r5+8], 1
1777    pshufb               m0, m8
1778    pshufb               m1, m8
1779    pmaddubsw            m0, m2
1780    pmaddubsw            m1, m2
1781    pmulhrsw             m0, m3
1782    pmulhrsw             m1, m3
1783    packuswb             m0, m1
1784    pcmpgtw              m1, m9, m6
1785    pcmpgtw              m2, m10, m6
1786    packsswb             m1, m2
1787    paddw                m6, m11
1788    vpblendvb            m0, m7, m0, m1
1789    mova         [dstq+strideq*0], xm0
1790    vextracti128 [dstq+strideq*1], m0, 1
1791    sub                  hd, 2
1792    jz .w16_end
1793    lea                dstq, [dstq+strideq*2]
1794    cmp                 r3d, maxbased
1795    jb .w16_loop
1796.w16_end_loop:
1797    mova   [dstq+strideq*0], xm7
1798    mova   [dstq+strideq*1], xm7
1799    lea                dstq, [dstq+strideq*2]
1800    sub                  hd, 2
1801    jg .w16_end_loop
1802.w16_end:
1803    RET
1804ALIGN function_align
1805.w32:
1806    ALLOC_STACK         -96, 15
1807    lea                 r3d, [hq+31]
1808    mov            maxbased, 63
1809    cmp                  hd, 32
1810    cmovs          maxbased, r3d
1811    test             angled, 0x400 ; !enable_intra_edge_filter
1812    jnz .w32_main
1813    vbroadcasti128       m0, [pb_0to15]
1814    sub                 r3d, 29 ; h+2
1815    movu               xm13, [tlq+29]    ; 32-39
1816    movd                xm1, r3d
1817    movu               xm14, [tlq+37]    ; 40-47
1818    sub                 r3d, 8 ; h-6
1819    vinserti128         m14, [tlq+51], 1 ; 56-63
1820    vpbroadcastb        xm1, xm1
1821    mova               xm11, [tlq- 1]    ;  0- 7
1822    vinserti128         m11, [tlq+13], 1 ; 16-23
1823    movd                xm2, r3d
1824    movu               xm12, [tlq+ 5]    ;  8-15
1825    vinserti128         m12, [tlq+19], 1 ; 24-31
1826    pminub              xm1, xm0 ; clip 32x8
1827    mova                 m7, [z_filter_s+0]
1828    pshufb             xm13, xm1
1829    vpbroadcastd         m1, [pb_12]
1830    vpbroadcastb        xm2, xm2
1831    vinserti128         m13, [tlq+43], 1 ; 48-55
1832    vinserti128          m8, m7, [z_filter_s+4], 1
1833    vpblendd             m2, m1, 0xf0
1834    vinserti128          m7, [z_filter_s+12], 0
1835    pminub               m2, m0 ; clip 32x16 and 32x(32|64)
1836    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
1837    pshufb              m14, m2
1838    pshufb               m0, m11, m8
1839    shufps               m8, m7, q1021
1840    pmaddubsw            m0, m9
1841    pshufb               m2, m12, m8
1842    pmaddubsw            m2, m9
1843    pshufb               m1, m13, m8
1844    pmaddubsw            m1, m9
1845    pshufb               m6, m14, m8
1846    pmaddubsw            m6, m9
1847    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
1848    pshufb              m10, m11, m8
1849    shufps               m8, m7, q2121
1850    pmaddubsw           m10, m9
1851    paddw                m0, m10
1852    pshufb              m10, m12, m8
1853    pmaddubsw           m10, m9
1854    paddw                m2, m10
1855    pshufb              m10, m13, m8
1856    pmaddubsw           m10, m9
1857    paddw                m1, m10
1858    pshufb              m10, m14, m8
1859    pmaddubsw           m10, m9
1860    paddw                m6, m10
1861    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
1862    pshufb              m11, m8
1863    pmaddubsw           m11, m9
1864    pshufb              m12, m7
1865    pmaddubsw           m12, m9
1866    movzx               r3d, byte [tlq+63]
1867    movzx               r2d, byte [tlq+62]
1868    paddw                m0, m11
1869    paddw                m2, m12
1870    pshufb              m13, m7
1871    pmaddubsw           m13, m9
1872    pshufb              m14, m7
1873    pmaddubsw           m14, m9
1874    paddw                m1, m13
1875    paddw                m6, m14
1876    sub                 r2d, r3d
1877    lea                 r2d, [r2+r3*8+4] ; edge case for 32x64
1878    pmulhrsw             m0, m3
1879    pmulhrsw             m2, m3
1880    pmulhrsw             m1, m3
1881    pmulhrsw             m6, m3
1882    shr                 r2d, 3
1883    mov            [rsp+64], r2b
1884    mov                 tlq, rsp
1885    mov            [tlq+65], r3b
1886    mov                 r3d, 65
1887    cmp                  hd, 64
1888    cmove          maxbased, r3d
1889    packuswb             m0, m2
1890    packuswb             m1, m6
1891    mova           [tlq+ 0], m0
1892    mova           [tlq+32], m1
1893.w32_main:
1894    movd                xm6, dxd
1895    vpbroadcastb         m7, [tlq+maxbaseq]
1896    shl            maxbased, 6
1897    vpbroadcastw         m6, xm6
1898    movd                xm9, maxbased
1899    vbroadcasti128       m8, [z_filter_s+2]
1900    vpbroadcastw         m9, xm9
1901    mov                 r5d, dxd
1902    psubw                m9, [z_base_inc]
1903    mova                m11, m6
1904    psubw               m10, m9, m3 ; 64*8
1905.w32_loop:
1906    mov                 r3d, r5d
1907    shr                 r3d, 6
1908    pand                 m1, m4, m6
1909    psubw                m2, m5, m1
1910    psllw                m1, 8
1911    por                  m2, m1
1912    movu                 m0, [tlq+r3+0]
1913    movu                 m1, [tlq+r3+8]
1914    add                 r5d, dxd
1915    pshufb               m0, m8
1916    pshufb               m1, m8
1917    pmaddubsw            m0, m2
1918    pmaddubsw            m1, m2
1919    pmulhrsw             m0, m3
1920    pmulhrsw             m1, m3
1921    packuswb             m0, m1
1922    pcmpgtw              m1, m9, m6
1923    pcmpgtw              m2, m10, m6
1924    packsswb             m1, m2
1925    paddw                m6, m11
1926    vpblendvb            m0, m7, m0, m1
1927    mova             [dstq], m0
1928    dec                  hd
1929    jz .w32_end
1930    add                dstq, strideq
1931    cmp                 r5d, maxbased
1932    jb .w32_loop
1933    test                 hb, 1
1934    jz .w32_end_loop
1935    mova             [dstq], m7
1936    add                dstq, strideq
1937    dec                  hd
1938    jz .w32_end
1939.w32_end_loop:
1940    mova   [dstq+strideq*0], m7
1941    mova   [dstq+strideq*1], m7
1942    lea                dstq, [dstq+strideq*2]
1943    sub                  hd, 2
1944    jg .w32_end_loop
1945.w32_end:
1946    RET
1947ALIGN function_align
1948.w64:
1949    ALLOC_STACK        -128, 16
1950    lea            maxbased, [hq+63]
1951    test             angled, 0x400 ; !enable_intra_edge_filter
1952    jnz .w64_main
1953    mova               xm11, [tlq- 1]    ;  0- 7
1954    vinserti128         m11, [tlq+13], 1 ; 16-23
1955    movu               xm12, [tlq+ 5]    ;  8-15
1956    vinserti128         m12, [tlq+19], 1 ; 24-31
1957    mova                 m7, [z_filter_s+0]
1958    vinserti128          m8, m7, [z_filter_s+4], 1
1959    vinserti128          m7, [z_filter_s+12], 0
1960    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
1961    movu               xm13, [tlq+29]    ; 32-39
1962    vinserti128         m13, [tlq+43], 1 ; 48-55
1963    movu               xm14, [tlq+37]    ; 40-47
1964    vinserti128         m14, [tlq+51], 1 ; 56-63
1965    pshufb               m0, m11, m8
1966    shufps               m8, m7, q1021
1967    pmaddubsw            m0, m9
1968    pshufb               m2, m12, m8
1969    pmaddubsw            m2, m9
1970    pshufb               m1, m13, m8
1971    pmaddubsw            m1, m9
1972    pshufb               m6, m14, m8
1973    pmaddubsw            m6, m9
1974    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
1975    pshufb              m10, m11, m8
1976    shufps              m15, m8, m7, q2121
1977    pmaddubsw           m10, m9
1978    paddw                m0, m10
1979    pshufb              m10, m12, m15
1980    pmaddubsw           m10, m9
1981    paddw                m2, m10
1982    pshufb              m10, m13, m15
1983    pmaddubsw           m10, m9
1984    paddw                m1, m10
1985    pshufb              m10, m14, m15
1986    pmaddubsw           m10, m9
1987    paddw                m6, m10
1988    vpbroadcastd        m10, [z_filter_k+4*2+12*2]
1989    pshufb              m11, m15
1990    pmaddubsw           m11, m10
1991    pshufb              m12, m7
1992    pmaddubsw           m12, m10
1993    pshufb              m13, m7
1994    pmaddubsw           m13, m10
1995    pshufb              m14, m7
1996    pmaddubsw           m14, m10
1997    paddw                m0, m11
1998    paddw                m2, m12
1999    paddw                m1, m13
2000    paddw                m6, m14
2001    movu               xm11, [tlq+ 61]    ;  64- 71
2002    vinserti128         m11, [tlq+ 75], 1 ;  80- 87
2003    movu               xm12, [tlq+ 69]    ;  72- 79
2004    vinserti128         m12, [tlq+ 83], 1 ;  88- 95
2005    movu               xm13, [tlq+ 93]    ;  96-103
2006    vinserti128         m13, [tlq+107], 1 ; 112-119
2007    movu               xm14, [tlq+101]    ; 104-111
2008    vinserti128         m14, [tlq+115], 1 ; 120-127
2009    pmulhrsw             m0, m3
2010    pmulhrsw             m2, m3
2011    pmulhrsw             m1, m3
2012    pmulhrsw             m6, m3
2013    lea                 r3d, [hq-20]
2014    mov                 tlq, rsp
2015    packuswb             m0, m2
2016    packuswb             m1, m6
2017    vpbroadcastd        xm2, [pb_14]
2018    vbroadcasti128       m6, [pb_0to15]
2019    mova         [tlq+32*0], m0
2020    mova         [tlq+32*1], m1
2021    movd                xm0, r3d
2022    vpbroadcastd         m1, [pb_12]
2023    vpbroadcastb         m0, xm0
2024    paddb                m0, m2
2025    pminub               m0, m6 ; clip 64x16 and 64x32
2026    pshufb              m12, m0
2027    pminub               m1, m6 ; clip 64x64
2028    pshufb              m14, m1
2029    pshufb               m0, m11, m7
2030    pmaddubsw            m0, m10
2031    pshufb               m2, m12, m7
2032    pmaddubsw            m2, m10
2033    pshufb               m1, m13, m7
2034    pmaddubsw            m1, m10
2035    pshufb               m6, m14, m7
2036    pmaddubsw            m6, m10
2037    pshufb               m7, m11, m15
2038    pmaddubsw            m7, m9
2039    pshufb              m10, m12, m15
2040    pmaddubsw           m10, m9
2041    paddw                m0, m7
2042    pshufb               m7, m13, m15
2043    pmaddubsw            m7, m9
2044    paddw                m2, m10
2045    pshufb              m10, m14, m15
2046    pmaddubsw           m10, m9
2047    paddw                m1, m7
2048    paddw                m6, m10
2049    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
2050    pshufb              m11, m8
2051    pmaddubsw           m11, m9
2052    pshufb              m12, m8
2053    pmaddubsw           m12, m9
2054    pshufb              m13, m8
2055    pmaddubsw           m13, m9
2056    pshufb              m14, m8
2057    pmaddubsw           m14, m9
2058    paddw                m0, m11
2059    paddw                m2, m12
2060    paddw                m1, m13
2061    paddw                m6, m14
2062    pmulhrsw             m0, m3
2063    pmulhrsw             m2, m3
2064    pmulhrsw             m1, m3
2065    pmulhrsw             m6, m3
2066    packuswb             m0, m2
2067    packuswb             m1, m6
2068    mova         [tlq+32*2], m0
2069    mova         [tlq+32*3], m1
2070.w64_main:
2071    movd               xm12, dxd
2072    vpbroadcastb         m7, [tlq+maxbaseq]
2073    lea                 r3d, [dxq-64]
2074    shl            maxbased, 6
2075    vpbroadcastw        m12, xm12
2076    sub                 r3d, maxbased
2077    vbroadcasti128       m8, [z_filter_s+2]
2078    movd                xm6, r3d
2079    mov                 r5d, dxd
2080    mova                m10, [pb_1to32]
2081    vpbroadcastd        m11, [pb_32]
2082    vpbroadcastw         m6, xm6
2083.w64_loop:
2084    mov                 r3d, r5d
2085    shr                 r3d, 6
2086    movu                 m0, [tlq+r3+ 0]
2087    movu                 m1, [tlq+r3+ 8]
2088    pand                 m2, m4, m6
2089    psubw                m9, m5, m2
2090    psllw                m2, 8
2091    por                  m9, m2
2092    pshufb               m0, m8
2093    pshufb               m1, m8
2094    pmaddubsw            m0, m9
2095    pmaddubsw            m1, m9
2096    psraw                m2, m6, 6
2097    pmulhrsw             m0, m3
2098    pmulhrsw             m1, m3
2099    packsswb             m2, m2
2100    paddb                m2, m10
2101    packuswb             m0, m1
2102    vpblendvb            m0, m7, m0, m2
2103    mova          [dstq+ 0], m0
2104    movu                 m0, [tlq+r3+32]
2105    movu                 m1, [tlq+r3+40]
2106    add                 r5d, dxd
2107    pshufb               m0, m8
2108    pshufb               m1, m8
2109    pmaddubsw            m0, m9
2110    pmaddubsw            m1, m9
2111    paddb                m2, m11
2112    pmulhrsw             m0, m3
2113    pmulhrsw             m1, m3
2114    paddw                m6, m12
2115    packuswb             m0, m1
2116    vpblendvb            m0, m7, m0, m2
2117    mova          [dstq+32], m0
2118    dec                  hd
2119    jz .w64_end
2120    add                dstq, strideq
2121    cmp                 r5d, maxbased
2122    jb .w64_loop
2123.w64_end_loop:
2124    mova          [dstq+ 0], m7
2125    mova          [dstq+32], m7
2126    add                dstq, strideq
2127    dec                  hd
2128    jg .w64_end_loop
2129.w64_end:
2130    RET
2131
2132cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
2133%define base r9-z_filter_t0
2134    lea                  r9, [ipred_z2_avx2_table]
2135    tzcnt                wd, wm
2136    movifnidn        angled, anglem
2137    movifnidn            hd, hm
2138    lea                 dxq, [dr_intra_derivative-90]
2139    movsxd               wq, [r9+wq*4]
2140    movzx               dyd, angleb
2141    xor              angled, 0x400
2142    mov                  r8, dxq
2143    sub                 dxq, dyq
2144    add                  wq, r9
2145    add                  r9, z_filter_t0-ipred_z2_avx2_table
2146    mova                 m2, [tlq-64]
2147    mova                 m0, [tlq-32]
2148    mova                 m1, [tlq]
2149    and                 dyd, ~1
2150    and                 dxq, ~1
2151    movzx               dyd, word [r8+dyq]  ; angle - 90
2152    movzx               dxd, word [dxq+270] ; 180 - angle
2153    vpbroadcastd        m13, [base+pw_512]
2154    vpbroadcastd        m14, [base+pw_62]
2155    vpbroadcastd        m15, [base+pw_64]
2156    mova           [rsp+ 0], m2
2157    mova           [rsp+32], m0
2158    mova           [rsp+64], m1
2159    neg                 dxd
2160    neg                 dyd
2161    jmp                  wq
2162.w4:
2163    vpbroadcastq         m6, [base+z2_base_inc] ; base_inc << 6
2164    vbroadcasti128      m10, [base+z1_shuf_w4]
2165    vbroadcasti128      m11, [base+z2_shuf_h4]
2166    lea                 r2d, [dxq+(65<<6)] ; xpos
2167    movd                xm5, dyd
2168    mov                 r8d, (63-4)<<6
2169    mov                 dyq, -4
2170    pshuflw             xm5, xm5, q0000
2171    pmullw              xm5, [base+z2_ymul]
2172    test             angled, 0x400
2173    jnz .w4_main ; !enable_intra_edge_filter
2174    lea                 r3d, [hq+2]
2175    add              angled, 1022
2176    shl                 r3d, 6
2177    test                r3d, angled
2178    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
2179    vpbroadcastd        xm3, [base+pb_4]
2180    call .upsample_above
2181    sub              angled, 1075 ; angle - 53
2182    lea                 r3d, [hq+3]
2183    xor              angled, 0x7f ; 180 - angle
2184    call .filter_strength
2185    jmp .w4_filter_left
2186ALIGN function_align
2187.filter_strength:
2188    movd                xm8, r3d
2189    mov                 r3d, angled
2190    movd                xm7, angled
2191    vpbroadcastb         m8, xm8
2192    shr                 r3d, 8 ; is_sm << 1
2193    vpbroadcastb         m7, xm7
2194    pcmpeqb              m8, [base+z_filter_wh]
2195    mova                xm9, [r9+r3*8]
2196    pand                 m0, m8, m7
2197    pcmpgtb              m0, m9
2198    pmovmskb            r3d, m0
2199    ret
2200ALIGN function_align
2201.upsample_above: ; w4/w8
2202    pshufb              xm2, xm1, [base+z_upsample1-2]
2203    pminub              xm3, [base+z_filter_s+4]
2204    vpbroadcastd        xm4, [base+pb_36_m4]
2205    vbroadcasti128      m10, [base+pb_0to15]
2206    pshufb              xm3, xm1, xm3
2207    pmaddubsw           xm2, xm4
2208    pmaddubsw           xm3, xm4
2209    lea                 r2d, [r2+dxq+(1<<6)]
2210    add                 dxd, dxd
2211    paddw               xm2, xm3
2212    pmulhrsw            xm2, xm13
2213    sub                 r8d, 3<<6
2214    paddw                m6, m6
2215    packuswb            xm2, xm2
2216    punpcklbw           xm1, xm2
2217    mova   [rsp+gprsize+64], xm1
2218    ret
2219ALIGN function_align
2220.upsample_left: ; h4/h8
2221    mov                 r3d, hd
2222    and                 r3d, 4
2223    movd                xm2, [rsp+gprsize+64]
2224    movddup             xm0, [rsp+gprsize+56]
2225    movd                xm1, r3d
2226    palignr             xm2, xm0, 1
2227    vpbroadcastb        xm1, xm1
2228    pshufb              xm2, [base+z_filter_s+18]
2229    vpbroadcastd        xm3, [base+pb_36_m4]
2230    pmaxub              xm1, [base+z_upsample1-2]
2231    pshufb              xm1, xm0, xm1
2232    pmaddubsw           xm2, xm3
2233    pmaddubsw           xm1, xm3
2234    paddw               xm5, xm5
2235    add                 dyq, dyq
2236    paddw               xm1, xm2
2237    pmulhrsw            xm1, xm13
2238    vbroadcasti128      m11, [base+z2_upsample]
2239    paddw               xm5, xm15
2240    packuswb            xm1, xm1
2241    punpcklbw           xm0, xm1
2242    mova   [rsp+gprsize+48], xm0
2243    ret
2244.w4_no_upsample_above:
2245    lea                 r3d, [hq+3]
2246    sub              angled, 1112 ; angle - 90
2247    call .filter_strength
2248    test                r3d, r3d
2249    jz .w4_no_filter_above
2250    popcnt              r3d, r3d
2251    vpbroadcastd        xm2, [base+pb_4]
2252    pminub              xm2, [base+z_filter_s]
2253    vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
2254    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
2255    pshufb              xm3, xm1, xm2 ; 00 01 12 23
2256    pshufd              xm2, xm2, q0321
2257    pmaddubsw           xm0, xm3, xm0
2258    pshufb              xm2, xm1, xm2 ; 12 23 34 44
2259    pmaddubsw           xm2, xm4
2260    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*2]
2261    punpckhqdq          xm3, xm3      ; 34 44 44 44
2262    pmaddubsw           xm3, xm4
2263    vpbroadcastd        xm4, r6m      ; max_width
2264    packssdw            xm4, xm4
2265    paddw               xm0, xm2
2266    paddw               xm0, xm3
2267    pmulhrsw            xm0, xm13
2268    packsswb            xm4, xm4
2269    psrlq               xm1, 8
2270    psubb               xm4, [base+pb_1to32]
2271    packuswb            xm0, xm0
2272    vpblendvb           xm0, xm1, xm4
2273    movd           [rsp+65], xm0
2274.w4_no_filter_above:
2275    lea                 r3d, [hq+2]
2276    add              angled, 973 ; angle + 883
2277    shl                 r3d, 6
2278    test                r3d, angled
2279    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
2280    vpbroadcastd        xm0, [base+pb_90]
2281    psubb               xm0, xm7 ; 180 - angle
2282    pand                xm0, xm8 ; reuse from previous filter_strength call
2283    pcmpgtb             xm0, xm9
2284    pmovmskb            r3d, xm0
2285.w4_filter_left:
2286    test                r3d, r3d
2287    jz .w4_main
2288    popcnt              r3d, r3d
2289    mov                 r5d, 10
2290    cmp                  hd, 16
2291    movu                xm2, [rsp+49]
2292    vinserti128          m2, [rsp+43], 1
2293    cmovs               r5d, hd
2294    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
2295    movd                xm0, r5d
2296    vbroadcasti128       m1, [base+z_filter_s+12]
2297    vbroadcasti128       m4, [base+z_filter_s+16]
2298    vinserti128          m3, m1, [z_filter_s+8], 1   ; 56 67 78 89 9a ab bc cd   55 55 56 67 78 89 9a ab
2299    vpblendd             m1, m4, 0x0f                ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
2300    vinserti128          m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
2301    vpbroadcastb         m0, xm0
2302    pmaxub               m0, m3
2303    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
2304    pshufb               m0, m2, m0
2305    pmaddubsw            m0, m3
2306    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*1]
2307    pshufb               m1, m2, m1
2308    pmaddubsw            m1, m3
2309    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*2]
2310    pshufb               m2, m4
2311    pmaddubsw            m2, m3
2312    vpbroadcastd        xm4, r7m ; max_height
2313    packssdw            xm4, xm4
2314    paddw                m1, m0
2315    paddw                m1, m2
2316    pmulhrsw             m1, m13
2317    packsswb            xm4, xm4
2318    vextracti128        xm0, m1, 1
2319    psubb               xm4, [base+pb_16to1]
2320    packuswb            xm0, xm1
2321    vpblendvb           xm0, [rsp+48], xm4
2322    mova           [rsp+48], xm0
2323    jmp .w4_main
2324.w4_upsample_left:
2325    call .upsample_left
2326.w4_main:
2327    movd                xm0, dxd
2328    mova                m12, [base+z2_y_shuf_h4]
2329    lea                  r5, [rsp+56]  ; left-7
2330    vpbroadcastw         m0, xm0
2331    lea                  r9, [strideq*3]
2332    psraw               xm1, xm5, 6
2333    pand                xm5, xm14      ; frac_y
2334    pxor                xm2, xm2
2335    paddw                m7, m0, m0
2336    psubw               xm4, xm2, xm1  ; base_y
2337    vpblendd             m0, m7, 0xcc
2338    mova                xm1, xm7
2339    punpcklwd           xm4, xm2
2340    paddw                m0, m1        ; xpos2 xpos3 xpos0 xpos1
2341    psubw               xm1, xm15, xm5 ; 64-frac_y
2342    psllw               xm5, 8
2343    paddw                m7, m7
2344    paddw                m6, m0
2345    por                 xm5, xm1       ; 64-frac_y, frac_y
2346    vpbroadcastq         m5, xm5
2347.w4_loop:
2348    lea                 r3d, [r2+dxq]
2349    shr                 r2d, 6         ; base_x0
2350    vpbroadcastq         m1, [rsp+r2]
2351    lea                 r2d, [r3+dxq]
2352    shr                 r3d, 6         ; base_x1
2353    vpbroadcastq         m2, [rsp+r3]
2354    lea                 r3d, [r2+dxq]
2355    shr                 r2d, 6         ; base_x2
2356    movq                xm0, [rsp+r2]
2357    lea                 r2d, [r3+dxq]
2358    shr                 r3d, 6         ; base_x3
2359    movhps              xm0, [rsp+r3]
2360    vpblendd             m1, m2, 0xc0
2361    pand                 m2, m14, m6   ; frac_x
2362    vpblendd             m0, m1, 0xf0
2363    psubw                m1, m15, m2   ; 64-frac_x
2364    psllw                m2, 8
2365    pshufb               m0, m10
2366    por                  m1, m2        ; 64-frac_x, frac_x
2367    pmaddubsw            m0, m1
2368    cmp                 r3d, 64
2369    jge .w4_toponly
2370    mova                 m1, m7        ; arbitrary negative value
2371    vpgatherdq           m3, [r5+xm4], m1
2372    pshufb               m1, m3, m11
2373    vpermd               m1, m12, m1
2374    pmaddubsw            m1, m5
2375    psraw                m2, m6, 15    ; base_x < topleft
2376    vpblendvb            m0, m1, m2
2377.w4_toponly:
2378    pmulhrsw             m0, m13
2379    paddw                m6, m7        ; xpos += dx
2380    add                  r5, dyq
2381    packuswb             m0, m0
2382    vextracti128        xm1, m0, 1
2383    movd   [dstq+strideq*2], xm0
2384    pextrd [dstq+r9       ], xm0, 1
2385    movd   [dstq+strideq*0], xm1
2386    pextrd [dstq+strideq*1], xm1, 1
2387    sub                  hd, 4
2388    jz .w4_end
2389    lea                dstq, [dstq+strideq*4]
2390    cmp                 r2d, r8d
2391    jge .w4_loop
2392.w4_leftonly_loop:
2393    mova                 m1, m7
2394    vpgatherdq           m2, [r5+xm4], m1
2395    add                  r5, dyq
2396    pshufb               m0, m2, m11
2397    vpermd               m0, m12, m0
2398    pmaddubsw            m0, m5
2399    pmulhrsw             m0, m13
2400    packuswb             m0, m0
2401    vextracti128        xm1, m0, 1
2402    movd   [dstq+strideq*2], xm0
2403    pextrd [dstq+r9       ], xm0, 1
2404    movd   [dstq+strideq*0], xm1
2405    pextrd [dstq+strideq*1], xm1, 1
2406    lea                dstq, [dstq+strideq*4]
2407    sub                  hd, 4
2408    jg .w4_leftonly_loop
2409.w4_end:
2410    RET
2411.w8:
2412    vbroadcasti128       m6, [base+z2_base_inc] ; base_inc << 6
2413    movd                xm5, dyd
2414    vbroadcasti128      m10, [base+z_filter_s+2]
2415    vbroadcasti128      m11, [base+z2_shuf_h4]
2416    lea                 r2d, [dxq+(65<<6)] ; xpos
2417    vpbroadcastw        xm5, xm5
2418    mov                 r8d, (63-8)<<6
2419    mov                 dyq, -4
2420    pmullw              xm5, [base+z2_ymul]
2421    test             angled, 0x400
2422    jnz .w8_main
2423    lea                 r3d, [angleq+126]
2424    mov                 r3b, hb
2425    cmp                 r3d, 8
2426    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2427    vpbroadcastd        xm3, [base+pb_8]
2428    movhps         [rsp+80], xm1
2429    call .upsample_above
2430    sub              angled, 53 ; angle - 53
2431    lea                 r3d, [hq+7]
2432    xor              angled, 0x7f ; 180 - angle
2433    call .filter_strength
2434    jmp .w8_filter_left
2435.w8_no_upsample_above:
2436    lea                 r3d, [hq+7]
2437    sub              angled, 90 ; angle - 90
2438    call .filter_strength
2439    test                r3d, r3d
2440    jz .w8_no_filter_above
2441    popcnt              r3d, r3d
2442    vpbroadcastd        xm3, [base+pb_8]
2443    pminub              xm3, [base+z_filter_s+8]
2444    vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
2445    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
2446    pshufb              xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67
2447    pmaddubsw           xm0, xm2, xm0
2448    pshufb              xm3, xm1, xm3               ; 34 45 56 67 78 88 88 88
2449    shufps              xm2, xm3, q2121             ; 12 23 34 45 56 67 78 88
2450    pmaddubsw           xm2, xm4
2451    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*2]
2452    pmaddubsw           xm3, xm4
2453    vpbroadcastd        xm4, r6m ; max_width
2454    packssdw            xm4, xm4
2455    paddw               xm0, xm2
2456    paddw               xm0, xm3
2457    pmulhrsw            xm0, xm13
2458    packsswb            xm4, xm4
2459    psrldq              xm1, 1
2460    psubb               xm4, [base+pb_1to32]
2461    packuswb            xm0, xm0
2462    vpblendvb           xm0, xm1, xm4
2463    movq           [rsp+65], xm0
2464.w8_no_filter_above:
2465    lea                 r3d, [angleq-51]
2466    mov                 r3b, hb
2467    cmp                 r3d, 8
2468    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
2469    vpbroadcastd         m0, [base+pb_90]
2470    psubb                m0, m7
2471    pand                 m0, m8
2472    pcmpgtb              m0, m9
2473    pmovmskb            r3d, m0
2474.w8_filter_left:
2475    test                r3d, r3d
2476    jz .w8_main
2477    popcnt              r3d, r3d
2478    vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
2479    vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
2480    vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
2481    cmp                  hd, 32
2482    jne .w8_filter_left_h16
2483    movu                xm2, [rsp+27]
2484    vinserti128          m2, [rsp+35], 1
2485    vpbroadcastd        xm0, [base+pb_5]
2486    vbroadcasti128       m3, [base+z_filter_s+ 8]
2487    vbroadcasti128       m1, [base+z_filter_s+12]
2488    vbroadcasti128       m4, [base+z_filter_s+16]
2489    pmaxub               m3, m0
2490    pshufb               m3, m2, m3
2491    pmaddubsw            m3, m7
2492    pshufb               m1, m2, m1
2493    pmaddubsw            m1, m8
2494    pshufb               m2, m4
2495    pmaddubsw            m2, m9
2496    paddw                m3, m1
2497    paddw                m3, m2
2498    pmulhrsw             m3, m13
2499    jmp .w8_filter_left_top16
2500.w8_filter_left_h16:
2501    mov                 r5d, 10
2502    cmp                  hd, 16
2503    cmovs               r5d, hd
2504    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
2505    movd                xm0, r5d
2506    vpbroadcastb         m0, xm0
2507.w8_filter_left_top16:
2508    vbroadcasti128       m1, [base+z_filter_s+12]
2509    vinserti128          m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd   55 55 56 67 78 89 9a ab
2510    vbroadcasti128       m4, [base+z_filter_s+16]
2511    vpblendd             m1, m4, 0x0f                   ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
2512    vinserti128          m4, [base+z_filter_s+20], 0    ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
2513    pmaxub               m0, m2
2514    movu                xm2, [rsp+49]
2515    vinserti128          m2, [rsp+43], 1
2516    pshufb               m0, m2, m0
2517    pmaddubsw            m0, m7
2518    vpbroadcastd         m7, r7m ; max_height
2519    pshufb               m1, m2, m1
2520    pmaddubsw            m1, m8
2521    pshufb               m2, m4
2522    pmaddubsw            m2, m9
2523    packssdw             m7, m7
2524    paddw                m1, m0
2525    packsswb             m7, m7
2526    paddw                m1, m2
2527    pmulhrsw             m1, m13
2528    psubb                m7, [base+pb_32to1]
2529    packuswb             m3, m1
2530    vpermq               m3, m3, q1320
2531    vpblendvb            m3, [rsp+32], m7
2532    mova           [rsp+32], m3
2533    jmp .w8_main
2534.w8_upsample_left:
2535    call .upsample_left
2536.w8_main:
2537    movd                xm3, dxd
2538    lea                  r5, [rsp+56]  ; left-7
2539    pshufd              xm1, xm5, q3120
2540    pand                xm5, xm14
2541    vpbroadcastw         m3, xm3
2542    pxor                xm0, xm0
2543    psubw               xm2, xm15, xm5
2544    psraw               xm1, 6
2545    lea                  r9, [strideq*3]
2546    paddw                m7, m3, m3
2547    psubw               xm9, xm0, xm1  ; base_y
2548    psllw               xm5, 8
2549    punpcklwd           xm8, xm9, xm0  ; base_y 0, 1, 4, 5
2550    vpblendd             m3, m7, 0xf0  ; xpos0 xpos1
2551    por                 xm5, xm2       ; 64-frac_y, frac_y
2552    punpckhwd           xm9, xm0       ; base_y 2, 3, 6, 7
2553    paddw                m6, m3
2554    vinserti128         m12, m5, xm5, 1
2555.w8_loop:
2556    lea                 r3d, [r2+dxq]
2557    shr                 r2d, 6         ; base_x0
2558    movu                xm0, [rsp+r2]
2559    lea                 r2d, [r3+dxq]
2560    shr                 r3d, 6         ; base_x1
2561    vinserti128          m0, [rsp+r3], 1
2562    lea                 r3d, [r2+dxq]
2563    shr                 r2d, 6         ; base_x2
2564    movu                xm1, [rsp+r2]
2565    lea                 r2d, [r3+dxq]
2566    shr                 r3d, 6         ; base_x3
2567    vinserti128          m1, [rsp+r3], 1
2568    pand                 m2, m14, m6
2569    paddsw               m4, m6, m7
2570    psubw                m5, m15, m2
2571    psllw                m2, 8
2572    pshufb               m0, m10
2573    por                  m2, m5
2574    pmaddubsw            m0, m2
2575    pand                 m2, m14, m4
2576    psubw                m5, m15, m2
2577    psllw                m2, 8
2578    pshufb               m1, m10
2579    por                  m2, m5
2580    pmaddubsw            m1, m2
2581    cmp                 r3d, 64
2582    jge .w8_toponly
2583    mova                 m5, m7
2584    vpgatherdq           m3, [r5+xm9], m7
2585    mova                 m7, m5
2586    vpgatherdq           m2, [r5+xm8], m5
2587    pshufb               m3, m11
2588    pshufb               m2, m11
2589    punpckldq            m5, m2, m3    ; a0 b0 c0 d0 a1 b1 c1 d1   e0 f0 g0 h0 e1 f1 g1 h1
2590    punpckhdq            m2, m3        ; a2 b2 c2 d2 a3 b3 c3 d3   e2 f2 g2 h2 e3 f3 g3 h3
2591    vpermq               m5, m5, q3120 ; y0 y1
2592    vpermq               m2, m2, q3120 ; y2 y3
2593    pmaddubsw            m5, m12
2594    pmaddubsw            m2, m12
2595    psraw                m6, 15        ; base_x < topleft
2596    vpblendvb            m0, m5, m6
2597    psraw                m3, m4, 15
2598    vpblendvb            m1, m2, m3
2599.w8_toponly:
2600    pmulhrsw             m0, m13
2601    pmulhrsw             m1, m13
2602    paddw                m6, m4, m7     ; xpos += dx
2603    add                  r5, dyq
2604    packuswb             m0, m1
2605    vextracti128        xm1, m0, 1
2606    movq   [dstq+strideq*0], xm0
2607    movhps [dstq+strideq*2], xm0
2608    movq   [dstq+strideq*1], xm1
2609    movhps [dstq+r9       ], xm1
2610    sub                  hd, 4
2611    jz .w8_end
2612    lea                dstq, [dstq+strideq*4]
2613    cmp                 r2d, r8d
2614    jge .w8_loop
2615.w8_leftonly_loop:
2616    mova                 m0, m7
2617    vpgatherdq           m5, [r5+xm9], m7
2618    mova                 m7, m0
2619    vpgatherdq           m3, [r5+xm8], m0
2620    add                  r5, dyq
2621    pshufb               m2, m5, m11
2622    pshufb               m1, m3, m11
2623    punpckldq            m0, m1, m2
2624    punpckhdq            m1, m2
2625    vpermq               m0, m0, q3120
2626    vpermq               m1, m1, q3120
2627    pmaddubsw            m0, m12
2628    pmaddubsw            m1, m12
2629    pmulhrsw             m0, m13
2630    pmulhrsw             m1, m13
2631    packuswb             m0, m1
2632    vextracti128        xm1, m0, 1
2633    movq   [dstq+strideq*0], xm0
2634    movhps [dstq+strideq*2], xm0
2635    movq   [dstq+strideq*1], xm1
2636    movhps [dstq+r9       ], xm1
2637    lea                dstq, [dstq+strideq*4]
2638    sub                  hd, 4
2639    jg .w8_leftonly_loop
2640.w8_end:
2641    RET
2642.w16:
2643    mov                 r8d, hd
2644    test             angled, 0x400
2645    jnz .w16_main
2646    lea                 r3d, [hq+15]
2647    sub              angled, 90
2648    call .filter_strength
2649    test                r3d, r3d
2650    jz .w16_no_filter_above
2651    popcnt              r3d, r3d
2652    vbroadcasti128       m6, [tlq+1]
2653    mova                xm2, [base+z_filter_s]
2654    vinserti128          m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67   67 78 89 9a ab bc cd de
2655    movu                xm3, [base+z_filter_s+8]
2656    vinserti128          m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab   ab bc cd de ef ff ff ff
2657    vpblendd             m1, m6, 0xf0
2658    vpbroadcastd         m0, [base+z_filter_k-4+r3*4+12*0]
2659    vpbroadcastd         m4, [base+z_filter_k-4+r3*4+12*1]
2660    vpbroadcastd         m5, [base+z_filter_k-4+r3*4+12*2]
2661    pshufb               m2, m1, m2
2662    pshufb               m1, m3
2663    pmaddubsw            m0, m2, m0
2664    shufps               m2, m1, q2121                ; 12 23 34 45 56 67 78 89   89 9a ab bc cd de ef ff
2665    pmaddubsw            m2, m4
2666    pmaddubsw            m1, m5
2667    vpbroadcastd        xm4, r6m ; max_width
2668    packssdw            xm4, xm4
2669    paddw                m0, m2
2670    paddw                m0, m1
2671    pmulhrsw             m0, m13
2672    packsswb            xm4, xm4
2673    vextracti128        xm2, m0, 1
2674    psubb               xm4, [base+pb_1to32]
2675    packuswb            xm0, xm2
2676    vpblendvb           xm0, xm6, xm4
2677    movu           [rsp+65], xm0
2678.w16_no_filter_above:
2679    vpbroadcastd         m0, [base+pb_90]
2680    psubb                m0, m7
2681    pand                 m0, m8
2682    pcmpgtb              m0, m9
2683    pmovmskb            r3d, m0
2684    test                r3d, r3d
2685    jz .w16_main
2686    popcnt              r3d, r3d
2687    vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
2688    vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
2689    vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
2690.w16_filter_left:
2691    vpbroadcastd         m6, r7m ; max_height
2692    packssdw             m6, m6
2693    packsswb             m6, m6
2694    cmp                  hd, 32
2695    jl .w16_filter_left_h16
2696    vpbroadcastd        xm0, [base+pb_5]
2697    vbroadcasti128      m10, [base+z_filter_s+ 8]
2698    vbroadcasti128      m11, [base+z_filter_s+12]
2699    vbroadcasti128      m12, [base+z_filter_s+16]
2700    je .w16_filter_left_h32
2701    movu                 m3, [tlq-69]
2702    movu                 m5, [tlq-61]
2703    pmaxub               m1, m10, m0
2704    pshufb               m1, m3, m1
2705    pmaddubsw            m1, m7
2706    pshufb               m2, m3, m11
2707    pmaddubsw            m2, m8
2708    pshufb               m3, m12
2709    pmaddubsw            m3, m9
2710    paddw                m1, m2
2711    pshufb               m2, m5, m10
2712    pmaddubsw            m2, m7
2713    pshufb               m4, m5, m11
2714    pmaddubsw            m4, m8
2715    pshufb               m5, m12
2716    pmaddubsw            m5, m9
2717    paddw                m1, m3
2718    vpbroadcastd         m3, [base+pb_32]
2719    paddb                m3, [base+pb_32to1]
2720    paddw                m2, m4
2721    paddw                m2, m5
2722    pmulhrsw             m1, m13
2723    pmulhrsw             m2, m13
2724    psubb                m3, m6, m3
2725    packuswb             m1, m2
2726    vpblendvb            m1, [tlq-64], m3
2727    mova              [rsp], m1
2728    jmp .w16_filter_left_top32
2729.w16_filter_left_h32:
2730    pmaxub              m10, m0
2731.w16_filter_left_top32:
2732    movu                xm2, [tlq-37]
2733    vinserti128          m2, [tlq-29], 1
2734    pshufb               m3, m2, m10
2735    pshufb               m1, m2, m11
2736    pshufb               m2, m12
2737    pmaddubsw            m3, m7
2738    pmaddubsw            m1, m8
2739    pmaddubsw            m2, m9
2740    paddw                m3, m1
2741    paddw                m3, m2
2742    pmulhrsw             m3, m13
2743    jmp .w16_filter_left_top16
2744.w16_filter_left_h16:
2745    mov                 r5d, 10
2746    cmp                  hd, 16
2747    cmovs               r5d, hd
2748    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
2749    movd                xm0, r5d
2750    vpbroadcastb         m0, xm0
2751.w16_filter_left_top16:
2752    movu                xm2, [tlq-15]
2753    vinserti128          m2, [tlq-21], 1
2754    vbroadcasti128       m1, [base+z_filter_s+12]
2755    vbroadcasti128       m4, [base+z_filter_s+16]
2756    vinserti128          m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd   34 45 56 67 78 89 9a ab
2757    vpblendd             m1, m4, 0x0f                   ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
2758    vinserti128          m4, [base+z_filter_s+20], 0    ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
2759    pmaxub               m0, m5
2760    pshufb               m0, m2, m0
2761    pmaddubsw            m0, m7
2762    pshufb               m1, m2, m1
2763    pmaddubsw            m1, m8
2764    pshufb               m2, m4
2765    pmaddubsw            m2, m9
2766    psubb                m6, [base+pb_32to1]
2767    paddw                m1, m0
2768    paddw                m1, m2
2769    pmulhrsw             m1, m13
2770    packuswb             m3, m1
2771    vpermq               m3, m3, q1320
2772    vpblendvb            m3, [tlq-32], m6
2773    mova           [rsp+32], m3
2774.w16_main:
2775    movd                xm1, dyd
2776    vbroadcasti128      m10, [base+z_filter_s+2]
2777    movd                xm7, dxd
2778    vbroadcasti128      m11, [base+z2_shuf_h2]
2779    vpbroadcastw         m1, xm1
2780    vpbroadcastw         m7, xm7
2781    mov                  r7, dstq
2782    pmullw               m0, m1, [base+z2_ymul]
2783    psllw               xm1, 4
2784    paddw                m6, m7, [base+z2_base_inc]
2785    lea                 r9d, [dxq+(65<<6)] ; xpos
2786    movd          [rsp+156], xm1
2787.w16_loop0:
2788    mov                 r2d, r9d
2789    mova          [rsp+160], m0
2790    lea                  r5, [rsp+60] ; left-3
2791    mova          [rsp+192], m6
2792    pxor                 m1, m1
2793    psraw                m2, m0, 6
2794    pand                 m0, m14
2795    psubw                m9, m1, m2   ; base_y
2796    psubw               m12, m15, m0
2797    punpcklwd            m8, m9, m1   ; base_y  0,  1,  2,  3,     8,  9, 10, 11
2798    psllw                m0, 8
2799    punpckhwd            m9, m1       ; base_y  4,  5,  6,  7,    12, 13, 14, 15
2800    por                 m12, m0       ; 64-frac_y, frac_y
2801.w16_loop:
2802    lea                 r3d, [r2+dxq]
2803    shr                 r2d, 6        ; base_x0
2804    movu                xm0, [rsp+r2]
2805    vinserti128          m0, [rsp+r2+8], 1
2806    lea                 r2d, [r3+dxq]
2807    shr                 r3d, 6        ; base_x1
2808    movu                xm1, [rsp+r3]
2809    vinserti128          m1, [rsp+r3+8], 1
2810    pand                 m2, m14, m6
2811    paddsw               m5, m6, m7
2812    psubw                m3, m15, m2
2813    psllw                m2, 8
2814    pshufb               m0, m10
2815    por                  m2, m3
2816    pmaddubsw            m0, m2
2817    pand                 m2, m14, m5
2818    psubw                m3, m15, m2
2819    psllw                m2, 8
2820    pshufb               m1, m10
2821    por                  m2, m3
2822    pmaddubsw            m1, m2
2823    cmp                 r3d, 64
2824    jge .w16_toponly
2825    punpckhwd            m2, m5, m5   ; mask out unnecessary loads
2826    vpgatherdd           m4, [r5+m9], m2
2827    punpcklwd            m2, m5, m5
2828    vpgatherdd           m3, [r5+m8], m2
2829    pshufb               m4, m11      ; e0 f0 g0 h0 e1 f1 g1 h1   m0 n0 o0 p0 m1 n1 o1 p1
2830    pshufb               m3, m11      ; a0 b0 c0 d0 a1 b1 c1 d1   i0 j0 k0 l0 i1 j1 k1 l1
2831    punpcklqdq           m2, m3, m4   ; y0
2832    punpckhqdq           m3, m4       ; y1
2833    pmaddubsw            m2, m12
2834    pmaddubsw            m3, m12
2835    psraw                m6, 15       ; base_x < topleft
2836    vpblendvb            m0, m2, m6
2837    psraw                m6, m5, 15
2838    vpblendvb            m1, m3, m6
2839.w16_toponly:
2840    pmulhrsw             m0, m13
2841    pmulhrsw             m1, m13
2842    paddw                m6, m5, m7   ; xpos += dx
2843    sub                  r5, 2
2844    packuswb             m0, m1
2845    vpermq               m0, m0, q3120
2846    mova         [dstq+strideq*0], xm0
2847    vextracti128 [dstq+strideq*1], m0, 1
2848    sub                  hd, 2
2849    jz .w16_end
2850    lea                dstq, [dstq+strideq*2]
2851    cmp                 r2d, (63-16)<<6
2852    jge .w16_loop
2853.w16_leftonly_loop:
2854    mova                 m0, m7
2855    vpgatherdd           m4, [r5+m9], m7
2856    mova                 m7, m0
2857    vpgatherdd           m3, [r5+m8], m0
2858    sub                  r5, 2
2859    pshufb               m2, m4, m11
2860    pshufb               m1, m3, m11
2861    punpcklqdq           m0, m1, m2
2862    punpckhqdq           m1, m2
2863    pmaddubsw            m0, m12
2864    pmaddubsw            m1, m12
2865    pmulhrsw             m0, m13
2866    pmulhrsw             m1, m13
2867    packuswb             m0, m1
2868    vpermq               m0, m0, q3120
2869    mova         [dstq+strideq*0], xm0
2870    vextracti128 [dstq+strideq*1], m0, 1
2871    lea                dstq, [dstq+strideq*2]
2872    sub                  hd, 2
2873    jg .w16_leftonly_loop
2874.w16_end:
2875    sub                 r8d, 1<<8
2876    jl .w16_ret
2877    vpbroadcastd         m0, [rsp+156]
2878    paddw                m0, [rsp+160] ; base_y += 16*dy
2879    paddw                m6, m13, [rsp+192]
2880    add                  r7, 16
2881    add                 r9d, 16<<6
2882    movzx                hd, r8b
2883    mov                dstq, r7
2884    paddw                m6, m13 ; base_x += 16*64
2885    jmp .w16_loop0
2886.w16_ret:
2887    RET
2888.w32:
2889    mova                 m2, [tlq+32]
2890    lea                 r8d, [hq+(1<<8)]
2891    mova           [rsp+96], m2
2892    test             angled, 0x400
2893    jnz .w16_main
2894    vpbroadcastd         m7, [base+z_filter_k+4*2+12*0]
2895    vpbroadcastd         m8, [base+z_filter_k+4*2+12*1]
2896    vpbroadcastd         m9, [base+z_filter_k+4*2+12*2]
2897    mova                xm5, [base+z_filter_s]
2898    vinserti128          m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67   45 56 67 78 89 9a ab bc
2899    vinserti128          m1, [tlq+11], 1
2900    movu                xm6, [base+z_filter_s+12]
2901    vinserti128          m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd   ab bc cd de ef ff ff ff
2902    movu                xm3, [tlq+ 6]
2903    vinserti128          m3, [tlq+17], 1
2904    vpbroadcastd        m10, r6m ; max_width
2905    packssdw            m10, m10
2906    packsswb            m10, m10
2907.w32_filter_above:
2908    pshufb               m0, m1, m5
2909    shufps               m4, m5, m6, q1021           ; 12 23 34 45 56 67 78 89   67 78 89 9a ab bc cd de
2910    pmaddubsw            m0, m7
2911    pshufb               m2, m1, m4
2912    shufps               m5, m6, q2132               ; 34 45 56 67 78 89 9a ab   89 9a ab bc cd de ef ff
2913    pmaddubsw            m2, m8
2914    pshufb               m1, m5
2915    pmaddubsw            m1, m9
2916    paddw                m0, m2
2917    paddw                m0, m1
2918    pshufb               m1, m3, m4
2919    pmaddubsw            m1, m7
2920    pshufb               m2, m3, m5
2921    pmaddubsw            m2, m8
2922    pshufb               m3, m6
2923    pmaddubsw            m3, m9
2924    paddw                m1, m2
2925    paddw                m1, m3
2926    pmulhrsw             m0, m13
2927    pmulhrsw             m1, m13
2928    psubb               m10, [base+pb_1to32]
2929    packuswb             m0, m1
2930    vpblendvb            m0, [tlq+1], m10
2931    movu           [rsp+65], m0
2932    jmp .w16_filter_left
2933.w64:
2934    mova                 m2, [tlq+32]
2935    mov                 r3d, [tlq+64]
2936    lea                 r8d, [hq+(3<<8)]
2937    mova          [rsp+ 96], m2
2938    mov           [rsp+128], r3d
2939    test             angled, 0x400
2940    jnz .w16_main
2941    vpbroadcastd         m7, [base+z_filter_k+4*2+12*0]
2942    vpbroadcastd         m8, [base+z_filter_k+4*2+12*1]
2943    vpbroadcastd         m9, [base+z_filter_k+4*2+12*2]
2944    movu                xm6, [base+z_filter_s+ 4]
2945    vinserti128          m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89   45 56 67 78 89 9a ab bc
2946    movu                xm3, [tlq+30]
2947    vinserti128          m3, [tlq+43], 1
2948    movu                xm5, [base+z_filter_s+16]
2949    vinserti128          m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef   ab bc cd de ef ff ff ff
2950    pshufb               m0, m3, m6
2951    shufps               m4, m6, m5, q1021           ; 34 45 56 67 78 89 9a ab   67 78 89 9a ab bc cd de
2952    pmaddubsw            m0, m7
2953    pshufb               m2, m3, m4
2954    shufps               m6, m5, q2132               ; 56 67 78 89 9a ab bc cd   89 9a ab bc cd de ef ff
2955    pmaddubsw            m2, m8
2956    pshufb               m3, m6
2957    pmaddubsw            m3, m9
2958    paddw                m0, m2
2959    paddw                m0, m3
2960    movu                xm2, [tlq+36]
2961    vinserti128          m2, [tlq+49], 1
2962    vpbroadcastd        m10, r6m ; max_width
2963    pshufb               m4, m2, m4
2964    pmaddubsw            m4, m7
2965    pshufb               m3, m2, m6
2966    pmaddubsw            m3, m8
2967    pshufb               m2, m5
2968    pmaddubsw            m2, m9
2969    packssdw            m10, m10
2970    paddw                m3, m4
2971    paddw                m2, m3
2972    vpbroadcastd         m3, [base+pb_32]
2973    pmulhrsw             m0, m13
2974    pmulhrsw             m2, m13
2975    packsswb            m10, m10
2976    mova                xm5, [base+z_filter_s]
2977    vinserti128          m5, [base+z_filter_s+6], 1
2978    psubb                m3, m10, m3
2979    psubb                m3, [base+pb_1to32]
2980    vinserti128          m1, [tlq+13], 1
2981    packuswb             m0, m2
2982    vpblendvb            m0, [tlq+33], m3
2983    movu                xm3, [tlq+ 6]
2984    vinserti128          m3, [tlq+19], 1
2985    movu           [rsp+97], m0
2986    jmp .w32_filter_above
2987
2988cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
2989    lea                  r6, [ipred_z3_avx2_table]
2990    tzcnt                hd, hm
2991    movifnidn        angled, anglem
2992    lea                  r7, [dr_intra_derivative+45*2-1]
2993    dec                 tlq
2994    movsxd               hq, [r6+hq*4]
2995    sub              angled, 180
2996    add                  hq, r6
2997    mov                 dyd, angled
2998    neg                 dyd
2999    xor              angled, 0x400
3000    or                  dyq, ~0x7e
3001    movzx               dyd, word [r7+dyq]
3002    vpbroadcastd         m3, [pw_512]
3003    vpbroadcastd         m4, [pw_62]
3004    vpbroadcastd         m5, [pw_64]
3005    mov              org_wd, wd
3006    jmp                  hq
3007.h4:
3008    lea                  r7, [strideq*3]
3009    cmp              angleb, 40
3010    jae .h4_no_upsample
3011    lea                 r4d, [angleq-1024]
3012    sar                 r4d, 7
3013    add                 r4d, wd
3014    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
3015    ALLOC_STACK         -32, 9
3016    movu                xm8, [tlq-7]
3017    pshufb              xm0, xm8, [z_upsample1-4]
3018    vpbroadcastb        xm2, xm8
3019    pshufb              xm1, xm8, [z_filter_s+2]
3020    mova           [rsp+16], xm2 ; top[max_base_y]
3021    vpbroadcastd        xm2, [pb_36_m4]
3022    add                 dyd, dyd
3023    pmaddubsw           xm0, xm2
3024    pmaddubsw           xm1, xm2
3025    movd                xm7, dyd
3026    mov                 r2d, dyd
3027    vpbroadcastw         m7, xm7
3028    paddw               xm1, xm0
3029    pmulhrsw            xm1, xm3
3030    pslldq               m6, m7, 8
3031    paddw               xm2, xm7, xm7
3032    paddw                m6, m7
3033    packuswb            xm1, xm1
3034    paddw                m6, m2
3035    punpcklbw           xm1, xm8
3036    mova                xm8, [z_transpose4]
3037    psllw                m7, 2
3038    pshufb              xm1, [pb_15to0]
3039    mova              [rsp], xm1
3040.h4_upsample_loop:
3041    lea                 r4d, [r2+dyq]
3042    shr                 r2d, 6
3043    vpbroadcastq         m1, [rsp+r2]
3044    lea                 r2d, [r4+dyq]
3045    shr                 r4d, 6
3046    vpbroadcastq         m2, [rsp+r4]
3047    lea                 r4d, [r2+dyq]
3048    shr                 r2d, 6
3049    movq                xm0, [rsp+r2]
3050    lea                 r2d, [r4+dyq]
3051    shr                 r4d, 6
3052    movhps              xm0, [rsp+r4]
3053    vpblendd             m1, m2, 0xc0
3054    pand                 m2, m4, m6
3055    vpblendd             m0, m1, 0xf0
3056    psubw                m1, m5, m2
3057    psllw                m2, 8
3058    por                  m1, m2
3059    pmaddubsw            m0, m1
3060    paddw                m6, m7
3061    pmulhrsw             m0, m3
3062    vextracti128        xm1, m0, 1
3063    packuswb            xm1, xm0
3064    pshufb              xm1, xm8
3065    movd   [dstq+strideq*0], xm1
3066    pextrd [dstq+strideq*1], xm1, 1
3067    pextrd [dstq+strideq*2], xm1, 2
3068    pextrd [dstq+r7       ], xm1, 3
3069    add                dstq, 4
3070    sub                  wd, 4
3071    jg .h4_upsample_loop
3072    RET
3073ALIGN function_align
3074.filter_strength: ; h4/h8/h16
3075%define base r4-z_filter_t0
3076    lea                  r4, [z_filter_t0]
3077    movd                xm0, maxbased
3078    movd                xm2, angled
3079    shr              angled, 8 ; is_sm << 1
3080    vpbroadcastb         m0, xm0
3081    vpbroadcastb         m2, xm2
3082    pcmpeqb              m1, m0, [base+z_filter_wh]
3083    pand                 m1, m2
3084    mova                xm2, [r4+angleq*8]
3085    pcmpgtb              m1, m2
3086    pmovmskb            r5d, m1
3087    ret
3088.h4_no_upsample:
3089    ALLOC_STACK         -16, 12
3090    mov            maxbased, 7
3091    test             angled, 0x400 ; !enable_intra_edge_filter
3092    jnz .h4_main
3093    lea            maxbased, [wq+3]
3094    call .filter_strength
3095    mov            maxbased, 7
3096    test                r5d, r5d
3097    jz .h4_main ; filter_strength == 0
3098    popcnt              r5d, r5d
3099    vpbroadcastd         m7, [base+pb_7]
3100    vbroadcasti128       m2, [tlq-14]
3101    pmaxub               m1, m7, [base+z_filter_s-4]
3102    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
3103    pmaxub               m7, [base+z_filter_s+4]
3104    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
3105    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
3106    pshufb               m0, m2, m1
3107    shufps               m1, m7, q2121
3108    pmaddubsw            m0, m8
3109    pshufb               m1, m2, m1
3110    pmaddubsw            m1, m9
3111    pshufb               m2, m7
3112    pmaddubsw            m2, m10
3113    paddw                m0, m1
3114    paddw                m0, m2
3115    pmulhrsw             m0, m3
3116    mov                 r4d, 9
3117    lea                 tlq, [rsp+15]
3118    cmp                  wd, 4
3119    cmovne         maxbased, r4d
3120    vextracti128        xm1, m0, 1
3121    packuswb            xm0, xm1
3122    mova              [rsp], xm0
3123.h4_main:
3124    movd                xm6, dyd
3125    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
3126    mov                  r4, tlq
3127    sub                 tlq, 4
3128    neg                 dyq
3129    vpbroadcastw         m6, xm6
3130    sub                  r4, maxbaseq
3131    shl            maxbased, 6
3132    vpbroadcastb         m7, [r4]
3133    lea                  r4, [dyq+63] ; ypos
3134    movd                xm9, maxbased
3135    not            maxbased
3136    vbroadcasti128       m8, [z3_shuf_w4]
3137    add            maxbased, 64
3138    vpbroadcastw         m9, xm9
3139    psrlw                m7, 8  ; top[max_base_y]
3140    paddw               m10, m6, m6
3141    psubw                m9, m0 ; max_base_y
3142    vpblendd             m6, m10, 0xcc
3143    mova                xm0, xm10
3144    paddw                m6, m0 ; ypos2 ypos3 ypos0 ypos1
3145    paddw               m10, m10
3146    mova               xm11, [z_transpose4]
3147.h4_loop:
3148    lea                  r5, [r4+dyq]
3149    sar                  r4, 6 ; base0
3150    vpbroadcastq         m1, [tlq+r4]
3151    lea                  r4, [r5+dyq]
3152    sar                  r5, 6 ; base1
3153    vpbroadcastq         m2, [tlq+r5]
3154    lea                  r5, [r4+dyq]
3155    sar                  r4, 6 ; base2
3156    movq                xm0, [tlq+r4]
3157    lea                  r4, [r5+dyq]
3158    sar                  r5, 6 ; base3
3159    movhps              xm0, [tlq+r5]
3160    vpblendd             m1, m2, 0xc0
3161    pand                 m2, m4, m6 ; frac
3162    vpblendd             m0, m1, 0xf0
3163    psubw                m1, m5, m2 ; 64-frac
3164    psllw                m2, 8
3165    pshufb               m0, m8
3166    por                  m1, m2     ; 64-frac, frac
3167    pmaddubsw            m0, m1
3168    pcmpgtw              m1, m9, m6 ; base < max_base_y
3169    pmulhrsw             m0, m3
3170    paddw                m6, m10    ; ypos += dy
3171    vpblendvb            m0, m7, m0, m1
3172    vextracti128        xm1, m0, 1
3173    packuswb            xm1, xm0
3174    pshufb              xm1, xm11   ; transpose
3175    movd   [dstq+strideq*0], xm1
3176    pextrd [dstq+strideq*1], xm1, 1
3177    pextrd [dstq+strideq*2], xm1, 2
3178    pextrd [dstq+r7       ], xm1, 3
3179    sub                  wd, 4
3180    jz .h4_end
3181    add                dstq, 4
3182    cmp                 r4d, maxbased
3183    jg .h4_loop
3184    packuswb            xm7, xm7
3185.h4_end_loop:
3186    movd   [dstq+strideq*0], xm7
3187    movd   [dstq+strideq*1], xm7
3188    movd   [dstq+strideq*2], xm7
3189    movd   [dstq+r7       ], xm7
3190    add                dstq, 4
3191    sub                  wd, 4
3192    jg .h4_end_loop
3193.h4_end:
3194    RET
3195ALIGN function_align
3196.h8:
3197    lea                 r4d, [angleq+216]
3198    mov                 r4b, wb
3199    cmp                 r4d, 8
3200    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
3201    ALLOC_STACK         -32, 8
3202    and                 r4d, 4
3203    mova                xm0, [tlq-15]
3204    vinserti128          m0, [tlq- 9], 1
3205    movd                xm1, r4d
3206    movu                xm2, [z_filter_s+2]
3207    vinserti128          m2, [z_filter_s+6], 1
3208    vpbroadcastb        xm1, xm1 ; w & 4
3209    vpbroadcastd         m7, [pb_36_m4]
3210    pmaxub              xm1, [z_upsample1-4] ; clip 4x8
3211    vinserti128          m1, [z_upsample1], 1
3212    add                 dyd, dyd
3213    pshufb               m1, m0, m1
3214    pshufb               m2, m0, m2
3215    vinserti128          m0, [tlq-7], 1
3216    movd                xm6, dyd
3217    pmaddubsw            m1, m7
3218    pmaddubsw            m2, m7
3219    vpbroadcastw         m6, xm6
3220    mov                 r2d, dyd
3221    lea                  r5, [strideq*3]
3222    paddw                m7, m6, m6
3223    paddw                m1, m2
3224    vpblendd             m6, m7, 0xf0
3225    pmulhrsw             m1, m3
3226    pslldq               m2, m7, 8
3227    paddw                m7, m7
3228    paddw                m6, m2
3229    vbroadcasti128       m2, [pb_15to0]
3230    packuswb             m1, m1
3231    punpcklbw            m1, m0
3232    pshufb               m1, m2
3233    vextracti128   [rsp+ 0], m1, 1
3234    mova           [rsp+16], xm1
3235.h8_upsample_loop:
3236    lea                 r4d, [r2+dyq]
3237    shr                 r2d, 6 ; base0
3238    movu                xm0, [rsp+r2]
3239    lea                 r2d, [r4+dyq]
3240    shr                 r4d, 6 ; base1
3241    vinserti128          m0, [rsp+r4], 1
3242    lea                 r4d, [r2+dyq]
3243    shr                 r2d, 6 ; base2
3244    pand                 m1, m4, m6
3245    psubw                m2, m5, m1
3246    psllw                m1, 8
3247    por                  m2, m1
3248    punpcklqdq           m1, m2, m2 ; frac0 frac1
3249    pmaddubsw            m0, m1
3250    movu                xm1, [rsp+r2]
3251    lea                 r2d, [r4+dyq]
3252    shr                 r4d, 6 ; base3
3253    vinserti128          m1, [rsp+r4], 1
3254    punpckhqdq           m2, m2 ; frac2 frac3
3255    pmaddubsw            m1, m2
3256    pmulhrsw             m0, m3
3257    paddw                m6, m7
3258    pmulhrsw             m1, m3
3259    lea                  r4, [dstq+strideq*4]
3260    psllw                m1, 8
3261    por                  m0, m1
3262    vextracti128        xm1, m0, 1
3263    punpcklbw           xm2, xm0, xm1
3264    punpckhbw           xm0, xm1
3265    movd   [dstq+strideq*0], xm2
3266    pextrd [dstq+strideq*1], xm2, 1
3267    pextrd [dstq+strideq*2], xm2, 2
3268    pextrd [dstq+r5       ], xm2, 3
3269    movd   [r4  +strideq*0], xm0
3270    pextrd [r4  +strideq*1], xm0, 1
3271    pextrd [r4  +strideq*2], xm0, 2
3272    pextrd [r4  +r5       ], xm0, 3
3273    add                dstq, 4
3274    sub                  wd, 4
3275    jg .h8_upsample_loop
3276    RET
3277.h8_no_intra_edge_filter:
3278    and            maxbased, 7
3279    or             maxbased, 8 ; imin(w+7, 15)
3280    jmp .h8_main
3281.h8_no_upsample:
3282    ALLOC_STACK         -32, 10
3283    lea            maxbased, [wq+7]
3284    test             angled, 0x400
3285    jnz .h8_no_intra_edge_filter
3286    call .filter_strength
3287    test                r5d, r5d
3288    jz .h8_main ; filter_strength == 0
3289    popcnt              r5d, r5d
3290    vpbroadcastd        xm6, [base+pb_15]
3291    pcmpeqb             xm1, xm1
3292    psubusb             xm6, xm0
3293    psubb               xm6, xm1 ; w == 4 ? 5 : 1
3294    movu                xm2, [tlq-16]
3295    pmaxub              xm1, xm6, [base+z_filter_s]
3296    vinserti128          m2, [tlq-14], 1
3297    vinserti128          m1, [base+z_filter_s+12], 1
3298    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
3299    pmaxub              xm6, [base+z_filter_s+ 8]
3300    vinserti128          m6, [base+z_filter_s+20], 1
3301    pshufb               m0, m2, m1
3302    pmaddubsw            m0, m7
3303    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
3304    movzx               r4d, byte [tlq-15]
3305    shufps               m1, m6, q2121
3306    pshufb               m1, m2, m1
3307    pmaddubsw            m1, m7
3308    paddw                m0, m1
3309    sub                 r5d, 3
3310    jnz .h8_3tap
3311    vpbroadcastd         m7, [z_filter_k+4*8]
3312    movzx               r2d, byte [tlq-14]
3313    pshufb               m2, m6
3314    pmaddubsw            m2, m7
3315    sub                 r2d, r4d
3316    lea                 r2d, [r2+r4*8+4]
3317    shr                 r2d, 3
3318    mov            [rsp+15], r2b
3319    paddw                m0, m2
3320.h8_3tap:
3321    pmulhrsw             m0, m3
3322    sar                 r5d, 1
3323    lea                 tlq, [rsp+31]
3324    add                 r5d, 17
3325    cmp                  wd, 16
3326    cmovns         maxbased, r5d
3327    neg                  r5
3328    mov            [tlq+r5], r4b
3329    vextracti128        xm1, m0, 1
3330    packuswb            xm0, xm1
3331    mova           [tlq-15], xm0
3332.h8_main:
3333    movd                xm2, dyd
3334    vbroadcasti128       m0, [z_base_inc]
3335    mov                  r4, tlq
3336    sub                 tlq, 8
3337    neg                 dyq
3338    vpbroadcastw         m2, xm2
3339    sub                  r4, maxbaseq
3340    shl            maxbased, 6
3341    vpbroadcastb         m7, [r4]
3342    lea                  r4, [dyq+63]
3343    movd                xm9, maxbased
3344    not            maxbased
3345    vbroadcasti128       m8, [z3_shuf]
3346    add            maxbased, 64
3347    vpbroadcastw         m9, xm9
3348    psrlw                m7, 8
3349    psubw                m9, m0
3350    paddw                m6, m2, m2
3351    vpblendd             m2, m6, 0x0f
3352.h8_loop:
3353    lea                  r5, [r4+dyq]
3354    sar                  r4, 6
3355    pand                 m0, m4, m2
3356    psubw                m1, m5, m0
3357    psllw                m0, 8
3358    por                  m1, m0
3359    vbroadcasti128       m0, [tlq+r4]
3360    lea                  r4, [r5+dyq]
3361    sar                  r5, 6
3362    vinserti128          m0, [tlq+r5], 0
3363    sub                 rsp, 8*2
3364    pshufb               m0, m8
3365    pmaddubsw            m0, m1
3366    pcmpgtw              m1, m9, m2
3367    paddw                m2, m6
3368    pmulhrsw             m0, m3
3369    vpblendvb            m0, m7, m0, m1
3370    vextracti128        xm1, m0, 1
3371    psllw               xm0, 8
3372    por                 xm0, xm1 ; interleave rows (partial transpose)
3373    mova              [rsp], xm0
3374    sub                  wd, 2
3375    jz .h8_transpose
3376    cmp                 r4d, maxbased
3377    jg .h8_loop
3378    packuswb            xm0, xm7, xm7
3379.h8_end_loop:
3380    sub                 rsp, 8*2
3381    mova              [rsp], xm0
3382    sub                  wd, 2
3383    jg .h8_end_loop
3384.h8_transpose:
3385    mova                xm2, [rsp+16*1]
3386    sub              org_wd, 8
3387    lea                  r2, [strideq*3]
3388    lea                  r6, [dstq+org_wq]
3389    cmovns             dstq, r6
3390    punpcklwd           xm1, xm2, xm0
3391    punpckhwd           xm2, xm0
3392    lea                  r6, [dstq+strideq*4]
3393    jge .h8_w8
3394    add                 rsp, 16*2
3395    movd   [dstq+strideq*0], xm1
3396    pextrd [dstq+strideq*1], xm1, 1
3397    pextrd [dstq+strideq*2], xm1, 2
3398    pextrd [dstq+r2       ], xm1, 3
3399    movd   [r6  +strideq*0], xm2
3400    pextrd [r6  +strideq*1], xm2, 1
3401    pextrd [r6  +strideq*2], xm2, 2
3402    pextrd [r6  +r2       ], xm2, 3
3403    jmp .h8_end
3404.h8_w8_loop:
3405    mova                xm0, [rsp+16*0]
3406    mova                xm2, [rsp+16*1]
3407    punpcklwd           xm1, xm2, xm0
3408    punpckhwd           xm2, xm0
3409.h8_w8: ; w8/w16/w32
3410    mova                xm0, [rsp+16*2]
3411    mova                xm4, [rsp+16*3]
3412    add                 rsp, 16*4
3413    punpcklwd           xm3, xm4, xm0
3414    punpckhwd           xm4, xm0
3415    punpckldq           xm0, xm3, xm1
3416    punpckhdq           xm3, xm1
3417    punpckldq           xm1, xm4, xm2
3418    punpckhdq           xm4, xm2
3419    movq   [dstq+strideq*0], xm0
3420    movhps [dstq+strideq*1], xm0
3421    movq   [dstq+strideq*2], xm3
3422    movhps [dstq+r2       ], xm3
3423    movq   [r6  +strideq*0], xm1
3424    movhps [r6  +strideq*1], xm1
3425    movq   [r6  +strideq*2], xm4
3426    movhps [r6  +r2       ], xm4
3427    sub                dstq, 8
3428    sub                  r6, 8
3429    sub              org_wd, 8
3430    jge .h8_w8_loop
3431.h8_end:
3432    RET
3433.h16_no_intra_edge_filter:
3434    and            maxbased, 15
3435    or             maxbased, 16 ; imin(w+15, 31)
3436    jmp .h16_main
3437ALIGN function_align
3438.h16:
3439    ALLOC_STACK         -64, 12
3440    lea            maxbased, [wq+15]
3441    test             angled, 0x400
3442    jnz .h16_no_intra_edge_filter
3443    call .filter_strength
3444    test                r5d, r5d
3445    jz .h16_main ; filter_strength == 0
3446    popcnt              r5d, r5d
3447    vpbroadcastd        m11, [base+pb_27]
3448    vpbroadcastd         m1, [base+pb_1]
3449    vbroadcasti128       m6, [base+z_filter_s+12]
3450    vinserti128          m2, m6, [base+z_filter_s+4], 0
3451    vinserti128          m6, [base+z_filter_s+20], 1
3452    movu               xm10, [tlq-18]
3453    vinserti128         m10, [tlq-14], 1
3454    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
3455    vbroadcasti128       m7, [base+z_filter_s+8]
3456    vinserti128          m8, m7, [base+z_filter_s+0], 0
3457    vinserti128          m7, [base+z_filter_s+16], 1
3458    psubusb             m11, m0
3459    por                  m1, m11
3460    movu               xm11, [tlq-32]
3461    vinserti128         m11, [tlq-28], 1
3462    pmaxub               m8, m1
3463    pmaxub               m7, m1
3464    pshufb               m0, m10, m2
3465    shufps               m2, m6, q2121
3466    pmaddubsw            m0, m9
3467    pshufb               m1, m11, m8
3468    shufps               m8, m7, q2121
3469    pmaddubsw            m1, m9
3470    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
3471    movzx               r4d, byte [tlq-31]
3472    pshufb               m2, m10, m2
3473    pmaddubsw            m2, m9
3474    pshufb               m8, m11, m8
3475    pmaddubsw            m8, m9
3476    paddw                m0, m2
3477    paddw                m1, m8
3478    sub                 r5d, 3
3479    jnz .h16_3tap
3480    vpbroadcastd         m9, [z_filter_k+4*8]
3481    movzx               r2d, byte [tlq-30]
3482    pshufb              m10, m6
3483    pmaddubsw           m10, m9
3484    pshufb              m11, m7
3485    pmaddubsw           m11, m9
3486    sub                 r2d, r4d
3487    lea                 r2d, [r2+r4*8+4]
3488    shr                 r2d, 3
3489    mov            [rsp+31], r2b
3490    paddw                m0, m10
3491    paddw                m1, m11
3492.h16_3tap:
3493    pmulhrsw             m0, m3
3494    pmulhrsw             m1, m3
3495    sar                 r5d, 1
3496    lea                 tlq, [rsp+63]
3497    add                 r5d, 33
3498    cmp                  wd, 32
3499    cmovns         maxbased, r5d
3500    neg                  r5
3501    mov            [tlq+r5], r4b
3502    packuswb             m0, m1
3503    vpermq               m0, m0, q2031
3504    mova           [tlq-31], m0
3505.h16_main:
3506    movd                xm6, dyd
3507    vbroadcasti128       m0, [z_base_inc]
3508    mov                  r4, tlq
3509    sub                 tlq, 8
3510    neg                 dyq
3511    vpbroadcastw         m6, xm6
3512    sub                  r4, maxbaseq
3513    shl            maxbased, 6
3514    vpbroadcastb         m7, [r4]
3515    lea                  r4, [dyq+63]
3516    movd                xm9, maxbased
3517    not            maxbased
3518    vbroadcasti128       m8, [z3_shuf]
3519    add            maxbased, 64
3520    vpbroadcastw         m9, xm9
3521    psubw                m9, m0
3522    paddw               m11, m6, m6
3523    psubw               m10, m9, m3 ; 64*8
3524    vpblendd             m6, m11, 0xf0
3525.h16_loop:
3526    lea                  r5, [r4+dyq]
3527    sar                  r4, 6
3528    pand                 m1, m4, m6
3529    psubw                m2, m5, m1
3530    psllw                m1, 8
3531    por                  m2, m1
3532    movu                xm0, [tlq+r4-0]
3533    movu                xm1, [tlq+r4-8]
3534    lea                  r4, [r5+dyq]
3535    sar                  r5, 6
3536    vinserti128          m0, [tlq+r5-0], 1
3537    vinserti128          m1, [tlq+r5-8], 1
3538    sub                 rsp, 32
3539    pshufb               m0, m8
3540    pshufb               m1, m8
3541    pmaddubsw            m0, m2
3542    pmaddubsw            m1, m2
3543    pmulhrsw             m0, m3
3544    pmulhrsw             m1, m3
3545    packuswb             m0, m1
3546    pcmpgtw              m1, m9, m6
3547    pcmpgtw              m2, m10, m6
3548    packsswb             m1, m2
3549    paddw                m6, m11
3550    vpblendvb            m0, m7, m0, m1
3551    vpermq               m0, m0, q3120
3552    mova              [rsp], m0
3553    sub                  wd, 2
3554    jz .h16_transpose
3555    cmp                 r4d, maxbased
3556    jg .h16_loop
3557    mova                 m0, m7
3558.h16_end_loop:
3559    sub                 rsp, 32
3560    mova              [rsp], m7
3561    sub                  wd, 2
3562    jg .h16_end_loop
3563.h16_transpose:
3564    mova                 m2, [rsp+32*1]
3565    sub              org_wd, 8
3566    lea                  r2, [strideq*3]
3567    lea                  r6, [dstq+org_wq]
3568    cmovns             dstq, r6
3569    punpcklbw            m1, m2, m0
3570    punpckhbw            m2, m0
3571    lea                  r3, [strideq*5]
3572    punpcklbw            m0, m1, m2
3573    punpckhbw            m1, m2
3574    lea                  r4, [strideq+r2*2] ; stride*7
3575    jge .h16_w8
3576    add                 rsp, 32*2
3577    movd   [dstq+strideq*0], xm0
3578    pextrd [dstq+strideq*1], xm0, 1
3579    pextrd [dstq+strideq*2], xm0, 2
3580    pextrd [dstq+r2       ], xm0, 3
3581    vextracti128        xm0, m0, 1
3582    movd   [dstq+strideq*4], xm1
3583    pextrd [dstq+r3       ], xm1, 1
3584    pextrd [dstq+r2*2     ], xm1, 2
3585    pextrd [dstq+r4       ], xm1, 3
3586    lea                dstq, [dstq+strideq*8]
3587    vextracti128        xm1, m1, 1
3588    movd   [dstq+strideq*0], xm0
3589    pextrd [dstq+strideq*1], xm0, 1
3590    pextrd [dstq+strideq*2], xm0, 2
3591    pextrd [dstq+r2       ], xm0, 3
3592    movd   [dstq+strideq*4], xm1
3593    pextrd [dstq+r3       ], xm1, 1
3594    pextrd [dstq+r2*2     ], xm1, 2
3595    pextrd [dstq+r4       ], xm1, 3
3596    jmp .h16_end
3597.h16_w8_loop:
3598    mova                 m0, [rsp+32*0]
3599    mova                 m2, [rsp+32*1]
3600    punpcklbw            m1, m2, m0
3601    punpckhbw            m2, m0
3602    punpcklbw            m0, m1, m2
3603    punpckhbw            m1, m2
3604.h16_w8:
3605    mova                 m2, [rsp+32*2]
3606    mova                 m4, [rsp+32*3]
3607    lea                  r6, [dstq+strideq*8]
3608    add                 rsp, 32*4
3609    punpcklbw            m3, m4, m2
3610    punpckhbw            m4, m2
3611    punpcklbw            m2, m3, m4
3612    punpckhbw            m3, m4
3613    punpckldq            m4, m2, m0
3614    punpckhdq            m2, m0
3615    punpckldq            m0, m3, m1
3616    punpckhdq            m3, m1
3617    movq   [dstq+strideq*0], xm4
3618    movhps [dstq+strideq*1], xm4
3619    vextracti128        xm4, m4, 1
3620    movq   [dstq+strideq*2], xm2
3621    movhps [dstq+r2       ], xm2
3622    vextracti128        xm2, m2, 1
3623    movq   [dstq+strideq*4], xm0
3624    movhps [dstq+r3       ], xm0
3625    vextracti128        xm0, m0, 1
3626    movq   [dstq+r2*2     ], xm3
3627    movhps [dstq+r4       ], xm3
3628    vextracti128        xm3, m3, 1
3629    movq     [r6+strideq*0], xm4
3630    movhps   [r6+strideq*1], xm4
3631    movq     [r6+strideq*2], xm2
3632    movhps   [r6+r2       ], xm2
3633    movq     [r6+strideq*4], xm0
3634    movhps   [r6+r3       ], xm0
3635    movq     [r6+r2*2     ], xm3
3636    movhps   [r6+r4       ], xm3
3637    sub                dstq, 8
3638    sub              org_wd, 8
3639    jge .h16_w8_loop
3640.h16_end:
3641    RET
3642ALIGN function_align
3643.h32:
3644    ALLOC_STACK         -96, 15
3645    lea            maxbased, [wq+31]
3646    and            maxbased, 31
3647    or             maxbased, 32 ; imin(w+31, 63)
3648    test             angled, 0x400 ; !enable_intra_edge_filter
3649    jnz .h32_main
3650    vbroadcasti128       m0, [pb_0to15]
3651    mov                 r4d, 21
3652    mov                 r5d, 3
3653    movu               xm11, [tlq-66]    ; 56-63
3654    vinserti128         m11, [tlq-52], 1 ; 40-47
3655    sub                 r4d, wd ; 21-w
3656    cmovns              r5d, r4d
3657    movu               xm12, [tlq-58]    ; 48-55
3658    vinserti128         m12, [tlq-44], 1 ; 32-39
3659    sub                 r4d, 8 ; 13-w
3660    movd                xm1, r5d
3661    movu               xm13, [tlq-34]    ; 24-31
3662    vinserti128         m13, [tlq-20], 1 ;  8-15
3663    movd                xm2, r4d
3664    vpbroadcastb         m1, xm1
3665    movu               xm14, [tlq-28]    ; 16-23
3666    vinserti128         m14, [tlq-14], 1 ;  0- 7
3667    vpbroadcastb         m2, xm2
3668    pmaxsb               m1, m0 ; clip 16x32 and (32|64)x32
3669    movu                 m7, [z_filter_s+4]
3670    pshufb              m11, m1
3671    vinserti128          m8, m7, [z_filter_s+8], 1
3672    vinserti128          m7, [z_filter_s+16], 0
3673    pmaxsb               m2, m0 ; clip 8x32
3674    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
3675    pshufb              m12, m2
3676    pshufb               m0, m11, m8
3677    pmaddubsw            m0, m9
3678    pshufb               m2, m12, m8
3679    pmaddubsw            m2, m9
3680    pshufb               m1, m13, m8
3681    pmaddubsw            m1, m9
3682    shufps               m8, m7, q1021
3683    pshufb               m6, m14, m8
3684    pmaddubsw            m6, m9
3685    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
3686    pshufb              m10, m11, m8
3687    pmaddubsw           m10, m9
3688    paddw                m0, m10
3689    pshufb              m10, m12, m8
3690    pmaddubsw           m10, m9
3691    paddw                m2, m10
3692    pshufb              m10, m13, m8
3693    pmaddubsw           m10, m9
3694    shufps               m8, m7, q2121
3695    paddw                m1, m10
3696    pshufb              m10, m14, m8
3697    pmaddubsw           m10, m9
3698    paddw                m6, m10
3699    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
3700    pshufb              m11, m8
3701    pmaddubsw           m11, m9
3702    pshufb              m12, m8
3703    pmaddubsw           m12, m9
3704    movzx               r4d, byte [tlq-63]
3705    movzx               r2d, byte [tlq-62]
3706    paddw                m0, m11
3707    paddw                m2, m12
3708    pshufb              m13, m8
3709    pmaddubsw           m13, m9
3710    pshufb              m14, m7
3711    pmaddubsw           m14, m9
3712    paddw                m1, m13
3713    paddw                m6, m14
3714    sub                 r2d, r4d
3715    lea                 r2d, [r2+r4*8+4] ; edge case for 64x32
3716    pmulhrsw             m0, m3
3717    pmulhrsw             m2, m3
3718    pmulhrsw             m1, m3
3719    pmulhrsw             m6, m3
3720    shr                 r2d, 3
3721    mov            [rsp+31], r2b
3722    lea                 tlq, [rsp+95]
3723    mov            [tlq-65], r4b
3724    mov                 r4d, 65
3725    cmp                  wd, 64
3726    cmove          maxbased, r4d
3727    packuswb             m0, m2
3728    packuswb             m1, m6
3729    mova           [tlq-63], m0
3730    mova           [tlq-31], m1
3731.h32_main:
3732    movd                xm6, dyd
3733    mov                  r4, tlq
3734    sub                 tlq, 8
3735    neg                 dyq
3736    vpbroadcastw         m6, xm6
3737    sub                  r4, maxbaseq
3738    shl            maxbased, 6
3739    vpbroadcastb         m7, [r4]
3740    lea                  r4, [dyq+63]
3741    movd                xm9, maxbased
3742    not            maxbased
3743    vbroadcasti128       m8, [z3_shuf]
3744    add            maxbased, 64
3745    vpbroadcastw         m9, xm9
3746    psubw                m9, [z_base_inc]
3747    mova                m11, m6
3748    psubw               m10, m9, m3 ; 64*8
3749.h32_loop:
3750    mov                  r5, r4
3751    sar                  r5, 6
3752    pand                 m1, m4, m6
3753    psubw                m2, m5, m1
3754    psllw                m1, 8
3755    por                  m2, m1
3756    movu                xm0, [tlq+r5- 0]
3757    vinserti128          m0, [tlq+r5-16], 1
3758    movu                xm1, [tlq+r5- 8]
3759    vinserti128          m1, [tlq+r5-24], 1
3760    sub                 rsp, 32
3761    add                  r4, dyq
3762    pshufb               m0, m8
3763    pshufb               m1, m8
3764    pmaddubsw            m0, m2
3765    pmaddubsw            m1, m2
3766    pmulhrsw             m0, m3
3767    pmulhrsw             m1, m3
3768    packuswb             m0, m1
3769    pcmpgtw              m1, m9, m6
3770    pcmpgtw              m2, m10, m6
3771    packsswb             m1, m2
3772    paddw                m6, m11
3773    vpblendvb            m0, m7, m0, m1
3774    mova              [rsp], m0
3775    dec                  wd
3776    jz .h32_transpose
3777    cmp                 r4d, maxbased
3778    jg .h32_loop
3779.h32_end_loop:
3780    sub                 rsp, 32
3781    mova              [rsp], m7
3782    dec                  wd
3783    jg .h32_end_loop
3784.h32_transpose:
3785    lea                dstq, [dstq+org_wq-8]
3786    lea                  r2, [strideq*3]
3787    lea                  r3, [strideq*5]
3788    lea                  r4, [strideq+r2*2] ; stride*7
3789.h32_w8_loop:
3790    mova                 m7, [rsp+32*0]
3791    mova                 m6, [rsp+32*1]
3792    mova                 m5, [rsp+32*2]
3793    mova                 m4, [rsp+32*3]
3794    mova                 m3, [rsp+32*4]
3795    mova                 m2, [rsp+32*5]
3796    mova                 m1, [rsp+32*6]
3797    mova                 m0, [rsp+32*7]
3798    lea                  r6, [dstq+strideq*8]
3799    add                 rsp, 32*8
3800    punpcklbw            m8, m0, m1
3801    punpckhbw            m0, m1
3802    punpcklbw            m1, m2, m3
3803    punpckhbw            m2, m3
3804    punpcklbw            m3, m4, m5
3805    punpckhbw            m4, m5
3806    punpcklbw            m5, m6, m7
3807    punpckhbw            m6, m7
3808    punpcklwd            m7, m8, m1
3809    punpckhwd            m8, m1
3810    punpcklwd            m1, m0, m2
3811    punpckhwd            m0, m2
3812    punpcklwd            m2, m3, m5
3813    punpckhwd            m3, m5
3814    punpcklwd            m5, m4, m6
3815    punpckhwd            m4, m6
3816    punpckldq            m6, m7, m2
3817    punpckhdq            m7, m2
3818    punpckldq            m2, m8, m3
3819    punpckhdq            m8, m3
3820    punpckldq            m3, m1, m5
3821    punpckhdq            m1, m5
3822    punpckldq            m5, m0, m4
3823    punpckhdq            m0, m4
3824    movq   [dstq+strideq*0], xm6
3825    movhps [dstq+strideq*1], xm6
3826    vextracti128        xm6, m6, 1
3827    movq   [dstq+strideq*2], xm7
3828    movhps [dstq+r2       ], xm7
3829    vextracti128        xm7, m7, 1
3830    movq   [dstq+strideq*4], xm2
3831    movhps [dstq+r3       ], xm2
3832    vextracti128        xm2, m2, 1
3833    movq   [dstq+r2*2     ], xm8
3834    movhps [dstq+r4       ], xm8
3835    vextracti128        xm8, m8, 1
3836    movq     [r6+strideq*0], xm3
3837    movhps   [r6+strideq*1], xm3
3838    vextracti128        xm3, m3, 1
3839    movq     [r6+strideq*2], xm1
3840    movhps   [r6+r2       ], xm1
3841    vextracti128        xm1, m1, 1
3842    movq     [r6+strideq*4], xm5
3843    movhps   [r6+r3       ], xm5
3844    vextracti128        xm5, m5, 1
3845    movq     [r6+r2*2     ], xm0
3846    movhps   [r6+r4       ], xm0
3847    lea                  r6, [r6+strideq*8]
3848    vextracti128        xm0, m0, 1
3849    movq     [r6+strideq*0], xm6
3850    movhps   [r6+strideq*1], xm6
3851    movq     [r6+strideq*2], xm7
3852    movhps   [r6+r2       ], xm7
3853    movq     [r6+strideq*4], xm2
3854    movhps   [r6+r3       ], xm2
3855    movq     [r6+r2*2     ], xm8
3856    movhps   [r6+r4       ], xm8
3857    lea                  r6, [r6+strideq*8]
3858    movq     [r6+strideq*0], xm3
3859    movhps   [r6+strideq*1], xm3
3860    movq     [r6+strideq*2], xm1
3861    movhps   [r6+r2       ], xm1
3862    movq     [r6+strideq*4], xm5
3863    movhps   [r6+r3       ], xm5
3864    movq     [r6+r2*2     ], xm0
3865    movhps   [r6+r4       ], xm0
3866    sub                dstq, 8
3867    sub              org_wd, 8
3868    jg .h32_w8_loop
3869    RET
3870ALIGN function_align
3871.h64:
3872    ALLOC_STACK        -128, 16
3873    lea            maxbased, [wq+63]
3874    test             angled, 0x400 ; !enable_intra_edge_filter
3875    jnz .h64_main
3876    mov                 r4d, 21
3877    vpbroadcastb       xm11, [tlq-127]
3878    vpblendd           xm11, [tlq-130], 0x0e ; 120-127
3879    sub                 r4d, wd ; 21-w
3880    mov                 r5d, 3
3881    vinserti128         m11, [tlq-116], 1    ; 104-111
3882    movu                 m7, [z_filter_s+4]
3883    cmp                  wd, 32
3884    cmove               r4d, r5d
3885    vinserti128          m8, m7, [z_filter_s+8], 1
3886    vbroadcasti128       m6, [pb_0to15]
3887    movd                xm1, r4d
3888    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
3889    movu               xm12, [tlq-122]       ; 112-119
3890    vinserti128         m12, [tlq-108], 1    ;  96-103
3891    vpbroadcastb         m1, xm1
3892    movu               xm13, [tlq- 98]       ;  88- 95
3893    vinserti128         m13, [tlq- 84], 1    ;  72- 79
3894    movu               xm14, [tlq- 90]       ;  80- 87
3895    vinserti128         m14, [tlq- 76], 1    ;  64- 71
3896    vinserti128          m7, [z_filter_s+16], 0
3897    pshufb               m0, m11, m8
3898    pmaddubsw            m0, m9
3899    pshufb               m2, m12, m8
3900    pmaddubsw            m2, m9
3901    pmaxsb               m1, m6 ; clip (16|32)x64
3902    pshufb              m13, m1
3903    pshufb               m1, m13, m8
3904    pmaddubsw            m1, m9
3905    pshufb               m6, m14, m8
3906    pmaddubsw            m6, m9
3907    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
3908    shufps              m15, m8, m7, q1021
3909    pshufb              m10, m11, m15
3910    pmaddubsw           m10, m9
3911    paddw                m0, m10
3912    pshufb              m10, m12, m15
3913    pmaddubsw           m10, m9
3914    paddw                m2, m10
3915    pshufb              m10, m13, m15
3916    pmaddubsw           m10, m9
3917    paddw                m1, m10
3918    pshufb              m10, m14, m15
3919    pmaddubsw           m10, m9
3920    paddw                m6, m10
3921    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
3922    shufps              m10, m8, m7, q2132
3923    pshufb              m11, m10
3924    pmaddubsw           m11, m9
3925    pshufb              m12, m10
3926    pmaddubsw           m12, m9
3927    pshufb              m13, m10
3928    pmaddubsw           m13, m9
3929    pshufb              m14, m10
3930    pmaddubsw           m14, m9
3931    paddw                m0, m11
3932    paddw                m2, m12
3933    paddw                m1, m13
3934    paddw                m6, m14
3935    movu               xm11, [tlq-66]    ; 56-63
3936    vinserti128         m11, [tlq-52], 1 ; 40-47
3937    movu               xm12, [tlq-58]    ; 48-55
3938    vinserti128         m12, [tlq-44], 1 ; 32-39
3939    movu               xm13, [tlq-34]    ; 24-31
3940    vinserti128         m13, [tlq-20], 1 ;  8-15
3941    movu               xm14, [tlq-28]    ; 16-23
3942    vinserti128         m14, [tlq-14], 1 ;  0- 7
3943    pmulhrsw             m0, m3
3944    pmulhrsw             m2, m3
3945    pmulhrsw             m1, m3
3946    pmulhrsw             m6, m3
3947    lea                 tlq, [rsp+127]
3948    packuswb             m0, m2
3949    packuswb             m1, m6
3950    mova          [tlq-127], m0
3951    mova          [tlq- 95], m1
3952    pshufb               m0, m11, m10
3953    pmaddubsw            m0, m9
3954    pshufb               m2, m12, m10
3955    pmaddubsw            m2, m9
3956    pshufb               m1, m13, m10
3957    pmaddubsw            m1, m9
3958    pshufb               m6, m14, m7
3959    pmaddubsw            m6, m9
3960    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
3961    pshufb               m7, m11, m15
3962    pmaddubsw            m7, m9
3963    paddw                m0, m7
3964    pshufb               m7, m12, m15
3965    pmaddubsw            m7, m9
3966    paddw                m2, m7
3967    pshufb               m7, m13, m15
3968    pmaddubsw            m7, m9
3969    paddw                m1, m7
3970    pshufb               m7, m14, m10
3971    pmaddubsw            m7, m9
3972    paddw                m6, m7
3973    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
3974    pshufb              m11, m8
3975    pmaddubsw           m11, m9
3976    pshufb              m12, m8
3977    pmaddubsw           m12, m9
3978    pshufb              m13, m8
3979    pmaddubsw           m13, m9
3980    pshufb              m14, m15
3981    pmaddubsw           m14, m9
3982    paddw                m0, m11
3983    paddw                m2, m12
3984    paddw                m1, m13
3985    paddw                m6, m14
3986    pmulhrsw             m0, m3
3987    pmulhrsw             m2, m3
3988    pmulhrsw             m1, m3
3989    pmulhrsw             m6, m3
3990    packuswb             m0, m2
3991    packuswb             m1, m6
3992    mova           [tlq-63], m0
3993    mova           [tlq-31], m1
3994.h64_main:
3995    movd               xm12, dyd
3996    neg            maxbaseq
3997    vbroadcasti128       m8, [z3_shuf]
3998    vpbroadcastb         m7, [tlq+maxbaseq]
3999    shl            maxbased, 6
4000    vpbroadcastw        m12, xm12
4001    lea                 r5d, [dyq+maxbaseq-64]
4002    neg                 dyq
4003    or             maxbased, 63
4004    lea                  r4, [dyq+63]
4005    movd                xm6, r5d
4006    mova               xm10, [pb_1to32+16]
4007    vinserti128         m10, [pb_1to32], 1
4008    vpbroadcastd        m11, [pb_32]
4009    vpbroadcastw         m6, xm6
4010.h64_loop:
4011    mov                  r5, r4
4012    sar                  r5, 6
4013    movu                 m0, [tlq+r5-24]
4014    movu                 m1, [tlq+r5-32]
4015    pand                 m2, m4, m6
4016    psubw                m9, m5, m2
4017    psllw                m2, 8
4018    por                  m9, m2
4019    pshufb               m0, m8
4020    pshufb               m1, m8
4021    pmaddubsw            m0, m9
4022    pmaddubsw            m1, m9
4023    psraw                m2, m6, 6
4024    sub                 rsp, 64
4025    pmulhrsw             m0, m3
4026    pmulhrsw             m1, m3
4027    packsswb             m2, m2
4028    paddb                m2, m10
4029    packuswb             m0, m1
4030    vpblendvb            m0, m7, m0, m2
4031    mova           [rsp+32], m0
4032    movu                 m0, [tlq+r5-56]
4033    movu                 m1, [tlq+r5-64]
4034    add                  r4, dyq
4035    pshufb               m0, m8
4036    pshufb               m1, m8
4037    pmaddubsw            m0, m9
4038    pmaddubsw            m1, m9
4039    paddb                m2, m11
4040    pmulhrsw             m0, m3
4041    pmulhrsw             m1, m3
4042    paddw                m6, m12
4043    packuswb             m0, m1
4044    vpblendvb            m0, m7, m0, m2
4045    mova              [rsp], m0
4046    dec                  wd
4047    jz .h64_transpose
4048    cmp                 r4d, maxbased
4049    jg .h64_loop
4050.h64_end_loop:
4051    sub                 rsp, 64
4052    mova           [rsp+32], m7
4053    mova           [rsp+ 0], m7
4054    dec                  wd
4055    jg .h64_end_loop
4056.h64_transpose:
4057    lea                  r2, [strideq*3]
4058    lea                  r3, [strideq*5]
4059    imul                 r5, strideq, -8
4060    lea                dstq, [dstq+org_wq-16]
4061    lea                  r4, [strideq+r2*2] ; stride*7
4062.h64_transpose_loop0:
4063    lea                  r6, [rsp+16*3]
4064.h64_transpose_loop:
4065    mova                xm0, [r6+64*15]
4066    vinserti128          m0, [r6+64* 7], 1
4067    mova                xm1, [r6+64*14]
4068    vinserti128          m1, [r6+64* 6], 1
4069    mova                xm2, [r6+64*13]
4070    vinserti128          m2, [r6+64* 5], 1
4071    mova                xm3, [r6+64*12]
4072    vinserti128          m3, [r6+64* 4], 1
4073    mova                xm4, [r6+64*11]
4074    vinserti128          m4, [r6+64* 3], 1
4075    mova                xm5, [r6+64*10]
4076    vinserti128          m5, [r6+64* 2], 1
4077    mova                xm6, [r6+64* 9]
4078    vinserti128          m6, [r6+64* 1], 1
4079    mova                xm7, [r6+64* 8]
4080    vinserti128          m7, [r6+64* 0], 1
4081    sub                  r6, 16
4082    punpcklbw            m8, m0, m1
4083    punpckhbw            m0, m1
4084    punpcklbw            m1, m2, m3
4085    punpckhbw            m2, m3
4086    punpcklbw            m3, m4, m5
4087    punpckhbw            m4, m5
4088    punpcklbw            m5, m6, m7
4089    punpckhbw            m6, m7
4090    punpcklwd            m7, m8, m1
4091    punpckhwd            m8, m1
4092    punpcklwd            m1, m0, m2
4093    punpckhwd            m0, m2
4094    punpcklwd            m2, m3, m5
4095    punpckhwd            m3, m5
4096    punpcklwd            m5, m4, m6
4097    punpckhwd            m4, m6
4098    punpckldq            m6, m7, m2
4099    punpckhdq            m7, m2
4100    punpckldq            m2, m8, m3
4101    punpckhdq            m8, m3
4102    punpckldq            m3, m1, m5
4103    punpckhdq            m1, m5
4104    punpckldq            m5, m0, m4
4105    punpckhdq            m0, m4
4106    vpermq               m6, m6, q3120
4107    vpermq               m7, m7, q3120
4108    vpermq               m2, m2, q3120
4109    vpermq               m8, m8, q3120
4110    vpermq               m3, m3, q3120
4111    vpermq               m1, m1, q3120
4112    vpermq               m5, m5, q3120
4113    vpermq               m0, m0, q3120
4114    mova         [dstq+strideq*0], xm6
4115    vextracti128 [dstq+strideq*1], m6, 1
4116    mova         [dstq+strideq*2], xm7
4117    vextracti128 [dstq+r2       ], m7, 1
4118    mova         [dstq+strideq*4], xm2
4119    vextracti128 [dstq+r3       ], m2, 1
4120    mova         [dstq+r2*2     ], xm8
4121    vextracti128 [dstq+r4       ], m8, 1
4122    sub               dstq, r5
4123    mova         [dstq+strideq*0], xm3
4124    vextracti128 [dstq+strideq*1], m3, 1
4125    mova         [dstq+strideq*2], xm1
4126    vextracti128 [dstq+r2       ], m1, 1
4127    mova         [dstq+strideq*4], xm5
4128    vextracti128 [dstq+r3       ], m5, 1
4129    mova         [dstq+r2*2     ], xm0
4130    vextracti128 [dstq+r4       ], m0, 1
4131    sub                dstq, r5
4132    cmp                  r6, rsp
4133    jae .h64_transpose_loop
4134    add                 rsp, 64*16
4135    lea                dstq, [dstq+r5*8-16]
4136    sub              org_wd, 16
4137    jg .h64_transpose_loop0
4138.h64_end:
4139    RET
4140
4141%macro FILTER_XMM 4 ; dst, src, tmp, shuf
4142%ifnum %4
4143    pshufb             xm%2, xm%4
4144%else
4145    pshufb             xm%2, %4
4146%endif
4147    pshufd             xm%1, xm%2, q0000 ; p0 p1
4148    pmaddubsw          xm%1, xm2
4149    pshufd             xm%3, xm%2, q1111 ; p2 p3
4150    pmaddubsw          xm%3, xm3
4151    paddw              xm%1, xm1
4152    paddw              xm%1, xm%3
4153    pshufd             xm%3, xm%2, q2222 ; p4 p5
4154    pmaddubsw          xm%3, xm4
4155    paddw              xm%1, xm%3
4156    pshufd             xm%3, xm%2, q3333 ; p6 __
4157    pmaddubsw          xm%3, xm5
4158    paddw              xm%1, xm%3
4159    psraw              xm%1, 4
4160    packuswb           xm%1, xm%1
4161%endmacro
4162
4163%macro FILTER_YMM 4 ; dst, src, tmp, shuf
4164    pshufb              m%2, m%4
4165    pshufd              m%1, m%2, q0000
4166    pmaddubsw           m%1, m2
4167    pshufd              m%3, m%2, q1111
4168    pmaddubsw           m%3, m3
4169    paddw               m%1, m1
4170    paddw               m%1, m%3
4171    pshufd              m%3, m%2, q2222
4172    pmaddubsw           m%3, m4
4173    paddw               m%1, m%3
4174    pshufd              m%3, m%2, q3333
4175    pmaddubsw           m%3, m5
4176    paddw               m%1, m%3
4177    psraw               m%1, 4
4178    vperm2i128          m%3, m%1, m%1, 0x01
4179    packuswb            m%1, m%3
4180%endmacro
4181
4182; The ipred_filter SIMD processes 4x2 blocks in the following order which
4183; increases parallelism compared to doing things row by row. One redundant
4184; block is calculated for w8 and w16, two for w32.
4185;     w4     w8       w16             w32
4186;     1     1 2     1 2 3 5     1 2 3 5 b c d f
4187;     2     2 3     2 4 5 7     2 4 5 7 c e f h
4188;     3     3 4     4 6 7 9     4 6 7 9 e g h j
4189; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
4190;           5       8           8       i
4191
4192cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
4193%define base r6-ipred_filter_avx2_table
4194    lea                  r6, [filter_intra_taps]
4195    tzcnt                wd, wm
4196%ifidn filterd, filterm
4197    movzx           filterd, filterb
4198%else
4199    movzx           filterd, byte filterm
4200%endif
4201    shl             filterd, 6
4202    WIN64_SPILL_XMM       9, 15
4203    add             filterq, r6
4204    lea                  r6, [ipred_filter_avx2_table]
4205    movq                xm0, [tlq-3] ; _ 6 5 0 1 2 3 4
4206    movsxd               wq, [r6+wq*4]
4207    vpbroadcastd         m1, [base+pw_8]
4208    vbroadcasti128       m2, [filterq+16*0]
4209    vbroadcasti128       m3, [filterq+16*1]
4210    vbroadcasti128       m4, [filterq+16*2]
4211    vbroadcasti128       m5, [filterq+16*3]
4212    add                  wq, r6
4213    mov                  hd, hm
4214    jmp                  wq
4215.w4:
4216    mova                xm8, [base+filter_shuf2]
4217    sub                 tlq, 3
4218    sub                 tlq, hq
4219    jmp .w4_loop_start
4220.w4_loop:
4221    pinsrd              xm0, xm6, [tlq+hq], 0
4222    lea                dstq, [dstq+strideq*2]
4223.w4_loop_start:
4224    FILTER_XMM            6, 0, 7, 8
4225    movd   [dstq+strideq*0], xm6
4226    pextrd [dstq+strideq*1], xm6, 1
4227    sub                  hd, 2
4228    jg .w4_loop
4229    RET
4230ALIGN function_align
4231.w8:
4232    WIN64_PUSH_XMM       10
4233    mova                 m8, [base+filter_shuf1]
4234    FILTER_XMM            7, 0, 6, [base+filter_shuf2]
4235    vpbroadcastd         m0, [tlq+4]
4236    vpbroadcastd         m6, [tlq+5]
4237    sub                 tlq, 4
4238    sub                 tlq, hq
4239    vpbroadcastq         m7, xm7
4240    vpblendd             m7, m6, 0x20
4241.w8_loop:
4242    vpbroadcastd        xm6, [tlq+hq]
4243    palignr              m6, m0, 12
4244    vpblendd             m0, m6, m7, 0xeb     ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
4245                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4246    mova                xm6, xm7
4247    call .main
4248    vpblendd            xm6, xm7, 0x0c
4249    pshufd              xm6, xm6, q3120
4250    movq   [dstq+strideq*0], xm6
4251    movhps [dstq+strideq*1], xm6
4252    lea                dstq, [dstq+strideq*2]
4253    sub                  hd, 2
4254    jg .w8_loop
4255    RET
4256ALIGN function_align
4257.w16:
4258    sub                  hd, 2
4259    call .w16_main
4260%if WIN64
4261    jmp .end
4262%else
4263    RET
4264%endif
4265.w16_main:
4266    ; The spills are into the callers stack frame
4267    %assign stack_size stack_size + gprsize
4268    WIN64_PUSH_XMM       15, 9
4269    %assign stack_size stack_size - gprsize
4270    FILTER_XMM           12, 0, 7, [base+filter_shuf2]
4271    vpbroadcastd         m0, [tlq+5]
4272    vpblendd             m0, [tlq-12], 0x14
4273    mova                 m8, [base+filter_shuf1]
4274    vpbroadcastq         m7, xm12
4275    vpblendd             m0, m7, 0xc2         ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
4276                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4277    call .main                                ; c0 d0 a1 b1   a1 b1 c0 d0
4278    movlps              xm9, xm7, [tlq+5]     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
4279    vinserti128         m14, m8, [base+filter_shuf3], 0
4280    vpblendd           xm12, xm7, 0x0c        ; a0 b0 a1 b1
4281    FILTER_XMM            6, 9, 10, 14
4282    vpbroadcastq         m6, xm6              ; a2 b2 __ __ __ __ a2 b2
4283    vpbroadcastd         m9, [tlq+13]
4284    vpbroadcastd        m10, [tlq+12]
4285    psrld               m11, m8, 4
4286    vpblendd             m6, m9, 0x20         ; top
4287    sub                 tlq, 6
4288    sub                 tlq, hq
4289.w16_loop:
4290    vpbroadcastd        xm9, [tlq+hq]
4291    palignr              m9, m0, 12
4292    vpblendd             m0, m9, m7, 0xe2     ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
4293                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4294    mova               xm13, xm7
4295    call .main                                ; e0 f0 c1 d1   c1 d1 e0 f0
4296    vpblendd             m9, m12, m10, 0xf0
4297    vpblendd            m12, m6, 0xc0
4298    pshufd               m9, m9, q3333
4299    vpblendd             m9, m6, 0xee
4300    vpblendd            m10, m9, m7, 0x0c     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
4301                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4302    FILTER_YMM            6, 10, 9, 14        ; c2 d2 a3 b3   a3 b3 c2 d2
4303    vpblendd            m12, m6, 0x30         ; a0 b0 a1 b1   a3 b3 a2 b2
4304    vpermd               m9, m11, m12         ; a0 a1 a2 a3   b0 b1 b2 b3
4305    vpblendd           xm12, xm13, xm7, 0x0c  ; c0 d0 c1 d1
4306    mova         [dstq+strideq*0], xm9
4307    vextracti128 [dstq+strideq*1], m9, 1
4308    lea                dstq, [dstq+strideq*2]
4309    sub                  hd, 2
4310    jg .w16_loop
4311    vpblendd            xm7, xm6, xm10, 0x04  ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
4312    pshufd              xm7, xm7, q1032       ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4313    FILTER_XMM            0, 7, 9, [base+filter_shuf1+16]
4314    vpblendd            xm6, xm0, 0x0c        ; c2 d2 c3 d3
4315    shufps              xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
4316    shufps              xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
4317    mova   [dstq+strideq*0], xm0
4318    mova   [dstq+strideq*1], xm6
4319    ret
4320ALIGN function_align
4321.w32:
4322    sub                  hd, 2
4323    lea                  r3, [dstq+16]
4324    lea                 r5d, [hq-2]
4325    call .w16_main
4326    add                 tlq, r5
4327    mov                dstq, r3
4328    lea                  r3, [strideq-4]
4329    lea                  r4, [r3+strideq*2]
4330    movq                xm0, [tlq+21]
4331    pinsrd              xm0, [dstq-4], 2
4332    pinsrd              xm0, [dstq+r3*1], 3
4333    FILTER_XMM           12, 0, 7, 14         ; a0 b0 a0 b0
4334    movq                xm7, [dstq+r3*2]
4335    pinsrd              xm7, [dstq+r4], 2
4336    palignr             xm7, xm0, 12          ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6
4337    vpbroadcastd         m0, [tlq+28]
4338    vpbroadcastd         m9, [tlq+29]
4339    vbroadcasti128       m8, [base+filter_shuf1+16]
4340    vpblendd             m0, m9, 0x20
4341    vpblendd             m0, m7, 0x0f
4342    vpbroadcastq         m7, xm12
4343    vpblendd             m0, m7, 0xc2         ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4344    call .main                                ; c0 d0 a1 b1   a1 b1 c0 d0
4345    add                  r3, 2
4346    lea                  r4, [r4+strideq*2]
4347    movlps              xm9, xm7, [tlq+29]    ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
4348    vpblendd           xm12, xm7, 0x0c        ; a0 b0 a1 b1
4349    FILTER_XMM            6, 9, 10, 14
4350    vpbroadcastq         m6, xm6              ; a2 b2 __ __ __ __ a2 b2
4351    vpbroadcastd         m9, [tlq+37]
4352    vpbroadcastd        m10, [tlq+36]
4353    vpblendd             m6, m9, 0x20         ; top
4354.w32_loop:
4355    movq                xm9, [dstq+r3*4]
4356    pinsrd              xm9, [dstq+r4], 2
4357.w32_loop_last:
4358    palignr              m9, m0, 12
4359    vpblendd             m0, m9, m7, 0xe2     ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4360    mova               xm13, xm7              ; c0 d0
4361    call .main                                ; e0 f0 c1 d1   c1 d1 e0 f0
4362    vpblendd             m9, m12, m10, 0xf0
4363    vpblendd            m12, m6, 0xc0
4364    pshufd               m9, m9, q3333
4365    vpblendd             m9, m6, 0xee
4366    vpblendd            m10, m9, m7, 0x0c     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
4367                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4368    FILTER_YMM            6, 10, 9, 14        ; c2 d2 a3 b3   a3 b3 c2 d2
4369    vpblendd            m12, m6, 0x30         ; a0 b0 a1 b1   a3 b3 a2 b2
4370    vpermd               m9, m11, m12         ; a0 a1 a2 a3   b0 b1 b2 b3
4371    vpblendd           xm12, xm13, xm7, 0x0c  ; c0 d0 c1 d1
4372    mova         [dstq+strideq*0], xm9
4373    vextracti128 [dstq+strideq*1], m9, 1
4374    lea                dstq, [dstq+strideq*2]
4375    sub                 r5d, 2
4376    jg .w32_loop
4377    jz .w32_loop_last
4378    vpblendd            xm7, xm6, xm10, 0x04  ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
4379    pshufd              xm7, xm7, q1032       ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4380    FILTER_XMM            0, 7, 9, [base+filter_shuf1+16]
4381    vpblendd            xm6, xm0, 0x0c        ; c2 d2 c3 d3
4382    shufps              xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
4383    shufps              xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
4384    mova   [dstq+strideq*0], xm0
4385    mova   [dstq+strideq*1], xm6
4386.end:
4387    RET
4388ALIGN function_align
4389.main:
4390    FILTER_YMM            7, 0, 9, 8
4391    ret
4392
4393%if WIN64
4394DECLARE_REG_TMP 5
4395%else
4396DECLARE_REG_TMP 7
4397%endif
4398
4399%macro IPRED_CFL 1 ; ac in, unpacked pixels out
4400    psignw               m3, m%1, m1
4401    pabsw               m%1, m%1
4402    pmulhrsw            m%1, m2
4403    psignw              m%1, m3
4404    paddw               m%1, m0
4405%endmacro
4406
4407cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
4408    lea                  t0, [ipred_cfl_left_avx2_table]
4409    tzcnt                wd, wm
4410    inc                 tlq
4411    movu                 m0, [tlq]
4412    movifnidn            hd, hm
4413    mov                 r6d, 0x8000
4414    shrx                r6d, r6d, wd
4415    movd                xm3, r6d
4416    movsxd               r6, [t0+wq*4]
4417    pcmpeqd              m2, m2
4418    pmaddubsw            m0, m2
4419    add                  r6, t0
4420    add                  t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
4421    movsxd               wq, [t0+wq*4]
4422    add                  wq, t0
4423    movifnidn           acq, acmp
4424    jmp                  r6
4425
4426cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
4427    mov                  hd, hm ; zero upper half
4428    tzcnt               r6d, hd
4429    sub                 tlq, hq
4430    tzcnt                wd, wm
4431    movu                 m0, [tlq]
4432    mov                 t0d, 0x8000
4433    shrx                t0d, t0d, r6d
4434    movd                xm3, t0d
4435    lea                  t0, [ipred_cfl_left_avx2_table]
4436    movsxd               r6, [t0+r6*4]
4437    pcmpeqd              m2, m2
4438    pmaddubsw            m0, m2
4439    add                  r6, t0
4440    add                  t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
4441    movsxd               wq, [t0+wq*4]
4442    add                  wq, t0
4443    movifnidn           acq, acmp
4444    jmp                  r6
4445.h32:
4446    vextracti128        xm1, m0, 1
4447    paddw               xm0, xm1
4448.h16:
4449    punpckhqdq          xm1, xm0, xm0
4450    paddw               xm0, xm1
4451.h8:
4452    psrlq               xm1, xm0, 32
4453    paddw               xm0, xm1
4454.h4:
4455    pmaddwd             xm0, xm2
4456    pmulhrsw            xm0, xm3
4457    vpbroadcastw         m0, xm0
4458    jmp                  wq
4459
4460cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
4461    movifnidn            hd, hm
4462    movifnidn            wd, wm
4463    tzcnt               r6d, hd
4464    lea                 t0d, [wq+hq]
4465    movd                xm4, t0d
4466    tzcnt               t0d, t0d
4467    movd                xm5, t0d
4468    lea                  t0, [ipred_cfl_avx2_table]
4469    tzcnt                wd, wd
4470    movsxd               r6, [t0+r6*4]
4471    movsxd               wq, [t0+wq*4+4*4]
4472    pcmpeqd              m3, m3
4473    psrlw               xm4, 1
4474    add                  r6, t0
4475    add                  wq, t0
4476    movifnidn           acq, acmp
4477    jmp                  r6
4478.h4:
4479    movd                xm0, [tlq-4]
4480    pmaddubsw           xm0, xm3
4481    jmp                  wq
4482.w4:
4483    movd                xm1, [tlq+1]
4484    pmaddubsw           xm1, xm3
4485    psubw               xm0, xm4
4486    paddw               xm0, xm1
4487    pmaddwd             xm0, xm3
4488    cmp                  hd, 4
4489    jg .w4_mul
4490    psrlw               xm0, 3
4491    jmp .w4_end
4492.w4_mul:
4493    punpckhqdq          xm1, xm0, xm0
4494    lea                 r2d, [hq*2]
4495    mov                 r6d, 0x55563334
4496    paddw               xm0, xm1
4497    shrx                r6d, r6d, r2d
4498    psrlq               xm1, xm0, 32
4499    paddw               xm0, xm1
4500    movd                xm1, r6d
4501    psrlw               xm0, 2
4502    pmulhuw             xm0, xm1
4503.w4_end:
4504    vpbroadcastw         m0, xm0
4505.s4:
4506    vpbroadcastw         m1, alpham
4507    lea                  r6, [strideq*3]
4508    pabsw                m2, m1
4509    psllw                m2, 9
4510.s4_loop:
4511    mova                 m4, [acq]
4512    IPRED_CFL             4
4513    packuswb             m4, m4
4514    vextracti128        xm5, m4, 1
4515    movd   [dstq+strideq*0], xm4
4516    pextrd [dstq+strideq*1], xm4, 1
4517    movd   [dstq+strideq*2], xm5
4518    pextrd [dstq+r6       ], xm5, 1
4519    lea                dstq, [dstq+strideq*4]
4520    add                 acq, 32
4521    sub                  hd, 4
4522    jg .s4_loop
4523    RET
4524ALIGN function_align
4525.h8:
4526    movq                xm0, [tlq-8]
4527    pmaddubsw           xm0, xm3
4528    jmp                  wq
4529.w8:
4530    movq                xm1, [tlq+1]
4531    vextracti128        xm2, m0, 1
4532    pmaddubsw           xm1, xm3
4533    psubw               xm0, xm4
4534    paddw               xm0, xm2
4535    punpckhqdq          xm2, xm0, xm0
4536    paddw               xm0, xm2
4537    paddw               xm0, xm1
4538    psrlq               xm1, xm0, 32
4539    paddw               xm0, xm1
4540    pmaddwd             xm0, xm3
4541    psrlw               xm0, xm5
4542    cmp                  hd, 8
4543    je .w8_end
4544    mov                 r6d, 0x5556
4545    mov                 r2d, 0x3334
4546    cmp                  hd, 32
4547    cmove               r6d, r2d
4548    movd                xm1, r6d
4549    pmulhuw             xm0, xm1
4550.w8_end:
4551    vpbroadcastw         m0, xm0
4552.s8:
4553    vpbroadcastw         m1, alpham
4554    lea                  r6, [strideq*3]
4555    pabsw                m2, m1
4556    psllw                m2, 9
4557.s8_loop:
4558    mova                 m4, [acq]
4559    mova                 m5, [acq+32]
4560    IPRED_CFL             4
4561    IPRED_CFL             5
4562    packuswb             m4, m5
4563    vextracti128        xm5, m4, 1
4564    movq   [dstq+strideq*0], xm4
4565    movq   [dstq+strideq*1], xm5
4566    movhps [dstq+strideq*2], xm4
4567    movhps [dstq+r6       ], xm5
4568    lea                dstq, [dstq+strideq*4]
4569    add                 acq, 64
4570    sub                  hd, 4
4571    jg .s8_loop
4572    RET
4573ALIGN function_align
4574.h16:
4575    mova                xm0, [tlq-16]
4576    pmaddubsw           xm0, xm3
4577    jmp                  wq
4578.w16:
4579    movu                xm1, [tlq+1]
4580    vextracti128        xm2, m0, 1
4581    pmaddubsw           xm1, xm3
4582    psubw               xm0, xm4
4583    paddw               xm0, xm2
4584    paddw               xm0, xm1
4585    punpckhqdq          xm1, xm0, xm0
4586    paddw               xm0, xm1
4587    psrlq               xm1, xm0, 32
4588    paddw               xm0, xm1
4589    pmaddwd             xm0, xm3
4590    psrlw               xm0, xm5
4591    cmp                  hd, 16
4592    je .w16_end
4593    mov                 r6d, 0x5556
4594    mov                 r2d, 0x3334
4595    test                 hb, 8|32
4596    cmovz               r6d, r2d
4597    movd                xm1, r6d
4598    pmulhuw             xm0, xm1
4599.w16_end:
4600    vpbroadcastw         m0, xm0
4601.s16:
4602    vpbroadcastw         m1, alpham
4603    pabsw                m2, m1
4604    psllw                m2, 9
4605.s16_loop:
4606    mova                 m4, [acq]
4607    mova                 m5, [acq+32]
4608    IPRED_CFL             4
4609    IPRED_CFL             5
4610    packuswb             m4, m5
4611    vpermq               m4, m4, q3120
4612    mova         [dstq+strideq*0], xm4
4613    vextracti128 [dstq+strideq*1], m4, 1
4614    lea                dstq, [dstq+strideq*2]
4615    add                 acq, 64
4616    sub                  hd, 2
4617    jg .s16_loop
4618    RET
4619ALIGN function_align
4620.h32:
4621    mova                 m0, [tlq-32]
4622    pmaddubsw            m0, m3
4623    jmp                  wq
4624.w32:
4625    movu                 m1, [tlq+1]
4626    pmaddubsw            m1, m3
4627    paddw                m0, m1
4628    vextracti128        xm1, m0, 1
4629    psubw               xm0, xm4
4630    paddw               xm0, xm1
4631    punpckhqdq          xm1, xm0, xm0
4632    paddw               xm0, xm1
4633    psrlq               xm1, xm0, 32
4634    paddw               xm0, xm1
4635    pmaddwd             xm0, xm3
4636    psrlw               xm0, xm5
4637    cmp                  hd, 32
4638    je .w32_end
4639    lea                 r2d, [hq*2]
4640    mov                 r6d, 0x33345556
4641    shrx                r6d, r6d, r2d
4642    movd                xm1, r6d
4643    pmulhuw             xm0, xm1
4644.w32_end:
4645    vpbroadcastw         m0, xm0
4646.s32:
4647    vpbroadcastw         m1, alpham
4648    pabsw                m2, m1
4649    psllw                m2, 9
4650.s32_loop:
4651    mova                 m4, [acq]
4652    mova                 m5, [acq+32]
4653    IPRED_CFL             4
4654    IPRED_CFL             5
4655    packuswb             m4, m5
4656    vpermq               m4, m4, q3120
4657    mova             [dstq], m4
4658    add                dstq, strideq
4659    add                 acq, 64
4660    dec                  hd
4661    jg .s32_loop
4662    RET
4663
4664cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
4665    lea                  t0, [ipred_cfl_splat_avx2_table]
4666    tzcnt                wd, wm
4667    movifnidn            hd, hm
4668    movsxd               wq, [t0+wq*4]
4669    vpbroadcastd         m0, [t0-ipred_cfl_splat_avx2_table+pw_128]
4670    add                  wq, t0
4671    movifnidn           acq, acmp
4672    jmp                  wq
4673
4674cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
4675    movifnidn         hpadd, hpadm
4676    movifnidn            wd, wm
4677    mov                  hd, hm
4678    mov                 szd, wd
4679    mov             ac_bakq, acq
4680    imul                szd, hd
4681    shl               hpadd, 2
4682    sub                  hd, hpadd
4683    vpbroadcastd         m2, [pb_2]
4684    pxor                 m4, m4
4685    cmp                  wd, 8
4686    jg .w16
4687    je .w8
4688    ; fall-through
4689
4690    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
4691.w4:
4692    lea            stride3q, [strideq*3]
4693.w4_loop:
4694    movq                xm0, [yq]
4695    movq                xm1, [yq+strideq]
4696    movhps              xm0, [yq+strideq*2]
4697    movhps              xm1, [yq+stride3q]
4698    pmaddubsw           xm0, xm2
4699    pmaddubsw           xm1, xm2
4700    paddw               xm0, xm1
4701    mova              [acq], xm0
4702    paddw               xm4, xm0
4703    lea                  yq, [yq+strideq*4]
4704    add                 acq, 16
4705    sub                  hd, 2
4706    jg .w4_loop
4707    test              hpadd, hpadd
4708    jz .calc_avg
4709    vpermq               m0, m0, q1111
4710.w4_hpad_loop:
4711    mova              [acq], m0
4712    paddw                m4, m0
4713    add                 acq, 32
4714    sub               hpadd, 4
4715    jg .w4_hpad_loop
4716    jmp .calc_avg
4717
4718.w8:
4719    lea            stride3q, [strideq*3]
4720    test              wpadd, wpadd
4721    jnz .w8_wpad
4722.w8_loop:
4723    mova                xm0, [yq]
4724    mova                xm1, [yq+strideq]
4725    vinserti128          m0, [yq+strideq*2], 1
4726    vinserti128          m1, [yq+stride3q], 1
4727    pmaddubsw            m0, m2
4728    pmaddubsw            m1, m2
4729    paddw                m0, m1
4730    mova              [acq], m0
4731    paddw                m4, m0
4732    lea                  yq, [yq+strideq*4]
4733    add                 acq, 32
4734    sub                  hd, 2
4735    jg .w8_loop
4736    test              hpadd, hpadd
4737    jz .calc_avg
4738    jmp .w8_hpad
4739.w8_wpad:
4740    vbroadcasti128       m3, [cfl_ac_w8_pad1_shuffle]
4741.w8_wpad_loop:
4742    movq                xm0, [yq]
4743    movq                xm1, [yq+strideq]
4744    vinserti128          m0, [yq+strideq*2], 1
4745    vinserti128          m1, [yq+stride3q], 1
4746    pmaddubsw            m0, m2
4747    pmaddubsw            m1, m2
4748    paddw                m0, m1
4749    pshufb               m0, m3
4750    mova              [acq], m0
4751    paddw                m4, m0
4752    lea                  yq, [yq+strideq*4]
4753    add                 acq, 32
4754    sub                  hd, 2
4755    jg .w8_wpad_loop
4756    test              hpadd, hpadd
4757    jz .calc_avg
4758.w8_hpad:
4759    vpermq               m0, m0, q3232
4760.w8_hpad_loop:
4761    mova              [acq], m0
4762    paddw                m4, m0
4763    add                 acq, 32
4764    sub               hpadd, 2
4765    jg .w8_hpad_loop
4766    jmp .calc_avg
4767
4768.w16:
4769    test              wpadd, wpadd
4770    jnz .w16_wpad
4771.w16_loop:
4772    mova                 m0, [yq]
4773    mova                 m1, [yq+strideq]
4774    pmaddubsw            m0, m2
4775    pmaddubsw            m1, m2
4776    paddw                m0, m1
4777    mova              [acq], m0
4778    paddw                m4, m0
4779    lea                  yq, [yq+strideq*2]
4780    add                 acq, 32
4781    dec                  hd
4782    jg .w16_loop
4783    test              hpadd, hpadd
4784    jz .calc_avg
4785    jmp .w16_hpad_loop
4786.w16_wpad:
4787    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
4788    lea               iptrq, [ipred_cfl_ac_420_avx2_table]
4789    shl               wpadd, 2
4790    mova                 m3, [iptrq+cfl_ac_w16_pad_shuffle- \
4791                              ipred_cfl_ac_420_avx2_table+wpadq*8-32]
4792    movsxd            wpadq, [iptrq+wpadq+4]
4793    add               iptrq, wpadq
4794    jmp iptrq
4795.w16_pad3:
4796    vpbroadcastq         m0, [yq]
4797    vpbroadcastq         m1, [yq+strideq]
4798    jmp .w16_wpad_end
4799.w16_pad2:
4800    vbroadcasti128       m0, [yq]
4801    vbroadcasti128       m1, [yq+strideq]
4802    jmp .w16_wpad_end
4803.w16_pad1:
4804    mova                 m0, [yq]
4805    mova                 m1, [yq+strideq]
4806    ; fall-through
4807.w16_wpad_end:
4808    pmaddubsw            m0, m2
4809    pmaddubsw            m1, m2
4810    paddw                m0, m1
4811    pshufb               m0, m3
4812    mova              [acq], m0
4813    paddw                m4, m0
4814    lea                  yq, [yq+strideq*2]
4815    add                 acq, 32
4816    dec                  hd
4817    jz .w16_wpad_done
4818    jmp iptrq
4819.w16_wpad_done:
4820    test              hpadd, hpadd
4821    jz .calc_avg
4822.w16_hpad_loop:
4823    mova              [acq], m0
4824    paddw                m4, m0
4825    add                 acq, 32
4826    dec               hpadd
4827    jg .w16_hpad_loop
4828    ; fall-through
4829
4830.calc_avg:
4831    vpbroadcastd         m2, [pw_1]
4832    pmaddwd              m0, m4, m2
4833    vextracti128        xm1, m0, 1
4834    tzcnt               r1d, szd
4835    paddd               xm0, xm1
4836    movd                xm2, r1d
4837    movd                xm3, szd
4838    punpckhqdq          xm1, xm0, xm0
4839    paddd               xm0, xm1
4840    psrad               xm3, 1
4841    psrlq               xm1, xm0, 32
4842    paddd               xm0, xm3
4843    paddd               xm0, xm1
4844    psrad               xm0, xm2
4845    vpbroadcastw         m0, xm0
4846.sub_loop:
4847    mova                 m1, [ac_bakq]
4848    psubw                m1, m0
4849    mova          [ac_bakq], m1
4850    add             ac_bakq, 32
4851    sub                 szd, 16
4852    jg .sub_loop
4853    RET
4854
4855cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
4856    movifnidn         hpadd, hpadm
4857    movifnidn            wd, wm
4858    mov                  hd, hm
4859    mov                 szd, wd
4860    mov             ac_bakq, acq
4861    imul                szd, hd
4862    shl               hpadd, 2
4863    sub                  hd, hpadd
4864    vpbroadcastd         m2, [pb_4]
4865    pxor                 m4, m4
4866    pxor                 m5, m5
4867    cmp                  wd, 8
4868    jg .w16
4869    je .w8
4870    ; fall-through
4871
4872    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
4873.w4:
4874    lea            stride3q, [strideq*3]
4875.w4_loop:
4876    movq                xm1, [yq]
4877    movhps              xm1, [yq+strideq]
4878    movq                xm0, [yq+strideq*2]
4879    movhps              xm0, [yq+stride3q]
4880    pmaddubsw           xm0, xm2
4881    pmaddubsw           xm1, xm2
4882    mova              [acq], xm1
4883    mova           [acq+16], xm0
4884    paddw               xm4, xm0
4885    paddw               xm5, xm1
4886    lea                  yq, [yq+strideq*4]
4887    add                 acq, 32
4888    sub                  hd, 4
4889    jg .w4_loop
4890    test              hpadd, hpadd
4891    jz .calc_avg
4892    vpermq               m0, m0, q1111
4893.w4_hpad_loop:
4894    mova              [acq], m0
4895    paddw                m4, m0
4896    add                 acq, 32
4897    sub               hpadd, 4
4898    jg .w4_hpad_loop
4899    jmp .calc_avg
4900
4901.w8:
4902    lea            stride3q, [strideq*3]
4903    test              wpadd, wpadd
4904    jnz .w8_wpad
4905.w8_loop:
4906    mova                xm1, [yq]
4907    vinserti128          m1, [yq+strideq], 1
4908    mova                xm0, [yq+strideq*2]
4909    vinserti128          m0, [yq+stride3q], 1
4910    pmaddubsw            m0, m2
4911    pmaddubsw            m1, m2
4912    mova              [acq], m1
4913    mova           [acq+32], m0
4914    paddw                m4, m0
4915    paddw                m5, m1
4916    lea                  yq, [yq+strideq*4]
4917    add                 acq, 64
4918    sub                  hd, 4
4919    jg .w8_loop
4920    test              hpadd, hpadd
4921    jz .calc_avg
4922    jmp .w8_hpad
4923.w8_wpad:
4924    vbroadcasti128       m3, [cfl_ac_w8_pad1_shuffle]
4925.w8_wpad_loop:
4926    movq                xm1, [yq]
4927    vinserti128          m1, [yq+strideq], 1
4928    movq                xm0, [yq+strideq*2]
4929    vinserti128          m0, [yq+stride3q], 1
4930    pmaddubsw            m0, m2
4931    pmaddubsw            m1, m2
4932    pshufb               m0, m3
4933    pshufb               m1, m3
4934    mova              [acq], m1
4935    mova           [acq+32], m0
4936    paddw                m4, m0
4937    paddw                m5, m1
4938    lea                  yq, [yq+strideq*4]
4939    add                 acq, 64
4940    sub                  hd, 4
4941    jg .w8_wpad_loop
4942    test              hpadd, hpadd
4943    jz .calc_avg
4944.w8_hpad:
4945    vpermq               m0, m0, q3232
4946.w8_hpad_loop:
4947    mova              [acq], m0
4948    paddw                m4, m0
4949    add                 acq, 32
4950    sub               hpadd, 2
4951    jg .w8_hpad_loop
4952    jmp .calc_avg
4953
4954.w16:
4955    test              wpadd, wpadd
4956    jnz .w16_wpad
4957.w16_loop:
4958    mova                 m1, [yq]
4959    mova                 m0, [yq+strideq]
4960    pmaddubsw            m0, m2
4961    pmaddubsw            m1, m2
4962    mova              [acq], m1
4963    mova           [acq+32], m0
4964    paddw                m4, m0
4965    paddw                m5, m1
4966    lea                  yq, [yq+strideq*2]
4967    add                 acq, 64
4968    sub                  hd, 2
4969    jg .w16_loop
4970    test              hpadd, hpadd
4971    jz .calc_avg
4972    jmp .w16_hpad_loop
4973.w16_wpad:
4974    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
4975    lea               iptrq, [ipred_cfl_ac_422_avx2_table]
4976    shl               wpadd, 2
4977    mova                 m3, [iptrq+cfl_ac_w16_pad_shuffle- \
4978                              ipred_cfl_ac_422_avx2_table+wpadq*8-32]
4979    movsxd            wpadq, [iptrq+wpadq+4]
4980    add               iptrq, wpadq
4981    jmp iptrq
4982.w16_pad3:
4983    vpbroadcastq         m1, [yq]
4984    vpbroadcastq         m0, [yq+strideq]
4985    jmp .w16_wpad_end
4986.w16_pad2:
4987    vbroadcasti128       m1, [yq]
4988    vbroadcasti128       m0, [yq+strideq]
4989    jmp .w16_wpad_end
4990.w16_pad1:
4991    mova                 m1, [yq]
4992    mova                 m0, [yq+strideq]
4993    ; fall-through
4994.w16_wpad_end:
4995    pmaddubsw            m0, m2
4996    pmaddubsw            m1, m2
4997    pshufb               m0, m3
4998    pshufb               m1, m3
4999    mova              [acq], m1
5000    mova           [acq+32], m0
5001    paddw                m4, m0
5002    paddw                m5, m1
5003    lea                  yq, [yq+strideq*2]
5004    add                 acq, 64
5005    sub                  hd, 2
5006    jz .w16_wpad_done
5007    jmp iptrq
5008.w16_wpad_done:
5009    test              hpadd, hpadd
5010    jz .calc_avg
5011.w16_hpad_loop:
5012    mova              [acq], m0
5013    mova           [acq+32], m0
5014    paddw                m4, m0
5015    paddw                m5, m0
5016    add                 acq, 64
5017    sub               hpadd, 2
5018    jg .w16_hpad_loop
5019    ; fall-through
5020
5021.calc_avg:
5022    vpbroadcastd         m2, [pw_1]
5023    pmaddwd              m5, m5, m2
5024    pmaddwd              m0, m4, m2
5025    paddd                m0, m5
5026    vextracti128        xm1, m0, 1
5027    tzcnt               r1d, szd
5028    paddd               xm0, xm1
5029    movd                xm2, r1d
5030    movd                xm3, szd
5031    punpckhqdq          xm1, xm0, xm0
5032    paddd               xm0, xm1
5033    psrad               xm3, 1
5034    psrlq               xm1, xm0, 32
5035    paddd               xm0, xm3
5036    paddd               xm0, xm1
5037    psrad               xm0, xm2
5038    vpbroadcastw         m0, xm0
5039.sub_loop:
5040    mova                 m1, [ac_bakq]
5041    psubw                m1, m0
5042    mova          [ac_bakq], m1
5043    add             ac_bakq, 32
5044    sub                 szd, 16
5045    jg .sub_loop
5046    RET
5047
5048cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
5049    movifnidn         hpadd, hpadm
5050    movifnidn            wd, wm
5051    mov                  hd, hm
5052    mov                 szd, wd
5053    imul                szd, hd
5054    shl               hpadd, 2
5055    sub                  hd, hpadd
5056    pxor                 m4, m4
5057    vpbroadcastd         m5, [pw_1]
5058    tzcnt               r8d, wd
5059    lea                  r5, [ipred_cfl_ac_444_avx2_table]
5060    movsxd               r8, [r5+r8*4+12]
5061    add                  r5, r8
5062
5063    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
5064    mov             ac_bakq, acq
5065    jmp                  r5
5066
5067.w4:
5068    lea            stride3q, [strideq*3]
5069    pxor                xm2, xm2
5070.w4_loop:
5071    movd                xm1, [yq]
5072    movd                xm0, [yq+strideq*2]
5073    pinsrd              xm1, [yq+strideq], 1
5074    pinsrd              xm0, [yq+stride3q], 1
5075    punpcklbw           xm1, xm2
5076    punpcklbw           xm0, xm2
5077    psllw               xm1, 3
5078    psllw               xm0, 3
5079    mova              [acq], xm1
5080    mova           [acq+16], xm0
5081    paddw               xm1, xm0
5082    paddw               xm4, xm1
5083    lea                  yq, [yq+strideq*4]
5084    add                 acq, 32
5085    sub                  hd, 4
5086    jg .w4_loop
5087    test              hpadd, hpadd
5088    jz .calc_avg_mul
5089    pshufd              xm0, xm0, q3232
5090    paddw               xm1, xm0, xm0
5091.w4_hpad_loop:
5092    mova              [acq], xm0
5093    mova           [acq+16], xm0
5094    paddw               xm4, xm1
5095    add                 acq, 32
5096    sub               hpadd, 4
5097    jg .w4_hpad_loop
5098    jmp .calc_avg_mul
5099
5100.w8:
5101    lea            stride3q, [strideq*3]
5102    pxor                 m2, m2
5103.w8_loop:
5104    movq                xm1, [yq]
5105    movq                xm0, [yq+strideq*2]
5106    vinserti128          m1, [yq+strideq], 1
5107    vinserti128          m0, [yq+stride3q], 1
5108    punpcklbw            m1, m2
5109    punpcklbw            m0, m2
5110    psllw                m1, 3
5111    psllw                m0, 3
5112    mova              [acq], m1
5113    mova           [acq+32], m0
5114    paddw                m1, m0
5115    paddw                m4, m1
5116    lea                  yq, [yq+strideq*4]
5117    add                 acq, 64
5118    sub                  hd, 4
5119    jg .w8_loop
5120    test              hpadd, hpadd
5121    jz .calc_avg_mul
5122    vpermq               m0, m0, q3232
5123    paddw                m1, m0, m0
5124.w8_hpad_loop:
5125    mova              [acq], m0
5126    mova           [acq+32], m0
5127    paddw                m4, m1
5128    add                 acq, 64
5129    sub               hpadd, 4
5130    jg .w8_hpad_loop
5131    jmp .calc_avg_mul
5132
5133.w16:
5134    test              wpadd, wpadd
5135    jnz .w16_wpad
5136.w16_loop:
5137    pmovzxbw             m1, [yq]
5138    pmovzxbw             m0, [yq+strideq]
5139    psllw                m1, 3
5140    psllw                m0, 3
5141    mova              [acq], m1
5142    mova           [acq+32], m0
5143    paddw                m1, m0
5144    pmaddwd              m1, m5
5145    paddd                m4, m1
5146    lea                  yq, [yq+strideq*2]
5147    add                 acq, 64
5148    sub                  hd, 2
5149    jg .w16_loop
5150    test              hpadd, hpadd
5151    jz .calc_avg
5152    jmp .w16_hpad
5153.w16_wpad:
5154    mova                 m3, [cfl_ac_444_w16_pad1_shuffle]
5155.w16_wpad_loop:
5156    vpbroadcastq         m1, [yq]
5157    vpbroadcastq         m0, [yq+strideq]
5158    pshufb               m1, m3
5159    pshufb               m0, m3
5160    psllw                m1, 3
5161    psllw                m0, 3
5162    mova              [acq], m1
5163    mova           [acq+32], m0
5164    paddw                m1, m0
5165    pmaddwd              m1, m5
5166    paddd                m4, m1
5167    lea                  yq, [yq+strideq*2]
5168    add                 acq, 64
5169    sub                  hd, 2
5170    jg .w16_wpad_loop
5171    test              hpadd, hpadd
5172    jz .calc_avg
5173.w16_hpad:
5174    paddw                m1, m0, m0
5175    pmaddwd              m1, m5
5176.w16_hpad_loop:
5177    mova              [acq], m0
5178    mova           [acq+32], m0
5179    paddd                m4, m1
5180    add                 acq, 64
5181    sub               hpadd, 2
5182    jg .w16_hpad_loop
5183    jmp .calc_avg
5184
5185.w32:
5186    test              wpadd, wpadd
5187    jnz .w32_wpad
5188.w32_loop:
5189    pmovzxbw             m1, [yq]
5190    pmovzxbw             m0, [yq+16]
5191    psllw                m1, 3
5192    psllw                m0, 3
5193    mova              [acq], m1
5194    mova           [acq+32], m0
5195    paddw                m2, m1, m0
5196    pmaddwd              m2, m5
5197    paddd                m4, m2
5198    add                  yq, strideq
5199    add                 acq, 64
5200    dec                  hd
5201    jg .w32_loop
5202    test              hpadd, hpadd
5203    jz .calc_avg
5204    jmp .w32_hpad_loop
5205.w32_wpad:
5206    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
5207    lea               iptrq, [ipred_cfl_ac_444_avx2_table]
5208    add               wpadd, wpadd
5209    mova                 m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table]
5210    movsxd            wpadq, [iptrq+wpadq+4]
5211    add               iptrq, wpadq
5212    jmp iptrq
5213.w32_pad3:
5214    vpbroadcastq         m1, [yq]
5215    pshufb               m1, m3
5216    vpermq               m0, m1, q3232
5217    jmp .w32_wpad_end
5218.w32_pad2:
5219    pmovzxbw             m1, [yq]
5220    pshufhw              m0, m1, q3333
5221    vpermq               m0, m0, q3333
5222    jmp .w32_wpad_end
5223.w32_pad1:
5224    pmovzxbw             m1, [yq]
5225    vpbroadcastq         m0, [yq+16]
5226    pshufb               m0, m3
5227    ; fall-through
5228.w32_wpad_end:
5229    psllw                m1, 3
5230    psllw                m0, 3
5231    mova              [acq], m1
5232    mova           [acq+32], m0
5233    paddw                m2, m1, m0
5234    pmaddwd              m2, m5
5235    paddd                m4, m2
5236    add                  yq, strideq
5237    add                 acq, 64
5238    dec                  hd
5239    jz .w32_wpad_done
5240    jmp iptrq
5241.w32_wpad_done:
5242    test              hpadd, hpadd
5243    jz .calc_avg
5244.w32_hpad_loop:
5245    mova              [acq], m1
5246    mova           [acq+32], m0
5247    paddd                m4, m2
5248    add                 acq, 64
5249    dec               hpadd
5250    jg .w32_hpad_loop
5251    jmp .calc_avg
5252
5253.calc_avg_mul:
5254    pmaddwd              m4, m5
5255.calc_avg:
5256    vextracti128        xm1, m4, 1
5257    tzcnt               r1d, szd
5258    paddd               xm0, xm4, xm1
5259    movd                xm2, r1d
5260    movd                xm3, szd
5261    punpckhqdq          xm1, xm0, xm0
5262    paddd               xm0, xm1
5263    psrad               xm3, 1
5264    psrlq               xm1, xm0, 32
5265    paddd               xm0, xm3
5266    paddd               xm0, xm1
5267    psrad               xm0, xm2
5268    vpbroadcastw         m0, xm0
5269.sub_loop:
5270    mova                 m1, [ac_bakq]
5271    psubw                m1, m0
5272    mova          [ac_bakq], m1
5273    add             ac_bakq, 32
5274    sub                 szd, 16
5275    jg .sub_loop
5276    RET
5277
5278cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
5279    vpbroadcastq         m4, [palq]
5280    lea                  r2, [pal_pred_avx2_table]
5281    tzcnt                wd, wm
5282    movifnidn            hd, hm
5283    movsxd               wq, [r2+wq*4]
5284    add                  wq, r2
5285    lea                  r2, [strideq*3]
5286    jmp                  wq
5287.w4:
5288    movq                xm0, [idxq]
5289    add                idxq, 8
5290    psrlw               xm1, xm0, 4
5291    punpcklbw           xm0, xm1
5292    pshufb              xm0, xm4, xm0
5293    movd   [dstq+strideq*0], xm0
5294    pextrd [dstq+strideq*1], xm0, 1
5295    pextrd [dstq+strideq*2], xm0, 2
5296    pextrd [dstq+r2       ], xm0, 3
5297    lea                dstq, [dstq+strideq*4]
5298    sub                  hd, 4
5299    jg .w4
5300    RET
5301.w8:
5302    movu                xm2, [idxq]
5303    add                idxq, 16
5304    pshufb              xm1, xm4, xm2
5305    psrlw               xm2, 4
5306    pshufb              xm2, xm4, xm2
5307    punpcklbw           xm0, xm1, xm2
5308    punpckhbw           xm1, xm2
5309    movq   [dstq+strideq*0], xm0
5310    movhps [dstq+strideq*1], xm0
5311    movq   [dstq+strideq*2], xm1
5312    movhps [dstq+r2       ], xm1
5313    lea                dstq, [dstq+strideq*4]
5314    sub                  hd, 4
5315    jg .w8
5316    RET
5317.w16:
5318    movu                 m2, [idxq]
5319    add                idxq, 32
5320    pshufb               m1, m4, m2
5321    psrlw                m2, 4
5322    pshufb               m2, m4, m2
5323    punpcklbw            m0, m1, m2
5324    punpckhbw            m1, m2
5325    mova         [dstq+strideq*0], xm0
5326    mova         [dstq+strideq*1], xm1
5327    vextracti128 [dstq+strideq*2], m0, 1
5328    vextracti128 [dstq+r2       ], m1, 1
5329    lea                dstq, [dstq+strideq*4]
5330    sub                  hd, 4
5331    jg .w16
5332    RET
5333.w32:
5334    vpermq               m2, [idxq], q3120
5335    add                idxq, 32
5336    pshufb               m1, m4, m2
5337    psrlw                m2, 4
5338    pshufb               m2, m4, m2
5339    punpcklbw            m0, m1, m2
5340    punpckhbw            m1, m2
5341    mova   [dstq+strideq*0], m0
5342    mova   [dstq+strideq*1], m1
5343    lea                dstq, [dstq+strideq*2]
5344    sub                  hd, 2
5345    jg .w32
5346    RET
5347.w64:
5348    vpermq               m2, [idxq], q3120
5349    add                idxq, 32
5350    pshufb               m1, m4, m2
5351    psrlw                m2, 4
5352    pshufb               m2, m4, m2
5353    punpcklbw            m0, m1, m2
5354    punpckhbw            m1, m2
5355    mova        [dstq+32*0], m0
5356    mova        [dstq+32*1], m1
5357    add                dstq, strideq
5358    dec                  hd
5359    jg .w64
5360    RET
5361
5362%endif
5363