xref: /aosp_15_r20/external/libdav1d/src/x86/ipred_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2020, VideoLAN and dav1d authors
2; Copyright © 2020, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33%macro SMOOTH_WEIGHT_TABLE 1-*
34    %rep %0
35        db %1-128, 127-%1
36        %rotate 1
37    %endrep
38%endmacro
39
40smooth_weights: SMOOTH_WEIGHT_TABLE         \
41      0,   0, 255, 128, 255, 149,  85,  64, \
42    255, 197, 146, 105,  73,  50,  37,  32, \
43    255, 225, 196, 170, 145, 123, 102,  84, \
44     68,  54,  43,  33,  26,  20,  17,  16, \
45    255, 240, 225, 210, 196, 182, 169, 157, \
46    145, 133, 122, 111, 101,  92,  83,  74, \
47     66,  59,  52,  45,  39,  34,  29,  25, \
48     21,  17,  14,  12,  10,   9,   8,   8, \
49    255, 248, 240, 233, 225, 218, 210, 203, \
50    196, 189, 182, 176, 169, 163, 156, 150, \
51    144, 138, 133, 127, 121, 116, 111, 106, \
52    101,  96,  91,  86,  82,  77,  73,  69, \
53     65,  61,  57,  54,  50,  47,  44,  41, \
54     38,  35,  32,  29,  27,  25,  22,  20, \
55     18,  16,  15,  13,  12,  10,   9,   8, \
56      7,   6,   6,   5,   5,   4,   4,   4
57
58; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __
59filter_taps:  db 10,  0,  0,  0,  2, 10,  0,  0,  1,  1, 10,  0,  1,  1,  2, 10
60              db  6,  0,  0,  0,  2,  6,  0,  0,  2,  2,  6,  0,  1,  2,  2,  6
61              db  0, 12, -6,  0,  0,  9, -5,  0,  0,  7, -3,  0,  0,  5, -3,  0
62              db 12,  2, -4,  0,  9,  2, -3,  0,  7,  2, -3,  0,  5,  3, -3,  0
63              db 16,  0,  0,  0,  0, 16,  0,  0,  0,  0, 16,  0,  0,  0,  0, 16
64              db 16,  0,  0,  0,  0, 16,  0,  0,  0,  0, 16,  0,  0,  0,  0, 16
65              db  0, 10,-10,  0,  0,  6, -6,  0,  0,  4, -4,  0,  0,  2, -2,  0
66              db 10,  0,-10,  0,  6,  0, -6,  0,  4,  0, -4,  0,  2,  0, -2,  0
67              db  8,  0,  0,  0,  0,  8,  0,  0,  0,  0,  8,  0,  0,  0,  0,  8
68              db  4,  0,  0,  0,  0,  4,  0,  0,  0,  0,  4,  0,  0,  0,  0,  4
69              db  0, 16, -8,  0,  0, 16, -8,  0,  0, 16, -8,  0,  0, 16, -8,  0
70              db 16,  0, -4,  0, 16,  0, -4,  0, 16,  0, -4,  0, 16,  0, -4,  0
71              db  8,  0,  0,  0,  3,  8,  0,  0,  2,  3,  8,  0,  1,  2,  3,  8
72              db  4,  0,  0,  0,  3,  4,  0,  0,  2,  3,  4,  0,  2,  2,  3,  4
73              db  0, 10, -2,  0,  0,  6, -1,  0,  0,  4, -1,  0,  0,  2,  0,  0
74              db 10,  3, -1,  0,  6,  4, -1,  0,  4,  4, -1,  0,  3,  3, -1,  0
75              db 14,  0,  0,  0,  0, 14,  0,  0,  0,  0, 14,  0,  0,  0,  0, 14
76              db 12,  0,  0,  0,  1, 12,  0,  0,  0,  0, 12,  0,  0,  0,  1, 12
77              db  0, 14,-12,  0,  0, 12,-10,  0,  0, 11, -9,  0,  0, 10, -8,  0
78              db 14,  0,-10,  0, 12,  0, -9,  0, 11,  1, -8,  0,  9,  1, -7,  0
79filter_perm:  db  0,  1,  2,  3, 24, 25, 26, 27,  4,  5,  6,  7, 28, 29, 30, 31
80              db 15, 11,  7,  3, 15, 11,  7,  3, 15, 11,  7,  3, 15, 11,  7,131
81              db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147
82              db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163
83filter_end:   dd  2,  3, 16, 17, -1, -1, 20, 21,  0,  6, 24, 30,  1,  7, 25, 31
84smooth_shuf:  db  7,  7,  7,  7,  0,  1,  0,  1,  3,  3,  3,  3,  8,  9,  8,  9
85              db  5,  5,  5,  5,  4,  5,  4,  5,  1,  1,  1,  1, 12, 13, 12, 13
86              db  6,  6,  6,  6,  2,  3,  2,  3,  2,  2,  2,  2, 10, 11, 10, 11
87              db  4,  4,  4,  4,  6,  7,  6,  7,  0,  0,  0,  0, 14, 15, 14, 15
88smooth_endA:  db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
89              db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
90              db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
91              db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
92smooth_endB:  db  1,  3,  5,  7,  9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
93              db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95
94              db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111
95              db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
96ipred_h_shuf: db  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4
97              db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
98pal_unpack:   db  0,  4,  8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
99pal_perm:     db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
100pb_63to0:     db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48
101              db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32
102              db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
103              db 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
104z_frac_table: db 64,  0, 62,  2, 60,  4, 58,  6, 56,  8, 54, 10, 52, 12, 50, 14
105              db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30
106              db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46
107              db 16, 48, 14, 50, 12, 52, 10, 54,  8, 56,  6, 58,  4, 60,  2, 62
108z_filter_s1:  db -1, -1, -1,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6
109              db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22
110              db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38
111              db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54
112z_filter_s5:  db 10,  9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16
113              db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32
114              db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48
115              db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64
116z_filter_s3:  db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
117z_filter_s2:  db  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
118z_filter_s4:  db  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7,  9,  8
119z_xpos_bc:    db 17, 17, 17, 17, 33, 33, 33, 33,  9,  9,  9,  9,  9,  9,  9,  9
120z_filter4_s1: db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
121              db  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8
122z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72
123z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80
124z_xpos_off2a: db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
125              db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
126              db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
127              db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56
128z_xpos_off2b: db  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
129              db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
130              db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
131              db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64
132z_xpos_mul:   dw  4,  4,  4,  4,  8,  8,  4,  4, 12, 12,  8,  8, 16, 16,  8,  8
133              dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16
134z_ypos_off1:  db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67
135              db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71
136              db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75
137              db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79
138z_ypos_off2:  db 64, 65, 64, 65,  0,  0,  0,  0, 64, 65, 64, 65,  0,  0,  0,  0
139              db 65, 66, 65, 66,  1,  1,  1,  1, 65, 66, 65, 66,  1,  1,  1,  1
140              db 66, 67, 66, 67,  2,  2,  2,  2, 66, 67, 66, 67,  2,  2,  2,  2
141              db 67, 68, 67, 68,  3,  3,  3,  3, 67, 68, 67, 68,  3,  3,  3,  3
142z_ypos_off3:  db  1,  2,  1,  2,  1,  1,  1,  1,  3,  4,  3,  4,  1,  1,  1,  1
143              db  5,  6,  5,  6,  3,  3,  3,  3,  7,  8,  7,  8,  3,  3,  3,  3
144              db  9, 10,  9, 10,  5,  5,  5,  5, 11, 12, 11, 12,  5,  5,  5,  5
145              db 13, 14, 13, 14,  7,  7,  7,  7, 15, 16, 15, 16,  7,  7,  7,  7
146z_ypos_mul1a: dw  1,  2,  3,  4,  5,  6,  7,  8, 17, 18, 19, 20, 21, 22, 23, 24
147              dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56
148z_ypos_mul1b: dw  9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32
149              dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64
150z_ypos_mul2a: dw  1*512,  2*512,  3*512,  4*512,  5*512,  6*512,  7*512,  8*512
151              dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512
152              dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512
153              dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512
154z_ypos_mul2b: dw  9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512
155              dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512
156              dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512
157              dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512
158z_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
159z_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
160z3_upsample:  db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
161              db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10,  9,  8
162z_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
163              db 39, 39, 47, 47, 47, 79, 79, 79
164z_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
165              db 32,  0, 32,  0, 24,  0, 24,  0, 16,  0, 16,  0
166              db  0, 32,  0, 32,  0, 24,  0, 24,  0, 16,  0, 16
167
168pb_8_56_0_0:  db  8, 56,  0,  0
169pb_m4_36:     times 2 db -4, 36
170pb_127_m127:  times 2 db 127, -127
171pb_8:         times 4 db 8
172pb_15:        times 4 db 15
173pb_16:        times 4 db 16
174pb_31:        times 4 db 31
175pb_63:        times 4 db 63
176pb_90:        times 4 db 90
177pb_128:       times 4 db 128
178pw_128:       times 2 dw 128
179pw_255:       times 2 dw 255
180pw_512:       times 2 dw 512
181
182%define pb_1  (ipred_h_shuf+24)
183%define pb_2  (ipred_h_shuf+20)
184%define pb_3  (ipred_h_shuf+16)
185%define pb_4  (smooth_shuf +48)
186%define pb_7  (ipred_h_shuf+ 0)
187%define pb_9  (z_xpos_bc   + 8)
188%define pb_17 (z_xpos_bc   + 0)
189%define pb_33 (z_xpos_bc   + 4)
190%define pd_8  (filter_taps+128)
191
192%macro JMP_TABLE 3-*
193    %xdefine %1_%2_table (%%table - 2*4)
194    %xdefine %%base mangle(private_prefix %+ _%1_%2)
195    %%table:
196    %rep %0 - 2
197        dd %%base %+ .%3 - (%%table - 2*4)
198        %rotate 1
199    %endrep
200%endmacro
201
202%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4)
203
204JMP_TABLE ipred_h_8bpc,          avx512icl, w4, w8, w16, w32, w64
205JMP_TABLE ipred_paeth_8bpc,      avx512icl, w4, w8, w16, w32, w64
206JMP_TABLE ipred_smooth_8bpc,     avx512icl, w4, w8, w16, w32, w64
207JMP_TABLE ipred_smooth_v_8bpc,   avx512icl, w4, w8, w16, w32, w64
208JMP_TABLE ipred_smooth_h_8bpc,   avx512icl, w4, w8, w16, w32, w64
209JMP_TABLE ipred_z1_8bpc,         avx512icl, w4, w8, w16, w32, w64
210JMP_TABLE ipred_z2_8bpc,         avx512icl, w4, w8, w16, w32, w64
211JMP_TABLE ipred_z3_8bpc,         avx512icl, w4, w8, w16, w32, w64
212JMP_TABLE ipred_dc_8bpc,         avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
213                                       s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
214JMP_TABLE ipred_dc_left_8bpc,    avx512icl, h4, h8, h16, h32, h64
215
216cextern dr_intra_derivative
217cextern pb_0to63
218
219SECTION .text
220
221INIT_ZMM avx512icl
222cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h
223    lea                  r5, [ipred_dc_left_8bpc_avx512icl_table]
224    movd                xm0, wm
225    tzcnt                wd, wm
226    inc                 tlq
227    movifnidn            hd, hm
228    movu                ym1, [tlq]
229    movd               xmm3, wd
230    movsxd               r6, [r5+wq*4]
231    vpbroadcastd        ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
232    psrld               xm0, 1
233    vpdpbusd            ym0, ym1, ym2
234    add                  r6, r5
235    add                  r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
236    movsxd               wq, [r5+wq*4]
237    add                  wq, r5
238    jmp                  r6
239
240cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
241    lea                  r5, [ipred_dc_left_8bpc_avx512icl_table]
242    mov                  hd, hm
243    tzcnt               r6d, hd
244    sub                 tlq, hq
245    tzcnt                wd, wm
246    movd                xm0, hm
247    movu                ym1, [tlq]
248    movd               xmm3, r6d
249    movsxd               r6, [r5+r6*4]
250    vpbroadcastd        ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
251    psrld               xm0, 1
252    vpdpbusd            ym0, ym1, ym2
253    add                  r6, r5
254    add                  r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
255    movsxd               wq, [r5+wq*4]
256    add                  wq, r5
257    jmp                  r6
258.h64:
259    movu                ym1, [tlq+32] ; unaligned when jumping here from dc_top
260    vpdpbusd            ym0, ym1, ym2
261.h32:
262    vextracti32x4       xm1, ym0, 1
263    paddd               xm0, xm1
264.h16:
265    punpckhqdq          xm1, xm0, xm0
266    paddd               xm0, xm1
267.h8:
268    psrlq               xm1, xm0, 32
269    paddd               xm0, xm1
270.h4:
271    vpsrlvd             xm0, xmm3
272    lea            stride3q, [strideq*3]
273    vpbroadcastb         m0, xm0
274    jmp                  wq
275
276cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
277    movifnidn            hd, hm
278    movifnidn            wd, wm
279    tzcnt               r6d, hd
280    lea                 r5d, [wq+hq]
281    movd                xm0, r5d
282    tzcnt               r5d, r5d
283    movd               xmm4, r5d
284    lea                  r5, [ipred_dc_8bpc_avx512icl_table]
285    tzcnt                wd, wd
286    movsxd               r6, [r5+r6*4]
287    movsxd               wq, [r5+wq*4+5*4]
288    vpbroadcastd        ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1]
289    psrld               xm0, 1
290    add                  r6, r5
291    add                  wq, r5
292    lea            stride3q, [strideq*3]
293    jmp                  r6
294.h4:
295    movd               xmm1, [tlq-4]
296    vpdpbusd            xm0, xmm1, xm3
297    jmp                  wq
298.w4:
299    movd               xmm1, [tlq+1]
300    vpdpbusd            xm0, xmm1, xm3
301    cmp                  hd, 4
302    jg .w4_mul
303    psrlw              xmm0, xm0, 3
304    jmp .w4_end
305.w4_mul:
306    punpckhqdq         xmm1, xm0, xm0
307    lea                 r2d, [hq*2]
308    mov                 r6d, 0x55563334
309    paddd              xmm1, xm0
310    shrx                r6d, r6d, r2d
311    psrlq              xmm0, xmm1, 32
312    paddd              xmm0, xmm1
313    movd               xmm1, r6d
314    psrld              xmm0, 2
315    pmulhuw            xmm0, xmm1
316.w4_end:
317    vpbroadcastb        xm0, xmm0
318.s4:
319    movd   [dstq+strideq*0], xm0
320    movd   [dstq+strideq*1], xm0
321    movd   [dstq+strideq*2], xm0
322    movd   [dstq+stride3q ], xm0
323    lea                dstq, [dstq+strideq*4]
324    sub                  hd, 4
325    jg .s4
326    RET
327.h8:
328    movq               xmm1, [tlq-8]
329    vpdpbusd            xm0, xmm1, xm3
330    jmp                  wq
331.w8:
332    movq               xmm1, [tlq+1]
333    vextracti32x4       xm2, ym0, 1
334    vpdpbusd            xm0, xmm1, xm3
335    paddd              xmm2, xm2, xm0
336    punpckhqdq         xmm0, xmm2, xmm2
337    paddd              xmm0, xmm2
338    psrlq              xmm1, xmm0, 32
339    paddd              xmm0, xmm1
340    vpsrlvd            xmm0, xmm4
341    cmp                  hd, 8
342    je .w8_end
343    mov                 r6d, 0x5556
344    mov                 r2d, 0x3334
345    cmp                  hd, 32
346    cmove               r6d, r2d
347    movd               xmm1, r6d
348    pmulhuw            xmm0, xmm1
349.w8_end:
350    vpbroadcastb        xm0, xmm0
351.s8:
352    movq   [dstq+strideq*0], xm0
353    movq   [dstq+strideq*1], xm0
354    movq   [dstq+strideq*2], xm0
355    movq   [dstq+stride3q ], xm0
356    lea                dstq, [dstq+strideq*4]
357    sub                  hd, 4
358    jg .s8
359    RET
360.h16:
361    mova               xmm1, [tlq-16]
362    vpdpbusd            xm0, xmm1, xm3
363    jmp                  wq
364.w16:
365    movu               xmm1, [tlq+1]
366    vextracti32x4       xm2, ym0, 1
367    vpdpbusd            xm0, xmm1, xm3
368    paddd              xmm2, xm2, xm0
369    punpckhqdq         xmm0, xmm2, xmm2
370    paddd              xmm0, xmm2
371    psrlq              xmm1, xmm0, 32
372    paddd              xmm0, xmm1
373    vpsrlvd            xmm0, xmm4
374    cmp                  hd, 16
375    je .w16_end
376    mov                 r6d, 0x5556
377    mov                 r2d, 0x3334
378    test                 hb, 8|32
379    cmovz               r6d, r2d
380    movd               xmm1, r6d
381    pmulhuw            xmm0, xmm1
382.w16_end:
383    vpbroadcastb        xm0, xmm0
384.s16:
385    mova   [dstq+strideq*0], xm0
386    mova   [dstq+strideq*1], xm0
387    mova   [dstq+strideq*2], xm0
388    mova   [dstq+stride3q ], xm0
389    lea                dstq, [dstq+strideq*4]
390    sub                  hd, 4
391    jg .s16
392    RET
393.h32:
394    mova                ym1, [tlq-32]
395    vpdpbusd            ym0, ym1, ym3
396    jmp                  wq
397.w32:
398    movu                ym1, [tlq+1]
399    vpdpbusd            ym0, ym1, ym3
400    vextracti32x4       xm1, ym0, 1
401    paddd              xmm1, xm1, xm0
402    punpckhqdq         xmm0, xmm1, xmm1
403    paddd              xmm0, xmm1
404    psrlq              xmm1, xmm0, 32
405    paddd              xmm0, xmm1
406    vpsrlvd            xmm0, xmm4
407    cmp                  hd, 32
408    je .w32_end
409    lea                 r2d, [hq*2]
410    mov                 r6d, 0x33345556
411    shrx                r6d, r6d, r2d
412    movd               xmm1, r6d
413    pmulhuw            xmm0, xmm1
414.w32_end:
415    vpbroadcastb        ym0, xmm0
416.s32:
417    mova   [dstq+strideq*0], ym0
418    mova   [dstq+strideq*1], ym0
419    mova   [dstq+strideq*2], ym0
420    mova   [dstq+stride3q ], ym0
421    lea                dstq, [dstq+strideq*4]
422    sub                  hd, 4
423    jg .s32
424    RET
425.h64:
426    mova                ym1, [tlq-64]
427    mova                ym2, [tlq-32]
428    vpdpbusd            ym0, ym1, ym3
429    vpdpbusd            ym0, ym2, ym3
430    jmp                  wq
431.w64:
432    movu                ym1, [tlq+ 1]
433    movu                ym2, [tlq+33]
434    vpdpbusd            ym0, ym1, ym3
435    vpdpbusd            ym0, ym2, ym3
436    vextracti32x4       xm1, ym0, 1
437    paddd              xmm1, xm1, xm0
438    punpckhqdq         xmm0, xmm1, xmm1
439    paddd              xmm0, xmm1
440    psrlq              xmm1, xmm0, 32
441    paddd              xmm0, xmm1
442    vpsrlvd            xmm0, xmm4
443    cmp                  hd, 64
444    je .w64_end
445    mov                 r6d, 0x33345556
446    shrx                r6d, r6d, hd
447    movd               xmm1, r6d
448    pmulhuw            xmm0, xmm1
449.w64_end:
450    vpbroadcastb         m0, xmm0
451.s64:
452    mova   [dstq+strideq*0], m0
453    mova   [dstq+strideq*1], m0
454    mova   [dstq+strideq*2], m0
455    mova   [dstq+stride3q ], m0
456    lea                dstq, [dstq+strideq*4]
457    sub                  hd, 4
458    jg .s64
459    RET
460
461cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3
462    lea                  r5, [ipred_dc_splat_8bpc_avx512icl_table]
463    tzcnt                wd, wm
464    movifnidn            hd, hm
465    movsxd               wq, [r5+wq*4]
466    vpbroadcastd         m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128]
467    add                  wq, r5
468    lea            stride3q, [strideq*3]
469    jmp                  wq
470
471cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
472    lea                  r5, [ipred_dc_splat_8bpc_avx512icl_table]
473    tzcnt                wd, wm
474    movu                 m0, [tlq+1]
475    movifnidn            hd, hm
476    movsxd               wq, [r5+wq*4]
477    add                  wq, r5
478    lea            stride3q, [strideq*3]
479    jmp                  wq
480
481cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3
482%define base r6-ipred_h_8bpc_avx512icl_table
483    lea                  r6, [ipred_h_8bpc_avx512icl_table]
484    tzcnt                wd, wm
485    mov                  hd, hm
486    movsxd               wq, [r6+wq*4]
487    lea            stride3q, [strideq*3]
488    sub                 tlq, hq
489    add                  wq, r6
490    jmp                  wq
491.w4:
492    mova               xmm1, [base+ipred_h_shuf+16]
493.w4_loop:
494    movd               xmm0, [tlq+hq-4]
495    pshufb             xmm0, xmm1
496    movd   [dstq+strideq*0], xmm0
497    pextrd [dstq+strideq*1], xmm0, 1
498    pextrd [dstq+strideq*2], xmm0, 2
499    pextrd [dstq+stride3q ], xmm0, 3
500    lea                dstq, [dstq+strideq*4]
501    sub                  hd, 4
502    jg .w4_loop
503    RET
504.w8:
505    movsldup           xmm2, [base+ipred_h_shuf+16]
506    movshdup           xmm3, [base+ipred_h_shuf+16]
507.w8_loop:
508    movd               xmm1, [tlq+hq-4]
509    pshufb             xmm0, xmm1, xmm2
510    pshufb             xmm1, xmm3
511    movq   [dstq+strideq*0], xmm0
512    movq   [dstq+strideq*1], xmm1
513    movhps [dstq+strideq*2], xmm0
514    movhps [dstq+stride3q ], xmm1
515    lea                dstq, [dstq+strideq*4]
516    sub                  hd, 4
517    jg .w8_loop
518    RET
519.w16:
520    movsldup             m1, [base+smooth_shuf]
521.w16_loop:
522    vpbroadcastd         m0, [tlq+hq-4]
523    pshufb               m0, m1
524    mova          [dstq+strideq*0], xm0
525    vextracti32x4 [dstq+strideq*1], m0, 2
526    vextracti32x4 [dstq+strideq*2], ym0, 1
527    vextracti32x4 [dstq+stride3q ], m0, 3
528    lea                dstq, [dstq+strideq*4]
529    sub                  hd, 4
530    jg .w16
531    RET
532.w32:
533    vpbroadcastd        ym3, [base+pb_1]
534    vpord                m2, m3, [base+pb_2] {1to16}
535.w32_loop:
536    vpbroadcastd         m1, [tlq+hq-4]
537    pshufb               m0, m1, m2
538    pshufb               m1, m3
539    mova          [dstq+strideq*0], ym0
540    vextracti32x8 [dstq+strideq*1], m0, 1
541    mova          [dstq+strideq*2], ym1
542    vextracti32x8 [dstq+stride3q ], m1, 1
543    lea                dstq, [dstq+strideq*4]
544    sub                  hd, 4
545    jg .w32_loop
546    RET
547.w64:
548    vpbroadcastd         m4, [base+pb_3]
549    vpbroadcastd         m5, [base+pb_2]
550    vpbroadcastd         m6, [base+pb_1]
551    pxor                 m7, m7
552.w64_loop:
553    vpbroadcastd         m3, [tlq+hq-4]
554    pshufb               m0, m3, m4
555    pshufb               m1, m3, m5
556    pshufb               m2, m3, m6
557    pshufb               m3, m7
558    mova   [dstq+strideq*0], m0
559    mova   [dstq+strideq*1], m1
560    mova   [dstq+strideq*2], m2
561    mova   [dstq+stride3q ], m3
562    lea                dstq, [dstq+strideq*4]
563    sub                  hd, 4
564    jg .w64_loop
565    RET
566
567%macro PAETH 0
568    psubusb              m1, m5, m4
569    psubusb              m0, m4, m5
570    por                  m1, m0           ; tdiff
571    pavgb                m2, m6, m4
572    vpcmpub              k1, m1, m7, 1    ; tdiff < ldiff
573    vpblendmb        m0{k1}, m4, m6
574    vpternlogd           m4, m6, m8, 0x28 ; (m4 ^ m6) & m8
575    psubusb              m3, m5, m2
576    psubb                m2, m4
577    psubusb              m2, m5
578    por                  m2, m3
579    pminub               m1, m7
580    paddusb              m2, m2
581    por                  m2, m4           ; min(tldiff, 255)
582    vpcmpub              k1, m2, m1, 1    ; tldiff < ldiff && tldiff < tdiff
583    vmovdqu8         m0{k1}, m5
584%endmacro
585
586cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3
587    lea                  r6, [ipred_paeth_8bpc_avx512icl_table]
588    tzcnt                wd, wm
589    vpbroadcastb         m5, [tlq] ; topleft
590    mov                  hd, hm
591    movsxd               wq, [r6+wq*4]
592    vpbroadcastd         m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1]
593    lea                topq, [tlq+1]
594    sub                 tlq, hq
595    add                  wq, r6
596    lea            stride3q, [strideq*3]
597    jmp                  wq
598INIT_YMM avx512icl
599.w4:
600    vpbroadcastd         m6, [topq]
601    mova                 m9, [ipred_h_shuf]
602    psubusb              m7, m5, m6
603    psubusb              m0, m6, m5
604    por                  m7, m0 ; ldiff
605.w4_loop:
606    vpbroadcastq         m4, [tlq+hq-8]
607    pshufb               m4, m9 ; left
608    PAETH
609    movd   [dstq+strideq*0], xm0
610    pextrd [dstq+strideq*1], xm0, 1
611    pextrd [dstq+strideq*2], xm0, 2
612    pextrd [dstq+stride3q ], xm0, 3
613    sub                  hd, 8
614    jl .w4_ret
615    vextracti32x4       xm0, m0, 1
616    lea                dstq, [dstq+strideq*4]
617    movd   [dstq+strideq*0], xm0
618    pextrd [dstq+strideq*1], xm0, 1
619    pextrd [dstq+strideq*2], xm0, 2
620    pextrd [dstq+stride3q ], xm0, 3
621    lea                dstq, [dstq+strideq*4]
622    jg .w4_loop
623.w4_ret:
624    RET
625INIT_ZMM avx512icl
626.w8:
627    vpbroadcastq         m6, [topq]
628    movsldup             m9, [smooth_shuf]
629    psubusb              m7, m5, m6
630    psubusb              m0, m6, m5
631    por                  m7, m0
632.w8_loop:
633    vpbroadcastq         m4, [tlq+hq-8]
634    pshufb               m4, m9
635    PAETH
636    vextracti32x4       xm1, m0, 2
637    vextracti32x4       xm2, ym0, 1
638    vextracti32x4       xm3, m0, 3
639    movq   [dstq+strideq*0], xm0
640    movq   [dstq+strideq*1], xm1
641    movq   [dstq+strideq*2], xm2
642    movq   [dstq+stride3q ], xm3
643    sub                  hd, 8
644    jl .w8_ret
645    lea                dstq, [dstq+strideq*4]
646    movhps [dstq+strideq*0], xm0
647    movhps [dstq+strideq*1], xm1
648    movhps [dstq+strideq*2], xm2
649    movhps [dstq+stride3q ], xm3
650    lea                dstq, [dstq+strideq*4]
651    jg .w8_loop
652.w8_ret:
653    RET
654.w16:
655    vbroadcasti32x4      m6, [topq]
656    movsldup             m9, [smooth_shuf]
657    psubusb              m7, m5, m6
658    psubusb              m0, m6, m5
659    por                  m7, m0
660.w16_loop:
661    vpbroadcastd         m4, [tlq+hq-4]
662    pshufb               m4, m9
663    PAETH
664    mova          [dstq+strideq*0], xm0
665    vextracti32x4 [dstq+strideq*1], m0, 2
666    vextracti32x4 [dstq+strideq*2], ym0, 1
667    vextracti32x4 [dstq+stride3q ], m0, 3
668    lea                dstq, [dstq+strideq*4]
669    sub                  hd, 4
670    jg .w16_loop
671    RET
672.w32:
673    vbroadcasti32x8      m6, [topq]
674    mova                ym9, ym8
675    psubusb              m7, m5, m6
676    psubusb              m0, m6, m5
677    por                  m7, m0
678.w32_loop:
679    vpbroadcastd         m4, [tlq+hq-2]
680    pshufb               m4, m9
681    PAETH
682    mova          [dstq+strideq*0], ym0
683    vextracti32x8 [dstq+strideq*1], m0, 1
684    lea                dstq, [dstq+strideq*2]
685    sub                  hd, 2
686    jg .w32_loop
687    RET
688.w64:
689    movu                 m6, [topq]
690    psubusb              m7, m5, m6
691    psubusb              m0, m6, m5
692    por                  m7, m0
693.w64_loop:
694    vpbroadcastb         m4, [tlq+hq-1]
695    PAETH
696    mova             [dstq], m0
697    add                dstq, strideq
698    dec                  hd
699    jg .w64_loop
700    RET
701
702cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
703%define base r6-ipred_smooth_v_8bpc_avx512icl_table
704    lea                  r6, [ipred_smooth_v_8bpc_avx512icl_table]
705    tzcnt                wd, wm
706    mov                  hd, hm
707    movsxd               wq, [r6+wq*4]
708    vpbroadcastd         m0, [base+pb_127_m127]
709    vpbroadcastd         m1, [base+pw_128]
710    lea            weightsq, [base+smooth_weights+hq*4]
711    neg                  hq
712    vpbroadcastb         m4, [tlq+hq] ; bottom
713    add                  wq, r6
714    lea            stride3q, [strideq*3]
715    jmp                  wq
716.w4:
717    vpbroadcastd         m2, [tlq+1]
718    movshdup             m5, [smooth_shuf]
719    mova                ym6, [smooth_endA]
720    punpcklbw            m2, m4 ; top, bottom
721    pmaddubsw            m3, m2, m0
722    paddw                m1, m2 ;   1 * top + 256 * bottom + 128, overflow is ok
723    paddw                m3, m1 ; 128 * top + 129 * bottom + 128
724.w4_loop:
725    vbroadcasti32x4      m0, [weightsq+hq*2]
726    pshufb               m0, m5
727    pmaddubsw            m0, m2, m0
728    paddw                m0, m3
729    vpermb               m0, m6, m0
730    vextracti32x4       xm1, ym0, 1
731    movd   [dstq+strideq*0], xm0
732    movd   [dstq+strideq*1], xm1
733    pextrd [dstq+strideq*2], xm0, 2
734    pextrd [dstq+stride3q ], xm1, 2
735    add                  hq, 8
736    jg .ret
737    lea                dstq, [dstq+strideq*4]
738    pextrd [dstq+strideq*0], xm0, 1
739    pextrd [dstq+strideq*1], xm1, 1
740    pextrd [dstq+strideq*2], xm0, 3
741    pextrd [dstq+stride3q ], xm1, 3
742    lea                dstq, [dstq+strideq*4]
743    jl .w4_loop
744.ret:
745    RET
746.w8:
747    vpbroadcastq         m2, [tlq+1]
748    movshdup             m5, [smooth_shuf]
749    mova                ym6, [smooth_endA]
750    punpcklbw            m2, m4
751    pmaddubsw            m3, m2, m0
752    paddw                m1, m2
753    paddw                m3, m1
754.w8_loop:
755    vpbroadcastq         m0, [weightsq+hq*2]
756    pshufb               m0, m5
757    pmaddubsw            m0, m2, m0
758    paddw                m0, m3
759    vpermb               m0, m6, m0
760    vextracti32x4       xm1, ym0, 1
761    movq   [dstq+strideq*0], xm0
762    movq   [dstq+strideq*1], xm1
763    movhps [dstq+strideq*2], xm0
764    movhps [dstq+stride3q ], xm1
765    lea                dstq, [dstq+strideq*4]
766    add                  hq, 4
767    jl .w8_loop
768    RET
769.w16:
770    vbroadcasti32x4      m3, [tlq+1]
771    movshdup             m6, [smooth_shuf]
772    mova                 m7, [smooth_endB]
773    punpcklbw            m2, m3, m4
774    punpckhbw            m3, m4
775    pmaddubsw            m4, m2, m0
776    pmaddubsw            m5, m3, m0
777    paddw                m0, m1, m2
778    paddw                m1, m3
779    paddw                m4, m0
780    paddw                m5, m1
781.w16_loop:
782    vpbroadcastq         m1, [weightsq+hq*2]
783    pshufb               m1, m6
784    pmaddubsw            m0, m2, m1
785    pmaddubsw            m1, m3, m1
786    paddw                m0, m4
787    paddw                m1, m5
788    vpermt2b             m0, m7, m1
789    mova          [dstq+strideq*0], xm0
790    vextracti32x4 [dstq+strideq*1], m0, 2
791    vextracti32x4 [dstq+strideq*2], ym0, 1
792    vextracti32x4 [dstq+stride3q ], m0, 3
793    lea                dstq, [dstq+strideq*4]
794    add                  hq, 4
795    jl .w16_loop
796    RET
797.w32:
798    vbroadcasti32x8      m3, [tlq+1]
799    movshdup             m6, [smooth_shuf]
800    mova                 m7, [smooth_endB]
801    punpcklbw            m2, m3, m4
802    punpckhbw            m3, m4
803    pmaddubsw            m4, m2, m0
804    pmaddubsw            m5, m3, m0
805    paddw                m0, m1, m2
806    paddw                m1, m3
807    paddw                m4, m0
808    paddw                m5, m1
809.w32_loop:
810    vpbroadcastd         m1, [weightsq+hq*2]
811    pshufb               m1, m6
812    pmaddubsw            m0, m2, m1
813    pmaddubsw            m1, m3, m1
814    paddw                m0, m4
815    paddw                m1, m5
816    vpermt2b             m0, m7, m1
817    mova          [dstq+strideq*0], ym0
818    vextracti32x8 [dstq+strideq*1], m0, 1
819    lea                dstq, [dstq+strideq*2]
820    add                  hq, 2
821    jl .w32_loop
822    RET
823.w64:
824    movu                 m3, [tlq+1]
825    mova                 m6, [smooth_endB]
826    punpcklbw            m2, m3, m4
827    punpckhbw            m3, m4
828    pmaddubsw            m4, m2, m0
829    pmaddubsw            m5, m3, m0
830    paddw                m0, m1, m2
831    paddw                m1, m3
832    paddw                m4, m0
833    paddw                m5, m1
834.w64_loop:
835    vpbroadcastw         m1, [weightsq+hq*2]
836    pmaddubsw            m0, m2, m1
837    pmaddubsw            m1, m3, m1
838    paddw                m0, m4
839    paddw                m1, m5
840    vpermt2b             m0, m6, m1
841    mova             [dstq], m0
842    add                dstq, strideq
843    inc                  hq
844    jl .w64_loop
845    RET
846
847cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
848%define base r5-ipred_smooth_h_8bpc_avx512icl_table
849    lea                  r5, [ipred_smooth_h_8bpc_avx512icl_table]
850    mov                 r6d, wd
851    tzcnt                wd, wd
852    vpbroadcastb         m4, [tlq+r6] ; right
853    mov                  hd, hm
854    movsxd               wq, [r5+wq*4]
855    vpbroadcastd         m5, [base+pb_127_m127]
856    vpbroadcastd         m6, [base+pw_128]
857    sub                 tlq, hq
858    add                  wq, r5
859    vpmovb2m             k1, m6
860    lea            stride3q, [strideq*3]
861    jmp                  wq
862.w4:
863    movsldup             m3, [smooth_shuf]
864    vpbroadcastq         m7, [smooth_weights+4*2]
865    mova                ym8, [smooth_endA]
866.w4_loop:
867    vpbroadcastq         m0, [tlq+hq-8]
868    mova                 m2, m4
869    vpshufb          m2{k1}, m0, m3 ; left, right
870    pmaddubsw            m0, m2, m5
871    pmaddubsw            m1, m2, m7
872    paddw                m2, m6
873    paddw                m0, m2
874    paddw                m0, m1
875    vpermb               m0, m8, m0
876    vextracti32x4       xm1, ym0, 1
877    movd   [dstq+strideq*0], xm0
878    movd   [dstq+strideq*1], xm1
879    pextrd [dstq+strideq*2], xm0, 2
880    pextrd [dstq+stride3q ], xm1, 2
881    sub                  hd, 8
882    jl .ret
883    lea                dstq, [dstq+strideq*4]
884    pextrd [dstq+strideq*0], xm0, 1
885    pextrd [dstq+strideq*1], xm1, 1
886    pextrd [dstq+strideq*2], xm0, 3
887    pextrd [dstq+stride3q ], xm1, 3
888    lea                dstq, [dstq+strideq*4]
889    jg .w4_loop
890.ret:
891    RET
892.w8:
893    movsldup             m3, [smooth_shuf]
894    vbroadcasti32x4      m7, [smooth_weights+8*2]
895    mova                ym8, [smooth_endA]
896.w8_loop:
897    vpbroadcastd         m0, [tlq+hq-4]
898    mova                 m2, m4
899    vpshufb          m2{k1}, m0, m3
900    pmaddubsw            m0, m2, m5
901    pmaddubsw            m1, m2, m7
902    paddw                m2, m6
903    paddw                m0, m2
904    paddw                m0, m1
905    vpermb               m0, m8, m0
906    vextracti32x4       xm1, ym0, 1
907    movq   [dstq+strideq*0], xm0
908    movq   [dstq+strideq*1], xm1
909    movhps [dstq+strideq*2], xm0
910    movhps [dstq+stride3q ], xm1
911    lea                dstq, [dstq+strideq*4]
912    sub                  hd, 4
913    jg .w8_loop
914    RET
915.w16:
916    movsldup             m7, [smooth_shuf]
917    vbroadcasti32x4      m8, [smooth_weights+16*2]
918    vbroadcasti32x4      m9, [smooth_weights+16*3]
919    mova                m10, [smooth_endB]
920.w16_loop:
921    vpbroadcastd         m0, [tlq+hq-4]
922    mova                 m3, m4
923    vpshufb          m3{k1}, m0, m7
924    pmaddubsw            m2, m3, m5
925    pmaddubsw            m0, m3, m8
926    pmaddubsw            m1, m3, m9
927    paddw                m3, m6
928    paddw                m2, m3
929    paddw                m0, m2
930    paddw                m1, m2
931    vpermt2b             m0, m10, m1
932    mova          [dstq+strideq*0], xm0
933    vextracti32x4 [dstq+strideq*1], m0, 2
934    vextracti32x4 [dstq+strideq*2], ym0, 1
935    vextracti32x4 [dstq+stride3q ], m0, 3
936    lea                dstq, [dstq+strideq*4]
937    sub                  hd, 4
938    jg .w16_loop
939    RET
940.w32:
941    mova                m10, [smooth_endA]
942    vpbroadcastd        ym7, [pb_1]
943    vbroadcasti32x8      m8, [smooth_weights+32*2]
944    vbroadcasti32x8      m9, [smooth_weights+32*3]
945    vshufi32x4          m10, m10, q3120
946.w32_loop:
947    vpbroadcastd         m0, [tlq+hq-2]
948    mova                 m3, m4
949    vpshufb          m3{k1}, m0, m7
950    pmaddubsw            m2, m3, m5
951    pmaddubsw            m0, m3, m8
952    pmaddubsw            m1, m3, m9
953    paddw                m3, m6
954    paddw                m2, m3
955    paddw                m0, m2
956    paddw                m1, m2
957    vpermt2b             m0, m10, m1
958    mova          [dstq+strideq*0], ym0
959    vextracti32x8 [dstq+strideq*1], m0, 1
960    lea                dstq, [dstq+strideq*2]
961    sub                  hd, 2
962    jg .w32_loop
963    RET
964.w64:
965    mova                 m7, [smooth_weights+64*2]
966    mova                 m8, [smooth_weights+64*3]
967    mova                 m9, [smooth_endA]
968.w64_loop:
969    mova                 m3, m4
970    vpbroadcastb     m3{k1}, [tlq+hq-1]
971    pmaddubsw            m2, m3, m5
972    pmaddubsw            m0, m3, m7
973    pmaddubsw            m1, m3, m8
974    paddw                m3, m6
975    paddw                m2, m3
976    paddw                m0, m2
977    paddw                m1, m2
978    vpermt2b             m0, m9, m1
979    mova             [dstq], m0
980    add                dstq, strideq
981    dec                  hd
982    jg .w64_loop
983    RET
984
985cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
986%define base r5-ipred_smooth_8bpc_avx512icl_table
987    lea                  r5, [ipred_smooth_8bpc_avx512icl_table]
988    mov                 r6d, wd
989    tzcnt                wd, wd
990    mov                  hd, hm
991    vpbroadcastb         m6, [tlq+r6] ; right
992    sub                 tlq, hq
993    movsxd               wq, [r5+wq*4]
994    vpbroadcastd         m7, [base+pb_127_m127]
995    vpbroadcastb         m0, [tlq]    ; bottom
996    vpbroadcastd         m1, [base+pw_255]
997    add                  wq, r5
998    lea          v_weightsq, [base+smooth_weights+hq*2]
999    vpmovb2m             k1, m1
1000    lea            stride3q, [strideq*3]
1001    jmp                  wq
1002.w4:
1003    vpbroadcastd         m8, [tlq+hq+1]
1004    movsldup             m4, [smooth_shuf]
1005    movshdup             m5, [smooth_shuf]
1006    vpbroadcastq         m9, [smooth_weights+4*2]
1007    mova               ym11, [smooth_endA]
1008
1009    punpcklbw            m8, m0     ; top, bottom
1010    pmaddubsw           m10, m8, m7
1011    paddw                m1, m8     ;   1 * top + 256 * bottom + 255
1012    paddw               m10, m1     ; 128 * top + 129 * bottom + 255
1013.w4_loop:
1014    vpbroadcastq         m1, [tlq+hq-8]
1015    vbroadcasti32x4      m0, [v_weightsq]
1016    add          v_weightsq, 16
1017    mova                 m2, m6
1018    vpshufb          m2{k1}, m1, m4 ; left, right
1019    pmaddubsw            m1, m2, m7 ; 127 * left - 127 * right
1020    pshufb               m0, m5
1021    pmaddubsw            m0, m8, m0
1022    paddw                m1, m2     ; 128 * left + 129 * right
1023    pmaddubsw            m2, m9
1024    paddw                m0, m10
1025    paddw                m1, m2
1026    pavgw                m0, m1
1027    vpermb               m0, m11, m0
1028    vextracti32x4       xm1, ym0, 1
1029    movd   [dstq+strideq*0], xm0
1030    movd   [dstq+strideq*1], xm1
1031    pextrd [dstq+strideq*2], xm0, 2
1032    pextrd [dstq+stride3q ], xm1, 2
1033    sub                  hd, 8
1034    jl .ret
1035    lea                dstq, [dstq+strideq*4]
1036    pextrd [dstq+strideq*0], xm0, 1
1037    pextrd [dstq+strideq*1], xm1, 1
1038    pextrd [dstq+strideq*2], xm0, 3
1039    pextrd [dstq+stride3q ], xm1, 3
1040    lea                dstq, [dstq+strideq*4]
1041    jg .w4_loop
1042.ret:
1043    RET
1044.w8:
1045    vpbroadcastq         m8, [tlq+hq+1]
1046    movsldup             m4, [smooth_shuf]
1047    movshdup             m5, [smooth_shuf]
1048    vbroadcasti32x4      m9, [smooth_weights+8*2]
1049    mova               ym11, [smooth_endA]
1050    punpcklbw            m8, m0
1051    pmaddubsw           m10, m8, m7
1052    paddw                m1, m8
1053    paddw               m10, m1
1054.w8_loop:
1055    vpbroadcastd         m1, [tlq+hq-4]
1056    vpbroadcastq         m0, [v_weightsq]
1057    add          v_weightsq, 8
1058    mova                 m2, m6
1059    vpshufb          m2{k1}, m1, m4
1060    pmaddubsw            m1, m2, m7
1061    pshufb               m0, m5
1062    pmaddubsw            m0, m8, m0
1063    paddw                m1, m2
1064    pmaddubsw            m2, m9
1065    paddw                m0, m10
1066    paddw                m1, m2
1067    pavgw                m0, m1
1068    vpermb               m0, m11, m0
1069    vextracti32x4       xm1, ym0, 1
1070    movq   [dstq+strideq*0], xm0
1071    movq   [dstq+strideq*1], xm1
1072    movhps [dstq+strideq*2], xm0
1073    movhps [dstq+stride3q ], xm1
1074    lea                dstq, [dstq+strideq*4]
1075    sub                  hd, 4
1076    jg .w8_loop
1077    RET
1078.w16:
1079    vbroadcasti32x4      m9, [tlq+hq+1]
1080    movsldup             m5, [smooth_shuf]
1081    movshdup            m10, [smooth_shuf]
1082    vbroadcasti32x4     m11, [smooth_weights+16*2]
1083    vbroadcasti32x4     m12, [smooth_weights+16*3]
1084    mova                m15, [smooth_endB]
1085    punpcklbw            m8, m9, m0
1086    punpckhbw            m9, m0
1087    pmaddubsw           m13, m8, m7
1088    pmaddubsw           m14, m9, m7
1089    paddw                m0, m1, m8
1090    paddw                m1, m9
1091    paddw               m13, m0
1092    paddw               m14, m1
1093.w16_loop:
1094    vpbroadcastd         m0, [tlq+hq-4]
1095    vpbroadcastq         m1, [v_weightsq]
1096    add          v_weightsq, 8
1097    mova                 m4, m6
1098    vpshufb          m4{k1}, m0, m5
1099    pmaddubsw            m2, m4, m7
1100    pshufb               m1, m10
1101    pmaddubsw            m0, m8, m1
1102    pmaddubsw            m1, m9, m1
1103    paddw                m2, m4
1104    pmaddubsw            m3, m4, m11
1105    pmaddubsw            m4, m12
1106    paddw                m0, m13
1107    paddw                m1, m14
1108    paddw                m3, m2
1109    paddw                m4, m2
1110    pavgw                m0, m3
1111    pavgw                m1, m4
1112    vpermt2b             m0, m15, m1
1113    mova          [dstq+strideq*0], xm0
1114    vextracti32x4 [dstq+strideq*1], m0, 2
1115    vextracti32x4 [dstq+strideq*2], ym0, 1
1116    vextracti32x4 [dstq+stride3q ], m0, 3
1117    lea                dstq, [dstq+strideq*4]
1118    sub                  hd, 4
1119    jg .w16_loop
1120    RET
1121.w32:
1122    vbroadcasti32x8      m9, [tlq+hq+1]
1123    movshdup            m10, [smooth_shuf]
1124    mova                m12, [smooth_weights+32*2]
1125    vpbroadcastd        ym5, [pb_1]
1126    mova                m15, [smooth_endB]
1127    punpcklbw            m8, m9, m0
1128    punpckhbw            m9, m0
1129    pmaddubsw           m13, m8, m7
1130    pmaddubsw           m14, m9, m7
1131    vshufi32x4          m11, m12, m12, q2020
1132    vshufi32x4          m12, m12, q3131
1133    paddw                m0, m1, m8
1134    paddw                m1, m9
1135    paddw               m13, m0
1136    paddw               m14, m1
1137.w32_loop:
1138    vpbroadcastd         m0, [tlq+hq-2]
1139    vpbroadcastd         m1, [v_weightsq]
1140    add          v_weightsq, 4
1141    mova                 m4, m6
1142    vpshufb          m4{k1}, m0, m5
1143    pmaddubsw            m2, m4, m7
1144    pshufb               m1, m10
1145    pmaddubsw            m0, m8, m1
1146    pmaddubsw            m1, m9, m1
1147    paddw                m2, m4
1148    pmaddubsw            m3, m4, m11
1149    pmaddubsw            m4, m12
1150    paddw                m0, m13
1151    paddw                m1, m14
1152    paddw                m3, m2
1153    paddw                m4, m2
1154    pavgw                m0, m3
1155    pavgw                m1, m4
1156    vpermt2b             m0, m15, m1
1157    mova          [dstq+strideq*0], ym0
1158    vextracti32x8 [dstq+strideq*1], m0, 1
1159    lea                dstq, [dstq+strideq*2]
1160    sub                  hd, 2
1161    jg .w32_loop
1162    RET
1163.w64:
1164    movu                 m9, [tlq+hq+1]
1165    mova                m11, [smooth_weights+64*2]
1166    mova                 m2, [smooth_weights+64*3]
1167    mova                m14, [smooth_endB]
1168    punpcklbw            m8, m9, m0
1169    punpckhbw            m9, m0
1170    pmaddubsw           m12, m8, m7
1171    pmaddubsw           m13, m9, m7
1172    vshufi32x4          m10, m11, m2, q2020
1173    vshufi32x4          m11, m2, q3131
1174    paddw                m0, m1, m8
1175    paddw                m1, m9
1176    paddw               m12, m0
1177    paddw               m13, m1
1178.w64_loop:
1179    mova                 m4, m6
1180    vpbroadcastb     m4{k1}, [tlq+hq-1]
1181    vpbroadcastw         m1, [v_weightsq]
1182    add          v_weightsq, 2
1183    pmaddubsw            m2, m4, m7
1184    pmaddubsw            m0, m8, m1
1185    pmaddubsw            m1, m9, m1
1186    paddw                m2, m4
1187    pmaddubsw            m3, m4, m10
1188    pmaddubsw            m4, m11
1189    paddw                m0, m12
1190    paddw                m1, m13
1191    paddw                m3, m2
1192    paddw                m4, m2
1193    pavgw                m0, m3
1194    pavgw                m1, m4
1195    vpermt2b             m0, m14, m1
1196    mova             [dstq], m0
1197    add                dstq, strideq
1198    dec                  hd
1199    jg .w64_loop
1200    RET
1201
1202cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
1203    movifnidn            wd, wm
1204    movifnidn            hd, hm
1205    lea            stride3q, [strideq*3]
1206    cmp                  wd, 8
1207    jg .w32
1208    movq               xmm3, [palq]
1209    je .w8
1210.w4:
1211    movq               xmm0, [idxq]
1212    add                idxq, 8
1213    psrlw              xmm1, xmm0, 4
1214    punpcklbw          xmm0, xmm1
1215    pshufb             xmm0, xmm3, xmm0
1216    movd   [dstq+strideq*0], xmm0
1217    pextrd [dstq+strideq*1], xmm0, 1
1218    pextrd [dstq+strideq*2], xmm0, 2
1219    pextrd [dstq+stride3q ], xmm0, 3
1220    lea                dstq, [dstq+strideq*4]
1221    sub                  hd, 4
1222    jg .w4
1223    RET
1224.w8:
1225    movu               xmm2, [idxq]
1226    add                idxq, 16
1227    pshufb             xmm1, xmm3, xmm2
1228    psrlw              xmm2, 4
1229    pshufb             xmm2, xmm3, xmm2
1230    punpcklbw          xmm0, xmm1, xmm2
1231    punpckhbw          xmm1, xmm2
1232    movq   [dstq+strideq*0], xmm0
1233    movhps [dstq+strideq*1], xmm0
1234    movq   [dstq+strideq*2], xmm1
1235    movhps [dstq+stride3q ], xmm1
1236    lea                dstq, [dstq+strideq*4]
1237    sub                  hd, 4
1238    jg .w8
1239    RET
1240.w16:
1241    pmovzxdq             m0, [idxq]
1242    add                idxq, 32
1243    vpmultishiftqb       m0, m3, m0
1244    pshufb               m0, m5, m0
1245    mova          [dstq+strideq*0], xm0
1246    vextracti32x4 [dstq+strideq*1], ym0, 1
1247    vextracti32x4 [dstq+strideq*2], m0, 2
1248    vextracti32x4 [dstq+stride3q ], m0, 3
1249    lea                dstq, [dstq+strideq*4]
1250    sub                  hd, 4
1251    jg .w16
1252    RET
1253.w32:
1254    vpbroadcastq         m3, [pal_unpack+0]
1255    vpbroadcastq         m5, [palq]
1256    cmp                  wd, 32
1257    jl .w16
1258    pmovzxbd             m2, [pal_perm]
1259    vpbroadcastq         m4, [pal_unpack+8]
1260    jg .w64
1261.w32_loop:
1262    vpermd               m1, m2, [idxq]
1263    add                idxq, 64
1264    vpmultishiftqb       m0, m3, m1
1265    vpmultishiftqb       m1, m4, m1
1266    pshufb               m0, m5, m0
1267    pshufb               m1, m5, m1
1268    mova          [dstq+strideq*0], ym0
1269    vextracti32x8 [dstq+strideq*1], m0, 1
1270    mova          [dstq+strideq*2], ym1
1271    vextracti32x8 [dstq+stride3q ], m1, 1
1272    lea                dstq, [dstq+strideq*4]
1273    sub                  hd, 4
1274    jg .w32_loop
1275    RET
1276.w64:
1277    vpermd               m1, m2, [idxq]
1278    add                idxq, 64
1279    vpmultishiftqb       m0, m3, m1
1280    vpmultishiftqb       m1, m4, m1
1281    pshufb               m0, m5, m0
1282    pshufb               m1, m5, m1
1283    mova   [dstq+strideq*0], m0
1284    mova   [dstq+strideq*1], m1
1285    lea                dstq, [dstq+strideq*2]
1286    sub                  hd, 2
1287    jg .w64
1288    RET
1289
1290%if WIN64
1291    DECLARE_REG_TMP 4
1292%else
1293    DECLARE_REG_TMP 8
1294%endif
1295
1296cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
1297%define base r7-z_filter_t0
1298    lea                  r7, [z_filter_t0]
1299    tzcnt                wd, wm
1300    movifnidn        angled, anglem
1301    lea                  t0, [dr_intra_derivative]
1302    movsxd               wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4]
1303    inc                 tlq
1304    mov                 dxd, angled
1305    and                 dxd, 0x7e
1306    add              angled, 165 ; ~90
1307    movzx               dxd, word [t0+dxq]
1308    lea                  wq, [base+ipred_z1_8bpc_avx512icl_table+wq]
1309    movifnidn            hd, hm
1310    xor              angled, 0x4ff ; d = 90 - angle
1311    mova                m14, [base+z_frac_table]
1312    vpbroadcastd        m15, [base+pw_512]
1313    jmp                  wq
1314.w4:
1315    mova                 m9, [pb_0to63]
1316    pminud               m8, m9, [base+pb_7] {1to16}
1317    vpbroadcastq         m7, [tlq]
1318    pshufb               m7, m8
1319    cmp              angleb, 40
1320    jae .w4_no_upsample
1321    lea                 r3d, [angleq-1024]
1322    sar                 r3d, 7
1323    add                 r3d, hd
1324    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
1325    pshufb             xmm0, xm7, [base+z_filter_s4]
1326    mova               xmm1, [tlq-1]
1327    pshufb             xmm1, [base+z_xpos_off2a]
1328    vpbroadcastd       xmm2, [base+pb_m4_36]
1329    vpbroadcastq         m4, [pb_0to63]
1330    pmaddubsw          xmm0, xmm2
1331    pmaddubsw          xmm1, xmm2
1332    add                 dxd, dxd
1333    kxnorw               k1, k1, k1
1334    paddw              xmm0, xmm1
1335    pmulhrsw            xm0, xmm0, xm15
1336    packuswb            xm0, xm0
1337    punpcklbw       ym7{k1}, ym0
1338    jmp .w4_main2
1339.w4_no_upsample:
1340    test             angled, 0x400
1341    jnz .w4_main ; !enable_intra_edge_filter
1342    lea                 r3d, [hq+3]
1343    vpbroadcastb        xm0, r3d
1344    vpbroadcastb        xm1, angled
1345    shr              angled, 8 ; is_sm << 1
1346    vpcmpeqb             k1, xm0, [base+z_filter_wh]
1347    vpcmpgtb         k1{k1}, xm1, [base+z_filter_t0+angleq*8]
1348    kmovw               r5d, k1
1349    test                r5d, r5d
1350    jz .w4_main
1351    vbroadcasti32x4     ym0, [tlq-1]
1352    pshufb              ym0, [base+z_filter4_s1]
1353    popcnt              r5d, r5d ; filter_strength
1354    pshufb              ym1, ym7, [z_filter_s4]
1355    pshufb              ym7, [base+z_filter_s3]
1356    vpbroadcastd       ym11, [base+z_filter_k+(r5-1)*4+12*0]
1357    vpbroadcastd       ym12, [base+z_filter_k+(r5-1)*4+12*1]
1358    pmaddubsw           ym0, ym11
1359    pmaddubsw           ym1, ym11
1360    pmaddubsw           ym7, ym12
1361    paddw               ym0, ym1
1362    paddw               ym7, ym0
1363    pmulhrsw            ym7, ym15
1364    cmp                  hd, 4
1365    je .w4_filter_end
1366    vpbroadcastd         m8, [base+pb_9]
1367    pminub               m8, m9
1368.w4_filter_end:
1369    paddb                m8, m8
1370    vpermb               m7, m8, m7
1371.w4_main:
1372    vpbroadcastq         m4, [base+z_xpos_off1a]
1373.w4_main2:
1374    movsldup             m2, [base+z_xpos_mul]
1375    vpbroadcastw         m5, dxd
1376    vbroadcasti32x4      m3, [base+z_xpos_bc]
1377    lea                  r2, [strideq*3]
1378    pmullw               m2, m5      ; xpos
1379    psllw                m5, 5       ; dx*8
1380.w4_loop:
1381    psrlw                m1, m2, 3
1382    pshufb               m0, m2, m3
1383    vpermw               m1, m1, m14 ; 64-frac, frac
1384    paddsb               m0, m4      ; base, base+1
1385    vpermb               m0, m0, m7  ; top[base], top[base+1]
1386    paddsw               m2, m5      ; xpos += dx
1387    pmaddubsw            m0, m1      ; v
1388    pmulhrsw             m0, m15
1389    packuswb             m0, m0
1390    vextracti32x4       xm1, ym0, 1
1391    movd   [dstq+strideq*0], xm0
1392    pextrd [dstq+strideq*1], xm0, 1
1393    movd   [dstq+strideq*2], xm1
1394    pextrd [dstq+r2       ], xm1, 1
1395    sub                  hd, 8
1396    jl .w4_end
1397    vextracti32x4       xm1, m0, 2 ; top[max_base_x]
1398    lea                dstq, [dstq+strideq*4]
1399    vextracti32x4       xm0, m0, 3
1400    movd   [dstq+strideq*0], xm1
1401    pextrd [dstq+strideq*1], xm1, 1
1402    movd   [dstq+strideq*2], xm0
1403    pextrd [dstq+r2       ], xm0, 1
1404    lea                dstq, [dstq+strideq*4]
1405    jg .w4_loop
1406.w4_end:
1407    RET
1408.w8_filter:
1409    mova                ym0, [base+z_filter_s1]
1410    popcnt              r5d, r5d
1411    vbroadcasti32x4     ym1, [base+z_filter_s2]
1412    vbroadcasti32x4     ym3, [base+z_filter_s3]
1413    vbroadcasti32x4     ym4, [base+z_filter_s4]
1414    vpermi2b            ym0, ym7, ym2 ; al bl
1415    mova                ym5, [base+z_filter_s5]
1416    pshufb              ym1, ym7, ym1 ; ah bh
1417    vpbroadcastd       ym11, [base+z_filter_k+(r5-1)*4+12*0]
1418    pshufb              ym3, ym7, ym3 ; cl ch
1419    vpbroadcastd       ym12, [base+z_filter_k+(r5-1)*4+12*1]
1420    pshufb              ym4, ym7, ym4 ; el dl
1421    vpbroadcastd       ym13, [base+z_filter_k+(r5-1)*4+12*2]
1422    vpermb              ym5, ym5, ym7 ; eh dh
1423    pmaddubsw           ym0, ym11
1424    pmaddubsw           ym1, ym11
1425    pmaddubsw           ym2, ym3, ym12
1426    pmaddubsw           ym3, ym13
1427    pmaddubsw           ym4, ym11
1428    pmaddubsw           ym5, ym11
1429    paddw               ym0, ym2
1430    paddw               ym1, ym3
1431    paddw               ym0, ym4
1432    paddw               ym1, ym5
1433    pmulhrsw            ym0, ym15
1434    pmulhrsw            ym1, ym15
1435    packuswb            ym0, ym1
1436    ret
1437.w8:
1438    lea                 r3d, [angleq+216]
1439    mov                 r3b, hb
1440    cmp                 r3d, 8
1441    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1442    lea                 r3d, [hq-1]
1443    mova                xm1, [base+z_filter_s4]
1444    vpbroadcastb        xm2, r3d
1445    mova                xm7, [tlq-1]
1446    vinserti32x4        ym7, [tlq+7], 1
1447    vbroadcasti32x4     ym0, [base+z_xpos_off1a]
1448    vpbroadcastd        ym3, [base+pb_m4_36]
1449    pminub              xm2, xm1
1450    pshufb              ym0, ym7, ym0
1451    vinserti32x4        ym1, xm2, 1
1452    psrldq              ym7, 1
1453    pshufb              ym1, ym7, ym1
1454    pmaddubsw           ym0, ym3
1455    pmaddubsw           ym1, ym3
1456    vbroadcasti32x4      m8, [pb_0to63]
1457    add                 dxd, dxd
1458    paddw               ym0, ym1
1459    pmulhrsw            ym0, ym15
1460    packuswb            ym0, ym0
1461    punpcklbw           ym7, ym0
1462    jmp .w8_main2
1463.w8_no_upsample:
1464    lea                 r3d, [hq+7]
1465    mova                 m9, [pb_0to63]
1466    vpbroadcastb        ym0, r3d
1467    and                 r3d, 7
1468    vbroadcasti32x4      m7, [tlq]
1469    or                  r3d, 8 ; imin(h+7, 15)
1470    vpbroadcastb         m8, r3d
1471    pminub               m8, m9
1472    pshufb               m7, m8
1473    test             angled, 0x400
1474    jnz .w8_main
1475    vpbroadcastb        ym1, angled
1476    shr              angled, 8
1477    vpcmpeqb             k1, ym0, [base+z_filter_wh]
1478    mova                xm0, [base+z_filter_t0+angleq*8]
1479    vpcmpgtb         k1{k1}, ym1, ym0
1480    kmovd               r5d, k1
1481    test                r5d, r5d
1482    jz .w8_main
1483    vpbroadcastd        ym2, [tlq-4]
1484    call .w8_filter
1485    cmp                  hd, 8
1486    jle .w8_filter_end
1487    vpbroadcastd         m8, [base+pb_17]
1488    add                 r3d, 2
1489    pminub               m8, m9
1490.w8_filter_end:
1491    vpermb               m7, m8, m0
1492.w8_main:
1493    vbroadcasti32x4      m8, [base+z_xpos_off1a]
1494.w8_main2:
1495    movsldup             m4, [base+z_xpos_mul]
1496    vpbroadcastw         m9, dxd
1497    shl                 r3d, 6
1498    vpbroadcastd         m5, [base+z_xpos_bc+8*0]
1499    pmullw               m4, m9 ; xpos
1500    vpbroadcastd         m6, [base+z_xpos_bc+8*1]
1501    sub                 r3d, dxd
1502    shl                 dxd, 3
1503    psllw                m9, 5 ; dx*8
1504    lea                  r2, [strideq*3]
1505.w8_loop:
1506    psrlw                m3, m4, 3
1507    pshufb               m0, m4, m5
1508    pshufb               m1, m4, m6
1509    vpermw               m3, m3, m14
1510    paddsb               m0, m8
1511    paddsb               m1, m8
1512    vpermb               m0, m0, m7
1513    vpermb               m1, m1, m7
1514    paddsw               m4, m9
1515    punpcklqdq           m2, m3, m3
1516    pmaddubsw            m0, m2
1517    punpckhqdq           m3, m3
1518    pmaddubsw            m1, m3
1519    pmulhrsw             m0, m15
1520    pmulhrsw             m1, m15
1521    packuswb             m0, m1
1522    vextracti32x4       xm1, ym0, 1
1523    movq   [dstq+strideq*0], xm0
1524    movhps [dstq+strideq*1], xm0
1525    movq   [dstq+strideq*2], xm1
1526    movhps [dstq+r2       ], xm1
1527    sub                  hd, 8
1528    jl .w8_end
1529    vextracti32x8       ym0, m0, 1
1530    lea                dstq, [dstq+strideq*4]
1531    vextracti32x4       xm1, ym0, 1
1532    movq   [dstq+strideq*0], xm0
1533    movhps [dstq+strideq*1], xm0
1534    movq   [dstq+strideq*2], xm1
1535    movhps [dstq+r2       ], xm1
1536    jz .w8_end
1537    lea                dstq, [dstq+strideq*4]
1538    sub                 r3d, dxd
1539    jg .w8_loop
1540    vextracti32x4       xm7, m7, 3
1541.w8_end_loop:
1542    movq   [dstq+strideq*0], xm7
1543    movq   [dstq+strideq*1], xm7
1544    movq   [dstq+strideq*2], xm7
1545    movq   [dstq+r2       ], xm7
1546    lea                dstq, [dstq+strideq*4]
1547    sub                  hd, 4
1548    jg .w8_end_loop
1549.w8_end:
1550    RET
1551.w16_filter:
1552    mova                 m0, [base+z_filter_s1]
1553    popcnt              r5d, r5d
1554    vbroadcasti32x4      m1, [base+z_filter_s2]
1555    vbroadcasti32x4      m3, [base+z_filter_s3]
1556    vbroadcasti32x4      m4, [base+z_filter_s4]
1557    vpermi2b             m0, m7, m2 ; al bl
1558    mova                 m5, [base+z_filter_s5]
1559    pshufb               m1, m7, m1 ; ah bh
1560    vpbroadcastd        m11, [base+z_filter_k+(r5-1)*4+12*0]
1561    pshufb               m3, m7, m3 ; cl ch
1562    vpbroadcastd        m12, [base+z_filter_k+(r5-1)*4+12*1]
1563    pshufb               m4, m7, m4 ; el dl
1564    vpbroadcastd        m13, [base+z_filter_k+(r5-1)*4+12*2]
1565    vpermb               m5, m5, m7 ; eh dh
1566    pmaddubsw            m0, m11
1567    pmaddubsw            m1, m11
1568    pmaddubsw            m2, m3, m12
1569    pmaddubsw            m3, m13
1570    pmaddubsw            m4, m11
1571    pmaddubsw            m5, m11
1572    paddw                m0, m2
1573    paddw                m1, m3
1574    paddw                m0, m4
1575    paddw                m1, m5
1576    pmulhrsw             m0, m15
1577    pmulhrsw             m1, m15
1578    packuswb             m0, m1
1579    ret
1580.w16:
1581    lea                 r3d, [hq+15]
1582    mova                 m9, [pb_0to63]
1583    vpbroadcastb        ym0, r3d
1584    and                 r3d, 15
1585    movu                ym7, [tlq]
1586    or                  r3d, 16 ; imin(h+15, 31)
1587    vpbroadcastb         m8, r3d
1588    pminub               m8, m9
1589    vpermb               m7, m8, m7
1590    test             angled, 0x400
1591    jnz .w16_main
1592    vpbroadcastb        ym1, angled
1593    shr              angled, 8
1594    vpcmpeqb             k1, ym0, [base+z_filter_wh]
1595    mova                xm0, [base+z_filter_t0+angleq*8]
1596    vpcmpgtb         k1{k1}, ym1, ym0
1597    kmovd               r5d, k1
1598    test                r5d, r5d
1599    jz .w16_main
1600    vpbroadcastd         m2, [tlq-4]
1601    call .w16_filter
1602    cmp                  hd, 16
1603    jle .w16_filter_end
1604    vpbroadcastd         m8, [base+pb_33]
1605    add                 r3d, 2
1606    pminub               m8, m9
1607.w16_filter_end:
1608    vpermb               m7, m8, m0
1609.w16_main:
1610    movshdup             m3, [base+z_xpos_mul]
1611    vpbroadcastw         m8, dxd
1612    shl                 r3d, 6
1613    vpbroadcastd         m4, [base+z_xpos_bc]
1614    pmullw               m3, m8 ; xpos
1615    vbroadcasti32x4      m5, [base+z_xpos_off1a]
1616    sub                 r3d, dxd
1617    shl                 dxd, 2
1618    vbroadcasti32x4      m6, [base+z_xpos_off1b]
1619    psllw                m8, 4 ; dx*4
1620    lea                  r2, [strideq*3]
1621.w16_loop:
1622    pshufb               m1, m3, m4
1623    psrlw                m2, m3, 3
1624    paddsb               m0, m1, m5
1625    vpermw               m2, m2, m14
1626    paddsb               m1, m6
1627    vpermb               m0, m0, m7
1628    vpermb               m1, m1, m7
1629    paddsw               m3, m8
1630    pmaddubsw            m0, m2
1631    pmaddubsw            m1, m2
1632    pmulhrsw             m0, m15
1633    pmulhrsw             m1, m15
1634    packuswb             m0, m1
1635    mova          [dstq+strideq*0], xm0
1636    vextracti32x4 [dstq+strideq*1], ym0, 1
1637    vextracti32x4 [dstq+strideq*2], m0, 2
1638    vextracti32x4 [dstq+r2       ], m0, 3
1639    sub                  hd, 4
1640    jz .w16_end
1641    lea                dstq, [dstq+strideq*4]
1642    sub                 r3d, dxd
1643    jg .w16_loop
1644    vextracti32x4       xm7, m7, 3
1645.w16_end_loop:
1646    mova   [dstq+strideq*0], xm7
1647    mova   [dstq+strideq*1], xm7
1648    mova   [dstq+strideq*2], xm7
1649    mova   [dstq+r2       ], xm7
1650    lea                dstq, [dstq+strideq*4]
1651    sub                  hd, 4
1652    jg .w16_end_loop
1653.w16_end:
1654    RET
1655.w32_filter:
1656    mova                 m0, [base+z_filter_s1]
1657    vbroadcasti32x4      m1, [base+z_filter_s2]
1658    vbroadcasti32x4      m3, [base+z_filter_s3]
1659    vbroadcasti32x4      m4, [base+z_filter_s4]
1660    vpermi2b             m0, m7, m2 ; al bl
1661    mova                 m5, [base+z_filter_s5]
1662    pshufb               m1, m7, m1 ; ah bh
1663    vpbroadcastd        m11, [base+z_filter_k+4*2+12*0]
1664    pshufb               m3, m7, m3 ; cl ch
1665    vpbroadcastd        m12, [base+z_filter_k+4*2+12*1]
1666    pshufb               m4, m7, m4 ; el dl
1667    vpbroadcastd        m13, [base+z_filter_k+4*2+12*2]
1668    vpermi2b             m5, m7, m8 ; eh dh
1669    pmaddubsw            m0, m11
1670    pmaddubsw            m1, m11
1671    pmaddubsw            m2, m3, m12
1672    pmaddubsw            m3, m13
1673    pmaddubsw            m4, m11
1674    pmaddubsw            m5, m11
1675    paddw                m0, m2
1676    paddw                m1, m3
1677    paddw                m0, m4
1678    paddw                m1, m5
1679    pmulhrsw             m0, m15
1680    pmulhrsw             m1, m15
1681    packuswb             m7, m0, m1
1682    ret
1683.w32:
1684    lea                 r3d, [hq+31]
1685    vpbroadcastb         m9, r3d
1686    and                 r3d, 31
1687    pminub              m10, m9, [pb_0to63]
1688    or                  r3d, 32 ; imin(h+31, 63)
1689    vpermb               m7, m10, [tlq]
1690    vpbroadcastb         m8, [tlq+r3]
1691    test             angled, 0x400 ; !enable_intra_edge_filter
1692    jnz .w32_main
1693    vpbroadcastd         m2, [tlq-4]
1694    call .w32_filter
1695    cmp                  hd, 64
1696    je .w32_h64_filter_end
1697    vpermb               m8, m9, m7
1698    vpermb               m7, m10, m7
1699    jmp .w32_main
1700.w32_h64_filter_end: ; edge case for 32x64
1701    movd               xmm0, [tlq+r3-1]
1702    movd               xmm1, [base+pb_8_56_0_0]
1703    add                 r3d, 2
1704    pmaddubsw          xmm0, xmm1
1705    vptestmw             k1, xmm1, xmm1 ; 0x01
1706    pmulhrsw            xm0, xmm0, xm15
1707    vmovdqu8         m8{k1}, m0
1708.w32_main:
1709    rorx                r2d, dxd, 30
1710    vpbroadcastd         m4, [base+z_xpos_bc]
1711    vpbroadcastw         m3, r2d
1712    vbroadcasti32x8      m5, [base+z_xpos_off2a]
1713    shl                 r3d, 6
1714    vbroadcasti32x8      m6, [base+z_xpos_off2b]
1715    sub                 r3d, dxd
1716    paddw                m9, m3, m3
1717    add                 dxd, dxd
1718    vinserti32x8         m3, ym9, 1
1719.w32_loop:
1720    pshufb               m1, m3, m4
1721    psrlw                m2, m3, 3
1722    paddsb               m0, m1, m5
1723    vpermw               m2, m2, m14
1724    paddsb               m1, m6
1725    vpermi2b             m0, m7, m8
1726    vpermi2b             m1, m7, m8
1727    paddsw               m3, m9
1728    pmaddubsw            m0, m2
1729    pmaddubsw            m1, m2
1730    pmulhrsw             m0, m15
1731    pmulhrsw             m1, m15
1732    packuswb             m0, m1
1733    mova          [dstq+strideq*0], ym0
1734    vextracti32x8 [dstq+strideq*1], m0, 1
1735    sub                  hd, 2
1736    jz .w32_end
1737    lea                dstq, [dstq+strideq*2]
1738    sub                 r3d, dxd
1739    jg .w32_loop
1740    punpckhqdq          ym8, ym8
1741.w32_end_loop:
1742    mova   [dstq+strideq*0], ym8
1743    mova   [dstq+strideq*1], ym8
1744    lea                dstq, [dstq+strideq*2]
1745    sub                  hd, 2
1746    jg .w32_end_loop
1747.w32_end:
1748    RET
1749.w64_filter:
1750    vbroadcasti32x4      m3, [base+z_filter_s2]
1751    mova                 m1, [base+z_filter_s1]
1752    pshufb               m0, m3      ; al bl
1753    vpermi2b             m1, m7, m2
1754    vbroadcasti32x4      m4, [base+z_filter_s4]
1755    pshufb               m6, m8, m4  ; el dl
1756    pshufb               m9, m7, m4
1757    pminub              m10, m13, [base+z_filter_s5]
1758    pshufb               m2, m8, m3  ; ah bh
1759    pshufb               m3, m7, m3
1760    vbroadcasti32x4      m5, [base+z_filter_s3]
1761    vpermb              m10, m10, m8 ; eh dh
1762    pshufb              m11, m4
1763    vpbroadcastd         m4, [base+z_filter_k+4*2+12*0]
1764    pshufb               m8, m5      ; cl ch
1765    pshufb               m7, m5
1766    vpbroadcastd         m5, [base+z_filter_k+4*2+12*1]
1767    REPX  {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11
1768    pmaddubsw            m4, m8, m5
1769    pmaddubsw            m5, m7, m5
1770    paddw                m0, m6
1771    vpbroadcastd         m6, [base+z_filter_k+4*2+12*2]
1772    paddw                m1, m9
1773    pmaddubsw            m7, m6
1774    pmaddubsw            m8, m6
1775    paddw                m2, m10
1776    paddw                m3, m11
1777    paddw                m0, m4
1778    paddw                m1, m5
1779    paddw                m2, m8
1780    paddw                m3, m7
1781    REPX  {pmulhrsw x, m15}, m0, m2, m1, m3
1782    packuswb             m0, m2
1783    packuswb             m7, m1, m3
1784    vpermb               m8, m12, m0
1785    ret
1786.w64:
1787    lea                 r3d, [hq-1]
1788    movu                 m7, [tlq+64*0]
1789    vpbroadcastb        m13, r3d
1790    pminub              m12, m13, [pb_0to63]
1791    or                  r3d, 64
1792    vpermb               m8, m12, [tlq+64*1]
1793    test             angled, 0x400 ; !enable_intra_edge_filter
1794    jnz .w64_main
1795    movu                 m0, [tlq+56]
1796    vpbroadcastd         m2, [tlq-4]
1797    movu                m11, [tlq+8]
1798    call .w64_filter
1799.w64_main:
1800    rorx                r2d, dxd, 30
1801    vpbroadcastd         m4, [base+z_xpos_bc]
1802    vpbroadcastw         m3, r2d
1803    mova                 m5, [base+z_xpos_off2a]
1804    shl                 r3d, 6
1805    mova                 m6, [base+z_xpos_off2b]
1806    sub                 r3d, dxd
1807    mova                 m9, m3
1808.w64_loop:
1809    pshufb               m1, m3, m4
1810    psrlw                m2, m3, 3
1811    paddsb               m0, m1, m5
1812    vpermw               m2, m2, m14
1813    paddsb               m1, m6
1814    vpermi2b             m0, m7, m8
1815    vpermi2b             m1, m7, m8
1816    paddsw               m3, m9
1817    pmaddubsw            m0, m2
1818    pmaddubsw            m1, m2
1819    pmulhrsw             m0, m15
1820    pmulhrsw             m1, m15
1821    packuswb             m0, m1
1822    mova             [dstq], m0
1823    dec                  hd
1824    jz .w64_end
1825    add                dstq, strideq
1826    sub                 r3d, dxd
1827    jg .w64_loop
1828    vpermb               m8, m13, m8
1829.w64_end_loop:
1830    mova             [dstq], m8
1831    add                dstq, strideq
1832    dec                  hd
1833    jg .w64_end_loop
1834.w64_end:
1835    RET
1836
1837cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy
1838    tzcnt                wd, wm
1839    movifnidn        angled, anglem
1840    lea                 dxq, [dr_intra_derivative-90]
1841    movzx               dyd, angleb
1842    xor              angled, 0x400
1843    mov                  r7, dxq
1844    sub                 dxq, dyq
1845    movifnidn            hd, hm
1846    and                 dyd, ~1
1847    and                 dxq, ~1
1848    movzx               dyd, word [r7+dyq]  ; angle - 90
1849    lea                  r7, [z_filter_t0]
1850    movzx               dxd, word [dxq+270] ; 180 - angle
1851    movsxd               wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4]
1852    mova                 m8, [base+pb_63to0]
1853    neg                 dyd
1854    vpermb               m8, m8, [tlq-64] ; left
1855    lea                  wq, [base+ipred_z2_8bpc_avx512icl_table+wq]
1856    mova                m14, [base+z_frac_table]
1857    inc                 tlq
1858    vpbroadcastd        m15, [base+pw_512]
1859    neg                 dxd
1860    jmp                  wq
1861.w4:
1862    movd                xm7, [tlq]
1863    vpbroadcastq        m10, [base+z_xpos_off2a]
1864    test             angled, 0x400
1865    jnz .w4_main ; !enable_intra_edge_filter
1866    lea                 r3d, [hq+2]
1867    add              angled, 1022
1868    shl                 r3d, 6
1869    test                r3d, angled
1870    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
1871    vpbroadcastd        xm2, [base+pb_4]
1872    sub              angled, 1075 ; angle - 53
1873    call .upsample_above
1874    lea                 r3d, [hq+3]
1875    vpbroadcastq        m10, [pb_0to63+1]
1876    punpcklbw           xm7, xm0, xm7
1877    call .filter_strength
1878    jmp .w4_filter_left
1879.w4_upsample_left:
1880    call .upsample_left
1881    movsldup            m16, [base+z_ypos_off3]
1882    vpbroadcastd         m9, [base+pb_16]
1883    punpcklbw           xm8, xm0, xm8
1884    jmp .w4_main2
1885.w4_no_upsample_above:
1886    lea                 r3d, [hq+3]
1887    sub              angled, 1112 ; angle - 90
1888    call .filter_strength
1889    test                r3d, r3d
1890    jz .w4_no_filter_above
1891    vpbroadcastd        xm5, [base+pb_3]
1892    call .filter_top_w16
1893.w4_no_filter_above:
1894    lea                 r3d, [hq+2]
1895    add              angled, 973 ; angle + 883
1896    shl                 r3d, 6
1897    test                r3d, angled
1898    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
1899    vpbroadcastd        ym0, [base+pb_90]
1900    psubb               ym0, ym17
1901    vpcmpgtb         k2{k2}, ym0, ym16
1902    kmovd               r3d, k2
1903.w4_filter_left:
1904    test                r3d, r3d
1905    jz .w4_main
1906    popcnt              r3d, r3d
1907    call .filter_left_h16
1908.w4_main:
1909    movsldup            m16, [base+z_ypos_off1]
1910    vpbroadcastd         m9, [base+pb_8]
1911.w4_main2:
1912    vpbroadcastq         m3, [base+z_ypos_mul1a]
1913    vpbroadcastw         m0, dyd
1914    movsldup             m1, [base+z_xpos_mul]
1915    vpbroadcastw         m5, dxd
1916    vinserti32x4         m7, [tlq-16], 3
1917    vinserti32x4         m8, [tlq-16], 3
1918    pmullw               m3, m0
1919    vbroadcasti32x4      m2, [base+z_xpos_bc]
1920    pmullw               m1, m5      ; xpos0..3
1921    psllw                m5, 5       ; dx*8
1922    psraw                m4, m3, 6
1923    psrlw                m3, 1
1924    packsswb             m4, m4
1925    vpermw               m3, m3, m14 ; 64-frac, frac
1926    punpcklbw            m4, m4
1927    lea                  r2, [strideq*3]
1928    paddb                m4, m16     ; base, base+1
1929.w4_loop:
1930    pshufb              m16, m1, m2
1931    psrlw                m0, m1, 3
1932    paddb               m16, m10
1933    vpermw               m0, m0, m14
1934    vpmovw2m             k1, m16     ; base_x < 0
1935    vpermb              m16, m16, m7
1936    pmaddubsw           m16, m0
1937    vpermb               m0, m4, m8
1938    pmaddubsw       m16{k1}, m0, m3
1939    pmulhrsw            m16, m15
1940    vpmovwb            ym16, m16
1941    movd   [dstq+strideq*0], xm16
1942    pextrd [dstq+strideq*1], xm16, 1
1943    pextrd [dstq+strideq*2], xm16, 2
1944    pextrd [dstq+r2       ], xm16, 3
1945    sub                  hd, 8
1946    jl .w4_end
1947    paddsw               m1, m5
1948    vextracti128       xm16, ym16, 1
1949    lea                dstq, [dstq+strideq*4]
1950    paddb                m4, m9
1951    movd   [dstq+strideq*0], xm16
1952    pextrd [dstq+strideq*1], xm16, 1
1953    pextrd [dstq+strideq*2], xm16, 2
1954    pextrd [dstq+r2       ], xm16, 3
1955    lea                dstq, [dstq+strideq*4]
1956    jg .w4_loop
1957.w4_end:
1958    RET
1959.upsample_above: ; w4/w8
1960    mova                xm0, [tlq-1]
1961    xor              angled, 0x7f ; 180 - angle
1962    add                 dxd, dxd
1963    jmp .upsample
1964.upsample_left: ; h4/h8
1965    palignr             xm0, xm8, [tlq-16], 15
1966    vpbroadcastb        xm2, hd
1967    add                 dyd, dyd
1968.upsample:
1969    pshufb              xm1, xm0, [base+z_filter4_s1]
1970    pminub              xm2, [base+z_filter_s4]
1971    vpbroadcastd        xm3, [base+pb_m4_36]
1972    pshufb              xm0, xm2
1973    pmaddubsw           xm1, xm3
1974    pmaddubsw           xm0, xm3
1975    paddw               xm0, xm1
1976    pmulhrsw            xm0, xm15
1977    packuswb            xm0, xm0
1978    ret
1979.filter_strength:
1980    vpbroadcastb       ym16, r3d
1981    mov                 r3d, angled
1982    vpbroadcastd         m2, [tlq-4]
1983    vpbroadcastb       ym17, angled
1984    shr                 r3d, 8
1985    vpcmpeqb             k2, ym16, [base+z_filter_wh]
1986    mova               xm16, [base+z_filter_t0+r3*8]
1987    vpcmpgtb         k1{k2}, ym17, ym16
1988    mova                 m9, [pb_0to63]
1989    kmovd               r3d, k1
1990    ret
1991.w8:
1992    movq                xm7, [tlq]
1993    vbroadcasti32x4     m10, [base+z_xpos_off2a]
1994    test             angled, 0x400
1995    jnz .w8_main
1996    lea                 r3d, [angleq+126]
1997    mov                 r3b, hb
1998    cmp                 r3d, 8
1999    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2000    vpbroadcastd        xm2, [base+pb_8]
2001    sub              angled, 53 ; angle - 53
2002    call .upsample_above
2003    lea                 r3d, [hq+7]
2004    vbroadcasti32x4     m10, [pb_0to63+1]
2005    punpcklbw           xm7, xm0, xm7
2006    call .filter_strength
2007    jmp .w8_filter_left
2008.w8_upsample_left:
2009    call .upsample_left
2010    movshdup            m16, [base+z_ypos_off3]
2011    vpbroadcastd         m9, [base+pb_8]
2012    punpcklbw           xm8, xm0, xm8
2013    jmp .w8_main2
2014.w8_no_upsample_above:
2015    lea                 r3d, [hq+7]
2016    sub              angled, 90 ; angle - 90
2017    call .filter_strength
2018    test                r3d, r3d
2019    jz .w8_no_filter_above
2020    vpbroadcastd        xm5, [base+pb_7]
2021    call .filter_top_w16
2022.w8_no_filter_above:
2023    lea                 r3d, [angleq-51]
2024    mov                 r3b, hb
2025    cmp                 r3d, 8
2026    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
2027    vpbroadcastd        ym0, [base+pb_90]
2028    psubb               ym0, ym17
2029    vpcmpgtb         k2{k2}, ym0, ym16
2030    kmovd               r3d, k2
2031.w8_filter_left:
2032    test                r3d, r3d
2033    jz .w8_main
2034    cmp                  hd, 32
2035    je .w8_filter_left_h32
2036    popcnt              r3d, r3d
2037    call .filter_left_h16
2038    jmp .w8_main
2039.w8_filter_left_h32:
2040    call .filter_left_h64
2041.w8_main:
2042    movshdup            m16, [base+z_ypos_off2]
2043    vpbroadcastd         m9, [base+pb_4]
2044.w8_main2:
2045    vbroadcasti32x4      m3, [base+z_ypos_mul1a]
2046    vpbroadcastw         m0, dyd
2047    movshdup             m1, [base+z_xpos_mul]
2048    vpbroadcastw         m5, dxd
2049    vinserti32x4         m7, [tlq-16], 3
2050    vinserti32x4         m8, [tlq-16], 3
2051    pmullw               m3, m0
2052    vpbroadcastd         m2, [base+pb_1]
2053    pmullw               m1, m5      ; xpos0..3
2054    psllw                m5, 4       ; dx*4
2055    psraw                m4, m3, 6
2056    psrlw                m3, 1
2057    packsswb             m4, m4
2058    vpermw               m3, m3, m14 ; 64-frac, frac
2059    lea                 r3d, [dxq+(8<<6)]
2060    paddsb               m4, m16
2061    shl                 dxd, 2
2062    paddsb               m0, m4, m2
2063    lea                  r2, [strideq*3]
2064    punpcklbw            m4, m0      ; base, base+1
2065.w8_loop:
2066    pshufb              m16, m1, m2
2067    psrlw                m0, m1, 3
2068    paddb               m16, m10
2069    vpermw               m0, m0, m14
2070    vpmovw2m             k1, m16     ; base_x < 0
2071    vpermb              m16, m16, m7
2072    pmaddubsw           m16, m0
2073    vpermb               m0, m4, m8
2074    pmaddubsw       m16{k1}, m0, m3
2075    pmulhrsw            m16, m15
2076    vpmovwb            ym16, m16
2077    vextracti128       xm17, ym16, 1
2078    movq   [dstq+strideq*0], xm16
2079    movhps [dstq+strideq*1], xm16
2080    movq   [dstq+strideq*2], xm17
2081    movhps [dstq+r2       ], xm17
2082    sub                  hd, 4
2083    jz .w8_end
2084    paddw                m1, m5
2085    lea                dstq, [dstq+strideq*4]
2086    paddb                m4, m9
2087    add                 r3d, dxd
2088    jge .w8_loop
2089.w8_leftonly_loop:
2090    vpermb              m16, m4, m8
2091    pmaddubsw           m16, m3
2092    paddb                m4, m9
2093    pmulhrsw            m16, m15
2094    vpmovwb            ym16, m16
2095    vextracti128       xm17, ym16, 1
2096    movq   [dstq+strideq*0], xm16
2097    movhps [dstq+strideq*1], xm16
2098    movq   [dstq+strideq*2], xm17
2099    movhps [dstq+r2       ], xm17
2100    lea                dstq, [dstq+strideq*4]
2101    sub                  hd, 4
2102    jg .w8_leftonly_loop
2103.w8_end:
2104    RET
2105.filter_top_w16:
2106    mova                xm0, [base+z_filter_s1]
2107    popcnt              r3d, r3d
2108    pminub              xm4, xm5, [base+z_filter_s4]
2109    vpermi2b            xm0, xm7, xm2
2110    pminub              xm5, [base+z_filter_s5]
2111    pshufb              xm1, xm7, [base+z_filter_s2]
2112    vpbroadcastd       xm11, [base+z_filter_k+(r3-1)*4+12*0]
2113    pshufb              xm3, xm7, [base+z_filter_s3]
2114    vpbroadcastd       xm12, [base+z_filter_k+(r3-1)*4+12*1]
2115    pshufb              xm4, xm7, xm4
2116    vpbroadcastd       xm13, [base+z_filter_k+(r3-1)*4+12*2]
2117    pshufb              xm5, xm7, xm5
2118    pmaddubsw           xm0, xm11
2119    pmaddubsw           xm1, xm11
2120    pmaddubsw           xm6, xm3, xm12
2121    vpbroadcastd       xm12, r7m ; max_width
2122    pmaddubsw           xm3, xm13
2123    pmaddubsw           xm4, xm11
2124    pmaddubsw           xm5, xm11
2125    packssdw           xm12, xm12
2126    paddw               xm0, xm6
2127    paddw               xm1, xm3
2128    paddw               xm0, xm4
2129    paddw               xm1, xm5
2130    packsswb           xm12, xm12
2131    pmulhrsw            xm0, xm15
2132    pmulhrsw            xm1, xm15
2133    vpcmpgtb             k1, xm12, xm9 ; x < max_width
2134    packuswb        xm7{k1}, xm0, xm1
2135    ret
2136.filter_left_h16:
2137    lea                 r5d, [hq-1]
2138    mova                xm0, [base+z_filter_s1]
2139    vpbroadcastb        xm5, r5d
2140    vpermi2b            xm0, xm8, xm2
2141    pminub              xm4, xm5, [base+z_filter_s4]
2142    pshufb              xm1, xm8, [base+z_filter_s2]
2143    pminub              xm5, [base+z_filter_s5]
2144    pshufb              xm3, xm8, [base+z_filter_s3]
2145    vpbroadcastd       xm11, [base+z_filter_k+(r3-1)*4+12*0]
2146    pshufb              xm4, xm8, xm4
2147    vpbroadcastd       xm12, [base+z_filter_k+(r3-1)*4+12*1]
2148    pshufb              xm5, xm8, xm5
2149    vpbroadcastd       xm13, [base+z_filter_k+(r3-1)*4+12*2]
2150    pmaddubsw           xm0, xm11
2151    pmaddubsw           xm1, xm11
2152    pmaddubsw           xm6, xm3, xm12
2153    vpbroadcastd       xm12, r8m ; max_height
2154    pmaddubsw           xm3, xm13
2155    pmaddubsw           xm4, xm11
2156    pmaddubsw           xm5, xm11
2157    packssdw           xm12, xm12
2158    paddw               xm0, xm6
2159    paddw               xm1, xm3
2160    paddw               xm0, xm4
2161    paddw               xm1, xm5
2162    packsswb           xm12, xm12
2163    pmulhrsw            xm0, xm15
2164    pmulhrsw            xm1, xm15
2165    vpcmpgtb             k1, xm12, xm9 ; y < max_height
2166    packuswb        xm8{k1}, xm0, xm1
2167    ret
2168.w16:
2169    movu                xm7, [tlq] ; top
2170    test             angled, 0x400
2171    jnz .w16_main
2172    lea                 r3d, [hq+15]
2173    sub              angled, 90
2174    call .filter_strength
2175    test                r3d, r3d
2176    jz .w16_no_filter_above
2177    vpbroadcastd        xm5, [base+pb_15]
2178    call .filter_top_w16
2179.w16_no_filter_above:
2180    cmp                  hd, 16
2181    jg .w16_filter_left_h64
2182    vpbroadcastd        ym0, [base+pb_90]
2183    psubb               ym0, ym17
2184    vpcmpgtb         k2{k2}, ym0, ym16
2185    kmovd               r3d, k2
2186    test                r3d, r3d
2187    jz .w16_main
2188    popcnt              r3d, r3d
2189    call .filter_left_h16
2190    jmp .w16_main
2191.w16_filter_left_h64:
2192    call .filter_left_h64
2193.w16_main:
2194    vbroadcasti32x4      m6, [base+z_ypos_mul1a] ; 1.. 8
2195    vbroadcasti32x4      m5, [base+z_ypos_mul1b] ; 9..15
2196    vpbroadcastw         m0, dyd
2197    vinserti32x4         m7, [tlq-16], 3
2198    vpbroadcastd         m2, [base+pb_1]
2199    vpbroadcastw        m12, dxd
2200    movshdup             m1, [base+z_xpos_mul]
2201    pmullw               m6, m0
2202    vbroadcasti32x4      m3, [base+z_xpos_off2a]
2203    pmullw               m5, m0
2204    vbroadcasti32x4      m4, [base+z_xpos_off2b]
2205    pmullw               m1, m12      ; xpos0 xpos1 xpos2 xpos3
2206    vpbroadcastd         m9, [base+pb_4]
2207    psllw               m12, 4        ; dx*4
2208    movshdup            m16, [base+z_ypos_off2]
2209    psrlw               m10, m6, 1
2210    psrlw               m11, m5, 1
2211    vpermw              m10, m10, m14 ; 64-frac, frac
2212    psraw                m6, 6
2213    vpermw              m11, m11, m14
2214    psraw                m5, 6
2215    mov                 r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft
2216    packsswb             m6, m5
2217    mov                 r3d, 1<<6
2218    paddsb               m6, m16
2219    sub                 r5d, dxd      ; left-only threshold
2220    paddsb               m0, m6, m2
2221    shl                 dxd, 2
2222    punpcklbw            m5, m6, m0   ; base, base+1
2223    lea                  r2, [strideq*3]
2224    punpckhbw            m6, m0
2225.w16_loop:
2226    pshufb              m17, m1, m2
2227    psrlw                m0, m1, 3
2228    paddb               m16, m3, m17
2229    vpermw               m0, m0, m14
2230    paddb               m17, m4
2231    vpmovw2m             k1, m16
2232    vpermb              m16, m16, m7
2233    vpmovw2m             k2, m17
2234    vpermb              m17, m17, m7
2235    pmaddubsw           m16, m0
2236    pmaddubsw           m17, m0
2237    add                 r3d, dxd
2238    jge .w16_toponly
2239    mova                 m0, m8
2240    vpermt2b             m0, m5, m7
2241    pmaddubsw       m16{k1}, m0, m10
2242    mova                 m0, m8
2243    vpermt2b             m0, m6, m7
2244    pmaddubsw       m17{k2}, m0, m11
2245.w16_toponly:
2246    pmulhrsw            m16, m15
2247    pmulhrsw            m17, m15
2248    packuswb            m16, m17
2249    mova          [dstq+strideq*0], xm16
2250    vextracti128  [dstq+strideq*1], ym16, 1
2251    vextracti32x4 [dstq+strideq*2], m16, 2
2252    vextracti32x4 [dstq+r2       ], m16, 3
2253    sub                  hd, 4
2254    jz .w16_end
2255    paddw                m1, m12
2256    lea                dstq, [dstq+strideq*4]
2257    paddb                m5, m9
2258    paddb                m6, m9
2259    cmp                 r3d, r5d
2260    jge .w16_loop
2261.w16_leftonly_loop:
2262    vpermb              m16, m5, m8
2263    vpermb              m17, m6, m8
2264    pmaddubsw           m16, m10
2265    pmaddubsw           m17, m11
2266    paddb                m5, m9
2267    paddb                m6, m9
2268    pmulhrsw            m16, m15
2269    pmulhrsw            m17, m15
2270    packuswb            m16, m17
2271    mova          [dstq+strideq*0], xm16
2272    vextracti128  [dstq+strideq*1], ym16, 1
2273    vextracti32x4 [dstq+strideq*2], m16, 2
2274    vextracti32x4 [dstq+r2       ], m16, 3
2275    lea                dstq, [dstq+strideq*4]
2276    sub                  hd, 4
2277    jg .w16_leftonly_loop
2278.w16_end:
2279    RET
2280.w32:
2281    movu                ym7, [tlq]
2282    test             angled, 0x400
2283    jnz .w32_main
2284    vpbroadcastd         m2, [tlq-4]
2285    mova                ym0, [base+z_filter_s1]
2286    vbroadcasti32x4     ym1, [base+z_filter_s2]
2287    vbroadcasti32x4     ym3, [base+z_filter_s3]
2288    vbroadcasti32x4     ym4, [base+z_filter_s4]
2289    vpermi2b            ym0, ym7, ym2 ; al bl
2290    vpbroadcastd        ym5, [base+pb_31]
2291    pminub              ym5, [base+z_filter_s5]
2292    pshufb              ym1, ym7, ym1 ; ah bh
2293    vpbroadcastd       ym11, [base+z_filter_k+4*2+12*0]
2294    pshufb              ym3, ym7, ym3 ; cl ch
2295    vpbroadcastd       ym12, [base+z_filter_k+4*2+12*1]
2296    pshufb              ym4, ym7, ym4 ; el dl
2297    vpbroadcastd       ym13, [base+z_filter_k+4*2+12*2]
2298    vpermb              ym5, ym5, ym7 ; eh dh
2299    pmaddubsw           ym0, ym11
2300    pmaddubsw           ym1, ym11
2301    pmaddubsw           ym6, ym3, ym12
2302    vpbroadcastd       ym12, r6m
2303    pmaddubsw           ym3, ym13
2304    pmaddubsw           ym4, ym11
2305    pmaddubsw           ym5, ym11
2306    mova                 m9, [pb_0to63]
2307    packssdw           ym12, ym12
2308    paddw               ym0, ym6
2309    paddw               ym1, ym3
2310    paddw               ym0, ym4
2311    paddw               ym1, ym5
2312    packsswb           ym12, ym12
2313    pmulhrsw            ym0, ym15
2314    pmulhrsw            ym1, ym15
2315    vpcmpgtb             k1, ym12, ym9 ; x < max_width
2316    packuswb        ym7{k1}, ym0, ym1
2317    cmp                  hd, 16
2318    jg .w32_filter_h64
2319    mov                 r3d, 3
2320    call .filter_left_h16
2321    jmp .w32_main
2322.w32_filter_h64:
2323    call .filter_left_h64
2324.w32_main:
2325    vbroadcasti32x8      m6, [base+z_ypos_mul1a] ; 1.. 8
2326    vbroadcasti32x8      m5, [base+z_ypos_mul1b] ; 9..15
2327    vpbroadcastw         m0, dyd
2328    vinserti32x4         m7, [tlq-16], 3
2329    rorx                r2q, dxq, 62 ; dx << 2
2330    vpbroadcastd         m2, [base+pb_1]
2331    vpbroadcastw         m1, r2d
2332    pmullw               m6, m0
2333    vbroadcasti32x8      m3, [base+z_xpos_off2a]
2334    pmullw               m5, m0
2335    vbroadcasti32x8      m4, [base+z_xpos_off2b]
2336    mova                ym0, ym1
2337    paddw               m12, m1, m1
2338    vpbroadcastd         m9, [base+pb_2]
2339    paddw                m1, m0       ; xpos1 xpos0
2340    mova                ym0, ym2
2341    psrlw               m10, m6, 1
2342    psrlw               m11, m5, 1
2343    vpermw              m10, m10, m14 ; 64-frac, frac
2344    psraw                m6, 6
2345    vpermw              m11, m11, m14
2346    psraw                m5, 6
2347    mov                 r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft
2348    packsswb             m6, m5
2349    mov                 r3d, 1<<6
2350    paddsb               m6, m0
2351    sub                 r5d, dxd      ; left-only threshold
2352    paddsb               m0, m6, m2
2353    add                 dxd, dxd
2354    punpcklbw            m5, m6, m0   ; base, base+1
2355    punpckhbw            m6, m0
2356.w32_loop:
2357    pshufb              m17, m1, m2
2358    psrlw                m0, m1, 3
2359    paddb               m16, m3, m17
2360    vpermw               m0, m0, m14
2361    paddb               m17, m4
2362    vpmovw2m             k1, m16
2363    vpermb              m16, m16, m7
2364    vpmovw2m             k2, m17
2365    vpermb              m17, m17, m7
2366    pmaddubsw           m16, m0
2367    pmaddubsw           m17, m0
2368    add                 r3d, dxd
2369    jge .w32_toponly
2370    mova                 m0, m8
2371    vpermt2b             m0, m5, m7
2372    pmaddubsw       m16{k1}, m0, m10
2373    mova                 m0, m8
2374    vpermt2b             m0, m6, m7
2375    pmaddubsw       m17{k2}, m0, m11
2376.w32_toponly:
2377    pmulhrsw            m16, m15
2378    pmulhrsw            m17, m15
2379    packuswb            m16, m17
2380    vextracti32x8 [dstq+strideq*0], m16, 1
2381    mova          [dstq+strideq*1], ym16
2382    sub                  hd, 2
2383    jz .w32_end
2384    paddw                m1, m12
2385    lea                dstq, [dstq+strideq*2]
2386    paddb                m5, m9
2387    paddb                m6, m9
2388    cmp                 r3d, r5d
2389    jge .w32_loop
2390.w32_leftonly_loop:
2391    vpermb              m16, m5, m8
2392    vpermb              m17, m6, m8
2393    pmaddubsw           m16, m10
2394    pmaddubsw           m17, m11
2395    paddb                m5, m9
2396    paddb                m6, m9
2397    pmulhrsw            m16, m15
2398    pmulhrsw            m17, m15
2399    packuswb            m16, m17
2400    vextracti32x8 [dstq+strideq*0], m16, 1
2401    mova          [dstq+strideq*1], ym16
2402    lea                dstq, [dstq+strideq*2]
2403    sub                  hd, 2
2404    jg .w32_leftonly_loop
2405.w32_end:
2406    RET
2407.filter_left_h64:
2408    mova                 m0, [base+z_filter_s1]
2409    lea                 r3d, [hq-1]
2410    vbroadcasti32x4      m4, [base+z_filter_s4]
2411    vpbroadcastb         m5, r3d
2412    vbroadcasti32x4      m1, [base+z_filter_s2]
2413    vbroadcasti32x4      m3, [base+z_filter_s3]
2414    vpermi2b             m0, m8, m2 ; al bl
2415    pminub               m5, [base+z_filter_s5]
2416    pshufb               m1, m8, m1 ; ah bh
2417    vpbroadcastd        m11, [base+z_filter_k+4*2+12*0]
2418    pshufb               m3, m8, m3 ; cl ch
2419    vpbroadcastd        m12, [base+z_filter_k+4*2+12*1]
2420    pshufb               m4, m8, m4 ; el dl
2421    vpbroadcastd        m13, [base+z_filter_k+4*2+12*2]
2422    vpermb               m5, m5, m8 ; eh dh
2423    pmaddubsw            m0, m11
2424    pmaddubsw            m1, m11
2425    pmaddubsw            m6, m3, m12
2426    vpbroadcastd        m12, r8m    ; max_height
2427    pmaddubsw            m3, m13
2428    pmaddubsw            m4, m11
2429    pmaddubsw            m5, m11
2430    packssdw            m12, m12
2431    paddw                m0, m6
2432    paddw                m1, m3
2433    paddw                m0, m4
2434    paddw                m1, m5
2435    packsswb            m12, m12
2436    pmulhrsw             m0, m15
2437    pmulhrsw             m1, m15
2438    vpcmpgtb             k1, m12, m9 ; y < max_height
2439    packuswb         m8{k1}, m0, m1
2440    ret
2441.w64:
2442    movu                 m7, [tlq]
2443    test             angled, 0x400
2444    jnz .w64_main
2445    vpbroadcastd         m2, [tlq-4]
2446    mova                 m0, [base+z_filter_s1]
2447    vbroadcasti32x4      m1, [base+z_filter_s2]
2448    vbroadcasti32x4      m3, [base+z_filter_s3]
2449    vbroadcasti32x4      m4, [base+z_filter_s4]
2450    vpermi2b             m0, m7, m2 ; al bl
2451    vpbroadcastd         m5, [base+pb_63]
2452    pminub               m5, [base+z_filter_s5]
2453    pshufb               m1, m7, m1 ; ah bh
2454    vpbroadcastd        m11, [base+z_filter_k+4*2+12*0]
2455    pshufb               m3, m7, m3 ; cl ch
2456    vpbroadcastd        m12, [base+z_filter_k+4*2+12*1]
2457    pshufb               m4, m7, m4 ; el dl
2458    vpbroadcastd        m13, [base+z_filter_k+4*2+12*2]
2459    vpermb               m5, m5, m7 ; eh dh
2460    pmaddubsw            m0, m11
2461    pmaddubsw            m1, m11
2462    pmaddubsw            m6, m3, m12
2463    vpbroadcastd        m12, r6m
2464    pmaddubsw            m3, m13
2465    pmaddubsw            m4, m11
2466    pmaddubsw            m5, m11
2467    mova                 m9, [pb_0to63]
2468    packssdw            m12, m12
2469    paddw                m0, m6
2470    paddw                m1, m3
2471    paddw                m0, m4
2472    paddw                m1, m5
2473    packsswb            m12, m12
2474    pmulhrsw             m0, m15
2475    pmulhrsw             m1, m15
2476    vpcmpgtb             k1, m12, m9 ; x < max_width
2477    packuswb         m7{k1}, m0, m1
2478    call .filter_left_h64 ; always filter the full 64 pixels for simplicity
2479.w64_main:
2480    vpbroadcastw         m5, dyd
2481    vpbroadcastd         m9, [tlq-4]
2482    rorx                r2q, dxq, 62 ; dx << 2
2483    pmullw               m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such
2484    pmullw               m5, [base+z_ypos_mul1b]     ; pixels aren't selected from the left edge
2485    vpbroadcastw         m1, r2d     ; xpos
2486    mova                 m3, [base+z_xpos_off2a]
2487    mova                 m4, [base+z_xpos_off2b]
2488    mova                m12, m1
2489    vpbroadcastd         m2, [base+pb_1]
2490    psrlw               m10, m6, 1
2491    psrlw               m11, m5, 1
2492    vpermw              m10, m10, m14 ; 64-frac, frac
2493    psraw                m6, 6
2494    vpermw              m11, m11, m14
2495    psraw                m5, 6
2496    mov                 r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft
2497    packsswb             m6, m5
2498    mov                 r3d, 1<<6
2499    paddsb               m0, m6, m2
2500    sub                 r5d, dxd      ; left-only threshold
2501    punpcklbw            m5, m6, m0   ; base, base+1
2502    punpckhbw            m6, m0
2503.w64_loop:
2504    pshufb              m17, m1, m2
2505    psrlw                m0, m1, 3
2506    paddb               m16, m3, m17
2507    vpermw               m0, m0, m14
2508    paddb               m17, m4
2509    vpmovw2m             k1, m16      ; base_x < 0
2510    vpermi2b            m16, m7, m9
2511    vpmovw2m             k2, m17
2512    vpermi2b            m17, m7, m9
2513    pmaddubsw           m16, m0
2514    pmaddubsw           m17, m0
2515    add                 r3d, dxd
2516    jge .w64_toponly
2517    mova                 m0, m8
2518    vpermt2b             m0, m5, m9
2519    pmaddubsw       m16{k1}, m0, m10
2520    mova                 m0, m8
2521    vpermt2b             m0, m6, m9
2522    pmaddubsw       m17{k2}, m0, m11
2523.w64_toponly:
2524    pmulhrsw            m16, m15
2525    pmulhrsw            m17, m15
2526    packuswb            m16, m17
2527    mova             [dstq], m16
2528    dec                  hd
2529    jz .w64_end
2530    paddw                m1, m12
2531    add                dstq, strideq
2532    paddb                m5, m2
2533    paddb                m6, m2
2534    cmp                 r3d, r5d
2535    jge .w64_loop
2536.w64_leftonly_loop:
2537    vpermb              m16, m5, m8
2538    vpermb              m17, m6, m8
2539    pmaddubsw           m16, m10
2540    pmaddubsw           m17, m11
2541    paddb                m5, m2
2542    paddb                m6, m2
2543    pmulhrsw            m16, m15
2544    pmulhrsw            m17, m15
2545    packuswb            m16, m17
2546    mova             [dstq], m16
2547    add                dstq, strideq
2548    dec                  hd
2549    jg .w64_leftonly_loop
2550.w64_end:
2551    RET
2552
2553cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
2554    lea                  r7, [z_filter_t0]
2555    tzcnt                wd, wm
2556    movifnidn        angled, anglem
2557    lea                  t0, [dr_intra_derivative+45*2-1]
2558    movsxd               wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4]
2559    sub              angled, 180
2560    mov                 dyd, angled
2561    neg                 dyd
2562    xor              angled, 0x400
2563    or                  dyq, ~0x7e
2564    mova                 m0, [base+pb_63to0]
2565    movzx               dyd, word [t0+dyq]
2566    lea                  wq, [base+ipred_z3_8bpc_avx512icl_table+wq]
2567    movifnidn            hd, hm
2568    mova                m14, [base+z_frac_table]
2569    shl                 dyd, 6
2570    vpbroadcastd        m15, [base+pw_512]
2571    jmp                  wq
2572.w4:
2573    cmp              angleb, 40
2574    jae .w4_no_upsample
2575    lea                 r3d, [angleq-1024]
2576    sar                 r3d, 7
2577    add                 r3d, hd
2578    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
2579    lea                 r3d, [hq+4]
2580    call .upsample
2581    movshdup             m1, [base+z_ypos_off1]
2582    vpbroadcastd         m6, [base+pb_16]
2583    jmp .w4_main2
2584.w4_no_upsample:
2585    lea                 r3d, [hq+3]
2586    vpbroadcastb         m9, r3d
2587    vpxord               m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4)
2588    pmaxub               m1, m0
2589    vpermb               m7, m1, [tlq-64*1]
2590    test             angled, 0x400 ; !enable_intra_edge_filter
2591    jnz .w4_main
2592    vpbroadcastb        xm1, angled
2593    shr              angled, 8
2594    vpcmpeqb             k1, xm9, [base+z_filter_wh]
2595    vpbroadcastd         m2, [tlq-3]
2596    vpcmpgtb         k1{k1}, xm1, [base+z_filter_t0+angleq*8]
2597    kmovw               r5d, k1
2598    test                r5d, r5d
2599    jz .w4_main
2600    pminub               m9, [pb_0to63]
2601    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter
2602    vpermb               m7, m9, m0
2603.w4_main:
2604    movsldup             m1, [base+z_ypos_off1]
2605    vpbroadcastd         m6, [base+pb_8]
2606.w4_main2:
2607    vpbroadcastw         m0, dyd
2608    vpbroadcastq         m2, [base+z_ypos_mul2a] ; 1..4
2609    pmulhuw              m2, m0 ; ypos >> 1
2610    lea                  r2, [strideq*3]
2611    vpermw               m3, m2, m14 ; 64-frac, frac
2612    psrlw                m2, 5
2613    packsswb             m2, m2
2614    punpcklbw            m2, m2
2615    paddsb               m2, m1 ; base, base+1
2616.w4_loop:
2617    vpermb               m0, m2, m7
2618    pmaddubsw            m0, m3
2619    paddsb               m2, m6
2620    pmulhrsw             m0, m15
2621    vpmovwb             ym0, m0
2622    movd   [dstq+strideq*0], xm0
2623    pextrd [dstq+strideq*1], xm0, 1
2624    pextrd [dstq+strideq*2], xm0, 2
2625    pextrd [dstq+r2       ], xm0, 3
2626    sub                  hd, 8
2627    jl .w4_end
2628    vextracti32x4       xm0, ym0, 1
2629    lea                dstq, [dstq+strideq*4]
2630    movd   [dstq+strideq*0], xm0
2631    pextrd [dstq+strideq*1], xm0, 1
2632    pextrd [dstq+strideq*2], xm0, 2
2633    pextrd [dstq+r2       ], xm0, 3
2634    lea                dstq, [dstq+strideq*4]
2635    jg .w4_loop
2636.w4_end:
2637    RET
2638.upsample:
2639    xor                 r3d, 31 ; 31 - (h + imin(w, h))
2640    vbroadcasti32x4     ym0, [base+z_xpos_off2a]
2641    vpbroadcastb        ym7, r3d
2642    pmaxub              ym7, [base+z3_upsample]
2643    vbroadcasti32x4     ym1, [base+z_filter_s4]
2644    vpermb              ym7, ym7, [tlq-31]
2645    vpbroadcastd        ym2, [base+pb_m4_36]
2646    pshufb              ym0, ym7, ym0
2647    psrldq              ym7, 1
2648    pshufb              ym1, ym7, ym1
2649    pmaddubsw           ym0, ym2
2650    pmaddubsw           ym1, ym2
2651    add                 dyd, dyd
2652    paddw               ym0, ym1
2653    pmulhrsw            ym0, ym15
2654    packuswb            ym0, ym0
2655    punpcklbw           ym7, ym0
2656    ret
2657.w8:
2658    lea                 r3d, [angleq+216]
2659    mov                 r3b, hb
2660    cmp                 r3d, 8
2661    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
2662    lea                 r3d, [hq*2]
2663    call .upsample
2664    pshufd               m1, [base+z_ypos_off1], q0000
2665    vpbroadcastd         m6, [base+pb_8]
2666    jmp .w8_main2
2667.w8_no_upsample:
2668    mov                 r3d, 8
2669    cmp                  hd, 4
2670    cmove               r3d, hd
2671    lea                 r3d, [r3+hq-1]
2672    xor                 r3d, 63 ; 63 - (h + imin(w, h))
2673    vpbroadcastb         m1, wd
2674    pmaxub               m1, m0
2675    vpermb               m7, m1, [tlq-64*1]
2676    test             angled, 0x400 ; !enable_intra_edge_filter
2677    jnz .w8_main
2678    lea                 r3d, [hq+7]
2679    call .filter_strength
2680    test                r5d, r5d
2681    jz .w8_main
2682    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter
2683    vpermb               m7, m10, m0
2684.w8_main:
2685    movsldup             m1, [base+z_ypos_off2]
2686    vpbroadcastd         m6, [base+pb_4]
2687.w8_main2:
2688    vpbroadcastw         m0, dyd
2689    vbroadcasti32x4      m2, [base+z_ypos_mul2a] ; 1..8
2690    pmulhuw              m2, m0 ; ypos >> 1
2691    lea                  r2, [strideq*3]
2692    vpermw               m3, m2, m14 ; 64-frac, frac
2693    psrlw                m2, 5
2694    packsswb             m2, m2
2695    punpcklbw            m2, m2
2696    paddsb               m2, m1 ; base, base+1
2697.w8_loop:
2698    vpermb               m0, m2, m7
2699    pmaddubsw            m0, m3
2700    paddsb               m2, m6
2701    pmulhrsw             m0, m15
2702    vpmovwb             ym0, m0
2703    vextracti32x4       xm1, ym0, 1
2704    movq   [dstq+strideq*0], xm0
2705    movhps [dstq+strideq*1], xm0
2706    movq   [dstq+strideq*2], xm1
2707    movhps [dstq+r2       ], xm1
2708    lea                dstq, [dstq+strideq*4]
2709    sub                  hd, 4
2710    jg .w8_loop
2711    RET
2712.filter_strength:
2713    vpbroadcastd         m2, [tlq-3]
2714.filter_strength2:
2715    vpbroadcastb         m9, r3d
2716    vpbroadcastb        ym1, angled
2717    shr              angled, 8
2718    vpcmpeqb             k1, ym9, [base+z_filter_wh]
2719    mova                xm0, [base+z_filter_t0+angleq*8]
2720    vpcmpgtb         k1{k1}, ym1, ym0
2721    pminub              m10, m9, [pb_0to63]
2722    kmovd               r5d, k1
2723    ret
2724.w16_load:
2725    cmp                 r3d, hd
2726    cmovae              r3d, hd
2727    add                 r3d, hd
2728    mova                 m7, [tlq-64*1]
2729    neg                 r3d ; -(h + imin(w, h))
2730    and                 r3d, 63
2731    vpbroadcastb         m1, r3d
2732    pmaxub               m2, m0, m1
2733    cmp                  hd, 64
2734    je .w16_load_h64
2735    vpermb               m8, m1, m7
2736    vpermb               m7, m2, m7
2737    ret
2738.w16_load_h64:
2739    vpermb               m7, m0, m7
2740    vpermb               m8, m2, [tlq-64*2]
2741    ret
2742.w16:
2743    mov                 r3d, 16
2744    call .w16_load
2745    test             angled, 0x400 ; !enable_intra_edge_filter
2746    jnz .w16_main
2747    vpbroadcastd         m2, [tlq-3]
2748    cmp                  hd, 64
2749    je .w16_filter64
2750    lea                 r3d, [hq+15]
2751    call .filter_strength2
2752    test                r5d, r5d
2753    jz .w16_main
2754    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter
2755    pminub              m10, m9, [pb_0to63]
2756    vpermb               m8, m9, m0
2757    vpermb               m7, m10, m0
2758    jmp .w16_main
2759.w16_filter64:
2760    vpbroadcastd        m13, [base+pb_15]
2761    valignq              m0, m8, m7, 7
2762    pminub              m12, m13, [pb_0to63]
2763    valignq             m11, m8, m7, 1
2764    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
2765.w16_main:
2766    vbroadcasti32x4      m3, [base+z_ypos_mul2a] ; 1.. 8
2767    vbroadcasti32x4      m2, [base+z_ypos_mul2b] ; 9..15
2768    vpbroadcastw         m0, dyd
2769    vpbroadcastd         m6, [base+pb_4]
2770    pmulhuw              m3, m0 ; ypos >> 1
2771    pmulhuw              m2, m0
2772    movshdup             m0, [base+z_ypos_off2]
2773    lea                  r2, [strideq*3]
2774    vpbroadcastd         m1, [base+pb_1]
2775    vpermw               m4, m3, m14 ; 64-frac, frac
2776    psrlw                m3, 5
2777    vpermw               m5, m2, m14
2778    psrlw                m2, 5
2779    packsswb             m3, m2
2780    paddsb               m3, m0
2781    paddsb               m1, m3
2782    punpcklbw            m2, m3, m1 ; base, base+1
2783    punpckhbw            m3, m1
2784.w16_loop:
2785%macro Z3_PERM2 0
2786    mova                 m0, m7
2787    vpermt2b             m0, m2, m8
2788    mova                 m1, m7
2789    vpermt2b             m1, m3, m8
2790    pmaddubsw            m0, m4
2791    pmaddubsw            m1, m5
2792    paddsb               m2, m6
2793    paddsb               m3, m6
2794    pmulhrsw             m0, m15
2795    pmulhrsw             m1, m15
2796    packuswb             m0, m1
2797%endmacro
2798    Z3_PERM2
2799    mova          [dstq+strideq*0], xm0
2800    vextracti32x4 [dstq+strideq*1], ym0, 1
2801    vextracti32x4 [dstq+strideq*2], m0, 2
2802    vextracti32x4 [dstq+r2       ], m0, 3
2803    lea                dstq, [dstq+strideq*4]
2804    sub                  hd, 4
2805    jg .w16_loop
2806    RET
2807.w32:
2808    mov                  r3d, 32
2809    call .w16_load
2810    test             angled, 0x400 ; !enable_intra_edge_filter
2811    jnz .w32_main
2812    vpbroadcastd         m2, [tlq-3]
2813    cmp                  hd, 64
2814    je .w32_filter64
2815    lea                 r3d, [hq+31]
2816    vpbroadcastb         m9, r3d
2817    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter
2818    vpermb               m8, m9, m7
2819    jmp .w32_main
2820.w32_filter64:
2821    vpbroadcastd        m13, [base+pb_31]
2822    valignq              m0, m8, m7, 7
2823    pminub              m12, m13, [pb_0to63]
2824    valignq             m11, m8, m7, 1
2825    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
2826.w32_main:
2827    vbroadcasti32x8      m3, [base+z_ypos_mul2a] ; 1.. 8
2828    vbroadcasti32x8      m2, [base+z_ypos_mul2b] ; 9..15
2829    vpbroadcastw         m0, dyd
2830    vpbroadcastd         m1, [base+pb_1]
2831    pmulhuw              m3, m0 ; ypos >> 1
2832    pmulhuw              m2, m0
2833    vpbroadcastd         m6, [base+pb_2]
2834    mova                ym0, ym1
2835    vpermw               m4, m3, m14 ; 64-frac, frac
2836    psrlw                m3, 5
2837    vpermw               m5, m2, m14
2838    psrlw                m2, 5
2839    packsswb             m3, m2
2840    paddsb               m3, m0
2841    paddsb               m1, m3
2842    punpcklbw            m2, m3, m1 ; base, base+1
2843    punpckhbw            m3, m1
2844.w32_loop:
2845    Z3_PERM2
2846    vextracti32x8 [dstq+strideq*0], m0, 1
2847    mova          [dstq+strideq*1], ym0
2848    lea                dstq, [dstq+strideq*2]
2849    sub                  hd, 2
2850    jg .w32_loop
2851    RET
2852.w64:
2853    mova                 m7, [tlq-64*1]
2854    cmp                  hd, 64
2855    je .w64_h64
2856    lea                 r3d, [hq*2-1]
2857    xor                 r3d, 63 ; -(h + imin(w, h)) & 63
2858    vpbroadcastb         m1, r3d
2859    pmaxub               m0, m1
2860    vpermb               m8, m1, m7
2861    jmp .w64_filter
2862.w64_h64:
2863    vpermb               m8, m0, [tlq-64*2]
2864.w64_filter:
2865    vpermb               m7, m0, m7
2866    test             angled, 0x400 ; !enable_intra_edge_filter
2867    jnz .w64_main
2868    lea                 r3d, [hq-1]
2869    vpbroadcastd         m2, [tlq-3]
2870    vpbroadcastb        m13, r3d
2871    valignq              m0, m8, m7, 7
2872    pminub              m12, m13, [pb_0to63]
2873    valignq             m11, m8, m7, 1
2874    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
2875.w64_main:
2876    vpbroadcastw         m2, dyd
2877    pmulhuw              m3, m2, [base+z_ypos_mul2a]
2878    pmulhuw              m2, [base+z_ypos_mul2b]
2879    vpbroadcastd         m6, [base+pb_1]
2880    vpermw               m4, m3, m14 ; 64-frac, frac
2881    psrlw                m3, 5
2882    vpermw               m5, m2, m14
2883    psrlw                m2, 5
2884    packsswb             m3, m2
2885    paddsb               m1, m3, m6
2886    punpcklbw            m2, m3, m1 ; base, base+1
2887    punpckhbw            m3, m1
2888.w64_loop:
2889    Z3_PERM2
2890    mova             [dstq], m0
2891    add                dstq, strideq
2892    dec                  hd
2893    jg .w64_loop
2894    RET
2895
2896; The ipred_filter code processes 4x2 blocks in the following order
2897; which increases parallelism compared to doing things row by row.
2898; Some redundant blocks are calculated for w > 4.
2899;     w4     w8       w16             w32
2900;     1     1 2     1 2 3 4     1 2 3 4 9 a b c
2901;     2     2 3     2 3 4 5     2 3 4 5 a b c d
2902;     3     3 4     3 4 5 6     3 4 5 6 b c d e
2903;     4     4 5     4 5 6 7     4 5 6 7 c d e f
2904;     5     5 6     5 6 7 8     5 6 7 8 d e f g
2905;     6     6 7     6 7 8 9     6 7 8 9 e f g h
2906;     7     7 8     7 8 9 a     7 8 9 a f g h i
2907; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___
2908;           9       9 a b               h i j
2909;                   a b                 i j
2910;                   b                   j
2911
2912cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt
2913%define base r6-filter_taps
2914    lea                  r6, [filter_taps]
2915%ifidn fltd, fltm
2916    movzx              fltd, fltb
2917%else
2918    movzx              fltd, byte fltm
2919%endif
2920    vpbroadcastd       xmm2, [tlq+1]        ; t0 t0 t0 t0
2921    movifnidn            hd, hm
2922    shl                fltd, 6
2923    vpbroadcastd         m6, [base+pd_8]
2924    vpbroadcastd       xmm3, [tlq-2]        ; l1 l0 tl __
2925    vbroadcasti32x4      m7, [r6+fltq+16*0] ; p1 p2 p3 p4
2926    vbroadcasti32x4      m8, [r6+fltq+16*1]
2927    vbroadcasti32x4      m9, [r6+fltq+16*2] ; p6 p5 p0 __
2928    vbroadcasti32x4     m10, [r6+fltq+16*3]
2929    mova               xmm0, xm6
2930    vpdpbusd           xmm0, xmm2, xm7
2931    mova               xmm1, xm6
2932    vpdpbusd           xmm1, xmm2, xm8
2933    vpdpbusd           xmm0, xmm3, xm9
2934    vpdpbusd           xmm1, xmm3, xm10
2935    packssdw           xmm0, xmm1
2936    cmp                  wd, 8
2937    jb .w4
2938    vpbroadcastd        ym2, [tlq+5]
2939    mova                m11, [base+filter_perm]
2940    mov                  r5, 0xffffffffffff000f
2941    psrldq             xmm2, 1           ; __ t0
2942    kmovq                k1, r5          ; 0x000f
2943    psraw               xm5, xmm0, 4
2944    packuswb           xmm2, xm5         ; __ t0 a0 b0
2945    pshufd          ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0   t1 t1 t1 t1
2946    je .w8
2947    kxnorb               k3, k3, k3      ; 0x00ff
2948    vpbroadcastd        xm3, [tlq-4]
2949    kandnq               k2, k3, k1      ; 0xffffffffffff0000
2950    vpermb          ym3{k2}, ym11, ymm2  ; l3 l2 l1 __   b3 a3 t3 __
2951    mova                ym0, ym6
2952    vpdpbusd            ym0, ym2, ym7
2953    mova                ym1, ym6
2954    vpdpbusd            ym1, ym2, ym8
2955    pshufb          ym5{k2}, ym2, ym11   ; a0 b0   __ t0
2956    vpbroadcastd         m2, [tlq+9]
2957    vpdpbusd            ym0, ym3, ym9
2958    vpdpbusd            ym1, ym3, ym10
2959    vpbroadcastd        xm3, [tlq-6]     ; l5 l4 l3 __
2960    kunpckbw             k4, k1, k3      ; 0x0fff
2961    packssdw            ym0, ym1
2962    psraw               ym0, 4           ; a0 d0         a1 b1
2963    packuswb            ym5, ym0         ; a0 b0 c0 d0   __ t1 a1 b1
2964    pshufd           m2{k3}, m5, q3333   ; d0 d0 d0 d0   b1 b1 b1 b1   t2 t2 t2 t2
2965    vpermb           m3{k2}, m11, m5     ; l5 l4 l3 __   d3 c3 b3 __   b7 a7 t7 __
2966    mova                 m4, m6
2967    vpdpbusd             m4, m2, m7
2968    mova                 m1, m6
2969    vpdpbusd             m1, m2, m8
2970    psrldq               m0, m2, 1       ; __ d0         __ b0         __ t0
2971    vpbroadcastd         m2, [tlq+13]
2972    vpdpbusd             m4, m3, m9
2973    vpdpbusd             m1, m3, m10
2974    mova                m12, [base+filter_end]
2975    lea                 r5d, [hq-6]
2976    mov                  r6, dstq
2977    cmovp                hd, r5d         ; w == 16 ? h : h - 6
2978    packssdw             m4, m1
2979    psraw                m4, 4           ; e0 f0         c1 d1         a2 b2
2980    packuswb             m0, m4          ; __ d0 e0 f0   __ b1 c1 d1   __ t2 a2 b2
2981    pshufd           m2{k4}, m0, q3333   ; f0 f0 f0 f0   d1 d1 d1 d1   b2 b2 b2 b2   t3 t3 t3 t3
2982.w16_loop:
2983    vpbroadcastd        xm3, [tlq-8]
2984    vpermb           m3{k2}, m11, m0     ; l7 l6 l5 __   f3 e3 d3 __   d7 c7 b7 __   bb ab tb __
2985    mova                 m1, m6
2986    vpdpbusd             m1, m2, m7
2987    mova                 m0, m6
2988    vpdpbusd             m0, m2, m8
2989    sub                 tlq, 2
2990    vpdpbusd             m1, m3, m9
2991    vpdpbusd             m0, m3, m10
2992    packssdw             m1, m0
2993    mova                 m0, m4
2994    psraw                m4, m1, 4       ; g0 h0         e1 f1         c2 d2         a3 b3
2995    packuswb             m0, m4          ; e0 f0 g0 h0   c1 d1 e1 f1   a2 b2 c2 d2   __ __ a3 b3
2996    pshufd               m2, m0, q3333   ; h0 h0 h0 h0   f1 f1 f1 f1   d2 d2 d2 d2   b3 b3 b3 b3
2997    vpermt2d             m5, m12, m0     ; c0 d0 e0 f0   __ __ c1 d1   a0 a1 a2 a3   b0 b1 b2 b3
2998    vextracti32x4 [dstq+strideq*0], m5, 2
2999    vextracti32x4 [dstq+strideq*1], m5, 3
3000    lea                dstq, [dstq+strideq*2]
3001    sub                  hd, 2
3002    jg .w16_loop
3003    cmp                  wd, 16
3004    je .ret
3005    mova               xm13, [filter_perm+16]
3006    mova               xmm3, [r6+strideq*0]
3007    punpckhdq          xmm3, [r6+strideq*1]
3008    vpbroadcastd     m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4   f1 f1 f1 f1   d2 d2 d2 d2   b3 b3 b3 b3
3009    pinsrb              xm3, xmm3, [tlq+r5+16], 7
3010    pshufb              xm3, xm13
3011    vpermb           m3{k2}, m11, m0     ; bf af tf __   h3 g3 f3 __   f7 e7 d7 __   db cb bb __
3012    mova                 m0, m6
3013    vpdpbusd             m0, m2, m7
3014    mova                 m1, m6
3015    vpdpbusd             m1, m2, m8
3016    kunpckbw             k5, k3, k1      ; 0xff0f
3017    lea                  r3, [strideq*3]
3018    vpdpbusd             m0, m3, m9
3019    vpdpbusd             m1, m3, m10
3020    packssdw             m0, m1
3021    psraw                m0, 4           ; a4 b4         g1 h1         e2 f2         c3 d3
3022    packuswb             m4, m0          ; g0 h0 a4 b4   e1 f1 g1 h1   c2 d2 e2 f2   __ __ c3 d3
3023    vpblendmb        m1{k3}, m4, m2      ; __ t4 a4 b4   e1 f1 g1 h1   c2 d2 e2 f2   __ __ c3 d3
3024    vpbroadcastd        ym2, [tlq+r5+21]
3025    pshufd           m2{k5}, m4, q3333   ; b4 b4 b4 b4   t5 t5 t5 t5   f2 f2 f2 f2   d3 d3 d3 d3
3026    vpermt2d             m5, m12, m4     ; e0 f0 g0 h0   __ __ e1 f1   c0 c1 c2 c3   d0 d1 d2 d3
3027    vextracti32x4 [dstq+strideq*0], m5, 2
3028    vextracti32x4 [dstq+strideq*1], m5, 3
3029    punpckhqdq         xmm3, [r6+r3]
3030    pinsrb             xmm3, [r6+strideq*2+15], 11
3031    pshufb              xm3, xmm3, xm13
3032    vpermb           m3{k2}, m11, m1     ; df cf bf __   bj aj tj __   h7 g7 f7 __   fb eb db __
3033    mova                 m4, m6
3034    vpdpbusd             m4, m2, m7
3035    mova                 m1, m6
3036    vpdpbusd             m1, m2, m8
3037    kxnord               k3, k3, k4      ; 0xfffff0ff
3038    lea                  r4, [strideq*5]
3039    vpdpbusd             m4, m3, m9
3040    vpdpbusd             m1, m3, m10
3041    packssdw             m4, m1
3042    psraw                m4, 4           ; c4 d4         a5 b5         g2 h2         e3 f3
3043    packuswb             m0, m4          ; a4 b4 c4 d4   g1 h1 a5 b5   e2 f2 g2 h2   __ __ e3 f3
3044    vpblendmw        m1{k3}, m2, m0      ; a4 b4 c4 d4   __ t5 a5 b5   e2 f2 g2 h2   __ __ e3 f3
3045    vpbroadcastd         m2, [tlq+r5+25]
3046    pshufd           m2{k3}, m0, q3333   ; d4 d4 d4 d4   b5 b5 b5 b5   t6 t6 t6 t6   f3 f3 f3 f3
3047    vpermt2d             m5, m12, m0     ; g0 h0 a4 b4   __ __ g1 h1   e0 e1 e2 e3   f0 f1 f2 f3
3048    vextracti32x4 [dstq+strideq*2], m5, 2
3049    vextracti32x4 [dstq+r3       ], m5, 3
3050    punpckhqdq         xmm3, [r6+r4]
3051    pinsrb             xmm3, [r6+strideq*4+15], 11
3052    pshufb              xm3, xmm3, xm13
3053    vpermb           m3{k2}, m11, m1     ; ff ef df __   dj cj bj __   bn an tn __   hb hb fb __
3054    mova                 m0, m6
3055    vpdpbusd             m0, m2, m7
3056    mova                 m1, m6
3057    vpdpbusd             m1, m2, m8
3058    kunpckwd             k1, k1, k2      ; 0x000f0000
3059    vpdpbusd             m0, m3, m9
3060    vpdpbusd             m1, m3, m10
3061    packssdw             m0, m1
3062    psraw                m0, 4           ; e4 f4         c5 d5         a6 b6         g3 h3
3063    packuswb             m4, m0          ; c4 d4 e4 f4   a5 b5 c5 d5   g2 h2 a6 b6   __ __ g3 h3
3064    vpblendmw        m1{k1}, m4, m2      ; c4 d4 e4 f4   a5 b5 c5 d5   __ t6 a6 b6   __ __ g3 h3
3065    vpbroadcastd         m2, [tlq+r5+29]
3066    pshufd           m2{k4}, m4, q3333   ; f4 f4 f4 f4   d5 d5 d5 d5   b6 b6 b6 b6   t7 t7 t7 t7
3067    vpermt2d             m5, m12, m4     ; a4 b4 c4 d4   __ __ a5 b5   g0 g1 g2 g3   h0 h1 h2 h3
3068    vextracti32x4 [dstq+strideq*4], m5, 2
3069    vextracti32x4 [dstq+r4       ], m5, 3
3070    lea                  r0, [strideq+r3*2]
3071.w32_loop:
3072    punpckhqdq         xmm3, [r6+r0]
3073    pinsrb             xmm3, [r6+r3*2+15], 11
3074    pshufb              xm3, xmm3, xm13
3075    vpermb           m3{k2}, m11, m1     ; hf gf ff __   fj ej dj __   dn cn bn __   br ar tr __
3076.w32_loop_tail:
3077    mova                 m4, m6
3078    vpdpbusd             m4, m2, m7
3079    mova                 m1, m6
3080    vpdpbusd             m1, m2, m8
3081    vpdpbusd             m4, m3, m9
3082    vpdpbusd             m1, m3, m10
3083    packssdw             m4, m1
3084    mova                 m1, m0
3085    psraw                m0, m4, 4       ; g4 h4         e5 f5         c6 d6         a7 b7
3086    packuswb             m1, m0          ; e4 f4 g4 h4   c5 d5 e5 f5   a6 b6 c6 d6   __ __ a7 b7
3087    pshufd               m2, m1, q3333   ; h4 h4 h4 h4   f5 f5 f5 f5   d6 d6 d6 d6   b7 b7 b7 b7
3088    vpermt2d             m5, m12, m1     ; c4 d4 e4 f4   __ __ c5 d5   a4 a5 a6 a7   b4 b5 b6 b7
3089    vextracti32x4 [r6+strideq*0+16], m5, 2
3090    vextracti32x4 [r6+strideq*1+16], m5, 3
3091    lea                  r6, [r6+strideq*2]
3092    sub                 r5d, 2
3093    jg .w32_loop
3094    vpermb               m3, m11, m1
3095    cmp                 r5d, -6
3096    jg .w32_loop_tail
3097.ret:
3098    RET
3099.w8:
3100    vpermb              ym3, ym11, ymm2
3101.w8_loop:
3102    vpbroadcastd    ym3{k1}, [tlq-4]     ; l3 l2 l1 __   b3 a3 t3 __
3103    mova                ym0, ym6
3104    vpdpbusd            ym0, ym2, ym7
3105    mova                ym1, ym6
3106    vpdpbusd            ym1, ym2, ym8
3107    sub                 tlq, 2
3108    vpdpbusd            ym0, ym3, ym9
3109    vpdpbusd            ym1, ym3, ym10
3110    mova                ym3, ym5
3111    packssdw            ym0, ym1
3112    psraw               ym5, ym0, 4      ; c0 d0         a1 b1
3113    packuswb            ym3, ym5         ; a0 b0 c0 d0   __ __ a1 b1
3114    pshufd              ym2, ym3, q3333  ; d0 d0 d0 d0   b1 b1 b1 b1
3115    vpermb              ym3, ym11, ym3   ; a0 a1 b0 b1
3116    movq   [dstq+strideq*0], xm3
3117    movhps [dstq+strideq*1], xm3
3118    lea                dstq, [dstq+strideq*2]
3119    sub                  hd, 2
3120    jg .w8_loop
3121    RET
3122.w4_loop:
3123    vpbroadcastd       xmm3, [tlq-4]     ; l3 l2 l1 __
3124    mova               xmm0, xm6
3125    vpdpbusd           xmm0, xmm2, xm7
3126    mova               xmm1, xm6
3127    vpdpbusd           xmm1, xmm2, xm8
3128    sub                 tlq, 2
3129    vpdpbusd           xmm0, xmm3, xm9
3130    vpdpbusd           xmm1, xmm3, xm10
3131    packssdw           xmm0, xmm1
3132.w4:
3133    psraw              xmm0, 4           ; a0 b0
3134    packuswb           xmm0, xmm0
3135    movd   [dstq+strideq*0], xmm0
3136    pshufd             xmm2, xmm0, q1111 ; b0 b0 b0 b0
3137    movd   [dstq+strideq*1], xmm2
3138    lea                dstq, [dstq+strideq*2]
3139    sub                  hd, 2
3140    jg .w4_loop
3141    RET
3142
3143%endif ; ARCH_X86_64
3144