xref: /aosp_15_r20/external/libdav1d/src/x86/ipred_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2018-2021, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 16
30
31%macro SMOOTH_WEIGHT_TABLE 1-*
32    %rep %0
33        db %1-128, 127-%1
34        %rotate 1
35    %endrep
36%endmacro
37
38; sm_weights[], but modified to precalculate x and 256-x with offsets to
39; enable efficient use of pmaddubsw (which requires signed values)
40smooth_weights: SMOOTH_WEIGHT_TABLE         \
41      0,   0, 255, 128, 255, 149,  85,  64, \
42    255, 197, 146, 105,  73,  50,  37,  32, \
43    255, 225, 196, 170, 145, 123, 102,  84, \
44     68,  54,  43,  33,  26,  20,  17,  16, \
45    255, 240, 225, 210, 196, 182, 169, 157, \
46    145, 133, 122, 111, 101,  92,  83,  74, \
47     66,  59,  52,  45,  39,  34,  29,  25, \
48     21,  17,  14,  12,  10,   9,   8,   8, \
49    255, 248, 240, 233, 225, 218, 210, 203, \
50    196, 189, 182, 176, 169, 163, 156, 150, \
51    144, 138, 133, 127, 121, 116, 111, 106, \
52    101,  96,  91,  86,  82,  77,  73,  69, \
53     65,  61,  57,  54,  50,  47,  44,  41, \
54     38,  35,  32,  29,  27,  25,  22,  20, \
55     18,  16,  15,  13,  12,  10,   9,   8, \
56      7,   6,   6,   5,   5,   4,   4,   4
57
58ipred_v_shuf:     db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
59ipred_h_shuf:     db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
60ipred_paeth_shuf: db  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0
61z_upsample1:      db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
62z_upsample2:      db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  8
63z_transpose4:     db  8, 12,  0,  4,  9, 13,  1,  5, 10, 14,  2,  6, 11, 15,  3,  7
64z3_shuf:          db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
65z3_shuf_h4:       db  4,  3,  3,  2,  2,  1,  1,  0, 12, 11, 11, 10, 10,  9,  9,  8
66filter_shuf1:     db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
67filter_shuf2:     db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11, 15, -1, 15, -1
68z_filter_wh4:     db  7,  7, 19,  7,
69z_filter_wh8:     db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
70pd_32768:         dd 32768
71z3_filter_k_tail: db 64,  0, 64,  0, 64,  0, 56,  8
72z1_shuf_w4:       db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
73pb_0to15:         db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
74pb_15to0:         db 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
75z_base_inc:       dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
76z3_base_inc:      dw   7*64,   6*64,   5*64,   4*64,   3*64,   2*64,   1*64,   0*64
77z_filter_wh16:    db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
78z_filter_t_w48:   db 55,127,  7,127, 15, 31, 39, 31,127, 39,127, 39,  7, 15, 31, 15
79                  db 39, 63,  3, 63,  3,  3, 19,  3, 47, 19, 47, 19,  3,  3,  3,  3
80z_filter_t_w16:   db 15, 31,  7, 15, 31,  7,  3, 31,  3,  3,  3,  3,  3,  3,  0,  0
81z_filter_s:       db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
82                  db  7,  8,  8,  9,  9, 10, 10, 11
83z_filter_k_tail:  db  0, 64,  0, 64,  8, 56,  0, 64
84z2_h_shuf:        db  7,  6, 15, 14,  6,  5, 14, 13,  5,  4, 13, 12,  4,  3, 12, 11
85z2_upsample:      db  7,  6, 15, 14,  5,  4, 13, 12,  3,  2, 11, 10,  1,  0,  9,  8
86z2_dy_offset:     dw 88*64, 88*64, 87*64, 87*64
87pw_m1to4:         dw -1, -2, -3, -4
88z_filter_k:       times  4 db  0, 16
89                  times  4 db  0, 20
90                  times  4 db  8, 16
91                  times  4 db 32, 16
92                  times  4 db 24, 20
93                  times  4 db 16, 16
94                  times  4 db  0,  0
95                  times  4 db  0,  0
96pw_8:             times  8 db  8,  0
97pb_3:             times 16 db 3
98pb_16:            times 16 db 16
99pw_62:            times  8 dw 62
100pw_64:            times  8 dw 64
101pw_256:           times  8 dw 256
102pw_512:           times  8 dw 512
103pw_m256:          times  8 dw -256
104pb_2:             times  8 db 2
105pb_4:             times  8 db 4
106pb_8:             times  8 db 8
107pb_128:           times  8 db 128
108pb_m16:           times  8 db -16
109pw_128:           times  4 dw 128
110pw_255:           times  4 dw 255
111pb_36_m4:         times  4 db 36, -4
112pb_127_m127:      times  4 db 127, -127
113
114%macro JMP_TABLE 3-*
115    %xdefine %1_%2_table (%%table - 2*4)
116    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
117    %%table:
118    %rep %0 - 2
119        dd %%base %+ .%3 - (%%table - 2*4)
120        %rotate 1
121    %endrep
122%endmacro
123
124%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
125%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
126
127JMP_TABLE ipred_h,          ssse3, w4, w8, w16, w32, w64
128JMP_TABLE ipred_dc,         ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
129                                s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
130JMP_TABLE ipred_dc_left,    ssse3, h4, h8, h16, h32, h64
131JMP_TABLE ipred_smooth,     ssse3, w4, w8, w16, w32, w64
132JMP_TABLE ipred_smooth_v,   ssse3, w4, w8, w16, w32, w64
133JMP_TABLE ipred_smooth_h,   ssse3, w4, w8, w16, w32, w64
134JMP_TABLE ipred_paeth,      ssse3, w4, w8, w16, w32, w64
135JMP_TABLE ipred_z1,         ssse3, w4, w8, w16, w32, w64
136JMP_TABLE ipred_z2,         ssse3, w4, w8, w16, w32, w64
137JMP_TABLE ipred_z3,         ssse3, h4, h8, h16, h32, h64
138JMP_TABLE pal_pred,         ssse3, w4, w8, w16, w32, w64
139JMP_TABLE ipred_cfl,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
140                                s4-8*4, s8-8*4, s16-8*4, s32-8*4
141JMP_TABLE ipred_cfl_left,   ssse3, h4, h8, h16, h32
142JMP_TABLE ipred_filter,     ssse3, w4, w8, w16, w32
143
144cextern dr_intra_derivative
145cextern filter_intra_taps
146
147SECTION .text
148
149;---------------------------------------------------------------------------------------
150;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
151;                                    const int width, const int height, const int a);
152;---------------------------------------------------------------------------------------
153%macro IPRED_SET   3                                          ; width, stride, stride size pshuflw_imm8
154    pshuflw                      m1, m0, %3                   ; extend 8 byte for 2 pos
155    punpcklqdq                   m1, m1
156    mova           [dstq +      %2], m1
157%if %1 > 16
158    mova           [dstq + 16 + %2], m1
159%endif
160%if %1 > 32
161    mova           [dstq + 32 + %2], m1
162    mova           [dstq + 48 + %2], m1
163%endif
164%endmacro
165
166%macro IPRED_H 1                                            ; width
167    sub                         tlq, 4
168    movd                         m0, [tlq]                  ; get 4 bytes of topleft data
169    punpcklbw                    m0, m0                     ; extend 2 byte
170%if %1 == 4
171    pshuflw                      m1, m0, q2233
172    movd           [dstq+strideq*0], m1
173    psrlq                        m1, 32
174    movd           [dstq+strideq*1], m1
175    pshuflw                      m0, m0, q0011
176    movd           [dstq+strideq*2], m0
177    psrlq                        m0, 32
178    movd           [dstq+stride3q ], m0
179
180%elif %1 == 8
181    punpcklwd                    m0, m0
182    punpckhdq                    m1, m0, m0
183    punpckldq                    m0, m0
184    movq           [dstq+strideq*1], m1
185    movhps         [dstq+strideq*0], m1
186    movq           [dstq+stride3q ], m0
187    movhps         [dstq+strideq*2], m0
188%else
189    IPRED_SET                    %1,         0, q3333
190    IPRED_SET                    %1,   strideq, q2222
191    IPRED_SET                    %1, strideq*2, q1111
192    IPRED_SET                    %1,  stride3q, q0000
193%endif
194    lea                        dstq, [dstq+strideq*4]
195    sub                          hd, 4
196    jg .w%1
197    RET
198%endmacro
199
200INIT_XMM ssse3
201cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
202    LEA                          r5, ipred_h_ssse3_table
203    tzcnt                        wd, wm
204    movifnidn                    hd, hm
205    movsxd                       wq, [r5+wq*4]
206    add                          wq, r5
207    lea                    stride3q, [strideq*3]
208    jmp                          wq
209.w4:
210    IPRED_H                       4
211.w8:
212    IPRED_H                       8
213.w16:
214    IPRED_H                      16
215.w32:
216    IPRED_H                      32
217.w64:
218    IPRED_H                      64
219
220;---------------------------------------------------------------------------------------
221;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
222;                                    const int width, const int height, const int a);
223;---------------------------------------------------------------------------------------
224cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
225    LEA                  r5, ipred_dc_splat_ssse3_table
226    tzcnt                wd, wm
227    movu                 m0, [tlq+ 1]
228    movu                 m1, [tlq+17]
229    movu                 m2, [tlq+33]
230    movu                 m3, [tlq+49]
231    movifnidn            hd, hm
232    movsxd               wq, [r5+wq*4]
233    add                  wq, r5
234    lea            stride3q, [strideq*3]
235    jmp                  wq
236
237;---------------------------------------------------------------------------------------
238;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
239;                                    const int width, const int height, const int a);
240;---------------------------------------------------------------------------------------
241cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
242    movifnidn                    hd, hm
243    movifnidn                    wd, wm
244    tzcnt                       r6d, hd
245    lea                         r5d, [wq+hq]
246    movd                         m4, r5d
247    tzcnt                       r5d, r5d
248    movd                         m5, r5d
249    LEA                          r5, ipred_dc_ssse3_table
250    tzcnt                        wd, wd
251    movsxd                       r6, [r5+r6*4]
252    movsxd                       wq, [r5+wq*4+20]
253    pcmpeqd                      m3, m3
254    psrlw                        m4, 1                             ; dc = (width + height) >> 1;
255    add                          r6, r5
256    add                          wq, r5
257    lea                    stride3q, [strideq*3]
258    jmp r6
259.h4:
260    movd                         m0, [tlq-4]
261    pmaddubsw                    m0, m3
262    jmp                          wq
263.w4:
264    movd                         m1, [tlq+1]
265    pmaddubsw                    m1, m3
266    psubw                        m0, m4
267    paddw                        m0, m1
268    pmaddwd                      m0, m3
269    cmp                          hd, 4
270    jg .w4_mul
271    psrlw                        m0, 3                             ; dc >>= ctz(width + height);
272    jmp .w4_end
273.w4_mul:
274    punpckhqdq                   m1, m0, m0
275    paddw                        m0, m1
276    psrlq                        m1, m0, 32
277    paddw                        m0, m1
278    psrlw                        m0, 2
279    mov                         r6d, 0x5556
280    mov                         r2d, 0x3334
281    test                         hd, 8
282    cmovz                       r6d, r2d
283    movd                         m5, r6d
284    pmulhuw                      m0, m5
285.w4_end:
286    pxor                         m1, m1
287    pshufb                       m0, m1
288.s4:
289    movd           [dstq+strideq*0], m0
290    movd           [dstq+strideq*1], m0
291    movd           [dstq+strideq*2], m0
292    movd           [dstq+stride3q ], m0
293    lea                        dstq, [dstq+strideq*4]
294    sub                          hd, 4
295    jg .s4
296    RET
297ALIGN function_align
298.h8:
299    movq                         m0, [tlq-8]
300    pmaddubsw                    m0, m3
301    jmp                          wq
302.w8:
303    movq                         m1, [tlq+1]
304    pmaddubsw                    m1, m3
305    psubw                        m4, m0
306    punpckhqdq                   m0, m0
307    psubw                        m0, m4
308    paddw                        m0, m1
309    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
310    paddw                        m0, m1
311    pmaddwd                      m0, m3
312    psrlw                        m0, m5
313    cmp                          hd, 8
314    je .w8_end
315    mov                         r6d, 0x5556
316    mov                         r2d, 0x3334
317    cmp                          hd, 32
318    cmovz                       r6d, r2d
319    movd                         m1, r6d
320    pmulhuw                      m0, m1
321.w8_end:
322    pxor                         m1, m1
323    pshufb                       m0, m1
324.s8:
325    movq           [dstq+strideq*0], m0
326    movq           [dstq+strideq*1], m0
327    movq           [dstq+strideq*2], m0
328    movq           [dstq+stride3q ], m0
329    lea                        dstq, [dstq+strideq*4]
330    sub                          hd, 4
331    jg .s8
332    RET
333ALIGN function_align
334.h16:
335    mova                         m0, [tlq-16]
336    pmaddubsw                    m0, m3
337    jmp                          wq
338.w16:
339    movu                         m1, [tlq+1]
340    pmaddubsw                    m1, m3
341    paddw                        m0, m1
342    psubw                        m4, m0
343    punpckhqdq                   m0, m0
344    psubw                        m0, m4
345    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
346    paddw                        m0, m1
347    pmaddwd                      m0, m3
348    psrlw                        m0, m5
349    cmp                          hd, 16
350    je .w16_end
351    mov                         r6d, 0x5556
352    mov                         r2d, 0x3334
353    test                         hd, 8|32
354    cmovz                       r6d, r2d
355    movd                         m1, r6d
356    pmulhuw                      m0, m1
357.w16_end:
358    pxor                         m1, m1
359    pshufb                       m0, m1
360.s16:
361    mova           [dstq+strideq*0], m0
362    mova           [dstq+strideq*1], m0
363    mova           [dstq+strideq*2], m0
364    mova           [dstq+stride3q ], m0
365    lea                        dstq, [dstq+strideq*4]
366    sub                          hd, 4
367    jg .s16
368    RET
369ALIGN function_align
370.h32:
371    mova                         m0, [tlq-32]
372    pmaddubsw                    m0, m3
373    mova                         m2, [tlq-16]
374    pmaddubsw                    m2, m3
375    paddw                        m0, m2
376    jmp wq
377.w32:
378    movu                         m1, [tlq+1]
379    pmaddubsw                    m1, m3
380    movu                         m2, [tlq+17]
381    pmaddubsw                    m2, m3
382    paddw                        m1, m2
383    paddw                        m0, m1
384    psubw                        m4, m0
385    punpckhqdq                   m0, m0
386    psubw                        m0, m4
387    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
388    paddw                        m0, m1
389    pmaddwd                      m0, m3
390    psrlw                        m0, m5
391    cmp                          hd, 32
392    je .w32_end
393    lea                         r2d, [hq*2]
394    mov                         r6d, 0x5556
395    mov                         r2d, 0x3334
396    test                         hd, 64|16
397    cmovz                       r6d, r2d
398    movd                         m1, r6d
399    pmulhuw                      m0, m1
400.w32_end:
401    pxor                         m1, m1
402    pshufb                       m0, m1
403    mova                         m1, m0
404.s32:
405    mova                     [dstq], m0
406    mova                  [dstq+16], m1
407    mova             [dstq+strideq], m0
408    mova          [dstq+strideq+16], m1
409    mova           [dstq+strideq*2], m0
410    mova        [dstq+strideq*2+16], m1
411    mova            [dstq+stride3q], m0
412    mova         [dstq+stride3q+16], m1
413    lea                        dstq, [dstq+strideq*4]
414    sub                          hd, 4
415    jg .s32
416    RET
417ALIGN function_align
418.h64:
419    mova                         m0, [tlq-64]
420    mova                         m1, [tlq-48]
421    pmaddubsw                    m0, m3
422    pmaddubsw                    m1, m3
423    paddw                        m0, m1
424    mova                         m1, [tlq-32]
425    pmaddubsw                    m1, m3
426    paddw                        m0, m1
427    mova                         m1, [tlq-16]
428    pmaddubsw                    m1, m3
429    paddw                        m0, m1
430    jmp wq
431.w64:
432    movu                         m1, [tlq+ 1]
433    movu                         m2, [tlq+17]
434    pmaddubsw                    m1, m3
435    pmaddubsw                    m2, m3
436    paddw                        m1, m2
437    movu                         m2, [tlq+33]
438    pmaddubsw                    m2, m3
439    paddw                        m1, m2
440    movu                         m2, [tlq+49]
441    pmaddubsw                    m2, m3
442    paddw                        m1, m2
443    paddw                        m0, m1
444    psubw                        m4, m0
445    punpckhqdq                   m0, m0
446    psubw                        m0, m4
447    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
448    paddw                        m0, m1
449    pmaddwd                      m0, m3
450    psrlw                        m0, m5
451    cmp                          hd, 64
452    je .w64_end
453    mov                         r6d, 0x5556
454    mov                         r2d, 0x3334
455    test                         hd, 32
456    cmovz                       r6d, r2d
457    movd                         m1, r6d
458    pmulhuw                      m0, m1
459.w64_end:
460    pxor                         m1, m1
461    pshufb                       m0, m1
462    mova                         m1, m0
463    mova                         m2, m0
464    mova                         m3, m0
465.s64:
466    mova                     [dstq], m0
467    mova                  [dstq+16], m1
468    mova                  [dstq+32], m2
469    mova                  [dstq+48], m3
470    mova             [dstq+strideq], m0
471    mova          [dstq+strideq+16], m1
472    mova          [dstq+strideq+32], m2
473    mova          [dstq+strideq+48], m3
474    lea                        dstq, [dstq+strideq*2]
475    sub                          hd, 2
476    jg .s64
477    RET
478
479;---------------------------------------------------------------------------------------
480;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
481;                                    const int width, const int height, const int a);
482;---------------------------------------------------------------------------------------
483cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
484    LEA                  r5, ipred_dc_left_ssse3_table
485    mov                  hd, hm                ; zero upper half
486    tzcnt               r6d, hd
487    sub                 tlq, hq
488    tzcnt                wd, wm
489    movu                 m0, [tlq]
490    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
491    movd                 m2, r6d
492    psrld                m3, m2
493    movsxd               r6, [r5+r6*4]
494    pcmpeqd              m2, m2
495    pmaddubsw            m0, m2
496    add                  r6, r5
497    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
498    movsxd               wq, [r5+wq*4]
499    add                  wq, r5
500    jmp                  r6
501.h64:
502    movu                 m1, [tlq+48]                           ; unaligned when jumping here from dc_top
503    pmaddubsw            m1, m2
504    paddw                m0, m1
505    movu                 m1, [tlq+32]                           ; unaligned when jumping here from dc_top
506    pmaddubsw            m1, m2
507    paddw                m0, m1
508.h32:
509    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
510    pmaddubsw            m1, m2
511    paddw                m0, m1
512.h16:
513    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
514    paddw                m0, m1
515.h8:
516    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
517    paddw                m0, m1
518.h4:
519    pmaddwd              m0, m2
520    pmulhrsw             m0, m3
521    lea            stride3q, [strideq*3]
522    pxor                 m1, m1
523    pshufb               m0, m1
524    mova                 m1, m0
525    mova                 m2, m0
526    mova                 m3, m0
527    jmp                  wq
528
529;---------------------------------------------------------------------------------------
530;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
531;                                    const int width, const int height, const int a);
532;---------------------------------------------------------------------------------------
533cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
534    LEA                  r5, ipred_dc_splat_ssse3_table
535    tzcnt                wd, wm
536    movifnidn            hd, hm
537    movsxd               wq, [r5+wq*4]
538    movddup              m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
539    mova                 m1, m0
540    mova                 m2, m0
541    mova                 m3, m0
542    add                  wq, r5
543    lea            stride3q, [strideq*3]
544    jmp                  wq
545
546;---------------------------------------------------------------------------------------
547;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
548;                                    const int width, const int height, const int a);
549;---------------------------------------------------------------------------------------
550cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
551    LEA                  r5, ipred_dc_left_ssse3_table
552    tzcnt                wd, wm
553    inc                 tlq
554    movu                 m0, [tlq]
555    movifnidn            hd, hm
556    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
557    movd                 m2, wd
558    psrld                m3, m2
559    movsxd               r6, [r5+wq*4]
560    pcmpeqd              m2, m2
561    pmaddubsw            m0, m2
562    add                  r6, r5
563    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
564    movsxd               wq, [r5+wq*4]
565    add                  wq, r5
566    jmp                  r6
567
568;---------------------------------------------------------------------------------------
569;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
570;                                    const int width, const int height, const int a);
571;---------------------------------------------------------------------------------------
572%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
573                ;            w * a         = (w - 128) * a + 128 * a
574                ;            (256 - w) * b = (127 - w) * b + 129 * b
575                ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
576    pmaddubsw            m6, m%3, m%1
577    pmaddubsw            m0, m%4, m%2                    ; (w - 128) * a + (127 - w) * b
578    paddw                m6, m%5
579    paddw                m0, m%6                         ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
580    psrlw                m6, 8
581    psrlw                m0, 8
582    packuswb             m6, m0
583%endmacro
584
585cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
586%define base r6-ipred_smooth_v_ssse3_table
587    LEA                  r6, ipred_smooth_v_ssse3_table
588    tzcnt                wd, wm
589    mov                  hd, hm
590    movsxd               wq, [r6+wq*4]
591    movddup              m0, [base+pb_127_m127]
592    movddup              m1, [base+pw_128]
593    lea            weightsq, [base+smooth_weights+hq*4]
594    neg                  hq
595    movd                 m5, [tlq+hq]
596    pxor                 m2, m2
597    pshufb               m5, m2
598    add                  wq, r6
599    jmp                  wq
600.w4:
601    movd                 m2, [tlq+1]
602    punpckldq            m2, m2
603    punpcklbw            m2, m5                          ; top, bottom
604    lea                  r3, [strideq*3]
605    mova                 m4, [base+ipred_v_shuf]
606    mova                 m5, m4
607    punpckldq            m4, m4
608    punpckhdq            m5, m5
609    pmaddubsw            m3, m2, m0                      ; m3: 127 * top - 127 * bottom
610    paddw                m1, m2                          ; m1:   1 * top + 256 * bottom + 128, overflow is ok
611    paddw                m3, m1                          ; m3: 128 * top + 129 * bottom + 128
612.w4_loop:
613    movu                 m1, [weightsq+hq*2]
614    pshufb               m0, m1, m4                      ;m2, m3, m4 and m5 should be stable in loop
615    pshufb               m1, m5
616    SMOOTH                0, 1, 2, 2, 3, 3
617    movd   [dstq+strideq*0], m6
618    pshuflw              m1, m6, q1032
619    movd   [dstq+strideq*1], m1
620    punpckhqdq           m6, m6
621    movd   [dstq+strideq*2], m6
622    psrlq                m6, 32
623    movd   [dstq+r3       ], m6
624    lea                dstq, [dstq+strideq*4]
625    add                  hq, 4
626    jl .w4_loop
627    RET
628ALIGN function_align
629.w8:
630    movq                 m2, [tlq+1]
631    punpcklbw            m2, m5
632    mova                 m5, [base+ipred_v_shuf]
633    lea                  r3, [strideq*3]
634    pshufd               m4, m5, q0000
635    pshufd               m5, m5, q1111
636    pmaddubsw            m3, m2, m0
637    paddw                m1, m2
638    paddw                m3, m1                           ; m3 is output for loop
639.w8_loop:
640    movq                 m1, [weightsq+hq*2]
641    pshufb               m0, m1, m4
642    pshufb               m1, m5
643    SMOOTH                0, 1, 2, 2, 3, 3
644    movq   [dstq+strideq*0], m6
645    movhps [dstq+strideq*1], m6
646    lea                dstq, [dstq+strideq*2]
647    add                  hq, 2
648    jl .w8_loop
649    RET
650ALIGN function_align
651.w16:
652    movu                 m3, [tlq+1]
653    punpcklbw            m2, m3, m5
654    punpckhbw            m3, m5
655    pmaddubsw            m4, m2, m0
656    pmaddubsw            m5, m3, m0
657    paddw                m0, m1, m2
658    paddw                m1, m3
659    paddw                m4, m0
660    paddw                m5, m1                           ; m4 and m5 is output for loop
661.w16_loop:
662    movd                 m1, [weightsq+hq*2]
663    pshuflw              m1, m1, q0000
664    punpcklqdq           m1, m1
665    SMOOTH 1, 1, 2, 3, 4, 5
666    mova             [dstq], m6
667    add                dstq, strideq
668    add                  hq, 1
669    jl .w16_loop
670    RET
671ALIGN function_align
672.w32:
673    WIN64_PUSH_XMM        8, 7
674    mova                 m7, m5
675.w32_loop_init:
676    mov                 r3d, 2
677.w32_loop:
678    movddup              m0, [base+pb_127_m127]
679    movddup              m1, [base+pw_128]
680    movu                 m3, [tlq+1]
681    punpcklbw            m2, m3, m7
682    punpckhbw            m3, m7
683    pmaddubsw            m4, m2, m0
684    pmaddubsw            m5, m3, m0
685    paddw                m0, m1, m2
686    paddw                m1, m3
687    paddw                m4, m0
688    paddw                m5, m1
689    movd                 m1, [weightsq+hq*2]
690    pshuflw              m1, m1, q0000
691    punpcklqdq           m1, m1
692    SMOOTH                1, 1, 2, 3, 4, 5
693    mova             [dstq], m6
694    add                 tlq, 16
695    add                dstq, 16
696    dec                 r3d
697    jg .w32_loop
698    lea                dstq, [dstq-32+strideq]
699    sub                 tlq, 32
700    add                  hq, 1
701    jl .w32_loop_init
702    RET
703ALIGN function_align
704.w64:
705    WIN64_PUSH_XMM        8, 7
706    mova                 m7, m5
707.w64_loop_init:
708    mov                 r3d, 4
709.w64_loop:
710    movddup              m0, [base+pb_127_m127]
711    movddup              m1, [base+pw_128]
712    movu                 m3, [tlq+1]
713    punpcklbw            m2, m3, m7
714    punpckhbw            m3, m7
715    pmaddubsw            m4, m2, m0
716    pmaddubsw            m5, m3, m0
717    paddw                m0, m1, m2
718    paddw                m1, m3
719    paddw                m4, m0
720    paddw                m5, m1
721    movd                 m1, [weightsq+hq*2]
722    pshuflw              m1, m1, q0000
723    punpcklqdq           m1, m1
724    SMOOTH                1, 1, 2, 3, 4, 5
725    mova             [dstq], m6
726    add                 tlq, 16
727    add                dstq, 16
728    dec                 r3d
729    jg .w64_loop
730    lea                dstq, [dstq-64+strideq]
731    sub                 tlq, 64
732    add                  hq, 1
733    jl .w64_loop_init
734    RET
735
736;---------------------------------------------------------------------------------------
737;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
738;                                    const int width, const int height, const int a);
739;---------------------------------------------------------------------------------------
740cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
741%define base r6-ipred_smooth_h_ssse3_table
742    LEA                  r6, ipred_smooth_h_ssse3_table
743    mov                  wd, wm
744    movd                 m3, [tlq+wq]
745    pxor                 m1, m1
746    pshufb               m3, m1                          ; right
747    tzcnt                wd, wd
748    mov                  hd, hm
749    movsxd               wq, [r6+wq*4]
750    movddup              m4, [base+pb_127_m127]
751    movddup              m5, [base+pw_128]
752    add                  wq, r6
753    jmp                  wq
754.w4:
755    movddup              m6, [base+smooth_weights+4*2]
756    mova                 m7, [base+ipred_h_shuf]
757    sub                 tlq, 4
758    sub                 tlq, hq
759    lea                  r3, [strideq*3]
760.w4_loop:
761    movd                 m2, [tlq+hq]                    ; left
762    pshufb               m2, m7
763    punpcklbw            m1, m2, m3                      ; left, right
764    punpckhbw            m2, m3
765    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
766    paddw                m0, m1                          ; 128 * left + 129 * right
767    pmaddubsw            m1, m6
768    paddw                m1, m5
769    paddw                m0, m1
770    pmaddubsw            m1, m2, m4
771    paddw                m1, m2
772    pmaddubsw            m2, m6
773    paddw                m2, m5
774    paddw                m1, m2
775    psrlw                m0, 8
776    psrlw                m1, 8
777    packuswb             m0, m1
778    movd   [dstq+strideq*0], m0
779    pshuflw              m1, m0, q1032
780    movd   [dstq+strideq*1], m1
781    punpckhqdq           m0, m0
782    movd   [dstq+strideq*2], m0
783    psrlq                m0, 32
784    movd   [dstq+r3       ], m0
785    lea                dstq, [dstq+strideq*4]
786    sub                  hd, 4
787    jg .w4_loop
788    RET
789ALIGN function_align
790.w8:
791    mova                 m6, [base+smooth_weights+8*2]
792    mova                 m7, [base+ipred_h_shuf]
793    sub                 tlq, 4
794    sub                 tlq, hq
795    punpckldq            m7, m7
796.w8_loop:
797    movd                 m2, [tlq+hq]                    ; left
798    pshufb               m2, m7
799    punpcklbw            m1, m2, m3                      ; left, right
800    punpckhbw            m2, m3
801    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
802    paddw                m0, m1                          ; 128 * left + 129 * right
803    pmaddubsw            m1, m6
804    paddw                m1, m5
805    paddw                m0, m1
806    pmaddubsw            m1, m2, m4
807    paddw                m1, m2
808    pmaddubsw            m2, m6
809    paddw                m2, m5
810    paddw                m1, m2
811    psrlw                m0, 8
812    psrlw                m1, 8
813    packuswb             m0, m1
814    movq   [dstq+strideq*0], m0
815    movhps [dstq+strideq*1], m0
816    lea                dstq, [dstq+strideq*2]
817    sub                  hd, 2
818    jg .w8_loop
819    RET
820ALIGN function_align
821.w16:
822    mova                 m6, [base+smooth_weights+16*2]
823    mova                 m7, [base+smooth_weights+16*3]
824    sub                 tlq, 1
825    sub                 tlq, hq
826.w16_loop:
827    pxor                 m1, m1
828    movd                 m2, [tlq+hq]                    ; left
829    pshufb               m2, m1
830    punpcklbw            m1, m2, m3                      ; left, right
831    punpckhbw            m2, m3
832    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
833    paddw                m0, m1                          ; 128 * left + 129 * right
834    pmaddubsw            m1, m6
835    paddw                m1, m5
836    paddw                m0, m1
837    pmaddubsw            m1, m2, m4
838    paddw                m1, m2
839    pmaddubsw            m2, m7
840    paddw                m2, m5
841    paddw                m1, m2
842    psrlw                m0, 8
843    psrlw                m1, 8
844    packuswb             m0, m1
845    mova             [dstq], m0
846    lea                dstq, [dstq+strideq]
847    sub                  hd, 1
848    jg .w16_loop
849    RET
850ALIGN function_align
851.w32:
852    sub                 tlq, 1
853    sub                 tlq, hq
854    pxor                 m6, m6
855.w32_loop_init:
856    mov                  r5, 2
857    lea                  r3, [base+smooth_weights+16*4]
858.w32_loop:
859    mova                 m7, [r3]
860    add                  r3, 16
861    movd                 m2, [tlq+hq]                    ; left
862    pshufb               m2, m6
863    punpcklbw            m1, m2, m3                      ; left, right
864    punpckhbw            m2, m3
865    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
866    paddw                m0, m1                          ; 128 * left + 129 * right
867    pmaddubsw            m1, m7
868    paddw                m1, m5
869    paddw                m0, m1
870    pmaddubsw            m1, m2, m4
871    paddw                m1, m2
872    mova                 m7, [r3]
873    add                  r3, 16
874    pmaddubsw            m2, m7
875    paddw                m2, m5
876    paddw                m1, m2
877    psrlw                m0, 8
878    psrlw                m1, 8
879    packuswb             m0, m1
880    mova             [dstq], m0
881    add                dstq, 16
882    dec                  r5
883    jg .w32_loop
884    lea                dstq, [dstq-32+strideq]
885    sub                  hd, 1
886    jg .w32_loop_init
887    RET
888ALIGN function_align
889.w64:
890    sub                 tlq, 1
891    sub                 tlq, hq
892    pxor                 m6, m6
893.w64_loop_init:
894    mov                  r5, 4
895    lea                  r3, [base+smooth_weights+16*8]
896.w64_loop:
897    mova                 m7, [r3]
898    add                  r3, 16
899    movd                 m2, [tlq+hq]                    ; left
900    pshufb               m2, m6
901    punpcklbw            m1, m2, m3                      ; left, right
902    punpckhbw            m2, m3
903    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
904    paddw                m0, m1                          ; 128 * left + 129 * right
905    pmaddubsw            m1, m7
906    paddw                m1, m5
907    paddw                m0, m1
908    pmaddubsw            m1, m2, m4
909    paddw                m1, m2
910    mova                 m7, [r3]
911    add                  r3, 16
912    pmaddubsw            m2, m7
913    paddw                m2, m5
914    paddw                m1, m2
915    psrlw                m0, 8
916    psrlw                m1, 8
917    packuswb             m0, m1
918    mova             [dstq], m0
919    add                dstq, 16
920    dec                  r5
921    jg .w64_loop
922    lea                dstq, [dstq-64+strideq]
923    sub                  hd, 1
924    jg .w64_loop_init
925    RET
926
927;---------------------------------------------------------------------------------------
928;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
929;                                    const int width, const int height, const int a);
930;---------------------------------------------------------------------------------------
931%macro SMOOTH_2D_END  7                                  ; src[1-2], mul[1-2], add[1-2], m3
932    pmaddubsw            m6, m%3, m%1
933    mova                 m0, m6
934    pmaddubsw            m6, m%4, m%2
935    mova                 m1, m6
936%ifnum %5
937    paddw                m0, m%5
938%else
939    paddw                m0, %5
940%endif
941%ifnum %6
942    paddw                m1, m%6
943%else
944    paddw                m1, %6
945%endif
946%ifnum %7
947%else
948    mova                 m3, %7
949%endif
950    pavgw                m0, m2
951    pavgw                m1, m3
952    psrlw                m0, 8
953    psrlw                m1, 8
954    packuswb             m0, m1
955%endmacro
956
957%macro SMOOTH_OUTPUT_16B  12      ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
958    mova                 m1, [rsp+16*%1]                  ; top
959    punpckhbw            m6, m1, m0                       ; top, bottom
960    punpcklbw            m1, m0                           ; top, bottom
961    pmaddubsw            m2, m1, m5
962    mova        [rsp+16*%2], m1
963    paddw                m1, m3                           ;   1 * top + 255 * bottom + 255
964    paddw                m2, m1                           ; 128 * top + 129 * bottom + 255
965    mova        [rsp+16*%3], m2
966    pmaddubsw            m2, m6, m5
967    mova        [rsp+16*%4], m6
968    paddw                m6, m3                           ;   1 * top + 255 * bottom + 255
969    paddw                m2, m6                           ; 128 * top + 129 * bottom + 255
970    mova        [rsp+16*%5], m2
971    movd                 m1, [tlq+hq]                     ; left
972    pshufb               m1, [base+pb_3]                  ; topleft[-(1 + y)]
973    punpcklbw            m1, m4                           ; left, right
974    pmaddubsw            m2, m1, m5                       ; 127 * left - 127 * right
975    paddw                m2, m1                           ; 128 * left + 129 * right
976    mova                 m3, m2
977    pmaddubsw            m0, m1, %6                       ; weights_hor = &dav1d_sm_weights[width];
978    pmaddubsw            m1, %7
979    paddw                m2, m3, m0
980    paddw                m3, m1
981    movd                 m1, [v_weightsq]                 ; weights_ver = &dav1d_sm_weights[height];
982    mova                 m7, [rsp+16*%9]
983    pshufb               m1, m7
984    mova        [rsp+16*%8], m3
985    mova                 m4, [rsp+16*%2]
986    mova                 m5, [rsp+16*%3]
987    mova                 m3, [rsp+16*%4]
988    mova                 m7, [rsp+16*%5]
989    SMOOTH_2D_END         1, 1, 4, 3, 5, 7, [rsp+16*%8]
990    mova             [dstq], m0
991    movddup              m3, [base+pw_255]                ; recovery
992    mova                 m0, [rsp+16*%10]                 ; recovery
993    mova                 m4, [rsp+16*%11]                 ; recovery
994    mova                 m5, [rsp+16*%12]                 ; recovery
995%endmacro
996
997cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
998%define base r6-ipred_smooth_ssse3_table
999    mov                  wd, wm
1000    mov                  hd, hm
1001    LEA                  r6, ipred_smooth_ssse3_table
1002    movd                 m4, [tlq+wq]                     ; right
1003    pxor                 m2, m2
1004    pshufb               m4, m2
1005    tzcnt                wd, wd
1006    mov                  r5, tlq
1007    sub                  r5, hq
1008    movsxd               wq, [r6+wq*4]
1009    movddup              m5, [base+pb_127_m127]
1010    movd                 m0, [r5]
1011    pshufb               m0, m2                           ; bottom
1012    movddup              m3, [base+pw_255]
1013    add                  wq, r6
1014    lea          v_weightsq, [base+smooth_weights+hq*2]   ; weights_ver = &dav1d_sm_weights[height]
1015    jmp                  wq
1016.w4:
1017    mova                 m7, [base+ipred_v_shuf]
1018    movd                 m1, [tlq+1]                      ; left
1019    pshufd               m1, m1, q0000
1020    sub                 tlq, 4
1021    lea                  r3, [strideq*3]
1022    sub                 tlq, hq
1023    punpcklbw            m1, m0                           ; top, bottom
1024    pshufd               m6, m7, q1100
1025    pshufd               m7, m7, q3322
1026    pmaddubsw            m2, m1, m5
1027    paddw                m3, m1                           ;   1 * top + 255 * bottom + 255
1028    paddw                m2, m3                           ; 128 * top + 129 * bottom + 255
1029    mova         [rsp+16*0], m1
1030    mova         [rsp+16*1], m2
1031    movq                 m1,  [base+smooth_weights+4*2]   ; weights_hor = &dav1d_sm_weights[width];
1032    punpcklqdq           m1, m1
1033    mova         [rsp+16*2], m1
1034    mova         [rsp+16*3], m4
1035    mova         [rsp+16*4], m6
1036    mova         [rsp+16*5], m5
1037.w4_loop:
1038    movd                 m1, [tlq+hq]                 ; left
1039    pshufb               m1, [base+ipred_h_shuf]
1040    punpcklbw            m0, m1, m4                   ; left, right
1041    punpckhbw            m1, m4
1042    pmaddubsw            m2, m0, m5                   ; 127 * left - 127 * right
1043    pmaddubsw            m3, m1, m5
1044    paddw                m2, m0                       ; 128 * left + 129 * right
1045    paddw                m3, m1
1046    mova                 m4, [rsp+16*2]
1047    pmaddubsw            m0, m4
1048    pmaddubsw            m1, m4
1049    paddw                m2, m0
1050    paddw                m3, m1
1051    movq                 m1, [v_weightsq]             ; weights_ver = &dav1d_sm_weights[height];
1052    add          v_weightsq, 8
1053    pshufb               m0, m1, m6
1054    pshufb               m1, m7
1055    mova                 m4, [rsp+16*0]
1056    mova                 m5, [rsp+16*1]
1057    SMOOTH_2D_END         0, 1, 4, 4, 5, 5, 3
1058    mova                 m4, [rsp+16*3]
1059    mova                 m6, [rsp+16*4]
1060    mova                 m5, [rsp+16*5]
1061    movd   [dstq+strideq*0], m0
1062    pshuflw              m1, m0, q1032
1063    movd   [dstq+strideq*1], m1
1064    punpckhqdq           m0, m0
1065    movd   [dstq+strideq*2], m0
1066    psrlq                m0, 32
1067    movd   [dstq+r3       ], m0
1068    lea                dstq, [dstq+strideq*4]
1069    sub                  hd, 4
1070    jg .w4_loop
1071    RET
1072ALIGN function_align
1073.w8:
1074    mova                 m7, [base+ipred_v_shuf]
1075    movq                 m1, [tlq+1]                  ; left
1076    punpcklqdq           m1, m1
1077    sub                 tlq, 4
1078    sub                 tlq, hq
1079    punpcklbw            m1, m0
1080    pshufd               m6, m7, q0000
1081    pshufd               m7, m7, q1111
1082    pmaddubsw            m2, m1, m5
1083    paddw                m3, m1
1084    paddw                m2, m3
1085    mova         [rsp+16*0], m1
1086    mova         [rsp+16*1], m2
1087    mova                 m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
1088    mova         [rsp+16*2], m1
1089    mova         [rsp+16*3], m4
1090    mova         [rsp+16*4], m6
1091    mova         [rsp+16*5], m5
1092.w8_loop:
1093    movd                 m1, [tlq+hq]                  ; left
1094    pshufb               m1, [base+ipred_h_shuf]
1095    pshufd               m1, m1, q1100
1096    punpcklbw            m0, m1, m4
1097    punpckhbw            m1, m4
1098    pmaddubsw            m2, m0, m5
1099    pmaddubsw            m3, m1, m5
1100    paddw                m2, m0
1101    paddw                m3, m1
1102    mova                 m4,  [rsp+16*2]
1103    pmaddubsw            m0, m4
1104    pmaddubsw            m1, m4
1105    paddw                m2, m0
1106    paddw                m3, m1
1107    movd                 m1, [v_weightsq]              ; weights_ver = &dav1d_sm_weights[height];
1108    add          v_weightsq, 4
1109    pshufb               m0, m1, m6
1110    pshufb               m1, m7
1111    mova                 m4, [rsp+16*0]
1112    mova                 m5, [rsp+16*1]
1113    SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
1114    mova                 m4, [rsp+16*3]
1115    mova                 m6, [rsp+16*4]
1116    mova                 m5, [rsp+16*5]
1117    movq   [dstq+strideq*0], m0
1118    movhps [dstq+strideq*1], m0
1119    lea                dstq, [dstq+strideq*2]
1120    sub                  hd, 2
1121    jg .w8_loop
1122    RET
1123ALIGN function_align
1124.w16:
1125    mova                 m7, [base+ipred_v_shuf]
1126    movu                 m1, [tlq+1]                     ; left
1127    sub                 tlq, 4
1128    sub                 tlq, hq
1129    punpckhbw            m6, m1, m0                      ; top, bottom
1130    punpcklbw            m1, m0                          ; top, bottom
1131    pshufd               m7, m7, q0000
1132    mova         [rsp+16*2], m7
1133    pmaddubsw            m2, m6, m5
1134    mova         [rsp+16*5], m6
1135    paddw                m6, m3                          ;   1 * top + 255 * bottom + 255
1136    paddw                m2, m6                          ; 128 * top + 129 * bottom + 255
1137    mova         [rsp+16*6], m2
1138    pmaddubsw            m2, m1, m5
1139    paddw                m3, m1                          ;   1 * top + 255 * bottom + 255
1140    mova         [rsp+16*0], m1
1141    paddw                m2, m3                          ; 128 * top + 129 * bottom + 255
1142    mova         [rsp+16*1], m2
1143    mova         [rsp+16*3], m4
1144    mova         [rsp+16*4], m5
1145.w16_loop:
1146    movd                 m1, [tlq+hq]                    ; left
1147    pshufb               m1, [base+pb_3]                 ; topleft[-(1 + y)]
1148    punpcklbw            m1, m4                          ; left, right
1149    pmaddubsw            m2, m1, m5                      ; 127 * left - 127 * right
1150    paddw                m2, m1                          ; 128 * left + 129 * right
1151    mova                 m0, m1
1152    mova                 m3, m2
1153    pmaddubsw            m0, [base+smooth_weights+16*2]  ; weights_hor = &dav1d_sm_weights[width];
1154    pmaddubsw            m1, [base+smooth_weights+16*3]
1155    paddw                m2, m0
1156    paddw                m3, m1
1157    movd                 m1, [v_weightsq]                ; weights_ver = &dav1d_sm_weights[height];
1158    add          v_weightsq, 2
1159    mova                 m7, [rsp+16*2]
1160    pshufb               m1, m7
1161    mova         [rsp+16*7], m3
1162    mova                 m4, [rsp+16*0]
1163    mova                 m5, [rsp+16*1]
1164    mova                 m3, [rsp+16*5]
1165    mova                 m7, [rsp+16*6]
1166    SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
1167    mova                 m4, [rsp+16*3]
1168    mova                 m5, [rsp+16*4]
1169    mova             [dstq], m0
1170    lea                dstq, [dstq+strideq]
1171    sub                  hd, 1
1172    jg .w16_loop
1173    RET
1174ALIGN function_align
1175.w32:
1176    movu                 m1, [tlq+1]                     ; top     topleft[1 + x]
1177    movu                 m2, [tlq+17]                    ; top
1178    mova         [rsp+16*0], m1
1179    mova         [rsp+16*1], m2
1180    sub                 tlq, 4
1181    sub                 tlq, hq
1182    mova                 m7, [base+ipred_v_shuf]
1183    pshufd               m7, m7, q0000
1184    mova         [rsp+16*2], m7
1185    mova         [rsp+16*3], m0
1186    mova         [rsp+16*4], m4
1187    mova         [rsp+16*5], m5
1188.w32_loop:
1189    SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
1190    add                dstq, 16
1191    SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
1192    lea                dstq, [dstq-16+strideq]
1193    add          v_weightsq, 2
1194    sub                  hd, 1
1195    jg .w32_loop
1196    RET
1197ALIGN function_align
1198.w64:
1199    movu                 m1, [tlq+1]                     ; top     topleft[1 + x]
1200    movu                 m2, [tlq+17]                    ; top
1201    mova         [rsp+16*0], m1
1202    mova         [rsp+16*1], m2
1203    movu                 m1, [tlq+33]                    ; top
1204    movu                 m2, [tlq+49]                    ; top
1205    mova        [rsp+16*11], m1
1206    mova        [rsp+16*12], m2
1207    sub                 tlq, 4
1208    sub                 tlq, hq
1209    mova                 m7, [base+ipred_v_shuf]
1210    pshufd               m7, m7, q0000
1211    mova         [rsp+16*2], m7
1212    mova         [rsp+16*3], m0
1213    mova         [rsp+16*4], m4
1214    mova         [rsp+16*5], m5
1215.w64_loop:
1216    SMOOTH_OUTPUT_16B  0, 6, 7, 8, 9,  [base+smooth_weights+16*8],  [base+smooth_weights+16*9], 10, 2, 3, 4, 5
1217    add                dstq, 16
1218    SMOOTH_OUTPUT_16B  1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
1219    add                dstq, 16
1220    SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
1221    add                dstq, 16
1222    SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
1223    lea                dstq, [dstq-48+strideq]
1224    add          v_weightsq, 2
1225    sub                  hd, 1
1226    jg .w64_loop
1227    RET
1228
1229%if ARCH_X86_64
1230cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx
1231    %define            base  r7-$$
1232    lea                  r7, [$$]
1233    mova                 m8, [base+pw_62]
1234    mova                 m9, [base+pw_64]
1235    mova                m10, [base+pw_512]
1236%else
1237cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx
1238    %define            base  r1-$$
1239    %define              m8  [base+pw_62]
1240    %define              m9  [base+pw_64]
1241    %define             m10  [base+pw_512]
1242    %define         strideq  r3
1243    %define        stridemp  dword [rsp+16*12]
1244    mov            stridemp, r1
1245    LEA                  r1, $$
1246%endif
1247    tzcnt                wd, wm
1248    movifnidn        angled, anglem
1249    movifnidn            hd, hm
1250    inc                 tlq
1251    movsxd               wq, [base+ipred_z1_ssse3_table+wq*4]
1252    mov                 dxd, angled
1253    and                 dxd, 0x7e
1254    add              angled, 165 ; ~90
1255    lea                  wq, [base+wq+ipred_z1_ssse3_table]
1256    movzx               dxd, word [base+dr_intra_derivative+dxq]
1257    xor              angled, 0x4ff ; d = 90 - angle
1258    jmp                  wq
1259.w4:
1260    lea                 r3d, [angleq+88]
1261    test                r3d, 0x480
1262    jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
1263    sar                 r3d, 9
1264    add                 r3d, hd
1265    cmp                 r3d, 8
1266    jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
1267    mova                 m1, [tlq-1]
1268    pshufb               m0, m1, [base+z_upsample1]
1269    pshufb               m1, [base+z_upsample2]
1270    movddup              m2, [base+pb_36_m4]
1271    add                 dxd, dxd
1272    pmaddubsw            m0, m2
1273    pshufd               m7, m1, q3333
1274    movd           [rsp+16], m7 ; top[max_base_x]
1275    pmaddubsw            m1, m2
1276    movd                 m6, dxd
1277    mov                 r5d, dxd ; xpos
1278    pshufb               m6, [base+pw_256]
1279    paddw                m1, m0
1280    movq                 m0, [tlq]
1281    pmulhrsw             m1, m10
1282    paddw                m7, m6, m6
1283    punpcklqdq           m6, m7 ; xpos0 xpos1
1284    packuswb             m1, m1
1285    punpcklbw            m0, m1
1286    movifnidn       strideq, stridemp
1287    mova              [rsp], m0
1288.w4_upsample_loop:
1289    lea                 r2d, [r5+dxq]
1290    shr                 r5d, 6      ; base0
1291    movq                 m0, [rsp+r5]
1292    lea                 r5d, [r2+dxq]
1293    shr                 r2d, 6      ; base1
1294    movhps               m0, [rsp+r2]
1295    pand                 m2, m8, m6 ; frac
1296    psubw                m1, m9, m2 ; 64-frac
1297    psllw                m2, 8
1298    por                  m1, m2     ; 64-frac, frac
1299    pmaddubsw            m0, m1
1300    paddw                m6, m7     ; xpos += dx
1301    pmulhrsw             m0, m10
1302    packuswb             m0, m0
1303    movd   [dstq+strideq*0], m0
1304    pshuflw              m0, m0, q1032
1305    movd   [dstq+strideq*1], m0
1306    lea                dstq, [dstq+strideq*2]
1307    sub                  hd, 2
1308    jg .w4_upsample_loop
1309    RET
1310.w4_no_upsample:
1311    mov                 r3d, 7     ; max_base
1312    test             angled, 0x400 ; !enable_intra_edge_filter
1313    jnz .w4_main
1314    lea                 r3d, [hq+3]
1315    movd                 m0, r3d
1316    movd                 m2, angled
1317    shr              angled, 8 ; is_sm << 1
1318    pxor                 m1, m1
1319    pshufb               m0, m1
1320    pshufb               m2, m1
1321    pcmpeqb              m1, m0, [base+z_filter_wh4]
1322    pand                 m1, m2
1323    pcmpgtb              m1, [base+z_filter_t_w48+angleq*8]
1324    pmovmskb            r5d, m1
1325    mov                 r3d, 7
1326    test                r5d, r5d
1327    jz .w4_main ; filter_strength == 0
1328    mova                 m3, [tlq-1]
1329    imul                r5d, 0x55555555
1330    movu                 m7, [base+z_filter_s+8]
1331    shr                 r5d, 30 ; filter_strength
1332    movddup              m0, [base+pb_8]
1333    pminub               m7, m0
1334    pshufb               m0, m3, [base+z_filter_s]
1335    movddup              m4, [base+z_filter_k-8+r5*8+24*0]
1336    pshufb               m3, m7
1337    movddup              m5, [base+z_filter_k-8+r5*8+24*1]
1338    shufps               m2, m0, m3, q2121
1339    movddup              m6, [base+z_filter_k-8+r5*8+24*2]
1340    pmaddubsw            m0, m4
1341    pmaddubsw            m1, m2, m4
1342    pmaddubsw            m2, m5
1343    paddd                m5, m6
1344    pmaddubsw            m4, m3, m5
1345    pmaddubsw            m3, m6
1346    paddw                m0, m2
1347    paddw                m1, m4
1348    paddw                m0, m3
1349    pshufd               m1, m1, q3333
1350    pmulhrsw             m0, m10
1351    pmulhrsw             m1, m10
1352    mov                 r5d, 9
1353    mov                 tlq, rsp
1354    cmp                  hd, 4
1355    cmovne              r3d, r5d
1356    packuswb             m0, m1
1357    mova              [tlq], m0
1358.w4_main:
1359    add                 tlq, r3
1360    movd                 m5, dxd
1361    movddup              m0, [base+z_base_inc] ; base_inc << 6
1362    movd                 m7, [tlq] ; top[max_base_x]
1363    shl                 r3d, 6
1364    movd                 m4, r3d
1365    pshufb               m5, [base+pw_256]
1366    mov                 r5d, dxd ; xpos
1367    pshufb               m7, [base+pw_m256]
1368    sub                  r5, r3
1369    pshufb               m4, [base+pw_256]
1370    mova                 m3, [base+z1_shuf_w4]
1371    paddw                m6, m5, m5
1372    psubw                m4, m0 ; max_base_x
1373    punpcklqdq           m5, m6 ; xpos0 xpos1
1374.w4_loop:
1375    lea                  r3, [r5+dxq]
1376    sar                  r5, 6      ; base0
1377    movq                 m0, [tlq+r5]
1378    lea                  r5, [r3+dxq]
1379    sar                  r3, 6      ; base1
1380    movhps               m0, [tlq+r3]
1381    pand                 m2, m8, m5 ; frac
1382    psubw                m1, m9, m2 ; 64-frac
1383    psllw                m2, 8
1384    pshufb               m0, m3
1385    por                  m1, m2     ; 64-frac, frac
1386    pmaddubsw            m0, m1
1387    movifnidn       strideq, stridemp
1388    pcmpgtw              m1, m4, m5 ; base < max_base_x
1389    pmulhrsw             m0, m10
1390    paddw                m5, m6     ; xpos += dx
1391    pand                 m0, m1
1392    pandn                m1, m7
1393    por                  m0, m1
1394    packuswb             m0, m0
1395    movd   [dstq+strideq*0], m0
1396    pshuflw              m0, m0, q1032
1397    movd   [dstq+strideq*1], m0
1398    sub                  hd, 2
1399    jz .w4_end
1400    lea                dstq, [dstq+strideq*2]
1401    test                r5d, r5d
1402    jl .w4_loop
1403    packuswb             m7, m7
1404.w4_end_loop:
1405    movd   [dstq+strideq*0], m7
1406    movd   [dstq+strideq*1], m7
1407    lea                dstq, [dstq+strideq*2]
1408    sub                  hd, 2
1409    jg .w4_end_loop
1410.w4_end:
1411    RET
1412.w8:
1413    lea                 r3d, [angleq+88]
1414    and                 r3d, ~0x7f
1415    or                  r3d, hd
1416    cmp                 r3d, 8
1417    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1418    mova                 m5, [base+z_upsample1]
1419    movu                 m3, [base+z_filter_s+6]
1420    movd                 m4, hd
1421    mova                 m0, [tlq-1]
1422    movu                 m1, [tlq+7]
1423    pxor                 m7, m7
1424    pshufb               m4, m7
1425    movddup              m7, [base+pb_36_m4]
1426    pminub               m4, m3
1427    add                 dxd, dxd
1428    pshufb               m2, m0, m5
1429    pmaddubsw            m2, m7
1430    pshufb               m0, m3
1431    pmaddubsw            m0, m7
1432    movd                 m6, dxd
1433    pshufb               m3, m1, m5
1434    pmaddubsw            m3, m7
1435    pshufb               m1, m4
1436    pmaddubsw            m1, m7
1437    pshufb               m6, [base+pw_256]
1438    mov                 r5d, dxd
1439    paddw                m2, m0
1440    paddw                m7, m6, m6
1441    paddw                m3, m1
1442    punpcklqdq           m6, m7 ; xpos0 xpos1
1443    movu                 m1, [tlq]
1444    pmulhrsw             m2, m10
1445    pmulhrsw             m3, m10
1446    packuswb             m2, m3
1447    punpcklbw            m0, m1, m2
1448    punpckhbw            m1, m2
1449    movifnidn       strideq, stridemp
1450    mova         [rsp+16*0], m0
1451    mova         [rsp+16*1], m1
1452.w8_upsample_loop:
1453    lea                 r2d, [r5+dxq]
1454    shr                 r5d, 6 ; base0
1455    movu                 m0, [rsp+r5]
1456    lea                 r5d, [r2+dxq]
1457    shr                 r2d, 6 ; base1
1458    movu                 m1, [rsp+r2]
1459    pand                 m2, m8, m6
1460    psubw                m3, m9, m2
1461    psllw                m2, 8
1462    por                  m3, m2
1463    punpcklqdq           m2, m3, m3 ; frac0
1464    pmaddubsw            m0, m2
1465    punpckhqdq           m3, m3     ; frac1
1466    pmaddubsw            m1, m3
1467    paddw                m6, m7
1468    pmulhrsw             m0, m10
1469    pmulhrsw             m1, m10
1470    packuswb             m0, m1
1471    movq   [dstq+strideq*0], m0
1472    movhps [dstq+strideq*1], m0
1473    lea                dstq, [dstq+strideq*2]
1474    sub                  hd, 2
1475    jg .w8_upsample_loop
1476    RET
1477.w8_no_upsample:
1478    lea                 r3d, [hq+7]
1479    movd                 m0, r3d
1480    and                 r3d, 7
1481    or                  r3d, 8 ; imin(h+7, 15)
1482    test             angled, 0x400
1483    jnz .w8_main
1484    movd                 m2, angled
1485    shr              angled, 8 ; is_sm << 1
1486    pxor                 m1, m1
1487    pshufb               m0, m1
1488    pshufb               m2, m1
1489    movu                 m1, [base+z_filter_wh8]
1490    psrldq               m3, [base+z_filter_t_w48+angleq*8], 4
1491    pcmpeqb              m1, m0
1492    pand                 m1, m2
1493    pcmpgtb              m1, m3
1494    pmovmskb            r5d, m1
1495    test                r5d, r5d
1496    jz .w8_main ; filter_strength == 0
1497    movd                 m3, [tlq-1]
1498    movu                 m0, [tlq+16*0]
1499    imul                r5d, 0x55555555
1500    movu                 m1, [tlq+16*1]
1501    shr                 r5d, 30 ; filter_strength
1502    movd                 m2, [tlq+r3]
1503    lea                 tlq, [rsp+16*4]
1504    sub                  r5, 3
1505    mova         [tlq-16*1], m0
1506    pxor                 m7, m7
1507    mova         [tlq+16*0], m1
1508    pshufb               m3, m7
1509    pshufb               m2, m7
1510    mova         [tlq-16*2], m3
1511    movq        [tlq+r3-15], m2
1512    call .filter_edge
1513    sar                 r5d, 1
1514    add                 r5d, 17
1515    cmp                  hd, 8
1516    cmova               r3d, r5d
1517.w8_main:
1518    add                 tlq, r3
1519    movd                 m5, dxd
1520    movd                 m7, [tlq]
1521    shl                 r3d, 6
1522    movu                 m3, [base+z_filter_s+2]
1523    movd                 m4, r3d
1524    pshufb               m5, [base+pw_256]
1525    mov                 r5d, dxd
1526    pshufb               m7, [base+pw_m256]
1527    sub                  r5, r3
1528    pshufb               m4, [base+pw_256]
1529    psubw                m4, [base+z_base_inc]
1530    mova                 m6, m5
1531.w8_loop:
1532    mov                  r3, r5
1533    sar                  r3, 6
1534    movu                 m0, [tlq+r3]
1535    pand                 m1, m8, m5
1536    psubw                m2, m9, m1
1537    psllw                m1, 8
1538    pshufb               m0, m3
1539    por                  m1, m2
1540    pmaddubsw            m0, m1
1541    pcmpgtw              m1, m4, m5
1542    paddw                m5, m6
1543    pmulhrsw             m0, m10
1544    pand                 m0, m1
1545    pandn                m1, m7
1546    por                  m0, m1
1547    packuswb             m0, m0
1548    movq             [dstq], m0
1549    dec                  hd
1550    jz .w8_end
1551    movifnidn       strideq, stridemp
1552    add                dstq, strideq
1553    add                  r5, dxq
1554    jl .w8_loop
1555    packuswb             m7, m7
1556.w8_end_loop:
1557    movq             [dstq], m7
1558    add                dstq, strideq
1559    dec                  hd
1560    jg .w8_end_loop
1561.w8_end:
1562    RET
1563.w16:
1564    lea                 r3d, [hq+15]
1565    movd                 m0, r3d
1566    and                 r3d, 15
1567    or                  r3d, 16 ; imin(h+15, 31)
1568    test             angled, 0x400
1569    jnz .w16_main
1570    movd                 m2, angled
1571    shr              angled, 8 ; is_sm << 1
1572    pxor                 m1, m1
1573    pshufb               m0, m1
1574    pshufb               m2, m1
1575    movq                 m3, [base+z_filter_t_w16+angleq*4]
1576    pcmpeqb              m0, [base+z_filter_wh16]
1577    pand                 m0, m2
1578    pcmpgtb              m0, m3
1579    pmovmskb            r5d, m0
1580    test                r5d, r5d
1581    jz .w16_main ; filter_strength == 0
1582    movd                 m4, [tlq-1]
1583    movu                 m0, [tlq+16*0]
1584    imul                r5d, 0x24924924
1585    movu                 m1, [tlq+16*1]
1586    shr                 r5d, 30
1587    movd                 m2, [tlq+30]
1588    adc                  r5, -4 ; filter_strength-3
1589    movd                 m3, [tlq+r3]
1590    lea                 tlq, [rsp+16*4]
1591    mova         [tlq-16*1], m0
1592    pxor                 m7, m7
1593    mova         [tlq+16*0], m1
1594    pshufb               m4, m7
1595    movd              [rsp], m2
1596    pshufb               m3, m7
1597    mova         [tlq-16*2], m4
1598    movd        [tlq+r3-16], m3
1599    call .filter_edge
1600    cmp                  hd, 16
1601    jle .w16_main
1602    pshuflw              m0, [rsp], q0000
1603    sar                  r5, 1
1604    movd                 m1, [base+z_filter_k_tail+4+r5*4]
1605    lea                 r3d, [r5+33]
1606    pmaddubsw            m0, m1
1607%if ARCH_X86_64
1608    pmulhrsw             m0, m10
1609%else
1610    pmulhrsw             m0, m4
1611%endif
1612    packuswb             m0, m0
1613    movd           [tlq+32], m0
1614.w16_main:
1615    add                 tlq, r3
1616    movd                 m5, dxd
1617    movd                 m7, [tlq]
1618    movd                 m4, r3d
1619    shl                 r3d, 6
1620    pshufb               m5, [base+pw_256]
1621    pxor                 m6, m6
1622    pshufb               m7, m6
1623    mov                 r5d, dxd
1624    pshufb               m4, m6
1625    sub                  r5, r3
1626    psubb                m4, [base+pb_0to15]
1627    mova                 m6, m5
1628.w16_loop:
1629    mov                  r3, r5
1630    sar                  r3, 6
1631    movu                 m1, [tlq+r3+0]
1632    pand                 m0, m8, m5
1633    movu                 m2, [tlq+r3+1]
1634    psubw                m3, m9, m0
1635    psllw                m0, 8
1636    por                  m3, m0
1637    punpcklbw            m0, m1, m2
1638    pmaddubsw            m0, m3
1639    punpckhbw            m1, m2
1640    pmaddubsw            m1, m3
1641    psrlw                m3, m5, 6
1642    packsswb             m3, m3
1643    pmulhrsw             m0, m10
1644    pmulhrsw             m1, m10
1645    paddw                m5, m6
1646    pcmpgtb              m2, m4, m3
1647    packuswb             m0, m1
1648    pand                 m0, m2
1649    pandn                m2, m7
1650    por                  m0, m2
1651    mova             [dstq], m0
1652    dec                  hd
1653    jz .w16_end
1654    movifnidn       strideq, stridemp
1655    add                dstq, strideq
1656    add                  r5, dxq
1657    jl .w16_loop
1658.w16_end_loop:
1659    mova             [dstq], m7
1660    add                dstq, strideq
1661    dec                  hd
1662    jg .w16_end_loop
1663.w16_end:
1664    RET
1665.w32:
1666    lea                 r3d, [hq+31]
1667    and                 r3d, 31
1668    or                  r3d, 32    ; imin(h+31, 63)
1669    test             angled, 0x400 ; !enable_intra_edge_filter
1670    jnz .w32_main
1671    movd                 m6, [tlq-1]
1672    movu                 m0, [tlq+16*0]
1673    movu                 m1, [tlq+16*1]
1674    movu                 m2, [tlq+16*2]
1675    movu                 m3, [tlq+16*3]
1676    movd                 m4, [tlq+62]
1677    movd                 m5, [tlq+r3]
1678    lea                 tlq, [rsp+16*6]
1679    mova         [tlq-16*3], m0
1680    pxor                 m7, m7
1681    mova         [tlq-16*2], m1
1682    pshufb               m6, m7
1683    mova         [tlq-16*1], m2
1684    xor                 r5d, r5d ; filter_strength = 3
1685    mova         [tlq+16*0], m3
1686    movd              [rsp], m4
1687    pshufb               m5, m7
1688    mova         [tlq-16*4], m6
1689    movd        [tlq+r3-48], m5
1690    call .filter_edge
1691    sub                 tlq, 16*2
1692    call .filter_edge
1693    cmp                  hd, 32
1694    jle .w32_main
1695    pshuflw              m0, [rsp], q0000
1696    movd                 m1, [base+z_filter_k_tail+4]
1697    add                 r3d, 2
1698    pmaddubsw            m0, m1
1699%if ARCH_X86_64
1700    pmulhrsw             m0, m10
1701%else
1702    pmulhrsw             m0, m4
1703%endif
1704    packuswb             m0, m0
1705    movd           [tlq+64], m0
1706.w32_main:
1707    add                 tlq, r3
1708    movd                 m0, r3d
1709    movd                 m7, [tlq]
1710    shl                 r3d, 6
1711    movd                 m5, dxd
1712    pxor                 m6, m6
1713    mov                 r5d, dxd
1714    pshufb               m0, m6
1715    pshufb               m5, [base+pw_256]
1716    sub                  r5, r3
1717    pshufb               m7, m6
1718    psubb                m0, [base+pb_0to15]
1719    movddup              m1, [base+pb_m16]
1720    mova         [rsp+16*0], m0
1721    paddb                m0, m1
1722    mova         [rsp+16*1], m0
1723    mova                 m6, m5
1724.w32_loop:
1725    mov                  r3, r5
1726    sar                  r3, 6
1727    movu                 m1, [tlq+r3+16*0+0]
1728    pand                 m0, m8, m5
1729    movu                 m2, [tlq+r3+16*0+1]
1730    psubw                m3, m9, m0
1731    psllw                m0, 8
1732    por                  m3, m0
1733    punpcklbw            m0, m1, m2
1734    pmaddubsw            m0, m3
1735    punpckhbw            m1, m2
1736    pmaddubsw            m1, m3
1737    psrlw                m4, m5, 6
1738    pmulhrsw             m0, m10
1739    pmulhrsw             m1, m10
1740    packsswb             m4, m4
1741    pcmpgtb              m2, [rsp+16*0], m4
1742    packuswb             m0, m1
1743    pand                 m0, m2
1744    pandn                m2, m7
1745    por                  m0, m2
1746    movu                 m1, [tlq+r3+16*1+0]
1747    movu                 m2, [tlq+r3+16*1+1]
1748    mova        [dstq+16*0], m0
1749    punpcklbw            m0, m1, m2
1750    pmaddubsw            m0, m3
1751    punpckhbw            m1, m2
1752    pmaddubsw            m1, m3
1753    paddw                m5, m6
1754    pmulhrsw             m0, m10
1755    pmulhrsw             m1, m10
1756    pcmpgtb              m2, [rsp+16*1], m4
1757    packuswb             m0, m1
1758    pand                 m0, m2
1759    pandn                m2, m7
1760    por                  m0, m2
1761    mova        [dstq+16*1], m0
1762    dec                  hd
1763    jz .w32_end
1764    movifnidn       strideq, stridemp
1765    add                dstq, strideq
1766    add                  r5, dxq
1767    jl .w32_loop
1768.w32_end_loop:
1769    mova        [dstq+16*0], m7
1770    mova        [dstq+16*1], m7
1771    add                dstq, strideq
1772    dec                  hd
1773    jg .w32_end_loop
1774.w32_end:
1775    RET
1776.w64:
1777    lea                 r3d, [hq+63]
1778    test             angled, 0x400 ; !enable_intra_edge_filter
1779    jnz .w64_main
1780    movd                 m4, [tlq-1]
1781    movu                 m0, [tlq+16*0]
1782    movu                 m1, [tlq+16*1]
1783    movu                 m2, [tlq+16*2]
1784    movu                 m3, [tlq+16*3]
1785    mova         [rsp+16*3], m0
1786    pxor                 m7, m7
1787    mova         [rsp+16*4], m1
1788    pshufb               m4, m7
1789    mova         [rsp+16*5], m2
1790    mova         [rsp+16*6], m3
1791    mova         [rsp+16*2], m4
1792    movu                 m0, [tlq+16*4]
1793    movu                 m1, [tlq+16*5]
1794    movu                 m2, [tlq+16*6]
1795    movu                 m3, [tlq+16*7]
1796    movd                 m4, [tlq+r3]
1797    lea                 tlq, [rsp+16*10]
1798    mova         [tlq-16*3], m0
1799    xor                 r5d, r5d ; filter_strength = 3
1800    mova         [tlq-16*2], m1
1801    pshufb               m4, m7
1802    mova         [tlq-16*1], m2
1803    mova         [tlq+16*0], m3
1804    movd      [tlq+r3-16*7], m4
1805    cmp                  hd, 64
1806    jl .w64_filter96 ; skip one call if the last 32 bytes aren't used
1807    call .filter_edge
1808.w64_filter96:
1809    sub                 tlq, 16*2
1810    call .filter_edge
1811    sub                 tlq, 16*2
1812    call .filter_edge
1813    sub                 tlq, 16*2
1814    call .filter_edge
1815.w64_main:
1816    add                 tlq, r3
1817    movd                 m0, r3d
1818    movd                 m7, [tlq]
1819    shl                 r3d, 6
1820    movd                 m5, dxd
1821    pxor                 m6, m6
1822    mov                 r5d, dxd
1823    pshufb               m0, m6
1824    sub                  r5, r3
1825    pshufb               m5, [base+pw_256]
1826    pshufb               m7, m6
1827    psubb                m0, [base+pb_0to15]
1828    movddup              m1, [base+pb_m16]
1829    mova         [rsp+16*0], m0
1830    paddb                m0, m1
1831    mova         [rsp+16*1], m0
1832    paddb                m0, m1
1833    mova         [rsp+16*2], m0
1834    paddb                m0, m1
1835    mova         [rsp+16*3], m0
1836    mova                 m6, m5
1837.w64_loop:
1838    mov                  r3, r5
1839    sar                  r3, 6
1840    movu                 m1, [tlq+r3+16*0+0]
1841    pand                 m0, m8, m5
1842    movu                 m2, [tlq+r3+16*0+1]
1843    psubw                m3, m9, m0
1844    psllw                m0, 8
1845    por                  m3, m0
1846    punpcklbw            m0, m1, m2
1847    pmaddubsw            m0, m3
1848    punpckhbw            m1, m2
1849    pmaddubsw            m1, m3
1850    psrlw                m4, m5, 6
1851    pmulhrsw             m0, m10
1852    pmulhrsw             m1, m10
1853    packsswb             m4, m4
1854    pcmpgtb              m2, [rsp+16*0], m4
1855    packuswb             m0, m1
1856    pand                 m0, m2
1857    pandn                m2, m7
1858    por                  m0, m2
1859    movu                 m1, [tlq+r3+16*1+0]
1860    movu                 m2, [tlq+r3+16*1+1]
1861    mova        [dstq+16*0], m0
1862    punpcklbw            m0, m1, m2
1863    pmaddubsw            m0, m3
1864    punpckhbw            m1, m2
1865    pmaddubsw            m1, m3
1866    pmulhrsw             m0, m10
1867    pmulhrsw             m1, m10
1868    pcmpgtb              m2, [rsp+16*1], m4
1869    packuswb             m0, m1
1870    pand                 m0, m2
1871    pandn                m2, m7
1872    por                  m0, m2
1873    movu                 m1, [tlq+r3+16*2+0]
1874    movu                 m2, [tlq+r3+16*2+1]
1875    mova        [dstq+16*1], m0
1876    punpcklbw            m0, m1, m2
1877    pmaddubsw            m0, m3
1878    punpckhbw            m1, m2
1879    pmaddubsw            m1, m3
1880    pmulhrsw             m0, m10
1881    pmulhrsw             m1, m10
1882    pcmpgtb              m2, [rsp+16*2], m4
1883    packuswb             m0, m1
1884    pand                 m0, m2
1885    pandn                m2, m7
1886    por                  m0, m2
1887    movu                 m1, [tlq+r3+16*3+0]
1888    movu                 m2, [tlq+r3+16*3+1]
1889    mova        [dstq+16*2], m0
1890    punpcklbw            m0, m1, m2
1891    pmaddubsw            m0, m3
1892    punpckhbw            m1, m2
1893    pmaddubsw            m1, m3
1894    paddw                m5, m6
1895    pmulhrsw             m0, m10
1896    pmulhrsw             m1, m10
1897    pcmpgtb              m2, [rsp+16*3], m4
1898    packuswb             m0, m1
1899    pand                 m0, m2
1900    pandn                m2, m7
1901    por                  m0, m2
1902    mova        [dstq+16*3], m0
1903    dec                  hd
1904    jz .w64_end
1905    movifnidn       strideq, stridemp
1906    add                dstq, strideq
1907    add                  r5, dxq
1908    jl .w64_loop
1909.w64_end_loop:
1910    mova        [dstq+16*0], m7
1911    mova        [dstq+16*1], m7
1912    mova        [dstq+16*2], m7
1913    mova        [dstq+16*3], m7
1914    add                dstq, strideq
1915    dec                  hd
1916    jg .w64_end_loop
1917.w64_end:
1918    RET
1919ALIGN function_align
1920.filter_edge: ; 32 pixels/iteration
1921    movddup              m7, [base+z_filter_k+8*2+r5*8+24*0]
1922    movu                 m2, [tlq-18]
1923    movu                 m1, [tlq-17]
1924    movu                 m3, [tlq- 2]
1925    movu                 m4, [tlq- 1]
1926    punpcklbw            m0, m2, m1
1927    pmaddubsw            m0, m7
1928    punpckhbw            m2, m1
1929    pmaddubsw            m2, m7
1930    punpcklbw            m1, m3, m4
1931    pmaddubsw            m1, m7
1932    punpckhbw            m3, m4
1933    pmaddubsw            m3, m7
1934    movddup              m7, [base+z_filter_k+8*2+r5*8+24*1]
1935    mova                 m5, [tlq-16]
1936    movu                 m6, [tlq-15]
1937    punpcklbw            m4, m5, m6
1938    pmaddubsw            m4, m7
1939    punpckhbw            m5, m6
1940    pmaddubsw            m5, m7
1941    paddw                m0, m4
1942    paddw                m2, m5
1943    mova                 m5, [tlq+ 0]
1944    movu                 m6, [tlq+ 1]
1945    punpcklbw            m4, m5, m6
1946    pmaddubsw            m4, m7
1947    punpckhbw            m5, m6
1948    pmaddubsw            m5, m7
1949    paddw                m1, m4
1950    paddw                m3, m5
1951    test                r5d, r5d
1952    jnz .filter_end ; 3-tap
1953    movddup              m7, [base+z_filter_k+8*8]
1954    movu                 m5, [tlq-14]
1955    movu                 m6, [tlq+ 2]
1956    punpcklbw            m4, m5, m5
1957    pmaddubsw            m4, m7
1958    punpckhbw            m5, m5
1959    pmaddubsw            m5, m7
1960    paddw                m0, m4
1961    paddw                m2, m5
1962    punpcklbw            m5, m6, m6
1963    pmaddubsw            m5, m7
1964    punpckhbw            m6, m6
1965    pmaddubsw            m6, m7
1966    paddw                m1, m5
1967    paddw                m3, m6
1968.filter_end:
1969%if ARCH_X86_64
1970    REPX  {pmulhrsw x, m10}, m0, m2, m1, m3
1971%else
1972    mova                 m4, m10
1973    REPX  {pmulhrsw x, m4 }, m0, m2, m1, m3
1974%endif
1975    packuswb             m0, m2
1976    packuswb             m1, m3
1977    mova         [tlq+16*0], m0
1978    mova         [tlq+16*1], m1
1979    ret
1980
1981%if ARCH_X86_64
1982cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy
1983    %define            base  r7-$$
1984    %define           maxwm  r6m
1985    %define           maxhm  r7m
1986    lea                  r7, [$$]
1987    mov                  hd, hm
1988    mova                 m8, [base+pw_62]
1989    mova                 m9, [base+pw_64]
1990    lea                 r9d, [wq-4]
1991    mova                m10, [base+pw_512]
1992    shl                 r9d, 6
1993    mova                m11, [base+z1_shuf_w4]
1994    or                  r9d, hd
1995    mova                m12, [base+z2_h_shuf]
1996%else
1997cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx
1998    %define            base  r1-$$
1999    %define              m8  [base+pw_62]
2000    %define              m9  [base+pw_64]
2001    %define             m10  [base+pw_512]
2002    %define             m11  [rsp+16*16]
2003    %define             m12  [rsp+16*17]
2004    %define             r9b  byte [rsp+16*18+4*0]
2005    %define             r9d  dword [rsp+16*18+4*0]
2006    %define            r10d  dword [rsp+16*18+4*1]
2007    %define            r11d  dword [rsp+16*18+4*2]
2008    %define           maxwm  [rsp+16*18+4*3]
2009    %define           maxhm  [rsp+16*19+4*0]
2010    %define        stridemp  [rsp+16*19+4*1]
2011    %define         strideq  r3
2012    %define             dyd  r4
2013    %define             dyq  r4
2014    mov            stridemp, r1
2015    mov                 r1d, r6m
2016    mov                 r4d, r7m
2017    mov               maxwm, r1d
2018    mov               maxhm, r4d
2019    LEA                  r1, $$
2020    lea                  hd, [wq-4]
2021    mova                 m0, [base+z1_shuf_w4]
2022    shl                  hd, 6
2023    mova                 m1, [base+z2_h_shuf]
2024    or                   hd, hm
2025    mova                m11, m0
2026    mov                 r9d, hd
2027    mova                m12, m1
2028%endif
2029    tzcnt                wd, wd
2030    movifnidn        angled, anglem
2031    movsxd               wq, [base+ipred_z2_ssse3_table+wq*4]
2032%if ARCH_X86_64
2033    movzx               dxd, angleb
2034%else
2035    movzx               dxd, byte anglem
2036%endif
2037    xor              angled, 0x400
2038    mova                 m0, [tlq-16*4]
2039    mov                 dyd, dxd
2040    mova                 m1, [tlq-16*3]
2041    neg                 dxq
2042    mova                 m2, [tlq-16*2]
2043    and                 dyd, ~1
2044    mova                 m3, [tlq-16*1]
2045    and                 dxq, ~1
2046    movd                 m4, [tlq]
2047    movu                 m5, [tlq+16*0+1]
2048    movu                 m6, [tlq+16*1+1]
2049    movzx               dyd, word [base+dr_intra_derivative+dyq-90]  ; angle - 90
2050    movzx               dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
2051    mova         [rsp+16*2], m0
2052    pxor                 m7, m7
2053    mova         [rsp+16*3], m1
2054    pshufb               m4, m7
2055    mova         [rsp+16*4], m2
2056    lea                  wq, [base+ipred_z2_ssse3_table+wq]
2057    mova         [rsp+16*5], m3
2058    neg                 dxd
2059    mova         [rsp+16*6], m4
2060    or                  dyd, 4<<16
2061    mova         [rsp+16*7], m4
2062    mova         [rsp+16*8], m5
2063    mova         [rsp+16*9], m6
2064    movq                 m0, [base+z_base_inc+2]
2065    movsldup             m1, [base+z2_dy_offset]
2066    movq                 m2, [base+pw_256] ; 4<<6
2067    movq    [rsp+16*14+8*0], m0
2068    movq    [rsp+16*15+8*0], m1
2069    movq    [rsp+16*15+8*1], m2
2070%if ARCH_X86_64
2071    lea                r10d, [dxq+(128<<6)] ; xpos
2072%else
2073    mov      [rsp+16*7+4*1], dyd
2074    lea                 r4d, [dxq+(128<<6)]
2075    mov                r10d, r4d
2076    movzx                hd, r9b
2077%endif
2078    mov                r11d, (128-4)<<6
2079    jmp                  wq
2080.w4:
2081    test             angled, 0x400
2082    jnz .w4_main
2083    movd                 m5, [tlq+4]
2084    lea                 r3d, [hq+2]
2085    add              angled, 1022
2086    pshufb               m5, m7
2087    shl                 r3d, 6
2088    movd       [rsp+16*8+4], m5
2089    test                r3d, angled
2090    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
2091    call .upsample_above
2092    sub              angled, 1075 ; angle - 53
2093    lea                 r3d, [hq+3]
2094    xor              angled, 0x7f ; 180 - angle
2095    movd                 m0, r3d
2096    movd                 m6, angled
2097    shr              angled, 8 ; is_sm << 1
2098    pshufb               m0, m7
2099    pshufb               m6, m7
2100    pcmpeqb              m0, [base+z_filter_wh4]
2101    pand                 m6, m0
2102    pcmpgtb              m6, [base+z_filter_t_w48+angleq*8]
2103    jmp .w8_filter_left
2104.upsample_above: ; w4/w8
2105    movq                 m3, [rsp+gprsize+16*8-2]
2106    movq                 m1, [rsp+gprsize+16*8-1]
2107    movq                 m0, [rsp+gprsize+16*8+0]
2108    movq                 m4, [rsp+gprsize+16*8+1]
2109    movddup              m5, [base+pb_36_m4]
2110    punpcklbw            m1, m3
2111    punpcklbw            m2, m0, m4
2112    pmaddubsw            m1, m5
2113    pmaddubsw            m2, m5
2114%if ARCH_X86_64
2115    mova                m11, [base+pb_0to15]
2116    lea                r10d, [r10+dxq+(1<<6)]
2117    mov                r11d, (128-7)<<6
2118%else
2119    mova                 m3, [base+pb_0to15]
2120    mov                 r3d, [rsp+gprsize+16*18+4*1]
2121    mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6
2122    lea                 r3d, [r3+dxq+(1<<6)]
2123    mov [rsp+gprsize+16*18+4*1], r3d
2124    mova [rsp+gprsize+16*16], m3
2125%endif
2126    add                 dxd, dxd
2127    paddw                m1, m2
2128    pmulhrsw             m1, m10
2129    movq                 m2, [rsp+gprsize+16*14]
2130    paddw                m2, m2
2131    movq [rsp+gprsize+16*14], m2
2132    packuswb             m1, m1
2133    punpcklbw            m1, m0
2134    mova [rsp+gprsize+16*8], m1
2135    ret
2136.w4_no_upsample_above:
2137    lea                 r3d, [hq+3]
2138    mov               [rsp], angled
2139    sub              angled, 1112 ; angle - 90
2140    movd                 m0, r3d
2141    mov                 r3d, 90
2142    movd                 m1, angled
2143    sub                 r3d, angled ; 180 - angle
2144    shr              angled, 8 ; is_sm << 1
2145    movu                 m3, [base+z_filter_wh4]
2146    mova                 m4, [base+z_filter_t_w48+angleq*8]
2147    call .w8_filter_top
2148    mov              angled, [rsp]
2149    lea                 r3d, [hq+2]
2150    sub              angled, 139
2151    shl                 r3d, 6
2152    test                r3d, angled
2153    jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
2154.upsample_left: ; w4/w8
2155    neg                  hq
2156    movd                 m0, [tlq+hq]
2157    pshufb               m0, m7
2158    movd    [rsp+16*6+hq-4], m0
2159    movq                 m3, [rsp+16*5+7]
2160    movq                 m0, [rsp+16*5+8]
2161    movq                 m2, [rsp+16*5+9]
2162    movq                 m4, [rsp+16*5+10]
2163    movddup              m5, [base+pb_36_m4]
2164    punpcklbw            m1, m0, m3
2165    punpcklbw            m2, m4
2166    pmaddubsw            m1, m5
2167    pmaddubsw            m2, m5
2168    movshdup             m3, [base+z2_dy_offset]
2169%if ARCH_X86_64
2170    mova                m12, [base+z2_upsample]
2171    add                 dyd, dyd
2172%else
2173    mova                 m4, [base+z2_upsample]
2174    shl dword [rsp+16*7+4*1], 1
2175    mova                m12, m4
2176%endif
2177    paddw                m1, m2
2178    pmulhrsw             m1, m10
2179    movq        [rsp+16*15], m3
2180    packuswb             m1, m1
2181    punpcklbw            m0, m1
2182    mova         [rsp+16*5], m0
2183.w4_main:
2184    movd                 m6, dxd
2185%if ARCH_X86_64
2186    movd                 m3, dyd
2187%else
2188    movd                 m3, [rsp+16*7+4*1]
2189%endif
2190    movddup              m0, [rsp+16*14+8*0]
2191    pshufb               m6, [base+pw_256]
2192    paddw                m7, m6, m6
2193    movq                 m5, [base+pw_m1to4]
2194    pshuflw              m4, m3, q0000
2195    punpcklqdq           m6, m7
2196    pmullw               m4, m5
2197    pshuflw              m3, m3, q1111
2198    paddw                m6, m0
2199    mov                 r2d, r10d
2200    pshuflw              m0, m4, q3333
2201    psubw                m4, [rsp+16*15]
2202    movq     [rsp+16*6+8*1], m3
2203    movq          [rsp+8*1], m0 ; dy*4
2204    mov                  r5, dstq
2205.w4_loop0:
2206    mova        [rsp+16*12], m6
2207    movq          [rsp+8*0], m4
2208    pand                 m0, m4, m8
2209    psraw                m4, 6
2210    psubw                m1, m9, m0
2211    psllw                m0, 8
2212    por                  m0, m1       ; 64-frac_y, frac_y
2213    movq          [rsp+8*3], m0
2214    pabsw                m4, m4
2215    movq          [rsp+8*2], m4
2216    movzx                hd, r9b
2217.w4_loop:
2218    lea                 r3d, [r2+dxq]
2219    shr                 r2d, 6        ; base_x0
2220    movq                 m0, [rsp+r2]
2221    lea                 r2d, [r3+dxq]
2222    shr                 r3d, 6        ; base_x1
2223    movhps               m0, [rsp+r3]
2224    lea                 r3d, [r2+dxq]
2225    shr                 r2d, 6        ; base_x2
2226    movq                 m1, [rsp+r2]
2227    lea                 r2d, [r3+dxq]
2228    shr                 r3d, 6        ; base_x3
2229    movhps               m1, [rsp+r3]
2230    pand                 m2, m8, m6
2231    paddsw               m5, m6, m7
2232    psubw                m3, m9, m2
2233    psllw                m2, 8
2234    pshufb               m0, m11
2235    por                  m2, m3
2236    pmaddubsw            m0, m2
2237    pand                 m2, m8, m5
2238    psubw                m3, m9, m2
2239    psllw                m2, 8
2240    pshufb               m1, m11
2241    por                  m2, m3
2242    pmaddubsw            m1, m2
2243    cmp                 r3d, 127 ; topleft
2244    jge .w4_toponly
2245    movzx               r3d, byte [rsp+8*2+0] ; base_y0
2246    movq                 m3, [rsp+r3]
2247    movzx               r3d, byte [rsp+8*2+2] ; base_y1
2248    movhps               m3, [rsp+r3]
2249    movzx               r3d, byte [rsp+8*2+4] ; base_y2
2250    movq                 m4, [rsp+r3]
2251    movzx               r3d, byte [rsp+8*2+6] ; base_y3
2252    movhps               m4, [rsp+r3]
2253    pshufb               m3, m12
2254    pshufb               m4, m12
2255    punpckldq            m2, m3, m4
2256    punpckhdq            m3, m4
2257    movddup              m4, [rsp+8*3]
2258    pmaddubsw            m2, m4
2259    pmaddubsw            m3, m4
2260    psraw                m6, 15       ; base_x < topleft
2261    pand                 m2, m6
2262    pandn                m6, m0
2263    por                  m0, m2, m6
2264    psraw                m6, m5, 15
2265    pand                 m3, m6
2266    pandn                m6, m1
2267    por                  m1, m3, m6
2268.w4_toponly:
2269    pmulhrsw             m0, m10
2270    pmulhrsw             m1, m10
2271    movifnidn       strideq, stridemp
2272    packuswb             m0, m1
2273    movd   [dstq+strideq*0], m0
2274    pshuflw              m1, m0, q1032
2275    movd   [dstq+strideq*1], m1
2276    lea                dstq, [dstq+strideq*2]
2277    punpckhqdq           m0, m0
2278    movd   [dstq+strideq*0], m0
2279    psrlq                m0, 32
2280    movd   [dstq+strideq*1], m0
2281    sub                  hd, 4
2282    jz .w4_end
2283    movq                 m4, [rsp+8*2]
2284    movq                 m3, [rsp+16*6+8*1]
2285    paddw                m6, m5, m7   ; xpos += dx
2286    psubw                m4, m3
2287    movq          [rsp+8*2], m4
2288    lea                dstq, [dstq+strideq*2]
2289    cmp                 r2d, r11d
2290    jge .w4_loop
2291    movddup              m5, [rsp+8*3]
2292.w4_leftonly_loop:
2293    movzx               r2d, byte [rsp+8*2+0] ; base_y0
2294    movq                 m1, [rsp+r2]
2295    movzx               r2d, byte [rsp+8*2+2] ; base_y1
2296    movhps               m1, [rsp+r2]
2297    movzx               r2d, byte [rsp+8*2+4] ; base_y2
2298    movq                 m2, [rsp+r2]
2299    movzx               r2d, byte [rsp+8*2+6] ; base_y3
2300    movhps               m2, [rsp+r2]
2301    psubw                m4, m3
2302    pshufb               m1, m12
2303    pshufb               m2, m12
2304    movq          [rsp+8*2], m4
2305    punpckldq            m0, m1, m2
2306    punpckhdq            m1, m2
2307    pmaddubsw            m0, m5
2308    pmaddubsw            m1, m5
2309    pmulhrsw             m0, m10
2310    pmulhrsw             m1, m10
2311    packuswb             m0, m1
2312    movd   [dstq+strideq*0], m0
2313    pshuflw              m1, m0, q1032
2314    movd   [dstq+strideq*1], m1
2315    lea                dstq, [dstq+strideq*2]
2316    punpckhqdq           m0, m0
2317    movd   [dstq+strideq*0], m0
2318    psrlq                m0, 32
2319    movd   [dstq+strideq*1], m0
2320    lea                dstq, [dstq+strideq*2]
2321    sub                  hd, 4
2322    jg .w4_leftonly_loop
2323.w4_end:
2324    sub                 r9d, 1<<8
2325    jl .w4_ret
2326    movq                 m4, [rsp+8*1]
2327    add                  r5, 4
2328    mov                dstq, r5
2329    paddw                m4, [rsp+8*0] ; base_y += 4*dy
2330    movzx               r2d, word [rsp+16*15+8*1]
2331    movddup              m6, [rsp+16*15+8*1]
2332    paddw                m6, [rsp+16*12] ; base_x += (4 << upsample_above)
2333    add                 r2d, r10d
2334    mov                r10d, r2d
2335    jmp .w4_loop0
2336.w4_ret:
2337    RET
2338.w8:
2339    test             angled, 0x400
2340    jnz .w4_main
2341    movd                 m5, [tlq+8]
2342    lea                 r3d, [angleq+126]
2343    pshufb               m5, m7
2344%if ARCH_X86_64
2345    mov                 r3b, hb
2346%else
2347    xor                 r3b, r3b
2348    or                  r3d, hd
2349%endif
2350    movd       [rsp+16*8+8], m5
2351    cmp                 r3d, 8
2352    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2353    call .upsample_above
2354    sub              angled, 53
2355    lea                 r3d, [hq+7]
2356    xor              angled, 0x7f ; 180 - angle
2357    movu                 m1, [base+z_filter_wh8]
2358    movd                 m0, r3d
2359    movd                 m6, angled
2360    shr              angled, 8 ; is_sm << 1
2361    psrldq               m2, [base+z_filter_t_w48+angleq*8], 4
2362    pshufb               m0, m7
2363    pshufb               m6, m7
2364    pcmpeqb              m0, m1
2365    pand                 m6, m0
2366    pcmpgtb              m6, m2
2367%if ARCH_X86_64
2368    movq    [rsp+16*15+8*1], m10 ; 8<<6
2369%else
2370    movq                 m0, m10
2371    movq    [rsp+16*15+8*1], m0
2372%endif
2373    jmp .w8_filter_left
2374.w8_no_upsample_above:
2375    lea                 r3d, [hq+7]
2376    mov               [rsp], angled
2377    sub              angled, 90
2378    movd                 m0, r3d
2379    mov                 r3d, 90
2380    movd                 m1, angled
2381    sub                 r3d, angled ; 180 - angle
2382    shr              angled, 8 ; is_sm << 1
2383    movu                 m3, [base+z_filter_wh8]
2384    psrldq               m4, [base+z_filter_t_w48+angleq*8], 4
2385    call .w8_filter_top
2386    mov                 r3d, [rsp]
2387    sub                 r3d, 141
2388%if ARCH_X86_64
2389    mov                 r3b, hb
2390%else
2391    xor                 r3b, r3b
2392    or                  r3d, hd
2393%endif
2394    cmp                 r3d, 8
2395    jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
2396.w8_filter_left:
2397    pmovmskb            r5d, m6
2398    test                r5d, r5d
2399    jz .w4_main
2400    imul                r5d, 0x55555555
2401    mov                  r3, tlq
2402    shr                 r5d, 30
2403    sub                  r5, 3 ; filter_strength-3
2404    jmp .filter_left
2405.w8_filter_top:
2406    movd                 m6, r3d
2407    REPX     {pshufb x, m7}, m0, m1, m6
2408    pcmpeqb              m0, m3
2409    pand                 m1, m0
2410    pand                 m6, m0
2411    pcmpgtb              m1, m4
2412    pcmpgtb              m6, m4
2413    pmovmskb            r5d, m1
2414    test                r5d, r5d
2415    jz .w8_filter_top_end ; filter_strength == 0
2416    imul                r5d, 0x55555555
2417    movq                 m0, [rsp+gprsize+16*8-2]
2418    shr                 r5d, 30
2419    movq                 m1, [rsp+gprsize+16*8-1]
2420    sub                  r5, 3 ; filter_strength-3
2421    movddup              m7, [base+z_filter_k+8*2+r5*8+24*0]
2422    punpcklbw            m0, m1
2423    pmaddubsw            m0, m7
2424    movq                 m1, [rsp+gprsize+16*8+0]
2425    movq                 m2, [rsp+gprsize+16*8+1]
2426    movddup              m7, [base+z_filter_k+8*2+r5*8+24*1]
2427    punpcklbw            m1, m2
2428    pmaddubsw            m1, m7
2429    movq                 m2, [rsp+gprsize+16*8+2]
2430    movddup              m7, [base+z_filter_k+8*2+r5*8+24*2]
2431    punpcklbw            m2, m2
2432    pmaddubsw            m2, m7
2433    paddw                m0, m1
2434    paddw                m0, m2
2435%if ARCH_X86_64
2436    mov                 r3d, r7m ; maxw, offset due to call
2437%else
2438    mov                 r3d, [rsp+gprsize+16*18+4*3]
2439%endif
2440    pmulhrsw             m0, m10
2441    pmulhrsw             m1, m10
2442    packuswb             m0, m1
2443    movq [rsp+gprsize+16*8], m0
2444    cmp                 r3d, 8
2445    jge .w8_filter_top_end
2446    movq                 m0, [tlq+r3+1]
2447    movq [rsp+gprsize+r3+16*8], m0
2448.w8_filter_top_end:
2449    ret
2450.w16:
2451    test             angled, 0x400
2452    jnz .w4_main
2453    lea                 r3d, [hq+15]
2454    sub              angled, 90
2455    movd                 m0, r3d
2456    mov                 r3d, 90
2457    movd                 m1, angled
2458    sub                 r3d, angled ; 180 - angle
2459    shr              angled, 8 ; is_sm << 1
2460    movd                 m6, r3d
2461    REPX     {pshufb x, m7}, m0, m1, m6
2462    movq                 m3, [base+z_filter_t_w16+angleq*4]
2463    pcmpeqb              m0, [base+z_filter_wh16]
2464    pand                 m1, m0
2465    pand                 m6, m0
2466    pcmpgtb              m1, m3
2467    pcmpgtb              m6, m3
2468    pmovmskb            r5d, m1
2469    mov                  r3, tlq
2470    test                r5d, r5d
2471    jz .w16_filter_left ; filter_strength == 0
2472    imul                r5d, 0x24924924
2473    pshufb               m5, [base+z_filter_t_w16] ; tlq[16]
2474    shr                 r5d, 30
2475    adc                  r5, -4 ; filter_strength-3
2476    movd         [rsp+16*9], m5
2477    movddup              m7, [base+z_filter_k+8*2+r5*8+24*0]
2478    movu                 m1, [rsp+16*8-2]
2479    movu                 m2, [rsp+16*8-1]
2480    punpcklbw            m0, m1, m2
2481    pmaddubsw            m0, m7
2482    punpckhbw            m1, m2
2483    pmaddubsw            m1, m7
2484    movddup              m7, [base+z_filter_k+8*2+r5*8+24*1]
2485    mova                 m3, [rsp+16*8+0]
2486    movu                 m4, [rsp+16*8+1]
2487    punpcklbw            m2, m3, m4
2488    pmaddubsw            m2, m7
2489    punpckhbw            m3, m4
2490    pmaddubsw            m3, m7
2491    paddw                m0, m2
2492    paddw                m1, m3
2493    test                r5d, r5d
2494    jnz .w16_filter_end ; 3-tap
2495    movddup              m7, [base+z_filter_k+8*8]
2496    movu                 m3, [rsp+16*8+2]
2497    punpcklbw            m2, m3, m3
2498    pmaddubsw            m2, m7
2499    punpckhbw            m3, m3
2500    pmaddubsw            m3, m7
2501    paddw                m0, m2
2502    paddw                m1, m3
2503.w16_filter_end:
2504    mov                 r2d, maxwm
2505    pmulhrsw             m0, m10
2506    pmulhrsw             m1, m10
2507    packuswb             m0, m1
2508    mova         [rsp+16*8], m0
2509    cmp                 r2d, 16
2510    jge .w16_filter_left
2511    movu                 m0, [r3+r2+1]
2512    movu      [rsp+r2+16*8], m0
2513.w16_filter_left:
2514    pmovmskb            r5d, m6
2515    test                r5d, r5d
2516    jz .w4_main
2517    imul                r5d, 0x24924924
2518    shr                 r5d, 30
2519    adc                  r5, -4 ; filter_strength-3
2520    jmp .filter_left
2521.w32:
2522    test             angled, 0x400
2523    jnz .w4_main
2524    pshufb               m6, [base+z_filter_t_w16] ; tlq[32]
2525    mov                  r3, tlq
2526    lea                 tlq, [rsp+16*9]
2527    movd         [tlq+16*1], m6
2528    xor                 r5d, r5d ; filter_strength = 3
2529    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2530    mova                 m0, [tlq+16*0]
2531    mova                 m1, [tlq+16*1]
2532    mov                 r2d, maxwm
2533    mova         [rsp+16*8], m0
2534    mova         [rsp+16*9], m1
2535    cmp                 r2d, 32
2536    jge .filter_left
2537    movu                 m0, [r3+r2+16*0+1]
2538    movu                 m1, [r3+r2+16*1+1]
2539    movu      [rsp+r2+16*8], m0
2540    movu      [rsp+r2+16*9], m1
2541    jmp .filter_left
2542.w64:
2543    movu                 m0, [tlq+16*2+1]
2544    movu                 m1, [tlq+16*3+1]
2545    mova        [rsp+16*10], m0
2546    mova        [rsp+16*11], m1
2547    test             angled, 0x400
2548    jnz .w4_main
2549    pshufb               m1, [base+z_filter_t_w16] ; tlq[64]
2550    mov                  r3, tlq
2551    lea                 tlq, [rsp+16*11]
2552    movd         [tlq+16*1], m1
2553    xor                 r5d, r5d ; filter_strength = 3
2554    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2555    sub                 tlq, 16*2
2556    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2557    mova                 m0, [tlq+16*0]
2558    mova                 m1, [tlq+16*1]
2559    mova                 m2, [tlq+16*2]
2560    mova                 m3, [tlq+16*3]
2561    mov                 r2d, maxwm
2562    mova        [rsp+16* 8], m0
2563    mova        [rsp+16* 9], m1
2564    mova        [rsp+16*10], m2
2565    mova        [rsp+16*11], m3
2566    cmp                 r2d, 64
2567    jge .filter_left
2568    movu                 m0, [r3+r2+16*0+1]
2569    movu                 m1, [r3+r2+16*1+1]
2570    movu     [rsp+r2+16* 8], m0
2571    movu     [rsp+r2+16* 9], m1
2572    cmp                 r2d, 32
2573    jge .filter_left
2574    movu                 m0, [r3+r2+16*2+1]
2575    movu                 m1, [r3+r2+16*3+1]
2576    movu     [rsp+r2+16*10], m0
2577    movu     [rsp+r2+16*11], m1
2578.filter_left:
2579    neg                  hq
2580    movd                 m0, [r3+hq]
2581    pxor                 m1, m1
2582    pshufb               m0, m1
2583    movd    [rsp+16*6+hq-4], m0
2584    lea                 tlq, [rsp+16*5]
2585    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2586    cmp                  hd, -32
2587    jge .filter_left_end
2588    sub                 tlq, 16*2
2589    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2590    mova                 m0, [tlq+16*0]
2591    mova                 m1, [tlq+16*1]
2592    mova         [rsp+16*2], m0
2593    mova         [rsp+16*3], m1
2594.filter_left_end:
2595    mov                 r2d, maxhm
2596    mova                 m0, [rsp+16*5]
2597    mova                 m1, [rsp+16*6]
2598    mova                 m2, [rsp+16*7]
2599    neg                  r2
2600    mova         [rsp+16*4], m0
2601    mova         [rsp+16*5], m1
2602    mova         [rsp+16*6], m2
2603    cmp                 r2d, hd
2604    jle .w4_main
2605    movu                 m0, [r3+r2-16*2]
2606    movu                 m1, [r3+r2-16*1]
2607    movu      [rsp+r2+16*4], m0
2608    movu      [rsp+r2+16*5], m1
2609    cmp                 r2d, -32
2610    jle .w4_main
2611    movu                 m0, [r3+r2-16*4]
2612    movu                 m1, [r3+r2-16*3]
2613    movu      [rsp+r2+16*2], m0
2614    movu      [rsp+r2+16*3], m1
2615    jmp .w4_main
2616
2617%if ARCH_X86_64
2618cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w
2619    %define            base  r7-$$
2620    lea                  r7, [$$]
2621    mova                 m8, [base+pw_62]
2622    mova                 m9, [base+pw_64]
2623    mova                m10, [base+pw_512]
2624    mov              org_wd, wd
2625%else
2626cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
2627    %define            base  r1-$$
2628    %define              m8  [base+pw_62]
2629    %define              m9  [base+pw_64]
2630    %define             m10  [base+pw_512]
2631    %define          org_wd  r5
2632    %define          org_wq  r5
2633    mov    [dstq+strideq*0], strideq
2634    mov    [dstq+strideq*1], wd
2635    LEA                  r1, $$
2636%endif
2637    tzcnt                hd, hm
2638    movifnidn        angled, anglem
2639    dec                 tlq
2640    movsxd               hq, [base+ipred_z3_ssse3_table+hq*4]
2641    sub              angled, 180
2642    mov                 dyd, angled
2643    neg                 dyd
2644    xor              angled, 0x400
2645    or                  dyq, ~0x7e
2646    lea                  hq, [base+ipred_z3_ssse3_table+hq]
2647    movzx               dyd, word [base+dr_intra_derivative+45*2-1+dyq]
2648    jmp                  hq
2649.h4:
2650    lea                 r4d, [angleq+88]
2651    test                r4d, 0x480
2652    jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
2653    sar                 r4d, 9
2654    add                 r4d, wd
2655    cmp                 r4d, 8
2656    jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
2657    movu                 m3, [tlq-7]
2658    movu                 m1, [base+z_upsample1-4]
2659    movu                 m4, [base+z_filter_s+2]
2660    pshufb               m0, m3, m1
2661    pxor                 m1, m1
2662    pshufb               m2, m3, m1
2663    pshufb               m1, m3, m4
2664    mova           [rsp+16], m2 ; top[max_base_y]
2665    movddup              m2, [base+pb_36_m4]
2666    add                 dyd, dyd
2667    pmaddubsw            m0, m2
2668    pmaddubsw            m1, m2
2669    movd                 m5, dyd
2670    mov                 r5d, dyd
2671    pshufb               m5, [base+pw_256]
2672    paddw                m0, m1
2673    pmulhrsw             m0, m10
2674    shl                  wd, 2
2675    mov                 tlq, rsp
2676    sub                 rsp, wq
2677    packuswb             m0, m0
2678    punpcklbw            m0, m3
2679    paddw                m6, m5, m5
2680    punpcklqdq           m5, m6
2681    pshufb               m0, [base+pb_15to0]
2682    mova              [tlq], m0
2683.h4_upsample_loop:
2684    lea                 r4d, [r5+dyq]
2685    shr                 r5d, 6
2686    movq                 m0, [tlq+r5]
2687    lea                 r5d, [r4+dyq]
2688    shr                 r4d, 6
2689    movhps               m0, [tlq+r4]
2690    pand                 m2, m8, m5
2691    psubw                m1, m9, m2
2692    psllw                m2, 8
2693    por                  m1, m2
2694    pmaddubsw            m0, m1
2695    paddw                m5, m6
2696    pmulhrsw             m0, m10
2697    packuswb             m0, m0
2698    movq         [rsp+wq-8], m0
2699    sub                  wd, 8
2700    jg .h4_upsample_loop
2701    jmp .h4_transpose
2702.h4_no_upsample:
2703    mov                 r4d, 7
2704    test             angled, 0x400 ; !enable_intra_edge_filter
2705    jnz .h4_main
2706    lea                 r4d, [wq+3]
2707    movd                 m0, r4d
2708    movd                 m2, angled
2709    shr              angled, 8 ; is_sm << 1
2710    pxor                 m1, m1
2711    pshufb               m0, m1
2712    pshufb               m2, m1
2713    pcmpeqb              m1, m0, [base+z_filter_wh4]
2714    pand                 m1, m2
2715    pcmpgtb              m1, [base+z_filter_t_w48+angleq*8]
2716    pmovmskb            r5d, m1
2717    mov                 r4d, 7
2718    test                r5d, r5d
2719    jz .h4_main ; filter_strength == 0
2720    movu                 m2, [tlq-7]
2721    imul                r5d, 0x55555555
2722    movu                 m3, [base+z_filter_s-2]
2723    shr                 r5d, 30 ; filter_strength
2724    mova                 m4, [base+z_upsample2]
2725    movddup              m5, [base+z_filter_k-8+r5*8+24*0]
2726    movddup              m6, [base+z_filter_k-8+r5*8+24*1]
2727    movddup              m7, [base+z_filter_k-8+r5*8+24*2]
2728    pshufb               m0, m2, m3
2729    shufps               m3, m4, q2121
2730    pmaddubsw            m1, m0, m5
2731    pmaddubsw            m0, m6
2732    pshufb               m5, m2, m3
2733    pmaddubsw            m3, m5, m6
2734    pmaddubsw            m5, m7
2735    pshufb               m2, m4
2736    pmaddubsw            m2, m7
2737    paddw                m0, m1
2738    paddw                m1, m3
2739    paddw                m0, m5
2740    paddw                m1, m2
2741    pmulhrsw             m0, m10
2742    pmulhrsw             m1, m10
2743    lea                 r2d, [r4+2]
2744    cmp                  wd, 4
2745    cmovne              r4d, r2d
2746    pshufd               m0, m0, q0000
2747    lea                 tlq, [rsp+15]
2748    packuswb             m0, m1
2749    mova              [rsp], m0
2750.h4_main:
2751    movd                 m5, dyd
2752    movddup              m0, [base+z_base_inc] ; base_inc << 6
2753    sub                 tlq, r4
2754    shl                 r4d, 6
2755    movd                 m7, [tlq]
2756    movd                 m4, r4d
2757    pshufb               m5, [base+pw_256]
2758    neg                 dyq
2759    pshufb               m7, [base+pw_m256]
2760    mova                 m3, [base+z3_shuf_h4]
2761    lea                  r5, [dyq+r4+63] ; ypos
2762    pshufb               m4, [base+pw_256]
2763    psubw                m4, m0 ; max_base_y
2764    shl                  wd, 2
2765    paddw                m6, m5, m5
2766    sub                 rsp, wq
2767    punpcklqdq           m5, m6
2768.h4_loop:
2769    lea                  r4, [r5+dyq]
2770    sar                  r5, 6
2771    movq                 m0, [tlq+r5-4]
2772    lea                  r5, [r4+dyq]
2773    sar                  r4, 6
2774    movhps               m0, [tlq+r4-4]
2775    pand                 m2, m8, m5
2776    psubw                m1, m9, m2
2777    psllw                m2, 8
2778    pshufb               m0, m3
2779    por                  m1, m2
2780    pmaddubsw            m0, m1
2781    pcmpgtw              m1, m4, m5
2782    paddw                m5, m6
2783    pmulhrsw             m0, m10
2784    pand                 m0, m1
2785    pandn                m1, m7
2786    por                  m0, m1
2787    packuswb             m0, m0
2788    movq         [rsp+wq-8], m0
2789    sub                  wd, 8
2790    jz .h4_transpose
2791    test                r5d, r5d
2792    jg .h4_loop
2793    packuswb             m7, m7
2794.h4_end_loop:
2795    movq         [rsp+wq-8], m7
2796    sub                  wd, 8
2797    jg .h4_end_loop
2798.h4_transpose:
2799    mova                 m1, [base+z_transpose4]
2800%if ARCH_X86_32
2801    mov             strideq, [dstq]
2802    mov              org_wd, [dstq+strideq]
2803%endif
2804    lea                  r2, [strideq*3]
2805    lea                dstq, [dstq+org_wq-4]
2806.h4_transpose_loop:
2807    mova                 m0, [rsp]
2808    add                 rsp, 16
2809    pshufb               m0, m1
2810    movd   [dstq+strideq*0], m0
2811    pshuflw              m2, m0, q1032
2812    movd   [dstq+strideq*1], m2
2813    punpckhqdq           m0, m0
2814    movd   [dstq+strideq*2], m0
2815    psrlq                m0, 32
2816    movd   [dstq+r2       ], m0
2817    sub                dstq, 4
2818    sub              org_wd, 4
2819    jg .h4_transpose_loop
2820    RET
2821.h8:
2822    lea                 r4d, [angleq+88]
2823    and                 r4d, ~0x7f
2824    or                  r4d, wd
2825    cmp                 r4d, 8
2826    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
2827    mova                 m4, [tlq-15]
2828    and                 r4d, 4
2829    movu                 m3, [tlq- 9]
2830    movd                 m1, r4d
2831    movu                 m2, [base+z_filter_s+2]
2832    pxor                 m0, m0
2833    movu                 m5, [base+z_filter_s+6]
2834    movddup              m7, [base+pb_36_m4]
2835    pshufb               m1, m0 ; w & 4
2836    movu                 m0, [base+z_upsample1-4]
2837    pmaxub               m1, m0 ; clip 4x8
2838    add                 dyd, dyd
2839    pshufb               m0, m4, m1
2840    pmaddubsw            m0, m7
2841    pshufb               m1, m4, m2
2842    pmaddubsw            m1, m7
2843    pshufb               m2, m3, [base+z_upsample1]
2844    pmaddubsw            m2, m7
2845    pshufb               m3, m5
2846    pmaddubsw            m3, m7
2847    movd                 m5, dyd
2848    neg                 dyq
2849    paddw                m1, m0
2850    paddw                m2, m3
2851    pmulhrsw             m1, m10
2852    pmulhrsw             m2, m10
2853    shl                  wd, 3
2854    lea                 tlq, [rsp+16]
2855    pshufb               m5, [base+pw_256]
2856    sub                 rsp, wq
2857    packuswb             m1, m2
2858    lea                  r5, [dyq+63]
2859    punpcklbw            m0, m1, m4
2860    punpckhbw            m1, m4
2861    mova         [tlq-16*1], m0
2862    mova         [tlq-16*0], m1
2863    paddw                m6, m5, m5
2864    punpcklqdq           m5, m6
2865.h8_upsample_loop:
2866    lea                  r4, [r5+dyq]
2867    sar                  r5, 6
2868    movu                 m0, [tlq+r5]
2869    lea                  r5, [r4+dyq]
2870    sar                  r4, 6
2871    movu                 m1, [tlq+r4]
2872    pand                 m3, m8, m5
2873    psubw                m2, m9, m3
2874    psllw                m2, 8
2875    por                  m3, m2
2876    pshufd               m2, m3, q1010
2877    pmaddubsw            m0, m2
2878    punpckhqdq           m3, m3
2879    pmaddubsw            m1, m3
2880    paddw                m5, m6
2881    pmulhrsw             m0, m10
2882    pmulhrsw             m1, m10
2883    packuswb             m1, m0
2884    mova        [rsp+wq-16], m1
2885    sub                  wd, 16
2886    jg .h8_upsample_loop
2887    jmp .h8_transpose
2888.h8_no_upsample:
2889    lea                 r4d, [wq+7]
2890    movd                 m0, r4d
2891    and                 r4d, 7
2892    or                  r4d, 8 ; imin(w+7, 15)
2893    test             angled, 0x400
2894    jnz .h8_main
2895    movd                 m2, angled
2896    shr              angled, 8 ; is_sm << 1
2897    pxor                 m1, m1
2898    pshufb               m0, m1
2899    pshufb               m2, m1
2900    movu                 m1, [base+z_filter_wh8]
2901    psrldq               m3, [base+z_filter_t_w48+angleq*8], 4
2902    pcmpeqb              m1, m0
2903    pand                 m1, m2
2904    pcmpgtb              m1, m3
2905    pmovmskb            r5d, m1
2906    test                r5d, r5d
2907    jz .h8_main ; filter_strength == 0
2908    mova                 m0, [tlq-15]
2909    imul                r5d, 0x55555555
2910    movd                 m1, [tlq+1]
2911    neg                  r4
2912    movd                 m2, [tlq+r4]
2913    shr                 r5d, 30
2914    pxor                 m7, m7
2915    lea                 tlq, [rsp+16*2]
2916    sub                  r5, 3 ; filter_strength-3
2917    mova         [tlq+16*0], m0
2918    pshufb               m1, m7
2919    mova         [tlq+16*1], m1
2920    pshufb               m2, m7
2921    movq         [tlq+r4+8], m2
2922    neg                 r4d
2923    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2924    sar                 r5d, 1
2925    add                 tlq, 31
2926    add                 r5d, 17
2927    cmp                  wd, 8
2928    cmova               r4d, r5d
2929.h8_main:
2930    movd                 m5, dyd
2931    sub                 tlq, r4
2932    shl                 r4d, 6
2933    movd                 m7, [tlq]
2934    movd                 m4, r4d
2935    pshufb               m5, [base+pw_256]
2936    neg                 dyq
2937    pshufb               m7, [base+pw_m256]
2938    mova                 m3, [base+z3_shuf]
2939    lea                  r5, [dyq+r4+63]
2940    pshufb               m4, [base+pw_256]
2941    psubw                m4, [base+z3_base_inc]
2942    shl                  wd, 3
2943    mova                 m6, m5
2944    sub                 rsp, wq
2945.h8_loop:
2946    mov                  r4, r5
2947    sar                  r4, 6
2948    movu                 m0, [tlq+r4-8]
2949    pand                 m2, m8, m5
2950    psubw                m1, m9, m2
2951    psllw                m2, 8
2952    pshufb               m0, m3
2953    por                  m1, m2
2954    pmaddubsw            m0, m1
2955    pcmpgtw              m1, m4, m5
2956    paddw                m5, m6
2957    pmulhrsw             m0, m10
2958    pand                 m0, m1
2959    pandn                m1, m7
2960    por                  m0, m1
2961    packuswb             m0, m0
2962    movq         [rsp+wq-8], m0
2963    sub                  wd, 8
2964    jz .h8_transpose
2965    add                  r5, dyq
2966    jg .h8_loop
2967    packuswb             m7, m7
2968.h8_end_loop:
2969    movq         [rsp+wq-8], m7
2970    sub                  wd, 8
2971    jg .h8_end_loop
2972.h8_transpose:
2973%if ARCH_X86_32
2974    mov             strideq, [dstq]
2975    mov              org_wd, [dstq+strideq]
2976%endif
2977    or                  r3d, 8
2978    cmp              org_wd, 4
2979%if ARCH_X86_64
2980    jne .end_transpose_main
2981%else
2982    jne .end_transpose_loop
2983%endif
2984    mova                 m1, [rsp+16*1]
2985    mova                 m0, [rsp+16*0]
2986    lea                  r2, [strideq*3]
2987    add                 rsp, 16*2
2988    punpcklbw            m2, m1, m0
2989    punpckhbw            m1, m0
2990    punpckhbw            m0, m1, m2
2991    punpcklbw            m1, m2
2992.write_4x8_end:
2993    call .write_4x8
2994    RET
2995.write_4x8:
2996    movd   [dstq+r2       ], m0
2997    pshuflw              m4, m0, q1032
2998    movd   [dstq+strideq*2], m4
2999    punpckhqdq           m0, m0
3000    movd   [dstq+strideq*1], m0
3001    psrlq                m0, 32
3002    movd   [dstq+strideq*0], m0
3003    lea                dstq, [dstq+strideq*4]
3004    movd   [dstq+r2       ], m1
3005    pshuflw              m4, m1, q1032
3006    movd   [dstq+strideq*2], m4
3007    punpckhqdq           m1, m1
3008    movd   [dstq+strideq*1], m1
3009    psrlq                m1, 32
3010    movd   [dstq+strideq*0], m1
3011    ret
3012.h16:
3013    lea                 r4d, [wq+15]
3014    movd                 m0, r4d
3015    and                 r4d, 15
3016    or                  r4d, 16 ; imin(w+15, 31)
3017    test             angled, 0x400
3018    jnz .h16_main
3019    movd                 m2, angled
3020    shr              angled, 8 ; is_sm << 1
3021    pxor                 m1, m1
3022    pshufb               m0, m1
3023    pshufb               m2, m1
3024    movq                 m3, [base+z_filter_t_w16+angleq*4]
3025    pcmpeqb              m1, m0, [base+z_filter_wh16]
3026    pand                 m1, m2
3027    pcmpgtb              m1, m3
3028    pmovmskb            r5d, m1
3029    test                r5d, r5d
3030    jz .h16_main ; filter_strength == 0
3031    mova                 m0, [tlq-16*2+1]
3032    imul                r5d, 0x24924924
3033    mova                 m1, [tlq-16*1+1]
3034    neg                  r4
3035    movd                 m2, [tlq-16*0+1]
3036    shr                 r5d, 30
3037    movd                 m3, [tlq+r4]
3038    adc                  r5, -4 ; filter_strength-3
3039    pxor                 m7, m7
3040    lea                 tlq, [rsp+16*2]
3041    mova         [tlq-16*1], m0
3042    pshufb               m2, m7
3043    mova         [tlq+16*0], m1
3044    pshufb               m3, m7
3045    mova         [tlq+16*1], m2
3046    movq         [tlq+r4+8], m3
3047    neg                 r4d
3048    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3049    add                 tlq, 31
3050    cmp                  wd, 16
3051    jle .h16_main
3052    pshuflw              m0, [tlq-47], q0000
3053    sar                  r5, 1
3054    movq                 m1, [base+z3_filter_k_tail+r5*4]
3055    lea                 r4d, [r5+33]
3056    pmaddubsw            m0, m1
3057%if ARCH_X86_64
3058    pmulhrsw             m0, m10
3059%else
3060    pmulhrsw             m0, m4
3061%endif
3062    packuswb             m0, m0
3063    movd           [tlq-35], m0
3064.h16_main:
3065    movd                 m5, dyd
3066    sub                 tlq, r4
3067    movd                 m4, r4d
3068    shl                 r4d, 6
3069    movd                 m7, [tlq]
3070    pxor                 m6, m6
3071    pshufb               m5, [base+pw_256]
3072    neg                 dyq
3073    pshufb               m7, m6
3074    mova                 m3, [base+z3_shuf]
3075    lea                  r5, [dyq+r4+63]
3076    pshufb               m4, m6
3077    psubb                m4, [base+pb_15to0]
3078    shl                  wd, 4
3079    mova                 m6, m5
3080    sub                 rsp, wq
3081.h16_loop:
3082    mov                  r4, r5
3083    pand                 m2, m8, m5
3084    sar                  r4, 6
3085    psubw                m1, m9, m2
3086    psllw                m2, 8
3087    movu                 m0, [tlq+r4-8*2]
3088    por                  m2, m1
3089    movu                 m1, [tlq+r4-8*1]
3090    pshufb               m0, m3
3091    pmaddubsw            m0, m2
3092    pshufb               m1, m3
3093    pmaddubsw            m1, m2
3094    psrlw                m2, m5, 6
3095    paddw                m5, m6
3096    pmulhrsw             m0, m10
3097    pmulhrsw             m1, m10
3098    packsswb             m2, m2
3099    packuswb             m0, m1
3100    pcmpgtb              m1, m4, m2
3101    pand                 m0, m1
3102    pandn                m1, m7
3103    por                  m0, m1
3104    mova        [rsp+wq-16], m0
3105    sub                  wd, 16
3106    jz .h16_transpose
3107    add                  r5, dyq
3108    jg .h16_loop
3109.h16_end_loop:
3110    mova        [rsp+wq-16], m7
3111    sub                  wd, 16
3112    jg .h16_end_loop
3113.h16_transpose:
3114%if ARCH_X86_32
3115    mov             strideq, [dstq]
3116    mov              org_wd, [dstq+strideq]
3117%endif
3118    or                  r3d, 16
3119    cmp              org_wd, 4
3120%if ARCH_X86_64
3121    jne .end_transpose_main
3122%else
3123    jne .end_transpose_loop
3124%endif
3125.h16_transpose_w4:
3126    mova                 m2, [rsp+16*3]
3127    mova                 m4, [rsp+16*2]
3128    mova                 m3, [rsp+16*1]
3129    mova                 m0, [rsp+16*0]
3130    lea                  r2, [strideq*3]
3131    add                 rsp, 16*4
3132    punpckhbw            m1, m2, m4
3133    punpcklbw            m2, m4
3134    punpckhbw            m4, m3, m0
3135    punpcklbw            m3, m0
3136    punpckhwd            m0, m1, m4
3137    punpcklwd            m1, m4
3138    call .write_4x8
3139    lea                dstq, [dstq+strideq*4]
3140    punpckhwd            m0, m2, m3
3141    punpcklwd            m1, m2, m3
3142    jmp .write_4x8_end
3143.h32:
3144    lea                 r4d, [wq+31]
3145    and                 r4d, 31
3146    or                  r4d, 32 ; imin(w+31, 63)
3147    test             angled, 0x400 ; !enable_intra_edge_filter
3148    jnz .h32_main
3149    mova                 m0, [tlq-16*4+1]
3150    mova                 m1, [tlq-16*3+1]
3151    mova                 m2, [tlq-16*2+1]
3152    mova                 m3, [tlq-16*1+1]
3153    movd                 m4, [tlq-16*0+1]
3154    neg                  r4
3155    movd                 m5, [tlq+r4]
3156    pxor                 m7, m7
3157    lea                 tlq, [rsp+16*4]
3158    mova         [tlq-16*3], m0
3159    mova         [tlq-16*2], m1
3160    xor                 r5d, r5d ; filter_strength = 3
3161    mova         [tlq-16*1], m2
3162    pshufb               m4, m7
3163    mova         [tlq+16*0], m3
3164    pshufb               m5, m7
3165    mova         [tlq+16*1], m4
3166    movq         [tlq+r4+8], m5
3167    neg                 r4d
3168    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3169    sub                 tlq, 16*2
3170    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3171    add                 tlq, 63
3172    cmp                  wd, 32
3173    jle .h32_main
3174    pshuflw              m0, [tlq-79], q0000
3175    movq                 m1, [base+z3_filter_k_tail]
3176    add                 r4d, 2
3177    pmaddubsw            m0, m1
3178%if ARCH_X86_64
3179    pmulhrsw             m0, m10
3180%else
3181    pmulhrsw             m0, m4
3182%endif
3183    packuswb             m0, m0
3184    movd           [tlq-67], m0
3185.h32_main:
3186    movd                 m5, dyd
3187    sub                 tlq, r4
3188    movd                 m4, r4d
3189    shl                 r4d, 6
3190    movd                 m7, [tlq]
3191    pxor                 m6, m6
3192    pshufb               m5, [base+pw_256]
3193    neg                 dyq
3194    pshufb               m7, m6
3195    mova                 m3, [base+z3_shuf]
3196    lea                  r5, [dyq+r4+63]
3197    pshufb               m4, m6
3198    psubb                m4, [base+pb_15to0]
3199    mova                 m6, m5
3200.h32_loop:
3201    mov                  r4, r5
3202    pand                 m2, m8, m5
3203    sar                  r4, 6
3204    psubw                m1, m9, m2
3205    psllw                m2, 8
3206    movu                 m0, [tlq+r4-8*4]
3207    por                  m2, m1
3208    movu                 m1, [tlq+r4-8*3]
3209    pshufb               m0, m3
3210    pmaddubsw            m0, m2
3211    pshufb               m1, m3
3212    pmaddubsw            m1, m2
3213    pmulhrsw             m0, m10
3214    pmulhrsw             m1, m10
3215    sub                 rsp, 32
3216    packuswb             m0, m1
3217    mova         [rsp+16*0], m0
3218    movu                 m0, [tlq+r4-8*2]
3219    movu                 m1, [tlq+r4-8*1]
3220    pshufb               m0, m3
3221    pshufb               m1, m3
3222    pmaddubsw            m0, m2
3223    pmaddubsw            m1, m2
3224    pmulhrsw             m0, m10
3225    pmulhrsw             m1, m10
3226    psrlw                m2, m5, 6
3227    paddw                m5, m6
3228    packsswb             m2, m2
3229    packuswb             m0, m1
3230    pcmpgtb              m1, m4, m2
3231    paddsb               m2, [base+pb_16]
3232    pand                 m0, m1
3233    pandn                m1, m7
3234    por                  m0, m1
3235    pcmpgtb              m1, m4, m2
3236    mova         [rsp+16*1], m0
3237    pand                 m0, m1, [rsp+16*0]
3238    pandn                m1, m7
3239    por                  m0, m1
3240    mova         [rsp+16*0], m0
3241    dec                  wd
3242    jz .h32_transpose
3243    add                  r5, dyq
3244    jg .h32_loop
3245.h32_end_loop:
3246    sub                 rsp, 32
3247    mova         [rsp+16*1], m7
3248    mova         [rsp+16*0], m7
3249    dec                  wd
3250    jg .h32_end_loop
3251.h32_transpose:
3252    or                  r3d, 32
3253    jmp .end_transpose_main
3254.h64:
3255    lea                 r4d, [wq+63]
3256    test             angled, 0x400 ; !enable_intra_edge_filter
3257    jnz .h64_main
3258    mova                 m0, [tlq-16*8+1]
3259    mova                 m1, [tlq-16*7+1]
3260    mova                 m2, [tlq-16*6+1]
3261    mova                 m3, [tlq-16*5+1]
3262    mova         [rsp+16*1], m0
3263    mova         [rsp+16*2], m1
3264    mova         [rsp+16*3], m2
3265    mova         [rsp+16*4], m3
3266    mova                 m0, [tlq-16*4+1]
3267    mova                 m1, [tlq-16*3+1]
3268    mova                 m2, [tlq-16*2+1]
3269    mova                 m3, [tlq-16*1+1]
3270    movd                 m4, [tlq-16*0+1]
3271    neg                  r4
3272    movd                 m5, [tlq+r4]
3273    pxor                 m7, m7
3274    lea                 tlq, [rsp+16*8]
3275    mova         [tlq-16*3], m0
3276    mova         [tlq-16*2], m1
3277    xor                 r5d, r5d ; filter_strength = 3
3278    mova         [tlq-16*1], m2
3279    pshufb               m4, m7
3280    mova         [tlq+16*0], m3
3281    pshufb               m5, m7
3282    mova         [tlq+16*1], m4
3283    movq         [tlq+r4+8], m5
3284    neg                 r4d
3285    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3286    sub                 tlq, 16*2
3287    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3288    sub                 tlq, 16*2
3289    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3290    sub                 tlq, 16*2
3291    cmp                  wd, 64
3292    jl .h64_filter96 ; skip one call if the last 32 bytes aren't used
3293    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3294.h64_filter96:
3295    add                 tlq, 127
3296.h64_main:
3297    movd                 m5, dyd
3298    sub                 tlq, r4
3299    movd                 m4, r4d
3300    shl                 r4d, 6
3301    movd                 m7, [tlq]
3302    pxor                 m6, m6
3303    pshufb               m5, [base+pw_256]
3304    neg                 dyq
3305    pshufb               m7, m6
3306    mova                 m3, [base+z3_shuf]
3307    lea                  r5, [dyq+r4+63]
3308    pshufb               m4, m6
3309    psubb                m4, [base+pb_15to0]
3310    mova                 m6, m5
3311.h64_loop:
3312    mov                  r4, r5
3313    pand                 m2, m8, m5
3314    sar                  r4, 6
3315    psubw                m1, m9, m2
3316    psllw                m2, 8
3317    movu                 m0, [tlq+r4-8*8]
3318    por                  m2, m1
3319    movu                 m1, [tlq+r4-8*7]
3320    pshufb               m0, m3
3321    pmaddubsw            m0, m2
3322    pshufb               m1, m3
3323    pmaddubsw            m1, m2
3324    pmulhrsw             m0, m10
3325    pmulhrsw             m1, m10
3326    sub                 rsp, 64
3327    packuswb             m0, m1
3328    mova         [rsp+16*0], m0
3329    movu                 m0, [tlq+r4-8*6]
3330    movu                 m1, [tlq+r4-8*5]
3331    pshufb               m0, m3
3332    pshufb               m1, m3
3333    pmaddubsw            m0, m2
3334    pmaddubsw            m1, m2
3335    pmulhrsw             m0, m10
3336    pmulhrsw             m1, m10
3337    packuswb             m0, m1
3338    mova         [rsp+16*1], m0
3339    movu                 m0, [tlq+r4-8*4]
3340    movu                 m1, [tlq+r4-8*3]
3341    pshufb               m0, m3
3342    pshufb               m1, m3
3343    pmaddubsw            m0, m2
3344    pmaddubsw            m1, m2
3345    pmulhrsw             m0, m10
3346    pmulhrsw             m1, m10
3347    packuswb             m0, m1
3348    mova         [rsp+16*2], m0
3349    movu                 m0, [tlq+r4-8*2]
3350    movu                 m1, [tlq+r4-8*1]
3351    pshufb               m0, m3
3352    pshufb               m1, m3
3353    pmaddubsw            m0, m2
3354    pmaddubsw            m1, m2
3355    pmulhrsw             m0, m10
3356    pmulhrsw             m1, m10
3357    psrlw                m2, m5, 6
3358    paddw                m5, m6
3359    packsswb             m2, m2
3360    packuswb             m0, m1
3361    pcmpgtb              m1, m4, m2
3362    paddsb               m2, [base+pb_16]
3363    pand                 m0, m1
3364    pandn                m1, m7
3365    por                  m0, m1
3366    pcmpgtb              m1, m4, m2
3367    paddsb               m2, [base+pb_16]
3368    mova         [rsp+16*3], m0
3369    pand                 m0, m1, [rsp+16*2]
3370    pandn                m1, m7
3371    por                  m0, m1
3372    pcmpgtb              m1, m4, m2
3373    paddsb               m2, [base+pb_16]
3374    mova         [rsp+16*2], m0
3375    pand                 m0, m1, [rsp+16*1]
3376    pandn                m1, m7
3377    por                  m0, m1
3378    pcmpgtb              m1, m4, m2
3379    mova         [rsp+16*1], m0
3380    pand                 m0, m1, [rsp+16*0]
3381    pandn                m1, m7
3382    por                  m0, m1
3383    mova         [rsp+16*0], m0
3384    dec                  wd
3385    jz .h64_transpose
3386    add                  r5, dyq
3387    jg .h64_loop
3388.h64_end_loop:
3389    sub                 rsp, 64
3390    mova         [rsp+16*3], m7
3391    mova         [rsp+16*2], m7
3392    mova         [rsp+16*1], m7
3393    mova         [rsp+16*0], m7
3394    dec                  wd
3395    jg .h64_end_loop
3396.h64_transpose:
3397    or                  r3d, 64
3398.end_transpose_main:
3399%if ARCH_X86_64
3400    lea                  r5, [r3*3]
3401    lea                  r7, [strideq*3]
3402%else
3403    mov             strideq, [dstq]
3404    mov              org_wd, [dstq+strideq]
3405%endif
3406.end_transpose_loop:
3407    lea                  r4, [rsp+r3-8]
3408    lea                  r6, [dstq+org_wq-8]
3409.end_transpose_loop_y:
3410    movq                 m0, [r4+r3*1]
3411    movq                 m4, [r4+r3*0]
3412%if ARCH_X86_64
3413    movq                 m1, [r4+r5  ]
3414    movq                 m5, [r4+r3*2]
3415    lea                  r2, [r4+r3*4]
3416%else
3417    lea                  r2, [r4+r3*2]
3418    movq                 m1, [r2+r3*1]
3419    movq                 m5, [r2+r3*0]
3420    lea                  r2, [r2+r3*2]
3421%endif
3422    movq                 m2, [r2+r3*1]
3423    movq                 m6, [r2+r3*0]
3424%if ARCH_X86_64
3425    movq                 m3, [r2+r5  ]
3426    movq                 m7, [r2+r3*2]
3427%else
3428    lea                  r2, [r2+r3*2]
3429    movq                 m3, [r2+r3*1]
3430    movq                 m7, [r2+r3*0]
3431%endif
3432    sub                  r4, 8
3433    punpcklbw            m0, m4
3434    punpcklbw            m1, m5
3435    punpcklbw            m2, m6
3436    punpcklbw            m3, m7
3437    punpckhwd            m4, m1, m0
3438    punpcklwd            m1, m0
3439    punpckhwd            m0, m3, m2
3440    punpcklwd            m3, m2
3441    punpckhdq            m2, m3, m1
3442    punpckldq            m3, m1
3443    punpckldq            m1, m0, m4
3444    punpckhdq            m0, m4
3445    movhps   [r6+strideq*0], m0
3446    movq     [r6+strideq*1], m0
3447%if ARCH_X86_64
3448    movhps   [r6+strideq*2], m1
3449    movq     [r6+r7       ], m1
3450    lea                  r6, [r6+strideq*4]
3451%else
3452    lea                  r6, [r6+strideq*2]
3453    movhps   [r6+strideq*0], m1
3454    movq     [r6+strideq*1], m1
3455    lea                  r6, [r6+strideq*2]
3456%endif
3457    movhps   [r6+strideq*0], m2
3458    movq     [r6+strideq*1], m2
3459%if ARCH_X86_64
3460    movhps   [r6+strideq*2], m3
3461    movq     [r6+r7       ], m3
3462    lea                  r6, [r6+strideq*4]
3463%else
3464    lea                  r6, [r6+strideq*2]
3465    movhps   [r6+strideq*0], m3
3466    movq     [r6+strideq*1], m3
3467    lea                  r6, [r6+strideq*2]
3468%endif
3469    cmp                  r4, rsp
3470    jae .end_transpose_loop_y
3471    lea                 rsp, [rsp+r3*8]
3472    sub              org_wd, 8
3473    jg .end_transpose_loop
3474    RET
3475
3476;-------------------------------------------------------------------------------
3477;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal,
3478;                         const uint8_t *idx, int w, int h);
3479;-------------------------------------------------------------------------------
3480cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
3481    movq                 m4, [palq]
3482    LEA                  r2, pal_pred_ssse3_table
3483    tzcnt                wd, wm
3484    movifnidn            hd, hm
3485    movsxd               wq, [r2+wq*4]
3486    add                  wq, r2
3487    lea                  r2, [strideq*3]
3488    jmp                  wq
3489.w4:
3490    movq                 m1, [idxq]
3491    add                idxq, 8
3492    psrlw                m0, m1, 4
3493    punpcklbw            m1, m0
3494    pshufb               m0, m4, m1
3495    movd   [dstq+strideq*0], m0
3496    pshuflw              m1, m0, q1032
3497    movd   [dstq+strideq*1], m1
3498    punpckhqdq           m0, m0
3499    movd   [dstq+strideq*2], m0
3500    psrlq                m0, 32
3501    movd   [dstq+r2       ], m0
3502    lea                dstq, [dstq+strideq*4]
3503    sub                  hd, 4
3504    jg .w4
3505    RET
3506.w8:
3507    movu                 m0, [idxq]
3508    add                idxq, 16
3509    pshufb               m1, m4, m0
3510    psrlw                m0, 4
3511    pshufb               m2, m4, m0
3512    punpcklbw            m0, m1, m2
3513    punpckhbw            m1, m2
3514    movq   [dstq+strideq*0], m0
3515    movhps [dstq+strideq*1], m0
3516    movq   [dstq+strideq*2], m1
3517    movhps [dstq+r2       ], m1
3518    lea                dstq, [dstq+strideq*4]
3519    sub                  hd, 4
3520    jg .w8
3521    RET
3522.w16:
3523    movu                 m0, [idxq]
3524    add                idxq, 16
3525    pshufb               m1, m4, m0
3526    psrlw                m0, 4
3527    pshufb               m2, m4, m0
3528    punpcklbw            m0, m1, m2
3529    punpckhbw            m1, m2
3530    mova   [dstq+strideq*0], m0
3531    mova   [dstq+strideq*1], m1
3532    lea                dstq, [dstq+strideq*2]
3533    sub                  hd, 2
3534    jg .w16
3535    RET
3536.w32:
3537    movu                 m0, [idxq]
3538    add                idxq, 16
3539    pshufb               m1, m4, m0
3540    psrlw                m0, 4
3541    pshufb               m2, m4, m0
3542    punpcklbw            m0, m1, m2
3543    punpckhbw            m1, m2
3544    mova        [dstq+16*0], m0
3545    mova        [dstq+16*1], m1
3546    add                dstq, strideq
3547    dec                  hd
3548    jg .w32
3549    RET
3550.w64:
3551    movu                 m0, [idxq+16*0]
3552    movu                 m2, [idxq+16*1]
3553    add                idxq, 32
3554    pshufb               m1, m4, m0
3555    psrlw                m0, 4
3556    pshufb               m3, m4, m0
3557    punpcklbw            m0, m1, m3
3558    punpckhbw            m1, m3
3559    mova        [dstq+16*0], m0
3560    mova        [dstq+16*1], m1
3561    pshufb               m1, m4, m2
3562    psrlw                m2, 4
3563    pshufb               m3, m4, m2
3564    punpcklbw            m0, m1, m3
3565    punpckhbw            m1, m3
3566    mova        [dstq+16*2], m0
3567    mova        [dstq+16*3], m1
3568    add                dstq, strideq
3569    sub                  hd, 1
3570    jg .w64
3571    RET
3572
3573;---------------------------------------------------------------------------------------
3574;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
3575;                           const int width, const int height, const int16_t *ac, const int alpha);
3576;---------------------------------------------------------------------------------------
3577%macro IPRED_CFL 1                   ; ac in, unpacked pixels out
3578    psignw               m3, m%1, m1
3579    pabsw               m%1, m%1
3580    pmulhrsw            m%1, m2
3581    psignw              m%1, m3
3582    paddw               m%1, m0
3583%endmacro
3584
3585%if UNIX64
3586DECLARE_REG_TMP 7
3587%else
3588DECLARE_REG_TMP 5
3589%endif
3590
3591cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
3592    movifnidn            wd, wm
3593    movifnidn            hd, hm
3594    tzcnt               r6d, hd
3595    lea                 t0d, [wq+hq]
3596    movd                 m4, t0d
3597    tzcnt               t0d, t0d
3598    movd                 m5, t0d
3599    LEA                  t0, ipred_cfl_ssse3_table
3600    tzcnt                wd, wd
3601    movsxd               r6, [t0+r6*4]
3602    movsxd               wq, [t0+wq*4+16]
3603    pcmpeqd              m3, m3
3604    psrlw                m4, 1
3605    add                  r6, t0
3606    add                  wq, t0
3607    movifnidn           acq, acmp
3608    jmp                  r6
3609.h4:
3610    movd                 m0, [tlq-4]
3611    pmaddubsw            m0, m3
3612    jmp                  wq
3613.w4:
3614    movd                 m1, [tlq+1]
3615    pmaddubsw            m1, m3
3616    psubw                m0, m4
3617    paddw                m0, m1
3618    pmaddwd              m0, m3
3619    cmp                  hd, 4
3620    jg .w4_mul
3621    psrlw                m0, 3                             ; dc >>= ctz(width + height);
3622    jmp .w4_end
3623.w4_mul:
3624    punpckhqdq           m1, m0, m0
3625    paddw                m0, m1
3626    pshuflw              m1, m0, q1032                     ; psrlq                m1, m0, 32
3627    paddw                m0, m1
3628    psrlw                m0, 2
3629    mov                 r6d, 0x5556
3630    mov                 r2d, 0x3334
3631    test                 hd, 8
3632    cmovz               r6d, r2d
3633    movd                 m5, r6d
3634    pmulhuw              m0, m5
3635.w4_end:
3636    pshuflw              m0, m0, q0000
3637    punpcklqdq           m0, m0
3638.s4:
3639    movd                 m1, alpham
3640    pshuflw              m1, m1, q0000
3641    punpcklqdq           m1, m1
3642    lea                  r6, [strideq*3]
3643    pabsw                m2, m1
3644    psllw                m2, 9
3645.s4_loop:
3646    mova                 m4, [acq]
3647    mova                 m5, [acq+16]
3648    IPRED_CFL             4
3649    IPRED_CFL             5
3650    packuswb             m4, m5
3651    movd   [dstq+strideq*0], m4
3652    pshuflw              m4, m4, q1032
3653    movd   [dstq+strideq*1], m4
3654    punpckhqdq           m4, m4
3655    movd   [dstq+strideq*2], m4
3656    psrlq                m4, 32
3657    movd   [dstq+r6       ], m4
3658    lea                dstq, [dstq+strideq*4]
3659    add                 acq, 32
3660    sub                  hd, 4
3661    jg .s4_loop
3662    RET
3663ALIGN function_align
3664.h8:
3665    movq                 m0, [tlq-8]
3666    pmaddubsw            m0, m3
3667    jmp                  wq
3668.w8:
3669    movq                 m1, [tlq+1]
3670    pmaddubsw            m1, m3
3671    psubw                m4, m0
3672    punpckhqdq           m0, m0
3673    psubw                m0, m4
3674    paddw                m0, m1
3675    pshuflw              m1, m0, q1032                  ; psrlq  m1, m0, 32
3676    paddw                m0, m1
3677    pmaddwd              m0, m3
3678    psrlw                m0, m5
3679    cmp                  hd, 8
3680    je .w8_end
3681    mov                 r6d, 0x5556
3682    mov                 r2d, 0x3334
3683    cmp                  hd, 32
3684    cmovz               r6d, r2d
3685    movd                 m1, r6d
3686    pmulhuw              m0, m1
3687.w8_end:
3688    pshuflw              m0, m0, q0000
3689    punpcklqdq           m0, m0
3690.s8:
3691    movd                 m1, alpham
3692    pshuflw              m1, m1, q0000
3693    punpcklqdq           m1, m1
3694    lea                  r6, [strideq*3]
3695    pabsw                m2, m1
3696    psllw                m2, 9
3697.s8_loop:
3698    mova                 m4, [acq]
3699    mova                 m5, [acq+16]
3700    IPRED_CFL             4
3701    IPRED_CFL             5
3702    packuswb             m4, m5
3703    movq   [dstq          ], m4
3704    movhps [dstq+strideq  ], m4
3705    mova                 m4, [acq+32]
3706    mova                 m5, [acq+48]
3707    IPRED_CFL             4
3708    IPRED_CFL             5
3709    packuswb             m4, m5
3710    movq   [dstq+strideq*2], m4
3711    movhps [dstq+r6       ], m4
3712    lea                dstq, [dstq+strideq*4]
3713    add                 acq, 64
3714    sub                  hd, 4
3715    jg .s8_loop
3716    RET
3717ALIGN function_align
3718.h16:
3719    mova                 m0, [tlq-16]
3720    pmaddubsw            m0, m3
3721    jmp                  wq
3722.w16:
3723    movu                 m1, [tlq+1]
3724    pmaddubsw            m1, m3
3725    paddw                m0, m1
3726    psubw                m4, m0
3727    punpckhqdq           m0, m0
3728    psubw                m0, m4
3729    pshuflw              m1, m0, q1032                  ; psrlq  m1, m0, 32
3730    paddw                m0, m1
3731    pmaddwd              m0, m3
3732    psrlw                m0, m5
3733    cmp                  hd, 16
3734    je .w16_end
3735    mov                 r6d, 0x5556
3736    mov                 r2d, 0x3334
3737    test                 hd, 8|32
3738    cmovz               r6d, r2d
3739    movd                 m1, r6d
3740    pmulhuw              m0, m1
3741.w16_end:
3742    pshuflw              m0, m0, q0000
3743    punpcklqdq           m0, m0
3744.s16:
3745    movd                 m1, alpham
3746    pshuflw              m1, m1, q0000
3747    punpcklqdq           m1, m1
3748    pabsw                m2, m1
3749    psllw                m2, 9
3750.s16_loop:
3751    mova                 m4, [acq]
3752    mova                 m5, [acq+16]
3753    IPRED_CFL             4
3754    IPRED_CFL             5
3755    packuswb             m4, m5
3756    mova             [dstq], m4
3757    mova                 m4, [acq+32]
3758    mova                 m5, [acq+48]
3759    IPRED_CFL             4
3760    IPRED_CFL             5
3761    packuswb             m4, m5
3762    mova     [dstq+strideq], m4
3763    lea                dstq, [dstq+strideq*2]
3764    add                 acq, 64
3765    sub                  hd, 2
3766    jg .s16_loop
3767    RET
3768ALIGN function_align
3769.h32:
3770    mova                 m0, [tlq-32]
3771    pmaddubsw            m0, m3
3772    mova                 m2, [tlq-16]
3773    pmaddubsw            m2, m3
3774    paddw                m0, m2
3775    jmp                  wq
3776.w32:
3777    movu                 m1, [tlq+1]
3778    pmaddubsw            m1, m3
3779    movu                 m2, [tlq+17]
3780    pmaddubsw            m2, m3
3781    paddw                m1, m2
3782    paddw                m0, m1
3783    psubw                m4, m0
3784    punpckhqdq           m0, m0
3785    psubw                m0, m4
3786    pshuflw              m1, m0, q1032                   ; psrlq  m1, m0, 32
3787    paddw                m0, m1
3788    pmaddwd              m0, m3
3789    psrlw                m0, m5
3790    cmp                  hd, 32
3791    je .w32_end
3792    lea                 r2d, [hq*2]
3793    mov                 r6d, 0x5556
3794    mov                 r2d, 0x3334
3795    test                 hd, 64|16
3796    cmovz               r6d, r2d
3797    movd                 m1, r6d
3798    pmulhuw              m0, m1
3799.w32_end:
3800    pshuflw              m0, m0, q0000
3801    punpcklqdq           m0, m0
3802.s32:
3803    movd                 m1, alpham
3804    pshuflw              m1, m1, q0000
3805    punpcklqdq           m1, m1
3806    pabsw                m2, m1
3807    psllw                m2, 9
3808.s32_loop:
3809    mova                 m4, [acq]
3810    mova                 m5, [acq+16]
3811    IPRED_CFL             4
3812    IPRED_CFL             5
3813    packuswb             m4, m5
3814    mova             [dstq], m4
3815    mova                 m4, [acq+32]
3816    mova                 m5, [acq+48]
3817    IPRED_CFL             4
3818    IPRED_CFL             5
3819    packuswb             m4, m5
3820    mova          [dstq+16], m4
3821    add                dstq, strideq
3822    add                 acq, 64
3823    dec                  hd
3824    jg .s32_loop
3825    RET
3826
3827;---------------------------------------------------------------------------------------
3828;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
3829;                           const int width, const int height, const int16_t *ac, const int alpha);
3830;---------------------------------------------------------------------------------------
3831cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
3832    mov                  hd, hm                                 ; zero upper half
3833    tzcnt               r6d, hd
3834    sub                 tlq, hq
3835    tzcnt                wd, wm
3836    movu                 m0, [tlq]
3837    mov                 t0d, 0x8000
3838    movd                 m3, t0d
3839    movd                 m2, r6d
3840    psrld                m3, m2
3841    LEA                  t0, ipred_cfl_left_ssse3_table
3842    movsxd               r6, [t0+r6*4]
3843    pcmpeqd              m2, m2
3844    pmaddubsw            m0, m2
3845    add                  r6, t0
3846    add                  t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
3847    movsxd               wq, [t0+wq*4]
3848    add                  wq, t0
3849    movifnidn           acq, acmp
3850    jmp                  r6
3851.h32:
3852    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
3853    pmaddubsw            m1, m2
3854    paddw                m0, m1
3855.h16:
3856    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
3857    paddw                m0, m1
3858.h8:
3859    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
3860    paddw                m0, m1
3861.h4:
3862    pmaddwd              m0, m2
3863    pmulhrsw             m0, m3
3864    pshuflw              m0, m0, q0000
3865    punpcklqdq           m0, m0
3866    jmp                  wq
3867
3868;---------------------------------------------------------------------------------------
3869;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
3870;                           const int width, const int height, const int16_t *ac, const int alpha);
3871;---------------------------------------------------------------------------------------
3872cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
3873    LEA                  t0, ipred_cfl_left_ssse3_table
3874    tzcnt                wd, wm
3875    inc                 tlq
3876    movu                 m0, [tlq]
3877    movifnidn            hd, hm
3878    mov                 r6d, 0x8000
3879    movd                 m3, r6d
3880    movd                 m2, wd
3881    psrld                m3, m2
3882    movsxd               r6, [t0+wq*4]
3883    pcmpeqd              m2, m2
3884    pmaddubsw            m0, m2
3885    add                  r6, t0
3886    add                  t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
3887    movsxd               wq, [t0+wq*4]
3888    add                  wq, t0
3889    movifnidn           acq, acmp
3890    jmp                  r6
3891
3892;---------------------------------------------------------------------------------------
3893;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
3894;                           const int width, const int height, const int16_t *ac, const int alpha);
3895;---------------------------------------------------------------------------------------
3896cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
3897    tzcnt                wd, wm
3898    movifnidn            hd, hm
3899    LEA                  r6, ipred_cfl_splat_ssse3_table
3900    movsxd               wq, [r6+wq*4]
3901    movddup              m0, [r6-ipred_cfl_splat_ssse3_table+pw_128]
3902    add                  wq, r6
3903    movifnidn           acq, acmp
3904    jmp                  wq
3905
3906%macro RELOAD_ACQ_32 1
3907    mov                 acq, ac_bakq       ; restore acq
3908%endmacro
3909
3910%if ARCH_X86_64
3911cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
3912DECLARE_REG_TMP 7
3913    movddup              m2, [pb_2]
3914%else
3915cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
3916DECLARE_REG_TMP 4
3917%define ac_bakq acmp
3918    mov                 t0d, 0x02020202
3919    movd                 m2, t0d
3920    pshufd               m2, m2, q0000
3921%endif
3922    movifnidn            wd, wm
3923    mov                 t0d, hm
3924    mov                  hd, t0d
3925    imul                t0d, wd
3926    movd                 m5, t0d
3927    movifnidn         hpadd, hpadm
3928%if ARCH_X86_64
3929    mov             ac_bakq, acq
3930%endif
3931    shl               hpadd, 2
3932    sub                  hd, hpadd
3933    pxor                 m4, m4
3934    cmp                  wd, 8
3935    jg .w16
3936    je .w8
3937    ; fall-through
3938%if ARCH_X86_64
3939    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
3940%else
3941    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
3942%endif
3943.w4:
3944    lea            stride3q, [strideq*3]
3945.w4_loop:
3946    movq                 m0, [yq]
3947    movq                 m1, [yq+strideq]
3948    movhps               m0, [yq+strideq*2]
3949    movhps               m1, [yq+stride3q]
3950    pmaddubsw            m0, m2
3951    pmaddubsw            m1, m2
3952    paddw                m0, m1
3953    mova              [acq], m0
3954    paddw                m4, m0
3955    lea                  yq, [yq+strideq*4]
3956    add                 acq, 16
3957    sub                  hd, 2
3958    jg .w4_loop
3959    test              hpadd, hpadd
3960    jz .calc_avg_4_8
3961    punpckhqdq           m0, m0
3962.w4_hpad_loop:
3963    mova              [acq], m0
3964    paddw                m4, m0
3965    add                 acq, 16
3966    sub               hpadd, 2
3967    jg .w4_hpad_loop
3968    jmp .calc_avg_4_8
3969.w8:
3970    lea            stride3q, [strideq*3]
3971    test              wpadd, wpadd
3972    jnz .w8_wpad
3973.w8_loop:
3974    mova                 m0, [yq]
3975    mova                 m1, [yq+strideq]
3976    pmaddubsw            m0, m2
3977    pmaddubsw            m1, m2
3978    paddw                m0, m1
3979    mova              [acq], m0
3980    paddw                m4, m0
3981    mova                 m0, [yq+strideq*2]
3982    mova                 m1, [yq+stride3q]
3983    pmaddubsw            m0, m2
3984    pmaddubsw            m1, m2
3985    paddw                m0, m1
3986    mova           [acq+16], m0
3987    paddw                m4, m0
3988    lea                  yq, [yq+strideq*4]
3989    add                 acq, 32
3990    sub                  hd, 2
3991    jg .w8_loop
3992    test              hpadd, hpadd
3993    jz .calc_avg_4_8
3994    jmp .w8_hpad
3995.w8_wpad:                                              ; wpadd=1
3996    movddup              m0, [yq]
3997    movddup              m1, [yq+strideq]
3998    pmaddubsw            m0, m2
3999    pmaddubsw            m1, m2
4000    paddw                m0, m1
4001    pshufhw              m0, m0, q3333
4002    mova              [acq], m0
4003    paddw                m4, m0
4004    lea                  yq, [yq+strideq*2]
4005    add                 acq, 16
4006    sub                  hd, 1
4007    jg .w8_wpad
4008    test              hpadd, hpadd
4009    jz .calc_avg_4_8
4010.w8_hpad:
4011    mova              [acq], m0
4012    paddw                m4, m0
4013    add                 acq, 16
4014    sub               hpadd, 1
4015    jg .w8_hpad
4016    jmp .calc_avg_4_8
4017.w16:
4018    test              wpadd, wpadd
4019    jnz .w16_wpad
4020.w16_loop:
4021    mova                 m0, [yq]
4022    mova                 m1, [yq+strideq]
4023    pmaddubsw            m0, m2
4024    pmaddubsw            m1, m2
4025    paddw                m0, m1
4026    mova              [acq], m0
4027    paddw                m4, m0
4028    mova                 m6, [yq+16]
4029    mova                 m1, [yq+strideq+16]
4030    pmaddubsw            m6, m2
4031    pmaddubsw            m1, m2
4032    paddw                m6, m1
4033    mova           [acq+16], m6
4034    paddw                m4, m6
4035    lea                  yq, [yq+strideq*2]
4036    add                 acq, 32
4037    dec                  hd
4038    jg .w16_loop
4039    test              hpadd, hpadd
4040    jz .calc_avg16
4041    jmp .w16_hpad_loop
4042.w16_wpad:
4043    cmp               wpadd, 2
4044    jl .w16_pad1
4045    je .w16_pad2
4046.w16_pad3:
4047    movddup              m0, [yq]
4048    movddup              m1, [yq+strideq]
4049    pmaddubsw            m0, m2
4050    pmaddubsw            m1, m2
4051    paddw                m0, m1
4052    pshufhw              m0, m0, q3333
4053    mova              [acq], m0
4054    paddw                m4, m0
4055    mova                 m6, m0
4056    punpckhqdq           m6, m0, m0
4057    mova           [acq+16], m6
4058    paddw                m4, m6
4059    lea                  yq, [yq+strideq*2]
4060    add                 acq, 32
4061    dec                  hd
4062    jg .w16_pad3
4063    jmp .w16_wpad_done
4064.w16_pad2:
4065    mova                 m0, [yq]
4066    mova                 m1, [yq+strideq]
4067    pmaddubsw            m0, m2
4068    pmaddubsw            m1, m2
4069    paddw                m0, m1
4070    mova              [acq], m0
4071    paddw                m4, m0
4072    pshufhw              m6, m0, q3333
4073    punpckhqdq           m6, m6
4074    mova           [acq+16], m6
4075    paddw                m4, m6
4076    lea                  yq, [yq+strideq*2]
4077    add                 acq, 32
4078    dec                  hd
4079    jg .w16_pad2
4080    jmp .w16_wpad_done
4081.w16_pad1:
4082    mova                 m0, [yq]
4083    mova                 m1, [yq+strideq]
4084    pmaddubsw            m0, m2
4085    pmaddubsw            m1, m2
4086    paddw                m0, m1
4087    mova              [acq], m0
4088    paddw                m4, m0
4089    movddup              m6, [yq+16]
4090    movddup              m1, [yq+strideq+16]
4091    pmaddubsw            m6, m2
4092    pmaddubsw            m1, m2
4093    paddw                m6, m1
4094    pshufhw              m6, m6, q3333
4095    mova           [acq+16], m6
4096    paddw                m4, m6
4097    lea                  yq, [yq+strideq*2]
4098    add                 acq, 32
4099    dec                  hd
4100    jg .w16_pad1
4101.w16_wpad_done:
4102    test              hpadd, hpadd
4103    jz .calc_avg16
4104.w16_hpad_loop:
4105    mova              [acq], m0
4106    paddw                m4, m0
4107    mova           [acq+16], m6
4108    paddw                m4, m6
4109    add                 acq, 32
4110    dec               hpadd
4111    jg .w16_hpad_loop
4112    jmp .calc_avg16
4113
4114%if ARCH_X86_64
4115    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
4116%else
4117    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
4118%endif
4119.calc_avg_4_8:
4120    psrlw                m2, 9
4121    pmaddwd              m4, m2
4122    jmp .calc_avg
4123.calc_avg16:
4124    psrld                m0, m4, 16
4125    pslld                m4, 16
4126    psrld                m4, 16
4127    paddd                m4, m0
4128.calc_avg:
4129    movd                szd, m5
4130    psrad                m5, 1
4131    tzcnt               r1d, szd
4132    paddd                m4, m5
4133    movd                 m1, r1d
4134    pshufd               m0, m4, q2301
4135    paddd                m0, m4
4136    pshufd               m4, m0, q1032
4137    paddd                m0, m4
4138    psrad                m0, m1                        ; sum >>= log2sz;
4139    packssdw             m0, m0
4140    RELOAD_ACQ_32       acq
4141.sub_loop:
4142    mova                 m1, [acq]
4143    psubw                m1, m0                        ; ac[x] -= sum;
4144    mova              [acq], m1
4145    add                 acq, 16
4146    sub                 szd, 8
4147    jg .sub_loop
4148    RET
4149
4150%if ARCH_X86_64
4151cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
4152    movddup              m2, [pb_4]
4153%else
4154cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
4155    mov                 t0d, 0x04040404
4156    movd                 m2, t0d
4157    pshufd               m2, m2, q0000
4158%endif
4159    movifnidn            wd, wm
4160    mov                 t0d, hm
4161    mov                  hd, t0d
4162    imul                t0d, wd
4163    movd                 m6, t0d
4164    movifnidn         hpadd, hpadm
4165%if ARCH_X86_64
4166    mov             ac_bakq, acq
4167%endif
4168    shl               hpadd, 2
4169    sub                  hd, hpadd
4170    pxor                 m4, m4
4171    pxor                 m5, m5
4172    cmp                  wd, 8
4173    jg .w16
4174    je .w8
4175    ; fall-through
4176
4177%if ARCH_X86_64
4178    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
4179%else
4180    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
4181%endif
4182.w4:
4183    lea            stride3q, [strideq*3]
4184.w4_loop:
4185    movq                 m1, [yq]
4186    movhps               m1, [yq+strideq]
4187    movq                 m0, [yq+strideq*2]
4188    movhps               m0, [yq+stride3q]
4189    pmaddubsw            m0, m2
4190    pmaddubsw            m1, m2
4191    mova              [acq], m1
4192    mova           [acq+16], m0
4193    paddw                m4, m0
4194    paddw                m5, m1
4195    lea                  yq, [yq+strideq*4]
4196    add                 acq, 32
4197    sub                  hd, 4
4198    jg .w4_loop
4199    test              hpadd, hpadd
4200    jz .calc_avg_4
4201    punpckhqdq           m0, m0
4202.w4_hpad_loop:
4203    mova              [acq], m0
4204    paddw                m4, m0
4205    add                 acq, 16
4206    sub               hpadd, 2
4207    jg .w4_hpad_loop
4208    jmp .calc_avg_4
4209.w8:
4210    lea            stride3q, [strideq*3]
4211    test              wpadd, wpadd
4212    jnz .w8_wpad
4213.w8_loop:
4214    mova                 m1, [yq]
4215    mova                 m0, [yq+strideq]
4216    pmaddubsw            m0, m2
4217    pmaddubsw            m1, m2
4218    mova              [acq], m1
4219    mova           [acq+16], m0
4220    paddw                m4, m0
4221    paddw                m5, m1
4222    mova                 m1, [yq+strideq*2]
4223    mova                 m0, [yq+stride3q]
4224    pmaddubsw            m0, m2
4225    pmaddubsw            m1, m2
4226    mova           [acq+32], m1
4227    mova           [acq+48], m0
4228    paddw                m4, m0
4229    paddw                m5, m1
4230    lea                  yq, [yq+strideq*4]
4231    add                 acq, 64
4232    sub                  hd, 4
4233    jg .w8_loop
4234    test              hpadd, hpadd
4235    jz .calc_avg_8_16
4236    jmp .w8_hpad
4237.w8_wpad:
4238    movddup              m1, [yq]
4239    pmaddubsw            m1, m2
4240    pshufhw              m1, m1, q3333
4241    mova              [acq], m1
4242    paddw                m5, m1
4243    movddup              m0, [yq+strideq]
4244    pmaddubsw            m0, m2
4245    pshufhw              m0, m0, q3333
4246    mova           [acq+16], m0
4247    paddw                m4, m0
4248    lea                  yq, [yq+strideq*2]
4249    add                 acq, 32
4250    sub                  hd, 2
4251    jg .w8_wpad
4252    test              hpadd, hpadd
4253    jz .calc_avg_8_16
4254.w8_hpad:
4255    mova              [acq], m0
4256    paddw                m4, m0
4257    mova           [acq+16], m0
4258    paddw                m4, m0
4259    add                 acq, 32
4260    sub               hpadd, 2
4261    jg .w8_hpad
4262    jmp .calc_avg_8_16
4263.w16:
4264    test              wpadd, wpadd
4265    jnz .w16_wpad
4266.w16_loop:
4267    mova                 m1, [yq]
4268    mova                 m0, [yq+16]
4269    pmaddubsw            m0, m2
4270    pmaddubsw            m1, m2
4271    mova              [acq], m1
4272    mova           [acq+16], m0
4273    paddw                m5, m0
4274    paddw                m5, m1
4275    mova                 m1, [yq+strideq]
4276    mova                 m0, [yq+strideq+16]
4277    pmaddubsw            m0, m2
4278    pmaddubsw            m1, m2
4279    mova           [acq+32], m1
4280    mova           [acq+48], m0
4281    paddw                m4, m0
4282    paddw                m4, m1
4283    lea                  yq, [yq+strideq*2]
4284    add                 acq, 64
4285    sub                  hd, 2
4286    jg .w16_loop
4287    test              hpadd, hpadd
4288    jz .calc_avg_8_16
4289    jmp .w16_hpad_loop
4290.w16_wpad:
4291    cmp               wpadd, 2
4292    jl .w16_pad1
4293    je .w16_pad2
4294.w16_pad3:
4295    movddup              m1, [yq]
4296    pmaddubsw            m1, m2
4297    pshufhw              m1, m1, q3333
4298    mova              [acq], m1
4299    paddw                m5, m1
4300    punpckhqdq           m1, m1
4301    mova           [acq+16], m1
4302    paddw                m5, m1
4303    movddup              m1, [yq+strideq]
4304    pmaddubsw            m1, m2
4305    pshufhw              m1, m1, q3333
4306    mova           [acq+32], m1
4307    paddw                m4, m1
4308    punpckhqdq           m0, m1, m1
4309    mova           [acq+48], m0
4310    paddw                m4, m0
4311    lea                  yq, [yq+strideq*2]
4312    add                 acq, 64
4313    sub                  hd, 2
4314    jg .w16_pad3
4315    jmp .w16_wpad_done
4316.w16_pad2:
4317    mova                 m1, [yq]
4318    pmaddubsw            m1, m2
4319    mova              [acq], m1
4320    paddw                m5, m1
4321    pshufhw              m1, m1, q3333
4322    punpckhqdq           m1, m1
4323    mova           [acq+16], m1
4324    paddw                m5, m1
4325    mova                 m1, [yq+strideq]
4326    pmaddubsw            m1, m2
4327    mova           [acq+32], m1
4328    paddw                m4, m1
4329    mova                 m0, m1
4330    pshufhw              m0, m0, q3333
4331    punpckhqdq           m0, m0
4332    mova           [acq+48], m0
4333    paddw                m4, m0
4334    lea                  yq, [yq+strideq*2]
4335    add                 acq, 64
4336    sub                  hd, 2
4337    jg .w16_pad2
4338    jmp .w16_wpad_done
4339.w16_pad1:
4340    mova                 m1, [yq]
4341    pmaddubsw            m1, m2
4342    mova              [acq], m1
4343    paddw                m5, m1
4344    movddup              m0, [yq+16]
4345    pmaddubsw            m0, m2
4346    pshufhw              m0, m0, q3333
4347    mova           [acq+16], m0
4348    paddw                m5, m0
4349    mova                 m1, [yq+strideq]
4350    pmaddubsw            m1, m2
4351    mova           [acq+32], m1
4352    paddw                m4, m1
4353    movddup              m0, [yq+strideq+16]
4354    pmaddubsw            m0, m2
4355    pshufhw              m0, m0, q3333
4356    mova           [acq+48], m0
4357    paddw                m4, m0
4358    lea                  yq, [yq+strideq*2]
4359    add                 acq, 64
4360    sub                  hd, 2
4361    jg .w16_pad1
4362.w16_wpad_done:
4363    test              hpadd, hpadd
4364    jz .calc_avg_8_16
4365.w16_hpad_loop:
4366    mova              [acq], m1
4367    mova           [acq+16], m0
4368    paddw                m4, m1
4369    paddw                m5, m0
4370    mova           [acq+32], m1
4371    mova           [acq+48], m0
4372    paddw                m4, m1
4373    paddw                m5, m0
4374    add                 acq, 64
4375    sub               hpadd, 2
4376    jg .w16_hpad_loop
4377    jmp .calc_avg_8_16
4378
4379%if ARCH_X86_64
4380    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
4381%else
4382    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
4383%endif
4384.calc_avg_4:
4385    psrlw                m2, 10
4386    pmaddwd              m5, m2
4387    pmaddwd              m0, m4, m2
4388    jmp .calc_avg
4389.calc_avg_8_16:
4390    mova                 m0, m5
4391    psrld                m5, 16
4392    pslld                m0, 16
4393    psrld                m0, 16
4394    paddd                m5, m0
4395    mova                 m0, m4
4396    psrld                m0, 16
4397    pslld                m4, 16
4398    psrld                m4, 16
4399    paddd                m0, m4
4400.calc_avg:
4401    paddd                m5, m0
4402    movd                szd, m6
4403    psrad                m6, 1
4404    tzcnt               r1d, szd                       ; const int log2sz = ctz(width) + ctz(height);
4405    paddd                m5, m6
4406    movd                 m1, r1d
4407    pshufd               m0, m5, q2301
4408    paddd                m0, m5
4409    pshufd               m5, m0, q1032
4410    paddd                m0, m5
4411    psrad                m0, m1                        ; sum >>= log2sz;
4412    packssdw             m0, m0
4413    RELOAD_ACQ_32       acq                            ; ac = ac_orig
4414.sub_loop:
4415    mova                 m1, [acq]
4416    psubw                m1, m0
4417    mova              [acq], m1
4418    add                 acq, 16
4419    sub                 szd, 8
4420    jg .sub_loop
4421    RET
4422
4423%if ARCH_X86_64
4424cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
4425    movddup              m2, [pb_4]
4426%else
4427cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
4428%define ac_bakq [rsp+16*4]
4429    mov                 t0d, 0x04040404
4430    movd                 m2, t0d
4431    pshufd               m2, m2, q0000
4432%endif
4433    movifnidn            wd, wm
4434    movifnidn         hpadd, hpadm
4435    movd                 m0, hpadd
4436    mov                 t0d, hm
4437    mov                  hd, t0d
4438    imul                t0d, wd
4439    movd                 m6, t0d
4440    movd              hpadd, m0
4441    mov             ac_bakq, acq
4442    shl               hpadd, 2
4443    sub                  hd, hpadd
4444    pxor                 m5, m5
4445    pxor                 m4, m4
4446    cmp                  wd, 16
4447    jg .w32
4448    cmp                  wd, 8
4449    jg .w16
4450    je .w8
4451    ; fall-through
4452
4453%if ARCH_X86_64
4454    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
4455%else
4456    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
4457%endif
4458.w4:
4459    lea            stride3q, [strideq*3]
4460.w4_loop:
4461    movd                 m1, [yq]
4462    movd                 m3, [yq+strideq]
4463    punpckldq            m1, m3
4464    punpcklbw            m1, m1
4465    movd                 m0, [yq+strideq*2]
4466    movd                 m3, [yq+stride3q]
4467    punpckldq            m0, m3
4468    punpcklbw            m0, m0
4469    pmaddubsw            m1, m2
4470    pmaddubsw            m0, m2
4471    mova              [acq], m1
4472    mova           [acq+16], m0
4473    paddw                m5, m0
4474    paddw                m5, m1
4475    lea                  yq, [yq+strideq*4]
4476    add                 acq, 32
4477    sub                  hd, 4
4478    jg .w4_loop
4479    test              hpadd, hpadd
4480    jz .calc_avg_4
4481    punpckhqdq           m0, m0
4482.w4_hpad_loop:
4483    mova              [acq], m0
4484    paddw                m5, m0
4485    add                 acq, 16
4486    sub               hpadd, 2
4487    jg .w4_hpad_loop
4488.calc_avg_4:
4489    psrlw                m2, 10
4490    pmaddwd              m5, m2
4491    jmp .calc_avg
4492
4493.w8:
4494    lea            stride3q, [strideq*3]
4495    test              wpadd, wpadd
4496    jnz .w8_wpad
4497.w8_loop:
4498    movq                 m1, [yq]
4499    punpcklbw            m1, m1
4500    pmaddubsw            m1, m2
4501    mova              [acq], m1
4502    paddw                m5, m1
4503    movq                 m0, [yq+strideq]
4504    punpcklbw            m0, m0
4505    pmaddubsw            m0, m2
4506    mova           [acq+16], m0
4507    paddw                m5, m0
4508    movq                 m1, [yq+strideq*2]
4509    punpcklbw            m1, m1
4510    pmaddubsw            m1, m2
4511    mova           [acq+32], m1
4512    paddw                m4, m1
4513    movq                 m0, [yq+stride3q]
4514    punpcklbw            m0, m0
4515    pmaddubsw            m0, m2
4516    mova           [acq+48], m0
4517    paddw                m4, m0
4518    lea                  yq, [yq+strideq*4]
4519    add                 acq, 64
4520    sub                  hd, 4
4521    jg .w8_loop
4522    test              hpadd, hpadd
4523    jz .calc_avg_8_16
4524    jmp .w8_hpad
4525.w8_wpad:
4526    movd                 m1, [yq]
4527    punpcklbw            m1, m1
4528    punpcklqdq           m1, m1
4529    pmaddubsw            m1, m2
4530    pshufhw              m1, m1, q3333
4531    mova              [acq], m1
4532    paddw                m5, m1
4533    movd                 m0, [yq+strideq]
4534    punpcklbw            m0, m0
4535    punpcklqdq           m0, m0
4536    pmaddubsw            m0, m2
4537    pshufhw              m0, m0, q3333
4538    mova           [acq+16], m0
4539    paddw                m4, m0
4540    lea                  yq, [yq+strideq*2]
4541    add                 acq, 32
4542    sub                  hd, 2
4543    jg .w8_wpad
4544    test              hpadd, hpadd
4545    jz .calc_avg_8_16
4546.w8_hpad:
4547    mova              [acq], m0
4548    paddw                m5, m0
4549    mova           [acq+16], m0
4550    paddw                m4, m0
4551    add                 acq, 32
4552    sub               hpadd, 2
4553    jg .w8_hpad
4554    jmp .calc_avg_8_16
4555
4556.w16:
4557    test              wpadd, wpadd
4558    jnz .w16_wpad
4559.w16_loop:
4560    mova                 m0, [yq]
4561    mova                 m1, m0
4562    punpcklbw            m1, m1
4563    pmaddubsw            m1, m2
4564    mova              [acq], m1
4565    paddw                m5, m1
4566    punpckhbw            m0, m0
4567    pmaddubsw            m0, m2
4568    mova           [acq+16], m0
4569    paddw                m5, m0
4570    mova                 m0, [yq+strideq]
4571    mova                 m1, m0
4572    punpcklbw            m1, m1
4573    pmaddubsw            m1, m2
4574    mova           [acq+32], m1
4575    paddw                m4, m1
4576    punpckhbw            m0, m0
4577    pmaddubsw            m0, m2
4578    mova           [acq+48], m0
4579    paddw                m4, m0
4580    lea                  yq, [yq+strideq*2]
4581    add                 acq, 64
4582    sub                  hd, 2
4583    jg .w16_loop
4584    test              hpadd, hpadd
4585    jz .calc_avg_8_16
4586    jmp .w16_hpad_loop
4587.w16_wpad:
4588    cmp               wpadd, 2
4589    jl .w16_pad1
4590    je .w16_pad2
4591.w16_pad3:
4592    movd                 m1, [yq]
4593    punpcklbw            m1, m1
4594    punpcklqdq           m1, m1
4595    pshufhw              m1, m1, q3333
4596    pmaddubsw            m1, m2
4597    mova              [acq], m1
4598    paddw                m5, m1
4599    punpckhqdq           m1, m1
4600    mova           [acq+16], m1
4601    paddw                m5, m1
4602    movd                 m1, [yq+strideq]
4603    punpcklbw            m1, m1
4604    punpcklqdq           m1, m1
4605    pshufhw              m1, m1, q3333
4606    pmaddubsw            m1, m2
4607    mova           [acq+32], m1
4608    paddw                m4, m1
4609    punpckhqdq           m0, m1, m1
4610    mova           [acq+48], m0
4611    paddw                m4, m0
4612    lea                  yq, [yq+strideq*2]
4613    add                 acq, 64
4614    sub                  hd, 2
4615    jg .w16_pad3
4616    jmp .w16_wpad_done
4617.w16_pad2:
4618    movq                 m1, [yq]
4619    punpcklbw            m1, m1
4620    pmaddubsw            m1, m2
4621    mova              [acq], m1
4622    paddw                m5, m1
4623    pshufhw              m1, m1, q3333
4624    punpckhqdq           m1, m1
4625    mova           [acq+16], m1
4626    paddw                m5, m1
4627    movq                 m1, [yq+strideq]
4628    punpcklbw            m1, m1
4629    pmaddubsw            m1, m2
4630    mova           [acq+32], m1
4631    paddw                m4, m1
4632    mova                 m0, m1
4633    pshufhw              m0, m0, q3333
4634    punpckhqdq           m0, m0
4635    mova           [acq+48], m0
4636    paddw                m4, m0
4637    lea                  yq, [yq+strideq*2]
4638    add                 acq, 64
4639    sub                  hd, 2
4640    jg .w16_pad2
4641    jmp .w16_wpad_done
4642.w16_pad1:
4643    mova                 m0, [yq]
4644    mova                 m1, m0
4645    punpcklbw            m1, m1
4646    pmaddubsw            m1, m2
4647    mova              [acq], m1
4648    paddw                m5, m1
4649    punpckhbw            m0, m0
4650    punpcklqdq           m0, m0
4651    pshufhw              m0, m0, q3333
4652    pmaddubsw            m0, m2
4653    mova           [acq+16], m0
4654    paddw                m5, m0
4655    mova                 m0, [yq+strideq]
4656    mova                 m1, m0
4657    punpcklbw            m1, m1
4658    pmaddubsw            m1, m2
4659    mova           [acq+32], m1
4660    paddw                m4, m1
4661    punpckhbw            m0, m0
4662    punpcklqdq           m0, m0
4663    pshufhw              m0, m0, q3333
4664    pmaddubsw            m0, m2
4665    mova           [acq+48], m0
4666    paddw                m4, m0
4667    lea                  yq, [yq+strideq*2]
4668    add                 acq, 64
4669    sub                  hd, 2
4670    jg .w16_pad1
4671.w16_wpad_done:
4672    test              hpadd, hpadd
4673    jz .calc_avg_8_16
4674.w16_hpad_loop:
4675    mova              [acq], m1
4676    mova           [acq+16], m0
4677    paddw                m4, m1
4678    paddw                m5, m0
4679    mova           [acq+32], m1
4680    mova           [acq+48], m0
4681    paddw                m4, m1
4682    paddw                m5, m0
4683    add                 acq, 64
4684    sub               hpadd, 2
4685    jg .w16_hpad_loop
4686.calc_avg_8_16:
4687    mova                 m0, m5
4688    psrld                m5, 16
4689    pslld                m0, 16
4690    psrld                m0, 16
4691    paddd                m5, m0
4692    mova                 m0, m4
4693    psrld                m0, 16
4694    pslld                m4, 16
4695    psrld                m4, 16
4696    paddd                m0, m4
4697    paddd                m5, m0
4698    jmp .calc_avg
4699
4700.w32:
4701    pxor                 m0, m0
4702    mova           [rsp   ], m0
4703    mova           [rsp+16], m0
4704    mova           [rsp+32], m0
4705    mova           [rsp+48], m0
4706    test              wpadd, wpadd
4707    jnz .w32_wpad
4708.w32_loop:
4709    mova                 m0, [yq]
4710    mova                 m1, m0
4711    punpcklbw            m1, m1
4712    pmaddubsw            m1, m2
4713    mova              [acq], m1
4714    paddw                m5, m1, [rsp]
4715    mova           [rsp   ], m5
4716    punpckhbw            m0, m0
4717    pmaddubsw            m0, m2
4718    mova           [acq+16], m0
4719    paddw                m5, m0, [rsp+16]
4720    mova           [rsp+16], m5
4721    mova                 m4, [yq+16]
4722    mova                 m3, m4
4723    punpcklbw            m3, m3
4724    pmaddubsw            m3, m2
4725    mova           [acq+32], m3
4726    paddw                m5, m3, [rsp+32]
4727    mova           [rsp+32], m5
4728    punpckhbw            m4, m4
4729    pmaddubsw            m4, m2
4730    mova           [acq+48], m4
4731    paddw                m5, m4, [rsp+48]
4732    mova           [rsp+48], m5
4733    lea                  yq, [yq+strideq]
4734    add                 acq, 64
4735    sub                  hd, 1
4736    jg .w32_loop
4737    test              hpadd, hpadd
4738    jz .calc_avg_32
4739    jmp .w32_hpad_loop
4740.w32_wpad:
4741    cmp               wpadd, 2
4742    jl .w32_pad1
4743    je .w32_pad2
4744    cmp               wpadd, 4
4745    jl .w32_pad3
4746    je .w32_pad4
4747    cmp               wpadd, 6
4748    jl .w32_pad5
4749    je .w32_pad6
4750.w32_pad7:
4751    movd                 m1, [yq]
4752    punpcklbw            m1, m1
4753    punpcklqdq           m1, m1
4754    pshufhw              m1, m1, q3333
4755    pmaddubsw            m1, m2
4756    mova              [acq], m1
4757    paddw                m5, m1, [rsp]
4758    mova           [rsp   ], m5
4759    mova                 m0, m1
4760    punpckhqdq           m0, m0
4761    mova           [acq+16], m0
4762    paddw                m5, m0, [rsp+16]
4763    mova           [rsp+16], m5
4764    mova                 m3, m0
4765    mova           [acq+32], m3
4766    paddw                m5, m3, [rsp+32]
4767    mova           [rsp+32], m5
4768    mova                 m4, m3
4769    mova           [acq+48], m4
4770    paddw                m5, m4, [rsp+48]
4771    mova           [rsp+48], m5
4772    lea                  yq, [yq+strideq]
4773    add                 acq, 64
4774    sub                  hd, 1
4775    jg .w32_pad7
4776    jmp .w32_wpad_done
4777.w32_pad6:
4778    mova                 m0, [yq]
4779    mova                 m1, m0
4780    punpcklbw            m1, m1
4781    pmaddubsw            m1, m2
4782    mova              [acq], m1
4783    paddw                m5, m1, [rsp]
4784    mova           [rsp   ], m5
4785    pshufhw              m0, m1, q3333
4786    punpckhqdq           m0, m0
4787    mova           [acq+16], m0
4788    paddw                m5, m0, [rsp+16]
4789    mova           [rsp+16], m5
4790    mova                 m3, m0
4791    mova           [acq+32], m3
4792    paddw                m5, m3, [rsp+32]
4793    mova           [rsp+32], m5
4794    mova                 m4, m3
4795    mova           [acq+48], m4
4796    paddw                m5, m4, [rsp+48]
4797    mova           [rsp+48], m5
4798    lea                  yq, [yq+strideq]
4799    add                 acq, 64
4800    sub                  hd, 1
4801    jg .w32_pad6
4802    jmp .w32_wpad_done
4803.w32_pad5:
4804    mova                 m0, [yq]
4805    mova                 m1, m0
4806    punpcklbw            m1, m1
4807    pmaddubsw            m1, m2
4808    mova              [acq], m1
4809    mova                 m5, [rsp]
4810    paddw                m5, m1
4811    mova           [rsp   ], m5
4812    punpckhbw            m0, m0
4813    punpcklqdq           m0, m0
4814    pshufhw              m0, m0, q3333
4815    pmaddubsw            m0, m2
4816    mova           [acq+16], m0
4817    paddw                m5, m0, [rsp+16]
4818    mova           [rsp+16], m5
4819    mova                 m3, m0
4820    punpckhqdq           m3, m3
4821    mova           [acq+32], m3
4822    paddw                m5, m3, [rsp+32]
4823    mova           [rsp+32], m5
4824    mova                 m4, m3
4825    mova           [acq+48], m4
4826    paddw                m5, m4, [rsp+48]
4827    mova           [rsp+48], m5
4828    lea                  yq, [yq+strideq]
4829    add                 acq, 64
4830    sub                  hd, 1
4831    jg .w32_pad5
4832    jmp .w32_wpad_done
4833.w32_pad4:
4834    mova                 m0, [yq]
4835    mova                 m1, m0
4836    punpcklbw            m1, m1
4837    pmaddubsw            m1, m2
4838    mova              [acq], m1
4839    paddw                m5, m1, [rsp]
4840    mova           [rsp   ], m5
4841    punpckhbw            m0, m0
4842    pmaddubsw            m0, m2
4843    mova           [acq+16], m0
4844    paddw                m5, m0, [rsp+16]
4845    mova           [rsp+16], m5
4846    mova                 m3, m0
4847    pshufhw              m3, m3, q3333
4848    punpckhqdq           m3, m3
4849    mova           [acq+32], m3
4850    paddw                m5, m3, [rsp+32]
4851    mova           [rsp+32], m5
4852    mova                 m4, m3
4853    mova           [acq+48], m4
4854    paddw                m5, m4, [rsp+48]
4855    mova           [rsp+48], m5
4856    lea                  yq, [yq+strideq]
4857    add                 acq, 64
4858    sub                  hd, 1
4859    jg .w32_pad4
4860    jmp .w32_wpad_done
4861.w32_pad3:
4862    mova                 m0, [yq]
4863    mova                 m1, m0
4864    punpcklbw            m1, m1
4865    pmaddubsw            m1, m2
4866    mova              [acq], m1
4867    paddw                m5, m1, [rsp]
4868    mova           [rsp   ], m5
4869    punpckhbw            m0, m0
4870    pmaddubsw            m0, m2
4871    mova           [acq+16], m0
4872    paddw                m5, m0, [rsp+16]
4873    mova           [rsp+16], m5
4874    movd                 m3, [yq+16]
4875    punpcklbw            m3, m3
4876    punpcklqdq           m3, m3
4877    pshufhw              m3, m3, q3333
4878    pmaddubsw            m3, m2
4879    mova           [acq+32], m3
4880    paddw                m5, m3, [rsp+32]
4881    mova           [rsp+32], m5
4882    mova                 m4, m3
4883    punpckhqdq           m4, m4
4884    mova           [acq+48], m4
4885    paddw                m5, m4, [rsp+48]
4886    mova           [rsp+48], m5
4887    lea                  yq, [yq+strideq]
4888    add                 acq, 64
4889    sub                  hd, 1
4890    jg .w32_pad3
4891    jmp .w32_wpad_done
4892.w32_pad2:
4893    mova                 m0, [yq]
4894    mova                 m1, m0
4895    punpcklbw            m1, m1
4896    pmaddubsw            m1, m2
4897    mova              [acq], m1
4898    paddw                m5, m1, [rsp]
4899    mova           [rsp   ], m5
4900    punpckhbw            m0, m0
4901    pmaddubsw            m0, m2
4902    mova           [acq+16], m0
4903    paddw                m5, m0, [rsp+16]
4904    mova           [rsp+16], m5
4905    mova                 m3, [yq+16]
4906    punpcklbw            m3, m3
4907    pmaddubsw            m3, m2
4908    mova           [acq+32], m3
4909    paddw                m5, m3, [rsp+32]
4910    mova           [rsp+32], m5
4911    pshufhw              m4, m3, q3333
4912    punpckhqdq           m4, m4
4913    mova           [acq+48], m4
4914    paddw                m5, m4, [rsp+48]
4915    mova           [rsp+48], m5
4916    lea                  yq, [yq+strideq]
4917    add                 acq, 64
4918    sub                  hd, 1
4919    jg .w32_pad2
4920    jmp .w32_wpad_done
4921.w32_pad1:
4922    mova                 m0, [yq]
4923    mova                 m1, m0
4924    punpcklbw            m1, m1
4925    pmaddubsw            m1, m2
4926    mova              [acq], m1
4927    paddw                m5, m1, [rsp]
4928    mova           [rsp   ], m5
4929    punpckhbw            m0, m0
4930    pmaddubsw            m0, m2
4931    mova           [acq+16], m0
4932    paddw                m5, m0, [rsp+16]
4933    mova           [rsp+16], m5
4934    mova                 m4, [yq+16]
4935    mova                 m3, m4
4936    punpcklbw            m3, m3
4937    pmaddubsw            m3, m2
4938    mova           [acq+32], m3
4939    paddw                m5, m3, [rsp+32]
4940    mova           [rsp+32], m5
4941    punpckhbw            m4, m4
4942    punpcklqdq           m4, m4
4943    pshufhw              m4, m4, q3333
4944    pmaddubsw            m4, m2
4945    mova           [acq+48], m4
4946    paddw                m5, m4, [rsp+48]
4947    mova           [rsp+48], m5
4948    lea                  yq, [yq+strideq]
4949    add                 acq, 64
4950    sub                  hd, 1
4951    jg .w32_pad1
4952.w32_wpad_done:
4953    test              hpadd, hpadd
4954    jz .calc_avg_32
4955.w32_hpad_loop:
4956    mova              [acq], m1
4957    mova           [acq+16], m0
4958    paddw                m5, m1, [rsp]
4959    mova           [rsp   ], m5
4960    paddw                m5, m0, [rsp+16]
4961    mova           [rsp+16], m5
4962    mova           [acq+32], m3
4963    mova           [acq+48], m4
4964    paddw                m5, m3, [rsp+32]
4965    mova           [rsp+32], m5
4966    paddw                m5, m4, [rsp+48]
4967    mova           [rsp+48], m5
4968    add                 acq, 64
4969    sub               hpadd, 1
4970    jg .w32_hpad_loop
4971
4972%if ARCH_X86_64
4973    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
4974%else
4975    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
4976%endif
4977
4978.calc_avg_32:
4979    mova                 m5, [rsp]
4980    mova                 m0, m5
4981    psrld                m5, 16
4982    pslld                m0, 16
4983    psrld                m0, 16
4984    paddd                m5, m0
4985    mova                 m0, [rsp+16]
4986    mova                 m3, m0
4987    psrld                m0, 16
4988    pslld                m3, 16
4989    psrld                m3, 16
4990    paddd                m0, m3
4991    paddd                m5, m0
4992    mova                 m0, [rsp+32]
4993    mova                 m3, m0
4994    psrld                m0, 16
4995    pslld                m3, 16
4996    psrld                m3, 16
4997    paddd                m0, m3
4998    mova                 m1, [rsp+48]
4999    mova                 m3, m1
5000    psrld                m1, 16
5001    pslld                m3, 16
5002    psrld                m3, 16
5003    paddd                m1, m3
5004    paddd                m1, m0
5005    paddd                m5, m1
5006.calc_avg:
5007    movd                szd, m6
5008    psrad                m6, 1
5009    tzcnt               r1d, szd                       ; const int log2sz = ctz(width) + ctz(height);
5010    paddd                m5, m6
5011    movd                 m1, r1d
5012    pshufd               m0, m5, q2301
5013    paddd                m0, m5
5014    pshufd               m5, m0, q1032
5015    paddd                m0, m5
5016    psrad                m0, m1                        ; sum >>= log2sz;
5017    packssdw             m0, m0
5018    RELOAD_ACQ_32       acq                            ; ac = ac_orig
5019.sub_loop:
5020    mova                 m1, [acq]
5021    psubw                m1, m0
5022    mova              [acq], m1
5023    add                 acq, 16
5024    sub                 szd, 8
5025    jg .sub_loop
5026    RET
5027
5028; %1 simd register that hold the mask and will hold the result
5029; %2 simd register that holds the "true" values
5030; %3 location of the "false" values (simd register/memory)
5031%macro BLEND 3 ; mask, true, false
5032    pand  %2, %1
5033    pandn %1, %3
5034    por   %1, %2
5035%endmacro
5036
5037%macro PAETH 2                                 ; top, ldiff
5038    pavgb                m1, m%1, m3
5039    pxor                 m0, m%1, m3
5040    pand                 m0, m4
5041    psubusb              m2, m5, m1
5042    psubb                m1, m0
5043    psubusb              m1, m5
5044    por                  m1, m2
5045    paddusb              m1, m1
5046    por                  m1, m0               ; min(tldiff, 255)
5047    psubusb              m2, m5, m3
5048    psubusb              m0, m3, m5
5049    por                  m2, m0               ; tdiff
5050%ifnum %2
5051    pminub               m2, m%2
5052    pcmpeqb              m0, m%2, m2          ; ldiff <= tdiff
5053%else
5054    mova                 m0, %2
5055    pminub               m2, m0
5056    pcmpeqb              m0, m2
5057%endif
5058    pminub               m1, m2
5059    pcmpeqb              m1, m2               ; ldiff <= tldiff && tdiff <= tldiff
5060    mova                 m2, m3
5061    BLEND                m0, m2, m%1
5062    BLEND                m1, m0, m5
5063%endmacro
5064
5065cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h
5066%define base r5-ipred_paeth_ssse3_table
5067    tzcnt                wd, wm
5068    movifnidn            hd, hm
5069    pxor                 m0, m0
5070    movd                 m5, [tlq]
5071    pshufb               m5, m0
5072    LEA                  r5, ipred_paeth_ssse3_table
5073    movsxd               wq, [r5+wq*4]
5074    movddup              m4, [base+ipred_paeth_shuf]
5075    add                  wq, r5
5076    jmp                  wq
5077.w4:
5078    movd                 m6, [tlq+1]            ; top
5079    pshufd               m6, m6, q0000
5080    lea                  r3, [strideq*3]
5081    psubusb              m7, m5, m6
5082    psubusb              m0, m6, m5
5083    por                  m7, m0                 ; ldiff
5084.w4_loop:
5085    sub                 tlq, 4
5086    movd                 m3, [tlq]
5087    mova                 m1, [base+ipred_h_shuf]
5088    pshufb               m3, m1                 ; left
5089    PAETH                 6, 7
5090    movd   [dstq          ], m1
5091    pshuflw              m0, m1, q1032
5092    movd   [dstq+strideq  ], m0
5093    punpckhqdq           m1, m1
5094    movd   [dstq+strideq*2], m1
5095    psrlq                m1, 32
5096    movd   [dstq+r3       ], m1
5097    lea                dstq, [dstq+strideq*4]
5098    sub                  hd, 4
5099    jg .w4_loop
5100    RET
5101ALIGN function_align
5102.w8:
5103    movddup              m6, [tlq+1]
5104    psubusb              m7, m5, m6
5105    psubusb              m0, m6, m5
5106    por                  m7, m0
5107.w8_loop:
5108    sub                 tlq, 2
5109    movd                 m3, [tlq]
5110    pshufb               m3, [base+ipred_paeth_shuf]
5111    PAETH                 6, 7
5112    movq     [dstq        ], m1
5113    movhps   [dstq+strideq], m1
5114    lea                dstq, [dstq+strideq*2]
5115    sub                  hd, 2
5116    jg .w8_loop
5117    RET
5118ALIGN function_align
5119.w16:
5120    movu                 m6, [tlq+1]
5121    psubusb              m7, m5, m6
5122    psubusb              m0, m6, m5
5123    por                  m7, m0
5124.w16_loop:
5125    sub                 tlq, 1
5126    movd                 m3, [tlq]
5127    pxor                 m1, m1
5128    pshufb               m3, m1
5129    PAETH                 6, 7
5130    mova             [dstq], m1
5131    add                dstq, strideq
5132    sub                  hd, 1
5133    jg .w16_loop
5134    RET
5135ALIGN function_align
5136.w32:
5137    movu                 m6, [tlq+1]
5138    psubusb              m7, m5, m6
5139    psubusb              m0, m6, m5
5140    por                  m7, m0
5141    mova           [rsp   ], m6
5142    mova           [rsp+16], m7
5143    movu                 m6, [tlq+17]
5144    psubusb              m7, m5, m6
5145    psubusb              m0, m6, m5
5146    por                  m7, m0
5147    mova           [rsp+32], m6
5148.w32_loop:
5149    dec                 tlq
5150    movd                 m3, [tlq]
5151    pxor                 m1, m1
5152    pshufb               m3, m1
5153    mova                 m6, [rsp]
5154    PAETH                 6, [rsp+16]
5155    mova          [dstq   ], m1
5156    mova                 m6, [rsp+32]
5157    PAETH                 6, 7
5158    mova          [dstq+16], m1
5159    add                dstq, strideq
5160    dec                  hd
5161    jg .w32_loop
5162    RET
5163ALIGN function_align
5164.w64:
5165    movu                 m6, [tlq+1]
5166    psubusb              m7, m5, m6
5167    psubusb              m0, m6, m5
5168    por                  m7, m0
5169    mova           [rsp   ], m6
5170    mova           [rsp+16], m7
5171    movu                 m6, [tlq+17]
5172    psubusb              m7, m5, m6
5173    psubusb              m0, m6, m5
5174    por                  m7, m0
5175    mova           [rsp+32], m6
5176    mova           [rsp+48], m7
5177    movu                 m6, [tlq+33]
5178    psubusb              m7, m5, m6
5179    psubusb              m0, m6, m5
5180    por                  m7, m0
5181    mova           [rsp+64], m6
5182    mova           [rsp+80], m7
5183    movu                 m6, [tlq+49]
5184    psubusb              m7, m5, m6
5185    psubusb              m0, m6, m5
5186    por                  m7, m0
5187    mova           [rsp+96], m6
5188.w64_loop:
5189    dec                 tlq
5190    movd                 m3, [tlq]
5191    pxor                 m1, m1
5192    pshufb               m3, m1
5193    mova                 m6, [rsp]
5194    PAETH                 6, [rsp+16]
5195    mova          [dstq   ], m1
5196    mova                 m6, [rsp+32]
5197    PAETH                 6, [rsp+48]
5198    mova          [dstq+16], m1
5199    mova                 m6, [rsp+64]
5200    PAETH                 6, [rsp+80]
5201    mova          [dstq+32], m1
5202    mova                 m6, [rsp+96]
5203    PAETH                 6, 7
5204    mova          [dstq+48], m1
5205    add                dstq, strideq
5206    dec                  hd
5207    jg .w64_loop
5208    RET
5209
5210
5211%macro FILTER 4  ;dst, src, tmp, shuf
5212%ifnum %4
5213    pshufb               m%2, m%4
5214%else
5215    pshufb               m%2, %4
5216%endif
5217    pshufd               m%1, m%2, q0000           ;p0 p1
5218    pmaddubsw            m%1, m2
5219    pshufd               m%3, m%2, q1111           ;p2 p3
5220    pmaddubsw            m%3, m3
5221    paddw                m%1, [base+pw_8]
5222    paddw                m%1, m%3
5223    pshufd               m%3, m%2, q2222           ;p4 p5
5224    pmaddubsw            m%3, m4
5225    paddw                m%1, m%3
5226    pshufd               m%3, m%2, q3333           ;p6 __
5227    pmaddubsw            m%3, m5
5228    paddw                m%1, m%3
5229    psraw                m%1, 4
5230    packuswb             m%1, m%1
5231%endmacro
5232
5233cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter
5234%define base r6-$$
5235    LEA                   r6, $$
5236    tzcnt                 wd, wm
5237%ifidn filterd, filterm
5238    movzx            filterd, filterb
5239%else
5240    movzx            filterd, byte filterm
5241%endif
5242    shl              filterd, 6
5243    lea              filterq, [base+filter_intra_taps+filterq]
5244    movq                  m0, [tlq-3]                     ;_ 6 5 0 1 2 3 4
5245    movsxd                wq, [base+ipred_filter_ssse3_table+wq*4]
5246    mova                  m2, [filterq+16*0]
5247    mova                  m3, [filterq+16*1]
5248    mova                  m4, [filterq+16*2]
5249    mova                  m5, [filterq+16*3]
5250    lea                   wq, [base+ipred_filter_ssse3_table+wq]
5251    mov                   hd, hm
5252    jmp                   wq
5253.w4:
5254    mova                  m1, [base+filter_shuf1]
5255    sub                  tlq, 3
5256    sub                  tlq, hq
5257    jmp .w4_loop_start
5258.w4_loop:
5259    movd                  m0, [tlq+hq]
5260    punpckldq             m0, m6
5261    lea                 dstq, [dstq+strideq*2]
5262.w4_loop_start:
5263    FILTER                 6, 0, 7, 1
5264    movd    [dstq+strideq*0], m6
5265    pshuflw               m6, m6, q1032
5266    movd    [dstq+strideq*1], m6
5267    sub                   hd, 2
5268    jg .w4_loop
5269    RET
5270
5271ALIGN function_align
5272.w8:
5273    movq                  m6, [tlq+1]                   ;_ _ _ 0 1 2 3 4
5274    sub                  tlq, 5
5275    sub                  tlq, hq
5276
5277.w8_loop:
5278    FILTER                 7, 0, 1, [base+filter_shuf1]
5279    punpcklqdq            m6, m7                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5280    FILTER                 0, 6, 1, [base+filter_shuf2]
5281
5282    punpckldq             m6, m7, m0
5283    movq    [dstq+strideq*0], m6
5284    punpckhqdq            m6, m6
5285    movq    [dstq+strideq*1], m6
5286
5287    movd                  m0, [tlq+hq]                  ;_ 6 5 0
5288    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
5289
5290    lea                 dstq, [dstq+strideq*2]
5291    sub                   hd, 2
5292    jg .w8_loop
5293    RET
5294
5295ALIGN function_align
5296.w16:
5297    movu                  m6, [tlq+1]                   ;top row
5298    sub                  tlq, 5
5299    sub                  tlq, hq
5300
5301.w16_loop:
5302    FILTER                 7, 0, 1, [base+filter_shuf1]
5303    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5304    movd    [dstq+strideq*0], m7
5305    psrlq                 m7, 32
5306    palignr               m7, m6, 4
5307
5308    FILTER                 6, 0, 1, [base+filter_shuf2]
5309    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5310    movd  [dstq+4+strideq*0], m6
5311    psrlq                 m6, 32
5312    palignr               m6, m7, 4
5313
5314    FILTER                 7, 0, 1, [base+filter_shuf2]
5315    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5316    movd  [dstq+8+strideq*0], m7
5317    psrlq                 m7, 32
5318    palignr               m7, m6, 4
5319
5320    FILTER                 6, 0, 1, [base+filter_shuf2]
5321    movd [dstq+12+strideq*0], m6
5322    psrlq                 m6, 32
5323    palignr               m6, m7, 4
5324    mova    [dstq+strideq*1], m6
5325
5326    movd                  m0, [tlq+hq]                  ;_ 6 5 0
5327    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
5328
5329    lea                 dstq, [dstq+strideq*2]
5330    sub                   hd, 2
5331    jg .w16_loop
5332    RET
5333
5334ALIGN function_align
5335.w32:
5336    movu                  m6, [tlq+1]                   ;top row
5337    lea              filterq, [tlq+17]
5338    sub                  tlq, 5
5339    sub                  tlq, hq
5340
5341.w32_loop:
5342    FILTER                 7, 0, 1, [base+filter_shuf1]
5343    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5344    movd    [dstq+strideq*0], m7
5345    psrlq                 m7, 32
5346    palignr               m7, m6, 4
5347
5348    FILTER                 6, 0, 1, [base+filter_shuf2]
5349    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5350    movd  [dstq+4+strideq*0], m6
5351    psrlq                 m6, 32
5352    palignr               m6, m7, 4
5353
5354    FILTER                 7, 0, 1, [base+filter_shuf2]
5355    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5356    movd  [dstq+8+strideq*0], m7
5357    psrlq                 m7, 32
5358    palignr               m7, m6, 4
5359
5360    FILTER                 6, 0, 1, [base+filter_shuf2]
5361    movu                  m1, [filterq]
5362    punpckldq             m0, m7, m1                    ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
5363    punpcklqdq            m0, m6                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5364    movd [dstq+12+strideq*0], m6
5365    psrlq                 m6, 32
5366    palignr               m6, m7, 4
5367    mova    [dstq+strideq*1], m6
5368
5369    mova                  m6, m1
5370
5371    FILTER                 7, 0, 6, [base+filter_shuf2]
5372    punpcklqdq            m0, m1, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5373    movd [dstq+16+strideq*0], m7
5374    psrlq                 m7, 32
5375    palignr               m7, m1, 4
5376
5377    FILTER                 6, 0, 1, [base+filter_shuf2]
5378    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5379    movd [dstq+20+strideq*0], m6
5380    psrlq                 m6, 32
5381    palignr               m6, m7, 4
5382
5383    FILTER                 7, 0, 1, [base+filter_shuf2]
5384    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5385    movd [dstq+24+strideq*0], m7
5386    psrlq                 m7, 32
5387    palignr               m7, m6, 4
5388
5389    FILTER                 6, 0, 1, [base+filter_shuf2]
5390    movd [dstq+28+strideq*0], m6
5391    psrlq                 m6, 32
5392    palignr               m6, m7, 4
5393    mova [dstq+16+strideq*1], m6
5394
5395    mova                  m6, [dstq+strideq*1]
5396    movd                  m0, [tlq+hq]                  ;_ 6 5 0
5397    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
5398    lea              filterq, [dstq+16+strideq*1]
5399    lea                 dstq, [dstq+strideq*2]
5400    sub                   hd, 2
5401    jg .w32_loop
5402    RET
5403