xref: /aosp_15_r20/external/libdav1d/src/x86/mc_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; Copyright © 2018, VideoLabs
4; All rights reserved.
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions are met:
8;
9; 1. Redistributions of source code must retain the above copyright notice, this
10;    list of conditions and the following disclaimer.
11;
12; 2. Redistributions in binary form must reproduce the above copyright notice,
13;    this list of conditions and the following disclaimer in the documentation
14;    and/or other materials provided with the distribution.
15;
16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27%include "config.asm"
28%include "ext/x86/x86inc.asm"
29
30SECTION_RODATA 16
31
32; dav1d_obmc_masks[] with 64-x interleaved
33obmc_masks: db  0,  0,  0,  0
34            ; 2 @4
35            db 45, 19, 64,  0
36            ; 4 @8
37            db 39, 25, 50, 14, 59,  5, 64,  0
38            ; 8 @16
39            db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
40            ; 16 @32
41            db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
42            db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
43            ; 32 @64
44            db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
45            db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
46            db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
47
48warp_8x8_shufA: db 0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
49warp_8x8_shufB: db 4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
50warp_8x8_shufC: db 2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
51warp_8x8_shufD: db 6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
52blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
53subpel_h_shuf4: db 0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
54                db 2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
55subpel_h_shufA: db 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
56subpel_h_shufB: db 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
57subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
58subpel_h_shufD: db 0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
59subpel_h_shufE: db 2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
60subpel_h_shufF: db 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12
61subpel_s_shuf2: db 0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
62subpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
63bilin_h_shuf4:  db 0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
64unpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
65rescale_mul:    dd 0,  1,  2,  3
66resize_shuf:    db 0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
67
68wm_420_sign:    times 4 dw 258
69                times 4 dw 257
70wm_422_sign:    times 8 db 128
71                times 8 db 127
72
73pb_8x0_8x8: times 8 db 0
74            times 8 db 8
75bdct_lb_dw: times 4 db 0
76            times 4 db 4
77            times 4 db 8
78            times 4 db 12
79
80pb_64:    times 16 db 64
81pw_m256:  times 8 dw -256
82pw_1:     times 8 dw 1
83pw_2:     times 8 dw 2
84pw_8:     times 8 dw 8
85pw_15:    times 8 dw 15
86pw_26:    times 8 dw 26
87pw_34:    times 8 dw 34
88pw_512:   times 8 dw 512
89pw_1024:  times 8 dw 1024
90pw_2048:  times 8 dw 2048
91pw_6903:  times 8 dw 6903
92pw_8192:  times 8 dw 8192
93pd_32:    times 4 dd 32
94pd_63:    times 4 dd 63
95pd_512:   times 4 dd 512
96pd_16384: times 4 dd 16484
97pd_32768: times 4 dd 32768
98pd_262144:times 4 dd 262144
99pd_0x3ff: times 4 dd 0x3ff
100pd_0x4000:times 4 dd 0x4000
101pq_0x40000000: times 2 dq 0x40000000
102
103const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage
104    ; [-1, 0)
105    db 0, 127,   0, 0,   0,   1, 0, 0, 0, 127,   0, 0,  -1,   2, 0, 0
106    db 1, 127,  -1, 0,  -3,   4, 0, 0, 1, 126,  -2, 0,  -4,   6, 1, 0
107    db 1, 126,  -3, 0,  -5,   8, 1, 0, 1, 125,  -4, 0,  -6,  11, 1, 0
108    db 1, 124,  -4, 0,  -7,  13, 1, 0, 2, 123,  -5, 0,  -8,  15, 1, 0
109    db 2, 122,  -6, 0,  -9,  18, 1, 0, 2, 121,  -6, 0, -10,  20, 1, 0
110    db 2, 120,  -7, 0, -11,  22, 2, 0, 2, 119,  -8, 0, -12,  25, 2, 0
111    db 3, 117,  -8, 0, -13,  27, 2, 0, 3, 116,  -9, 0, -13,  29, 2, 0
112    db 3, 114, -10, 0, -14,  32, 3, 0, 3, 113, -10, 0, -15,  35, 2, 0
113    db 3, 111, -11, 0, -15,  37, 3, 0, 3, 109, -11, 0, -16,  40, 3, 0
114    db 3, 108, -12, 0, -16,  42, 3, 0, 4, 106, -13, 0, -17,  45, 3, 0
115    db 4, 104, -13, 0, -17,  47, 3, 0, 4, 102, -14, 0, -17,  50, 3, 0
116    db 4, 100, -14, 0, -17,  52, 3, 0, 4,  98, -15, 0, -18,  55, 4, 0
117    db 4,  96, -15, 0, -18,  58, 3, 0, 4,  94, -16, 0, -18,  60, 4, 0
118    db 4,  91, -16, 0, -18,  63, 4, 0, 4,  89, -16, 0, -18,  65, 4, 0
119    db 4,  87, -17, 0, -18,  68, 4, 0, 4,  85, -17, 0, -18,  70, 4, 0
120    db 4,  82, -17, 0, -18,  73, 4, 0, 4,  80, -17, 0, -18,  75, 4, 0
121    db 4,  78, -18, 0, -18,  78, 4, 0, 4,  75, -18, 0, -17,  80, 4, 0
122    db 4,  73, -18, 0, -17,  82, 4, 0, 4,  70, -18, 0, -17,  85, 4, 0
123    db 4,  68, -18, 0, -17,  87, 4, 0, 4,  65, -18, 0, -16,  89, 4, 0
124    db 4,  63, -18, 0, -16,  91, 4, 0, 4,  60, -18, 0, -16,  94, 4, 0
125    db 3,  58, -18, 0, -15,  96, 4, 0, 4,  55, -18, 0, -15,  98, 4, 0
126    db 3,  52, -17, 0, -14, 100, 4, 0, 3,  50, -17, 0, -14, 102, 4, 0
127    db 3,  47, -17, 0, -13, 104, 4, 0, 3,  45, -17, 0, -13, 106, 4, 0
128    db 3,  42, -16, 0, -12, 108, 3, 0, 3,  40, -16, 0, -11, 109, 3, 0
129    db 3,  37, -15, 0, -11, 111, 3, 0, 2,  35, -15, 0, -10, 113, 3, 0
130    db 3,  32, -14, 0, -10, 114, 3, 0, 2,  29, -13, 0,  -9, 116, 3, 0
131    db 2,  27, -13, 0,  -8, 117, 3, 0, 2,  25, -12, 0,  -8, 119, 2, 0
132    db 2,  22, -11, 0,  -7, 120, 2, 0, 1,  20, -10, 0,  -6, 121, 2, 0
133    db 1,  18,  -9, 0,  -6, 122, 2, 0, 1,  15,  -8, 0,  -5, 123, 2, 0
134    db 1,  13,  -7, 0,  -4, 124, 1, 0, 1,  11,  -6, 0,  -4, 125, 1, 0
135    db 1,   8,  -5, 0,  -3, 126, 1, 0, 1,   6,  -4, 0,  -2, 126, 1, 0
136    db 0,   4,  -3, 0,  -1, 127, 1, 0, 0,   2,  -1, 0,   0, 127, 0, 0
137    ; [0, 1)
138    db  0,   0,   1, 0, 0, 127,   0,  0,  0,  -1,   2, 0, 0, 127,   0,  0
139    db  0,  -3,   4, 1, 1, 127,  -2,  0,  0,  -5,   6, 1, 1, 127,  -2,  0
140    db  0,  -6,   8, 1, 2, 126,  -3,  0, -1,  -7,  11, 2, 2, 126,  -4, -1
141    db -1,  -8,  13, 2, 3, 125,  -5, -1, -1, -10,  16, 3, 3, 124,  -6, -1
142    db -1, -11,  18, 3, 4, 123,  -7, -1, -1, -12,  20, 3, 4, 122,  -7, -1
143    db -1, -13,  23, 3, 4, 121,  -8, -1, -2, -14,  25, 4, 5, 120,  -9, -1
144    db -1, -15,  27, 4, 5, 119, -10, -1, -1, -16,  30, 4, 5, 118, -11, -1
145    db -2, -17,  33, 5, 6, 116, -12, -1, -2, -17,  35, 5, 6, 114, -12, -1
146    db -2, -18,  38, 5, 6, 113, -13, -1, -2, -19,  41, 6, 7, 111, -14, -2
147    db -2, -19,  43, 6, 7, 110, -15, -2, -2, -20,  46, 6, 7, 108, -15, -2
148    db -2, -20,  49, 6, 7, 106, -16, -2, -2, -21,  51, 7, 7, 104, -16, -2
149    db -2, -21,  54, 7, 7, 102, -17, -2, -2, -21,  56, 7, 8, 100, -18, -2
150    db -2, -22,  59, 7, 8,  98, -18, -2, -2, -22,  62, 7, 8,  96, -19, -2
151    db -2, -22,  64, 7, 8,  94, -19, -2, -2, -22,  67, 8, 8,  91, -20, -2
152    db -2, -22,  69, 8, 8,  89, -20, -2, -2, -22,  72, 8, 8,  87, -21, -2
153    db -2, -21,  74, 8, 8,  84, -21, -2, -2, -22,  77, 8, 8,  82, -21, -2
154    db -2, -21,  79, 8, 8,  79, -21, -2, -2, -21,  82, 8, 8,  77, -22, -2
155    db -2, -21,  84, 8, 8,  74, -21, -2, -2, -21,  87, 8, 8,  72, -22, -2
156    db -2, -20,  89, 8, 8,  69, -22, -2, -2, -20,  91, 8, 8,  67, -22, -2
157    db -2, -19,  94, 8, 7,  64, -22, -2, -2, -19,  96, 8, 7,  62, -22, -2
158    db -2, -18,  98, 8, 7,  59, -22, -2, -2, -18, 100, 8, 7,  56, -21, -2
159    db -2, -17, 102, 7, 7,  54, -21, -2, -2, -16, 104, 7, 7,  51, -21, -2
160    db -2, -16, 106, 7, 6,  49, -20, -2, -2, -15, 108, 7, 6,  46, -20, -2
161    db -2, -15, 110, 7, 6,  43, -19, -2, -2, -14, 111, 7, 6,  41, -19, -2
162    db -1, -13, 113, 6, 5,  38, -18, -2, -1, -12, 114, 6, 5,  35, -17, -2
163    db -1, -12, 116, 6, 5,  33, -17, -2, -1, -11, 118, 5, 4,  30, -16, -1
164    db -1, -10, 119, 5, 4,  27, -15, -1, -1,  -9, 120, 5, 4,  25, -14, -2
165    db -1,  -8, 121, 4, 3,  23, -13, -1, -1,  -7, 122, 4, 3,  20, -12, -1
166    db -1,  -7, 123, 4, 3,  18, -11, -1, -1,  -6, 124, 3, 3,  16, -10, -1
167    db -1,  -5, 125, 3, 2,  13,  -8, -1, -1,  -4, 126, 2, 2,  11,  -7, -1
168    db  0,  -3, 126, 2, 1,   8,  -6,  0,  0,  -2, 127, 1, 1,   6,  -5,  0
169    db  0,  -2, 127, 1, 1,   4,  -3,  0,  0,   0, 127, 0, 0,   2,  -1,  0
170    ; [1, 2)
171    db 0, 0, 127,   0, 0,   1,   0, 0, 0, 0, 127,   0, 0,  -1,   2, 0
172    db 0, 1, 127,  -1, 0,  -3,   4, 0, 0, 1, 126,  -2, 0,  -4,   6, 1
173    db 0, 1, 126,  -3, 0,  -5,   8, 1, 0, 1, 125,  -4, 0,  -6,  11, 1
174    db 0, 1, 124,  -4, 0,  -7,  13, 1, 0, 2, 123,  -5, 0,  -8,  15, 1
175    db 0, 2, 122,  -6, 0,  -9,  18, 1, 0, 2, 121,  -6, 0, -10,  20, 1
176    db 0, 2, 120,  -7, 0, -11,  22, 2, 0, 2, 119,  -8, 0, -12,  25, 2
177    db 0, 3, 117,  -8, 0, -13,  27, 2, 0, 3, 116,  -9, 0, -13,  29, 2
178    db 0, 3, 114, -10, 0, -14,  32, 3, 0, 3, 113, -10, 0, -15,  35, 2
179    db 0, 3, 111, -11, 0, -15,  37, 3, 0, 3, 109, -11, 0, -16,  40, 3
180    db 0, 3, 108, -12, 0, -16,  42, 3, 0, 4, 106, -13, 0, -17,  45, 3
181    db 0, 4, 104, -13, 0, -17,  47, 3, 0, 4, 102, -14, 0, -17,  50, 3
182    db 0, 4, 100, -14, 0, -17,  52, 3, 0, 4,  98, -15, 0, -18,  55, 4
183    db 0, 4,  96, -15, 0, -18,  58, 3, 0, 4,  94, -16, 0, -18,  60, 4
184    db 0, 4,  91, -16, 0, -18,  63, 4, 0, 4,  89, -16, 0, -18,  65, 4
185    db 0, 4,  87, -17, 0, -18,  68, 4, 0, 4,  85, -17, 0, -18,  70, 4
186    db 0, 4,  82, -17, 0, -18,  73, 4, 0, 4,  80, -17, 0, -18,  75, 4
187    db 0, 4,  78, -18, 0, -18,  78, 4, 0, 4,  75, -18, 0, -17,  80, 4
188    db 0, 4,  73, -18, 0, -17,  82, 4, 0, 4,  70, -18, 0, -17,  85, 4
189    db 0, 4,  68, -18, 0, -17,  87, 4, 0, 4,  65, -18, 0, -16,  89, 4
190    db 0, 4,  63, -18, 0, -16,  91, 4, 0, 4,  60, -18, 0, -16,  94, 4
191    db 0, 3,  58, -18, 0, -15,  96, 4, 0, 4,  55, -18, 0, -15,  98, 4
192    db 0, 3,  52, -17, 0, -14, 100, 4, 0, 3,  50, -17, 0, -14, 102, 4
193    db 0, 3,  47, -17, 0, -13, 104, 4, 0, 3,  45, -17, 0, -13, 106, 4
194    db 0, 3,  42, -16, 0, -12, 108, 3, 0, 3,  40, -16, 0, -11, 109, 3
195    db 0, 3,  37, -15, 0, -11, 111, 3, 0, 2,  35, -15, 0, -10, 113, 3
196    db 0, 3,  32, -14, 0, -10, 114, 3, 0, 2,  29, -13, 0,  -9, 116, 3
197    db 0, 2,  27, -13, 0,  -8, 117, 3, 0, 2,  25, -12, 0,  -8, 119, 2
198    db 0, 2,  22, -11, 0,  -7, 120, 2, 0, 1,  20, -10, 0,  -6, 121, 2
199    db 0, 1,  18,  -9, 0,  -6, 122, 2, 0, 1,  15,  -8, 0,  -5, 123, 2
200    db 0, 1,  13,  -7, 0,  -4, 124, 1, 0, 1,  11,  -6, 0,  -4, 125, 1
201    db 0, 1,   8,  -5, 0,  -3, 126, 1, 0, 1,   6,  -4, 0,  -2, 126, 1
202    db 0, 0,   4,  -3, 0,  -1, 127, 1, 0, 0,   2,  -1, 0,   0, 127, 0
203    db 0, 0,   2,  -1, 0,   0, 127, 0
204
205pw_258:  times 2 dw 258
206
207cextern mc_subpel_filters
208%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
209
210%macro BIDIR_JMP_TABLE 2-*
211    ;evaluated at definition time (in loop below)
212    %xdefine %1_%2_table (%%table - 2*%3)
213    %xdefine %%base %1_%2_table
214    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
215    ; dynamically generated label
216    %%table:
217    %rep %0 - 2 ; repeat for num args
218        dd %%prefix %+ .w%3 - %%base
219        %rotate 1
220    %endrep
221%endmacro
222
223BIDIR_JMP_TABLE avg, ssse3,        4, 8, 16, 32, 64, 128
224BIDIR_JMP_TABLE w_avg, ssse3,      4, 8, 16, 32, 64, 128
225BIDIR_JMP_TABLE mask, ssse3,       4, 8, 16, 32, 64, 128
226BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
227BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
228BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
229BIDIR_JMP_TABLE blend, ssse3,      4, 8, 16, 32
230BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
231BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16
232
233%macro BASE_JMP_TABLE 3-*
234    %xdefine %1_%2_table (%%table - %3)
235    %xdefine %%base %1_%2
236    %%table:
237    %rep %0 - 2
238        dw %%base %+ _w%3 - %%base
239        %rotate 1
240    %endrep
241%endmacro
242
243%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put)
244%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep)
245
246BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
247BASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
248
249%macro HV_JMP_TABLE 5-*
250    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
251    %xdefine %%base %1_%3
252    %assign %%types %4
253    %if %%types & 1
254        %xdefine %1_%2_h_%3_table  (%%h  - %5)
255        %%h:
256        %rep %0 - 4
257            dw %%prefix %+ .h_w%5 - %%base
258            %rotate 1
259        %endrep
260        %rotate 4
261    %endif
262    %if %%types & 2
263        %xdefine %1_%2_v_%3_table  (%%v  - %5)
264        %%v:
265        %rep %0 - 4
266            dw %%prefix %+ .v_w%5 - %%base
267            %rotate 1
268        %endrep
269        %rotate 4
270    %endif
271    %if %%types & 4
272        %xdefine %1_%2_hv_%3_table (%%hv - %5)
273        %%hv:
274        %rep %0 - 4
275            dw %%prefix %+ .hv_w%5 - %%base
276            %rotate 1
277        %endrep
278    %endif
279%endmacro
280
281HV_JMP_TABLE prep,  8tap, ssse3, 1,    4, 8, 16, 32, 64, 128
282HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
283HV_JMP_TABLE prep, bilin, ssse3, 7,    4, 8, 16, 32, 64, 128
284
285%macro SCALED_JMP_TABLE 2-*
286    %xdefine %1_%2_table (%%table - %3)
287    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
288%%table:
289    %rep %0 - 2
290        dw %%base %+ .w%3 - %%base
291        %rotate 1
292    %endrep
293    %rotate 2
294%%dy_1024:
295    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
296    %rep %0 - 2
297        dw %%base %+ .dy1_w%3 - %%base
298        %rotate 1
299    %endrep
300    %rotate 2
301%%dy_2048:
302    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
303    %rep %0 - 2
304        dw %%base %+ .dy2_w%3 - %%base
305        %rotate 1
306    %endrep
307%endmacro
308
309SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
310SCALED_JMP_TABLE prep_8tap_scaled, ssse3,   4, 8, 16, 32, 64, 128
311
312%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
313
314SECTION .text
315
316INIT_XMM ssse3
317
318%if ARCH_X86_32
319 DECLARE_REG_TMP 1
320 %define base t0-put_ssse3
321%else
322 DECLARE_REG_TMP 7
323 %define base 0
324%endif
325
326%macro RESTORE_DSQ_32 1
327 %if ARCH_X86_32
328   mov                  %1, dsm ; restore dsq
329 %endif
330%endmacro
331
332cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy
333    movifnidn          mxyd, r6m ; mx
334    LEA                  t0, put_ssse3
335    movifnidn          srcq, srcmp
336    movifnidn           ssq, ssmp
337    tzcnt                wd, wm
338    mov                  hd, hm
339    test               mxyd, mxyd
340    jnz .h
341    mov                mxyd, r7m ; my
342    test               mxyd, mxyd
343    jnz .v
344.put:
345    movzx                wd, word [t0+wq*2+table_offset(put,)]
346    add                  wq, t0
347    RESTORE_DSQ_32       t0
348    jmp                  wq
349.put_w2:
350    movzx               r4d, word [srcq+ssq*0]
351    movzx               r6d, word [srcq+ssq*1]
352    lea                srcq, [srcq+ssq*2]
353    mov        [dstq+dsq*0], r4w
354    mov        [dstq+dsq*1], r6w
355    lea                dstq, [dstq+dsq*2]
356    sub                  hd, 2
357    jg .put_w2
358    RET
359.put_w4:
360    mov                 r4d, [srcq+ssq*0]
361    mov                 r6d, [srcq+ssq*1]
362    lea                srcq, [srcq+ssq*2]
363    mov        [dstq+dsq*0], r4d
364    mov        [dstq+dsq*1], r6d
365    lea                dstq, [dstq+dsq*2]
366    sub                  hd, 2
367    jg .put_w4
368    RET
369.put_w8:
370    movq                 m0, [srcq+ssq*0]
371    movq                 m1, [srcq+ssq*1]
372    lea                srcq, [srcq+ssq*2]
373    movq       [dstq+dsq*0], m0
374    movq       [dstq+dsq*1], m1
375    lea                dstq, [dstq+dsq*2]
376    sub                  hd, 2
377    jg .put_w8
378    RET
379.put_w16:
380    movu                 m0, [srcq+ssq*0]
381    movu                 m1, [srcq+ssq*1]
382    lea                srcq, [srcq+ssq*2]
383    mova       [dstq+dsq*0], m0
384    mova       [dstq+dsq*1], m1
385    lea                dstq, [dstq+dsq*2]
386    sub                  hd, 2
387    jg .put_w16
388    RET
389.put_w32:
390    movu                 m0, [srcq+ssq*0+16*0]
391    movu                 m1, [srcq+ssq*0+16*1]
392    movu                 m2, [srcq+ssq*1+16*0]
393    movu                 m3, [srcq+ssq*1+16*1]
394    lea                srcq, [srcq+ssq*2]
395    mova  [dstq+dsq*0+16*0], m0
396    mova  [dstq+dsq*0+16*1], m1
397    mova  [dstq+dsq*1+16*0], m2
398    mova  [dstq+dsq*1+16*1], m3
399    lea                dstq, [dstq+dsq*2]
400    sub                  hd, 2
401    jg .put_w32
402    RET
403.put_w64:
404    movu                 m0, [srcq+16*0]
405    movu                 m1, [srcq+16*1]
406    movu                 m2, [srcq+16*2]
407    movu                 m3, [srcq+16*3]
408    add                srcq, ssq
409    mova        [dstq+16*0], m0
410    mova        [dstq+16*1], m1
411    mova        [dstq+16*2], m2
412    mova        [dstq+16*3], m3
413    add                dstq, dsq
414    dec                  hd
415    jg .put_w64
416    RET
417.put_w128:
418    movu                 m0, [srcq+16*0]
419    movu                 m1, [srcq+16*1]
420    movu                 m2, [srcq+16*2]
421    movu                 m3, [srcq+16*3]
422    mova        [dstq+16*0], m0
423    mova        [dstq+16*1], m1
424    mova        [dstq+16*2], m2
425    mova        [dstq+16*3], m3
426    movu                 m0, [srcq+16*4]
427    movu                 m1, [srcq+16*5]
428    movu                 m2, [srcq+16*6]
429    movu                 m3, [srcq+16*7]
430    mova        [dstq+16*4], m0
431    mova        [dstq+16*5], m1
432    mova        [dstq+16*6], m2
433    mova        [dstq+16*7], m3
434    add                srcq, ssq
435    add                dstq, dsq
436    dec                  hd
437    jg .put_w128
438    RET
439.h:
440    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
441    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
442    imul               mxyd, 0x00ff00ff
443    mova                 m4, [base+subpel_h_shufD]
444    mova                 m0, [base+bilin_h_shuf4]
445    add                mxyd, 0x00100010
446    movd                 m5, mxyd
447    mov                mxyd, r7m ; my
448    pshufd               m5, m5, q0000
449    test               mxyd, mxyd
450    jnz .hv
451    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
452    mova                 m3, [base+pw_2048]
453    add                  wq, t0
454    movifnidn           dsq, dsmp
455    jmp                  wq
456.h_w2:
457    pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
458.h_w2_loop:
459    movd                 m0, [srcq+ssq*0]
460    movd                 m1, [srcq+ssq*1]
461    lea                srcq, [srcq+ssq*2]
462    punpckldq            m0, m1
463    pshufb               m0, m4
464    pmaddubsw            m0, m5
465    pmulhrsw             m0, m3
466    packuswb             m0, m0
467    movd                r6d, m0
468    mov        [dstq+dsq*0], r6w
469    shr                 r6d, 16
470    mov        [dstq+dsq*1], r6w
471    lea                dstq, [dstq+dsq*2]
472    sub                  hd, 2
473    jg .h_w2_loop
474    RET
475.h_w4:
476    movq                 m4, [srcq+ssq*0]
477    movhps               m4, [srcq+ssq*1]
478    lea                srcq, [srcq+ssq*2]
479    pshufb               m4, m0
480    pmaddubsw            m4, m5
481    pmulhrsw             m4, m3
482    packuswb             m4, m4
483    movd       [dstq+dsq*0], m4
484    psrlq                m4, 32
485    movd       [dstq+dsq*1], m4
486    lea                dstq, [dstq+dsq*2]
487    sub                  hd, 2
488    jg .h_w4
489    RET
490.h_w8:
491    movu                 m0, [srcq+ssq*0]
492    movu                 m1, [srcq+ssq*1]
493    lea                srcq, [srcq+ssq*2]
494    pshufb               m0, m4
495    pshufb               m1, m4
496    pmaddubsw            m0, m5
497    pmaddubsw            m1, m5
498    pmulhrsw             m0, m3
499    pmulhrsw             m1, m3
500    packuswb             m0, m1
501    movq       [dstq+dsq*0], m0
502    movhps     [dstq+dsq*1], m0
503    lea                dstq, [dstq+dsq*2]
504    sub                  hd, 2
505    jg .h_w8
506    RET
507.h_w16:
508    movu                 m0, [srcq+8*0]
509    movu                 m1, [srcq+8*1]
510    add                srcq, ssq
511    pshufb               m0, m4
512    pshufb               m1, m4
513    pmaddubsw            m0, m5
514    pmaddubsw            m1, m5
515    pmulhrsw             m0, m3
516    pmulhrsw             m1, m3
517    packuswb             m0, m1
518    mova             [dstq], m0
519    add                dstq, dsq
520    dec                  hd
521    jg .h_w16
522    RET
523.h_w32:
524    movu                 m0, [srcq+mmsize*0+8*0]
525    movu                 m1, [srcq+mmsize*0+8*1]
526    pshufb               m0, m4
527    pshufb               m1, m4
528    pmaddubsw            m0, m5
529    pmaddubsw            m1, m5
530    pmulhrsw             m0, m3
531    pmulhrsw             m1, m3
532    packuswb             m0, m1
533    movu                 m1, [srcq+mmsize*1+8*0]
534    movu                 m2, [srcq+mmsize*1+8*1]
535    add                srcq, ssq
536    pshufb               m1, m4
537    pshufb               m2, m4
538    pmaddubsw            m1, m5
539    pmaddubsw            m2, m5
540    pmulhrsw             m1, m3
541    pmulhrsw             m2, m3
542    packuswb             m1, m2
543    mova        [dstq+16*0], m0
544    mova        [dstq+16*1], m1
545    add                dstq, dsq
546    dec                  hd
547    jg .h_w32
548    RET
549.h_w64:
550    mov                  r6, -16*3
551.h_w64_loop:
552    movu                 m0, [srcq+r6+16*3+8*0]
553    movu                 m1, [srcq+r6+16*3+8*1]
554    pshufb               m0, m4
555    pshufb               m1, m4
556    pmaddubsw            m0, m5
557    pmaddubsw            m1, m5
558    pmulhrsw             m0, m3
559    pmulhrsw             m1, m3
560    packuswb             m0, m1
561    mova     [dstq+r6+16*3], m0
562    add                  r6, 16
563    jle .h_w64_loop
564    add                srcq, ssq
565    add                dstq, dsq
566    dec                  hd
567    jg .h_w64
568    RET
569.h_w128:
570    mov                  r6, -16*7
571.h_w128_loop:
572    movu                 m0, [srcq+r6+16*7+8*0]
573    movu                 m1, [srcq+r6+16*7+8*1]
574    pshufb               m0, m4
575    pshufb               m1, m4
576    pmaddubsw            m0, m5
577    pmaddubsw            m1, m5
578    pmulhrsw             m0, m3
579    pmulhrsw             m1, m3
580    packuswb             m0, m1
581    mova     [dstq+r6+16*7], m0
582    add                  r6, 16
583    jle .h_w128_loop
584    add                srcq, ssq
585    add                dstq, dsq
586    dec                  hd
587    jg .h_w128
588    RET
589.v:
590    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
591    imul               mxyd, 0x00ff00ff
592    mova                 m5, [base+pw_2048]
593    add                mxyd, 0x00100010
594    add                  wq, t0
595    movd                 m4, mxyd
596    pshufd               m4, m4, q0000
597    movifnidn           dsq, dsmp
598    jmp                  wq
599.v_w2:
600    movd                 m0, [srcq+ssq*0]
601.v_w2_loop:
602    pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
603    lea                srcq, [srcq+ssq*2]
604    pshuflw              m1, m0, q2301
605    pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
606    punpcklbw            m1, m0
607    pmaddubsw            m1, m4
608    pmulhrsw             m1, m5
609    packuswb             m1, m1
610    movd                r6d, m1
611    mov        [dstq+dsq*1], r6w
612    shr                 r6d, 16
613    mov        [dstq+dsq*0], r6w
614    lea                dstq, [dstq+dsq*2]
615    sub                  hd, 2
616    jg .v_w2_loop
617    RET
618.v_w4:
619    movd                 m0, [srcq+ssq*0]
620.v_w4_loop:
621    movd                 m2, [srcq+ssq*1]
622    lea                srcq, [srcq+ssq*2]
623    mova                 m1, m0
624    movd                 m0, [srcq+ssq*0]
625    punpckldq            m1, m2 ; 0 1
626    punpckldq            m2, m0 ; 1 2
627    punpcklbw            m1, m2
628    pmaddubsw            m1, m4
629    pmulhrsw             m1, m5
630    packuswb             m1, m1
631    movd       [dstq+dsq*0], m1
632    psrlq                m1, 32
633    movd       [dstq+dsq*1], m1
634    ;
635    lea                dstq, [dstq+dsq*2]
636    sub                  hd, 2
637    jg .v_w4_loop
638    RET
639.v_w8:
640    movq                 m0, [srcq+ssq*0]
641.v_w8_loop:
642    movq                 m2, [srcq+ssq*1]
643    lea                srcq, [srcq+ssq*2]
644    mova                 m1, m0
645    movq                 m0, [srcq+ssq*0]
646    punpcklbw            m1, m2
647    punpcklbw            m2, m0
648    pmaddubsw            m1, m4
649    pmaddubsw            m2, m4
650    pmulhrsw             m1, m5
651    pmulhrsw             m2, m5
652    packuswb             m1, m2
653    movq       [dstq+dsq*0], m1
654    movhps     [dstq+dsq*1], m1
655    lea                dstq, [dstq+dsq*2]
656    sub                  hd, 2
657    jg .v_w8_loop
658    RET
659%macro PUT_BILIN_V_W16 0
660    movu                 m0, [srcq+ssq*0]
661%%loop:
662    movu                 m3, [srcq+ssq*1]
663    lea                srcq, [srcq+ssq*2]
664    mova                 m1, m0
665    mova                 m2, m0
666    movu                 m0, [srcq+ssq*0]
667    punpcklbw            m1, m3
668    punpckhbw            m2, m3
669    pmaddubsw            m1, m4
670    pmaddubsw            m2, m4
671    pmulhrsw             m1, m5
672    pmulhrsw             m2, m5
673    packuswb             m1, m2
674    punpcklbw            m2, m3, m0
675    punpckhbw            m3, m0
676    pmaddubsw            m2, m4
677    pmaddubsw            m3, m4
678    pmulhrsw             m2, m5
679    pmulhrsw             m3, m5
680    packuswb             m2, m3
681    mova       [dstq+dsq*0], m1
682    mova       [dstq+dsq*1], m2
683    lea                dstq, [dstq+dsq*2]
684    sub                  hd, 2
685    jg %%loop
686%endmacro
687.v_w16:
688    PUT_BILIN_V_W16
689    RET
690.v_w128:
691    lea                 r6d, [hq+(7<<16)]
692    jmp .v_w16gt
693.v_w64:
694    lea                 r6d, [hq+(3<<16)]
695    jmp .v_w16gt
696.v_w32:
697    lea                 r6d, [hq+(1<<16)]
698.v_w16gt:
699    mov                  r4, srcq
700%if ARCH_X86_64
701    mov                  r7, dstq
702%endif
703.v_w16gt_loop:
704    PUT_BILIN_V_W16
705%if ARCH_X86_64
706    add                  r4, 16
707    add                  r7, 16
708    movzx                hd, r6b
709    mov                srcq, r4
710    mov                dstq, r7
711%else
712    mov                dstq, dstmp
713    add                  r4, 16
714    movzx                hd, r6w
715    add                dstq, 16
716    mov                srcq, r4
717    mov               dstmp, dstq
718%endif
719    sub                 r6d, 1<<16
720    jg .v_w16gt
721    RET
722.hv:
723    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
724    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
725    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
726    WIN64_SPILL_XMM       8
727    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
728    mova                 m7, [base+pw_15]
729    movd                 m6, mxyd
730    add                  wq, t0
731    pshuflw              m6, m6, q0000
732    paddb                m5, m5
733    punpcklqdq           m6, m6
734    jmp                  wq
735.hv_w2:
736    RESTORE_DSQ_32       t0
737    movd                 m0, [srcq+ssq*0]
738    punpckldq            m0, m0
739    pshufb               m0, m4
740    pmaddubsw            m0, m5
741.hv_w2_loop:
742    movd                 m1, [srcq+ssq*1]
743    lea                srcq, [srcq+ssq*2]
744    movd                 m2, [srcq+ssq*0]
745    punpckldq            m1, m2
746    pshufb               m1, m4
747    pmaddubsw            m1, m5             ; 1 _ 2 _
748    shufps               m2, m0, m1, q1032  ; 0 _ 1 _
749    mova                 m0, m1
750    psubw                m1, m2   ; 2 * (src[x + src_stride] - src[x])
751    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x]) >> 4
752    pavgw                m2, m7   ; src[x] + 8
753    paddw                m1, m2   ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
754    psrlw                m1, 4
755    packuswb             m1, m1
756%if ARCH_X86_64
757    movq                 r6, m1
758%else
759    pshuflw              m1, m1, q2020
760    movd                r6d, m1
761%endif
762    mov        [dstq+dsq*0], r6w
763    shr                  r6, gprsize*4
764    mov        [dstq+dsq*1], r6w
765    lea                dstq, [dstq+dsq*2]
766    sub                  hd, 2
767    jg .hv_w2_loop
768    RET
769.hv_w4:
770    mova                 m4, [base+bilin_h_shuf4]
771    movddup              m0, [srcq+ssq*0]
772    movifnidn           dsq, dsmp
773    pshufb               m0, m4
774    pmaddubsw            m0, m5
775.hv_w4_loop:
776    movq                 m1, [srcq+ssq*1]
777    lea                srcq, [srcq+ssq*2]
778    movhps               m1, [srcq+ssq*0]
779    pshufb               m1, m4
780    pmaddubsw            m1, m5            ; 1 2
781    shufps               m2, m0, m1, q1032 ; 0 1
782    mova                 m0, m1
783    psubw                m1, m2
784    pmulhw               m1, m6
785    pavgw                m2, m7
786    paddw                m1, m2
787    psrlw                m1, 4
788    packuswb             m1, m1
789    movd       [dstq+dsq*0], m1
790    psrlq                m1, 32
791    movd       [dstq+dsq*1], m1
792    lea                dstq, [dstq+dsq*2]
793    sub                  hd, 2
794    jg .hv_w4_loop
795    RET
796.hv_w8:
797    movu                 m0, [srcq+ssq*0]
798    movifnidn           dsq, dsmp
799    pshufb               m0, m4
800    pmaddubsw            m0, m5
801.hv_w8_loop:
802    movu                 m2, [srcq+ssq*1]
803    lea                srcq, [srcq+ssq*2]
804    pshufb               m2, m4
805    pmaddubsw            m2, m5
806    psubw                m1, m2, m0
807    pmulhw               m1, m6
808    pavgw                m0, m7
809    paddw                m1, m0
810    movu                 m0, [srcq+ssq*0]
811    pshufb               m0, m4
812    pmaddubsw            m0, m5
813    psubw                m3, m0, m2
814    pmulhw               m3, m6
815    pavgw                m2, m7
816    paddw                m3, m2
817    psrlw                m1, 4
818    psrlw                m3, 4
819    packuswb             m1, m3
820    movq       [dstq+dsq*0], m1
821    movhps     [dstq+dsq*1], m1
822    lea                dstq, [dstq+dsq*2]
823    sub                  hd, 2
824    jg .hv_w8_loop
825    RET
826.hv_w128:
827    lea                 r6d, [hq+(7<<16)]
828    jmp .hv_w16_start
829.hv_w64:
830    lea                 r6d, [hq+(3<<16)]
831    jmp .hv_w16_start
832.hv_w32:
833    lea                 r6d, [hq+(1<<16)]
834.hv_w16_start:
835    mov                  r4, srcq
836%if ARCH_X86_32
837    %define m8 [dstq]
838%else
839    mov                  r7, dstq
840%endif
841.hv_w16:
842    movifnidn           dsq, dsmp
843%if WIN64
844    movaps              r4m, m8
845%endif
846.hv_w16_loop0:
847    movu                 m0, [srcq+8*0]
848    movu                 m1, [srcq+8*1]
849    pshufb               m0, m4
850    pshufb               m1, m4
851    pmaddubsw            m0, m5
852    pmaddubsw            m1, m5
853.hv_w16_loop:
854    add                srcq, ssq
855    movu                 m2, [srcq+8*0]
856    movu                 m3, [srcq+8*1]
857    pshufb               m2, m4
858    pshufb               m3, m4
859    pmaddubsw            m2, m5
860    pmaddubsw            m3, m5
861    mova                 m8, m2
862    psubw                m2, m0
863    pmulhw               m2, m6
864    pavgw                m0, m7
865    paddw                m2, m0
866    mova                 m0, m3
867    psubw                m3, m1
868    pmulhw               m3, m6
869    pavgw                m1, m7
870    paddw                m3, m1
871    mova                 m1, m0
872    mova                 m0, m8
873    psrlw                m2, 4
874    psrlw                m3, 4
875    packuswb             m2, m3
876    mova             [dstq], m2
877    add                dstq, dsmp
878    dec                  hd
879    jg .hv_w16_loop
880%if ARCH_X86_32
881    mov                dstq, dstm
882    add                  r4, 16
883    movzx                hd, r6w
884    add                dstq, 16
885    mov                srcq, r4
886    mov                dstm, dstq
887%else
888    add                  r4, 16
889    add                  r7, 16
890    movzx                hd, r6b
891    mov                srcq, r4
892    mov                dstq, r7
893%endif
894    sub                 r6d, 1<<16
895    jg .hv_w16_loop0
896%if WIN64
897    movaps               m8, r4m
898%endif
899    RET
900
901%if ARCH_X86_32
902    %define base r6-prep%+SUFFIX
903%else
904    %define base 0
905%endif
906
907cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
908    movifnidn          mxyd, r5m ; mx
909    LEA                  r6, prep_ssse3
910    tzcnt                wd, wm
911    movifnidn            hd, hm
912    test               mxyd, mxyd
913    jnz .h
914    mov                mxyd, r6m ; my
915    test               mxyd, mxyd
916    jnz .v
917.prep:
918    movzx                wd, word [r6+wq*2+table_offset(prep,)]
919    pxor                 m4, m4
920    add                  wq, r6
921    lea            stride3q, [strideq*3]
922    jmp                  wq
923.prep_w4:
924    movd                 m0, [srcq+strideq*0]
925    movd                 m1, [srcq+strideq*1]
926    movd                 m2, [srcq+strideq*2]
927    movd                 m3, [srcq+stride3q ]
928    lea                srcq, [srcq+strideq*4]
929    punpckldq            m0, m1
930    punpckldq            m2, m3
931    punpcklbw            m0, m4
932    punpcklbw            m2, m4
933    psllw                m0, 4
934    psllw                m2, 4
935    mova        [tmpq+16*0], m0
936    mova        [tmpq+16*1], m2
937    add                tmpq, 16*2
938    sub                  hd, 4
939    jg .prep_w4
940    RET
941.prep_w8:
942    movq                 m0, [srcq+strideq*0]
943    movq                 m1, [srcq+strideq*1]
944    movq                 m2, [srcq+strideq*2]
945    movq                 m3, [srcq+stride3q ]
946    lea                srcq, [srcq+strideq*4]
947    punpcklbw            m0, m4
948    punpcklbw            m1, m4
949    punpcklbw            m2, m4
950    punpcklbw            m3, m4
951    psllw                m0, 4
952    psllw                m1, 4
953    psllw                m2, 4
954    psllw                m3, 4
955    mova        [tmpq+16*0], m0
956    mova        [tmpq+16*1], m1
957    mova        [tmpq+16*2], m2
958    mova        [tmpq+16*3], m3
959    add                tmpq, 16*4
960    sub                  hd, 4
961    jg .prep_w8
962    RET
963.prep_w16:
964    movu                 m1, [srcq+strideq*0]
965    movu                 m3, [srcq+strideq*1]
966    lea                srcq, [srcq+strideq*2]
967    punpcklbw            m0, m1, m4
968    punpckhbw            m1, m4
969    punpcklbw            m2, m3, m4
970    punpckhbw            m3, m4
971    psllw                m0, 4
972    psllw                m1, 4
973    psllw                m2, 4
974    psllw                m3, 4
975    mova        [tmpq+16*0], m0
976    mova        [tmpq+16*1], m1
977    mova        [tmpq+16*2], m2
978    mova        [tmpq+16*3], m3
979    add                tmpq, 16*4
980    sub                  hd, 2
981    jg .prep_w16
982    RET
983.prep_w128:
984    mov                  r3, -128
985    jmp .prep_w32_start
986.prep_w64:
987    mov                  r3, -64
988    jmp .prep_w32_start
989.prep_w32:
990    mov                  r3, -32
991.prep_w32_start:
992    sub                srcq, r3
993.prep_w32_vloop:
994    mov                  r6, r3
995.prep_w32_hloop:
996    movu                 m1, [srcq+r6+16*0]
997    movu                 m3, [srcq+r6+16*1]
998    punpcklbw            m0, m1, m4
999    punpckhbw            m1, m4
1000    punpcklbw            m2, m3, m4
1001    punpckhbw            m3, m4
1002    psllw                m0, 4
1003    psllw                m1, 4
1004    psllw                m2, 4
1005    psllw                m3, 4
1006    mova        [tmpq+16*0], m0
1007    mova        [tmpq+16*1], m1
1008    mova        [tmpq+16*2], m2
1009    mova        [tmpq+16*3], m3
1010    add                tmpq, 16*4
1011    add                  r6, 32
1012    jl .prep_w32_hloop
1013    add                srcq, strideq
1014    dec                  hd
1015    jg .prep_w32_vloop
1016    RET
1017.h:
1018    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
1019    ; = (16 - mx) * src[x] + mx * src[x + 1]
1020    imul               mxyd, 0x00ff00ff
1021    mova                 m4, [base+subpel_h_shufD]
1022    add                mxyd, 0x00100010
1023    movd                 m5, mxyd
1024    mov                mxyd, r6m ; my
1025    pshufd               m5, m5, q0000
1026    test               mxyd, mxyd
1027    jnz .hv
1028    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
1029    add                  wq, r6
1030    jmp                  wq
1031.h_w4:
1032    mova                 m4, [base+bilin_h_shuf4]
1033    lea            stride3q, [strideq*3]
1034.h_w4_loop:
1035    movq                 m0, [srcq+strideq*0]
1036    movhps               m0, [srcq+strideq*1]
1037    movq                 m1, [srcq+strideq*2]
1038    movhps               m1, [srcq+stride3q ]
1039    lea                srcq, [srcq+strideq*4]
1040    pshufb               m0, m4
1041    pshufb               m1, m4
1042    pmaddubsw            m0, m5
1043    pmaddubsw            m1, m5
1044    mova          [tmpq+0 ], m0
1045    mova          [tmpq+16], m1
1046    add                tmpq, 32
1047    sub                  hd, 4
1048    jg .h_w4_loop
1049    RET
1050.h_w8:
1051    lea            stride3q, [strideq*3]
1052.h_w8_loop:
1053    movu                 m0, [srcq+strideq*0]
1054    movu                 m1, [srcq+strideq*1]
1055    movu                 m2, [srcq+strideq*2]
1056    movu                 m3, [srcq+stride3q ]
1057    lea                srcq, [srcq+strideq*4]
1058    REPX  {pshufb    x, m4}, m0, m1, m2, m3
1059    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
1060    mova        [tmpq+16*0], m0
1061    mova        [tmpq+16*1], m1
1062    mova        [tmpq+16*2], m2
1063    mova        [tmpq+16*3], m3
1064    add                tmpq, 16*4
1065    sub                  hd, 4
1066    jg .h_w8_loop
1067    RET
1068.h_w16:
1069    movu                 m0, [srcq+strideq*0+8*0]
1070    movu                 m1, [srcq+strideq*0+8*1]
1071    movu                 m2, [srcq+strideq*1+8*0]
1072    movu                 m3, [srcq+strideq*1+8*1]
1073    lea                srcq, [srcq+strideq*2]
1074    REPX  {pshufb    x, m4}, m0, m1, m2, m3
1075    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
1076    mova        [tmpq+16*0], m0
1077    mova        [tmpq+16*1], m1
1078    mova        [tmpq+16*2], m2
1079    mova        [tmpq+16*3], m3
1080    add                tmpq, 16*4
1081    sub                  hd, 2
1082    jg .h_w16
1083    RET
1084.h_w128:
1085    mov                  r3, -128
1086    jmp .h_w32_start
1087.h_w64:
1088    mov                  r3, -64
1089    jmp .h_w32_start
1090.h_w32:
1091    mov                  r3, -32
1092.h_w32_start:
1093    sub                srcq, r3
1094.h_w32_vloop:
1095    mov                  r6, r3
1096.h_w32_hloop:
1097    movu                 m0, [srcq+r6+8*0]
1098    movu                 m1, [srcq+r6+8*1]
1099    movu                 m2, [srcq+r6+8*2]
1100    movu                 m3, [srcq+r6+8*3]
1101    REPX  {pshufb    x, m4}, m0, m1, m2, m3
1102    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
1103    mova        [tmpq+16*0], m0
1104    mova        [tmpq+16*1], m1
1105    mova        [tmpq+16*2], m2
1106    mova        [tmpq+16*3], m3
1107    add                tmpq, 16*4
1108    add                  r6, 32
1109    jl .h_w32_hloop
1110    add                srcq, strideq
1111    dec                  hd
1112    jg .h_w32_vloop
1113    RET
1114.v:
1115    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
1116    imul               mxyd, 0x00ff00ff
1117    add                mxyd, 0x00100010
1118    add                  wq, r6
1119    lea            stride3q, [strideq*3]
1120    movd                 m5, mxyd
1121    pshufd               m5, m5, q0000
1122    jmp                  wq
1123.v_w4:
1124    movd                 m0, [srcq+strideq*0]
1125.v_w4_loop:
1126    movd                 m1, [srcq+strideq*1]
1127    movd                 m2, [srcq+strideq*2]
1128    movd                 m3, [srcq+stride3q ]
1129    lea                srcq, [srcq+strideq*4]
1130    punpckldq            m0, m1
1131    punpckldq            m1, m2
1132    punpcklbw            m0, m1 ; 01 12
1133    pmaddubsw            m0, m5
1134    mova        [tmpq+16*0], m0
1135    movd                 m0, [srcq+strideq*0]
1136    punpckldq            m2, m3
1137    punpckldq            m3, m0
1138    punpcklbw            m2, m3 ; 23 34
1139    pmaddubsw            m2, m5
1140    mova        [tmpq+16*1], m2
1141    add                tmpq, 16*2
1142    sub                  hd, 4
1143    jg .v_w4_loop
1144    RET
1145.v_w8:
1146    movq                 m0, [srcq+strideq*0]
1147.v_w8_loop:
1148    movq                 m1, [srcq+strideq*1]
1149    movq                 m2, [srcq+strideq*2]
1150    movq                 m3, [srcq+stride3q ]
1151    lea                srcq, [srcq+strideq*4]
1152    punpcklbw            m0, m1 ; 01
1153    punpcklbw            m1, m2 ; 12
1154    pmaddubsw            m0, m5
1155    pmaddubsw            m1, m5
1156    mova        [tmpq+16*0], m0
1157    movq                 m0, [srcq+strideq*0]
1158    punpcklbw            m2, m3 ; 23
1159    punpcklbw            m3, m0 ; 34
1160    pmaddubsw            m2, m5
1161    mova        [tmpq+16*1], m1
1162    pmaddubsw            m3, m5
1163    mova        [tmpq+16*2], m2
1164    mova        [tmpq+16*3], m3
1165    add                tmpq, 16*4
1166    sub                  hd, 4
1167    jg .v_w8_loop
1168    RET
1169.v_w16:
1170    movu                 m0, [srcq+strideq*0]
1171.v_w16_loop:
1172    movu                 m1, [srcq+strideq*1]
1173    movu                 m2, [srcq+strideq*2]
1174    movu                 m3, [srcq+stride3q ]
1175    lea                srcq, [srcq+strideq*4]
1176    punpcklbw            m4, m0, m1
1177    punpckhbw            m0, m1
1178    pmaddubsw            m4, m5
1179    pmaddubsw            m0, m5
1180    mova        [tmpq+16*0], m4
1181    punpcklbw            m4, m1, m2
1182    punpckhbw            m1, m2
1183    pmaddubsw            m4, m5
1184    mova        [tmpq+16*1], m0
1185    movu                 m0, [srcq+strideq*0]
1186    pmaddubsw            m1, m5
1187    mova        [tmpq+16*2], m4
1188    punpcklbw            m4, m2, m3
1189    punpckhbw            m2, m3
1190    pmaddubsw            m4, m5
1191    mova        [tmpq+16*3], m1
1192    pmaddubsw            m2, m5
1193    mova        [tmpq+16*4], m4
1194    punpcklbw            m4, m3, m0
1195    punpckhbw            m3, m0
1196    pmaddubsw            m4, m5
1197    mova        [tmpq+16*5], m2
1198    pmaddubsw            m3, m5
1199    mova        [tmpq+16*6], m4
1200    mova        [tmpq+16*7], m3
1201    add                tmpq, 16*8
1202    sub                  hd, 4
1203    jg .v_w16_loop
1204    RET
1205.v_w128:
1206    lea                 r3d, [hq+(3<<8)]
1207    mov                 r6d, 256
1208    jmp .v_w32_start
1209.v_w64:
1210    lea                 r3d, [hq+(1<<8)]
1211    mov                 r6d, 128
1212    jmp .v_w32_start
1213.v_w32:
1214    xor                 r3d, r3d
1215    mov                 r6d, 64
1216.v_w32_start:
1217%if ARCH_X86_64
1218 %if WIN64
1219    PUSH                 r7
1220 %endif
1221    mov                  r7, tmpq
1222%endif
1223    mov                  r5, srcq
1224.v_w32_hloop:
1225    movu                 m0, [srcq+strideq*0+16*0]
1226    movu                 m1, [srcq+strideq*0+16*1]
1227.v_w32_vloop:
1228    movu                 m2, [srcq+strideq*1+16*0]
1229    movu                 m3, [srcq+strideq*1+16*1]
1230    lea                srcq, [srcq+strideq*2]
1231    punpcklbw            m4, m0, m2
1232    punpckhbw            m0, m2
1233    pmaddubsw            m4, m5
1234    pmaddubsw            m0, m5
1235    mova        [tmpq+16*0], m4
1236    mova        [tmpq+16*1], m0
1237    movu                 m0, [srcq+strideq*0+16*0]
1238    punpcklbw            m4, m1, m3
1239    punpckhbw            m1, m3
1240    pmaddubsw            m4, m5
1241    pmaddubsw            m1, m5
1242    mova        [tmpq+16*2], m4
1243    mova        [tmpq+16*3], m1
1244    movu                 m1, [srcq+strideq*0+16*1]
1245    add                tmpq, r6
1246    punpcklbw            m4, m2, m0
1247    punpckhbw            m2, m0
1248    pmaddubsw            m4, m5
1249    pmaddubsw            m2, m5
1250    mova        [tmpq+16*0], m4
1251    mova        [tmpq+16*1], m2
1252    punpcklbw            m4, m3, m1
1253    punpckhbw            m3, m1
1254    pmaddubsw            m4, m5
1255    pmaddubsw            m3, m5
1256    mova        [tmpq+16*2], m4
1257    mova        [tmpq+16*3], m3
1258    add                tmpq, r6
1259    sub                  hd, 2
1260    jg .v_w32_vloop
1261    add                  r5, 32
1262    movzx                hd, r3b
1263    mov                srcq, r5
1264%if ARCH_X86_64
1265    add                  r7, 16*4
1266    mov                tmpq, r7
1267%else
1268    mov                tmpq, tmpmp
1269    add                tmpq, 16*4
1270    mov               tmpmp, tmpq
1271%endif
1272    sub                 r3d, 1<<8
1273    jg .v_w32_hloop
1274%if WIN64
1275    POP                  r7
1276%endif
1277    RET
1278.hv:
1279    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
1280    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
1281    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1282    imul               mxyd, 0x08000800
1283    WIN64_SPILL_XMM 8
1284    movd                 m6, mxyd
1285    add                  wq, r6
1286    pshufd               m6, m6, q0000
1287    jmp                  wq
1288.hv_w4:
1289    mova                 m4, [base+bilin_h_shuf4]
1290    movddup              m0, [srcq+strideq*0]
1291    lea                  r3, [strideq*3]
1292    pshufb               m0, m4
1293    pmaddubsw            m0, m5            ; _ 0
1294.hv_w4_loop:
1295    movq                 m1, [srcq+strideq*1]
1296    movhps               m1, [srcq+strideq*2]
1297    movq                 m2, [srcq+r3       ]
1298    lea                srcq, [srcq+strideq*4]
1299    movhps               m2, [srcq+strideq*0]
1300    pshufb               m1, m4
1301    pshufb               m2, m4
1302    pmaddubsw            m1, m5            ; 1 2
1303    pmaddubsw            m2, m5            ; 3 4
1304    shufpd               m0, m1, 0x01      ; 0 1
1305    shufpd               m3, m1, m2, 0x01  ; 2 3
1306    psubw                m1, m0
1307    pmulhrsw             m1, m6
1308    paddw                m1, m0
1309    mova                 m0, m2
1310    psubw                m2, m3
1311    pmulhrsw             m2, m6
1312    paddw                m2, m3
1313    mova        [tmpq+16*0], m1
1314    mova        [tmpq+16*1], m2
1315    add                tmpq, 32
1316    sub                  hd, 4
1317    jg .hv_w4_loop
1318    RET
1319.hv_w8:
1320    movu                 m0, [srcq+strideq*0]
1321    pshufb               m0, m4
1322    pmaddubsw            m0, m5 ; 0
1323.hv_w8_loop:
1324    movu                 m1, [srcq+strideq*1]
1325    lea                srcq, [srcq+strideq*2]
1326    movu                 m2, [srcq+strideq*0]
1327    pshufb               m1, m4
1328    pshufb               m2, m4
1329    pmaddubsw            m1, m5 ; 1
1330    pmaddubsw            m2, m5 ; 2
1331    psubw                m3, m1, m0
1332    pmulhrsw             m3, m6
1333    paddw                m3, m0
1334    mova                 m0, m2
1335    psubw                m2, m1
1336    pmulhrsw             m2, m6
1337    paddw                m2, m1
1338    mova        [tmpq+16*0], m3
1339    mova        [tmpq+16*1], m2
1340    add                tmpq, 16*2
1341    sub                  hd, 2
1342    jg .hv_w8_loop
1343    RET
1344.hv_w128:
1345    lea                 r3d, [hq+(7<<8)]
1346    mov                 r5d, 256
1347    jmp .hv_w16_start
1348.hv_w64:
1349    lea                 r3d, [hq+(3<<8)]
1350    mov                 r5d, 128
1351    jmp .hv_w16_start
1352.hv_w32:
1353    lea                 r3d, [hq+(1<<8)]
1354    mov                 r5d, 64
1355    jmp .hv_w16_start
1356.hv_w16:
1357    xor                 r3d, r3d
1358    mov                 r5d, 32
1359.hv_w16_start:
1360    mov                  r6, srcq
1361%if ARCH_X86_64
1362 %if WIN64
1363    PUSH                 r7
1364 %endif
1365    mov                  r7, tmpq
1366%endif
1367.hv_w16_hloop:
1368    movu                 m0, [srcq+strideq*0+8*0]
1369    movu                 m1, [srcq+strideq*0+8*1]
1370    pshufb               m0, m4
1371    pshufb               m1, m4
1372    pmaddubsw            m0, m5 ; 0a
1373    pmaddubsw            m1, m5 ; 0b
1374.hv_w16_vloop:
1375    movu                 m2, [srcq+strideq*1+8*0]
1376    pshufb               m2, m4
1377    pmaddubsw            m2, m5 ; 1a
1378    psubw                m3, m2, m0
1379    pmulhrsw             m3, m6
1380    paddw                m3, m0
1381    mova        [tmpq+16*0], m3
1382    movu                 m3, [srcq+strideq*1+8*1]
1383    lea                srcq, [srcq+strideq*2]
1384    pshufb               m3, m4
1385    pmaddubsw            m3, m5 ; 1b
1386    psubw                m0, m3, m1
1387    pmulhrsw             m0, m6
1388    paddw                m0, m1
1389    mova        [tmpq+16*1], m0
1390    add                tmpq, r5
1391    movu                 m0, [srcq+strideq*0+8*0]
1392    pshufb               m0, m4
1393    pmaddubsw            m0, m5 ; 2a
1394    psubw                m1, m0, m2
1395    pmulhrsw             m1, m6
1396    paddw                m1, m2
1397    mova        [tmpq+16*0], m1
1398    movu                 m1, [srcq+strideq*0+8*1]
1399    pshufb               m1, m4
1400    pmaddubsw            m1, m5 ; 2b
1401    psubw                m2, m1, m3
1402    pmulhrsw             m2, m6
1403    paddw                m2, m3
1404    mova        [tmpq+16*1], m2
1405    add                tmpq, r5
1406    sub                  hd, 2
1407    jg .hv_w16_vloop
1408    movzx                hd, r3b
1409%if ARCH_X86_64
1410    add                  r6, 16
1411    add                  r7, 2*16
1412    mov                srcq, r6
1413    mov                tmpq, r7
1414%else
1415    mov                tmpq, tmpm
1416    add                  r6, 16
1417    add                tmpq, 2*16
1418    mov                srcq, r6
1419    mov                tmpm, tmpq
1420%endif
1421    sub                 r3d, 1<<8
1422    jg .hv_w16_hloop
1423%if WIN64
1424    POP                  r7
1425%endif
1426    RET
1427
1428; int8_t subpel_filters[5][15][8]
1429%assign FILTER_REGULAR (0*15 << 16) | 3*15
1430%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1431%assign FILTER_SHARP   (2*15 << 16) | 3*15
1432
1433%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
1434cglobal %1_%2_8bpc
1435    mov                 t0d, FILTER_%3
1436%ifidn %3, %4
1437    mov                 t1d, t0d
1438%else
1439    mov                 t1d, FILTER_%4
1440%endif
1441%if %0 == 5 ; skip the jump in the last filter
1442    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1443%endif
1444%endmacro
1445
1446%if ARCH_X86_32
1447DECLARE_REG_TMP 1, 2
1448%elif WIN64
1449DECLARE_REG_TMP 4, 5
1450%else
1451DECLARE_REG_TMP 7, 8
1452%endif
1453
1454%if ARCH_X86_32
1455 %define base_reg r1
1456 %define base base_reg-put_ssse3
1457%else
1458 %define base_reg r8
1459 %define base 0
1460%endif
1461
1462%define PUT_8TAP_FN FN put_8tap,
1463PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_8bpc
1464PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_8bpc
1465PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_8bpc
1466PUT_8TAP_FN regular,        REGULAR, REGULAR
1467
1468cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
1469    imul                mxd, mxm, 0x010101
1470    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
1471%if ARCH_X86_64
1472    imul                myd, mym, 0x010101
1473    add                 myd, t1d ; 8tap_v, my, 4tap_v
1474%else
1475    imul                ssd, mym, 0x010101
1476    add                 ssd, t1d ; 8tap_v, my, 4tap_v
1477    mov                srcq, srcm
1478%endif
1479    mov                  wd, wm
1480    movifnidn            hd, hm
1481    LEA            base_reg, put_ssse3
1482    test                mxd, 0xf00
1483    jnz .h
1484%if ARCH_X86_32
1485    test                ssd, 0xf00
1486%else
1487    test                myd, 0xf00
1488%endif
1489    jnz .v
1490.put:
1491    tzcnt                wd, wd
1492    movzx                wd, word [base_reg+wq*2+table_offset(put,)]
1493    movifnidn           ssq, ssmp
1494    add                  wq, base_reg
1495    movifnidn           dsq, dsmp
1496%if WIN64
1497    pop                  r8
1498%endif
1499    lea                  r6, [ssq*3]
1500    jmp                  wq
1501.h:
1502%if ARCH_X86_32
1503    test                ssd, 0xf00
1504%else
1505    test                myd, 0xf00
1506%endif
1507    jnz .hv
1508    movifnidn           ssq, ssmp
1509    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
1510    cmp                  wd, 4
1511    jle mangle(private_prefix %+ _put_8tap_8bpc %+ SUFFIX).h_w4
1512    WIN64_SPILL_XMM      11
1513%if ARCH_X86_64
1514    mova                 m8, [base+subpel_h_shufD]
1515    mova                 m9, [base+subpel_h_shufE]
1516    mova                m10, [base+subpel_h_shufF]
1517%endif
1518    shr                 mxd, 16
1519    sub                srcq, 2
1520    movq                 m7, [base_reg-put_ssse3+subpel_filters+1+mxq*8]
1521    punpcklwd            m7, m7
1522    pshufd               m4, m7, q0000
1523    pshufd               m6, m7, q1111
1524    pshufd               m7, m7, q2222
1525    sub                  wd, 16
1526    jge .h_w16
1527%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2]
1528%if ARCH_X86_32
1529    pshufb               %2, %1, [base+subpel_h_shufD]
1530    pshufb               %3, %1, [base+subpel_h_shufE]
1531    pshufb               %1, [base+subpel_h_shufF]
1532%else
1533    pshufb               %2, %1, m8
1534    pshufb               %3, %1, m9
1535    pshufb               %1, m10
1536%endif
1537    pmaddubsw            %2, m4
1538    pmaddubsw            %3, m6
1539    pmaddubsw            %1, m7
1540    paddw                %2, m5
1541    paddw                %2, %3
1542    paddw                %1, %2
1543    psraw                %1, 6
1544%endmacro
1545%if ARCH_X86_32
1546    mov                  r4, dsm
1547%endif
1548.h_w8:
1549    movu                 m0, [srcq+ssq*0]
1550    movu                 m1, [srcq+ssq*1]
1551    lea                srcq, [srcq+ssq*2]
1552    PUT_6TAP_H           m0, m2, m3
1553    PUT_6TAP_H           m1, m2, m3
1554    packuswb             m0, m1
1555%if ARCH_X86_32
1556    movq        [dstq+r4*0], m0
1557    movhps      [dstq+r4*1], m0
1558    lea                dstq, [dstq+r4*2]
1559%else
1560    movq       [dstq+dsq*0], m0
1561    movhps     [dstq+dsq*1], m0
1562    lea                dstq, [dstq+dsq*2]
1563%endif
1564    sub                  hd, 2
1565    jg .h_w8
1566    RET
1567.h_w16:
1568    add                srcq, wq
1569    add                dstq, wq
1570    neg                  wq
1571.h_w16_loop_v:
1572    mov                  r6, wq
1573.h_w16_loop_h:
1574    movu                 m0, [srcq+r6+8*0]
1575    movu                 m1, [srcq+r6+8*1]
1576    PUT_6TAP_H           m0, m2, m3
1577    PUT_6TAP_H           m1, m2, m3
1578    packuswb             m0, m1
1579    mova          [dstq+r6], m0
1580    add                  r6, 16
1581    jle .h_w16_loop_h
1582    add                srcq, ssq
1583    add                dstq, dsmp
1584    dec                  hd
1585    jg .h_w16_loop_v
1586    RET
1587.v:
1588%if ARCH_X86_32
1589    %define             dsq  r4
1590    %define              m8  [base+pw_512]
1591    movzx               mxd, ssb
1592    shr                 ssd, 16
1593    cmp                  hd, 6
1594    cmovs               ssd, mxd
1595    movq                 m7, [base_reg-put_ssse3+subpel_filters+1+ssq*8]
1596    mov                 ssq, ssm
1597    punpcklwd            m7, m7
1598    pshufd               m5, m7, q0000
1599    mov                  r6, ssq
1600    pshufd               m6, m7, q1111
1601    neg                  r6
1602    pshufd               m7, m7, q2222
1603    cmp                  wd, 4
1604    jge .v_w4
1605%else
1606    WIN64_SPILL_XMM       9, 12
1607    movzx               mxd, myb
1608    shr                 myd, 16
1609    cmp                  hd, 6
1610    cmovs               myd, mxd
1611    movq                 m7, [base_reg-put_ssse3+subpel_filters+1+myq*8]
1612    mova                 m8, [base+pw_512]
1613    punpcklwd            m7, m7
1614    pshufd               m5, m7, q0000
1615    mov                 nsq, ssq
1616    pshufd               m6, m7, q1111
1617    neg                 nsq
1618    pshufd               m7, m7, q2222
1619    cmp                  wd, 4
1620    je .v_w4
1621    jg .v_w8
1622%endif
1623.v_w2:
1624%if ARCH_X86_32
1625    mov                 dsq, dsm
1626    movd                 m1, [srcq+r6 *2]
1627    movd                 m3, [srcq+r6 *1]
1628%else
1629    movd                 m1, [srcq+nsq*2]
1630    movd                 m3, [srcq+nsq*1]
1631%endif
1632    movd                 m2, [srcq+ssq*0]
1633    movd                 m4, [srcq+ssq*1]
1634    lea                srcq, [srcq+ssq*2]
1635    movd                 m0, [srcq+ssq*0]
1636    punpcklwd            m1, m3     ; 0 1
1637    punpcklwd            m3, m2     ; 1 2
1638    punpcklwd            m2, m4     ; 2 3
1639    punpcklwd            m4, m0     ; 3 4
1640    punpcklbw            m1, m3     ; 01 12
1641    punpcklbw            m2, m4     ; 23 34
1642.v_w2_loop:
1643    movd                 m3, [srcq+ssq*1]
1644    lea                srcq, [srcq+ssq*2]
1645    pmaddubsw            m4, m1, m5 ; a0 b0
1646    mova                 m1, m2
1647    pmaddubsw            m2, m6     ; a1 b1
1648    paddw                m4, m2
1649    punpcklwd            m2, m0, m3 ; 4 5
1650    movd                 m0, [srcq+ssq*0]
1651    punpcklwd            m3, m0     ; 5 6
1652    punpcklbw            m2, m3     ; 67 78
1653    pmaddubsw            m3, m2, m7 ; a2 b2
1654    paddw                m4, m3
1655    pmulhrsw             m4, m8
1656    packuswb             m4, m4
1657    movd                r6d, m4
1658    mov        [dstq+dsq*0], r6w
1659    shr                 r6d, 16
1660    mov        [dstq+dsq*1], r6w
1661    lea                dstq, [dstq+dsq*2]
1662    sub                  hd, 2
1663    jg .v_w2_loop
1664    RET
1665.v_w4:
1666%if ARCH_X86_32
1667    shl                  wd, 14
1668    lea                srcq, [srcq+r6*2]
1669    lea                 r6d, [hq+wq-(1<<16)]
1670    mov                srcm, srcq
1671    mov                 dsq, dsm
1672.v_w4_loop0:
1673    movd                 m1, [srcq+ssq*0]
1674    movd                 m3, [srcq+ssq*1]
1675    lea                srcq, [srcq+ssq*2]
1676%else
1677    movd                 m1, [srcq+nsq*2]
1678    movd                 m3, [srcq+nsq*1]
1679%endif
1680    movd                 m2, [srcq+ssq*0]
1681    movd                 m4, [srcq+ssq*1]
1682    lea                srcq, [srcq+ssq*2]
1683    movd                 m0, [srcq+ssq*0]
1684    punpckldq            m1, m3     ; 0 1
1685    punpckldq            m3, m2     ; 1 2
1686    punpckldq            m2, m4     ; 2 3
1687    punpckldq            m4, m0     ; 3 4
1688    punpcklbw            m1, m3     ; 01 12
1689    punpcklbw            m2, m4     ; 23 34
1690.v_w4_loop:
1691    movd                 m3, [srcq+ssq*1]
1692    lea                srcq, [srcq+ssq*2]
1693    pmaddubsw            m4, m1, m5 ; a0 b0
1694    mova                 m1, m2
1695    pmaddubsw            m2, m6     ; a1 b1
1696    paddw                m4, m2
1697    punpckldq            m2, m0, m3 ; 4 5
1698    movd                 m0, [srcq+ssq*0]
1699    punpckldq            m3, m0     ; 5 6
1700    punpcklbw            m2, m3     ; 67 78
1701    pmaddubsw            m3, m2, m7 ; a2 b2
1702    paddw                m4, m3
1703    pmulhrsw             m4, m8
1704    packuswb             m4, m4
1705    movd       [dstq+dsq*0], m4
1706    psrlq                m4, 32
1707    movd       [dstq+dsq*1], m4
1708    lea                dstq, [dstq+dsq*2]
1709    sub                  hd, 2
1710    jg .v_w4_loop
1711%if ARCH_X86_32
1712    mov                srcq, srcm
1713    mov                dstq, dstm
1714    movzx                hd, r6w
1715    add                srcq, 4
1716    add                dstq, 4
1717    mov                srcm, srcq
1718    mov                dstm, dstq
1719    sub                 r6d, 1<<16
1720    jg .v_w4_loop0
1721%endif
1722    RET
1723%if ARCH_X86_64
1724.v_w8:
1725    WIN64_PUSH_XMM       12
1726    shl                  wd, 5
1727    lea                 r6d, [hq+wq-256]
1728.v_w8_loop0:
1729    movq                 m1, [srcq+nsq*2]
1730    movq                 m2, [srcq+nsq*1]
1731    lea                  r4, [srcq+ssq*2]
1732    movq                 m3, [srcq+ssq*0]
1733    movq                 m4, [srcq+ssq*1]
1734    mov                  r7, dstq
1735    movq                 m0, [r4  +ssq*0]
1736    punpcklbw            m1, m2     ; 01
1737    punpcklbw            m2, m3     ; 12
1738    punpcklbw            m3, m4     ; 23
1739    punpcklbw            m4, m0     ; 34
1740.v_w8_loop:
1741    pmaddubsw           m10, m1, m5 ; a0
1742    mova                 m1, m3
1743    pmaddubsw           m11, m2, m5 ; b0
1744    mova                 m2, m4
1745    pmaddubsw            m3, m6     ; a1
1746    pmaddubsw            m4, m6     ; b1
1747    paddw               m10, m3
1748    paddw               m11, m4
1749    movq                 m4, [r4+ssq*1]
1750    lea                  r4, [r4+ssq*2]
1751    punpcklbw            m3, m0, m4 ; 67
1752    movq                 m0, [r4+ssq*0]
1753    punpcklbw            m4, m0     ; 78
1754    pmaddubsw            m9, m3, m7 ; a2
1755    paddw               m10, m9
1756    pmaddubsw            m9, m4, m7 ; b2
1757    paddw               m11, m9
1758    pmulhrsw            m10, m8
1759    pmulhrsw            m11, m8
1760    packuswb            m10, m11
1761    movq         [r7+dsq*0], m10
1762    movhps       [r7+dsq*1], m10
1763    lea                  r7, [r7+dsq*2]
1764    sub                  hd, 2
1765    jg .v_w8_loop
1766    add                srcq, 8
1767    add                dstq, 8
1768    movzx                hd, r6b
1769    sub                 r6d, 1<<8
1770    jg .v_w8_loop0
1771    RET
1772%endif ;ARCH_X86_64
1773.hv:
1774    RESET_STACK_STATE
1775    cmp                  wd, 4
1776    jg .hv_w8
1777%if ARCH_X86_32
1778    and                 mxd, 0x7f
1779%else
1780    movzx               mxd, mxb
1781%endif
1782    dec                srcq
1783    movd                 m1, [base_reg-put_ssse3+subpel_filters+2+mxq*8]
1784%if ARCH_X86_32
1785    movzx               mxd, ssb
1786    shr                 ssd, 16
1787    cmp                  hd, 6
1788    cmovs               ssd, mxd
1789    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+ssq*8]
1790    mov                 ssq, ssmp
1791    ALLOC_STACK   -mmsize*4
1792    %define              m8  [rsp+mmsize*0]
1793    %define              m9  [rsp+mmsize*1]
1794    %define             m10  [rsp+mmsize*2]
1795    punpcklbw            m0, m0
1796    sub                srcq, ssq
1797    psraw                m0, 8 ; sign-extend
1798    sub                srcq, ssq
1799    pshufd               m2, m0, q0000
1800    mova                 m8, m2
1801    pshufd               m2, m0, q1111
1802    mova                 m9, m2
1803    pshufd               m2, m0, q2222
1804    mova                m10, m2
1805%else
1806    movzx               mxd, myb
1807    shr                 myd, 16
1808    cmp                  hd, 6
1809    cmovs               myd, mxd
1810    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+myq*8]
1811    WIN64_SPILL_XMM      11, 14
1812    mov                 nsq, ssq
1813    punpcklbw            m0, m0
1814    neg                 nsq
1815    psraw                m0, 8 ; sign-extend
1816    pshufd               m8, m0, q0000
1817    pshufd               m9, m0, q1111
1818    pshufd              m10, m0, q2222
1819%endif
1820    cmp                  wd, 4
1821    je .hv_w4
1822.hv_w2:
1823    mova                 m5, [base+subpel_h_shuf4]
1824    mova                 m6, [base+pw_34]
1825    pshufd               m7, m1, q0000
1826%if ARCH_X86_32
1827    movq                 m2, [srcq+ssq*0]
1828    movhps               m2, [srcq+ssq*1]
1829    lea                srcq, [srcq+ssq*2]
1830    mov                 dsq, [rstk+stack_offset+gprsize*2]
1831%else
1832    movq                 m2, [srcq+nsq*2]
1833    movhps               m2, [srcq+nsq*1] ; 0 1
1834%endif
1835    movq                 m1, [srcq+ssq*0]
1836    movhps               m1, [srcq+ssq*1] ; 2 3
1837    lea                srcq, [srcq+ssq*2]
1838    movq                 m0, [srcq+ssq*0] ; 4
1839    REPX  {pshufb    x, m5}, m2, m1, m0
1840    REPX  {pmaddubsw x, m7}, m2, m1, m0
1841    phaddw               m2, m1
1842    phaddw               m0, m0
1843    paddw                m2, m6
1844    paddw                m0, m6
1845    psraw                m2, 2            ; 0 1 2 3
1846    psraw                m0, 2
1847    palignr              m0, m2, 4        ; 1 2 3 4
1848    punpcklwd            m1, m2, m0       ; 01 12
1849    punpckhwd            m2, m0           ; 23 34
1850.hv_w2_loop:
1851    movq                 m3, [srcq+ssq*1]
1852    lea                srcq, [srcq+ssq*2]
1853    movhps               m3, [srcq+ssq*0] ; 5 6
1854    pshufb               m3, m5
1855    pmaddubsw            m3, m7
1856    pmaddwd              m4, m8, m1       ; a0 b0
1857    mova                 m1, m2
1858    pmaddwd              m2, m9           ; a1 b1
1859    phaddw               m3, m3
1860    paddw                m3, m6
1861    psraw                m3, 2
1862    paddd                m4, m2
1863    palignr              m2, m3, m0, 12   ; 4 5
1864    mova                 m0, m3
1865    punpcklwd            m2, m3           ; 45 56
1866    pmaddwd              m3, m10, m2      ; a2 b2
1867    paddd                m4, m3
1868    psrad                m4, 10
1869    packssdw             m4, m5
1870    packuswb             m4, m4
1871    movd                r6d, m4
1872    mov        [dstq+dsq*0], r6w
1873    shr                 r6d, 16
1874    mov        [dstq+dsq*1], r6w
1875    lea                dstq, [dstq+dsq*2]
1876    sub                  hd, 2
1877    jg .hv_w2_loop
1878    RET
1879.hv_w4:
1880%if ARCH_X86_32
1881    movq                 m3, [srcq+ssq*0]
1882    movq                 m4, [srcq+ssq*1]
1883    lea                srcq, [srcq+ssq*2]
1884    mov                 dsq, [rstk+stack_offset+gprsize*2]
1885    %define             m11  [base+pw_34]
1886    %define             m12  [base+subpel_h_shufA]
1887    %define             m13  [rsp+mmsize*3]
1888    pshufd               m1, m1, q0000
1889    mova                m13, m1
1890%else
1891    WIN64_PUSH_XMM       14
1892    movq                 m3, [srcq+nsq*2]
1893    movq                 m4, [srcq+nsq*1]
1894    pshufd              m13, m1, q0000
1895    mova                m12, [base+subpel_h_shufA]
1896    mova                m11, [base+pw_34]
1897%endif
1898    movq                 m0, [srcq+ssq*0]
1899    movq                 m1, [srcq+ssq*1]
1900    lea                srcq, [srcq+ssq*2]
1901    movq                 m2, [srcq+ssq*0]
1902%if ARCH_X86_32
1903    mova                 m5, m12
1904    mova                 m6, m13
1905    REPX {pshufb    x, m5 }, m3, m4, m0, m1, m2
1906    mova                 m5, m11
1907    REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2
1908%else
1909    REPX {pshufb    x, m12}, m3, m4, m0, m1, m2
1910    REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2
1911%endif
1912    phaddw               m3, m0      ; 0 2
1913    phaddw               m4, m1      ; 1 3
1914    phaddw               m0, m2      ; 2 4
1915%if ARCH_X86_32
1916    REPX     {paddw x, m5 }, m3, m4, m0
1917%else
1918    REPX     {paddw x, m11}, m3, m4, m0
1919%endif
1920    REPX     {psraw x, 2  }, m3, m4, m0
1921    punpcklwd            m1, m3, m4  ; 01
1922    punpckhwd            m3, m4      ; 23
1923    punpcklwd            m2, m4, m0  ; 12
1924    punpckhwd            m4, m0      ; 34
1925.hv_w4_loop:
1926    movq                 m7, [srcq+ssq*1]
1927    lea                srcq, [srcq+ssq*2]
1928    movq                 m6, [srcq+ssq*0]
1929    pshufb               m7, m12
1930    pshufb               m6, m12
1931    pmaddubsw            m7, m13
1932    pmaddubsw            m6, m13
1933    pmaddwd              m5, m8, m1  ; a0
1934    mova                 m1, m3
1935    phaddw               m7, m6      ; 5 6
1936    pmaddwd              m6, m8, m2  ; b0
1937    mova                 m2, m4
1938    pmaddwd              m3, m9      ; a1
1939    pmaddwd              m4, m9      ; b1
1940    paddw                m7, m11
1941    psraw                m7, 2
1942    paddd                m5, m3
1943    paddd                m6, m4
1944    shufpd               m4, m0, m7, 0x01 ; 4 5
1945    mova                 m0, m7
1946    punpcklwd            m3, m4, m7  ; 45
1947    punpckhwd            m4, m7      ; 56
1948    pmaddwd              m7, m10, m3 ; a2
1949    paddd                m5, m7
1950    pmaddwd              m7, m10, m4 ; b2
1951    paddd                m6, m7
1952    psrad                m5, 10
1953    psrad                m6, 10
1954    packssdw             m5, m6
1955    packuswb             m5, m5
1956    movd       [dstq+dsq*0], m5
1957    psrlq                m5, 32
1958    movd       [dstq+dsq*1], m5
1959    lea                dstq, [dstq+dsq*2]
1960    sub                  hd, 2
1961    jg .hv_w4_loop
1962    RET
1963.hv_w8:
1964    RESET_STACK_STATE
1965    shr                 mxd, 16
1966    sub                srcq, 2
1967%if ARCH_X86_32
1968    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8]
1969    movzx               mxd, ssb
1970    shr                 ssd, 16
1971    cmp                  hd, 6
1972    cmovs               ssd, mxd
1973    movq                 m1, [base_reg-put_ssse3+subpel_filters+1+ssq*8]
1974    shl                  wd, 13
1975    mov                 ssq, ssm
1976    lea                 r6d, [hq+wq-(1<<16)]
1977%assign regs_used 5
1978    ALLOC_STACK  -mmsize*16
1979%assign regs_used 7
1980    mov                 dsq, [rstk+stack_offset+gprsize*2]
1981    sub                srcq, ssq
1982    sub                srcq, ssq
1983%if STACK_ALIGNMENT < 16
1984    %define            srcm  [esp+mmsize*15+gprsize*0]
1985    %define            dstm  [esp+mmsize*15+gprsize*1]
1986    mov                dstm, dstq
1987%endif
1988    mov                srcm, srcq
1989%else
1990    ALLOC_STACK        16*6, 16
1991    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8]
1992    movzx               mxd, myb
1993    shr                 myd, 16
1994    cmp                  hd, 6
1995    cmovs               myd, mxd
1996    movq                 m1, [base_reg-put_ssse3+subpel_filters+1+myq*8]
1997    mov                 nsq, ssq
1998    shl                  wd, 13
1999    neg                 nsq
2000    lea                 r6d, [hq+wq-(1<<16)]
2001%endif
2002    mova                 m7, [base+pw_34]
2003    punpcklwd            m0, m0
2004    punpcklbw            m1, m1
2005    psraw                m1, 8 ; sign-extend
2006    pshufd               m2, m0, q0000
2007    mova         [rsp+16*0], m2
2008    pshufd               m2, m0, q1111
2009    mova         [rsp+16*1], m2
2010    pshufd               m0, m0, q2222
2011    mova         [rsp+16*2], m0
2012    pshufd               m2, m1, q0000
2013    mova         [rsp+16*3], m2
2014    pshufd               m2, m1, q1111
2015    mova         [rsp+16*4], m2
2016    pshufd               m1, m1, q2222
2017    mova         [rsp+16*5], m1
2018%macro HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \
2019                     [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3]
2020    pshufb               %2, %1, %4
2021    pshufb               %1, %5
2022    pmaddubsw            %3, %2, %6
2023    shufps               %2, %1, q2121
2024    pmaddubsw            %1, %8
2025    pmaddubsw            %2, %7
2026    paddw                %3, m7
2027    paddw                %1, %3
2028    paddw                %1, %2
2029    psraw                %1, 2
2030%endmacro
2031.hv_w8_loop0:
2032    mova                 m2, [base+subpel_h_shufD]
2033    mova                 m3, [base+subpel_h_shufF]
2034    mova                 m4, [rsp+16*0]
2035%if ARCH_X86_32
2036    movu                 m0, [srcq+ssq*0]
2037    movu                 m1, [srcq+ssq*1]
2038    lea                srcq, [srcq+ssq*2]
2039    HV_H_6TAP            m0, m5, m6, m2, m3, m4
2040    HV_H_6TAP            m1, m5, m6, m2, m3, m4
2041    movu                 m5, [srcq+ssq*0]
2042    punpcklwd            m6, m0, m1   ; 01
2043    punpckhwd            m0, m1
2044    mova        [rsp+16* 6], m6
2045    mova        [rsp+16* 7], m0
2046    HV_H_6TAP            m5, m0, m6, m2, m3, m4
2047    movu                 m0, [srcq+ssq*1]
2048    lea                srcq, [srcq+ssq*2]
2049    punpcklwd            m6, m1, m5   ; 12
2050    punpckhwd            m1, m5
2051    mova        [rsp+16* 8], m6
2052    mova        [rsp+16* 9], m1
2053    HV_H_6TAP            m0, m1, m6, m2, m3, m4
2054    movu                 m1, [srcq+ssq*0]
2055    punpcklwd            m6, m5, m0   ; 23
2056    punpckhwd            m5, m0
2057    mova        [rsp+16*10], m6
2058    mova        [rsp+16*11], m5
2059    HV_H_6TAP            m1, m5, m6, m2, m3, m4
2060    mova        [rsp+16*14], m1
2061    punpcklwd            m6, m0, m1   ; 34
2062    punpckhwd            m0, m1
2063    mova        [rsp+16*12], m6
2064    mova        [rsp+16*13], m0
2065.hv_w8_loop:
2066    mova                 m3, [rsp+16* 3]
2067    pmaddwd              m0, m3, [rsp+16* 6] ; a0
2068    pmaddwd              m2, m3, [rsp+16* 7] ; a0'
2069    pmaddwd              m1, m3, [rsp+16* 8] ; b0
2070    pmaddwd              m3, [rsp+16* 9]     ; b0'
2071    mova                 m6, [rsp+16* 4]
2072    mova                 m4, [rsp+16*10]
2073    mova                 m5, [rsp+16*11]
2074    mova        [rsp+16* 6], m4
2075    pmaddwd              m4, m6       ; a1
2076    mova        [rsp+16* 7], m5
2077    pmaddwd              m5, m6       ; a1'
2078    paddd                m0, m4
2079    mova                 m4, [rsp+16*12]
2080    paddd                m2, m5
2081    mova                 m5, [rsp+16*13]
2082    mova        [rsp+16* 8], m4
2083    pmaddwd              m4, m6       ; b1
2084    mova        [rsp+16* 9], m5
2085    pmaddwd              m5, m6       ; b1'
2086    movu                 m6, [srcq+ssq*1]
2087    lea                srcq, [srcq+ssq*2]
2088    paddd                m1, m4
2089    paddd                m3, m5
2090    HV_H_6TAP            m6, m4, m5
2091    mova                 m5, [rsp+16*14]
2092    punpcklwd            m4, m5, m6   ; 45
2093    punpckhwd            m5, m6
2094    mova        [rsp+16*10], m4
2095    mova        [rsp+16*11], m5
2096    pmaddwd              m4, [rsp+16*5] ; a2
2097    pmaddwd              m5, [rsp+16*5] ; a2'
2098    paddd                m0, m4
2099    movu                 m4, [srcq+ssq*0]
2100    paddd                m2, m5
2101    psrad                m0, 10
2102    psrad                m2, 10
2103    packssdw             m0, m2
2104    HV_H_6TAP            m4, m2, m5
2105    mova                 m2, [rsp+16*5]
2106    punpcklwd            m5, m6, m4   ; 56
2107    mova        [rsp+16*14], m4
2108    punpckhwd            m6, m4
2109    mova        [rsp+16*12], m5
2110    pmaddwd              m5, m2       ; b2
2111    mova        [rsp+16*13], m6
2112    pmaddwd              m6, m2       ; b2'
2113    paddd                m1, m5
2114    paddd                m3, m6
2115    psrad                m1, 10
2116    psrad                m3, 10
2117    packssdw             m1, m3
2118    packuswb             m0, m1
2119    movq       [dstq+dsq*0], m0
2120    movhps     [dstq+dsq*1], m0
2121    lea                dstq, [dstq+dsq*2]
2122    sub                  hd, 2
2123    jg .hv_w8_loop
2124    mov                srcq, srcm
2125    mov                dstq, dstm
2126    movzx                hd, r6w
2127    add                srcq, 8
2128    add                dstq, 8
2129    mov                srcm, srcq
2130    mov                dstm, dstq
2131%else
2132    movu                 m9, [srcq+nsq*2]
2133    movu                m11, [srcq+nsq*1]
2134    lea                  r4, [srcq+ssq*2]
2135    movu                m13, [srcq+ssq*0]
2136    movu                m15, [srcq+ssq*1]
2137    mov                  r7, dstq
2138    movu                 m6, [r4  +ssq*0]
2139    mova                 m5, [rsp+16*1]
2140    mova                 m8, [rsp+16*2]
2141    HV_H_6TAP            m9, m0, m1, m2, m3, m4, m5, m8
2142    HV_H_6TAP           m11, m0, m1, m2, m3, m4, m5, m8
2143    HV_H_6TAP           m13, m0, m1, m2, m3, m4, m5, m8
2144    HV_H_6TAP           m15, m0, m1, m2, m3, m4, m5, m8
2145    HV_H_6TAP            m6, m0, m1, m2, m3, m4, m5, m8
2146    punpcklwd            m8, m9, m11  ; 01
2147    punpckhwd            m9, m11
2148    punpcklwd           m10, m11, m13 ; 12
2149    punpckhwd           m11, m13
2150    punpcklwd           m12, m13, m15 ; 23
2151    punpckhwd           m13, m15
2152    punpcklwd           m14, m15, m6  ; 34
2153    punpckhwd           m15, m6
2154.hv_w8_loop:
2155    mova                 m3, [rsp+16*3]
2156    mova                 m4, [rsp+16*4]
2157    pmaddwd              m0, m8, m3  ; a0
2158    mova                 m8, m12
2159    pmaddwd              m2, m9, m3  ; a0'
2160    mova                 m9, m13
2161    pmaddwd              m1, m10, m3 ; b0
2162    mova                m10, m14
2163    pmaddwd              m3, m11     ; b0'
2164    mova                m11, m15
2165    REPX    {pmaddwd x, m4}, m12, m13, m14, m15
2166    paddd                m0, m12
2167    paddd                m2, m13
2168    paddd                m1, m14
2169    paddd                m3, m15
2170    movu                m15, [r4+ssq*1]
2171    lea                  r4, [r4+ssq*2]
2172    HV_H_6TAP           m15, m4, m5
2173    punpcklwd           m12, m6, m15
2174    punpckhwd           m13, m6, m15
2175    movu                 m6, [r4+ssq*0]
2176    HV_H_6TAP            m6, m4, m5
2177    mova                 m4, [rsp+16*5]
2178    punpcklwd           m14, m15, m6
2179    punpckhwd           m15, m6
2180    pmaddwd              m5, m12, m4  ; a2
2181    paddd                m0, m5
2182    pmaddwd              m5, m13, m4  ; a2'
2183    paddd                m2, m5
2184    pmaddwd              m5, m14, m4  ; b2
2185    paddd                m1, m5
2186    pmaddwd              m4, m15      ; b2'
2187    paddd                m3, m4
2188    REPX      {psrad x, 10}, m0, m2, m1, m3
2189    packssdw             m0, m2
2190    packssdw             m1, m3
2191    packuswb             m0, m1
2192    movq         [r7+dsq*0], m0
2193    movhps       [r7+dsq*1], m0
2194    lea                  r7, [r7+dsq*2]
2195    sub                  hd, 2
2196    jg .hv_w8_loop
2197    add                srcq, 8
2198    add                dstq, 8
2199    movzx                hd, r6b
2200%endif
2201    sub                 r6d, 1<<16
2202    jg .hv_w8_loop0
2203    RET
2204
2205PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_8bpc
2206PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_8bpc
2207PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_8bpc
2208PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_8bpc
2209PUT_8TAP_FN sharp,          SHARP,   SHARP
2210
2211cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
2212    imul                mxd, mxm, 0x010101
2213    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2214%if ARCH_X86_64
2215    imul                myd, mym, 0x010101
2216    add                 myd, t1d ; 8tap_v, my, 4tap_v
2217%else
2218    imul                ssd, mym, 0x010101
2219    add                 ssd, t1d ; 8tap_v, my, 4tap_v
2220    mov                srcq, srcm
2221%endif
2222    mov                  wd, wm
2223    movifnidn            hd, hm
2224    LEA            base_reg, put_ssse3
2225    test                mxd, 0xf00
2226    jnz .h
2227%if ARCH_X86_32
2228    test                ssd, 0xf00
2229%else
2230    test                myd, 0xf00
2231%endif
2232    jnz .v
2233    tzcnt                wd, wd
2234    movzx                wd, word [base_reg+wq*2+table_offset(put,)]
2235    movifnidn           ssq, ssmp
2236    add                  wq, base_reg
2237    movifnidn           dsq, dsmp
2238%if WIN64
2239    pop                  r8
2240%endif
2241    lea                  r6, [ssq*3]
2242    jmp                  wq
2243.h_w2:
2244    mova                 m3, [base+subpel_h_shuf4]
2245    movifnidn           dsq, dsmp
2246.h_w2_loop:
2247    movq                 m0, [srcq+ssq*0]
2248    movhps               m0, [srcq+ssq*1]
2249    lea                srcq, [srcq+ssq*2]
2250    pshufb               m0, m3
2251    pmaddubsw            m0, m4
2252    phaddw               m0, m0
2253    paddw                m0, m5 ; pw34
2254    psraw                m0, 6
2255    packuswb             m0, m0
2256    movd                r6d, m0
2257    mov        [dstq+dsq*0], r6w
2258    shr                 r6d, 16
2259    mov        [dstq+dsq*1], r6w
2260    lea                dstq, [dstq+dsq*2]
2261    sub                  hd, 2
2262    jg .h_w2_loop
2263    RET
2264.h_w4:
2265%if ARCH_X86_32
2266    and                 mxd, 0x7f
2267%else
2268    movzx               mxd, mxb
2269%endif
2270    movd                 m4, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
2271    dec                srcq
2272    pshufd               m4, m4, q0000
2273    cmp                  wd, 4
2274    jl .h_w2
2275    mova                 m3, [base+subpel_h_shufA]
2276    movifnidn           dsq, dsmp
2277.h_w4_loop:
2278    movq                 m0, [srcq+ssq*0] ; 1
2279    movq                 m1, [srcq+ssq*1] ; 2
2280    lea                srcq, [srcq+ssq*2]
2281    pshufb               m0, m3 ; subpel_h_shufA
2282    pshufb               m1, m3 ; subpel_h_shufA
2283    pmaddubsw            m0, m4 ; subpel_filters
2284    pmaddubsw            m1, m4 ; subpel_filters
2285    phaddw               m0, m1
2286    paddw                m0, m5 ; pw34
2287    psraw                m0, 6
2288    packuswb             m0, m0
2289    movd       [dstq+dsq*0], m0
2290    psrlq                m0, 32
2291    movd       [dstq+dsq*1], m0
2292    lea                dstq, [dstq+dsq*2]
2293    sub                  hd, 2
2294    jg .h_w4_loop
2295    RET
2296.h:
2297%if ARCH_X86_32
2298    test                ssd, 0xf00
2299%else
2300    test                myd, 0xf00
2301%endif
2302    jnz .hv
2303    movifnidn           ssq, ssmp
2304    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
2305    cmp                  wd, 4
2306    jle .h_w4
2307    WIN64_SPILL_XMM      12
2308%if ARCH_X86_64
2309    mova                m10, [base+subpel_h_shufA]
2310    mova                m11, [base+subpel_h_shufB]
2311    mova                 m9, [base+subpel_h_shufC]
2312%endif
2313    shr                 mxd, 16
2314    sub                srcq, 3
2315    movq                 m7, [base_reg+mxq*8+subpel_filters-put_ssse3]
2316    pshufd               m6, m7, q0000
2317    pshufd               m7, m7, q1111
2318    sub                  wd, 16
2319    jge .h_w16
2320%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
2321 %if ARCH_X86_32
2322    pshufb              %2, %1, [base+subpel_h_shufB]
2323    pshufb              %3, %1, [base+subpel_h_shufC]
2324    pshufb              %1,     [base+subpel_h_shufA]
2325 %else
2326    pshufb              %2, %1, m11; subpel_h_shufB
2327    pshufb              %3, %1, m9 ; subpel_h_shufC
2328    pshufb              %1, m10    ; subpel_h_shufA
2329 %endif
2330    pmaddubsw           %4, %2, m6 ; subpel +0 B0
2331    pmaddubsw           %2, m7     ; subpel +4 B4
2332    pmaddubsw           %3, m7     ; C4
2333    pmaddubsw           %1, m6     ; A0
2334    paddw               %3, %4     ; C4+B0
2335    paddw               %1, %2     ; A0+B4
2336    phaddw              %1, %3
2337    paddw               %1, m5     ; pw34
2338    psraw               %1, 6
2339%endmacro
2340.h_w8:
2341    movu                 m0, [srcq+ssq*0]
2342    movu                 m1, [srcq+ssq*1]
2343    lea                srcq, [srcq+ssq*2]
2344    PUT_8TAP_H           m0, m2, m3, m4
2345    PUT_8TAP_H           m1, m2, m3, m4
2346    packuswb             m0, m1
2347%if ARCH_X86_32
2348    movq             [dstq], m0
2349    add                dstq, dsm
2350    movhps           [dstq], m0
2351    add                dstq, dsm
2352%else
2353    movq       [dstq+dsq*0], m0
2354    movhps     [dstq+dsq*1], m0
2355    lea                dstq, [dstq+dsq*2]
2356%endif
2357    sub                  hd, 2
2358    jg .h_w8
2359    RET
2360.h_w16:
2361    add                srcq, wq
2362    add                dstq, wq
2363    neg                  wq
2364.h_w16_loop_v:
2365    mov                  r6, wq
2366.h_w16_loop_h:
2367    movu                 m0, [srcq+r6+8*0]
2368    movu                 m1, [srcq+r6+8*1]
2369    PUT_8TAP_H           m0, m2, m3, m4
2370    PUT_8TAP_H           m1, m2, m3, m4
2371    packuswb             m0, m1
2372    mova          [dstq+r6], m0
2373    add                  r6, 16
2374    jle .h_w16_loop_h
2375    add                srcq, ssq
2376    add                dstq, dsmp
2377    dec                  hd
2378    jg .h_w16_loop_v
2379    RET
2380.v:
2381%if ARCH_X86_32
2382    movzx               mxd, ssb
2383    shr                 ssd, 16
2384    cmp                  hd, 6
2385    cmovs               ssd, mxd
2386    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
2387%else
2388    WIN64_SPILL_XMM      16
2389    movzx               mxd, myb
2390    shr                 myd, 16
2391    cmp                  hd, 6
2392    cmovs               myd, mxd
2393    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
2394%endif
2395    punpcklwd            m0, m0
2396    mova                 m7, [base+pw_512]
2397%if ARCH_X86_32
2398 %define            subpel0  [rsp+mmsize*0]
2399 %define            subpel1  [rsp+mmsize*1]
2400 %define            subpel2  [rsp+mmsize*2]
2401 %define            subpel3  [rsp+mmsize*3]
2402%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
2403    ALLOC_STACK       -16*4
2404%assign regs_used 7
2405    pshufd               m1, m0, q0000
2406    mova            subpel0, m1
2407    pshufd               m1, m0, q1111
2408    mova            subpel1, m1
2409    pshufd               m1, m0, q2222
2410    mova            subpel2, m1
2411    pshufd               m1, m0, q3333
2412    mova            subpel3, m1
2413    mov                 ssq, [rstk+stack_offset+gprsize*4]
2414    lea                 ssq, [ssq*3]
2415    sub                srcq, ssq
2416    mov                 ssq, [rstk+stack_offset+gprsize*4]
2417    mov                 dsq, [rstk+stack_offset+gprsize*2]
2418    cmp                  wd, 2
2419    jne .v_w4
2420%else
2421 %define            subpel0  m8
2422 %define            subpel1  m9
2423 %define            subpel2  m10
2424 %define            subpel3  m11
2425    lea                ss3q, [ssq*3]
2426    pshufd               m8, m0, q0000
2427    sub                srcq, ss3q
2428    pshufd               m9, m0, q1111
2429    pshufd              m10, m0, q2222
2430    pshufd              m11, m0, q3333
2431    cmp                  wd, 4
2432    je .v_w4
2433    jg .v_w8
2434%endif
2435.v_w2:
2436    movd                 m1, [srcq+ssq*0]
2437    movd                 m0, [srcq+ssq*1]
2438%if ARCH_X86_32
2439    lea                srcq, [srcq+ssq*2]
2440    movd                 m2, [srcq+ssq*0]
2441    movd                 m5, [srcq+ssq*1]
2442    lea                srcq, [srcq+ssq*2]
2443    movd                 m3, [srcq+ssq*0]
2444    movd                 m4, [srcq+ssq*1]
2445    lea                srcq, [srcq+ssq*2]
2446%else
2447    movd                 m2, [srcq+ssq*2]
2448    add                srcq, ss3q
2449    movd                 m5, [srcq+ssq*0]
2450    movd                 m3, [srcq+ssq*1]
2451    movd                 m4, [srcq+ssq*2]
2452    add                srcq, ss3q
2453%endif
2454    punpcklwd            m1, m0           ; 0 1
2455    punpcklwd            m0, m2           ; 1 2
2456    punpcklbw            m1, m0           ; 01 12
2457    movd                 m0, [srcq+ssq*0]
2458    punpcklwd            m2, m5           ; 2 3
2459    punpcklwd            m5, m3           ; 3 4
2460    punpcklwd            m3, m4           ; 4 5
2461    punpcklwd            m4, m0           ; 5 6
2462    punpcklbw            m2, m5           ; 23 34
2463    punpcklbw            m3, m4           ; 45 56
2464.v_w2_loop:
2465    movd                 m4, [srcq+ssq*1]
2466    lea                srcq, [srcq+ssq*2]
2467    pmaddubsw            m5, m1, subpel0     ; a0 b0
2468    mova                 m1, m2
2469    pmaddubsw            m2, subpel1         ; a1 b1
2470    paddw                m5, m2
2471    mova                 m2, m3
2472    pmaddubsw            m3, subpel2         ; a2 b2
2473    paddw                m5, m3
2474    punpcklwd            m3, m0, m4          ; 6 7
2475    movd                 m0, [srcq+ssq*0]
2476    punpcklwd            m4, m0              ; 7 8
2477    punpcklbw            m3, m4              ; 67 78
2478    pmaddubsw            m4, m3, subpel3     ; a3 b3
2479    paddw                m5, m4
2480    pmulhrsw             m5, m7
2481    packuswb             m5, m5
2482    movd                r6d, m5
2483    mov        [dstq+dsq*0], r6w
2484    shr                 r6d, 16
2485    mov        [dstq+dsq*1], r6w
2486    lea                dstq, [dstq+dsq*2]
2487    sub                  hd, 2
2488    jg .v_w2_loop
2489    RET
2490.v_w4:
2491%if ARCH_X86_32
2492    shl                  wd, 14
2493%if STACK_ALIGNMENT < 16
2494 %define               dstm [rsp+mmsize*4+gprsize]
2495    mov                dstm, dstq
2496%endif
2497    lea                 r6d, [hq+wq-(1<<16)]
2498    mov                  r4, srcq
2499.v_w4_loop0:
2500%endif
2501    movd                 m1, [srcq+ssq*0]
2502    movd                 m0, [srcq+ssq*1]
2503%if ARCH_X86_32
2504    lea                srcq, [srcq+ssq*2]
2505    movd                 m2, [srcq+ssq*0]
2506    movd                 m5, [srcq+ssq*1]
2507    lea                srcq, [srcq+ssq*2]
2508    movd                 m3, [srcq+ssq*0]
2509    movd                 m4, [srcq+ssq*1]
2510    lea                srcq, [srcq+ssq*2]
2511%else
2512    movd                 m2, [srcq+ssq*2]
2513    add                srcq, ss3q
2514    movd                 m5, [srcq+ssq*0]
2515    movd                 m3, [srcq+ssq*1]
2516    movd                 m4, [srcq+ssq*2]
2517    add                srcq, ss3q
2518%endif
2519    punpckldq            m1, m0           ; 0 1
2520    punpckldq            m0, m2           ; 1 2
2521    punpcklbw            m1, m0           ; 01 12
2522    movd                 m0, [srcq+ssq*0]
2523    punpckldq            m2, m5           ; 2 3
2524    punpckldq            m5, m3           ; 3 4
2525    punpckldq            m3, m4           ; 4 5
2526    punpckldq            m4, m0           ; 5 6
2527    punpcklbw            m2, m5           ; 23 34
2528    punpcklbw            m3, m4           ; 45 56
2529.v_w4_loop:
2530    movd                 m4, [srcq+ssq*1]
2531    lea                srcq, [srcq+ssq*2]
2532    pmaddubsw            m5, m1, subpel0  ; a0 b0
2533    mova                 m1, m2
2534    pmaddubsw            m2, subpel1      ; a1 b1
2535    paddw                m5, m2
2536    mova                 m2, m3
2537    pmaddubsw            m3, subpel2      ; a2 b2
2538    paddw                m5, m3
2539    punpckldq            m3, m0, m4       ; 6 7 _ _
2540    movd                 m0, [srcq+ssq*0]
2541    punpckldq            m4, m0           ; 7 8 _ _
2542    punpcklbw            m3, m4           ; 67 78
2543    pmaddubsw            m4, m3, subpel3  ; a3 b3
2544    paddw                m5, m4
2545    pmulhrsw             m5, m7
2546    packuswb             m5, m5
2547    movd       [dstq+dsq*0], m5
2548    psrlq                m5, 32
2549    movd       [dstq+dsq*1], m5
2550    lea                dstq, [dstq+dsq*2]
2551    sub                  hd, 2
2552    jg .v_w4_loop
2553%if ARCH_X86_32
2554    mov                dstq, dstm
2555    add                  r4, 4
2556    movzx                hd, r6w
2557    add                dstq, 4
2558    mov                srcq, r4
2559    mov                dstm, dstq
2560    sub                 r6d, 1<<16
2561    jg .v_w4_loop0
2562%endif
2563    RET
2564%if ARCH_X86_64
2565.v_w8:
2566    shl                  wd, 5
2567    lea                 r6d, [hq+wq-256]
2568.v_w8_loop0:
2569    movq                 m1, [srcq+ssq*0]
2570    movq                 m2, [srcq+ssq*1]
2571    lea                  r4, [srcq+ss3q]
2572    movq                 m3, [srcq+ssq*2]
2573    movq                 m4, [r4  +ssq*0]
2574    mov                  r7, dstq
2575    movq                 m5, [r4  +ssq*1]
2576    movq                 m6, [r4  +ssq*2]
2577    add                  r4, ss3q
2578    movq                 m0, [r4  +ssq*0]
2579    punpcklbw            m1, m2 ; 01
2580    punpcklbw            m2, m3 ; 12
2581    punpcklbw            m3, m4 ; 23
2582    punpcklbw            m4, m5 ; 34
2583    punpcklbw            m5, m6 ; 45
2584    punpcklbw            m6, m0 ; 56
2585.v_w8_loop:
2586    movq                m13, [r4+ssq*1]
2587    lea                  r4, [r4+ssq*2]
2588    pmaddubsw           m14, m1, subpel0 ; a0
2589    mova                 m1, m3
2590    pmaddubsw           m15, m2, subpel0 ; b0
2591    mova                 m2, m4
2592    pmaddubsw            m3, subpel1 ; a1
2593    mova                m12, m0
2594    pmaddubsw            m4, subpel1 ; b1
2595    movq                 m0, [r4+ssq*0]
2596    paddw               m14, m3
2597    paddw               m15, m4
2598    mova                 m3, m5
2599    pmaddubsw            m5, subpel2 ; a2
2600    mova                 m4, m6
2601    pmaddubsw            m6, subpel2 ; b2
2602    punpcklbw           m12, m13     ; 67
2603    punpcklbw           m13, m0      ; 78
2604    paddw               m14, m5
2605    mova                 m5, m12
2606    pmaddubsw           m12, subpel3 ; a3
2607    paddw               m15, m6
2608    mova                 m6, m13
2609    pmaddubsw           m13, subpel3 ; b3
2610    paddw               m14, m12
2611    paddw               m15, m13
2612    pmulhrsw            m14, m7
2613    pmulhrsw            m15, m7
2614    packuswb            m14, m15
2615    movq         [r7+dsq*0], m14
2616    movhps       [r7+dsq*1], m14
2617    lea                  r7, [r7+dsq*2]
2618    sub                  hd, 2
2619    jg .v_w8_loop
2620    add                srcq, 8
2621    add                dstq, 8
2622    movzx                hd, r6b
2623    sub                 r6d, 1<<8
2624    jg .v_w8_loop0
2625    RET
2626%endif ;ARCH_X86_64
2627%undef subpel0
2628%undef subpel1
2629%undef subpel2
2630%undef subpel3
2631.hv:
2632    RESET_STACK_STATE
2633    cmp                  wd, 4
2634    jg .hv_w8
2635%if ARCH_X86_32
2636    and                 mxd, 0x7f
2637%else
2638    movzx               mxd, mxb
2639%endif
2640    dec                srcq
2641    movd                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
2642%if ARCH_X86_32
2643    movzx               mxd, ssb
2644    shr                 ssd, 16
2645    cmp                  hd, 6
2646    cmovs               ssd, mxd
2647    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
2648    mov                 ssq, ssmp
2649    lea                  r6, [ssq*3]
2650    sub                srcq, r6
2651 %define           base_reg  r6
2652    mov                  r6, r1; use as new base
2653 %assign regs_used 2
2654    ALLOC_STACK  -mmsize*14
2655 %assign regs_used 7
2656    mov                 dsq, [rstk+stack_offset+gprsize*2]
2657 %define           subpelv0  [rsp+mmsize*0]
2658 %define           subpelv1  [rsp+mmsize*1]
2659 %define           subpelv2  [rsp+mmsize*2]
2660 %define           subpelv3  [rsp+mmsize*3]
2661    punpcklbw            m0, m0
2662    psraw                m0, 8 ; sign-extend
2663    pshufd               m6, m0, q0000
2664    mova           subpelv0, m6
2665    pshufd               m6, m0, q1111
2666    mova           subpelv1, m6
2667    pshufd               m6, m0, q2222
2668    mova           subpelv2, m6
2669    pshufd               m6, m0, q3333
2670    mova           subpelv3, m6
2671%else
2672    movzx               mxd, myb
2673    shr                 myd, 16
2674    cmp                  hd, 6
2675    cmovs               myd, mxd
2676    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
2677    ALLOC_STACK   mmsize*14, 14
2678    lea                ss3q, [ssq*3]
2679    sub                srcq, ss3q
2680 %define           subpelv0  m10
2681 %define           subpelv1  m11
2682 %define           subpelv2  m12
2683 %define           subpelv3  m13
2684    punpcklbw            m0, m0
2685    psraw                m0, 8 ; sign-extend
2686    mova                 m8, [base+pw_8192]
2687    mova                 m9, [base+pd_512]
2688    pshufd              m10, m0, q0000
2689    pshufd              m11, m0, q1111
2690    pshufd              m12, m0, q2222
2691    pshufd              m13, m0, q3333
2692%endif
2693    pshufd               m7, m1, q0000
2694    cmp                  wd, 4
2695    je .hv_w4
2696.hv_w2:
2697    mova                 m6, [base+subpel_h_shuf4]
2698    movq                 m2, [srcq+ssq*0]     ; 0
2699    movhps               m2, [srcq+ssq*1]     ; 0 _ 1
2700%if ARCH_X86_32
2701 %define           w8192reg  [base+pw_8192]
2702 %define            d512reg  [base+pd_512]
2703    lea                srcq, [srcq+ssq*2]
2704    movq                 m0, [srcq+ssq*0]     ; 2
2705    movhps               m0, [srcq+ssq*1]     ; 2 _ 3
2706    lea                srcq, [srcq+ssq*2]
2707%else
2708 %define           w8192reg  m8
2709 %define            d512reg  m9
2710    movq                 m0, [srcq+ssq*2]     ; 2
2711    add                srcq, ss3q
2712    movhps               m0, [srcq+ssq*0]     ; 2 _ 3
2713%endif
2714    pshufb               m2, m6 ; 0 ~ 1 ~
2715    pshufb               m0, m6 ; 2 ~ 3 ~
2716    pmaddubsw            m2, m7 ; subpel_filters
2717    pmaddubsw            m0, m7 ; subpel_filters
2718    phaddw               m2, m0 ; 0 1 2 3
2719    pmulhrsw             m2, w8192reg
2720%if ARCH_X86_32
2721    movq                 m3, [srcq+ssq*0]     ; 4
2722    movhps               m3, [srcq+ssq*1]     ; 4 _ 5
2723    lea                srcq, [srcq+ssq*2]
2724%else
2725    movq                 m3, [srcq+ssq*1]     ; 4
2726    movhps               m3, [srcq+ssq*2]     ; 4 _ 5
2727    add                srcq, ss3q
2728%endif
2729    movq                 m0, [srcq+ssq*0]     ; 6
2730    pshufb               m3, m6 ; 4 ~ 5 ~
2731    pshufb               m0, m6 ; 6 ~
2732    pmaddubsw            m3, m7 ; subpel_filters
2733    pmaddubsw            m0, m7 ; subpel_filters
2734    phaddw               m3, m0 ; 4 5 6 _
2735    pmulhrsw             m3, w8192reg
2736    palignr              m4, m3, m2, 4; V        1 2 3 4
2737    punpcklwd            m1, m2, m4   ; V 01 12    0 1 1 2
2738    punpckhwd            m2, m4       ; V 23 34    2 3 3 4
2739    pshufd               m0, m3, q2121; V          5 6 5 6
2740    punpcklwd            m3, m0       ; V 45 56    4 5 5 6
2741.hv_w2_loop:
2742    movq                 m4, [srcq+ssq*1] ; V 7
2743    lea                srcq, [srcq+ssq*2] ; V
2744    movhps               m4, [srcq+ssq*0] ; V 7 8
2745    pshufb               m4, m6
2746    pmaddubsw            m4, m7
2747    pmaddwd              m5, m1, subpelv0; V a0 b0
2748    mova                 m1, m2       ; V
2749    pmaddwd              m2, subpelv1 ; V a1 b1
2750    paddd                m5, m2       ; V
2751    mova                 m2, m3       ; V
2752    pmaddwd              m3, subpelv2 ; a2 b2
2753    phaddw               m4, m4
2754    pmulhrsw             m4, w8192reg
2755    paddd                m5, m3       ; V
2756    palignr              m3, m4, m0, 12
2757    mova                 m0, m4
2758    punpcklwd            m3, m0           ; V 67 78
2759    pmaddwd              m4, m3, subpelv3 ; V a3 b3
2760    paddd                m5, d512reg
2761    paddd                m5, m4
2762    psrad                m5, 10
2763    packssdw             m5, m5
2764    packuswb             m5, m5
2765    movd                r4d, m5
2766    mov        [dstq+dsq*0], r4w
2767    shr                 r4d, 16
2768    mov        [dstq+dsq*1], r4w
2769    lea                dstq, [dstq+dsq*2]
2770    sub                  hd, 2
2771    jg .hv_w2_loop
2772    RET
2773%undef w8192reg
2774%undef d512reg
2775.hv_w4:
2776%define hv4_line_0_0 4
2777%define hv4_line_0_1 5
2778%define hv4_line_0_2 6
2779%define hv4_line_0_3 7
2780%define hv4_line_0_4 8
2781%define hv4_line_0_5 9
2782%define hv4_line_1_0 10
2783%define hv4_line_1_1 11
2784%define hv4_line_1_2 12
2785%define hv4_line_1_3 13
2786%macro SAVELINE_W4 3
2787    mova     [rsp+mmsize*hv4_line_%3_%2], %1
2788%endmacro
2789%macro RESTORELINE_W4 3
2790    mova     %1, [rsp+mmsize*hv4_line_%3_%2]
2791%endmacro
2792%if ARCH_X86_32
2793 %define           w8192reg  [base+pw_8192]
2794 %define            d512reg  [base+pd_512]
2795%else
2796 %define           w8192reg  m8
2797 %define            d512reg  m9
2798%endif
2799    ; lower shuffle 0 1 2 3 4
2800    mova                 m6, [base+subpel_h_shuf4]
2801    movq                 m5, [srcq+ssq*0]   ; 0 _ _ _
2802    movhps               m5, [srcq+ssq*1]   ; 0 _ 1 _
2803%if ARCH_X86_32
2804    lea                srcq, [srcq+ssq*2]
2805    movq                 m4, [srcq+ssq*0]   ; 2 _ _ _
2806    movhps               m4, [srcq+ssq*1]   ; 2 _ 3 _
2807    lea                srcq, [srcq+ssq*2]
2808%else
2809    movq                 m4, [srcq+ssq*2]   ; 2 _ _ _
2810    movhps               m4, [srcq+ss3q ]   ; 2 _ 3 _
2811    lea                srcq, [srcq+ssq*4]
2812%endif
2813    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
2814    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
2815    pmaddubsw            m2, m7 ;H subpel_filters
2816    pmaddubsw            m0, m7 ;H subpel_filters
2817    phaddw               m2, m0 ;H 0 1 2 3
2818    pmulhrsw             m2, w8192reg ;H pw_8192
2819    SAVELINE_W4          m2, 2, 0
2820    ; upper shuffle 2 3 4 5 6
2821    mova                 m6, [base+subpel_h_shuf4+16]
2822    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
2823    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
2824    pmaddubsw            m2, m7 ;H subpel_filters
2825    pmaddubsw            m0, m7 ;H subpel_filters
2826    phaddw               m2, m0 ;H 0 1 2 3
2827    pmulhrsw             m2, w8192reg ;H pw_8192
2828    ;
2829    ; lower shuffle
2830    mova                 m6, [base+subpel_h_shuf4]
2831    movq                 m5, [srcq+ssq*0]   ; 4 _ _ _
2832    movhps               m5, [srcq+ssq*1]   ; 4 _ 5 _
2833%if ARCH_X86_32
2834    lea                srcq, [srcq+ssq*2]
2835    movq                 m4, [srcq+ssq*0]   ; 6 _ _ _
2836    add                srcq, ssq
2837%else
2838    movq                 m4, [srcq+ssq*2]   ; 6 _ _ _
2839    add                srcq, ss3q
2840%endif
2841    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
2842    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
2843    pmaddubsw            m3, m7 ;H subpel_filters
2844    pmaddubsw            m0, m7 ;H subpel_filters
2845    phaddw               m3, m0 ;H 4 5 6 7
2846    pmulhrsw             m3, w8192reg ;H pw_8192
2847    SAVELINE_W4          m3, 3, 0
2848    ; upper shuffle
2849    mova                 m6, [base+subpel_h_shuf4+16]
2850    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
2851    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
2852    pmaddubsw            m3, m7 ;H subpel_filters
2853    pmaddubsw            m0, m7 ;H subpel_filters
2854    phaddw               m3, m0 ;H 4 5 6 7
2855    pmulhrsw             m3, w8192reg ;H pw_8192
2856    ;process high
2857    palignr              m4, m3, m2, 4;V 1 2 3 4
2858    punpcklwd            m1, m2, m4  ; V 01 12
2859    punpckhwd            m2, m4      ; V 23 34
2860    pshufd               m0, m3, q2121;V 5 6 5 6
2861    punpcklwd            m3, m0      ; V 45 56
2862    SAVELINE_W4          m0, 0, 1
2863    SAVELINE_W4          m1, 1, 1
2864    SAVELINE_W4          m2, 2, 1
2865    SAVELINE_W4          m3, 3, 1
2866    ;process low
2867    RESTORELINE_W4       m2, 2, 0
2868    RESTORELINE_W4       m3, 3, 0
2869    palignr              m4, m3, m2, 4;V 1 2 3 4
2870    punpcklwd            m1, m2, m4  ; V 01 12
2871    punpckhwd            m2, m4      ; V 23 34
2872    pshufd               m0, m3, q2121;V 5 6 5 6
2873    punpcklwd            m3, m0      ; V 45 56
2874.hv_w4_loop:
2875    ;process low
2876    pmaddwd              m5, m1, subpelv0 ; V a0 b0
2877    mova                 m1, m2
2878    pmaddwd              m2, subpelv1; V a1 b1
2879    paddd                m5, m2
2880    mova                 m2, m3
2881    pmaddwd              m3, subpelv2; V a2 b2
2882    paddd                m5, m3
2883    mova                 m6, [base+subpel_h_shuf4]
2884    movq                 m4, [srcq+ssq*0] ; 7
2885    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
2886    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
2887    pmaddubsw            m4, m7 ;H subpel_filters
2888    phaddw               m4, m4 ;H                7 8 7 8
2889    pmulhrsw             m4, w8192reg ;H pw_8192
2890    palignr              m3, m4, m0, 12         ; 6 7 8 7
2891    mova                 m0, m4
2892    punpcklwd            m3, m4      ; 67 78
2893    pmaddwd              m4, m3, subpelv3; a3 b3
2894    paddd                m5, d512reg ; pd_512
2895    paddd                m5, m4
2896    psrad                m5, 10
2897    SAVELINE_W4          m0, 0, 0
2898    SAVELINE_W4          m1, 1, 0
2899    SAVELINE_W4          m2, 2, 0
2900    SAVELINE_W4          m3, 3, 0
2901    SAVELINE_W4          m5, 5, 0
2902    ;process high
2903    RESTORELINE_W4       m0, 0, 1
2904    RESTORELINE_W4       m1, 1, 1
2905    RESTORELINE_W4       m2, 2, 1
2906    RESTORELINE_W4       m3, 3, 1
2907    pmaddwd              m5, m1, subpelv0; V a0 b0
2908    mova                 m1, m2
2909    pmaddwd              m2, subpelv1; V a1 b1
2910    paddd                m5, m2
2911    mova                 m2, m3
2912    pmaddwd              m3, subpelv2; V a2 b2
2913    paddd                m5, m3
2914    mova                 m6, [base+subpel_h_shuf4+16]
2915    movq                 m4, [srcq+ssq*0] ; 7
2916    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
2917    lea                srcq, [srcq+ssq*2]
2918    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
2919    pmaddubsw            m4, m7 ;H subpel_filters
2920    phaddw               m4, m4 ;H                7 8 7 8
2921    pmulhrsw             m4, w8192reg ;H pw_8192
2922    palignr              m3, m4, m0, 12         ; 6 7 8 7
2923    mova                 m0, m4
2924    punpcklwd            m3, m4      ; 67 78
2925    pmaddwd              m4, m3, subpelv3; a3 b3
2926    paddd                m5, d512reg ; pd_512
2927    paddd                m5, m4
2928    psrad                m4, m5, 10
2929    RESTORELINE_W4       m5, 5, 0
2930    packssdw             m5, m4 ; d -> w
2931    packuswb             m5, m5 ; w -> b
2932    pshuflw              m5, m5, q3120
2933    movd       [dstq+dsq*0], m5
2934    psrlq                m5, 32
2935    movd       [dstq+dsq*1], m5
2936    lea                dstq, [dstq+dsq*2]
2937    sub                  hd, 2
2938    SAVELINE_W4          m0, 0, 1
2939    SAVELINE_W4          m1, 1, 1
2940    SAVELINE_W4          m2, 2, 1
2941    SAVELINE_W4          m3, 3, 1
2942    RESTORELINE_W4       m0, 0, 0
2943    RESTORELINE_W4       m1, 1, 0
2944    RESTORELINE_W4       m2, 2, 0
2945    RESTORELINE_W4       m3, 3, 0
2946    jg .hv_w4_loop
2947    RET
2948%undef subpelv0
2949%undef subpelv1
2950%undef subpelv2
2951%undef subpelv3
2952.hv_w8:
2953    RESET_STACK_STATE
2954%define hv8_line_1 0
2955%define hv8_line_2 1
2956%define hv8_line_3 2
2957%define hv8_line_4 3
2958%define hv8_line_6 4
2959%macro SAVELINE_W8 2
2960    mova     [rsp+hv8_line_%1*mmsize], %2
2961%endmacro
2962%macro RESTORELINE_W8 2
2963    mova     %2, [rsp+hv8_line_%1*mmsize]
2964%endmacro
2965    shr                 mxd, 16
2966    sub                srcq, 3
2967%if ARCH_X86_32
2968 %define           base_reg  r1
2969 %define           subpelh0  [rsp+mmsize*5]
2970 %define           subpelh1  [rsp+mmsize*6]
2971 %define           subpelv0  [rsp+mmsize*7]
2972 %define           subpelv1  [rsp+mmsize*8]
2973 %define           subpelv2  [rsp+mmsize*9]
2974 %define           subpelv3  [rsp+mmsize*10]
2975 %define             accuv0  [rsp+mmsize*11]
2976 %define             accuv1  [rsp+mmsize*12]
2977    movq                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
2978    movzx               mxd, ssb
2979    shr                 ssd, 16
2980    cmp                  hd, 6
2981    cmovs               ssd, mxd
2982    movq                 m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
2983    mov                 ssq, ssmp
2984    ALLOC_STACK  -mmsize*13
2985%if STACK_ALIGNMENT < 16
2986 %define               dstm  [rsp+mmsize*13+gprsize*1]
2987 %define                dsm  [rsp+mmsize*13+gprsize*2]
2988    mov                  r6, [rstk+stack_offset+gprsize*2]
2989    mov                 dsm, r6
2990%endif
2991    pshufd               m0, m1, q0000
2992    pshufd               m1, m1, q1111
2993    punpcklbw            m5, m5
2994    psraw                m5, 8 ; sign-extend
2995    pshufd               m2, m5, q0000
2996    pshufd               m3, m5, q1111
2997    pshufd               m4, m5, q2222
2998    pshufd               m5, m5, q3333
2999    mova           subpelh0, m0
3000    mova           subpelh1, m1
3001    mova           subpelv0, m2
3002    mova           subpelv1, m3
3003    mova           subpelv2, m4
3004    mova           subpelv3, m5
3005    lea                  r6, [ssq*3]
3006    mov                dstm, dstq
3007    sub                srcq, r6
3008%else
3009    ALLOC_STACK        16*5, 16
3010 %define           subpelh0  m10
3011 %define           subpelh1  m11
3012 %define           subpelv0  m12
3013 %define           subpelv1  m13
3014 %define           subpelv2  m14
3015 %define           subpelv3  m15
3016 %define             accuv0  m8
3017 %define             accuv1  m9
3018    movq                 m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
3019    movzx               mxd, myb
3020    shr                 myd, 16
3021    cmp                  hd, 6
3022    cmovs               myd, mxd
3023    movq                 m1, [base_reg+myq*8+subpel_filters-put_ssse3]
3024    pshufd         subpelh0, m0, q0000
3025    pshufd         subpelh1, m0, q1111
3026    punpcklbw            m1, m1
3027    psraw                m1, 8 ; sign-extend
3028    pshufd         subpelv0, m1, q0000
3029    pshufd         subpelv1, m1, q1111
3030    pshufd         subpelv2, m1, q2222
3031    pshufd         subpelv3, m1, q3333
3032    lea                ss3q, [ssq*3]
3033    mov                  r7, dstq
3034    sub                srcq, ss3q
3035%endif
3036    shl                  wd, 14
3037    lea                 r6d, [hq+wq-(1<<16)]
3038    mov                  r4, srcq
3039.hv_w8_loop0:
3040    movu                 m4, [srcq+ssq*0] ; 0 = _ _
3041    movu                 m5, [srcq+ssq*1] ; 1 = _ _
3042%if ARCH_X86_32
3043    lea                srcq, [srcq+ssq*2]
3044%endif
3045%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
3046 %if ARCH_X86_32
3047    pshufb               %3, %1, [base+subpel_h_shufB]
3048    pshufb               %4, %1, [base+subpel_h_shufC]
3049    pshufb               %1,     [base+subpel_h_shufA]
3050 %else
3051    pshufb               %3, %1, %6  ; subpel_h_shufB
3052    pshufb               %4, %1, %7  ; subpel_h_shufC
3053    pshufb               %1, %5      ; subpel_h_shufA
3054 %endif
3055    pmaddubsw            %2, %3, subpelh0 ; subpel +0 C0
3056    pmaddubsw            %4, subpelh1; subpel +4 B4
3057    pmaddubsw            %3, subpelh1; C4
3058    pmaddubsw            %1, subpelh0; A0
3059    paddw                %2, %4      ; C0+B4
3060    paddw                %1, %3      ; A0+C4
3061    phaddw               %1, %2
3062%endmacro
3063%if ARCH_X86_64
3064    mova                 m7, [base+subpel_h_shufA]
3065    mova                 m8, [base+subpel_h_shufB]
3066    mova                 m9, [base+subpel_h_shufC]
3067%endif
3068    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
3069    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
3070%if ARCH_X86_32
3071    movu                 m6, [srcq+ssq*0] ; 2 = _ _
3072    movu                 m0, [srcq+ssq*1] ; 3 = _ _
3073    lea                srcq, [srcq+ssq*2]
3074%else
3075    movu                 m6, [srcq+ssq*2] ; 2 = _ _
3076    add                srcq, ss3q
3077    movu                 m0, [srcq+ssq*0] ; 3 = _ _
3078%endif
3079    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
3080    HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
3081    mova                 m7, [base+pw_8192]
3082    pmulhrsw             m4, m7 ; H pw_8192
3083    pmulhrsw             m5, m7 ; H pw_8192
3084    pmulhrsw             m6, m7 ; H pw_8192
3085    pmulhrsw             m0, m7 ; H pw_8192
3086    punpcklwd            m1, m4, m5  ; 0 1 ~
3087    punpcklwd            m2, m5, m6  ; 1 2 ~
3088    punpcklwd            m3, m6, m0  ; 2 3 ~
3089    SAVELINE_W8           1, m1
3090    SAVELINE_W8           2, m2
3091    SAVELINE_W8           3, m3
3092    mova                 m7, [base+subpel_h_shufA]
3093%if ARCH_X86_32
3094    movu                 m4, [srcq+ssq*0]       ; 4 = _ _
3095    movu                 m5, [srcq+ssq*1]       ; 5 = _ _
3096    lea                srcq, [srcq+ssq*2]
3097%else
3098    movu                 m4, [srcq+ssq*1]       ; 4 = _ _
3099    movu                 m5, [srcq+ssq*2]       ; 5 = _ _
3100    add                srcq, ss3q
3101%endif
3102    movu                 m6, [srcq+ssq*0]       ; 6 = _ _
3103    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
3104    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
3105    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
3106    mova                 m7, [base+pw_8192]
3107    pmulhrsw             m1, m4, m7 ; H pw_8192 4 ~
3108    pmulhrsw             m2, m5, m7 ; H pw_8192 5 ~
3109    pmulhrsw             m3, m6, m7 ; H pw_8192 6 ~
3110    punpcklwd            m4, m0, m1  ; 3 4 ~
3111    punpcklwd            m5, m1, m2  ; 4 5 ~
3112    punpcklwd            m6, m2, m3  ; 5 6 ~
3113    SAVELINE_W8           6, m3
3114    RESTORELINE_W8        1, m1
3115    RESTORELINE_W8        2, m2
3116    RESTORELINE_W8        3, m3
3117.hv_w8_loop:
3118    ; m8 accu for V a
3119    ; m9 accu for V b
3120    SAVELINE_W8           1, m3
3121    SAVELINE_W8           2, m4
3122    SAVELINE_W8           3, m5
3123    SAVELINE_W8           4, m6
3124%if ARCH_X86_32
3125    pmaddwd              m0, m1, subpelv0 ; a0
3126    pmaddwd              m7, m2, subpelv0 ; b0
3127    pmaddwd              m3, subpelv1     ; a1
3128    pmaddwd              m4, subpelv1     ; b1
3129    paddd                m0, m3
3130    paddd                m7, m4
3131    pmaddwd              m5, subpelv2     ; a2
3132    pmaddwd              m6, subpelv2     ; b2
3133    paddd                m0, m5
3134    paddd                m7, m6
3135    mova                 m5, [base+pd_512]
3136    paddd                m0, m5 ;   pd_512
3137    paddd                m7, m5 ;   pd_512
3138    mova             accuv0, m0
3139    mova             accuv1, m7
3140%else
3141    pmaddwd              m8, m1, subpelv0 ; a0
3142    pmaddwd              m9, m2, subpelv0 ; b0
3143    pmaddwd              m3, subpelv1     ; a1
3144    pmaddwd              m4, subpelv1     ; b1
3145    paddd                m8, m3
3146    paddd                m9, m4
3147    pmaddwd              m5, subpelv2     ; a2
3148    pmaddwd              m6, subpelv2     ; b2
3149    paddd                m8, m5
3150    paddd                m9, m6
3151    mova                 m7, [base+pd_512]
3152    paddd                m8, m7 ;   pd_512
3153    paddd                m9, m7 ;   pd_512
3154    mova                 m7, [base+subpel_h_shufB]
3155    mova                 m6, [base+subpel_h_shufC]
3156    mova                 m5, [base+subpel_h_shufA]
3157%endif
3158    movu                 m0, [srcq+ssq*1] ; 7
3159    movu                 m4, [srcq+ssq*2] ; 8
3160    lea                srcq, [srcq+ssq*2]
3161    HV_H_W8              m0, m1, m2, m3, m5, m7, m6
3162    HV_H_W8              m4, m1, m2, m3, m5, m7, m6
3163    mova                 m5, [base+pw_8192]
3164    pmulhrsw             m0, m5 ; H pw_8192
3165    pmulhrsw             m4, m5 ; H pw_8192
3166    RESTORELINE_W8        6, m6
3167    punpcklwd            m5, m6, m0  ; 6 7  ~
3168    punpcklwd            m6, m0, m4  ; 7 8 ~
3169    pmaddwd              m1, m5, subpelv3 ; a3
3170    paddd                m2, m1, accuv0
3171    pmaddwd              m1, m6, subpelv3 ; b3
3172    paddd                m1, m1, accuv1 ; H + V
3173    psrad                m2, 10
3174    psrad                m1, 10
3175    packssdw             m2, m1  ; d -> w
3176    packuswb             m2, m1 ; w -> b
3177    movd       [dstq+dsq*0], m2
3178    psrlq                m2, 32
3179%if ARCH_X86_32
3180    add                dstq, dsm
3181    movd       [dstq+dsq*0], m2
3182    add                dstq, dsm
3183%else
3184    movd       [dstq+dsq*1], m2
3185    lea                dstq, [dstq+dsq*2]
3186%endif
3187    sub                  hd, 2
3188    jle .hv_w8_outer
3189    SAVELINE_W8           6, m4
3190    RESTORELINE_W8        1, m1
3191    RESTORELINE_W8        2, m2
3192    RESTORELINE_W8        3, m3
3193    RESTORELINE_W8        4, m4
3194    jmp .hv_w8_loop
3195.hv_w8_outer:
3196%if ARCH_X86_32
3197    mov                dstq, dstm
3198    add                  r4, 4
3199    movzx                hd, r6w
3200    add                dstq, 4
3201    mov                srcq, r4
3202    mov                dstm, dstq
3203%else
3204    add                  r4, 4
3205    add                  r7, 4
3206    movzx                hd, r6b
3207    mov                srcq, r4
3208    mov                dstq, r7
3209%endif
3210    sub                 r6d, 1<<16
3211    jg .hv_w8_loop0
3212    RET
3213
3214%if ARCH_X86_32
3215 DECLARE_REG_TMP 1, 2
3216%elif WIN64
3217 DECLARE_REG_TMP 6, 4
3218%else
3219 DECLARE_REG_TMP 6, 7
3220%endif
3221
3222%if ARCH_X86_32
3223 %define base_reg r2
3224 %define base base_reg-prep_ssse3
3225%else
3226 %define base_reg r7
3227 %define base 0
3228%endif
3229
3230%define PREP_8TAP_FN FN prep_8tap,
3231PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_8bpc
3232PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_8bpc
3233PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_8bpc
3234PREP_8TAP_FN regular,        REGULAR, REGULAR
3235
3236cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, mx, my, ns
3237    imul                mxd, mxm, 0x010101
3238    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3239    imul                myd, mym, 0x010101
3240    add                 myd, t1d ; 8tap_v, my, 4tap_v
3241    mov                  wd, wm
3242    movifnidn          srcd, srcm
3243    movifnidn            hd, hm
3244    LEA            base_reg, prep_ssse3
3245    test                mxd, 0xf00
3246    jnz .h
3247    test                myd, 0xf00
3248    jnz .v
3249.prep:
3250    tzcnt                wd, wd
3251    movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
3252    pxor                 m4, m4
3253    add                  wq, base_reg
3254    movifnidn           ssq, ssmp
3255    lea                  r6, [ssq*3]
3256%if WIN64
3257    pop                  r8
3258    pop                  r7
3259%endif
3260    jmp                  wq
3261.h:
3262    test                myd, 0xf00
3263    jnz .hv
3264    test                myd, 0xf00
3265    jnz .hv
3266%if ARCH_X86_32
3267 %define ssq r6
3268    mov                 ssq, ssmp
3269%endif
3270    cmp                  wd, 4
3271    jle mangle(private_prefix %+ _prep_8tap_8bpc %+ SUFFIX).h_w4
3272    WIN64_SPILL_XMM      11
3273    mova                 m5, [base+pw_8192]
3274%if ARCH_X86_64
3275    mova                 m8, [base+subpel_h_shufD]
3276    mova                 m9, [base+subpel_h_shufE]
3277    mova                m10, [base+subpel_h_shufF]
3278%endif
3279    shr                 mxd, 16
3280    sub                srcq, 2
3281    movq                 m7, [base_reg-prep_ssse3+subpel_filters+1+mxq*8]
3282    punpcklwd            m7, m7
3283    pshufd               m4, m7, q0000
3284    pshufd               m6, m7, q1111
3285    pshufd               m7, m7, q2222
3286    sub                  wd, 16
3287    jge .h_w16
3288%macro PREP_6TAP_H 3 ; dst/src, tmp[1-2]
3289%if ARCH_X86_32
3290    pshufb               %2, %1, [base+subpel_h_shufD]
3291    pshufb               %3, %1, [base+subpel_h_shufE]
3292    pshufb               %1, [base+subpel_h_shufF]
3293%else
3294    pshufb               %2, %1, m8
3295    pshufb               %3, %1, m9
3296    pshufb               %1, m10
3297%endif
3298    pmaddubsw            %2, m4
3299    pmaddubsw            %3, m6
3300    pmaddubsw            %1, m7
3301    paddw                %2, %3
3302    paddw                %1, %2
3303    pmulhrsw             %1, m5
3304%endmacro
3305.h_w8:
3306    movu                 m0, [srcq+ssq*0]
3307    movu                 m1, [srcq+ssq*1]
3308    lea                srcq, [srcq+ssq*2]
3309    PREP_6TAP_H          m0, m2, m3
3310    PREP_6TAP_H          m1, m2, m3
3311    mova        [tmpq+16*0], m0
3312    mova        [tmpq+16*1], m1
3313    add                tmpq, 32
3314    sub                  hd, 2
3315    jg .h_w8
3316    RET
3317.h_w16:
3318    add                srcq, wq
3319    neg                  wq
3320.h_w16_loop_v:
3321    mov                  r5, wq
3322.h_w16_loop_h:
3323    movu                 m0, [srcq+r5+8*0]
3324    movu                 m1, [srcq+r5+8*1]
3325    PREP_6TAP_H          m0, m2, m3
3326    PREP_6TAP_H          m1, m2, m3
3327    mova        [tmpq+16*0], m0
3328    mova        [tmpq+16*1], m1
3329    add                tmpq, 32
3330    add                  r5, 16
3331    jle .h_w16_loop_h
3332    add                srcq, ssq
3333    dec                  hd
3334    jg .h_w16_loop_v
3335    RET
3336.v:
3337%if ARCH_X86_32
3338    mov                 mxd, myd
3339    and                 mxd, 0x7f
3340%else
3341    WIN64_SPILL_XMM       9, 12
3342    movzx               mxd, myb
3343%endif
3344    shr                 myd, 16
3345    cmp                  hd, 6
3346    cmovs               myd, mxd
3347    movq                 m7, [base_reg-prep_ssse3+subpel_filters+1+myq*8]
3348    punpcklwd            m7, m7
3349    pshufd               m5, m7, q0000
3350    pshufd               m6, m7, q1111
3351    pshufd               m7, m7, q2222
3352%if ARCH_X86_32
3353    %define              m8  [base+pw_8192]
3354    mov                 ssq, ssm
3355    sub                srcq, ssq
3356    sub                srcq, ssq
3357%else
3358    mova                 m8, [base+pw_8192]
3359    mov                 nsq, ssq
3360    neg                 nsq
3361    cmp                  wd, 4
3362    jg .v_w8
3363%endif
3364.v_w4:
3365%if ARCH_X86_32
3366    lea                 r5d, [wq-4]
3367    shl                 r5d, 14
3368    add                 r5d, hd
3369    mov                srcm, srcq
3370.v_w4_loop0:
3371    movd                 m1, [srcq+ssq*0]
3372    movd                 m3, [srcq+ssq*1]
3373    lea                srcq, [srcq+ssq*2]
3374%else
3375    movd                 m1, [srcq+nsq*2]
3376    movd                 m3, [srcq+nsq*1]
3377%endif
3378    movd                 m2, [srcq+ssq*0]
3379    movd                 m4, [srcq+ssq*1]
3380    lea                srcq, [srcq+ssq*2]
3381    movd                 m0, [srcq+ssq*0]
3382    punpckldq            m1, m3     ; 0 1
3383    punpckldq            m3, m2     ; 1 2
3384    punpckldq            m2, m4     ; 2 3
3385    punpckldq            m4, m0     ; 3 4
3386    punpcklbw            m1, m3     ; 01 12
3387    punpcklbw            m2, m4     ; 23 34
3388.v_w4_loop:
3389    movd                 m3, [srcq+ssq*1]
3390    lea                srcq, [srcq+ssq*2]
3391    pmaddubsw            m4, m1, m5 ; a0 b0
3392    mova                 m1, m2
3393    pmaddubsw            m2, m6     ; a1 b1
3394    paddw                m4, m2
3395    punpckldq            m2, m0, m3 ; 4 5
3396    movd                 m0, [srcq+ssq*0]
3397    punpckldq            m3, m0     ; 5 6
3398    punpcklbw            m2, m3     ; 67 78
3399    pmaddubsw            m3, m2, m7 ; a2 b2
3400    paddw                m4, m3
3401    pmulhrsw             m4, m8
3402%if ARCH_X86_32
3403    movq        [tmpq+wq*0], m4
3404    movhps      [tmpq+wq*2], m4
3405    lea                tmpq, [tmpq+wq*4]
3406    sub                  hd, 2
3407    jg .v_w4_loop
3408    mov                srcq, srcm
3409    mov                tmpq, tmpm
3410    movzx                hd, r5w
3411    add                srcq, 4
3412    add                tmpq, 8
3413    mov                srcm, srcq
3414    mov                tmpm, tmpq
3415    sub                 r5d, 1<<16
3416    jg .v_w4_loop0
3417%else
3418    mova             [tmpq], m4
3419    add                tmpq, 16
3420    sub                  hd, 2
3421    jg .v_w4_loop
3422%endif
3423    RET
3424%if ARCH_X86_64
3425.v_w8:
3426    WIN64_PUSH_XMM       12
3427    lea                 r6d, [wq*4-32]
3428    lea                 r6d, [r6*8+hq]
3429.v_w8_loop0:
3430    movq                 m1, [srcq+nsq*2]
3431    movq                 m2, [srcq+nsq*1]
3432    lea                  r5, [srcq+ssq*2]
3433    movq                 m3, [srcq+ssq*0]
3434    movq                 m4, [srcq+ssq*1]
3435    mov                  r8, tmpq
3436    movq                 m0, [r5  +ssq*0]
3437    punpcklbw            m1, m2     ; 01
3438    punpcklbw            m2, m3     ; 12
3439    punpcklbw            m3, m4     ; 23
3440    punpcklbw            m4, m0     ; 34
3441.v_w8_loop:
3442    pmaddubsw           m10, m1, m5 ; a0
3443    mova                 m1, m3
3444    pmaddubsw           m11, m2, m5 ; b0
3445    mova                 m2, m4
3446    pmaddubsw            m3, m6     ; a1
3447    pmaddubsw            m4, m6     ; b1
3448    paddw               m10, m3
3449    paddw               m11, m4
3450    movq                 m4, [r5+ssq*1]
3451    lea                  r5, [r5+ssq*2]
3452    punpcklbw            m3, m0, m4 ; 67
3453    movq                 m0, [r5+ssq*0]
3454    punpcklbw            m4, m0     ; 78
3455    pmaddubsw            m9, m3, m7 ; a2
3456    paddw               m10, m9
3457    pmaddubsw            m9, m4, m7 ; b2
3458    paddw               m11, m9
3459    pmulhrsw            m10, m8
3460    pmulhrsw            m11, m8
3461    mova          [r8+wq*0], m10
3462    mova          [r8+wq*2], m11
3463    lea                  r8, [r8+wq*4]
3464    sub                  hd, 2
3465    jg .v_w8_loop
3466    add                srcq, 8
3467    add                tmpq, 16
3468    movzx                hd, r6b
3469    sub                 r6d, 1<<8
3470    jg .v_w8_loop0
3471    RET
3472%endif ;ARCH_X86_64
3473.hv:
3474    RESET_STACK_STATE
3475    cmp                  wd, 4
3476    jg .hv_w8
3477%if ARCH_X86_32
3478    and                 mxd, 0x7f
3479%else
3480    movzx               mxd, mxb
3481%endif
3482    dec                srcq
3483    movd                 m1, [base_reg-prep_ssse3+subpel_filters+2+mxq*8]
3484%if ARCH_X86_32
3485    mov                 mxd, myd
3486    and                 mxd, 0x7f
3487%else
3488    movzx               mxd, myb
3489%endif
3490    shr                 myd, 16
3491    cmp                  hd, 6
3492    cmovs               myd, mxd
3493    movq                 m0, [base_reg-prep_ssse3+subpel_filters+1+myq*8]
3494%if ARCH_X86_32
3495    mov                 ssq, ssmp
3496%define regs_used 6
3497    ALLOC_STACK   -mmsize*4
3498%define regs_used 7
3499    %define              m8  [rsp+mmsize*0]
3500    %define              m9  [rsp+mmsize*1]
3501    %define             m10  [rsp+mmsize*2]
3502    punpcklbw            m0, m0
3503    sub                srcq, ssq
3504    psraw                m0, 8 ; sign-extend
3505    sub                srcq, ssq
3506    pshufd               m2, m0, q0000
3507    mova                 m8, m2
3508    pshufd               m2, m0, q1111
3509    mova                 m9, m2
3510    pshufd               m2, m0, q2222
3511    mova                m10, m2
3512    movq                 m3, [srcq+ssq*0]
3513    movq                 m4, [srcq+ssq*1]
3514    lea                srcq, [srcq+ssq*2]
3515    %define             m11  [base+pw_8192]
3516    %define             m12  [base+subpel_h_shufA]
3517    %define             m13  [rsp+mmsize*3]
3518    %define             m14  [base+pd_32]
3519    pshufd               m1, m1, q0000
3520    mova                m13, m1
3521%else
3522    WIN64_SPILL_XMM      15
3523    mov                 nsq, ssq
3524    punpcklbw            m0, m0
3525    neg                 nsq
3526    psraw                m0, 8 ; sign-extend
3527    pshufd               m8, m0, q0000
3528    pshufd               m9, m0, q1111
3529    pshufd              m10, m0, q2222
3530    movq                 m3, [srcq+nsq*2]
3531    movq                 m4, [srcq+nsq*1]
3532    pshufd              m13, m1, q0000
3533    mova                m12, [base+subpel_h_shufA]
3534    mova                m11, [base+pw_8192]
3535    mova                m14, [base+pd_32]
3536%endif
3537    movq                 m0, [srcq+ssq*0]
3538    movq                 m1, [srcq+ssq*1]
3539    lea                srcq, [srcq+ssq*2]
3540    movq                 m2, [srcq+ssq*0]
3541%if ARCH_X86_32
3542    mova                 m5, m12
3543    mova                 m6, m13
3544    REPX {pshufb    x, m5 }, m3, m4, m0, m1, m2
3545    mova                 m5, m11
3546    REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2
3547%else
3548    REPX {pshufb    x, m12}, m3, m4, m0, m1, m2
3549    REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2
3550%endif
3551    phaddw               m3, m0      ; 0 2
3552    phaddw               m4, m1      ; 1 3
3553    phaddw               m0, m2      ; 2 4
3554%if ARCH_X86_32
3555    REPX  {pmulhrsw x, m5 }, m3, m4, m0
3556%else
3557    REPX  {pmulhrsw x, m11}, m3, m4, m0
3558%endif
3559    punpcklwd            m1, m3, m4  ; 01
3560    punpckhwd            m3, m4      ; 23
3561    punpcklwd            m2, m4, m0  ; 12
3562    punpckhwd            m4, m0      ; 34
3563.hv_w4_loop:
3564    movq                 m7, [srcq+ssq*1]
3565    lea                srcq, [srcq+ssq*2]
3566    movq                 m6, [srcq+ssq*0]
3567    pshufb               m7, m12
3568    pshufb               m6, m12
3569    pmaddubsw            m7, m13
3570    pmaddubsw            m6, m13
3571    pmaddwd              m5, m8, m1  ; a0
3572    mova                 m1, m3
3573    phaddw               m7, m6      ; 5 6
3574    pmaddwd              m6, m8, m2  ; b0
3575    mova                 m2, m4
3576    pmaddwd              m3, m9      ; a1
3577    pmaddwd              m4, m9      ; b1
3578    pmulhrsw             m7, m11
3579    paddd                m5, m14
3580    paddd                m6, m14
3581    paddd                m5, m3
3582    paddd                m6, m4
3583    shufpd               m4, m0, m7, 0x01 ; 4 5
3584    mova                 m0, m7
3585    punpcklwd            m3, m4, m7  ; 45
3586    punpckhwd            m4, m7      ; 56
3587    pmaddwd              m7, m10, m3 ; a2
3588    paddd                m5, m7
3589    pmaddwd              m7, m10, m4 ; b2
3590    paddd                m6, m7
3591    psrad                m5, 6
3592    psrad                m6, 6
3593    packssdw             m5, m6
3594    mova             [tmpq], m5
3595    add                tmpq, 16
3596    sub                  hd, 2
3597    jg .hv_w4_loop
3598    RET
3599.hv_w8:
3600    RESET_STACK_STATE
3601    shr                 mxd, 16
3602    sub                srcq, 2
3603    movq                 m0, [base_reg-prep_ssse3+subpel_filters+1+mxq*8]
3604%if ARCH_X86_32
3605    mov                 mxd, myd
3606    and                 mxd, 0x7f
3607%else
3608    movzx               mxd, myb
3609%endif
3610    shr                 myd, 16
3611    cmp                  hd, 6
3612    cmovs               myd, mxd
3613    movq                 m1, [base_reg-prep_ssse3+subpel_filters+1+myq*8]
3614%if ARCH_X86_32
3615    mov                 ssq, ssm
3616%assign regs_used 6
3617    ALLOC_STACK  -mmsize*16
3618%assign regs_used 7
3619    sub                srcq, ssq
3620    sub                srcq, ssq
3621%if STACK_ALIGNMENT < 16
3622    %define            srcm  [esp+mmsize*15+gprsize*0]
3623    %define            tmpm  [esp+mmsize*15+gprsize*1]
3624    mov                tmpm, tmpq
3625%endif
3626    mov                srcm, srcq
3627%else
3628    ALLOC_STACK        16*6, 16
3629    mov                 nsq, ssq
3630    neg                 nsq
3631%endif
3632    mova                 m7, [base+pw_8192]
3633    lea                 r5d, [wq-8]
3634    punpcklwd            m0, m0
3635    shl                 r5d, 13
3636    punpcklbw            m1, m1
3637    add                 r5d, hd
3638    psraw                m1, 8 ; sign-extend
3639    pshufd               m2, m0, q0000
3640    mova         [rsp+16*0], m2
3641    pshufd               m2, m0, q1111
3642    mova         [rsp+16*1], m2
3643    pshufd               m0, m0, q2222
3644    mova         [rsp+16*2], m0
3645    pshufd               m2, m1, q0000
3646    mova         [rsp+16*3], m2
3647    pshufd               m2, m1, q1111
3648    mova         [rsp+16*4], m2
3649    pshufd               m1, m1, q2222
3650    mova         [rsp+16*5], m1
3651%macro PREP_HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \
3652                          [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3]
3653    pshufb               %2, %1, %4
3654    pshufb               %1, %5
3655    pmaddubsw            %3, %2, %6
3656    shufps               %2, %1, q2121
3657    pmaddubsw            %1, %8
3658    pmaddubsw            %2, %7
3659    paddw                %1, %3
3660    paddw                %1, %2
3661    pmulhrsw             %1, m7
3662%endmacro
3663.hv_w8_loop0:
3664    mova                 m2, [base+subpel_h_shufD]
3665    mova                 m3, [base+subpel_h_shufF]
3666    mova                 m4, [rsp+16*0]
3667%if ARCH_X86_32
3668    movu                 m0, [srcq+ssq*0]
3669    movu                 m1, [srcq+ssq*1]
3670    lea                srcq, [srcq+ssq*2]
3671    PREP_HV_H_6TAP       m0, m5, m6, m2, m3, m4
3672    PREP_HV_H_6TAP       m1, m5, m6, m2, m3, m4
3673    movu                 m5, [srcq+ssq*0]
3674    punpcklwd            m6, m0, m1   ; 01
3675    punpckhwd            m0, m1
3676    mova        [rsp+16* 6], m6
3677    mova        [rsp+16* 7], m0
3678    PREP_HV_H_6TAP       m5, m0, m6, m2, m3, m4
3679    movu                 m0, [srcq+ssq*1]
3680    lea                srcq, [srcq+ssq*2]
3681    punpcklwd            m6, m1, m5   ; 12
3682    punpckhwd            m1, m5
3683    mova        [rsp+16* 8], m6
3684    mova        [rsp+16* 9], m1
3685    PREP_HV_H_6TAP       m0, m1, m6, m2, m3, m4
3686    movu                 m1, [srcq+ssq*0]
3687    punpcklwd            m6, m5, m0   ; 23
3688    punpckhwd            m5, m0
3689    mova        [rsp+16*10], m6
3690    mova        [rsp+16*11], m5
3691    PREP_HV_H_6TAP       m1, m5, m6, m2, m3, m4
3692    mova        [rsp+16*14], m1
3693    punpcklwd            m6, m0, m1   ; 34
3694    punpckhwd            m0, m1
3695    mova        [rsp+16*12], m6
3696    mova        [rsp+16*13], m0
3697.hv_w8_loop:
3698    mova                 m3, [rsp+16* 3]
3699    pmaddwd              m0, m3, [rsp+16* 6] ; a0
3700    pmaddwd              m2, m3, [rsp+16* 7] ; a0'
3701    pmaddwd              m1, m3, [rsp+16* 8] ; b0
3702    pmaddwd              m3, [rsp+16* 9]     ; b0'
3703    mova                 m6, [rsp+16* 4]
3704    mova                 m4, [rsp+16*10]
3705    mova                 m5, [rsp+16*11]
3706    mova        [rsp+16* 6], m4
3707    pmaddwd              m4, m6       ; a1
3708    mova        [rsp+16* 7], m5
3709    pmaddwd              m5, m6       ; a1'
3710    paddd                m0, m4
3711    mova                 m4, [rsp+16*12]
3712    paddd                m2, m5
3713    mova                 m5, [rsp+16*13]
3714    mova        [rsp+16* 8], m4
3715    pmaddwd              m4, m6       ; b1
3716    mova        [rsp+16* 9], m5
3717    pmaddwd              m5, m6       ; b1'
3718    movu                 m6, [srcq+ssq*1]
3719    lea                srcq, [srcq+ssq*2]
3720    paddd                m1, m4
3721    paddd                m3, m5
3722    PREP_HV_H_6TAP       m6, m4, m5
3723    mova                 m4, [base+pd_32]
3724    mova                 m5, [rsp+16*14]
3725    REPX      {paddd x, m4}, m0, m2, m1, m3
3726    punpcklwd            m4, m5, m6   ; 45
3727    punpckhwd            m5, m6
3728    mova        [rsp+16*10], m4
3729    mova        [rsp+16*11], m5
3730    pmaddwd              m4, [rsp+16*5] ; a2
3731    pmaddwd              m5, [rsp+16*5] ; a2'
3732    paddd                m0, m4
3733    movu                 m4, [srcq+ssq*0]
3734    paddd                m2, m5
3735    psrad                m0, 6
3736    psrad                m2, 6
3737    packssdw             m0, m2
3738    PREP_HV_H_6TAP       m4, m2, m5
3739    mova                 m2, [rsp+16*5]
3740    punpcklwd            m5, m6, m4   ; 56
3741    mova        [rsp+16*14], m4
3742    punpckhwd            m6, m4
3743    mova        [rsp+16*12], m5
3744    pmaddwd              m5, m2       ; b2
3745    mova        [rsp+16*13], m6
3746    pmaddwd              m6, m2       ; b2'
3747    paddd                m1, m5
3748    paddd                m3, m6
3749    psrad                m1, 6
3750    psrad                m3, 6
3751    packssdw             m1, m3
3752    mova        [tmpq+wq*0], m0
3753    mova        [tmpq+wq*2], m1
3754    lea                tmpq, [tmpq+wq*4]
3755    sub                  hd, 2
3756    jg .hv_w8_loop
3757    mov                srcq, srcm
3758    mov                tmpq, tmpm
3759    movzx                hd, r5w
3760    add                srcq, 8
3761    add                tmpq, 16
3762    mov                srcm, srcq
3763    mov                tmpm, tmpq
3764%else
3765    movu                 m9, [srcq+nsq*2]
3766    movu                m11, [srcq+nsq*1]
3767    lea                  r6, [srcq+ssq*2]
3768    movu                m13, [srcq+ssq*0]
3769    movu                m15, [srcq+ssq*1]
3770    mov                  r8, tmpq
3771    movu                 m6, [r6  +ssq*0]
3772    mova                 m5, [rsp+16*1]
3773    mova                 m8, [rsp+16*2]
3774    PREP_HV_H_6TAP       m9, m0, m1, m2, m3, m4, m5, m8
3775    PREP_HV_H_6TAP      m11, m0, m1, m2, m3, m4, m5, m8
3776    PREP_HV_H_6TAP      m13, m0, m1, m2, m3, m4, m5, m8
3777    PREP_HV_H_6TAP      m15, m0, m1, m2, m3, m4, m5, m8
3778    PREP_HV_H_6TAP       m6, m0, m1, m2, m3, m4, m5, m8
3779    punpcklwd            m8, m9, m11  ; 01
3780    punpckhwd            m9, m11
3781    punpcklwd           m10, m11, m13 ; 12
3782    punpckhwd           m11, m13
3783    punpcklwd           m12, m13, m15 ; 23
3784    punpckhwd           m13, m15
3785    punpcklwd           m14, m15, m6  ; 34
3786    punpckhwd           m15, m6
3787.hv_w8_loop:
3788    mova                 m3, [rsp+16*3]
3789    mova                 m4, [rsp+16*4]
3790    mova                 m5, [base+pd_32]
3791    pmaddwd              m0, m8, m3  ; a0
3792    mova                 m8, m12
3793    pmaddwd              m2, m9, m3  ; a0'
3794    mova                 m9, m13
3795    pmaddwd              m1, m10, m3 ; b0
3796    mova                m10, m14
3797    pmaddwd              m3, m11     ; b0'
3798    mova                m11, m15
3799    REPX    {pmaddwd x, m4}, m12, m13, m14, m15
3800    REPX    {paddd   x, m5}, m0, m2, m1, m3
3801    paddd                m0, m12
3802    paddd                m2, m13
3803    paddd                m1, m14
3804    paddd                m3, m15
3805    movu                m15, [r6+ssq*1]
3806    lea                  r6, [r6+ssq*2]
3807    PREP_HV_H_6TAP      m15, m4, m5
3808    punpcklwd           m12, m6, m15
3809    punpckhwd           m13, m6, m15
3810    movu                 m6, [r6+ssq*0]
3811    PREP_HV_H_6TAP       m6, m4, m5
3812    mova                 m4, [rsp+16*5]
3813    punpcklwd           m14, m15, m6
3814    punpckhwd           m15, m6
3815    pmaddwd              m5, m12, m4  ; a2
3816    paddd                m0, m5
3817    pmaddwd              m5, m13, m4  ; a2'
3818    paddd                m2, m5
3819    pmaddwd              m5, m14, m4  ; b2
3820    paddd                m1, m5
3821    pmaddwd              m4, m15      ; b2'
3822    paddd                m3, m4
3823    REPX       {psrad x, 6}, m0, m2, m1, m3
3824    packssdw             m0, m2
3825    packssdw             m1, m3
3826    mova          [r8+wq*0], m0
3827    mova          [r8+wq*2], m1
3828    lea                  r8, [r8+wq*4]
3829    sub                  hd, 2
3830    jg .hv_w8_loop
3831    add                srcq, 8
3832    add                tmpq, 16
3833    movzx                hd, r5b
3834%endif
3835    sub                 r5d, 1<<16
3836    jg .hv_w8_loop0
3837    RET
3838
3839PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_8bpc
3840PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_8bpc
3841PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_8bpc
3842PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_8bpc
3843PREP_8TAP_FN sharp,          SHARP,   SHARP
3844
3845cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
3846    imul                mxd, mxm, 0x010101
3847    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3848    imul                myd, mym, 0x010101
3849    add                 myd, t1d ; 8tap_v, my, 4tap_v
3850    mov                  wd, wm
3851    movifnidn          srcd, srcm
3852    movifnidn            hd, hm
3853    LEA            base_reg, prep_ssse3
3854    test                mxd, 0xf00
3855    jnz .h
3856    test                myd, 0xf00
3857    jz mangle(private_prefix %+ _prep_6tap_8bpc_ssse3).prep
3858.v:
3859%if ARCH_X86_32
3860    mov                 mxd, myd
3861    and                 mxd, 0x7f
3862%else
3863    WIN64_SPILL_XMM      16
3864    movzx               mxd, myb
3865%endif
3866    shr                 myd, 16
3867    cmp                  hd, 6
3868    cmovs               myd, mxd
3869    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
3870    mova                 m2, [base+pw_512]
3871    mova                 m7, [base+pw_8192]
3872    punpcklwd            m0, m0
3873%if ARCH_X86_32
3874 %define            subpel0  [rsp+mmsize*0]
3875 %define            subpel1  [rsp+mmsize*1]
3876 %define            subpel2  [rsp+mmsize*2]
3877 %define            subpel3  [rsp+mmsize*3]
3878%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
3879    ALLOC_STACK   -mmsize*4
3880%assign regs_used 7
3881    mov             strideq, [rstk+stack_offset+gprsize*3]
3882    pshufd               m1, m0, q0000
3883    mova            subpel0, m1
3884    pshufd               m1, m0, q1111
3885    mova            subpel1, m1
3886    lea                  r5, [strideq*3]
3887    pshufd               m1, m0, q2222
3888    mova            subpel2, m1
3889    pshufd               m1, m0, q3333
3890    mova            subpel3, m1
3891    sub                srcq, r5
3892%else
3893 %define            subpel0  m8
3894 %define            subpel1  m9
3895 %define            subpel2  m10
3896 %define            subpel3  m11
3897    pshufd               m8, m0, q0000
3898    pshufd               m9, m0, q1111
3899    lea            stride3q, [strideq*3]
3900    pshufd              m10, m0, q2222
3901    pshufd              m11, m0, q3333
3902    sub                srcq, stride3q
3903    cmp                  wd, 8
3904    jns .v_w8
3905%endif
3906.v_w4:
3907%if ARCH_X86_32
3908 %if STACK_ALIGNMENT < mmsize
3909  %define srcm [esp+stack_size+gprsize*1]
3910  %define tmpm [esp+stack_size+gprsize*2]
3911 %endif
3912    mov                tmpm, tmpq
3913    mov                srcm, srcq
3914    lea                 r5d, [wq - 4] ; horizontal loop
3915    shl                 r5d, (16 - 2)  ; (wq / 4) << 16
3916    mov                 r5w, hw
3917.v_w4_loop0:
3918%endif
3919    movd                 m1, [srcq+strideq*0]
3920    movd                 m0, [srcq+strideq*1]
3921%if ARCH_X86_32
3922    lea                srcq, [srcq+strideq*2]
3923    movd                 m2, [srcq+strideq*0]
3924    movd                 m4, [srcq+strideq*1]
3925    lea                srcq, [srcq+strideq*2]
3926    movd                 m3, [srcq+strideq*0]
3927    movd                 m5, [srcq+strideq*1]
3928    lea                srcq, [srcq+strideq*2]
3929%else
3930    movd                 m2, [srcq+strideq*2]
3931    add                srcq, stride3q
3932    movd                 m4, [srcq+strideq*0]
3933    movd                 m3, [srcq+strideq*1]
3934    movd                 m5, [srcq+strideq*2]
3935    add                srcq, stride3q
3936%endif
3937    punpckldq            m1, m0 ; 0 1
3938    punpckldq            m0, m2 ; 1 2
3939    punpcklbw            m1, m0 ; 01 12
3940    movd                 m0, [srcq+strideq*0]
3941    punpckldq            m2, m4 ; 2 3
3942    punpckldq            m4, m3 ; 3 4
3943    punpckldq            m3, m5 ; 4 5
3944    punpckldq            m5, m0 ; 5 6
3945    punpcklbw            m2, m4 ; 23 34
3946    punpcklbw            m3, m5 ; 45 56
3947.v_w4_loop:
3948    mova                 m5, m1
3949    pmaddubsw            m5, subpel0      ; a0 b0
3950    mova                 m1, m2
3951    pmaddubsw            m2, subpel1      ; a1 b1
3952    paddw                m5, m2
3953    mova                 m2, m3
3954    pmaddubsw            m3, subpel2      ; a2 b2
3955    movd                 m4, [srcq+strideq*1]
3956    lea                srcq, [srcq+strideq*2]
3957    paddw                m5, m3
3958    punpckldq            m3, m0, m4       ; 6 7 _ _
3959    movd                 m0, [srcq+strideq*0]
3960    punpckldq            m4, m0           ; 7 8 _ _
3961    punpcklbw            m3, m4           ; 67 78
3962    mova                 m4, m3
3963    pmaddubsw            m4, subpel3      ; a3 b3
3964    paddw                m5, m4
3965    pmulhrsw             m5, m7
3966    movq        [tmpq+wq*0], m5
3967    movhps      [tmpq+wq*2], m5
3968    lea                tmpq, [tmpq+wq*4]
3969    sub                  hd, 2
3970    jg .v_w4_loop
3971%if ARCH_X86_32
3972    mov                srcq, srcm
3973    mov                tmpq, tmpm
3974    movzx                hd, r5w
3975    add                srcq, 4
3976    add                tmpq, 8
3977    mov                srcm, srcq
3978    mov                tmpm, tmpq
3979    sub                 r5d, 1<<16 ; horizontal--
3980    jg .v_w4_loop0
3981%endif
3982    RET
3983%if ARCH_X86_64
3984.v_w8:
3985    lea                 r6d, [wq*8-64]
3986    mov                  r5, srcq
3987    mov                  r8, tmpq
3988    lea                 r6d, [hq+r6*4]
3989.v_w8_loop0:
3990    movq                 m1, [srcq+strideq*0]
3991    movq                 m2, [srcq+strideq*1]
3992    movq                 m3, [srcq+strideq*2]
3993    add                srcq, stride3q
3994    movq                 m4, [srcq+strideq*0]
3995    movq                 m5, [srcq+strideq*1]
3996    movq                 m6, [srcq+strideq*2]
3997    add                srcq, stride3q
3998    movq                 m0, [srcq+strideq*0]
3999    punpcklbw            m1, m2 ; 01
4000    punpcklbw            m2, m3 ; 12
4001    punpcklbw            m3, m4 ; 23
4002    punpcklbw            m4, m5 ; 34
4003    punpcklbw            m5, m6 ; 45
4004    punpcklbw            m6, m0 ; 56
4005.v_w8_loop:
4006    movq                m13, [srcq+strideq*1]
4007    lea                srcq, [srcq+strideq*2]
4008    pmaddubsw           m14, m1, subpel0 ; a0
4009    pmaddubsw           m15, m2, subpel0 ; b0
4010    mova                 m1, m3
4011    mova                 m2, m4
4012    pmaddubsw            m3, subpel1 ; a1
4013    pmaddubsw            m4, subpel1 ; b1
4014    paddw               m14, m3
4015    paddw               m15, m4
4016    mova                 m3, m5
4017    mova                 m4, m6
4018    pmaddubsw            m5, subpel2 ; a2
4019    pmaddubsw            m6, subpel2 ; b2
4020    punpcklbw           m12, m0, m13 ; 67
4021    movq                 m0, [srcq+strideq*0]
4022    punpcklbw           m13, m0      ; 78
4023    paddw               m14, m5
4024    mova                 m5, m12
4025    pmaddubsw           m12, subpel3 ; a3
4026    paddw               m15, m6
4027    mova                 m6, m13
4028    pmaddubsw           m13, subpel3 ; b3
4029    paddw               m14, m12
4030    paddw               m15, m13
4031    pmulhrsw            m14, m7
4032    pmulhrsw            m15, m7
4033    movu        [tmpq+wq*0], m14
4034    movu        [tmpq+wq*2], m15
4035    lea                tmpq, [tmpq+wq*4]
4036    sub                  hd, 2
4037    jg .v_w8_loop
4038    add                  r5, 8
4039    add                  r8, 16
4040    movzx                hd, r6b
4041    mov                srcq, r5
4042    mov                tmpq, r8
4043    sub                 r6d, 1<<8
4044    jg .v_w8_loop0
4045    RET
4046%endif ;ARCH_X86_64
4047%undef subpel0
4048%undef subpel1
4049%undef subpel2
4050%undef subpel3
4051.h_w4:
4052    WIN64_SPILL_XMM       7
4053%if ARCH_X86_32
4054    and                 mxd, 0x7f
4055%else
4056    movzx               mxd, mxb
4057%endif
4058    dec                srcq
4059    movd                 m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
4060    mova                 m5, [base+subpel_h_shufA]
4061    mova                 m6, [base+pw_8192]
4062    movifnidn            r2, stridemp
4063    pshufd               m4, m4, q0000
4064    lea                  r3, [r2*3]
4065.h_w4_loop:
4066    movq                 m0, [srcq+r2*0]
4067    movq                 m1, [srcq+r2*1]
4068    movq                 m2, [srcq+r2*2]
4069    movq                 m3, [srcq+r3  ]
4070    lea                srcq, [srcq+r2*4]
4071    REPX  {pshufb    x, m5}, m0, m1, m2, m3
4072    REPX  {pmaddubsw x, m4}, m0, m1, m2, m3
4073    phaddw               m0, m1
4074    phaddw               m2, m3
4075    pmulhrsw             m0, m6
4076    pmulhrsw             m2, m6
4077    mova        [tmpq+16*0], m0
4078    mova        [tmpq+16*1], m2
4079    add                tmpq, 32
4080    sub                  hd, 4
4081    jg .h_w4_loop
4082    RET
4083.h:
4084    test                myd, 0xf00
4085    jnz .hv
4086    cmp                  wd, 4
4087    je .h_w4
4088    WIN64_SPILL_XMM      12
4089%if ARCH_X86_32
4090 %define strideq r6
4091    mov             strideq, stridem
4092%endif
4093    tzcnt                wd, wd
4094%if ARCH_X86_64
4095    mova                m10, [base+subpel_h_shufA]
4096    mova                m11, [base+subpel_h_shufB]
4097    mova                 m9, [base+subpel_h_shufC]
4098%else
4099    %define             m10  [base+subpel_h_shufA]
4100    %define             m11  [base+subpel_h_shufB]
4101    %define              m9  [base+subpel_h_shufC]
4102%endif
4103    shr                 mxd, 16
4104    sub                srcq, 3
4105    movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
4106    movq                 m6, [base_reg+mxq*8+subpel_filters-prep_ssse3]
4107    mova                 m7, [base+pw_8192]
4108    pshufd               m5, m6, q0000
4109    pshufd               m6, m6, q1111
4110    add                  wq, base_reg
4111    jmp                  wq
4112%macro PREP_8TAP_H 2 ; dst, src_memloc
4113    movu                m%1, [%2]
4114    pshufb               m2, m%1, m11 ; subpel_h_shufB
4115    pshufb               m3, m%1, m9  ; subpel_h_shufC
4116    pshufb              m%1, m10      ; subpel_h_shufA
4117    mova                 m4, m2
4118    pmaddubsw            m4, m5       ; subpel +0 B0
4119    pmaddubsw            m2, m6       ; subpel +4 B4
4120    pmaddubsw            m3, m6       ; subpel +4 C4
4121    pmaddubsw           m%1, m5       ; subpel +0 A0
4122    paddw                m3, m4
4123    paddw               m%1, m2
4124    phaddw              m%1, m3
4125    pmulhrsw            m%1, m7
4126%endmacro
4127.h_w8:
4128    PREP_8TAP_H           0, srcq+strideq*0
4129    PREP_8TAP_H           1, srcq+strideq*1
4130    mova        [tmpq+16*0], m0
4131    mova        [tmpq+16*1], m1
4132    lea                srcq, [srcq+strideq*2]
4133    add                tmpq, 32
4134    sub                  hd, 2
4135    jg .h_w8
4136    RET
4137.h_w16:
4138    mov                  r3, -16*1
4139    jmp .h_start
4140.h_w32:
4141    mov                  r3, -16*2
4142    jmp .h_start
4143.h_w64:
4144    mov                  r3, -16*4
4145    jmp .h_start
4146.h_w128:
4147    mov                  r3, -16*8
4148.h_start:
4149    sub                srcq, r3
4150    mov                  r5, r3
4151.h_loop:
4152    PREP_8TAP_H           0, srcq+r3+8*0
4153    PREP_8TAP_H           1, srcq+r3+8*1
4154    mova        [tmpq+16*0], m0
4155    mova        [tmpq+16*1], m1
4156    add                tmpq, 32
4157    add                  r3, 16
4158    jl .h_loop
4159    add                srcq, strideq
4160    mov                  r3, r5
4161    dec                  hd
4162    jg .h_loop
4163    RET
4164.hv:
4165    RESET_STACK_STATE
4166    cmp                  wd, 4
4167    jg .hv_w8
4168    and                 mxd, 0x7f
4169    movd                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
4170%if ARCH_X86_32
4171    mov                 mxd, myd
4172    shr                 myd, 16
4173    and                 mxd, 0x7f
4174    cmp                  hd, 6
4175    cmovs               myd, mxd
4176    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
4177    mov             strideq, stridem
4178 %assign regs_used 6
4179    ALLOC_STACK  -mmsize*14
4180 %assign regs_used 7
4181    lea                  r5, [strideq*3+1]
4182    sub                srcq, r5
4183 %define           subpelv0  [rsp+mmsize*0]
4184 %define           subpelv1  [rsp+mmsize*1]
4185 %define           subpelv2  [rsp+mmsize*2]
4186 %define           subpelv3  [rsp+mmsize*3]
4187    punpcklbw            m0, m0
4188    psraw                m0, 8
4189    pshufd               m6, m0, q0000
4190    mova           subpelv0, m6
4191    pshufd               m6, m0, q1111
4192    mova           subpelv1, m6
4193    pshufd               m6, m0, q2222
4194    mova           subpelv2, m6
4195    pshufd               m6, m0, q3333
4196    mova           subpelv3, m6
4197%else
4198    movzx               mxd, myb
4199    shr                 myd, 16
4200    cmp                  hd, 6
4201    cmovs               myd, mxd
4202    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
4203    ALLOC_STACK   mmsize*14, 14
4204    lea            stride3q, [strideq*3]
4205    sub                srcq, stride3q
4206    dec                srcq
4207 %define           subpelv0  m10
4208 %define           subpelv1  m11
4209 %define           subpelv2  m12
4210 %define           subpelv3  m13
4211    punpcklbw            m0, m0
4212    psraw                m0, 8
4213    mova                 m8, [base+pw_8192]
4214    mova                 m9, [base+pd_32]
4215    pshufd              m10, m0, q0000
4216    pshufd              m11, m0, q1111
4217    pshufd              m12, m0, q2222
4218    pshufd              m13, m0, q3333
4219%endif
4220    pshufd               m7, m1, q0000
4221%define hv4_line_0_0 4
4222%define hv4_line_0_1 5
4223%define hv4_line_0_2 6
4224%define hv4_line_0_3 7
4225%define hv4_line_0_4 8
4226%define hv4_line_0_5 9
4227%define hv4_line_1_0 10
4228%define hv4_line_1_1 11
4229%define hv4_line_1_2 12
4230%define hv4_line_1_3 13
4231%if ARCH_X86_32
4232    %define        w8192reg  [base+pw_8192]
4233    %define          d32reg  [base+pd_32]
4234%else
4235    %define        w8192reg  m8
4236    %define          d32reg  m9
4237%endif
4238    ; lower shuffle 0 1 2 3 4
4239    mova                 m6, [base+subpel_h_shuf4]
4240    movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
4241    movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
4242%if ARCH_X86_32
4243    lea                srcq, [srcq+strideq*2]
4244    movq                 m4, [srcq+strideq*0]   ; 2 _ _ _
4245    movhps               m4, [srcq+strideq*1]   ; 2 _ 3 _
4246    lea                srcq, [srcq+strideq*2]
4247%else
4248    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
4249    movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
4250    lea                srcq, [srcq+strideq*4]
4251%endif
4252    pshufb               m2, m5, m6             ;H subpel_h_shuf4 0~1~
4253    pshufb               m0, m4, m6             ;H subpel_h_shuf4 2~3~
4254    pmaddubsw            m2, m7                 ;H subpel_filters
4255    pmaddubsw            m0, m7                 ;H subpel_filters
4256    phaddw               m2, m0
4257    pmulhrsw             m2, w8192reg
4258    SAVELINE_W4          m2, 2, 0
4259    ; upper shuffle 2 3 4 5 6
4260    mova                 m6, [base+subpel_h_shuf4+16]
4261    pshufb               m2, m5, m6             ;H subpel_h_shuf4 0~1~
4262    pshufb               m0, m4, m6             ;H subpel_h_shuf4 2~3~
4263    pmaddubsw            m2, m7                 ;H subpel_filters
4264    pmaddubsw            m0, m7                 ;H subpel_filters
4265    phaddw               m2, m0                 ;H 0 1 2 3
4266    pmulhrsw             m2, w8192reg
4267    ; lower shuffle
4268    mova                 m6, [base+subpel_h_shuf4]
4269    movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
4270    movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
4271%if ARCH_X86_32
4272    lea                srcq, [srcq+strideq*2]
4273    movq                 m4, [srcq+strideq*0]   ; 6 _ _ _
4274    add                srcq, strideq
4275%else
4276    movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
4277    add                srcq, stride3q
4278%endif
4279    pshufb               m3, m5, m6             ;H subpel_h_shuf4 4~5~
4280    pshufb               m0, m4, m6             ;H subpel_h_shuf4 6~6~
4281    pmaddubsw            m3, m7                 ;H subpel_filters
4282    pmaddubsw            m0, m7                 ;H subpel_filters
4283    phaddw               m3, m0                 ;H 4 5 6 7
4284    pmulhrsw             m3, w8192reg
4285    SAVELINE_W4          m3, 3, 0
4286    ; upper shuffle
4287    mova                 m6, [base+subpel_h_shuf4+16]
4288    pshufb               m3, m5, m6             ;H subpel_h_shuf4 4~5~
4289    pshufb               m0, m4, m6             ;H subpel_h_shuf4 6~6~
4290    pmaddubsw            m3, m7                 ;H subpel_filters
4291    pmaddubsw            m0, m7                 ;H subpel_filters
4292    phaddw               m3, m0                 ;H 4 5 6 7
4293    pmulhrsw             m3, w8192reg
4294    ;process high
4295    palignr              m4, m3, m2, 4;V 1 2 3 4
4296    punpcklwd            m1, m2, m4  ; V 01 12
4297    punpckhwd            m2, m4      ; V 23 34
4298    pshufd               m0, m3, q2121;V 5 6 5 6
4299    punpcklwd            m3, m0      ; V 45 56
4300    SAVELINE_W4          m0, 0, 1
4301    SAVELINE_W4          m1, 1, 1
4302    SAVELINE_W4          m2, 2, 1
4303    SAVELINE_W4          m3, 3, 1
4304    ;process low
4305    RESTORELINE_W4       m2, 2, 0
4306    RESTORELINE_W4       m3, 3, 0
4307    palignr              m4, m3, m2, 4;V 1 2 3 4
4308    punpcklwd            m1, m2, m4  ; V 01 12
4309    punpckhwd            m2, m4      ; V 23 34
4310    pshufd               m0, m3, q2121;V 5 6 5 6
4311    punpcklwd            m3, m0      ; V 45 56
4312.hv_w4_loop:
4313    ;process low
4314    pmaddwd              m5, m1, subpelv0 ; V a0 b0
4315    mova                 m1, m2
4316    pmaddwd              m2, subpelv1; V a1 b1
4317    paddd                m5, m2
4318    mova                 m2, m3
4319    pmaddwd              m3, subpelv2; V a2 b2
4320    paddd                m5, m3
4321    mova                 m6, [base+subpel_h_shuf4]
4322    movq                 m4, [srcq+strideq*0] ; 7
4323    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
4324    pshufb               m4, m6               ; H subpel_h_shuf4 7~8~
4325    pmaddubsw            m4, m7               ; H subpel_filters
4326    phaddw               m4, m4               ; H                7878
4327    pmulhrsw             m4, w8192reg
4328    palignr              m3, m4, m0, 12       ;                  6787
4329    mova                 m0, m4
4330    punpcklwd            m3, m4      ; 67 78
4331    pmaddwd              m4, m3, subpelv3; a3 b3
4332    paddd                m5, d32reg ; pd_32
4333    paddd                m5, m4
4334    psrad                m5, 6
4335    SAVELINE_W4          m0, 0, 0
4336    SAVELINE_W4          m1, 1, 0
4337    SAVELINE_W4          m2, 2, 0
4338    SAVELINE_W4          m3, 3, 0
4339    SAVELINE_W4          m5, 5, 0
4340    ;process high
4341    RESTORELINE_W4       m0, 0, 1
4342    RESTORELINE_W4       m1, 1, 1
4343    RESTORELINE_W4       m2, 2, 1
4344    RESTORELINE_W4       m3, 3, 1
4345    pmaddwd              m5, m1, subpelv0; V a0 b0
4346    mova                 m1, m2
4347    pmaddwd              m2, subpelv1; V a1 b1
4348    paddd                m5, m2
4349    mova                 m2, m3
4350    pmaddwd              m3, subpelv2; V a2 b2
4351    paddd                m5, m3
4352    mova                 m6, [base+subpel_h_shuf4+16]
4353    movq                 m4, [srcq+strideq*0] ; 7
4354    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
4355    pshufb               m4, m6               ; H subpel_h_shuf4 7~8~
4356    pmaddubsw            m4, m7               ; H subpel_filters
4357    phaddw               m4, m4               ; H                7878
4358    pmulhrsw             m4, w8192reg
4359    palignr              m3, m4, m0, 12       ;                  6787
4360    mova                 m0, m4
4361    punpcklwd            m3, m4      ; 67 78
4362    pmaddwd              m4, m3, subpelv3; a3 b3
4363    paddd                m5, d32reg ; pd_32
4364    paddd                m5, m4
4365    psrad                m4, m5, 6
4366    RESTORELINE_W4       m5, 5, 0
4367    packssdw             m5, m4
4368    pshufd               m5, m5, q3120
4369    movu             [tmpq], m5
4370    lea                srcq, [srcq+strideq*2]
4371    add                tmpq, 16
4372    sub                  hd, 2
4373    SAVELINE_W4          m0, 0, 1
4374    SAVELINE_W4          m1, 1, 1
4375    SAVELINE_W4          m2, 2, 1
4376    SAVELINE_W4          m3, 3, 1
4377    RESTORELINE_W4       m0, 0, 0
4378    RESTORELINE_W4       m1, 1, 0
4379    RESTORELINE_W4       m2, 2, 0
4380    RESTORELINE_W4       m3, 3, 0
4381    jg .hv_w4_loop
4382    RET
4383%undef subpelv0
4384%undef subpelv1
4385%undef subpelv2
4386%undef subpelv3
4387.hv_w8:
4388    RESET_STACK_STATE
4389%define hv8_line_1 0
4390%define hv8_line_2 1
4391%define hv8_line_3 2
4392%define hv8_line_4 3
4393%define hv8_line_6 4
4394    shr                 mxd, 16
4395%if ARCH_X86_32
4396 %define           subpelh0  [rsp+mmsize*5]
4397 %define           subpelh1  [rsp+mmsize*6]
4398 %define           subpelv0  [rsp+mmsize*7]
4399 %define           subpelv1  [rsp+mmsize*8]
4400 %define           subpelv2  [rsp+mmsize*9]
4401 %define           subpelv3  [rsp+mmsize*10]
4402 %define             accuv0  [rsp+mmsize*11]
4403 %define             accuv1  [rsp+mmsize*12]
4404    movq                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
4405    mov                 mxd, myd
4406    shr                 myd, 16
4407    and                 mxd, 0x7f
4408    cmp                  hd, 6
4409    cmovs               myd, mxd
4410    movq                 m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
4411    mov             strideq, stridem
4412 %assign regs_used 6
4413    ALLOC_STACK  -mmsize*14
4414 %assign regs_used 7
4415 %if STACK_ALIGNMENT < mmsize
4416  %define              tmpm  [rsp+mmsize*13+gprsize*1]
4417  %define              srcm  [rsp+mmsize*13+gprsize*2]
4418  %define           stridem  [rsp+mmsize*13+gprsize*3]
4419    mov                tmpm, tmpq
4420    mov             stridem, strideq
4421 %endif
4422    pshufd               m0, m1, q0000
4423    pshufd               m1, m1, q1111
4424    punpcklbw            m5, m5
4425    psraw                m5, 8
4426    pshufd               m2, m5, q0000
4427    pshufd               m3, m5, q1111
4428    pshufd               m4, m5, q2222
4429    pshufd               m5, m5, q3333
4430    mova           subpelh0, m0
4431    mova           subpelh1, m1
4432    mova           subpelv0, m2
4433    mova           subpelv1, m3
4434    mova           subpelv2, m4
4435    mova           subpelv3, m5
4436    lea                  r5, [strideq*3+3]
4437    sub                srcq, r5
4438    mov                srcm, srcq
4439%else
4440    ALLOC_STACK    mmsize*5, 16
4441 %define           subpelh0  m10
4442 %define           subpelh1  m11
4443 %define           subpelv0  m12
4444 %define           subpelv1  m13
4445 %define           subpelv2  m14
4446 %define           subpelv3  m15
4447 %define             accuv0  m8
4448 %define             accuv1  m9
4449    movq                 m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
4450    movzx               mxd, myb
4451    shr                 myd, 16
4452    cmp                  hd, 6
4453    cmovs               myd, mxd
4454    movq                 m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
4455    pshufd         subpelh0, m0, q0000
4456    pshufd         subpelh1, m0, q1111
4457    punpcklbw            m1, m1
4458    psraw                m1, 8
4459    pshufd         subpelv0, m1, q0000
4460    pshufd         subpelv1, m1, q1111
4461    pshufd         subpelv2, m1, q2222
4462    pshufd         subpelv3, m1, q3333
4463    lea            stride3q, [strideq*3]
4464    sub                srcq, 3
4465    sub                srcq, stride3q
4466    mov                  r6, srcq
4467    mov                  r8, tmpq
4468%endif
4469    lea                 r5d, [wq-4]
4470    shl                 r5d, 14
4471    add                 r5d, hd
4472.hv_w8_loop0:
4473%if ARCH_X86_64
4474    mova                 m7, [base+subpel_h_shufA]
4475    mova                 m8, [base+subpel_h_shufB]
4476    mova                 m9, [base+subpel_h_shufC]
4477    %define           shufA  m7
4478    %define           shufB  m8
4479    %define           shufC  m9
4480%else
4481    %define           shufA  [base+subpel_h_shufA]
4482    %define           shufB  [base+subpel_h_shufB]
4483    %define           shufC  [base+subpel_h_shufC]
4484%endif
4485%macro PREP_8TAP_HV 2 ; dst, src_memloc, tmp[1-2]
4486    movu                 %1, [%2]
4487    pshufb               m2, %1, shufB
4488    pshufb               m3, %1, shufC
4489    pshufb               %1, shufA
4490    mova                 m1, m2
4491    pmaddubsw            m1, subpelh0 ; subpel +0 C0
4492    pmaddubsw            m3, subpelh1 ; subpel +4 B4
4493    pmaddubsw            m2, subpelh1 ; C4
4494    pmaddubsw            %1, subpelh0 ; A0
4495    paddw                m1, m3       ; C0+B4
4496    paddw                %1, m2       ; A0+C4
4497    phaddw               %1, m1
4498%endmacro
4499    PREP_8TAP_HV         m4, srcq+strideq*0
4500    PREP_8TAP_HV         m5, srcq+strideq*1
4501%if ARCH_X86_64
4502    PREP_8TAP_HV         m6, srcq+strideq*2
4503    add                srcq, stride3q
4504    PREP_8TAP_HV         m0, srcq+strideq*0
4505%else
4506    lea                srcq, [srcq+strideq*2]
4507    PREP_8TAP_HV         m6, srcq+strideq*0
4508    PREP_8TAP_HV         m0, srcq+strideq*1
4509    lea                srcq, [srcq+strideq*2]
4510%endif
4511    mova                 m7, [base+pw_8192]
4512    REPX   {pmulhrsw x, m7}, m4, m5, m6, m0
4513    punpcklwd            m1, m4, m5 ; 01
4514    punpcklwd            m2, m5, m6 ; 12
4515    punpcklwd            m3, m6, m0 ; 23
4516    SAVELINE_W8           1, m1
4517    SAVELINE_W8           2, m2
4518    SAVELINE_W8           3, m3
4519    mova                 m7, [base+subpel_h_shufA]
4520%if ARCH_X86_64
4521    PREP_8TAP_HV         m4, srcq+strideq*1
4522    PREP_8TAP_HV         m5, srcq+strideq*2
4523    add                srcq, stride3q
4524    PREP_8TAP_HV         m6, srcq+strideq*0
4525%else
4526    PREP_8TAP_HV         m4, srcq+strideq*0
4527    PREP_8TAP_HV         m5, srcq+strideq*1
4528    lea                srcq, [srcq+strideq*2]
4529    PREP_8TAP_HV         m6, srcq+strideq*0
4530%endif
4531    mova                 m3, [base+pw_8192]
4532    pmulhrsw             m1, m3, m4
4533    pmulhrsw             m2, m3, m5
4534    pmulhrsw             m3, m6
4535    punpcklwd            m4, m0, m1 ; 34
4536    punpcklwd            m5, m1, m2 ; 45
4537    punpcklwd            m6, m2, m3 ; 56
4538    SAVELINE_W8           6, m3
4539    RESTORELINE_W8        1, m1
4540    RESTORELINE_W8        2, m2
4541    RESTORELINE_W8        3, m3
4542.hv_w8_loop:
4543    SAVELINE_W8           1, m3
4544    SAVELINE_W8           2, m4
4545    SAVELINE_W8           3, m5
4546    SAVELINE_W8           4, m6
4547%if ARCH_X86_32
4548    pmaddwd              m0, m1, subpelv0 ; a0
4549    pmaddwd              m7, m2, subpelv0 ; b0
4550    pmaddwd              m3, subpelv1     ; a1
4551    pmaddwd              m4, subpelv1     ; b1
4552    paddd                m0, m3
4553    paddd                m7, m4
4554    pmaddwd              m5, subpelv2     ; a2
4555    pmaddwd              m6, subpelv2     ; b2
4556    paddd                m0, m5
4557    paddd                m7, m6
4558    mova                 m5, [base+pd_32]
4559    paddd                m0, m5
4560    paddd                m7, m5
4561    mova             accuv0, m0
4562    mova             accuv1, m7
4563%else
4564    pmaddwd          accuv0, m1, subpelv0 ; a0
4565    pmaddwd          accuv1, m2, subpelv0 ; b0
4566    pmaddwd              m3, subpelv1     ; a1
4567    pmaddwd              m4, subpelv1     ; b1
4568    paddd            accuv0, m3
4569    paddd            accuv1, m4
4570    pmaddwd              m5, subpelv2     ; a2
4571    pmaddwd              m6, subpelv2     ; b2
4572    paddd            accuv0, m5
4573    paddd            accuv1, m6
4574    mova                 m7, [base+pd_32]
4575    paddd            accuv0, m7
4576    paddd            accuv1, m7
4577    mova                 m7, [base+subpel_h_shufB]
4578    mova                 m6, [base+subpel_h_shufC]
4579    mova                 m5, [base+subpel_h_shufA]
4580    %define           shufA  m5
4581    %define           shufB  m7
4582    %define           shufC  m6
4583%endif
4584    PREP_8TAP_HV         m0, srcq+strideq*1
4585    lea                srcq, [srcq+strideq*2]
4586    PREP_8TAP_HV         m4, srcq+strideq*0
4587    mova                 m5, [base+pw_8192]
4588    pmulhrsw             m0, m5
4589    pmulhrsw             m4, m5
4590    RESTORELINE_W8        6, m6
4591    punpcklwd            m5, m6, m0 ; 67
4592    punpcklwd            m6, m0, m4 ; 78
4593    pmaddwd              m1, m5, subpelv3 ; a3
4594    paddd                m2, m1, accuv0
4595    pmaddwd              m1, m6, subpelv3 ; b3
4596    paddd                m1, m1, accuv1
4597    psrad                m2, 6
4598    psrad                m1, 6
4599    packssdw             m2, m1
4600    movq        [tmpq+wq*0], m2
4601    movhps      [tmpq+wq*2], m2
4602    lea                tmpq, [tmpq+wq*4]
4603    sub                  hd, 2
4604    jle .hv_w8_outer
4605    SAVELINE_W8           6, m4
4606    RESTORELINE_W8        1, m1
4607    RESTORELINE_W8        2, m2
4608    RESTORELINE_W8        3, m3
4609    RESTORELINE_W8        4, m4
4610    jmp .hv_w8_loop
4611.hv_w8_outer:
4612%if ARCH_X86_32
4613    mov                srcq, srcm
4614    mov                tmpq, tmpm
4615    movzx                hd, r5w
4616    add                srcq, 4
4617    add                tmpq, 8
4618    mov                srcm, srcq
4619    mov                tmpm, tmpq
4620%else
4621    add                  r6, 4
4622    add                  r8, 8
4623    movzx                hd, r5b
4624    mov                srcq, r6
4625    mov                tmpq, r8
4626%endif
4627    sub                 r5d, 1<<16
4628    jg .hv_w8_loop0
4629    RET
4630
4631%macro movifprep 2
4632 %if isprep
4633    mov %1, %2
4634 %endif
4635%endmacro
4636
4637%macro SAVE_REG 1
4638 %xdefine r%1_save  r%1
4639 %xdefine r%1q_save r%1q
4640 %xdefine r%1d_save r%1d
4641 %if ARCH_X86_32
4642  %define r%1m_save [rstk+stack_offset+(%1+1)*4]
4643 %endif
4644%endmacro
4645
4646%macro LOAD_REG 1
4647 %xdefine r%1  r%1_save
4648 %xdefine r%1q r%1q_save
4649 %xdefine r%1d r%1d_save
4650 %if ARCH_X86_32
4651  %define r%1m r%1m_save
4652 %endif
4653 %undef r%1d_save
4654 %undef r%1q_save
4655 %undef r%1_save
4656%endmacro
4657
4658%macro REMAP_REG 2-3
4659 %xdefine r%1  r%2
4660 %xdefine r%1q r%2q
4661 %xdefine r%1d r%2d
4662 %if ARCH_X86_32
4663  %if %3 == 0
4664   %xdefine r%1m r%2m
4665  %else
4666   %define r%1m [rstk+stack_offset+(%1+1)*4]
4667  %endif
4668 %endif
4669%endmacro
4670
4671%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
4672 %if isprep
4673  %if ARCH_X86_64
4674   SAVE_REG 14
4675   %assign %%i 14
4676   %rep 14
4677    %assign %%j %%i-1
4678    REMAP_REG %%i, %%j
4679    %assign %%i %%i-1
4680   %endrep
4681  %else
4682   SAVE_REG 5
4683   %assign %%i 5
4684   %rep 5
4685    %assign %%j %%i-1
4686    REMAP_REG %%i, %%j, 0
4687    %assign %%i %%i-1
4688   %endrep
4689  %endif
4690 %endif
4691%endmacro
4692
4693%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
4694 %if isprep
4695  %assign %%i 1
4696  %if ARCH_X86_64
4697   %rep 13
4698    %assign %%j %%i+1
4699    REMAP_REG %%i, %%j
4700    %assign %%i %%i+1
4701   %endrep
4702   LOAD_REG 14
4703  %else
4704   %rep 4
4705    %assign %%j %%i+1
4706    REMAP_REG %%i, %%j, 1
4707    %assign %%i %%i+1
4708   %endrep
4709   LOAD_REG 5
4710  %endif
4711 %endif
4712%endmacro
4713
4714%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
4715    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4716    RET
4717 %if %1
4718    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4719 %endif
4720%endmacro
4721
4722%if ARCH_X86_64
4723 %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3]
4724    SWAP                m%2, m%5
4725    movq                m%1, [srcq+ r4]
4726    movq                m%2, [srcq+ r6]
4727    movhps              m%1, [srcq+ r7]
4728    movhps              m%2, [srcq+ r9]
4729    movq                m%3, [srcq+r10]
4730    movq                m%4, [srcq+r11]
4731    movhps              m%3, [srcq+r13]
4732    movhps              m%4, [srcq+ rX]
4733    add                srcq, ssq
4734    movq                m%5, [srcq+ r4]
4735    movq                m%6, [srcq+ r6]
4736    movhps              m%5, [srcq+ r7]
4737    movhps              m%6, [srcq+ r9]
4738    movq                m%7, [srcq+r10]
4739    movq                m%8, [srcq+r11]
4740    movhps              m%7, [srcq+r13]
4741    movhps              m%8, [srcq+ rX]
4742    add                srcq, ssq
4743    pmaddubsw           m%1, m%9
4744    pmaddubsw           m%5, m%9
4745    pmaddubsw           m%2, m%10
4746    pmaddubsw           m%6, m%10
4747    pmaddubsw           m%3, m%11
4748    pmaddubsw           m%7, m%11
4749    pmaddubsw           m%4, m%12
4750    pmaddubsw           m%8, m%12
4751    phaddw              m%1, m%2
4752    phaddw              m%5, m%6
4753    phaddw              m%3, m%4
4754    phaddw              m%7, m%8
4755    phaddw              m%1, m%3
4756    phaddw              m%5, m%7
4757    pmulhrsw            m%1, m12
4758    pmulhrsw            m%5, m12
4759    SWAP                m%2, m%5
4760 %endmacro
4761%else
4762 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets
4763  %if %3 == 1
4764    mov                  r0, [esp+ 0]
4765    mov                  rX, [esp+ 8]
4766    mov                  r4, [esp+ 4]
4767    mov                  r5, [esp+12]
4768  %endif
4769    movq                 m0, [srcq+r0]
4770    movq                 m1, [srcq+rX]
4771    movhps               m0, [srcq+r4]
4772    movhps               m1, [srcq+r5]
4773    add                srcq, ssq
4774    movq                 m4, [srcq+r0]
4775    movq                 m5, [srcq+rX]
4776    movhps               m4, [srcq+r4]
4777    movhps               m5, [srcq+r5]
4778    mov                  r0, [esp+16]
4779    mov                  rX, [esp+24]
4780    mov                  r4, [esp+20]
4781    mov                  r5, [esp+28]
4782    sub                srcq, ssq
4783    movq                 m2, [srcq+r0]
4784    movq                 m3, [srcq+rX]
4785    movhps               m2, [srcq+r4]
4786    movhps               m3, [srcq+r5]
4787    add                srcq, ssq
4788    movq                 m6, [srcq+r0]
4789    movq                 m7, [srcq+rX]
4790    movhps               m6, [srcq+r4]
4791    movhps               m7, [srcq+r5]
4792    add                srcq, ssq
4793    pmaddubsw            m0, [esp+%1+ 0]
4794    pmaddubsw            m4, [esp+%1+ 0]
4795    pmaddubsw            m1, [esp+%1+16]
4796    pmaddubsw            m5, [esp+%1+16]
4797    pmaddubsw            m2, [esp+%1+32]
4798    pmaddubsw            m6, [esp+%1+32]
4799    pmaddubsw            m3, [esp+%1+48]
4800    pmaddubsw            m7, [esp+%1+48]
4801    phaddw               m0, m1
4802    phaddw               m4, m5
4803    phaddw               m2, m3
4804    phaddw               m6, m7
4805    phaddw               m0, m2
4806    phaddw               m4, m6
4807    pmulhrsw             m0, m12
4808    pmulhrsw             m4, m12
4809  %if %2 != 0
4810    mova        [esp+%2+ 0], m0
4811    mova        [esp+%2+16], m4
4812  %endif
4813 %endmacro
4814%endif
4815
4816%macro MC_8TAP_SCALED 1
4817%ifidn %1, put
4818 %assign isprep 0
4819 %if ARCH_X86_64
4820  %if required_stack_alignment <= STACK_ALIGNMENT
4821cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
4822  %else
4823cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
4824  %endif
4825 %else ; ARCH_X86_32
4826  %if required_stack_alignment <= STACK_ALIGNMENT
4827cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy
4828  %else
4829cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy
4830  %endif
4831 %endif
4832 %xdefine base_reg r12
4833 %define rndshift 10
4834%else ; prep
4835 %assign isprep 1
4836 %if ARCH_X86_64
4837  %if required_stack_alignment <= STACK_ALIGNMENT
4838cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
4839   %xdefine tmp_stridem r14q
4840  %else
4841cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
4842   %define tmp_stridem qword [rsp+0x138]
4843  %endif
4844  %xdefine base_reg r11
4845 %else ; ARCH_X86_32
4846  %if required_stack_alignment <= STACK_ALIGNMENT
4847cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
4848  %else
4849cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
4850  %endif
4851  %define tmp_stridem dword [esp+0x138]
4852 %endif
4853 %define rndshift 6
4854%endif
4855%if ARCH_X86_32
4856    mov         [esp+0x1f0], t0d
4857    mov         [esp+0x1f4], t1d
4858 %if !isprep && required_stack_alignment > STACK_ALIGNMENT
4859    mov                dstd, dstm
4860    mov                 dsd, dsm
4861    mov                srcd, srcm
4862    mov                 ssd, ssm
4863    mov                  hd, hm
4864    mov                  r4, mxm
4865  %define r0m  [esp+0x200]
4866  %define dsm  [esp+0x204]
4867  %define dsmp dsm
4868  %define r1m  dsm
4869  %define r2m  [esp+0x208]
4870  %define ssm  [esp+0x20c]
4871  %define r3m  ssm
4872  %define hm   [esp+0x210]
4873  %define mxm  [esp+0x214]
4874    mov                 r0m, dstd
4875    mov                 dsm, dsd
4876    mov                 r2m, srcd
4877    mov                 ssm, ssd
4878    mov                  hm, hd
4879    mov                  r0, mym
4880    mov                  r1, dxm
4881    mov                  r2, dym
4882  %define mym [esp+0x218]
4883  %define dxm [esp+0x09c]
4884  %define dym [esp+0x21c]
4885    mov                 mxm, r4
4886    mov                 mym, r0
4887    mov                 dxm, r1
4888    mov                 dym, r2
4889    tzcnt                wd, wm
4890 %endif
4891 %if isprep && required_stack_alignment > STACK_ALIGNMENT
4892  %xdefine base_reg r5
4893 %else
4894  %xdefine base_reg r6
4895 %endif
4896    mov                 ssd, ssm
4897%endif
4898    LEA            base_reg, %1_8tap_scaled_8bpc_ssse3
4899%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3
4900%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
4901    tzcnt                wd, wm
4902%endif
4903%if ARCH_X86_32
4904 %define m8  m0
4905 %define m9  m1
4906 %define m14 m4
4907 %define m15 m3
4908%endif
4909    movd                 m8, dxm
4910    movd                m14, mxm
4911    pshufd               m8, m8, q0000
4912    pshufd              m14, m14, q0000
4913%if isprep && UNIX64
4914    mov                 r5d, t0d
4915 DECLARE_REG_TMP 5, 7
4916%endif
4917%if ARCH_X86_64
4918    mov                 dyd, dym
4919%endif
4920%ifidn %1, put
4921 %if WIN64
4922    mov                 r8d, hm
4923  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
4924  %define hm r5m
4925  %define dxm r8m
4926 %elif ARCH_X86_64
4927  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
4928  %define hm r6m
4929 %endif
4930 %if ARCH_X86_64
4931  %if required_stack_alignment > STACK_ALIGNMENT
4932   %define dsm [rsp+0x138]
4933   %define rX r1
4934   %define rXd r1d
4935  %else
4936   %define dsm dsq
4937   %define rX r14
4938   %define rXd r14d
4939  %endif
4940 %else
4941  %define rX r1
4942 %endif
4943%else ; prep
4944 %if WIN64
4945    mov                 r7d, hm
4946  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
4947  %define hm r4m
4948  %define dxm r7m
4949 %elif ARCH_X86_64
4950  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
4951  %define hm [rsp+0x94]
4952 %endif
4953 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4954 %if ARCH_X86_64
4955  %define rX r14
4956  %define rXd r14d
4957 %else
4958  %define rX r3
4959 %endif
4960%endif
4961%if ARCH_X86_64
4962    mova                m10, [base+pd_0x3ff]
4963    mova                m12, [base+pw_8192]
4964 %ifidn %1, put
4965    mova                m13, [base+pd_512]
4966 %else
4967    mova                m13, [base+pd_32]
4968 %endif
4969%else
4970 %define m10 [base+pd_0x3ff]
4971 %define m12 [base+pw_8192]
4972 %ifidn %1, put
4973  %define m13 [base+pd_512]
4974 %else
4975  %define m13 [base+pd_32]
4976 %endif
4977%endif
4978    pxor                 m9, m9
4979%if ARCH_X86_64
4980    lea                ss3q, [ssq*3]
4981    movzx               r7d, t1b
4982    shr                 t1d, 16
4983    cmp                  hd, 6
4984    cmovs               t1d, r7d
4985    sub                srcq, ss3q
4986%else
4987 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4988    mov                  r1, [esp+0x1f4]
4989    lea                  r0, [ssq*3]
4990    movzx                r2, r1b
4991    shr                  r1, 16
4992    cmp            dword hm, 6
4993    cmovs                r1, r2
4994    mov         [esp+0x1f4], r1
4995    mov                  r1, r1m
4996    mov                  r2, r2m
4997    sub                srcq, r0
4998 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4999 %define ss3q r0
5000 %define myd r4
5001 %define dyd dword dym
5002 %define hd  dword hm
5003%endif
5004    cmp                 dyd, 1024
5005    je .dy1
5006    cmp                 dyd, 2048
5007    je .dy2
5008    movzx                wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
5009    add                  wq, base_reg
5010    jmp                  wq
5011%ifidn %1, put
5012.w2:
5013 %if ARCH_X86_64
5014    mov                 myd, mym
5015    movzx               t0d, t0b
5016    dec                srcq
5017    movd                m15, t0d
5018 %else
5019    movzx                r4, byte [esp+0x1f0]
5020    dec                srcq
5021    movd                m15, r4
5022 %endif
5023    punpckldq            m9, m8
5024    SWAP                 m8, m9
5025    paddd               m14, m8 ; mx+dx*[0-1]
5026 %if ARCH_X86_64
5027    mova                m11, [base+pd_0x4000]
5028 %else
5029  %define m11 [base+pd_0x4000]
5030 %endif
5031    pshufd              m15, m15, q0000
5032    pand                 m8, m14, m10
5033    psrld                m8, 6
5034    paddd               m15, m8
5035    movd                r4d, m15
5036    psrldq              m15, 4
5037 %if ARCH_X86_64
5038    movd                r6d, m15
5039 %else
5040    movd                r3d, m15
5041 %endif
5042    mova                 m5, [base+bdct_lb_dw]
5043    mova                 m6, [base+subpel_s_shuf2]
5044    movd                m15, [base+subpel_filters+r4*8+2]
5045 %if ARCH_X86_64
5046    movd                 m7, [base+subpel_filters+r6*8+2]
5047 %else
5048    movd                 m7, [base+subpel_filters+r3*8+2]
5049 %endif
5050    pxor                 m9, m9
5051    pcmpeqd              m8, m9
5052    psrld               m14, 10
5053 %if ARCH_X86_32
5054    mov                  r3, r3m
5055    pshufb              m14, m5
5056    paddb               m14, m6
5057    mova        [rsp+0x180], m14
5058    SWAP                 m5, m0
5059    SWAP                 m6, m3
5060  %define m8  m5
5061  %define m15 m6
5062 %endif
5063    movq                 m0, [srcq+ssq*0]
5064    movq                 m2, [srcq+ssq*2]
5065    movhps               m0, [srcq+ssq*1]
5066    movhps               m2, [srcq+ss3q ]
5067    lea                srcq, [srcq+ssq*4]
5068 %if ARCH_X86_64
5069    pshufb              m14, m5
5070    paddb               m14, m6
5071 %endif
5072    movq                 m1, [srcq+ssq*0]
5073    movq                 m3, [srcq+ssq*2]
5074    movhps               m1, [srcq+ssq*1]
5075    movhps               m3, [srcq+ss3q ]
5076    lea                srcq, [srcq+ssq*4]
5077    punpckldq           m15, m7
5078    punpcklqdq          m15, m15
5079 %if ARCH_X86_64
5080    pand                m11, m8
5081    pandn                m8, m15
5082    SWAP                m15, m8
5083    por                 m15, m11
5084 %else
5085    pand                 m7, m8, m11
5086    pandn                m8, m15
5087  %define m8  m6
5088  %define m15 m5
5089    por                 m15, m7
5090    mova        [rsp+0x190], m15
5091 %endif
5092    pshufb               m0, m14
5093    pshufb               m2, m14
5094    pshufb               m1, m14
5095    pshufb               m3, m14
5096    pmaddubsw            m0, m15
5097    pmaddubsw            m2, m15
5098    pmaddubsw            m1, m15
5099    pmaddubsw            m3, m15
5100    phaddw               m0, m2
5101    phaddw               m1, m3
5102    pmulhrsw             m0, m12       ; 0 1 2 3
5103    pmulhrsw             m1, m12       ; 4 5 6 7
5104    palignr              m2, m1, m0, 4 ; 1 2 3 4
5105    punpcklwd            m3, m0, m2    ; 01 12
5106    punpckhwd            m0, m2        ; 23 34
5107    pshufd               m5, m1, q0321 ; 5 6 7 _
5108    punpcklwd            m2, m1, m5    ; 45 56
5109    punpckhwd            m4, m1, m5    ; 67 __
5110 %if ARCH_X86_32
5111    mov                 myd, mym
5112    mov                  r0, r0m
5113    mova        [rsp+0x1a0], m3
5114    mova        [rsp+0x1b0], m0
5115    mova        [rsp+0x1c0], m2
5116    mova        [rsp+0x1d0], m4
5117 %endif
5118.w2_loop:
5119    and                 myd, 0x3ff
5120 %if ARCH_X86_64
5121    mov                 r6d, 64 << 24
5122    mov                 r4d, myd
5123    shr                 r4d, 6
5124    lea                 r4d, [t1+r4]
5125    cmovnz              r6q, [base+subpel_filters+r4*8]
5126    movq                m11, r6q
5127    punpcklbw           m11, m11
5128    psraw               m11, 8
5129    pshufd               m8, m11, q0000
5130    pshufd               m9, m11, q1111
5131    pshufd              m10, m11, q2222
5132    pshufd              m11, m11, q3333
5133    pmaddwd              m5, m3, m8
5134    pmaddwd              m6, m0, m9
5135    pmaddwd              m7, m2, m10
5136    pmaddwd              m8, m4, m11
5137    paddd                m5, m6
5138    paddd                m7, m8
5139 %else
5140    mov                 mym, myd
5141    mov                  r1, [esp+0x1f4]
5142    xor                  r3, r3
5143    shr                  r4, 6
5144    lea                  r1, [r1+r4]
5145    mov                  r4, 64 << 24
5146    cmovnz               r4, [base+subpel_filters+r1*8+0]
5147    cmovnz               r3, [base+subpel_filters+r1*8+4]
5148    movd                 m7, r4
5149    movd                 m6, r3
5150    punpckldq            m7, m6
5151    punpcklbw            m7, m7
5152    psraw                m7, 8
5153    pshufd               m5, m7, q0000
5154    pshufd               m6, m7, q1111
5155    pmaddwd              m3, m5
5156    pmaddwd              m0, m6
5157    pshufd               m5, m7, q2222
5158    pshufd               m7, m7, q3333
5159    pmaddwd              m2, m5
5160    pmaddwd              m4, m7
5161    paddd                m3, m0
5162    paddd                m2, m4
5163    SWAP                 m5, m3
5164    SWAP                 m7, m2
5165 %endif
5166    paddd                m5, m13
5167    paddd                m5, m7
5168    psrad                m5, 10
5169    packssdw             m5, m5
5170    packuswb             m5, m5
5171 %if ARCH_X86_64
5172    pextrw              r6d, m5, 0
5173    mov              [dstq], r6w
5174    add                dstq, dsq
5175    dec                  hd
5176    jz .ret
5177    add                 myd, dyd
5178 %else
5179    pextrw              r3d, m5, 0
5180    mov              [dstq], r3w
5181    add                dstq, dsm
5182    dec                  hd
5183    jz .ret
5184    mov                 myd, mym
5185    add                 myd, dym
5186 %endif
5187    test                myd, ~0x3ff
5188 %if ARCH_X86_32
5189    SWAP                 m3, m5
5190    SWAP                 m2, m7
5191    mova                 m3, [rsp+0x1a0]
5192    mova                 m0, [rsp+0x1b0]
5193    mova                 m2, [rsp+0x1c0]
5194    mova                 m4, [rsp+0x1d0]
5195  %define m14 [esp+0x180]
5196  %define m15 [esp+0x190]
5197 %endif
5198    jz .w2_loop
5199 %if ARCH_X86_32
5200    mov                  r3, r3m
5201 %endif
5202    movq                 m5, [srcq]
5203    test                myd, 0x400
5204    jz .w2_skip_line
5205    add                srcq, ssq
5206    shufps               m3, m0, q1032      ; 01 12
5207    shufps               m0, m2, q1032      ; 23 34
5208    shufps               m2, m4, q1032      ; 45 56
5209    pshufb               m5, m14
5210    pmaddubsw            m5, m15
5211    phaddw               m5, m5
5212    pmulhrsw             m5, m12
5213    palignr              m4, m5, m1, 12
5214    punpcklqdq           m1, m4, m4         ; 6 7 6 7
5215    punpcklwd            m4, m1, m5         ; 67 __
5216 %if ARCH_X86_32
5217    mova        [rsp+0x1a0], m3
5218    mova        [rsp+0x1b0], m0
5219    mova        [rsp+0x1c0], m2
5220    mova        [rsp+0x1d0], m4
5221 %endif
5222    jmp .w2_loop
5223.w2_skip_line:
5224    movhps               m5, [srcq+ssq*1]
5225    lea                srcq, [srcq+ssq*2]
5226    mova                 m3, m0             ; 01 12
5227    mova                 m0, m2             ; 23 34
5228    pshufb               m5, m14
5229    pmaddubsw            m5, m15
5230    phaddw               m5, m5
5231    pmulhrsw             m5, m12            ; 6 7 6 7
5232    palignr              m4, m5, m1, 8      ; 4 5 6 7
5233    pshufd               m5, m4, q0321      ; 5 6 7 _
5234    mova                 m1, m4
5235    punpcklwd            m2, m4, m5         ; 45 56
5236    punpckhwd            m4, m5             ; 67 __
5237 %if ARCH_X86_32
5238    mova        [rsp+0x1a0], m3
5239    mova        [rsp+0x1b0], m0
5240    mova        [rsp+0x1c0], m2
5241    mova        [rsp+0x1d0], m4
5242 %endif
5243    jmp .w2_loop
5244%endif
5245INIT_XMM ssse3
5246.w4:
5247%if ARCH_X86_64
5248    mov                 myd, mym
5249    movzx               t0d, t0b
5250    dec                srcq
5251    movd                m15, t0d
5252%else
5253 %define m8  m0
5254 %xdefine m14 m4
5255 %define m15 m3
5256    movzx                r4, byte [esp+0x1f0]
5257    dec                srcq
5258    movd                m15, r4
5259%endif
5260    pmaddwd              m8, [base+rescale_mul]
5261%if ARCH_X86_64
5262    mova                m11, [base+pd_0x4000]
5263%else
5264  %define m11 [base+pd_0x4000]
5265%endif
5266    pshufd              m15, m15, q0000
5267    paddd               m14, m8 ; mx+dx*[0-3]
5268    pand                 m0, m14, m10
5269    psrld                m0, 6
5270    paddd               m15, m0
5271    psrldq               m7, m15, 8
5272%if ARCH_X86_64
5273    movd                r4d, m15
5274    movd               r11d, m7
5275    psrldq              m15, 4
5276    psrldq               m7, 4
5277    movd                r6d, m15
5278    movd               r13d, m7
5279    movd                m15, [base+subpel_filters+ r4*8+2]
5280    movd                 m2, [base+subpel_filters+r11*8+2]
5281    movd                 m3, [base+subpel_filters+ r6*8+2]
5282    movd                 m4, [base+subpel_filters+r13*8+2]
5283%else
5284    movd                 r0, m15
5285    movd                 rX, m7
5286    psrldq              m15, 4
5287    psrldq               m7, 4
5288    movd                 r4, m15
5289    movd                 r5, m7
5290    movd                 m1, [base+subpel_filters+r0*8+2]
5291    movd                 m2, [base+subpel_filters+rX*8+2]
5292    movd                 m3, [base+subpel_filters+r4*8+2]
5293    movd                 m7, [base+subpel_filters+r5*8+2]
5294    movifprep            r3, r3m
5295    SWAP                 m4, m7
5296 %define m15 m1
5297%endif
5298    mova                 m5, [base+bdct_lb_dw]
5299    movq                 m6, [base+subpel_s_shuf2]
5300    psrld               m14, 10
5301    punpckldq           m15, m3
5302    punpckldq            m2, m4
5303    punpcklqdq          m15, m2
5304    punpcklqdq           m6, m6
5305    pshufb              m14, m5
5306    paddb               m14, m6
5307%if ARCH_X86_64
5308    pcmpeqd              m0, m9
5309    pand                m11, m0
5310%else
5311    mova        [esp+0x180], m14
5312    SWAP                 m7, m4
5313    pxor                 m3, m3
5314    pcmpeqd              m0, m3
5315    pand                 m2, m11, m0
5316 %define m11 m2
5317%endif
5318    pandn                m0, m15
5319%if ARCH_X86_64
5320    SWAP                m15, m0
5321%else
5322 %define m15 m0
5323%endif
5324    por                 m15, m11
5325%if ARCH_X86_64
5326    movu                 m7, [srcq+ssq*0]
5327    movu                 m9, [srcq+ssq*1]
5328    movu                 m8, [srcq+ssq*2]
5329    movu                m10, [srcq+ss3q ]
5330    lea                srcq, [srcq+ssq*4]
5331    movu                 m2, [srcq+ssq*0]
5332    movu                 m4, [srcq+ssq*1]
5333    movu                 m3, [srcq+ssq*2]
5334    movu                 m5, [srcq+ss3q ]
5335    lea                srcq, [srcq+ssq*4]
5336    pshufb               m7, m14
5337    pshufb               m9, m14
5338    pshufb               m8, m14
5339    pshufb              m10, m14
5340    pshufb               m2, m14
5341    pshufb               m4, m14
5342    pshufb               m3, m14
5343    pshufb               m5, m14
5344    pmaddubsw            m7, m15
5345    pmaddubsw            m9, m15
5346    pmaddubsw            m8, m15
5347    pmaddubsw           m10, m15
5348    pmaddubsw            m2, m15
5349    pmaddubsw            m4, m15
5350    pmaddubsw            m3, m15
5351    pmaddubsw            m5, m15
5352    phaddw               m7, m9
5353    phaddw               m8, m10
5354    phaddw               m9, m2, m4
5355    phaddw               m3, m5
5356    pmulhrsw             m7, m12            ; 0 1
5357    pmulhrsw             m8, m12            ; 2 3
5358    pmulhrsw             m9, m12            ; 4 5
5359    pmulhrsw             m3, m12            ; 6 7
5360    shufps               m4, m7, m8, q1032  ; 1 2
5361    shufps               m5, m8, m9, q1032  ; 3 4
5362    shufps               m6, m9, m3, q1032  ; 5 6
5363    psrldq              m11, m3, 8          ; 7 _
5364    punpcklwd            m0, m7, m4 ; 01
5365    punpckhwd            m7, m4     ; 12
5366    punpcklwd            m1, m8, m5 ; 23
5367    punpckhwd            m8, m5     ; 34
5368    punpcklwd            m2, m9, m6 ; 45
5369    punpckhwd            m9, m6     ; 56
5370    punpcklwd            m3, m11    ; 67
5371    mova         [rsp+0x00], m7
5372    mova         [rsp+0x10], m8
5373    mova         [rsp+0x20], m9
5374%else
5375    mova        [esp+0x190], m15
5376    lea                ss3q, [ssq*3]
5377    movu                 m2, [srcq+ssq*0]
5378    movu                 m3, [srcq+ssq*1]
5379    movu                 m7, [srcq+ssq*2]
5380    movu                 m6, [srcq+ss3q ]
5381    lea                srcq, [srcq+ssq*4]
5382    pshufb               m2, m14
5383    pshufb               m3, m14
5384    pshufb               m7, m14
5385    pshufb               m6, m14
5386    pmaddubsw            m2, m15
5387    pmaddubsw            m3, m15
5388    pmaddubsw            m7, m15
5389    pmaddubsw            m6, m15
5390    phaddw               m2, m3
5391    phaddw               m7, m6
5392    movu                 m1, [srcq+ssq*0]
5393    movu                 m5, [srcq+ssq*1]
5394    movu                 m3, [srcq+ssq*2]
5395    movu                 m6, [srcq+ss3q ]
5396    lea                srcq, [srcq+ssq*4]
5397    pshufb               m1, m14
5398    pshufb               m5, m14
5399    pshufb               m3, m14
5400    pshufb               m6, m14
5401    pmaddubsw            m1, m15
5402    pmaddubsw            m5, m15
5403    pmaddubsw            m3, m15
5404    pmaddubsw            m6, m15
5405    phaddw               m1, m5
5406    phaddw               m3, m6
5407    pmulhrsw             m2, m12
5408    pmulhrsw             m7, m12
5409    pmulhrsw             m1, m12
5410    pmulhrsw             m3, m12
5411    shufps               m4, m2, m7, q1032  ; 1 2
5412    shufps               m5, m7, m1, q1032  ; 3 4
5413    shufps               m6, m1, m3, q1032  ; 5 6
5414    psrldq               m0, m3, 8          ; 7 _
5415    mova        [esp+0x1a0], m0
5416 %define m11 [esp+0x1a0]
5417    punpcklwd            m0, m2, m4      ; 01
5418    punpckhwd            m2, m4          ; 12
5419    punpcklwd            m4, m7, m5      ; 23
5420    punpckhwd            m7, m5          ; 34
5421    punpcklwd            m5, m1, m6      ; 45
5422    punpckhwd            m1, m6          ; 56
5423    punpcklwd            m3, [esp+0x1a0] ; 67
5424    mov                 myd, mym
5425    mov                  r0, r0m
5426    mova        [esp+0x1b0], m0 ; 01
5427    mova        [esp+0x1c0], m4 ; 23
5428    mova        [esp+0x1d0], m5 ; 45
5429    mova        [esp+0x1e0], m3 ; 67
5430    mova         [rsp+0x00], m2 ; 12
5431    mova         [rsp+0x10], m7 ; 34
5432    mova         [rsp+0x20], m1 ; 56
5433    SWAP                 m1, m4
5434    SWAP                 m2, m5
5435%endif
5436.w4_loop:
5437    and                 myd, 0x3ff
5438%if ARCH_X86_64
5439    mov                 r6d, 64 << 24
5440    mov                 r4d, myd
5441    shr                 r4d, 6
5442    lea                 r4d, [t1+r4]
5443    cmovnz              r6q, [base+subpel_filters+r4*8]
5444    movq                m10, r6q
5445    punpcklbw           m10, m10
5446    psraw               m10, 8
5447    pshufd               m7, m10, q0000
5448    pshufd               m8, m10, q1111
5449    pshufd               m9, m10, q2222
5450    pshufd              m10, m10, q3333
5451    pmaddwd              m4, m0, m7
5452    pmaddwd              m5, m1, m8
5453    pmaddwd              m6, m2, m9
5454    pmaddwd              m7, m3, m10
5455    paddd                m4, m5
5456    paddd                m6, m7
5457    paddd                m4, m13
5458    paddd                m4, m6
5459%else
5460    mov                 mym, myd
5461    mov                  r5, [esp+0x1f4]
5462    xor                  r3, r3
5463    shr                  r4, 6
5464    lea                  r5, [r5+r4]
5465    mov                  r4, 64 << 24
5466    cmovnz               r4, [base+subpel_filters+r5*8+0]
5467    cmovnz               r3, [base+subpel_filters+r5*8+4]
5468    movd                 m7, r4
5469    movd                 m6, r3
5470    punpckldq            m7, m6
5471    punpcklbw            m7, m7
5472    psraw                m7, 8
5473    pshufd               m4, m7, q0000
5474    pshufd               m5, m7, q1111
5475    pshufd               m6, m7, q2222
5476    pshufd               m7, m7, q3333
5477    pmaddwd              m0, m4
5478    pmaddwd              m1, m5
5479    pmaddwd              m2, m6
5480    pmaddwd              m3, m7
5481    paddd                m0, m1
5482    paddd                m2, m3
5483    paddd                m0, m13
5484    paddd                m0, m2
5485    SWAP                 m4, m0
5486%endif
5487    psrad                m4, rndshift
5488    packssdw             m4, m4
5489%ifidn %1, put
5490    packuswb             m4, m4
5491    movd             [dstq], m4
5492    add                dstq, dsmp
5493%else
5494    movq             [tmpq], m4
5495    add                tmpq, 8
5496%endif
5497    dec                  hd
5498    jz .ret
5499%if ARCH_X86_64
5500    add                 myd, dyd
5501    test                myd, ~0x3ff
5502    jz .w4_loop
5503%else
5504    SWAP                 m0, m4
5505    mov                 myd, mym
5506    mov                  r3, r3m
5507    add                 myd, dym
5508    test                myd, ~0x3ff
5509    jnz .w4_next_line
5510    mova                 m0, [esp+0x1b0]
5511    mova                 m1, [esp+0x1c0]
5512    mova                 m2, [esp+0x1d0]
5513    mova                 m3, [esp+0x1e0]
5514    jmp .w4_loop
5515.w4_next_line:
5516  %define m14 [esp+0x180]
5517  %define m15 [esp+0x190]
5518%endif
5519    movu                 m4, [srcq]
5520    test                myd, 0x400
5521    jz .w4_skip_line
5522%if ARCH_X86_64
5523    mova                 m0, [rsp+0x00]
5524    mova         [rsp+0x00], m1
5525    mova                 m1, [rsp+0x10]
5526    mova         [rsp+0x10], m2
5527    mova                 m2, [rsp+0x20]
5528    mova         [rsp+0x20], m3
5529%else
5530    mova                 m5, [esp+0x1c0]
5531    mova                 m0, [rsp+0x000]
5532    mova         [rsp+0x00], m5
5533    mova        [esp+0x1b0], m0
5534    mova                 m6, [esp+0x1d0]
5535    mova                 m1, [rsp+0x010]
5536    mova         [rsp+0x10], m6
5537    mova        [esp+0x1c0], m1
5538    mova                 m7, [esp+0x1e0]
5539    mova                 m2, [rsp+0x020]
5540    mova         [rsp+0x20], m7
5541    mova        [esp+0x1d0], m2
5542%endif
5543    pshufb               m4, m14
5544    pmaddubsw            m4, m15
5545    phaddw               m4, m4
5546    pmulhrsw             m4, m12
5547    punpcklwd            m3, m11, m4
5548%if ARCH_X86_32
5549    mova        [esp+0x1e0], m3
5550%endif
5551    mova                m11, m4
5552    add                srcq, ssq
5553    jmp .w4_loop
5554.w4_skip_line:
5555%if ARCH_X86_32
5556    mova                 m0, [esp+0x1c0]
5557    mova                 m1, [esp+0x1d0]
5558    mova                 m2, [esp+0x1e0]
5559%endif
5560    movu                 m5, [srcq+ssq*1]
5561    lea                srcq, [srcq+ssq*2]
5562    mova                 m6, [rsp+0x10]
5563    mova                 m7, [rsp+0x20]
5564    pshufb               m4, m14
5565    pshufb               m5, m14
5566    pmaddubsw            m4, m15
5567    pmaddubsw            m5, m15
5568    phaddw               m4, m5
5569    pmulhrsw             m4, m12
5570    punpcklwd            m5, m11, m4
5571    mova         [rsp+0x00], m6
5572    mova         [rsp+0x10], m7
5573    mova         [rsp+0x20], m5
5574%if ARCH_X86_64
5575    psrldq              m11, m4, 8
5576    mova                 m0, m1
5577    mova                 m1, m2
5578    mova                 m2, m3
5579    punpcklwd            m3, m4, m11
5580%else
5581    psrldq               m6, m4, 8
5582    punpcklwd            m3, m4, m6
5583    mova        [esp+0x1a0], m6
5584    mova        [esp+0x1b0], m0
5585    mova        [esp+0x1c0], m1
5586    mova        [esp+0x1d0], m2
5587    mova        [esp+0x1e0], m3
5588%endif
5589    jmp .w4_loop
5590INIT_XMM ssse3
5591.w8:
5592    mov    dword [rsp+0x90], 1
5593    movifprep   tmp_stridem, 16
5594    jmp .w_start
5595.w16:
5596    mov    dword [rsp+0x90], 2
5597    movifprep   tmp_stridem, 32
5598    jmp .w_start
5599.w32:
5600    mov    dword [rsp+0x90], 4
5601    movifprep   tmp_stridem, 64
5602    jmp .w_start
5603.w64:
5604    mov    dword [rsp+0x90], 8
5605    movifprep   tmp_stridem, 128
5606    jmp .w_start
5607.w128:
5608    mov    dword [rsp+0x90], 16
5609    movifprep   tmp_stridem, 256
5610.w_start:
5611%ifidn %1, put
5612    movifnidn           dsm, dsq
5613%endif
5614%if ARCH_X86_64
5615    shr                 t0d, 16
5616    movd                m15, t0d
5617%else
5618 %define m8  m0
5619 %xdefine m14 m4
5620 %define m15 m3
5621 %if isprep
5622  %define ssq ssm
5623 %endif
5624    mov                  r4, [esp+0x1f0]
5625    shr                  r4, 16
5626    movd                m15, r4
5627    mov                  r0, r0m
5628    mov                 myd, mym
5629%endif
5630    sub                srcq, 3
5631    pslld                m7, m8, 2 ; dx*4
5632    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
5633    pshufd              m15, m15, q0000
5634    paddd               m14, m8 ; mx+dx*[0-3]
5635    mova        [rsp+0x100], m7
5636    mova        [rsp+0x120], m15
5637    mov         [rsp+0x098], srcq
5638    mov         [rsp+0x130], r0q ; dstq / tmpq
5639%if ARCH_X86_64 && UNIX64
5640    mov                  hm, hd
5641%elif ARCH_X86_32
5642    mov                  r5, hm
5643    mov         [esp+0x094], myd
5644    mov         [esp+0x134], r5
5645%endif
5646    jmp .hloop
5647.hloop_prep:
5648    dec   dword [rsp+0x090]
5649    jz .ret
5650%if ARCH_X86_64
5651    add   qword [rsp+0x130], 8*(isprep+1)
5652    mov                  hd, hm
5653%else
5654    add   dword [esp+0x130], 8*(isprep+1)
5655    mov                 myd, [esp+0x094]
5656    mov                  r5, [esp+0x134]
5657    mov                  r0, [esp+0x130]
5658%endif
5659    mova                 m7, [rsp+0x100]
5660    mova                m14, [rsp+0x110]
5661%if ARCH_X86_64
5662    mova                m10, [base+pd_0x3ff]
5663%endif
5664    mova                m15, [rsp+0x120]
5665    pxor                 m9, m9
5666    mov                srcq, [rsp+0x098]
5667%if ARCH_X86_64
5668    mov                 r0q, [rsp+0x130] ; dstq / tmpq
5669%else
5670    mov                 mym, myd
5671    mov                  hm, r5
5672    mov                 r0m, r0
5673    mov                  r3, r3m
5674%endif
5675    paddd               m14, m7
5676.hloop:
5677%if ARCH_X86_64
5678    mova                m11, [base+pq_0x40000000]
5679%else
5680 %define m11 [base+pq_0x40000000]
5681%endif
5682    psrld                m2, m14, 10
5683    mova              [rsp], m2
5684    pand                 m6, m14, m10
5685    psrld                m6, 6
5686    paddd                m5, m15, m6
5687    pcmpeqd              m6, m9
5688    psrldq               m2, m5, 8
5689%if ARCH_X86_64
5690    movd                r4d, m5
5691    movd                r6d, m2
5692    psrldq               m5, 4
5693    psrldq               m2, 4
5694    movd                r7d, m5
5695    movd                r9d, m2
5696    movq                 m0, [base+subpel_filters+r4*8]
5697    movq                 m1, [base+subpel_filters+r6*8]
5698    movhps               m0, [base+subpel_filters+r7*8]
5699    movhps               m1, [base+subpel_filters+r9*8]
5700%else
5701    movd                 r0, m5
5702    movd                 rX, m2
5703    psrldq               m5, 4
5704    psrldq               m2, 4
5705    movd                 r4, m5
5706    movd                 r5, m2
5707    movq                 m0, [base+subpel_filters+r0*8]
5708    movq                 m1, [base+subpel_filters+rX*8]
5709    movhps               m0, [base+subpel_filters+r4*8]
5710    movhps               m1, [base+subpel_filters+r5*8]
5711    pxor                 m2, m2
5712 %define m9 m2
5713%endif
5714    paddd               m14, m7 ; mx+dx*[4-7]
5715    pand                 m5, m14, m10
5716    psrld                m5, 6
5717    paddd               m15, m5
5718    pcmpeqd              m5, m9
5719    mova        [rsp+0x110], m14
5720    psrldq               m4, m15, 8
5721%if ARCH_X86_64
5722    movd               r10d, m15
5723    movd               r11d, m4
5724    psrldq              m15, 4
5725    psrldq               m4, 4
5726    movd               r13d, m15
5727    movd                rXd, m4
5728    movq                 m2, [base+subpel_filters+r10*8]
5729    movq                 m3, [base+subpel_filters+r11*8]
5730    movhps               m2, [base+subpel_filters+r13*8]
5731    movhps               m3, [base+subpel_filters+ rX*8]
5732    psrld               m14, 10
5733    psrldq               m4, m14, 8
5734    movd               r10d, m14
5735    movd               r11d, m4
5736    psrldq              m14, 4
5737    psrldq               m4, 4
5738    movd               r13d, m14
5739    movd                rXd, m4
5740    mov                 r4d, [rsp+ 0]
5741    mov                 r6d, [rsp+ 8]
5742    mov                 r7d, [rsp+ 4]
5743    mov                 r9d, [rsp+12]
5744    pshufd               m4, m6, q1100
5745    pshufd               m6, m6, q3322
5746    pshufd              m14, m5, q1100
5747    pshufd               m5, m5, q3322
5748    pand                 m7, m11, m4
5749    pand                 m8, m11, m6
5750    pand                m15, m11, m14
5751    pand                m11, m11, m5
5752    pandn                m4, m0
5753    pandn                m6, m1
5754    pandn               m14, m2
5755    pandn                m5, m3
5756    por                  m7, m4
5757    por                  m8, m6
5758    por                 m15, m14
5759    por                 m11, m5
5760    mova         [rsp+0x10], m7
5761    mova         [rsp+0x20], m8
5762    mova         [rsp+0x30], m15
5763    mova         [rsp+0x40], m11
5764    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1
5765    mova         [rsp+0x50], m1
5766    mova         [rsp+0x60], m2
5767    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3
5768    mova         [rsp+0x70], m3
5769    mova         [rsp+0x80], m4
5770    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5
5771    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7
5772    SWAP                 m7, m0
5773    SWAP                 m8, m14
5774    mova                 m1, [rsp+0x50]
5775    mova                 m2, [rsp+0x60]
5776    mova                 m3, [rsp+0x70]
5777    mova                 m9, [rsp+0x80]
5778    mov                 myd, mym
5779    mov                 dyd, dym
5780    punpcklwd            m4, m5, m6 ; 45a
5781    punpckhwd            m5, m6     ; 45b
5782    punpcklwd            m6, m7, m8 ; 67a
5783    punpckhwd            m7, m8     ; 67b
5784    punpcklwd            m0, m1, m2 ; 01a
5785    punpckhwd            m1, m2     ; 01b
5786    punpcklwd            m2, m3, m9 ; 23a
5787    punpckhwd            m3, m9     ; 23b
5788    mova         [rsp+0x50], m4
5789    mova         [rsp+0x60], m5
5790    mova         [rsp+0x70], m6
5791    mova         [rsp+0x80], m7
5792    SWAP                m14, m8
5793.vloop:
5794    and                 myd, 0x3ff
5795    mov                 r6d, 64 << 24
5796    mov                 r4d, myd
5797    shr                 r4d, 6
5798    lea                 r4d, [t1+r4]
5799    cmovnz              r6q, [base+subpel_filters+r4*8]
5800    movq                m11, r6q
5801    punpcklbw           m11, m11
5802    psraw               m11, 8
5803    pshufd               m5, m11, q0000
5804    pshufd               m7, m11, q1111
5805    pshufd              m10, m11, q2222
5806    pshufd              m11, m11, q3333
5807    pmaddwd              m4, m5, m0
5808    pmaddwd              m5, m5, m1
5809    pmaddwd              m6, m7, m2
5810    pmaddwd              m7, m7, m3
5811    paddd                m4, m13
5812    paddd                m5, m13
5813    paddd                m4, m6
5814    paddd                m5, m7
5815    pmaddwd              m6, [rsp+0x50], m10
5816    pmaddwd              m7, [rsp+0x60], m10
5817    pmaddwd              m8, [rsp+0x70], m11
5818    pmaddwd              m9, [rsp+0x80], m11
5819    paddd                m4, m6
5820    paddd                m5, m7
5821    paddd                m4, m8
5822    paddd                m5, m9
5823%else
5824    movd                 r0, m15
5825    movd                 rX, m4
5826    psrldq              m15, 4
5827    psrldq               m4, 4
5828    movd                 r4, m15
5829    movd                 r5, m4
5830    mova                m14, [esp+0x110]
5831    movq                 m2, [base+subpel_filters+r0*8]
5832    movq                 m3, [base+subpel_filters+rX*8]
5833    movhps               m2, [base+subpel_filters+r4*8]
5834    movhps               m3, [base+subpel_filters+r5*8]
5835    psrld               m14, 10
5836    mova           [esp+16], m14
5837    mov                  r0, [esp+ 0]
5838    mov                  rX, [esp+ 8]
5839    mov                  r4, [esp+ 4]
5840    mov                  r5, [esp+12]
5841    mova         [esp+0x20], m0
5842    mova         [esp+0x30], m1
5843    mova         [esp+0x40], m2
5844    mova         [esp+0x50], m3
5845    pshufd               m4, m6, q1100
5846    pshufd               m6, m6, q3322
5847    pshufd               m7, m5, q1100
5848    pshufd               m5, m5, q3322
5849    pand                 m0, m11, m4
5850    pand                 m1, m11, m6
5851    pand                 m2, m11, m7
5852    pand                 m3, m11, m5
5853    pandn                m4, [esp+0x20]
5854    pandn                m6, [esp+0x30]
5855    pandn                m7, [esp+0x40]
5856    pandn                m5, [esp+0x50]
5857    por                  m0, m4
5858    por                  m1, m6
5859    por                  m2, m7
5860    por                  m3, m5
5861    mova         [esp+0x20], m0
5862    mova         [esp+0x30], m1
5863    mova         [esp+0x40], m2
5864    mova         [esp+0x50], m3
5865    MC_8TAP_SCALED_H   0x20, 0x140, 0 ; 0-1
5866    MC_8TAP_SCALED_H   0x20, 0x160    ; 2-3
5867    MC_8TAP_SCALED_H   0x20, 0x180    ; 4-5
5868    MC_8TAP_SCALED_H   0x20, 0x1a0    ; 6-7
5869    mova                 m5, [esp+0x180]
5870    mova                 m6, [esp+0x190]
5871    mova                 m7, [esp+0x1a0]
5872    mova                 m0, [esp+0x1b0]
5873    mov                 myd, mym
5874    punpcklwd            m4, m5, m6      ; 45a
5875    punpckhwd            m5, m6          ; 45b
5876    punpcklwd            m6, m7, m0      ; 67a
5877    punpckhwd            m7, m0          ; 67b
5878    mova        [esp+0x180], m4
5879    mova        [esp+0x190], m5
5880    mova        [esp+0x1a0], m6
5881    mova        [esp+0x1b0], m7
5882    mova                 m1, [esp+0x140]
5883    mova                 m2, [esp+0x150]
5884    mova                 m3, [esp+0x160]
5885    mova                 m4, [esp+0x170]
5886    punpcklwd            m0, m1, m2      ; 01a
5887    punpckhwd            m1, m2          ; 01b
5888    punpcklwd            m2, m3, m4      ; 23a
5889    punpckhwd            m3, m4          ; 23b
5890    mova        [esp+0x140], m0
5891    mova        [esp+0x150], m1
5892    mova        [esp+0x160], m2
5893    mova        [esp+0x170], m3
5894.vloop:
5895    mov                  r0, r0m
5896    mov                  r5, [esp+0x1f4]
5897    and                 myd, 0x3ff
5898    mov                 mym, myd
5899    xor                  r3, r3
5900    shr                  r4, 6
5901    lea                  r5, [r5+r4]
5902    mov                  r4, 64 << 24
5903    cmovnz               r4, [base+subpel_filters+r5*8+0]
5904    cmovnz               r3, [base+subpel_filters+r5*8+4]
5905    movd                 m7, r4
5906    movd                 m6, r3
5907    punpckldq            m7, m6
5908    punpcklbw            m7, m7
5909    psraw                m7, 8
5910    pshufd               m4, m7, q0000
5911    pshufd               m5, m7, q1111
5912    pmaddwd              m0, m4
5913    pmaddwd              m1, m4
5914    pmaddwd              m2, m5
5915    pmaddwd              m3, m5
5916    pshufd               m6, m7, q2222
5917    pshufd               m7, m7, q3333
5918    paddd                m0, m2
5919    paddd                m1, m3
5920    pmaddwd              m2, [esp+0x180], m6
5921    pmaddwd              m3, [esp+0x190], m6
5922    pmaddwd              m4, [esp+0x1a0], m7
5923    pmaddwd              m5, [esp+0x1b0], m7
5924    paddd                m0, m2
5925    paddd                m1, m3
5926    paddd                m0, m13
5927    paddd                m1, m13
5928    paddd                m4, m0
5929    paddd                m5, m1
5930%endif
5931    psrad                m4, rndshift
5932    psrad                m5, rndshift
5933    packssdw             m4, m5
5934%ifidn %1, put
5935    packuswb             m4, m4
5936    movq             [dstq], m4
5937    add                dstq, dsm
5938%else
5939    mova             [tmpq], m4
5940    add                tmpq, tmp_stridem
5941%endif
5942    dec                  hd
5943    jz .hloop_prep
5944%if ARCH_X86_64
5945    add                 myd, dyd
5946    test                myd, ~0x3ff
5947    jz .vloop
5948    test                myd, 0x400
5949    mov         [rsp+0x140], myd
5950    mov                 r4d, [rsp+ 0]
5951    mov                 r6d, [rsp+ 8]
5952    mov                 r7d, [rsp+ 4]
5953    mov                 r9d, [rsp+12]
5954    jz .skip_line
5955    mova                m14, [base+unpckw]
5956    movq                 m6, [srcq+r10]
5957    movq                 m7, [srcq+r11]
5958    movhps               m6, [srcq+r13]
5959    movhps               m7, [srcq+ rX]
5960    movq                 m4, [srcq+ r4]
5961    movq                 m5, [srcq+ r6]
5962    movhps               m4, [srcq+ r7]
5963    movhps               m5, [srcq+ r9]
5964    add                srcq, ssq
5965    mov                 myd, [rsp+0x140]
5966    mov                 dyd, dym
5967    pshufd               m9, m14, q1032
5968    pshufb               m0, m14                ; 0a 1a
5969    pshufb               m1, m14                ; 0b 1b
5970    pshufb               m2, m9                 ; 3a 2a
5971    pshufb               m3, m9                 ; 3b 2b
5972    pmaddubsw            m6, [rsp+0x30]
5973    pmaddubsw            m7, [rsp+0x40]
5974    pmaddubsw            m4, [rsp+0x10]
5975    pmaddubsw            m5, [rsp+0x20]
5976    phaddw               m6, m7
5977    phaddw               m4, m5
5978    phaddw               m4, m6
5979    pmulhrsw             m4, m12
5980    pshufb               m5, [rsp+0x50], m14    ; 4a 5a
5981    pshufb               m6, [rsp+0x60], m14    ; 4b 5b
5982    pshufb               m7, [rsp+0x70], m9     ; 7a 6a
5983    pshufb               m8, [rsp+0x80], m9     ; 7b 6b
5984    punpckhwd            m0, m2 ; 12a
5985    punpckhwd            m1, m3 ; 12b
5986    punpcklwd            m2, m5 ; 34a
5987    punpcklwd            m3, m6 ; 34b
5988    punpckhwd            m5, m7 ; 56a
5989    punpckhwd            m6, m8 ; 56b
5990    punpcklwd            m7, m4 ; 78a
5991    punpckhqdq           m4, m4
5992    punpcklwd            m8, m4 ; 78b
5993    mova         [rsp+0x50], m5
5994    mova         [rsp+0x60], m6
5995    mova         [rsp+0x70], m7
5996    mova         [rsp+0x80], m8
5997    jmp .vloop
5998.skip_line:
5999    mova                 m0, [rsp+0x10]
6000    mova                 m1, [rsp+0x20]
6001    mova                m14, [rsp+0x30]
6002    mova                m15, [rsp+0x40]
6003    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15
6004    mov                 myd, [rsp+0x140]
6005    mov                 dyd, dym
6006    mova                 m0, m2         ; 01a
6007    mova                 m1, m3         ; 01b
6008    mova                 m2, [rsp+0x50] ; 23a
6009    mova                 m3, [rsp+0x60] ; 23b
6010    mova                 m5, [rsp+0x70] ; 45a
6011    mova                 m6, [rsp+0x80] ; 45b
6012    punpcklwd            m7, m4, m8     ; 67a
6013    punpckhwd            m4, m8         ; 67b
6014    mova         [rsp+0x50], m5
6015    mova         [rsp+0x60], m6
6016    mova         [rsp+0x70], m7
6017    mova         [rsp+0x80], m4
6018%else
6019    mov                 r0m, r0
6020    mov                 myd, mym
6021    mov                  r3, r3m
6022    add                 myd, dym
6023    test                myd, ~0x3ff
6024    mov                 mym, myd
6025    jnz .next_line
6026    mova                 m0, [esp+0x140]
6027    mova                 m1, [esp+0x150]
6028    mova                 m2, [esp+0x160]
6029    mova                 m3, [esp+0x170]
6030    jmp .vloop
6031.next_line:
6032    test                myd, 0x400
6033    mov                  r0, [esp+ 0]
6034    mov                  rX, [esp+ 8]
6035    mov                  r4, [esp+ 4]
6036    mov                  r5, [esp+12]
6037    jz .skip_line
6038    mova                 m6, [base+unpckw]
6039    mova                 m0, [esp+0x140]
6040    mova                 m1, [esp+0x150]
6041    mova                 m7, [esp+0x180]
6042    movq                 m4, [srcq+r0]
6043    movq                 m5, [srcq+rX]
6044    movhps               m4, [srcq+r4]
6045    movhps               m5, [srcq+r5]
6046    pshufb               m0, m6         ; 0a 1a
6047    pshufb               m1, m6         ; 0b 1b
6048    pshufb               m7, m6         ; 4a 5a
6049    mov                  r0, [esp+16]
6050    mov                  rX, [esp+24]
6051    mov                  r4, [esp+20]
6052    mov                  r5, [esp+28]
6053    movq                 m3, [srcq+r0]
6054    movq                 m2, [srcq+rX]
6055    movhps               m3, [srcq+r4]
6056    movhps               m2, [srcq+r5]
6057    add                srcq, ssq
6058    pmaddubsw            m4, [esp+0x20]
6059    pmaddubsw            m5, [esp+0x30]
6060    pmaddubsw            m3, [esp+0x40]
6061    pmaddubsw            m2, [esp+0x50]
6062    phaddw               m4, m5
6063    phaddw               m3, m2
6064    mova                 m5, [esp+0x190]
6065    mova                 m2, [esp+0x160]
6066    phaddw               m4, m3
6067    mova                 m3, [esp+0x170]
6068    pmulhrsw             m4, m12        ; 8a 8b
6069    mov                 myd, mym
6070    pshufb               m5, m6         ; 4b 5b
6071    pshufd               m6, m6, q1032
6072    pshufb               m2, m6         ; 3a 2a
6073    pshufb               m3, m6         ; 3b 2b
6074    punpckhwd            m0, m2         ; 12a
6075    punpckhwd            m1, m3         ; 12b
6076    mova        [esp+0x140], m0
6077    mova        [esp+0x150], m1
6078    mova                 m0, [esp+0x1a0]
6079    mova                 m1, [esp+0x1b0]
6080    punpcklwd            m2, m7         ; 34a
6081    punpcklwd            m3, m5         ; 34b
6082    mova        [esp+0x160], m2
6083    mova        [esp+0x170], m3
6084    pshufb               m0, m6         ; 7a 6a
6085    pshufb               m1, m6         ; 7b 6b
6086    punpckhwd            m7, m0         ; 56a
6087    punpckhwd            m5, m1         ; 56b
6088    punpcklwd            m0, m4
6089    punpckhqdq           m4, m4
6090    punpcklwd            m1, m4
6091    mova        [esp+0x180], m7
6092    mova        [esp+0x190], m5
6093    mova        [esp+0x1a0], m0
6094    mova        [esp+0x1b0], m1
6095    mova                 m0, [esp+0x140]
6096    mova                 m1, [esp+0x150]
6097    jmp .vloop
6098.skip_line:
6099    MC_8TAP_SCALED_H   0x20, 0x1c0, 0
6100    mov                 myd, mym
6101    mova                 m0, [esp+0x160]
6102    mova                 m1, [esp+0x170]
6103    mova                 m2, [esp+0x180]
6104    mova                 m3, [esp+0x190]
6105    mova         [esp+0x140], m0
6106    mova         [esp+0x150], m1
6107    mova                 m4, [esp+0x1a0]
6108    mova                 m5, [esp+0x1b0]
6109    mova        [esp+0x160], m2
6110    mova        [esp+0x170], m3
6111    mova                 m6, [esp+0x1c0]
6112    mova                 m7, [esp+0x1d0]
6113    mova        [esp+0x180], m4
6114    mova        [esp+0x190], m5
6115    punpcklwd            m4, m6, m7
6116    punpckhwd            m6, m7
6117    mova        [esp+0x1a0], m4
6118    mova        [esp+0x1b0], m6
6119%endif
6120    jmp .vloop
6121INIT_XMM ssse3
6122.dy1:
6123    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
6124    add                  wq, base_reg
6125    jmp                  wq
6126%ifidn %1, put
6127.dy1_w2:
6128 %if ARCH_X86_64
6129    mov                 myd, mym
6130    movzx               t0d, t0b
6131    dec                srcq
6132    movd                m15, t0d
6133 %else
6134  %define m8  m0
6135  %define m9  m1
6136  %define m14 m4
6137  %define m15 m3
6138    movzx                r5, byte [esp+0x1f0]
6139    dec                srcd
6140    movd                m15, r5
6141 %endif
6142    punpckldq            m9, m8
6143    SWAP                 m8, m9
6144    paddd               m14, m8 ; mx+dx*[0-1]
6145 %if ARCH_X86_64
6146    mova                m11, [base+pd_0x4000]
6147 %else
6148  %define m11 [base+pd_0x4000]
6149 %endif
6150    pshufd              m15, m15, q0000
6151    pand                 m8, m14, m10
6152    psrld                m8, 6
6153    paddd               m15, m8
6154    movd                r4d, m15
6155    psrldq              m15, 4
6156 %if ARCH_X86_64
6157    movd                r6d, m15
6158 %else
6159    movd                r3d, m15
6160 %endif
6161    mova                 m5, [base+bdct_lb_dw]
6162    mova                 m6, [base+subpel_s_shuf2]
6163    movd                m15, [base+subpel_filters+r4*8+2]
6164 %if ARCH_X86_64
6165    movd                 m7, [base+subpel_filters+r6*8+2]
6166 %else
6167    movd                 m7, [base+subpel_filters+r3*8+2]
6168 %endif
6169    pxor                 m9, m9
6170    pcmpeqd              m8, m9
6171    psrld               m14, 10
6172 %if ARCH_X86_32
6173    mov                  r3, r3m
6174    pshufb              m14, m5
6175    paddb               m14, m6
6176    mova         [esp+0x00], m14
6177  %define m14 [esp+0x00]
6178    SWAP                 m5, m0
6179    SWAP                 m6, m3
6180  %define m8  m5
6181  %define m15 m6
6182 %endif
6183    movq                 m0, [srcq+ssq*0]
6184    movq                 m2, [srcq+ssq*2]
6185    movhps               m0, [srcq+ssq*1]
6186    movhps               m2, [srcq+ss3q ]
6187    lea                srcq, [srcq+ssq*4]
6188 %if ARCH_X86_64
6189    shr                 myd, 6
6190    mov                 r4d, 64 << 24
6191    lea                 myd, [t1+myq]
6192    cmovnz              r4q, [base+subpel_filters+myq*8]
6193    pshufb              m14, m5
6194    paddb               m14, m6
6195    movq                m10, r4
6196 %else
6197    mov                 myd, mym
6198    mov                  r5, [esp+0x1f4]
6199    xor                  r3, r3
6200    shr                 myd, 6
6201    lea                  r5, [r5+myd]
6202    mov                  r4, 64 << 24
6203    cmovnz               r4, [base+subpel_filters+r5*8+0]
6204    cmovnz               r3, [base+subpel_filters+r5*8+4]
6205  %define m10 m4
6206    movd                m10, r4
6207    movd                 m3, r3
6208    mov                  r3, r3m
6209    punpckldq           m10, m3
6210 %endif
6211    movq                 m1, [srcq+ssq*0]
6212    movq                 m3, [srcq+ssq*2]
6213    movhps               m1, [srcq+ssq*1]
6214    add                srcq, ss3q
6215    punpcklbw           m10, m10
6216    psraw               m10, 8
6217    punpckldq           m15, m7
6218    punpcklqdq          m15, m15
6219 %if ARCH_X86_64
6220    pand                m11, m8
6221 %else
6222    pand                 m7, m11, m8
6223  %define m11 m7
6224 %endif
6225    pandn                m8, m15
6226    SWAP                m15, m8
6227    por                 m15, m11
6228 %if ARCH_X86_64
6229    pshufd               m8, m10, q0000
6230    pshufd               m9, m10, q1111
6231    pshufd              m11, m10, q3333
6232    pshufd              m10, m10, q2222
6233 %else
6234    mova         [esp+0x10], m15
6235  %define m15 [esp+0x10]
6236    mov                  r0, r0m
6237    pshufd               m5, m4, q0000
6238    pshufd               m6, m4, q1111
6239    pshufd               m7, m4, q2222
6240    pshufd               m4, m4, q3333
6241  %define m8  [esp+0x20]
6242  %define m9  [esp+0x30]
6243  %define m10 [esp+0x40]
6244  %define m11 [esp+0x50]
6245    mova                 m8, m5
6246    mova                 m9, m6
6247    mova                m10, m7
6248    mova                m11, m4
6249 %endif
6250    pshufb               m0, m14
6251    pshufb               m2, m14
6252    pshufb               m1, m14
6253    pshufb               m3, m14
6254    pmaddubsw            m0, m15
6255    pmaddubsw            m2, m15
6256    pmaddubsw            m1, m15
6257    pmaddubsw            m3, m15
6258    phaddw               m0, m2
6259    phaddw               m1, m3
6260    pmulhrsw             m0, m12
6261    pmulhrsw             m1, m12
6262    palignr              m2, m1, m0, 4
6263    pshufd               m4, m1, q2121
6264    punpcklwd            m3, m0, m2     ; 01 12
6265    punpckhwd            m0, m2         ; 23 34
6266    punpcklwd            m2, m1, m4     ; 45 56
6267.dy1_w2_loop:
6268    movq                 m1, [srcq+ssq*0]
6269    movhps               m1, [srcq+ssq*1]
6270    lea                srcq, [srcq+ssq*2]
6271    pmaddwd              m5, m3, m8
6272    pmaddwd              m6, m0, m9
6273    pmaddwd              m7, m2, m10
6274    mova                 m3, m0
6275    mova                 m0, m2
6276    paddd                m5, m13
6277    paddd                m6, m7
6278    pshufb               m1, m14
6279    pmaddubsw            m1, m15
6280    phaddw               m1, m1
6281    pmulhrsw             m1, m12
6282    palignr              m7, m1, m4, 12
6283    punpcklwd            m2, m7, m1     ; 67 78
6284    pmaddwd              m7, m2, m11
6285    mova                 m4, m1
6286    paddd                m5, m6
6287    paddd                m5, m7
6288    psrad                m5, rndshift
6289    packssdw             m5, m5
6290    packuswb             m5, m5
6291    movd                r4d, m5
6292    mov        [dstq+dsq*0], r4w
6293    shr                 r4d, 16
6294    mov        [dstq+dsq*1], r4w
6295    lea                dstq, [dstq+dsq*2]
6296    sub                  hd, 2
6297    jg .dy1_w2_loop
6298    RET
6299%endif
6300INIT_XMM ssse3
6301.dy1_w4:
6302%if ARCH_X86_64
6303    mov                 myd, mym
6304    movzx               t0d, t0b
6305    dec                srcq
6306    movd                m15, t0d
6307%else
6308 %define m10 [base+pd_0x3ff]
6309 %define m11 [base+pd_0x4000]
6310 %define m8  m0
6311 %xdefine m14 m4
6312 %define m15 m3
6313 %if isprep
6314  %define ssq r3
6315 %endif
6316    movzx                r4, byte [esp+0x1f0]
6317    dec                srcq
6318    movd                m15, r4
6319%endif
6320    pmaddwd              m8, [base+rescale_mul]
6321%if ARCH_X86_64
6322    mova                m11, [base+pd_0x4000]
6323%endif
6324    pshufd              m15, m15, q0000
6325    paddd               m14, m8 ; mx+dx*[0-3]
6326    pand                 m8, m14, m10
6327    psrld                m8, 6
6328    paddd               m15, m8
6329    psrldq               m7, m15, 8
6330%if ARCH_X86_64
6331    movd                r4d, m15
6332    movd               r11d, m7
6333    psrldq              m15, 4
6334    psrldq               m7, 4
6335    movd                r6d, m15
6336    movd               r13d, m7
6337    movd                m15, [base+subpel_filters+ r4*8+2]
6338    movd                 m2, [base+subpel_filters+r11*8+2]
6339    movd                 m3, [base+subpel_filters+ r6*8+2]
6340    movd                 m4, [base+subpel_filters+r13*8+2]
6341    shr                 myd, 6
6342    mov                 r4d, 64 << 24
6343    lea                 myd, [t1+myq]
6344    cmovnz              r4q, [base+subpel_filters+myq*8]
6345%else
6346    movd                 r1, m15
6347    movd                 r3, m7
6348    psrldq              m15, 4
6349    psrldq               m7, 4
6350    movd                 r4, m15
6351    movd                 r5, m7
6352 %define m15 m5
6353    SWAP                 m4, m7
6354    movd                m15, [base+subpel_filters+r1*8+2]
6355    movd                 m2, [base+subpel_filters+r3*8+2]
6356    movd                 m3, [base+subpel_filters+r4*8+2]
6357    movd                 m4, [base+subpel_filters+r5*8+2]
6358    mov                 myd, mym
6359    mov                  rX, [esp+0x1f4]
6360    xor                  r5, r5
6361    shr                 myd, 6
6362    lea                  rX, [rX+myd]
6363    mov                  r4, 64 << 24
6364    cmovnz               r4, [base+subpel_filters+rX*8+0]
6365    cmovnz               r5, [base+subpel_filters+rX*8+4]
6366    mov                  r3, r3m
6367 %if isprep
6368    lea                ss3q, [ssq*3]
6369 %endif
6370%endif
6371    punpckldq           m15, m3
6372    punpckldq            m2, m4
6373    punpcklqdq          m15, m2
6374    movq                 m6, [base+subpel_s_shuf2]
6375%if ARCH_X86_64
6376    pcmpeqd              m8, m9
6377    psrld               m14, 10
6378    pshufb              m14, [base+bdct_lb_dw]
6379    movu                 m0, [srcq+ssq*0]
6380    movu                 m1, [srcq+ssq*1]
6381    movu                 m2, [srcq+ssq*2]
6382    movu                 m3, [srcq+ss3q ]
6383    lea                srcq, [srcq+ssq*4]
6384    punpcklqdq           m6, m6
6385    movu                 m4, [srcq+ssq*0]
6386    movu                 m5, [srcq+ssq*1]
6387    movu                 m7, [srcq+ssq*2]
6388    add                srcq, ss3q
6389    pand                m11, m8
6390    pandn                m8, m15
6391    SWAP                m15, m8
6392    por                 m15, m11
6393    paddb               m14, m6
6394    movq                m10, r4q
6395    punpcklbw           m10, m10
6396    psraw               m10, 8
6397    pshufb               m0, m14
6398    pshufb               m1, m14
6399    pshufb               m2, m14
6400    pshufb               m3, m14
6401    pshufb               m4, m14
6402    pshufb               m5, m14
6403    pshufb               m7, m14
6404    pmaddubsw            m0, m15
6405    pmaddubsw            m1, m15
6406    pmaddubsw            m2, m15
6407    pmaddubsw            m3, m15
6408    pmaddubsw            m4, m15
6409    pmaddubsw            m5, m15
6410    pmaddubsw            m7, m15
6411    phaddw               m0, m1
6412    phaddw               m2, m3
6413    phaddw               m4, m5
6414    phaddw               m6, m7, m7
6415    pmulhrsw             m0, m12    ; 0 1
6416    pmulhrsw             m2, m12    ; 2 3
6417    pmulhrsw             m4, m12    ; 4 5
6418    pmulhrsw             m6, m12    ; 6 _
6419    shufps               m1, m0, m2, q1032  ; 1 2
6420    shufps               m3, m2, m4, q1032  ; 3 4
6421    shufps               m5, m4, m6, q1032  ; 5 6
6422    punpcklwd            m7, m0, m1 ; 01
6423    punpckhwd            m0, m1     ; 12
6424    punpcklwd            m8, m2, m3 ; 23
6425    punpckhwd            m2, m3     ; 34
6426    punpcklwd            m9, m4, m5 ; 45
6427    punpckhwd            m4, m5     ; 56
6428%else
6429    pxor                 m3, m3
6430    pcmpeqd              m8, m3
6431    psrld               m14, 10
6432    pshufb              m14, [base+bdct_lb_dw]
6433    movu                 m1, [srcq+ssq*0]
6434    movu                 m2, [srcq+ssq*1]
6435    movu                 m3, [srcq+ssq*2]
6436    add                srcq, ss3q
6437    punpcklqdq           m6, m6
6438    SWAP                 m4, m7
6439    pand                 m7, m11, m8
6440    pandn                m8, m15
6441    SWAP                 m5, m0
6442    por                 m15, m7
6443    paddb               m14, m6
6444    movu                 m0, [srcq+ssq*0]
6445    movu                 m7, [srcq+ssq*1]
6446    movu                 m6, [srcq+ssq*2]
6447    pshufb               m1, m14
6448    pshufb               m2, m14
6449    pshufb               m3, m14
6450    pshufb               m0, m14
6451    pshufb               m7, m14
6452    pshufb               m6, m14
6453    pmaddubsw            m1, m15
6454    pmaddubsw            m2, m15
6455    pmaddubsw            m3, m15
6456    mova         [esp+0x00], m14
6457    mova         [esp+0x10], m15
6458    pmaddubsw            m0, m15
6459    pmaddubsw            m7, m15
6460    pmaddubsw            m6, m15
6461    phaddw               m1, m2
6462    movu                 m2, [srcq+ss3q ]
6463    lea                srcq, [srcq+ssq*4]
6464    mov                  r0, r0m
6465    phaddw               m3, m0
6466    pshufb               m2, m14
6467    pmaddubsw            m2, m15
6468 %define m14 [esp+0x00]
6469 %define m15 [esp+0x10]
6470    phaddw               m7, m6
6471    phaddw               m2, m2
6472    movd                 m6, r4
6473    movd                 m0, r5
6474    punpckldq            m6, m0
6475    punpcklbw            m6, m6
6476    psraw                m6, 8
6477    mova         [esp+0x20], m6
6478    pmulhrsw             m1, m12 ; 0 1
6479    pmulhrsw             m3, m12 ; 2 3
6480    pmulhrsw             m7, m12 ; 4 5
6481    pmulhrsw             m2, m12 ; 6 _
6482    shufps               m0, m1, m3, q1032  ; 1 2
6483    shufps               m4, m3, m7, q1032  ; 3 4
6484    shufps               m5, m7, m2, q1032  ; 5 6
6485    punpcklwd            m6, m1, m0 ; 01
6486    punpckhwd            m1, m0     ; 12
6487    mova         [esp+0x30], m1
6488    punpcklwd            m1, m3, m4 ; 23
6489    punpckhwd            m3, m4     ; 34
6490    mova         [esp+0x40], m3
6491    punpcklwd            m3, m7, m5 ; 45
6492    punpckhwd            m7, m5     ; 56
6493    mova         [esp+0x50], m7
6494    mova         [esp+0x60], m2
6495    mova                 m0, [esp+0x20]
6496 %xdefine m8 m1
6497 %xdefine m9 m3
6498 %xdefine m10 m0
6499    SWAP                 m7, m6
6500    SWAP                 m1, m4
6501    SWAP                 m3, m2
6502%endif
6503    pshufd               m1, m10, q0000
6504    pshufd               m3, m10, q1111
6505    pshufd               m5, m10, q2222
6506    pshufd              m10, m10, q3333
6507%if ARCH_X86_64
6508    mova         [rsp+0x00], m8
6509    mova         [rsp+0x10], m2
6510    mova         [rsp+0x20], m9
6511    mova         [rsp+0x30], m4
6512%else
6513    mova         [esp+0x70], m8
6514    mova         [esp+0x80], m9
6515    mova         [esp+0x90], m1
6516    mova         [esp+0xa0], m3
6517    mova         [esp+0xb0], m5
6518    mova         [esp+0xc0], m10
6519 %ifidn %1, put
6520    mov                 dsd, dsm
6521 %endif
6522 %define m11 m6
6523%endif
6524.dy1_w4_loop:
6525%if ARCH_X86_64
6526    movu                m11, [srcq+ssq*0]
6527    pmaddwd              m7, m1
6528    pmaddwd              m8, m3
6529    pmaddwd              m0, m1
6530    pmaddwd              m2, m3
6531    pmaddwd              m9, m5
6532    pmaddwd              m4, m5
6533    paddd                m7, m8
6534    paddd                m0, m2
6535    movu                 m8, [srcq+ssq*1]
6536    lea                srcq, [srcq+ssq*2]
6537    pshufb              m11, m14
6538    pmaddubsw           m11, m15
6539    paddd                m7, m13
6540    paddd                m0, m13
6541    paddd                m7, m9
6542    paddd                m0, m4
6543    pshufb               m8, m14
6544    pmaddubsw            m8, m15
6545    phaddw              m11, m8
6546    mova                 m8, [rsp+0x20]
6547    pmulhrsw            m11, m12
6548    punpcklwd            m9, m6, m11    ; 67
6549    psrldq               m6, m11, 8
6550    punpcklwd            m4, m11, m6    ; 78
6551    pmaddwd              m2, m9, m10
6552    pmaddwd             m11, m4, m10
6553    paddd                m7, m2
6554    mova                 m2, [rsp+0x30]
6555    paddd                m0, m11
6556%else
6557    SWAP                 m7, m6
6558    SWAP                 m1, m4
6559    SWAP                 m3, m2
6560    movu                 m5, [srcq+ssq*0]
6561    mova                 m0, [esp+0x30]
6562    mova                 m2, [esp+0x40]
6563    mova                 m4, [esp+0x50]
6564    pmaddwd              m6, [esp+0x90]
6565    pmaddwd              m1, [esp+0xa0]
6566    pmaddwd              m0, [esp+0x90]
6567    pmaddwd              m2, [esp+0xa0]
6568    pmaddwd              m3, [esp+0xb0]
6569    pmaddwd              m4, [esp+0xb0]
6570    paddd                m6, m1
6571    paddd                m0, m2
6572    movu                 m7, [srcq+ssq*1]
6573    lea                srcq, [srcq+ssq*2]
6574    pshufb               m5, m14
6575    pmaddubsw            m5, m15
6576    paddd                m6, m13
6577    paddd                m0, m13
6578    paddd                m6, m3
6579    paddd                m0, m4
6580    pshufb               m7, m14
6581    pmaddubsw            m7, m15
6582    phaddw               m5, m7
6583    mova                 m7, [rsp+0x80]
6584    pmulhrsw             m5, m12
6585    punpcklwd            m3, [esp+0x60], m5 ; 67
6586    psrldq               m1, m5, 8
6587    punpcklwd            m4, m5, m1         ; 78
6588    pmaddwd              m2, m3, [esp+0xc0]
6589    pmaddwd              m5, m4, [esp+0xc0]
6590    mova         [esp+0x60], m1
6591    paddd                m6, m2
6592    mova                 m2, [esp+0x50]
6593    paddd                m0, m5
6594    SWAP                 m7, m6
6595%endif
6596    psrad                m7, rndshift
6597    psrad                m0, rndshift
6598    packssdw             m7, m0
6599%if ARCH_X86_64
6600    mova                 m0, [rsp+0x10]
6601%else
6602    mova                 m0, [esp+0x40]
6603%define m11 m5
6604%endif
6605%ifidn %1, put
6606    packuswb             m7, m7
6607    psrldq              m11, m7, 4
6608    movd       [dstq+dsq*0], m7
6609    movd       [dstq+dsq*1], m11
6610    lea                dstq, [dstq+dsq*2]
6611%else
6612    mova             [tmpq], m7
6613    add                tmpq, 16
6614%endif
6615    sub                  hd, 2
6616    jz .ret
6617%if ARCH_X86_64
6618    mova                 m7, [rsp+0x00]
6619    mova         [rsp+0x00], m8
6620    mova         [rsp+0x10], m2
6621    mova         [rsp+0x20], m9
6622    mova         [rsp+0x30], m4
6623%else
6624    mova                 m7, [esp+0x70] ; 01
6625    mova                 m1, [esp+0x80] ; 23
6626    mova                 m2, [esp+0x50] ; 34
6627    mova         [esp+0x30], m0
6628    mova         [esp+0x70], m1
6629    mova         [esp+0x40], m2
6630    mova         [esp+0x80], m3
6631    mova         [esp+0x50], m4
6632%endif
6633    jmp .dy1_w4_loop
6634INIT_XMM ssse3
6635.dy1_w8:
6636    mov    dword [rsp+0x90], 1
6637    movifprep   tmp_stridem, 16
6638    jmp .dy1_w_start
6639.dy1_w16:
6640    mov    dword [rsp+0x90], 2
6641    movifprep   tmp_stridem, 32
6642    jmp .dy1_w_start
6643.dy1_w32:
6644    mov    dword [rsp+0x90], 4
6645    movifprep   tmp_stridem, 64
6646    jmp .dy1_w_start
6647.dy1_w64:
6648    mov    dword [rsp+0x90], 8
6649    movifprep   tmp_stridem, 128
6650    jmp .dy1_w_start
6651.dy1_w128:
6652    mov    dword [rsp+0x90], 16
6653    movifprep   tmp_stridem, 256
6654.dy1_w_start:
6655    mov                 myd, mym
6656%ifidn %1, put
6657    movifnidn           dsm, dsq
6658%endif
6659%if ARCH_X86_64
6660    shr                 t0d, 16
6661    sub                srcq, 3
6662    shr                 myd, 6
6663    mov                 r4d, 64 << 24
6664    lea                 myd, [t1+myq]
6665    cmovnz              r4q, [base+subpel_filters+myq*8]
6666    movd                m15, t0d
6667%else
6668 %define m8   m0
6669 %define m9   m1
6670 %xdefine m14 m4
6671 %xdefine m15 m3
6672 %if isprep
6673  %define ssq ssm
6674 %endif
6675    mov                  r5, [esp+0x1f0]
6676    mov                  r3, [esp+0x1f4]
6677    shr                  r5, 16
6678    sub                srcq, 3
6679    movd                m15, r5
6680    xor                  r5, r5
6681    shr                 myd, 6
6682    lea                  r3, [r3+myd]
6683    mov                  r4, 64 << 24
6684    cmovnz               r4, [base+subpel_filters+r3*8+0]
6685    cmovnz               r5, [base+subpel_filters+r3*8+4]
6686    mov                  r0, r0m
6687    mov                  r3, r3m
6688%endif
6689    pslld                m7, m8, 2 ; dx*4
6690    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
6691    pshufd              m15, m15, q0000
6692    paddd               m14, m8 ; mx+dx*[0-3]
6693%if ARCH_X86_64
6694    movq                 m3, r4q
6695    punpcklbw            m3, m3
6696    psraw                m3, 8
6697%else
6698    movd                 m5, r4
6699    movd                 m6, r5
6700    punpckldq            m5, m6
6701    punpcklbw            m5, m5
6702    psraw                m5, 8
6703    SWAP                 m3, m5
6704%endif
6705    mova        [rsp+0x100], m7
6706    mova        [rsp+0x120], m15
6707    mov         [rsp+0x098], srcq
6708    mov         [rsp+0x130], r0q ; dstq / tmpq
6709    pshufd               m0, m3, q0000
6710    pshufd               m1, m3, q1111
6711    pshufd               m2, m3, q2222
6712    pshufd               m3, m3, q3333
6713    mova        [rsp+0x140], m0
6714    mova        [rsp+0x150], m1
6715    mova        [rsp+0x160], m2
6716    mova        [rsp+0x170], m3
6717%if ARCH_X86_64 && UNIX64
6718    mov                  hm, hd
6719%elif ARCH_X86_32
6720    SWAP                  m5, m3
6721    mov                   r5, hm
6722    mov          [esp+0x134], r5
6723%endif
6724    jmp .dy1_hloop
6725.dy1_hloop_prep:
6726    dec   dword [rsp+0x090]
6727    jz .ret
6728%if ARCH_X86_64
6729    add   qword [rsp+0x130], 8*(isprep+1)
6730    mov                  hd, hm
6731%else
6732    add   dword [rsp+0x130], 8*(isprep+1)
6733    mov                  r5, [esp+0x134]
6734    mov                  r0, [esp+0x130]
6735%endif
6736    mova                 m7, [rsp+0x100]
6737    mova                m14, [rsp+0x110]
6738%if ARCH_X86_64
6739    mova                m10, [base+pd_0x3ff]
6740%else
6741 %define m10 [base+pd_0x3ff]
6742%endif
6743    mova                m15, [rsp+0x120]
6744    mov                srcq, [rsp+0x098]
6745%if ARCH_X86_64
6746    mov                 r0q, [rsp+0x130] ; dstq / tmpq
6747%else
6748    mov                  hm, r5
6749    mov                 r0m, r0
6750    mov                  r3, r3m
6751%endif
6752    paddd               m14, m7
6753.dy1_hloop:
6754    pxor                 m9, m9
6755%if ARCH_X86_64
6756    mova                m11, [base+pq_0x40000000]
6757%else
6758 %define m11 [base+pq_0x40000000]
6759%endif
6760    psrld                m2, m14, 10
6761    mova              [rsp], m2
6762    pand                 m6, m14, m10
6763    psrld                m6, 6
6764    paddd                m5, m15, m6
6765    pcmpeqd              m6, m9
6766    psrldq               m2, m5, 8
6767%if ARCH_X86_64
6768    movd                r4d, m5
6769    movd                r6d, m2
6770    psrldq               m5, 4
6771    psrldq               m2, 4
6772    movd                r7d, m5
6773    movd                r9d, m2
6774    movq                 m0, [base+subpel_filters+r4*8]
6775    movq                 m1, [base+subpel_filters+r6*8]
6776    movhps               m0, [base+subpel_filters+r7*8]
6777    movhps               m1, [base+subpel_filters+r9*8]
6778%else
6779    movd                 r0, m5
6780    movd                 rX, m2
6781    psrldq               m5, 4
6782    psrldq               m2, 4
6783    movd                 r4, m5
6784    movd                 r5, m2
6785    movq                 m0, [base+subpel_filters+r0*8]
6786    movq                 m1, [base+subpel_filters+rX*8]
6787    movhps               m0, [base+subpel_filters+r4*8]
6788    movhps               m1, [base+subpel_filters+r5*8]
6789    pxor                 m2, m2
6790 %define m9 m2
6791%endif
6792    paddd               m14, m7 ; mx+dx*[4-7]
6793    pand                 m5, m14, m10
6794    psrld                m5, 6
6795    paddd               m15, m5
6796    pcmpeqd              m5, m9
6797    mova        [rsp+0x110], m14
6798    psrldq               m4, m15, 8
6799%if ARCH_X86_64
6800    movd               r10d, m15
6801    movd               r11d, m4
6802    psrldq              m15, 4
6803    psrldq               m4, 4
6804    movd               r13d, m15
6805    movd                rXd, m4
6806    movq                 m2, [base+subpel_filters+r10*8]
6807    movq                 m3, [base+subpel_filters+r11*8]
6808    movhps               m2, [base+subpel_filters+r13*8]
6809    movhps               m3, [base+subpel_filters+ rX*8]
6810    psrld               m14, 10
6811    psrldq               m4, m14, 8
6812    movd               r10d, m14
6813    movd               r11d, m4
6814    psrldq              m14, 4
6815    psrldq               m4, 4
6816    movd               r13d, m14
6817    movd                rXd, m4
6818    mov                 r4d, [rsp+ 0]
6819    mov                 r6d, [rsp+ 8]
6820    mov                 r7d, [rsp+ 4]
6821    mov                 r9d, [rsp+12]
6822    pshufd               m4, m6, q1100
6823    pshufd               m6, m6, q3322
6824    pshufd               m7, m5, q1100
6825    pshufd               m5, m5, q3322
6826    pand                 m8, m11, m4
6827    pand                 m9, m11, m6
6828    pand                m15, m11, m7
6829    pand                m11, m11, m5
6830    pandn                m4, m0
6831    pandn                m6, m1
6832    pandn                m7, m2
6833    pandn                m5, m3
6834    por                  m8, m4
6835    por                  m9, m6
6836    por                 m15, m7
6837    por                 m11, m5
6838    mova         [rsp+0x10], m8
6839    mova         [rsp+0x20], m9
6840    mova         [rsp+0x30], m15
6841    mova         [rsp+0x40], m11
6842    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
6843    mova         [rsp+0x50], m1
6844    mova         [rsp+0x60], m2
6845    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
6846    mova         [rsp+0x70], m3
6847    mova         [rsp+0x80], m4
6848    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
6849    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
6850    SWAP                 m7, m0
6851    SWAP                 m8, m14
6852    mova                 m1, [rsp+0x50]
6853    mova                 m2, [rsp+0x60]
6854    mova                 m3, [rsp+0x70]
6855    mova                m15, [rsp+0x80]
6856    punpcklwd            m4, m5, m6 ; 45a
6857    punpckhwd            m5, m6     ; 45b
6858    punpcklwd            m6, m7, m8 ; 67a
6859    punpckhwd            m7, m8     ; 67b
6860    SWAP                m14, m8
6861    mova                 m8, [rsp+0x140]
6862    mova                 m9, [rsp+0x150]
6863    mova                m10, [rsp+0x160]
6864    mova                m11, [rsp+0x170]
6865    punpcklwd            m0, m1, m2 ; 01a
6866    punpckhwd            m1, m2     ; 01b
6867    punpcklwd            m2, m3, m15; 23a
6868    punpckhwd            m3, m15    ; 23b
6869    mova         [rsp+0x50], m4
6870    mova         [rsp+0x60], m5
6871    mova         [rsp+0x70], m6
6872    mova         [rsp+0x80], m7
6873    mova                m14, [base+unpckw]
6874%else
6875    movd                 r0, m15
6876    movd                 rX, m4
6877    psrldq              m15, 4
6878    psrldq               m4, 4
6879    movd                 r4, m15
6880    movd                 r5, m4
6881    mova                m14, [esp+0x110]
6882    movq                 m2, [base+subpel_filters+r0*8]
6883    movq                 m3, [base+subpel_filters+rX*8]
6884    movhps               m2, [base+subpel_filters+r4*8]
6885    movhps               m3, [base+subpel_filters+r5*8]
6886    psrld               m14, 10
6887    mova           [esp+16], m14
6888    mov                  r0, [esp+ 0]
6889    mov                  rX, [esp+ 8]
6890    mov                  r4, [esp+ 4]
6891    mov                  r5, [esp+12]
6892    mova         [esp+0x20], m0
6893    mova         [esp+0x30], m1
6894    mova         [esp+0x40], m2
6895    mova         [esp+0x50], m3
6896    pshufd               m4, m6, q1100
6897    pshufd               m6, m6, q3322
6898    pshufd               m7, m5, q1100
6899    pshufd               m5, m5, q3322
6900    pand                 m0, m11, m4
6901    pand                 m1, m11, m6
6902    pand                 m2, m11, m7
6903    pand                 m3, m11, m5
6904    pandn                m4, [esp+0x20]
6905    pandn                m6, [esp+0x30]
6906    pandn                m7, [esp+0x40]
6907    pandn                m5, [esp+0x50]
6908    por                  m0, m4
6909    por                  m1, m6
6910    por                  m2, m7
6911    por                  m3, m5
6912    mova        [esp+0x20], m0
6913    mova        [esp+0x30], m1
6914    mova        [esp+0x40], m2
6915    mova        [esp+0x50], m3
6916    MC_8TAP_SCALED_H   0x20, 0x60, 0 ; 0-1
6917    MC_8TAP_SCALED_H   0x20, 0x180   ; 2-3
6918    MC_8TAP_SCALED_H   0x20, 0x1a0   ; 4-5
6919    MC_8TAP_SCALED_H   0x20, 0x1c0   ; 6-7
6920    mova                 m5, [esp+0x1a0]
6921    mova                 m6, [esp+0x1b0]
6922    mova                 m7, [esp+0x1c0]
6923    mova                 m0, [esp+0x1d0]
6924    punpcklwd            m4, m5, m6      ; 45a
6925    punpckhwd            m5, m6          ; 45b
6926    punpcklwd            m6, m7, m0      ; 67a
6927    punpckhwd            m7, m0          ; 67b
6928    mova        [esp+0x1a0], m4
6929    mova        [esp+0x1b0], m5
6930    mova        [esp+0x1c0], m6
6931    mova        [esp+0x1d0], m7
6932    mova                 m1, [esp+0x060]
6933    mova                 m2, [esp+0x070]
6934    mova                 m3, [esp+0x180]
6935    mova                 m4, [esp+0x190]
6936    punpcklwd            m0, m1, m2      ; 01a
6937    punpckhwd            m1, m2          ; 01b
6938    punpcklwd            m2, m3, m4      ; 23a
6939    punpckhwd            m3, m4          ; 23b
6940    mova        [esp+0x060], m0
6941    mova        [esp+0x070], m1
6942    mova        [esp+0x180], m2
6943    mova        [esp+0x190], m3
6944 %define m8  [esp+0x140]
6945 %define m9  [esp+0x150]
6946 %define m10 [esp+0x160]
6947 %define m11 [esp+0x170]
6948%endif
6949.dy1_vloop:
6950%if ARCH_X86_32
6951    mov                  r0, r0m
6952%endif
6953    pmaddwd              m4, m0, m8
6954    pmaddwd              m5, m1, m8
6955    pmaddwd              m6, m2, m9
6956    pmaddwd              m7, m3, m9
6957    paddd                m4, m13
6958    paddd                m5, m13
6959    paddd                m4, m6
6960    paddd                m5, m7
6961%if ARCH_X86_64
6962    pmaddwd              m6, [rsp+0x50], m10
6963    pmaddwd              m7, [rsp+0x60], m10
6964%else
6965    pmaddwd              m6, [rsp+0x1a0], m10
6966    pmaddwd              m7, [rsp+0x1b0], m10
6967%endif
6968    paddd                m4, m6
6969    paddd                m5, m7
6970%if ARCH_X86_64
6971    pmaddwd              m6, [rsp+0x70], m11
6972    pmaddwd              m7, [rsp+0x80], m11
6973%else
6974    pmaddwd              m6, [rsp+0x1c0], m11
6975    pmaddwd              m7, [rsp+0x1d0], m11
6976%endif
6977    paddd                m4, m6
6978    paddd                m5, m7
6979    psrad                m4, rndshift
6980    psrad                m5, rndshift
6981    packssdw             m4, m5
6982%ifidn %1, put
6983    packuswb             m4, m4
6984    movq             [dstq], m4
6985    add                dstq, dsm
6986%else
6987    mova             [tmpq], m4
6988    add                tmpq, tmp_stridem
6989%endif
6990%if ARCH_X86_32
6991    mov                 r0m, r0
6992%endif
6993    dec                  hd
6994    jz .dy1_hloop_prep
6995%if ARCH_X86_64
6996    movq                 m4, [srcq+ r4]
6997    movq                 m5, [srcq+ r6]
6998    movhps               m4, [srcq+ r7]
6999    movhps               m5, [srcq+ r9]
7000    movq                 m6, [srcq+r10]
7001    movq                 m7, [srcq+r11]
7002    movhps               m6, [srcq+r13]
7003    movhps               m7, [srcq+ rX]
7004    add                srcq, ssq
7005    pshufd              m15, m14, q1032
7006    pshufb               m0, m14                ; 0a 1a
7007    pshufb               m1, m14                ; 0b 1b
7008    pshufb               m2, m15                ; 3a 2a
7009    pshufb               m3, m15                ; 3b 2b
7010    pmaddubsw            m4, [rsp+0x10]
7011    pmaddubsw            m5, [rsp+0x20]
7012    pmaddubsw            m6, [rsp+0x30]
7013    pmaddubsw            m7, [rsp+0x40]
7014    phaddw               m4, m5
7015    phaddw               m6, m7
7016    phaddw               m4, m6
7017    pmulhrsw             m4, m12
7018    pshufb               m5, [rsp+0x70], m15    ; 7a 6a
7019    pshufb               m7, [rsp+0x80], m15    ; 7b 6b
7020    pshufb               m6, [rsp+0x50], m14    ; 4a 5a
7021    pshufb              m15, [rsp+0x60], m14    ; 4b 5b
7022    punpckhwd            m0, m2  ; 12a
7023    punpckhwd            m1, m3  ; 12b
7024    punpcklwd            m2, m6  ; 34a
7025    punpcklwd            m3, m15 ; 34b
7026    punpckhwd            m6, m5  ; 56a
7027    punpckhwd           m15, m7  ; 56b
7028    punpcklwd            m5, m4  ; 78a
7029    psrldq               m4, 8
7030    punpcklwd            m7, m4  ; 78b
7031    mova         [rsp+0x50], m6
7032    mova         [rsp+0x60], m15
7033    mova         [rsp+0x70], m5
7034    mova         [rsp+0x80], m7
7035%else
7036    mov                  r0, [esp+ 0]
7037    mov                  rX, [esp+ 8]
7038    mov                  r4, [esp+ 4]
7039    mov                  r5, [esp+12]
7040    mova                 m6, [base+unpckw]
7041    mova                 m0, [esp+0x060]
7042    mova                 m1, [esp+0x070]
7043    mova                 m7, [esp+0x1a0]
7044    movq                 m4, [srcq+r0]
7045    movq                 m5, [srcq+rX]
7046    movhps               m4, [srcq+r4]
7047    movhps               m5, [srcq+r5]
7048    pshufb               m0, m6         ; 0a 1a
7049    pshufb               m1, m6         ; 0b 1b
7050    pshufb               m7, m6         ; 4a 5a
7051    mov                  r0, [esp+16]
7052    mov                  rX, [esp+24]
7053    mov                  r4, [esp+20]
7054    mov                  r5, [esp+28]
7055    movq                 m3, [srcq+r0]
7056    movq                 m2, [srcq+rX]
7057    movhps               m3, [srcq+r4]
7058    movhps               m2, [srcq+r5]
7059    add                srcq, ssq
7060    pmaddubsw            m4, [esp+0x20]
7061    pmaddubsw            m5, [esp+0x30]
7062    pmaddubsw            m3, [esp+0x40]
7063    pmaddubsw            m2, [esp+0x50]
7064    phaddw               m4, m5
7065    phaddw               m3, m2
7066    mova                 m5, [esp+0x1b0]
7067    mova                 m2, [esp+0x180]
7068    phaddw               m4, m3
7069    mova                 m3, [esp+0x190]
7070    pmulhrsw             m4, m12        ; 8a 8b
7071    pshufb               m5, m6         ; 4b 5b
7072    pshufd               m6, m6, q1032
7073    pshufb               m2, m6         ; 3a 2a
7074    pshufb               m3, m6         ; 3b 2b
7075    punpckhwd            m0, m2         ; 12a
7076    punpckhwd            m1, m3         ; 12b
7077    mova         [esp+0x60], m0
7078    mova         [esp+0x70], m1
7079    mova                 m0, [esp+0x1c0]
7080    mova                 m1, [esp+0x1d0]
7081    punpcklwd            m2, m7         ; 34a
7082    punpcklwd            m3, m5         ; 34b
7083    mova        [esp+0x180], m2
7084    mova        [esp+0x190], m3
7085    pshufb               m0, m6         ; 7a 6a
7086    pshufb               m1, m6         ; 7b 6b
7087    punpckhwd            m7, m0         ; 56a
7088    punpckhwd            m5, m1         ; 56b
7089    punpcklwd            m0, m4
7090    punpckhqdq           m4, m4
7091    punpcklwd            m1, m4
7092    mova        [esp+0x1a0], m7
7093    mova        [esp+0x1b0], m5
7094    mova        [esp+0x1c0], m0
7095    mova        [esp+0x1d0], m1
7096    mova                 m0, [esp+0x60]
7097    mova                 m1, [esp+0x70]
7098%endif
7099    jmp .dy1_vloop
7100INIT_XMM ssse3
7101.dy2:
7102    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
7103    add                  wq, base_reg
7104    jmp                  wq
7105%ifidn %1, put
7106.dy2_w2:
7107 %if ARCH_X86_64
7108    mov                 myd, mym
7109    movzx               t0d, t0b
7110    dec                srcq
7111    movd                m15, t0d
7112 %else
7113  %define m10 [base+pd_0x3ff]
7114  %define m11 [base+pd_0x4000]
7115  %define m8  m0
7116  %define m9  m1
7117  %define m14 m4
7118  %define m15 m3
7119    movzx                r5, byte [esp+0x1f0]
7120    dec                srcd
7121    movd                m15, r5
7122 %endif
7123    punpckldq            m9, m8
7124    SWAP                 m8, m9
7125    paddd               m14, m8 ; mx+dx*[0-1]
7126 %if ARCH_X86_64
7127    mova                m11, [base+pd_0x4000]
7128 %endif
7129    pshufd              m15, m15, q0000
7130    pand                 m8, m14, m10
7131    psrld                m8, 6
7132    paddd               m15, m8
7133    movd                r4d, m15
7134    psrldq              m15, 4
7135 %if ARCH_X86_64
7136    movd                r6d, m15
7137 %else
7138    movd                r3d, m15
7139 %endif
7140    mova                 m5, [base+bdct_lb_dw]
7141    mova                 m6, [base+subpel_s_shuf2]
7142    movd                m15, [base+subpel_filters+r4*8+2]
7143 %if ARCH_X86_64
7144    movd                 m7, [base+subpel_filters+r6*8+2]
7145 %else
7146    movd                 m7, [base+subpel_filters+r3*8+2]
7147 %endif
7148    pxor                 m9, m9
7149    pcmpeqd              m8, m9
7150    psrld               m14, 10
7151 %if ARCH_X86_32
7152    mov                  r3, r3m
7153    pshufb              m14, m5
7154    paddb               m14, m6
7155    mova         [esp+0x00], m14
7156  %define m14 [esp+0x00]
7157    SWAP                 m5, m0
7158    SWAP                 m6, m3
7159  %define m8  m5
7160  %define m15 m6
7161 %endif
7162    movq                 m0, [srcq+ssq*0]
7163    movq                 m1, [srcq+ssq*1]
7164    movhps               m0, [srcq+ssq*2]
7165    movhps               m1, [srcq+ss3q ]
7166    lea                srcq, [srcq+ssq*4]
7167 %if ARCH_X86_64
7168    shr                 myd, 6
7169    mov                 r4d, 64 << 24
7170    lea                 myd, [t1+myq]
7171    cmovnz              r4q, [base+subpel_filters+myq*8]
7172    pshufb              m14, m5
7173    paddb               m14, m6
7174    movq                m10, r4q
7175 %else
7176    mov                 myd, mym
7177    mov                  r3, [esp+0x1f4]
7178    xor                  r5, r5
7179    shr                 myd, 6
7180    lea                  r3, [r3+myd]
7181    mov                  r4, 64 << 24
7182    cmovnz               r4, [base+subpel_filters+r3*8+0]
7183    cmovnz               r5, [base+subpel_filters+r3*8+4]
7184    mov                  r3, r3m
7185  %define m10 m4
7186    movd                m10, r4
7187    movd                 m3, r5
7188    punpckldq           m10, m3
7189 %endif
7190    movq                 m3, [srcq+ssq*0]
7191    movhps               m3, [srcq+ssq*1]
7192    lea                srcq, [srcq+ssq*2]
7193    punpcklbw           m10, m10
7194    psraw               m10, 8
7195    punpckldq           m15, m7
7196    punpcklqdq          m15, m15
7197 %if ARCH_X86_64
7198    pand                m11, m8
7199 %else
7200    pand                 m7, m11, m8
7201  %define m11 m7
7202 %endif
7203    pandn                m8, m15
7204    SWAP                m15, m8
7205    por                 m15, m11
7206 %if ARCH_X86_64
7207    pshufd               m8, m10, q0000
7208    pshufd               m9, m10, q1111
7209    pshufd              m11, m10, q3333
7210    pshufd              m10, m10, q2222
7211 %else
7212    mova         [esp+0x10], m15
7213  %define m15 [esp+0x10]
7214    mov                  r5, r0m
7215  %define dstq r5
7216    mov                 dsd, dsm
7217    pshufd               m5, m4, q0000
7218    pshufd               m6, m4, q1111
7219    pshufd               m7, m4, q2222
7220    pshufd               m4, m4, q3333
7221  %define m8  [esp+0x20]
7222  %define m9  [esp+0x30]
7223  %define m10 [esp+0x40]
7224  %define m11 [esp+0x50]
7225    mova                 m8, m5
7226    mova                 m9, m6
7227    mova                m10, m7
7228    mova                m11, m4
7229 %endif
7230    pshufb               m0, m14
7231    pshufb               m1, m14
7232    pshufb               m3, m14
7233    pmaddubsw            m0, m15
7234    pmaddubsw            m1, m15
7235    pmaddubsw            m3, m15
7236    pslldq               m2, m3, 8
7237    phaddw               m0, m2
7238    phaddw               m1, m3
7239    pmulhrsw             m0, m12            ; 0 2 _ 4
7240    pmulhrsw             m1, m12            ; 1 3 _ 5
7241    pshufd               m2, m0, q3110      ; 0 2 2 4
7242    pshufd               m1, m1, q3110      ; 1 3 3 5
7243    punpcklwd            m3, m2, m1         ; 01 23
7244    punpckhwd            m2, m1             ; 23 45
7245.dy2_w2_loop:
7246    movq                 m6, [srcq+ssq*0]
7247    movq                 m7, [srcq+ssq*1]
7248    movhps               m6, [srcq+ssq*2]
7249    movhps               m7, [srcq+ss3q ]
7250    lea                srcq, [srcq+ssq*4]
7251    pmaddwd              m4, m3, m8
7252    pmaddwd              m5, m2, m9
7253    pshufb               m6, m14
7254    pshufb               m7, m14
7255    pmaddubsw            m6, m15
7256    pmaddubsw            m7, m15
7257    phaddw               m6, m7
7258    pmulhrsw             m6, m12
7259    psrldq               m7, m6, 8
7260    palignr              m6, m0, 8
7261    palignr              m7, m1, 8
7262    mova                 m0, m6
7263    mova                 m1, m7
7264    pshufd               m6, m6, q3221
7265    pshufd               m7, m7, q3221
7266    punpcklwd            m3, m6, m7       ; 45 67
7267    punpckhwd            m2, m6, m7       ; 67 89
7268    pmaddwd              m6, m3, m10
7269    pmaddwd              m7, m2, m11
7270    paddd                m4, m5
7271    paddd                m4, m13
7272    paddd                m6, m7
7273    paddd                m4, m6
7274    psrad                m4, rndshift
7275    packssdw             m4, m4
7276    packuswb             m4, m4
7277    movd                r4d, m4
7278    mov        [dstq+dsq*0], r4w
7279    shr                 r4d, 16
7280    mov        [dstq+dsq*1], r4w
7281    lea                dstq, [dstq+dsq*2]
7282    sub                  hd, 2
7283    jg .dy2_w2_loop
7284    RET
7285%endif
7286INIT_XMM ssse3
7287.dy2_w4:
7288%if ARCH_X86_64
7289    mov                 myd, mym
7290    movzx               t0d, t0b
7291    dec                srcq
7292    movd                m15, t0d
7293%else
7294 %define m10 [base+pd_0x3ff]
7295 %define m11 [base+pd_0x4000]
7296 %define m8  m0
7297 %xdefine m14 m4
7298 %define m15 m3
7299 %define dstq r0
7300 %if isprep
7301  %define ssq r3
7302 %endif
7303    movzx                r4, byte [esp+0x1f0]
7304    dec                srcq
7305    movd                m15, r4
7306%endif
7307    pmaddwd              m8, [base+rescale_mul]
7308%if ARCH_X86_64
7309    mova                m11, [base+pd_0x4000]
7310%endif
7311    pshufd              m15, m15, q0000
7312    paddd               m14, m8 ; mx+dx*[0-3]
7313    pand                 m8, m14, m10
7314    psrld                m8, 6
7315    paddd               m15, m8
7316    psrldq               m7, m15, 8
7317%if ARCH_X86_64
7318    movd                r4d, m15
7319    movd               r11d, m7
7320    psrldq              m15, 4
7321    psrldq               m7, 4
7322    movd                r6d, m15
7323    movd               r13d, m7
7324    movd                m15, [base+subpel_filters+ r4*8+2]
7325    movd                 m2, [base+subpel_filters+r11*8+2]
7326    movd                 m3, [base+subpel_filters+ r6*8+2]
7327    movd                 m4, [base+subpel_filters+r13*8+2]
7328    movq                 m6, [base+subpel_s_shuf2]
7329    shr                 myd, 6
7330    mov                 r4d, 64 << 24
7331    lea                 myd, [t1+myq]
7332    cmovnz              r4q, [base+subpel_filters+myq*8]
7333%else
7334    movd                 r1, m15
7335    movd                 r3, m7
7336    psrldq              m15, 4
7337    psrldq               m7, 4
7338    movd                 r4, m15
7339    movd                 r5, m7
7340 %define m15 m5
7341    SWAP                 m4, m7
7342    movd                m15, [base+subpel_filters+r1*8+2]
7343    movd                 m2, [base+subpel_filters+r3*8+2]
7344    movd                 m3, [base+subpel_filters+r4*8+2]
7345    movd                 m4, [base+subpel_filters+r5*8+2]
7346    movq                 m6, [base+subpel_s_shuf2]
7347    mov                 myd, mym
7348    mov                  r3, [esp+0x1f4]
7349    xor                  r5, r5
7350    shr                 myd, 6
7351    lea                  r3, [r3+myd]
7352    mov                  r4, 64 << 24
7353    cmovnz               r4, [base+subpel_filters+r3*8+0]
7354    cmovnz               r5, [base+subpel_filters+r3*8+4]
7355    mov                  r3, r3m
7356 %if isprep
7357    lea                ss3q, [ssq*3]
7358 %endif
7359%endif
7360    punpckldq           m15, m3
7361    punpckldq            m2, m4
7362    punpcklqdq          m15, m2
7363%if ARCH_X86_64
7364    pcmpeqd              m8, m9
7365    psrld               m14, 10
7366    movu                 m0, [srcq+ssq*0]
7367    movu                 m2, [srcq+ssq*2]
7368    movu                 m1, [srcq+ssq*1]
7369    movu                 m3, [srcq+ss3q ]
7370    lea                srcq, [srcq+ssq*4]
7371    punpcklqdq           m6, m6
7372    pshufb              m14, [base+bdct_lb_dw]
7373    movu                 m4, [srcq+ssq*0]
7374    movu                 m5, [srcq+ssq*1]
7375    lea                srcq, [srcq+ssq*2]
7376    pand                m11, m8
7377    pandn                m8, m15
7378    SWAP                m15, m8
7379    por                 m15, m11
7380    paddb               m14, m6
7381    movq                m11, r4q
7382    punpcklbw           m11, m11
7383    psraw               m11, 8
7384    pshufb               m0, m14
7385    pshufb               m2, m14
7386    pshufb               m1, m14
7387    pshufb               m3, m14
7388    pshufb               m4, m14
7389    pshufb               m5, m14
7390    pmaddubsw            m0, m15
7391    pmaddubsw            m2, m15
7392    pmaddubsw            m1, m15
7393    pmaddubsw            m3, m15
7394    pmaddubsw            m4, m15
7395    pmaddubsw            m5, m15
7396    phaddw               m0, m2
7397    phaddw               m1, m3
7398    phaddw               m4, m5
7399    pmulhrsw             m0, m12    ; 0 2
7400    pmulhrsw             m1, m12    ; 1 3
7401    pmulhrsw             m4, m12    ; 4 5
7402    pshufd               m8, m11, q0000
7403    pshufd               m9, m11, q1111
7404    pshufd              m10, m11, q2222
7405    pshufd              m11, m11, q3333
7406%else
7407    pxor                 m3, m3
7408    pcmpeqd              m8, m3
7409    psrld               m14, 10
7410    pshufb              m14, [base+bdct_lb_dw]
7411    movu                 m1, [srcq+ssq*0]
7412    movu                 m2, [srcq+ssq*2]
7413    movu                 m3, [srcq+ssq*1]
7414    add                srcq, ss3q
7415    punpcklqdq           m6, m6
7416    SWAP                 m4, m7
7417    pand                 m7, m11, m8
7418    pandn                m8, m15
7419    SWAP                m15, m8
7420    por                 m15, m7
7421    paddb               m14, m6
7422    movu                 m0, [srcq+ssq*0]
7423    movu                 m7, [srcq+ssq*1]
7424    movu                 m6, [srcq+ssq*2]
7425    add                srcq, ss3q
7426    pshufb               m1, m14
7427    pshufb               m2, m14
7428    pshufb               m3, m14
7429    pshufb               m0, m14
7430    pshufb               m7, m14
7431    pshufb               m6, m14
7432    pmaddubsw            m1, m15
7433    pmaddubsw            m2, m15
7434    pmaddubsw            m3, m15
7435    mova         [esp+0x00], m14
7436    mova         [esp+0x10], m15
7437    pmaddubsw            m0, m15
7438    pmaddubsw            m7, m15
7439    pmaddubsw            m6, m15
7440 %define m14 [esp+0x00]
7441 %define m15 [esp+0x10]
7442    phaddw               m1, m2
7443    phaddw               m3, m0
7444    phaddw               m7, m6
7445 %ifidn %1, put
7446    mov                 dsd, dsm
7447  %define dstq r5
7448 %else
7449  %define tmpq r5
7450 %endif
7451    movd                 m6, r4
7452    movd                 m0, r5
7453    punpckldq            m6, m0
7454    punpcklbw            m6, m6
7455    psraw                m6, 8
7456    mov                  r5, r0m
7457    pmulhrsw             m1, m12 ; 0 2
7458    pmulhrsw             m3, m12 ; 1 3
7459    pmulhrsw             m7, m12 ; 4 5
7460    SWAP                 m0, m1, m3
7461    SWAP                 m4, m7
7462    pshufd               m2, m6, q0000
7463    pshufd               m3, m6, q1111
7464    pshufd               m7, m6, q2222
7465    pshufd               m6, m6, q3333
7466    mova         [esp+0x30], m2
7467    mova         [esp+0x40], m3
7468    mova         [esp+0x50], m7
7469    mova         [esp+0x60], m6
7470 %define m8  [esp+0x30]
7471 %define m9  [esp+0x40]
7472 %define m10 [esp+0x50]
7473 %define m11 [esp+0x60]
7474%endif
7475    psrldq               m5, m4, 8  ; 5 _
7476    punpckhwd            m2, m0, m1 ; 23
7477    punpcklwd            m0, m1     ; 01
7478    punpcklwd            m4, m5     ; 45
7479.dy2_w4_loop:
7480    pmaddwd              m0, m8         ; a0
7481    pmaddwd              m5, m2, m8     ; b0
7482    pmaddwd              m2, m9         ; a1
7483    pmaddwd              m7, m4, m9     ; b1
7484    pmaddwd              m3, m4, m10    ; a2
7485    paddd                m0, m13
7486    paddd                m5, m13
7487    paddd                m0, m2
7488    paddd                m5, m7
7489    paddd                m0, m3
7490    movu                 m6, [srcq+ssq*0]
7491    movu                 m7, [srcq+ssq*1]
7492    movu                 m3, [srcq+ssq*2]
7493    movu                 m1, [srcq+ss3q ]
7494    lea                srcq, [srcq+ssq*4]
7495    pshufb               m6, m14
7496    pshufb               m7, m14
7497    pshufb               m3, m14
7498    pshufb               m1, m14
7499    pmaddubsw            m6, m15
7500    pmaddubsw            m7, m15
7501    pmaddubsw            m3, m15
7502    pmaddubsw            m1, m15
7503    phaddw               m6, m7
7504    phaddw               m3, m1
7505    pmulhrsw             m6, m12    ; 6 7
7506    pmulhrsw             m3, m12    ; 8 9
7507    psrldq               m7, m6, 8
7508    psrldq               m1, m3, 8
7509    punpcklwd            m6, m7     ; 67
7510    punpcklwd            m3, m1     ; 89
7511    mova                 m2, m6
7512    pmaddwd              m1, m6, m10    ; b2
7513    pmaddwd              m6, m11        ; a3
7514    pmaddwd              m7, m3, m11    ; b3
7515    paddd                m5, m1
7516    paddd                m0, m6
7517    paddd                m5, m7
7518    psrad                m0, rndshift
7519    psrad                m5, rndshift
7520    packssdw             m0, m5
7521%ifidn %1, put
7522    packuswb             m0, m0
7523    psrldq               m1, m0, 4
7524    movd       [dstq+dsq*0], m0
7525    movd       [dstq+dsq*1], m1
7526    lea                dstq, [dstq+dsq*2]
7527%else
7528    mova             [tmpq], m0
7529    add                tmpq, 16
7530%endif
7531    mova                 m0, m4
7532    mova                 m4, m3
7533    sub                  hd, 2
7534    jg .dy2_w4_loop
7535    MC_8TAP_SCALED_RET
7536INIT_XMM ssse3
7537.dy2_w8:
7538    mov    dword [rsp+0x90], 1
7539    movifprep   tmp_stridem, 16
7540    jmp .dy2_w_start
7541.dy2_w16:
7542    mov    dword [rsp+0x90], 2
7543    movifprep   tmp_stridem, 32
7544    jmp .dy2_w_start
7545.dy2_w32:
7546    mov    dword [rsp+0x90], 4
7547    movifprep   tmp_stridem, 64
7548    jmp .dy2_w_start
7549.dy2_w64:
7550    mov    dword [rsp+0x90], 8
7551    movifprep   tmp_stridem, 128
7552    jmp .dy2_w_start
7553.dy2_w128:
7554    mov    dword [rsp+0x90], 16
7555    movifprep   tmp_stridem, 256
7556.dy2_w_start:
7557    mov                 myd, mym
7558%ifidn %1, put
7559    movifnidn           dsm, dsq
7560%endif
7561%if ARCH_X86_64
7562    shr                 t0d, 16
7563    sub                srcq, 3
7564    shr                 myd, 6
7565    mov                 r4d, 64 << 24
7566    lea                 myd, [t1+myq]
7567    cmovnz              r4q, [base+subpel_filters+myq*8]
7568    movd                m15, t0d
7569%else
7570 %define m10 [base+pd_0x3ff]
7571 %define m11 [base+pd_0x4000]
7572 %define m8   m0
7573 %define m9   m1
7574 %xdefine m14 m4
7575 %xdefine m15 m3
7576 %if isprep
7577  %define tmpq r0
7578  %define ssq ssm
7579 %else
7580  %define dstq r0
7581 %endif
7582    mov                  r5, [esp+0x1f0]
7583    mov                  r3, [esp+0x1f4]
7584    shr                  r5, 16
7585    sub                srcq, 3
7586    movd                m15, r5
7587    xor                  r5, r5
7588    shr                 myd, 6
7589    lea                  r3, [r3+myd]
7590    mov                  r4, 64 << 24
7591    cmovnz               r4, [base+subpel_filters+r3*8+0]
7592    cmovnz               r5, [base+subpel_filters+r3*8+4]
7593    mov                  r0, r0m
7594    mov                  r3, r3m
7595%endif
7596    pslld                m7, m8, 2 ; dx*4
7597    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
7598    pshufd              m15, m15, q0000
7599    paddd               m14, m8 ; mx+dx*[0-3]
7600%if ARCH_X86_64
7601    movq                 m3, r4q
7602    punpcklbw            m3, m3
7603    psraw                m3, 8
7604%else
7605    movd                 m5, r4
7606    movd                 m6, r5
7607    punpckldq            m5, m6
7608    punpcklbw            m5, m5
7609    psraw                m5, 8
7610    SWAP                 m3, m5
7611%endif
7612    mova        [rsp+0x100], m7
7613    mova        [rsp+0x120], m15
7614    mov         [rsp+0x098], srcq
7615    mov         [rsp+0x130], r0q ; dstq / tmpq
7616    pshufd               m0, m3, q0000
7617    pshufd               m1, m3, q1111
7618    pshufd               m2, m3, q2222
7619    pshufd               m3, m3, q3333
7620    mova        [rsp+0x140], m0
7621    mova        [rsp+0x150], m1
7622    mova        [rsp+0x160], m2
7623    mova        [rsp+0x170], m3
7624%if ARCH_X86_64 && UNIX64
7625    mov                  hm, hd
7626%elif ARCH_X86_32
7627    SWAP                  m5, m3
7628    mov                   r5, hm
7629    mov          [esp+0x134], r5
7630%endif
7631    jmp .dy2_hloop
7632.dy2_hloop_prep:
7633    dec   dword [rsp+0x090]
7634    jz .ret
7635%if ARCH_X86_64
7636    add   qword [rsp+0x130], 8*(isprep+1)
7637    mov                  hd, hm
7638%else
7639    add   dword [rsp+0x130], 8*(isprep+1)
7640    mov                  r5, [esp+0x134]
7641    mov                  r0, [esp+0x130]
7642%endif
7643    mova                 m7, [rsp+0x100]
7644    mova                m14, [rsp+0x110]
7645%if ARCH_X86_64
7646    mova                m10, [base+pd_0x3ff]
7647%else
7648 %define m10 [base+pd_0x3ff]
7649%endif
7650    mova                m15, [rsp+0x120]
7651    mov                srcq, [rsp+0x098]
7652%if ARCH_X86_64
7653    mov                 r0q, [rsp+0x130] ; dstq / tmpq
7654%else
7655    mov                  hm, r5
7656    mov                 r0m, r0
7657    mov                  r3, r3m
7658%endif
7659    paddd               m14, m7
7660.dy2_hloop:
7661    pxor                 m9, m9
7662%if ARCH_X86_64
7663    mova                m11, [base+pq_0x40000000]
7664%else
7665 %define m11 [base+pq_0x40000000]
7666%endif
7667    psrld                m2, m14, 10
7668    mova              [rsp], m2
7669    pand                 m6, m14, m10
7670    psrld                m6, 6
7671    paddd                m5, m15, m6
7672    pcmpeqd              m6, m9
7673    psrldq               m2, m5, 8
7674%if ARCH_X86_64
7675    movd                r4d, m5
7676    movd                r6d, m2
7677    psrldq               m5, 4
7678    psrldq               m2, 4
7679    movd                r7d, m5
7680    movd                r9d, m2
7681    movq                 m0, [base+subpel_filters+r4*8]
7682    movq                 m1, [base+subpel_filters+r6*8]
7683    movhps               m0, [base+subpel_filters+r7*8]
7684    movhps               m1, [base+subpel_filters+r9*8]
7685%else
7686    movd                 r0, m5
7687    movd                 rX, m2
7688    psrldq               m5, 4
7689    psrldq               m2, 4
7690    movd                 r4, m5
7691    movd                 r5, m2
7692    movq                 m0, [base+subpel_filters+r0*8]
7693    movq                 m1, [base+subpel_filters+rX*8]
7694    movhps               m0, [base+subpel_filters+r4*8]
7695    movhps               m1, [base+subpel_filters+r5*8]
7696    pxor                 m2, m2
7697 %define m9 m2
7698%endif
7699    paddd               m14, m7 ; mx+dx*[4-7]
7700    pand                 m5, m14, m10
7701    psrld                m5, 6
7702    paddd               m15, m5
7703    pcmpeqd              m5, m9
7704    mova        [rsp+0x110], m14
7705    psrldq               m4, m15, 8
7706%if ARCH_X86_64
7707    movd               r10d, m15
7708    movd               r11d, m4
7709    psrldq              m15, 4
7710    psrldq               m4, 4
7711    movd               r13d, m15
7712    movd                rXd, m4
7713    movq                 m2, [base+subpel_filters+r10*8]
7714    movq                 m3, [base+subpel_filters+r11*8]
7715    movhps               m2, [base+subpel_filters+r13*8]
7716    movhps               m3, [base+subpel_filters+ rX*8]
7717    psrld               m14, 10
7718    psrldq               m4, m14, 8
7719    movd               r10d, m14
7720    movd               r11d, m4
7721    psrldq              m14, 4
7722    psrldq               m4, 4
7723    movd               r13d, m14
7724    movd                rXd, m4
7725    mov                 r4d, [rsp+ 0]
7726    mov                 r6d, [rsp+ 8]
7727    mov                 r7d, [rsp+ 4]
7728    mov                 r9d, [rsp+12]
7729    pshufd               m4, m6, q1100
7730    pshufd               m6, m6, q3322
7731    pshufd               m7, m5, q1100
7732    pshufd               m5, m5, q3322
7733    pand                 m8, m11, m4
7734    pand                 m9, m11, m6
7735    pand                m15, m11, m7
7736    pand                m11, m11, m5
7737    pandn                m4, m0
7738    pandn                m6, m1
7739    pandn                m7, m2
7740    pandn                m5, m3
7741    por                  m8, m4
7742    por                  m9, m6
7743    por                 m15, m7
7744    por                 m11, m5
7745    mova         [rsp+0x10], m8
7746    mova         [rsp+0x20], m9
7747    mova         [rsp+0x30], m15
7748    mova         [rsp+0x40], m11
7749    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
7750    mova         [rsp+0x50], m1
7751    mova         [rsp+0x60], m2
7752    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
7753    mova         [rsp+0x70], m3
7754    mova         [rsp+0x80], m4
7755    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
7756    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
7757    SWAP                 m7, m0
7758    SWAP                 m8, m14
7759    mova                 m1, [rsp+0x50]
7760    mova                 m2, [rsp+0x60]
7761    mova                 m3, [rsp+0x70]
7762    mova                m15, [rsp+0x80]
7763    punpcklwd            m4, m5, m6 ; 45a
7764    punpckhwd            m5, m6     ; 45b
7765    punpcklwd            m6, m7, m8 ; 67a
7766    punpckhwd            m7, m8     ; 67b
7767    SWAP                m14, m8
7768    mova                 m8, [rsp+0x140]
7769    mova                 m9, [rsp+0x150]
7770    mova                m10, [rsp+0x160]
7771    mova                m11, [rsp+0x170]
7772    punpcklwd            m0, m1, m2 ; 01a
7773    punpckhwd            m1, m2     ; 01b
7774    punpcklwd            m2, m3, m15; 23a
7775    punpckhwd            m3, m15    ; 23b
7776    mova         [rsp+0x50], m4
7777    mova         [rsp+0x60], m5
7778    mova         [rsp+0x70], m6
7779    mova         [rsp+0x80], m7
7780%else
7781    movd                 r0, m15
7782    movd                 rX, m4
7783    psrldq              m15, 4
7784    psrldq               m4, 4
7785    movd                 r4, m15
7786    movd                 r5, m4
7787    mova                m14, [esp+0x110]
7788    movq                 m2, [base+subpel_filters+r0*8]
7789    movq                 m3, [base+subpel_filters+rX*8]
7790    movhps               m2, [base+subpel_filters+r4*8]
7791    movhps               m3, [base+subpel_filters+r5*8]
7792    psrld               m14, 10
7793    mova           [esp+16], m14
7794    mov                  r0, [esp+ 0]
7795    mov                  rX, [esp+ 8]
7796    mov                  r4, [esp+ 4]
7797    mov                  r5, [esp+12]
7798    mova         [esp+0x20], m0
7799    mova         [esp+0x30], m1
7800    mova         [esp+0x40], m2
7801    mova         [esp+0x50], m3
7802    pshufd               m4, m6, q1100
7803    pshufd               m6, m6, q3322
7804    pshufd               m7, m5, q1100
7805    pshufd               m5, m5, q3322
7806    pand                 m0, m11, m4
7807    pand                 m1, m11, m6
7808    pand                 m2, m11, m7
7809    pand                 m3, m11, m5
7810    pandn                m4, [esp+0x20]
7811    pandn                m6, [esp+0x30]
7812    pandn                m7, [esp+0x40]
7813    pandn                m5, [esp+0x50]
7814    por                  m0, m4
7815    por                  m1, m6
7816    por                  m2, m7
7817    por                  m3, m5
7818    mova        [esp+0x20], m0
7819    mova        [esp+0x30], m1
7820    mova        [esp+0x40], m2
7821    mova        [esp+0x50], m3
7822    MC_8TAP_SCALED_H   0x20, 0x60, 0 ; 0-1
7823    MC_8TAP_SCALED_H   0x20, 0x180   ; 2-3
7824    MC_8TAP_SCALED_H   0x20, 0x1a0   ; 4-5
7825    MC_8TAP_SCALED_H   0x20, 0x1c0   ; 6-7
7826    mova                 m5, [esp+0x1a0]
7827    mova                 m6, [esp+0x1b0]
7828    mova                 m7, [esp+0x1c0]
7829    mova                 m0, [esp+0x1d0]
7830    punpcklwd            m4, m5, m6      ; 45a
7831    punpckhwd            m5, m6          ; 45b
7832    punpcklwd            m6, m7, m0      ; 67a
7833    punpckhwd            m7, m0          ; 67b
7834    mova        [esp+0x1a0], m4
7835    mova        [esp+0x1b0], m5
7836    mova        [esp+0x1c0], m6
7837    mova        [esp+0x1d0], m7
7838    mova                 m1, [esp+0x060]
7839    mova                 m2, [esp+0x070]
7840    mova                 m3, [esp+0x180]
7841    mova                 m4, [esp+0x190]
7842    punpcklwd            m0, m1, m2      ; 01a
7843    punpckhwd            m1, m2          ; 01b
7844    punpcklwd            m2, m3, m4      ; 23a
7845    punpckhwd            m3, m4          ; 23b
7846    mova        [esp+0x180], m2
7847    mova        [esp+0x190], m3
7848 %define m8  [esp+0x140]
7849 %define m9  [esp+0x150]
7850 %define m10 [esp+0x160]
7851 %define m11 [esp+0x170]
7852%endif
7853.dy2_vloop:
7854%if ARCH_X86_32
7855    mov                  r0, r0m
7856%endif
7857    pmaddwd              m4, m0, m8
7858    pmaddwd              m5, m1, m8
7859    pmaddwd              m6, m2, m9
7860    pmaddwd              m7, m3, m9
7861    paddd                m4, m13
7862    paddd                m5, m13
7863    paddd                m4, m6
7864    paddd                m5, m7
7865%if ARCH_X86_64
7866    pmaddwd              m6, [rsp+0x50], m10
7867    pmaddwd              m7, [rsp+0x60], m10
7868%else
7869    pmaddwd              m6, [esp+0x1a0], m10
7870    pmaddwd              m7, [esp+0x1b0], m10
7871%endif
7872    paddd                m4, m6
7873    paddd                m5, m7
7874%if ARCH_X86_64
7875    pmaddwd              m6, [rsp+0x70], m11
7876    pmaddwd              m7, [rsp+0x80], m11
7877%else
7878    pmaddwd              m6, [esp+0x1c0], m11
7879    pmaddwd              m7, [esp+0x1d0], m11
7880%endif
7881    paddd                m4, m6
7882    paddd                m5, m7
7883    psrad                m4, rndshift
7884    psrad                m5, rndshift
7885    packssdw             m4, m5
7886%ifidn %1, put
7887    packuswb             m4, m4
7888    movq             [dstq], m4
7889    add                dstq, dsm
7890%else
7891    mova             [tmpq], m4
7892    add                tmpq, tmp_stridem
7893%endif
7894%if ARCH_X86_32
7895    mov                 r0m, r0
7896%endif
7897    dec                  hd
7898    jz .dy2_hloop_prep
7899%if ARCH_X86_64
7900    mova                 m8, [rsp+0x10]
7901    mova                 m9, [rsp+0x20]
7902    mova                m10, [rsp+0x30]
7903    mova                m11, [rsp+0x40]
7904    mova                 m0, m2             ; 01a
7905    mova                 m1, m3             ; 01b
7906    MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11
7907    mova                 m3, [rsp+0x50] ; 23a
7908    mova                 m4, [rsp+0x60] ; 23b
7909    mova                 m5, [rsp+0x70] ; 45a
7910    mova                 m7, [rsp+0x80] ; 45b
7911    mova                 m8, [rsp+0x140]
7912    mova                 m9, [rsp+0x150]
7913    mova                m10, [rsp+0x160]
7914    mova                m11, [rsp+0x170]
7915    punpcklwd           m14, m2, m6     ; 67a
7916    punpckhwd            m2, m6         ; 67b
7917    mova         [rsp+0x50], m5
7918    mova         [rsp+0x60], m7
7919    mova         [rsp+0x70], m14
7920    mova         [rsp+0x80], m2
7921    mova                 m2, m3
7922    mova                 m3, m4
7923%else
7924    MC_8TAP_SCALED_H   0x20, 0
7925    punpcklwd            m6, m0, m4
7926    punpckhwd            m7, m0, m4
7927    mova                 m0, [esp+0x180] ; 01a
7928    mova                 m1, [esp+0x190] ; 01b
7929    mova                 m2, [rsp+0x1a0]  ; 23a
7930    mova                 m3, [esp+0x1b0]  ; 23b
7931    mova                 m4, [esp+0x1c0]  ; 45a
7932    mova                 m5, [esp+0x1d0]  ; 45b
7933    mova        [esp+0x180], m2
7934    mova        [esp+0x190], m3
7935    mova        [esp+0x1a0], m4
7936    mova        [esp+0x1b0], m5
7937    mova        [esp+0x1c0], m6          ; 67a
7938    mova        [esp+0x1d0], m7          ; 67b
7939%endif
7940    jmp .dy2_vloop
7941.ret:
7942    MC_8TAP_SCALED_RET 0
7943%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
7944 %define r0m [rstk+stack_offset+ 4]
7945 %define r1m [rstk+stack_offset+ 8]
7946 %define r2m [rstk+stack_offset+12]
7947 %define r3m [rstk+stack_offset+16]
7948%endif
7949%undef isprep
7950%endmacro
7951
7952%macro BILIN_SCALED_FN 1
7953cglobal %1_bilin_scaled_8bpc
7954    mov                 t0d, (5*15 << 16) | 5*15
7955    mov                 t1d, (5*15 << 16) | 5*15
7956    jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
7957%endmacro
7958
7959%if WIN64
7960DECLARE_REG_TMP 6, 5
7961%elif ARCH_X86_64
7962DECLARE_REG_TMP 6, 8
7963%else
7964DECLARE_REG_TMP 1, 2
7965%endif
7966%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
7967BILIN_SCALED_FN put
7968PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_8bpc
7969PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_8bpc
7970PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_8bpc
7971PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_8bpc
7972PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_8bpc
7973PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_8bpc
7974PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_8bpc
7975PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_8bpc
7976PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
7977MC_8TAP_SCALED put
7978
7979%if WIN64
7980DECLARE_REG_TMP 5, 4
7981%elif ARCH_X86_64
7982DECLARE_REG_TMP 6, 7
7983%else
7984DECLARE_REG_TMP 1, 2
7985%endif
7986%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
7987BILIN_SCALED_FN prep
7988PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_8bpc
7989PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_8bpc
7990PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_8bpc
7991PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_8bpc
7992PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_8bpc
7993PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_8bpc
7994PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_8bpc
7995PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_8bpc
7996PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
7997MC_8TAP_SCALED prep
7998
7999%if ARCH_X86_32
8000 %macro SAVE_ALPHA_BETA 0
8001    mov              alpham, alphad
8002    mov               betam, betad
8003 %endmacro
8004
8005 %macro SAVE_DELTA_GAMMA 0
8006    mov              deltam, deltad
8007    mov              gammam, gammad
8008 %endmacro
8009
8010 %macro LOAD_ALPHA_BETA_MX 0
8011    mov                 mym, myd
8012    mov              alphad, alpham
8013    mov               betad, betam
8014    mov                 mxd, mxm
8015 %endmacro
8016
8017 %macro LOAD_DELTA_GAMMA_MY 0
8018    mov                 mxm, mxd
8019    mov              deltad, deltam
8020    mov              gammad, gammam
8021    mov                 myd, mym
8022 %endmacro
8023
8024 %define PIC_reg r2
8025 %define PIC_base_offset $$
8026 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
8027%else
8028 %define SAVE_ALPHA_BETA
8029 %define SAVE_DELTA_GAMMA
8030 %define PIC_sym(sym) sym
8031%endif
8032
8033%if ARCH_X86_32
8034 %if STACK_ALIGNMENT < required_stack_alignment
8035  %assign copy_args 8*4
8036 %else
8037  %assign copy_args 0
8038 %endif
8039%endif
8040
8041%macro RELOC_ARGS 0
8042 %if copy_args
8043    mov                  r0, r0m
8044    mov                  r1, r1m
8045    mov                  r2, r2m
8046    mov                  r3, r3m
8047    mov                  r5, r5m
8048    mov                dstm, r0
8049    mov                 dsm, r1
8050    mov                srcm, r2
8051    mov                 ssm, r3
8052    mov                 mxm, r5
8053    mov                  r0, r6m
8054    mov                 mym, r0
8055 %endif
8056%endmacro
8057
8058%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
8059 %if cpuflag(sse4)
8060    pblendw              %1, %2, 0xAA
8061 %else
8062    pand                 %2, m10
8063    por                  %1, %2
8064 %endif
8065%endmacro
8066
8067%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
8068 %if ARCH_X86_32
8069  %define m8  m4
8070  %define m9  m5
8071  %define m14 m6
8072  %define m15 m7
8073  %define m11 m7
8074 %endif
8075 %if ARCH_X86_32
8076    pxor                m11, m11
8077 %endif
8078    lea               tmp1d, [myq+deltaq*4]
8079    lea               tmp2d, [myq+deltaq*1]
8080    shr                 myd, 10
8081    shr               tmp1d, 10
8082    movq                 m2, [filterq+myq  *8] ; a
8083    movq                 m8, [filterq+tmp1q*8] ; e
8084    lea               tmp1d, [tmp2q+deltaq*4]
8085    lea                 myd, [tmp2q+deltaq*1]
8086    shr               tmp2d, 10
8087    shr               tmp1d, 10
8088    movq                 m3, [filterq+tmp2q*8] ; b
8089    movq                 m0, [filterq+tmp1q*8] ; f
8090    punpcklwd            m2, m3
8091    punpcklwd            m8, m0
8092    lea               tmp1d, [myq+deltaq*4]
8093    lea               tmp2d, [myq+deltaq*1]
8094    shr                 myd, 10
8095    shr               tmp1d, 10
8096    movq                 m0, [filterq+myq  *8] ; c
8097    movq                 m9, [filterq+tmp1q*8] ; g
8098    lea               tmp1d, [tmp2q+deltaq*4]
8099    lea                 myd, [tmp2q+gammaq]       ; my += gamma
8100    shr               tmp2d, 10
8101    shr               tmp1d, 10
8102    movq                 m3, [filterq+tmp2q*8] ; d
8103    movq                 m1, [filterq+tmp1q*8] ; h
8104    punpcklwd            m0, m3
8105    punpcklwd            m9, m1
8106    punpckldq            m1, m2, m0
8107    punpckhdq            m2, m0
8108    punpcklbw            m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
8109    punpckhbw            m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
8110    punpcklbw            m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
8111    punpckhbw           m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
8112    pmaddwd              m0, %3
8113    pmaddwd              m3, %5
8114    pmaddwd              m1, %7
8115    pmaddwd             m14, %9
8116    paddd                m0, m3
8117    paddd                m1, m14
8118    paddd                m0, m1
8119    mova                 %1, m0
8120 %if ARCH_X86_64
8121    SWAP                 m3, m14
8122 %endif
8123    punpckldq            m0, m8, m9
8124    punpckhdq            m8, m9
8125    punpcklbw            m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
8126    punpckhbw           m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
8127    punpcklbw            m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
8128    punpckhbw           m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
8129    pmaddwd              m1, %4
8130    pmaddwd             m14, %6
8131    pmaddwd              m2, %8
8132    pmaddwd             m15, %10
8133    paddd                m1, m14
8134    paddd                m2, m15
8135    paddd                m1, m2
8136    mova                 %2, m1
8137 %if ARCH_X86_64
8138    SWAP                m14, m3
8139 %endif
8140%endmacro
8141
8142%if ARCH_X86_64
8143 %define counterd r4d
8144%else
8145 %if copy_args == 0
8146  %define counterd dword r4m
8147 %else
8148  %define counterd dword [esp+stack_size-4*7]
8149 %endif
8150%endif
8151
8152%macro WARP_AFFINE_8X8 0
8153%if ARCH_X86_64
8154cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts
8155%else
8156cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts
8157 %if copy_args
8158  %define tmpm [esp+stack_size-4*1]
8159  %define tsm  [esp+stack_size-4*2]
8160 %endif
8161%endif
8162    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main
8163.loop:
8164%if ARCH_X86_32
8165 %define m12 m4
8166 %define m13 m5
8167 %define m14 m6
8168 %define m15 m7
8169    mova                m12, [esp+0xC0]
8170    mova                m13, [esp+0xD0]
8171    mova                m14, [esp+0xE0]
8172    mova                m15, [esp+0xF0]
8173%endif
8174    psrad               m12, 13
8175    psrad               m13, 13
8176    psrad               m14, 13
8177    psrad               m15, 13
8178    packssdw            m12, m13
8179    packssdw            m14, m15
8180    mova                m13, [PIC_sym(pw_8192)]
8181    pmulhrsw            m12, m13 ; (x + (1 << 6)) >> 7
8182    pmulhrsw            m14, m13
8183    mova       [tmpq+tsq*0], m12
8184    mova       [tmpq+tsq*2], m14
8185    dec            counterd
8186    jz   mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end
8187%if ARCH_X86_32
8188    mov                tmpm, tmpd
8189    mov                  r0, [esp+0x100]
8190    mov                  r1, [esp+0x104]
8191%endif
8192    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2
8193    lea                tmpq, [tmpq+tsq*4]
8194    jmp .loop
8195
8196%if ARCH_X86_64
8197cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \
8198                              dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
8199                              filter, tmp1, delta, my, gamma
8200%else
8201cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \
8202                              dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
8203                              filter, tmp1, delta, my, gamma
8204 %define alphaq     r0
8205 %define alphad     r0
8206 %define alpham     [esp+gprsize+0x100]
8207 %define betaq      r1
8208 %define betad      r1
8209 %define betam      [esp+gprsize+0x104]
8210 %define deltaq     r0
8211 %define deltad     r0
8212 %define deltam     [esp+gprsize+0x108]
8213 %define gammaq     r1
8214 %define gammad     r1
8215 %define gammam     [esp+gprsize+0x10C]
8216 %define filterq    r3
8217 %define tmp1q      r4
8218 %define tmp1d      r4
8219 %define tmp1m      [esp+gprsize+0x110]
8220 %define myq        r5
8221 %define myd        r5
8222 %define mym        r6m
8223 %if copy_args
8224  %define dstm [esp+stack_size-4*1]
8225  %define dsm  [esp+stack_size-4*2]
8226  %define srcm [esp+stack_size-4*3]
8227  %define ssm  [esp+stack_size-4*4]
8228  %define mxm  [esp+stack_size-4*5]
8229  %define mym  [esp+stack_size-4*6]
8230 %endif
8231%endif
8232    call .main
8233    jmp .start
8234.loop:
8235%if ARCH_X86_32
8236    mov                dstm, dstd
8237    mov              alphad, [esp+0x100]
8238    mov               betad, [esp+0x104]
8239%endif
8240    call .main2
8241    lea                dstq, [dstq+dsq*2]
8242.start:
8243%if notcpuflag(sse4)
8244  %define roundval pw_8192
8245 %if ARCH_X86_64
8246    mova                m10, [PIC_sym(roundval)]
8247 %else
8248  %define m10 [PIC_sym(roundval)]
8249 %endif
8250%endif
8251%if ARCH_X86_32
8252 %define m12 m5
8253 %define m13 m6
8254    mova                m12, [esp+0xC0]
8255    mova                m13, [esp+0xD0]
8256%endif
8257%if cpuflag(sse4)
8258 %if ARCH_X86_32
8259  %define m11 m4
8260    pxor                m11, m11
8261 %endif
8262    psrad               m12, 18
8263    psrad               m13, 18
8264    packusdw            m12, m13
8265    pavgw               m12, m11 ; (x + (1 << 10)) >> 11
8266%else
8267    psrad               m12, 17
8268    psrad               m13, 17
8269    packssdw            m12, m13
8270    pmulhrsw            m12, m10
8271%endif
8272%if ARCH_X86_32
8273 %define m14 m6
8274 %define m15 m7
8275    mova                m14, [esp+0xE0]
8276    mova                m15, [esp+0xF0]
8277%endif
8278%if cpuflag(sse4)
8279    psrad               m14, 18
8280    psrad               m15, 18
8281    packusdw            m14, m15
8282    pavgw               m14, m11 ; (x + (1 << 10)) >> 11
8283%else
8284    psrad               m14, 17
8285    psrad               m15, 17
8286    packssdw            m14, m15
8287    pmulhrsw            m14, m10
8288%endif
8289    packuswb            m12, m14
8290    movq       [dstq+dsq*0], m12
8291    movhps     [dstq+dsq*1], m12
8292    dec            counterd
8293    jg .loop
8294.end:
8295    RET
8296ALIGN function_align
8297.main:
8298%assign stack_offset stack_offset+gprsize
8299%if ARCH_X86_32
8300 %assign stack_size stack_size+4
8301 %if copy_args
8302  %assign stack_offset stack_offset-4
8303 %endif
8304    RELOC_ARGS
8305    LEA             PIC_reg, $$
8306 %define PIC_mem [esp+gprsize+0x114]
8307    mov               abcdd, abcdm
8308 %if copy_args == 0
8309    mov                 ssd, ssm
8310    mov                 mxd, mxm
8311 %endif
8312    mov             PIC_mem, PIC_reg
8313    mov                srcd, srcm
8314%endif
8315    movsx            deltad, word [abcdq+2*2]
8316    movsx            gammad, word [abcdq+2*3]
8317    lea               tmp1d, [deltaq*3]
8318    sub              gammad, tmp1d    ; gamma -= delta*3
8319    SAVE_DELTA_GAMMA
8320%if ARCH_X86_32
8321    mov               abcdd, abcdm
8322%endif
8323    movsx            alphad, word [abcdq+2*0]
8324    movsx             betad, word [abcdq+2*1]
8325    lea               tmp1q, [ssq*3+3]
8326    add                 mxd, 512+(64<<10)
8327    lea               tmp2d, [alphaq*3]
8328    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
8329%if ARCH_X86_32
8330    mov                srcm, srcd
8331    mov             PIC_reg, PIC_mem
8332%endif
8333    sub               betad, tmp2d    ; beta -= alpha*3
8334    lea             filterq, [PIC_sym(mc_warp_filter2)]
8335%if ARCH_X86_64
8336    mov                 myd, r6m
8337    pxor                m11, m11
8338%endif
8339    call .h
8340    psrld                m2, m0, 16
8341    psrld                m3, m1, 16
8342%if ARCH_X86_32
8343    mova [esp+gprsize+0x10], m3
8344%endif
8345    call .h
8346    psrld                m4, m0, 16
8347    psrld                m5, m1, 16
8348%if ARCH_X86_32
8349    mova [esp+gprsize+0x20], m4
8350    mova [esp+gprsize+0x30], m5
8351%endif
8352    call .h
8353%if ARCH_X86_64
8354 %define blendmask [rsp+gprsize+0x80]
8355%else
8356    mova                 m3, [esp+gprsize+0x10]
8357 %define blendmask [esp+gprsize+0x120]
8358 %define m10 m7
8359%endif
8360    pcmpeqd             m10, m10
8361    pslld               m10, 16
8362    mova          blendmask, m10
8363    BLENDHWDW            m2, m0 ; 0
8364    BLENDHWDW            m3, m1 ; 2
8365    mova [rsp+gprsize+0x00], m2
8366    mova [rsp+gprsize+0x10], m3
8367    call .h
8368%if ARCH_X86_32
8369    mova                 m4, [esp+gprsize+0x20]
8370    mova                 m5, [esp+gprsize+0x30]
8371%endif
8372    mova                m10, blendmask
8373    BLENDHWDW            m4, m0 ; 1
8374    BLENDHWDW            m5, m1 ; 3
8375    mova [rsp+gprsize+0x20], m4
8376    mova [rsp+gprsize+0x30], m5
8377    call .h
8378%if ARCH_X86_32
8379    mova                 m3, [esp+gprsize+0x10]
8380 %define m10 m5
8381%endif
8382    psrld                m6, m2, 16
8383    psrld                m7, m3, 16
8384    mova                m10, blendmask
8385    BLENDHWDW            m6, m0 ; 2
8386    BLENDHWDW            m7, m1 ; 4
8387    mova [rsp+gprsize+0x40], m6
8388    mova [rsp+gprsize+0x50], m7
8389    call .h
8390%if ARCH_X86_32
8391    mova                m4, [esp+gprsize+0x20]
8392    mova                m5, [esp+gprsize+0x30]
8393%endif
8394    psrld               m2, m4, 16
8395    psrld               m3, m5, 16
8396    mova                m10, blendmask
8397    BLENDHWDW           m2, m0 ; 3
8398    BLENDHWDW           m3, m1 ; 5
8399    mova [rsp+gprsize+0x60], m2
8400    mova [rsp+gprsize+0x70], m3
8401    call .h
8402%if ARCH_X86_32
8403    mova                 m6, [esp+gprsize+0x40]
8404    mova                 m7, [esp+gprsize+0x50]
8405 %define m10 m7
8406%endif
8407    psrld                m4, m6, 16
8408    psrld                m5, m7, 16
8409    mova                m10, blendmask
8410    BLENDHWDW            m4, m0 ; 4
8411    BLENDHWDW            m5, m1 ; 6
8412%if ARCH_X86_64
8413    add                 myd, 512+(64<<10)
8414    mova                 m6, m2
8415    mova                 m7, m3
8416%else
8417    mova [esp+gprsize+0x80], m4
8418    mova [esp+gprsize+0x90], m5
8419    add           dword mym, 512+(64<<10)
8420%endif
8421    mov            counterd, 4
8422    SAVE_ALPHA_BETA
8423.main2:
8424    call .h
8425%if ARCH_X86_32
8426    mova                 m6, [esp+gprsize+0x60]
8427    mova                 m7, [esp+gprsize+0x70]
8428 %define m10 m5
8429%endif
8430    psrld                m6, 16
8431    psrld                m7, 16
8432    mova                m10, blendmask
8433    BLENDHWDW            m6, m0 ; 5
8434    BLENDHWDW            m7, m1 ; 7
8435%if ARCH_X86_64
8436    WARP_V              m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
8437                                  m4, m5, \
8438                                  [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
8439                                  m6, m7
8440%else
8441    mova [esp+gprsize+0xA0], m6
8442    mova [esp+gprsize+0xB0], m7
8443    LOAD_DELTA_GAMMA_MY
8444    WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
8445           [esp+gprsize+0x00], [esp+gprsize+0x10], \
8446           [esp+gprsize+0x80], [esp+gprsize+0x90], \
8447           [esp+gprsize+0x20], [esp+gprsize+0x30], \
8448           [esp+gprsize+0xA0], [esp+gprsize+0xB0]
8449    LOAD_ALPHA_BETA_MX
8450%endif
8451    call .h
8452    mova                 m2, [rsp+gprsize+0x40]
8453    mova                 m3, [rsp+gprsize+0x50]
8454%if ARCH_X86_32
8455    mova                 m4, [rsp+gprsize+0x80]
8456    mova                 m5, [rsp+gprsize+0x90]
8457 %define m10 m7
8458%endif
8459    mova [rsp+gprsize+0x00], m2
8460    mova [rsp+gprsize+0x10], m3
8461    mova [rsp+gprsize+0x40], m4
8462    mova [rsp+gprsize+0x50], m5
8463    psrld                m4, 16
8464    psrld                m5, 16
8465    mova                m10, blendmask
8466    BLENDHWDW            m4, m0 ; 6
8467    BLENDHWDW            m5, m1 ; 8
8468%if ARCH_X86_64
8469    WARP_V              m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
8470                                  m6, m7, \
8471                                  [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
8472                                  m4, m5
8473%else
8474    mova [esp+gprsize+0x80], m4
8475    mova [esp+gprsize+0x90], m5
8476    LOAD_DELTA_GAMMA_MY
8477    WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
8478           [esp+gprsize+0x20], [esp+gprsize+0x30], \
8479           [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
8480           [esp+gprsize+0x00], [esp+gprsize+0x10], \
8481           [esp+gprsize+0x80], [esp+gprsize+0x90]
8482    mov                 mym, myd
8483    mov                dstd, dstm
8484    mov                 dsd, dsm
8485    mov                 mxd, mxm
8486%endif
8487    mova                 m2, [rsp+gprsize+0x60]
8488    mova                 m3, [rsp+gprsize+0x70]
8489%if ARCH_X86_32
8490    mova                 m6, [esp+gprsize+0xA0]
8491    mova                 m7, [esp+gprsize+0xB0]
8492%endif
8493    mova [rsp+gprsize+0x20], m2
8494    mova [rsp+gprsize+0x30], m3
8495    mova [rsp+gprsize+0x60], m6
8496    mova [rsp+gprsize+0x70], m7
8497    ret
8498ALIGN function_align
8499.h:
8500%if ARCH_X86_32
8501 %define m8  m3
8502 %define m9  m4
8503 %define m10 m5
8504 %define m14 m6
8505 %define m15 m7
8506%endif
8507    lea               tmp1d, [mxq+alphaq*4]
8508    lea               tmp2d, [mxq+alphaq*1]
8509%if ARCH_X86_32
8510 %assign stack_offset stack_offset+4
8511 %assign stack_size stack_size+4
8512 %define PIC_mem [esp+gprsize*2+0x114]
8513    mov             PIC_mem, PIC_reg
8514    mov                srcd, srcm
8515%endif
8516    movu                m10, [srcq]
8517%if ARCH_X86_32
8518    add                srcd, ssm
8519    mov                srcm, srcd
8520    mov             PIC_reg, PIC_mem
8521%else
8522    add                srcq, ssq
8523%endif
8524    shr                 mxd, 10
8525    shr               tmp1d, 10
8526    movq                 m1, [filterq+mxq  *8]  ; 0 X
8527    movq                 m8, [filterq+tmp1q*8]  ; 4 X
8528    lea               tmp1d, [tmp2q+alphaq*4]
8529    lea                 mxd, [tmp2q+alphaq*1]
8530    shr               tmp2d, 10
8531    shr               tmp1d, 10
8532    movhps               m1, [filterq+tmp2q*8]  ; 0 1
8533    movhps               m8, [filterq+tmp1q*8]  ; 4 5
8534    lea               tmp1d, [mxq+alphaq*4]
8535    lea               tmp2d, [mxq+alphaq*1]
8536    shr                 mxd, 10
8537    shr               tmp1d, 10
8538    movq                m14, [filterq+mxq  *8]  ; 2 X
8539    movq                 m9, [filterq+tmp1q*8]  ; 6 X
8540    lea               tmp1d, [tmp2q+alphaq*4]
8541    lea                 mxd, [tmp2q+betaq]  ; mx += beta
8542    shr               tmp2d, 10
8543    shr               tmp1d, 10
8544    movhps              m14, [filterq+tmp2q*8]  ; 2 3
8545    movhps               m9, [filterq+tmp1q*8]  ; 6 7
8546    pshufb               m0, m10, [PIC_sym(warp_8x8_shufA)]
8547    pmaddubsw            m0, m1
8548    pshufb               m1, m10, [PIC_sym(warp_8x8_shufB)]
8549    pmaddubsw            m1, m8
8550    pshufb              m15, m10, [PIC_sym(warp_8x8_shufC)]
8551    pmaddubsw           m15, m14
8552    pshufb              m10, m10, [PIC_sym(warp_8x8_shufD)]
8553    pmaddubsw           m10, m9
8554    phaddw               m0, m15
8555    phaddw               m1, m10
8556    mova                m14, [PIC_sym(pw_8192)]
8557    mova                 m9, [PIC_sym(pd_32768)]
8558    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
8559    pmaddwd              m1, m14
8560    paddd                m0, m9  ; rounded 14-bit result in upper 16 bits of dword
8561    paddd                m1, m9
8562    ret
8563%endmacro
8564
8565%if WIN64
8566DECLARE_REG_TMP 6, 4
8567%else
8568DECLARE_REG_TMP 6, 7
8569%endif
8570
8571%macro BIDIR_FN 1 ; op
8572    %1                    0
8573    lea            stride3q, [strideq*3]
8574    jmp                  wq
8575.w4_loop:
8576    %1_INC_PTR            2
8577    %1                    0
8578    lea                dstq, [dstq+strideq*4]
8579.w4: ; tile 4x
8580    movd   [dstq          ], m0      ; copy dw[0]
8581    pshuflw              m1, m0, q1032 ; swap dw[1] and dw[0]
8582    movd   [dstq+strideq*1], m1      ; copy dw[1]
8583    punpckhqdq           m0, m0      ; swap dw[3,2] with dw[1,0]
8584    movd   [dstq+strideq*2], m0      ; dw[2]
8585    psrlq                m0, 32      ; shift right in dw[3]
8586    movd   [dstq+stride3q ], m0      ; copy
8587    sub                  hd, 4
8588    jg .w4_loop
8589    RET
8590.w8_loop:
8591    %1_INC_PTR            2
8592    %1                    0
8593    lea                dstq, [dstq+strideq*2]
8594.w8:
8595    movq   [dstq          ], m0
8596    movhps [dstq+strideq*1], m0
8597    sub                  hd, 2
8598    jg .w8_loop
8599    RET
8600.w16_loop:
8601    %1_INC_PTR            2
8602    %1                    0
8603    lea                dstq, [dstq+strideq]
8604.w16:
8605    mova   [dstq          ], m0
8606    dec                  hd
8607    jg .w16_loop
8608    RET
8609.w32_loop:
8610    %1_INC_PTR            4
8611    %1                    0
8612    lea                dstq, [dstq+strideq]
8613.w32:
8614    mova   [dstq          ], m0
8615    %1                    2
8616    mova   [dstq + 16     ], m0
8617    dec                  hd
8618    jg .w32_loop
8619    RET
8620.w64_loop:
8621    %1_INC_PTR            8
8622    %1                    0
8623    add                dstq, strideq
8624.w64:
8625    %assign i 0
8626    %rep 4
8627    mova   [dstq + i*16   ], m0
8628    %assign i i+1
8629    %if i < 4
8630    %1                    2*i
8631    %endif
8632    %endrep
8633    dec                  hd
8634    jg .w64_loop
8635    RET
8636.w128_loop:
8637    %1_INC_PTR            16
8638    %1                    0
8639    add                dstq, strideq
8640.w128:
8641    %assign i 0
8642    %rep 8
8643    mova   [dstq + i*16   ], m0
8644    %assign i i+1
8645    %if i < 8
8646    %1                    2*i
8647    %endif
8648    %endrep
8649    dec                  hd
8650    jg .w128_loop
8651    RET
8652%endmacro
8653
8654%macro AVG 1 ; src_offset
8655    ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
8656    mova                 m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
8657    paddw                m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
8658    mova                 m1, [tmp1q+(%1+1)*mmsize]
8659    paddw                m1, [tmp2q+(%1+1)*mmsize]
8660    pmulhrsw             m0, m2
8661    pmulhrsw             m1, m2
8662    packuswb             m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
8663%endmacro
8664
8665%macro AVG_INC_PTR 1
8666    add               tmp1q, %1*mmsize
8667    add               tmp2q, %1*mmsize
8668%endmacro
8669
8670cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
8671    LEA                  r6, avg_ssse3_table
8672    tzcnt                wd, wm ; leading zeros
8673    movifnidn            hd, hm ; move h(stack) to h(register) if not already that register
8674    movsxd               wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
8675    mova                 m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
8676    add                  wq, r6
8677    BIDIR_FN            AVG
8678
8679%macro W_AVG 1 ; src_offset
8680    ; (a * weight + b * (16 - weight) + 128) >> 8
8681    ; = ((a - b) * weight + (b << 4) + 128) >> 8
8682    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
8683    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
8684    mova                 m2, [tmp1q+(%1+0)*mmsize]
8685    mova                 m0, m2
8686    psubw                m2, [tmp2q+(%1+0)*mmsize]
8687    mova                 m3, [tmp1q+(%1+1)*mmsize]
8688    mova                 m1, m3
8689    psubw                m3, [tmp2q+(%1+1)*mmsize]
8690    pmulhw               m2, m4
8691    pmulhw               m3, m4
8692    paddw                m0, m2
8693    paddw                m1, m3
8694    pmulhrsw             m0, m5
8695    pmulhrsw             m1, m5
8696    packuswb             m0, m1
8697%endmacro
8698
8699%define W_AVG_INC_PTR AVG_INC_PTR
8700
8701cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
8702    LEA                  r6, w_avg_ssse3_table
8703    tzcnt                wd, wm
8704    movd                 m4, r6m
8705    movifnidn            hd, hm
8706    pxor                 m0, m0
8707    movsxd               wq, dword [r6+wq*4]
8708    mova                 m5, [pw_2048+r6-w_avg_ssse3_table]
8709    pshufb               m4, m0
8710    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
8711    add                  wq, r6
8712    cmp           dword r6m, 7
8713    jg .weight_gt7
8714    mov                  r6, tmp1q
8715    psubw                m0, m4
8716    mov               tmp1q, tmp2q
8717    mova                 m4, m0 ; -weight
8718    mov               tmp2q, r6
8719.weight_gt7:
8720    BIDIR_FN          W_AVG
8721
8722%macro MASK 1 ; src_offset
8723    ; (a * m + b * (64 - m) + 512) >> 10
8724    ; = ((a - b) * m + (b << 6) + 512) >> 10
8725    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
8726    mova                 m3,     [maskq+(%1+0)*(mmsize/2)]
8727    mova                 m0,     [tmp2q+(%1+0)*mmsize] ; b
8728    psubw                m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
8729    mova                 m6, m3      ; m
8730    psubb                m3, m4, m6  ; -m
8731    paddw                m1, m1     ; (b - a) << 1
8732    paddb                m3, m3     ; -m << 1
8733    punpcklbw            m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
8734    pmulhw               m1, m2     ; (-m * (b - a)) << 10
8735    paddw                m0, m1     ; + b
8736    mova                 m1,     [tmp2q+(%1+1)*mmsize] ; b
8737    psubw                m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
8738    paddw                m2, m2  ; (b - a) << 1
8739    mova                 m6, m3  ; (-m << 1)
8740    punpckhbw            m3, m4, m6 ; (-m << 9)
8741    pmulhw               m2, m3 ; (-m << 9)
8742    paddw                m1, m2 ; (-m * (b - a)) << 10
8743    pmulhrsw             m0, m5 ; round
8744    pmulhrsw             m1, m5 ; round
8745    packuswb             m0, m1 ; interleave 16 -> 8
8746%endmacro
8747
8748%macro MASK_INC_PTR 1
8749    add               maskq, %1*mmsize/2
8750    add               tmp1q, %1*mmsize
8751    add               tmp2q, %1*mmsize
8752%endmacro
8753
8754%if ARCH_X86_64
8755cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
8756    movifnidn            hd, hm
8757%else
8758cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
8759%define hd dword r5m
8760%endif
8761%define base r6-mask_ssse3_table
8762    LEA                  r6, mask_ssse3_table
8763    tzcnt                wd, wm
8764    movsxd               wq, dword [r6+wq*4]
8765    pxor                 m4, m4
8766    mova                 m5, [base+pw_2048]
8767    add                  wq, r6
8768    mov               maskq, r6m
8769    BIDIR_FN           MASK
8770%undef hd
8771
8772%macro W_MASK_420_END 1-*
8773%rep %0
8774    call .main
8775    paddw                m2, [maskq+16*%1]
8776    mova      [maskq+16*%1], m2
8777    mova [dstq+strideq*1+16*(2*%1+0)], m0
8778    call .main
8779    psubw                m3, m7, m2
8780    psubw                m1, m7, [maskq+16*%1]
8781    psubw                m3, [dstq+strideq*1+16*(2*%1+1)]
8782    psrlw                m1, 2
8783    psrlw                m3, 2
8784    packuswb             m1, m3
8785    mova      [maskq+16*%1], m1
8786    mova [dstq+strideq*1+16*(2*%1+1)], m0
8787    %rotate 1
8788%endrep
8789%endmacro
8790
8791%if UNIX64
8792DECLARE_REG_TMP 7
8793%else
8794DECLARE_REG_TMP 5
8795%endif
8796
8797cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
8798%define base t0-w_mask_420_ssse3_table
8799    LEA                  t0, w_mask_420_ssse3_table
8800    tzcnt                wd, wm
8801    mov                 r6d, r7m ; sign
8802    sub               tmp2q, tmp1q
8803    movsxd               wq, [t0+wq*4]
8804    mova                 m6, [base+pw_2048]
8805    movddup              m7, [base+wm_420_sign+r6*8] ; 258 - sign
8806    add                  wq, t0
8807%if ARCH_X86_64
8808    mova                 m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
8809    movifnidn            hd, hm
8810%else
8811    %define              m8  [base+pw_6903]
8812    %define              hd  dword hm
8813%endif
8814    mov               maskq, maskmp
8815    call .main
8816    jmp                  wq
8817.w4_loop:
8818    call .main
8819    add               maskq, 4
8820    lea                dstq, [dstq+strideq*2]
8821.w4:
8822    pshufd               m3, m2, q2020
8823    pshufd               m2, m2, q3131
8824    psubw                m1, m7, m3
8825    psubw                m1, m2
8826    psrlw                m1, 2
8827    packuswb             m1, m1
8828    movd            [maskq], m1
8829    movd   [dstq+strideq*0], m0
8830    pshuflw              m1, m0, q1032
8831    movd   [dstq+strideq*1], m1
8832    punpckhqdq           m0, m0
8833    lea                dstq, [dstq+strideq*2]
8834    movd   [dstq+strideq*0], m0
8835    pshuflw              m1, m0, q1032
8836    movd   [dstq+strideq*1], m1
8837    sub                  hd, 4
8838    jg .w4_loop
8839    RET
8840.w8_loop:
8841    call .main
8842    add               maskq, 4
8843    lea                dstq, [dstq+strideq*2]
8844.w8:
8845    movhlps              m3, m2
8846    psubw                m1, m7, m2
8847    psubw                m1, m3
8848    psrlw                m1, 2
8849    packuswb             m1, m1
8850    movd            [maskq], m1
8851    movq   [dstq+strideq*0], m0
8852    movhps [dstq+strideq*1], m0
8853    sub                  hd, 2
8854    jg .w8_loop
8855    RET
8856.w16_loop:
8857    call .main
8858    add               maskq, 8
8859    lea                dstq, [dstq+strideq*2]
8860.w16:
8861    mova   [dstq+strideq*1], m2
8862    mova   [dstq+strideq*0], m0
8863    call .main
8864    psubw                m1, m7, [dstq+strideq*1]
8865    psubw                m1, m2
8866    psrlw                m1, 2
8867    packuswb             m1, m1
8868    movq            [maskq], m1
8869    mova   [dstq+strideq*1], m0
8870    sub                  hd, 2
8871    jg .w16_loop
8872    RET
8873.w32_loop:
8874    call .main
8875    add               maskq, 16
8876    lea                dstq, [dstq+strideq*2]
8877.w32:
8878    mova            [maskq], m2
8879    mova [dstq+strideq*0+16*0], m0
8880    call .main
8881    mova [dstq+strideq*1+16*1], m2
8882    mova [dstq+strideq*0+16*1], m0
8883    W_MASK_420_END        0
8884    sub                  hd, 2
8885    jg .w32_loop
8886    RET
8887.w64_loop:
8888    call .main
8889    add               maskq, 16*2
8890    lea                dstq, [dstq+strideq*2]
8891.w64:
8892    mova       [maskq+16*0], m2
8893    mova [dstq+strideq*0+16*0], m0
8894    call .main
8895    mova [dstq+strideq*1+16*1], m2
8896    mova [dstq+strideq*0+16*1], m0
8897    call .main
8898    mova       [maskq+16*1], m2
8899    mova [dstq+strideq*0+16*2], m0
8900    call .main
8901    mova [dstq+strideq*1+16*3], m2
8902    mova [dstq+strideq*0+16*3], m0
8903    W_MASK_420_END        0, 1
8904    sub                  hd, 2
8905    jg .w64_loop
8906    RET
8907.w128_loop:
8908    call .main
8909    add               maskq, 16*4
8910    lea                dstq, [dstq+strideq*2]
8911.w128:
8912    mova       [maskq+16*0], m2
8913    mova [dstq+strideq*0+16*0], m0
8914    call .main
8915    mova [dstq+strideq*1+16*1], m2
8916    mova [dstq+strideq*0+16*1], m0
8917    call .main
8918    mova       [maskq+16*1], m2
8919    mova [dstq+strideq*0+16*2], m0
8920    call .main
8921    mova [dstq+strideq*1+16*3], m2
8922    mova [dstq+strideq*0+16*3], m0
8923    call .main
8924    mova       [maskq+16*2], m2
8925    mova [dstq+strideq*0+16*4], m0
8926    call .main
8927    mova [dstq+strideq*1+16*5], m2
8928    mova [dstq+strideq*0+16*5], m0
8929    call .main
8930    mova       [maskq+16*3], m2
8931    mova [dstq+strideq*0+16*6], m0
8932    call .main
8933    mova [dstq+strideq*1+16*7], m2
8934    mova [dstq+strideq*0+16*7], m0
8935    W_MASK_420_END        0, 1, 2, 3
8936    sub                  hd, 2
8937    jg .w128_loop
8938    RET
8939ALIGN function_align
8940.main:
8941    mova                 m0, [tmp1q      +16*0]
8942    mova                 m3, [tmp1q+tmp2q+16*0]
8943    mova                 m1, [tmp1q      +16*1]
8944    mova                 m4, [tmp1q+tmp2q+16*1]
8945    add               tmp1q, 16*2
8946    psubw                m3, m0
8947    psubw                m4, m1
8948    pabsw                m5, m3
8949    psubusw              m2, m8, m5
8950    psrlw                m2, 8 ; 64 - m
8951    psllw                m5, m2, 10
8952    pmulhw               m3, m5
8953    pabsw                m5, m4
8954    paddw                m0, m3
8955    psubusw              m3, m8, m5
8956    psrlw                m3, 8
8957    phaddw               m2, m3
8958    psllw                m3, 10
8959    pmulhw               m4, m3
8960    paddw                m1, m4
8961    pmulhrsw             m0, m6
8962    pmulhrsw             m1, m6
8963    packuswb             m0, m1
8964    ret
8965
8966%macro W_MASK_422_BACKUP 1 ; mask_offset
8967%if ARCH_X86_64
8968    mova                m10, m2
8969%else
8970    mova      [maskq+16*%1], m2
8971%endif
8972%endmacro
8973
8974%macro W_MASK_422_END 1 ; mask_offset
8975%if ARCH_X86_64
8976    packuswb            m10, m2
8977    psubb                m1, m7, m10
8978    pavgb                m1, m9
8979%else
8980    mova                 m3, [maskq+16*%1]
8981    packuswb             m3, m2
8982    pxor                 m2, m2
8983    psubb                m1, m7, m3
8984    pavgb                m1, m2
8985%endif
8986    mova      [maskq+16*%1], m1
8987%endmacro
8988
8989cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask
8990%define base t0-w_mask_422_ssse3_table
8991    LEA                  t0, w_mask_422_ssse3_table
8992    tzcnt                wd, wm
8993    mov                 r6d, r7m ; sign
8994    sub               tmp2q, tmp1q
8995    movsxd               wq, [t0+wq*4]
8996    mova                 m6, [base+pw_2048]
8997    movddup              m7, [base+wm_422_sign+r6*8] ; 128 - sign
8998    add                  wq, t0
8999%if ARCH_X86_64
9000    mova                 m8, [base+pw_6903]
9001    pxor                 m9, m9
9002    movifnidn            hd, hm
9003%else
9004    add                  t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table
9005    %define              hd  dword hm
9006%endif
9007    mov               maskq, maskmp
9008    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9009    jmp                  wq
9010.w4_loop:
9011    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9012    add               maskq, 8
9013    lea                dstq, [dstq+strideq*2]
9014.w4:
9015    packuswb             m2, m2
9016    psubb                m1, m7, m2
9017%if ARCH_X86_64
9018    pavgb                m1, m9
9019%else
9020    pxor                 m2, m2
9021    pavgb                m1, m2
9022%endif
9023    movq            [maskq], m1
9024    movd   [dstq+strideq*0], m0
9025    pshuflw              m1, m0, q1032
9026    movd   [dstq+strideq*1], m1
9027    punpckhqdq           m0, m0
9028    lea                dstq, [dstq+strideq*2]
9029    movd   [dstq+strideq*0], m0
9030    pshuflw              m1, m0, q1032
9031    movd   [dstq+strideq*1], m1
9032    sub                  hd, 4
9033    jg .w4_loop
9034    RET
9035.w8_loop:
9036    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9037    add               maskq, 16
9038    lea                dstq, [dstq+strideq*2]
9039.w8:
9040    W_MASK_422_BACKUP     0
9041    movq   [dstq+strideq*0], m0
9042    movhps [dstq+strideq*1], m0
9043    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9044    lea                dstq, [dstq+strideq*2]
9045    W_MASK_422_END        0
9046    movq   [dstq+strideq*0], m0
9047    movhps [dstq+strideq*1], m0
9048    sub                  hd, 4
9049    jg .w8_loop
9050    RET
9051.w16_loop:
9052    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9053    add               maskq, 16
9054    lea                dstq, [dstq+strideq*2]
9055.w16:
9056    W_MASK_422_BACKUP     0
9057    mova   [dstq+strideq*0], m0
9058    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9059    W_MASK_422_END        0
9060    mova   [dstq+strideq*1], m0
9061    sub                  hd, 2
9062    jg .w16_loop
9063    RET
9064.w32_loop:
9065    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9066    add               maskq, 16
9067    add                dstq, strideq
9068.w32:
9069    W_MASK_422_BACKUP     0
9070    mova        [dstq+16*0], m0
9071    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9072    W_MASK_422_END        0
9073    mova        [dstq+16*1], m0
9074    dec                  hd
9075    jg .w32_loop
9076    RET
9077.w64_loop:
9078    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9079    add               maskq, 16*2
9080    add                dstq, strideq
9081.w64:
9082    W_MASK_422_BACKUP     0
9083    mova        [dstq+16*0], m0
9084    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9085    W_MASK_422_END        0
9086    mova        [dstq+16*1], m0
9087    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9088    W_MASK_422_BACKUP     1
9089    mova        [dstq+16*2], m0
9090    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9091    W_MASK_422_END        1
9092    mova        [dstq+16*3], m0
9093    dec                  hd
9094    jg .w64_loop
9095    RET
9096.w128_loop:
9097    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9098    add               maskq, 16*4
9099    add                dstq, strideq
9100.w128:
9101    W_MASK_422_BACKUP     0
9102    mova        [dstq+16*0], m0
9103    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9104    W_MASK_422_END        0
9105    mova        [dstq+16*1], m0
9106    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9107    W_MASK_422_BACKUP     1
9108    mova        [dstq+16*2], m0
9109    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9110    W_MASK_422_END        1
9111    mova        [dstq+16*3], m0
9112    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9113    W_MASK_422_BACKUP     2
9114    mova        [dstq+16*4], m0
9115    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9116    W_MASK_422_END        2
9117    mova        [dstq+16*5], m0
9118    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9119    W_MASK_422_BACKUP     3
9120    mova        [dstq+16*6], m0
9121    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9122    W_MASK_422_END        3
9123    mova        [dstq+16*7], m0
9124    dec                  hd
9125    jg .w128_loop
9126    RET
9127
9128cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
9129%define base t0-w_mask_444_ssse3_table
9130    LEA                  t0, w_mask_444_ssse3_table
9131    tzcnt                wd, wm
9132    mov               maskq, maskmp
9133    sub               tmp2q, tmp1q
9134    movsxd               wq, [t0+wq*4]
9135    mova                 m6, [base+pw_6903]
9136    mova                 m7, [base+pw_2048]
9137    add                  wq, t0
9138%if ARCH_X86_64
9139    mova                 m8, [base+pb_64]
9140    movifnidn            hd, hm
9141%else
9142    %define              m8  [base+pb_64]
9143    %define              hd  dword hm
9144%endif
9145    call .main
9146    jmp                  wq
9147.w4_loop:
9148    call .main
9149    lea                dstq, [dstq+strideq*2]
9150.w4:
9151    movd   [dstq+strideq*0], m0
9152    pshuflw              m1, m0, q1032
9153    movd   [dstq+strideq*1], m1
9154    punpckhqdq           m0, m0
9155    lea                dstq, [dstq+strideq*2]
9156    movd   [dstq+strideq*0], m0
9157    pshuflw              m1, m0, q1032
9158    movd   [dstq+strideq*1], m1
9159    sub                  hd, 4
9160    jg .w4_loop
9161    RET
9162.w8_loop:
9163    call .main
9164    lea                dstq, [dstq+strideq*2]
9165.w8:
9166    movq   [dstq+strideq*0], m0
9167    movhps [dstq+strideq*1], m0
9168    sub                  hd, 2
9169    jg .w8_loop
9170    RET
9171.w16_loop:
9172    call .main
9173    lea                dstq, [dstq+strideq*2]
9174.w16:
9175    mova   [dstq+strideq*0], m0
9176    call .main
9177    mova   [dstq+strideq*1], m0
9178    sub                  hd, 2
9179    jg .w16_loop
9180    RET
9181.w32_loop:
9182    call .main
9183    add                dstq, strideq
9184.w32:
9185    mova        [dstq+16*0], m0
9186    call .main
9187    mova        [dstq+16*1], m0
9188    dec                  hd
9189    jg .w32_loop
9190    RET
9191.w64_loop:
9192    call .main
9193    add                dstq, strideq
9194.w64:
9195    mova        [dstq+16*0], m0
9196    call .main
9197    mova        [dstq+16*1], m0
9198    call .main
9199    mova        [dstq+16*2], m0
9200    call .main
9201    mova        [dstq+16*3], m0
9202    dec                  hd
9203    jg .w64_loop
9204    RET
9205.w128_loop:
9206    call .main
9207    add                dstq, strideq
9208.w128:
9209    mova        [dstq+16*0], m0
9210    call .main
9211    mova        [dstq+16*1], m0
9212    call .main
9213    mova        [dstq+16*2], m0
9214    call .main
9215    mova        [dstq+16*3], m0
9216    call .main
9217    mova        [dstq+16*4], m0
9218    call .main
9219    mova        [dstq+16*5], m0
9220    call .main
9221    mova        [dstq+16*6], m0
9222    call .main
9223    mova        [dstq+16*7], m0
9224    dec                  hd
9225    jg .w128_loop
9226    RET
9227ALIGN function_align
9228.main:
9229    mova                 m0, [tmp1q      +16*0]
9230    mova                 m3, [tmp1q+tmp2q+16*0]
9231    mova                 m1, [tmp1q      +16*1]
9232    mova                 m4, [tmp1q+tmp2q+16*1]
9233    add               tmp1q, 16*2
9234    psubw                m3, m0
9235    psubw                m4, m1
9236    pabsw                m5, m3
9237    psubusw              m2, m6, m5
9238    psrlw                m2, 8 ; 64 - m
9239    psllw                m5, m2, 10
9240    pmulhw               m3, m5
9241    pabsw                m5, m4
9242    paddw                m0, m3
9243    psubusw              m3, m6, m5
9244    psrlw                m3, 8
9245    packuswb             m2, m3
9246    psllw                m3, 10
9247    pmulhw               m4, m3
9248    psubb                m3, m8, m2
9249    paddw                m1, m4
9250    pmulhrsw             m0, m7
9251    pmulhrsw             m1, m7
9252    mova            [maskq], m3
9253    add               maskq, 16
9254    packuswb             m0, m1
9255    ret
9256
9257%macro BLEND_64M 4; a, b, mask1, mask2
9258    punpcklbw            m0, %1, %2; {b;a}[7..0]
9259    punpckhbw            %1, %2    ; {b;a}[15..8]
9260    pmaddubsw            m0, %3    ; {b*m[0] + (64-m[0])*a}[7..0] u16
9261    pmaddubsw            %1, %4    ; {b*m[1] + (64-m[1])*a}[15..8] u16
9262    pmulhrsw             m0, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
9263    pmulhrsw             %1, m5    ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
9264    packuswb             m0, %1    ; {blendpx}[15..0] u8
9265%endmacro
9266
9267%macro BLEND 2; a, b
9268    psubb                m3, m4, m0 ; m3 = (64 - m)
9269    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
9270    punpckhbw            m3, m0     ; {m;(64-m)}[15..8]
9271    BLEND_64M            %1, %2, m2, m3
9272%endmacro
9273
9274cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
9275%define base r6-blend_ssse3_table
9276    LEA                  r6, blend_ssse3_table
9277    tzcnt                wd, wm
9278    movifnidn            hd, hm
9279    movifnidn         maskq, maskmp
9280    movsxd               wq, dword [r6+wq*4]
9281    mova                 m4, [base+pb_64]
9282    mova                 m5, [base+pw_512]
9283    add                  wq, r6
9284    lea                  r6, [dsq*3]
9285    jmp                  wq
9286.w4:
9287    movq                 m0, [maskq]; m
9288    movd                 m1, [dstq+dsq*0] ; a
9289    movd                 m6, [dstq+dsq*1]
9290    punpckldq            m1, m6
9291    movq                 m6, [tmpq] ; b
9292    psubb                m3, m4, m0 ; m3 = (64 - m)
9293    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
9294    punpcklbw            m1, m6    ; {b;a}[7..0]
9295    pmaddubsw            m1, m2    ; {b*m[0] + (64-m[0])*a}[7..0] u16
9296    pmulhrsw             m1, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
9297    packuswb             m1, m0    ; {blendpx}[15..0] u8
9298    movd       [dstq+dsq*0], m1
9299    psrlq                m1, 32
9300    movd       [dstq+dsq*1], m1
9301    add               maskq, 8
9302    add                tmpq, 8
9303    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
9304    sub                  hd, 2
9305    jg .w4
9306    RET
9307.w8:
9308    mova                 m0, [maskq]; m
9309    movq                 m1, [dstq+dsq*0] ; a
9310    movhps               m1, [dstq+dsq*1]
9311    mova                 m6, [tmpq] ; b
9312    BLEND                m1, m6
9313    movq       [dstq+dsq*0], m0
9314    movhps     [dstq+dsq*1], m0
9315    add               maskq, 16
9316    add                tmpq, 16
9317    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
9318    sub                  hd, 2
9319    jg .w8
9320    RET
9321.w16:
9322    mova                 m0, [maskq]; m
9323    mova                 m1, [dstq] ; a
9324    mova                 m6, [tmpq] ; b
9325    BLEND                m1, m6
9326    mova             [dstq], m0
9327    add               maskq, 16
9328    add                tmpq, 16
9329    add                dstq, dsq ; dst_stride
9330    dec                  hd
9331    jg .w16
9332    RET
9333.w32:
9334    %assign i 0
9335    %rep 2
9336    mova                 m0, [maskq+16*i]; m
9337    mova                 m1, [dstq+16*i] ; a
9338    mova                 m6, [tmpq+16*i] ; b
9339    BLEND                m1, m6
9340    mova        [dstq+i*16], m0
9341    %assign i i+1
9342    %endrep
9343    add               maskq, 32
9344    add                tmpq, 32
9345    add                dstq, dsq ; dst_stride
9346    dec                  hd
9347    jg .w32
9348    RET
9349
9350cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
9351%define base r5-blend_v_ssse3_table
9352    LEA                  r5, blend_v_ssse3_table
9353    tzcnt                wd, wm
9354    movifnidn            hd, hm
9355    movsxd               wq, dword [r5+wq*4]
9356    mova                 m5, [base+pw_512]
9357    add                  wq, r5
9358    add               maskq, obmc_masks-blend_v_ssse3_table
9359    jmp                  wq
9360.w2:
9361    movd                 m3, [maskq+4]
9362    punpckldq            m3, m3
9363    ; 2 mask blend is provided for 4 pixels / 2 lines
9364.w2_loop:
9365    movd                 m1, [dstq+dsq*0] ; a {..;a;a}
9366    pinsrw               m1, [dstq+dsq*1], 1
9367    movd                 m2, [tmpq] ; b
9368    punpcklbw            m0, m1, m2; {b;a}[7..0]
9369    pmaddubsw            m0, m3    ; {b*m + (64-m)*a}[7..0] u16
9370    pmulhrsw             m0, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
9371    packuswb             m0, m1    ; {blendpx}[8..0] u8
9372    movd                r3d, m0
9373    mov        [dstq+dsq*0], r3w
9374    shr                 r3d, 16
9375    mov        [dstq+dsq*1], r3w
9376    add                tmpq, 2*2
9377    lea                dstq, [dstq + dsq * 2]
9378    sub                  hd, 2
9379    jg .w2_loop
9380    RET
9381.w4:
9382    movddup              m3, [maskq+8]
9383    ; 4 mask blend is provided for 8 pixels / 2 lines
9384.w4_loop:
9385    movd                 m1, [dstq+dsq*0] ; a
9386    movd                 m2, [dstq+dsq*1] ;
9387    punpckldq            m1, m2
9388    movq                 m2, [tmpq] ; b
9389    punpcklbw            m1, m2    ; {b;a}[7..0]
9390    pmaddubsw            m1, m3    ; {b*m + (64-m)*a}[7..0] u16
9391    pmulhrsw             m1, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
9392    packuswb             m1, m1    ; {blendpx}[8..0] u8
9393    movd             [dstq], m1
9394    psrlq                m1, 32
9395    movd       [dstq+dsq*1], m1
9396    add                tmpq, 2*4
9397    lea                dstq, [dstq+dsq*2]
9398    sub                  hd, 2
9399    jg .w4_loop
9400    RET
9401.w8:
9402    mova                 m3, [maskq+16]
9403    ; 8 mask blend is provided for 16 pixels
9404.w8_loop:
9405    movq                 m1, [dstq+dsq*0] ; a
9406    movhps               m1, [dstq+dsq*1]
9407    mova                 m2, [tmpq]; b
9408    BLEND_64M            m1, m2, m3, m3
9409    movq       [dstq+dsq*0], m0
9410    movhps     [dstq+dsq*1], m0
9411    add                tmpq, 16
9412    lea                dstq, [dstq+dsq*2]
9413    sub                  hd, 2
9414    jg .w8_loop
9415    RET
9416.w16:
9417    ; 16 mask blend is provided for 32 pixels
9418    mova                  m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
9419    mova                  m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
9420.w16_loop:
9421    mova                 m1, [dstq] ; a
9422    mova                 m2, [tmpq] ; b
9423    BLEND_64M            m1, m2, m3, m4
9424    mova             [dstq], m0
9425    add                tmpq, 16
9426    add                dstq, dsq
9427    dec                  hd
9428    jg .w16_loop
9429    RET
9430.w32:
9431%if WIN64
9432    mova            [rsp+8], xmm6
9433%endif
9434    mova                 m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
9435    mova                 m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
9436    mova                 m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
9437    ; 16 mask blend is provided for 64 pixels
9438.w32_loop:
9439    mova                 m1, [dstq+16*0] ; a
9440    mova                 m2, [tmpq+16*0] ; b
9441    BLEND_64M            m1, m2, m3, m4
9442    movq                 m1, [dstq+16*1] ; a
9443    punpcklbw            m1, [tmpq+16*1] ; b
9444    pmaddubsw            m1, m6
9445    pmulhrsw             m1, m5
9446    packuswb             m1, m1
9447    mova        [dstq+16*0], m0
9448    movq        [dstq+16*1], m1
9449    add                tmpq, 32
9450    add                dstq, dsq
9451    dec                  hd
9452    jg .w32_loop
9453%if WIN64
9454    mova               xmm6, [rsp+8]
9455%endif
9456    RET
9457
9458cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
9459%define base t0-blend_h_ssse3_table
9460%if ARCH_X86_32
9461    ; We need to keep the PIC pointer for w4, reload wd from stack instead
9462    DECLARE_REG_TMP 6
9463%else
9464    DECLARE_REG_TMP 5
9465    mov                 r6d, wd
9466%endif
9467    LEA                  t0, blend_h_ssse3_table
9468    tzcnt                wd, wm
9469    mov                  hd, hm
9470    movsxd               wq, dword [t0+wq*4]
9471    mova                 m5, [base+pw_512]
9472    add                  wq, t0
9473    lea               maskq, [base+obmc_masks+hq*2]
9474    lea                  hd, [hq*3]
9475    shr                  hd, 2 ; h * 3/4
9476    lea               maskq, [maskq+hq*2]
9477    neg                  hq
9478    jmp                  wq
9479.w2:
9480    movd                 m0, [dstq+dsq*0]
9481    pinsrw               m0, [dstq+dsq*1], 1
9482    movd                 m2, [maskq+hq*2]
9483    movd                 m1, [tmpq]
9484    punpcklwd            m2, m2
9485    punpcklbw            m0, m1
9486    pmaddubsw            m0, m2
9487    pmulhrsw             m0, m5
9488    packuswb             m0, m0
9489    movd                r3d, m0
9490    mov        [dstq+dsq*0], r3w
9491    shr                 r3d, 16
9492    mov        [dstq+dsq*1], r3w
9493    lea                dstq, [dstq+dsq*2]
9494    add                tmpq, 2*2
9495    add                  hq, 2
9496    jl .w2
9497    RET
9498.w4:
9499%if ARCH_X86_32
9500    mova                 m3, [base+blend_shuf]
9501%else
9502    mova                 m3, [blend_shuf]
9503%endif
9504.w4_loop:
9505    movd                 m0, [dstq+dsq*0]
9506    movd                 m2, [dstq+dsq*1]
9507    punpckldq            m0, m2 ; a
9508    movq                 m1, [tmpq] ; b
9509    movq                 m2, [maskq+hq*2] ; m
9510    pshufb               m2, m3
9511    punpcklbw            m0, m1
9512    pmaddubsw            m0, m2
9513    pmulhrsw             m0, m5
9514    packuswb             m0, m0
9515    movd       [dstq+dsq*0], m0
9516    psrlq                m0, 32
9517    movd       [dstq+dsq*1], m0
9518    lea                dstq, [dstq+dsq*2]
9519    add                tmpq, 4*2
9520    add                  hq, 2
9521    jl .w4_loop
9522    RET
9523.w8:
9524    movd                 m4, [maskq+hq*2]
9525    punpcklwd            m4, m4
9526    pshufd               m3, m4, q0000
9527    pshufd               m4, m4, q1111
9528    movq                 m1, [dstq+dsq*0] ; a
9529    movhps               m1, [dstq+dsq*1]
9530    mova                 m2, [tmpq]
9531    BLEND_64M            m1, m2, m3, m4
9532    movq       [dstq+dsq*0], m0
9533    movhps     [dstq+dsq*1], m0
9534    lea                dstq, [dstq+dsq*2]
9535    add                tmpq, 8*2
9536    add                  hq, 2
9537    jl .w8
9538    RET
9539; w16/w32/w64/w128
9540.w16:
9541%if ARCH_X86_32
9542    mov                 r6d, wm
9543%endif
9544    sub                 dsq, r6
9545.w16_loop0:
9546    movd                 m3, [maskq+hq*2]
9547    pshuflw              m3, m3, q0000
9548    punpcklqdq           m3, m3
9549    mov                  wd, r6d
9550.w16_loop:
9551    mova                 m1, [dstq] ; a
9552    mova                 m2, [tmpq] ; b
9553    BLEND_64M            m1, m2, m3, m3
9554    mova             [dstq], m0
9555    add                dstq, 16
9556    add                tmpq, 16
9557    sub                  wd, 16
9558    jg .w16_loop
9559    add                dstq, dsq
9560    inc                  hq
9561    jl .w16_loop0
9562    RET
9563
9564; emu_edge args:
9565; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
9566; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
9567; const pixel *ref, const ptrdiff_t ref_stride
9568;
9569; bw, bh total filled size
9570; iw, ih, copied block -> fill bottom, right
9571; x, y, offset in bw/bh -> fill top, left
9572cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \
9573                                  y, dst, dstride, src, sstride, \
9574                                  bottomext, rightext, blk
9575    ; we assume that the buffer (stride) is larger than width, so we can
9576    ; safely overwrite by a few bytes
9577    pxor                 m1, m1
9578
9579%if ARCH_X86_64
9580 %define reg_zero       r12q
9581 %define reg_tmp        r10
9582 %define reg_src        srcq
9583 %define reg_bottomext  bottomextq
9584 %define reg_rightext   rightextq
9585 %define reg_blkm       r9m
9586%else
9587 %define reg_zero       r6
9588 %define reg_tmp        r0
9589 %define reg_src        r1
9590 %define reg_bottomext  r0
9591 %define reg_rightext   r1
9592 %define reg_blkm       r2m
9593%endif
9594    ;
9595    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
9596    xor            reg_zero, reg_zero
9597    lea             reg_tmp, [ihq-1]
9598    cmp                  yq, ihq
9599    cmovs           reg_tmp, yq
9600    test                 yq, yq
9601    cmovs           reg_tmp, reg_zero
9602%if ARCH_X86_64
9603    imul            reg_tmp, sstrideq
9604    add                srcq, reg_tmp
9605%else
9606    imul            reg_tmp, sstridem
9607    mov             reg_src, srcm
9608    add             reg_src, reg_tmp
9609%endif
9610    ;
9611    ; ref += iclip(x, 0, iw - 1)
9612    lea             reg_tmp, [iwq-1]
9613    cmp                  xq, iwq
9614    cmovs           reg_tmp, xq
9615    test                 xq, xq
9616    cmovs           reg_tmp, reg_zero
9617    add             reg_src, reg_tmp
9618%if ARCH_X86_32
9619    mov                srcm, reg_src
9620%endif
9621    ;
9622    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
9623%if ARCH_X86_32
9624    mov                  r1, r1m ; restore bh
9625%endif
9626    lea       reg_bottomext, [yq+bhq]
9627    sub       reg_bottomext, ihq
9628    lea                  r3, [bhq-1]
9629    cmovs     reg_bottomext, reg_zero
9630    ;
9631
9632    DEFINE_ARGS bw, bh, iw, ih, x, \
9633                topext, dst, dstride, src, sstride, \
9634                bottomext, rightext, blk
9635
9636    ; top_ext = iclip(-y, 0, bh - 1)
9637    neg             topextq
9638    cmovs           topextq, reg_zero
9639    cmp       reg_bottomext, bhq
9640    cmovns    reg_bottomext, r3
9641    cmp             topextq, bhq
9642    cmovg           topextq, r3
9643 %if ARCH_X86_32
9644    mov                 r4m, reg_bottomext
9645    ;
9646    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
9647    mov                  r0, r0m ; restore bw
9648 %endif
9649    lea        reg_rightext, [xq+bwq]
9650    sub        reg_rightext, iwq
9651    lea                  r2, [bwq-1]
9652    cmovs      reg_rightext, reg_zero
9653
9654    DEFINE_ARGS bw, bh, iw, ih, leftext, \
9655                topext, dst, dstride, src, sstride, \
9656                bottomext, rightext, blk
9657
9658    ; left_ext = iclip(-x, 0, bw - 1)
9659    neg            leftextq
9660    cmovs          leftextq, reg_zero
9661    cmp        reg_rightext, bwq
9662    cmovns     reg_rightext, r2
9663 %if ARCH_X86_32
9664    mov                 r3m, r1
9665 %endif
9666    cmp            leftextq, bwq
9667    cmovns         leftextq, r2
9668
9669%undef reg_zero
9670%undef reg_tmp
9671%undef reg_src
9672%undef reg_bottomext
9673%undef reg_rightext
9674
9675    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
9676                topext, dst, dstride, src, sstride, \
9677                bottomext, rightext, blk
9678
9679    ; center_h = bh - top_ext - bottom_ext
9680%if ARCH_X86_64
9681    lea                  r3, [bottomextq+topextq]
9682    sub            centerhq, r3
9683%else
9684    mov                   r1, centerhm ; restore r1
9685    sub             centerhq, topextq
9686    sub             centerhq, r4m
9687    mov                  r1m, centerhq
9688%endif
9689    ;
9690    ; blk += top_ext * PXSTRIDE(dst_stride)
9691    mov                  r2, topextq
9692%if ARCH_X86_64
9693    imul                 r2, dstrideq
9694%else
9695    mov                  r6, r6m ; restore dstq
9696    imul                 r2, dstridem
9697%endif
9698    add                dstq, r2
9699    mov            reg_blkm, dstq ; save pointer for ext
9700    ;
9701    ; center_w = bw - left_ext - right_ext
9702    mov            centerwq, bwq
9703%if ARCH_X86_64
9704    lea                  r3, [rightextq+leftextq]
9705    sub            centerwq, r3
9706%else
9707    sub            centerwq, r3m
9708    sub            centerwq, leftextq
9709%endif
9710
9711; vloop Macro
9712%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
9713  %if ARCH_X86_64
9714    %define reg_tmp        r12
9715  %else
9716    %define reg_tmp        r0
9717  %endif
9718.v_loop_%3:
9719  %if ARCH_X86_32
9720    mov                  r0, r0m
9721    mov                  r1, r1m
9722  %endif
9723%if %1
9724    ; left extension
9725  %if ARCH_X86_64
9726    movd                 m0, [srcq]
9727  %else
9728    mov                  r3, srcm
9729    movd                 m0, [r3]
9730  %endif
9731    pshufb               m0, m1
9732    xor                  r3, r3
9733.left_loop_%3:
9734    mova          [dstq+r3], m0
9735    add                  r3, mmsize
9736    cmp                  r3, leftextq
9737    jl .left_loop_%3
9738    ; body
9739    lea             reg_tmp, [dstq+leftextq]
9740%endif
9741    xor                  r3, r3
9742.body_loop_%3:
9743  %if ARCH_X86_64
9744    movu                 m0, [srcq+r3]
9745  %else
9746    mov                  r1, srcm
9747    movu                 m0, [r1+r3]
9748  %endif
9749%if %1
9750    movu       [reg_tmp+r3], m0
9751%else
9752    movu          [dstq+r3], m0
9753%endif
9754    add                  r3, mmsize
9755    cmp                  r3, centerwq
9756    jl .body_loop_%3
9757%if %2
9758    ; right extension
9759%if %1
9760    add             reg_tmp, centerwq
9761%else
9762    lea             reg_tmp, [dstq+centerwq]
9763%endif
9764  %if ARCH_X86_64
9765    movd                 m0, [srcq+centerwq-1]
9766  %else
9767    mov                  r3, srcm
9768    movd                 m0, [r3+centerwq-1]
9769  %endif
9770    pshufb               m0, m1
9771    xor                  r3, r3
9772.right_loop_%3:
9773    movu       [reg_tmp+r3], m0
9774    add                  r3, mmsize
9775  %if ARCH_X86_64
9776    cmp                  r3, rightextq
9777  %else
9778    cmp                  r3, r3m
9779  %endif
9780    jl .right_loop_%3
9781%endif
9782  %if ARCH_X86_64
9783    add                dstq, dstrideq
9784    add                srcq, sstrideq
9785    dec            centerhq
9786    jg .v_loop_%3
9787  %else
9788    add                dstq, dstridem
9789    mov                  r0, sstridem
9790    add                srcm, r0
9791    sub       dword centerhm, 1
9792    jg .v_loop_%3
9793    mov                  r0, r0m ; restore r0
9794  %endif
9795%endmacro ; vloop MACRO
9796
9797    test           leftextq, leftextq
9798    jnz .need_left_ext
9799 %if ARCH_X86_64
9800    test          rightextq, rightextq
9801    jnz .need_right_ext
9802 %else
9803    cmp            leftextq, r3m ; leftextq == 0
9804    jne .need_right_ext
9805 %endif
9806    v_loop                0, 0, 0
9807    jmp .body_done
9808
9809    ;left right extensions
9810.need_left_ext:
9811 %if ARCH_X86_64
9812    test          rightextq, rightextq
9813 %else
9814    mov                  r3, r3m
9815    test                 r3, r3
9816 %endif
9817    jnz .need_left_right_ext
9818    v_loop                1, 0, 1
9819    jmp .body_done
9820
9821.need_left_right_ext:
9822    v_loop                1, 1, 2
9823    jmp .body_done
9824
9825.need_right_ext:
9826    v_loop                0, 1, 3
9827
9828.body_done:
9829; r0 ; bw
9830; r1 ;; x loop
9831; r4 ;; y loop
9832; r5 ; topextq
9833; r6 ;dstq
9834; r7 ;dstrideq
9835; r8 ; srcq
9836%if ARCH_X86_64
9837 %define reg_dstride    dstrideq
9838%else
9839 %define reg_dstride    r2
9840%endif
9841    ;
9842    ; bottom edge extension
9843 %if ARCH_X86_64
9844    test         bottomextq, bottomextq
9845    jz .top
9846 %else
9847    xor                  r1, r1
9848    cmp                  r1, r4m
9849    je .top
9850 %endif
9851    ;
9852 %if ARCH_X86_64
9853    mov                srcq, dstq
9854    sub                srcq, dstrideq
9855    xor                  r1, r1
9856 %else
9857    mov                  r3, dstq
9858    mov         reg_dstride, dstridem
9859    sub                  r3, reg_dstride
9860    mov                srcm, r3
9861 %endif
9862    ;
9863.bottom_x_loop:
9864 %if ARCH_X86_64
9865    mova                 m0, [srcq+r1]
9866    lea                  r3, [dstq+r1]
9867    mov                  r4, bottomextq
9868 %else
9869    mov                  r3, srcm
9870    mova                 m0, [r3+r1]
9871    lea                  r3, [dstq+r1]
9872    mov                  r4, r4m
9873 %endif
9874    ;
9875.bottom_y_loop:
9876    mova               [r3], m0
9877    add                  r3, reg_dstride
9878    dec                  r4
9879    jg .bottom_y_loop
9880    add                  r1, mmsize
9881    cmp                  r1, bwq
9882    jl .bottom_x_loop
9883
9884.top:
9885    ; top edge extension
9886    test            topextq, topextq
9887    jz .end
9888%if ARCH_X86_64
9889    mov                srcq, reg_blkm
9890%else
9891    mov                  r3, reg_blkm
9892    mov         reg_dstride, dstridem
9893%endif
9894    mov                dstq, dstm
9895    xor                  r1, r1
9896    ;
9897.top_x_loop:
9898%if ARCH_X86_64
9899    mova                 m0, [srcq+r1]
9900%else
9901    mov                  r3, reg_blkm
9902    mova                 m0, [r3+r1]
9903%endif
9904    lea                  r3, [dstq+r1]
9905    mov                  r4, topextq
9906    ;
9907.top_y_loop:
9908    mova               [r3], m0
9909    add                  r3, reg_dstride
9910    dec                  r4
9911    jg .top_y_loop
9912    add                  r1, mmsize
9913    cmp                  r1, bwq
9914    jl .top_x_loop
9915
9916.end:
9917    RET
9918
9919%undef reg_dstride
9920%undef reg_blkm
9921%undef reg_tmp
9922
9923cextern resize_filter
9924
9925%macro SCRATCH 3
9926%if ARCH_X86_32
9927    mova [rsp+%3*mmsize], m%1
9928%define m%2 [rsp+%3*mmsize]
9929%else
9930    SWAP             %1, %2
9931%endif
9932%endmacro
9933
9934%if ARCH_X86_64
9935cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \
9936                                dst_w, h, src_w, dx, mx0
9937%elif STACK_ALIGNMENT >= 16
9938cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
9939                                      dst_w, h, src_w, dx, mx0
9940%else
9941cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
9942                                      dst_w, h, src_w, dx, mx0
9943%endif
9944    movifnidn          dstq, dstmp
9945    movifnidn          srcq, srcmp
9946%if STACK_ALIGNMENT >= 16
9947    movifnidn        dst_wd, dst_wm
9948%endif
9949%if ARCH_X86_64
9950    movifnidn            hd, hm
9951%endif
9952    sub          dword mx0m, 4<<14
9953    sub        dword src_wm, 8
9954    movd                 m7, dxm
9955    movd                 m6, mx0m
9956    movd                 m5, src_wm
9957    pshufd               m7, m7, q0000
9958    pshufd               m6, m6, q0000
9959    pshufd               m5, m5, q0000
9960
9961%if ARCH_X86_64
9962    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
9963    LEA                  r7, $$
9964%define base r7-$$
9965%else
9966    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
9967%define hd dword r5m
9968%if STACK_ALIGNMENT >= 16
9969    LEA                  r6, $$
9970%define base r6-$$
9971%else
9972    LEA                  r4, $$
9973%define base r4-$$
9974%endif
9975%endif
9976
9977%if ARCH_X86_64
9978    mova                m10, [base+pw_m256]
9979    mova                 m9, [base+pd_63]
9980    mova                 m8, [base+pb_8x0_8x8]
9981%else
9982%define m10 [base+pw_m256]
9983%define m9  [base+pd_63]
9984%define m8  [base+pb_8x0_8x8]
9985%endif
9986    pmaddwd              m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
9987    pslld                m7, 2                      ; dx*4
9988    pslld                m5, 14
9989    paddd                m6, m4                     ; mx+[0..3]*dx
9990    SCRATCH               7, 13, 0
9991    SCRATCH               6, 12, 1
9992    SCRATCH               5, 11, 2
9993
9994    ; m10 = pmulhrsw constant for x=(x+64)>>7
9995    ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8
9996
9997.loop_y:
9998    xor                  xd, xd
9999    mova                 m0, m12                    ; per-line working version of mx
10000
10001.loop_x:
10002    pxor                 m1, m1
10003    pcmpgtd              m1, m0
10004    pandn                m1, m0
10005    psrad                m2, m0, 8                  ; filter offset (unmasked)
10006    pcmpgtd              m3, m11, m1
10007    pand                 m1, m3
10008    pandn                m3, m11
10009    por                  m1, m3
10010    psubd                m3, m0, m1                 ; pshufb offset
10011    psrad                m1, 14                     ; clipped src_x offset
10012    psrad                m3, 14                     ; pshufb edge_emu offset
10013    pand                 m2, m9                     ; filter offset (masked)
10014
10015    ; load source pixels
10016%if ARCH_X86_64
10017    movd                r8d, m1
10018    pshuflw              m1, m1, q3232
10019    movd                r9d, m1
10020    punpckhqdq           m1, m1
10021    movd               r10d, m1
10022    psrlq                m1, 32
10023    movd               r11d, m1
10024    movq                 m4, [srcq+r8]
10025    movq                 m5, [srcq+r10]
10026    movhps               m4, [srcq+r9]
10027    movhps               m5, [srcq+r11]
10028%else
10029    movd                r3d,  m1
10030    pshufd               m1,  m1, q3312
10031    movd                r1d,  m1
10032    pshuflw              m1,  m1, q3232
10033    movq                 m4, [srcq+r3]
10034    movq                 m5, [srcq+r1]
10035    movd                r3d,  m1
10036    punpckhqdq           m1,  m1
10037    movd                r1d,  m1
10038    movhps               m4, [srcq+r3]
10039    movhps               m5, [srcq+r1]
10040%endif
10041
10042    ; if no emulation is required, we don't need to shuffle or emulate edges
10043    ; this also saves 2 quasi-vpgatherdqs
10044    pxor                 m6, m6
10045    pcmpeqb              m6, m3
10046%if ARCH_X86_64
10047    pmovmskb            r8d, m6
10048    cmp                 r8d, 0xffff
10049%else
10050    pmovmskb            r3d, m6
10051    cmp                 r3d, 0xffff
10052%endif
10053    je .filter
10054
10055%if ARCH_X86_64
10056    movd                r8d, m3
10057    pshuflw              m3, m3, q3232
10058    movd                r9d, m3
10059    punpckhqdq           m3, m3
10060    movd               r10d, m3
10061    psrlq                m3, 32
10062    movd               r11d, m3
10063    movsxd               r8, r8d
10064    movsxd               r9, r9d
10065    movsxd              r10, r10d
10066    movsxd              r11, r11d
10067    movq                 m6, [base+resize_shuf+4+r8]
10068    movq                 m7, [base+resize_shuf+4+r10]
10069    movhps               m6, [base+resize_shuf+4+r9]
10070    movhps               m7, [base+resize_shuf+4+r11]
10071%else
10072    movd                r3d, m3
10073    pshufd               m3, m3, q3312
10074    movd                r1d, m3
10075    pshuflw              m3, m3, q3232
10076    movq                 m6, [base+resize_shuf+4+r3]
10077    movq                 m7, [base+resize_shuf+4+r1]
10078    movd                r3d, m3
10079    punpckhqdq           m3, m3
10080    movd                r1d, m3
10081    movhps               m6, [base+resize_shuf+4+r3]
10082    movhps               m7, [base+resize_shuf+4+r1]
10083%endif
10084
10085    paddb                m6, m8
10086    paddb                m7, m8
10087    pshufb               m4, m6
10088    pshufb               m5, m7
10089
10090.filter:
10091%if ARCH_X86_64
10092    movd                r8d, m2
10093    pshuflw              m2, m2, q3232
10094    movd                r9d, m2
10095    punpckhqdq           m2, m2
10096    movd               r10d, m2
10097    psrlq                m2, 32
10098    movd               r11d, m2
10099    movq                 m6, [base+resize_filter+r8*8]
10100    movq                 m7, [base+resize_filter+r10*8]
10101    movhps               m6, [base+resize_filter+r9*8]
10102    movhps               m7, [base+resize_filter+r11*8]
10103%else
10104    movd                r3d, m2
10105    pshufd               m2, m2, q3312
10106    movd                r1d, m2
10107    pshuflw              m2, m2, q3232
10108    movq                 m6, [base+resize_filter+r3*8]
10109    movq                 m7, [base+resize_filter+r1*8]
10110    movd                r3d, m2
10111    punpckhqdq           m2, m2
10112    movd                r1d, m2
10113    movhps               m6, [base+resize_filter+r3*8]
10114    movhps               m7, [base+resize_filter+r1*8]
10115%endif
10116
10117    pmaddubsw            m4, m6
10118    pmaddubsw            m5, m7
10119    phaddw               m4, m5
10120    phaddsw              m4, m4
10121    pmulhrsw             m4, m10                    ; x=(x+64)>>7
10122    packuswb             m4, m4
10123    movd          [dstq+xq], m4
10124
10125    paddd                m0, m13
10126    add                  xd, 4
10127%if STACK_ALIGNMENT >= 16
10128    cmp                  xd, dst_wd
10129%else
10130    cmp                  xd, dst_wm
10131%endif
10132    jl .loop_x
10133
10134    add                dstq, dst_stridemp
10135    add                srcq, src_stridemp
10136    dec                  hd
10137    jg .loop_y
10138    RET
10139
10140INIT_XMM ssse3
10141WARP_AFFINE_8X8
10142
10143INIT_XMM sse4
10144WARP_AFFINE_8X8
10145