xref: /aosp_15_r20/external/libdav1d/src/x86/ipred_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_WEIGHT_TABLE 1-*
34*c0909341SAndroid Build Coastguard Worker    %rep %0
35*c0909341SAndroid Build Coastguard Worker        db %1-128, 127-%1
36*c0909341SAndroid Build Coastguard Worker        %rotate 1
37*c0909341SAndroid Build Coastguard Worker    %endrep
38*c0909341SAndroid Build Coastguard Worker%endmacro
39*c0909341SAndroid Build Coastguard Worker
40*c0909341SAndroid Build Coastguard Workersmooth_weights: SMOOTH_WEIGHT_TABLE         \
41*c0909341SAndroid Build Coastguard Worker      0,   0, 255, 128, 255, 149,  85,  64, \
42*c0909341SAndroid Build Coastguard Worker    255, 197, 146, 105,  73,  50,  37,  32, \
43*c0909341SAndroid Build Coastguard Worker    255, 225, 196, 170, 145, 123, 102,  84, \
44*c0909341SAndroid Build Coastguard Worker     68,  54,  43,  33,  26,  20,  17,  16, \
45*c0909341SAndroid Build Coastguard Worker    255, 240, 225, 210, 196, 182, 169, 157, \
46*c0909341SAndroid Build Coastguard Worker    145, 133, 122, 111, 101,  92,  83,  74, \
47*c0909341SAndroid Build Coastguard Worker     66,  59,  52,  45,  39,  34,  29,  25, \
48*c0909341SAndroid Build Coastguard Worker     21,  17,  14,  12,  10,   9,   8,   8, \
49*c0909341SAndroid Build Coastguard Worker    255, 248, 240, 233, 225, 218, 210, 203, \
50*c0909341SAndroid Build Coastguard Worker    196, 189, 182, 176, 169, 163, 156, 150, \
51*c0909341SAndroid Build Coastguard Worker    144, 138, 133, 127, 121, 116, 111, 106, \
52*c0909341SAndroid Build Coastguard Worker    101,  96,  91,  86,  82,  77,  73,  69, \
53*c0909341SAndroid Build Coastguard Worker     65,  61,  57,  54,  50,  47,  44,  41, \
54*c0909341SAndroid Build Coastguard Worker     38,  35,  32,  29,  27,  25,  22,  20, \
55*c0909341SAndroid Build Coastguard Worker     18,  16,  15,  13,  12,  10,   9,   8, \
56*c0909341SAndroid Build Coastguard Worker      7,   6,   6,   5,   5,   4,   4,   4
57*c0909341SAndroid Build Coastguard Worker
58*c0909341SAndroid Build Coastguard Worker; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __
59*c0909341SAndroid Build Coastguard Workerfilter_taps:  db 10,  0,  0,  0,  2, 10,  0,  0,  1,  1, 10,  0,  1,  1,  2, 10
60*c0909341SAndroid Build Coastguard Worker              db  6,  0,  0,  0,  2,  6,  0,  0,  2,  2,  6,  0,  1,  2,  2,  6
61*c0909341SAndroid Build Coastguard Worker              db  0, 12, -6,  0,  0,  9, -5,  0,  0,  7, -3,  0,  0,  5, -3,  0
62*c0909341SAndroid Build Coastguard Worker              db 12,  2, -4,  0,  9,  2, -3,  0,  7,  2, -3,  0,  5,  3, -3,  0
63*c0909341SAndroid Build Coastguard Worker              db 16,  0,  0,  0,  0, 16,  0,  0,  0,  0, 16,  0,  0,  0,  0, 16
64*c0909341SAndroid Build Coastguard Worker              db 16,  0,  0,  0,  0, 16,  0,  0,  0,  0, 16,  0,  0,  0,  0, 16
65*c0909341SAndroid Build Coastguard Worker              db  0, 10,-10,  0,  0,  6, -6,  0,  0,  4, -4,  0,  0,  2, -2,  0
66*c0909341SAndroid Build Coastguard Worker              db 10,  0,-10,  0,  6,  0, -6,  0,  4,  0, -4,  0,  2,  0, -2,  0
67*c0909341SAndroid Build Coastguard Worker              db  8,  0,  0,  0,  0,  8,  0,  0,  0,  0,  8,  0,  0,  0,  0,  8
68*c0909341SAndroid Build Coastguard Worker              db  4,  0,  0,  0,  0,  4,  0,  0,  0,  0,  4,  0,  0,  0,  0,  4
69*c0909341SAndroid Build Coastguard Worker              db  0, 16, -8,  0,  0, 16, -8,  0,  0, 16, -8,  0,  0, 16, -8,  0
70*c0909341SAndroid Build Coastguard Worker              db 16,  0, -4,  0, 16,  0, -4,  0, 16,  0, -4,  0, 16,  0, -4,  0
71*c0909341SAndroid Build Coastguard Worker              db  8,  0,  0,  0,  3,  8,  0,  0,  2,  3,  8,  0,  1,  2,  3,  8
72*c0909341SAndroid Build Coastguard Worker              db  4,  0,  0,  0,  3,  4,  0,  0,  2,  3,  4,  0,  2,  2,  3,  4
73*c0909341SAndroid Build Coastguard Worker              db  0, 10, -2,  0,  0,  6, -1,  0,  0,  4, -1,  0,  0,  2,  0,  0
74*c0909341SAndroid Build Coastguard Worker              db 10,  3, -1,  0,  6,  4, -1,  0,  4,  4, -1,  0,  3,  3, -1,  0
75*c0909341SAndroid Build Coastguard Worker              db 14,  0,  0,  0,  0, 14,  0,  0,  0,  0, 14,  0,  0,  0,  0, 14
76*c0909341SAndroid Build Coastguard Worker              db 12,  0,  0,  0,  1, 12,  0,  0,  0,  0, 12,  0,  0,  0,  1, 12
77*c0909341SAndroid Build Coastguard Worker              db  0, 14,-12,  0,  0, 12,-10,  0,  0, 11, -9,  0,  0, 10, -8,  0
78*c0909341SAndroid Build Coastguard Worker              db 14,  0,-10,  0, 12,  0, -9,  0, 11,  1, -8,  0,  9,  1, -7,  0
79*c0909341SAndroid Build Coastguard Workerfilter_perm:  db  0,  1,  2,  3, 24, 25, 26, 27,  4,  5,  6,  7, 28, 29, 30, 31
80*c0909341SAndroid Build Coastguard Worker              db 15, 11,  7,  3, 15, 11,  7,  3, 15, 11,  7,  3, 15, 11,  7,131
81*c0909341SAndroid Build Coastguard Worker              db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147
82*c0909341SAndroid Build Coastguard Worker              db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163
83*c0909341SAndroid Build Coastguard Workerfilter_end:   dd  2,  3, 16, 17, -1, -1, 20, 21,  0,  6, 24, 30,  1,  7, 25, 31
84*c0909341SAndroid Build Coastguard Workersmooth_shuf:  db  7,  7,  7,  7,  0,  1,  0,  1,  3,  3,  3,  3,  8,  9,  8,  9
85*c0909341SAndroid Build Coastguard Worker              db  5,  5,  5,  5,  4,  5,  4,  5,  1,  1,  1,  1, 12, 13, 12, 13
86*c0909341SAndroid Build Coastguard Worker              db  6,  6,  6,  6,  2,  3,  2,  3,  2,  2,  2,  2, 10, 11, 10, 11
87*c0909341SAndroid Build Coastguard Worker              db  4,  4,  4,  4,  6,  7,  6,  7,  0,  0,  0,  0, 14, 15, 14, 15
88*c0909341SAndroid Build Coastguard Workersmooth_endA:  db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
89*c0909341SAndroid Build Coastguard Worker              db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
90*c0909341SAndroid Build Coastguard Worker              db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
91*c0909341SAndroid Build Coastguard Worker              db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
92*c0909341SAndroid Build Coastguard Workersmooth_endB:  db  1,  3,  5,  7,  9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
93*c0909341SAndroid Build Coastguard Worker              db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95
94*c0909341SAndroid Build Coastguard Worker              db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111
95*c0909341SAndroid Build Coastguard Worker              db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
96*c0909341SAndroid Build Coastguard Workeripred_h_shuf: db  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4
97*c0909341SAndroid Build Coastguard Worker              db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
98*c0909341SAndroid Build Coastguard Workerpal_unpack:   db  0,  4,  8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
99*c0909341SAndroid Build Coastguard Workerpal_perm:     db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
100*c0909341SAndroid Build Coastguard Workerpb_63to0:     db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48
101*c0909341SAndroid Build Coastguard Worker              db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32
102*c0909341SAndroid Build Coastguard Worker              db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
103*c0909341SAndroid Build Coastguard Worker              db 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
104*c0909341SAndroid Build Coastguard Workerz_frac_table: db 64,  0, 62,  2, 60,  4, 58,  6, 56,  8, 54, 10, 52, 12, 50, 14
105*c0909341SAndroid Build Coastguard Worker              db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30
106*c0909341SAndroid Build Coastguard Worker              db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46
107*c0909341SAndroid Build Coastguard Worker              db 16, 48, 14, 50, 12, 52, 10, 54,  8, 56,  6, 58,  4, 60,  2, 62
108*c0909341SAndroid Build Coastguard Workerz_filter_s1:  db -1, -1, -1,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6
109*c0909341SAndroid Build Coastguard Worker              db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22
110*c0909341SAndroid Build Coastguard Worker              db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38
111*c0909341SAndroid Build Coastguard Worker              db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54
112*c0909341SAndroid Build Coastguard Workerz_filter_s5:  db 10,  9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16
113*c0909341SAndroid Build Coastguard Worker              db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32
114*c0909341SAndroid Build Coastguard Worker              db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48
115*c0909341SAndroid Build Coastguard Worker              db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64
116*c0909341SAndroid Build Coastguard Workerz_filter_s3:  db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
117*c0909341SAndroid Build Coastguard Workerz_filter_s2:  db  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
118*c0909341SAndroid Build Coastguard Workerz_filter_s4:  db  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7,  9,  8
119*c0909341SAndroid Build Coastguard Workerz_xpos_bc:    db 17, 17, 17, 17, 33, 33, 33, 33,  9,  9,  9,  9,  9,  9,  9,  9
120*c0909341SAndroid Build Coastguard Workerz_filter4_s1: db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
121*c0909341SAndroid Build Coastguard Worker              db  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8
122*c0909341SAndroid Build Coastguard Workerz_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72
123*c0909341SAndroid Build Coastguard Workerz_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80
124*c0909341SAndroid Build Coastguard Workerz_xpos_off2a: db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
125*c0909341SAndroid Build Coastguard Worker              db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
126*c0909341SAndroid Build Coastguard Worker              db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
127*c0909341SAndroid Build Coastguard Worker              db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56
128*c0909341SAndroid Build Coastguard Workerz_xpos_off2b: db  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
129*c0909341SAndroid Build Coastguard Worker              db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
130*c0909341SAndroid Build Coastguard Worker              db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
131*c0909341SAndroid Build Coastguard Worker              db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64
132*c0909341SAndroid Build Coastguard Workerz_xpos_mul:   dw  4,  4,  4,  4,  8,  8,  4,  4, 12, 12,  8,  8, 16, 16,  8,  8
133*c0909341SAndroid Build Coastguard Worker              dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16
134*c0909341SAndroid Build Coastguard Workerz_ypos_off1:  db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67
135*c0909341SAndroid Build Coastguard Worker              db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71
136*c0909341SAndroid Build Coastguard Worker              db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75
137*c0909341SAndroid Build Coastguard Worker              db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79
138*c0909341SAndroid Build Coastguard Workerz_ypos_off2:  db 64, 65, 64, 65,  0,  0,  0,  0, 64, 65, 64, 65,  0,  0,  0,  0
139*c0909341SAndroid Build Coastguard Worker              db 65, 66, 65, 66,  1,  1,  1,  1, 65, 66, 65, 66,  1,  1,  1,  1
140*c0909341SAndroid Build Coastguard Worker              db 66, 67, 66, 67,  2,  2,  2,  2, 66, 67, 66, 67,  2,  2,  2,  2
141*c0909341SAndroid Build Coastguard Worker              db 67, 68, 67, 68,  3,  3,  3,  3, 67, 68, 67, 68,  3,  3,  3,  3
142*c0909341SAndroid Build Coastguard Workerz_ypos_off3:  db  1,  2,  1,  2,  1,  1,  1,  1,  3,  4,  3,  4,  1,  1,  1,  1
143*c0909341SAndroid Build Coastguard Worker              db  5,  6,  5,  6,  3,  3,  3,  3,  7,  8,  7,  8,  3,  3,  3,  3
144*c0909341SAndroid Build Coastguard Worker              db  9, 10,  9, 10,  5,  5,  5,  5, 11, 12, 11, 12,  5,  5,  5,  5
145*c0909341SAndroid Build Coastguard Worker              db 13, 14, 13, 14,  7,  7,  7,  7, 15, 16, 15, 16,  7,  7,  7,  7
146*c0909341SAndroid Build Coastguard Workerz_ypos_mul1a: dw  1,  2,  3,  4,  5,  6,  7,  8, 17, 18, 19, 20, 21, 22, 23, 24
147*c0909341SAndroid Build Coastguard Worker              dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56
148*c0909341SAndroid Build Coastguard Workerz_ypos_mul1b: dw  9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32
149*c0909341SAndroid Build Coastguard Worker              dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64
150*c0909341SAndroid Build Coastguard Workerz_ypos_mul2a: dw  1*512,  2*512,  3*512,  4*512,  5*512,  6*512,  7*512,  8*512
151*c0909341SAndroid Build Coastguard Worker              dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512
152*c0909341SAndroid Build Coastguard Worker              dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512
153*c0909341SAndroid Build Coastguard Worker              dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512
154*c0909341SAndroid Build Coastguard Workerz_ypos_mul2b: dw  9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512
155*c0909341SAndroid Build Coastguard Worker              dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512
156*c0909341SAndroid Build Coastguard Worker              dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512
157*c0909341SAndroid Build Coastguard Worker              dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512
158*c0909341SAndroid Build Coastguard Workerz_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
159*c0909341SAndroid Build Coastguard Workerz_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
160*c0909341SAndroid Build Coastguard Workerz3_upsample:  db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
161*c0909341SAndroid Build Coastguard Worker              db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10,  9,  8
162*c0909341SAndroid Build Coastguard Workerz_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
163*c0909341SAndroid Build Coastguard Worker              db 39, 39, 47, 47, 47, 79, 79, 79
164*c0909341SAndroid Build Coastguard Workerz_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
165*c0909341SAndroid Build Coastguard Worker              db 32,  0, 32,  0, 24,  0, 24,  0, 16,  0, 16,  0
166*c0909341SAndroid Build Coastguard Worker              db  0, 32,  0, 32,  0, 24,  0, 24,  0, 16,  0, 16
167*c0909341SAndroid Build Coastguard Worker
168*c0909341SAndroid Build Coastguard Workerpb_8_56_0_0:  db  8, 56,  0,  0
169*c0909341SAndroid Build Coastguard Workerpb_m4_36:     times 2 db -4, 36
170*c0909341SAndroid Build Coastguard Workerpb_127_m127:  times 2 db 127, -127
171*c0909341SAndroid Build Coastguard Workerpb_8:         times 4 db 8
172*c0909341SAndroid Build Coastguard Workerpb_15:        times 4 db 15
173*c0909341SAndroid Build Coastguard Workerpb_16:        times 4 db 16
174*c0909341SAndroid Build Coastguard Workerpb_31:        times 4 db 31
175*c0909341SAndroid Build Coastguard Workerpb_63:        times 4 db 63
176*c0909341SAndroid Build Coastguard Workerpb_90:        times 4 db 90
177*c0909341SAndroid Build Coastguard Workerpb_128:       times 4 db 128
178*c0909341SAndroid Build Coastguard Workerpw_128:       times 2 dw 128
179*c0909341SAndroid Build Coastguard Workerpw_255:       times 2 dw 255
180*c0909341SAndroid Build Coastguard Workerpw_512:       times 2 dw 512
181*c0909341SAndroid Build Coastguard Worker
182*c0909341SAndroid Build Coastguard Worker%define pb_1  (ipred_h_shuf+24)
183*c0909341SAndroid Build Coastguard Worker%define pb_2  (ipred_h_shuf+20)
184*c0909341SAndroid Build Coastguard Worker%define pb_3  (ipred_h_shuf+16)
185*c0909341SAndroid Build Coastguard Worker%define pb_4  (smooth_shuf +48)
186*c0909341SAndroid Build Coastguard Worker%define pb_7  (ipred_h_shuf+ 0)
187*c0909341SAndroid Build Coastguard Worker%define pb_9  (z_xpos_bc   + 8)
188*c0909341SAndroid Build Coastguard Worker%define pb_17 (z_xpos_bc   + 0)
189*c0909341SAndroid Build Coastguard Worker%define pb_33 (z_xpos_bc   + 4)
190*c0909341SAndroid Build Coastguard Worker%define pd_8  (filter_taps+128)
191*c0909341SAndroid Build Coastguard Worker
192*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-*
193*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*4)
194*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_%2)
195*c0909341SAndroid Build Coastguard Worker    %%table:
196*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
197*c0909341SAndroid Build Coastguard Worker        dd %%base %+ .%3 - (%%table - 2*4)
198*c0909341SAndroid Build Coastguard Worker        %rotate 1
199*c0909341SAndroid Build Coastguard Worker    %endrep
200*c0909341SAndroid Build Coastguard Worker%endmacro
201*c0909341SAndroid Build Coastguard Worker
202*c0909341SAndroid Build Coastguard Worker%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4)
203*c0909341SAndroid Build Coastguard Worker
204*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_h_8bpc,          avx512icl, w4, w8, w16, w32, w64
205*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_paeth_8bpc,      avx512icl, w4, w8, w16, w32, w64
206*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_8bpc,     avx512icl, w4, w8, w16, w32, w64
207*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_v_8bpc,   avx512icl, w4, w8, w16, w32, w64
208*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_h_8bpc,   avx512icl, w4, w8, w16, w32, w64
209*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1_8bpc,         avx512icl, w4, w8, w16, w32, w64
210*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2_8bpc,         avx512icl, w4, w8, w16, w32, w64
211*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3_8bpc,         avx512icl, w4, w8, w16, w32, w64
212*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_8bpc,         avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
213*c0909341SAndroid Build Coastguard Worker                                       s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
214*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_left_8bpc,    avx512icl, h4, h8, h16, h32, h64
215*c0909341SAndroid Build Coastguard Worker
216*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative
217*c0909341SAndroid Build Coastguard Workercextern pb_0to63
218*c0909341SAndroid Build Coastguard Worker
219*c0909341SAndroid Build Coastguard WorkerSECTION .text
220*c0909341SAndroid Build Coastguard Worker
221*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
222*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h
223*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_left_8bpc_avx512icl_table]
224*c0909341SAndroid Build Coastguard Worker    movd                xm0, wm
225*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
226*c0909341SAndroid Build Coastguard Worker    inc                 tlq
227*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
228*c0909341SAndroid Build Coastguard Worker    movu                ym1, [tlq]
229*c0909341SAndroid Build Coastguard Worker    movd               xmm3, wd
230*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+wq*4]
231*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
232*c0909341SAndroid Build Coastguard Worker    psrld               xm0, 1
233*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym1, ym2
234*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
235*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
236*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
237*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
238*c0909341SAndroid Build Coastguard Worker    jmp                  r6
239*c0909341SAndroid Build Coastguard Worker
240*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
241*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_left_8bpc_avx512icl_table]
242*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
243*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
244*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
245*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
246*c0909341SAndroid Build Coastguard Worker    movd                xm0, hm
247*c0909341SAndroid Build Coastguard Worker    movu                ym1, [tlq]
248*c0909341SAndroid Build Coastguard Worker    movd               xmm3, r6d
249*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+r6*4]
250*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
251*c0909341SAndroid Build Coastguard Worker    psrld               xm0, 1
252*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym1, ym2
253*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
254*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
255*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
256*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
257*c0909341SAndroid Build Coastguard Worker    jmp                  r6
258*c0909341SAndroid Build Coastguard Worker.h64:
259*c0909341SAndroid Build Coastguard Worker    movu                ym1, [tlq+32] ; unaligned when jumping here from dc_top
260*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym1, ym2
261*c0909341SAndroid Build Coastguard Worker.h32:
262*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
263*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
264*c0909341SAndroid Build Coastguard Worker.h16:
265*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
266*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
267*c0909341SAndroid Build Coastguard Worker.h8:
268*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
269*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
270*c0909341SAndroid Build Coastguard Worker.h4:
271*c0909341SAndroid Build Coastguard Worker    vpsrlvd             xm0, xmm3
272*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
273*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
274*c0909341SAndroid Build Coastguard Worker    jmp                  wq
275*c0909341SAndroid Build Coastguard Worker
276*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
277*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
278*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
279*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
280*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [wq+hq]
281*c0909341SAndroid Build Coastguard Worker    movd                xm0, r5d
282*c0909341SAndroid Build Coastguard Worker    tzcnt               r5d, r5d
283*c0909341SAndroid Build Coastguard Worker    movd               xmm4, r5d
284*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_8bpc_avx512icl_table]
285*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
286*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+r6*4]
287*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4+5*4]
288*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1]
289*c0909341SAndroid Build Coastguard Worker    psrld               xm0, 1
290*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
291*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
292*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
293*c0909341SAndroid Build Coastguard Worker    jmp                  r6
294*c0909341SAndroid Build Coastguard Worker.h4:
295*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [tlq-4]
296*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm0, xmm1, xm3
297*c0909341SAndroid Build Coastguard Worker    jmp                  wq
298*c0909341SAndroid Build Coastguard Worker.w4:
299*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [tlq+1]
300*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm0, xmm1, xm3
301*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
302*c0909341SAndroid Build Coastguard Worker    jg .w4_mul
303*c0909341SAndroid Build Coastguard Worker    psrlw              xmm0, xm0, 3
304*c0909341SAndroid Build Coastguard Worker    jmp .w4_end
305*c0909341SAndroid Build Coastguard Worker.w4_mul:
306*c0909341SAndroid Build Coastguard Worker    punpckhqdq         xmm1, xm0, xm0
307*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
308*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x55563334
309*c0909341SAndroid Build Coastguard Worker    paddd              xmm1, xm0
310*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
311*c0909341SAndroid Build Coastguard Worker    psrlq              xmm0, xmm1, 32
312*c0909341SAndroid Build Coastguard Worker    paddd              xmm0, xmm1
313*c0909341SAndroid Build Coastguard Worker    movd               xmm1, r6d
314*c0909341SAndroid Build Coastguard Worker    psrld              xmm0, 2
315*c0909341SAndroid Build Coastguard Worker    pmulhuw            xmm0, xmm1
316*c0909341SAndroid Build Coastguard Worker.w4_end:
317*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm0, xmm0
318*c0909341SAndroid Build Coastguard Worker.s4:
319*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
320*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm0
321*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm0
322*c0909341SAndroid Build Coastguard Worker    movd   [dstq+stride3q ], xm0
323*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
324*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
325*c0909341SAndroid Build Coastguard Worker    jg .s4
326*c0909341SAndroid Build Coastguard Worker    RET
327*c0909341SAndroid Build Coastguard Worker.h8:
328*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [tlq-8]
329*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm0, xmm1, xm3
330*c0909341SAndroid Build Coastguard Worker    jmp                  wq
331*c0909341SAndroid Build Coastguard Worker.w8:
332*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [tlq+1]
333*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym0, 1
334*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm0, xmm1, xm3
335*c0909341SAndroid Build Coastguard Worker    paddd              xmm2, xm2, xm0
336*c0909341SAndroid Build Coastguard Worker    punpckhqdq         xmm0, xmm2, xmm2
337*c0909341SAndroid Build Coastguard Worker    paddd              xmm0, xmm2
338*c0909341SAndroid Build Coastguard Worker    psrlq              xmm1, xmm0, 32
339*c0909341SAndroid Build Coastguard Worker    paddd              xmm0, xmm1
340*c0909341SAndroid Build Coastguard Worker    vpsrlvd            xmm0, xmm4
341*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
342*c0909341SAndroid Build Coastguard Worker    je .w8_end
343*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
344*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
345*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
346*c0909341SAndroid Build Coastguard Worker    cmove               r6d, r2d
347*c0909341SAndroid Build Coastguard Worker    movd               xmm1, r6d
348*c0909341SAndroid Build Coastguard Worker    pmulhuw            xmm0, xmm1
349*c0909341SAndroid Build Coastguard Worker.w8_end:
350*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm0, xmm0
351*c0909341SAndroid Build Coastguard Worker.s8:
352*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
353*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm0
354*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
355*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm0
356*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
357*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
358*c0909341SAndroid Build Coastguard Worker    jg .s8
359*c0909341SAndroid Build Coastguard Worker    RET
360*c0909341SAndroid Build Coastguard Worker.h16:
361*c0909341SAndroid Build Coastguard Worker    mova               xmm1, [tlq-16]
362*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm0, xmm1, xm3
363*c0909341SAndroid Build Coastguard Worker    jmp                  wq
364*c0909341SAndroid Build Coastguard Worker.w16:
365*c0909341SAndroid Build Coastguard Worker    movu               xmm1, [tlq+1]
366*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym0, 1
367*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm0, xmm1, xm3
368*c0909341SAndroid Build Coastguard Worker    paddd              xmm2, xm2, xm0
369*c0909341SAndroid Build Coastguard Worker    punpckhqdq         xmm0, xmm2, xmm2
370*c0909341SAndroid Build Coastguard Worker    paddd              xmm0, xmm2
371*c0909341SAndroid Build Coastguard Worker    psrlq              xmm1, xmm0, 32
372*c0909341SAndroid Build Coastguard Worker    paddd              xmm0, xmm1
373*c0909341SAndroid Build Coastguard Worker    vpsrlvd            xmm0, xmm4
374*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
375*c0909341SAndroid Build Coastguard Worker    je .w16_end
376*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
377*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
378*c0909341SAndroid Build Coastguard Worker    test                 hb, 8|32
379*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
380*c0909341SAndroid Build Coastguard Worker    movd               xmm1, r6d
381*c0909341SAndroid Build Coastguard Worker    pmulhuw            xmm0, xmm1
382*c0909341SAndroid Build Coastguard Worker.w16_end:
383*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm0, xmm0
384*c0909341SAndroid Build Coastguard Worker.s16:
385*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm0
386*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm0
387*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], xm0
388*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], xm0
389*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
390*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
391*c0909341SAndroid Build Coastguard Worker    jg .s16
392*c0909341SAndroid Build Coastguard Worker    RET
393*c0909341SAndroid Build Coastguard Worker.h32:
394*c0909341SAndroid Build Coastguard Worker    mova                ym1, [tlq-32]
395*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym1, ym3
396*c0909341SAndroid Build Coastguard Worker    jmp                  wq
397*c0909341SAndroid Build Coastguard Worker.w32:
398*c0909341SAndroid Build Coastguard Worker    movu                ym1, [tlq+1]
399*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym1, ym3
400*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
401*c0909341SAndroid Build Coastguard Worker    paddd              xmm1, xm1, xm0
402*c0909341SAndroid Build Coastguard Worker    punpckhqdq         xmm0, xmm1, xmm1
403*c0909341SAndroid Build Coastguard Worker    paddd              xmm0, xmm1
404*c0909341SAndroid Build Coastguard Worker    psrlq              xmm1, xmm0, 32
405*c0909341SAndroid Build Coastguard Worker    paddd              xmm0, xmm1
406*c0909341SAndroid Build Coastguard Worker    vpsrlvd            xmm0, xmm4
407*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
408*c0909341SAndroid Build Coastguard Worker    je .w32_end
409*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
410*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x33345556
411*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
412*c0909341SAndroid Build Coastguard Worker    movd               xmm1, r6d
413*c0909341SAndroid Build Coastguard Worker    pmulhuw            xmm0, xmm1
414*c0909341SAndroid Build Coastguard Worker.w32_end:
415*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym0, xmm0
416*c0909341SAndroid Build Coastguard Worker.s32:
417*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], ym0
418*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], ym0
419*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], ym0
420*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], ym0
421*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
422*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
423*c0909341SAndroid Build Coastguard Worker    jg .s32
424*c0909341SAndroid Build Coastguard Worker    RET
425*c0909341SAndroid Build Coastguard Worker.h64:
426*c0909341SAndroid Build Coastguard Worker    mova                ym1, [tlq-64]
427*c0909341SAndroid Build Coastguard Worker    mova                ym2, [tlq-32]
428*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym1, ym3
429*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym2, ym3
430*c0909341SAndroid Build Coastguard Worker    jmp                  wq
431*c0909341SAndroid Build Coastguard Worker.w64:
432*c0909341SAndroid Build Coastguard Worker    movu                ym1, [tlq+ 1]
433*c0909341SAndroid Build Coastguard Worker    movu                ym2, [tlq+33]
434*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym1, ym3
435*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym2, ym3
436*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
437*c0909341SAndroid Build Coastguard Worker    paddd              xmm1, xm1, xm0
438*c0909341SAndroid Build Coastguard Worker    punpckhqdq         xmm0, xmm1, xmm1
439*c0909341SAndroid Build Coastguard Worker    paddd              xmm0, xmm1
440*c0909341SAndroid Build Coastguard Worker    psrlq              xmm1, xmm0, 32
441*c0909341SAndroid Build Coastguard Worker    paddd              xmm0, xmm1
442*c0909341SAndroid Build Coastguard Worker    vpsrlvd            xmm0, xmm4
443*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
444*c0909341SAndroid Build Coastguard Worker    je .w64_end
445*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x33345556
446*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, hd
447*c0909341SAndroid Build Coastguard Worker    movd               xmm1, r6d
448*c0909341SAndroid Build Coastguard Worker    pmulhuw            xmm0, xmm1
449*c0909341SAndroid Build Coastguard Worker.w64_end:
450*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xmm0
451*c0909341SAndroid Build Coastguard Worker.s64:
452*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
453*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
454*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m0
455*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m0
456*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
457*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
458*c0909341SAndroid Build Coastguard Worker    jg .s64
459*c0909341SAndroid Build Coastguard Worker    RET
460*c0909341SAndroid Build Coastguard Worker
461*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3
462*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_splat_8bpc_avx512icl_table]
463*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
464*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
465*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
466*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128]
467*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
468*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
469*c0909341SAndroid Build Coastguard Worker    jmp                  wq
470*c0909341SAndroid Build Coastguard Worker
471*c0909341SAndroid Build Coastguard Workercglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
472*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_splat_8bpc_avx512icl_table]
473*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
474*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+1]
475*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
476*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
477*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
478*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
479*c0909341SAndroid Build Coastguard Worker    jmp                  wq
480*c0909341SAndroid Build Coastguard Worker
481*c0909341SAndroid Build Coastguard Workercglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3
482*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_h_8bpc_avx512icl_table
483*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_h_8bpc_avx512icl_table]
484*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
485*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
486*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
487*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
488*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
489*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
490*c0909341SAndroid Build Coastguard Worker    jmp                  wq
491*c0909341SAndroid Build Coastguard Worker.w4:
492*c0909341SAndroid Build Coastguard Worker    mova               xmm1, [base+ipred_h_shuf+16]
493*c0909341SAndroid Build Coastguard Worker.w4_loop:
494*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [tlq+hq-4]
495*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xmm1
496*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xmm0
497*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xmm0, 1
498*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xmm0, 2
499*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xmm0, 3
500*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
501*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
502*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
503*c0909341SAndroid Build Coastguard Worker    RET
504*c0909341SAndroid Build Coastguard Worker.w8:
505*c0909341SAndroid Build Coastguard Worker    movsldup           xmm2, [base+ipred_h_shuf+16]
506*c0909341SAndroid Build Coastguard Worker    movshdup           xmm3, [base+ipred_h_shuf+16]
507*c0909341SAndroid Build Coastguard Worker.w8_loop:
508*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [tlq+hq-4]
509*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xmm1, xmm2
510*c0909341SAndroid Build Coastguard Worker    pshufb             xmm1, xmm3
511*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xmm0
512*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xmm1
513*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xmm0
514*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xmm1
515*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
516*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
517*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
518*c0909341SAndroid Build Coastguard Worker    RET
519*c0909341SAndroid Build Coastguard Worker.w16:
520*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+smooth_shuf]
521*c0909341SAndroid Build Coastguard Worker.w16_loop:
522*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+hq-4]
523*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
524*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
525*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
526*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
527*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
528*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
529*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
530*c0909341SAndroid Build Coastguard Worker    jg .w16
531*c0909341SAndroid Build Coastguard Worker    RET
532*c0909341SAndroid Build Coastguard Worker.w32:
533*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym3, [base+pb_1]
534*c0909341SAndroid Build Coastguard Worker    vpord                m2, m3, [base+pb_2] {1to16}
535*c0909341SAndroid Build Coastguard Worker.w32_loop:
536*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [tlq+hq-4]
537*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m2
538*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
539*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
540*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
541*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym1
542*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+stride3q ], m1, 1
543*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
544*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
545*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
546*c0909341SAndroid Build Coastguard Worker    RET
547*c0909341SAndroid Build Coastguard Worker.w64:
548*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pb_3]
549*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pb_2]
550*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pb_1]
551*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
552*c0909341SAndroid Build Coastguard Worker.w64_loop:
553*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [tlq+hq-4]
554*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3, m4
555*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m5
556*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, m6
557*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
558*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
559*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
560*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
561*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m3
562*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
563*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
564*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
565*c0909341SAndroid Build Coastguard Worker    RET
566*c0909341SAndroid Build Coastguard Worker
567*c0909341SAndroid Build Coastguard Worker%macro PAETH 0
568*c0909341SAndroid Build Coastguard Worker    psubusb              m1, m5, m4
569*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m4, m5
570*c0909341SAndroid Build Coastguard Worker    por                  m1, m0           ; tdiff
571*c0909341SAndroid Build Coastguard Worker    pavgb                m2, m6, m4
572*c0909341SAndroid Build Coastguard Worker    vpcmpub              k1, m1, m7, 1    ; tdiff < ldiff
573*c0909341SAndroid Build Coastguard Worker    vpblendmb        m0{k1}, m4, m6
574*c0909341SAndroid Build Coastguard Worker    vpternlogd           m4, m6, m8, 0x28 ; (m4 ^ m6) & m8
575*c0909341SAndroid Build Coastguard Worker    psubusb              m3, m5, m2
576*c0909341SAndroid Build Coastguard Worker    psubb                m2, m4
577*c0909341SAndroid Build Coastguard Worker    psubusb              m2, m5
578*c0909341SAndroid Build Coastguard Worker    por                  m2, m3
579*c0909341SAndroid Build Coastguard Worker    pminub               m1, m7
580*c0909341SAndroid Build Coastguard Worker    paddusb              m2, m2
581*c0909341SAndroid Build Coastguard Worker    por                  m2, m4           ; min(tldiff, 255)
582*c0909341SAndroid Build Coastguard Worker    vpcmpub              k1, m2, m1, 1    ; tldiff < ldiff && tldiff < tdiff
583*c0909341SAndroid Build Coastguard Worker    vmovdqu8         m0{k1}, m5
584*c0909341SAndroid Build Coastguard Worker%endmacro
585*c0909341SAndroid Build Coastguard Worker
586*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3
587*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_paeth_8bpc_avx512icl_table]
588*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
589*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m5, [tlq] ; topleft
590*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
591*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
592*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1]
593*c0909341SAndroid Build Coastguard Worker    lea                topq, [tlq+1]
594*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
595*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
596*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
597*c0909341SAndroid Build Coastguard Worker    jmp                  wq
598*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl
599*c0909341SAndroid Build Coastguard Worker.w4:
600*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [topq]
601*c0909341SAndroid Build Coastguard Worker    mova                 m9, [ipred_h_shuf]
602*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
603*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
604*c0909341SAndroid Build Coastguard Worker    por                  m7, m0 ; ldiff
605*c0909341SAndroid Build Coastguard Worker.w4_loop:
606*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [tlq+hq-8]
607*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m9 ; left
608*c0909341SAndroid Build Coastguard Worker    PAETH
609*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
610*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
611*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
612*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm0, 3
613*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
614*c0909341SAndroid Build Coastguard Worker    jl .w4_ret
615*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, m0, 1
616*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
617*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
618*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
619*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
620*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm0, 3
621*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
622*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
623*c0909341SAndroid Build Coastguard Worker.w4_ret:
624*c0909341SAndroid Build Coastguard Worker    RET
625*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
626*c0909341SAndroid Build Coastguard Worker.w8:
627*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [topq]
628*c0909341SAndroid Build Coastguard Worker    movsldup             m9, [smooth_shuf]
629*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
630*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
631*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
632*c0909341SAndroid Build Coastguard Worker.w8_loop:
633*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [tlq+hq-8]
634*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m9
635*c0909341SAndroid Build Coastguard Worker    PAETH
636*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m0, 2
637*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym0, 1
638*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m0, 3
639*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
640*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
641*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
642*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm3
643*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
644*c0909341SAndroid Build Coastguard Worker    jl .w8_ret
645*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
646*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm0
647*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
648*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm2
649*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
650*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
651*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
652*c0909341SAndroid Build Coastguard Worker.w8_ret:
653*c0909341SAndroid Build Coastguard Worker    RET
654*c0909341SAndroid Build Coastguard Worker.w16:
655*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [topq]
656*c0909341SAndroid Build Coastguard Worker    movsldup             m9, [smooth_shuf]
657*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
658*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
659*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
660*c0909341SAndroid Build Coastguard Worker.w16_loop:
661*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [tlq+hq-4]
662*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m9
663*c0909341SAndroid Build Coastguard Worker    PAETH
664*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
665*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
666*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
667*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
668*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
669*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
670*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
671*c0909341SAndroid Build Coastguard Worker    RET
672*c0909341SAndroid Build Coastguard Worker.w32:
673*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m6, [topq]
674*c0909341SAndroid Build Coastguard Worker    mova                ym9, ym8
675*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
676*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
677*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
678*c0909341SAndroid Build Coastguard Worker.w32_loop:
679*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [tlq+hq-2]
680*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m9
681*c0909341SAndroid Build Coastguard Worker    PAETH
682*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
683*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
684*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
685*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
686*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
687*c0909341SAndroid Build Coastguard Worker    RET
688*c0909341SAndroid Build Coastguard Worker.w64:
689*c0909341SAndroid Build Coastguard Worker    movu                 m6, [topq]
690*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
691*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
692*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
693*c0909341SAndroid Build Coastguard Worker.w64_loop:
694*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m4, [tlq+hq-1]
695*c0909341SAndroid Build Coastguard Worker    PAETH
696*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
697*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
698*c0909341SAndroid Build Coastguard Worker    dec                  hd
699*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
700*c0909341SAndroid Build Coastguard Worker    RET
701*c0909341SAndroid Build Coastguard Worker
702*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
703*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_v_8bpc_avx512icl_table
704*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_smooth_v_8bpc_avx512icl_table]
705*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
706*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
707*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
708*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+pb_127_m127]
709*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pw_128]
710*c0909341SAndroid Build Coastguard Worker    lea            weightsq, [base+smooth_weights+hq*4]
711*c0909341SAndroid Build Coastguard Worker    neg                  hq
712*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m4, [tlq+hq] ; bottom
713*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
714*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
715*c0909341SAndroid Build Coastguard Worker    jmp                  wq
716*c0909341SAndroid Build Coastguard Worker.w4:
717*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq+1]
718*c0909341SAndroid Build Coastguard Worker    movshdup             m5, [smooth_shuf]
719*c0909341SAndroid Build Coastguard Worker    mova                ym6, [smooth_endA]
720*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4 ; top, bottom
721*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m0
722*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2 ;   1 * top + 256 * bottom + 128, overflow is ok
723*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1 ; 128 * top + 129 * bottom + 128
724*c0909341SAndroid Build Coastguard Worker.w4_loop:
725*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [weightsq+hq*2]
726*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5
727*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m0
728*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
729*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m6, m0
730*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
731*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
732*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm1
733*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
734*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 2
735*c0909341SAndroid Build Coastguard Worker    add                  hq, 8
736*c0909341SAndroid Build Coastguard Worker    jg .ret
737*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
738*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 1
739*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
740*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 3
741*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
742*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
743*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
744*c0909341SAndroid Build Coastguard Worker.ret:
745*c0909341SAndroid Build Coastguard Worker    RET
746*c0909341SAndroid Build Coastguard Worker.w8:
747*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [tlq+1]
748*c0909341SAndroid Build Coastguard Worker    movshdup             m5, [smooth_shuf]
749*c0909341SAndroid Build Coastguard Worker    mova                ym6, [smooth_endA]
750*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4
751*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m0
752*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
753*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
754*c0909341SAndroid Build Coastguard Worker.w8_loop:
755*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [weightsq+hq*2]
756*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5
757*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m0
758*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
759*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m6, m0
760*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
761*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
762*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
763*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
764*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
765*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
766*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
767*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
768*c0909341SAndroid Build Coastguard Worker    RET
769*c0909341SAndroid Build Coastguard Worker.w16:
770*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [tlq+1]
771*c0909341SAndroid Build Coastguard Worker    movshdup             m6, [smooth_shuf]
772*c0909341SAndroid Build Coastguard Worker    mova                 m7, [smooth_endB]
773*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
774*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
775*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m0
776*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m0
777*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m2
778*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
779*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
780*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
781*c0909341SAndroid Build Coastguard Worker.w16_loop:
782*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [weightsq+hq*2]
783*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6
784*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m1
785*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3, m1
786*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
787*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
788*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m7, m1
789*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
790*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
791*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
792*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
793*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
794*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
795*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
796*c0909341SAndroid Build Coastguard Worker    RET
797*c0909341SAndroid Build Coastguard Worker.w32:
798*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m3, [tlq+1]
799*c0909341SAndroid Build Coastguard Worker    movshdup             m6, [smooth_shuf]
800*c0909341SAndroid Build Coastguard Worker    mova                 m7, [smooth_endB]
801*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
802*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
803*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m0
804*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m0
805*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m2
806*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
807*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
808*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
809*c0909341SAndroid Build Coastguard Worker.w32_loop:
810*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [weightsq+hq*2]
811*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6
812*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m1
813*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3, m1
814*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
815*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
816*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m7, m1
817*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
818*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
819*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
820*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
821*c0909341SAndroid Build Coastguard Worker    jl .w32_loop
822*c0909341SAndroid Build Coastguard Worker    RET
823*c0909341SAndroid Build Coastguard Worker.w64:
824*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+1]
825*c0909341SAndroid Build Coastguard Worker    mova                 m6, [smooth_endB]
826*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
827*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
828*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m0
829*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m0
830*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m2
831*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
832*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
833*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
834*c0909341SAndroid Build Coastguard Worker.w64_loop:
835*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [weightsq+hq*2]
836*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m1
837*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3, m1
838*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
839*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
840*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m6, m1
841*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
842*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
843*c0909341SAndroid Build Coastguard Worker    inc                  hq
844*c0909341SAndroid Build Coastguard Worker    jl .w64_loop
845*c0909341SAndroid Build Coastguard Worker    RET
846*c0909341SAndroid Build Coastguard Worker
847*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
848*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_smooth_h_8bpc_avx512icl_table
849*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_smooth_h_8bpc_avx512icl_table]
850*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
851*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
852*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m4, [tlq+r6] ; right
853*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
854*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
855*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pb_127_m127]
856*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_128]
857*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
858*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
859*c0909341SAndroid Build Coastguard Worker    vpmovb2m             k1, m6
860*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
861*c0909341SAndroid Build Coastguard Worker    jmp                  wq
862*c0909341SAndroid Build Coastguard Worker.w4:
863*c0909341SAndroid Build Coastguard Worker    movsldup             m3, [smooth_shuf]
864*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, [smooth_weights+4*2]
865*c0909341SAndroid Build Coastguard Worker    mova                ym8, [smooth_endA]
866*c0909341SAndroid Build Coastguard Worker.w4_loop:
867*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [tlq+hq-8]
868*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
869*c0909341SAndroid Build Coastguard Worker    vpshufb          m2{k1}, m0, m3 ; left, right
870*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m5
871*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m7
872*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
873*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
874*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
875*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m8, m0
876*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
877*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
878*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm1
879*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
880*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 2
881*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
882*c0909341SAndroid Build Coastguard Worker    jl .ret
883*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
884*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 1
885*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
886*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 3
887*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
888*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
889*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
890*c0909341SAndroid Build Coastguard Worker.ret:
891*c0909341SAndroid Build Coastguard Worker    RET
892*c0909341SAndroid Build Coastguard Worker.w8:
893*c0909341SAndroid Build Coastguard Worker    movsldup             m3, [smooth_shuf]
894*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [smooth_weights+8*2]
895*c0909341SAndroid Build Coastguard Worker    mova                ym8, [smooth_endA]
896*c0909341SAndroid Build Coastguard Worker.w8_loop:
897*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+hq-4]
898*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
899*c0909341SAndroid Build Coastguard Worker    vpshufb          m2{k1}, m0, m3
900*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m5
901*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m7
902*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
903*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
904*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
905*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m8, m0
906*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
907*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
908*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
909*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
910*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
911*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
912*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
913*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
914*c0909341SAndroid Build Coastguard Worker    RET
915*c0909341SAndroid Build Coastguard Worker.w16:
916*c0909341SAndroid Build Coastguard Worker    movsldup             m7, [smooth_shuf]
917*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [smooth_weights+16*2]
918*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [smooth_weights+16*3]
919*c0909341SAndroid Build Coastguard Worker    mova                m10, [smooth_endB]
920*c0909341SAndroid Build Coastguard Worker.w16_loop:
921*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+hq-4]
922*c0909341SAndroid Build Coastguard Worker    mova                 m3, m4
923*c0909341SAndroid Build Coastguard Worker    vpshufb          m3{k1}, m0, m7
924*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3, m5
925*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3, m8
926*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3, m9
927*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
928*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
929*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
930*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
931*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m10, m1
932*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
933*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
934*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
935*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
936*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
937*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
938*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
939*c0909341SAndroid Build Coastguard Worker    RET
940*c0909341SAndroid Build Coastguard Worker.w32:
941*c0909341SAndroid Build Coastguard Worker    mova                m10, [smooth_endA]
942*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym7, [pb_1]
943*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m8, [smooth_weights+32*2]
944*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m9, [smooth_weights+32*3]
945*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m10, m10, q3120
946*c0909341SAndroid Build Coastguard Worker.w32_loop:
947*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+hq-2]
948*c0909341SAndroid Build Coastguard Worker    mova                 m3, m4
949*c0909341SAndroid Build Coastguard Worker    vpshufb          m3{k1}, m0, m7
950*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3, m5
951*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3, m8
952*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3, m9
953*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
954*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
955*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
956*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
957*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m10, m1
958*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
959*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
960*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
961*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
962*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
963*c0909341SAndroid Build Coastguard Worker    RET
964*c0909341SAndroid Build Coastguard Worker.w64:
965*c0909341SAndroid Build Coastguard Worker    mova                 m7, [smooth_weights+64*2]
966*c0909341SAndroid Build Coastguard Worker    mova                 m8, [smooth_weights+64*3]
967*c0909341SAndroid Build Coastguard Worker    mova                 m9, [smooth_endA]
968*c0909341SAndroid Build Coastguard Worker.w64_loop:
969*c0909341SAndroid Build Coastguard Worker    mova                 m3, m4
970*c0909341SAndroid Build Coastguard Worker    vpbroadcastb     m3{k1}, [tlq+hq-1]
971*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3, m5
972*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3, m7
973*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3, m8
974*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
975*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
976*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
977*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
978*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m9, m1
979*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
980*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
981*c0909341SAndroid Build Coastguard Worker    dec                  hd
982*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
983*c0909341SAndroid Build Coastguard Worker    RET
984*c0909341SAndroid Build Coastguard Worker
985*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
986*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_smooth_8bpc_avx512icl_table
987*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_smooth_8bpc_avx512icl_table]
988*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
989*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
990*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
991*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m6, [tlq+r6] ; right
992*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
993*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
994*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pb_127_m127]
995*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, [tlq]    ; bottom
996*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pw_255]
997*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
998*c0909341SAndroid Build Coastguard Worker    lea          v_weightsq, [base+smooth_weights+hq*2]
999*c0909341SAndroid Build Coastguard Worker    vpmovb2m             k1, m1
1000*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1001*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1002*c0909341SAndroid Build Coastguard Worker.w4:
1003*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [tlq+hq+1]
1004*c0909341SAndroid Build Coastguard Worker    movsldup             m4, [smooth_shuf]
1005*c0909341SAndroid Build Coastguard Worker    movshdup             m5, [smooth_shuf]
1006*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m9, [smooth_weights+4*2]
1007*c0909341SAndroid Build Coastguard Worker    mova               ym11, [smooth_endA]
1008*c0909341SAndroid Build Coastguard Worker
1009*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m0     ; top, bottom
1010*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m8, m7
1011*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8     ;   1 * top + 256 * bottom + 255
1012*c0909341SAndroid Build Coastguard Worker    paddw               m10, m1     ; 128 * top + 129 * bottom + 255
1013*c0909341SAndroid Build Coastguard Worker.w4_loop:
1014*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [tlq+hq-8]
1015*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [v_weightsq]
1016*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 16
1017*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
1018*c0909341SAndroid Build Coastguard Worker    vpshufb          m2{k1}, m1, m4 ; left, right
1019*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m7 ; 127 * left - 127 * right
1020*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5
1021*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m8, m0
1022*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2     ; 128 * left + 129 * right
1023*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
1024*c0909341SAndroid Build Coastguard Worker    paddw                m0, m10
1025*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1026*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m1
1027*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m11, m0
1028*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
1029*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
1030*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm1
1031*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
1032*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 2
1033*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
1034*c0909341SAndroid Build Coastguard Worker    jl .ret
1035*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1036*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 1
1037*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
1038*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 3
1039*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
1040*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1041*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
1042*c0909341SAndroid Build Coastguard Worker.ret:
1043*c0909341SAndroid Build Coastguard Worker    RET
1044*c0909341SAndroid Build Coastguard Worker.w8:
1045*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m8, [tlq+hq+1]
1046*c0909341SAndroid Build Coastguard Worker    movsldup             m4, [smooth_shuf]
1047*c0909341SAndroid Build Coastguard Worker    movshdup             m5, [smooth_shuf]
1048*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [smooth_weights+8*2]
1049*c0909341SAndroid Build Coastguard Worker    mova               ym11, [smooth_endA]
1050*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m0
1051*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m8, m7
1052*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8
1053*c0909341SAndroid Build Coastguard Worker    paddw               m10, m1
1054*c0909341SAndroid Build Coastguard Worker.w8_loop:
1055*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [tlq+hq-4]
1056*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [v_weightsq]
1057*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 8
1058*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
1059*c0909341SAndroid Build Coastguard Worker    vpshufb          m2{k1}, m1, m4
1060*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m7
1061*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5
1062*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m8, m0
1063*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1064*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
1065*c0909341SAndroid Build Coastguard Worker    paddw                m0, m10
1066*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1067*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m1
1068*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m11, m0
1069*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
1070*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1071*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
1072*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
1073*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
1074*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1075*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1076*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
1077*c0909341SAndroid Build Coastguard Worker    RET
1078*c0909341SAndroid Build Coastguard Worker.w16:
1079*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [tlq+hq+1]
1080*c0909341SAndroid Build Coastguard Worker    movsldup             m5, [smooth_shuf]
1081*c0909341SAndroid Build Coastguard Worker    movshdup            m10, [smooth_shuf]
1082*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m11, [smooth_weights+16*2]
1083*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m12, [smooth_weights+16*3]
1084*c0909341SAndroid Build Coastguard Worker    mova                m15, [smooth_endB]
1085*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m9, m0
1086*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m0
1087*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m8, m7
1088*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m9, m7
1089*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m8
1090*c0909341SAndroid Build Coastguard Worker    paddw                m1, m9
1091*c0909341SAndroid Build Coastguard Worker    paddw               m13, m0
1092*c0909341SAndroid Build Coastguard Worker    paddw               m14, m1
1093*c0909341SAndroid Build Coastguard Worker.w16_loop:
1094*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+hq-4]
1095*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [v_weightsq]
1096*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 8
1097*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
1098*c0909341SAndroid Build Coastguard Worker    vpshufb          m4{k1}, m0, m5
1099*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4, m7
1100*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
1101*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m8, m1
1102*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9, m1
1103*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
1104*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m4, m11
1105*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m12
1106*c0909341SAndroid Build Coastguard Worker    paddw                m0, m13
1107*c0909341SAndroid Build Coastguard Worker    paddw                m1, m14
1108*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
1109*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
1110*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m3
1111*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m4
1112*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m15, m1
1113*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
1114*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
1115*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
1116*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
1117*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1118*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1119*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
1120*c0909341SAndroid Build Coastguard Worker    RET
1121*c0909341SAndroid Build Coastguard Worker.w32:
1122*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m9, [tlq+hq+1]
1123*c0909341SAndroid Build Coastguard Worker    movshdup            m10, [smooth_shuf]
1124*c0909341SAndroid Build Coastguard Worker    mova                m12, [smooth_weights+32*2]
1125*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym5, [pb_1]
1126*c0909341SAndroid Build Coastguard Worker    mova                m15, [smooth_endB]
1127*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m9, m0
1128*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m0
1129*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m8, m7
1130*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m9, m7
1131*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m11, m12, m12, q2020
1132*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m12, m12, q3131
1133*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m8
1134*c0909341SAndroid Build Coastguard Worker    paddw                m1, m9
1135*c0909341SAndroid Build Coastguard Worker    paddw               m13, m0
1136*c0909341SAndroid Build Coastguard Worker    paddw               m14, m1
1137*c0909341SAndroid Build Coastguard Worker.w32_loop:
1138*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+hq-2]
1139*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [v_weightsq]
1140*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 4
1141*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
1142*c0909341SAndroid Build Coastguard Worker    vpshufb          m4{k1}, m0, m5
1143*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4, m7
1144*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
1145*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m8, m1
1146*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9, m1
1147*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
1148*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m4, m11
1149*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m12
1150*c0909341SAndroid Build Coastguard Worker    paddw                m0, m13
1151*c0909341SAndroid Build Coastguard Worker    paddw                m1, m14
1152*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
1153*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
1154*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m3
1155*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m4
1156*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m15, m1
1157*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
1158*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
1159*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1160*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1161*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
1162*c0909341SAndroid Build Coastguard Worker    RET
1163*c0909341SAndroid Build Coastguard Worker.w64:
1164*c0909341SAndroid Build Coastguard Worker    movu                 m9, [tlq+hq+1]
1165*c0909341SAndroid Build Coastguard Worker    mova                m11, [smooth_weights+64*2]
1166*c0909341SAndroid Build Coastguard Worker    mova                 m2, [smooth_weights+64*3]
1167*c0909341SAndroid Build Coastguard Worker    mova                m14, [smooth_endB]
1168*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m9, m0
1169*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m0
1170*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m8, m7
1171*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m9, m7
1172*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m10, m11, m2, q2020
1173*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m11, m2, q3131
1174*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m8
1175*c0909341SAndroid Build Coastguard Worker    paddw                m1, m9
1176*c0909341SAndroid Build Coastguard Worker    paddw               m12, m0
1177*c0909341SAndroid Build Coastguard Worker    paddw               m13, m1
1178*c0909341SAndroid Build Coastguard Worker.w64_loop:
1179*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
1180*c0909341SAndroid Build Coastguard Worker    vpbroadcastb     m4{k1}, [tlq+hq-1]
1181*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [v_weightsq]
1182*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 2
1183*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4, m7
1184*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m8, m1
1185*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9, m1
1186*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
1187*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m4, m10
1188*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m11
1189*c0909341SAndroid Build Coastguard Worker    paddw                m0, m12
1190*c0909341SAndroid Build Coastguard Worker    paddw                m1, m13
1191*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
1192*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
1193*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m3
1194*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m4
1195*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m14, m1
1196*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1197*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1198*c0909341SAndroid Build Coastguard Worker    dec                  hd
1199*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
1200*c0909341SAndroid Build Coastguard Worker    RET
1201*c0909341SAndroid Build Coastguard Worker
1202*c0909341SAndroid Build Coastguard Workercglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
1203*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
1204*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1205*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1206*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
1207*c0909341SAndroid Build Coastguard Worker    jg .w32
1208*c0909341SAndroid Build Coastguard Worker    movq               xmm3, [palq]
1209*c0909341SAndroid Build Coastguard Worker    je .w8
1210*c0909341SAndroid Build Coastguard Worker.w4:
1211*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [idxq]
1212*c0909341SAndroid Build Coastguard Worker    add                idxq, 8
1213*c0909341SAndroid Build Coastguard Worker    psrlw              xmm1, xmm0, 4
1214*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm0, xmm1
1215*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xmm3, xmm0
1216*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xmm0
1217*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xmm0, 1
1218*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xmm0, 2
1219*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xmm0, 3
1220*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1221*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1222*c0909341SAndroid Build Coastguard Worker    jg .w4
1223*c0909341SAndroid Build Coastguard Worker    RET
1224*c0909341SAndroid Build Coastguard Worker.w8:
1225*c0909341SAndroid Build Coastguard Worker    movu               xmm2, [idxq]
1226*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
1227*c0909341SAndroid Build Coastguard Worker    pshufb             xmm1, xmm3, xmm2
1228*c0909341SAndroid Build Coastguard Worker    psrlw              xmm2, 4
1229*c0909341SAndroid Build Coastguard Worker    pshufb             xmm2, xmm3, xmm2
1230*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm0, xmm1, xmm2
1231*c0909341SAndroid Build Coastguard Worker    punpckhbw          xmm1, xmm2
1232*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xmm0
1233*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xmm0
1234*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xmm1
1235*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xmm1
1236*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1237*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1238*c0909341SAndroid Build Coastguard Worker    jg .w8
1239*c0909341SAndroid Build Coastguard Worker    RET
1240*c0909341SAndroid Build Coastguard Worker.w16:
1241*c0909341SAndroid Build Coastguard Worker    pmovzxdq             m0, [idxq]
1242*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
1243*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m0, m3, m0
1244*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5, m0
1245*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
1246*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
1247*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
1248*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
1249*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1250*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1251*c0909341SAndroid Build Coastguard Worker    jg .w16
1252*c0909341SAndroid Build Coastguard Worker    RET
1253*c0909341SAndroid Build Coastguard Worker.w32:
1254*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [pal_unpack+0]
1255*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [palq]
1256*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 32
1257*c0909341SAndroid Build Coastguard Worker    jl .w16
1258*c0909341SAndroid Build Coastguard Worker    pmovzxbd             m2, [pal_perm]
1259*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [pal_unpack+8]
1260*c0909341SAndroid Build Coastguard Worker    jg .w64
1261*c0909341SAndroid Build Coastguard Worker.w32_loop:
1262*c0909341SAndroid Build Coastguard Worker    vpermd               m1, m2, [idxq]
1263*c0909341SAndroid Build Coastguard Worker    add                idxq, 64
1264*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m0, m3, m1
1265*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m1, m4, m1
1266*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5, m0
1267*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5, m1
1268*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
1269*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
1270*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym1
1271*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+stride3q ], m1, 1
1272*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1273*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1274*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
1275*c0909341SAndroid Build Coastguard Worker    RET
1276*c0909341SAndroid Build Coastguard Worker.w64:
1277*c0909341SAndroid Build Coastguard Worker    vpermd               m1, m2, [idxq]
1278*c0909341SAndroid Build Coastguard Worker    add                idxq, 64
1279*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m0, m3, m1
1280*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m1, m4, m1
1281*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5, m0
1282*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5, m1
1283*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
1284*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
1285*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1286*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1287*c0909341SAndroid Build Coastguard Worker    jg .w64
1288*c0909341SAndroid Build Coastguard Worker    RET
1289*c0909341SAndroid Build Coastguard Worker
1290*c0909341SAndroid Build Coastguard Worker%if WIN64
1291*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 4
1292*c0909341SAndroid Build Coastguard Worker%else
1293*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8
1294*c0909341SAndroid Build Coastguard Worker%endif
1295*c0909341SAndroid Build Coastguard Worker
1296*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
1297*c0909341SAndroid Build Coastguard Worker%define base r7-z_filter_t0
1298*c0909341SAndroid Build Coastguard Worker    lea                  r7, [z_filter_t0]
1299*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
1300*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
1301*c0909341SAndroid Build Coastguard Worker    lea                  t0, [dr_intra_derivative]
1302*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4]
1303*c0909341SAndroid Build Coastguard Worker    inc                 tlq
1304*c0909341SAndroid Build Coastguard Worker    mov                 dxd, angled
1305*c0909341SAndroid Build Coastguard Worker    and                 dxd, 0x7e
1306*c0909341SAndroid Build Coastguard Worker    add              angled, 165 ; ~90
1307*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [t0+dxq]
1308*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_z1_8bpc_avx512icl_table+wq]
1309*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1310*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x4ff ; d = 90 - angle
1311*c0909341SAndroid Build Coastguard Worker    mova                m14, [base+z_frac_table]
1312*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+pw_512]
1313*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1314*c0909341SAndroid Build Coastguard Worker.w4:
1315*c0909341SAndroid Build Coastguard Worker    mova                 m9, [pb_0to63]
1316*c0909341SAndroid Build Coastguard Worker    pminud               m8, m9, [base+pb_7] {1to16}
1317*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, [tlq]
1318*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m8
1319*c0909341SAndroid Build Coastguard Worker    cmp              angleb, 40
1320*c0909341SAndroid Build Coastguard Worker    jae .w4_no_upsample
1321*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-1024]
1322*c0909341SAndroid Build Coastguard Worker    sar                 r3d, 7
1323*c0909341SAndroid Build Coastguard Worker    add                 r3d, hd
1324*c0909341SAndroid Build Coastguard Worker    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
1325*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xm7, [base+z_filter_s4]
1326*c0909341SAndroid Build Coastguard Worker    mova               xmm1, [tlq-1]
1327*c0909341SAndroid Build Coastguard Worker    pshufb             xmm1, [base+z_xpos_off2a]
1328*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm2, [base+pb_m4_36]
1329*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [pb_0to63]
1330*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xmm2
1331*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm1, xmm2
1332*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1333*c0909341SAndroid Build Coastguard Worker    kxnorw               k1, k1, k1
1334*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
1335*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xmm0, xm15
1336*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
1337*c0909341SAndroid Build Coastguard Worker    punpcklbw       ym7{k1}, ym0
1338*c0909341SAndroid Build Coastguard Worker    jmp .w4_main2
1339*c0909341SAndroid Build Coastguard Worker.w4_no_upsample:
1340*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1341*c0909341SAndroid Build Coastguard Worker    jnz .w4_main ; !enable_intra_edge_filter
1342*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1343*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm0, r3d
1344*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm1, angled
1345*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1346*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k1, xm0, [base+z_filter_wh]
1347*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k1}, xm1, [base+z_filter_t0+angleq*8]
1348*c0909341SAndroid Build Coastguard Worker    kmovw               r5d, k1
1349*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1350*c0909341SAndroid Build Coastguard Worker    jz .w4_main
1351*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym0, [tlq-1]
1352*c0909341SAndroid Build Coastguard Worker    pshufb              ym0, [base+z_filter4_s1]
1353*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d ; filter_strength
1354*c0909341SAndroid Build Coastguard Worker    pshufb              ym1, ym7, [z_filter_s4]
1355*c0909341SAndroid Build Coastguard Worker    pshufb              ym7, [base+z_filter_s3]
1356*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym11, [base+z_filter_k+(r5-1)*4+12*0]
1357*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym12, [base+z_filter_k+(r5-1)*4+12*1]
1358*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym11
1359*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym1, ym11
1360*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym7, ym12
1361*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym1
1362*c0909341SAndroid Build Coastguard Worker    paddw               ym7, ym0
1363*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym7, ym15
1364*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
1365*c0909341SAndroid Build Coastguard Worker    je .w4_filter_end
1366*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+pb_9]
1367*c0909341SAndroid Build Coastguard Worker    pminub               m8, m9
1368*c0909341SAndroid Build Coastguard Worker.w4_filter_end:
1369*c0909341SAndroid Build Coastguard Worker    paddb                m8, m8
1370*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m8, m7
1371*c0909341SAndroid Build Coastguard Worker.w4_main:
1372*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [base+z_xpos_off1a]
1373*c0909341SAndroid Build Coastguard Worker.w4_main2:
1374*c0909341SAndroid Build Coastguard Worker    movsldup             m2, [base+z_xpos_mul]
1375*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, dxd
1376*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [base+z_xpos_bc]
1377*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
1378*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5      ; xpos
1379*c0909341SAndroid Build Coastguard Worker    psllw                m5, 5       ; dx*8
1380*c0909341SAndroid Build Coastguard Worker.w4_loop:
1381*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m2, 3
1382*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2, m3
1383*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m1, m14 ; 64-frac, frac
1384*c0909341SAndroid Build Coastguard Worker    paddsb               m0, m4      ; base, base+1
1385*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m0, m7  ; top[base], top[base+1]
1386*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m5      ; xpos += dx
1387*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1      ; v
1388*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
1389*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
1390*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
1391*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
1392*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
1393*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
1394*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm1, 1
1395*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
1396*c0909341SAndroid Build Coastguard Worker    jl .w4_end
1397*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m0, 2 ; top[max_base_x]
1398*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1399*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, m0, 3
1400*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm1
1401*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
1402*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm0
1403*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm0, 1
1404*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1405*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
1406*c0909341SAndroid Build Coastguard Worker.w4_end:
1407*c0909341SAndroid Build Coastguard Worker    RET
1408*c0909341SAndroid Build Coastguard Worker.w8_filter:
1409*c0909341SAndroid Build Coastguard Worker    mova                ym0, [base+z_filter_s1]
1410*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
1411*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym1, [base+z_filter_s2]
1412*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym3, [base+z_filter_s3]
1413*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [base+z_filter_s4]
1414*c0909341SAndroid Build Coastguard Worker    vpermi2b            ym0, ym7, ym2 ; al bl
1415*c0909341SAndroid Build Coastguard Worker    mova                ym5, [base+z_filter_s5]
1416*c0909341SAndroid Build Coastguard Worker    pshufb              ym1, ym7, ym1 ; ah bh
1417*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym11, [base+z_filter_k+(r5-1)*4+12*0]
1418*c0909341SAndroid Build Coastguard Worker    pshufb              ym3, ym7, ym3 ; cl ch
1419*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym12, [base+z_filter_k+(r5-1)*4+12*1]
1420*c0909341SAndroid Build Coastguard Worker    pshufb              ym4, ym7, ym4 ; el dl
1421*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym13, [base+z_filter_k+(r5-1)*4+12*2]
1422*c0909341SAndroid Build Coastguard Worker    vpermb              ym5, ym5, ym7 ; eh dh
1423*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym11
1424*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym1, ym11
1425*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym2, ym3, ym12
1426*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym3, ym13
1427*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym4, ym11
1428*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym5, ym11
1429*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym2
1430*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym3
1431*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym4
1432*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym5
1433*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym0, ym15
1434*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym15
1435*c0909341SAndroid Build Coastguard Worker    packuswb            ym0, ym1
1436*c0909341SAndroid Build Coastguard Worker    ret
1437*c0909341SAndroid Build Coastguard Worker.w8:
1438*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+216]
1439*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
1440*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1441*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1442*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq-1]
1443*c0909341SAndroid Build Coastguard Worker    mova                xm1, [base+z_filter_s4]
1444*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm2, r3d
1445*c0909341SAndroid Build Coastguard Worker    mova                xm7, [tlq-1]
1446*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym7, [tlq+7], 1
1447*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym0, [base+z_xpos_off1a]
1448*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym3, [base+pb_m4_36]
1449*c0909341SAndroid Build Coastguard Worker    pminub              xm2, xm1
1450*c0909341SAndroid Build Coastguard Worker    pshufb              ym0, ym7, ym0
1451*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, xm2, 1
1452*c0909341SAndroid Build Coastguard Worker    psrldq              ym7, 1
1453*c0909341SAndroid Build Coastguard Worker    pshufb              ym1, ym7, ym1
1454*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym3
1455*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym1, ym3
1456*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [pb_0to63]
1457*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1458*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym1
1459*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym0, ym15
1460*c0909341SAndroid Build Coastguard Worker    packuswb            ym0, ym0
1461*c0909341SAndroid Build Coastguard Worker    punpcklbw           ym7, ym0
1462*c0909341SAndroid Build Coastguard Worker    jmp .w8_main2
1463*c0909341SAndroid Build Coastguard Worker.w8_no_upsample:
1464*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
1465*c0909341SAndroid Build Coastguard Worker    mova                 m9, [pb_0to63]
1466*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym0, r3d
1467*c0909341SAndroid Build Coastguard Worker    and                 r3d, 7
1468*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [tlq]
1469*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8 ; imin(h+7, 15)
1470*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m8, r3d
1471*c0909341SAndroid Build Coastguard Worker    pminub               m8, m9
1472*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m8
1473*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1474*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
1475*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym1, angled
1476*c0909341SAndroid Build Coastguard Worker    shr              angled, 8
1477*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k1, ym0, [base+z_filter_wh]
1478*c0909341SAndroid Build Coastguard Worker    mova                xm0, [base+z_filter_t0+angleq*8]
1479*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k1}, ym1, ym0
1480*c0909341SAndroid Build Coastguard Worker    kmovd               r5d, k1
1481*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1482*c0909341SAndroid Build Coastguard Worker    jz .w8_main
1483*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [tlq-4]
1484*c0909341SAndroid Build Coastguard Worker    call .w8_filter
1485*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
1486*c0909341SAndroid Build Coastguard Worker    jle .w8_filter_end
1487*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+pb_17]
1488*c0909341SAndroid Build Coastguard Worker    add                 r3d, 2
1489*c0909341SAndroid Build Coastguard Worker    pminub               m8, m9
1490*c0909341SAndroid Build Coastguard Worker.w8_filter_end:
1491*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m8, m0
1492*c0909341SAndroid Build Coastguard Worker.w8_main:
1493*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [base+z_xpos_off1a]
1494*c0909341SAndroid Build Coastguard Worker.w8_main2:
1495*c0909341SAndroid Build Coastguard Worker    movsldup             m4, [base+z_xpos_mul]
1496*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, dxd
1497*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1498*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+z_xpos_bc+8*0]
1499*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m9 ; xpos
1500*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+z_xpos_bc+8*1]
1501*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
1502*c0909341SAndroid Build Coastguard Worker    shl                 dxd, 3
1503*c0909341SAndroid Build Coastguard Worker    psllw                m9, 5 ; dx*8
1504*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
1505*c0909341SAndroid Build Coastguard Worker.w8_loop:
1506*c0909341SAndroid Build Coastguard Worker    psrlw                m3, m4, 3
1507*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m5
1508*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m6
1509*c0909341SAndroid Build Coastguard Worker    vpermw               m3, m3, m14
1510*c0909341SAndroid Build Coastguard Worker    paddsb               m0, m8
1511*c0909341SAndroid Build Coastguard Worker    paddsb               m1, m8
1512*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m0, m7
1513*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m1, m7
1514*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m9
1515*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m3
1516*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
1517*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m3
1518*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
1519*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
1520*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15
1521*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1522*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
1523*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1524*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
1525*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
1526*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm1
1527*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
1528*c0909341SAndroid Build Coastguard Worker    jl .w8_end
1529*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym0, m0, 1
1530*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1531*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
1532*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1533*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
1534*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
1535*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm1
1536*c0909341SAndroid Build Coastguard Worker    jz .w8_end
1537*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1538*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
1539*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
1540*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm7, m7, 3
1541*c0909341SAndroid Build Coastguard Worker.w8_end_loop:
1542*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm7
1543*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm7
1544*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm7
1545*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r2       ], xm7
1546*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1547*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1548*c0909341SAndroid Build Coastguard Worker    jg .w8_end_loop
1549*c0909341SAndroid Build Coastguard Worker.w8_end:
1550*c0909341SAndroid Build Coastguard Worker    RET
1551*c0909341SAndroid Build Coastguard Worker.w16_filter:
1552*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+z_filter_s1]
1553*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
1554*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [base+z_filter_s2]
1555*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [base+z_filter_s3]
1556*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [base+z_filter_s4]
1557*c0909341SAndroid Build Coastguard Worker    vpermi2b             m0, m7, m2 ; al bl
1558*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+z_filter_s5]
1559*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7, m1 ; ah bh
1560*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+z_filter_k+(r5-1)*4+12*0]
1561*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7, m3 ; cl ch
1562*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+z_filter_k+(r5-1)*4+12*1]
1563*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7, m4 ; el dl
1564*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+z_filter_k+(r5-1)*4+12*2]
1565*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m5, m7 ; eh dh
1566*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m11
1567*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m11
1568*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3, m12
1569*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m13
1570*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m11
1571*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m11
1572*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1573*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1574*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
1575*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
1576*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
1577*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15
1578*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1579*c0909341SAndroid Build Coastguard Worker    ret
1580*c0909341SAndroid Build Coastguard Worker.w16:
1581*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
1582*c0909341SAndroid Build Coastguard Worker    mova                 m9, [pb_0to63]
1583*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym0, r3d
1584*c0909341SAndroid Build Coastguard Worker    and                 r3d, 15
1585*c0909341SAndroid Build Coastguard Worker    movu                ym7, [tlq]
1586*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16 ; imin(h+15, 31)
1587*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m8, r3d
1588*c0909341SAndroid Build Coastguard Worker    pminub               m8, m9
1589*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m8, m7
1590*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1591*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
1592*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym1, angled
1593*c0909341SAndroid Build Coastguard Worker    shr              angled, 8
1594*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k1, ym0, [base+z_filter_wh]
1595*c0909341SAndroid Build Coastguard Worker    mova                xm0, [base+z_filter_t0+angleq*8]
1596*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k1}, ym1, ym0
1597*c0909341SAndroid Build Coastguard Worker    kmovd               r5d, k1
1598*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1599*c0909341SAndroid Build Coastguard Worker    jz .w16_main
1600*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-4]
1601*c0909341SAndroid Build Coastguard Worker    call .w16_filter
1602*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
1603*c0909341SAndroid Build Coastguard Worker    jle .w16_filter_end
1604*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+pb_33]
1605*c0909341SAndroid Build Coastguard Worker    add                 r3d, 2
1606*c0909341SAndroid Build Coastguard Worker    pminub               m8, m9
1607*c0909341SAndroid Build Coastguard Worker.w16_filter_end:
1608*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m8, m0
1609*c0909341SAndroid Build Coastguard Worker.w16_main:
1610*c0909341SAndroid Build Coastguard Worker    movshdup             m3, [base+z_xpos_mul]
1611*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, dxd
1612*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1613*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+z_xpos_bc]
1614*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m8 ; xpos
1615*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [base+z_xpos_off1a]
1616*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
1617*c0909341SAndroid Build Coastguard Worker    shl                 dxd, 2
1618*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [base+z_xpos_off1b]
1619*c0909341SAndroid Build Coastguard Worker    psllw                m8, 4 ; dx*4
1620*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
1621*c0909341SAndroid Build Coastguard Worker.w16_loop:
1622*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m4
1623*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m3, 3
1624*c0909341SAndroid Build Coastguard Worker    paddsb               m0, m1, m5
1625*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m2, m14
1626*c0909341SAndroid Build Coastguard Worker    paddsb               m1, m6
1627*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m0, m7
1628*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m1, m7
1629*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m8
1630*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
1631*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
1632*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
1633*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15
1634*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1635*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
1636*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
1637*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
1638*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2       ], m0, 3
1639*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1640*c0909341SAndroid Build Coastguard Worker    jz .w16_end
1641*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1642*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
1643*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
1644*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm7, m7, 3
1645*c0909341SAndroid Build Coastguard Worker.w16_end_loop:
1646*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm7
1647*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm7
1648*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], xm7
1649*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r2       ], xm7
1650*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1651*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1652*c0909341SAndroid Build Coastguard Worker    jg .w16_end_loop
1653*c0909341SAndroid Build Coastguard Worker.w16_end:
1654*c0909341SAndroid Build Coastguard Worker    RET
1655*c0909341SAndroid Build Coastguard Worker.w32_filter:
1656*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+z_filter_s1]
1657*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [base+z_filter_s2]
1658*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [base+z_filter_s3]
1659*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [base+z_filter_s4]
1660*c0909341SAndroid Build Coastguard Worker    vpermi2b             m0, m7, m2 ; al bl
1661*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+z_filter_s5]
1662*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7, m1 ; ah bh
1663*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+z_filter_k+4*2+12*0]
1664*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7, m3 ; cl ch
1665*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+z_filter_k+4*2+12*1]
1666*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7, m4 ; el dl
1667*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+z_filter_k+4*2+12*2]
1668*c0909341SAndroid Build Coastguard Worker    vpermi2b             m5, m7, m8 ; eh dh
1669*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m11
1670*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m11
1671*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3, m12
1672*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m13
1673*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m11
1674*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m11
1675*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1676*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1677*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
1678*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
1679*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
1680*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15
1681*c0909341SAndroid Build Coastguard Worker    packuswb             m7, m0, m1
1682*c0909341SAndroid Build Coastguard Worker    ret
1683*c0909341SAndroid Build Coastguard Worker.w32:
1684*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+31]
1685*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m9, r3d
1686*c0909341SAndroid Build Coastguard Worker    and                 r3d, 31
1687*c0909341SAndroid Build Coastguard Worker    pminub              m10, m9, [pb_0to63]
1688*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32 ; imin(h+31, 63)
1689*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m10, [tlq]
1690*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m8, [tlq+r3]
1691*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1692*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
1693*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-4]
1694*c0909341SAndroid Build Coastguard Worker    call .w32_filter
1695*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
1696*c0909341SAndroid Build Coastguard Worker    je .w32_h64_filter_end
1697*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m9, m7
1698*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m10, m7
1699*c0909341SAndroid Build Coastguard Worker    jmp .w32_main
1700*c0909341SAndroid Build Coastguard Worker.w32_h64_filter_end: ; edge case for 32x64
1701*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [tlq+r3-1]
1702*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [base+pb_8_56_0_0]
1703*c0909341SAndroid Build Coastguard Worker    add                 r3d, 2
1704*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xmm1
1705*c0909341SAndroid Build Coastguard Worker    vptestmw             k1, xmm1, xmm1 ; 0x01
1706*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xmm0, xm15
1707*c0909341SAndroid Build Coastguard Worker    vmovdqu8         m8{k1}, m0
1708*c0909341SAndroid Build Coastguard Worker.w32_main:
1709*c0909341SAndroid Build Coastguard Worker    rorx                r2d, dxd, 30
1710*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+z_xpos_bc]
1711*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, r2d
1712*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m5, [base+z_xpos_off2a]
1713*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1714*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m6, [base+z_xpos_off2b]
1715*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
1716*c0909341SAndroid Build Coastguard Worker    paddw                m9, m3, m3
1717*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1718*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, ym9, 1
1719*c0909341SAndroid Build Coastguard Worker.w32_loop:
1720*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m4
1721*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m3, 3
1722*c0909341SAndroid Build Coastguard Worker    paddsb               m0, m1, m5
1723*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m2, m14
1724*c0909341SAndroid Build Coastguard Worker    paddsb               m1, m6
1725*c0909341SAndroid Build Coastguard Worker    vpermi2b             m0, m7, m8
1726*c0909341SAndroid Build Coastguard Worker    vpermi2b             m1, m7, m8
1727*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m9
1728*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
1729*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
1730*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
1731*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15
1732*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1733*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
1734*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
1735*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1736*c0909341SAndroid Build Coastguard Worker    jz .w32_end
1737*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1738*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
1739*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
1740*c0909341SAndroid Build Coastguard Worker    punpckhqdq          ym8, ym8
1741*c0909341SAndroid Build Coastguard Worker.w32_end_loop:
1742*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], ym8
1743*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], ym8
1744*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1745*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1746*c0909341SAndroid Build Coastguard Worker    jg .w32_end_loop
1747*c0909341SAndroid Build Coastguard Worker.w32_end:
1748*c0909341SAndroid Build Coastguard Worker    RET
1749*c0909341SAndroid Build Coastguard Worker.w64_filter:
1750*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [base+z_filter_s2]
1751*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+z_filter_s1]
1752*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3      ; al bl
1753*c0909341SAndroid Build Coastguard Worker    vpermi2b             m1, m7, m2
1754*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [base+z_filter_s4]
1755*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m8, m4  ; el dl
1756*c0909341SAndroid Build Coastguard Worker    pshufb               m9, m7, m4
1757*c0909341SAndroid Build Coastguard Worker    pminub              m10, m13, [base+z_filter_s5]
1758*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m8, m3  ; ah bh
1759*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7, m3
1760*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [base+z_filter_s3]
1761*c0909341SAndroid Build Coastguard Worker    vpermb              m10, m10, m8 ; eh dh
1762*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m4
1763*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+z_filter_k+4*2+12*0]
1764*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m5      ; cl ch
1765*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m5
1766*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+z_filter_k+4*2+12*1]
1767*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11
1768*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m8, m5
1769*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m7, m5
1770*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
1771*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+z_filter_k+4*2+12*2]
1772*c0909341SAndroid Build Coastguard Worker    paddw                m1, m9
1773*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m6
1774*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m8, m6
1775*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
1776*c0909341SAndroid Build Coastguard Worker    paddw                m3, m11
1777*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
1778*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
1779*c0909341SAndroid Build Coastguard Worker    paddw                m2, m8
1780*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7
1781*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m15}, m0, m2, m1, m3
1782*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
1783*c0909341SAndroid Build Coastguard Worker    packuswb             m7, m1, m3
1784*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m12, m0
1785*c0909341SAndroid Build Coastguard Worker    ret
1786*c0909341SAndroid Build Coastguard Worker.w64:
1787*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq-1]
1788*c0909341SAndroid Build Coastguard Worker    movu                 m7, [tlq+64*0]
1789*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        m13, r3d
1790*c0909341SAndroid Build Coastguard Worker    pminub              m12, m13, [pb_0to63]
1791*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
1792*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m12, [tlq+64*1]
1793*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1794*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
1795*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+56]
1796*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-4]
1797*c0909341SAndroid Build Coastguard Worker    movu                m11, [tlq+8]
1798*c0909341SAndroid Build Coastguard Worker    call .w64_filter
1799*c0909341SAndroid Build Coastguard Worker.w64_main:
1800*c0909341SAndroid Build Coastguard Worker    rorx                r2d, dxd, 30
1801*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+z_xpos_bc]
1802*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, r2d
1803*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+z_xpos_off2a]
1804*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1805*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+z_xpos_off2b]
1806*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
1807*c0909341SAndroid Build Coastguard Worker    mova                 m9, m3
1808*c0909341SAndroid Build Coastguard Worker.w64_loop:
1809*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m4
1810*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m3, 3
1811*c0909341SAndroid Build Coastguard Worker    paddsb               m0, m1, m5
1812*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m2, m14
1813*c0909341SAndroid Build Coastguard Worker    paddsb               m1, m6
1814*c0909341SAndroid Build Coastguard Worker    vpermi2b             m0, m7, m8
1815*c0909341SAndroid Build Coastguard Worker    vpermi2b             m1, m7, m8
1816*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m9
1817*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
1818*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
1819*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
1820*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15
1821*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1822*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1823*c0909341SAndroid Build Coastguard Worker    dec                  hd
1824*c0909341SAndroid Build Coastguard Worker    jz .w64_end
1825*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1826*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
1827*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
1828*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m13, m8
1829*c0909341SAndroid Build Coastguard Worker.w64_end_loop:
1830*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m8
1831*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1832*c0909341SAndroid Build Coastguard Worker    dec                  hd
1833*c0909341SAndroid Build Coastguard Worker    jg .w64_end_loop
1834*c0909341SAndroid Build Coastguard Worker.w64_end:
1835*c0909341SAndroid Build Coastguard Worker    RET
1836*c0909341SAndroid Build Coastguard Worker
1837*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy
1838*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
1839*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
1840*c0909341SAndroid Build Coastguard Worker    lea                 dxq, [dr_intra_derivative-90]
1841*c0909341SAndroid Build Coastguard Worker    movzx               dyd, angleb
1842*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
1843*c0909341SAndroid Build Coastguard Worker    mov                  r7, dxq
1844*c0909341SAndroid Build Coastguard Worker    sub                 dxq, dyq
1845*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1846*c0909341SAndroid Build Coastguard Worker    and                 dyd, ~1
1847*c0909341SAndroid Build Coastguard Worker    and                 dxq, ~1
1848*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [r7+dyq]  ; angle - 90
1849*c0909341SAndroid Build Coastguard Worker    lea                  r7, [z_filter_t0]
1850*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [dxq+270] ; 180 - angle
1851*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4]
1852*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pb_63to0]
1853*c0909341SAndroid Build Coastguard Worker    neg                 dyd
1854*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m8, [tlq-64] ; left
1855*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_z2_8bpc_avx512icl_table+wq]
1856*c0909341SAndroid Build Coastguard Worker    mova                m14, [base+z_frac_table]
1857*c0909341SAndroid Build Coastguard Worker    inc                 tlq
1858*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+pw_512]
1859*c0909341SAndroid Build Coastguard Worker    neg                 dxd
1860*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1861*c0909341SAndroid Build Coastguard Worker.w4:
1862*c0909341SAndroid Build Coastguard Worker    movd                xm7, [tlq]
1863*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m10, [base+z_xpos_off2a]
1864*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1865*c0909341SAndroid Build Coastguard Worker    jnz .w4_main ; !enable_intra_edge_filter
1866*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
1867*c0909341SAndroid Build Coastguard Worker    add              angled, 1022
1868*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1869*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
1870*c0909341SAndroid Build Coastguard Worker    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
1871*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [base+pb_4]
1872*c0909341SAndroid Build Coastguard Worker    sub              angled, 1075 ; angle - 53
1873*c0909341SAndroid Build Coastguard Worker    call .upsample_above
1874*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1875*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m10, [pb_0to63+1]
1876*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm7, xm0, xm7
1877*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1878*c0909341SAndroid Build Coastguard Worker    jmp .w4_filter_left
1879*c0909341SAndroid Build Coastguard Worker.w4_upsample_left:
1880*c0909341SAndroid Build Coastguard Worker    call .upsample_left
1881*c0909341SAndroid Build Coastguard Worker    movsldup            m16, [base+z_ypos_off3]
1882*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pb_16]
1883*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm8, xm0, xm8
1884*c0909341SAndroid Build Coastguard Worker    jmp .w4_main2
1885*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above:
1886*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1887*c0909341SAndroid Build Coastguard Worker    sub              angled, 1112 ; angle - 90
1888*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1889*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
1890*c0909341SAndroid Build Coastguard Worker    jz .w4_no_filter_above
1891*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, [base+pb_3]
1892*c0909341SAndroid Build Coastguard Worker    call .filter_top_w16
1893*c0909341SAndroid Build Coastguard Worker.w4_no_filter_above:
1894*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
1895*c0909341SAndroid Build Coastguard Worker    add              angled, 973 ; angle + 883
1896*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1897*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
1898*c0909341SAndroid Build Coastguard Worker    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
1899*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym0, [base+pb_90]
1900*c0909341SAndroid Build Coastguard Worker    psubb               ym0, ym17
1901*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k2{k2}, ym0, ym16
1902*c0909341SAndroid Build Coastguard Worker    kmovd               r3d, k2
1903*c0909341SAndroid Build Coastguard Worker.w4_filter_left:
1904*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
1905*c0909341SAndroid Build Coastguard Worker    jz .w4_main
1906*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
1907*c0909341SAndroid Build Coastguard Worker    call .filter_left_h16
1908*c0909341SAndroid Build Coastguard Worker.w4_main:
1909*c0909341SAndroid Build Coastguard Worker    movsldup            m16, [base+z_ypos_off1]
1910*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pb_8]
1911*c0909341SAndroid Build Coastguard Worker.w4_main2:
1912*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [base+z_ypos_mul1a]
1913*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, dyd
1914*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+z_xpos_mul]
1915*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, dxd
1916*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m7, [tlq-16], 3
1917*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m8, [tlq-16], 3
1918*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m0
1919*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [base+z_xpos_bc]
1920*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5      ; xpos0..3
1921*c0909341SAndroid Build Coastguard Worker    psllw                m5, 5       ; dx*8
1922*c0909341SAndroid Build Coastguard Worker    psraw                m4, m3, 6
1923*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 1
1924*c0909341SAndroid Build Coastguard Worker    packsswb             m4, m4
1925*c0909341SAndroid Build Coastguard Worker    vpermw               m3, m3, m14 ; 64-frac, frac
1926*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m4
1927*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
1928*c0909341SAndroid Build Coastguard Worker    paddb                m4, m16     ; base, base+1
1929*c0909341SAndroid Build Coastguard Worker.w4_loop:
1930*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m1, m2
1931*c0909341SAndroid Build Coastguard Worker    psrlw                m0, m1, 3
1932*c0909341SAndroid Build Coastguard Worker    paddb               m16, m10
1933*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m0, m14
1934*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k1, m16     ; base_x < 0
1935*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m16, m7
1936*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m0
1937*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, m8
1938*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m16{k1}, m0, m3
1939*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15
1940*c0909341SAndroid Build Coastguard Worker    vpmovwb            ym16, m16
1941*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm16
1942*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm16, 1
1943*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm16, 2
1944*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm16, 3
1945*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
1946*c0909341SAndroid Build Coastguard Worker    jl .w4_end
1947*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5
1948*c0909341SAndroid Build Coastguard Worker    vextracti128       xm16, ym16, 1
1949*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1950*c0909341SAndroid Build Coastguard Worker    paddb                m4, m9
1951*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm16
1952*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm16, 1
1953*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm16, 2
1954*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm16, 3
1955*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1956*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
1957*c0909341SAndroid Build Coastguard Worker.w4_end:
1958*c0909341SAndroid Build Coastguard Worker    RET
1959*c0909341SAndroid Build Coastguard Worker.upsample_above: ; w4/w8
1960*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-1]
1961*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
1962*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1963*c0909341SAndroid Build Coastguard Worker    jmp .upsample
1964*c0909341SAndroid Build Coastguard Worker.upsample_left: ; h4/h8
1965*c0909341SAndroid Build Coastguard Worker    palignr             xm0, xm8, [tlq-16], 15
1966*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm2, hd
1967*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
1968*c0909341SAndroid Build Coastguard Worker.upsample:
1969*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm0, [base+z_filter4_s1]
1970*c0909341SAndroid Build Coastguard Worker    pminub              xm2, [base+z_filter_s4]
1971*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [base+pb_m4_36]
1972*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm2
1973*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
1974*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm3
1975*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
1976*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm15
1977*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
1978*c0909341SAndroid Build Coastguard Worker    ret
1979*c0909341SAndroid Build Coastguard Worker.filter_strength:
1980*c0909341SAndroid Build Coastguard Worker    vpbroadcastb       ym16, r3d
1981*c0909341SAndroid Build Coastguard Worker    mov                 r3d, angled
1982*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-4]
1983*c0909341SAndroid Build Coastguard Worker    vpbroadcastb       ym17, angled
1984*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 8
1985*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k2, ym16, [base+z_filter_wh]
1986*c0909341SAndroid Build Coastguard Worker    mova               xm16, [base+z_filter_t0+r3*8]
1987*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k2}, ym17, ym16
1988*c0909341SAndroid Build Coastguard Worker    mova                 m9, [pb_0to63]
1989*c0909341SAndroid Build Coastguard Worker    kmovd               r3d, k1
1990*c0909341SAndroid Build Coastguard Worker    ret
1991*c0909341SAndroid Build Coastguard Worker.w8:
1992*c0909341SAndroid Build Coastguard Worker    movq                xm7, [tlq]
1993*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [base+z_xpos_off2a]
1994*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1995*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
1996*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+126]
1997*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
1998*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1999*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2000*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [base+pb_8]
2001*c0909341SAndroid Build Coastguard Worker    sub              angled, 53 ; angle - 53
2002*c0909341SAndroid Build Coastguard Worker    call .upsample_above
2003*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2004*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [pb_0to63+1]
2005*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm7, xm0, xm7
2006*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2007*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_left
2008*c0909341SAndroid Build Coastguard Worker.w8_upsample_left:
2009*c0909341SAndroid Build Coastguard Worker    call .upsample_left
2010*c0909341SAndroid Build Coastguard Worker    movshdup            m16, [base+z_ypos_off3]
2011*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pb_8]
2012*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm8, xm0, xm8
2013*c0909341SAndroid Build Coastguard Worker    jmp .w8_main2
2014*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above:
2015*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2016*c0909341SAndroid Build Coastguard Worker    sub              angled, 90 ; angle - 90
2017*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2018*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2019*c0909341SAndroid Build Coastguard Worker    jz .w8_no_filter_above
2020*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, [base+pb_7]
2021*c0909341SAndroid Build Coastguard Worker    call .filter_top_w16
2022*c0909341SAndroid Build Coastguard Worker.w8_no_filter_above:
2023*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-51]
2024*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2025*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2026*c0909341SAndroid Build Coastguard Worker    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
2027*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym0, [base+pb_90]
2028*c0909341SAndroid Build Coastguard Worker    psubb               ym0, ym17
2029*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k2{k2}, ym0, ym16
2030*c0909341SAndroid Build Coastguard Worker    kmovd               r3d, k2
2031*c0909341SAndroid Build Coastguard Worker.w8_filter_left:
2032*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2033*c0909341SAndroid Build Coastguard Worker    jz .w8_main
2034*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
2035*c0909341SAndroid Build Coastguard Worker    je .w8_filter_left_h32
2036*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2037*c0909341SAndroid Build Coastguard Worker    call .filter_left_h16
2038*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
2039*c0909341SAndroid Build Coastguard Worker.w8_filter_left_h32:
2040*c0909341SAndroid Build Coastguard Worker    call .filter_left_h64
2041*c0909341SAndroid Build Coastguard Worker.w8_main:
2042*c0909341SAndroid Build Coastguard Worker    movshdup            m16, [base+z_ypos_off2]
2043*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pb_4]
2044*c0909341SAndroid Build Coastguard Worker.w8_main2:
2045*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [base+z_ypos_mul1a]
2046*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, dyd
2047*c0909341SAndroid Build Coastguard Worker    movshdup             m1, [base+z_xpos_mul]
2048*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, dxd
2049*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m7, [tlq-16], 3
2050*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m8, [tlq-16], 3
2051*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m0
2052*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+pb_1]
2053*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5      ; xpos0..3
2054*c0909341SAndroid Build Coastguard Worker    psllw                m5, 4       ; dx*4
2055*c0909341SAndroid Build Coastguard Worker    psraw                m4, m3, 6
2056*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 1
2057*c0909341SAndroid Build Coastguard Worker    packsswb             m4, m4
2058*c0909341SAndroid Build Coastguard Worker    vpermw               m3, m3, m14 ; 64-frac, frac
2059*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [dxq+(8<<6)]
2060*c0909341SAndroid Build Coastguard Worker    paddsb               m4, m16
2061*c0909341SAndroid Build Coastguard Worker    shl                 dxd, 2
2062*c0909341SAndroid Build Coastguard Worker    paddsb               m0, m4, m2
2063*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
2064*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0      ; base, base+1
2065*c0909341SAndroid Build Coastguard Worker.w8_loop:
2066*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m1, m2
2067*c0909341SAndroid Build Coastguard Worker    psrlw                m0, m1, 3
2068*c0909341SAndroid Build Coastguard Worker    paddb               m16, m10
2069*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m0, m14
2070*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k1, m16     ; base_x < 0
2071*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m16, m7
2072*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m0
2073*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, m8
2074*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m16{k1}, m0, m3
2075*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15
2076*c0909341SAndroid Build Coastguard Worker    vpmovwb            ym16, m16
2077*c0909341SAndroid Build Coastguard Worker    vextracti128       xm17, ym16, 1
2078*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm16
2079*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm16
2080*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm17
2081*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm17
2082*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2083*c0909341SAndroid Build Coastguard Worker    jz .w8_end
2084*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
2085*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2086*c0909341SAndroid Build Coastguard Worker    paddb                m4, m9
2087*c0909341SAndroid Build Coastguard Worker    add                 r3d, dxd
2088*c0909341SAndroid Build Coastguard Worker    jge .w8_loop
2089*c0909341SAndroid Build Coastguard Worker.w8_leftonly_loop:
2090*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m4, m8
2091*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m3
2092*c0909341SAndroid Build Coastguard Worker    paddb                m4, m9
2093*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15
2094*c0909341SAndroid Build Coastguard Worker    vpmovwb            ym16, m16
2095*c0909341SAndroid Build Coastguard Worker    vextracti128       xm17, ym16, 1
2096*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm16
2097*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm16
2098*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm17
2099*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm17
2100*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2101*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2102*c0909341SAndroid Build Coastguard Worker    jg .w8_leftonly_loop
2103*c0909341SAndroid Build Coastguard Worker.w8_end:
2104*c0909341SAndroid Build Coastguard Worker    RET
2105*c0909341SAndroid Build Coastguard Worker.filter_top_w16:
2106*c0909341SAndroid Build Coastguard Worker    mova                xm0, [base+z_filter_s1]
2107*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2108*c0909341SAndroid Build Coastguard Worker    pminub              xm4, xm5, [base+z_filter_s4]
2109*c0909341SAndroid Build Coastguard Worker    vpermi2b            xm0, xm7, xm2
2110*c0909341SAndroid Build Coastguard Worker    pminub              xm5, [base+z_filter_s5]
2111*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm7, [base+z_filter_s2]
2112*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm11, [base+z_filter_k+(r3-1)*4+12*0]
2113*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm7, [base+z_filter_s3]
2114*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm12, [base+z_filter_k+(r3-1)*4+12*1]
2115*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm7, xm4
2116*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm13, [base+z_filter_k+(r3-1)*4+12*2]
2117*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm7, xm5
2118*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm11
2119*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm11
2120*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm6, xm3, xm12
2121*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm12, r7m ; max_width
2122*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm3, xm13
2123*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm11
2124*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm5, xm11
2125*c0909341SAndroid Build Coastguard Worker    packssdw           xm12, xm12
2126*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm6
2127*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm3
2128*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm4
2129*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm5
2130*c0909341SAndroid Build Coastguard Worker    packsswb           xm12, xm12
2131*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm15
2132*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm15
2133*c0909341SAndroid Build Coastguard Worker    vpcmpgtb             k1, xm12, xm9 ; x < max_width
2134*c0909341SAndroid Build Coastguard Worker    packuswb        xm7{k1}, xm0, xm1
2135*c0909341SAndroid Build Coastguard Worker    ret
2136*c0909341SAndroid Build Coastguard Worker.filter_left_h16:
2137*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [hq-1]
2138*c0909341SAndroid Build Coastguard Worker    mova                xm0, [base+z_filter_s1]
2139*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm5, r5d
2140*c0909341SAndroid Build Coastguard Worker    vpermi2b            xm0, xm8, xm2
2141*c0909341SAndroid Build Coastguard Worker    pminub              xm4, xm5, [base+z_filter_s4]
2142*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm8, [base+z_filter_s2]
2143*c0909341SAndroid Build Coastguard Worker    pminub              xm5, [base+z_filter_s5]
2144*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm8, [base+z_filter_s3]
2145*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm11, [base+z_filter_k+(r3-1)*4+12*0]
2146*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm8, xm4
2147*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm12, [base+z_filter_k+(r3-1)*4+12*1]
2148*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm8, xm5
2149*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm13, [base+z_filter_k+(r3-1)*4+12*2]
2150*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm11
2151*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm11
2152*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm6, xm3, xm12
2153*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm12, r8m ; max_height
2154*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm3, xm13
2155*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm11
2156*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm5, xm11
2157*c0909341SAndroid Build Coastguard Worker    packssdw           xm12, xm12
2158*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm6
2159*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm3
2160*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm4
2161*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm5
2162*c0909341SAndroid Build Coastguard Worker    packsswb           xm12, xm12
2163*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm15
2164*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm15
2165*c0909341SAndroid Build Coastguard Worker    vpcmpgtb             k1, xm12, xm9 ; y < max_height
2166*c0909341SAndroid Build Coastguard Worker    packuswb        xm8{k1}, xm0, xm1
2167*c0909341SAndroid Build Coastguard Worker    ret
2168*c0909341SAndroid Build Coastguard Worker.w16:
2169*c0909341SAndroid Build Coastguard Worker    movu                xm7, [tlq] ; top
2170*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2171*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
2172*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
2173*c0909341SAndroid Build Coastguard Worker    sub              angled, 90
2174*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2175*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2176*c0909341SAndroid Build Coastguard Worker    jz .w16_no_filter_above
2177*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, [base+pb_15]
2178*c0909341SAndroid Build Coastguard Worker    call .filter_top_w16
2179*c0909341SAndroid Build Coastguard Worker.w16_no_filter_above:
2180*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
2181*c0909341SAndroid Build Coastguard Worker    jg .w16_filter_left_h64
2182*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym0, [base+pb_90]
2183*c0909341SAndroid Build Coastguard Worker    psubb               ym0, ym17
2184*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k2{k2}, ym0, ym16
2185*c0909341SAndroid Build Coastguard Worker    kmovd               r3d, k2
2186*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2187*c0909341SAndroid Build Coastguard Worker    jz .w16_main
2188*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2189*c0909341SAndroid Build Coastguard Worker    call .filter_left_h16
2190*c0909341SAndroid Build Coastguard Worker    jmp .w16_main
2191*c0909341SAndroid Build Coastguard Worker.w16_filter_left_h64:
2192*c0909341SAndroid Build Coastguard Worker    call .filter_left_h64
2193*c0909341SAndroid Build Coastguard Worker.w16_main:
2194*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [base+z_ypos_mul1a] ; 1.. 8
2195*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [base+z_ypos_mul1b] ; 9..15
2196*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, dyd
2197*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m7, [tlq-16], 3
2198*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+pb_1]
2199*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m12, dxd
2200*c0909341SAndroid Build Coastguard Worker    movshdup             m1, [base+z_xpos_mul]
2201*c0909341SAndroid Build Coastguard Worker    pmullw               m6, m0
2202*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [base+z_xpos_off2a]
2203*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m0
2204*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [base+z_xpos_off2b]
2205*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m12      ; xpos0 xpos1 xpos2 xpos3
2206*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pb_4]
2207*c0909341SAndroid Build Coastguard Worker    psllw               m12, 4        ; dx*4
2208*c0909341SAndroid Build Coastguard Worker    movshdup            m16, [base+z_ypos_off2]
2209*c0909341SAndroid Build Coastguard Worker    psrlw               m10, m6, 1
2210*c0909341SAndroid Build Coastguard Worker    psrlw               m11, m5, 1
2211*c0909341SAndroid Build Coastguard Worker    vpermw              m10, m10, m14 ; 64-frac, frac
2212*c0909341SAndroid Build Coastguard Worker    psraw                m6, 6
2213*c0909341SAndroid Build Coastguard Worker    vpermw              m11, m11, m14
2214*c0909341SAndroid Build Coastguard Worker    psraw                m5, 6
2215*c0909341SAndroid Build Coastguard Worker    mov                 r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft
2216*c0909341SAndroid Build Coastguard Worker    packsswb             m6, m5
2217*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 1<<6
2218*c0909341SAndroid Build Coastguard Worker    paddsb               m6, m16
2219*c0909341SAndroid Build Coastguard Worker    sub                 r5d, dxd      ; left-only threshold
2220*c0909341SAndroid Build Coastguard Worker    paddsb               m0, m6, m2
2221*c0909341SAndroid Build Coastguard Worker    shl                 dxd, 2
2222*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m0   ; base, base+1
2223*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
2224*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m0
2225*c0909341SAndroid Build Coastguard Worker.w16_loop:
2226*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m1, m2
2227*c0909341SAndroid Build Coastguard Worker    psrlw                m0, m1, 3
2228*c0909341SAndroid Build Coastguard Worker    paddb               m16, m3, m17
2229*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m0, m14
2230*c0909341SAndroid Build Coastguard Worker    paddb               m17, m4
2231*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k1, m16
2232*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m16, m7
2233*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k2, m17
2234*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m17, m7
2235*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m0
2236*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m0
2237*c0909341SAndroid Build Coastguard Worker    add                 r3d, dxd
2238*c0909341SAndroid Build Coastguard Worker    jge .w16_toponly
2239*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2240*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m5, m7
2241*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m16{k1}, m0, m10
2242*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2243*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m6, m7
2244*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m17{k2}, m0, m11
2245*c0909341SAndroid Build Coastguard Worker.w16_toponly:
2246*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15
2247*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m15
2248*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m17
2249*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm16
2250*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+strideq*1], ym16, 1
2251*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m16, 2
2252*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2       ], m16, 3
2253*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2254*c0909341SAndroid Build Coastguard Worker    jz .w16_end
2255*c0909341SAndroid Build Coastguard Worker    paddw                m1, m12
2256*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2257*c0909341SAndroid Build Coastguard Worker    paddb                m5, m9
2258*c0909341SAndroid Build Coastguard Worker    paddb                m6, m9
2259*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, r5d
2260*c0909341SAndroid Build Coastguard Worker    jge .w16_loop
2261*c0909341SAndroid Build Coastguard Worker.w16_leftonly_loop:
2262*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m5, m8
2263*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m6, m8
2264*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m10
2265*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m11
2266*c0909341SAndroid Build Coastguard Worker    paddb                m5, m9
2267*c0909341SAndroid Build Coastguard Worker    paddb                m6, m9
2268*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15
2269*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m15
2270*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m17
2271*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm16
2272*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+strideq*1], ym16, 1
2273*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m16, 2
2274*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2       ], m16, 3
2275*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2276*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2277*c0909341SAndroid Build Coastguard Worker    jg .w16_leftonly_loop
2278*c0909341SAndroid Build Coastguard Worker.w16_end:
2279*c0909341SAndroid Build Coastguard Worker    RET
2280*c0909341SAndroid Build Coastguard Worker.w32:
2281*c0909341SAndroid Build Coastguard Worker    movu                ym7, [tlq]
2282*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2283*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
2284*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-4]
2285*c0909341SAndroid Build Coastguard Worker    mova                ym0, [base+z_filter_s1]
2286*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym1, [base+z_filter_s2]
2287*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym3, [base+z_filter_s3]
2288*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [base+z_filter_s4]
2289*c0909341SAndroid Build Coastguard Worker    vpermi2b            ym0, ym7, ym2 ; al bl
2290*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym5, [base+pb_31]
2291*c0909341SAndroid Build Coastguard Worker    pminub              ym5, [base+z_filter_s5]
2292*c0909341SAndroid Build Coastguard Worker    pshufb              ym1, ym7, ym1 ; ah bh
2293*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym11, [base+z_filter_k+4*2+12*0]
2294*c0909341SAndroid Build Coastguard Worker    pshufb              ym3, ym7, ym3 ; cl ch
2295*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym12, [base+z_filter_k+4*2+12*1]
2296*c0909341SAndroid Build Coastguard Worker    pshufb              ym4, ym7, ym4 ; el dl
2297*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym13, [base+z_filter_k+4*2+12*2]
2298*c0909341SAndroid Build Coastguard Worker    vpermb              ym5, ym5, ym7 ; eh dh
2299*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym11
2300*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym1, ym11
2301*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym6, ym3, ym12
2302*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym12, r6m
2303*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym3, ym13
2304*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym4, ym11
2305*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym5, ym11
2306*c0909341SAndroid Build Coastguard Worker    mova                 m9, [pb_0to63]
2307*c0909341SAndroid Build Coastguard Worker    packssdw           ym12, ym12
2308*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym6
2309*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym3
2310*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym4
2311*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym5
2312*c0909341SAndroid Build Coastguard Worker    packsswb           ym12, ym12
2313*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym0, ym15
2314*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym15
2315*c0909341SAndroid Build Coastguard Worker    vpcmpgtb             k1, ym12, ym9 ; x < max_width
2316*c0909341SAndroid Build Coastguard Worker    packuswb        ym7{k1}, ym0, ym1
2317*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
2318*c0909341SAndroid Build Coastguard Worker    jg .w32_filter_h64
2319*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 3
2320*c0909341SAndroid Build Coastguard Worker    call .filter_left_h16
2321*c0909341SAndroid Build Coastguard Worker    jmp .w32_main
2322*c0909341SAndroid Build Coastguard Worker.w32_filter_h64:
2323*c0909341SAndroid Build Coastguard Worker    call .filter_left_h64
2324*c0909341SAndroid Build Coastguard Worker.w32_main:
2325*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m6, [base+z_ypos_mul1a] ; 1.. 8
2326*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m5, [base+z_ypos_mul1b] ; 9..15
2327*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, dyd
2328*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m7, [tlq-16], 3
2329*c0909341SAndroid Build Coastguard Worker    rorx                r2q, dxq, 62 ; dx << 2
2330*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+pb_1]
2331*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, r2d
2332*c0909341SAndroid Build Coastguard Worker    pmullw               m6, m0
2333*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m3, [base+z_xpos_off2a]
2334*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m0
2335*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m4, [base+z_xpos_off2b]
2336*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym1
2337*c0909341SAndroid Build Coastguard Worker    paddw               m12, m1, m1
2338*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pb_2]
2339*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0       ; xpos1 xpos0
2340*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym2
2341*c0909341SAndroid Build Coastguard Worker    psrlw               m10, m6, 1
2342*c0909341SAndroid Build Coastguard Worker    psrlw               m11, m5, 1
2343*c0909341SAndroid Build Coastguard Worker    vpermw              m10, m10, m14 ; 64-frac, frac
2344*c0909341SAndroid Build Coastguard Worker    psraw                m6, 6
2345*c0909341SAndroid Build Coastguard Worker    vpermw              m11, m11, m14
2346*c0909341SAndroid Build Coastguard Worker    psraw                m5, 6
2347*c0909341SAndroid Build Coastguard Worker    mov                 r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft
2348*c0909341SAndroid Build Coastguard Worker    packsswb             m6, m5
2349*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 1<<6
2350*c0909341SAndroid Build Coastguard Worker    paddsb               m6, m0
2351*c0909341SAndroid Build Coastguard Worker    sub                 r5d, dxd      ; left-only threshold
2352*c0909341SAndroid Build Coastguard Worker    paddsb               m0, m6, m2
2353*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
2354*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m0   ; base, base+1
2355*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m0
2356*c0909341SAndroid Build Coastguard Worker.w32_loop:
2357*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m1, m2
2358*c0909341SAndroid Build Coastguard Worker    psrlw                m0, m1, 3
2359*c0909341SAndroid Build Coastguard Worker    paddb               m16, m3, m17
2360*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m0, m14
2361*c0909341SAndroid Build Coastguard Worker    paddb               m17, m4
2362*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k1, m16
2363*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m16, m7
2364*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k2, m17
2365*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m17, m7
2366*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m0
2367*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m0
2368*c0909341SAndroid Build Coastguard Worker    add                 r3d, dxd
2369*c0909341SAndroid Build Coastguard Worker    jge .w32_toponly
2370*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2371*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m5, m7
2372*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m16{k1}, m0, m10
2373*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2374*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m6, m7
2375*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m17{k2}, m0, m11
2376*c0909341SAndroid Build Coastguard Worker.w32_toponly:
2377*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15
2378*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m15
2379*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m17
2380*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*0], m16, 1
2381*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*1], ym16
2382*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2383*c0909341SAndroid Build Coastguard Worker    jz .w32_end
2384*c0909341SAndroid Build Coastguard Worker    paddw                m1, m12
2385*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2386*c0909341SAndroid Build Coastguard Worker    paddb                m5, m9
2387*c0909341SAndroid Build Coastguard Worker    paddb                m6, m9
2388*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, r5d
2389*c0909341SAndroid Build Coastguard Worker    jge .w32_loop
2390*c0909341SAndroid Build Coastguard Worker.w32_leftonly_loop:
2391*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m5, m8
2392*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m6, m8
2393*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m10
2394*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m11
2395*c0909341SAndroid Build Coastguard Worker    paddb                m5, m9
2396*c0909341SAndroid Build Coastguard Worker    paddb                m6, m9
2397*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15
2398*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m15
2399*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m17
2400*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*0], m16, 1
2401*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*1], ym16
2402*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2403*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2404*c0909341SAndroid Build Coastguard Worker    jg .w32_leftonly_loop
2405*c0909341SAndroid Build Coastguard Worker.w32_end:
2406*c0909341SAndroid Build Coastguard Worker    RET
2407*c0909341SAndroid Build Coastguard Worker.filter_left_h64:
2408*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+z_filter_s1]
2409*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq-1]
2410*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [base+z_filter_s4]
2411*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m5, r3d
2412*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [base+z_filter_s2]
2413*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [base+z_filter_s3]
2414*c0909341SAndroid Build Coastguard Worker    vpermi2b             m0, m8, m2 ; al bl
2415*c0909341SAndroid Build Coastguard Worker    pminub               m5, [base+z_filter_s5]
2416*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8, m1 ; ah bh
2417*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+z_filter_k+4*2+12*0]
2418*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m8, m3 ; cl ch
2419*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+z_filter_k+4*2+12*1]
2420*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m8, m4 ; el dl
2421*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+z_filter_k+4*2+12*2]
2422*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m5, m8 ; eh dh
2423*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m11
2424*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m11
2425*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m3, m12
2426*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, r8m    ; max_height
2427*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m13
2428*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m11
2429*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m11
2430*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m12
2431*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
2432*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2433*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
2434*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
2435*c0909341SAndroid Build Coastguard Worker    packsswb            m12, m12
2436*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
2437*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15
2438*c0909341SAndroid Build Coastguard Worker    vpcmpgtb             k1, m12, m9 ; y < max_height
2439*c0909341SAndroid Build Coastguard Worker    packuswb         m8{k1}, m0, m1
2440*c0909341SAndroid Build Coastguard Worker    ret
2441*c0909341SAndroid Build Coastguard Worker.w64:
2442*c0909341SAndroid Build Coastguard Worker    movu                 m7, [tlq]
2443*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2444*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
2445*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-4]
2446*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+z_filter_s1]
2447*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [base+z_filter_s2]
2448*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [base+z_filter_s3]
2449*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [base+z_filter_s4]
2450*c0909341SAndroid Build Coastguard Worker    vpermi2b             m0, m7, m2 ; al bl
2451*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pb_63]
2452*c0909341SAndroid Build Coastguard Worker    pminub               m5, [base+z_filter_s5]
2453*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7, m1 ; ah bh
2454*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+z_filter_k+4*2+12*0]
2455*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7, m3 ; cl ch
2456*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+z_filter_k+4*2+12*1]
2457*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7, m4 ; el dl
2458*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+z_filter_k+4*2+12*2]
2459*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m5, m7 ; eh dh
2460*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m11
2461*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m11
2462*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m3, m12
2463*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, r6m
2464*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m13
2465*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m11
2466*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m11
2467*c0909341SAndroid Build Coastguard Worker    mova                 m9, [pb_0to63]
2468*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m12
2469*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
2470*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2471*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
2472*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
2473*c0909341SAndroid Build Coastguard Worker    packsswb            m12, m12
2474*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
2475*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15
2476*c0909341SAndroid Build Coastguard Worker    vpcmpgtb             k1, m12, m9 ; x < max_width
2477*c0909341SAndroid Build Coastguard Worker    packuswb         m7{k1}, m0, m1
2478*c0909341SAndroid Build Coastguard Worker    call .filter_left_h64 ; always filter the full 64 pixels for simplicity
2479*c0909341SAndroid Build Coastguard Worker.w64_main:
2480*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, dyd
2481*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [tlq-4]
2482*c0909341SAndroid Build Coastguard Worker    rorx                r2q, dxq, 62 ; dx << 2
2483*c0909341SAndroid Build Coastguard Worker    pmullw               m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such
2484*c0909341SAndroid Build Coastguard Worker    pmullw               m5, [base+z_ypos_mul1b]     ; pixels aren't selected from the left edge
2485*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, r2d     ; xpos
2486*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z_xpos_off2a]
2487*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z_xpos_off2b]
2488*c0909341SAndroid Build Coastguard Worker    mova                m12, m1
2489*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+pb_1]
2490*c0909341SAndroid Build Coastguard Worker    psrlw               m10, m6, 1
2491*c0909341SAndroid Build Coastguard Worker    psrlw               m11, m5, 1
2492*c0909341SAndroid Build Coastguard Worker    vpermw              m10, m10, m14 ; 64-frac, frac
2493*c0909341SAndroid Build Coastguard Worker    psraw                m6, 6
2494*c0909341SAndroid Build Coastguard Worker    vpermw              m11, m11, m14
2495*c0909341SAndroid Build Coastguard Worker    psraw                m5, 6
2496*c0909341SAndroid Build Coastguard Worker    mov                 r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft
2497*c0909341SAndroid Build Coastguard Worker    packsswb             m6, m5
2498*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 1<<6
2499*c0909341SAndroid Build Coastguard Worker    paddsb               m0, m6, m2
2500*c0909341SAndroid Build Coastguard Worker    sub                 r5d, dxd      ; left-only threshold
2501*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m0   ; base, base+1
2502*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m0
2503*c0909341SAndroid Build Coastguard Worker.w64_loop:
2504*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m1, m2
2505*c0909341SAndroid Build Coastguard Worker    psrlw                m0, m1, 3
2506*c0909341SAndroid Build Coastguard Worker    paddb               m16, m3, m17
2507*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m0, m14
2508*c0909341SAndroid Build Coastguard Worker    paddb               m17, m4
2509*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k1, m16      ; base_x < 0
2510*c0909341SAndroid Build Coastguard Worker    vpermi2b            m16, m7, m9
2511*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k2, m17
2512*c0909341SAndroid Build Coastguard Worker    vpermi2b            m17, m7, m9
2513*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m0
2514*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m0
2515*c0909341SAndroid Build Coastguard Worker    add                 r3d, dxd
2516*c0909341SAndroid Build Coastguard Worker    jge .w64_toponly
2517*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2518*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m5, m9
2519*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m16{k1}, m0, m10
2520*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2521*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m6, m9
2522*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m17{k2}, m0, m11
2523*c0909341SAndroid Build Coastguard Worker.w64_toponly:
2524*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15
2525*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m15
2526*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m17
2527*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m16
2528*c0909341SAndroid Build Coastguard Worker    dec                  hd
2529*c0909341SAndroid Build Coastguard Worker    jz .w64_end
2530*c0909341SAndroid Build Coastguard Worker    paddw                m1, m12
2531*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
2532*c0909341SAndroid Build Coastguard Worker    paddb                m5, m2
2533*c0909341SAndroid Build Coastguard Worker    paddb                m6, m2
2534*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, r5d
2535*c0909341SAndroid Build Coastguard Worker    jge .w64_loop
2536*c0909341SAndroid Build Coastguard Worker.w64_leftonly_loop:
2537*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m5, m8
2538*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m6, m8
2539*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m10
2540*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m11
2541*c0909341SAndroid Build Coastguard Worker    paddb                m5, m2
2542*c0909341SAndroid Build Coastguard Worker    paddb                m6, m2
2543*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15
2544*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m15
2545*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m17
2546*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m16
2547*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
2548*c0909341SAndroid Build Coastguard Worker    dec                  hd
2549*c0909341SAndroid Build Coastguard Worker    jg .w64_leftonly_loop
2550*c0909341SAndroid Build Coastguard Worker.w64_end:
2551*c0909341SAndroid Build Coastguard Worker    RET
2552*c0909341SAndroid Build Coastguard Worker
2553*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
2554*c0909341SAndroid Build Coastguard Worker    lea                  r7, [z_filter_t0]
2555*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
2556*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
2557*c0909341SAndroid Build Coastguard Worker    lea                  t0, [dr_intra_derivative+45*2-1]
2558*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4]
2559*c0909341SAndroid Build Coastguard Worker    sub              angled, 180
2560*c0909341SAndroid Build Coastguard Worker    mov                 dyd, angled
2561*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2562*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
2563*c0909341SAndroid Build Coastguard Worker    or                  dyq, ~0x7e
2564*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+pb_63to0]
2565*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [t0+dyq]
2566*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_z3_8bpc_avx512icl_table+wq]
2567*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2568*c0909341SAndroid Build Coastguard Worker    mova                m14, [base+z_frac_table]
2569*c0909341SAndroid Build Coastguard Worker    shl                 dyd, 6
2570*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+pw_512]
2571*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2572*c0909341SAndroid Build Coastguard Worker.w4:
2573*c0909341SAndroid Build Coastguard Worker    cmp              angleb, 40
2574*c0909341SAndroid Build Coastguard Worker    jae .w4_no_upsample
2575*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-1024]
2576*c0909341SAndroid Build Coastguard Worker    sar                 r3d, 7
2577*c0909341SAndroid Build Coastguard Worker    add                 r3d, hd
2578*c0909341SAndroid Build Coastguard Worker    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
2579*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+4]
2580*c0909341SAndroid Build Coastguard Worker    call .upsample
2581*c0909341SAndroid Build Coastguard Worker    movshdup             m1, [base+z_ypos_off1]
2582*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pb_16]
2583*c0909341SAndroid Build Coastguard Worker    jmp .w4_main2
2584*c0909341SAndroid Build Coastguard Worker.w4_no_upsample:
2585*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
2586*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m9, r3d
2587*c0909341SAndroid Build Coastguard Worker    vpxord               m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4)
2588*c0909341SAndroid Build Coastguard Worker    pmaxub               m1, m0
2589*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m1, [tlq-64*1]
2590*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2591*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2592*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm1, angled
2593*c0909341SAndroid Build Coastguard Worker    shr              angled, 8
2594*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k1, xm9, [base+z_filter_wh]
2595*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-3]
2596*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k1}, xm1, [base+z_filter_t0+angleq*8]
2597*c0909341SAndroid Build Coastguard Worker    kmovw               r5d, k1
2598*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2599*c0909341SAndroid Build Coastguard Worker    jz .w4_main
2600*c0909341SAndroid Build Coastguard Worker    pminub               m9, [pb_0to63]
2601*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter
2602*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m9, m0
2603*c0909341SAndroid Build Coastguard Worker.w4_main:
2604*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+z_ypos_off1]
2605*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pb_8]
2606*c0909341SAndroid Build Coastguard Worker.w4_main2:
2607*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, dyd
2608*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [base+z_ypos_mul2a] ; 1..4
2609*c0909341SAndroid Build Coastguard Worker    pmulhuw              m2, m0 ; ypos >> 1
2610*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
2611*c0909341SAndroid Build Coastguard Worker    vpermw               m3, m2, m14 ; 64-frac, frac
2612*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 5
2613*c0909341SAndroid Build Coastguard Worker    packsswb             m2, m2
2614*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
2615*c0909341SAndroid Build Coastguard Worker    paddsb               m2, m1 ; base, base+1
2616*c0909341SAndroid Build Coastguard Worker.w4_loop:
2617*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m2, m7
2618*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
2619*c0909341SAndroid Build Coastguard Worker    paddsb               m2, m6
2620*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
2621*c0909341SAndroid Build Coastguard Worker    vpmovwb             ym0, m0
2622*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
2623*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
2624*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
2625*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm0, 3
2626*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
2627*c0909341SAndroid Build Coastguard Worker    jl .w4_end
2628*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, ym0, 1
2629*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2630*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
2631*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
2632*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
2633*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm0, 3
2634*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2635*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
2636*c0909341SAndroid Build Coastguard Worker.w4_end:
2637*c0909341SAndroid Build Coastguard Worker    RET
2638*c0909341SAndroid Build Coastguard Worker.upsample:
2639*c0909341SAndroid Build Coastguard Worker    xor                 r3d, 31 ; 31 - (h + imin(w, h))
2640*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym0, [base+z_xpos_off2a]
2641*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym7, r3d
2642*c0909341SAndroid Build Coastguard Worker    pmaxub              ym7, [base+z3_upsample]
2643*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym1, [base+z_filter_s4]
2644*c0909341SAndroid Build Coastguard Worker    vpermb              ym7, ym7, [tlq-31]
2645*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [base+pb_m4_36]
2646*c0909341SAndroid Build Coastguard Worker    pshufb              ym0, ym7, ym0
2647*c0909341SAndroid Build Coastguard Worker    psrldq              ym7, 1
2648*c0909341SAndroid Build Coastguard Worker    pshufb              ym1, ym7, ym1
2649*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym2
2650*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym1, ym2
2651*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
2652*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym1
2653*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym0, ym15
2654*c0909341SAndroid Build Coastguard Worker    packuswb            ym0, ym0
2655*c0909341SAndroid Build Coastguard Worker    punpcklbw           ym7, ym0
2656*c0909341SAndroid Build Coastguard Worker    ret
2657*c0909341SAndroid Build Coastguard Worker.w8:
2658*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+216]
2659*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2660*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2661*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
2662*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq*2]
2663*c0909341SAndroid Build Coastguard Worker    call .upsample
2664*c0909341SAndroid Build Coastguard Worker    pshufd               m1, [base+z_ypos_off1], q0000
2665*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pb_8]
2666*c0909341SAndroid Build Coastguard Worker    jmp .w8_main2
2667*c0909341SAndroid Build Coastguard Worker.w8_no_upsample:
2668*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 8
2669*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
2670*c0909341SAndroid Build Coastguard Worker    cmove               r3d, hd
2671*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r3+hq-1]
2672*c0909341SAndroid Build Coastguard Worker    xor                 r3d, 63 ; 63 - (h + imin(w, h))
2673*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, wd
2674*c0909341SAndroid Build Coastguard Worker    pmaxub               m1, m0
2675*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m1, [tlq-64*1]
2676*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2677*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
2678*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2679*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2680*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2681*c0909341SAndroid Build Coastguard Worker    jz .w8_main
2682*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter
2683*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m10, m0
2684*c0909341SAndroid Build Coastguard Worker.w8_main:
2685*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+z_ypos_off2]
2686*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pb_4]
2687*c0909341SAndroid Build Coastguard Worker.w8_main2:
2688*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, dyd
2689*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [base+z_ypos_mul2a] ; 1..8
2690*c0909341SAndroid Build Coastguard Worker    pmulhuw              m2, m0 ; ypos >> 1
2691*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
2692*c0909341SAndroid Build Coastguard Worker    vpermw               m3, m2, m14 ; 64-frac, frac
2693*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 5
2694*c0909341SAndroid Build Coastguard Worker    packsswb             m2, m2
2695*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
2696*c0909341SAndroid Build Coastguard Worker    paddsb               m2, m1 ; base, base+1
2697*c0909341SAndroid Build Coastguard Worker.w8_loop:
2698*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m2, m7
2699*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
2700*c0909341SAndroid Build Coastguard Worker    paddsb               m2, m6
2701*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
2702*c0909341SAndroid Build Coastguard Worker    vpmovwb             ym0, m0
2703*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
2704*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
2705*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
2706*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
2707*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm1
2708*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2709*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2710*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
2711*c0909341SAndroid Build Coastguard Worker    RET
2712*c0909341SAndroid Build Coastguard Worker.filter_strength:
2713*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-3]
2714*c0909341SAndroid Build Coastguard Worker.filter_strength2:
2715*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m9, r3d
2716*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym1, angled
2717*c0909341SAndroid Build Coastguard Worker    shr              angled, 8
2718*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k1, ym9, [base+z_filter_wh]
2719*c0909341SAndroid Build Coastguard Worker    mova                xm0, [base+z_filter_t0+angleq*8]
2720*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k1}, ym1, ym0
2721*c0909341SAndroid Build Coastguard Worker    pminub              m10, m9, [pb_0to63]
2722*c0909341SAndroid Build Coastguard Worker    kmovd               r5d, k1
2723*c0909341SAndroid Build Coastguard Worker    ret
2724*c0909341SAndroid Build Coastguard Worker.w16_load:
2725*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, hd
2726*c0909341SAndroid Build Coastguard Worker    cmovae              r3d, hd
2727*c0909341SAndroid Build Coastguard Worker    add                 r3d, hd
2728*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tlq-64*1]
2729*c0909341SAndroid Build Coastguard Worker    neg                 r3d ; -(h + imin(w, h))
2730*c0909341SAndroid Build Coastguard Worker    and                 r3d, 63
2731*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, r3d
2732*c0909341SAndroid Build Coastguard Worker    pmaxub               m2, m0, m1
2733*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
2734*c0909341SAndroid Build Coastguard Worker    je .w16_load_h64
2735*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m1, m7
2736*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m2, m7
2737*c0909341SAndroid Build Coastguard Worker    ret
2738*c0909341SAndroid Build Coastguard Worker.w16_load_h64:
2739*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m0, m7
2740*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m2, [tlq-64*2]
2741*c0909341SAndroid Build Coastguard Worker    ret
2742*c0909341SAndroid Build Coastguard Worker.w16:
2743*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16
2744*c0909341SAndroid Build Coastguard Worker    call .w16_load
2745*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2746*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
2747*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-3]
2748*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
2749*c0909341SAndroid Build Coastguard Worker    je .w16_filter64
2750*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
2751*c0909341SAndroid Build Coastguard Worker    call .filter_strength2
2752*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2753*c0909341SAndroid Build Coastguard Worker    jz .w16_main
2754*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter
2755*c0909341SAndroid Build Coastguard Worker    pminub              m10, m9, [pb_0to63]
2756*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m9, m0
2757*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m10, m0
2758*c0909341SAndroid Build Coastguard Worker    jmp .w16_main
2759*c0909341SAndroid Build Coastguard Worker.w16_filter64:
2760*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+pb_15]
2761*c0909341SAndroid Build Coastguard Worker    valignq              m0, m8, m7, 7
2762*c0909341SAndroid Build Coastguard Worker    pminub              m12, m13, [pb_0to63]
2763*c0909341SAndroid Build Coastguard Worker    valignq             m11, m8, m7, 1
2764*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
2765*c0909341SAndroid Build Coastguard Worker.w16_main:
2766*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [base+z_ypos_mul2a] ; 1.. 8
2767*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [base+z_ypos_mul2b] ; 9..15
2768*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, dyd
2769*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pb_4]
2770*c0909341SAndroid Build Coastguard Worker    pmulhuw              m3, m0 ; ypos >> 1
2771*c0909341SAndroid Build Coastguard Worker    pmulhuw              m2, m0
2772*c0909341SAndroid Build Coastguard Worker    movshdup             m0, [base+z_ypos_off2]
2773*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
2774*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pb_1]
2775*c0909341SAndroid Build Coastguard Worker    vpermw               m4, m3, m14 ; 64-frac, frac
2776*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 5
2777*c0909341SAndroid Build Coastguard Worker    vpermw               m5, m2, m14
2778*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 5
2779*c0909341SAndroid Build Coastguard Worker    packsswb             m3, m2
2780*c0909341SAndroid Build Coastguard Worker    paddsb               m3, m0
2781*c0909341SAndroid Build Coastguard Worker    paddsb               m1, m3
2782*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m1 ; base, base+1
2783*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
2784*c0909341SAndroid Build Coastguard Worker.w16_loop:
2785*c0909341SAndroid Build Coastguard Worker%macro Z3_PERM2 0
2786*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
2787*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m2, m8
2788*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
2789*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m3, m8
2790*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4
2791*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
2792*c0909341SAndroid Build Coastguard Worker    paddsb               m2, m6
2793*c0909341SAndroid Build Coastguard Worker    paddsb               m3, m6
2794*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
2795*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15
2796*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2797*c0909341SAndroid Build Coastguard Worker%endmacro
2798*c0909341SAndroid Build Coastguard Worker    Z3_PERM2
2799*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
2800*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
2801*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
2802*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2       ], m0, 3
2803*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2804*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2805*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
2806*c0909341SAndroid Build Coastguard Worker    RET
2807*c0909341SAndroid Build Coastguard Worker.w32:
2808*c0909341SAndroid Build Coastguard Worker    mov                  r3d, 32
2809*c0909341SAndroid Build Coastguard Worker    call .w16_load
2810*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2811*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
2812*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-3]
2813*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
2814*c0909341SAndroid Build Coastguard Worker    je .w32_filter64
2815*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+31]
2816*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m9, r3d
2817*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter
2818*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m9, m7
2819*c0909341SAndroid Build Coastguard Worker    jmp .w32_main
2820*c0909341SAndroid Build Coastguard Worker.w32_filter64:
2821*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+pb_31]
2822*c0909341SAndroid Build Coastguard Worker    valignq              m0, m8, m7, 7
2823*c0909341SAndroid Build Coastguard Worker    pminub              m12, m13, [pb_0to63]
2824*c0909341SAndroid Build Coastguard Worker    valignq             m11, m8, m7, 1
2825*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
2826*c0909341SAndroid Build Coastguard Worker.w32_main:
2827*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m3, [base+z_ypos_mul2a] ; 1.. 8
2828*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m2, [base+z_ypos_mul2b] ; 9..15
2829*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, dyd
2830*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pb_1]
2831*c0909341SAndroid Build Coastguard Worker    pmulhuw              m3, m0 ; ypos >> 1
2832*c0909341SAndroid Build Coastguard Worker    pmulhuw              m2, m0
2833*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pb_2]
2834*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym1
2835*c0909341SAndroid Build Coastguard Worker    vpermw               m4, m3, m14 ; 64-frac, frac
2836*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 5
2837*c0909341SAndroid Build Coastguard Worker    vpermw               m5, m2, m14
2838*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 5
2839*c0909341SAndroid Build Coastguard Worker    packsswb             m3, m2
2840*c0909341SAndroid Build Coastguard Worker    paddsb               m3, m0
2841*c0909341SAndroid Build Coastguard Worker    paddsb               m1, m3
2842*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m1 ; base, base+1
2843*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
2844*c0909341SAndroid Build Coastguard Worker.w32_loop:
2845*c0909341SAndroid Build Coastguard Worker    Z3_PERM2
2846*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*0], m0, 1
2847*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*1], ym0
2848*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2849*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2850*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
2851*c0909341SAndroid Build Coastguard Worker    RET
2852*c0909341SAndroid Build Coastguard Worker.w64:
2853*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tlq-64*1]
2854*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
2855*c0909341SAndroid Build Coastguard Worker    je .w64_h64
2856*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq*2-1]
2857*c0909341SAndroid Build Coastguard Worker    xor                 r3d, 63 ; -(h + imin(w, h)) & 63
2858*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, r3d
2859*c0909341SAndroid Build Coastguard Worker    pmaxub               m0, m1
2860*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m1, m7
2861*c0909341SAndroid Build Coastguard Worker    jmp .w64_filter
2862*c0909341SAndroid Build Coastguard Worker.w64_h64:
2863*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m0, [tlq-64*2]
2864*c0909341SAndroid Build Coastguard Worker.w64_filter:
2865*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m0, m7
2866*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2867*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
2868*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq-1]
2869*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq-3]
2870*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        m13, r3d
2871*c0909341SAndroid Build Coastguard Worker    valignq              m0, m8, m7, 7
2872*c0909341SAndroid Build Coastguard Worker    pminub              m12, m13, [pb_0to63]
2873*c0909341SAndroid Build Coastguard Worker    valignq             m11, m8, m7, 1
2874*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
2875*c0909341SAndroid Build Coastguard Worker.w64_main:
2876*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, dyd
2877*c0909341SAndroid Build Coastguard Worker    pmulhuw              m3, m2, [base+z_ypos_mul2a]
2878*c0909341SAndroid Build Coastguard Worker    pmulhuw              m2, [base+z_ypos_mul2b]
2879*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pb_1]
2880*c0909341SAndroid Build Coastguard Worker    vpermw               m4, m3, m14 ; 64-frac, frac
2881*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 5
2882*c0909341SAndroid Build Coastguard Worker    vpermw               m5, m2, m14
2883*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 5
2884*c0909341SAndroid Build Coastguard Worker    packsswb             m3, m2
2885*c0909341SAndroid Build Coastguard Worker    paddsb               m1, m3, m6
2886*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m1 ; base, base+1
2887*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
2888*c0909341SAndroid Build Coastguard Worker.w64_loop:
2889*c0909341SAndroid Build Coastguard Worker    Z3_PERM2
2890*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
2891*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
2892*c0909341SAndroid Build Coastguard Worker    dec                  hd
2893*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
2894*c0909341SAndroid Build Coastguard Worker    RET
2895*c0909341SAndroid Build Coastguard Worker
2896*c0909341SAndroid Build Coastguard Worker; The ipred_filter code processes 4x2 blocks in the following order
2897*c0909341SAndroid Build Coastguard Worker; which increases parallelism compared to doing things row by row.
2898*c0909341SAndroid Build Coastguard Worker; Some redundant blocks are calculated for w > 4.
2899*c0909341SAndroid Build Coastguard Worker;     w4     w8       w16             w32
2900*c0909341SAndroid Build Coastguard Worker;     1     1 2     1 2 3 4     1 2 3 4 9 a b c
2901*c0909341SAndroid Build Coastguard Worker;     2     2 3     2 3 4 5     2 3 4 5 a b c d
2902*c0909341SAndroid Build Coastguard Worker;     3     3 4     3 4 5 6     3 4 5 6 b c d e
2903*c0909341SAndroid Build Coastguard Worker;     4     4 5     4 5 6 7     4 5 6 7 c d e f
2904*c0909341SAndroid Build Coastguard Worker;     5     5 6     5 6 7 8     5 6 7 8 d e f g
2905*c0909341SAndroid Build Coastguard Worker;     6     6 7     6 7 8 9     6 7 8 9 e f g h
2906*c0909341SAndroid Build Coastguard Worker;     7     7 8     7 8 9 a     7 8 9 a f g h i
2907*c0909341SAndroid Build Coastguard Worker; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___
2908*c0909341SAndroid Build Coastguard Worker;           9       9 a b               h i j
2909*c0909341SAndroid Build Coastguard Worker;                   a b                 i j
2910*c0909341SAndroid Build Coastguard Worker;                   b                   j
2911*c0909341SAndroid Build Coastguard Worker
2912*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt
2913*c0909341SAndroid Build Coastguard Worker%define base r6-filter_taps
2914*c0909341SAndroid Build Coastguard Worker    lea                  r6, [filter_taps]
2915*c0909341SAndroid Build Coastguard Worker%ifidn fltd, fltm
2916*c0909341SAndroid Build Coastguard Worker    movzx              fltd, fltb
2917*c0909341SAndroid Build Coastguard Worker%else
2918*c0909341SAndroid Build Coastguard Worker    movzx              fltd, byte fltm
2919*c0909341SAndroid Build Coastguard Worker%endif
2920*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm2, [tlq+1]        ; t0 t0 t0 t0
2921*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2922*c0909341SAndroid Build Coastguard Worker    shl                fltd, 6
2923*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pd_8]
2924*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm3, [tlq-2]        ; l1 l0 tl __
2925*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [r6+fltq+16*0] ; p1 p2 p3 p4
2926*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [r6+fltq+16*1]
2927*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [r6+fltq+16*2] ; p6 p5 p0 __
2928*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [r6+fltq+16*3]
2929*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xm6
2930*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm0, xmm2, xm7
2931*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xm6
2932*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm1, xmm2, xm8
2933*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm0, xmm3, xm9
2934*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm1, xmm3, xm10
2935*c0909341SAndroid Build Coastguard Worker    packssdw           xmm0, xmm1
2936*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2937*c0909341SAndroid Build Coastguard Worker    jb .w4
2938*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [tlq+5]
2939*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+filter_perm]
2940*c0909341SAndroid Build Coastguard Worker    mov                  r5, 0xffffffffffff000f
2941*c0909341SAndroid Build Coastguard Worker    psrldq             xmm2, 1           ; __ t0
2942*c0909341SAndroid Build Coastguard Worker    kmovq                k1, r5          ; 0x000f
2943*c0909341SAndroid Build Coastguard Worker    psraw               xm5, xmm0, 4
2944*c0909341SAndroid Build Coastguard Worker    packuswb           xmm2, xm5         ; __ t0 a0 b0
2945*c0909341SAndroid Build Coastguard Worker    pshufd          ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0   t1 t1 t1 t1
2946*c0909341SAndroid Build Coastguard Worker    je .w8
2947*c0909341SAndroid Build Coastguard Worker    kxnorb               k3, k3, k3      ; 0x00ff
2948*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [tlq-4]
2949*c0909341SAndroid Build Coastguard Worker    kandnq               k2, k3, k1      ; 0xffffffffffff0000
2950*c0909341SAndroid Build Coastguard Worker    vpermb          ym3{k2}, ym11, ymm2  ; l3 l2 l1 __   b3 a3 t3 __
2951*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym6
2952*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym2, ym7
2953*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym6
2954*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym1, ym2, ym8
2955*c0909341SAndroid Build Coastguard Worker    pshufb          ym5{k2}, ym2, ym11   ; a0 b0   __ t0
2956*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq+9]
2957*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym3, ym9
2958*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym1, ym3, ym10
2959*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [tlq-6]     ; l5 l4 l3 __
2960*c0909341SAndroid Build Coastguard Worker    kunpckbw             k4, k1, k3      ; 0x0fff
2961*c0909341SAndroid Build Coastguard Worker    packssdw            ym0, ym1
2962*c0909341SAndroid Build Coastguard Worker    psraw               ym0, 4           ; a0 d0         a1 b1
2963*c0909341SAndroid Build Coastguard Worker    packuswb            ym5, ym0         ; a0 b0 c0 d0   __ t1 a1 b1
2964*c0909341SAndroid Build Coastguard Worker    pshufd           m2{k3}, m5, q3333   ; d0 d0 d0 d0   b1 b1 b1 b1   t2 t2 t2 t2
2965*c0909341SAndroid Build Coastguard Worker    vpermb           m3{k2}, m11, m5     ; l5 l4 l3 __   d3 c3 b3 __   b7 a7 t7 __
2966*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2967*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m2, m7
2968*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
2969*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m8
2970*c0909341SAndroid Build Coastguard Worker    psrldq               m0, m2, 1       ; __ d0         __ b0         __ t0
2971*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq+13]
2972*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m3, m9
2973*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m3, m10
2974*c0909341SAndroid Build Coastguard Worker    mova                m12, [base+filter_end]
2975*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [hq-6]
2976*c0909341SAndroid Build Coastguard Worker    mov                  r6, dstq
2977*c0909341SAndroid Build Coastguard Worker    cmovp                hd, r5d         ; w == 16 ? h : h - 6
2978*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m1
2979*c0909341SAndroid Build Coastguard Worker    psraw                m4, 4           ; e0 f0         c1 d1         a2 b2
2980*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m4          ; __ d0 e0 f0   __ b1 c1 d1   __ t2 a2 b2
2981*c0909341SAndroid Build Coastguard Worker    pshufd           m2{k4}, m0, q3333   ; f0 f0 f0 f0   d1 d1 d1 d1   b2 b2 b2 b2   t3 t3 t3 t3
2982*c0909341SAndroid Build Coastguard Worker.w16_loop:
2983*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [tlq-8]
2984*c0909341SAndroid Build Coastguard Worker    vpermb           m3{k2}, m11, m0     ; l7 l6 l5 __   f3 e3 d3 __   d7 c7 b7 __   bb ab tb __
2985*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
2986*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m7
2987*c0909341SAndroid Build Coastguard Worker    mova                 m0, m6
2988*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m2, m8
2989*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
2990*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m3, m9
2991*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m3, m10
2992*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0
2993*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
2994*c0909341SAndroid Build Coastguard Worker    psraw                m4, m1, 4       ; g0 h0         e1 f1         c2 d2         a3 b3
2995*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m4          ; e0 f0 g0 h0   c1 d1 e1 f1   a2 b2 c2 d2   __ __ a3 b3
2996*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q3333   ; h0 h0 h0 h0   f1 f1 f1 f1   d2 d2 d2 d2   b3 b3 b3 b3
2997*c0909341SAndroid Build Coastguard Worker    vpermt2d             m5, m12, m0     ; c0 d0 e0 f0   __ __ c1 d1   a0 a1 a2 a3   b0 b1 b2 b3
2998*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*0], m5, 2
2999*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m5, 3
3000*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3001*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3002*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
3003*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
3004*c0909341SAndroid Build Coastguard Worker    je .ret
3005*c0909341SAndroid Build Coastguard Worker    mova               xm13, [filter_perm+16]
3006*c0909341SAndroid Build Coastguard Worker    mova               xmm3, [r6+strideq*0]
3007*c0909341SAndroid Build Coastguard Worker    punpckhdq          xmm3, [r6+strideq*1]
3008*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4   f1 f1 f1 f1   d2 d2 d2 d2   b3 b3 b3 b3
3009*c0909341SAndroid Build Coastguard Worker    pinsrb              xm3, xmm3, [tlq+r5+16], 7
3010*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm13
3011*c0909341SAndroid Build Coastguard Worker    vpermb           m3{k2}, m11, m0     ; bf af tf __   h3 g3 f3 __   f7 e7 d7 __   db cb bb __
3012*c0909341SAndroid Build Coastguard Worker    mova                 m0, m6
3013*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m2, m7
3014*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
3015*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m8
3016*c0909341SAndroid Build Coastguard Worker    kunpckbw             k5, k3, k1      ; 0xff0f
3017*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3018*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m3, m9
3019*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m3, m10
3020*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3021*c0909341SAndroid Build Coastguard Worker    psraw                m0, 4           ; a4 b4         g1 h1         e2 f2         c3 d3
3022*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m0          ; g0 h0 a4 b4   e1 f1 g1 h1   c2 d2 e2 f2   __ __ c3 d3
3023*c0909341SAndroid Build Coastguard Worker    vpblendmb        m1{k3}, m4, m2      ; __ t4 a4 b4   e1 f1 g1 h1   c2 d2 e2 f2   __ __ c3 d3
3024*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [tlq+r5+21]
3025*c0909341SAndroid Build Coastguard Worker    pshufd           m2{k5}, m4, q3333   ; b4 b4 b4 b4   t5 t5 t5 t5   f2 f2 f2 f2   d3 d3 d3 d3
3026*c0909341SAndroid Build Coastguard Worker    vpermt2d             m5, m12, m4     ; e0 f0 g0 h0   __ __ e1 f1   c0 c1 c2 c3   d0 d1 d2 d3
3027*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*0], m5, 2
3028*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m5, 3
3029*c0909341SAndroid Build Coastguard Worker    punpckhqdq         xmm3, [r6+r3]
3030*c0909341SAndroid Build Coastguard Worker    pinsrb             xmm3, [r6+strideq*2+15], 11
3031*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xmm3, xm13
3032*c0909341SAndroid Build Coastguard Worker    vpermb           m3{k2}, m11, m1     ; df cf bf __   bj aj tj __   h7 g7 f7 __   fb eb db __
3033*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3034*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m2, m7
3035*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
3036*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m8
3037*c0909341SAndroid Build Coastguard Worker    kxnord               k3, k3, k4      ; 0xfffff0ff
3038*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*5]
3039*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m3, m9
3040*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m3, m10
3041*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m1
3042*c0909341SAndroid Build Coastguard Worker    psraw                m4, 4           ; c4 d4         a5 b5         g2 h2         e3 f3
3043*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m4          ; a4 b4 c4 d4   g1 h1 a5 b5   e2 f2 g2 h2   __ __ e3 f3
3044*c0909341SAndroid Build Coastguard Worker    vpblendmw        m1{k3}, m2, m0      ; a4 b4 c4 d4   __ t5 a5 b5   e2 f2 g2 h2   __ __ e3 f3
3045*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq+r5+25]
3046*c0909341SAndroid Build Coastguard Worker    pshufd           m2{k3}, m0, q3333   ; d4 d4 d4 d4   b5 b5 b5 b5   t6 t6 t6 t6   f3 f3 f3 f3
3047*c0909341SAndroid Build Coastguard Worker    vpermt2d             m5, m12, m0     ; g0 h0 a4 b4   __ __ g1 h1   e0 e1 e2 e3   f0 f1 f2 f3
3048*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m5, 2
3049*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r3       ], m5, 3
3050*c0909341SAndroid Build Coastguard Worker    punpckhqdq         xmm3, [r6+r4]
3051*c0909341SAndroid Build Coastguard Worker    pinsrb             xmm3, [r6+strideq*4+15], 11
3052*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xmm3, xm13
3053*c0909341SAndroid Build Coastguard Worker    vpermb           m3{k2}, m11, m1     ; ff ef df __   dj cj bj __   bn an tn __   hb hb fb __
3054*c0909341SAndroid Build Coastguard Worker    mova                 m0, m6
3055*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m2, m7
3056*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
3057*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m8
3058*c0909341SAndroid Build Coastguard Worker    kunpckwd             k1, k1, k2      ; 0x000f0000
3059*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m3, m9
3060*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m3, m10
3061*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3062*c0909341SAndroid Build Coastguard Worker    psraw                m0, 4           ; e4 f4         c5 d5         a6 b6         g3 h3
3063*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m0          ; c4 d4 e4 f4   a5 b5 c5 d5   g2 h2 a6 b6   __ __ g3 h3
3064*c0909341SAndroid Build Coastguard Worker    vpblendmw        m1{k1}, m4, m2      ; c4 d4 e4 f4   a5 b5 c5 d5   __ t6 a6 b6   __ __ g3 h3
3065*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq+r5+29]
3066*c0909341SAndroid Build Coastguard Worker    pshufd           m2{k4}, m4, q3333   ; f4 f4 f4 f4   d5 d5 d5 d5   b6 b6 b6 b6   t7 t7 t7 t7
3067*c0909341SAndroid Build Coastguard Worker    vpermt2d             m5, m12, m4     ; a4 b4 c4 d4   __ __ a5 b5   g0 g1 g2 g3   h0 h1 h2 h3
3068*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*4], m5, 2
3069*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r4       ], m5, 3
3070*c0909341SAndroid Build Coastguard Worker    lea                  r0, [strideq+r3*2]
3071*c0909341SAndroid Build Coastguard Worker.w32_loop:
3072*c0909341SAndroid Build Coastguard Worker    punpckhqdq         xmm3, [r6+r0]
3073*c0909341SAndroid Build Coastguard Worker    pinsrb             xmm3, [r6+r3*2+15], 11
3074*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xmm3, xm13
3075*c0909341SAndroid Build Coastguard Worker    vpermb           m3{k2}, m11, m1     ; hf gf ff __   fj ej dj __   dn cn bn __   br ar tr __
3076*c0909341SAndroid Build Coastguard Worker.w32_loop_tail:
3077*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3078*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m2, m7
3079*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
3080*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m8
3081*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m3, m9
3082*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m3, m10
3083*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m1
3084*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
3085*c0909341SAndroid Build Coastguard Worker    psraw                m0, m4, 4       ; g4 h4         e5 f5         c6 d6         a7 b7
3086*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m0          ; e4 f4 g4 h4   c5 d5 e5 f5   a6 b6 c6 d6   __ __ a7 b7
3087*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q3333   ; h4 h4 h4 h4   f5 f5 f5 f5   d6 d6 d6 d6   b7 b7 b7 b7
3088*c0909341SAndroid Build Coastguard Worker    vpermt2d             m5, m12, m1     ; c4 d4 e4 f4   __ __ c5 d5   a4 a5 a6 a7   b4 b5 b6 b7
3089*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r6+strideq*0+16], m5, 2
3090*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r6+strideq*1+16], m5, 3
3091*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
3092*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
3093*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
3094*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m11, m1
3095*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, -6
3096*c0909341SAndroid Build Coastguard Worker    jg .w32_loop_tail
3097*c0909341SAndroid Build Coastguard Worker.ret:
3098*c0909341SAndroid Build Coastguard Worker    RET
3099*c0909341SAndroid Build Coastguard Worker.w8:
3100*c0909341SAndroid Build Coastguard Worker    vpermb              ym3, ym11, ymm2
3101*c0909341SAndroid Build Coastguard Worker.w8_loop:
3102*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    ym3{k1}, [tlq-4]     ; l3 l2 l1 __   b3 a3 t3 __
3103*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym6
3104*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym2, ym7
3105*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym6
3106*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym1, ym2, ym8
3107*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
3108*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym3, ym9
3109*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym1, ym3, ym10
3110*c0909341SAndroid Build Coastguard Worker    mova                ym3, ym5
3111*c0909341SAndroid Build Coastguard Worker    packssdw            ym0, ym1
3112*c0909341SAndroid Build Coastguard Worker    psraw               ym5, ym0, 4      ; c0 d0         a1 b1
3113*c0909341SAndroid Build Coastguard Worker    packuswb            ym3, ym5         ; a0 b0 c0 d0   __ __ a1 b1
3114*c0909341SAndroid Build Coastguard Worker    pshufd              ym2, ym3, q3333  ; d0 d0 d0 d0   b1 b1 b1 b1
3115*c0909341SAndroid Build Coastguard Worker    vpermb              ym3, ym11, ym3   ; a0 a1 b0 b1
3116*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm3
3117*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm3
3118*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3119*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3120*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
3121*c0909341SAndroid Build Coastguard Worker    RET
3122*c0909341SAndroid Build Coastguard Worker.w4_loop:
3123*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm3, [tlq-4]     ; l3 l2 l1 __
3124*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xm6
3125*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm0, xmm2, xm7
3126*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xm6
3127*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm1, xmm2, xm8
3128*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
3129*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm0, xmm3, xm9
3130*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm1, xmm3, xm10
3131*c0909341SAndroid Build Coastguard Worker    packssdw           xmm0, xmm1
3132*c0909341SAndroid Build Coastguard Worker.w4:
3133*c0909341SAndroid Build Coastguard Worker    psraw              xmm0, 4           ; a0 b0
3134*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm0
3135*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xmm0
3136*c0909341SAndroid Build Coastguard Worker    pshufd             xmm2, xmm0, q1111 ; b0 b0 b0 b0
3137*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xmm2
3138*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3139*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3140*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
3141*c0909341SAndroid Build Coastguard Worker    RET
3142*c0909341SAndroid Build Coastguard Worker
3143*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
3144