1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_WEIGHT_TABLE 1-* 34*c0909341SAndroid Build Coastguard Worker %rep %0 35*c0909341SAndroid Build Coastguard Worker db %1-128, 127-%1 36*c0909341SAndroid Build Coastguard Worker %rotate 1 37*c0909341SAndroid Build Coastguard Worker %endrep 38*c0909341SAndroid Build Coastguard Worker%endmacro 39*c0909341SAndroid Build Coastguard Worker 40*c0909341SAndroid Build Coastguard Workersmooth_weights: SMOOTH_WEIGHT_TABLE \ 41*c0909341SAndroid Build Coastguard Worker 0, 0, 255, 128, 255, 149, 85, 64, \ 42*c0909341SAndroid Build Coastguard Worker 255, 197, 146, 105, 73, 50, 37, 32, \ 43*c0909341SAndroid Build Coastguard Worker 255, 225, 196, 170, 145, 123, 102, 84, \ 44*c0909341SAndroid Build Coastguard Worker 68, 54, 43, 33, 26, 20, 17, 16, \ 45*c0909341SAndroid Build Coastguard Worker 255, 240, 225, 210, 196, 182, 169, 157, \ 46*c0909341SAndroid Build Coastguard Worker 145, 133, 122, 111, 101, 92, 83, 74, \ 47*c0909341SAndroid Build Coastguard Worker 66, 59, 52, 45, 39, 34, 29, 25, \ 48*c0909341SAndroid Build Coastguard Worker 21, 17, 14, 12, 10, 9, 8, 8, \ 49*c0909341SAndroid Build Coastguard Worker 255, 248, 240, 233, 225, 218, 210, 203, \ 50*c0909341SAndroid Build Coastguard Worker 196, 189, 182, 176, 169, 163, 156, 150, \ 51*c0909341SAndroid Build Coastguard Worker 144, 138, 133, 127, 121, 116, 111, 106, \ 52*c0909341SAndroid Build Coastguard Worker 101, 96, 91, 86, 82, 77, 73, 69, \ 53*c0909341SAndroid Build Coastguard Worker 65, 61, 57, 54, 50, 47, 44, 41, \ 54*c0909341SAndroid Build Coastguard Worker 38, 35, 32, 29, 27, 25, 22, 20, \ 55*c0909341SAndroid Build Coastguard Worker 18, 16, 15, 13, 12, 10, 9, 8, \ 56*c0909341SAndroid Build Coastguard Worker 7, 6, 6, 5, 5, 4, 4, 4 57*c0909341SAndroid Build Coastguard Worker 58*c0909341SAndroid Build Coastguard Worker; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ 59*c0909341SAndroid Build Coastguard Workerfilter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 60*c0909341SAndroid Build Coastguard Worker db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 61*c0909341SAndroid Build Coastguard Worker db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 62*c0909341SAndroid Build Coastguard Worker db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 63*c0909341SAndroid Build Coastguard Worker db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 64*c0909341SAndroid Build Coastguard Worker db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 65*c0909341SAndroid Build Coastguard Worker db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 66*c0909341SAndroid Build Coastguard Worker db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 67*c0909341SAndroid Build Coastguard Worker db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 68*c0909341SAndroid Build Coastguard Worker db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 69*c0909341SAndroid Build Coastguard Worker db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 70*c0909341SAndroid Build Coastguard Worker db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 71*c0909341SAndroid Build Coastguard Worker db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 72*c0909341SAndroid Build Coastguard Worker db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 73*c0909341SAndroid Build Coastguard Worker db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 74*c0909341SAndroid Build Coastguard Worker db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 75*c0909341SAndroid Build Coastguard Worker db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 76*c0909341SAndroid Build Coastguard Worker db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 77*c0909341SAndroid Build Coastguard Worker db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 78*c0909341SAndroid Build Coastguard Worker db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 79*c0909341SAndroid Build Coastguard Workerfilter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 80*c0909341SAndroid Build Coastguard Worker db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 81*c0909341SAndroid Build Coastguard Worker db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 82*c0909341SAndroid Build Coastguard Worker db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 83*c0909341SAndroid Build Coastguard Workerfilter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 84*c0909341SAndroid Build Coastguard Workersmooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 85*c0909341SAndroid Build Coastguard Worker db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 86*c0909341SAndroid Build Coastguard Worker db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 87*c0909341SAndroid Build Coastguard Worker db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 88*c0909341SAndroid Build Coastguard Workersmooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 89*c0909341SAndroid Build Coastguard Worker db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 90*c0909341SAndroid Build Coastguard Worker db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 91*c0909341SAndroid Build Coastguard Worker db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 92*c0909341SAndroid Build Coastguard Workersmooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 93*c0909341SAndroid Build Coastguard Worker db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 94*c0909341SAndroid Build Coastguard Worker db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 95*c0909341SAndroid Build Coastguard Worker db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 96*c0909341SAndroid Build Coastguard Workeripred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 97*c0909341SAndroid Build Coastguard Worker db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 98*c0909341SAndroid Build Coastguard Workerpal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 99*c0909341SAndroid Build Coastguard Workerpal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 100*c0909341SAndroid Build Coastguard Workerpb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48 101*c0909341SAndroid Build Coastguard Worker db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32 102*c0909341SAndroid Build Coastguard Worker db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 103*c0909341SAndroid Build Coastguard Worker db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 104*c0909341SAndroid Build Coastguard Workerz_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14 105*c0909341SAndroid Build Coastguard Worker db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30 106*c0909341SAndroid Build Coastguard Worker db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46 107*c0909341SAndroid Build Coastguard Worker db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62 108*c0909341SAndroid Build Coastguard Workerz_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6 109*c0909341SAndroid Build Coastguard Worker db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22 110*c0909341SAndroid Build Coastguard Worker db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38 111*c0909341SAndroid Build Coastguard Worker db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54 112*c0909341SAndroid Build Coastguard Workerz_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16 113*c0909341SAndroid Build Coastguard Worker db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32 114*c0909341SAndroid Build Coastguard Worker db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48 115*c0909341SAndroid Build Coastguard Worker db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64 116*c0909341SAndroid Build Coastguard Workerz_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 117*c0909341SAndroid Build Coastguard Workerz_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 118*c0909341SAndroid Build Coastguard Workerz_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8 119*c0909341SAndroid Build Coastguard Workerz_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9 120*c0909341SAndroid Build Coastguard Workerz_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 121*c0909341SAndroid Build Coastguard Worker db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 122*c0909341SAndroid Build Coastguard Workerz_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72 123*c0909341SAndroid Build Coastguard Workerz_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80 124*c0909341SAndroid Build Coastguard Workerz_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 125*c0909341SAndroid Build Coastguard Worker db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 126*c0909341SAndroid Build Coastguard Worker db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 127*c0909341SAndroid Build Coastguard Worker db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56 128*c0909341SAndroid Build Coastguard Workerz_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 129*c0909341SAndroid Build Coastguard Worker db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 130*c0909341SAndroid Build Coastguard Worker db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 131*c0909341SAndroid Build Coastguard Worker db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64 132*c0909341SAndroid Build Coastguard Workerz_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8 133*c0909341SAndroid Build Coastguard Worker dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16 134*c0909341SAndroid Build Coastguard Workerz_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67 135*c0909341SAndroid Build Coastguard Worker db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71 136*c0909341SAndroid Build Coastguard Worker db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75 137*c0909341SAndroid Build Coastguard Worker db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79 138*c0909341SAndroid Build Coastguard Workerz_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0 139*c0909341SAndroid Build Coastguard Worker db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1 140*c0909341SAndroid Build Coastguard Worker db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2 141*c0909341SAndroid Build Coastguard Worker db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3 142*c0909341SAndroid Build Coastguard Workerz_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1 143*c0909341SAndroid Build Coastguard Worker db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3 144*c0909341SAndroid Build Coastguard Worker db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5 145*c0909341SAndroid Build Coastguard Worker db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7 146*c0909341SAndroid Build Coastguard Workerz_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24 147*c0909341SAndroid Build Coastguard Worker dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56 148*c0909341SAndroid Build Coastguard Workerz_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32 149*c0909341SAndroid Build Coastguard Worker dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64 150*c0909341SAndroid Build Coastguard Workerz_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 151*c0909341SAndroid Build Coastguard Worker dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512 152*c0909341SAndroid Build Coastguard Worker dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512 153*c0909341SAndroid Build Coastguard Worker dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512 154*c0909341SAndroid Build Coastguard Workerz_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 155*c0909341SAndroid Build Coastguard Worker dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512 156*c0909341SAndroid Build Coastguard Worker dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512 157*c0909341SAndroid Build Coastguard Worker dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512 158*c0909341SAndroid Build Coastguard Workerz_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 159*c0909341SAndroid Build Coastguard Workerz_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 160*c0909341SAndroid Build Coastguard Workerz3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 161*c0909341SAndroid Build Coastguard Worker db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8 162*c0909341SAndroid Build Coastguard Workerz_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 163*c0909341SAndroid Build Coastguard Worker db 39, 39, 47, 47, 47, 79, 79, 79 164*c0909341SAndroid Build Coastguard Workerz_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 165*c0909341SAndroid Build Coastguard Worker db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0 166*c0909341SAndroid Build Coastguard Worker db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16 167*c0909341SAndroid Build Coastguard Worker 168*c0909341SAndroid Build Coastguard Workerpb_8_56_0_0: db 8, 56, 0, 0 169*c0909341SAndroid Build Coastguard Workerpb_m4_36: times 2 db -4, 36 170*c0909341SAndroid Build Coastguard Workerpb_127_m127: times 2 db 127, -127 171*c0909341SAndroid Build Coastguard Workerpb_8: times 4 db 8 172*c0909341SAndroid Build Coastguard Workerpb_15: times 4 db 15 173*c0909341SAndroid Build Coastguard Workerpb_16: times 4 db 16 174*c0909341SAndroid Build Coastguard Workerpb_31: times 4 db 31 175*c0909341SAndroid Build Coastguard Workerpb_63: times 4 db 63 176*c0909341SAndroid Build Coastguard Workerpb_90: times 4 db 90 177*c0909341SAndroid Build Coastguard Workerpb_128: times 4 db 128 178*c0909341SAndroid Build Coastguard Workerpw_128: times 2 dw 128 179*c0909341SAndroid Build Coastguard Workerpw_255: times 2 dw 255 180*c0909341SAndroid Build Coastguard Workerpw_512: times 2 dw 512 181*c0909341SAndroid Build Coastguard Worker 182*c0909341SAndroid Build Coastguard Worker%define pb_1 (ipred_h_shuf+24) 183*c0909341SAndroid Build Coastguard Worker%define pb_2 (ipred_h_shuf+20) 184*c0909341SAndroid Build Coastguard Worker%define pb_3 (ipred_h_shuf+16) 185*c0909341SAndroid Build Coastguard Worker%define pb_4 (smooth_shuf +48) 186*c0909341SAndroid Build Coastguard Worker%define pb_7 (ipred_h_shuf+ 0) 187*c0909341SAndroid Build Coastguard Worker%define pb_9 (z_xpos_bc + 8) 188*c0909341SAndroid Build Coastguard Worker%define pb_17 (z_xpos_bc + 0) 189*c0909341SAndroid Build Coastguard Worker%define pb_33 (z_xpos_bc + 4) 190*c0909341SAndroid Build Coastguard Worker%define pd_8 (filter_taps+128) 191*c0909341SAndroid Build Coastguard Worker 192*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-* 193*c0909341SAndroid Build Coastguard Worker %xdefine %1_%2_table (%%table - 2*4) 194*c0909341SAndroid Build Coastguard Worker %xdefine %%base mangle(private_prefix %+ _%1_%2) 195*c0909341SAndroid Build Coastguard Worker %%table: 196*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 197*c0909341SAndroid Build Coastguard Worker dd %%base %+ .%3 - (%%table - 2*4) 198*c0909341SAndroid Build Coastguard Worker %rotate 1 199*c0909341SAndroid Build Coastguard Worker %endrep 200*c0909341SAndroid Build Coastguard Worker%endmacro 201*c0909341SAndroid Build Coastguard Worker 202*c0909341SAndroid Build Coastguard Worker%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) 203*c0909341SAndroid Build Coastguard Worker 204*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 205*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 206*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 207*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 208*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 209*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 210*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64 211*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64 212*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 213*c0909341SAndroid Build Coastguard Worker s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 214*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 215*c0909341SAndroid Build Coastguard Worker 216*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative 217*c0909341SAndroid Build Coastguard Workercextern pb_0to63 218*c0909341SAndroid Build Coastguard Worker 219*c0909341SAndroid Build Coastguard WorkerSECTION .text 220*c0909341SAndroid Build Coastguard Worker 221*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 222*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h 223*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_left_8bpc_avx512icl_table] 224*c0909341SAndroid Build Coastguard Worker movd xm0, wm 225*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 226*c0909341SAndroid Build Coastguard Worker inc tlq 227*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 228*c0909341SAndroid Build Coastguard Worker movu ym1, [tlq] 229*c0909341SAndroid Build Coastguard Worker movd xmm3, wd 230*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+wq*4] 231*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] 232*c0909341SAndroid Build Coastguard Worker psrld xm0, 1 233*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym1, ym2 234*c0909341SAndroid Build Coastguard Worker add r6, r5 235*c0909341SAndroid Build Coastguard Worker add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table 236*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 237*c0909341SAndroid Build Coastguard Worker add wq, r5 238*c0909341SAndroid Build Coastguard Worker jmp r6 239*c0909341SAndroid Build Coastguard Worker 240*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 241*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_left_8bpc_avx512icl_table] 242*c0909341SAndroid Build Coastguard Worker mov hd, hm 243*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 244*c0909341SAndroid Build Coastguard Worker sub tlq, hq 245*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 246*c0909341SAndroid Build Coastguard Worker movd xm0, hm 247*c0909341SAndroid Build Coastguard Worker movu ym1, [tlq] 248*c0909341SAndroid Build Coastguard Worker movd xmm3, r6d 249*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+r6*4] 250*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] 251*c0909341SAndroid Build Coastguard Worker psrld xm0, 1 252*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym1, ym2 253*c0909341SAndroid Build Coastguard Worker add r6, r5 254*c0909341SAndroid Build Coastguard Worker add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table 255*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 256*c0909341SAndroid Build Coastguard Worker add wq, r5 257*c0909341SAndroid Build Coastguard Worker jmp r6 258*c0909341SAndroid Build Coastguard Worker.h64: 259*c0909341SAndroid Build Coastguard Worker movu ym1, [tlq+32] ; unaligned when jumping here from dc_top 260*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym1, ym2 261*c0909341SAndroid Build Coastguard Worker.h32: 262*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 263*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 264*c0909341SAndroid Build Coastguard Worker.h16: 265*c0909341SAndroid Build Coastguard Worker punpckhqdq xm1, xm0, xm0 266*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 267*c0909341SAndroid Build Coastguard Worker.h8: 268*c0909341SAndroid Build Coastguard Worker psrlq xm1, xm0, 32 269*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 270*c0909341SAndroid Build Coastguard Worker.h4: 271*c0909341SAndroid Build Coastguard Worker vpsrlvd xm0, xmm3 272*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 273*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, xm0 274*c0909341SAndroid Build Coastguard Worker jmp wq 275*c0909341SAndroid Build Coastguard Worker 276*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 277*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 278*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 279*c0909341SAndroid Build Coastguard Worker tzcnt r6d, hd 280*c0909341SAndroid Build Coastguard Worker lea r5d, [wq+hq] 281*c0909341SAndroid Build Coastguard Worker movd xm0, r5d 282*c0909341SAndroid Build Coastguard Worker tzcnt r5d, r5d 283*c0909341SAndroid Build Coastguard Worker movd xmm4, r5d 284*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_8bpc_avx512icl_table] 285*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 286*c0909341SAndroid Build Coastguard Worker movsxd r6, [r5+r6*4] 287*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4+5*4] 288*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] 289*c0909341SAndroid Build Coastguard Worker psrld xm0, 1 290*c0909341SAndroid Build Coastguard Worker add r6, r5 291*c0909341SAndroid Build Coastguard Worker add wq, r5 292*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 293*c0909341SAndroid Build Coastguard Worker jmp r6 294*c0909341SAndroid Build Coastguard Worker.h4: 295*c0909341SAndroid Build Coastguard Worker movd xmm1, [tlq-4] 296*c0909341SAndroid Build Coastguard Worker vpdpbusd xm0, xmm1, xm3 297*c0909341SAndroid Build Coastguard Worker jmp wq 298*c0909341SAndroid Build Coastguard Worker.w4: 299*c0909341SAndroid Build Coastguard Worker movd xmm1, [tlq+1] 300*c0909341SAndroid Build Coastguard Worker vpdpbusd xm0, xmm1, xm3 301*c0909341SAndroid Build Coastguard Worker cmp hd, 4 302*c0909341SAndroid Build Coastguard Worker jg .w4_mul 303*c0909341SAndroid Build Coastguard Worker psrlw xmm0, xm0, 3 304*c0909341SAndroid Build Coastguard Worker jmp .w4_end 305*c0909341SAndroid Build Coastguard Worker.w4_mul: 306*c0909341SAndroid Build Coastguard Worker punpckhqdq xmm1, xm0, xm0 307*c0909341SAndroid Build Coastguard Worker lea r2d, [hq*2] 308*c0909341SAndroid Build Coastguard Worker mov r6d, 0x55563334 309*c0909341SAndroid Build Coastguard Worker paddd xmm1, xm0 310*c0909341SAndroid Build Coastguard Worker shrx r6d, r6d, r2d 311*c0909341SAndroid Build Coastguard Worker psrlq xmm0, xmm1, 32 312*c0909341SAndroid Build Coastguard Worker paddd xmm0, xmm1 313*c0909341SAndroid Build Coastguard Worker movd xmm1, r6d 314*c0909341SAndroid Build Coastguard Worker psrld xmm0, 2 315*c0909341SAndroid Build Coastguard Worker pmulhuw xmm0, xmm1 316*c0909341SAndroid Build Coastguard Worker.w4_end: 317*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm0, xmm0 318*c0909341SAndroid Build Coastguard Worker.s4: 319*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 320*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], xm0 321*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm0 322*c0909341SAndroid Build Coastguard Worker movd [dstq+stride3q ], xm0 323*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 324*c0909341SAndroid Build Coastguard Worker sub hd, 4 325*c0909341SAndroid Build Coastguard Worker jg .s4 326*c0909341SAndroid Build Coastguard Worker RET 327*c0909341SAndroid Build Coastguard Worker.h8: 328*c0909341SAndroid Build Coastguard Worker movq xmm1, [tlq-8] 329*c0909341SAndroid Build Coastguard Worker vpdpbusd xm0, xmm1, xm3 330*c0909341SAndroid Build Coastguard Worker jmp wq 331*c0909341SAndroid Build Coastguard Worker.w8: 332*c0909341SAndroid Build Coastguard Worker movq xmm1, [tlq+1] 333*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym0, 1 334*c0909341SAndroid Build Coastguard Worker vpdpbusd xm0, xmm1, xm3 335*c0909341SAndroid Build Coastguard Worker paddd xmm2, xm2, xm0 336*c0909341SAndroid Build Coastguard Worker punpckhqdq xmm0, xmm2, xmm2 337*c0909341SAndroid Build Coastguard Worker paddd xmm0, xmm2 338*c0909341SAndroid Build Coastguard Worker psrlq xmm1, xmm0, 32 339*c0909341SAndroid Build Coastguard Worker paddd xmm0, xmm1 340*c0909341SAndroid Build Coastguard Worker vpsrlvd xmm0, xmm4 341*c0909341SAndroid Build Coastguard Worker cmp hd, 8 342*c0909341SAndroid Build Coastguard Worker je .w8_end 343*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 344*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 345*c0909341SAndroid Build Coastguard Worker cmp hd, 32 346*c0909341SAndroid Build Coastguard Worker cmove r6d, r2d 347*c0909341SAndroid Build Coastguard Worker movd xmm1, r6d 348*c0909341SAndroid Build Coastguard Worker pmulhuw xmm0, xmm1 349*c0909341SAndroid Build Coastguard Worker.w8_end: 350*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm0, xmm0 351*c0909341SAndroid Build Coastguard Worker.s8: 352*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 353*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm0 354*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm0 355*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], xm0 356*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 357*c0909341SAndroid Build Coastguard Worker sub hd, 4 358*c0909341SAndroid Build Coastguard Worker jg .s8 359*c0909341SAndroid Build Coastguard Worker RET 360*c0909341SAndroid Build Coastguard Worker.h16: 361*c0909341SAndroid Build Coastguard Worker mova xmm1, [tlq-16] 362*c0909341SAndroid Build Coastguard Worker vpdpbusd xm0, xmm1, xm3 363*c0909341SAndroid Build Coastguard Worker jmp wq 364*c0909341SAndroid Build Coastguard Worker.w16: 365*c0909341SAndroid Build Coastguard Worker movu xmm1, [tlq+1] 366*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym0, 1 367*c0909341SAndroid Build Coastguard Worker vpdpbusd xm0, xmm1, xm3 368*c0909341SAndroid Build Coastguard Worker paddd xmm2, xm2, xm0 369*c0909341SAndroid Build Coastguard Worker punpckhqdq xmm0, xmm2, xmm2 370*c0909341SAndroid Build Coastguard Worker paddd xmm0, xmm2 371*c0909341SAndroid Build Coastguard Worker psrlq xmm1, xmm0, 32 372*c0909341SAndroid Build Coastguard Worker paddd xmm0, xmm1 373*c0909341SAndroid Build Coastguard Worker vpsrlvd xmm0, xmm4 374*c0909341SAndroid Build Coastguard Worker cmp hd, 16 375*c0909341SAndroid Build Coastguard Worker je .w16_end 376*c0909341SAndroid Build Coastguard Worker mov r6d, 0x5556 377*c0909341SAndroid Build Coastguard Worker mov r2d, 0x3334 378*c0909341SAndroid Build Coastguard Worker test hb, 8|32 379*c0909341SAndroid Build Coastguard Worker cmovz r6d, r2d 380*c0909341SAndroid Build Coastguard Worker movd xmm1, r6d 381*c0909341SAndroid Build Coastguard Worker pmulhuw xmm0, xmm1 382*c0909341SAndroid Build Coastguard Worker.w16_end: 383*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm0, xmm0 384*c0909341SAndroid Build Coastguard Worker.s16: 385*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 386*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], xm0 387*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm0 388*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], xm0 389*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 390*c0909341SAndroid Build Coastguard Worker sub hd, 4 391*c0909341SAndroid Build Coastguard Worker jg .s16 392*c0909341SAndroid Build Coastguard Worker RET 393*c0909341SAndroid Build Coastguard Worker.h32: 394*c0909341SAndroid Build Coastguard Worker mova ym1, [tlq-32] 395*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym1, ym3 396*c0909341SAndroid Build Coastguard Worker jmp wq 397*c0909341SAndroid Build Coastguard Worker.w32: 398*c0909341SAndroid Build Coastguard Worker movu ym1, [tlq+1] 399*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym1, ym3 400*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 401*c0909341SAndroid Build Coastguard Worker paddd xmm1, xm1, xm0 402*c0909341SAndroid Build Coastguard Worker punpckhqdq xmm0, xmm1, xmm1 403*c0909341SAndroid Build Coastguard Worker paddd xmm0, xmm1 404*c0909341SAndroid Build Coastguard Worker psrlq xmm1, xmm0, 32 405*c0909341SAndroid Build Coastguard Worker paddd xmm0, xmm1 406*c0909341SAndroid Build Coastguard Worker vpsrlvd xmm0, xmm4 407*c0909341SAndroid Build Coastguard Worker cmp hd, 32 408*c0909341SAndroid Build Coastguard Worker je .w32_end 409*c0909341SAndroid Build Coastguard Worker lea r2d, [hq*2] 410*c0909341SAndroid Build Coastguard Worker mov r6d, 0x33345556 411*c0909341SAndroid Build Coastguard Worker shrx r6d, r6d, r2d 412*c0909341SAndroid Build Coastguard Worker movd xmm1, r6d 413*c0909341SAndroid Build Coastguard Worker pmulhuw xmm0, xmm1 414*c0909341SAndroid Build Coastguard Worker.w32_end: 415*c0909341SAndroid Build Coastguard Worker vpbroadcastb ym0, xmm0 416*c0909341SAndroid Build Coastguard Worker.s32: 417*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 418*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], ym0 419*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], ym0 420*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], ym0 421*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 422*c0909341SAndroid Build Coastguard Worker sub hd, 4 423*c0909341SAndroid Build Coastguard Worker jg .s32 424*c0909341SAndroid Build Coastguard Worker RET 425*c0909341SAndroid Build Coastguard Worker.h64: 426*c0909341SAndroid Build Coastguard Worker mova ym1, [tlq-64] 427*c0909341SAndroid Build Coastguard Worker mova ym2, [tlq-32] 428*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym1, ym3 429*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym2, ym3 430*c0909341SAndroid Build Coastguard Worker jmp wq 431*c0909341SAndroid Build Coastguard Worker.w64: 432*c0909341SAndroid Build Coastguard Worker movu ym1, [tlq+ 1] 433*c0909341SAndroid Build Coastguard Worker movu ym2, [tlq+33] 434*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym1, ym3 435*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym2, ym3 436*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 437*c0909341SAndroid Build Coastguard Worker paddd xmm1, xm1, xm0 438*c0909341SAndroid Build Coastguard Worker punpckhqdq xmm0, xmm1, xmm1 439*c0909341SAndroid Build Coastguard Worker paddd xmm0, xmm1 440*c0909341SAndroid Build Coastguard Worker psrlq xmm1, xmm0, 32 441*c0909341SAndroid Build Coastguard Worker paddd xmm0, xmm1 442*c0909341SAndroid Build Coastguard Worker vpsrlvd xmm0, xmm4 443*c0909341SAndroid Build Coastguard Worker cmp hd, 64 444*c0909341SAndroid Build Coastguard Worker je .w64_end 445*c0909341SAndroid Build Coastguard Worker mov r6d, 0x33345556 446*c0909341SAndroid Build Coastguard Worker shrx r6d, r6d, hd 447*c0909341SAndroid Build Coastguard Worker movd xmm1, r6d 448*c0909341SAndroid Build Coastguard Worker pmulhuw xmm0, xmm1 449*c0909341SAndroid Build Coastguard Worker.w64_end: 450*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, xmm0 451*c0909341SAndroid Build Coastguard Worker.s64: 452*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 453*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m0 454*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m0 455*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m0 456*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 457*c0909341SAndroid Build Coastguard Worker sub hd, 4 458*c0909341SAndroid Build Coastguard Worker jg .s64 459*c0909341SAndroid Build Coastguard Worker RET 460*c0909341SAndroid Build Coastguard Worker 461*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 462*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_splat_8bpc_avx512icl_table] 463*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 464*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 465*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 466*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] 467*c0909341SAndroid Build Coastguard Worker add wq, r5 468*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 469*c0909341SAndroid Build Coastguard Worker jmp wq 470*c0909341SAndroid Build Coastguard Worker 471*c0909341SAndroid Build Coastguard Workercglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 472*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_dc_splat_8bpc_avx512icl_table] 473*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 474*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+1] 475*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 476*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 477*c0909341SAndroid Build Coastguard Worker add wq, r5 478*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 479*c0909341SAndroid Build Coastguard Worker jmp wq 480*c0909341SAndroid Build Coastguard Worker 481*c0909341SAndroid Build Coastguard Workercglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 482*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_h_8bpc_avx512icl_table 483*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_h_8bpc_avx512icl_table] 484*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 485*c0909341SAndroid Build Coastguard Worker mov hd, hm 486*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 487*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 488*c0909341SAndroid Build Coastguard Worker sub tlq, hq 489*c0909341SAndroid Build Coastguard Worker add wq, r6 490*c0909341SAndroid Build Coastguard Worker jmp wq 491*c0909341SAndroid Build Coastguard Worker.w4: 492*c0909341SAndroid Build Coastguard Worker mova xmm1, [base+ipred_h_shuf+16] 493*c0909341SAndroid Build Coastguard Worker.w4_loop: 494*c0909341SAndroid Build Coastguard Worker movd xmm0, [tlq+hq-4] 495*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xmm1 496*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xmm0 497*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xmm0, 1 498*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xmm0, 2 499*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xmm0, 3 500*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 501*c0909341SAndroid Build Coastguard Worker sub hd, 4 502*c0909341SAndroid Build Coastguard Worker jg .w4_loop 503*c0909341SAndroid Build Coastguard Worker RET 504*c0909341SAndroid Build Coastguard Worker.w8: 505*c0909341SAndroid Build Coastguard Worker movsldup xmm2, [base+ipred_h_shuf+16] 506*c0909341SAndroid Build Coastguard Worker movshdup xmm3, [base+ipred_h_shuf+16] 507*c0909341SAndroid Build Coastguard Worker.w8_loop: 508*c0909341SAndroid Build Coastguard Worker movd xmm1, [tlq+hq-4] 509*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xmm1, xmm2 510*c0909341SAndroid Build Coastguard Worker pshufb xmm1, xmm3 511*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xmm0 512*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xmm1 513*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xmm0 514*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xmm1 515*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 516*c0909341SAndroid Build Coastguard Worker sub hd, 4 517*c0909341SAndroid Build Coastguard Worker jg .w8_loop 518*c0909341SAndroid Build Coastguard Worker RET 519*c0909341SAndroid Build Coastguard Worker.w16: 520*c0909341SAndroid Build Coastguard Worker movsldup m1, [base+smooth_shuf] 521*c0909341SAndroid Build Coastguard Worker.w16_loop: 522*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [tlq+hq-4] 523*c0909341SAndroid Build Coastguard Worker pshufb m0, m1 524*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 525*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m0, 2 526*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], ym0, 1 527*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 528*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 529*c0909341SAndroid Build Coastguard Worker sub hd, 4 530*c0909341SAndroid Build Coastguard Worker jg .w16 531*c0909341SAndroid Build Coastguard Worker RET 532*c0909341SAndroid Build Coastguard Worker.w32: 533*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym3, [base+pb_1] 534*c0909341SAndroid Build Coastguard Worker vpord m2, m3, [base+pb_2] {1to16} 535*c0909341SAndroid Build Coastguard Worker.w32_loop: 536*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [tlq+hq-4] 537*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, m2 538*c0909341SAndroid Build Coastguard Worker pshufb m1, m3 539*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 540*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 541*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], ym1 542*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+stride3q ], m1, 1 543*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 544*c0909341SAndroid Build Coastguard Worker sub hd, 4 545*c0909341SAndroid Build Coastguard Worker jg .w32_loop 546*c0909341SAndroid Build Coastguard Worker RET 547*c0909341SAndroid Build Coastguard Worker.w64: 548*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+pb_3] 549*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pb_2] 550*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pb_1] 551*c0909341SAndroid Build Coastguard Worker pxor m7, m7 552*c0909341SAndroid Build Coastguard Worker.w64_loop: 553*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [tlq+hq-4] 554*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m4 555*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m5 556*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 557*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 558*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 559*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 560*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], m2 561*c0909341SAndroid Build Coastguard Worker mova [dstq+stride3q ], m3 562*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 563*c0909341SAndroid Build Coastguard Worker sub hd, 4 564*c0909341SAndroid Build Coastguard Worker jg .w64_loop 565*c0909341SAndroid Build Coastguard Worker RET 566*c0909341SAndroid Build Coastguard Worker 567*c0909341SAndroid Build Coastguard Worker%macro PAETH 0 568*c0909341SAndroid Build Coastguard Worker psubusb m1, m5, m4 569*c0909341SAndroid Build Coastguard Worker psubusb m0, m4, m5 570*c0909341SAndroid Build Coastguard Worker por m1, m0 ; tdiff 571*c0909341SAndroid Build Coastguard Worker pavgb m2, m6, m4 572*c0909341SAndroid Build Coastguard Worker vpcmpub k1, m1, m7, 1 ; tdiff < ldiff 573*c0909341SAndroid Build Coastguard Worker vpblendmb m0{k1}, m4, m6 574*c0909341SAndroid Build Coastguard Worker vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 575*c0909341SAndroid Build Coastguard Worker psubusb m3, m5, m2 576*c0909341SAndroid Build Coastguard Worker psubb m2, m4 577*c0909341SAndroid Build Coastguard Worker psubusb m2, m5 578*c0909341SAndroid Build Coastguard Worker por m2, m3 579*c0909341SAndroid Build Coastguard Worker pminub m1, m7 580*c0909341SAndroid Build Coastguard Worker paddusb m2, m2 581*c0909341SAndroid Build Coastguard Worker por m2, m4 ; min(tldiff, 255) 582*c0909341SAndroid Build Coastguard Worker vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff 583*c0909341SAndroid Build Coastguard Worker vmovdqu8 m0{k1}, m5 584*c0909341SAndroid Build Coastguard Worker%endmacro 585*c0909341SAndroid Build Coastguard Worker 586*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 587*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_paeth_8bpc_avx512icl_table] 588*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 589*c0909341SAndroid Build Coastguard Worker vpbroadcastb m5, [tlq] ; topleft 590*c0909341SAndroid Build Coastguard Worker mov hd, hm 591*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 592*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] 593*c0909341SAndroid Build Coastguard Worker lea topq, [tlq+1] 594*c0909341SAndroid Build Coastguard Worker sub tlq, hq 595*c0909341SAndroid Build Coastguard Worker add wq, r6 596*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 597*c0909341SAndroid Build Coastguard Worker jmp wq 598*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl 599*c0909341SAndroid Build Coastguard Worker.w4: 600*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [topq] 601*c0909341SAndroid Build Coastguard Worker mova m9, [ipred_h_shuf] 602*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 603*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 604*c0909341SAndroid Build Coastguard Worker por m7, m0 ; ldiff 605*c0909341SAndroid Build Coastguard Worker.w4_loop: 606*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [tlq+hq-8] 607*c0909341SAndroid Build Coastguard Worker pshufb m4, m9 ; left 608*c0909341SAndroid Build Coastguard Worker PAETH 609*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 610*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 611*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 2 612*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm0, 3 613*c0909341SAndroid Build Coastguard Worker sub hd, 8 614*c0909341SAndroid Build Coastguard Worker jl .w4_ret 615*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m0, 1 616*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 617*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 618*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 619*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 2 620*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm0, 3 621*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 622*c0909341SAndroid Build Coastguard Worker jg .w4_loop 623*c0909341SAndroid Build Coastguard Worker.w4_ret: 624*c0909341SAndroid Build Coastguard Worker RET 625*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 626*c0909341SAndroid Build Coastguard Worker.w8: 627*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [topq] 628*c0909341SAndroid Build Coastguard Worker movsldup m9, [smooth_shuf] 629*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 630*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 631*c0909341SAndroid Build Coastguard Worker por m7, m0 632*c0909341SAndroid Build Coastguard Worker.w8_loop: 633*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [tlq+hq-8] 634*c0909341SAndroid Build Coastguard Worker pshufb m4, m9 635*c0909341SAndroid Build Coastguard Worker PAETH 636*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m0, 2 637*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym0, 1 638*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m0, 3 639*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 640*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 641*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm2 642*c0909341SAndroid Build Coastguard Worker movq [dstq+stride3q ], xm3 643*c0909341SAndroid Build Coastguard Worker sub hd, 8 644*c0909341SAndroid Build Coastguard Worker jl .w8_ret 645*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 646*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*0], xm0 647*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm1 648*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm2 649*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm3 650*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 651*c0909341SAndroid Build Coastguard Worker jg .w8_loop 652*c0909341SAndroid Build Coastguard Worker.w8_ret: 653*c0909341SAndroid Build Coastguard Worker RET 654*c0909341SAndroid Build Coastguard Worker.w16: 655*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [topq] 656*c0909341SAndroid Build Coastguard Worker movsldup m9, [smooth_shuf] 657*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 658*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 659*c0909341SAndroid Build Coastguard Worker por m7, m0 660*c0909341SAndroid Build Coastguard Worker.w16_loop: 661*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [tlq+hq-4] 662*c0909341SAndroid Build Coastguard Worker pshufb m4, m9 663*c0909341SAndroid Build Coastguard Worker PAETH 664*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 665*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m0, 2 666*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], ym0, 1 667*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 668*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 669*c0909341SAndroid Build Coastguard Worker sub hd, 4 670*c0909341SAndroid Build Coastguard Worker jg .w16_loop 671*c0909341SAndroid Build Coastguard Worker RET 672*c0909341SAndroid Build Coastguard Worker.w32: 673*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m6, [topq] 674*c0909341SAndroid Build Coastguard Worker mova ym9, ym8 675*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 676*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 677*c0909341SAndroid Build Coastguard Worker por m7, m0 678*c0909341SAndroid Build Coastguard Worker.w32_loop: 679*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [tlq+hq-2] 680*c0909341SAndroid Build Coastguard Worker pshufb m4, m9 681*c0909341SAndroid Build Coastguard Worker PAETH 682*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 683*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 684*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 685*c0909341SAndroid Build Coastguard Worker sub hd, 2 686*c0909341SAndroid Build Coastguard Worker jg .w32_loop 687*c0909341SAndroid Build Coastguard Worker RET 688*c0909341SAndroid Build Coastguard Worker.w64: 689*c0909341SAndroid Build Coastguard Worker movu m6, [topq] 690*c0909341SAndroid Build Coastguard Worker psubusb m7, m5, m6 691*c0909341SAndroid Build Coastguard Worker psubusb m0, m6, m5 692*c0909341SAndroid Build Coastguard Worker por m7, m0 693*c0909341SAndroid Build Coastguard Worker.w64_loop: 694*c0909341SAndroid Build Coastguard Worker vpbroadcastb m4, [tlq+hq-1] 695*c0909341SAndroid Build Coastguard Worker PAETH 696*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 697*c0909341SAndroid Build Coastguard Worker add dstq, strideq 698*c0909341SAndroid Build Coastguard Worker dec hd 699*c0909341SAndroid Build Coastguard Worker jg .w64_loop 700*c0909341SAndroid Build Coastguard Worker RET 701*c0909341SAndroid Build Coastguard Worker 702*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 703*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_v_8bpc_avx512icl_table 704*c0909341SAndroid Build Coastguard Worker lea r6, [ipred_smooth_v_8bpc_avx512icl_table] 705*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 706*c0909341SAndroid Build Coastguard Worker mov hd, hm 707*c0909341SAndroid Build Coastguard Worker movsxd wq, [r6+wq*4] 708*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+pb_127_m127] 709*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [base+pw_128] 710*c0909341SAndroid Build Coastguard Worker lea weightsq, [base+smooth_weights+hq*4] 711*c0909341SAndroid Build Coastguard Worker neg hq 712*c0909341SAndroid Build Coastguard Worker vpbroadcastb m4, [tlq+hq] ; bottom 713*c0909341SAndroid Build Coastguard Worker add wq, r6 714*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 715*c0909341SAndroid Build Coastguard Worker jmp wq 716*c0909341SAndroid Build Coastguard Worker.w4: 717*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq+1] 718*c0909341SAndroid Build Coastguard Worker movshdup m5, [smooth_shuf] 719*c0909341SAndroid Build Coastguard Worker mova ym6, [smooth_endA] 720*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 ; top, bottom 721*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2, m0 722*c0909341SAndroid Build Coastguard Worker paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok 723*c0909341SAndroid Build Coastguard Worker paddw m3, m1 ; 128 * top + 129 * bottom + 128 724*c0909341SAndroid Build Coastguard Worker.w4_loop: 725*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [weightsq+hq*2] 726*c0909341SAndroid Build Coastguard Worker pshufb m0, m5 727*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m0 728*c0909341SAndroid Build Coastguard Worker paddw m0, m3 729*c0909341SAndroid Build Coastguard Worker vpermb m0, m6, m0 730*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 731*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 732*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], xm1 733*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 2 734*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 2 735*c0909341SAndroid Build Coastguard Worker add hq, 8 736*c0909341SAndroid Build Coastguard Worker jg .ret 737*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 738*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 1 739*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm1, 1 740*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 3 741*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 742*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 743*c0909341SAndroid Build Coastguard Worker jl .w4_loop 744*c0909341SAndroid Build Coastguard Worker.ret: 745*c0909341SAndroid Build Coastguard Worker RET 746*c0909341SAndroid Build Coastguard Worker.w8: 747*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [tlq+1] 748*c0909341SAndroid Build Coastguard Worker movshdup m5, [smooth_shuf] 749*c0909341SAndroid Build Coastguard Worker mova ym6, [smooth_endA] 750*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 751*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2, m0 752*c0909341SAndroid Build Coastguard Worker paddw m1, m2 753*c0909341SAndroid Build Coastguard Worker paddw m3, m1 754*c0909341SAndroid Build Coastguard Worker.w8_loop: 755*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [weightsq+hq*2] 756*c0909341SAndroid Build Coastguard Worker pshufb m0, m5 757*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m0 758*c0909341SAndroid Build Coastguard Worker paddw m0, m3 759*c0909341SAndroid Build Coastguard Worker vpermb m0, m6, m0 760*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 761*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 762*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 763*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 764*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 765*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 766*c0909341SAndroid Build Coastguard Worker add hq, 4 767*c0909341SAndroid Build Coastguard Worker jl .w8_loop 768*c0909341SAndroid Build Coastguard Worker RET 769*c0909341SAndroid Build Coastguard Worker.w16: 770*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [tlq+1] 771*c0909341SAndroid Build Coastguard Worker movshdup m6, [smooth_shuf] 772*c0909341SAndroid Build Coastguard Worker mova m7, [smooth_endB] 773*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m4 774*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 775*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m0 776*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m3, m0 777*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m2 778*c0909341SAndroid Build Coastguard Worker paddw m1, m3 779*c0909341SAndroid Build Coastguard Worker paddw m4, m0 780*c0909341SAndroid Build Coastguard Worker paddw m5, m1 781*c0909341SAndroid Build Coastguard Worker.w16_loop: 782*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [weightsq+hq*2] 783*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 784*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m1 785*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3, m1 786*c0909341SAndroid Build Coastguard Worker paddw m0, m4 787*c0909341SAndroid Build Coastguard Worker paddw m1, m5 788*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m7, m1 789*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 790*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m0, 2 791*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], ym0, 1 792*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 793*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 794*c0909341SAndroid Build Coastguard Worker add hq, 4 795*c0909341SAndroid Build Coastguard Worker jl .w16_loop 796*c0909341SAndroid Build Coastguard Worker RET 797*c0909341SAndroid Build Coastguard Worker.w32: 798*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m3, [tlq+1] 799*c0909341SAndroid Build Coastguard Worker movshdup m6, [smooth_shuf] 800*c0909341SAndroid Build Coastguard Worker mova m7, [smooth_endB] 801*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m4 802*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 803*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m0 804*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m3, m0 805*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m2 806*c0909341SAndroid Build Coastguard Worker paddw m1, m3 807*c0909341SAndroid Build Coastguard Worker paddw m4, m0 808*c0909341SAndroid Build Coastguard Worker paddw m5, m1 809*c0909341SAndroid Build Coastguard Worker.w32_loop: 810*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [weightsq+hq*2] 811*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 812*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m1 813*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3, m1 814*c0909341SAndroid Build Coastguard Worker paddw m0, m4 815*c0909341SAndroid Build Coastguard Worker paddw m1, m5 816*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m7, m1 817*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 818*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 819*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 820*c0909341SAndroid Build Coastguard Worker add hq, 2 821*c0909341SAndroid Build Coastguard Worker jl .w32_loop 822*c0909341SAndroid Build Coastguard Worker RET 823*c0909341SAndroid Build Coastguard Worker.w64: 824*c0909341SAndroid Build Coastguard Worker movu m3, [tlq+1] 825*c0909341SAndroid Build Coastguard Worker mova m6, [smooth_endB] 826*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m4 827*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 828*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m2, m0 829*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m3, m0 830*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m2 831*c0909341SAndroid Build Coastguard Worker paddw m1, m3 832*c0909341SAndroid Build Coastguard Worker paddw m4, m0 833*c0909341SAndroid Build Coastguard Worker paddw m5, m1 834*c0909341SAndroid Build Coastguard Worker.w64_loop: 835*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [weightsq+hq*2] 836*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m1 837*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3, m1 838*c0909341SAndroid Build Coastguard Worker paddw m0, m4 839*c0909341SAndroid Build Coastguard Worker paddw m1, m5 840*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m6, m1 841*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 842*c0909341SAndroid Build Coastguard Worker add dstq, strideq 843*c0909341SAndroid Build Coastguard Worker inc hq 844*c0909341SAndroid Build Coastguard Worker jl .w64_loop 845*c0909341SAndroid Build Coastguard Worker RET 846*c0909341SAndroid Build Coastguard Worker 847*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 848*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_smooth_h_8bpc_avx512icl_table 849*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_smooth_h_8bpc_avx512icl_table] 850*c0909341SAndroid Build Coastguard Worker mov r6d, wd 851*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 852*c0909341SAndroid Build Coastguard Worker vpbroadcastb m4, [tlq+r6] ; right 853*c0909341SAndroid Build Coastguard Worker mov hd, hm 854*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 855*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pb_127_m127] 856*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_128] 857*c0909341SAndroid Build Coastguard Worker sub tlq, hq 858*c0909341SAndroid Build Coastguard Worker add wq, r5 859*c0909341SAndroid Build Coastguard Worker vpmovb2m k1, m6 860*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 861*c0909341SAndroid Build Coastguard Worker jmp wq 862*c0909341SAndroid Build Coastguard Worker.w4: 863*c0909341SAndroid Build Coastguard Worker movsldup m3, [smooth_shuf] 864*c0909341SAndroid Build Coastguard Worker vpbroadcastq m7, [smooth_weights+4*2] 865*c0909341SAndroid Build Coastguard Worker mova ym8, [smooth_endA] 866*c0909341SAndroid Build Coastguard Worker.w4_loop: 867*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [tlq+hq-8] 868*c0909341SAndroid Build Coastguard Worker mova m2, m4 869*c0909341SAndroid Build Coastguard Worker vpshufb m2{k1}, m0, m3 ; left, right 870*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m5 871*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m7 872*c0909341SAndroid Build Coastguard Worker paddw m2, m6 873*c0909341SAndroid Build Coastguard Worker paddw m0, m2 874*c0909341SAndroid Build Coastguard Worker paddw m0, m1 875*c0909341SAndroid Build Coastguard Worker vpermb m0, m8, m0 876*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 877*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 878*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], xm1 879*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 2 880*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 2 881*c0909341SAndroid Build Coastguard Worker sub hd, 8 882*c0909341SAndroid Build Coastguard Worker jl .ret 883*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 884*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 1 885*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm1, 1 886*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 3 887*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 888*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 889*c0909341SAndroid Build Coastguard Worker jg .w4_loop 890*c0909341SAndroid Build Coastguard Worker.ret: 891*c0909341SAndroid Build Coastguard Worker RET 892*c0909341SAndroid Build Coastguard Worker.w8: 893*c0909341SAndroid Build Coastguard Worker movsldup m3, [smooth_shuf] 894*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [smooth_weights+8*2] 895*c0909341SAndroid Build Coastguard Worker mova ym8, [smooth_endA] 896*c0909341SAndroid Build Coastguard Worker.w8_loop: 897*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [tlq+hq-4] 898*c0909341SAndroid Build Coastguard Worker mova m2, m4 899*c0909341SAndroid Build Coastguard Worker vpshufb m2{k1}, m0, m3 900*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2, m5 901*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m7 902*c0909341SAndroid Build Coastguard Worker paddw m2, m6 903*c0909341SAndroid Build Coastguard Worker paddw m0, m2 904*c0909341SAndroid Build Coastguard Worker paddw m0, m1 905*c0909341SAndroid Build Coastguard Worker vpermb m0, m8, m0 906*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 907*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 908*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 909*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 910*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 911*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 912*c0909341SAndroid Build Coastguard Worker sub hd, 4 913*c0909341SAndroid Build Coastguard Worker jg .w8_loop 914*c0909341SAndroid Build Coastguard Worker RET 915*c0909341SAndroid Build Coastguard Worker.w16: 916*c0909341SAndroid Build Coastguard Worker movsldup m7, [smooth_shuf] 917*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [smooth_weights+16*2] 918*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [smooth_weights+16*3] 919*c0909341SAndroid Build Coastguard Worker mova m10, [smooth_endB] 920*c0909341SAndroid Build Coastguard Worker.w16_loop: 921*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [tlq+hq-4] 922*c0909341SAndroid Build Coastguard Worker mova m3, m4 923*c0909341SAndroid Build Coastguard Worker vpshufb m3{k1}, m0, m7 924*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3, m5 925*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3, m8 926*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3, m9 927*c0909341SAndroid Build Coastguard Worker paddw m3, m6 928*c0909341SAndroid Build Coastguard Worker paddw m2, m3 929*c0909341SAndroid Build Coastguard Worker paddw m0, m2 930*c0909341SAndroid Build Coastguard Worker paddw m1, m2 931*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m10, m1 932*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 933*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m0, 2 934*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], ym0, 1 935*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 936*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 937*c0909341SAndroid Build Coastguard Worker sub hd, 4 938*c0909341SAndroid Build Coastguard Worker jg .w16_loop 939*c0909341SAndroid Build Coastguard Worker RET 940*c0909341SAndroid Build Coastguard Worker.w32: 941*c0909341SAndroid Build Coastguard Worker mova m10, [smooth_endA] 942*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym7, [pb_1] 943*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m8, [smooth_weights+32*2] 944*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m9, [smooth_weights+32*3] 945*c0909341SAndroid Build Coastguard Worker vshufi32x4 m10, m10, q3120 946*c0909341SAndroid Build Coastguard Worker.w32_loop: 947*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [tlq+hq-2] 948*c0909341SAndroid Build Coastguard Worker mova m3, m4 949*c0909341SAndroid Build Coastguard Worker vpshufb m3{k1}, m0, m7 950*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3, m5 951*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3, m8 952*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3, m9 953*c0909341SAndroid Build Coastguard Worker paddw m3, m6 954*c0909341SAndroid Build Coastguard Worker paddw m2, m3 955*c0909341SAndroid Build Coastguard Worker paddw m0, m2 956*c0909341SAndroid Build Coastguard Worker paddw m1, m2 957*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m10, m1 958*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 959*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 960*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 961*c0909341SAndroid Build Coastguard Worker sub hd, 2 962*c0909341SAndroid Build Coastguard Worker jg .w32_loop 963*c0909341SAndroid Build Coastguard Worker RET 964*c0909341SAndroid Build Coastguard Worker.w64: 965*c0909341SAndroid Build Coastguard Worker mova m7, [smooth_weights+64*2] 966*c0909341SAndroid Build Coastguard Worker mova m8, [smooth_weights+64*3] 967*c0909341SAndroid Build Coastguard Worker mova m9, [smooth_endA] 968*c0909341SAndroid Build Coastguard Worker.w64_loop: 969*c0909341SAndroid Build Coastguard Worker mova m3, m4 970*c0909341SAndroid Build Coastguard Worker vpbroadcastb m3{k1}, [tlq+hq-1] 971*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3, m5 972*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3, m7 973*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3, m8 974*c0909341SAndroid Build Coastguard Worker paddw m3, m6 975*c0909341SAndroid Build Coastguard Worker paddw m2, m3 976*c0909341SAndroid Build Coastguard Worker paddw m0, m2 977*c0909341SAndroid Build Coastguard Worker paddw m1, m2 978*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m9, m1 979*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 980*c0909341SAndroid Build Coastguard Worker add dstq, strideq 981*c0909341SAndroid Build Coastguard Worker dec hd 982*c0909341SAndroid Build Coastguard Worker jg .w64_loop 983*c0909341SAndroid Build Coastguard Worker RET 984*c0909341SAndroid Build Coastguard Worker 985*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 986*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_smooth_8bpc_avx512icl_table 987*c0909341SAndroid Build Coastguard Worker lea r5, [ipred_smooth_8bpc_avx512icl_table] 988*c0909341SAndroid Build Coastguard Worker mov r6d, wd 989*c0909341SAndroid Build Coastguard Worker tzcnt wd, wd 990*c0909341SAndroid Build Coastguard Worker mov hd, hm 991*c0909341SAndroid Build Coastguard Worker vpbroadcastb m6, [tlq+r6] ; right 992*c0909341SAndroid Build Coastguard Worker sub tlq, hq 993*c0909341SAndroid Build Coastguard Worker movsxd wq, [r5+wq*4] 994*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pb_127_m127] 995*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [tlq] ; bottom 996*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [base+pw_255] 997*c0909341SAndroid Build Coastguard Worker add wq, r5 998*c0909341SAndroid Build Coastguard Worker lea v_weightsq, [base+smooth_weights+hq*2] 999*c0909341SAndroid Build Coastguard Worker vpmovb2m k1, m1 1000*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1001*c0909341SAndroid Build Coastguard Worker jmp wq 1002*c0909341SAndroid Build Coastguard Worker.w4: 1003*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [tlq+hq+1] 1004*c0909341SAndroid Build Coastguard Worker movsldup m4, [smooth_shuf] 1005*c0909341SAndroid Build Coastguard Worker movshdup m5, [smooth_shuf] 1006*c0909341SAndroid Build Coastguard Worker vpbroadcastq m9, [smooth_weights+4*2] 1007*c0909341SAndroid Build Coastguard Worker mova ym11, [smooth_endA] 1008*c0909341SAndroid Build Coastguard Worker 1009*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m0 ; top, bottom 1010*c0909341SAndroid Build Coastguard Worker pmaddubsw m10, m8, m7 1011*c0909341SAndroid Build Coastguard Worker paddw m1, m8 ; 1 * top + 256 * bottom + 255 1012*c0909341SAndroid Build Coastguard Worker paddw m10, m1 ; 128 * top + 129 * bottom + 255 1013*c0909341SAndroid Build Coastguard Worker.w4_loop: 1014*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [tlq+hq-8] 1015*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [v_weightsq] 1016*c0909341SAndroid Build Coastguard Worker add v_weightsq, 16 1017*c0909341SAndroid Build Coastguard Worker mova m2, m6 1018*c0909341SAndroid Build Coastguard Worker vpshufb m2{k1}, m1, m4 ; left, right 1019*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m7 ; 127 * left - 127 * right 1020*c0909341SAndroid Build Coastguard Worker pshufb m0, m5 1021*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m8, m0 1022*c0909341SAndroid Build Coastguard Worker paddw m1, m2 ; 128 * left + 129 * right 1023*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m9 1024*c0909341SAndroid Build Coastguard Worker paddw m0, m10 1025*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1026*c0909341SAndroid Build Coastguard Worker pavgw m0, m1 1027*c0909341SAndroid Build Coastguard Worker vpermb m0, m11, m0 1028*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 1029*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 1030*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], xm1 1031*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 2 1032*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 2 1033*c0909341SAndroid Build Coastguard Worker sub hd, 8 1034*c0909341SAndroid Build Coastguard Worker jl .ret 1035*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1036*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*0], xm0, 1 1037*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm1, 1 1038*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 3 1039*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm1, 3 1040*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1041*c0909341SAndroid Build Coastguard Worker jg .w4_loop 1042*c0909341SAndroid Build Coastguard Worker.ret: 1043*c0909341SAndroid Build Coastguard Worker RET 1044*c0909341SAndroid Build Coastguard Worker.w8: 1045*c0909341SAndroid Build Coastguard Worker vpbroadcastq m8, [tlq+hq+1] 1046*c0909341SAndroid Build Coastguard Worker movsldup m4, [smooth_shuf] 1047*c0909341SAndroid Build Coastguard Worker movshdup m5, [smooth_shuf] 1048*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [smooth_weights+8*2] 1049*c0909341SAndroid Build Coastguard Worker mova ym11, [smooth_endA] 1050*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m0 1051*c0909341SAndroid Build Coastguard Worker pmaddubsw m10, m8, m7 1052*c0909341SAndroid Build Coastguard Worker paddw m1, m8 1053*c0909341SAndroid Build Coastguard Worker paddw m10, m1 1054*c0909341SAndroid Build Coastguard Worker.w8_loop: 1055*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [tlq+hq-4] 1056*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [v_weightsq] 1057*c0909341SAndroid Build Coastguard Worker add v_weightsq, 8 1058*c0909341SAndroid Build Coastguard Worker mova m2, m6 1059*c0909341SAndroid Build Coastguard Worker vpshufb m2{k1}, m1, m4 1060*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2, m7 1061*c0909341SAndroid Build Coastguard Worker pshufb m0, m5 1062*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m8, m0 1063*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1064*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m9 1065*c0909341SAndroid Build Coastguard Worker paddw m0, m10 1066*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1067*c0909341SAndroid Build Coastguard Worker pavgw m0, m1 1068*c0909341SAndroid Build Coastguard Worker vpermb m0, m11, m0 1069*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 1070*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 1071*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm1 1072*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*2], xm0 1073*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm1 1074*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1075*c0909341SAndroid Build Coastguard Worker sub hd, 4 1076*c0909341SAndroid Build Coastguard Worker jg .w8_loop 1077*c0909341SAndroid Build Coastguard Worker RET 1078*c0909341SAndroid Build Coastguard Worker.w16: 1079*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [tlq+hq+1] 1080*c0909341SAndroid Build Coastguard Worker movsldup m5, [smooth_shuf] 1081*c0909341SAndroid Build Coastguard Worker movshdup m10, [smooth_shuf] 1082*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m11, [smooth_weights+16*2] 1083*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m12, [smooth_weights+16*3] 1084*c0909341SAndroid Build Coastguard Worker mova m15, [smooth_endB] 1085*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m9, m0 1086*c0909341SAndroid Build Coastguard Worker punpckhbw m9, m0 1087*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m8, m7 1088*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m9, m7 1089*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m8 1090*c0909341SAndroid Build Coastguard Worker paddw m1, m9 1091*c0909341SAndroid Build Coastguard Worker paddw m13, m0 1092*c0909341SAndroid Build Coastguard Worker paddw m14, m1 1093*c0909341SAndroid Build Coastguard Worker.w16_loop: 1094*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [tlq+hq-4] 1095*c0909341SAndroid Build Coastguard Worker vpbroadcastq m1, [v_weightsq] 1096*c0909341SAndroid Build Coastguard Worker add v_weightsq, 8 1097*c0909341SAndroid Build Coastguard Worker mova m4, m6 1098*c0909341SAndroid Build Coastguard Worker vpshufb m4{k1}, m0, m5 1099*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4, m7 1100*c0909341SAndroid Build Coastguard Worker pshufb m1, m10 1101*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m8, m1 1102*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m9, m1 1103*c0909341SAndroid Build Coastguard Worker paddw m2, m4 1104*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m4, m11 1105*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m12 1106*c0909341SAndroid Build Coastguard Worker paddw m0, m13 1107*c0909341SAndroid Build Coastguard Worker paddw m1, m14 1108*c0909341SAndroid Build Coastguard Worker paddw m3, m2 1109*c0909341SAndroid Build Coastguard Worker paddw m4, m2 1110*c0909341SAndroid Build Coastguard Worker pavgw m0, m3 1111*c0909341SAndroid Build Coastguard Worker pavgw m1, m4 1112*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m15, m1 1113*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 1114*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m0, 2 1115*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], ym0, 1 1116*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 1117*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1118*c0909341SAndroid Build Coastguard Worker sub hd, 4 1119*c0909341SAndroid Build Coastguard Worker jg .w16_loop 1120*c0909341SAndroid Build Coastguard Worker RET 1121*c0909341SAndroid Build Coastguard Worker.w32: 1122*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m9, [tlq+hq+1] 1123*c0909341SAndroid Build Coastguard Worker movshdup m10, [smooth_shuf] 1124*c0909341SAndroid Build Coastguard Worker mova m12, [smooth_weights+32*2] 1125*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym5, [pb_1] 1126*c0909341SAndroid Build Coastguard Worker mova m15, [smooth_endB] 1127*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m9, m0 1128*c0909341SAndroid Build Coastguard Worker punpckhbw m9, m0 1129*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m8, m7 1130*c0909341SAndroid Build Coastguard Worker pmaddubsw m14, m9, m7 1131*c0909341SAndroid Build Coastguard Worker vshufi32x4 m11, m12, m12, q2020 1132*c0909341SAndroid Build Coastguard Worker vshufi32x4 m12, m12, q3131 1133*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m8 1134*c0909341SAndroid Build Coastguard Worker paddw m1, m9 1135*c0909341SAndroid Build Coastguard Worker paddw m13, m0 1136*c0909341SAndroid Build Coastguard Worker paddw m14, m1 1137*c0909341SAndroid Build Coastguard Worker.w32_loop: 1138*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [tlq+hq-2] 1139*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [v_weightsq] 1140*c0909341SAndroid Build Coastguard Worker add v_weightsq, 4 1141*c0909341SAndroid Build Coastguard Worker mova m4, m6 1142*c0909341SAndroid Build Coastguard Worker vpshufb m4{k1}, m0, m5 1143*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4, m7 1144*c0909341SAndroid Build Coastguard Worker pshufb m1, m10 1145*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m8, m1 1146*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m9, m1 1147*c0909341SAndroid Build Coastguard Worker paddw m2, m4 1148*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m4, m11 1149*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m12 1150*c0909341SAndroid Build Coastguard Worker paddw m0, m13 1151*c0909341SAndroid Build Coastguard Worker paddw m1, m14 1152*c0909341SAndroid Build Coastguard Worker paddw m3, m2 1153*c0909341SAndroid Build Coastguard Worker paddw m4, m2 1154*c0909341SAndroid Build Coastguard Worker pavgw m0, m3 1155*c0909341SAndroid Build Coastguard Worker pavgw m1, m4 1156*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m15, m1 1157*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 1158*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 1159*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1160*c0909341SAndroid Build Coastguard Worker sub hd, 2 1161*c0909341SAndroid Build Coastguard Worker jg .w32_loop 1162*c0909341SAndroid Build Coastguard Worker RET 1163*c0909341SAndroid Build Coastguard Worker.w64: 1164*c0909341SAndroid Build Coastguard Worker movu m9, [tlq+hq+1] 1165*c0909341SAndroid Build Coastguard Worker mova m11, [smooth_weights+64*2] 1166*c0909341SAndroid Build Coastguard Worker mova m2, [smooth_weights+64*3] 1167*c0909341SAndroid Build Coastguard Worker mova m14, [smooth_endB] 1168*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m9, m0 1169*c0909341SAndroid Build Coastguard Worker punpckhbw m9, m0 1170*c0909341SAndroid Build Coastguard Worker pmaddubsw m12, m8, m7 1171*c0909341SAndroid Build Coastguard Worker pmaddubsw m13, m9, m7 1172*c0909341SAndroid Build Coastguard Worker vshufi32x4 m10, m11, m2, q2020 1173*c0909341SAndroid Build Coastguard Worker vshufi32x4 m11, m2, q3131 1174*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m8 1175*c0909341SAndroid Build Coastguard Worker paddw m1, m9 1176*c0909341SAndroid Build Coastguard Worker paddw m12, m0 1177*c0909341SAndroid Build Coastguard Worker paddw m13, m1 1178*c0909341SAndroid Build Coastguard Worker.w64_loop: 1179*c0909341SAndroid Build Coastguard Worker mova m4, m6 1180*c0909341SAndroid Build Coastguard Worker vpbroadcastb m4{k1}, [tlq+hq-1] 1181*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, [v_weightsq] 1182*c0909341SAndroid Build Coastguard Worker add v_weightsq, 2 1183*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4, m7 1184*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m8, m1 1185*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m9, m1 1186*c0909341SAndroid Build Coastguard Worker paddw m2, m4 1187*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m4, m10 1188*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m11 1189*c0909341SAndroid Build Coastguard Worker paddw m0, m12 1190*c0909341SAndroid Build Coastguard Worker paddw m1, m13 1191*c0909341SAndroid Build Coastguard Worker paddw m3, m2 1192*c0909341SAndroid Build Coastguard Worker paddw m4, m2 1193*c0909341SAndroid Build Coastguard Worker pavgw m0, m3 1194*c0909341SAndroid Build Coastguard Worker pavgw m1, m4 1195*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m14, m1 1196*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 1197*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1198*c0909341SAndroid Build Coastguard Worker dec hd 1199*c0909341SAndroid Build Coastguard Worker jg .w64_loop 1200*c0909341SAndroid Build Coastguard Worker RET 1201*c0909341SAndroid Build Coastguard Worker 1202*c0909341SAndroid Build Coastguard Workercglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3 1203*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1204*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1205*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1206*c0909341SAndroid Build Coastguard Worker cmp wd, 8 1207*c0909341SAndroid Build Coastguard Worker jg .w32 1208*c0909341SAndroid Build Coastguard Worker movq xmm3, [palq] 1209*c0909341SAndroid Build Coastguard Worker je .w8 1210*c0909341SAndroid Build Coastguard Worker.w4: 1211*c0909341SAndroid Build Coastguard Worker movq xmm0, [idxq] 1212*c0909341SAndroid Build Coastguard Worker add idxq, 8 1213*c0909341SAndroid Build Coastguard Worker psrlw xmm1, xmm0, 4 1214*c0909341SAndroid Build Coastguard Worker punpcklbw xmm0, xmm1 1215*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xmm3, xmm0 1216*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xmm0 1217*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xmm0, 1 1218*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xmm0, 2 1219*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xmm0, 3 1220*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1221*c0909341SAndroid Build Coastguard Worker sub hd, 4 1222*c0909341SAndroid Build Coastguard Worker jg .w4 1223*c0909341SAndroid Build Coastguard Worker RET 1224*c0909341SAndroid Build Coastguard Worker.w8: 1225*c0909341SAndroid Build Coastguard Worker movu xmm2, [idxq] 1226*c0909341SAndroid Build Coastguard Worker add idxq, 16 1227*c0909341SAndroid Build Coastguard Worker pshufb xmm1, xmm3, xmm2 1228*c0909341SAndroid Build Coastguard Worker psrlw xmm2, 4 1229*c0909341SAndroid Build Coastguard Worker pshufb xmm2, xmm3, xmm2 1230*c0909341SAndroid Build Coastguard Worker punpcklbw xmm0, xmm1, xmm2 1231*c0909341SAndroid Build Coastguard Worker punpckhbw xmm1, xmm2 1232*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xmm0 1233*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xmm0 1234*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xmm1 1235*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xmm1 1236*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1237*c0909341SAndroid Build Coastguard Worker sub hd, 4 1238*c0909341SAndroid Build Coastguard Worker jg .w8 1239*c0909341SAndroid Build Coastguard Worker RET 1240*c0909341SAndroid Build Coastguard Worker.w16: 1241*c0909341SAndroid Build Coastguard Worker pmovzxdq m0, [idxq] 1242*c0909341SAndroid Build Coastguard Worker add idxq, 32 1243*c0909341SAndroid Build Coastguard Worker vpmultishiftqb m0, m3, m0 1244*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m0 1245*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 1246*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 1247*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m0, 2 1248*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m0, 3 1249*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1250*c0909341SAndroid Build Coastguard Worker sub hd, 4 1251*c0909341SAndroid Build Coastguard Worker jg .w16 1252*c0909341SAndroid Build Coastguard Worker RET 1253*c0909341SAndroid Build Coastguard Worker.w32: 1254*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [pal_unpack+0] 1255*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [palq] 1256*c0909341SAndroid Build Coastguard Worker cmp wd, 32 1257*c0909341SAndroid Build Coastguard Worker jl .w16 1258*c0909341SAndroid Build Coastguard Worker pmovzxbd m2, [pal_perm] 1259*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [pal_unpack+8] 1260*c0909341SAndroid Build Coastguard Worker jg .w64 1261*c0909341SAndroid Build Coastguard Worker.w32_loop: 1262*c0909341SAndroid Build Coastguard Worker vpermd m1, m2, [idxq] 1263*c0909341SAndroid Build Coastguard Worker add idxq, 64 1264*c0909341SAndroid Build Coastguard Worker vpmultishiftqb m0, m3, m1 1265*c0909341SAndroid Build Coastguard Worker vpmultishiftqb m1, m4, m1 1266*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m0 1267*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m1 1268*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 1269*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 1270*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], ym1 1271*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+stride3q ], m1, 1 1272*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1273*c0909341SAndroid Build Coastguard Worker sub hd, 4 1274*c0909341SAndroid Build Coastguard Worker jg .w32_loop 1275*c0909341SAndroid Build Coastguard Worker RET 1276*c0909341SAndroid Build Coastguard Worker.w64: 1277*c0909341SAndroid Build Coastguard Worker vpermd m1, m2, [idxq] 1278*c0909341SAndroid Build Coastguard Worker add idxq, 64 1279*c0909341SAndroid Build Coastguard Worker vpmultishiftqb m0, m3, m1 1280*c0909341SAndroid Build Coastguard Worker vpmultishiftqb m1, m4, m1 1281*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m0 1282*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m1 1283*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m0 1284*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m1 1285*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1286*c0909341SAndroid Build Coastguard Worker sub hd, 2 1287*c0909341SAndroid Build Coastguard Worker jg .w64 1288*c0909341SAndroid Build Coastguard Worker RET 1289*c0909341SAndroid Build Coastguard Worker 1290*c0909341SAndroid Build Coastguard Worker%if WIN64 1291*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 4 1292*c0909341SAndroid Build Coastguard Worker%else 1293*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 8 1294*c0909341SAndroid Build Coastguard Worker%endif 1295*c0909341SAndroid Build Coastguard Worker 1296*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx 1297*c0909341SAndroid Build Coastguard Worker%define base r7-z_filter_t0 1298*c0909341SAndroid Build Coastguard Worker lea r7, [z_filter_t0] 1299*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 1300*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 1301*c0909341SAndroid Build Coastguard Worker lea t0, [dr_intra_derivative] 1302*c0909341SAndroid Build Coastguard Worker movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4] 1303*c0909341SAndroid Build Coastguard Worker inc tlq 1304*c0909341SAndroid Build Coastguard Worker mov dxd, angled 1305*c0909341SAndroid Build Coastguard Worker and dxd, 0x7e 1306*c0909341SAndroid Build Coastguard Worker add angled, 165 ; ~90 1307*c0909341SAndroid Build Coastguard Worker movzx dxd, word [t0+dxq] 1308*c0909341SAndroid Build Coastguard Worker lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq] 1309*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1310*c0909341SAndroid Build Coastguard Worker xor angled, 0x4ff ; d = 90 - angle 1311*c0909341SAndroid Build Coastguard Worker mova m14, [base+z_frac_table] 1312*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+pw_512] 1313*c0909341SAndroid Build Coastguard Worker jmp wq 1314*c0909341SAndroid Build Coastguard Worker.w4: 1315*c0909341SAndroid Build Coastguard Worker mova m9, [pb_0to63] 1316*c0909341SAndroid Build Coastguard Worker pminud m8, m9, [base+pb_7] {1to16} 1317*c0909341SAndroid Build Coastguard Worker vpbroadcastq m7, [tlq] 1318*c0909341SAndroid Build Coastguard Worker pshufb m7, m8 1319*c0909341SAndroid Build Coastguard Worker cmp angleb, 40 1320*c0909341SAndroid Build Coastguard Worker jae .w4_no_upsample 1321*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq-1024] 1322*c0909341SAndroid Build Coastguard Worker sar r3d, 7 1323*c0909341SAndroid Build Coastguard Worker add r3d, hd 1324*c0909341SAndroid Build Coastguard Worker jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) 1325*c0909341SAndroid Build Coastguard Worker pshufb xmm0, xm7, [base+z_filter_s4] 1326*c0909341SAndroid Build Coastguard Worker mova xmm1, [tlq-1] 1327*c0909341SAndroid Build Coastguard Worker pshufb xmm1, [base+z_xpos_off2a] 1328*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm2, [base+pb_m4_36] 1329*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [pb_0to63] 1330*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xmm2 1331*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm1, xmm2 1332*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1333*c0909341SAndroid Build Coastguard Worker kxnorw k1, k1, k1 1334*c0909341SAndroid Build Coastguard Worker paddw xmm0, xmm1 1335*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xmm0, xm15 1336*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 1337*c0909341SAndroid Build Coastguard Worker punpcklbw ym7{k1}, ym0 1338*c0909341SAndroid Build Coastguard Worker jmp .w4_main2 1339*c0909341SAndroid Build Coastguard Worker.w4_no_upsample: 1340*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1341*c0909341SAndroid Build Coastguard Worker jnz .w4_main ; !enable_intra_edge_filter 1342*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 1343*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm0, r3d 1344*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm1, angled 1345*c0909341SAndroid Build Coastguard Worker shr angled, 8 ; is_sm << 1 1346*c0909341SAndroid Build Coastguard Worker vpcmpeqb k1, xm0, [base+z_filter_wh] 1347*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] 1348*c0909341SAndroid Build Coastguard Worker kmovw r5d, k1 1349*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1350*c0909341SAndroid Build Coastguard Worker jz .w4_main 1351*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym0, [tlq-1] 1352*c0909341SAndroid Build Coastguard Worker pshufb ym0, [base+z_filter4_s1] 1353*c0909341SAndroid Build Coastguard Worker popcnt r5d, r5d ; filter_strength 1354*c0909341SAndroid Build Coastguard Worker pshufb ym1, ym7, [z_filter_s4] 1355*c0909341SAndroid Build Coastguard Worker pshufb ym7, [base+z_filter_s3] 1356*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] 1357*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] 1358*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym11 1359*c0909341SAndroid Build Coastguard Worker pmaddubsw ym1, ym11 1360*c0909341SAndroid Build Coastguard Worker pmaddubsw ym7, ym12 1361*c0909341SAndroid Build Coastguard Worker paddw ym0, ym1 1362*c0909341SAndroid Build Coastguard Worker paddw ym7, ym0 1363*c0909341SAndroid Build Coastguard Worker pmulhrsw ym7, ym15 1364*c0909341SAndroid Build Coastguard Worker cmp hd, 4 1365*c0909341SAndroid Build Coastguard Worker je .w4_filter_end 1366*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pb_9] 1367*c0909341SAndroid Build Coastguard Worker pminub m8, m9 1368*c0909341SAndroid Build Coastguard Worker.w4_filter_end: 1369*c0909341SAndroid Build Coastguard Worker paddb m8, m8 1370*c0909341SAndroid Build Coastguard Worker vpermb m7, m8, m7 1371*c0909341SAndroid Build Coastguard Worker.w4_main: 1372*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [base+z_xpos_off1a] 1373*c0909341SAndroid Build Coastguard Worker.w4_main2: 1374*c0909341SAndroid Build Coastguard Worker movsldup m2, [base+z_xpos_mul] 1375*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, dxd 1376*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [base+z_xpos_bc] 1377*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 1378*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 ; xpos 1379*c0909341SAndroid Build Coastguard Worker psllw m5, 5 ; dx*8 1380*c0909341SAndroid Build Coastguard Worker.w4_loop: 1381*c0909341SAndroid Build Coastguard Worker psrlw m1, m2, 3 1382*c0909341SAndroid Build Coastguard Worker pshufb m0, m2, m3 1383*c0909341SAndroid Build Coastguard Worker vpermw m1, m1, m14 ; 64-frac, frac 1384*c0909341SAndroid Build Coastguard Worker paddsb m0, m4 ; base, base+1 1385*c0909341SAndroid Build Coastguard Worker vpermb m0, m0, m7 ; top[base], top[base+1] 1386*c0909341SAndroid Build Coastguard Worker paddsw m2, m5 ; xpos += dx 1387*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m1 ; v 1388*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 1389*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1390*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 1391*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 1392*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 1393*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm1 1394*c0909341SAndroid Build Coastguard Worker pextrd [dstq+r2 ], xm1, 1 1395*c0909341SAndroid Build Coastguard Worker sub hd, 8 1396*c0909341SAndroid Build Coastguard Worker jl .w4_end 1397*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, m0, 2 ; top[max_base_x] 1398*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1399*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, m0, 3 1400*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm1 1401*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm1, 1 1402*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm0 1403*c0909341SAndroid Build Coastguard Worker pextrd [dstq+r2 ], xm0, 1 1404*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1405*c0909341SAndroid Build Coastguard Worker jg .w4_loop 1406*c0909341SAndroid Build Coastguard Worker.w4_end: 1407*c0909341SAndroid Build Coastguard Worker RET 1408*c0909341SAndroid Build Coastguard Worker.w8_filter: 1409*c0909341SAndroid Build Coastguard Worker mova ym0, [base+z_filter_s1] 1410*c0909341SAndroid Build Coastguard Worker popcnt r5d, r5d 1411*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym1, [base+z_filter_s2] 1412*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym3, [base+z_filter_s3] 1413*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [base+z_filter_s4] 1414*c0909341SAndroid Build Coastguard Worker vpermi2b ym0, ym7, ym2 ; al bl 1415*c0909341SAndroid Build Coastguard Worker mova ym5, [base+z_filter_s5] 1416*c0909341SAndroid Build Coastguard Worker pshufb ym1, ym7, ym1 ; ah bh 1417*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] 1418*c0909341SAndroid Build Coastguard Worker pshufb ym3, ym7, ym3 ; cl ch 1419*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] 1420*c0909341SAndroid Build Coastguard Worker pshufb ym4, ym7, ym4 ; el dl 1421*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] 1422*c0909341SAndroid Build Coastguard Worker vpermb ym5, ym5, ym7 ; eh dh 1423*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym11 1424*c0909341SAndroid Build Coastguard Worker pmaddubsw ym1, ym11 1425*c0909341SAndroid Build Coastguard Worker pmaddubsw ym2, ym3, ym12 1426*c0909341SAndroid Build Coastguard Worker pmaddubsw ym3, ym13 1427*c0909341SAndroid Build Coastguard Worker pmaddubsw ym4, ym11 1428*c0909341SAndroid Build Coastguard Worker pmaddubsw ym5, ym11 1429*c0909341SAndroid Build Coastguard Worker paddw ym0, ym2 1430*c0909341SAndroid Build Coastguard Worker paddw ym1, ym3 1431*c0909341SAndroid Build Coastguard Worker paddw ym0, ym4 1432*c0909341SAndroid Build Coastguard Worker paddw ym1, ym5 1433*c0909341SAndroid Build Coastguard Worker pmulhrsw ym0, ym15 1434*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym15 1435*c0909341SAndroid Build Coastguard Worker packuswb ym0, ym1 1436*c0909341SAndroid Build Coastguard Worker ret 1437*c0909341SAndroid Build Coastguard Worker.w8: 1438*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+216] 1439*c0909341SAndroid Build Coastguard Worker mov r3b, hb 1440*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 1441*c0909341SAndroid Build Coastguard Worker ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1442*c0909341SAndroid Build Coastguard Worker lea r3d, [hq-1] 1443*c0909341SAndroid Build Coastguard Worker mova xm1, [base+z_filter_s4] 1444*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm2, r3d 1445*c0909341SAndroid Build Coastguard Worker mova xm7, [tlq-1] 1446*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym7, [tlq+7], 1 1447*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym0, [base+z_xpos_off1a] 1448*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym3, [base+pb_m4_36] 1449*c0909341SAndroid Build Coastguard Worker pminub xm2, xm1 1450*c0909341SAndroid Build Coastguard Worker pshufb ym0, ym7, ym0 1451*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, xm2, 1 1452*c0909341SAndroid Build Coastguard Worker psrldq ym7, 1 1453*c0909341SAndroid Build Coastguard Worker pshufb ym1, ym7, ym1 1454*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym3 1455*c0909341SAndroid Build Coastguard Worker pmaddubsw ym1, ym3 1456*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [pb_0to63] 1457*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1458*c0909341SAndroid Build Coastguard Worker paddw ym0, ym1 1459*c0909341SAndroid Build Coastguard Worker pmulhrsw ym0, ym15 1460*c0909341SAndroid Build Coastguard Worker packuswb ym0, ym0 1461*c0909341SAndroid Build Coastguard Worker punpcklbw ym7, ym0 1462*c0909341SAndroid Build Coastguard Worker jmp .w8_main2 1463*c0909341SAndroid Build Coastguard Worker.w8_no_upsample: 1464*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 1465*c0909341SAndroid Build Coastguard Worker mova m9, [pb_0to63] 1466*c0909341SAndroid Build Coastguard Worker vpbroadcastb ym0, r3d 1467*c0909341SAndroid Build Coastguard Worker and r3d, 7 1468*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [tlq] 1469*c0909341SAndroid Build Coastguard Worker or r3d, 8 ; imin(h+7, 15) 1470*c0909341SAndroid Build Coastguard Worker vpbroadcastb m8, r3d 1471*c0909341SAndroid Build Coastguard Worker pminub m8, m9 1472*c0909341SAndroid Build Coastguard Worker pshufb m7, m8 1473*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1474*c0909341SAndroid Build Coastguard Worker jnz .w8_main 1475*c0909341SAndroid Build Coastguard Worker vpbroadcastb ym1, angled 1476*c0909341SAndroid Build Coastguard Worker shr angled, 8 1477*c0909341SAndroid Build Coastguard Worker vpcmpeqb k1, ym0, [base+z_filter_wh] 1478*c0909341SAndroid Build Coastguard Worker mova xm0, [base+z_filter_t0+angleq*8] 1479*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1{k1}, ym1, ym0 1480*c0909341SAndroid Build Coastguard Worker kmovd r5d, k1 1481*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1482*c0909341SAndroid Build Coastguard Worker jz .w8_main 1483*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [tlq-4] 1484*c0909341SAndroid Build Coastguard Worker call .w8_filter 1485*c0909341SAndroid Build Coastguard Worker cmp hd, 8 1486*c0909341SAndroid Build Coastguard Worker jle .w8_filter_end 1487*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pb_17] 1488*c0909341SAndroid Build Coastguard Worker add r3d, 2 1489*c0909341SAndroid Build Coastguard Worker pminub m8, m9 1490*c0909341SAndroid Build Coastguard Worker.w8_filter_end: 1491*c0909341SAndroid Build Coastguard Worker vpermb m7, m8, m0 1492*c0909341SAndroid Build Coastguard Worker.w8_main: 1493*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [base+z_xpos_off1a] 1494*c0909341SAndroid Build Coastguard Worker.w8_main2: 1495*c0909341SAndroid Build Coastguard Worker movsldup m4, [base+z_xpos_mul] 1496*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, dxd 1497*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1498*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+z_xpos_bc+8*0] 1499*c0909341SAndroid Build Coastguard Worker pmullw m4, m9 ; xpos 1500*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+z_xpos_bc+8*1] 1501*c0909341SAndroid Build Coastguard Worker sub r3d, dxd 1502*c0909341SAndroid Build Coastguard Worker shl dxd, 3 1503*c0909341SAndroid Build Coastguard Worker psllw m9, 5 ; dx*8 1504*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 1505*c0909341SAndroid Build Coastguard Worker.w8_loop: 1506*c0909341SAndroid Build Coastguard Worker psrlw m3, m4, 3 1507*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m5 1508*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m6 1509*c0909341SAndroid Build Coastguard Worker vpermw m3, m3, m14 1510*c0909341SAndroid Build Coastguard Worker paddsb m0, m8 1511*c0909341SAndroid Build Coastguard Worker paddsb m1, m8 1512*c0909341SAndroid Build Coastguard Worker vpermb m0, m0, m7 1513*c0909341SAndroid Build Coastguard Worker vpermb m1, m1, m7 1514*c0909341SAndroid Build Coastguard Worker paddsw m4, m9 1515*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m3, m3 1516*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1517*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m3 1518*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3 1519*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 1520*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 1521*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1522*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 1523*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 1524*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 1525*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 1526*c0909341SAndroid Build Coastguard Worker movhps [dstq+r2 ], xm1 1527*c0909341SAndroid Build Coastguard Worker sub hd, 8 1528*c0909341SAndroid Build Coastguard Worker jl .w8_end 1529*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym0, m0, 1 1530*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1531*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 1532*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 1533*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 1534*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 1535*c0909341SAndroid Build Coastguard Worker movhps [dstq+r2 ], xm1 1536*c0909341SAndroid Build Coastguard Worker jz .w8_end 1537*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1538*c0909341SAndroid Build Coastguard Worker sub r3d, dxd 1539*c0909341SAndroid Build Coastguard Worker jg .w8_loop 1540*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm7, m7, 3 1541*c0909341SAndroid Build Coastguard Worker.w8_end_loop: 1542*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm7 1543*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm7 1544*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm7 1545*c0909341SAndroid Build Coastguard Worker movq [dstq+r2 ], xm7 1546*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1547*c0909341SAndroid Build Coastguard Worker sub hd, 4 1548*c0909341SAndroid Build Coastguard Worker jg .w8_end_loop 1549*c0909341SAndroid Build Coastguard Worker.w8_end: 1550*c0909341SAndroid Build Coastguard Worker RET 1551*c0909341SAndroid Build Coastguard Worker.w16_filter: 1552*c0909341SAndroid Build Coastguard Worker mova m0, [base+z_filter_s1] 1553*c0909341SAndroid Build Coastguard Worker popcnt r5d, r5d 1554*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [base+z_filter_s2] 1555*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [base+z_filter_s3] 1556*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [base+z_filter_s4] 1557*c0909341SAndroid Build Coastguard Worker vpermi2b m0, m7, m2 ; al bl 1558*c0909341SAndroid Build Coastguard Worker mova m5, [base+z_filter_s5] 1559*c0909341SAndroid Build Coastguard Worker pshufb m1, m7, m1 ; ah bh 1560*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0] 1561*c0909341SAndroid Build Coastguard Worker pshufb m3, m7, m3 ; cl ch 1562*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1] 1563*c0909341SAndroid Build Coastguard Worker pshufb m4, m7, m4 ; el dl 1564*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2] 1565*c0909341SAndroid Build Coastguard Worker vpermb m5, m5, m7 ; eh dh 1566*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m11 1567*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m11 1568*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3, m12 1569*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m13 1570*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m11 1571*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m11 1572*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1573*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1574*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1575*c0909341SAndroid Build Coastguard Worker paddw m1, m5 1576*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 1577*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 1578*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1579*c0909341SAndroid Build Coastguard Worker ret 1580*c0909341SAndroid Build Coastguard Worker.w16: 1581*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+15] 1582*c0909341SAndroid Build Coastguard Worker mova m9, [pb_0to63] 1583*c0909341SAndroid Build Coastguard Worker vpbroadcastb ym0, r3d 1584*c0909341SAndroid Build Coastguard Worker and r3d, 15 1585*c0909341SAndroid Build Coastguard Worker movu ym7, [tlq] 1586*c0909341SAndroid Build Coastguard Worker or r3d, 16 ; imin(h+15, 31) 1587*c0909341SAndroid Build Coastguard Worker vpbroadcastb m8, r3d 1588*c0909341SAndroid Build Coastguard Worker pminub m8, m9 1589*c0909341SAndroid Build Coastguard Worker vpermb m7, m8, m7 1590*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1591*c0909341SAndroid Build Coastguard Worker jnz .w16_main 1592*c0909341SAndroid Build Coastguard Worker vpbroadcastb ym1, angled 1593*c0909341SAndroid Build Coastguard Worker shr angled, 8 1594*c0909341SAndroid Build Coastguard Worker vpcmpeqb k1, ym0, [base+z_filter_wh] 1595*c0909341SAndroid Build Coastguard Worker mova xm0, [base+z_filter_t0+angleq*8] 1596*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1{k1}, ym1, ym0 1597*c0909341SAndroid Build Coastguard Worker kmovd r5d, k1 1598*c0909341SAndroid Build Coastguard Worker test r5d, r5d 1599*c0909341SAndroid Build Coastguard Worker jz .w16_main 1600*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-4] 1601*c0909341SAndroid Build Coastguard Worker call .w16_filter 1602*c0909341SAndroid Build Coastguard Worker cmp hd, 16 1603*c0909341SAndroid Build Coastguard Worker jle .w16_filter_end 1604*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pb_33] 1605*c0909341SAndroid Build Coastguard Worker add r3d, 2 1606*c0909341SAndroid Build Coastguard Worker pminub m8, m9 1607*c0909341SAndroid Build Coastguard Worker.w16_filter_end: 1608*c0909341SAndroid Build Coastguard Worker vpermb m7, m8, m0 1609*c0909341SAndroid Build Coastguard Worker.w16_main: 1610*c0909341SAndroid Build Coastguard Worker movshdup m3, [base+z_xpos_mul] 1611*c0909341SAndroid Build Coastguard Worker vpbroadcastw m8, dxd 1612*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1613*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+z_xpos_bc] 1614*c0909341SAndroid Build Coastguard Worker pmullw m3, m8 ; xpos 1615*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [base+z_xpos_off1a] 1616*c0909341SAndroid Build Coastguard Worker sub r3d, dxd 1617*c0909341SAndroid Build Coastguard Worker shl dxd, 2 1618*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [base+z_xpos_off1b] 1619*c0909341SAndroid Build Coastguard Worker psllw m8, 4 ; dx*4 1620*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 1621*c0909341SAndroid Build Coastguard Worker.w16_loop: 1622*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m4 1623*c0909341SAndroid Build Coastguard Worker psrlw m2, m3, 3 1624*c0909341SAndroid Build Coastguard Worker paddsb m0, m1, m5 1625*c0909341SAndroid Build Coastguard Worker vpermw m2, m2, m14 1626*c0909341SAndroid Build Coastguard Worker paddsb m1, m6 1627*c0909341SAndroid Build Coastguard Worker vpermb m0, m0, m7 1628*c0909341SAndroid Build Coastguard Worker vpermb m1, m1, m7 1629*c0909341SAndroid Build Coastguard Worker paddsw m3, m8 1630*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1631*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 1632*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 1633*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 1634*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1635*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 1636*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 1637*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m0, 2 1638*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r2 ], m0, 3 1639*c0909341SAndroid Build Coastguard Worker sub hd, 4 1640*c0909341SAndroid Build Coastguard Worker jz .w16_end 1641*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1642*c0909341SAndroid Build Coastguard Worker sub r3d, dxd 1643*c0909341SAndroid Build Coastguard Worker jg .w16_loop 1644*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm7, m7, 3 1645*c0909341SAndroid Build Coastguard Worker.w16_end_loop: 1646*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm7 1647*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], xm7 1648*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm7 1649*c0909341SAndroid Build Coastguard Worker mova [dstq+r2 ], xm7 1650*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1651*c0909341SAndroid Build Coastguard Worker sub hd, 4 1652*c0909341SAndroid Build Coastguard Worker jg .w16_end_loop 1653*c0909341SAndroid Build Coastguard Worker.w16_end: 1654*c0909341SAndroid Build Coastguard Worker RET 1655*c0909341SAndroid Build Coastguard Worker.w32_filter: 1656*c0909341SAndroid Build Coastguard Worker mova m0, [base+z_filter_s1] 1657*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [base+z_filter_s2] 1658*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [base+z_filter_s3] 1659*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [base+z_filter_s4] 1660*c0909341SAndroid Build Coastguard Worker vpermi2b m0, m7, m2 ; al bl 1661*c0909341SAndroid Build Coastguard Worker mova m5, [base+z_filter_s5] 1662*c0909341SAndroid Build Coastguard Worker pshufb m1, m7, m1 ; ah bh 1663*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+z_filter_k+4*2+12*0] 1664*c0909341SAndroid Build Coastguard Worker pshufb m3, m7, m3 ; cl ch 1665*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+z_filter_k+4*2+12*1] 1666*c0909341SAndroid Build Coastguard Worker pshufb m4, m7, m4 ; el dl 1667*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+z_filter_k+4*2+12*2] 1668*c0909341SAndroid Build Coastguard Worker vpermi2b m5, m7, m8 ; eh dh 1669*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m11 1670*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m11 1671*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m3, m12 1672*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m13 1673*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m11 1674*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m11 1675*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1676*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1677*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1678*c0909341SAndroid Build Coastguard Worker paddw m1, m5 1679*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 1680*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 1681*c0909341SAndroid Build Coastguard Worker packuswb m7, m0, m1 1682*c0909341SAndroid Build Coastguard Worker ret 1683*c0909341SAndroid Build Coastguard Worker.w32: 1684*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+31] 1685*c0909341SAndroid Build Coastguard Worker vpbroadcastb m9, r3d 1686*c0909341SAndroid Build Coastguard Worker and r3d, 31 1687*c0909341SAndroid Build Coastguard Worker pminub m10, m9, [pb_0to63] 1688*c0909341SAndroid Build Coastguard Worker or r3d, 32 ; imin(h+31, 63) 1689*c0909341SAndroid Build Coastguard Worker vpermb m7, m10, [tlq] 1690*c0909341SAndroid Build Coastguard Worker vpbroadcastb m8, [tlq+r3] 1691*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 1692*c0909341SAndroid Build Coastguard Worker jnz .w32_main 1693*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-4] 1694*c0909341SAndroid Build Coastguard Worker call .w32_filter 1695*c0909341SAndroid Build Coastguard Worker cmp hd, 64 1696*c0909341SAndroid Build Coastguard Worker je .w32_h64_filter_end 1697*c0909341SAndroid Build Coastguard Worker vpermb m8, m9, m7 1698*c0909341SAndroid Build Coastguard Worker vpermb m7, m10, m7 1699*c0909341SAndroid Build Coastguard Worker jmp .w32_main 1700*c0909341SAndroid Build Coastguard Worker.w32_h64_filter_end: ; edge case for 32x64 1701*c0909341SAndroid Build Coastguard Worker movd xmm0, [tlq+r3-1] 1702*c0909341SAndroid Build Coastguard Worker movd xmm1, [base+pb_8_56_0_0] 1703*c0909341SAndroid Build Coastguard Worker add r3d, 2 1704*c0909341SAndroid Build Coastguard Worker pmaddubsw xmm0, xmm1 1705*c0909341SAndroid Build Coastguard Worker vptestmw k1, xmm1, xmm1 ; 0x01 1706*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xmm0, xm15 1707*c0909341SAndroid Build Coastguard Worker vmovdqu8 m8{k1}, m0 1708*c0909341SAndroid Build Coastguard Worker.w32_main: 1709*c0909341SAndroid Build Coastguard Worker rorx r2d, dxd, 30 1710*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+z_xpos_bc] 1711*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, r2d 1712*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m5, [base+z_xpos_off2a] 1713*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1714*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m6, [base+z_xpos_off2b] 1715*c0909341SAndroid Build Coastguard Worker sub r3d, dxd 1716*c0909341SAndroid Build Coastguard Worker paddw m9, m3, m3 1717*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1718*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, ym9, 1 1719*c0909341SAndroid Build Coastguard Worker.w32_loop: 1720*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m4 1721*c0909341SAndroid Build Coastguard Worker psrlw m2, m3, 3 1722*c0909341SAndroid Build Coastguard Worker paddsb m0, m1, m5 1723*c0909341SAndroid Build Coastguard Worker vpermw m2, m2, m14 1724*c0909341SAndroid Build Coastguard Worker paddsb m1, m6 1725*c0909341SAndroid Build Coastguard Worker vpermi2b m0, m7, m8 1726*c0909341SAndroid Build Coastguard Worker vpermi2b m1, m7, m8 1727*c0909341SAndroid Build Coastguard Worker paddsw m3, m9 1728*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1729*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 1730*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 1731*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 1732*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1733*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 1734*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 1735*c0909341SAndroid Build Coastguard Worker sub hd, 2 1736*c0909341SAndroid Build Coastguard Worker jz .w32_end 1737*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1738*c0909341SAndroid Build Coastguard Worker sub r3d, dxd 1739*c0909341SAndroid Build Coastguard Worker jg .w32_loop 1740*c0909341SAndroid Build Coastguard Worker punpckhqdq ym8, ym8 1741*c0909341SAndroid Build Coastguard Worker.w32_end_loop: 1742*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym8 1743*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], ym8 1744*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1745*c0909341SAndroid Build Coastguard Worker sub hd, 2 1746*c0909341SAndroid Build Coastguard Worker jg .w32_end_loop 1747*c0909341SAndroid Build Coastguard Worker.w32_end: 1748*c0909341SAndroid Build Coastguard Worker RET 1749*c0909341SAndroid Build Coastguard Worker.w64_filter: 1750*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [base+z_filter_s2] 1751*c0909341SAndroid Build Coastguard Worker mova m1, [base+z_filter_s1] 1752*c0909341SAndroid Build Coastguard Worker pshufb m0, m3 ; al bl 1753*c0909341SAndroid Build Coastguard Worker vpermi2b m1, m7, m2 1754*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [base+z_filter_s4] 1755*c0909341SAndroid Build Coastguard Worker pshufb m6, m8, m4 ; el dl 1756*c0909341SAndroid Build Coastguard Worker pshufb m9, m7, m4 1757*c0909341SAndroid Build Coastguard Worker pminub m10, m13, [base+z_filter_s5] 1758*c0909341SAndroid Build Coastguard Worker pshufb m2, m8, m3 ; ah bh 1759*c0909341SAndroid Build Coastguard Worker pshufb m3, m7, m3 1760*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [base+z_filter_s3] 1761*c0909341SAndroid Build Coastguard Worker vpermb m10, m10, m8 ; eh dh 1762*c0909341SAndroid Build Coastguard Worker pshufb m11, m4 1763*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+z_filter_k+4*2+12*0] 1764*c0909341SAndroid Build Coastguard Worker pshufb m8, m5 ; cl ch 1765*c0909341SAndroid Build Coastguard Worker pshufb m7, m5 1766*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+z_filter_k+4*2+12*1] 1767*c0909341SAndroid Build Coastguard Worker REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11 1768*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m8, m5 1769*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m7, m5 1770*c0909341SAndroid Build Coastguard Worker paddw m0, m6 1771*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+z_filter_k+4*2+12*2] 1772*c0909341SAndroid Build Coastguard Worker paddw m1, m9 1773*c0909341SAndroid Build Coastguard Worker pmaddubsw m7, m6 1774*c0909341SAndroid Build Coastguard Worker pmaddubsw m8, m6 1775*c0909341SAndroid Build Coastguard Worker paddw m2, m10 1776*c0909341SAndroid Build Coastguard Worker paddw m3, m11 1777*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1778*c0909341SAndroid Build Coastguard Worker paddw m1, m5 1779*c0909341SAndroid Build Coastguard Worker paddw m2, m8 1780*c0909341SAndroid Build Coastguard Worker paddw m3, m7 1781*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m15}, m0, m2, m1, m3 1782*c0909341SAndroid Build Coastguard Worker packuswb m0, m2 1783*c0909341SAndroid Build Coastguard Worker packuswb m7, m1, m3 1784*c0909341SAndroid Build Coastguard Worker vpermb m8, m12, m0 1785*c0909341SAndroid Build Coastguard Worker ret 1786*c0909341SAndroid Build Coastguard Worker.w64: 1787*c0909341SAndroid Build Coastguard Worker lea r3d, [hq-1] 1788*c0909341SAndroid Build Coastguard Worker movu m7, [tlq+64*0] 1789*c0909341SAndroid Build Coastguard Worker vpbroadcastb m13, r3d 1790*c0909341SAndroid Build Coastguard Worker pminub m12, m13, [pb_0to63] 1791*c0909341SAndroid Build Coastguard Worker or r3d, 64 1792*c0909341SAndroid Build Coastguard Worker vpermb m8, m12, [tlq+64*1] 1793*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 1794*c0909341SAndroid Build Coastguard Worker jnz .w64_main 1795*c0909341SAndroid Build Coastguard Worker movu m0, [tlq+56] 1796*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-4] 1797*c0909341SAndroid Build Coastguard Worker movu m11, [tlq+8] 1798*c0909341SAndroid Build Coastguard Worker call .w64_filter 1799*c0909341SAndroid Build Coastguard Worker.w64_main: 1800*c0909341SAndroid Build Coastguard Worker rorx r2d, dxd, 30 1801*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+z_xpos_bc] 1802*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, r2d 1803*c0909341SAndroid Build Coastguard Worker mova m5, [base+z_xpos_off2a] 1804*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1805*c0909341SAndroid Build Coastguard Worker mova m6, [base+z_xpos_off2b] 1806*c0909341SAndroid Build Coastguard Worker sub r3d, dxd 1807*c0909341SAndroid Build Coastguard Worker mova m9, m3 1808*c0909341SAndroid Build Coastguard Worker.w64_loop: 1809*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m4 1810*c0909341SAndroid Build Coastguard Worker psrlw m2, m3, 3 1811*c0909341SAndroid Build Coastguard Worker paddsb m0, m1, m5 1812*c0909341SAndroid Build Coastguard Worker vpermw m2, m2, m14 1813*c0909341SAndroid Build Coastguard Worker paddsb m1, m6 1814*c0909341SAndroid Build Coastguard Worker vpermi2b m0, m7, m8 1815*c0909341SAndroid Build Coastguard Worker vpermi2b m1, m7, m8 1816*c0909341SAndroid Build Coastguard Worker paddsw m3, m9 1817*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1818*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m2 1819*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 1820*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 1821*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1822*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 1823*c0909341SAndroid Build Coastguard Worker dec hd 1824*c0909341SAndroid Build Coastguard Worker jz .w64_end 1825*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1826*c0909341SAndroid Build Coastguard Worker sub r3d, dxd 1827*c0909341SAndroid Build Coastguard Worker jg .w64_loop 1828*c0909341SAndroid Build Coastguard Worker vpermb m8, m13, m8 1829*c0909341SAndroid Build Coastguard Worker.w64_end_loop: 1830*c0909341SAndroid Build Coastguard Worker mova [dstq], m8 1831*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1832*c0909341SAndroid Build Coastguard Worker dec hd 1833*c0909341SAndroid Build Coastguard Worker jg .w64_end_loop 1834*c0909341SAndroid Build Coastguard Worker.w64_end: 1835*c0909341SAndroid Build Coastguard Worker RET 1836*c0909341SAndroid Build Coastguard Worker 1837*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy 1838*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 1839*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 1840*c0909341SAndroid Build Coastguard Worker lea dxq, [dr_intra_derivative-90] 1841*c0909341SAndroid Build Coastguard Worker movzx dyd, angleb 1842*c0909341SAndroid Build Coastguard Worker xor angled, 0x400 1843*c0909341SAndroid Build Coastguard Worker mov r7, dxq 1844*c0909341SAndroid Build Coastguard Worker sub dxq, dyq 1845*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1846*c0909341SAndroid Build Coastguard Worker and dyd, ~1 1847*c0909341SAndroid Build Coastguard Worker and dxq, ~1 1848*c0909341SAndroid Build Coastguard Worker movzx dyd, word [r7+dyq] ; angle - 90 1849*c0909341SAndroid Build Coastguard Worker lea r7, [z_filter_t0] 1850*c0909341SAndroid Build Coastguard Worker movzx dxd, word [dxq+270] ; 180 - angle 1851*c0909341SAndroid Build Coastguard Worker movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4] 1852*c0909341SAndroid Build Coastguard Worker mova m8, [base+pb_63to0] 1853*c0909341SAndroid Build Coastguard Worker neg dyd 1854*c0909341SAndroid Build Coastguard Worker vpermb m8, m8, [tlq-64] ; left 1855*c0909341SAndroid Build Coastguard Worker lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq] 1856*c0909341SAndroid Build Coastguard Worker mova m14, [base+z_frac_table] 1857*c0909341SAndroid Build Coastguard Worker inc tlq 1858*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+pw_512] 1859*c0909341SAndroid Build Coastguard Worker neg dxd 1860*c0909341SAndroid Build Coastguard Worker jmp wq 1861*c0909341SAndroid Build Coastguard Worker.w4: 1862*c0909341SAndroid Build Coastguard Worker movd xm7, [tlq] 1863*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [base+z_xpos_off2a] 1864*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1865*c0909341SAndroid Build Coastguard Worker jnz .w4_main ; !enable_intra_edge_filter 1866*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+2] 1867*c0909341SAndroid Build Coastguard Worker add angled, 1022 1868*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1869*c0909341SAndroid Build Coastguard Worker test r3d, angled 1870*c0909341SAndroid Build Coastguard Worker jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 1871*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm2, [base+pb_4] 1872*c0909341SAndroid Build Coastguard Worker sub angled, 1075 ; angle - 53 1873*c0909341SAndroid Build Coastguard Worker call .upsample_above 1874*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 1875*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [pb_0to63+1] 1876*c0909341SAndroid Build Coastguard Worker punpcklbw xm7, xm0, xm7 1877*c0909341SAndroid Build Coastguard Worker call .filter_strength 1878*c0909341SAndroid Build Coastguard Worker jmp .w4_filter_left 1879*c0909341SAndroid Build Coastguard Worker.w4_upsample_left: 1880*c0909341SAndroid Build Coastguard Worker call .upsample_left 1881*c0909341SAndroid Build Coastguard Worker movsldup m16, [base+z_ypos_off3] 1882*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pb_16] 1883*c0909341SAndroid Build Coastguard Worker punpcklbw xm8, xm0, xm8 1884*c0909341SAndroid Build Coastguard Worker jmp .w4_main2 1885*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above: 1886*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 1887*c0909341SAndroid Build Coastguard Worker sub angled, 1112 ; angle - 90 1888*c0909341SAndroid Build Coastguard Worker call .filter_strength 1889*c0909341SAndroid Build Coastguard Worker test r3d, r3d 1890*c0909341SAndroid Build Coastguard Worker jz .w4_no_filter_above 1891*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, [base+pb_3] 1892*c0909341SAndroid Build Coastguard Worker call .filter_top_w16 1893*c0909341SAndroid Build Coastguard Worker.w4_no_filter_above: 1894*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+2] 1895*c0909341SAndroid Build Coastguard Worker add angled, 973 ; angle + 883 1896*c0909341SAndroid Build Coastguard Worker shl r3d, 6 1897*c0909341SAndroid Build Coastguard Worker test r3d, angled 1898*c0909341SAndroid Build Coastguard Worker jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 1899*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym0, [base+pb_90] 1900*c0909341SAndroid Build Coastguard Worker psubb ym0, ym17 1901*c0909341SAndroid Build Coastguard Worker vpcmpgtb k2{k2}, ym0, ym16 1902*c0909341SAndroid Build Coastguard Worker kmovd r3d, k2 1903*c0909341SAndroid Build Coastguard Worker.w4_filter_left: 1904*c0909341SAndroid Build Coastguard Worker test r3d, r3d 1905*c0909341SAndroid Build Coastguard Worker jz .w4_main 1906*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 1907*c0909341SAndroid Build Coastguard Worker call .filter_left_h16 1908*c0909341SAndroid Build Coastguard Worker.w4_main: 1909*c0909341SAndroid Build Coastguard Worker movsldup m16, [base+z_ypos_off1] 1910*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pb_8] 1911*c0909341SAndroid Build Coastguard Worker.w4_main2: 1912*c0909341SAndroid Build Coastguard Worker vpbroadcastq m3, [base+z_ypos_mul1a] 1913*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, dyd 1914*c0909341SAndroid Build Coastguard Worker movsldup m1, [base+z_xpos_mul] 1915*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, dxd 1916*c0909341SAndroid Build Coastguard Worker vinserti32x4 m7, [tlq-16], 3 1917*c0909341SAndroid Build Coastguard Worker vinserti32x4 m8, [tlq-16], 3 1918*c0909341SAndroid Build Coastguard Worker pmullw m3, m0 1919*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [base+z_xpos_bc] 1920*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 ; xpos0..3 1921*c0909341SAndroid Build Coastguard Worker psllw m5, 5 ; dx*8 1922*c0909341SAndroid Build Coastguard Worker psraw m4, m3, 6 1923*c0909341SAndroid Build Coastguard Worker psrlw m3, 1 1924*c0909341SAndroid Build Coastguard Worker packsswb m4, m4 1925*c0909341SAndroid Build Coastguard Worker vpermw m3, m3, m14 ; 64-frac, frac 1926*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m4 1927*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 1928*c0909341SAndroid Build Coastguard Worker paddb m4, m16 ; base, base+1 1929*c0909341SAndroid Build Coastguard Worker.w4_loop: 1930*c0909341SAndroid Build Coastguard Worker pshufb m16, m1, m2 1931*c0909341SAndroid Build Coastguard Worker psrlw m0, m1, 3 1932*c0909341SAndroid Build Coastguard Worker paddb m16, m10 1933*c0909341SAndroid Build Coastguard Worker vpermw m0, m0, m14 1934*c0909341SAndroid Build Coastguard Worker vpmovw2m k1, m16 ; base_x < 0 1935*c0909341SAndroid Build Coastguard Worker vpermb m16, m16, m7 1936*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m0 1937*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, m8 1938*c0909341SAndroid Build Coastguard Worker pmaddubsw m16{k1}, m0, m3 1939*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 1940*c0909341SAndroid Build Coastguard Worker vpmovwb ym16, m16 1941*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm16 1942*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm16, 1 1943*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm16, 2 1944*c0909341SAndroid Build Coastguard Worker pextrd [dstq+r2 ], xm16, 3 1945*c0909341SAndroid Build Coastguard Worker sub hd, 8 1946*c0909341SAndroid Build Coastguard Worker jl .w4_end 1947*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 1948*c0909341SAndroid Build Coastguard Worker vextracti128 xm16, ym16, 1 1949*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1950*c0909341SAndroid Build Coastguard Worker paddb m4, m9 1951*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm16 1952*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm16, 1 1953*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm16, 2 1954*c0909341SAndroid Build Coastguard Worker pextrd [dstq+r2 ], xm16, 3 1955*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1956*c0909341SAndroid Build Coastguard Worker jg .w4_loop 1957*c0909341SAndroid Build Coastguard Worker.w4_end: 1958*c0909341SAndroid Build Coastguard Worker RET 1959*c0909341SAndroid Build Coastguard Worker.upsample_above: ; w4/w8 1960*c0909341SAndroid Build Coastguard Worker mova xm0, [tlq-1] 1961*c0909341SAndroid Build Coastguard Worker xor angled, 0x7f ; 180 - angle 1962*c0909341SAndroid Build Coastguard Worker add dxd, dxd 1963*c0909341SAndroid Build Coastguard Worker jmp .upsample 1964*c0909341SAndroid Build Coastguard Worker.upsample_left: ; h4/h8 1965*c0909341SAndroid Build Coastguard Worker palignr xm0, xm8, [tlq-16], 15 1966*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm2, hd 1967*c0909341SAndroid Build Coastguard Worker add dyd, dyd 1968*c0909341SAndroid Build Coastguard Worker.upsample: 1969*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm0, [base+z_filter4_s1] 1970*c0909341SAndroid Build Coastguard Worker pminub xm2, [base+z_filter_s4] 1971*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, [base+pb_m4_36] 1972*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm2 1973*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm3 1974*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm3 1975*c0909341SAndroid Build Coastguard Worker paddw xm0, xm1 1976*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm15 1977*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 1978*c0909341SAndroid Build Coastguard Worker ret 1979*c0909341SAndroid Build Coastguard Worker.filter_strength: 1980*c0909341SAndroid Build Coastguard Worker vpbroadcastb ym16, r3d 1981*c0909341SAndroid Build Coastguard Worker mov r3d, angled 1982*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-4] 1983*c0909341SAndroid Build Coastguard Worker vpbroadcastb ym17, angled 1984*c0909341SAndroid Build Coastguard Worker shr r3d, 8 1985*c0909341SAndroid Build Coastguard Worker vpcmpeqb k2, ym16, [base+z_filter_wh] 1986*c0909341SAndroid Build Coastguard Worker mova xm16, [base+z_filter_t0+r3*8] 1987*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1{k2}, ym17, ym16 1988*c0909341SAndroid Build Coastguard Worker mova m9, [pb_0to63] 1989*c0909341SAndroid Build Coastguard Worker kmovd r3d, k1 1990*c0909341SAndroid Build Coastguard Worker ret 1991*c0909341SAndroid Build Coastguard Worker.w8: 1992*c0909341SAndroid Build Coastguard Worker movq xm7, [tlq] 1993*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m10, [base+z_xpos_off2a] 1994*c0909341SAndroid Build Coastguard Worker test angled, 0x400 1995*c0909341SAndroid Build Coastguard Worker jnz .w8_main 1996*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+126] 1997*c0909341SAndroid Build Coastguard Worker mov r3b, hb 1998*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 1999*c0909341SAndroid Build Coastguard Worker ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2000*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm2, [base+pb_8] 2001*c0909341SAndroid Build Coastguard Worker sub angled, 53 ; angle - 53 2002*c0909341SAndroid Build Coastguard Worker call .upsample_above 2003*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 2004*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m10, [pb_0to63+1] 2005*c0909341SAndroid Build Coastguard Worker punpcklbw xm7, xm0, xm7 2006*c0909341SAndroid Build Coastguard Worker call .filter_strength 2007*c0909341SAndroid Build Coastguard Worker jmp .w8_filter_left 2008*c0909341SAndroid Build Coastguard Worker.w8_upsample_left: 2009*c0909341SAndroid Build Coastguard Worker call .upsample_left 2010*c0909341SAndroid Build Coastguard Worker movshdup m16, [base+z_ypos_off3] 2011*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pb_8] 2012*c0909341SAndroid Build Coastguard Worker punpcklbw xm8, xm0, xm8 2013*c0909341SAndroid Build Coastguard Worker jmp .w8_main2 2014*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above: 2015*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 2016*c0909341SAndroid Build Coastguard Worker sub angled, 90 ; angle - 90 2017*c0909341SAndroid Build Coastguard Worker call .filter_strength 2018*c0909341SAndroid Build Coastguard Worker test r3d, r3d 2019*c0909341SAndroid Build Coastguard Worker jz .w8_no_filter_above 2020*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, [base+pb_7] 2021*c0909341SAndroid Build Coastguard Worker call .filter_top_w16 2022*c0909341SAndroid Build Coastguard Worker.w8_no_filter_above: 2023*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq-51] 2024*c0909341SAndroid Build Coastguard Worker mov r3b, hb 2025*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2026*c0909341SAndroid Build Coastguard Worker jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm 2027*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym0, [base+pb_90] 2028*c0909341SAndroid Build Coastguard Worker psubb ym0, ym17 2029*c0909341SAndroid Build Coastguard Worker vpcmpgtb k2{k2}, ym0, ym16 2030*c0909341SAndroid Build Coastguard Worker kmovd r3d, k2 2031*c0909341SAndroid Build Coastguard Worker.w8_filter_left: 2032*c0909341SAndroid Build Coastguard Worker test r3d, r3d 2033*c0909341SAndroid Build Coastguard Worker jz .w8_main 2034*c0909341SAndroid Build Coastguard Worker cmp hd, 32 2035*c0909341SAndroid Build Coastguard Worker je .w8_filter_left_h32 2036*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 2037*c0909341SAndroid Build Coastguard Worker call .filter_left_h16 2038*c0909341SAndroid Build Coastguard Worker jmp .w8_main 2039*c0909341SAndroid Build Coastguard Worker.w8_filter_left_h32: 2040*c0909341SAndroid Build Coastguard Worker call .filter_left_h64 2041*c0909341SAndroid Build Coastguard Worker.w8_main: 2042*c0909341SAndroid Build Coastguard Worker movshdup m16, [base+z_ypos_off2] 2043*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pb_4] 2044*c0909341SAndroid Build Coastguard Worker.w8_main2: 2045*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [base+z_ypos_mul1a] 2046*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, dyd 2047*c0909341SAndroid Build Coastguard Worker movshdup m1, [base+z_xpos_mul] 2048*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, dxd 2049*c0909341SAndroid Build Coastguard Worker vinserti32x4 m7, [tlq-16], 3 2050*c0909341SAndroid Build Coastguard Worker vinserti32x4 m8, [tlq-16], 3 2051*c0909341SAndroid Build Coastguard Worker pmullw m3, m0 2052*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+pb_1] 2053*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 ; xpos0..3 2054*c0909341SAndroid Build Coastguard Worker psllw m5, 4 ; dx*4 2055*c0909341SAndroid Build Coastguard Worker psraw m4, m3, 6 2056*c0909341SAndroid Build Coastguard Worker psrlw m3, 1 2057*c0909341SAndroid Build Coastguard Worker packsswb m4, m4 2058*c0909341SAndroid Build Coastguard Worker vpermw m3, m3, m14 ; 64-frac, frac 2059*c0909341SAndroid Build Coastguard Worker lea r3d, [dxq+(8<<6)] 2060*c0909341SAndroid Build Coastguard Worker paddsb m4, m16 2061*c0909341SAndroid Build Coastguard Worker shl dxd, 2 2062*c0909341SAndroid Build Coastguard Worker paddsb m0, m4, m2 2063*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 2064*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0 ; base, base+1 2065*c0909341SAndroid Build Coastguard Worker.w8_loop: 2066*c0909341SAndroid Build Coastguard Worker pshufb m16, m1, m2 2067*c0909341SAndroid Build Coastguard Worker psrlw m0, m1, 3 2068*c0909341SAndroid Build Coastguard Worker paddb m16, m10 2069*c0909341SAndroid Build Coastguard Worker vpermw m0, m0, m14 2070*c0909341SAndroid Build Coastguard Worker vpmovw2m k1, m16 ; base_x < 0 2071*c0909341SAndroid Build Coastguard Worker vpermb m16, m16, m7 2072*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m0 2073*c0909341SAndroid Build Coastguard Worker vpermb m0, m4, m8 2074*c0909341SAndroid Build Coastguard Worker pmaddubsw m16{k1}, m0, m3 2075*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 2076*c0909341SAndroid Build Coastguard Worker vpmovwb ym16, m16 2077*c0909341SAndroid Build Coastguard Worker vextracti128 xm17, ym16, 1 2078*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm16 2079*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm16 2080*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm17 2081*c0909341SAndroid Build Coastguard Worker movhps [dstq+r2 ], xm17 2082*c0909341SAndroid Build Coastguard Worker sub hd, 4 2083*c0909341SAndroid Build Coastguard Worker jz .w8_end 2084*c0909341SAndroid Build Coastguard Worker paddw m1, m5 2085*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2086*c0909341SAndroid Build Coastguard Worker paddb m4, m9 2087*c0909341SAndroid Build Coastguard Worker add r3d, dxd 2088*c0909341SAndroid Build Coastguard Worker jge .w8_loop 2089*c0909341SAndroid Build Coastguard Worker.w8_leftonly_loop: 2090*c0909341SAndroid Build Coastguard Worker vpermb m16, m4, m8 2091*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m3 2092*c0909341SAndroid Build Coastguard Worker paddb m4, m9 2093*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 2094*c0909341SAndroid Build Coastguard Worker vpmovwb ym16, m16 2095*c0909341SAndroid Build Coastguard Worker vextracti128 xm17, ym16, 1 2096*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm16 2097*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm16 2098*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm17 2099*c0909341SAndroid Build Coastguard Worker movhps [dstq+r2 ], xm17 2100*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2101*c0909341SAndroid Build Coastguard Worker sub hd, 4 2102*c0909341SAndroid Build Coastguard Worker jg .w8_leftonly_loop 2103*c0909341SAndroid Build Coastguard Worker.w8_end: 2104*c0909341SAndroid Build Coastguard Worker RET 2105*c0909341SAndroid Build Coastguard Worker.filter_top_w16: 2106*c0909341SAndroid Build Coastguard Worker mova xm0, [base+z_filter_s1] 2107*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 2108*c0909341SAndroid Build Coastguard Worker pminub xm4, xm5, [base+z_filter_s4] 2109*c0909341SAndroid Build Coastguard Worker vpermi2b xm0, xm7, xm2 2110*c0909341SAndroid Build Coastguard Worker pminub xm5, [base+z_filter_s5] 2111*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm7, [base+z_filter_s2] 2112*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] 2113*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm7, [base+z_filter_s3] 2114*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] 2115*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm7, xm4 2116*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] 2117*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm7, xm5 2118*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm11 2119*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm11 2120*c0909341SAndroid Build Coastguard Worker pmaddubsw xm6, xm3, xm12 2121*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm12, r7m ; max_width 2122*c0909341SAndroid Build Coastguard Worker pmaddubsw xm3, xm13 2123*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm11 2124*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm11 2125*c0909341SAndroid Build Coastguard Worker packssdw xm12, xm12 2126*c0909341SAndroid Build Coastguard Worker paddw xm0, xm6 2127*c0909341SAndroid Build Coastguard Worker paddw xm1, xm3 2128*c0909341SAndroid Build Coastguard Worker paddw xm0, xm4 2129*c0909341SAndroid Build Coastguard Worker paddw xm1, xm5 2130*c0909341SAndroid Build Coastguard Worker packsswb xm12, xm12 2131*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm15 2132*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm15 2133*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1, xm12, xm9 ; x < max_width 2134*c0909341SAndroid Build Coastguard Worker packuswb xm7{k1}, xm0, xm1 2135*c0909341SAndroid Build Coastguard Worker ret 2136*c0909341SAndroid Build Coastguard Worker.filter_left_h16: 2137*c0909341SAndroid Build Coastguard Worker lea r5d, [hq-1] 2138*c0909341SAndroid Build Coastguard Worker mova xm0, [base+z_filter_s1] 2139*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm5, r5d 2140*c0909341SAndroid Build Coastguard Worker vpermi2b xm0, xm8, xm2 2141*c0909341SAndroid Build Coastguard Worker pminub xm4, xm5, [base+z_filter_s4] 2142*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm8, [base+z_filter_s2] 2143*c0909341SAndroid Build Coastguard Worker pminub xm5, [base+z_filter_s5] 2144*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm8, [base+z_filter_s3] 2145*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] 2146*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm8, xm4 2147*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] 2148*c0909341SAndroid Build Coastguard Worker pshufb xm5, xm8, xm5 2149*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] 2150*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm11 2151*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm11 2152*c0909341SAndroid Build Coastguard Worker pmaddubsw xm6, xm3, xm12 2153*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm12, r8m ; max_height 2154*c0909341SAndroid Build Coastguard Worker pmaddubsw xm3, xm13 2155*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm11 2156*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm11 2157*c0909341SAndroid Build Coastguard Worker packssdw xm12, xm12 2158*c0909341SAndroid Build Coastguard Worker paddw xm0, xm6 2159*c0909341SAndroid Build Coastguard Worker paddw xm1, xm3 2160*c0909341SAndroid Build Coastguard Worker paddw xm0, xm4 2161*c0909341SAndroid Build Coastguard Worker paddw xm1, xm5 2162*c0909341SAndroid Build Coastguard Worker packsswb xm12, xm12 2163*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm15 2164*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm15 2165*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1, xm12, xm9 ; y < max_height 2166*c0909341SAndroid Build Coastguard Worker packuswb xm8{k1}, xm0, xm1 2167*c0909341SAndroid Build Coastguard Worker ret 2168*c0909341SAndroid Build Coastguard Worker.w16: 2169*c0909341SAndroid Build Coastguard Worker movu xm7, [tlq] ; top 2170*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2171*c0909341SAndroid Build Coastguard Worker jnz .w16_main 2172*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+15] 2173*c0909341SAndroid Build Coastguard Worker sub angled, 90 2174*c0909341SAndroid Build Coastguard Worker call .filter_strength 2175*c0909341SAndroid Build Coastguard Worker test r3d, r3d 2176*c0909341SAndroid Build Coastguard Worker jz .w16_no_filter_above 2177*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, [base+pb_15] 2178*c0909341SAndroid Build Coastguard Worker call .filter_top_w16 2179*c0909341SAndroid Build Coastguard Worker.w16_no_filter_above: 2180*c0909341SAndroid Build Coastguard Worker cmp hd, 16 2181*c0909341SAndroid Build Coastguard Worker jg .w16_filter_left_h64 2182*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym0, [base+pb_90] 2183*c0909341SAndroid Build Coastguard Worker psubb ym0, ym17 2184*c0909341SAndroid Build Coastguard Worker vpcmpgtb k2{k2}, ym0, ym16 2185*c0909341SAndroid Build Coastguard Worker kmovd r3d, k2 2186*c0909341SAndroid Build Coastguard Worker test r3d, r3d 2187*c0909341SAndroid Build Coastguard Worker jz .w16_main 2188*c0909341SAndroid Build Coastguard Worker popcnt r3d, r3d 2189*c0909341SAndroid Build Coastguard Worker call .filter_left_h16 2190*c0909341SAndroid Build Coastguard Worker jmp .w16_main 2191*c0909341SAndroid Build Coastguard Worker.w16_filter_left_h64: 2192*c0909341SAndroid Build Coastguard Worker call .filter_left_h64 2193*c0909341SAndroid Build Coastguard Worker.w16_main: 2194*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8 2195*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15 2196*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, dyd 2197*c0909341SAndroid Build Coastguard Worker vinserti32x4 m7, [tlq-16], 3 2198*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+pb_1] 2199*c0909341SAndroid Build Coastguard Worker vpbroadcastw m12, dxd 2200*c0909341SAndroid Build Coastguard Worker movshdup m1, [base+z_xpos_mul] 2201*c0909341SAndroid Build Coastguard Worker pmullw m6, m0 2202*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [base+z_xpos_off2a] 2203*c0909341SAndroid Build Coastguard Worker pmullw m5, m0 2204*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [base+z_xpos_off2b] 2205*c0909341SAndroid Build Coastguard Worker pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3 2206*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pb_4] 2207*c0909341SAndroid Build Coastguard Worker psllw m12, 4 ; dx*4 2208*c0909341SAndroid Build Coastguard Worker movshdup m16, [base+z_ypos_off2] 2209*c0909341SAndroid Build Coastguard Worker psrlw m10, m6, 1 2210*c0909341SAndroid Build Coastguard Worker psrlw m11, m5, 1 2211*c0909341SAndroid Build Coastguard Worker vpermw m10, m10, m14 ; 64-frac, frac 2212*c0909341SAndroid Build Coastguard Worker psraw m6, 6 2213*c0909341SAndroid Build Coastguard Worker vpermw m11, m11, m14 2214*c0909341SAndroid Build Coastguard Worker psraw m5, 6 2215*c0909341SAndroid Build Coastguard Worker mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft 2216*c0909341SAndroid Build Coastguard Worker packsswb m6, m5 2217*c0909341SAndroid Build Coastguard Worker mov r3d, 1<<6 2218*c0909341SAndroid Build Coastguard Worker paddsb m6, m16 2219*c0909341SAndroid Build Coastguard Worker sub r5d, dxd ; left-only threshold 2220*c0909341SAndroid Build Coastguard Worker paddsb m0, m6, m2 2221*c0909341SAndroid Build Coastguard Worker shl dxd, 2 2222*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6, m0 ; base, base+1 2223*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 2224*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m0 2225*c0909341SAndroid Build Coastguard Worker.w16_loop: 2226*c0909341SAndroid Build Coastguard Worker pshufb m17, m1, m2 2227*c0909341SAndroid Build Coastguard Worker psrlw m0, m1, 3 2228*c0909341SAndroid Build Coastguard Worker paddb m16, m3, m17 2229*c0909341SAndroid Build Coastguard Worker vpermw m0, m0, m14 2230*c0909341SAndroid Build Coastguard Worker paddb m17, m4 2231*c0909341SAndroid Build Coastguard Worker vpmovw2m k1, m16 2232*c0909341SAndroid Build Coastguard Worker vpermb m16, m16, m7 2233*c0909341SAndroid Build Coastguard Worker vpmovw2m k2, m17 2234*c0909341SAndroid Build Coastguard Worker vpermb m17, m17, m7 2235*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m0 2236*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m0 2237*c0909341SAndroid Build Coastguard Worker add r3d, dxd 2238*c0909341SAndroid Build Coastguard Worker jge .w16_toponly 2239*c0909341SAndroid Build Coastguard Worker mova m0, m8 2240*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m5, m7 2241*c0909341SAndroid Build Coastguard Worker pmaddubsw m16{k1}, m0, m10 2242*c0909341SAndroid Build Coastguard Worker mova m0, m8 2243*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m6, m7 2244*c0909341SAndroid Build Coastguard Worker pmaddubsw m17{k2}, m0, m11 2245*c0909341SAndroid Build Coastguard Worker.w16_toponly: 2246*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 2247*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m15 2248*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 2249*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm16 2250*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], ym16, 1 2251*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m16, 2 2252*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r2 ], m16, 3 2253*c0909341SAndroid Build Coastguard Worker sub hd, 4 2254*c0909341SAndroid Build Coastguard Worker jz .w16_end 2255*c0909341SAndroid Build Coastguard Worker paddw m1, m12 2256*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2257*c0909341SAndroid Build Coastguard Worker paddb m5, m9 2258*c0909341SAndroid Build Coastguard Worker paddb m6, m9 2259*c0909341SAndroid Build Coastguard Worker cmp r3d, r5d 2260*c0909341SAndroid Build Coastguard Worker jge .w16_loop 2261*c0909341SAndroid Build Coastguard Worker.w16_leftonly_loop: 2262*c0909341SAndroid Build Coastguard Worker vpermb m16, m5, m8 2263*c0909341SAndroid Build Coastguard Worker vpermb m17, m6, m8 2264*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m10 2265*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m11 2266*c0909341SAndroid Build Coastguard Worker paddb m5, m9 2267*c0909341SAndroid Build Coastguard Worker paddb m6, m9 2268*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 2269*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m15 2270*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 2271*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm16 2272*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], ym16, 1 2273*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m16, 2 2274*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r2 ], m16, 3 2275*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2276*c0909341SAndroid Build Coastguard Worker sub hd, 4 2277*c0909341SAndroid Build Coastguard Worker jg .w16_leftonly_loop 2278*c0909341SAndroid Build Coastguard Worker.w16_end: 2279*c0909341SAndroid Build Coastguard Worker RET 2280*c0909341SAndroid Build Coastguard Worker.w32: 2281*c0909341SAndroid Build Coastguard Worker movu ym7, [tlq] 2282*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2283*c0909341SAndroid Build Coastguard Worker jnz .w32_main 2284*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-4] 2285*c0909341SAndroid Build Coastguard Worker mova ym0, [base+z_filter_s1] 2286*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym1, [base+z_filter_s2] 2287*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym3, [base+z_filter_s3] 2288*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym4, [base+z_filter_s4] 2289*c0909341SAndroid Build Coastguard Worker vpermi2b ym0, ym7, ym2 ; al bl 2290*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym5, [base+pb_31] 2291*c0909341SAndroid Build Coastguard Worker pminub ym5, [base+z_filter_s5] 2292*c0909341SAndroid Build Coastguard Worker pshufb ym1, ym7, ym1 ; ah bh 2293*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym11, [base+z_filter_k+4*2+12*0] 2294*c0909341SAndroid Build Coastguard Worker pshufb ym3, ym7, ym3 ; cl ch 2295*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym12, [base+z_filter_k+4*2+12*1] 2296*c0909341SAndroid Build Coastguard Worker pshufb ym4, ym7, ym4 ; el dl 2297*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym13, [base+z_filter_k+4*2+12*2] 2298*c0909341SAndroid Build Coastguard Worker vpermb ym5, ym5, ym7 ; eh dh 2299*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym11 2300*c0909341SAndroid Build Coastguard Worker pmaddubsw ym1, ym11 2301*c0909341SAndroid Build Coastguard Worker pmaddubsw ym6, ym3, ym12 2302*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym12, r6m 2303*c0909341SAndroid Build Coastguard Worker pmaddubsw ym3, ym13 2304*c0909341SAndroid Build Coastguard Worker pmaddubsw ym4, ym11 2305*c0909341SAndroid Build Coastguard Worker pmaddubsw ym5, ym11 2306*c0909341SAndroid Build Coastguard Worker mova m9, [pb_0to63] 2307*c0909341SAndroid Build Coastguard Worker packssdw ym12, ym12 2308*c0909341SAndroid Build Coastguard Worker paddw ym0, ym6 2309*c0909341SAndroid Build Coastguard Worker paddw ym1, ym3 2310*c0909341SAndroid Build Coastguard Worker paddw ym0, ym4 2311*c0909341SAndroid Build Coastguard Worker paddw ym1, ym5 2312*c0909341SAndroid Build Coastguard Worker packsswb ym12, ym12 2313*c0909341SAndroid Build Coastguard Worker pmulhrsw ym0, ym15 2314*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym15 2315*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1, ym12, ym9 ; x < max_width 2316*c0909341SAndroid Build Coastguard Worker packuswb ym7{k1}, ym0, ym1 2317*c0909341SAndroid Build Coastguard Worker cmp hd, 16 2318*c0909341SAndroid Build Coastguard Worker jg .w32_filter_h64 2319*c0909341SAndroid Build Coastguard Worker mov r3d, 3 2320*c0909341SAndroid Build Coastguard Worker call .filter_left_h16 2321*c0909341SAndroid Build Coastguard Worker jmp .w32_main 2322*c0909341SAndroid Build Coastguard Worker.w32_filter_h64: 2323*c0909341SAndroid Build Coastguard Worker call .filter_left_h64 2324*c0909341SAndroid Build Coastguard Worker.w32_main: 2325*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8 2326*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15 2327*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, dyd 2328*c0909341SAndroid Build Coastguard Worker vinserti32x4 m7, [tlq-16], 3 2329*c0909341SAndroid Build Coastguard Worker rorx r2q, dxq, 62 ; dx << 2 2330*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+pb_1] 2331*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, r2d 2332*c0909341SAndroid Build Coastguard Worker pmullw m6, m0 2333*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m3, [base+z_xpos_off2a] 2334*c0909341SAndroid Build Coastguard Worker pmullw m5, m0 2335*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m4, [base+z_xpos_off2b] 2336*c0909341SAndroid Build Coastguard Worker mova ym0, ym1 2337*c0909341SAndroid Build Coastguard Worker paddw m12, m1, m1 2338*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pb_2] 2339*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; xpos1 xpos0 2340*c0909341SAndroid Build Coastguard Worker mova ym0, ym2 2341*c0909341SAndroid Build Coastguard Worker psrlw m10, m6, 1 2342*c0909341SAndroid Build Coastguard Worker psrlw m11, m5, 1 2343*c0909341SAndroid Build Coastguard Worker vpermw m10, m10, m14 ; 64-frac, frac 2344*c0909341SAndroid Build Coastguard Worker psraw m6, 6 2345*c0909341SAndroid Build Coastguard Worker vpermw m11, m11, m14 2346*c0909341SAndroid Build Coastguard Worker psraw m5, 6 2347*c0909341SAndroid Build Coastguard Worker mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft 2348*c0909341SAndroid Build Coastguard Worker packsswb m6, m5 2349*c0909341SAndroid Build Coastguard Worker mov r3d, 1<<6 2350*c0909341SAndroid Build Coastguard Worker paddsb m6, m0 2351*c0909341SAndroid Build Coastguard Worker sub r5d, dxd ; left-only threshold 2352*c0909341SAndroid Build Coastguard Worker paddsb m0, m6, m2 2353*c0909341SAndroid Build Coastguard Worker add dxd, dxd 2354*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6, m0 ; base, base+1 2355*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m0 2356*c0909341SAndroid Build Coastguard Worker.w32_loop: 2357*c0909341SAndroid Build Coastguard Worker pshufb m17, m1, m2 2358*c0909341SAndroid Build Coastguard Worker psrlw m0, m1, 3 2359*c0909341SAndroid Build Coastguard Worker paddb m16, m3, m17 2360*c0909341SAndroid Build Coastguard Worker vpermw m0, m0, m14 2361*c0909341SAndroid Build Coastguard Worker paddb m17, m4 2362*c0909341SAndroid Build Coastguard Worker vpmovw2m k1, m16 2363*c0909341SAndroid Build Coastguard Worker vpermb m16, m16, m7 2364*c0909341SAndroid Build Coastguard Worker vpmovw2m k2, m17 2365*c0909341SAndroid Build Coastguard Worker vpermb m17, m17, m7 2366*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m0 2367*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m0 2368*c0909341SAndroid Build Coastguard Worker add r3d, dxd 2369*c0909341SAndroid Build Coastguard Worker jge .w32_toponly 2370*c0909341SAndroid Build Coastguard Worker mova m0, m8 2371*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m5, m7 2372*c0909341SAndroid Build Coastguard Worker pmaddubsw m16{k1}, m0, m10 2373*c0909341SAndroid Build Coastguard Worker mova m0, m8 2374*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m6, m7 2375*c0909341SAndroid Build Coastguard Worker pmaddubsw m17{k2}, m0, m11 2376*c0909341SAndroid Build Coastguard Worker.w32_toponly: 2377*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 2378*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m15 2379*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 2380*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*0], m16, 1 2381*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], ym16 2382*c0909341SAndroid Build Coastguard Worker sub hd, 2 2383*c0909341SAndroid Build Coastguard Worker jz .w32_end 2384*c0909341SAndroid Build Coastguard Worker paddw m1, m12 2385*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2386*c0909341SAndroid Build Coastguard Worker paddb m5, m9 2387*c0909341SAndroid Build Coastguard Worker paddb m6, m9 2388*c0909341SAndroid Build Coastguard Worker cmp r3d, r5d 2389*c0909341SAndroid Build Coastguard Worker jge .w32_loop 2390*c0909341SAndroid Build Coastguard Worker.w32_leftonly_loop: 2391*c0909341SAndroid Build Coastguard Worker vpermb m16, m5, m8 2392*c0909341SAndroid Build Coastguard Worker vpermb m17, m6, m8 2393*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m10 2394*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m11 2395*c0909341SAndroid Build Coastguard Worker paddb m5, m9 2396*c0909341SAndroid Build Coastguard Worker paddb m6, m9 2397*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 2398*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m15 2399*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 2400*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*0], m16, 1 2401*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], ym16 2402*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2403*c0909341SAndroid Build Coastguard Worker sub hd, 2 2404*c0909341SAndroid Build Coastguard Worker jg .w32_leftonly_loop 2405*c0909341SAndroid Build Coastguard Worker.w32_end: 2406*c0909341SAndroid Build Coastguard Worker RET 2407*c0909341SAndroid Build Coastguard Worker.filter_left_h64: 2408*c0909341SAndroid Build Coastguard Worker mova m0, [base+z_filter_s1] 2409*c0909341SAndroid Build Coastguard Worker lea r3d, [hq-1] 2410*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [base+z_filter_s4] 2411*c0909341SAndroid Build Coastguard Worker vpbroadcastb m5, r3d 2412*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [base+z_filter_s2] 2413*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [base+z_filter_s3] 2414*c0909341SAndroid Build Coastguard Worker vpermi2b m0, m8, m2 ; al bl 2415*c0909341SAndroid Build Coastguard Worker pminub m5, [base+z_filter_s5] 2416*c0909341SAndroid Build Coastguard Worker pshufb m1, m8, m1 ; ah bh 2417*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+z_filter_k+4*2+12*0] 2418*c0909341SAndroid Build Coastguard Worker pshufb m3, m8, m3 ; cl ch 2419*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+z_filter_k+4*2+12*1] 2420*c0909341SAndroid Build Coastguard Worker pshufb m4, m8, m4 ; el dl 2421*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+z_filter_k+4*2+12*2] 2422*c0909341SAndroid Build Coastguard Worker vpermb m5, m5, m8 ; eh dh 2423*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m11 2424*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m11 2425*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m3, m12 2426*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, r8m ; max_height 2427*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m13 2428*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m11 2429*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m11 2430*c0909341SAndroid Build Coastguard Worker packssdw m12, m12 2431*c0909341SAndroid Build Coastguard Worker paddw m0, m6 2432*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2433*c0909341SAndroid Build Coastguard Worker paddw m0, m4 2434*c0909341SAndroid Build Coastguard Worker paddw m1, m5 2435*c0909341SAndroid Build Coastguard Worker packsswb m12, m12 2436*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 2437*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 2438*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1, m12, m9 ; y < max_height 2439*c0909341SAndroid Build Coastguard Worker packuswb m8{k1}, m0, m1 2440*c0909341SAndroid Build Coastguard Worker ret 2441*c0909341SAndroid Build Coastguard Worker.w64: 2442*c0909341SAndroid Build Coastguard Worker movu m7, [tlq] 2443*c0909341SAndroid Build Coastguard Worker test angled, 0x400 2444*c0909341SAndroid Build Coastguard Worker jnz .w64_main 2445*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-4] 2446*c0909341SAndroid Build Coastguard Worker mova m0, [base+z_filter_s1] 2447*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [base+z_filter_s2] 2448*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [base+z_filter_s3] 2449*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [base+z_filter_s4] 2450*c0909341SAndroid Build Coastguard Worker vpermi2b m0, m7, m2 ; al bl 2451*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [base+pb_63] 2452*c0909341SAndroid Build Coastguard Worker pminub m5, [base+z_filter_s5] 2453*c0909341SAndroid Build Coastguard Worker pshufb m1, m7, m1 ; ah bh 2454*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+z_filter_k+4*2+12*0] 2455*c0909341SAndroid Build Coastguard Worker pshufb m3, m7, m3 ; cl ch 2456*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+z_filter_k+4*2+12*1] 2457*c0909341SAndroid Build Coastguard Worker pshufb m4, m7, m4 ; el dl 2458*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+z_filter_k+4*2+12*2] 2459*c0909341SAndroid Build Coastguard Worker vpermb m5, m5, m7 ; eh dh 2460*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m11 2461*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m11 2462*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m3, m12 2463*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, r6m 2464*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m13 2465*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m11 2466*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m11 2467*c0909341SAndroid Build Coastguard Worker mova m9, [pb_0to63] 2468*c0909341SAndroid Build Coastguard Worker packssdw m12, m12 2469*c0909341SAndroid Build Coastguard Worker paddw m0, m6 2470*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2471*c0909341SAndroid Build Coastguard Worker paddw m0, m4 2472*c0909341SAndroid Build Coastguard Worker paddw m1, m5 2473*c0909341SAndroid Build Coastguard Worker packsswb m12, m12 2474*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 2475*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 2476*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1, m12, m9 ; x < max_width 2477*c0909341SAndroid Build Coastguard Worker packuswb m7{k1}, m0, m1 2478*c0909341SAndroid Build Coastguard Worker call .filter_left_h64 ; always filter the full 64 pixels for simplicity 2479*c0909341SAndroid Build Coastguard Worker.w64_main: 2480*c0909341SAndroid Build Coastguard Worker vpbroadcastw m5, dyd 2481*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [tlq-4] 2482*c0909341SAndroid Build Coastguard Worker rorx r2q, dxq, 62 ; dx << 2 2483*c0909341SAndroid Build Coastguard Worker pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such 2484*c0909341SAndroid Build Coastguard Worker pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge 2485*c0909341SAndroid Build Coastguard Worker vpbroadcastw m1, r2d ; xpos 2486*c0909341SAndroid Build Coastguard Worker mova m3, [base+z_xpos_off2a] 2487*c0909341SAndroid Build Coastguard Worker mova m4, [base+z_xpos_off2b] 2488*c0909341SAndroid Build Coastguard Worker mova m12, m1 2489*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+pb_1] 2490*c0909341SAndroid Build Coastguard Worker psrlw m10, m6, 1 2491*c0909341SAndroid Build Coastguard Worker psrlw m11, m5, 1 2492*c0909341SAndroid Build Coastguard Worker vpermw m10, m10, m14 ; 64-frac, frac 2493*c0909341SAndroid Build Coastguard Worker psraw m6, 6 2494*c0909341SAndroid Build Coastguard Worker vpermw m11, m11, m14 2495*c0909341SAndroid Build Coastguard Worker psraw m5, 6 2496*c0909341SAndroid Build Coastguard Worker mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft 2497*c0909341SAndroid Build Coastguard Worker packsswb m6, m5 2498*c0909341SAndroid Build Coastguard Worker mov r3d, 1<<6 2499*c0909341SAndroid Build Coastguard Worker paddsb m0, m6, m2 2500*c0909341SAndroid Build Coastguard Worker sub r5d, dxd ; left-only threshold 2501*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6, m0 ; base, base+1 2502*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m0 2503*c0909341SAndroid Build Coastguard Worker.w64_loop: 2504*c0909341SAndroid Build Coastguard Worker pshufb m17, m1, m2 2505*c0909341SAndroid Build Coastguard Worker psrlw m0, m1, 3 2506*c0909341SAndroid Build Coastguard Worker paddb m16, m3, m17 2507*c0909341SAndroid Build Coastguard Worker vpermw m0, m0, m14 2508*c0909341SAndroid Build Coastguard Worker paddb m17, m4 2509*c0909341SAndroid Build Coastguard Worker vpmovw2m k1, m16 ; base_x < 0 2510*c0909341SAndroid Build Coastguard Worker vpermi2b m16, m7, m9 2511*c0909341SAndroid Build Coastguard Worker vpmovw2m k2, m17 2512*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m7, m9 2513*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m0 2514*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m0 2515*c0909341SAndroid Build Coastguard Worker add r3d, dxd 2516*c0909341SAndroid Build Coastguard Worker jge .w64_toponly 2517*c0909341SAndroid Build Coastguard Worker mova m0, m8 2518*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m5, m9 2519*c0909341SAndroid Build Coastguard Worker pmaddubsw m16{k1}, m0, m10 2520*c0909341SAndroid Build Coastguard Worker mova m0, m8 2521*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m6, m9 2522*c0909341SAndroid Build Coastguard Worker pmaddubsw m17{k2}, m0, m11 2523*c0909341SAndroid Build Coastguard Worker.w64_toponly: 2524*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 2525*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m15 2526*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 2527*c0909341SAndroid Build Coastguard Worker mova [dstq], m16 2528*c0909341SAndroid Build Coastguard Worker dec hd 2529*c0909341SAndroid Build Coastguard Worker jz .w64_end 2530*c0909341SAndroid Build Coastguard Worker paddw m1, m12 2531*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2532*c0909341SAndroid Build Coastguard Worker paddb m5, m2 2533*c0909341SAndroid Build Coastguard Worker paddb m6, m2 2534*c0909341SAndroid Build Coastguard Worker cmp r3d, r5d 2535*c0909341SAndroid Build Coastguard Worker jge .w64_loop 2536*c0909341SAndroid Build Coastguard Worker.w64_leftonly_loop: 2537*c0909341SAndroid Build Coastguard Worker vpermb m16, m5, m8 2538*c0909341SAndroid Build Coastguard Worker vpermb m17, m6, m8 2539*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m10 2540*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m11 2541*c0909341SAndroid Build Coastguard Worker paddb m5, m2 2542*c0909341SAndroid Build Coastguard Worker paddb m6, m2 2543*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 2544*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m15 2545*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 2546*c0909341SAndroid Build Coastguard Worker mova [dstq], m16 2547*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2548*c0909341SAndroid Build Coastguard Worker dec hd 2549*c0909341SAndroid Build Coastguard Worker jg .w64_leftonly_loop 2550*c0909341SAndroid Build Coastguard Worker.w64_end: 2551*c0909341SAndroid Build Coastguard Worker RET 2552*c0909341SAndroid Build Coastguard Worker 2553*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy 2554*c0909341SAndroid Build Coastguard Worker lea r7, [z_filter_t0] 2555*c0909341SAndroid Build Coastguard Worker tzcnt wd, wm 2556*c0909341SAndroid Build Coastguard Worker movifnidn angled, anglem 2557*c0909341SAndroid Build Coastguard Worker lea t0, [dr_intra_derivative+45*2-1] 2558*c0909341SAndroid Build Coastguard Worker movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4] 2559*c0909341SAndroid Build Coastguard Worker sub angled, 180 2560*c0909341SAndroid Build Coastguard Worker mov dyd, angled 2561*c0909341SAndroid Build Coastguard Worker neg dyd 2562*c0909341SAndroid Build Coastguard Worker xor angled, 0x400 2563*c0909341SAndroid Build Coastguard Worker or dyq, ~0x7e 2564*c0909341SAndroid Build Coastguard Worker mova m0, [base+pb_63to0] 2565*c0909341SAndroid Build Coastguard Worker movzx dyd, word [t0+dyq] 2566*c0909341SAndroid Build Coastguard Worker lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq] 2567*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2568*c0909341SAndroid Build Coastguard Worker mova m14, [base+z_frac_table] 2569*c0909341SAndroid Build Coastguard Worker shl dyd, 6 2570*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+pw_512] 2571*c0909341SAndroid Build Coastguard Worker jmp wq 2572*c0909341SAndroid Build Coastguard Worker.w4: 2573*c0909341SAndroid Build Coastguard Worker cmp angleb, 40 2574*c0909341SAndroid Build Coastguard Worker jae .w4_no_upsample 2575*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq-1024] 2576*c0909341SAndroid Build Coastguard Worker sar r3d, 7 2577*c0909341SAndroid Build Coastguard Worker add r3d, hd 2578*c0909341SAndroid Build Coastguard Worker jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) 2579*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+4] 2580*c0909341SAndroid Build Coastguard Worker call .upsample 2581*c0909341SAndroid Build Coastguard Worker movshdup m1, [base+z_ypos_off1] 2582*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pb_16] 2583*c0909341SAndroid Build Coastguard Worker jmp .w4_main2 2584*c0909341SAndroid Build Coastguard Worker.w4_no_upsample: 2585*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+3] 2586*c0909341SAndroid Build Coastguard Worker vpbroadcastb m9, r3d 2587*c0909341SAndroid Build Coastguard Worker vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4) 2588*c0909341SAndroid Build Coastguard Worker pmaxub m1, m0 2589*c0909341SAndroid Build Coastguard Worker vpermb m7, m1, [tlq-64*1] 2590*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2591*c0909341SAndroid Build Coastguard Worker jnz .w4_main 2592*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm1, angled 2593*c0909341SAndroid Build Coastguard Worker shr angled, 8 2594*c0909341SAndroid Build Coastguard Worker vpcmpeqb k1, xm9, [base+z_filter_wh] 2595*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-3] 2596*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] 2597*c0909341SAndroid Build Coastguard Worker kmovw r5d, k1 2598*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2599*c0909341SAndroid Build Coastguard Worker jz .w4_main 2600*c0909341SAndroid Build Coastguard Worker pminub m9, [pb_0to63] 2601*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter 2602*c0909341SAndroid Build Coastguard Worker vpermb m7, m9, m0 2603*c0909341SAndroid Build Coastguard Worker.w4_main: 2604*c0909341SAndroid Build Coastguard Worker movsldup m1, [base+z_ypos_off1] 2605*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pb_8] 2606*c0909341SAndroid Build Coastguard Worker.w4_main2: 2607*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, dyd 2608*c0909341SAndroid Build Coastguard Worker vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4 2609*c0909341SAndroid Build Coastguard Worker pmulhuw m2, m0 ; ypos >> 1 2610*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 2611*c0909341SAndroid Build Coastguard Worker vpermw m3, m2, m14 ; 64-frac, frac 2612*c0909341SAndroid Build Coastguard Worker psrlw m2, 5 2613*c0909341SAndroid Build Coastguard Worker packsswb m2, m2 2614*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m2 2615*c0909341SAndroid Build Coastguard Worker paddsb m2, m1 ; base, base+1 2616*c0909341SAndroid Build Coastguard Worker.w4_loop: 2617*c0909341SAndroid Build Coastguard Worker vpermb m0, m2, m7 2618*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 2619*c0909341SAndroid Build Coastguard Worker paddsb m2, m6 2620*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 2621*c0909341SAndroid Build Coastguard Worker vpmovwb ym0, m0 2622*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 2623*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 2624*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 2 2625*c0909341SAndroid Build Coastguard Worker pextrd [dstq+r2 ], xm0, 3 2626*c0909341SAndroid Build Coastguard Worker sub hd, 8 2627*c0909341SAndroid Build Coastguard Worker jl .w4_end 2628*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, ym0, 1 2629*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2630*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm0 2631*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm0, 1 2632*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*2], xm0, 2 2633*c0909341SAndroid Build Coastguard Worker pextrd [dstq+r2 ], xm0, 3 2634*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2635*c0909341SAndroid Build Coastguard Worker jg .w4_loop 2636*c0909341SAndroid Build Coastguard Worker.w4_end: 2637*c0909341SAndroid Build Coastguard Worker RET 2638*c0909341SAndroid Build Coastguard Worker.upsample: 2639*c0909341SAndroid Build Coastguard Worker xor r3d, 31 ; 31 - (h + imin(w, h)) 2640*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym0, [base+z_xpos_off2a] 2641*c0909341SAndroid Build Coastguard Worker vpbroadcastb ym7, r3d 2642*c0909341SAndroid Build Coastguard Worker pmaxub ym7, [base+z3_upsample] 2643*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym1, [base+z_filter_s4] 2644*c0909341SAndroid Build Coastguard Worker vpermb ym7, ym7, [tlq-31] 2645*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [base+pb_m4_36] 2646*c0909341SAndroid Build Coastguard Worker pshufb ym0, ym7, ym0 2647*c0909341SAndroid Build Coastguard Worker psrldq ym7, 1 2648*c0909341SAndroid Build Coastguard Worker pshufb ym1, ym7, ym1 2649*c0909341SAndroid Build Coastguard Worker pmaddubsw ym0, ym2 2650*c0909341SAndroid Build Coastguard Worker pmaddubsw ym1, ym2 2651*c0909341SAndroid Build Coastguard Worker add dyd, dyd 2652*c0909341SAndroid Build Coastguard Worker paddw ym0, ym1 2653*c0909341SAndroid Build Coastguard Worker pmulhrsw ym0, ym15 2654*c0909341SAndroid Build Coastguard Worker packuswb ym0, ym0 2655*c0909341SAndroid Build Coastguard Worker punpcklbw ym7, ym0 2656*c0909341SAndroid Build Coastguard Worker ret 2657*c0909341SAndroid Build Coastguard Worker.w8: 2658*c0909341SAndroid Build Coastguard Worker lea r3d, [angleq+216] 2659*c0909341SAndroid Build Coastguard Worker mov r3b, hb 2660*c0909341SAndroid Build Coastguard Worker cmp r3d, 8 2661*c0909341SAndroid Build Coastguard Worker ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 2662*c0909341SAndroid Build Coastguard Worker lea r3d, [hq*2] 2663*c0909341SAndroid Build Coastguard Worker call .upsample 2664*c0909341SAndroid Build Coastguard Worker pshufd m1, [base+z_ypos_off1], q0000 2665*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pb_8] 2666*c0909341SAndroid Build Coastguard Worker jmp .w8_main2 2667*c0909341SAndroid Build Coastguard Worker.w8_no_upsample: 2668*c0909341SAndroid Build Coastguard Worker mov r3d, 8 2669*c0909341SAndroid Build Coastguard Worker cmp hd, 4 2670*c0909341SAndroid Build Coastguard Worker cmove r3d, hd 2671*c0909341SAndroid Build Coastguard Worker lea r3d, [r3+hq-1] 2672*c0909341SAndroid Build Coastguard Worker xor r3d, 63 ; 63 - (h + imin(w, h)) 2673*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, wd 2674*c0909341SAndroid Build Coastguard Worker pmaxub m1, m0 2675*c0909341SAndroid Build Coastguard Worker vpermb m7, m1, [tlq-64*1] 2676*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2677*c0909341SAndroid Build Coastguard Worker jnz .w8_main 2678*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+7] 2679*c0909341SAndroid Build Coastguard Worker call .filter_strength 2680*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2681*c0909341SAndroid Build Coastguard Worker jz .w8_main 2682*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter 2683*c0909341SAndroid Build Coastguard Worker vpermb m7, m10, m0 2684*c0909341SAndroid Build Coastguard Worker.w8_main: 2685*c0909341SAndroid Build Coastguard Worker movsldup m1, [base+z_ypos_off2] 2686*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pb_4] 2687*c0909341SAndroid Build Coastguard Worker.w8_main2: 2688*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, dyd 2689*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8 2690*c0909341SAndroid Build Coastguard Worker pmulhuw m2, m0 ; ypos >> 1 2691*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 2692*c0909341SAndroid Build Coastguard Worker vpermw m3, m2, m14 ; 64-frac, frac 2693*c0909341SAndroid Build Coastguard Worker psrlw m2, 5 2694*c0909341SAndroid Build Coastguard Worker packsswb m2, m2 2695*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m2 2696*c0909341SAndroid Build Coastguard Worker paddsb m2, m1 ; base, base+1 2697*c0909341SAndroid Build Coastguard Worker.w8_loop: 2698*c0909341SAndroid Build Coastguard Worker vpermb m0, m2, m7 2699*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m3 2700*c0909341SAndroid Build Coastguard Worker paddsb m2, m6 2701*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 2702*c0909341SAndroid Build Coastguard Worker vpmovwb ym0, m0 2703*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm1, ym0, 1 2704*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 2705*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 2706*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm1 2707*c0909341SAndroid Build Coastguard Worker movhps [dstq+r2 ], xm1 2708*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2709*c0909341SAndroid Build Coastguard Worker sub hd, 4 2710*c0909341SAndroid Build Coastguard Worker jg .w8_loop 2711*c0909341SAndroid Build Coastguard Worker RET 2712*c0909341SAndroid Build Coastguard Worker.filter_strength: 2713*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-3] 2714*c0909341SAndroid Build Coastguard Worker.filter_strength2: 2715*c0909341SAndroid Build Coastguard Worker vpbroadcastb m9, r3d 2716*c0909341SAndroid Build Coastguard Worker vpbroadcastb ym1, angled 2717*c0909341SAndroid Build Coastguard Worker shr angled, 8 2718*c0909341SAndroid Build Coastguard Worker vpcmpeqb k1, ym9, [base+z_filter_wh] 2719*c0909341SAndroid Build Coastguard Worker mova xm0, [base+z_filter_t0+angleq*8] 2720*c0909341SAndroid Build Coastguard Worker vpcmpgtb k1{k1}, ym1, ym0 2721*c0909341SAndroid Build Coastguard Worker pminub m10, m9, [pb_0to63] 2722*c0909341SAndroid Build Coastguard Worker kmovd r5d, k1 2723*c0909341SAndroid Build Coastguard Worker ret 2724*c0909341SAndroid Build Coastguard Worker.w16_load: 2725*c0909341SAndroid Build Coastguard Worker cmp r3d, hd 2726*c0909341SAndroid Build Coastguard Worker cmovae r3d, hd 2727*c0909341SAndroid Build Coastguard Worker add r3d, hd 2728*c0909341SAndroid Build Coastguard Worker mova m7, [tlq-64*1] 2729*c0909341SAndroid Build Coastguard Worker neg r3d ; -(h + imin(w, h)) 2730*c0909341SAndroid Build Coastguard Worker and r3d, 63 2731*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, r3d 2732*c0909341SAndroid Build Coastguard Worker pmaxub m2, m0, m1 2733*c0909341SAndroid Build Coastguard Worker cmp hd, 64 2734*c0909341SAndroid Build Coastguard Worker je .w16_load_h64 2735*c0909341SAndroid Build Coastguard Worker vpermb m8, m1, m7 2736*c0909341SAndroid Build Coastguard Worker vpermb m7, m2, m7 2737*c0909341SAndroid Build Coastguard Worker ret 2738*c0909341SAndroid Build Coastguard Worker.w16_load_h64: 2739*c0909341SAndroid Build Coastguard Worker vpermb m7, m0, m7 2740*c0909341SAndroid Build Coastguard Worker vpermb m8, m2, [tlq-64*2] 2741*c0909341SAndroid Build Coastguard Worker ret 2742*c0909341SAndroid Build Coastguard Worker.w16: 2743*c0909341SAndroid Build Coastguard Worker mov r3d, 16 2744*c0909341SAndroid Build Coastguard Worker call .w16_load 2745*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2746*c0909341SAndroid Build Coastguard Worker jnz .w16_main 2747*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-3] 2748*c0909341SAndroid Build Coastguard Worker cmp hd, 64 2749*c0909341SAndroid Build Coastguard Worker je .w16_filter64 2750*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+15] 2751*c0909341SAndroid Build Coastguard Worker call .filter_strength2 2752*c0909341SAndroid Build Coastguard Worker test r5d, r5d 2753*c0909341SAndroid Build Coastguard Worker jz .w16_main 2754*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter 2755*c0909341SAndroid Build Coastguard Worker pminub m10, m9, [pb_0to63] 2756*c0909341SAndroid Build Coastguard Worker vpermb m8, m9, m0 2757*c0909341SAndroid Build Coastguard Worker vpermb m7, m10, m0 2758*c0909341SAndroid Build Coastguard Worker jmp .w16_main 2759*c0909341SAndroid Build Coastguard Worker.w16_filter64: 2760*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pb_15] 2761*c0909341SAndroid Build Coastguard Worker valignq m0, m8, m7, 7 2762*c0909341SAndroid Build Coastguard Worker pminub m12, m13, [pb_0to63] 2763*c0909341SAndroid Build Coastguard Worker valignq m11, m8, m7, 1 2764*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter 2765*c0909341SAndroid Build Coastguard Worker.w16_main: 2766*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8 2767*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15 2768*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, dyd 2769*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pb_4] 2770*c0909341SAndroid Build Coastguard Worker pmulhuw m3, m0 ; ypos >> 1 2771*c0909341SAndroid Build Coastguard Worker pmulhuw m2, m0 2772*c0909341SAndroid Build Coastguard Worker movshdup m0, [base+z_ypos_off2] 2773*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 2774*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [base+pb_1] 2775*c0909341SAndroid Build Coastguard Worker vpermw m4, m3, m14 ; 64-frac, frac 2776*c0909341SAndroid Build Coastguard Worker psrlw m3, 5 2777*c0909341SAndroid Build Coastguard Worker vpermw m5, m2, m14 2778*c0909341SAndroid Build Coastguard Worker psrlw m2, 5 2779*c0909341SAndroid Build Coastguard Worker packsswb m3, m2 2780*c0909341SAndroid Build Coastguard Worker paddsb m3, m0 2781*c0909341SAndroid Build Coastguard Worker paddsb m1, m3 2782*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m1 ; base, base+1 2783*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 2784*c0909341SAndroid Build Coastguard Worker.w16_loop: 2785*c0909341SAndroid Build Coastguard Worker%macro Z3_PERM2 0 2786*c0909341SAndroid Build Coastguard Worker mova m0, m7 2787*c0909341SAndroid Build Coastguard Worker vpermt2b m0, m2, m8 2788*c0909341SAndroid Build Coastguard Worker mova m1, m7 2789*c0909341SAndroid Build Coastguard Worker vpermt2b m1, m3, m8 2790*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m4 2791*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m5 2792*c0909341SAndroid Build Coastguard Worker paddsb m2, m6 2793*c0909341SAndroid Build Coastguard Worker paddsb m3, m6 2794*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 2795*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 2796*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2797*c0909341SAndroid Build Coastguard Worker%endmacro 2798*c0909341SAndroid Build Coastguard Worker Z3_PERM2 2799*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 2800*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 2801*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m0, 2 2802*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r2 ], m0, 3 2803*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2804*c0909341SAndroid Build Coastguard Worker sub hd, 4 2805*c0909341SAndroid Build Coastguard Worker jg .w16_loop 2806*c0909341SAndroid Build Coastguard Worker RET 2807*c0909341SAndroid Build Coastguard Worker.w32: 2808*c0909341SAndroid Build Coastguard Worker mov r3d, 32 2809*c0909341SAndroid Build Coastguard Worker call .w16_load 2810*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2811*c0909341SAndroid Build Coastguard Worker jnz .w32_main 2812*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-3] 2813*c0909341SAndroid Build Coastguard Worker cmp hd, 64 2814*c0909341SAndroid Build Coastguard Worker je .w32_filter64 2815*c0909341SAndroid Build Coastguard Worker lea r3d, [hq+31] 2816*c0909341SAndroid Build Coastguard Worker vpbroadcastb m9, r3d 2817*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter 2818*c0909341SAndroid Build Coastguard Worker vpermb m8, m9, m7 2819*c0909341SAndroid Build Coastguard Worker jmp .w32_main 2820*c0909341SAndroid Build Coastguard Worker.w32_filter64: 2821*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pb_31] 2822*c0909341SAndroid Build Coastguard Worker valignq m0, m8, m7, 7 2823*c0909341SAndroid Build Coastguard Worker pminub m12, m13, [pb_0to63] 2824*c0909341SAndroid Build Coastguard Worker valignq m11, m8, m7, 1 2825*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter 2826*c0909341SAndroid Build Coastguard Worker.w32_main: 2827*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8 2828*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15 2829*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, dyd 2830*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [base+pb_1] 2831*c0909341SAndroid Build Coastguard Worker pmulhuw m3, m0 ; ypos >> 1 2832*c0909341SAndroid Build Coastguard Worker pmulhuw m2, m0 2833*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pb_2] 2834*c0909341SAndroid Build Coastguard Worker mova ym0, ym1 2835*c0909341SAndroid Build Coastguard Worker vpermw m4, m3, m14 ; 64-frac, frac 2836*c0909341SAndroid Build Coastguard Worker psrlw m3, 5 2837*c0909341SAndroid Build Coastguard Worker vpermw m5, m2, m14 2838*c0909341SAndroid Build Coastguard Worker psrlw m2, 5 2839*c0909341SAndroid Build Coastguard Worker packsswb m3, m2 2840*c0909341SAndroid Build Coastguard Worker paddsb m3, m0 2841*c0909341SAndroid Build Coastguard Worker paddsb m1, m3 2842*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m1 ; base, base+1 2843*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 2844*c0909341SAndroid Build Coastguard Worker.w32_loop: 2845*c0909341SAndroid Build Coastguard Worker Z3_PERM2 2846*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*0], m0, 1 2847*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], ym0 2848*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2849*c0909341SAndroid Build Coastguard Worker sub hd, 2 2850*c0909341SAndroid Build Coastguard Worker jg .w32_loop 2851*c0909341SAndroid Build Coastguard Worker RET 2852*c0909341SAndroid Build Coastguard Worker.w64: 2853*c0909341SAndroid Build Coastguard Worker mova m7, [tlq-64*1] 2854*c0909341SAndroid Build Coastguard Worker cmp hd, 64 2855*c0909341SAndroid Build Coastguard Worker je .w64_h64 2856*c0909341SAndroid Build Coastguard Worker lea r3d, [hq*2-1] 2857*c0909341SAndroid Build Coastguard Worker xor r3d, 63 ; -(h + imin(w, h)) & 63 2858*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, r3d 2859*c0909341SAndroid Build Coastguard Worker pmaxub m0, m1 2860*c0909341SAndroid Build Coastguard Worker vpermb m8, m1, m7 2861*c0909341SAndroid Build Coastguard Worker jmp .w64_filter 2862*c0909341SAndroid Build Coastguard Worker.w64_h64: 2863*c0909341SAndroid Build Coastguard Worker vpermb m8, m0, [tlq-64*2] 2864*c0909341SAndroid Build Coastguard Worker.w64_filter: 2865*c0909341SAndroid Build Coastguard Worker vpermb m7, m0, m7 2866*c0909341SAndroid Build Coastguard Worker test angled, 0x400 ; !enable_intra_edge_filter 2867*c0909341SAndroid Build Coastguard Worker jnz .w64_main 2868*c0909341SAndroid Build Coastguard Worker lea r3d, [hq-1] 2869*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq-3] 2870*c0909341SAndroid Build Coastguard Worker vpbroadcastb m13, r3d 2871*c0909341SAndroid Build Coastguard Worker valignq m0, m8, m7, 7 2872*c0909341SAndroid Build Coastguard Worker pminub m12, m13, [pb_0to63] 2873*c0909341SAndroid Build Coastguard Worker valignq m11, m8, m7, 1 2874*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter 2875*c0909341SAndroid Build Coastguard Worker.w64_main: 2876*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, dyd 2877*c0909341SAndroid Build Coastguard Worker pmulhuw m3, m2, [base+z_ypos_mul2a] 2878*c0909341SAndroid Build Coastguard Worker pmulhuw m2, [base+z_ypos_mul2b] 2879*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pb_1] 2880*c0909341SAndroid Build Coastguard Worker vpermw m4, m3, m14 ; 64-frac, frac 2881*c0909341SAndroid Build Coastguard Worker psrlw m3, 5 2882*c0909341SAndroid Build Coastguard Worker vpermw m5, m2, m14 2883*c0909341SAndroid Build Coastguard Worker psrlw m2, 5 2884*c0909341SAndroid Build Coastguard Worker packsswb m3, m2 2885*c0909341SAndroid Build Coastguard Worker paddsb m1, m3, m6 2886*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m1 ; base, base+1 2887*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1 2888*c0909341SAndroid Build Coastguard Worker.w64_loop: 2889*c0909341SAndroid Build Coastguard Worker Z3_PERM2 2890*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 2891*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2892*c0909341SAndroid Build Coastguard Worker dec hd 2893*c0909341SAndroid Build Coastguard Worker jg .w64_loop 2894*c0909341SAndroid Build Coastguard Worker RET 2895*c0909341SAndroid Build Coastguard Worker 2896*c0909341SAndroid Build Coastguard Worker; The ipred_filter code processes 4x2 blocks in the following order 2897*c0909341SAndroid Build Coastguard Worker; which increases parallelism compared to doing things row by row. 2898*c0909341SAndroid Build Coastguard Worker; Some redundant blocks are calculated for w > 4. 2899*c0909341SAndroid Build Coastguard Worker; w4 w8 w16 w32 2900*c0909341SAndroid Build Coastguard Worker; 1 1 2 1 2 3 4 1 2 3 4 9 a b c 2901*c0909341SAndroid Build Coastguard Worker; 2 2 3 2 3 4 5 2 3 4 5 a b c d 2902*c0909341SAndroid Build Coastguard Worker; 3 3 4 3 4 5 6 3 4 5 6 b c d e 2903*c0909341SAndroid Build Coastguard Worker; 4 4 5 4 5 6 7 4 5 6 7 c d e f 2904*c0909341SAndroid Build Coastguard Worker; 5 5 6 5 6 7 8 5 6 7 8 d e f g 2905*c0909341SAndroid Build Coastguard Worker; 6 6 7 6 7 8 9 6 7 8 9 e f g h 2906*c0909341SAndroid Build Coastguard Worker; 7 7 8 7 8 9 a 7 8 9 a f g h i 2907*c0909341SAndroid Build Coastguard Worker; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ 2908*c0909341SAndroid Build Coastguard Worker; 9 9 a b h i j 2909*c0909341SAndroid Build Coastguard Worker; a b i j 2910*c0909341SAndroid Build Coastguard Worker; b j 2911*c0909341SAndroid Build Coastguard Worker 2912*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt 2913*c0909341SAndroid Build Coastguard Worker%define base r6-filter_taps 2914*c0909341SAndroid Build Coastguard Worker lea r6, [filter_taps] 2915*c0909341SAndroid Build Coastguard Worker%ifidn fltd, fltm 2916*c0909341SAndroid Build Coastguard Worker movzx fltd, fltb 2917*c0909341SAndroid Build Coastguard Worker%else 2918*c0909341SAndroid Build Coastguard Worker movzx fltd, byte fltm 2919*c0909341SAndroid Build Coastguard Worker%endif 2920*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 2921*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2922*c0909341SAndroid Build Coastguard Worker shl fltd, 6 2923*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pd_8] 2924*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ 2925*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 2926*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [r6+fltq+16*1] 2927*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ 2928*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m10, [r6+fltq+16*3] 2929*c0909341SAndroid Build Coastguard Worker mova xmm0, xm6 2930*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm0, xmm2, xm7 2931*c0909341SAndroid Build Coastguard Worker mova xmm1, xm6 2932*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm1, xmm2, xm8 2933*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm0, xmm3, xm9 2934*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm1, xmm3, xm10 2935*c0909341SAndroid Build Coastguard Worker packssdw xmm0, xmm1 2936*c0909341SAndroid Build Coastguard Worker cmp wd, 8 2937*c0909341SAndroid Build Coastguard Worker jb .w4 2938*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [tlq+5] 2939*c0909341SAndroid Build Coastguard Worker mova m11, [base+filter_perm] 2940*c0909341SAndroid Build Coastguard Worker mov r5, 0xffffffffffff000f 2941*c0909341SAndroid Build Coastguard Worker psrldq xmm2, 1 ; __ t0 2942*c0909341SAndroid Build Coastguard Worker kmovq k1, r5 ; 0x000f 2943*c0909341SAndroid Build Coastguard Worker psraw xm5, xmm0, 4 2944*c0909341SAndroid Build Coastguard Worker packuswb xmm2, xm5 ; __ t0 a0 b0 2945*c0909341SAndroid Build Coastguard Worker pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 2946*c0909341SAndroid Build Coastguard Worker je .w8 2947*c0909341SAndroid Build Coastguard Worker kxnorb k3, k3, k3 ; 0x00ff 2948*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, [tlq-4] 2949*c0909341SAndroid Build Coastguard Worker kandnq k2, k3, k1 ; 0xffffffffffff0000 2950*c0909341SAndroid Build Coastguard Worker vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ 2951*c0909341SAndroid Build Coastguard Worker mova ym0, ym6 2952*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym2, ym7 2953*c0909341SAndroid Build Coastguard Worker mova ym1, ym6 2954*c0909341SAndroid Build Coastguard Worker vpdpbusd ym1, ym2, ym8 2955*c0909341SAndroid Build Coastguard Worker pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 2956*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq+9] 2957*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym3, ym9 2958*c0909341SAndroid Build Coastguard Worker vpdpbusd ym1, ym3, ym10 2959*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ 2960*c0909341SAndroid Build Coastguard Worker kunpckbw k4, k1, k3 ; 0x0fff 2961*c0909341SAndroid Build Coastguard Worker packssdw ym0, ym1 2962*c0909341SAndroid Build Coastguard Worker psraw ym0, 4 ; a0 d0 a1 b1 2963*c0909341SAndroid Build Coastguard Worker packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 2964*c0909341SAndroid Build Coastguard Worker pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 2965*c0909341SAndroid Build Coastguard Worker vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ 2966*c0909341SAndroid Build Coastguard Worker mova m4, m6 2967*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m2, m7 2968*c0909341SAndroid Build Coastguard Worker mova m1, m6 2969*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m8 2970*c0909341SAndroid Build Coastguard Worker psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 2971*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq+13] 2972*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m3, m9 2973*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m3, m10 2974*c0909341SAndroid Build Coastguard Worker mova m12, [base+filter_end] 2975*c0909341SAndroid Build Coastguard Worker lea r5d, [hq-6] 2976*c0909341SAndroid Build Coastguard Worker mov r6, dstq 2977*c0909341SAndroid Build Coastguard Worker cmovp hd, r5d ; w == 16 ? h : h - 6 2978*c0909341SAndroid Build Coastguard Worker packssdw m4, m1 2979*c0909341SAndroid Build Coastguard Worker psraw m4, 4 ; e0 f0 c1 d1 a2 b2 2980*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 2981*c0909341SAndroid Build Coastguard Worker pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 2982*c0909341SAndroid Build Coastguard Worker.w16_loop: 2983*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, [tlq-8] 2984*c0909341SAndroid Build Coastguard Worker vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ 2985*c0909341SAndroid Build Coastguard Worker mova m1, m6 2986*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m7 2987*c0909341SAndroid Build Coastguard Worker mova m0, m6 2988*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m8 2989*c0909341SAndroid Build Coastguard Worker sub tlq, 2 2990*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m3, m9 2991*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m3, m10 2992*c0909341SAndroid Build Coastguard Worker packssdw m1, m0 2993*c0909341SAndroid Build Coastguard Worker mova m0, m4 2994*c0909341SAndroid Build Coastguard Worker psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 2995*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 2996*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 2997*c0909341SAndroid Build Coastguard Worker vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 2998*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*0], m5, 2 2999*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m5, 3 3000*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3001*c0909341SAndroid Build Coastguard Worker sub hd, 2 3002*c0909341SAndroid Build Coastguard Worker jg .w16_loop 3003*c0909341SAndroid Build Coastguard Worker cmp wd, 16 3004*c0909341SAndroid Build Coastguard Worker je .ret 3005*c0909341SAndroid Build Coastguard Worker mova xm13, [filter_perm+16] 3006*c0909341SAndroid Build Coastguard Worker mova xmm3, [r6+strideq*0] 3007*c0909341SAndroid Build Coastguard Worker punpckhdq xmm3, [r6+strideq*1] 3008*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 3009*c0909341SAndroid Build Coastguard Worker pinsrb xm3, xmm3, [tlq+r5+16], 7 3010*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm13 3011*c0909341SAndroid Build Coastguard Worker vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ 3012*c0909341SAndroid Build Coastguard Worker mova m0, m6 3013*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m7 3014*c0909341SAndroid Build Coastguard Worker mova m1, m6 3015*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m8 3016*c0909341SAndroid Build Coastguard Worker kunpckbw k5, k3, k1 ; 0xff0f 3017*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 3018*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m3, m9 3019*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m3, m10 3020*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 3021*c0909341SAndroid Build Coastguard Worker psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 3022*c0909341SAndroid Build Coastguard Worker packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 3023*c0909341SAndroid Build Coastguard Worker vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 3024*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [tlq+r5+21] 3025*c0909341SAndroid Build Coastguard Worker pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 3026*c0909341SAndroid Build Coastguard Worker vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 3027*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*0], m5, 2 3028*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], m5, 3 3029*c0909341SAndroid Build Coastguard Worker punpckhqdq xmm3, [r6+r3] 3030*c0909341SAndroid Build Coastguard Worker pinsrb xmm3, [r6+strideq*2+15], 11 3031*c0909341SAndroid Build Coastguard Worker pshufb xm3, xmm3, xm13 3032*c0909341SAndroid Build Coastguard Worker vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ 3033*c0909341SAndroid Build Coastguard Worker mova m4, m6 3034*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m2, m7 3035*c0909341SAndroid Build Coastguard Worker mova m1, m6 3036*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m8 3037*c0909341SAndroid Build Coastguard Worker kxnord k3, k3, k4 ; 0xfffff0ff 3038*c0909341SAndroid Build Coastguard Worker lea r4, [strideq*5] 3039*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m3, m9 3040*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m3, m10 3041*c0909341SAndroid Build Coastguard Worker packssdw m4, m1 3042*c0909341SAndroid Build Coastguard Worker psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 3043*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 3044*c0909341SAndroid Build Coastguard Worker vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 3045*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq+r5+25] 3046*c0909341SAndroid Build Coastguard Worker pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 3047*c0909341SAndroid Build Coastguard Worker vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 3048*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m5, 2 3049*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r3 ], m5, 3 3050*c0909341SAndroid Build Coastguard Worker punpckhqdq xmm3, [r6+r4] 3051*c0909341SAndroid Build Coastguard Worker pinsrb xmm3, [r6+strideq*4+15], 11 3052*c0909341SAndroid Build Coastguard Worker pshufb xm3, xmm3, xm13 3053*c0909341SAndroid Build Coastguard Worker vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ 3054*c0909341SAndroid Build Coastguard Worker mova m0, m6 3055*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m7 3056*c0909341SAndroid Build Coastguard Worker mova m1, m6 3057*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m8 3058*c0909341SAndroid Build Coastguard Worker kunpckwd k1, k1, k2 ; 0x000f0000 3059*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m3, m9 3060*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m3, m10 3061*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 3062*c0909341SAndroid Build Coastguard Worker psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 3063*c0909341SAndroid Build Coastguard Worker packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 3064*c0909341SAndroid Build Coastguard Worker vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 3065*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [tlq+r5+29] 3066*c0909341SAndroid Build Coastguard Worker pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 3067*c0909341SAndroid Build Coastguard Worker vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 3068*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*4], m5, 2 3069*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r4 ], m5, 3 3070*c0909341SAndroid Build Coastguard Worker lea r0, [strideq+r3*2] 3071*c0909341SAndroid Build Coastguard Worker.w32_loop: 3072*c0909341SAndroid Build Coastguard Worker punpckhqdq xmm3, [r6+r0] 3073*c0909341SAndroid Build Coastguard Worker pinsrb xmm3, [r6+r3*2+15], 11 3074*c0909341SAndroid Build Coastguard Worker pshufb xm3, xmm3, xm13 3075*c0909341SAndroid Build Coastguard Worker vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ 3076*c0909341SAndroid Build Coastguard Worker.w32_loop_tail: 3077*c0909341SAndroid Build Coastguard Worker mova m4, m6 3078*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m2, m7 3079*c0909341SAndroid Build Coastguard Worker mova m1, m6 3080*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m8 3081*c0909341SAndroid Build Coastguard Worker vpdpbusd m4, m3, m9 3082*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m3, m10 3083*c0909341SAndroid Build Coastguard Worker packssdw m4, m1 3084*c0909341SAndroid Build Coastguard Worker mova m1, m0 3085*c0909341SAndroid Build Coastguard Worker psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 3086*c0909341SAndroid Build Coastguard Worker packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 3087*c0909341SAndroid Build Coastguard Worker pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 3088*c0909341SAndroid Build Coastguard Worker vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 3089*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r6+strideq*0+16], m5, 2 3090*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r6+strideq*1+16], m5, 3 3091*c0909341SAndroid Build Coastguard Worker lea r6, [r6+strideq*2] 3092*c0909341SAndroid Build Coastguard Worker sub r5d, 2 3093*c0909341SAndroid Build Coastguard Worker jg .w32_loop 3094*c0909341SAndroid Build Coastguard Worker vpermb m3, m11, m1 3095*c0909341SAndroid Build Coastguard Worker cmp r5d, -6 3096*c0909341SAndroid Build Coastguard Worker jg .w32_loop_tail 3097*c0909341SAndroid Build Coastguard Worker.ret: 3098*c0909341SAndroid Build Coastguard Worker RET 3099*c0909341SAndroid Build Coastguard Worker.w8: 3100*c0909341SAndroid Build Coastguard Worker vpermb ym3, ym11, ymm2 3101*c0909341SAndroid Build Coastguard Worker.w8_loop: 3102*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ 3103*c0909341SAndroid Build Coastguard Worker mova ym0, ym6 3104*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym2, ym7 3105*c0909341SAndroid Build Coastguard Worker mova ym1, ym6 3106*c0909341SAndroid Build Coastguard Worker vpdpbusd ym1, ym2, ym8 3107*c0909341SAndroid Build Coastguard Worker sub tlq, 2 3108*c0909341SAndroid Build Coastguard Worker vpdpbusd ym0, ym3, ym9 3109*c0909341SAndroid Build Coastguard Worker vpdpbusd ym1, ym3, ym10 3110*c0909341SAndroid Build Coastguard Worker mova ym3, ym5 3111*c0909341SAndroid Build Coastguard Worker packssdw ym0, ym1 3112*c0909341SAndroid Build Coastguard Worker psraw ym5, ym0, 4 ; c0 d0 a1 b1 3113*c0909341SAndroid Build Coastguard Worker packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 3114*c0909341SAndroid Build Coastguard Worker pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 3115*c0909341SAndroid Build Coastguard Worker vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 3116*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm3 3117*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm3 3118*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3119*c0909341SAndroid Build Coastguard Worker sub hd, 2 3120*c0909341SAndroid Build Coastguard Worker jg .w8_loop 3121*c0909341SAndroid Build Coastguard Worker RET 3122*c0909341SAndroid Build Coastguard Worker.w4_loop: 3123*c0909341SAndroid Build Coastguard Worker vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ 3124*c0909341SAndroid Build Coastguard Worker mova xmm0, xm6 3125*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm0, xmm2, xm7 3126*c0909341SAndroid Build Coastguard Worker mova xmm1, xm6 3127*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm1, xmm2, xm8 3128*c0909341SAndroid Build Coastguard Worker sub tlq, 2 3129*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm0, xmm3, xm9 3130*c0909341SAndroid Build Coastguard Worker vpdpbusd xmm1, xmm3, xm10 3131*c0909341SAndroid Build Coastguard Worker packssdw xmm0, xmm1 3132*c0909341SAndroid Build Coastguard Worker.w4: 3133*c0909341SAndroid Build Coastguard Worker psraw xmm0, 4 ; a0 b0 3134*c0909341SAndroid Build Coastguard Worker packuswb xmm0, xmm0 3135*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xmm0 3136*c0909341SAndroid Build Coastguard Worker pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 3137*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], xmm2 3138*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3139*c0909341SAndroid Build Coastguard Worker sub hd, 2 3140*c0909341SAndroid Build Coastguard Worker jg .w4_loop 3141*c0909341SAndroid Build Coastguard Worker RET 3142*c0909341SAndroid Build Coastguard Worker 3143*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 3144