xref: /aosp_15_r20/external/libdav1d/src/x86/msac.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2019, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2019, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 ; avoids cacheline splits
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Workermin_prob:  dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
32*c0909341SAndroid Build Coastguard Workerpw_0xff00: times 8 dw 0xff00
33*c0909341SAndroid Build Coastguard Workerpw_32:     times 8 dw 32
34*c0909341SAndroid Build Coastguard Worker
35*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
36*c0909341SAndroid Build Coastguard Worker%define resp   resq
37*c0909341SAndroid Build Coastguard Worker%define movp   movq
38*c0909341SAndroid Build Coastguard Worker%define c_shuf q3333
39*c0909341SAndroid Build Coastguard Worker%macro DECODE_SYMBOL_ADAPT_INIT 0-1
40*c0909341SAndroid Build Coastguard Worker%endmacro
41*c0909341SAndroid Build Coastguard Worker%else
42*c0909341SAndroid Build Coastguard Worker%define resp   resd
43*c0909341SAndroid Build Coastguard Worker%define movp   movd
44*c0909341SAndroid Build Coastguard Worker%define c_shuf q1111
45*c0909341SAndroid Build Coastguard Worker%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
46*c0909341SAndroid Build Coastguard Worker    mov            t0, r0m
47*c0909341SAndroid Build Coastguard Worker    mov            t1, r1m
48*c0909341SAndroid Build Coastguard Worker%if %1 == 0
49*c0909341SAndroid Build Coastguard Worker    mov            t2, r2m
50*c0909341SAndroid Build Coastguard Worker%endif
51*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
52*c0909341SAndroid Build Coastguard Worker    sub           esp, 40-%1*4
53*c0909341SAndroid Build Coastguard Worker%else
54*c0909341SAndroid Build Coastguard Worker    mov           eax, esp
55*c0909341SAndroid Build Coastguard Worker    and           esp, ~15
56*c0909341SAndroid Build Coastguard Worker    sub           esp, 40-%1*4
57*c0909341SAndroid Build Coastguard Worker    mov         [esp], eax
58*c0909341SAndroid Build Coastguard Worker%endif
59*c0909341SAndroid Build Coastguard Worker%endmacro
60*c0909341SAndroid Build Coastguard Worker%endif
61*c0909341SAndroid Build Coastguard Worker
62*c0909341SAndroid Build Coastguard Workerstruc msac
63*c0909341SAndroid Build Coastguard Worker    .buf:        resp 1
64*c0909341SAndroid Build Coastguard Worker    .end:        resp 1
65*c0909341SAndroid Build Coastguard Worker    .dif:        resp 1
66*c0909341SAndroid Build Coastguard Worker    .rng:        resd 1
67*c0909341SAndroid Build Coastguard Worker    .cnt:        resd 1
68*c0909341SAndroid Build Coastguard Worker    .update_cdf: resd 1
69*c0909341SAndroid Build Coastguard Workerendstruc
70*c0909341SAndroid Build Coastguard Worker
71*c0909341SAndroid Build Coastguard Worker%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)
72*c0909341SAndroid Build Coastguard Worker
73*c0909341SAndroid Build Coastguard WorkerSECTION .text
74*c0909341SAndroid Build Coastguard Worker
75*c0909341SAndroid Build Coastguard Worker%if WIN64
76*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
77*c0909341SAndroid Build Coastguard Worker%define buf rsp+stack_offset+8 ; shadow space
78*c0909341SAndroid Build Coastguard Worker%elif UNIX64
79*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
80*c0909341SAndroid Build Coastguard Worker%define buf rsp-40 ; red zone
81*c0909341SAndroid Build Coastguard Worker%else
82*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
83*c0909341SAndroid Build Coastguard Worker%define buf esp+8
84*c0909341SAndroid Build Coastguard Worker%endif
85*c0909341SAndroid Build Coastguard Worker
86*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse2
87*c0909341SAndroid Build Coastguard Workercglobal msac_decode_symbol_adapt4, 0, 6, 6
88*c0909341SAndroid Build Coastguard Worker    DECODE_SYMBOL_ADAPT_INIT
89*c0909341SAndroid Build Coastguard Worker    LEA           rax, pw_0xff00
90*c0909341SAndroid Build Coastguard Worker    movd           m2, [t0+msac.rng]
91*c0909341SAndroid Build Coastguard Worker    movq           m1, [t1]
92*c0909341SAndroid Build Coastguard Worker    movp           m3, [t0+msac.dif]
93*c0909341SAndroid Build Coastguard Worker    mov           t3d, [t0+msac.update_cdf]
94*c0909341SAndroid Build Coastguard Worker    mov           t4d, t2d
95*c0909341SAndroid Build Coastguard Worker    not            t2     ; -(n_symbols + 1)
96*c0909341SAndroid Build Coastguard Worker    pshuflw        m2, m2, q0000
97*c0909341SAndroid Build Coastguard Worker    movd     [buf+12], m2
98*c0909341SAndroid Build Coastguard Worker    pand           m2, [rax]
99*c0909341SAndroid Build Coastguard Worker    mova           m0, m1
100*c0909341SAndroid Build Coastguard Worker    psrlw          m1, 6
101*c0909341SAndroid Build Coastguard Worker    psllw          m1, 7
102*c0909341SAndroid Build Coastguard Worker    pmulhuw        m1, m2
103*c0909341SAndroid Build Coastguard Worker    movq           m2, [rax+t2*2]
104*c0909341SAndroid Build Coastguard Worker    pshuflw        m3, m3, c_shuf
105*c0909341SAndroid Build Coastguard Worker    paddw          m1, m2
106*c0909341SAndroid Build Coastguard Worker    mova     [buf+16], m1
107*c0909341SAndroid Build Coastguard Worker    psubusw        m1, m3
108*c0909341SAndroid Build Coastguard Worker    pxor           m2, m2
109*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m1, m2 ; c >= v
110*c0909341SAndroid Build Coastguard Worker    pmovmskb      eax, m1
111*c0909341SAndroid Build Coastguard Worker    test          t3d, t3d
112*c0909341SAndroid Build Coastguard Worker    jz .renorm ; !allow_update_cdf
113*c0909341SAndroid Build Coastguard Worker
114*c0909341SAndroid Build Coastguard Worker; update_cdf:
115*c0909341SAndroid Build Coastguard Worker    movzx         t3d, word [t1+t4*2] ; count
116*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m2, m2
117*c0909341SAndroid Build Coastguard Worker    mov           t2d, t3d
118*c0909341SAndroid Build Coastguard Worker    shr           t3d, 4
119*c0909341SAndroid Build Coastguard Worker    cmp           t4d, 3
120*c0909341SAndroid Build Coastguard Worker    sbb           t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
121*c0909341SAndroid Build Coastguard Worker    cmp           t2d, 32
122*c0909341SAndroid Build Coastguard Worker    adc           t2d, 0  ; count + (count < 32)
123*c0909341SAndroid Build Coastguard Worker    movd           m3, t3d
124*c0909341SAndroid Build Coastguard Worker    pavgw          m2, m1 ; i >= val ? -1 : 32768
125*c0909341SAndroid Build Coastguard Worker    psubw          m2, m0 ; for (i = 0; i < val; i++)
126*c0909341SAndroid Build Coastguard Worker    psubw          m0, m1 ;     cdf[i] += (32768 - cdf[i]) >> rate;
127*c0909341SAndroid Build Coastguard Worker    psraw          m2, m3 ; for (; i < n_symbols; i++)
128*c0909341SAndroid Build Coastguard Worker    paddw          m0, m2 ;     cdf[i] += ((  -1 - cdf[i]) >> rate) + 1;
129*c0909341SAndroid Build Coastguard Worker    movq         [t1], m0
130*c0909341SAndroid Build Coastguard Worker    mov     [t1+t4*2], t2w
131*c0909341SAndroid Build Coastguard Worker
132*c0909341SAndroid Build Coastguard Worker.renorm:
133*c0909341SAndroid Build Coastguard Worker    tzcnt         eax, eax
134*c0909341SAndroid Build Coastguard Worker    mov            t4, [t0+msac.dif]
135*c0909341SAndroid Build Coastguard Worker    movzx         t1d, word [buf+rax+16] ; v
136*c0909341SAndroid Build Coastguard Worker    movzx         t2d, word [buf+rax+14] ; u
137*c0909341SAndroid Build Coastguard Worker    shr           eax, 1
138*c0909341SAndroid Build Coastguard Worker.renorm2:
139*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
140*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
141*c0909341SAndroid Build Coastguard Worker    add           esp, 40
142*c0909341SAndroid Build Coastguard Worker%else
143*c0909341SAndroid Build Coastguard Worker    mov           esp, [esp]
144*c0909341SAndroid Build Coastguard Worker%endif
145*c0909341SAndroid Build Coastguard Worker%endif
146*c0909341SAndroid Build Coastguard Worker    sub           t2d, t1d ; rng
147*c0909341SAndroid Build Coastguard Worker    shl            t1, gprsize*8-16
148*c0909341SAndroid Build Coastguard Worker    sub            t4, t1  ; dif - v
149*c0909341SAndroid Build Coastguard Worker.renorm3:
150*c0909341SAndroid Build Coastguard Worker    mov           t1d, [t0+msac.cnt]
151*c0909341SAndroid Build Coastguard Worker    movifnidn      t7, t0
152*c0909341SAndroid Build Coastguard Worker.renorm4:
153*c0909341SAndroid Build Coastguard Worker    bsr           ecx, t2d
154*c0909341SAndroid Build Coastguard Worker    xor           ecx, 15  ; d
155*c0909341SAndroid Build Coastguard Worker.renorm5:
156*c0909341SAndroid Build Coastguard Worker    shl           t2d, cl
157*c0909341SAndroid Build Coastguard Worker    shl            t4, cl
158*c0909341SAndroid Build Coastguard Worker    mov [t7+msac.rng], t2d
159*c0909341SAndroid Build Coastguard Worker    sub           t1d, ecx
160*c0909341SAndroid Build Coastguard Worker    jae .end ; no refill required
161*c0909341SAndroid Build Coastguard Worker
162*c0909341SAndroid Build Coastguard Worker; refill:
163*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
164*c0909341SAndroid Build Coastguard Worker    push           t5
165*c0909341SAndroid Build Coastguard Worker%endif
166*c0909341SAndroid Build Coastguard Worker    mov            t2, [t7+msac.buf]
167*c0909341SAndroid Build Coastguard Worker    mov            t5, [t7+msac.end]
168*c0909341SAndroid Build Coastguard Worker    lea           rcx, [t2+gprsize]
169*c0909341SAndroid Build Coastguard Worker    sub           rcx, t5
170*c0909341SAndroid Build Coastguard Worker    ja .refill_eob
171*c0909341SAndroid Build Coastguard Worker    mov            t5, [t2]
172*c0909341SAndroid Build Coastguard Worker    lea           ecx, [t1+16-gprsize*8]
173*c0909341SAndroid Build Coastguard Worker    not            t5
174*c0909341SAndroid Build Coastguard Worker    bswap          t5
175*c0909341SAndroid Build Coastguard Worker    shr            t5, cl
176*c0909341SAndroid Build Coastguard Worker    neg           ecx
177*c0909341SAndroid Build Coastguard Worker    shr           ecx, 3 ; num_bytes_read
178*c0909341SAndroid Build Coastguard Worker    or             t4, t5
179*c0909341SAndroid Build Coastguard Worker.refill_end:
180*c0909341SAndroid Build Coastguard Worker    add            t2, rcx
181*c0909341SAndroid Build Coastguard Worker    lea           t1d, [t1+rcx*8] ; cnt += num_bits_read
182*c0909341SAndroid Build Coastguard Worker    mov [t7+msac.buf], t2
183*c0909341SAndroid Build Coastguard Worker.refill_end2:
184*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
185*c0909341SAndroid Build Coastguard Worker    pop            t5
186*c0909341SAndroid Build Coastguard Worker%endif
187*c0909341SAndroid Build Coastguard Worker.end:
188*c0909341SAndroid Build Coastguard Worker    mov [t7+msac.cnt], t1d
189*c0909341SAndroid Build Coastguard Worker    mov [t7+msac.dif], t4
190*c0909341SAndroid Build Coastguard Worker    RET
191*c0909341SAndroid Build Coastguard Worker.pad_with_ones:
192*c0909341SAndroid Build Coastguard Worker    lea           ecx, [t1-16]
193*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
194*c0909341SAndroid Build Coastguard Worker    ror           rcx, cl
195*c0909341SAndroid Build Coastguard Worker%else
196*c0909341SAndroid Build Coastguard Worker    shr           ecx, cl
197*c0909341SAndroid Build Coastguard Worker%endif
198*c0909341SAndroid Build Coastguard Worker    or             t4, rcx
199*c0909341SAndroid Build Coastguard Worker    jmp .refill_end2
200*c0909341SAndroid Build Coastguard Worker.refill_eob: ; avoid overreading the input buffer
201*c0909341SAndroid Build Coastguard Worker    cmp            t2, t5
202*c0909341SAndroid Build Coastguard Worker    jae .pad_with_ones ; eob reached
203*c0909341SAndroid Build Coastguard Worker    ; We can safely do a register-sized load of the last bytes of the buffer
204*c0909341SAndroid Build Coastguard Worker    ; as this code is only reached if the msac buffer size is >= gprsize.
205*c0909341SAndroid Build Coastguard Worker    mov            t5, [t5-gprsize]
206*c0909341SAndroid Build Coastguard Worker    shl           ecx, 3
207*c0909341SAndroid Build Coastguard Worker    shr            t5, cl
208*c0909341SAndroid Build Coastguard Worker    lea           ecx, [t1+16-gprsize*8]
209*c0909341SAndroid Build Coastguard Worker    not            t5
210*c0909341SAndroid Build Coastguard Worker    bswap          t5
211*c0909341SAndroid Build Coastguard Worker    shr            t5, cl
212*c0909341SAndroid Build Coastguard Worker    neg           ecx
213*c0909341SAndroid Build Coastguard Worker    or             t4, t5
214*c0909341SAndroid Build Coastguard Worker    mov           t5d, [t7+msac.end]
215*c0909341SAndroid Build Coastguard Worker    shr           ecx, 3
216*c0909341SAndroid Build Coastguard Worker    sub           t5d, t2d ; num_bytes_left
217*c0909341SAndroid Build Coastguard Worker    cmp           ecx, t5d
218*c0909341SAndroid Build Coastguard Worker    cmovae        ecx, t5d ; num_bytes_read
219*c0909341SAndroid Build Coastguard Worker    jmp .refill_end
220*c0909341SAndroid Build Coastguard Worker
221*c0909341SAndroid Build Coastguard Workercglobal msac_decode_symbol_adapt8, 0, 6, 6
222*c0909341SAndroid Build Coastguard Worker    DECODE_SYMBOL_ADAPT_INIT
223*c0909341SAndroid Build Coastguard Worker    LEA           rax, pw_0xff00
224*c0909341SAndroid Build Coastguard Worker    movd           m2, [t0+msac.rng]
225*c0909341SAndroid Build Coastguard Worker    mova           m1, [t1]
226*c0909341SAndroid Build Coastguard Worker    movp           m3, [t0+msac.dif]
227*c0909341SAndroid Build Coastguard Worker    mov           t3d, [t0+msac.update_cdf]
228*c0909341SAndroid Build Coastguard Worker    mov           t4d, t2d
229*c0909341SAndroid Build Coastguard Worker    not            t2
230*c0909341SAndroid Build Coastguard Worker    pshuflw        m2, m2, q0000
231*c0909341SAndroid Build Coastguard Worker    movd     [buf+12], m2
232*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m2, m2
233*c0909341SAndroid Build Coastguard Worker    mova           m0, m1
234*c0909341SAndroid Build Coastguard Worker    psrlw          m1, 6
235*c0909341SAndroid Build Coastguard Worker    pand           m2, [rax]
236*c0909341SAndroid Build Coastguard Worker    psllw          m1, 7
237*c0909341SAndroid Build Coastguard Worker    pmulhuw        m1, m2
238*c0909341SAndroid Build Coastguard Worker    movu           m2, [rax+t2*2]
239*c0909341SAndroid Build Coastguard Worker    pshuflw        m3, m3, c_shuf
240*c0909341SAndroid Build Coastguard Worker    paddw          m1, m2
241*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m3, m3
242*c0909341SAndroid Build Coastguard Worker    mova     [buf+16], m1
243*c0909341SAndroid Build Coastguard Worker    psubusw        m1, m3
244*c0909341SAndroid Build Coastguard Worker    pxor           m2, m2
245*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m1, m2
246*c0909341SAndroid Build Coastguard Worker    pmovmskb      eax, m1
247*c0909341SAndroid Build Coastguard Worker    test          t3d, t3d
248*c0909341SAndroid Build Coastguard Worker    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
249*c0909341SAndroid Build Coastguard Worker    movzx         t3d, word [t1+t4*2]
250*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m2, m2
251*c0909341SAndroid Build Coastguard Worker    mov           t2d, t3d
252*c0909341SAndroid Build Coastguard Worker    shr           t3d, 4
253*c0909341SAndroid Build Coastguard Worker    cmp           t4d, 3 ; may be called with n_symbols <= 2
254*c0909341SAndroid Build Coastguard Worker    sbb           t3d, -5
255*c0909341SAndroid Build Coastguard Worker    cmp           t2d, 32
256*c0909341SAndroid Build Coastguard Worker    adc           t2d, 0
257*c0909341SAndroid Build Coastguard Worker    movd           m3, t3d
258*c0909341SAndroid Build Coastguard Worker    pavgw          m2, m1
259*c0909341SAndroid Build Coastguard Worker    psubw          m2, m0
260*c0909341SAndroid Build Coastguard Worker    psubw          m0, m1
261*c0909341SAndroid Build Coastguard Worker    psraw          m2, m3
262*c0909341SAndroid Build Coastguard Worker    paddw          m0, m2
263*c0909341SAndroid Build Coastguard Worker    mova         [t1], m0
264*c0909341SAndroid Build Coastguard Worker    mov     [t1+t4*2], t2w
265*c0909341SAndroid Build Coastguard Worker    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm
266*c0909341SAndroid Build Coastguard Worker
267*c0909341SAndroid Build Coastguard Workercglobal msac_decode_symbol_adapt16, 0, 6, 6
268*c0909341SAndroid Build Coastguard Worker    DECODE_SYMBOL_ADAPT_INIT
269*c0909341SAndroid Build Coastguard Worker    LEA           rax, pw_0xff00
270*c0909341SAndroid Build Coastguard Worker    movd           m4, [t0+msac.rng]
271*c0909341SAndroid Build Coastguard Worker    mova           m2, [t1]
272*c0909341SAndroid Build Coastguard Worker    mova           m3, [t1+16]
273*c0909341SAndroid Build Coastguard Worker    movp           m5, [t0+msac.dif]
274*c0909341SAndroid Build Coastguard Worker    mov           t3d, [t0+msac.update_cdf]
275*c0909341SAndroid Build Coastguard Worker    mov           t4d, t2d
276*c0909341SAndroid Build Coastguard Worker    not            t2
277*c0909341SAndroid Build Coastguard Worker%if WIN64
278*c0909341SAndroid Build Coastguard Worker    sub           rsp, 48 ; need 36 bytes, shadow space is only 32
279*c0909341SAndroid Build Coastguard Worker%endif
280*c0909341SAndroid Build Coastguard Worker    pshuflw        m4, m4, q0000
281*c0909341SAndroid Build Coastguard Worker    movd      [buf-4], m4
282*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m4, m4
283*c0909341SAndroid Build Coastguard Worker    mova           m0, m2
284*c0909341SAndroid Build Coastguard Worker    psrlw          m2, 6
285*c0909341SAndroid Build Coastguard Worker    mova           m1, m3
286*c0909341SAndroid Build Coastguard Worker    psrlw          m3, 6
287*c0909341SAndroid Build Coastguard Worker    pand           m4, [rax]
288*c0909341SAndroid Build Coastguard Worker    psllw          m2, 7
289*c0909341SAndroid Build Coastguard Worker    psllw          m3, 7
290*c0909341SAndroid Build Coastguard Worker    pmulhuw        m2, m4
291*c0909341SAndroid Build Coastguard Worker    pmulhuw        m3, m4
292*c0909341SAndroid Build Coastguard Worker    movu           m4, [rax+t2*2]
293*c0909341SAndroid Build Coastguard Worker    pshuflw        m5, m5, c_shuf
294*c0909341SAndroid Build Coastguard Worker    paddw          m2, m4
295*c0909341SAndroid Build Coastguard Worker    psubw          m4, [rax-pw_0xff00+pw_32]
296*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m5, m5
297*c0909341SAndroid Build Coastguard Worker    paddw          m3, m4
298*c0909341SAndroid Build Coastguard Worker    mova        [buf], m2
299*c0909341SAndroid Build Coastguard Worker    psubusw        m2, m5
300*c0909341SAndroid Build Coastguard Worker    mova     [buf+16], m3
301*c0909341SAndroid Build Coastguard Worker    psubusw        m3, m5
302*c0909341SAndroid Build Coastguard Worker    pxor           m4, m4
303*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m2, m4
304*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m3, m4
305*c0909341SAndroid Build Coastguard Worker    packsswb       m5, m2, m3
306*c0909341SAndroid Build Coastguard Worker    pmovmskb      eax, m5
307*c0909341SAndroid Build Coastguard Worker    test          t3d, t3d
308*c0909341SAndroid Build Coastguard Worker    jz .renorm
309*c0909341SAndroid Build Coastguard Worker    movzx         t3d, word [t1+t4*2]
310*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m4, m4
311*c0909341SAndroid Build Coastguard Worker    mova           m5, m4
312*c0909341SAndroid Build Coastguard Worker    lea           t2d, [t3+80] ; only support n_symbols > 2
313*c0909341SAndroid Build Coastguard Worker    shr           t2d, 4
314*c0909341SAndroid Build Coastguard Worker    cmp           t3d, 32
315*c0909341SAndroid Build Coastguard Worker    adc           t3d, 0
316*c0909341SAndroid Build Coastguard Worker    pavgw          m4, m2
317*c0909341SAndroid Build Coastguard Worker    pavgw          m5, m3
318*c0909341SAndroid Build Coastguard Worker    psubw          m4, m0
319*c0909341SAndroid Build Coastguard Worker    psubw          m0, m2
320*c0909341SAndroid Build Coastguard Worker    movd           m2, t2d
321*c0909341SAndroid Build Coastguard Worker    psubw          m5, m1
322*c0909341SAndroid Build Coastguard Worker    psubw          m1, m3
323*c0909341SAndroid Build Coastguard Worker    psraw          m4, m2
324*c0909341SAndroid Build Coastguard Worker    psraw          m5, m2
325*c0909341SAndroid Build Coastguard Worker    paddw          m0, m4
326*c0909341SAndroid Build Coastguard Worker    paddw          m1, m5
327*c0909341SAndroid Build Coastguard Worker    mova         [t1], m0
328*c0909341SAndroid Build Coastguard Worker    mova      [t1+16], m1
329*c0909341SAndroid Build Coastguard Worker    mov     [t1+t4*2], t3w
330*c0909341SAndroid Build Coastguard Worker.renorm:
331*c0909341SAndroid Build Coastguard Worker    tzcnt         eax, eax
332*c0909341SAndroid Build Coastguard Worker    mov            t4, [t0+msac.dif]
333*c0909341SAndroid Build Coastguard Worker    movzx         t1d, word [buf+rax*2]
334*c0909341SAndroid Build Coastguard Worker    movzx         t2d, word [buf+rax*2-2]
335*c0909341SAndroid Build Coastguard Worker%if WIN64
336*c0909341SAndroid Build Coastguard Worker    add           rsp, 48
337*c0909341SAndroid Build Coastguard Worker%endif
338*c0909341SAndroid Build Coastguard Worker    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2
339*c0909341SAndroid Build Coastguard Worker
340*c0909341SAndroid Build Coastguard Workercglobal msac_decode_bool_adapt, 0, 6, 0
341*c0909341SAndroid Build Coastguard Worker    movifnidn      t1, r1mp
342*c0909341SAndroid Build Coastguard Worker    movifnidn      t0, r0mp
343*c0909341SAndroid Build Coastguard Worker    movzx         eax, word [t1]
344*c0909341SAndroid Build Coastguard Worker    movzx         t3d, byte [t0+msac.rng+1]
345*c0909341SAndroid Build Coastguard Worker    mov            t4, [t0+msac.dif]
346*c0909341SAndroid Build Coastguard Worker    mov           t2d, [t0+msac.rng]
347*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
348*c0909341SAndroid Build Coastguard Worker    mov           t5d, eax
349*c0909341SAndroid Build Coastguard Worker%endif
350*c0909341SAndroid Build Coastguard Worker    and           eax, ~63
351*c0909341SAndroid Build Coastguard Worker    imul          eax, t3d
352*c0909341SAndroid Build Coastguard Worker%if UNIX64
353*c0909341SAndroid Build Coastguard Worker    mov            t6, t4
354*c0909341SAndroid Build Coastguard Worker%endif
355*c0909341SAndroid Build Coastguard Worker    shr           eax, 7
356*c0909341SAndroid Build Coastguard Worker    add           eax, 4            ; v
357*c0909341SAndroid Build Coastguard Worker    mov           t3d, eax
358*c0909341SAndroid Build Coastguard Worker    shl           rax, gprsize*8-16 ; vw
359*c0909341SAndroid Build Coastguard Worker    sub           t2d, t3d          ; r - v
360*c0909341SAndroid Build Coastguard Worker    sub            t4, rax          ; dif - vw
361*c0909341SAndroid Build Coastguard Worker    setb           al
362*c0909341SAndroid Build Coastguard Worker    cmovb         t2d, t3d
363*c0909341SAndroid Build Coastguard Worker    mov           t3d, [t0+msac.update_cdf]
364*c0909341SAndroid Build Coastguard Worker%if UNIX64
365*c0909341SAndroid Build Coastguard Worker    cmovb          t4, t6
366*c0909341SAndroid Build Coastguard Worker%else
367*c0909341SAndroid Build Coastguard Worker    cmovb          t4, [t0+msac.dif]
368*c0909341SAndroid Build Coastguard Worker%endif
369*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
370*c0909341SAndroid Build Coastguard Worker    movzx         eax, al
371*c0909341SAndroid Build Coastguard Worker%endif
372*c0909341SAndroid Build Coastguard Worker    test          t3d, t3d
373*c0909341SAndroid Build Coastguard Worker    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
374*c0909341SAndroid Build Coastguard Worker%if UNIX64 == 0
375*c0909341SAndroid Build Coastguard Worker    push           t6
376*c0909341SAndroid Build Coastguard Worker%endif
377*c0909341SAndroid Build Coastguard Worker    movzx         t6d, word [t1+2]
378*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
379*c0909341SAndroid Build Coastguard Worker    push           t5
380*c0909341SAndroid Build Coastguard Worker    movzx         t5d, word [t1]
381*c0909341SAndroid Build Coastguard Worker%endif
382*c0909341SAndroid Build Coastguard Worker    movifnidn      t7, t0
383*c0909341SAndroid Build Coastguard Worker    lea           ecx, [t6+64]
384*c0909341SAndroid Build Coastguard Worker    cmp           t6d, 32
385*c0909341SAndroid Build Coastguard Worker    adc           t6d, 0
386*c0909341SAndroid Build Coastguard Worker    mov        [t1+2], t6w
387*c0909341SAndroid Build Coastguard Worker    imul          t6d, eax, -32769
388*c0909341SAndroid Build Coastguard Worker    shr           ecx, 4   ; rate
389*c0909341SAndroid Build Coastguard Worker    add           t6d, t5d ; if (bit)
390*c0909341SAndroid Build Coastguard Worker    sub           t5d, eax ;     cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
391*c0909341SAndroid Build Coastguard Worker    sar           t6d, cl  ; else
392*c0909341SAndroid Build Coastguard Worker    sub           t5d, t6d ;     cdf[0] -= cdf[0] >> rate;
393*c0909341SAndroid Build Coastguard Worker    mov          [t1], t5w
394*c0909341SAndroid Build Coastguard Worker%if WIN64
395*c0909341SAndroid Build Coastguard Worker    mov           t1d, [t7+msac.cnt]
396*c0909341SAndroid Build Coastguard Worker    pop            t6
397*c0909341SAndroid Build Coastguard Worker    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
398*c0909341SAndroid Build Coastguard Worker%else
399*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
400*c0909341SAndroid Build Coastguard Worker    pop            t5
401*c0909341SAndroid Build Coastguard Worker    pop            t6
402*c0909341SAndroid Build Coastguard Worker%endif
403*c0909341SAndroid Build Coastguard Worker    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
404*c0909341SAndroid Build Coastguard Worker%endif
405*c0909341SAndroid Build Coastguard Worker
406*c0909341SAndroid Build Coastguard Workercglobal msac_decode_bool_equi, 0, 6, 0
407*c0909341SAndroid Build Coastguard Worker    movifnidn      t0, r0mp
408*c0909341SAndroid Build Coastguard Worker    mov           t1d, [t0+msac.rng]
409*c0909341SAndroid Build Coastguard Worker    mov            t4, [t0+msac.dif]
410*c0909341SAndroid Build Coastguard Worker    mov           t2d, t1d
411*c0909341SAndroid Build Coastguard Worker    mov           t1b, 8
412*c0909341SAndroid Build Coastguard Worker    mov            t3, t4
413*c0909341SAndroid Build Coastguard Worker    mov           eax, t1d
414*c0909341SAndroid Build Coastguard Worker    shr           t1d, 1            ; v
415*c0909341SAndroid Build Coastguard Worker    shl           rax, gprsize*8-17 ; vw
416*c0909341SAndroid Build Coastguard Worker    sub           t2d, t1d          ; r - v
417*c0909341SAndroid Build Coastguard Worker    sub            t4, rax          ; dif - vw
418*c0909341SAndroid Build Coastguard Worker    cmovb         t2d, t1d
419*c0909341SAndroid Build Coastguard Worker    mov           t1d, [t0+msac.cnt]
420*c0909341SAndroid Build Coastguard Worker    cmovb          t4, t3
421*c0909341SAndroid Build Coastguard Worker    movifnidn      t7, t0
422*c0909341SAndroid Build Coastguard Worker    mov           ecx, 0xbfff
423*c0909341SAndroid Build Coastguard Worker    setb           al ; the upper 32 bits contains garbage but that's OK
424*c0909341SAndroid Build Coastguard Worker    sub           ecx, t2d
425*c0909341SAndroid Build Coastguard Worker    ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
426*c0909341SAndroid Build Coastguard Worker    ;   i.e. (0 <= d <= 2) and v < (3 << 14)
427*c0909341SAndroid Build Coastguard Worker    shr           ecx, 14           ; d
428*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
429*c0909341SAndroid Build Coastguard Worker    movzx         eax, al
430*c0909341SAndroid Build Coastguard Worker%endif
431*c0909341SAndroid Build Coastguard Worker    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5
432*c0909341SAndroid Build Coastguard Worker
433*c0909341SAndroid Build Coastguard Workercglobal msac_decode_bool, 0, 6, 0
434*c0909341SAndroid Build Coastguard Worker    movifnidn      t0, r0mp
435*c0909341SAndroid Build Coastguard Worker    movifnidn     t1d, r1m
436*c0909341SAndroid Build Coastguard Worker    movzx         eax, byte [t0+msac.rng+1] ; r >> 8
437*c0909341SAndroid Build Coastguard Worker    mov            t4, [t0+msac.dif]
438*c0909341SAndroid Build Coastguard Worker    mov           t2d, [t0+msac.rng]
439*c0909341SAndroid Build Coastguard Worker    and           t1d, ~63
440*c0909341SAndroid Build Coastguard Worker    imul          eax, t1d
441*c0909341SAndroid Build Coastguard Worker    mov            t3, t4
442*c0909341SAndroid Build Coastguard Worker    shr           eax, 7
443*c0909341SAndroid Build Coastguard Worker    add           eax, 4            ; v
444*c0909341SAndroid Build Coastguard Worker    mov           t1d, eax
445*c0909341SAndroid Build Coastguard Worker    shl           rax, gprsize*8-16 ; vw
446*c0909341SAndroid Build Coastguard Worker    sub           t2d, t1d          ; r - v
447*c0909341SAndroid Build Coastguard Worker    sub            t4, rax          ; dif - vw
448*c0909341SAndroid Build Coastguard Worker    cmovb         t2d, t1d
449*c0909341SAndroid Build Coastguard Worker    cmovb          t4, t3
450*c0909341SAndroid Build Coastguard Worker    setb           al
451*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
452*c0909341SAndroid Build Coastguard Worker    movzx         eax, al
453*c0909341SAndroid Build Coastguard Worker%endif
454*c0909341SAndroid Build Coastguard Worker    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
455*c0909341SAndroid Build Coastguard Worker
456*c0909341SAndroid Build Coastguard Worker%macro HI_TOK 1 ; update_cdf
457*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
458*c0909341SAndroid Build Coastguard Worker    mov           eax, -24
459*c0909341SAndroid Build Coastguard Worker%endif
460*c0909341SAndroid Build Coastguard Worker%%loop:
461*c0909341SAndroid Build Coastguard Worker%if %1
462*c0909341SAndroid Build Coastguard Worker    movzx         t2d, word [t1+3*2]
463*c0909341SAndroid Build Coastguard Worker%endif
464*c0909341SAndroid Build Coastguard Worker    mova           m1, m0
465*c0909341SAndroid Build Coastguard Worker    pshuflw        m2, m2, q0000
466*c0909341SAndroid Build Coastguard Worker    psrlw          m1, 6
467*c0909341SAndroid Build Coastguard Worker    movd     [buf+12], m2
468*c0909341SAndroid Build Coastguard Worker    pand           m2, m4
469*c0909341SAndroid Build Coastguard Worker    psllw          m1, 7
470*c0909341SAndroid Build Coastguard Worker    pmulhuw        m1, m2
471*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0
472*c0909341SAndroid Build Coastguard Worker    add           eax, 5
473*c0909341SAndroid Build Coastguard Worker    mov       [buf+8], eax
474*c0909341SAndroid Build Coastguard Worker%endif
475*c0909341SAndroid Build Coastguard Worker    pshuflw        m3, m3, c_shuf
476*c0909341SAndroid Build Coastguard Worker    paddw          m1, m5
477*c0909341SAndroid Build Coastguard Worker    movq     [buf+16], m1
478*c0909341SAndroid Build Coastguard Worker    psubusw        m1, m3
479*c0909341SAndroid Build Coastguard Worker    pxor           m2, m2
480*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m1, m2
481*c0909341SAndroid Build Coastguard Worker    pmovmskb      eax, m1
482*c0909341SAndroid Build Coastguard Worker%if %1
483*c0909341SAndroid Build Coastguard Worker    lea           ecx, [t2+80]
484*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m2, m2
485*c0909341SAndroid Build Coastguard Worker    shr           ecx, 4
486*c0909341SAndroid Build Coastguard Worker    cmp           t2d, 32
487*c0909341SAndroid Build Coastguard Worker    adc           t2d, 0
488*c0909341SAndroid Build Coastguard Worker    movd           m3, ecx
489*c0909341SAndroid Build Coastguard Worker    pavgw          m2, m1
490*c0909341SAndroid Build Coastguard Worker    psubw          m2, m0
491*c0909341SAndroid Build Coastguard Worker    psubw          m0, m1
492*c0909341SAndroid Build Coastguard Worker    psraw          m2, m3
493*c0909341SAndroid Build Coastguard Worker    paddw          m0, m2
494*c0909341SAndroid Build Coastguard Worker    movq         [t1], m0
495*c0909341SAndroid Build Coastguard Worker    mov      [t1+3*2], t2w
496*c0909341SAndroid Build Coastguard Worker%endif
497*c0909341SAndroid Build Coastguard Worker    tzcnt         eax, eax
498*c0909341SAndroid Build Coastguard Worker    movzx         ecx, word [buf+rax+16]
499*c0909341SAndroid Build Coastguard Worker    movzx         t2d, word [buf+rax+14]
500*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
501*c0909341SAndroid Build Coastguard Worker    add           t6d, 5
502*c0909341SAndroid Build Coastguard Worker%endif
503*c0909341SAndroid Build Coastguard Worker    sub           eax, 5   ; setup for merging the tok_br and tok branches
504*c0909341SAndroid Build Coastguard Worker    sub           t2d, ecx
505*c0909341SAndroid Build Coastguard Worker    shl           rcx, gprsize*8-16
506*c0909341SAndroid Build Coastguard Worker    sub            t4, rcx
507*c0909341SAndroid Build Coastguard Worker    bsr           ecx, t2d
508*c0909341SAndroid Build Coastguard Worker    xor           ecx, 15
509*c0909341SAndroid Build Coastguard Worker    shl           t2d, cl
510*c0909341SAndroid Build Coastguard Worker    shl            t4, cl
511*c0909341SAndroid Build Coastguard Worker    movd           m2, t2d
512*c0909341SAndroid Build Coastguard Worker    mov [t7+msac.rng], t2d
513*c0909341SAndroid Build Coastguard Worker    sub           t5d, ecx
514*c0909341SAndroid Build Coastguard Worker    jae %%end
515*c0909341SAndroid Build Coastguard Worker%if UNIX64 == 0
516*c0909341SAndroid Build Coastguard Worker    push           t8
517*c0909341SAndroid Build Coastguard Worker%endif
518*c0909341SAndroid Build Coastguard Worker    mov            t2, [t7+msac.buf]
519*c0909341SAndroid Build Coastguard Worker    mov            t8, [t7+msac.end]
520*c0909341SAndroid Build Coastguard Worker    lea           rcx, [t2+gprsize]
521*c0909341SAndroid Build Coastguard Worker    sub           rcx, t8
522*c0909341SAndroid Build Coastguard Worker    ja %%refill_eob
523*c0909341SAndroid Build Coastguard Worker    mov            t8, [t2]
524*c0909341SAndroid Build Coastguard Worker    lea           ecx, [t5+16-gprsize*8]
525*c0909341SAndroid Build Coastguard Worker    not            t8
526*c0909341SAndroid Build Coastguard Worker    bswap          t8
527*c0909341SAndroid Build Coastguard Worker    shr            t8, cl
528*c0909341SAndroid Build Coastguard Worker    neg           ecx
529*c0909341SAndroid Build Coastguard Worker    shr           ecx, 3
530*c0909341SAndroid Build Coastguard Worker    or             t4, t8
531*c0909341SAndroid Build Coastguard Worker%%refill_end:
532*c0909341SAndroid Build Coastguard Worker    add            t2, rcx
533*c0909341SAndroid Build Coastguard Worker    lea           t5d, [t5+rcx*8]
534*c0909341SAndroid Build Coastguard Worker    mov [t7+msac.buf], t2
535*c0909341SAndroid Build Coastguard Worker%%refill_end2:
536*c0909341SAndroid Build Coastguard Worker%if UNIX64 == 0
537*c0909341SAndroid Build Coastguard Worker    pop            t8
538*c0909341SAndroid Build Coastguard Worker%endif
539*c0909341SAndroid Build Coastguard Worker%%end:
540*c0909341SAndroid Build Coastguard Worker    movp           m3, t4
541*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
542*c0909341SAndroid Build Coastguard Worker    add           t6d, eax ; CF = tok_br < 3 || tok == 15
543*c0909341SAndroid Build Coastguard Worker    jnc %%loop
544*c0909341SAndroid Build Coastguard Worker    lea           eax, [t6+30]
545*c0909341SAndroid Build Coastguard Worker%else
546*c0909341SAndroid Build Coastguard Worker    add           eax, [buf+8]
547*c0909341SAndroid Build Coastguard Worker    jnc %%loop
548*c0909341SAndroid Build Coastguard Worker    add           eax, 30
549*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
550*c0909341SAndroid Build Coastguard Worker    add           esp, 36
551*c0909341SAndroid Build Coastguard Worker%else
552*c0909341SAndroid Build Coastguard Worker    mov           esp, [esp]
553*c0909341SAndroid Build Coastguard Worker%endif
554*c0909341SAndroid Build Coastguard Worker%endif
555*c0909341SAndroid Build Coastguard Worker    mov [t7+msac.dif], t4
556*c0909341SAndroid Build Coastguard Worker    shr           eax, 1
557*c0909341SAndroid Build Coastguard Worker    mov [t7+msac.cnt], t5d
558*c0909341SAndroid Build Coastguard Worker    RET
559*c0909341SAndroid Build Coastguard Worker%%pad_with_ones:
560*c0909341SAndroid Build Coastguard Worker    ; ensure that dif is padded with at least 15 bits of ones at the end
561*c0909341SAndroid Build Coastguard Worker    lea           ecx, [t5-16]
562*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
563*c0909341SAndroid Build Coastguard Worker    ror           rcx, cl
564*c0909341SAndroid Build Coastguard Worker%else
565*c0909341SAndroid Build Coastguard Worker    shr           ecx, cl
566*c0909341SAndroid Build Coastguard Worker%endif
567*c0909341SAndroid Build Coastguard Worker    or             t4, rcx
568*c0909341SAndroid Build Coastguard Worker    jmp %%refill_end2
569*c0909341SAndroid Build Coastguard Worker%%refill_eob:
570*c0909341SAndroid Build Coastguard Worker    cmp            t2, t8
571*c0909341SAndroid Build Coastguard Worker    jae %%pad_with_ones
572*c0909341SAndroid Build Coastguard Worker    mov            t8, [t8-gprsize]
573*c0909341SAndroid Build Coastguard Worker    shl           ecx, 3
574*c0909341SAndroid Build Coastguard Worker    shr            t8, cl
575*c0909341SAndroid Build Coastguard Worker    lea           ecx, [t5+16-gprsize*8]
576*c0909341SAndroid Build Coastguard Worker    not            t8
577*c0909341SAndroid Build Coastguard Worker    bswap          t8
578*c0909341SAndroid Build Coastguard Worker    shr            t8, cl
579*c0909341SAndroid Build Coastguard Worker    neg           ecx
580*c0909341SAndroid Build Coastguard Worker    or             t4, t8
581*c0909341SAndroid Build Coastguard Worker    mov           t8d, [t7+msac.end]
582*c0909341SAndroid Build Coastguard Worker    shr           ecx, 3
583*c0909341SAndroid Build Coastguard Worker    sub           t8d, t2d
584*c0909341SAndroid Build Coastguard Worker    cmp           ecx, t8d
585*c0909341SAndroid Build Coastguard Worker    cmovae        ecx, t8d
586*c0909341SAndroid Build Coastguard Worker    jmp %%refill_end
587*c0909341SAndroid Build Coastguard Worker%endmacro
588*c0909341SAndroid Build Coastguard Worker
589*c0909341SAndroid Build Coastguard Workercglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
590*c0909341SAndroid Build Coastguard Worker    DECODE_SYMBOL_ADAPT_INIT 1
591*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 == 0 && PIC
592*c0909341SAndroid Build Coastguard Worker    LEA            t2, min_prob+12*2
593*c0909341SAndroid Build Coastguard Worker    %define base t2-(min_prob+12*2)
594*c0909341SAndroid Build Coastguard Worker%else
595*c0909341SAndroid Build Coastguard Worker    %define base 0
596*c0909341SAndroid Build Coastguard Worker%endif
597*c0909341SAndroid Build Coastguard Worker    movq           m0, [t1]
598*c0909341SAndroid Build Coastguard Worker    movd           m2, [t0+msac.rng]
599*c0909341SAndroid Build Coastguard Worker    mov           eax, [t0+msac.update_cdf]
600*c0909341SAndroid Build Coastguard Worker    movq           m4, [base+pw_0xff00]
601*c0909341SAndroid Build Coastguard Worker    movp           m3, [t0+msac.dif]
602*c0909341SAndroid Build Coastguard Worker    movq           m5, [base+min_prob+12*2]
603*c0909341SAndroid Build Coastguard Worker    mov            t4, [t0+msac.dif]
604*c0909341SAndroid Build Coastguard Worker    mov           t5d, [t0+msac.cnt]
605*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
606*c0909341SAndroid Build Coastguard Worker    mov           t6d, -24
607*c0909341SAndroid Build Coastguard Worker%endif
608*c0909341SAndroid Build Coastguard Worker    movifnidn      t7, t0
609*c0909341SAndroid Build Coastguard Worker    test          eax, eax
610*c0909341SAndroid Build Coastguard Worker    jz .no_update_cdf
611*c0909341SAndroid Build Coastguard Worker    HI_TOK          1
612*c0909341SAndroid Build Coastguard Worker.no_update_cdf:
613*c0909341SAndroid Build Coastguard Worker    HI_TOK          0
614*c0909341SAndroid Build Coastguard Worker
615*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
616*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
617*c0909341SAndroid Build Coastguard Workercglobal msac_decode_symbol_adapt16, 3, 6, 6
618*c0909341SAndroid Build Coastguard Worker    lea           rax, [pw_0xff00]
619*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m2, [t0+msac.rng]
620*c0909341SAndroid Build Coastguard Worker    mova           m0, [t1]
621*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m3, [t0+msac.dif+6]
622*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m4, [rax]
623*c0909341SAndroid Build Coastguard Worker    mov           t3d, [t0+msac.update_cdf]
624*c0909341SAndroid Build Coastguard Worker    mov           t4d, t2d
625*c0909341SAndroid Build Coastguard Worker    not            t2
626*c0909341SAndroid Build Coastguard Worker    mov            r5, rsp
627*c0909341SAndroid Build Coastguard Worker%if WIN64
628*c0909341SAndroid Build Coastguard Worker    and           rsp, ~31
629*c0909341SAndroid Build Coastguard Worker    sub           rsp, 40
630*c0909341SAndroid Build Coastguard Worker%else
631*c0909341SAndroid Build Coastguard Worker    and            r5, ~31
632*c0909341SAndroid Build Coastguard Worker    %define buf r5-32
633*c0909341SAndroid Build Coastguard Worker%endif
634*c0909341SAndroid Build Coastguard Worker    psrlw          m1, m0, 6
635*c0909341SAndroid Build Coastguard Worker    movd      [buf-4], xm2
636*c0909341SAndroid Build Coastguard Worker    pand           m2, m4
637*c0909341SAndroid Build Coastguard Worker    psllw          m1, 7
638*c0909341SAndroid Build Coastguard Worker    pmulhuw        m1, m2
639*c0909341SAndroid Build Coastguard Worker    paddw          m1, [rax+t2*2]
640*c0909341SAndroid Build Coastguard Worker    mova        [buf], m1
641*c0909341SAndroid Build Coastguard Worker    pmaxuw         m1, m3
642*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m1, m3
643*c0909341SAndroid Build Coastguard Worker    pmovmskb      eax, m1
644*c0909341SAndroid Build Coastguard Worker    test          t3d, t3d
645*c0909341SAndroid Build Coastguard Worker    jz .renorm
646*c0909341SAndroid Build Coastguard Worker    movzx         t3d, word [t1+t4*2]
647*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m2, m2
648*c0909341SAndroid Build Coastguard Worker    lea           t2d, [t3+80]
649*c0909341SAndroid Build Coastguard Worker    shr           t2d, 4
650*c0909341SAndroid Build Coastguard Worker    cmp           t3d, 32
651*c0909341SAndroid Build Coastguard Worker    adc           t3d, 0
652*c0909341SAndroid Build Coastguard Worker    movd          xm3, t2d
653*c0909341SAndroid Build Coastguard Worker    pavgw          m2, m1
654*c0909341SAndroid Build Coastguard Worker    psubw          m2, m0
655*c0909341SAndroid Build Coastguard Worker    psubw          m0, m1
656*c0909341SAndroid Build Coastguard Worker    psraw          m2, xm3
657*c0909341SAndroid Build Coastguard Worker    paddw          m0, m2
658*c0909341SAndroid Build Coastguard Worker    mova         [t1], m0
659*c0909341SAndroid Build Coastguard Worker    mov     [t1+t4*2], t3w
660*c0909341SAndroid Build Coastguard Worker.renorm:
661*c0909341SAndroid Build Coastguard Worker    tzcnt         eax, eax
662*c0909341SAndroid Build Coastguard Worker    mov            t4, [t0+msac.dif]
663*c0909341SAndroid Build Coastguard Worker    movzx         t1d, word [buf+rax-0]
664*c0909341SAndroid Build Coastguard Worker    movzx         t2d, word [buf+rax-2]
665*c0909341SAndroid Build Coastguard Worker    shr           eax, 1
666*c0909341SAndroid Build Coastguard Worker%if WIN64
667*c0909341SAndroid Build Coastguard Worker    mov           rsp, r5
668*c0909341SAndroid Build Coastguard Worker%endif
669*c0909341SAndroid Build Coastguard Worker    vzeroupper
670*c0909341SAndroid Build Coastguard Worker    jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
671*c0909341SAndroid Build Coastguard Worker%endif
672