xref: /aosp_15_r20/external/libdav1d/src/x86/msac.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2019, VideoLAN and dav1d authors
2; Copyright © 2019, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 64 ; avoids cacheline splits
30
31min_prob:  dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
32pw_0xff00: times 8 dw 0xff00
33pw_32:     times 8 dw 32
34
35%if ARCH_X86_64
36%define resp   resq
37%define movp   movq
38%define c_shuf q3333
39%macro DECODE_SYMBOL_ADAPT_INIT 0-1
40%endmacro
41%else
42%define resp   resd
43%define movp   movd
44%define c_shuf q1111
45%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
46    mov            t0, r0m
47    mov            t1, r1m
48%if %1 == 0
49    mov            t2, r2m
50%endif
51%if STACK_ALIGNMENT >= 16
52    sub           esp, 40-%1*4
53%else
54    mov           eax, esp
55    and           esp, ~15
56    sub           esp, 40-%1*4
57    mov         [esp], eax
58%endif
59%endmacro
60%endif
61
62struc msac
63    .buf:        resp 1
64    .end:        resp 1
65    .dif:        resp 1
66    .rng:        resd 1
67    .cnt:        resd 1
68    .update_cdf: resd 1
69endstruc
70
71%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)
72
73SECTION .text
74
75%if WIN64
76DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
77%define buf rsp+stack_offset+8 ; shadow space
78%elif UNIX64
79DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
80%define buf rsp-40 ; red zone
81%else
82DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
83%define buf esp+8
84%endif
85
86INIT_XMM sse2
87cglobal msac_decode_symbol_adapt4, 0, 6, 6
88    DECODE_SYMBOL_ADAPT_INIT
89    LEA           rax, pw_0xff00
90    movd           m2, [t0+msac.rng]
91    movq           m1, [t1]
92    movp           m3, [t0+msac.dif]
93    mov           t3d, [t0+msac.update_cdf]
94    mov           t4d, t2d
95    not            t2     ; -(n_symbols + 1)
96    pshuflw        m2, m2, q0000
97    movd     [buf+12], m2
98    pand           m2, [rax]
99    mova           m0, m1
100    psrlw          m1, 6
101    psllw          m1, 7
102    pmulhuw        m1, m2
103    movq           m2, [rax+t2*2]
104    pshuflw        m3, m3, c_shuf
105    paddw          m1, m2
106    mova     [buf+16], m1
107    psubusw        m1, m3
108    pxor           m2, m2
109    pcmpeqw        m1, m2 ; c >= v
110    pmovmskb      eax, m1
111    test          t3d, t3d
112    jz .renorm ; !allow_update_cdf
113
114; update_cdf:
115    movzx         t3d, word [t1+t4*2] ; count
116    pcmpeqw        m2, m2
117    mov           t2d, t3d
118    shr           t3d, 4
119    cmp           t4d, 3
120    sbb           t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
121    cmp           t2d, 32
122    adc           t2d, 0  ; count + (count < 32)
123    movd           m3, t3d
124    pavgw          m2, m1 ; i >= val ? -1 : 32768
125    psubw          m2, m0 ; for (i = 0; i < val; i++)
126    psubw          m0, m1 ;     cdf[i] += (32768 - cdf[i]) >> rate;
127    psraw          m2, m3 ; for (; i < n_symbols; i++)
128    paddw          m0, m2 ;     cdf[i] += ((  -1 - cdf[i]) >> rate) + 1;
129    movq         [t1], m0
130    mov     [t1+t4*2], t2w
131
132.renorm:
133    tzcnt         eax, eax
134    mov            t4, [t0+msac.dif]
135    movzx         t1d, word [buf+rax+16] ; v
136    movzx         t2d, word [buf+rax+14] ; u
137    shr           eax, 1
138.renorm2:
139%if ARCH_X86_64 == 0
140%if STACK_ALIGNMENT >= 16
141    add           esp, 40
142%else
143    mov           esp, [esp]
144%endif
145%endif
146    sub           t2d, t1d ; rng
147    shl            t1, gprsize*8-16
148    sub            t4, t1  ; dif - v
149.renorm3:
150    mov           t1d, [t0+msac.cnt]
151    movifnidn      t7, t0
152.renorm4:
153    bsr           ecx, t2d
154    xor           ecx, 15  ; d
155.renorm5:
156    shl           t2d, cl
157    shl            t4, cl
158    mov [t7+msac.rng], t2d
159    sub           t1d, ecx
160    jae .end ; no refill required
161
162; refill:
163%if ARCH_X86_64 == 0
164    push           t5
165%endif
166    mov            t2, [t7+msac.buf]
167    mov            t5, [t7+msac.end]
168    lea           rcx, [t2+gprsize]
169    sub           rcx, t5
170    ja .refill_eob
171    mov            t5, [t2]
172    lea           ecx, [t1+16-gprsize*8]
173    not            t5
174    bswap          t5
175    shr            t5, cl
176    neg           ecx
177    shr           ecx, 3 ; num_bytes_read
178    or             t4, t5
179.refill_end:
180    add            t2, rcx
181    lea           t1d, [t1+rcx*8] ; cnt += num_bits_read
182    mov [t7+msac.buf], t2
183.refill_end2:
184%if ARCH_X86_64 == 0
185    pop            t5
186%endif
187.end:
188    mov [t7+msac.cnt], t1d
189    mov [t7+msac.dif], t4
190    RET
191.pad_with_ones:
192    lea           ecx, [t1-16]
193%if ARCH_X86_64
194    ror           rcx, cl
195%else
196    shr           ecx, cl
197%endif
198    or             t4, rcx
199    jmp .refill_end2
200.refill_eob: ; avoid overreading the input buffer
201    cmp            t2, t5
202    jae .pad_with_ones ; eob reached
203    ; We can safely do a register-sized load of the last bytes of the buffer
204    ; as this code is only reached if the msac buffer size is >= gprsize.
205    mov            t5, [t5-gprsize]
206    shl           ecx, 3
207    shr            t5, cl
208    lea           ecx, [t1+16-gprsize*8]
209    not            t5
210    bswap          t5
211    shr            t5, cl
212    neg           ecx
213    or             t4, t5
214    mov           t5d, [t7+msac.end]
215    shr           ecx, 3
216    sub           t5d, t2d ; num_bytes_left
217    cmp           ecx, t5d
218    cmovae        ecx, t5d ; num_bytes_read
219    jmp .refill_end
220
221cglobal msac_decode_symbol_adapt8, 0, 6, 6
222    DECODE_SYMBOL_ADAPT_INIT
223    LEA           rax, pw_0xff00
224    movd           m2, [t0+msac.rng]
225    mova           m1, [t1]
226    movp           m3, [t0+msac.dif]
227    mov           t3d, [t0+msac.update_cdf]
228    mov           t4d, t2d
229    not            t2
230    pshuflw        m2, m2, q0000
231    movd     [buf+12], m2
232    punpcklqdq     m2, m2
233    mova           m0, m1
234    psrlw          m1, 6
235    pand           m2, [rax]
236    psllw          m1, 7
237    pmulhuw        m1, m2
238    movu           m2, [rax+t2*2]
239    pshuflw        m3, m3, c_shuf
240    paddw          m1, m2
241    punpcklqdq     m3, m3
242    mova     [buf+16], m1
243    psubusw        m1, m3
244    pxor           m2, m2
245    pcmpeqw        m1, m2
246    pmovmskb      eax, m1
247    test          t3d, t3d
248    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
249    movzx         t3d, word [t1+t4*2]
250    pcmpeqw        m2, m2
251    mov           t2d, t3d
252    shr           t3d, 4
253    cmp           t4d, 3 ; may be called with n_symbols <= 2
254    sbb           t3d, -5
255    cmp           t2d, 32
256    adc           t2d, 0
257    movd           m3, t3d
258    pavgw          m2, m1
259    psubw          m2, m0
260    psubw          m0, m1
261    psraw          m2, m3
262    paddw          m0, m2
263    mova         [t1], m0
264    mov     [t1+t4*2], t2w
265    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm
266
267cglobal msac_decode_symbol_adapt16, 0, 6, 6
268    DECODE_SYMBOL_ADAPT_INIT
269    LEA           rax, pw_0xff00
270    movd           m4, [t0+msac.rng]
271    mova           m2, [t1]
272    mova           m3, [t1+16]
273    movp           m5, [t0+msac.dif]
274    mov           t3d, [t0+msac.update_cdf]
275    mov           t4d, t2d
276    not            t2
277%if WIN64
278    sub           rsp, 48 ; need 36 bytes, shadow space is only 32
279%endif
280    pshuflw        m4, m4, q0000
281    movd      [buf-4], m4
282    punpcklqdq     m4, m4
283    mova           m0, m2
284    psrlw          m2, 6
285    mova           m1, m3
286    psrlw          m3, 6
287    pand           m4, [rax]
288    psllw          m2, 7
289    psllw          m3, 7
290    pmulhuw        m2, m4
291    pmulhuw        m3, m4
292    movu           m4, [rax+t2*2]
293    pshuflw        m5, m5, c_shuf
294    paddw          m2, m4
295    psubw          m4, [rax-pw_0xff00+pw_32]
296    punpcklqdq     m5, m5
297    paddw          m3, m4
298    mova        [buf], m2
299    psubusw        m2, m5
300    mova     [buf+16], m3
301    psubusw        m3, m5
302    pxor           m4, m4
303    pcmpeqw        m2, m4
304    pcmpeqw        m3, m4
305    packsswb       m5, m2, m3
306    pmovmskb      eax, m5
307    test          t3d, t3d
308    jz .renorm
309    movzx         t3d, word [t1+t4*2]
310    pcmpeqw        m4, m4
311    mova           m5, m4
312    lea           t2d, [t3+80] ; only support n_symbols > 2
313    shr           t2d, 4
314    cmp           t3d, 32
315    adc           t3d, 0
316    pavgw          m4, m2
317    pavgw          m5, m3
318    psubw          m4, m0
319    psubw          m0, m2
320    movd           m2, t2d
321    psubw          m5, m1
322    psubw          m1, m3
323    psraw          m4, m2
324    psraw          m5, m2
325    paddw          m0, m4
326    paddw          m1, m5
327    mova         [t1], m0
328    mova      [t1+16], m1
329    mov     [t1+t4*2], t3w
330.renorm:
331    tzcnt         eax, eax
332    mov            t4, [t0+msac.dif]
333    movzx         t1d, word [buf+rax*2]
334    movzx         t2d, word [buf+rax*2-2]
335%if WIN64
336    add           rsp, 48
337%endif
338    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2
339
340cglobal msac_decode_bool_adapt, 0, 6, 0
341    movifnidn      t1, r1mp
342    movifnidn      t0, r0mp
343    movzx         eax, word [t1]
344    movzx         t3d, byte [t0+msac.rng+1]
345    mov            t4, [t0+msac.dif]
346    mov           t2d, [t0+msac.rng]
347%if ARCH_X86_64
348    mov           t5d, eax
349%endif
350    and           eax, ~63
351    imul          eax, t3d
352%if UNIX64
353    mov            t6, t4
354%endif
355    shr           eax, 7
356    add           eax, 4            ; v
357    mov           t3d, eax
358    shl           rax, gprsize*8-16 ; vw
359    sub           t2d, t3d          ; r - v
360    sub            t4, rax          ; dif - vw
361    setb           al
362    cmovb         t2d, t3d
363    mov           t3d, [t0+msac.update_cdf]
364%if UNIX64
365    cmovb          t4, t6
366%else
367    cmovb          t4, [t0+msac.dif]
368%endif
369%if ARCH_X86_64 == 0
370    movzx         eax, al
371%endif
372    test          t3d, t3d
373    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
374%if UNIX64 == 0
375    push           t6
376%endif
377    movzx         t6d, word [t1+2]
378%if ARCH_X86_64 == 0
379    push           t5
380    movzx         t5d, word [t1]
381%endif
382    movifnidn      t7, t0
383    lea           ecx, [t6+64]
384    cmp           t6d, 32
385    adc           t6d, 0
386    mov        [t1+2], t6w
387    imul          t6d, eax, -32769
388    shr           ecx, 4   ; rate
389    add           t6d, t5d ; if (bit)
390    sub           t5d, eax ;     cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
391    sar           t6d, cl  ; else
392    sub           t5d, t6d ;     cdf[0] -= cdf[0] >> rate;
393    mov          [t1], t5w
394%if WIN64
395    mov           t1d, [t7+msac.cnt]
396    pop            t6
397    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
398%else
399%if ARCH_X86_64 == 0
400    pop            t5
401    pop            t6
402%endif
403    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
404%endif
405
406cglobal msac_decode_bool_equi, 0, 6, 0
407    movifnidn      t0, r0mp
408    mov           t1d, [t0+msac.rng]
409    mov            t4, [t0+msac.dif]
410    mov           t2d, t1d
411    mov           t1b, 8
412    mov            t3, t4
413    mov           eax, t1d
414    shr           t1d, 1            ; v
415    shl           rax, gprsize*8-17 ; vw
416    sub           t2d, t1d          ; r - v
417    sub            t4, rax          ; dif - vw
418    cmovb         t2d, t1d
419    mov           t1d, [t0+msac.cnt]
420    cmovb          t4, t3
421    movifnidn      t7, t0
422    mov           ecx, 0xbfff
423    setb           al ; the upper 32 bits contains garbage but that's OK
424    sub           ecx, t2d
425    ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
426    ;   i.e. (0 <= d <= 2) and v < (3 << 14)
427    shr           ecx, 14           ; d
428%if ARCH_X86_64 == 0
429    movzx         eax, al
430%endif
431    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5
432
433cglobal msac_decode_bool, 0, 6, 0
434    movifnidn      t0, r0mp
435    movifnidn     t1d, r1m
436    movzx         eax, byte [t0+msac.rng+1] ; r >> 8
437    mov            t4, [t0+msac.dif]
438    mov           t2d, [t0+msac.rng]
439    and           t1d, ~63
440    imul          eax, t1d
441    mov            t3, t4
442    shr           eax, 7
443    add           eax, 4            ; v
444    mov           t1d, eax
445    shl           rax, gprsize*8-16 ; vw
446    sub           t2d, t1d          ; r - v
447    sub            t4, rax          ; dif - vw
448    cmovb         t2d, t1d
449    cmovb          t4, t3
450    setb           al
451%if ARCH_X86_64 == 0
452    movzx         eax, al
453%endif
454    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
455
456%macro HI_TOK 1 ; update_cdf
457%if ARCH_X86_64 == 0
458    mov           eax, -24
459%endif
460%%loop:
461%if %1
462    movzx         t2d, word [t1+3*2]
463%endif
464    mova           m1, m0
465    pshuflw        m2, m2, q0000
466    psrlw          m1, 6
467    movd     [buf+12], m2
468    pand           m2, m4
469    psllw          m1, 7
470    pmulhuw        m1, m2
471%if ARCH_X86_64 == 0
472    add           eax, 5
473    mov       [buf+8], eax
474%endif
475    pshuflw        m3, m3, c_shuf
476    paddw          m1, m5
477    movq     [buf+16], m1
478    psubusw        m1, m3
479    pxor           m2, m2
480    pcmpeqw        m1, m2
481    pmovmskb      eax, m1
482%if %1
483    lea           ecx, [t2+80]
484    pcmpeqw        m2, m2
485    shr           ecx, 4
486    cmp           t2d, 32
487    adc           t2d, 0
488    movd           m3, ecx
489    pavgw          m2, m1
490    psubw          m2, m0
491    psubw          m0, m1
492    psraw          m2, m3
493    paddw          m0, m2
494    movq         [t1], m0
495    mov      [t1+3*2], t2w
496%endif
497    tzcnt         eax, eax
498    movzx         ecx, word [buf+rax+16]
499    movzx         t2d, word [buf+rax+14]
500%if ARCH_X86_64
501    add           t6d, 5
502%endif
503    sub           eax, 5   ; setup for merging the tok_br and tok branches
504    sub           t2d, ecx
505    shl           rcx, gprsize*8-16
506    sub            t4, rcx
507    bsr           ecx, t2d
508    xor           ecx, 15
509    shl           t2d, cl
510    shl            t4, cl
511    movd           m2, t2d
512    mov [t7+msac.rng], t2d
513    sub           t5d, ecx
514    jae %%end
515%if UNIX64 == 0
516    push           t8
517%endif
518    mov            t2, [t7+msac.buf]
519    mov            t8, [t7+msac.end]
520    lea           rcx, [t2+gprsize]
521    sub           rcx, t8
522    ja %%refill_eob
523    mov            t8, [t2]
524    lea           ecx, [t5+16-gprsize*8]
525    not            t8
526    bswap          t8
527    shr            t8, cl
528    neg           ecx
529    shr           ecx, 3
530    or             t4, t8
531%%refill_end:
532    add            t2, rcx
533    lea           t5d, [t5+rcx*8]
534    mov [t7+msac.buf], t2
535%%refill_end2:
536%if UNIX64 == 0
537    pop            t8
538%endif
539%%end:
540    movp           m3, t4
541%if ARCH_X86_64
542    add           t6d, eax ; CF = tok_br < 3 || tok == 15
543    jnc %%loop
544    lea           eax, [t6+30]
545%else
546    add           eax, [buf+8]
547    jnc %%loop
548    add           eax, 30
549%if STACK_ALIGNMENT >= 16
550    add           esp, 36
551%else
552    mov           esp, [esp]
553%endif
554%endif
555    mov [t7+msac.dif], t4
556    shr           eax, 1
557    mov [t7+msac.cnt], t5d
558    RET
559%%pad_with_ones:
560    ; ensure that dif is padded with at least 15 bits of ones at the end
561    lea           ecx, [t5-16]
562%if ARCH_X86_64
563    ror           rcx, cl
564%else
565    shr           ecx, cl
566%endif
567    or             t4, rcx
568    jmp %%refill_end2
569%%refill_eob:
570    cmp            t2, t8
571    jae %%pad_with_ones
572    mov            t8, [t8-gprsize]
573    shl           ecx, 3
574    shr            t8, cl
575    lea           ecx, [t5+16-gprsize*8]
576    not            t8
577    bswap          t8
578    shr            t8, cl
579    neg           ecx
580    or             t4, t8
581    mov           t8d, [t7+msac.end]
582    shr           ecx, 3
583    sub           t8d, t2d
584    cmp           ecx, t8d
585    cmovae        ecx, t8d
586    jmp %%refill_end
587%endmacro
588
589cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
590    DECODE_SYMBOL_ADAPT_INIT 1
591%if ARCH_X86_64 == 0 && PIC
592    LEA            t2, min_prob+12*2
593    %define base t2-(min_prob+12*2)
594%else
595    %define base 0
596%endif
597    movq           m0, [t1]
598    movd           m2, [t0+msac.rng]
599    mov           eax, [t0+msac.update_cdf]
600    movq           m4, [base+pw_0xff00]
601    movp           m3, [t0+msac.dif]
602    movq           m5, [base+min_prob+12*2]
603    mov            t4, [t0+msac.dif]
604    mov           t5d, [t0+msac.cnt]
605%if ARCH_X86_64
606    mov           t6d, -24
607%endif
608    movifnidn      t7, t0
609    test          eax, eax
610    jz .no_update_cdf
611    HI_TOK          1
612.no_update_cdf:
613    HI_TOK          0
614
615%if ARCH_X86_64
616INIT_YMM avx2
617cglobal msac_decode_symbol_adapt16, 3, 6, 6
618    lea           rax, [pw_0xff00]
619    vpbroadcastw   m2, [t0+msac.rng]
620    mova           m0, [t1]
621    vpbroadcastw   m3, [t0+msac.dif+6]
622    vbroadcasti128 m4, [rax]
623    mov           t3d, [t0+msac.update_cdf]
624    mov           t4d, t2d
625    not            t2
626    mov            r5, rsp
627%if WIN64
628    and           rsp, ~31
629    sub           rsp, 40
630%else
631    and            r5, ~31
632    %define buf r5-32
633%endif
634    psrlw          m1, m0, 6
635    movd      [buf-4], xm2
636    pand           m2, m4
637    psllw          m1, 7
638    pmulhuw        m1, m2
639    paddw          m1, [rax+t2*2]
640    mova        [buf], m1
641    pmaxuw         m1, m3
642    pcmpeqw        m1, m3
643    pmovmskb      eax, m1
644    test          t3d, t3d
645    jz .renorm
646    movzx         t3d, word [t1+t4*2]
647    pcmpeqw        m2, m2
648    lea           t2d, [t3+80]
649    shr           t2d, 4
650    cmp           t3d, 32
651    adc           t3d, 0
652    movd          xm3, t2d
653    pavgw          m2, m1
654    psubw          m2, m0
655    psubw          m0, m1
656    psraw          m2, xm3
657    paddw          m0, m2
658    mova         [t1], m0
659    mov     [t1+t4*2], t3w
660.renorm:
661    tzcnt         eax, eax
662    mov            t4, [t0+msac.dif]
663    movzx         t1d, word [buf+rax-0]
664    movzx         t2d, word [buf+rax-2]
665    shr           eax, 1
666%if WIN64
667    mov           rsp, r5
668%endif
669    vzeroupper
670    jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
671%endif
672