xref: /aosp_15_r20/external/libdav1d/src/x86/pal.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2023, VideoLAN and dav1d authors
2; Copyright © 2023, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 64
30
31const pb_0to63,  db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
32%if ARCH_X86_64
33                 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
34                 db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
35                 db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
36%endif
37pal_idx_w8_padh: db  0,  1,  2,  3,  3,  3,  3,  3,  8,  9, 10, 11, 11, 11, 11, 11
38
39pb_1_16: times 4 db  1, 16
40%if ARCH_X86_64
41pb_32:   times 4 db 32
42%endif
43
44%macro JMP_TABLE 2-*
45    %xdefine %1_table (%%table - 2*4)
46    %xdefine %%base mangle(private_prefix %+ _%1)
47    %%table:
48    %rep %0 - 1
49        dd %%base %+ .w%2 - (%%table - 2*4)
50        %rotate 1
51    %endrep
52%endmacro
53
54JMP_TABLE pal_idx_finish_ssse3,     4, 8, 16, 32, 64
55%if ARCH_X86_64
56JMP_TABLE pal_idx_finish_avx2,      4, 8, 16, 32, 64
57JMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64
58%endif
59
60SECTION .text
61
62INIT_XMM ssse3
63cglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h
64%define base r6-pal_idx_finish_ssse3_table
65    LEA                  r6, pal_idx_finish_ssse3_table
66    tzcnt               bwd, bwm
67    movifnidn           bhd, bhm
68    movifnidn            wd, wm
69    movifnidn            hd, hm
70    movsxd              bwq, [r6+bwq*4]
71    movddup              m3, [base+pb_1_16]
72    add                 bwq, r6
73    sub                 bhd, hd
74    jmp                 bwq
75.w4:
76    mova                 m0, [srcq]
77    add                srcq, 16
78    pmaddubsw            m0, m3
79    packuswb             m0, m0
80    movq             [dstq], m0
81    add                dstq, 8
82    sub                  hd, 4
83    jg .w4
84    test                bhd, bhd
85    jz .w4_end
86    pshuflw              m0, m0, q3333
87.w4_padv:
88    movq             [dstq], m0
89    add                dstq, 8
90    sub                 bhd, 4
91    jg .w4_padv
92.w4_end:
93    RET
94.w8_padh:
95    pshufb               m0, m2
96    pshufb               m1, m2
97    jmp .w8_main
98.w8:
99    mova                 m2, [base+pal_idx_w8_padh]
100.w8_loop:
101    mova                 m0, [srcq+16*0]
102    mova                 m1, [srcq+16*1]
103    cmp                  wd, 8
104    jl .w8_padh
105.w8_main:
106    pmaddubsw            m0, m3
107    pmaddubsw            m1, m3
108    add                srcq, 16*2
109    packuswb             m0, m1
110    movu             [dstq], m0
111    add                dstq, 16
112    sub                  hd, 4
113    jg .w8_loop
114    test                bhd, bhd
115    jz .w8_end
116    pshufd               m0, m0, q3333
117.w8_padv:
118    movu             [dstq], m0
119    add                dstq, 16
120    sub                 bhd, 4
121    jg .w8_padv
122.w8_end:
123    RET
124.w16_padh:
125    pshufb               m0, m4
126    pshufb               m1, m4
127    jmp .w16_main
128.w16:
129    cmp                  wd, 16
130    je .w16_loop
131    call .setup_padh
132.w16_loop:
133    mova                 m0, [srcq+16*0]
134    mova                 m1, [srcq+16*1]
135    cmp                  wd, 16
136    jl .w16_padh
137.w16_main:
138    pmaddubsw            m0, m3
139    pmaddubsw            m1, m3
140    add                srcq, 16*2
141    packuswb             m0, m1
142    movu             [dstq], m0
143    add                dstq, 16
144    sub                  hd, 2
145    jg .w16_loop
146    test                bhd, bhd
147    jz .w16_end
148    punpckhqdq           m0, m0
149.w16_padv:
150    movu        [dstq+16*0], m0
151    movu        [dstq+16*1], m0
152    add                dstq, 16*2
153    sub                 bhd, 4
154    jg .w16_padv
155.w16_end:
156    RET
157.w32_padh:
158    cmp                  wd, 16
159    jg .w32_padh2
160    pshufb               m1, m0, m5
161    pshufb               m0, m4
162    jmp .w32_main
163.w32_padh2:
164    pshufb               m1, m4
165    jmp .w32_main
166.w32:
167    cmp                  wd, 32
168    je .w32_loop
169    call .setup_padh
170.w32_loop:
171    mova                 m0, [srcq+16*0]
172    mova                 m1, [srcq+16*1]
173    cmp                  wd, 32
174    jl .w32_padh
175.w32_main:
176    pmaddubsw            m0, m3
177    pmaddubsw            m1, m3
178    add                srcq, 16*2
179    packuswb             m0, m1
180    movu             [dstq], m0
181    add                dstq, 16
182    dec                  hd
183    jg .w32_loop
184    test                bhd, bhd
185    jz .w32_end
186.w32_padv:
187    movu        [dstq+16*0], m0
188    movu        [dstq+16*1], m0
189    movu        [dstq+16*2], m0
190    movu        [dstq+16*3], m0
191    add                dstq, 16*4
192    sub                 bhd, 4
193    jg .w32_padv
194.w32_end:
195    RET
196.w64_padh:
197    cmp                  wd, 16
198    jg .w64_padh2
199    pshufb               m1, m0, m5
200    pshufb               m0, m4
201    pmaddubsw            m0, m3
202    pmaddubsw            m1, m3
203    packuswb             m0, m1
204    packuswb             m1, m1
205    jmp .w64_main
206.w64_padh2:
207    pshufb               m1, m4
208    pmaddubsw            m0, m3
209    pmaddubsw            m2, m1, m3
210    pshufb               m1, m5
211    pmaddubsw            m1, m3
212    packuswb             m0, m2
213    packuswb             m1, m1
214    jmp .w64_main
215.w64_padh3:
216    cmp                  wd, 48
217    jg .w64_padh4
218    pshufb               m2, m1, m5
219    pshufb               m1, m4
220    jmp .w64_main2
221.w64_padh4:
222    pshufb               m2, m4
223    jmp .w64_main2
224.w64:
225    cmp                  wd, 64
226    je .w64_loop
227    call .setup_padh
228.w64_loop:
229    mova                 m0, [srcq+16*0]
230    mova                 m1, [srcq+16*1]
231    cmp                  wd, 32
232    jle .w64_padh
233    pmaddubsw            m0, m3
234    pmaddubsw            m1, m3
235    packuswb             m0, m1
236    mova                 m1, [srcq+16*2]
237    mova                 m2, [srcq+16*3]
238    cmp                  wd, 64
239    jl .w64_padh3
240.w64_main2:
241    pmaddubsw            m1, m3
242    pmaddubsw            m2, m3
243    packuswb             m1, m2
244.w64_main:
245    add                srcq, 16*4
246    movu        [dstq+16*0], m0
247    movu        [dstq+16*1], m1
248    add                dstq, 16*2
249    dec                  hd
250    jg .w64_loop
251    test                bhd, bhd
252    jz .w64_end
253.w64_padv:
254    movu        [dstq+16*0], m0
255    movu        [dstq+16*1], m1
256    movu        [dstq+16*2], m0
257    movu        [dstq+16*3], m1
258    add                dstq, 16*4
259    sub                 bhd, 2
260    jg .w64_padv
261.w64_end:
262    RET
263.setup_padh:
264    mova                 m4, [base+pb_0to63]
265    lea                 r6d, [wq-1]
266    and                 r6d, 15
267    movd                 m5, r6d
268    pxor                 m0, m0
269    pshufb               m5, m0
270    pminub               m4, m5
271    ret
272
273%if ARCH_X86_64
274
275INIT_YMM avx2
276cglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h
277%define base r6-pal_idx_finish_avx2_table
278    lea                  r6, [pal_idx_finish_avx2_table]
279    tzcnt               bwd, bwd
280    movifnidn            wd, wm
281    movifnidn            hd, hm
282    movsxd              bwq, [r6+bwq*4]
283    vpbroadcastd         m2, [base+pb_1_16]
284    dec                  wd
285    add                 bwq, r6
286    sub                 bhd, hd
287    jmp                 bwq
288.w4:
289    mova                xm0, [srcq]
290    add                srcq, 16
291    pmaddubsw           xm0, xm2
292    packuswb            xm0, xm0
293    movq             [dstq], xm0
294    add                dstq, 8
295    sub                  hd, 4
296    jg .w4
297    test                bhd, bhd
298    jz .w4_end
299    pshuflw             xm0, xm0, q3333
300.w4_padv:
301    movq             [dstq], xm0
302    add                dstq, 8
303    sub                 bhd, 4
304    jg .w4_padv
305.w4_end:
306    RET
307.w8_padh:
308    pshufb              xm0, xm3
309    pshufb              xm1, xm3
310    jmp .w8_main
311.w8:
312    mova                xm3, [base+pal_idx_w8_padh]
313.w8_loop:
314    mova                xm0, [srcq+16*0]
315    mova                xm1, [srcq+16*1]
316    cmp                  wd, 7
317    jl .w8_padh
318.w8_main:
319    pmaddubsw           xm0, xm2
320    pmaddubsw           xm1, xm2
321    add                srcq, 16*2
322    packuswb            xm0, xm1
323    movu             [dstq], xm0
324    add                dstq, 16
325    sub                  hd, 4
326    jg .w8_loop
327    test                bhd, bhd
328    jz .w8_end
329    pshufd              xm0, xm0, q3333
330.w8_padv:
331    movu             [dstq], xm0
332    add                dstq, 16
333    sub                 bhd, 4
334    jg .w8_padv
335.w8_end:
336    RET
337.w16_padh:
338    pshufb               m0, m3
339    pshufb               m1, m3
340    jmp .w16_main
341.w16:
342    cmp                  wd, 15
343    je .w16_loop
344    vbroadcasti128       m0, [base+pb_0to63]
345    movd                xm3, wd
346    vpbroadcastb         m3, xm3
347    pminub               m3, m0
348.w16_loop:
349    mova                 m0, [srcq+32*0]
350    mova                 m1, [srcq+32*1]
351    cmp                  wd, 15
352    jl .w16_padh
353.w16_main:
354    pmaddubsw            m0, m2
355    pmaddubsw            m1, m2
356    add                srcq, 32*2
357    packuswb             m0, m1
358    vpermq               m1, m0, q3120
359    movu             [dstq], m1
360    add                dstq, 32
361    sub                  hd, 4
362    jg .w16_loop
363    test                bhd, bhd
364    jz .w16_end
365    vpermq               m0, m0, q3333
366.w16_padv:
367    movu             [dstq], m0
368    add                dstq, 32
369    sub                 bhd, 4
370    jg .w16_padv
371.w16_end:
372    RET
373.w32_padh:
374    cmp                  wd, 15
375    jg .w32_padh2
376    vinserti128          m0, xm0, 1
377    vinserti128          m1, xm1, 1
378.w32_padh2:
379    pshufb               m0, m3
380    pshufb               m1, m3
381    jmp .w32_main
382.w32:
383    cmp                  wd, 31
384    je .w32_loop
385    movd                xm3, wd
386    vpbroadcastb         m3, xm3
387    pminub               m3, [base+pb_0to63]
388.w32_loop:
389    mova                 m0, [srcq+32*0]
390    mova                 m1, [srcq+32*1]
391    cmp                  wd, 31
392    jl .w32_padh
393.w32_main:
394    pmaddubsw            m0, m2
395    pmaddubsw            m1, m2
396    add                srcq, 32*2
397    packuswb             m0, m1
398    vpermq               m1, m0, q3120
399    movu             [dstq], m1
400    add                dstq, 32
401    sub                  hd, 2
402    jg .w32_loop
403    test                bhd, bhd
404    jz .w32_end
405    vpermq               m0, m0, q3131
406.w32_padv:
407    movu        [dstq+32*0], m0
408    movu        [dstq+32*1], m0
409    add                dstq, 32*2
410    sub                 bhd, 4
411    jg .w32_padv
412.w32_end:
413    RET
414.w64_padh:
415    cmp                  wd, 15
416    jg .w64_padh2
417    vinserti128          m1, m0, xm0, 1
418    pshufb               m0, m1, m3
419    pshufb               m1, m4
420    jmp .w64_main
421.w64_padh2:
422    cmp                  wd, 31
423    jg .w64_padh3
424    vperm2i128           m1, m0, m0, 0x11
425    pshufb               m0, m3
426    pshufb               m1, m4
427    jmp .w64_main
428.w64_padh3:
429    cmp                  wd, 47
430    jg .w64_padh4
431    vinserti128          m1, xm1, 1
432.w64_padh4:
433    pshufb               m1, m3
434    jmp .w64_main
435.w64:
436    cmp                  wd, 63
437    je .w64_loop
438    mov                 r6d, wd
439    and                 r6d, 31
440    movd                xm4, r6d
441    vpbroadcastb         m4, xm4
442    pminub               m3, m4, [pb_0to63]
443.w64_loop:
444    mova                 m0, [srcq+32*0]
445    mova                 m1, [srcq+32*1]
446    cmp                  wd, 63
447    jl .w64_padh
448.w64_main:
449    pmaddubsw            m0, m2
450    pmaddubsw            m1, m2
451    add                srcq, 32*2
452    packuswb             m0, m1
453    vpermq               m0, m0, q3120
454    movu             [dstq], m0
455    add                dstq, 32
456    dec                  hd
457    jg .w64_loop
458    test                bhd, bhd
459    jz .w64_end
460.w64_padv:
461    movu        [dstq+32*0], m0
462    movu        [dstq+32*1], m0
463    movu        [dstq+32*2], m0
464    movu        [dstq+32*3], m0
465    add                dstq, 32*4
466    sub                 bhd, 4
467    jg .w64_padv
468.w64_end:
469    RET
470
471INIT_ZMM avx512icl
472cglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h
473%define base r6-pal_idx_finish_avx512icl_table
474    lea                  r6, [pal_idx_finish_avx512icl_table]
475    tzcnt               bwd, bwd
476    movifnidn            wd, wm
477    movifnidn            hd, hm
478    movsxd              bwq, [r6+bwq*4]
479    vpbroadcastd         m4, [base+pb_1_16]
480    dec                  wd
481    add                 bwq, r6
482    sub                 bhd, hd
483    jmp                 bwq
484.w4:
485    mova               xmm0, [srcq]
486    add                srcq, 16
487    pmaddubsw          xmm0, xm4
488    packuswb           xmm0, xmm0
489    movq             [dstq], xmm0
490    add                dstq, 8
491    sub                  hd, 4
492    jg .w4
493    test                bhd, bhd
494    jz .w4_end
495    pshuflw            xmm0, xmm0, q3333
496.w4_padv:
497    movq             [dstq], xmm0
498    add                dstq, 8
499    sub                 bhd, 4
500    jg .w4_padv
501.w4_end:
502    RET
503.w8_padh:
504    pshufb             xmm0, xmm2
505    pshufb             xmm1, xmm2
506    jmp .w8_main
507.w8:
508    mova               xmm2, [base+pal_idx_w8_padh]
509.w8_loop:
510    mova               xmm0, [srcq+16*0]
511    mova               xmm1, [srcq+16*1]
512    cmp                  wd, 7
513    jl .w8_padh
514.w8_main:
515    pmaddubsw          xmm0, xm4
516    pmaddubsw          xmm1, xm4
517    add                srcq, 16*2
518    packuswb           xmm0, xmm1
519    movu             [dstq], xmm0
520    add                dstq, 16
521    sub                  hd, 4
522    jg .w8_loop
523    test                bhd, bhd
524    jz .w8_end
525    pshufd             xmm0, xmm0, q3333
526.w8_padv:
527    movu             [dstq], xmm0
528    add                dstq, 16
529    sub                 bhd, 4
530    jg .w8_padv
531.w8_end:
532    RET
533.w16_padh:
534    pshufb               m0, m2
535    jmp .w16_main
536.w16:
537    cmp                  wd, 15
538    je .w16_loop
539    vbroadcasti32x4      m2, [base+pb_0to63]
540    vpbroadcastb         m0, wd
541    pminub               m2, m0
542.w16_loop:
543    mova                 m0, [srcq]
544    cmp                  wd, 15
545    jl .w16_padh
546.w16_main:
547    pmaddubsw            m0, m4
548    add                srcq, 64
549    vpmovwb             ym0, m0
550    movu             [dstq], ym0
551    add                dstq, 32
552    sub                  hd, 4
553    jg .w16_loop
554    test                bhd, bhd
555    jz .w16_end
556    vpermq              ym0, ym0, q3333
557.w16_padv:
558    movu             [dstq], ym0
559    add                dstq, 32
560    sub                 bhd, 4
561    jg .w16_padv
562.w16_end:
563    RET
564.w32_padh:
565    vpermb               m0, m2, m0
566    vpermb               m1, m2, m1
567    jmp .w32_main
568.w32:
569    mova                 m2, [base+pb_0to63]
570    paddb                m3, m2, m2
571    cmp                  wd, 31
572    je .w32_loop
573    vpbroadcastb         m0, wd
574    mov                 r6d, 0xff00
575    kmovw                k1, r6d
576    vpaddd           m0{k1}, [pb_32] {1to16}
577    pminub               m2, m0
578.w32_loop:
579    mova                 m0, [srcq+64*0]
580    mova                 m1, [srcq+64*1]
581    cmp                  wd, 31
582    jl .w32_padh
583.w32_main:
584    pmaddubsw            m0, m4
585    pmaddubsw            m1, m4
586    add                srcq, 64*2
587    vpermt2b             m0, m3, m1
588    movu             [dstq], m0
589    add                dstq, 64
590    sub                  hd, 4
591    jg .w32_loop
592    test                bhd, bhd
593    jz .w32_end
594    vshufi32x4           m0, m0, q3333
595.w32_padv:
596    movu             [dstq], m0
597    add                dstq, 64
598    sub                 bhd, 4
599    jg .w32_padv
600.w32_end:
601    RET
602.w64_padh:
603    REPX  {vpermb x, m5, x}, m0, m1, m2, m3
604    jmp .w64_main
605.w64:
606    mova                 m5, [base+pb_0to63]
607    paddb                m6, m5, m5
608    cmp                  wd, 63
609    je .w64_loop
610    vpbroadcastb         m0, wd
611    pminub               m5, m0
612.w64_loop:
613    mova                 m0, [srcq+64*0]
614    mova                 m1, [srcq+64*1]
615    mova                 m2, [srcq+64*2]
616    mova                 m3, [srcq+64*3]
617    cmp                  wd, 63
618    jl .w64_padh
619.w64_main:
620    REPX  {pmaddubsw x, m4}, m0, m1, m2, m3
621    add                srcq, 64*4
622    vpermt2b             m0, m6, m1
623    vpermt2b             m2, m6, m3
624    movu        [dstq+64*0], m0
625    movu        [dstq+64*1], m2
626    add                dstq, 64*2
627    sub                  hd, 4
628    jg .w64_loop
629    test                bhd, bhd
630    jz .w64_end
631    vshufi32x4           m2, m2, q3232
632.w64_padv:
633    movu        [dstq+64*0], m2
634    movu        [dstq+64*1], m2
635    add                dstq, 64*2
636    sub                 bhd, 4
637    jg .w64_padv
638.w64_end:
639    RET
640
641%endif ; ARCH_X86_64
642