xref: /aosp_15_r20/external/libaom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14
15%include "aom_ports/x86_abi_support.asm"
16
17;Note: tap3 and tap4 have to be applied and added after other taps to avoid
18;overflow.
19
20%macro HIGH_GET_FILTERS_4 0
21    mov         rdx, arg(5)                 ;filter ptr
22    mov         rcx, 0x00000040
23
24    movdqa      xmm7, [rdx]                 ;load filters
25    pshuflw     xmm0, xmm7, 0b              ;k0
26    pshuflw     xmm1, xmm7, 01010101b       ;k1
27    pshuflw     xmm2, xmm7, 10101010b       ;k2
28    pshuflw     xmm3, xmm7, 11111111b       ;k3
29    psrldq      xmm7, 8
30    pshuflw     xmm4, xmm7, 0b              ;k4
31    pshuflw     xmm5, xmm7, 01010101b       ;k5
32    pshuflw     xmm6, xmm7, 10101010b       ;k6
33    pshuflw     xmm7, xmm7, 11111111b       ;k7
34
35    punpcklwd   xmm0, xmm6
36    punpcklwd   xmm2, xmm5
37    punpcklwd   xmm3, xmm4
38    punpcklwd   xmm1, xmm7
39
40    movdqa      k0k6, xmm0
41    movdqa      k2k5, xmm2
42    movdqa      k3k4, xmm3
43    movdqa      k1k7, xmm1
44
45    movq        xmm6, rcx
46    pshufd      xmm6, xmm6, 0
47    movdqa      krd, xmm6
48
49    ;Compute max and min values of a pixel
50    mov         rdx, 0x00010001
51    movsxd      rcx, DWORD PTR arg(6)      ;bps
52    movq        xmm0, rdx
53    movq        xmm1, rcx
54    pshufd      xmm0, xmm0, 0b
55    movdqa      xmm2, xmm0
56    psllw       xmm0, xmm1
57    psubw       xmm0, xmm2
58    pxor        xmm1, xmm1
59    movdqa      max, xmm0                  ;max value (for clamping)
60    movdqa      min, xmm1                  ;min value (for clamping)
61
62%endm
63
64%macro HIGH_APPLY_FILTER_4 1
65    punpcklwd   xmm0, xmm6                  ;two row in one register
66    punpcklwd   xmm1, xmm7
67    punpcklwd   xmm2, xmm5
68    punpcklwd   xmm3, xmm4
69
70    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
71    pmaddwd     xmm1, k1k7
72    pmaddwd     xmm2, k2k5
73    pmaddwd     xmm3, k3k4
74
75    paddd       xmm0, xmm1                  ;sum
76    paddd       xmm0, xmm2
77    paddd       xmm0, xmm3
78
79    paddd       xmm0, krd                   ;rounding
80    psrad       xmm0, 7                     ;shift
81    packssdw    xmm0, xmm0                  ;pack to word
82
83    ;clamp the values
84    pminsw      xmm0, max
85    pmaxsw      xmm0, min
86
87%if %1
88    movq        xmm1, [rdi]
89    pavgw       xmm0, xmm1
90%endif
91    movq        [rdi], xmm0
92%endm
93
94%macro HIGH_GET_FILTERS 0
95    mov         rdx, arg(5)                 ;filter ptr
96    mov         rsi, arg(0)                 ;src_ptr
97    mov         rdi, arg(2)                 ;output_ptr
98    mov         rcx, 0x00000040
99
100    movdqa      xmm7, [rdx]                 ;load filters
101    pshuflw     xmm0, xmm7, 0b              ;k0
102    pshuflw     xmm1, xmm7, 01010101b       ;k1
103    pshuflw     xmm2, xmm7, 10101010b       ;k2
104    pshuflw     xmm3, xmm7, 11111111b       ;k3
105    pshufhw     xmm4, xmm7, 0b              ;k4
106    pshufhw     xmm5, xmm7, 01010101b       ;k5
107    pshufhw     xmm6, xmm7, 10101010b       ;k6
108    pshufhw     xmm7, xmm7, 11111111b       ;k7
109    punpcklqdq  xmm2, xmm2
110    punpcklqdq  xmm3, xmm3
111    punpcklwd   xmm0, xmm1
112    punpckhwd   xmm6, xmm7
113    punpckhwd   xmm2, xmm5
114    punpckhwd   xmm3, xmm4
115
116    movdqa      k0k1, xmm0                  ;store filter factors on stack
117    movdqa      k6k7, xmm6
118    movdqa      k2k5, xmm2
119    movdqa      k3k4, xmm3
120
121    movq        xmm6, rcx
122    pshufd      xmm6, xmm6, 0
123    movdqa      krd, xmm6                   ;rounding
124
125    ;Compute max and min values of a pixel
126    mov         rdx, 0x00010001
127    movsxd      rcx, DWORD PTR arg(6)       ;bps
128    movq        xmm0, rdx
129    movq        xmm1, rcx
130    pshufd      xmm0, xmm0, 0b
131    movdqa      xmm2, xmm0
132    psllw       xmm0, xmm1
133    psubw       xmm0, xmm2
134    pxor        xmm1, xmm1
135    movdqa      max, xmm0                  ;max value (for clamping)
136    movdqa      min, xmm1                  ;min value (for clamping)
137%endm
138
139%macro LOAD_VERT_8 1
140    movdqu      xmm0, [rsi + %1]            ;0
141    movdqu      xmm1, [rsi + rax + %1]      ;1
142    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
143    lea         rsi,  [rsi + rax]
144    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
145    movdqu      xmm2, [rsi + rax + %1]      ;2
146    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
147    movdqu      xmm4, [rsi + rdx + %1]      ;4
148    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
149%endm
150
151%macro HIGH_APPLY_FILTER_8 2
152    movdqu      temp, xmm4
153    movdqa      xmm4, xmm0
154    punpcklwd   xmm0, xmm1
155    punpckhwd   xmm4, xmm1
156    movdqa      xmm1, xmm6
157    punpcklwd   xmm6, xmm7
158    punpckhwd   xmm1, xmm7
159    movdqa      xmm7, xmm2
160    punpcklwd   xmm2, xmm5
161    punpckhwd   xmm7, xmm5
162
163    movdqu      xmm5, temp
164    movdqu      temp, xmm4
165    movdqa      xmm4, xmm3
166    punpcklwd   xmm3, xmm5
167    punpckhwd   xmm4, xmm5
168    movdqu      xmm5, temp
169
170    pmaddwd     xmm0, k0k1
171    pmaddwd     xmm5, k0k1
172    pmaddwd     xmm6, k6k7
173    pmaddwd     xmm1, k6k7
174    pmaddwd     xmm2, k2k5
175    pmaddwd     xmm7, k2k5
176    pmaddwd     xmm3, k3k4
177    pmaddwd     xmm4, k3k4
178
179    paddd       xmm0, xmm6
180    paddd       xmm0, xmm2
181    paddd       xmm0, xmm3
182    paddd       xmm5, xmm1
183    paddd       xmm5, xmm7
184    paddd       xmm5, xmm4
185
186    paddd       xmm0, krd                   ;rounding
187    paddd       xmm5, krd
188    psrad       xmm0, 7                     ;shift
189    psrad       xmm5, 7
190    packssdw    xmm0, xmm5                  ;pack back to word
191
192    ;clamp the values
193    pminsw      xmm0, max
194    pmaxsw      xmm0, min
195
196%if %1
197    movdqu      xmm1, [rdi + %2]
198    pavgw       xmm0, xmm1
199%endif
200    movdqu      [rdi + %2], xmm0
201%endm
202
203SECTION .text
204
205;void aom_highbd_filter_block1d4_v8_sse2
206;(
207;    const uint16_t  *src_ptr,
208;    const ptrdiff_t  src_pitch,
209;    uint16_t        *output_ptr,
210;    ptrdiff_t        out_pitch,
211;    unsigned int     output_height,
212;    const int16_t   *filter,
213;    int              bd
214;)
215globalsym(aom_highbd_filter_block1d4_v8_sse2)
216sym(aom_highbd_filter_block1d4_v8_sse2):
217    push        rbp
218    mov         rbp, rsp
219    SHADOW_ARGS_TO_STACK 7
220    SAVE_XMM 7
221    push        rsi
222    push        rdi
223    push        rbx
224    ; end prolog
225
226    ALIGN_STACK 16, rax
227    sub         rsp, 16 * 7
228    %define k0k6 [rsp + 16 * 0]
229    %define k2k5 [rsp + 16 * 1]
230    %define k3k4 [rsp + 16 * 2]
231    %define k1k7 [rsp + 16 * 3]
232    %define krd [rsp + 16 * 4]
233    %define max [rsp + 16 * 5]
234    %define min [rsp + 16 * 6]
235
236    HIGH_GET_FILTERS_4
237
238    mov         rsi, arg(0)                 ;src_ptr
239    mov         rdi, arg(2)                 ;output_ptr
240
241    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
242    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
243    lea         rax, [rax + rax]            ;bytes per line
244    lea         rbx, [rbx + rbx]
245    lea         rdx, [rax + rax * 2]
246    movsxd      rcx, DWORD PTR arg(4)       ;output_height
247
248.loop:
249    movq        xmm0, [rsi]                 ;load src: row 0
250    movq        xmm1, [rsi + rax]           ;1
251    movq        xmm6, [rsi + rdx * 2]       ;6
252    lea         rsi,  [rsi + rax]
253    movq        xmm7, [rsi + rdx * 2]       ;7
254    movq        xmm2, [rsi + rax]           ;2
255    movq        xmm3, [rsi + rax * 2]       ;3
256    movq        xmm4, [rsi + rdx]           ;4
257    movq        xmm5, [rsi + rax * 4]       ;5
258
259    HIGH_APPLY_FILTER_4 0
260
261    lea         rdi, [rdi + rbx]
262    dec         rcx
263    jnz         .loop
264
265    add rsp, 16 * 7
266    pop rsp
267    pop rbx
268    ; begin epilog
269    pop rdi
270    pop rsi
271    RESTORE_XMM
272    UNSHADOW_ARGS
273    pop         rbp
274    ret
275
276;void aom_highbd_filter_block1d8_v8_sse2
277;(
278;    const uint16_t  *src_ptr,
279;    const ptrdiff_t  src_pitch,
280;    uint16_t        *output_ptr,
281;    ptrdiff_t        out_pitch,
282;    unsigned int     output_height,
283;    const int16_t   *filter,
284;    int              bd
285;)
286globalsym(aom_highbd_filter_block1d8_v8_sse2)
287sym(aom_highbd_filter_block1d8_v8_sse2):
288    push        rbp
289    mov         rbp, rsp
290    SHADOW_ARGS_TO_STACK 7
291    SAVE_XMM 7
292    push        rsi
293    push        rdi
294    push        rbx
295    ; end prolog
296
297    ALIGN_STACK 16, rax
298    sub         rsp, 16 * 8
299    %define k0k1 [rsp + 16 * 0]
300    %define k6k7 [rsp + 16 * 1]
301    %define k2k5 [rsp + 16 * 2]
302    %define k3k4 [rsp + 16 * 3]
303    %define krd [rsp + 16 * 4]
304    %define temp [rsp + 16 * 5]
305    %define max [rsp + 16 * 6]
306    %define min [rsp + 16 * 7]
307
308    HIGH_GET_FILTERS
309
310    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
311    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
312    lea         rax, [rax + rax]            ;bytes per line
313    lea         rbx, [rbx + rbx]
314    lea         rdx, [rax + rax * 2]
315    movsxd      rcx, DWORD PTR arg(4)       ;output_height
316
317.loop:
318    LOAD_VERT_8 0
319    HIGH_APPLY_FILTER_8 0, 0
320
321    lea         rdi, [rdi + rbx]
322    dec         rcx
323    jnz         .loop
324
325    add rsp, 16 * 8
326    pop rsp
327    pop rbx
328    ; begin epilog
329    pop rdi
330    pop rsi
331    RESTORE_XMM
332    UNSHADOW_ARGS
333    pop         rbp
334    ret
335
336;void aom_highbd_filter_block1d16_v8_sse2
337;(
338;    const uint16_t  *src_ptr,
339;    const ptrdiff_t  src_pitch,
340;    uint16_t        *output_ptr,
341;    ptrdiff_t        out_pitch,
342;    unsigned int     output_height,
343;    const int16_t   *filter,
344;    int              bd
345;)
346globalsym(aom_highbd_filter_block1d16_v8_sse2)
347sym(aom_highbd_filter_block1d16_v8_sse2):
348    push        rbp
349    mov         rbp, rsp
350    SHADOW_ARGS_TO_STACK 7
351    SAVE_XMM 7
352    push        rsi
353    push        rdi
354    push        rbx
355    ; end prolog
356
357    ALIGN_STACK 16, rax
358    sub         rsp, 16 * 8
359    %define k0k1 [rsp + 16 * 0]
360    %define k6k7 [rsp + 16 * 1]
361    %define k2k5 [rsp + 16 * 2]
362    %define k3k4 [rsp + 16 * 3]
363    %define krd [rsp + 16 * 4]
364    %define temp [rsp + 16 * 5]
365    %define max [rsp + 16 * 6]
366    %define min [rsp + 16 * 7]
367
368    HIGH_GET_FILTERS
369
370    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
371    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
372    lea         rax, [rax + rax]            ;bytes per line
373    lea         rbx, [rbx + rbx]
374    lea         rdx, [rax + rax * 2]
375    movsxd      rcx, DWORD PTR arg(4)       ;output_height
376
377.loop:
378    LOAD_VERT_8 0
379    HIGH_APPLY_FILTER_8 0, 0
380    sub         rsi, rax
381
382    LOAD_VERT_8 16
383    HIGH_APPLY_FILTER_8 0, 16
384    add         rdi, rbx
385
386    dec         rcx
387    jnz         .loop
388
389    add rsp, 16 * 8
390    pop rsp
391    pop rbx
392    ; begin epilog
393    pop rdi
394    pop rsi
395    RESTORE_XMM
396    UNSHADOW_ARGS
397    pop         rbp
398    ret
399
400;void aom_highbd_filter_block1d4_h8_sse2
401;(
402;    const uint16_t  *src_ptr,
403;    const ptrdiff_t  src_pitch,
404;    uint16_t        *output_ptr,
405;    ptrdiff_t        out_pitch,
406;    unsigned int     output_height,
407;    const int16_t   *filter,
408;    int              bd
409;)
410globalsym(aom_highbd_filter_block1d4_h8_sse2)
411sym(aom_highbd_filter_block1d4_h8_sse2):
412    push        rbp
413    mov         rbp, rsp
414    SHADOW_ARGS_TO_STACK 7
415    SAVE_XMM 7
416    push        rsi
417    push        rdi
418    ; end prolog
419
420    ALIGN_STACK 16, rax
421    sub         rsp, 16 * 7
422    %define k0k6 [rsp + 16 * 0]
423    %define k2k5 [rsp + 16 * 1]
424    %define k3k4 [rsp + 16 * 2]
425    %define k1k7 [rsp + 16 * 3]
426    %define krd [rsp + 16 * 4]
427    %define max [rsp + 16 * 5]
428    %define min [rsp + 16 * 6]
429
430    HIGH_GET_FILTERS_4
431
432    mov         rsi, arg(0)                 ;src_ptr
433    mov         rdi, arg(2)                 ;output_ptr
434
435    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
436    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
437    lea         rax, [rax + rax]            ;bytes per line
438    lea         rdx, [rdx + rdx]
439    movsxd      rcx, DWORD PTR arg(4)       ;output_height
440
441.loop:
442    movdqu      xmm0,   [rsi - 6]           ;load src
443    movdqu      xmm4,   [rsi + 2]
444    movdqa      xmm1, xmm0
445    movdqa      xmm6, xmm4
446    movdqa      xmm7, xmm4
447    movdqa      xmm2, xmm0
448    movdqa      xmm3, xmm0
449    movdqa      xmm5, xmm4
450
451    psrldq      xmm1, 2
452    psrldq      xmm6, 4
453    psrldq      xmm7, 6
454    psrldq      xmm2, 4
455    psrldq      xmm3, 6
456    psrldq      xmm5, 2
457
458    HIGH_APPLY_FILTER_4 0
459
460    lea         rsi, [rsi + rax]
461    lea         rdi, [rdi + rdx]
462    dec         rcx
463    jnz         .loop
464
465    add rsp, 16 * 7
466    pop rsp
467
468    ; begin epilog
469    pop rdi
470    pop rsi
471    RESTORE_XMM
472    UNSHADOW_ARGS
473    pop         rbp
474    ret
475
476;void aom_highbd_filter_block1d8_h8_sse2
477;(
478;    const uint16_t  *src_ptr,
479;    const ptrdiff_t  src_pitch,
480;    uint16_t        *output_ptr,
481;    ptrdiff_t        out_pitch,
482;    unsigned int     output_height,
483;    const int16_t   *filter,
484;    int              bd
485;)
486globalsym(aom_highbd_filter_block1d8_h8_sse2)
487sym(aom_highbd_filter_block1d8_h8_sse2):
488    push        rbp
489    mov         rbp, rsp
490    SHADOW_ARGS_TO_STACK 7
491    SAVE_XMM 7
492    push        rsi
493    push        rdi
494    ; end prolog
495
496    ALIGN_STACK 16, rax
497    sub         rsp, 16 * 8
498    %define k0k1 [rsp + 16 * 0]
499    %define k6k7 [rsp + 16 * 1]
500    %define k2k5 [rsp + 16 * 2]
501    %define k3k4 [rsp + 16 * 3]
502    %define krd [rsp + 16 * 4]
503    %define temp [rsp + 16 * 5]
504    %define max [rsp + 16 * 6]
505    %define min [rsp + 16 * 7]
506
507    HIGH_GET_FILTERS
508
509    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
510    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
511    lea         rax, [rax + rax]            ;bytes per line
512    lea         rdx, [rdx + rdx]
513    movsxd      rcx, DWORD PTR arg(4)       ;output_height
514
515.loop:
516    movdqu      xmm0,   [rsi - 6]           ;load src
517    movdqu      xmm1,   [rsi - 4]
518    movdqu      xmm2,   [rsi - 2]
519    movdqu      xmm3,   [rsi]
520    movdqu      xmm4,   [rsi + 2]
521    movdqu      xmm5,   [rsi + 4]
522    movdqu      xmm6,   [rsi + 6]
523    movdqu      xmm7,   [rsi + 8]
524
525    HIGH_APPLY_FILTER_8 0, 0
526
527    lea         rsi, [rsi + rax]
528    lea         rdi, [rdi + rdx]
529    dec         rcx
530    jnz         .loop
531
532    add rsp, 16 * 8
533    pop rsp
534
535    ; begin epilog
536    pop rdi
537    pop rsi
538    RESTORE_XMM
539    UNSHADOW_ARGS
540    pop         rbp
541    ret
542
543;void aom_highbd_filter_block1d16_h8_sse2
544;(
545;    const uint16_t  *src_ptr,
546;    const ptrdiff_t  src_pitch,
547;    uint16_t        *output_ptr,
548;    ptrdiff_t        out_pitch,
549;    unsigned int     output_height,
550;    const int16_t   *filter,
551;    int              bd
552;)
553globalsym(aom_highbd_filter_block1d16_h8_sse2)
554sym(aom_highbd_filter_block1d16_h8_sse2):
555    push        rbp
556    mov         rbp, rsp
557    SHADOW_ARGS_TO_STACK 7
558    SAVE_XMM 7
559    push        rsi
560    push        rdi
561    ; end prolog
562
563    ALIGN_STACK 16, rax
564    sub         rsp, 16 * 8
565    %define k0k1 [rsp + 16 * 0]
566    %define k6k7 [rsp + 16 * 1]
567    %define k2k5 [rsp + 16 * 2]
568    %define k3k4 [rsp + 16 * 3]
569    %define krd [rsp + 16 * 4]
570    %define temp [rsp + 16 * 5]
571    %define max [rsp + 16 * 6]
572    %define min [rsp + 16 * 7]
573
574    HIGH_GET_FILTERS
575
576    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
577    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
578    lea         rax, [rax + rax]            ;bytes per line
579    lea         rdx, [rdx + rdx]
580    movsxd      rcx, DWORD PTR arg(4)       ;output_height
581
582.loop:
583    movdqu      xmm0,   [rsi - 6]           ;load src
584    movdqu      xmm1,   [rsi - 4]
585    movdqu      xmm2,   [rsi - 2]
586    movdqu      xmm3,   [rsi]
587    movdqu      xmm4,   [rsi + 2]
588    movdqu      xmm5,   [rsi + 4]
589    movdqu      xmm6,   [rsi + 6]
590    movdqu      xmm7,   [rsi + 8]
591
592    HIGH_APPLY_FILTER_8 0, 0
593
594    movdqu      xmm0,   [rsi + 10]           ;load src
595    movdqu      xmm1,   [rsi + 12]
596    movdqu      xmm2,   [rsi + 14]
597    movdqu      xmm3,   [rsi + 16]
598    movdqu      xmm4,   [rsi + 18]
599    movdqu      xmm5,   [rsi + 20]
600    movdqu      xmm6,   [rsi + 22]
601    movdqu      xmm7,   [rsi + 24]
602
603    HIGH_APPLY_FILTER_8 0, 16
604
605    lea         rsi, [rsi + rax]
606    lea         rdi, [rdi + rdx]
607    dec         rcx
608    jnz         .loop
609
610    add rsp, 16 * 8
611    pop rsp
612
613    ; begin epilog
614    pop rdi
615    pop rsi
616    RESTORE_XMM
617    UNSHADOW_ARGS
618    pop         rbp
619    ret
620