xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro HIGH_GET_PARAM_4 0
14    mov         rdx, arg(5)                 ;filter ptr
15    mov         rsi, arg(0)                 ;src_ptr
16    mov         rdi, arg(2)                 ;output_ptr
17    mov         rcx, 0x00000040
18
19    movdqa      xmm3, [rdx]                 ;load filters
20    pshuflw     xmm4, xmm3, 11111111b       ;k3
21    psrldq      xmm3, 8
22    pshuflw     xmm3, xmm3, 0b              ;k4
23    punpcklwd   xmm4, xmm3                  ;k3k4
24
25    movq        xmm3, rcx                   ;rounding
26    pshufd      xmm3, xmm3, 0
27
28    mov         rdx, 0x00010001
29    movsxd      rcx, DWORD PTR arg(6)       ;bd
30    movq        xmm5, rdx
31    movq        xmm2, rcx
32    pshufd      xmm5, xmm5, 0b
33    movdqa      xmm1, xmm5
34    psllw       xmm5, xmm2
35    psubw       xmm5, xmm1                  ;max value (for clamping)
36    pxor        xmm2, xmm2                  ;min value (for clamping)
37
38    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
39    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
40    movsxd      rcx, DWORD PTR arg(4)       ;output_height
41%endm
42
43%macro HIGH_APPLY_FILTER_4 1
44
45    punpcklwd   xmm0, xmm1                  ;two row in one register
46    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
47
48    paddd       xmm0, xmm3                  ;rounding
49    psrad       xmm0, 7                     ;shift
50    packssdw    xmm0, xmm0                  ;pack to word
51
52    ;clamp the values
53    pminsw      xmm0, xmm5
54    pmaxsw      xmm0, xmm2
55
56%if %1
57    movq        xmm1, [rdi]
58    pavgw       xmm0, xmm1
59%endif
60
61    movq        [rdi], xmm0
62    lea         rsi, [rsi + 2*rax]
63    lea         rdi, [rdi + 2*rdx]
64    dec         rcx
65%endm
66
67%if VPX_ARCH_X86_64
68%macro HIGH_GET_PARAM 0
69    mov         rdx, arg(5)                 ;filter ptr
70    mov         rsi, arg(0)                 ;src_ptr
71    mov         rdi, arg(2)                 ;output_ptr
72    mov         rcx, 0x00000040
73
74    movdqa      xmm6, [rdx]                 ;load filters
75
76    pshuflw     xmm7, xmm6, 11111111b       ;k3
77    pshufhw     xmm6, xmm6, 0b              ;k4
78    psrldq      xmm6, 8
79    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
80
81    movq        xmm4, rcx                   ;rounding
82    pshufd      xmm4, xmm4, 0
83
84    mov         rdx, 0x00010001
85    movsxd      rcx, DWORD PTR arg(6)       ;bd
86    movq        xmm8, rdx
87    movq        xmm5, rcx
88    pshufd      xmm8, xmm8, 0b
89    movdqa      xmm1, xmm8
90    psllw       xmm8, xmm5
91    psubw       xmm8, xmm1                  ;max value (for clamping)
92    pxor        xmm5, xmm5                  ;min value (for clamping)
93
94    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
95    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
96    movsxd      rcx, DWORD PTR arg(4)       ;output_height
97%endm
98
99%macro HIGH_APPLY_FILTER_8 1
100    movdqa      xmm6, xmm0
101    punpckhwd   xmm6, xmm1
102    punpcklwd   xmm0, xmm1
103    pmaddwd     xmm6, xmm7
104    pmaddwd     xmm0, xmm7
105
106    paddd       xmm6, xmm4                  ;rounding
107    paddd       xmm0, xmm4                  ;rounding
108    psrad       xmm6, 7                     ;shift
109    psrad       xmm0, 7                     ;shift
110    packssdw    xmm0, xmm6                  ;pack back to word
111
112    ;clamp the values
113    pminsw      xmm0, xmm8
114    pmaxsw      xmm0, xmm5
115
116%if %1
117    movdqu      xmm1, [rdi]
118    pavgw       xmm0, xmm1
119%endif
120    movdqu      [rdi], xmm0                 ;store the result
121
122    lea         rsi, [rsi + 2*rax]
123    lea         rdi, [rdi + 2*rdx]
124    dec         rcx
125%endm
126
127%macro HIGH_APPLY_FILTER_16 1
128    movdqa      xmm9, xmm0
129    movdqa      xmm6, xmm2
130    punpckhwd   xmm9, xmm1
131    punpckhwd   xmm6, xmm3
132    punpcklwd   xmm0, xmm1
133    punpcklwd   xmm2, xmm3
134
135    pmaddwd     xmm9, xmm7
136    pmaddwd     xmm6, xmm7
137    pmaddwd     xmm0, xmm7
138    pmaddwd     xmm2, xmm7
139
140    paddd       xmm9, xmm4                  ;rounding
141    paddd       xmm6, xmm4
142    paddd       xmm0, xmm4
143    paddd       xmm2, xmm4
144
145    psrad       xmm9, 7                     ;shift
146    psrad       xmm6, 7
147    psrad       xmm0, 7
148    psrad       xmm2, 7
149
150    packssdw    xmm0, xmm9                  ;pack back to word
151    packssdw    xmm2, xmm6                  ;pack back to word
152
153    ;clamp the values
154    pminsw      xmm0, xmm8
155    pmaxsw      xmm0, xmm5
156    pminsw      xmm2, xmm8
157    pmaxsw      xmm2, xmm5
158
159%if %1
160    movdqu      xmm1, [rdi]
161    movdqu      xmm3, [rdi + 16]
162    pavgw       xmm0, xmm1
163    pavgw       xmm2, xmm3
164%endif
165    movdqu      [rdi], xmm0               ;store the result
166    movdqu      [rdi + 16], xmm2          ;store the result
167
168    lea         rsi, [rsi + 2*rax]
169    lea         rdi, [rdi + 2*rdx]
170    dec         rcx
171%endm
172%endif
173
174SECTION .text
175
176globalsym(vpx_highbd_filter_block1d4_v2_sse2)
177sym(vpx_highbd_filter_block1d4_v2_sse2):
178    push        rbp
179    mov         rbp, rsp
180    SHADOW_ARGS_TO_STACK 7
181    push        rsi
182    push        rdi
183    ; end prolog
184
185    HIGH_GET_PARAM_4
186.loop:
187    movq        xmm0, [rsi]                 ;load src
188    movq        xmm1, [rsi + 2*rax]
189
190    HIGH_APPLY_FILTER_4 0
191    jnz         .loop
192
193    ; begin epilog
194    pop         rdi
195    pop         rsi
196    UNSHADOW_ARGS
197    pop         rbp
198    ret
199
200%if VPX_ARCH_X86_64
201globalsym(vpx_highbd_filter_block1d8_v2_sse2)
202sym(vpx_highbd_filter_block1d8_v2_sse2):
203    push        rbp
204    mov         rbp, rsp
205    SHADOW_ARGS_TO_STACK 7
206    SAVE_XMM 8
207    push        rsi
208    push        rdi
209    ; end prolog
210
211    HIGH_GET_PARAM
212.loop:
213    movdqu      xmm0, [rsi]                 ;0
214    movdqu      xmm1, [rsi + 2*rax]         ;1
215
216    HIGH_APPLY_FILTER_8 0
217    jnz         .loop
218
219    ; begin epilog
220    pop         rdi
221    pop         rsi
222    RESTORE_XMM
223    UNSHADOW_ARGS
224    pop         rbp
225    ret
226
227globalsym(vpx_highbd_filter_block1d16_v2_sse2)
228sym(vpx_highbd_filter_block1d16_v2_sse2):
229    push        rbp
230    mov         rbp, rsp
231    SHADOW_ARGS_TO_STACK 7
232    SAVE_XMM 9
233    push        rsi
234    push        rdi
235    ; end prolog
236
237    HIGH_GET_PARAM
238.loop:
239    movdqu        xmm0, [rsi]               ;0
240    movdqu        xmm2, [rsi + 16]
241    movdqu        xmm1, [rsi + 2*rax]       ;1
242    movdqu        xmm3, [rsi + 2*rax + 16]
243
244    HIGH_APPLY_FILTER_16 0
245    jnz         .loop
246
247    ; begin epilog
248    pop         rdi
249    pop         rsi
250    RESTORE_XMM
251    UNSHADOW_ARGS
252    pop         rbp
253    ret
254%endif
255
256globalsym(vpx_highbd_filter_block1d4_v2_avg_sse2)
257sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
258    push        rbp
259    mov         rbp, rsp
260    SHADOW_ARGS_TO_STACK 7
261    push        rsi
262    push        rdi
263    ; end prolog
264
265    HIGH_GET_PARAM_4
266.loop:
267    movq        xmm0, [rsi]                 ;load src
268    movq        xmm1, [rsi + 2*rax]
269
270    HIGH_APPLY_FILTER_4 1
271    jnz         .loop
272
273    ; begin epilog
274    pop         rdi
275    pop         rsi
276    UNSHADOW_ARGS
277    pop         rbp
278    ret
279
280%if VPX_ARCH_X86_64
281globalsym(vpx_highbd_filter_block1d8_v2_avg_sse2)
282sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
283    push        rbp
284    mov         rbp, rsp
285    SHADOW_ARGS_TO_STACK 7
286    SAVE_XMM 8
287    push        rsi
288    push        rdi
289    ; end prolog
290
291    HIGH_GET_PARAM
292.loop:
293    movdqu      xmm0, [rsi]                 ;0
294    movdqu      xmm1, [rsi + 2*rax]         ;1
295
296    HIGH_APPLY_FILTER_8 1
297    jnz         .loop
298
299    ; begin epilog
300    pop         rdi
301    pop         rsi
302    RESTORE_XMM
303    UNSHADOW_ARGS
304    pop         rbp
305    ret
306
307globalsym(vpx_highbd_filter_block1d16_v2_avg_sse2)
308sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
309    push        rbp
310    mov         rbp, rsp
311    SHADOW_ARGS_TO_STACK 7
312    SAVE_XMM 9
313    push        rsi
314    push        rdi
315    ; end prolog
316
317    HIGH_GET_PARAM
318.loop:
319    movdqu        xmm0, [rsi]               ;0
320    movdqu        xmm1, [rsi + 2*rax]       ;1
321    movdqu        xmm2, [rsi + 16]
322    movdqu        xmm3, [rsi + 2*rax + 16]
323
324    HIGH_APPLY_FILTER_16 1
325    jnz         .loop
326
327    ; begin epilog
328    pop         rdi
329    pop         rsi
330    RESTORE_XMM
331    UNSHADOW_ARGS
332    pop         rbp
333    ret
334%endif
335
336globalsym(vpx_highbd_filter_block1d4_h2_sse2)
337sym(vpx_highbd_filter_block1d4_h2_sse2):
338    push        rbp
339    mov         rbp, rsp
340    SHADOW_ARGS_TO_STACK 7
341    push        rsi
342    push        rdi
343    ; end prolog
344
345    HIGH_GET_PARAM_4
346.loop:
347    movdqu      xmm0, [rsi]                 ;load src
348    movdqa      xmm1, xmm0
349    psrldq      xmm1, 2
350
351    HIGH_APPLY_FILTER_4 0
352    jnz         .loop
353
354    ; begin epilog
355    pop         rdi
356    pop         rsi
357    UNSHADOW_ARGS
358    pop         rbp
359    ret
360
361%if VPX_ARCH_X86_64
362globalsym(vpx_highbd_filter_block1d8_h2_sse2)
363sym(vpx_highbd_filter_block1d8_h2_sse2):
364    push        rbp
365    mov         rbp, rsp
366    SHADOW_ARGS_TO_STACK 7
367    SAVE_XMM 8
368    push        rsi
369    push        rdi
370    ; end prolog
371
372    HIGH_GET_PARAM
373.loop:
374    movdqu      xmm0, [rsi]                 ;load src
375    movdqu      xmm1, [rsi + 2]
376
377    HIGH_APPLY_FILTER_8 0
378    jnz         .loop
379
380    ; begin epilog
381    pop         rdi
382    pop         rsi
383    RESTORE_XMM
384    UNSHADOW_ARGS
385    pop         rbp
386    ret
387
388globalsym(vpx_highbd_filter_block1d16_h2_sse2)
389sym(vpx_highbd_filter_block1d16_h2_sse2):
390    push        rbp
391    mov         rbp, rsp
392    SHADOW_ARGS_TO_STACK 7
393    SAVE_XMM 9
394    push        rsi
395    push        rdi
396    ; end prolog
397
398    HIGH_GET_PARAM
399.loop:
400    movdqu      xmm0,   [rsi]               ;load src
401    movdqu      xmm1,   [rsi + 2]
402    movdqu      xmm2,   [rsi + 16]
403    movdqu      xmm3,   [rsi + 18]
404
405    HIGH_APPLY_FILTER_16 0
406    jnz         .loop
407
408    ; begin epilog
409    pop         rdi
410    pop         rsi
411    RESTORE_XMM
412    UNSHADOW_ARGS
413    pop         rbp
414    ret
415%endif
416
417globalsym(vpx_highbd_filter_block1d4_h2_avg_sse2)
418sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
419    push        rbp
420    mov         rbp, rsp
421    SHADOW_ARGS_TO_STACK 7
422    push        rsi
423    push        rdi
424    ; end prolog
425
426    HIGH_GET_PARAM_4
427.loop:
428    movdqu      xmm0, [rsi]                 ;load src
429    movdqa      xmm1, xmm0
430    psrldq      xmm1, 2
431
432    HIGH_APPLY_FILTER_4 1
433    jnz         .loop
434
435    ; begin epilog
436    pop         rdi
437    pop         rsi
438    UNSHADOW_ARGS
439    pop         rbp
440    ret
441
442%if VPX_ARCH_X86_64
443globalsym(vpx_highbd_filter_block1d8_h2_avg_sse2)
444sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
445    push        rbp
446    mov         rbp, rsp
447    SHADOW_ARGS_TO_STACK 7
448    SAVE_XMM 8
449    push        rsi
450    push        rdi
451    ; end prolog
452
453    HIGH_GET_PARAM
454.loop:
455    movdqu      xmm0, [rsi]                 ;load src
456    movdqu      xmm1, [rsi + 2]
457
458    HIGH_APPLY_FILTER_8 1
459    jnz         .loop
460
461    ; begin epilog
462    pop         rdi
463    pop         rsi
464    RESTORE_XMM
465    UNSHADOW_ARGS
466    pop         rbp
467    ret
468
469globalsym(vpx_highbd_filter_block1d16_h2_avg_sse2)
470sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
471    push        rbp
472    mov         rbp, rsp
473    SHADOW_ARGS_TO_STACK 7
474    SAVE_XMM 9
475    push        rsi
476    push        rdi
477    ; end prolog
478
479    HIGH_GET_PARAM
480.loop:
481    movdqu      xmm0,   [rsi]               ;load src
482    movdqu      xmm1,   [rsi + 2]
483    movdqu      xmm2,   [rsi + 16]
484    movdqu      xmm3,   [rsi + 18]
485
486    HIGH_APPLY_FILTER_16 1
487    jnz         .loop
488
489    ; begin epilog
490    pop         rdi
491    pop         rsi
492    RESTORE_XMM
493    UNSHADOW_ARGS
494    pop         rbp
495    ret
496%endif
497