xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro GET_PARAM_4 0
14    mov         rdx, arg(5)                 ;filter ptr
15    mov         rsi, arg(0)                 ;src_ptr
16    mov         rdi, arg(2)                 ;output_ptr
17    mov         ecx, 0x01000100
18
19    movdqa      xmm3, [rdx]                 ;load filters
20    psrldq      xmm3, 6
21    packsswb    xmm3, xmm3
22    pshuflw     xmm3, xmm3, 0b              ;k3_k4
23
24    movd        xmm2, ecx                   ;rounding_shift
25    pshufd      xmm2, xmm2, 0
26
27    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
28    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
29    movsxd      rcx, DWORD PTR arg(4)       ;output_height
30%endm
31
32%macro APPLY_FILTER_4 1
33    punpcklbw   xmm0, xmm1
34    pmaddubsw   xmm0, xmm3
35
36    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
37    packuswb    xmm0, xmm0                  ;pack to byte
38
39%if %1
40    movd        xmm1, [rdi]
41    pavgb       xmm0, xmm1
42%endif
43    movd        [rdi], xmm0
44    lea         rsi, [rsi + rax]
45    lea         rdi, [rdi + rdx]
46    dec         rcx
47%endm
48
49%macro GET_PARAM 0
50    mov         rdx, arg(5)                 ;filter ptr
51    mov         rsi, arg(0)                 ;src_ptr
52    mov         rdi, arg(2)                 ;output_ptr
53    mov         ecx, 0x01000100
54
55    movdqa      xmm7, [rdx]                 ;load filters
56    psrldq      xmm7, 6
57    packsswb    xmm7, xmm7
58    pshuflw     xmm7, xmm7, 0b              ;k3_k4
59    punpcklwd   xmm7, xmm7
60
61    movd        xmm6, ecx                   ;rounding_shift
62    pshufd      xmm6, xmm6, 0
63
64    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
65    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
66    movsxd      rcx, DWORD PTR arg(4)       ;output_height
67%endm
68
69%macro APPLY_FILTER_8 1
70    punpcklbw   xmm0, xmm1
71    pmaddubsw   xmm0, xmm7
72
73    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
74    packuswb    xmm0, xmm0                  ;pack back to byte
75
76%if %1
77    movq        xmm1, [rdi]
78    pavgb       xmm0, xmm1
79%endif
80    movq        [rdi], xmm0                 ;store the result
81
82    lea         rsi, [rsi + rax]
83    lea         rdi, [rdi + rdx]
84    dec         rcx
85%endm
86
87%macro APPLY_FILTER_16 1
88    punpcklbw   xmm0, xmm1
89    punpckhbw   xmm2, xmm1
90    pmaddubsw   xmm0, xmm7
91    pmaddubsw   xmm2, xmm7
92
93    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
94    pmulhrsw    xmm2, xmm6
95    packuswb    xmm0, xmm2                  ;pack back to byte
96
97%if %1
98    movdqu      xmm1, [rdi]
99    pavgb       xmm0, xmm1
100%endif
101    movdqu      [rdi], xmm0                 ;store the result
102
103    lea         rsi, [rsi + rax]
104    lea         rdi, [rdi + rdx]
105    dec         rcx
106%endm
107
108SECTION .text
109
110globalsym(vpx_filter_block1d4_v2_ssse3)
111sym(vpx_filter_block1d4_v2_ssse3):
112    push        rbp
113    mov         rbp, rsp
114    SHADOW_ARGS_TO_STACK 6
115    push        rsi
116    push        rdi
117    ; end prolog
118
119    GET_PARAM_4
120.loop:
121    movd        xmm0, [rsi]                 ;load src
122    movd        xmm1, [rsi + rax]
123
124    APPLY_FILTER_4 0
125    jnz         .loop
126
127    ; begin epilog
128    pop         rdi
129    pop         rsi
130    UNSHADOW_ARGS
131    pop         rbp
132    ret
133
134globalsym(vpx_filter_block1d8_v2_ssse3)
135sym(vpx_filter_block1d8_v2_ssse3):
136    push        rbp
137    mov         rbp, rsp
138    SHADOW_ARGS_TO_STACK 6
139    SAVE_XMM 7
140    push        rsi
141    push        rdi
142    ; end prolog
143
144    GET_PARAM
145.loop:
146    movq        xmm0, [rsi]                 ;0
147    movq        xmm1, [rsi + rax]           ;1
148
149    APPLY_FILTER_8 0
150    jnz         .loop
151
152    ; begin epilog
153    pop         rdi
154    pop         rsi
155    RESTORE_XMM
156    UNSHADOW_ARGS
157    pop         rbp
158    ret
159
160globalsym(vpx_filter_block1d16_v2_ssse3)
161sym(vpx_filter_block1d16_v2_ssse3):
162    push        rbp
163    mov         rbp, rsp
164    SHADOW_ARGS_TO_STACK 6
165    SAVE_XMM 7
166    push        rsi
167    push        rdi
168    ; end prolog
169
170    GET_PARAM
171.loop:
172    movdqu        xmm0, [rsi]               ;0
173    movdqu        xmm1, [rsi + rax]         ;1
174    movdqa        xmm2, xmm0
175
176    APPLY_FILTER_16 0
177    jnz         .loop
178
179    ; begin epilog
180    pop         rdi
181    pop         rsi
182    RESTORE_XMM
183    UNSHADOW_ARGS
184    pop         rbp
185    ret
186
187globalsym(vpx_filter_block1d4_v2_avg_ssse3)
188sym(vpx_filter_block1d4_v2_avg_ssse3):
189    push        rbp
190    mov         rbp, rsp
191    SHADOW_ARGS_TO_STACK 6
192    push        rsi
193    push        rdi
194    ; end prolog
195
196    GET_PARAM_4
197.loop:
198    movd        xmm0, [rsi]                 ;load src
199    movd        xmm1, [rsi + rax]
200
201    APPLY_FILTER_4 1
202    jnz         .loop
203
204    ; begin epilog
205    pop         rdi
206    pop         rsi
207    UNSHADOW_ARGS
208    pop         rbp
209    ret
210
211globalsym(vpx_filter_block1d8_v2_avg_ssse3)
212sym(vpx_filter_block1d8_v2_avg_ssse3):
213    push        rbp
214    mov         rbp, rsp
215    SHADOW_ARGS_TO_STACK 6
216    SAVE_XMM 7
217    push        rsi
218    push        rdi
219    ; end prolog
220
221    GET_PARAM
222.loop:
223    movq        xmm0, [rsi]                 ;0
224    movq        xmm1, [rsi + rax]           ;1
225
226    APPLY_FILTER_8 1
227    jnz         .loop
228
229    ; begin epilog
230    pop         rdi
231    pop         rsi
232    RESTORE_XMM
233    UNSHADOW_ARGS
234    pop         rbp
235    ret
236
237globalsym(vpx_filter_block1d16_v2_avg_ssse3)
238sym(vpx_filter_block1d16_v2_avg_ssse3):
239    push        rbp
240    mov         rbp, rsp
241    SHADOW_ARGS_TO_STACK 6
242    SAVE_XMM 7
243    push        rsi
244    push        rdi
245    ; end prolog
246
247    GET_PARAM
248.loop:
249    movdqu        xmm0, [rsi]               ;0
250    movdqu        xmm1, [rsi + rax]         ;1
251    movdqa        xmm2, xmm0
252
253    APPLY_FILTER_16 1
254    jnz         .loop
255
256    ; begin epilog
257    pop         rdi
258    pop         rsi
259    RESTORE_XMM
260    UNSHADOW_ARGS
261    pop         rbp
262    ret
263
264globalsym(vpx_filter_block1d4_h2_ssse3)
265sym(vpx_filter_block1d4_h2_ssse3):
266    push        rbp
267    mov         rbp, rsp
268    SHADOW_ARGS_TO_STACK 6
269    push        rsi
270    push        rdi
271    ; end prolog
272
273    GET_PARAM_4
274.loop:
275    movdqu      xmm0, [rsi]                 ;load src
276    movdqa      xmm1, xmm0
277    psrldq      xmm1, 1
278
279    APPLY_FILTER_4 0
280    jnz         .loop
281
282    ; begin epilog
283    pop         rdi
284    pop         rsi
285    UNSHADOW_ARGS
286    pop         rbp
287    ret
288
289globalsym(vpx_filter_block1d8_h2_ssse3)
290sym(vpx_filter_block1d8_h2_ssse3):
291    push        rbp
292    mov         rbp, rsp
293    SHADOW_ARGS_TO_STACK 6
294    SAVE_XMM 7
295    push        rsi
296    push        rdi
297    ; end prolog
298
299    GET_PARAM
300.loop:
301    movdqu      xmm0, [rsi]                 ;load src
302    movdqa      xmm1, xmm0
303    psrldq      xmm1, 1
304
305    APPLY_FILTER_8 0
306    jnz         .loop
307
308    ; begin epilog
309    pop         rdi
310    pop         rsi
311    RESTORE_XMM
312    UNSHADOW_ARGS
313    pop         rbp
314    ret
315
316globalsym(vpx_filter_block1d16_h2_ssse3)
317sym(vpx_filter_block1d16_h2_ssse3):
318    push        rbp
319    mov         rbp, rsp
320    SHADOW_ARGS_TO_STACK 6
321    SAVE_XMM 7
322    push        rsi
323    push        rdi
324    ; end prolog
325
326    GET_PARAM
327.loop:
328    movdqu      xmm0,   [rsi]               ;load src
329    movdqu      xmm1,   [rsi + 1]
330    movdqa      xmm2, xmm0
331
332    APPLY_FILTER_16 0
333    jnz         .loop
334
335    ; begin epilog
336    pop         rdi
337    pop         rsi
338    RESTORE_XMM
339    UNSHADOW_ARGS
340    pop         rbp
341    ret
342
343globalsym(vpx_filter_block1d4_h2_avg_ssse3)
344sym(vpx_filter_block1d4_h2_avg_ssse3):
345    push        rbp
346    mov         rbp, rsp
347    SHADOW_ARGS_TO_STACK 6
348    push        rsi
349    push        rdi
350    ; end prolog
351
352    GET_PARAM_4
353.loop:
354    movdqu      xmm0, [rsi]                 ;load src
355    movdqa      xmm1, xmm0
356    psrldq      xmm1, 1
357
358    APPLY_FILTER_4 1
359    jnz         .loop
360
361    ; begin epilog
362    pop         rdi
363    pop         rsi
364    UNSHADOW_ARGS
365    pop         rbp
366    ret
367
368globalsym(vpx_filter_block1d8_h2_avg_ssse3)
369sym(vpx_filter_block1d8_h2_avg_ssse3):
370    push        rbp
371    mov         rbp, rsp
372    SHADOW_ARGS_TO_STACK 6
373    SAVE_XMM 7
374    push        rsi
375    push        rdi
376    ; end prolog
377
378    GET_PARAM
379.loop:
380    movdqu      xmm0, [rsi]                 ;load src
381    movdqa      xmm1, xmm0
382    psrldq      xmm1, 1
383
384    APPLY_FILTER_8 1
385    jnz         .loop
386
387    ; begin epilog
388    pop         rdi
389    pop         rsi
390    RESTORE_XMM
391    UNSHADOW_ARGS
392    pop         rbp
393    ret
394
395globalsym(vpx_filter_block1d16_h2_avg_ssse3)
396sym(vpx_filter_block1d16_h2_avg_ssse3):
397    push        rbp
398    mov         rbp, rsp
399    SHADOW_ARGS_TO_STACK 6
400    SAVE_XMM 7
401    push        rsi
402    push        rdi
403    ; end prolog
404
405    GET_PARAM
406.loop:
407    movdqu      xmm0,   [rsi]               ;load src
408    movdqu      xmm1,   [rsi + 1]
409    movdqa      xmm2, xmm0
410
411    APPLY_FILTER_16 1
412    jnz         .loop
413
414    ; begin epilog
415    pop         rdi
416    pop         rsi
417    RESTORE_XMM
418    UNSHADOW_ARGS
419    pop         rbp
420    ret
421