xref: /aosp_15_r20/external/libvpx/vp8/encoder/x86/dct_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro STACK_FRAME_CREATE 0
15%if ABI_IS_32BIT
16  %define       input       rsi
17  %define       output      rdi
18  %define       pitch       rax
19    push        rbp
20    mov         rbp, rsp
21    GET_GOT     rbx
22    push        rsi
23    push        rdi
24    ; end prolog
25
26    mov         rsi, arg(0)
27    mov         rdi, arg(1)
28
29    movsxd      rax, dword ptr arg(2)
30    lea         rcx, [rsi + rax*2]
31%else
32  %if LIBVPX_YASM_WIN64
33    %define     input       rcx
34    %define     output      rdx
35    %define     pitch       r8
36    SAVE_XMM 7, u
37  %else
38    %define     input       rdi
39    %define     output      rsi
40    %define     pitch       rdx
41  %endif
42%endif
43%endmacro
44
45%macro STACK_FRAME_DESTROY 0
46  %define     input
47  %define     output
48  %define     pitch
49
50%if ABI_IS_32BIT
51    pop         rdi
52    pop         rsi
53    RESTORE_GOT
54    pop         rbp
55%else
56  %if LIBVPX_YASM_WIN64
57    RESTORE_XMM
58  %endif
59%endif
60    ret
61%endmacro
62
63SECTION .text
64
65;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
66globalsym(vp8_short_fdct4x4_sse2)
67sym(vp8_short_fdct4x4_sse2):
68
69    STACK_FRAME_CREATE
70
71    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
72    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
73    lea         input,          [input+2*pitch]
74    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
75    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
76
77    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
78    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
79
80    movdqa      xmm2, xmm0
81    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
82    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
83    movdqa      xmm1, xmm0
84    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
85    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
86    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
87
88    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
89    movdqa      xmm3, xmm0
90    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
91    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
92    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
93    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
94
95    movdqa      xmm1, xmm0
96    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
97    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
98    movdqa      xmm4, xmm3
99    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
100    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
101
102    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
103    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
104    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
105    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
106
107    packssdw    xmm0, xmm1                      ;op[2] op[0]
108    packssdw    xmm3, xmm4                      ;op[3] op[1]
109    ; 23 22 21 20 03 02 01 00
110    ;
111    ; 33 32 31 30 13 12 11 10
112    ;
113    movdqa      xmm2, xmm0
114    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
115    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
116
117    movdqa      xmm3, xmm0
118    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
119    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
120    movdqa      xmm2, xmm0
121    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
122    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
123
124    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
125    pshufd      xmm2, xmm2, 04eh
126    movdqa      xmm3, xmm0
127    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
128    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
129
130    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
131    movdqa      xmm2, xmm3                      ;save d1 for compare
132    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
133    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
134    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
135    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
136    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
137    movdqa      xmm1, xmm0
138    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
139    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
140
141    pxor        xmm4, xmm4                      ;zero out for compare
142    paddd       xmm0, xmm5
143    paddd       xmm1, xmm5
144    pcmpeqw     xmm2, xmm4
145    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
146    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
147    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
148                                                     ;and keep bit 0 of lower
149
150    movdqa      xmm4, xmm3
151    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
152    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
153    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
154    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
155    packssdw    xmm0, xmm1                      ;op[8] op[0]
156    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
157    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
158
159    packssdw    xmm3, xmm4                      ;op[12] op[4]
160    movdqa      xmm1, xmm0
161    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
162    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
163    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
164
165    movdqa      XMMWORD PTR[output +  0], xmm0
166    movdqa      XMMWORD PTR[output + 16], xmm1
167
168    STACK_FRAME_DESTROY
169
170;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
171globalsym(vp8_short_fdct8x4_sse2)
172sym(vp8_short_fdct8x4_sse2):
173
174    STACK_FRAME_CREATE
175
176        ; read the input data
177        movdqa      xmm0,       [input        ]
178        movdqa      xmm2,       [input+  pitch]
179        lea         input,      [input+2*pitch]
180        movdqa      xmm4,       [input        ]
181        movdqa      xmm3,       [input+  pitch]
182
183        ; transpose for the first stage
184        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
185        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
186
187        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
188        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
189
190        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
191        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
192
193        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
194        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
195
196        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
197
198        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
199        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
200
201        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
202        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
203
204        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
205        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
206
207        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
208        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
209
210        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
211
212        ; xmm0 0
213        ; xmm1 1
214        ; xmm2 2
215        ; xmm3 3
216
217        ; first stage
218        movdqa      xmm5,       xmm0
219        movdqa      xmm4,       xmm1
220
221        paddw       xmm0,       xmm3        ; a1 = 0 + 3
222        paddw       xmm1,       xmm2        ; b1 = 1 + 2
223
224        psubw       xmm4,       xmm2        ; c1 = 1 - 2
225        psubw       xmm5,       xmm3        ; d1 = 0 - 3
226
227        psllw       xmm5,        3
228        psllw       xmm4,        3
229
230        psllw       xmm0,        3
231        psllw       xmm1,        3
232
233        ; output 0 and 2
234        movdqa      xmm2,       xmm0        ; a1
235
236        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
237        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
238
239        ; output 1 and 3
240        ; interleave c1, d1
241        movdqa      xmm1,       xmm5        ; d1
242        punpcklwd   xmm1,       xmm4        ; c1 d1
243        punpckhwd   xmm5,       xmm4        ; c1 d1
244
245        movdqa      xmm3,       xmm1
246        movdqa      xmm4,       xmm5
247
248        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
249        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
250
251        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
252        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
253
254        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
255        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
256        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
257        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
258
259        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
260        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
261        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
262        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
263
264        packssdw    xmm1,       xmm4        ; op[1]
265        packssdw    xmm3,       xmm5        ; op[3]
266
267        ; done with vertical
268        ; transpose for the second stage
269        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
270        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
271
272        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
273        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
274
275        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
276        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
277
278        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
279        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
280
281        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
282
283        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
284        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
285
286        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
287        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
288
289        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
290        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
291
292        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
293        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
294
295        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
296
297        ; xmm0 0
298        ; xmm1 4
299        ; xmm2 1
300        ; xmm3 3
301
302        movdqa      xmm5,       xmm0
303        movdqa      xmm2,       xmm1
304
305        paddw       xmm0,       xmm3        ; a1 = 0 + 3
306        paddw       xmm1,       xmm4        ; b1 = 1 + 2
307
308        psubw       xmm4,       xmm2        ; c1 = 1 - 2
309        psubw       xmm5,       xmm3        ; d1 = 0 - 3
310
311        pxor        xmm6,       xmm6        ; zero out for compare
312
313        pcmpeqw     xmm6,       xmm5        ; d1 != 0
314
315        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
316                                                                    ; and keep bit 0 of lower
317
318        ; output 0 and 2
319        movdqa      xmm2,       xmm0        ; a1
320
321        paddw       xmm0,       xmm1        ; a1 + b1
322        psubw       xmm2,       xmm1        ; a1 - b1
323
324        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
325        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
326
327        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
328        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
329
330        ; output 1 and 3
331        ; interleave c1, d1
332        movdqa      xmm1,       xmm5        ; d1
333        punpcklwd   xmm1,       xmm4        ; c1 d1
334        punpckhwd   xmm5,       xmm4        ; c1 d1
335
336        movdqa      xmm3,       xmm1
337        movdqa      xmm4,       xmm5
338
339        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
340        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
341
342        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
343        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
344
345        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
346        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
347        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
348        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
349
350        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
351        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
352        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
353        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
354
355        packssdw    xmm1,       xmm4        ; op[4]
356        packssdw    xmm3,       xmm5        ; op[12]
357
358        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
359
360        movdqa      xmm4,       xmm0
361        movdqa      xmm5,       xmm2
362
363        punpcklqdq  xmm0,       xmm1
364        punpckhqdq  xmm4,       xmm1
365
366        punpcklqdq  xmm2,       xmm3
367        punpckhqdq  xmm5,       xmm3
368
369        movdqa      XMMWORD PTR[output + 0 ],  xmm0
370        movdqa      XMMWORD PTR[output + 16],  xmm2
371        movdqa      XMMWORD PTR[output + 32],  xmm4
372        movdqa      XMMWORD PTR[output + 48],  xmm5
373
374    STACK_FRAME_DESTROY
375
376SECTION_RODATA
377align 16
378_5352_2217:
379    dw 5352
380    dw 2217
381    dw 5352
382    dw 2217
383    dw 5352
384    dw 2217
385    dw 5352
386    dw 2217
387align 16
388_2217_neg5352:
389    dw 2217
390    dw -5352
391    dw 2217
392    dw -5352
393    dw 2217
394    dw -5352
395    dw 2217
396    dw -5352
397align 16
398_mult_add:
399    times 8 dw 1
400align 16
401_cmp_mask:
402    times 4 dw 1
403    times 4 dw 0
404align 16
405_cmp_mask8x4:
406    times 8 dw 1
407align 16
408_mult_sub:
409    dw 1
410    dw -1
411    dw 1
412    dw -1
413    dw 1
414    dw -1
415    dw 1
416    dw -1
417align 16
418_7:
419    times 4 dd 7
420align 16
421_7w:
422    times 8 dw 7
423align 16
424_14500:
425    times 4 dd 14500
426align 16
427_7500:
428    times 4 dd 7500
429align 16
430_12000:
431    times 4 dd 12000
432align 16
433_51000:
434    times 4 dd 51000
435