xref: /aosp_15_r20/external/libjpeg-turbo/simd/x86_64/jidctflt-sse2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1;
2; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6; Copyright (C) 2018, Matthias Räncker.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; This file contains a floating-point implementation of the inverse DCT
19; (Discrete Cosine Transform). The following code is based directly on
20; the IJG's original jidctflt.c; see the jidctflt.c for more details.
21
22%include "jsimdext.inc"
23%include "jdct.inc"
24
25; --------------------------------------------------------------------------
26
27%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
28    shufps      %1, %2, 0x44
29%endmacro
30
31%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
32    shufps      %1, %2, 0xEE
33%endmacro
34
35; --------------------------------------------------------------------------
36    SECTION     SEG_CONST
37
38    alignz      32
39    GLOBAL_DATA(jconst_idct_float_sse2)
40
41EXTN(jconst_idct_float_sse2):
42
43PD_1_414        times 4  dd  1.414213562373095048801689
44PD_1_847        times 4  dd  1.847759065022573512256366
45PD_1_082        times 4  dd  1.082392200292393968799446
46PD_M2_613       times 4  dd -2.613125929752753055713286
47PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
48PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
49
50    alignz      32
51
52; --------------------------------------------------------------------------
53    SECTION     SEG_TEXT
54    BITS        64
55;
56; Perform dequantization and inverse DCT on one block of coefficients.
57;
58; GLOBAL(void)
59; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
60;                       JSAMPARRAY output_buf, JDIMENSION output_col)
61;
62
63; r10 = void *dct_table
64; r11 = JCOEFPTR coef_block
65; r12 = JSAMPARRAY output_buf
66; r13d = JDIMENSION output_col
67
68%define original_rbp  rbp + 0
69%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
70                                        ; xmmword wk[WK_NUM]
71%define WK_NUM        2
72%define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
73                                        ; FAST_FLOAT workspace[DCTSIZE2]
74
75    align       32
76    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
77
78EXTN(jsimd_idct_float_sse2):
79    push        rbp
80    mov         rax, rsp                     ; rax = original rbp
81    sub         rsp, byte 4
82    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
83    mov         [rsp], rax
84    mov         rbp, rsp                     ; rbp = aligned rbp
85    lea         rsp, [workspace]
86    collect_args 4
87    push        rbx
88
89    ; ---- Pass 1: process columns from input, store into work array.
90
91    mov         rdx, r10                ; quantptr
92    mov         rsi, r11                ; inptr
93    lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
94    mov         rcx, DCTSIZE/4          ; ctr
95.columnloop:
96%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
97    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
98    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
99    jnz         near .columnDCT
100
101    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
102    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
103    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
104    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
105    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
106    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
107    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
108    por         xmm1, xmm2
109    por         xmm3, xmm4
110    por         xmm5, xmm6
111    por         xmm1, xmm3
112    por         xmm5, xmm7
113    por         xmm1, xmm5
114    packsswb    xmm1, xmm1
115    movd        eax, xmm1
116    test        rax, rax
117    jnz         short .columnDCT
118
119    ; -- AC terms all zero
120
121    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
122
123    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
124    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
125    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
126
127    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
128
129    movaps      xmm1, xmm0
130    movaps      xmm2, xmm0
131    movaps      xmm3, xmm0
132
133    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
134    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
135    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
136    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
137
138    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
139    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
140    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
141    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
142    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
143    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
144    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
145    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
146    jmp         near .nextcolumn
147%endif
148.columnDCT:
149
150    ; -- Even part
151
152    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
153    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
154    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
155    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
156
157    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
158    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
159    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
160    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
161    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
162    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
163
164    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
165    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
166    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
167    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
168    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
169    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
170
171    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
172    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
173    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
174    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
175
176    movaps      xmm4, xmm0
177    movaps      xmm5, xmm1
178    subps       xmm0, xmm2              ; xmm0=tmp11
179    subps       xmm1, xmm3
180    addps       xmm4, xmm2              ; xmm4=tmp10
181    addps       xmm5, xmm3              ; xmm5=tmp13
182
183    mulps       xmm1, [rel PD_1_414]
184    subps       xmm1, xmm5              ; xmm1=tmp12
185
186    movaps      xmm6, xmm4
187    movaps      xmm7, xmm0
188    subps       xmm4, xmm5              ; xmm4=tmp3
189    subps       xmm0, xmm1              ; xmm0=tmp2
190    addps       xmm6, xmm5              ; xmm6=tmp0
191    addps       xmm7, xmm1              ; xmm7=tmp1
192
193    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
194    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
195
196    ; -- Odd part
197
198    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
199    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
200    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
201    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
202
203    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
204    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
205    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
206    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
207    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
208    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
209
210    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
211    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
212    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
213    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
214    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
215    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
216
217    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
218    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
219    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
220    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
221
222    movaps      xmm4, xmm2
223    movaps      xmm0, xmm5
224    addps       xmm2, xmm1              ; xmm2=z11
225    addps       xmm5, xmm3              ; xmm5=z13
226    subps       xmm4, xmm1              ; xmm4=z12
227    subps       xmm0, xmm3              ; xmm0=z10
228
229    movaps      xmm1, xmm2
230    subps       xmm2, xmm5
231    addps       xmm1, xmm5              ; xmm1=tmp7
232
233    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
234
235    movaps      xmm3, xmm0
236    addps       xmm0, xmm4
237    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
238    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
239    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
240    addps       xmm3, xmm0              ; xmm3=tmp12
241    subps       xmm4, xmm0              ; xmm4=tmp10
242
243    ; -- Final output stage
244
245    subps       xmm3, xmm1              ; xmm3=tmp6
246    movaps      xmm5, xmm6
247    movaps      xmm0, xmm7
248    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
249    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
250    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
251    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
252    subps       xmm2, xmm3              ; xmm2=tmp5
253
254    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
255    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
256    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
257    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
258    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
259    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
260
261    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
262    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
263
264    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
265    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
266
267    addps       xmm4, xmm2              ; xmm4=tmp4
268    movaps      xmm0, xmm7
269    movaps      xmm3, xmm5
270    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
271    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
272    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
273    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
274
275    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
276    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
277    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
278    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
279    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
280    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
281
282    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
283    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
284    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
285    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
286    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
287    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
288
289    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
290    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
291
292    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
293    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
294    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
295    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
296
297    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
298    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
299    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
300    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
301    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
302    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
303
304    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
305    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
306    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
307    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
308
309.nextcolumn:
310    add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
311    add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
312    add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
313    dec         rcx                                    ; ctr
314    jnz         near .columnloop
315
316    ; -- Prefetch the next coefficient block
317
318    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
319    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
320    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
321    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
322
323    ; ---- Pass 2: process rows from work array, store into output array.
324
325    mov         rax, [original_rbp]
326    lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
327    mov         rdi, r12                ; (JSAMPROW *)
328    mov         eax, r13d
329    mov         rcx, DCTSIZE/4          ; ctr
330.rowloop:
331
332    ; -- Even part
333
334    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
335    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
336    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
337    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
338
339    movaps      xmm4, xmm0
340    movaps      xmm5, xmm1
341    subps       xmm0, xmm2              ; xmm0=tmp11
342    subps       xmm1, xmm3
343    addps       xmm4, xmm2              ; xmm4=tmp10
344    addps       xmm5, xmm3              ; xmm5=tmp13
345
346    mulps       xmm1, [rel PD_1_414]
347    subps       xmm1, xmm5              ; xmm1=tmp12
348
349    movaps      xmm6, xmm4
350    movaps      xmm7, xmm0
351    subps       xmm4, xmm5              ; xmm4=tmp3
352    subps       xmm0, xmm1              ; xmm0=tmp2
353    addps       xmm6, xmm5              ; xmm6=tmp0
354    addps       xmm7, xmm1              ; xmm7=tmp1
355
356    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
357    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
358
359    ; -- Odd part
360
361    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
362    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
363    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
364    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
365
366    movaps      xmm4, xmm2
367    movaps      xmm0, xmm5
368    addps       xmm2, xmm1              ; xmm2=z11
369    addps       xmm5, xmm3              ; xmm5=z13
370    subps       xmm4, xmm1              ; xmm4=z12
371    subps       xmm0, xmm3              ; xmm0=z10
372
373    movaps      xmm1, xmm2
374    subps       xmm2, xmm5
375    addps       xmm1, xmm5              ; xmm1=tmp7
376
377    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
378
379    movaps      xmm3, xmm0
380    addps       xmm0, xmm4
381    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
382    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
383    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
384    addps       xmm3, xmm0              ; xmm3=tmp12
385    subps       xmm4, xmm0              ; xmm4=tmp10
386
387    ; -- Final output stage
388
389    subps       xmm3, xmm1              ; xmm3=tmp6
390    movaps      xmm5, xmm6
391    movaps      xmm0, xmm7
392    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
393    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
394    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
395    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
396    subps       xmm2, xmm3              ; xmm2=tmp5
397
398    movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
399    pcmpeqd     xmm3, xmm3
400    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
401
402    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
403    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
404    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
405    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
406
407    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
408    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
409    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
410    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
411    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
412    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
413
414    movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
415    movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
416
417    addps       xmm4, xmm2              ; xmm4=tmp4
418    movaps      xmm7, xmm1
419    movaps      xmm5, xmm3
420    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
421    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
422    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
423    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
424
425    movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
426    pcmpeqd     xmm4, xmm4
427    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
428
429    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
430    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
431    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
432    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
433
434    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
435    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
436    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
437    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
438    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
439    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
440
441    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
442
443    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
444    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
445    paddb       xmm6, xmm2
446    paddb       xmm1, xmm2
447
448    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
449    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
450    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
451
452    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
453    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
454    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
455
456    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
457    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
458
459    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
460    mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
461    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
462    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
463    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
464    mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
465    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
466    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
467
468    add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
469    add         rdi, byte 4*SIZEOF_JSAMPROW
470    dec         rcx                            ; ctr
471    jnz         near .rowloop
472
473    pop         rbx
474    uncollect_args 4
475    mov         rsp, rbp                ; rsp <- aligned rbp
476    pop         rsp                     ; rsp <- original rbp
477    pop         rbp
478    ret
479
480; For some reason, the OS X linker does not honor the request to align the
481; segment unless we do this.
482    align       32
483