xref: /aosp_15_r20/external/libjpeg-turbo/simd/x86_64/jdsample-sse2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1;
2; jdsample.asm - upsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6; Copyright (C) 2018, Matthias Räncker.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17
18%include "jsimdext.inc"
19
20; --------------------------------------------------------------------------
21    SECTION     SEG_CONST
22
23    alignz      32
24    GLOBAL_DATA(jconst_fancy_upsample_sse2)
25
26EXTN(jconst_fancy_upsample_sse2):
27
28PW_ONE   times 8 dw 1
29PW_TWO   times 8 dw 2
30PW_THREE times 8 dw 3
31PW_SEVEN times 8 dw 7
32PW_EIGHT times 8 dw 8
33
34    alignz      32
35
36; --------------------------------------------------------------------------
37    SECTION     SEG_TEXT
38    BITS        64
39;
40; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
41;
42; The upsampling algorithm is linear interpolation between pixel centers,
43; also known as a "triangle filter".  This is a good compromise between
44; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
45; of the way between input pixel centers.
46;
47; GLOBAL(void)
48; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
49;                                JDIMENSION downsampled_width,
50;                                JSAMPARRAY input_data,
51;                                JSAMPARRAY *output_data_ptr);
52;
53
54; r10 = int max_v_samp_factor
55; r11d = JDIMENSION downsampled_width
56; r12 = JSAMPARRAY input_data
57; r13 = JSAMPARRAY *output_data_ptr
58
59    align       32
60    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
61
62EXTN(jsimd_h2v1_fancy_upsample_sse2):
63    push        rbp
64    mov         rax, rsp
65    mov         rbp, rsp
66    collect_args 4
67
68    mov         eax, r11d               ; colctr
69    test        rax, rax
70    jz          near .return
71
72    mov         rcx, r10                ; rowctr
73    test        rcx, rcx
74    jz          near .return
75
76    mov         rsi, r12                ; input_data
77    mov         rdi, r13
78    mov         rdip, JSAMPARRAY [rdi]  ; output_data
79.rowloop:
80    push        rax                     ; colctr
81    push        rdi
82    push        rsi
83
84    mov         rsip, JSAMPROW [rsi]    ; inptr
85    mov         rdip, JSAMPROW [rdi]    ; outptr
86
87    test        rax, SIZEOF_XMMWORD-1
88    jz          short .skip
89    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
90    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
91.skip:
92    pxor        xmm0, xmm0              ; xmm0=(all 0's)
93    pcmpeqb     xmm7, xmm7
94    psrldq      xmm7, (SIZEOF_XMMWORD-1)
95    pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
96
97    add         rax, byte SIZEOF_XMMWORD-1
98    and         rax, byte -SIZEOF_XMMWORD
99    cmp         rax, byte SIZEOF_XMMWORD
100    ja          short .columnloop
101
102.columnloop_last:
103    pcmpeqb     xmm6, xmm6
104    pslldq      xmm6, (SIZEOF_XMMWORD-1)
105    pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
106    jmp         short .upsample
107
108.columnloop:
109    movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
110    pslldq      xmm6, (SIZEOF_XMMWORD-1)
111
112.upsample:
113    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
114    movdqa      xmm2, xmm1
115    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
116    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
117    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
118
119    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
120    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
121
122    movdqa      xmm7, xmm1
123    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
124
125    movdqa      xmm4, xmm1
126    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
127    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
128    movdqa      xmm5, xmm2
129    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
130    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
131    movdqa      xmm6, xmm3
132    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
133    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
134
135    pmullw      xmm1, [rel PW_THREE]
136    pmullw      xmm4, [rel PW_THREE]
137    paddw       xmm2, [rel PW_ONE]
138    paddw       xmm5, [rel PW_ONE]
139    paddw       xmm3, [rel PW_TWO]
140    paddw       xmm6, [rel PW_TWO]
141
142    paddw       xmm2, xmm1
143    paddw       xmm5, xmm4
144    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
145    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
146    paddw       xmm3, xmm1
147    paddw       xmm6, xmm4
148    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
149    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
150
151    psllw       xmm3, BYTE_BIT
152    psllw       xmm6, BYTE_BIT
153    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
154    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
155
156    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
157    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
158
159    sub         rax, byte SIZEOF_XMMWORD
160    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
161    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
162    cmp         rax, byte SIZEOF_XMMWORD
163    ja          near .columnloop
164    test        eax, eax
165    jnz         near .columnloop_last
166
167    pop         rsi
168    pop         rdi
169    pop         rax
170
171    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
172    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
173    dec         rcx                        ; rowctr
174    jg          near .rowloop
175
176.return:
177    uncollect_args 4
178    pop         rbp
179    ret
180
181; --------------------------------------------------------------------------
182;
183; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
184; Again a triangle filter; see comments for h2v1 case, above.
185;
186; GLOBAL(void)
187; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
188;                                JDIMENSION downsampled_width,
189;                                JSAMPARRAY input_data,
190;                                JSAMPARRAY *output_data_ptr);
191;
192
193; r10 = int max_v_samp_factor
194; r11d = JDIMENSION downsampled_width
195; r12 = JSAMPARRAY input_data
196; r13 = JSAMPARRAY *output_data_ptr
197
198%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
199%define WK_NUM  4
200
201    align       32
202    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
203
204EXTN(jsimd_h2v2_fancy_upsample_sse2):
205    push        rbp
206    mov         rax, rsp                     ; rax = original rbp
207    sub         rsp, byte 4
208    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
209    mov         [rsp], rax
210    mov         rbp, rsp                     ; rbp = aligned rbp
211    lea         rsp, [wk(0)]
212    collect_args 4
213    push        rbx
214
215    mov         eax, r11d               ; colctr
216    test        rax, rax
217    jz          near .return
218
219    mov         rcx, r10                ; rowctr
220    test        rcx, rcx
221    jz          near .return
222
223    mov         rsi, r12                ; input_data
224    mov         rdi, r13
225    mov         rdip, JSAMPARRAY [rdi]  ; output_data
226.rowloop:
227    push        rax                     ; colctr
228    push        rcx
229    push        rdi
230    push        rsi
231
232    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
233    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
234    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
235    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
236    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
237
238    test        rax, SIZEOF_XMMWORD-1
239    jz          short .skip
240    push        rdx
241    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
242    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
243    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
244    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
245    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
246    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
247    pop         rdx
248.skip:
249    ; -- process the first column block
250
251    movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
252    movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
253    movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
254
255    pxor        xmm3, xmm3              ; xmm3=(all 0's)
256    movdqa      xmm4, xmm0
257    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
258    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
259    movdqa      xmm5, xmm1
260    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
261    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
262    movdqa      xmm6, xmm2
263    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
264    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
265
266    pmullw      xmm0, [rel PW_THREE]
267    pmullw      xmm4, [rel PW_THREE]
268
269    pcmpeqb     xmm7, xmm7
270    psrldq      xmm7, (SIZEOF_XMMWORD-2)
271
272    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
273    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
274    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
275    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
276
277    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
278    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
279    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
280    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
281
282    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
283    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
284
285    movdqa      XMMWORD [wk(0)], xmm1
286    movdqa      XMMWORD [wk(1)], xmm2
287
288    add         rax, byte SIZEOF_XMMWORD-1
289    and         rax, byte -SIZEOF_XMMWORD
290    cmp         rax, byte SIZEOF_XMMWORD
291    ja          short .columnloop
292
293.columnloop_last:
294    ; -- process the last column block
295
296    pcmpeqb     xmm1, xmm1
297    pslldq      xmm1, (SIZEOF_XMMWORD-2)
298    movdqa      xmm2, xmm1
299
300    pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
301    pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
302
303    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
304    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
305
306    jmp         near .upsample
307
308.columnloop:
309    ; -- process the next column block
310
311    movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
312    movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
313    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
314
315    pxor        xmm3, xmm3              ; xmm3=(all 0's)
316    movdqa      xmm4, xmm0
317    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
318    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
319    movdqa      xmm5, xmm1
320    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
321    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
322    movdqa      xmm6, xmm2
323    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
324    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
325
326    pmullw      xmm0, [rel PW_THREE]
327    pmullw      xmm4, [rel PW_THREE]
328
329    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
330    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
331    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
332    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
333
334    movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
335    movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
336    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
337    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
338
339    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
340    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
341
342    movdqa      XMMWORD [wk(2)], xmm1
343    movdqa      XMMWORD [wk(3)], xmm2
344
345.upsample:
346    ; -- process the upper row
347
348    movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
349    movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
350
351    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
352    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
353    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
354    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
355    movdqa      xmm5, xmm7
356    movdqa      xmm6, xmm3
357    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
358    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
359
360    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
361    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
362
363    movdqa      xmm1, xmm7
364    movdqa      xmm2, xmm3
365    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
366    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
367    movdqa      xmm4, xmm3
368    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
369
370    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
371    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
372
373    movdqa      XMMWORD [wk(0)], xmm4
374
375    pmullw      xmm7, [rel PW_THREE]
376    pmullw      xmm3, [rel PW_THREE]
377    paddw       xmm1, [rel PW_EIGHT]
378    paddw       xmm5, [rel PW_EIGHT]
379    paddw       xmm0, [rel PW_SEVEN]
380    paddw       xmm2, [rel PW_SEVEN]
381
382    paddw       xmm1, xmm7
383    paddw       xmm5, xmm3
384    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
385    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
386    paddw       xmm0, xmm7
387    paddw       xmm2, xmm3
388    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
389    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
390
391    psllw       xmm0, BYTE_BIT
392    psllw       xmm2, BYTE_BIT
393    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
394    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
395
396    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
397    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
398
399    ; -- process the lower row
400
401    movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
402    movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
403
404    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
405    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
406    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
407    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
408    movdqa      xmm0, xmm6
409    movdqa      xmm2, xmm4
410    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
411    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
412
413    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
414    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
415
416    movdqa      xmm1, xmm6
417    movdqa      xmm5, xmm4
418    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
419    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
420    movdqa      xmm3, xmm4
421    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
422
423    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
424    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
425
426    movdqa      XMMWORD [wk(1)], xmm3
427
428    pmullw      xmm6, [rel PW_THREE]
429    pmullw      xmm4, [rel PW_THREE]
430    paddw       xmm1, [rel PW_EIGHT]
431    paddw       xmm0, [rel PW_EIGHT]
432    paddw       xmm7, [rel PW_SEVEN]
433    paddw       xmm5, [rel PW_SEVEN]
434
435    paddw       xmm1, xmm6
436    paddw       xmm0, xmm4
437    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
438    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
439    paddw       xmm7, xmm6
440    paddw       xmm5, xmm4
441    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
442    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
443
444    psllw       xmm7, BYTE_BIT
445    psllw       xmm5, BYTE_BIT
446    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
447    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
448
449    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
450    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
451
452    sub         rax, byte SIZEOF_XMMWORD
453    add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
454    add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
455    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
456    add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
457    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
458    cmp         rax, byte SIZEOF_XMMWORD
459    ja          near .columnloop
460    test        rax, rax
461    jnz         near .columnloop_last
462
463    pop         rsi
464    pop         rdi
465    pop         rcx
466    pop         rax
467
468    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
469    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
470    sub         rcx, byte 2                  ; rowctr
471    jg          near .rowloop
472
473.return:
474    pop         rbx
475    uncollect_args 4
476    mov         rsp, rbp                ; rsp <- aligned rbp
477    pop         rsp                     ; rsp <- original rbp
478    pop         rbp
479    ret
480
481; --------------------------------------------------------------------------
482;
483; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
484; It's still a box filter.
485;
486; GLOBAL(void)
487; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
488;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
489;
490
491; r10 = int max_v_samp_factor
492; r11d = JDIMENSION output_width
493; r12 = JSAMPARRAY input_data
494; r13 = JSAMPARRAY *output_data_ptr
495
496    align       32
497    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
498
499EXTN(jsimd_h2v1_upsample_sse2):
500    push        rbp
501    mov         rax, rsp
502    mov         rbp, rsp
503    collect_args 4
504
505    mov         edx, r11d
506    add         rdx, byte (2*SIZEOF_XMMWORD)-1
507    and         rdx, byte -(2*SIZEOF_XMMWORD)
508    jz          near .return
509
510    mov         rcx, r10                ; rowctr
511    test        rcx, rcx
512    jz          short .return
513
514    mov         rsi, r12                ; input_data
515    mov         rdi, r13
516    mov         rdip, JSAMPARRAY [rdi]  ; output_data
517.rowloop:
518    push        rdi
519    push        rsi
520
521    mov         rsip, JSAMPROW [rsi]    ; inptr
522    mov         rdip, JSAMPROW [rdi]    ; outptr
523    mov         rax, rdx                ; colctr
524.columnloop:
525
526    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
527
528    movdqa      xmm1, xmm0
529    punpcklbw   xmm0, xmm0
530    punpckhbw   xmm1, xmm1
531
532    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
533    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
534
535    sub         rax, byte 2*SIZEOF_XMMWORD
536    jz          short .nextrow
537
538    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
539
540    movdqa      xmm3, xmm2
541    punpcklbw   xmm2, xmm2
542    punpckhbw   xmm3, xmm3
543
544    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
545    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
546
547    sub         rax, byte 2*SIZEOF_XMMWORD
548    jz          short .nextrow
549
550    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
551    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
552    jmp         short .columnloop
553
554.nextrow:
555    pop         rsi
556    pop         rdi
557
558    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
559    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
560    dec         rcx                        ; rowctr
561    jg          short .rowloop
562
563.return:
564    uncollect_args 4
565    pop         rbp
566    ret
567
568; --------------------------------------------------------------------------
569;
570; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
571; It's still a box filter.
572;
573; GLOBAL(void)
574; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
575;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
576;
577
578; r10 = int max_v_samp_factor
579; r11d = JDIMENSION output_width
580; r12 = JSAMPARRAY input_data
581; r13 = JSAMPARRAY *output_data_ptr
582
583    align       32
584    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
585
586EXTN(jsimd_h2v2_upsample_sse2):
587    push        rbp
588    mov         rax, rsp
589    mov         rbp, rsp
590    collect_args 4
591    push        rbx
592
593    mov         edx, r11d
594    add         rdx, byte (2*SIZEOF_XMMWORD)-1
595    and         rdx, byte -(2*SIZEOF_XMMWORD)
596    jz          near .return
597
598    mov         rcx, r10                ; rowctr
599    test        rcx, rcx
600    jz          near .return
601
602    mov         rsi, r12                ; input_data
603    mov         rdi, r13
604    mov         rdip, JSAMPARRAY [rdi]  ; output_data
605.rowloop:
606    push        rdi
607    push        rsi
608
609    mov         rsip, JSAMPROW [rsi]                   ; inptr
610    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
611    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
612    mov         rax, rdx                               ; colctr
613.columnloop:
614
615    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
616
617    movdqa      xmm1, xmm0
618    punpcklbw   xmm0, xmm0
619    punpckhbw   xmm1, xmm1
620
621    movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
622    movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
623    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
624    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
625
626    sub         rax, byte 2*SIZEOF_XMMWORD
627    jz          short .nextrow
628
629    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
630
631    movdqa      xmm3, xmm2
632    punpcklbw   xmm2, xmm2
633    punpckhbw   xmm3, xmm3
634
635    movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
636    movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
637    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
638    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
639
640    sub         rax, byte 2*SIZEOF_XMMWORD
641    jz          short .nextrow
642
643    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
644    add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
645    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
646    jmp         short .columnloop
647
648.nextrow:
649    pop         rsi
650    pop         rdi
651
652    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
653    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
654    sub         rcx, byte 2                  ; rowctr
655    jg          near .rowloop
656
657.return:
658    pop         rbx
659    uncollect_args 4
660    pop         rbp
661    ret
662
663; For some reason, the OS X linker does not honor the request to align the
664; segment unless we do this.
665    align       32
666