xref: /aosp_15_r20/external/libjpeg-turbo/simd/x86_64/jcsample-avx2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1;
2; jcsample.asm - downsampling (64-bit AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6; Copyright (C) 2015, Intel Corporation.
7; Copyright (C) 2018, Matthias Räncker.
8;
9; Based on the x86 SIMD extension for IJG JPEG library
10; Copyright (C) 1999-2006, MIYASAKA Masaru.
11; For conditions of distribution and use, see copyright notice in jsimdext.inc
12;
13; This file should be assembled with NASM (Netwide Assembler),
14; can *not* be assembled with Microsoft's MASM or any compatible
15; assembler (including Borland's Turbo Assembler).
16; NASM is available from http://nasm.sourceforge.net/ or
17; http://sourceforge.net/project/showfiles.php?group_id=6208
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22    SECTION     SEG_TEXT
23    BITS        64
24;
25; Downsample pixel values of a single component.
26; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27; without smoothing.
28;
29; GLOBAL(void)
30; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
31;                            JDIMENSION v_samp_factor,
32;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
33;                            JSAMPARRAY output_data);
34;
35
36; r10d = JDIMENSION image_width
37; r11 = int max_v_samp_factor
38; r12d = JDIMENSION v_samp_factor
39; r13d = JDIMENSION width_in_blocks
40; r14 = JSAMPARRAY input_data
41; r15 = JSAMPARRAY output_data
42
43    align       32
44    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
45
46EXTN(jsimd_h2v1_downsample_avx2):
47    push        rbp
48    mov         rax, rsp
49    mov         rbp, rsp
50    collect_args 6
51
52    mov         ecx, r13d
53    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
54    jz          near .return
55
56    mov         edx, r10d
57
58    ; -- expand_right_edge
59
60    push        rcx
61    shl         rcx, 1                  ; output_cols * 2
62    sub         rcx, rdx
63    jle         short .expand_end
64
65    mov         rax, r11
66    test        rax, rax
67    jle         short .expand_end
68
69    cld
70    mov         rsi, r14                ; input_data
71.expandloop:
72    push        rax
73    push        rcx
74
75    mov         rdip, JSAMPROW [rsi]
76    add         rdi, rdx
77    mov         al, JSAMPLE [rdi-1]
78
79    rep stosb
80
81    pop         rcx
82    pop         rax
83
84    add         rsi, byte SIZEOF_JSAMPROW
85    dec         rax
86    jg          short .expandloop
87
88.expand_end:
89    pop         rcx                     ; output_cols
90
91    ; -- h2v1_downsample
92
93    mov         eax, r12d               ; rowctr
94    test        eax, eax
95    jle         near .return
96
97    mov         rdx, 0x00010000         ; bias pattern
98    vmovd       xmm7, edx
99    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
100    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
101    vpcmpeqw    ymm6, ymm6, ymm6
102    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
103
104    mov         rsi, r14                ; input_data
105    mov         rdi, r15                ; output_data
106.rowloop:
107    push        rcx
108    push        rdi
109    push        rsi
110
111    mov         rsip, JSAMPROW [rsi]    ; inptr
112    mov         rdip, JSAMPROW [rdi]    ; outptr
113
114    cmp         rcx, byte SIZEOF_YMMWORD
115    jae         short .columnloop
116
117.columnloop_r24:
118    ; rcx can possibly be 8, 16, 24
119    cmp         rcx, 24
120    jne         .columnloop_r16
121    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
122    vmovdqu     xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
123    mov         rcx, SIZEOF_YMMWORD
124    jmp         short .downsample
125
126.columnloop_r16:
127    cmp         rcx, 16
128    jne         .columnloop_r8
129    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
130    vpxor       ymm1, ymm1, ymm1
131    mov         rcx, SIZEOF_YMMWORD
132    jmp         short .downsample
133
134.columnloop_r8:
135    vmovdqu     xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
136    vpxor       ymm1, ymm1, ymm1
137    mov         rcx, SIZEOF_YMMWORD
138    jmp         short .downsample
139
140.columnloop:
141    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
142    vmovdqu     ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
143
144.downsample:
145    vpsrlw      ymm2, ymm0, BYTE_BIT
146    vpand       ymm0, ymm0, ymm6
147    vpsrlw      ymm3, ymm1, BYTE_BIT
148    vpand       ymm1, ymm1, ymm6
149
150    vpaddw      ymm0, ymm0, ymm2
151    vpaddw      ymm1, ymm1, ymm3
152    vpaddw      ymm0, ymm0, ymm7
153    vpaddw      ymm1, ymm1, ymm7
154    vpsrlw      ymm0, ymm0, 1
155    vpsrlw      ymm1, ymm1, 1
156
157    vpackuswb   ymm0, ymm0, ymm1
158    vpermq      ymm0, ymm0, 0xd8
159
160    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
161
162    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
163    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr
164    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
165    cmp         rcx, byte SIZEOF_YMMWORD
166    jae         short .columnloop
167    test        rcx, rcx
168    jnz         near .columnloop_r24
169
170    pop         rsi
171    pop         rdi
172    pop         rcx
173
174    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
175    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
176    dec         rax                        ; rowctr
177    jg          near .rowloop
178
179.return:
180    vzeroupper
181    uncollect_args 6
182    pop         rbp
183    ret
184
185; --------------------------------------------------------------------------
186;
187; Downsample pixel values of a single component.
188; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
189; without smoothing.
190;
191; GLOBAL(void)
192; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
193;                            JDIMENSION v_samp_factor,
194;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
195;                            JSAMPARRAY output_data);
196;
197
198; r10d = JDIMENSION image_width
199; r11 = int max_v_samp_factor
200; r12d = JDIMENSION v_samp_factor
201; r13d = JDIMENSION width_in_blocks
202; r14 = JSAMPARRAY input_data
203; r15 = JSAMPARRAY output_data
204
205    align       32
206    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
207
208EXTN(jsimd_h2v2_downsample_avx2):
209    push        rbp
210    mov         rax, rsp
211    mov         rbp, rsp
212    collect_args 6
213
214    mov         ecx, r13d
215    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
216    jz          near .return
217
218    mov         edx, r10d
219
220    ; -- expand_right_edge
221
222    push        rcx
223    shl         rcx, 1                  ; output_cols * 2
224    sub         rcx, rdx
225    jle         short .expand_end
226
227    mov         rax, r11
228    test        rax, rax
229    jle         short .expand_end
230
231    cld
232    mov         rsi, r14                ; input_data
233.expandloop:
234    push        rax
235    push        rcx
236
237    mov         rdip, JSAMPROW [rsi]
238    add         rdi, rdx
239    mov         al, JSAMPLE [rdi-1]
240
241    rep stosb
242
243    pop         rcx
244    pop         rax
245
246    add         rsi, byte SIZEOF_JSAMPROW
247    dec         rax
248    jg          short .expandloop
249
250.expand_end:
251    pop         rcx                     ; output_cols
252
253    ; -- h2v2_downsample
254
255    mov         eax, r12d               ; rowctr
256    test        rax, rax
257    jle         near .return
258
259    mov         rdx, 0x00020001         ; bias pattern
260    vmovd       xmm7, edx
261    vpcmpeqw    ymm6, ymm6, ymm6
262    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
263    vperm2i128  ymm7, ymm7, ymm7, 0
264    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
265
266    mov         rsi, r14                ; input_data
267    mov         rdi, r15                ; output_data
268.rowloop:
269    push        rcx
270    push        rdi
271    push        rsi
272
273    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
274    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
275    mov         rdip, JSAMPROW [rdi]                    ; outptr
276
277    cmp         rcx, byte SIZEOF_YMMWORD
278    jae         short .columnloop
279
280.columnloop_r24:
281    cmp         rcx, 24
282    jne         .columnloop_r16
283    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
284    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
285    vmovdqu     xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
286    vmovdqu     xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
287    mov         rcx, SIZEOF_YMMWORD
288    jmp         short .downsample
289
290.columnloop_r16:
291    cmp         rcx, 16
292    jne         .columnloop_r8
293    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
294    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
295    vpxor       ymm2, ymm2, ymm2
296    vpxor       ymm3, ymm3, ymm3
297    mov         rcx, SIZEOF_YMMWORD
298    jmp         short .downsample
299
300.columnloop_r8:
301    vmovdqu     xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
302    vmovdqu     xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
303    vpxor       ymm2, ymm2, ymm2
304    vpxor       ymm3, ymm3, ymm3
305    mov         rcx, SIZEOF_YMMWORD
306    jmp         short .downsample
307
308.columnloop:
309    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
310    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
311    vmovdqu     ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
312    vmovdqu     ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
313
314.downsample:
315    vpand       ymm4, ymm0, ymm6
316    vpsrlw      ymm0, ymm0, BYTE_BIT
317    vpand       ymm5, ymm1, ymm6
318    vpsrlw      ymm1, ymm1, BYTE_BIT
319    vpaddw      ymm0, ymm0, ymm4
320    vpaddw      ymm1, ymm1, ymm5
321
322    vpand       ymm4, ymm2, ymm6
323    vpsrlw      ymm2, ymm2, BYTE_BIT
324    vpand       ymm5, ymm3, ymm6
325    vpsrlw      ymm3, ymm3, BYTE_BIT
326    vpaddw      ymm2, ymm2, ymm4
327    vpaddw      ymm3, ymm3, ymm5
328
329    vpaddw      ymm0, ymm0, ymm1
330    vpaddw      ymm2, ymm2, ymm3
331    vpaddw      ymm0, ymm0, ymm7
332    vpaddw      ymm2, ymm2, ymm7
333    vpsrlw      ymm0, ymm0, 2
334    vpsrlw      ymm2, ymm2, 2
335
336    vpackuswb   ymm0, ymm0, ymm2
337    vpermq      ymm0, ymm0, 0xd8
338
339    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
340
341    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
342    add         rdx, byte 2*SIZEOF_YMMWORD  ; inptr0
343    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr1
344    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
345    cmp         rcx, byte SIZEOF_YMMWORD
346    jae         near .columnloop
347    test        rcx, rcx
348    jnz         near .columnloop_r24
349
350    pop         rsi
351    pop         rdi
352    pop         rcx
353
354    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
355    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
356    dec         rax                          ; rowctr
357    jg          near .rowloop
358
359.return:
360    vzeroupper
361    uncollect_args 6
362    pop         rbp
363    ret
364
365; For some reason, the OS X linker does not honor the request to align the
366; segment unless we do this.
367    align       32
368