xref: /aosp_15_r20/bionic/libc/arch-x86_64/string/avx2-memset-kbl.S (revision 8d67ca893c1523eb926b9080dbe4e2ffd2a27ba1)
1*8d67ca89SAndroid Build Coastguard Worker/*
2*8d67ca89SAndroid Build Coastguard WorkerCopyright (c) 2014, Intel Corporation
3*8d67ca89SAndroid Build Coastguard WorkerAll rights reserved.
4*8d67ca89SAndroid Build Coastguard Worker
5*8d67ca89SAndroid Build Coastguard WorkerRedistribution and use in source and binary forms, with or without
6*8d67ca89SAndroid Build Coastguard Workermodification, are permitted provided that the following conditions are met:
7*8d67ca89SAndroid Build Coastguard Worker
8*8d67ca89SAndroid Build Coastguard Worker    * Redistributions of source code must retain the above copyright notice,
9*8d67ca89SAndroid Build Coastguard Worker    * this list of conditions and the following disclaimer.
10*8d67ca89SAndroid Build Coastguard Worker
11*8d67ca89SAndroid Build Coastguard Worker    * Redistributions in binary form must reproduce the above copyright notice,
12*8d67ca89SAndroid Build Coastguard Worker    * this list of conditions and the following disclaimer in the documentation
13*8d67ca89SAndroid Build Coastguard Worker    * and/or other materials provided with the distribution.
14*8d67ca89SAndroid Build Coastguard Worker
15*8d67ca89SAndroid Build Coastguard Worker    * Neither the name of Intel Corporation nor the names of its contributors
16*8d67ca89SAndroid Build Coastguard Worker    * may be used to endorse or promote products derived from this software
17*8d67ca89SAndroid Build Coastguard Worker    * without specific prior written permission.
18*8d67ca89SAndroid Build Coastguard Worker
19*8d67ca89SAndroid Build Coastguard WorkerTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20*8d67ca89SAndroid Build Coastguard WorkerANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21*8d67ca89SAndroid Build Coastguard WorkerWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22*8d67ca89SAndroid Build Coastguard WorkerDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23*8d67ca89SAndroid Build Coastguard WorkerANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24*8d67ca89SAndroid Build Coastguard Worker(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25*8d67ca89SAndroid Build Coastguard WorkerLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26*8d67ca89SAndroid Build Coastguard WorkerANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27*8d67ca89SAndroid Build Coastguard Worker(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28*8d67ca89SAndroid Build Coastguard WorkerSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*8d67ca89SAndroid Build Coastguard Worker*/
30*8d67ca89SAndroid Build Coastguard Worker
31*8d67ca89SAndroid Build Coastguard Worker#include <private/bionic_asm.h>
32*8d67ca89SAndroid Build Coastguard Worker
33*8d67ca89SAndroid Build Coastguard Worker
34*8d67ca89SAndroid Build Coastguard Worker#ifndef L
35*8d67ca89SAndroid Build Coastguard Worker# define L(label)	.L##label
36*8d67ca89SAndroid Build Coastguard Worker#endif
37*8d67ca89SAndroid Build Coastguard Worker
38*8d67ca89SAndroid Build Coastguard Worker#ifndef ALIGN
39*8d67ca89SAndroid Build Coastguard Worker# define ALIGN(n)	.p2align n
40*8d67ca89SAndroid Build Coastguard Worker#endif
41*8d67ca89SAndroid Build Coastguard Worker
42*8d67ca89SAndroid Build Coastguard Worker	.section .text.avx2,"ax",@progbits
43*8d67ca89SAndroid Build Coastguard Worker
44*8d67ca89SAndroid Build Coastguard WorkerENTRY(__memset_chk_avx2)
45*8d67ca89SAndroid Build Coastguard Worker	# %rdi = dst, %rsi = byte, %rdx = n, %rcx = dst_len
46*8d67ca89SAndroid Build Coastguard Worker	cmp %rcx, %rdx
47*8d67ca89SAndroid Build Coastguard Worker	ja __memset_chk_fail
48*8d67ca89SAndroid Build Coastguard Worker	// Fall through to memset...
49*8d67ca89SAndroid Build Coastguard WorkerEND(__memset_chk_avx2)
50*8d67ca89SAndroid Build Coastguard Worker
51*8d67ca89SAndroid Build Coastguard WorkerENTRY(memset_avx2)
52*8d67ca89SAndroid Build Coastguard Worker	movq	%rdi, %rax
53*8d67ca89SAndroid Build Coastguard Worker	and	$0xff, %rsi
54*8d67ca89SAndroid Build Coastguard Worker	mov	$0x0101010101010101, %rcx
55*8d67ca89SAndroid Build Coastguard Worker	imul	%rsi, %rcx
56*8d67ca89SAndroid Build Coastguard Worker	cmpq	$16, %rdx
57*8d67ca89SAndroid Build Coastguard Worker	jae	L(16bytesormore)
58*8d67ca89SAndroid Build Coastguard Worker	testb	$8, %dl
59*8d67ca89SAndroid Build Coastguard Worker	jnz	L(8_15bytes)
60*8d67ca89SAndroid Build Coastguard Worker	testb	$4, %dl
61*8d67ca89SAndroid Build Coastguard Worker	jnz	L(4_7bytes)
62*8d67ca89SAndroid Build Coastguard Worker	testb	$2, %dl
63*8d67ca89SAndroid Build Coastguard Worker	jnz	L(2_3bytes)
64*8d67ca89SAndroid Build Coastguard Worker	testb	$1, %dl
65*8d67ca89SAndroid Build Coastguard Worker	jz	1f
66*8d67ca89SAndroid Build Coastguard Worker	movb	%cl, (%rdi)
67*8d67ca89SAndroid Build Coastguard Worker1:	ret
68*8d67ca89SAndroid Build Coastguard Worker
69*8d67ca89SAndroid Build Coastguard WorkerL(8_15bytes):
70*8d67ca89SAndroid Build Coastguard Worker	movq	%rcx, (%rdi)
71*8d67ca89SAndroid Build Coastguard Worker	movq	%rcx, -8(%rdi, %rdx)
72*8d67ca89SAndroid Build Coastguard Worker	ret
73*8d67ca89SAndroid Build Coastguard Worker
74*8d67ca89SAndroid Build Coastguard WorkerL(4_7bytes):
75*8d67ca89SAndroid Build Coastguard Worker	movl	%ecx, (%rdi)
76*8d67ca89SAndroid Build Coastguard Worker	movl	%ecx, -4(%rdi, %rdx)
77*8d67ca89SAndroid Build Coastguard Worker	ret
78*8d67ca89SAndroid Build Coastguard Worker
79*8d67ca89SAndroid Build Coastguard WorkerL(2_3bytes):
80*8d67ca89SAndroid Build Coastguard Worker	movw	%cx, (%rdi)
81*8d67ca89SAndroid Build Coastguard Worker	movw	%cx, -2(%rdi, %rdx)
82*8d67ca89SAndroid Build Coastguard Worker	ret
83*8d67ca89SAndroid Build Coastguard Worker
84*8d67ca89SAndroid Build Coastguard Worker	ALIGN (4)
85*8d67ca89SAndroid Build Coastguard WorkerL(16bytesormore):
86*8d67ca89SAndroid Build Coastguard Worker	movd	%rcx, %xmm0
87*8d67ca89SAndroid Build Coastguard Worker	pshufd	$0, %xmm0, %xmm0
88*8d67ca89SAndroid Build Coastguard Worker	movdqu	%xmm0, (%rdi)
89*8d67ca89SAndroid Build Coastguard Worker	movdqu	%xmm0, -16(%rdi, %rdx)
90*8d67ca89SAndroid Build Coastguard Worker	cmpq	$32, %rdx
91*8d67ca89SAndroid Build Coastguard Worker	jbe	L(done)
92*8d67ca89SAndroid Build Coastguard Worker	movdqu	%xmm0, 16(%rdi)
93*8d67ca89SAndroid Build Coastguard Worker	movdqu	%xmm0, -32(%rdi, %rdx)
94*8d67ca89SAndroid Build Coastguard Worker	cmpq	$64, %rdx
95*8d67ca89SAndroid Build Coastguard Worker	jbe	L(done)
96*8d67ca89SAndroid Build Coastguard Worker	movdqu	%xmm0, 32(%rdi)
97*8d67ca89SAndroid Build Coastguard Worker	movdqu	%xmm0, 48(%rdi)
98*8d67ca89SAndroid Build Coastguard Worker	movdqu	%xmm0, -64(%rdi, %rdx)
99*8d67ca89SAndroid Build Coastguard Worker	movdqu	%xmm0, -48(%rdi, %rdx)
100*8d67ca89SAndroid Build Coastguard Worker	cmpq	$128, %rdx
101*8d67ca89SAndroid Build Coastguard Worker	jbe	L(done)
102*8d67ca89SAndroid Build Coastguard Worker	vpbroadcastb %xmm0, %ymm0
103*8d67ca89SAndroid Build Coastguard Worker	vmovdqu	%ymm0, 64(%rdi)
104*8d67ca89SAndroid Build Coastguard Worker	vmovdqu	%ymm0, 96(%rdi)
105*8d67ca89SAndroid Build Coastguard Worker	vmovdqu	%ymm0, -128(%rdi, %rdx)
106*8d67ca89SAndroid Build Coastguard Worker	vmovdqu	%ymm0, -96(%rdi, %rdx)
107*8d67ca89SAndroid Build Coastguard Worker	cmpq	$256, %rdx
108*8d67ca89SAndroid Build Coastguard Worker	jbe	L(done)
109*8d67ca89SAndroid Build Coastguard Worker
110*8d67ca89SAndroid Build Coastguard Worker	ALIGN (4)
111*8d67ca89SAndroid Build Coastguard Worker	leaq	128(%rdi), %rcx
112*8d67ca89SAndroid Build Coastguard Worker	andq	$-128, %rcx
113*8d67ca89SAndroid Build Coastguard Worker	movq	%rdx, %r8
114*8d67ca89SAndroid Build Coastguard Worker	addq	%rdi, %rdx
115*8d67ca89SAndroid Build Coastguard Worker	andq	$-128, %rdx
116*8d67ca89SAndroid Build Coastguard Worker	cmpq	%rcx, %rdx
117*8d67ca89SAndroid Build Coastguard Worker	je	L(done)
118*8d67ca89SAndroid Build Coastguard Worker
119*8d67ca89SAndroid Build Coastguard Worker	cmp	__x86_shared_cache_size(%rip), %r8
120*8d67ca89SAndroid Build Coastguard Worker
121*8d67ca89SAndroid Build Coastguard Worker	ja	L(non_temporal_loop)
122*8d67ca89SAndroid Build Coastguard Worker
123*8d67ca89SAndroid Build Coastguard Worker	ALIGN (4)
124*8d67ca89SAndroid Build Coastguard WorkerL(normal_loop):
125*8d67ca89SAndroid Build Coastguard Worker	vmovdqa	%ymm0, (%rcx)
126*8d67ca89SAndroid Build Coastguard Worker	vmovdqa	%ymm0, 32(%rcx)
127*8d67ca89SAndroid Build Coastguard Worker	vmovdqa	%ymm0, 64(%rcx)
128*8d67ca89SAndroid Build Coastguard Worker	vmovdqa	%ymm0, 96(%rcx)
129*8d67ca89SAndroid Build Coastguard Worker	addq	$128, %rcx
130*8d67ca89SAndroid Build Coastguard Worker	cmpq	%rcx, %rdx
131*8d67ca89SAndroid Build Coastguard Worker	jne	L(normal_loop)
132*8d67ca89SAndroid Build Coastguard Worker	jmp	L(done)
133*8d67ca89SAndroid Build Coastguard Worker
134*8d67ca89SAndroid Build Coastguard Worker	ALIGN (4)
135*8d67ca89SAndroid Build Coastguard WorkerL(non_temporal_loop):
136*8d67ca89SAndroid Build Coastguard Worker	movntdq	 %xmm0, (%rcx)
137*8d67ca89SAndroid Build Coastguard Worker	movntdq	 %xmm0, 16(%rcx)
138*8d67ca89SAndroid Build Coastguard Worker	movntdq	 %xmm0, 32(%rcx)
139*8d67ca89SAndroid Build Coastguard Worker	movntdq	 %xmm0, 48(%rcx)
140*8d67ca89SAndroid Build Coastguard Worker	movntdq	 %xmm0, 64(%rcx)
141*8d67ca89SAndroid Build Coastguard Worker	movntdq	 %xmm0, 80(%rcx)
142*8d67ca89SAndroid Build Coastguard Worker	movntdq	 %xmm0, 96(%rcx)
143*8d67ca89SAndroid Build Coastguard Worker	movntdq	 %xmm0, 112(%rcx)
144*8d67ca89SAndroid Build Coastguard Worker	leaq	128(%rcx), %rcx
145*8d67ca89SAndroid Build Coastguard Worker	cmpq	%rcx, %rdx
146*8d67ca89SAndroid Build Coastguard Worker	jne	L(non_temporal_loop)
147*8d67ca89SAndroid Build Coastguard Worker	# We used non-temporal stores, so we need a fence here.
148*8d67ca89SAndroid Build Coastguard Worker	sfence
149*8d67ca89SAndroid Build Coastguard Worker
150*8d67ca89SAndroid Build Coastguard WorkerL(done):
151*8d67ca89SAndroid Build Coastguard Worker	# We used the ymm registers, and that can break SSE2 performance
152*8d67ca89SAndroid Build Coastguard Worker	# unless you do this.
153*8d67ca89SAndroid Build Coastguard Worker	vzeroupper
154*8d67ca89SAndroid Build Coastguard Worker	ret
155*8d67ca89SAndroid Build Coastguard Worker
156*8d67ca89SAndroid Build Coastguard WorkerEND(memset_avx2)
157