xref: /aosp_15_r20/bionic/libc/arch-arm64/oryon/memset-nt.S (revision 8d67ca893c1523eb926b9080dbe4e2ffd2a27ba1)
1*8d67ca89SAndroid Build Coastguard Worker/* Copyright (c) 2012, Linaro Limited
2*8d67ca89SAndroid Build Coastguard Worker   All rights reserved.
3*8d67ca89SAndroid Build Coastguard Worker   Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
4*8d67ca89SAndroid Build Coastguard Worker   Redistribution and use in source and binary forms, with or without
5*8d67ca89SAndroid Build Coastguard Worker   modification, are permitted provided that the following conditions are met:
6*8d67ca89SAndroid Build Coastguard Worker       * Redistributions of source code must retain the above copyright
7*8d67ca89SAndroid Build Coastguard Worker         notice, this list of conditions and the following disclaimer.
8*8d67ca89SAndroid Build Coastguard Worker       * Redistributions in binary form must reproduce the above copyright
9*8d67ca89SAndroid Build Coastguard Worker         notice, this list of conditions and the following disclaimer in the
10*8d67ca89SAndroid Build Coastguard Worker         documentation and/or other materials provided with the distribution.
11*8d67ca89SAndroid Build Coastguard Worker       * Neither the name of the Linaro nor the
12*8d67ca89SAndroid Build Coastguard Worker         names of its contributors may be used to endorse or promote products
13*8d67ca89SAndroid Build Coastguard Worker         derived from this software without specific prior written permission.
14*8d67ca89SAndroid Build Coastguard Worker   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15*8d67ca89SAndroid Build Coastguard Worker   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16*8d67ca89SAndroid Build Coastguard Worker   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17*8d67ca89SAndroid Build Coastguard Worker   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
18*8d67ca89SAndroid Build Coastguard Worker   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19*8d67ca89SAndroid Build Coastguard Worker   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
20*8d67ca89SAndroid Build Coastguard Worker   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21*8d67ca89SAndroid Build Coastguard Worker   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22*8d67ca89SAndroid Build Coastguard Worker   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*8d67ca89SAndroid Build Coastguard Worker   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24*8d67ca89SAndroid Build Coastguard Worker   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*8d67ca89SAndroid Build Coastguard Worker*/
26*8d67ca89SAndroid Build Coastguard Worker/* Assumptions:
27*8d67ca89SAndroid Build Coastguard Worker *
28*8d67ca89SAndroid Build Coastguard Worker * ARMv8-a, AArch64
29*8d67ca89SAndroid Build Coastguard Worker * Unaligned accesses
30*8d67ca89SAndroid Build Coastguard Worker *
31*8d67ca89SAndroid Build Coastguard Worker */
32*8d67ca89SAndroid Build Coastguard Worker#include <private/bionic_asm.h>
33*8d67ca89SAndroid Build Coastguard Worker
34*8d67ca89SAndroid Build Coastguard Worker#define dstin		x0
35*8d67ca89SAndroid Build Coastguard Worker#define val		    w1
36*8d67ca89SAndroid Build Coastguard Worker#define count		x2
37*8d67ca89SAndroid Build Coastguard Worker#define tmp1		x3
38*8d67ca89SAndroid Build Coastguard Worker#define tmp1w		w3
39*8d67ca89SAndroid Build Coastguard Worker#define tmp2		x4
40*8d67ca89SAndroid Build Coastguard Worker#define tmp2w		w4
41*8d67ca89SAndroid Build Coastguard Worker#define zva_len_x	x5
42*8d67ca89SAndroid Build Coastguard Worker#define zva_len		w5
43*8d67ca89SAndroid Build Coastguard Worker#define zva_bits_x	x6
44*8d67ca89SAndroid Build Coastguard Worker#define A_l		    x7
45*8d67ca89SAndroid Build Coastguard Worker#define A_lw		w7
46*8d67ca89SAndroid Build Coastguard Worker#define dst		    x8
47*8d67ca89SAndroid Build Coastguard Worker#define tmp3w		w9
48*8d67ca89SAndroid Build Coastguard Worker#define tmp4        x10
49*8d67ca89SAndroid Build Coastguard Worker#define SMALL_BUFFER_SIZE    96
50*8d67ca89SAndroid Build Coastguard Worker
51*8d67ca89SAndroid Build Coastguard WorkerENTRY(__memset_aarch64_nt)
52*8d67ca89SAndroid Build Coastguard Worker    mov	dst, dstin		/* Preserve return value.  */
53*8d67ca89SAndroid Build Coastguard Worker    ands	A_lw, val, #255
54*8d67ca89SAndroid Build Coastguard Worker    b.eq	.Lzero_mem  /* Use DC ZVA instruction if the val = 0 */
55*8d67ca89SAndroid Build Coastguard Worker    orr	A_lw, A_lw, A_lw, lsl #8
56*8d67ca89SAndroid Build Coastguard Worker    orr	A_lw, A_lw, A_lw, lsl #16
57*8d67ca89SAndroid Build Coastguard Worker    orr	A_l, A_l, A_l, lsl #32
58*8d67ca89SAndroid Build Coastguard Worker.Ltail_maybe_long:
59*8d67ca89SAndroid Build Coastguard Worker    cmp	count, #64
60*8d67ca89SAndroid Build Coastguard Worker    b.ge	.Lnot_short
61*8d67ca89SAndroid Build Coastguard Worker.Ltail_maybe_tiny:
62*8d67ca89SAndroid Build Coastguard Worker    cmp	count, #15
63*8d67ca89SAndroid Build Coastguard Worker    b.le	.Ltail15tiny
64*8d67ca89SAndroid Build Coastguard Worker.Ltail63:
65*8d67ca89SAndroid Build Coastguard Worker    ands	tmp1, count, #0x30
66*8d67ca89SAndroid Build Coastguard Worker    b.eq	.Ltail15
67*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, tmp1
68*8d67ca89SAndroid Build Coastguard Worker    cmp	tmp1w, #0x20
69*8d67ca89SAndroid Build Coastguard Worker    b.eq	1f
70*8d67ca89SAndroid Build Coastguard Worker    b.lt	2f
71*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #-48]
72*8d67ca89SAndroid Build Coastguard Worker1:
73*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #-32]
74*8d67ca89SAndroid Build Coastguard Worker2:
75*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #-16]
76*8d67ca89SAndroid Build Coastguard Worker.Ltail15:
77*8d67ca89SAndroid Build Coastguard Worker    and	count, count, #15
78*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, count
79*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
80*8d67ca89SAndroid Build Coastguard Worker    ret
81*8d67ca89SAndroid Build Coastguard Worker.Ltail15tiny:
82*8d67ca89SAndroid Build Coastguard Worker    /* Set up to 15 bytes.  Does not assume earlier memory
83*8d67ca89SAndroid Build Coastguard Worker       being set.  */
84*8d67ca89SAndroid Build Coastguard Worker    tbz	count, #3, 1f
85*8d67ca89SAndroid Build Coastguard Worker    str	A_l, [dst], #8
86*8d67ca89SAndroid Build Coastguard Worker1:
87*8d67ca89SAndroid Build Coastguard Worker    tbz	count, #2, 1f
88*8d67ca89SAndroid Build Coastguard Worker    str	A_lw, [dst], #4
89*8d67ca89SAndroid Build Coastguard Worker1:
90*8d67ca89SAndroid Build Coastguard Worker    tbz	count, #1, 1f
91*8d67ca89SAndroid Build Coastguard Worker    strh	A_lw, [dst], #2
92*8d67ca89SAndroid Build Coastguard Worker1:
93*8d67ca89SAndroid Build Coastguard Worker    tbz	count, #0, 1f
94*8d67ca89SAndroid Build Coastguard Worker    strb	A_lw, [dst]
95*8d67ca89SAndroid Build Coastguard Worker1:
96*8d67ca89SAndroid Build Coastguard Worker    ret
97*8d67ca89SAndroid Build Coastguard Worker    /* Critical loop.  Start at a new cache line boundary.  Assuming
98*8d67ca89SAndroid Build Coastguard Worker     * 64 bytes per line, this ensures the entire loop is in one line.  */
99*8d67ca89SAndroid Build Coastguard Worker    .p2align 6
100*8d67ca89SAndroid Build Coastguard Worker.Lnot_short:
101*8d67ca89SAndroid Build Coastguard Worker    mov tmp4, #SMALL_BUFFER_SIZE
102*8d67ca89SAndroid Build Coastguard Worker    cmp count, tmp4, LSL#10
103*8d67ca89SAndroid Build Coastguard Worker    /* Use non-temporal instruction if count > SMALL_BUFFER_SIZE */
104*8d67ca89SAndroid Build Coastguard Worker    bgt L(not_short_nt)
105*8d67ca89SAndroid Build Coastguard Worker    neg	tmp2, dst
106*8d67ca89SAndroid Build Coastguard Worker    ands	tmp2, tmp2, #15
107*8d67ca89SAndroid Build Coastguard Worker    b.eq	2f
108*8d67ca89SAndroid Build Coastguard Worker    /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
109*8d67ca89SAndroid Build Coastguard Worker     * more than that to set, so we simply store 16 bytes and advance by
110*8d67ca89SAndroid Build Coastguard Worker     * the amount required to reach alignment.  */
111*8d67ca89SAndroid Build Coastguard Worker    sub	count, count, tmp2
112*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst]
113*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, tmp2
114*8d67ca89SAndroid Build Coastguard Worker    /* There may be less than 63 bytes to go now.  */
115*8d67ca89SAndroid Build Coastguard Worker    cmp	count, #63
116*8d67ca89SAndroid Build Coastguard Worker    b.le	.Ltail63
117*8d67ca89SAndroid Build Coastguard Worker2:
118*8d67ca89SAndroid Build Coastguard Worker    sub	dst, dst, #16		/* Pre-bias.  */
119*8d67ca89SAndroid Build Coastguard Worker    sub	count, count, #64
120*8d67ca89SAndroid Build Coastguard Worker1:
121*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #16]
122*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #32]
123*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #48]
124*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #64]!
125*8d67ca89SAndroid Build Coastguard Worker    subs	count, count, #64
126*8d67ca89SAndroid Build Coastguard Worker    b.ge	1b
127*8d67ca89SAndroid Build Coastguard Worker    tst	count, #0x3f
128*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, #16
129*8d67ca89SAndroid Build Coastguard Worker    b.ne	.Ltail63
130*8d67ca89SAndroid Build Coastguard Worker    ret
131*8d67ca89SAndroid Build Coastguard Worker.Lnot_short_nt:
132*8d67ca89SAndroid Build Coastguard Worker    neg	tmp2, dst
133*8d67ca89SAndroid Build Coastguard Worker    ands	tmp2, tmp2, #15
134*8d67ca89SAndroid Build Coastguard Worker    b.eq	2f
135*8d67ca89SAndroid Build Coastguard Worker    /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
136*8d67ca89SAndroid Build Coastguard Worker     * more than that to set, so we simply store 16 bytes and advance by
137*8d67ca89SAndroid Build Coastguard Worker     * the amount required to reach alignment.  */
138*8d67ca89SAndroid Build Coastguard Worker    sub	count, count, tmp2
139*8d67ca89SAndroid Build Coastguard Worker    stnp	A_l, A_l, [dst]
140*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, tmp2
141*8d67ca89SAndroid Build Coastguard Worker    /* There may be less than 63 bytes to go now.  */
142*8d67ca89SAndroid Build Coastguard Worker    cmp	count, #63
143*8d67ca89SAndroid Build Coastguard Worker    b.le	.Ltail63
144*8d67ca89SAndroid Build Coastguard Worker2:
145*8d67ca89SAndroid Build Coastguard Worker    sub	dst, dst, #16		/* Pre-bias.  */
146*8d67ca89SAndroid Build Coastguard Worker    sub	count, count, #64
147*8d67ca89SAndroid Build Coastguard Worker1:
148*8d67ca89SAndroid Build Coastguard Worker    stnp	A_l, A_l, [dst, #16]
149*8d67ca89SAndroid Build Coastguard Worker    stnp	A_l, A_l, [dst, #32]
150*8d67ca89SAndroid Build Coastguard Worker    stnp	A_l, A_l, [dst, #48]
151*8d67ca89SAndroid Build Coastguard Worker    stnp	A_l, A_l, [dst, #64]
152*8d67ca89SAndroid Build Coastguard Worker    add     dst, dst, #64
153*8d67ca89SAndroid Build Coastguard Worker    subs	count, count, #64
154*8d67ca89SAndroid Build Coastguard Worker    b.ge	1b
155*8d67ca89SAndroid Build Coastguard Worker    tst	count, #0x3f
156*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, #16
157*8d67ca89SAndroid Build Coastguard Worker    b.ne	.Ltail63
158*8d67ca89SAndroid Build Coastguard Worker    ret
159*8d67ca89SAndroid Build Coastguard Worker.Lzero_mem:
160*8d67ca89SAndroid Build Coastguard Worker    mov	A_l, #0
161*8d67ca89SAndroid Build Coastguard Worker    cmp	count, #63
162*8d67ca89SAndroid Build Coastguard Worker    b.le	.Ltail_maybe_tiny
163*8d67ca89SAndroid Build Coastguard Worker    neg	tmp2, dst
164*8d67ca89SAndroid Build Coastguard Worker    ands	tmp2, tmp2, #15
165*8d67ca89SAndroid Build Coastguard Worker    b.eq	1f
166*8d67ca89SAndroid Build Coastguard Worker    sub	count, count, tmp2
167*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst]
168*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, tmp2
169*8d67ca89SAndroid Build Coastguard Worker    cmp	count, #63
170*8d67ca89SAndroid Build Coastguard Worker    b.le	.Ltail63
171*8d67ca89SAndroid Build Coastguard Worker1:
172*8d67ca89SAndroid Build Coastguard Worker    /* For zeroing small amounts of memory, it's not worth setting up
173*8d67ca89SAndroid Build Coastguard Worker     * the line-clear code.  */
174*8d67ca89SAndroid Build Coastguard Worker    cmp	count, #128
175*8d67ca89SAndroid Build Coastguard Worker    b.lt	.Lnot_short
176*8d67ca89SAndroid Build Coastguard Worker    mrs	tmp1, dczid_el0
177*8d67ca89SAndroid Build Coastguard Worker    tbnz	tmp1, #4, .Lnot_short
178*8d67ca89SAndroid Build Coastguard Worker    mov	tmp3w, #4
179*8d67ca89SAndroid Build Coastguard Worker    and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
180*8d67ca89SAndroid Build Coastguard Worker    lsl	zva_len, tmp3w, zva_len
181*8d67ca89SAndroid Build Coastguard Worker.Lzero_by_line:
182*8d67ca89SAndroid Build Coastguard Worker    /* Compute how far we need to go to become suitably aligned.  We're
183*8d67ca89SAndroid Build Coastguard Worker     * already at quad-word alignment.  */
184*8d67ca89SAndroid Build Coastguard Worker    cmp	count, zva_len_x
185*8d67ca89SAndroid Build Coastguard Worker    b.lt	.Lnot_short		/* Not enough to reach alignment.  */
186*8d67ca89SAndroid Build Coastguard Worker    sub	zva_bits_x, zva_len_x, #1
187*8d67ca89SAndroid Build Coastguard Worker    neg	tmp2, dst
188*8d67ca89SAndroid Build Coastguard Worker    ands	tmp2, tmp2, zva_bits_x
189*8d67ca89SAndroid Build Coastguard Worker    b.eq	1f			/* Already aligned.  */
190*8d67ca89SAndroid Build Coastguard Worker    /* Not aligned, check that there's enough to copy after alignment.  */
191*8d67ca89SAndroid Build Coastguard Worker    sub	tmp1, count, tmp2
192*8d67ca89SAndroid Build Coastguard Worker    cmp	tmp1, #64
193*8d67ca89SAndroid Build Coastguard Worker    ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
194*8d67ca89SAndroid Build Coastguard Worker    b.lt	.Lnot_short
195*8d67ca89SAndroid Build Coastguard Worker    /* We know that there's at least 64 bytes to zero and that it's safe
196*8d67ca89SAndroid Build Coastguard Worker     * to overrun by 64 bytes.  */
197*8d67ca89SAndroid Build Coastguard Worker    mov	count, tmp1
198*8d67ca89SAndroid Build Coastguard Worker2:
199*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst]
200*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #16]
201*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #32]
202*8d67ca89SAndroid Build Coastguard Worker    subs	tmp2, tmp2, #64
203*8d67ca89SAndroid Build Coastguard Worker    stp	A_l, A_l, [dst, #48]
204*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, #64
205*8d67ca89SAndroid Build Coastguard Worker    b.ge	2b
206*8d67ca89SAndroid Build Coastguard Worker    /* We've overrun a bit, so adjust dst downwards.  */
207*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, tmp2
208*8d67ca89SAndroid Build Coastguard Worker1:
209*8d67ca89SAndroid Build Coastguard Worker    sub	count, count, zva_len_x
210*8d67ca89SAndroid Build Coastguard Worker3:
211*8d67ca89SAndroid Build Coastguard Worker    dc	zva, dst
212*8d67ca89SAndroid Build Coastguard Worker    add	dst, dst, zva_len_x
213*8d67ca89SAndroid Build Coastguard Worker    subs	count, count, zva_len_x
214*8d67ca89SAndroid Build Coastguard Worker    b.ge	3b
215*8d67ca89SAndroid Build Coastguard Worker    ands	count, count, zva_bits_x
216*8d67ca89SAndroid Build Coastguard Worker    b.ne	.Ltail_maybe_long
217*8d67ca89SAndroid Build Coastguard Worker    ret
218*8d67ca89SAndroid Build Coastguard WorkerEND(__memset_aarch64_nt)
219