xref: /aosp_15_r20/external/arm-optimized-routines/string/arm/memset.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * memset - fill memory with a constant
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2010-2021, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/*
9*412f47f9SXin Li   Written by Dave Gilbert <[email protected]>
10*412f47f9SXin Li
11*412f47f9SXin Li   This memset routine is optimised on a Cortex-A9 and should work on
12*412f47f9SXin Li   all ARMv7 processors.
13*412f47f9SXin Li
14*412f47f9SXin Li */
15*412f47f9SXin Li
16*412f47f9SXin Li	.syntax unified
17*412f47f9SXin Li	.arch armv7-a
18*412f47f9SXin Li
19*412f47f9SXin Li@ 2011-08-30 david.gilbert@linaro.org
20*412f47f9SXin Li@    Extracted from local git 2f11b436
21*412f47f9SXin Li
22*412f47f9SXin Li@ this lets us check a flag in a 00/ff byte easily in either endianness
23*412f47f9SXin Li#ifdef __ARMEB__
24*412f47f9SXin Li#define CHARTSTMASK(c) 1<<(31-(c*8))
25*412f47f9SXin Li#else
26*412f47f9SXin Li#define CHARTSTMASK(c) 1<<(c*8)
27*412f47f9SXin Li#endif
28*412f47f9SXin Li	.thumb
29*412f47f9SXin Li
30*412f47f9SXin Li@ ---------------------------------------------------------------------------
31*412f47f9SXin Li	.thumb_func
32*412f47f9SXin Li	.align 2
33*412f47f9SXin Li	.p2align 4,,15
34*412f47f9SXin Li	.global __memset_arm
35*412f47f9SXin Li	.type __memset_arm,%function
36*412f47f9SXin Li__memset_arm:
37*412f47f9SXin Li	@ r0 = address
38*412f47f9SXin Li	@ r1 = character
39*412f47f9SXin Li	@ r2 = count
40*412f47f9SXin Li	@ returns original address in r0
41*412f47f9SXin Li
42*412f47f9SXin Li	mov	r3, r0		@ Leave r0 alone
43*412f47f9SXin Li	cbz	r2, 10f		@ Exit if 0 length
44*412f47f9SXin Li
45*412f47f9SXin Li	tst	r0, #7
46*412f47f9SXin Li	beq	2f		@ Already aligned
47*412f47f9SXin Li
48*412f47f9SXin Li	@ Ok, so we're misaligned here
49*412f47f9SXin Li1:
50*412f47f9SXin Li	strb	r1, [r3], #1
51*412f47f9SXin Li	subs	r2,r2,#1
52*412f47f9SXin Li	tst	r3, #7
53*412f47f9SXin Li	cbz	r2, 10f		@ Exit if we hit the end
54*412f47f9SXin Li	bne	1b		@ go round again if still misaligned
55*412f47f9SXin Li
56*412f47f9SXin Li2:
57*412f47f9SXin Li	@ OK, so we're aligned
58*412f47f9SXin Li	push	{r4,r5,r6,r7}
59*412f47f9SXin Li	bics	r4, r2, #15	@ if less than 16 bytes then need to finish it off
60*412f47f9SXin Li	beq	5f
61*412f47f9SXin Li
62*412f47f9SXin Li3:
63*412f47f9SXin Li	@ POSIX says that ch is cast to an unsigned char.  A uxtb is one
64*412f47f9SXin Li	@ byte and takes two cycles, where an AND is four bytes but one
65*412f47f9SXin Li	@ cycle.
66*412f47f9SXin Li	and	r1, #0xFF
67*412f47f9SXin Li	orr	r1, r1, r1, lsl#8	@ Same character into all bytes
68*412f47f9SXin Li	orr	r1, r1, r1, lsl#16
69*412f47f9SXin Li	mov	r5,r1
70*412f47f9SXin Li	mov	r6,r1
71*412f47f9SXin Li	mov	r7,r1
72*412f47f9SXin Li
73*412f47f9SXin Li4:
74*412f47f9SXin Li	subs	r4,r4,#16
75*412f47f9SXin Li	stmia	r3!,{r1,r5,r6,r7}
76*412f47f9SXin Li	bne	4b
77*412f47f9SXin Li	and	r2,r2,#15
78*412f47f9SXin Li
79*412f47f9SXin Li	@ At this point we're still aligned and we have upto align-1 bytes left to right
80*412f47f9SXin Li	@ we can avoid some of the byte-at-a time now by testing for some big chunks
81*412f47f9SXin Li	tst	r2,#8
82*412f47f9SXin Li	itt	ne
83*412f47f9SXin Li	subne	r2,r2,#8
84*412f47f9SXin Li	stmiane	r3!,{r1,r5}
85*412f47f9SXin Li
86*412f47f9SXin Li5:
87*412f47f9SXin Li	pop	{r4,r5,r6,r7}
88*412f47f9SXin Li	cbz	r2, 10f
89*412f47f9SXin Li
90*412f47f9SXin Li	@ Got to do any last < alignment bytes
91*412f47f9SXin Li6:
92*412f47f9SXin Li	subs	r2,r2,#1
93*412f47f9SXin Li	strb	r1,[r3],#1
94*412f47f9SXin Li	bne	6b
95*412f47f9SXin Li
96*412f47f9SXin Li10:
97*412f47f9SXin Li	bx	lr		@ goodbye
98*412f47f9SXin Li	.size	__memset_arm, . - __memset_arm
99