xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/memcpy.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * memcpy - copy memory area
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2012-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64, unaligned accesses.
11*412f47f9SXin Li *
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li#define dstin	x0
17*412f47f9SXin Li#define src	x1
18*412f47f9SXin Li#define count	x2
19*412f47f9SXin Li#define dst	x3
20*412f47f9SXin Li#define srcend	x4
21*412f47f9SXin Li#define dstend	x5
22*412f47f9SXin Li#define A_l	x6
23*412f47f9SXin Li#define A_lw	w6
24*412f47f9SXin Li#define A_h	x7
25*412f47f9SXin Li#define B_l	x8
26*412f47f9SXin Li#define B_lw	w8
27*412f47f9SXin Li#define B_h	x9
28*412f47f9SXin Li#define C_l	x10
29*412f47f9SXin Li#define C_lw	w10
30*412f47f9SXin Li#define C_h	x11
31*412f47f9SXin Li#define D_l	x12
32*412f47f9SXin Li#define D_h	x13
33*412f47f9SXin Li#define E_l	x14
34*412f47f9SXin Li#define E_h	x15
35*412f47f9SXin Li#define F_l	x16
36*412f47f9SXin Li#define F_h	x17
37*412f47f9SXin Li#define G_l	count
38*412f47f9SXin Li#define G_h	dst
39*412f47f9SXin Li#define H_l	src
40*412f47f9SXin Li#define H_h	srcend
41*412f47f9SXin Li#define tmp1	x14
42*412f47f9SXin Li
43*412f47f9SXin Li/* This implementation handles overlaps and supports both memcpy and memmove
44*412f47f9SXin Li   from a single entry point.  It uses unaligned accesses and branchless
45*412f47f9SXin Li   sequences to keep the code small, simple and improve performance.
46*412f47f9SXin Li
47*412f47f9SXin Li   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
48*412f47f9SXin Li   copies of up to 128 bytes, and large copies.  The overhead of the overlap
49*412f47f9SXin Li   check is negligible since it is only required for large copies.
50*412f47f9SXin Li
51*412f47f9SXin Li   Large copies use a software pipelined loop processing 64 bytes per iteration.
52*412f47f9SXin Li   The destination pointer is 16-byte aligned to minimize unaligned accesses.
53*412f47f9SXin Li   The loop tail is handled by always copying 64 bytes from the end.
54*412f47f9SXin Li*/
55*412f47f9SXin Li
56*412f47f9SXin LiENTRY_ALIAS (__memmove_aarch64)
57*412f47f9SXin LiENTRY (__memcpy_aarch64)
58*412f47f9SXin Li	PTR_ARG (0)
59*412f47f9SXin Li	PTR_ARG (1)
60*412f47f9SXin Li	SIZE_ARG (2)
61*412f47f9SXin Li	add	srcend, src, count
62*412f47f9SXin Li	add	dstend, dstin, count
63*412f47f9SXin Li	cmp	count, 128
64*412f47f9SXin Li	b.hi	L(copy_long)
65*412f47f9SXin Li	cmp	count, 32
66*412f47f9SXin Li	b.hi	L(copy32_128)
67*412f47f9SXin Li
68*412f47f9SXin Li	/* Small copies: 0..32 bytes.  */
69*412f47f9SXin Li	cmp	count, 16
70*412f47f9SXin Li	b.lo	L(copy16)
71*412f47f9SXin Li	ldp	A_l, A_h, [src]
72*412f47f9SXin Li	ldp	D_l, D_h, [srcend, -16]
73*412f47f9SXin Li	stp	A_l, A_h, [dstin]
74*412f47f9SXin Li	stp	D_l, D_h, [dstend, -16]
75*412f47f9SXin Li	ret
76*412f47f9SXin Li
77*412f47f9SXin Li	/* Copy 8-15 bytes.  */
78*412f47f9SXin LiL(copy16):
79*412f47f9SXin Li	tbz	count, 3, L(copy8)
80*412f47f9SXin Li	ldr	A_l, [src]
81*412f47f9SXin Li	ldr	A_h, [srcend, -8]
82*412f47f9SXin Li	str	A_l, [dstin]
83*412f47f9SXin Li	str	A_h, [dstend, -8]
84*412f47f9SXin Li	ret
85*412f47f9SXin Li
86*412f47f9SXin Li	.p2align 3
87*412f47f9SXin Li	/* Copy 4-7 bytes.  */
88*412f47f9SXin LiL(copy8):
89*412f47f9SXin Li	tbz	count, 2, L(copy4)
90*412f47f9SXin Li	ldr	A_lw, [src]
91*412f47f9SXin Li	ldr	B_lw, [srcend, -4]
92*412f47f9SXin Li	str	A_lw, [dstin]
93*412f47f9SXin Li	str	B_lw, [dstend, -4]
94*412f47f9SXin Li	ret
95*412f47f9SXin Li
96*412f47f9SXin Li	/* Copy 0..3 bytes using a branchless sequence.  */
97*412f47f9SXin LiL(copy4):
98*412f47f9SXin Li	cbz	count, L(copy0)
99*412f47f9SXin Li	lsr	tmp1, count, 1
100*412f47f9SXin Li	ldrb	A_lw, [src]
101*412f47f9SXin Li	ldrb	C_lw, [srcend, -1]
102*412f47f9SXin Li	ldrb	B_lw, [src, tmp1]
103*412f47f9SXin Li	strb	A_lw, [dstin]
104*412f47f9SXin Li	strb	B_lw, [dstin, tmp1]
105*412f47f9SXin Li	strb	C_lw, [dstend, -1]
106*412f47f9SXin LiL(copy0):
107*412f47f9SXin Li	ret
108*412f47f9SXin Li
109*412f47f9SXin Li	.p2align 4
110*412f47f9SXin Li	/* Medium copies: 33..128 bytes.  */
111*412f47f9SXin LiL(copy32_128):
112*412f47f9SXin Li	ldp	A_l, A_h, [src]
113*412f47f9SXin Li	ldp	B_l, B_h, [src, 16]
114*412f47f9SXin Li	ldp	C_l, C_h, [srcend, -32]
115*412f47f9SXin Li	ldp	D_l, D_h, [srcend, -16]
116*412f47f9SXin Li	cmp	count, 64
117*412f47f9SXin Li	b.hi	L(copy128)
118*412f47f9SXin Li	stp	A_l, A_h, [dstin]
119*412f47f9SXin Li	stp	B_l, B_h, [dstin, 16]
120*412f47f9SXin Li	stp	C_l, C_h, [dstend, -32]
121*412f47f9SXin Li	stp	D_l, D_h, [dstend, -16]
122*412f47f9SXin Li	ret
123*412f47f9SXin Li
124*412f47f9SXin Li	.p2align 4
125*412f47f9SXin Li	/* Copy 65..128 bytes.  */
126*412f47f9SXin LiL(copy128):
127*412f47f9SXin Li	ldp	E_l, E_h, [src, 32]
128*412f47f9SXin Li	ldp	F_l, F_h, [src, 48]
129*412f47f9SXin Li	cmp	count, 96
130*412f47f9SXin Li	b.ls	L(copy96)
131*412f47f9SXin Li	ldp	G_l, G_h, [srcend, -64]
132*412f47f9SXin Li	ldp	H_l, H_h, [srcend, -48]
133*412f47f9SXin Li	stp	G_l, G_h, [dstend, -64]
134*412f47f9SXin Li	stp	H_l, H_h, [dstend, -48]
135*412f47f9SXin LiL(copy96):
136*412f47f9SXin Li	stp	A_l, A_h, [dstin]
137*412f47f9SXin Li	stp	B_l, B_h, [dstin, 16]
138*412f47f9SXin Li	stp	E_l, E_h, [dstin, 32]
139*412f47f9SXin Li	stp	F_l, F_h, [dstin, 48]
140*412f47f9SXin Li	stp	C_l, C_h, [dstend, -32]
141*412f47f9SXin Li	stp	D_l, D_h, [dstend, -16]
142*412f47f9SXin Li	ret
143*412f47f9SXin Li
144*412f47f9SXin Li	.p2align 4
145*412f47f9SXin Li	/* Copy more than 128 bytes.  */
146*412f47f9SXin LiL(copy_long):
147*412f47f9SXin Li	/* Use backwards copy if there is an overlap.  */
148*412f47f9SXin Li	sub	tmp1, dstin, src
149*412f47f9SXin Li	cbz	tmp1, L(copy0)
150*412f47f9SXin Li	cmp	tmp1, count
151*412f47f9SXin Li	b.lo	L(copy_long_backwards)
152*412f47f9SXin Li
153*412f47f9SXin Li	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
154*412f47f9SXin Li
155*412f47f9SXin Li	ldp	D_l, D_h, [src]
156*412f47f9SXin Li	and	tmp1, dstin, 15
157*412f47f9SXin Li	bic	dst, dstin, 15
158*412f47f9SXin Li	sub	src, src, tmp1
159*412f47f9SXin Li	add	count, count, tmp1	/* Count is now 16 too large.  */
160*412f47f9SXin Li	ldp	A_l, A_h, [src, 16]
161*412f47f9SXin Li	stp	D_l, D_h, [dstin]
162*412f47f9SXin Li	ldp	B_l, B_h, [src, 32]
163*412f47f9SXin Li	ldp	C_l, C_h, [src, 48]
164*412f47f9SXin Li	ldp	D_l, D_h, [src, 64]!
165*412f47f9SXin Li	subs	count, count, 128 + 16	/* Test and readjust count.  */
166*412f47f9SXin Li	b.ls	L(copy64_from_end)
167*412f47f9SXin Li
168*412f47f9SXin LiL(loop64):
169*412f47f9SXin Li	stp	A_l, A_h, [dst, 16]
170*412f47f9SXin Li	ldp	A_l, A_h, [src, 16]
171*412f47f9SXin Li	stp	B_l, B_h, [dst, 32]
172*412f47f9SXin Li	ldp	B_l, B_h, [src, 32]
173*412f47f9SXin Li	stp	C_l, C_h, [dst, 48]
174*412f47f9SXin Li	ldp	C_l, C_h, [src, 48]
175*412f47f9SXin Li	stp	D_l, D_h, [dst, 64]!
176*412f47f9SXin Li	ldp	D_l, D_h, [src, 64]!
177*412f47f9SXin Li	subs	count, count, 64
178*412f47f9SXin Li	b.hi	L(loop64)
179*412f47f9SXin Li
180*412f47f9SXin Li	/* Write the last iteration and copy 64 bytes from the end.  */
181*412f47f9SXin LiL(copy64_from_end):
182*412f47f9SXin Li	ldp	E_l, E_h, [srcend, -64]
183*412f47f9SXin Li	stp	A_l, A_h, [dst, 16]
184*412f47f9SXin Li	ldp	A_l, A_h, [srcend, -48]
185*412f47f9SXin Li	stp	B_l, B_h, [dst, 32]
186*412f47f9SXin Li	ldp	B_l, B_h, [srcend, -32]
187*412f47f9SXin Li	stp	C_l, C_h, [dst, 48]
188*412f47f9SXin Li	ldp	C_l, C_h, [srcend, -16]
189*412f47f9SXin Li	stp	D_l, D_h, [dst, 64]
190*412f47f9SXin Li	stp	E_l, E_h, [dstend, -64]
191*412f47f9SXin Li	stp	A_l, A_h, [dstend, -48]
192*412f47f9SXin Li	stp	B_l, B_h, [dstend, -32]
193*412f47f9SXin Li	stp	C_l, C_h, [dstend, -16]
194*412f47f9SXin Li	ret
195*412f47f9SXin Li
196*412f47f9SXin Li	.p2align 4
197*412f47f9SXin Li
198*412f47f9SXin Li	/* Large backwards copy for overlapping copies.
199*412f47f9SXin Li	   Copy 16 bytes and then align dst to 16-byte alignment.  */
200*412f47f9SXin LiL(copy_long_backwards):
201*412f47f9SXin Li	ldp	D_l, D_h, [srcend, -16]
202*412f47f9SXin Li	and	tmp1, dstend, 15
203*412f47f9SXin Li	sub	srcend, srcend, tmp1
204*412f47f9SXin Li	sub	count, count, tmp1
205*412f47f9SXin Li	ldp	A_l, A_h, [srcend, -16]
206*412f47f9SXin Li	stp	D_l, D_h, [dstend, -16]
207*412f47f9SXin Li	ldp	B_l, B_h, [srcend, -32]
208*412f47f9SXin Li	ldp	C_l, C_h, [srcend, -48]
209*412f47f9SXin Li	ldp	D_l, D_h, [srcend, -64]!
210*412f47f9SXin Li	sub	dstend, dstend, tmp1
211*412f47f9SXin Li	subs	count, count, 128
212*412f47f9SXin Li	b.ls	L(copy64_from_start)
213*412f47f9SXin Li
214*412f47f9SXin LiL(loop64_backwards):
215*412f47f9SXin Li	stp	A_l, A_h, [dstend, -16]
216*412f47f9SXin Li	ldp	A_l, A_h, [srcend, -16]
217*412f47f9SXin Li	stp	B_l, B_h, [dstend, -32]
218*412f47f9SXin Li	ldp	B_l, B_h, [srcend, -32]
219*412f47f9SXin Li	stp	C_l, C_h, [dstend, -48]
220*412f47f9SXin Li	ldp	C_l, C_h, [srcend, -48]
221*412f47f9SXin Li	stp	D_l, D_h, [dstend, -64]!
222*412f47f9SXin Li	ldp	D_l, D_h, [srcend, -64]!
223*412f47f9SXin Li	subs	count, count, 64
224*412f47f9SXin Li	b.hi	L(loop64_backwards)
225*412f47f9SXin Li
226*412f47f9SXin Li	/* Write the last iteration and copy 64 bytes from the start.  */
227*412f47f9SXin LiL(copy64_from_start):
228*412f47f9SXin Li	ldp	G_l, G_h, [src, 48]
229*412f47f9SXin Li	stp	A_l, A_h, [dstend, -16]
230*412f47f9SXin Li	ldp	A_l, A_h, [src, 32]
231*412f47f9SXin Li	stp	B_l, B_h, [dstend, -32]
232*412f47f9SXin Li	ldp	B_l, B_h, [src, 16]
233*412f47f9SXin Li	stp	C_l, C_h, [dstend, -48]
234*412f47f9SXin Li	ldp	C_l, C_h, [src]
235*412f47f9SXin Li	stp	D_l, D_h, [dstend, -64]
236*412f47f9SXin Li	stp	G_l, G_h, [dstin, 48]
237*412f47f9SXin Li	stp	A_l, A_h, [dstin, 32]
238*412f47f9SXin Li	stp	B_l, B_h, [dstin, 16]
239*412f47f9SXin Li	stp	C_l, C_h, [dstin]
240*412f47f9SXin Li	ret
241*412f47f9SXin Li
242*412f47f9SXin LiEND (__memcpy_aarch64)
243*412f47f9SXin Li
244