xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/memcpy-advsimd.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * memcpy - copy memory area
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2019-2023, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11*412f47f9SXin Li *
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li#define dstin	x0
17*412f47f9SXin Li#define src	x1
18*412f47f9SXin Li#define count	x2
19*412f47f9SXin Li#define dst	x3
20*412f47f9SXin Li#define srcend	x4
21*412f47f9SXin Li#define dstend	x5
22*412f47f9SXin Li#define A_l	x6
23*412f47f9SXin Li#define A_lw	w6
24*412f47f9SXin Li#define A_h	x7
25*412f47f9SXin Li#define B_l	x8
26*412f47f9SXin Li#define B_lw	w8
27*412f47f9SXin Li#define B_h	x9
28*412f47f9SXin Li#define C_lw	w10
29*412f47f9SXin Li#define tmp1	x14
30*412f47f9SXin Li
31*412f47f9SXin Li#define A_q	q0
32*412f47f9SXin Li#define B_q	q1
33*412f47f9SXin Li#define C_q	q2
34*412f47f9SXin Li#define D_q	q3
35*412f47f9SXin Li#define E_q	q4
36*412f47f9SXin Li#define F_q	q5
37*412f47f9SXin Li#define G_q	q6
38*412f47f9SXin Li#define H_q	q7
39*412f47f9SXin Li
40*412f47f9SXin Li/* This implementation handles overlaps and supports both memcpy and memmove
41*412f47f9SXin Li   from a single entry point.  It uses unaligned accesses and branchless
42*412f47f9SXin Li   sequences to keep the code small, simple and improve performance.
43*412f47f9SXin Li
44*412f47f9SXin Li   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45*412f47f9SXin Li   copies of up to 128 bytes, and large copies.  The overhead of the overlap
46*412f47f9SXin Li   check is negligible since it is only required for large copies.
47*412f47f9SXin Li
48*412f47f9SXin Li   Large copies use a software pipelined loop processing 64 bytes per iteration.
49*412f47f9SXin Li   The source pointer is 16-byte aligned to minimize unaligned accesses.
50*412f47f9SXin Li   The loop tail is handled by always copying 64 bytes from the end.
51*412f47f9SXin Li*/
52*412f47f9SXin Li
53*412f47f9SXin LiENTRY_ALIAS (__memmove_aarch64_simd)
54*412f47f9SXin LiENTRY (__memcpy_aarch64_simd)
55*412f47f9SXin Li	PTR_ARG (0)
56*412f47f9SXin Li	PTR_ARG (1)
57*412f47f9SXin Li	SIZE_ARG (2)
58*412f47f9SXin Li	add	srcend, src, count
59*412f47f9SXin Li	cmp	count, 128
60*412f47f9SXin Li	b.hi	L(copy_long)
61*412f47f9SXin Li	add	dstend, dstin, count
62*412f47f9SXin Li	cmp	count, 32
63*412f47f9SXin Li	b.hi	L(copy32_128)
64*412f47f9SXin Li	nop
65*412f47f9SXin Li
66*412f47f9SXin Li	/* Small copies: 0..32 bytes.  */
67*412f47f9SXin Li	cmp	count, 16
68*412f47f9SXin Li	b.lo	L(copy16)
69*412f47f9SXin Li	ldr	A_q, [src]
70*412f47f9SXin Li	ldr	B_q, [srcend, -16]
71*412f47f9SXin Li	str	A_q, [dstin]
72*412f47f9SXin Li	str	B_q, [dstend, -16]
73*412f47f9SXin Li	ret
74*412f47f9SXin Li
75*412f47f9SXin Li	.p2align 4
76*412f47f9SXin Li	/* Medium copies: 33..128 bytes.  */
77*412f47f9SXin LiL(copy32_128):
78*412f47f9SXin Li	ldp	A_q, B_q, [src]
79*412f47f9SXin Li	ldp	C_q, D_q, [srcend, -32]
80*412f47f9SXin Li	cmp	count, 64
81*412f47f9SXin Li	b.hi	L(copy128)
82*412f47f9SXin Li	stp	A_q, B_q, [dstin]
83*412f47f9SXin Li	stp	C_q, D_q, [dstend, -32]
84*412f47f9SXin Li	ret
85*412f47f9SXin Li
86*412f47f9SXin Li	.p2align 4
87*412f47f9SXin Li	/* Copy 8-15 bytes.  */
88*412f47f9SXin LiL(copy16):
89*412f47f9SXin Li	tbz	count, 3, L(copy8)
90*412f47f9SXin Li	ldr	A_l, [src]
91*412f47f9SXin Li	ldr	A_h, [srcend, -8]
92*412f47f9SXin Li	str	A_l, [dstin]
93*412f47f9SXin Li	str	A_h, [dstend, -8]
94*412f47f9SXin Li	ret
95*412f47f9SXin Li
96*412f47f9SXin Li	/* Copy 4-7 bytes.  */
97*412f47f9SXin LiL(copy8):
98*412f47f9SXin Li	tbz	count, 2, L(copy4)
99*412f47f9SXin Li	ldr	A_lw, [src]
100*412f47f9SXin Li	ldr	B_lw, [srcend, -4]
101*412f47f9SXin Li	str	A_lw, [dstin]
102*412f47f9SXin Li	str	B_lw, [dstend, -4]
103*412f47f9SXin Li	ret
104*412f47f9SXin Li
105*412f47f9SXin Li	/* Copy 65..128 bytes.  */
106*412f47f9SXin LiL(copy128):
107*412f47f9SXin Li	ldp	E_q, F_q, [src, 32]
108*412f47f9SXin Li	cmp	count, 96
109*412f47f9SXin Li	b.ls	L(copy96)
110*412f47f9SXin Li	ldp	G_q, H_q, [srcend, -64]
111*412f47f9SXin Li	stp	G_q, H_q, [dstend, -64]
112*412f47f9SXin LiL(copy96):
113*412f47f9SXin Li	stp	A_q, B_q, [dstin]
114*412f47f9SXin Li	stp	E_q, F_q, [dstin, 32]
115*412f47f9SXin Li	stp	C_q, D_q, [dstend, -32]
116*412f47f9SXin Li	ret
117*412f47f9SXin Li
118*412f47f9SXin Li	/* Copy 0..3 bytes using a branchless sequence.  */
119*412f47f9SXin LiL(copy4):
120*412f47f9SXin Li	cbz	count, L(copy0)
121*412f47f9SXin Li	lsr	tmp1, count, 1
122*412f47f9SXin Li	ldrb	A_lw, [src]
123*412f47f9SXin Li	ldrb	C_lw, [srcend, -1]
124*412f47f9SXin Li	ldrb	B_lw, [src, tmp1]
125*412f47f9SXin Li	strb	A_lw, [dstin]
126*412f47f9SXin Li	strb	B_lw, [dstin, tmp1]
127*412f47f9SXin Li	strb	C_lw, [dstend, -1]
128*412f47f9SXin LiL(copy0):
129*412f47f9SXin Li	ret
130*412f47f9SXin Li
131*412f47f9SXin Li	.p2align 3
132*412f47f9SXin Li	/* Copy more than 128 bytes.  */
133*412f47f9SXin LiL(copy_long):
134*412f47f9SXin Li	add	dstend, dstin, count
135*412f47f9SXin Li
136*412f47f9SXin Li	/* Use backwards copy if there is an overlap.  */
137*412f47f9SXin Li	sub	tmp1, dstin, src
138*412f47f9SXin Li	cmp	tmp1, count
139*412f47f9SXin Li	b.lo	L(copy_long_backwards)
140*412f47f9SXin Li
141*412f47f9SXin Li	/* Copy 16 bytes and then align src to 16-byte alignment.  */
142*412f47f9SXin Li	ldr	D_q, [src]
143*412f47f9SXin Li	and	tmp1, src, 15
144*412f47f9SXin Li	bic	src, src, 15
145*412f47f9SXin Li	sub	dst, dstin, tmp1
146*412f47f9SXin Li	add	count, count, tmp1	/* Count is now 16 too large.  */
147*412f47f9SXin Li	ldp	A_q, B_q, [src, 16]
148*412f47f9SXin Li	str	D_q, [dstin]
149*412f47f9SXin Li	ldp	C_q, D_q, [src, 48]
150*412f47f9SXin Li	subs	count, count, 128 + 16	/* Test and readjust count.  */
151*412f47f9SXin Li	b.ls	L(copy64_from_end)
152*412f47f9SXin LiL(loop64):
153*412f47f9SXin Li	stp	A_q, B_q, [dst, 16]
154*412f47f9SXin Li	ldp	A_q, B_q, [src, 80]
155*412f47f9SXin Li	stp	C_q, D_q, [dst, 48]
156*412f47f9SXin Li	ldp	C_q, D_q, [src, 112]
157*412f47f9SXin Li	add	src, src, 64
158*412f47f9SXin Li	add	dst, dst, 64
159*412f47f9SXin Li	subs	count, count, 64
160*412f47f9SXin Li	b.hi	L(loop64)
161*412f47f9SXin Li
162*412f47f9SXin Li	/* Write the last iteration and copy 64 bytes from the end.  */
163*412f47f9SXin LiL(copy64_from_end):
164*412f47f9SXin Li	ldp	E_q, F_q, [srcend, -64]
165*412f47f9SXin Li	stp	A_q, B_q, [dst, 16]
166*412f47f9SXin Li	ldp	A_q, B_q, [srcend, -32]
167*412f47f9SXin Li	stp	C_q, D_q, [dst, 48]
168*412f47f9SXin Li	stp	E_q, F_q, [dstend, -64]
169*412f47f9SXin Li	stp	A_q, B_q, [dstend, -32]
170*412f47f9SXin Li	ret
171*412f47f9SXin Li
172*412f47f9SXin Li	.p2align 4
173*412f47f9SXin Li	nop
174*412f47f9SXin Li
175*412f47f9SXin Li	/* Large backwards copy for overlapping copies.
176*412f47f9SXin Li	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
177*412f47f9SXin LiL(copy_long_backwards):
178*412f47f9SXin Li	cbz	tmp1, L(copy0)
179*412f47f9SXin Li	ldr	D_q, [srcend, -16]
180*412f47f9SXin Li	and	tmp1, srcend, 15
181*412f47f9SXin Li	bic	srcend, srcend, 15
182*412f47f9SXin Li	sub	count, count, tmp1
183*412f47f9SXin Li	ldp	A_q, B_q, [srcend, -32]
184*412f47f9SXin Li	str	D_q, [dstend, -16]
185*412f47f9SXin Li	ldp	C_q, D_q, [srcend, -64]
186*412f47f9SXin Li	sub	dstend, dstend, tmp1
187*412f47f9SXin Li	subs	count, count, 128
188*412f47f9SXin Li	b.ls	L(copy64_from_start)
189*412f47f9SXin Li
190*412f47f9SXin LiL(loop64_backwards):
191*412f47f9SXin Li	str	B_q, [dstend, -16]
192*412f47f9SXin Li	str	A_q, [dstend, -32]
193*412f47f9SXin Li	ldp	A_q, B_q, [srcend, -96]
194*412f47f9SXin Li	str	D_q, [dstend, -48]
195*412f47f9SXin Li	str	C_q, [dstend, -64]!
196*412f47f9SXin Li	ldp	C_q, D_q, [srcend, -128]
197*412f47f9SXin Li	sub	srcend, srcend, 64
198*412f47f9SXin Li	subs	count, count, 64
199*412f47f9SXin Li	b.hi	L(loop64_backwards)
200*412f47f9SXin Li
201*412f47f9SXin Li	/* Write the last iteration and copy 64 bytes from the start.  */
202*412f47f9SXin LiL(copy64_from_start):
203*412f47f9SXin Li	ldp	E_q, F_q, [src, 32]
204*412f47f9SXin Li	stp	A_q, B_q, [dstend, -32]
205*412f47f9SXin Li	ldp	A_q, B_q, [src]
206*412f47f9SXin Li	stp	C_q, D_q, [dstend, -64]
207*412f47f9SXin Li	stp	E_q, F_q, [dstin, 32]
208*412f47f9SXin Li	stp	A_q, B_q, [dstin]
209*412f47f9SXin Li	ret
210*412f47f9SXin Li
211*412f47f9SXin LiEND (__memcpy_aarch64_simd)
212*412f47f9SXin Li
213