xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/memcpy-sve.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * memcpy - copy memory area
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2019-2023, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
11*412f47f9SXin Li *
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li#ifdef HAVE_SVE
17*412f47f9SXin Li
18*412f47f9SXin Li.arch armv8-a+sve
19*412f47f9SXin Li
20*412f47f9SXin Li#define dstin	x0
21*412f47f9SXin Li#define src	x1
22*412f47f9SXin Li#define count	x2
23*412f47f9SXin Li#define dst	x3
24*412f47f9SXin Li#define srcend	x4
25*412f47f9SXin Li#define dstend	x5
26*412f47f9SXin Li#define tmp1	x6
27*412f47f9SXin Li#define vlen	x6
28*412f47f9SXin Li
29*412f47f9SXin Li#define A_q	q0
30*412f47f9SXin Li#define B_q	q1
31*412f47f9SXin Li#define C_q	q2
32*412f47f9SXin Li#define D_q	q3
33*412f47f9SXin Li#define E_q	q4
34*412f47f9SXin Li#define F_q	q5
35*412f47f9SXin Li#define G_q	q6
36*412f47f9SXin Li#define H_q	q7
37*412f47f9SXin Li
38*412f47f9SXin Li/* This implementation handles overlaps and supports both memcpy and memmove
39*412f47f9SXin Li   from a single entry point.  It uses unaligned accesses and branchless
40*412f47f9SXin Li   sequences to keep the code small, simple and improve performance.
41*412f47f9SXin Li   SVE vectors are used to speedup small copies.
42*412f47f9SXin Li
43*412f47f9SXin Li   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
44*412f47f9SXin Li   copies of up to 128 bytes, and large copies.  The overhead of the overlap
45*412f47f9SXin Li   check is negligible since it is only required for large copies.
46*412f47f9SXin Li
47*412f47f9SXin Li   Large copies use a software pipelined loop processing 64 bytes per iteration.
48*412f47f9SXin Li   The source pointer is 16-byte aligned to minimize unaligned accesses.
49*412f47f9SXin Li   The loop tail is handled by always copying 64 bytes from the end.
50*412f47f9SXin Li*/
51*412f47f9SXin Li
52*412f47f9SXin LiENTRY_ALIAS (__memmove_aarch64_sve)
53*412f47f9SXin LiENTRY (__memcpy_aarch64_sve)
54*412f47f9SXin Li	PTR_ARG (0)
55*412f47f9SXin Li	PTR_ARG (1)
56*412f47f9SXin Li	SIZE_ARG (2)
57*412f47f9SXin Li
58*412f47f9SXin Li	cmp	count, 128
59*412f47f9SXin Li	b.hi	L(copy_long)
60*412f47f9SXin Li	cntb	vlen
61*412f47f9SXin Li	cmp	count, vlen, lsl 1
62*412f47f9SXin Li	b.hi	L(copy32_128)
63*412f47f9SXin Li
64*412f47f9SXin Li	whilelo p0.b, xzr, count
65*412f47f9SXin Li	whilelo p1.b, vlen, count
66*412f47f9SXin Li	ld1b	z0.b, p0/z, [src, 0, mul vl]
67*412f47f9SXin Li	ld1b	z1.b, p1/z, [src, 1, mul vl]
68*412f47f9SXin Li	st1b	z0.b, p0, [dstin, 0, mul vl]
69*412f47f9SXin Li	st1b	z1.b, p1, [dstin, 1, mul vl]
70*412f47f9SXin Li	ret
71*412f47f9SXin Li
72*412f47f9SXin Li	/* Medium copies: 33..128 bytes.  */
73*412f47f9SXin LiL(copy32_128):
74*412f47f9SXin Li	add	srcend, src, count
75*412f47f9SXin Li	add	dstend, dstin, count
76*412f47f9SXin Li	ldp	A_q, B_q, [src]
77*412f47f9SXin Li	ldp	C_q, D_q, [srcend, -32]
78*412f47f9SXin Li	cmp	count, 64
79*412f47f9SXin Li	b.hi	L(copy128)
80*412f47f9SXin Li	stp	A_q, B_q, [dstin]
81*412f47f9SXin Li	stp	C_q, D_q, [dstend, -32]
82*412f47f9SXin Li	ret
83*412f47f9SXin Li
84*412f47f9SXin Li	/* Copy 65..128 bytes.  */
85*412f47f9SXin LiL(copy128):
86*412f47f9SXin Li	ldp	E_q, F_q, [src, 32]
87*412f47f9SXin Li	cmp	count, 96
88*412f47f9SXin Li	b.ls	L(copy96)
89*412f47f9SXin Li	ldp	G_q, H_q, [srcend, -64]
90*412f47f9SXin Li	stp	G_q, H_q, [dstend, -64]
91*412f47f9SXin LiL(copy96):
92*412f47f9SXin Li	stp	A_q, B_q, [dstin]
93*412f47f9SXin Li	stp	E_q, F_q, [dstin, 32]
94*412f47f9SXin Li	stp	C_q, D_q, [dstend, -32]
95*412f47f9SXin Li	ret
96*412f47f9SXin Li
97*412f47f9SXin Li	/* Copy more than 128 bytes.  */
98*412f47f9SXin LiL(copy_long):
99*412f47f9SXin Li	add	srcend, src, count
100*412f47f9SXin Li	add	dstend, dstin, count
101*412f47f9SXin Li
102*412f47f9SXin Li	/* Use backwards copy if there is an overlap.  */
103*412f47f9SXin Li	sub	tmp1, dstin, src
104*412f47f9SXin Li	cmp	tmp1, count
105*412f47f9SXin Li	b.lo	L(copy_long_backwards)
106*412f47f9SXin Li
107*412f47f9SXin Li	/* Copy 16 bytes and then align src to 16-byte alignment.  */
108*412f47f9SXin Li	ldr	D_q, [src]
109*412f47f9SXin Li	and	tmp1, src, 15
110*412f47f9SXin Li	bic	src, src, 15
111*412f47f9SXin Li	sub	dst, dstin, tmp1
112*412f47f9SXin Li	add	count, count, tmp1	/* Count is now 16 too large.  */
113*412f47f9SXin Li	ldp	A_q, B_q, [src, 16]
114*412f47f9SXin Li	str	D_q, [dstin]
115*412f47f9SXin Li	ldp	C_q, D_q, [src, 48]
116*412f47f9SXin Li	subs	count, count, 128 + 16	/* Test and readjust count.  */
117*412f47f9SXin Li	b.ls	L(copy64_from_end)
118*412f47f9SXin LiL(loop64):
119*412f47f9SXin Li	stp	A_q, B_q, [dst, 16]
120*412f47f9SXin Li	ldp	A_q, B_q, [src, 80]
121*412f47f9SXin Li	stp	C_q, D_q, [dst, 48]
122*412f47f9SXin Li	ldp	C_q, D_q, [src, 112]
123*412f47f9SXin Li	add	src, src, 64
124*412f47f9SXin Li	add	dst, dst, 64
125*412f47f9SXin Li	subs	count, count, 64
126*412f47f9SXin Li	b.hi	L(loop64)
127*412f47f9SXin Li
128*412f47f9SXin Li	/* Write the last iteration and copy 64 bytes from the end.  */
129*412f47f9SXin LiL(copy64_from_end):
130*412f47f9SXin Li	ldp	E_q, F_q, [srcend, -64]
131*412f47f9SXin Li	stp	A_q, B_q, [dst, 16]
132*412f47f9SXin Li	ldp	A_q, B_q, [srcend, -32]
133*412f47f9SXin Li	stp	C_q, D_q, [dst, 48]
134*412f47f9SXin Li	stp	E_q, F_q, [dstend, -64]
135*412f47f9SXin Li	stp	A_q, B_q, [dstend, -32]
136*412f47f9SXin Li	ret
137*412f47f9SXin Li
138*412f47f9SXin Li	/* Large backwards copy for overlapping copies.
139*412f47f9SXin Li	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
140*412f47f9SXin LiL(copy_long_backwards):
141*412f47f9SXin Li	cbz	tmp1, L(return)
142*412f47f9SXin Li	ldr	D_q, [srcend, -16]
143*412f47f9SXin Li	and	tmp1, srcend, 15
144*412f47f9SXin Li	bic	srcend, srcend, 15
145*412f47f9SXin Li	sub	count, count, tmp1
146*412f47f9SXin Li	ldp	A_q, B_q, [srcend, -32]
147*412f47f9SXin Li	str	D_q, [dstend, -16]
148*412f47f9SXin Li	ldp	C_q, D_q, [srcend, -64]
149*412f47f9SXin Li	sub	dstend, dstend, tmp1
150*412f47f9SXin Li	subs	count, count, 128
151*412f47f9SXin Li	b.ls	L(copy64_from_start)
152*412f47f9SXin Li
153*412f47f9SXin LiL(loop64_backwards):
154*412f47f9SXin Li	str	B_q, [dstend, -16]
155*412f47f9SXin Li	str	A_q, [dstend, -32]
156*412f47f9SXin Li	ldp	A_q, B_q, [srcend, -96]
157*412f47f9SXin Li	str	D_q, [dstend, -48]
158*412f47f9SXin Li	str	C_q, [dstend, -64]!
159*412f47f9SXin Li	ldp	C_q, D_q, [srcend, -128]
160*412f47f9SXin Li	sub	srcend, srcend, 64
161*412f47f9SXin Li	subs	count, count, 64
162*412f47f9SXin Li	b.hi	L(loop64_backwards)
163*412f47f9SXin Li
164*412f47f9SXin Li	/* Write the last iteration and copy 64 bytes from the start.  */
165*412f47f9SXin LiL(copy64_from_start):
166*412f47f9SXin Li	ldp	E_q, F_q, [src, 32]
167*412f47f9SXin Li	stp	A_q, B_q, [dstend, -32]
168*412f47f9SXin Li	ldp	A_q, B_q, [src]
169*412f47f9SXin Li	stp	C_q, D_q, [dstend, -64]
170*412f47f9SXin Li	stp	E_q, F_q, [dstin, 32]
171*412f47f9SXin Li	stp	A_q, B_q, [dstin]
172*412f47f9SXin LiL(return):
173*412f47f9SXin Li	ret
174*412f47f9SXin Li
175*412f47f9SXin LiEND (__memcpy_aarch64_sve)
176*412f47f9SXin Li
177*412f47f9SXin Li#endif
178