xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/strcpy.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * strcpy/stpcpy - copy a string returning pointer to start/end.
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2020-2023, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD.
11*412f47f9SXin Li * MTE compatible.
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li#define dstin		x0
17*412f47f9SXin Li#define srcin		x1
18*412f47f9SXin Li#define result		x0
19*412f47f9SXin Li
20*412f47f9SXin Li#define src		x2
21*412f47f9SXin Li#define dst		x3
22*412f47f9SXin Li#define len		x4
23*412f47f9SXin Li#define synd		x4
24*412f47f9SXin Li#define	tmp		x5
25*412f47f9SXin Li#define shift		x5
26*412f47f9SXin Li#define data1		x6
27*412f47f9SXin Li#define dataw1		w6
28*412f47f9SXin Li#define data2		x7
29*412f47f9SXin Li#define dataw2		w7
30*412f47f9SXin Li
31*412f47f9SXin Li#define dataq		q0
32*412f47f9SXin Li#define vdata		v0
33*412f47f9SXin Li#define vhas_nul	v1
34*412f47f9SXin Li#define vend		v2
35*412f47f9SXin Li#define dend		d2
36*412f47f9SXin Li#define dataq2		q1
37*412f47f9SXin Li
38*412f47f9SXin Li#ifdef BUILD_STPCPY
39*412f47f9SXin Li# define STRCPY __stpcpy_aarch64
40*412f47f9SXin Li# define IFSTPCPY(X,...) X,__VA_ARGS__
41*412f47f9SXin Li#else
42*412f47f9SXin Li# define STRCPY __strcpy_aarch64
43*412f47f9SXin Li# define IFSTPCPY(X,...)
44*412f47f9SXin Li#endif
45*412f47f9SXin Li
46*412f47f9SXin Li/*
47*412f47f9SXin Li   Core algorithm:
48*412f47f9SXin Li   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
49*412f47f9SXin Li   per byte. We take 4 bits of every comparison byte with shift right and narrow
50*412f47f9SXin Li   by 4 instruction. Since the bits in the nibble mask reflect the order in
51*412f47f9SXin Li   which things occur in the original string, counting leading zeros identifies
52*412f47f9SXin Li   exactly which byte matched.  */
53*412f47f9SXin Li
54*412f47f9SXin LiENTRY (STRCPY)
55*412f47f9SXin Li	PTR_ARG (0)
56*412f47f9SXin Li	PTR_ARG (1)
57*412f47f9SXin Li	bic	src, srcin, 15
58*412f47f9SXin Li	ld1	{vdata.16b}, [src]
59*412f47f9SXin Li	cmeq	vhas_nul.16b, vdata.16b, 0
60*412f47f9SXin Li	lsl	shift, srcin, 2
61*412f47f9SXin Li	shrn	vend.8b, vhas_nul.8h, 4
62*412f47f9SXin Li	fmov	synd, dend
63*412f47f9SXin Li	lsr	synd, synd, shift
64*412f47f9SXin Li	cbnz	synd, L(tail)
65*412f47f9SXin Li
66*412f47f9SXin Li	ldr	dataq, [src, 16]!
67*412f47f9SXin Li	cmeq	vhas_nul.16b, vdata.16b, 0
68*412f47f9SXin Li	shrn	vend.8b, vhas_nul.8h, 4
69*412f47f9SXin Li	fmov	synd, dend
70*412f47f9SXin Li	cbz	synd, L(start_loop)
71*412f47f9SXin Li
72*412f47f9SXin Li#ifndef __AARCH64EB__
73*412f47f9SXin Li	rbit	synd, synd
74*412f47f9SXin Li#endif
75*412f47f9SXin Li	sub	tmp, src, srcin
76*412f47f9SXin Li	clz	len, synd
77*412f47f9SXin Li	add	len, tmp, len, lsr 2
78*412f47f9SXin Li	tbz	len, 4, L(less16)
79*412f47f9SXin Li	sub	tmp, len, 15
80*412f47f9SXin Li	ldr	dataq, [srcin]
81*412f47f9SXin Li	ldr	dataq2, [srcin, tmp]
82*412f47f9SXin Li	str	dataq, [dstin]
83*412f47f9SXin Li	str	dataq2, [dstin, tmp]
84*412f47f9SXin Li	IFSTPCPY (add result, dstin, len)
85*412f47f9SXin Li	ret
86*412f47f9SXin Li
87*412f47f9SXin LiL(tail):
88*412f47f9SXin Li	rbit	synd, synd
89*412f47f9SXin Li	clz	len, synd
90*412f47f9SXin Li	lsr	len, len, 2
91*412f47f9SXin LiL(less16):
92*412f47f9SXin Li	tbz	len, 3, L(less8)
93*412f47f9SXin Li	sub	tmp, len, 7
94*412f47f9SXin Li	ldr	data1, [srcin]
95*412f47f9SXin Li	ldr	data2, [srcin, tmp]
96*412f47f9SXin Li	str	data1, [dstin]
97*412f47f9SXin Li	str	data2, [dstin, tmp]
98*412f47f9SXin Li	IFSTPCPY (add result, dstin, len)
99*412f47f9SXin Li	ret
100*412f47f9SXin Li
101*412f47f9SXin Li	.p2align 4
102*412f47f9SXin LiL(less8):
103*412f47f9SXin Li	subs	tmp, len, 3
104*412f47f9SXin Li	b.lo	L(less4)
105*412f47f9SXin Li	ldr	dataw1, [srcin]
106*412f47f9SXin Li	ldr	dataw2, [srcin, tmp]
107*412f47f9SXin Li	str	dataw1, [dstin]
108*412f47f9SXin Li	str	dataw2, [dstin, tmp]
109*412f47f9SXin Li	IFSTPCPY (add result, dstin, len)
110*412f47f9SXin Li	ret
111*412f47f9SXin Li
112*412f47f9SXin LiL(less4):
113*412f47f9SXin Li	cbz	len, L(zerobyte)
114*412f47f9SXin Li	ldrh	dataw1, [srcin]
115*412f47f9SXin Li	strh	dataw1, [dstin]
116*412f47f9SXin LiL(zerobyte):
117*412f47f9SXin Li	strb	wzr, [dstin, len]
118*412f47f9SXin Li	IFSTPCPY (add result, dstin, len)
119*412f47f9SXin Li	ret
120*412f47f9SXin Li
121*412f47f9SXin Li	.p2align 4
122*412f47f9SXin LiL(start_loop):
123*412f47f9SXin Li	sub	tmp, srcin, dstin
124*412f47f9SXin Li	ldr	dataq2, [srcin]
125*412f47f9SXin Li	sub	dst, src, tmp
126*412f47f9SXin Li	str	dataq2, [dstin]
127*412f47f9SXin LiL(loop):
128*412f47f9SXin Li	str	dataq, [dst], 32
129*412f47f9SXin Li	ldr	dataq, [src, 16]
130*412f47f9SXin Li	cmeq	vhas_nul.16b, vdata.16b, 0
131*412f47f9SXin Li	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
132*412f47f9SXin Li	fmov	synd, dend
133*412f47f9SXin Li	cbnz	synd, L(loopend)
134*412f47f9SXin Li	str	dataq, [dst, -16]
135*412f47f9SXin Li	ldr	dataq, [src, 32]!
136*412f47f9SXin Li	cmeq	vhas_nul.16b, vdata.16b, 0
137*412f47f9SXin Li	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
138*412f47f9SXin Li	fmov	synd, dend
139*412f47f9SXin Li	cbz	synd, L(loop)
140*412f47f9SXin Li	add	dst, dst, 16
141*412f47f9SXin LiL(loopend):
142*412f47f9SXin Li	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
143*412f47f9SXin Li	fmov	synd, dend
144*412f47f9SXin Li	sub	dst, dst, 31
145*412f47f9SXin Li#ifndef __AARCH64EB__
146*412f47f9SXin Li	rbit	synd, synd
147*412f47f9SXin Li#endif
148*412f47f9SXin Li	clz	len, synd
149*412f47f9SXin Li	lsr	len, len, 2
150*412f47f9SXin Li	add	dst, dst, len
151*412f47f9SXin Li	ldr	dataq, [dst, tmp]
152*412f47f9SXin Li	str	dataq, [dst]
153*412f47f9SXin Li	IFSTPCPY (add result, dst, 15)
154*412f47f9SXin Li	ret
155*412f47f9SXin Li
156*412f47f9SXin LiEND (STRCPY)
157