xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/strlen.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * strlen - calculate the length of a string.
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2020-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11*412f47f9SXin Li * Not MTE compatible.
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li#define srcin	x0
17*412f47f9SXin Li#define len	x0
18*412f47f9SXin Li
19*412f47f9SXin Li#define src	x1
20*412f47f9SXin Li#define data1	x2
21*412f47f9SXin Li#define data2	x3
22*412f47f9SXin Li#define has_nul1 x4
23*412f47f9SXin Li#define has_nul2 x5
24*412f47f9SXin Li#define tmp1	x4
25*412f47f9SXin Li#define tmp2	x5
26*412f47f9SXin Li#define tmp3	x6
27*412f47f9SXin Li#define tmp4	x7
28*412f47f9SXin Li#define zeroones x8
29*412f47f9SXin Li
30*412f47f9SXin Li#define maskv	v0
31*412f47f9SXin Li#define maskd	d0
32*412f47f9SXin Li#define dataq1	q1
33*412f47f9SXin Li#define dataq2	q2
34*412f47f9SXin Li#define datav1	v1
35*412f47f9SXin Li#define datav2	v2
36*412f47f9SXin Li#define tmp	x2
37*412f47f9SXin Li#define tmpw	w2
38*412f47f9SXin Li#define synd	x3
39*412f47f9SXin Li#define syndw	w3
40*412f47f9SXin Li#define shift	x4
41*412f47f9SXin Li
42*412f47f9SXin Li/* For the first 32 bytes, NUL detection works on the principle that
43*412f47f9SXin Li   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
44*412f47f9SXin Li   byte is zero, and can be done in parallel across the entire word.  */
45*412f47f9SXin Li
46*412f47f9SXin Li#define REP8_01 0x0101010101010101
47*412f47f9SXin Li#define REP8_7f 0x7f7f7f7f7f7f7f7f
48*412f47f9SXin Li
49*412f47f9SXin Li/* To test the page crossing code path more thoroughly, compile with
50*412f47f9SXin Li   -DTEST_PAGE_CROSS - this will force all calls through the slower
51*412f47f9SXin Li   entry path.  This option is not intended for production use.  */
52*412f47f9SXin Li
53*412f47f9SXin Li#ifdef TEST_PAGE_CROSS
54*412f47f9SXin Li# define MIN_PAGE_SIZE 32
55*412f47f9SXin Li#else
56*412f47f9SXin Li# define MIN_PAGE_SIZE 4096
57*412f47f9SXin Li#endif
58*412f47f9SXin Li
59*412f47f9SXin Li/* Core algorithm:
60*412f47f9SXin Li
61*412f47f9SXin Li   Since strings are short on average, we check the first 32 bytes of the
62*412f47f9SXin Li   string for a NUL character without aligning the string.  In order to use
63*412f47f9SXin Li   unaligned loads safely we must do a page cross check first.
64*412f47f9SXin Li
65*412f47f9SXin Li   If there is a NUL byte we calculate the length from the 2 8-byte words
66*412f47f9SXin Li   using conditional select to reduce branch mispredictions (it is unlikely
67*412f47f9SXin Li   strlen will be repeatedly called on strings with the same length).
68*412f47f9SXin Li
69*412f47f9SXin Li   If the string is longer than 32 bytes, align src so we don't need further
70*412f47f9SXin Li   page cross checks, and process 32 bytes per iteration using a fast SIMD
71*412f47f9SXin Li   loop.
72*412f47f9SXin Li
73*412f47f9SXin Li   If the page cross check fails, we read 32 bytes from an aligned address,
74*412f47f9SXin Li   and ignore any characters before the string.  If it contains a NUL
75*412f47f9SXin Li   character, return the length, if not, continue in the main loop.  */
76*412f47f9SXin Li
77*412f47f9SXin LiENTRY (__strlen_aarch64)
78*412f47f9SXin Li	PTR_ARG (0)
79*412f47f9SXin Li	and	tmp1, srcin, MIN_PAGE_SIZE - 1
80*412f47f9SXin Li	cmp	tmp1, MIN_PAGE_SIZE - 32
81*412f47f9SXin Li	b.hi	L(page_cross)
82*412f47f9SXin Li
83*412f47f9SXin Li	/* Look for a NUL byte in the first 16 bytes.  */
84*412f47f9SXin Li	ldp	data1, data2, [srcin]
85*412f47f9SXin Li	mov	zeroones, REP8_01
86*412f47f9SXin Li
87*412f47f9SXin Li#ifdef __AARCH64EB__
88*412f47f9SXin Li	/* For big-endian, carry propagation (if the final byte in the
89*412f47f9SXin Li	   string is 0x01) means we cannot use has_nul1/2 directly.
90*412f47f9SXin Li	   Since we expect strings to be small and early-exit,
91*412f47f9SXin Li	   byte-swap the data now so has_null1/2 will be correct.  */
92*412f47f9SXin Li	rev	data1, data1
93*412f47f9SXin Li	rev	data2, data2
94*412f47f9SXin Li#endif
95*412f47f9SXin Li	sub	tmp1, data1, zeroones
96*412f47f9SXin Li	orr	tmp2, data1, REP8_7f
97*412f47f9SXin Li	sub	tmp3, data2, zeroones
98*412f47f9SXin Li	orr	tmp4, data2, REP8_7f
99*412f47f9SXin Li	bics	has_nul1, tmp1, tmp2
100*412f47f9SXin Li	bic	has_nul2, tmp3, tmp4
101*412f47f9SXin Li	ccmp	has_nul2, 0, 0, eq
102*412f47f9SXin Li	b.eq	L(bytes16_31)
103*412f47f9SXin Li
104*412f47f9SXin Li	/* Find the exact offset of the first NUL byte in the first 16 bytes
105*412f47f9SXin Li	   from the string start.  Enter with C = has_nul1 == 0.  */
106*412f47f9SXin Li	csel	has_nul1, has_nul1, has_nul2, cc
107*412f47f9SXin Li	mov	len, 8
108*412f47f9SXin Li	rev	has_nul1, has_nul1
109*412f47f9SXin Li	csel	len, xzr, len, cc
110*412f47f9SXin Li	clz	tmp1, has_nul1
111*412f47f9SXin Li	add	len, len, tmp1, lsr 3
112*412f47f9SXin Li	ret
113*412f47f9SXin Li
114*412f47f9SXin Li	/* Look for a NUL byte at offset 16..31 in the string.  */
115*412f47f9SXin LiL(bytes16_31):
116*412f47f9SXin Li	ldp	data1, data2, [srcin, 16]
117*412f47f9SXin Li#ifdef __AARCH64EB__
118*412f47f9SXin Li	rev	data1, data1
119*412f47f9SXin Li	rev	data2, data2
120*412f47f9SXin Li#endif
121*412f47f9SXin Li	sub	tmp1, data1, zeroones
122*412f47f9SXin Li	orr	tmp2, data1, REP8_7f
123*412f47f9SXin Li	sub	tmp3, data2, zeroones
124*412f47f9SXin Li	orr	tmp4, data2, REP8_7f
125*412f47f9SXin Li	bics	has_nul1, tmp1, tmp2
126*412f47f9SXin Li	bic	has_nul2, tmp3, tmp4
127*412f47f9SXin Li	ccmp	has_nul2, 0, 0, eq
128*412f47f9SXin Li	b.eq	L(loop_entry)
129*412f47f9SXin Li
130*412f47f9SXin Li	/* Find the exact offset of the first NUL byte at offset 16..31 from
131*412f47f9SXin Li	   the string start.  Enter with C = has_nul1 == 0.  */
132*412f47f9SXin Li	csel	has_nul1, has_nul1, has_nul2, cc
133*412f47f9SXin Li	mov	len, 24
134*412f47f9SXin Li	rev	has_nul1, has_nul1
135*412f47f9SXin Li	mov	tmp3, 16
136*412f47f9SXin Li	clz	tmp1, has_nul1
137*412f47f9SXin Li	csel	len, tmp3, len, cc
138*412f47f9SXin Li	add	len, len, tmp1, lsr 3
139*412f47f9SXin Li	ret
140*412f47f9SXin Li
141*412f47f9SXin Li	nop
142*412f47f9SXin LiL(loop_entry):
143*412f47f9SXin Li	bic	src, srcin, 31
144*412f47f9SXin Li
145*412f47f9SXin Li	.p2align 5
146*412f47f9SXin LiL(loop):
147*412f47f9SXin Li	ldp	dataq1, dataq2, [src, 32]!
148*412f47f9SXin Li	uminp	maskv.16b, datav1.16b, datav2.16b
149*412f47f9SXin Li	uminp	maskv.16b, maskv.16b, maskv.16b
150*412f47f9SXin Li	cmeq	maskv.8b, maskv.8b, 0
151*412f47f9SXin Li	fmov	synd, maskd
152*412f47f9SXin Li	cbz	synd, L(loop)
153*412f47f9SXin Li
154*412f47f9SXin Li	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
155*412f47f9SXin Li	cmeq	maskv.16b, datav1.16b, 0
156*412f47f9SXin Li	sub	len, src, srcin
157*412f47f9SXin Li	cbnz	syndw, 1f
158*412f47f9SXin Li	cmeq	maskv.16b, datav2.16b, 0
159*412f47f9SXin Li	add	len, len, 16
160*412f47f9SXin Li1:
161*412f47f9SXin Li	/* Generate a bitmask and compute correct byte offset.  */
162*412f47f9SXin Li	shrn	maskv.8b, maskv.8h, 4
163*412f47f9SXin Li	fmov	synd, maskd
164*412f47f9SXin Li#ifndef __AARCH64EB__
165*412f47f9SXin Li	rbit	synd, synd
166*412f47f9SXin Li#endif
167*412f47f9SXin Li	clz	tmp, synd
168*412f47f9SXin Li	add	len, len, tmp, lsr 2
169*412f47f9SXin Li	ret
170*412f47f9SXin Li
171*412f47f9SXin LiL(page_cross):
172*412f47f9SXin Li	bic	src, srcin, 31
173*412f47f9SXin Li	mov	tmpw, 0x0c03
174*412f47f9SXin Li	movk	tmpw, 0xc030, lsl 16
175*412f47f9SXin Li	ld1	{datav1.16b, datav2.16b}, [src]
176*412f47f9SXin Li	dup	maskv.4s, tmpw
177*412f47f9SXin Li	cmeq	datav1.16b, datav1.16b, 0
178*412f47f9SXin Li	cmeq	datav2.16b, datav2.16b, 0
179*412f47f9SXin Li	and	datav1.16b, datav1.16b, maskv.16b
180*412f47f9SXin Li	and	datav2.16b, datav2.16b, maskv.16b
181*412f47f9SXin Li	addp	maskv.16b, datav1.16b, datav2.16b
182*412f47f9SXin Li	addp	maskv.16b, maskv.16b, maskv.16b
183*412f47f9SXin Li	fmov	synd, maskd
184*412f47f9SXin Li	lsl	shift, srcin, 1
185*412f47f9SXin Li	lsr	synd, synd, shift
186*412f47f9SXin Li	cbz	synd, L(loop)
187*412f47f9SXin Li
188*412f47f9SXin Li	rbit	synd, synd
189*412f47f9SXin Li	clz	len, synd
190*412f47f9SXin Li	lsr	len, len, 1
191*412f47f9SXin Li	ret
192*412f47f9SXin Li
193*412f47f9SXin LiEND (__strlen_aarch64)
194