xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/strrchr.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * strrchr - find last position of a character in a string.
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2014-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64
11*412f47f9SXin Li * Neon Available.
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li/* Arguments and results.  */
17*412f47f9SXin Li#define srcin		x0
18*412f47f9SXin Li#define chrin		w1
19*412f47f9SXin Li
20*412f47f9SXin Li#define result		x0
21*412f47f9SXin Li
22*412f47f9SXin Li#define src		x2
23*412f47f9SXin Li#define	tmp1		x3
24*412f47f9SXin Li#define wtmp2		w4
25*412f47f9SXin Li#define tmp3		x5
26*412f47f9SXin Li#define src_match	x6
27*412f47f9SXin Li#define src_offset	x7
28*412f47f9SXin Li#define const_m1	x8
29*412f47f9SXin Li#define tmp4		x9
30*412f47f9SXin Li#define nul_match	x10
31*412f47f9SXin Li#define chr_match	x11
32*412f47f9SXin Li
33*412f47f9SXin Li#define vrepchr		v0
34*412f47f9SXin Li#define vdata1		v1
35*412f47f9SXin Li#define vdata2		v2
36*412f47f9SXin Li#define vhas_nul1	v3
37*412f47f9SXin Li#define vhas_nul2	v4
38*412f47f9SXin Li#define vhas_chr1	v5
39*412f47f9SXin Li#define vhas_chr2	v6
40*412f47f9SXin Li#define vrepmask_0	v7
41*412f47f9SXin Li#define vrepmask_c	v16
42*412f47f9SXin Li#define vend1		v17
43*412f47f9SXin Li#define vend2		v18
44*412f47f9SXin Li
45*412f47f9SXin Li/* Core algorithm.
46*412f47f9SXin Li
47*412f47f9SXin Li   For each 32-byte hunk we calculate a 64-bit syndrome value, with
48*412f47f9SXin Li   two bits per byte (LSB is always in bits 0 and 1, for both big
49*412f47f9SXin Li   and little-endian systems).  For each tuple, bit 0 is set iff
50*412f47f9SXin Li   the relevant byte matched the requested character; bit 1 is set
51*412f47f9SXin Li   iff the relevant byte matched the NUL end of string (we trigger
52*412f47f9SXin Li   off bit0 for the special case of looking for NUL).  Since the bits
53*412f47f9SXin Li   in the syndrome reflect exactly the order in which things occur
54*412f47f9SXin Li   in the original string a count_trailing_zeros() operation will
55*412f47f9SXin Li   identify exactly which byte is causing the termination, and why.  */
56*412f47f9SXin Li
57*412f47f9SXin LiENTRY (__strrchr_aarch64)
58*412f47f9SXin Li	PTR_ARG (0)
59*412f47f9SXin Li	/* Magic constant 0x40100401 to allow us to identify which lane
60*412f47f9SXin Li	   matches the requested byte.  Magic constant 0x80200802 used
61*412f47f9SXin Li	   similarly for NUL termination.  */
62*412f47f9SXin Li	mov	wtmp2, #0x0401
63*412f47f9SXin Li	movk	wtmp2, #0x4010, lsl #16
64*412f47f9SXin Li	dup	vrepchr.16b, chrin
65*412f47f9SXin Li	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
66*412f47f9SXin Li	dup	vrepmask_c.4s, wtmp2
67*412f47f9SXin Li	mov	src_offset, #0
68*412f47f9SXin Li	ands	tmp1, srcin, #31
69*412f47f9SXin Li	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
70*412f47f9SXin Li	b.eq	L(aligned)
71*412f47f9SXin Li
72*412f47f9SXin Li	/* Input string is not 32-byte aligned.  Rather than forcing
73*412f47f9SXin Li	   the padding bytes to a safe value, we calculate the syndrome
74*412f47f9SXin Li	   for all the bytes, but then mask off those bits of the
75*412f47f9SXin Li	   syndrome that are related to the padding.  */
76*412f47f9SXin Li	ld1	{vdata1.16b, vdata2.16b}, [src], #32
77*412f47f9SXin Li	neg	tmp1, tmp1
78*412f47f9SXin Li	cmeq	vhas_nul1.16b, vdata1.16b, #0
79*412f47f9SXin Li	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
80*412f47f9SXin Li	cmeq	vhas_nul2.16b, vdata2.16b, #0
81*412f47f9SXin Li	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
82*412f47f9SXin Li	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
83*412f47f9SXin Li	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
84*412f47f9SXin Li	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
85*412f47f9SXin Li	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
86*412f47f9SXin Li	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
87*412f47f9SXin Li	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
88*412f47f9SXin Li	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
89*412f47f9SXin Li	mov	nul_match, vend1.d[0]
90*412f47f9SXin Li	lsl	tmp1, tmp1, #1
91*412f47f9SXin Li	mov	const_m1, #~0
92*412f47f9SXin Li	lsr	tmp3, const_m1, tmp1
93*412f47f9SXin Li	mov	chr_match, vend1.d[1]
94*412f47f9SXin Li
95*412f47f9SXin Li	bic	nul_match, nul_match, tmp3	// Mask padding bits.
96*412f47f9SXin Li	bic	chr_match, chr_match, tmp3	// Mask padding bits.
97*412f47f9SXin Li	cbnz	nul_match, L(tail)
98*412f47f9SXin Li
99*412f47f9SXin Li	.p2align 4
100*412f47f9SXin LiL(loop):
101*412f47f9SXin Li	cmp	chr_match, #0
102*412f47f9SXin Li	csel	src_match, src, src_match, ne
103*412f47f9SXin Li	csel	src_offset, chr_match, src_offset, ne
104*412f47f9SXin LiL(aligned):
105*412f47f9SXin Li	ld1	{vdata1.16b, vdata2.16b}, [src], #32
106*412f47f9SXin Li	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
107*412f47f9SXin Li	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
108*412f47f9SXin Li	uminp	vend1.16b, vdata1.16b, vdata2.16b
109*412f47f9SXin Li	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
110*412f47f9SXin Li	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
111*412f47f9SXin Li	cmeq	vend1.16b, vend1.16b, 0
112*412f47f9SXin Li	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
113*412f47f9SXin Li	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
114*412f47f9SXin Li	mov	nul_match, vend1.d[0]
115*412f47f9SXin Li	mov	chr_match, vend1.d[1]
116*412f47f9SXin Li	cbz	nul_match, L(loop)
117*412f47f9SXin Li
118*412f47f9SXin Li	cmeq	vhas_nul1.16b, vdata1.16b, #0
119*412f47f9SXin Li	cmeq	vhas_nul2.16b, vdata2.16b, #0
120*412f47f9SXin Li	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
121*412f47f9SXin Li	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
122*412f47f9SXin Li	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
123*412f47f9SXin Li	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
124*412f47f9SXin Li	mov	nul_match, vhas_nul1.d[0]
125*412f47f9SXin Li
126*412f47f9SXin LiL(tail):
127*412f47f9SXin Li	/* Work out exactly where the string ends.  */
128*412f47f9SXin Li	sub	tmp4, nul_match, #1
129*412f47f9SXin Li	eor	tmp4, tmp4, nul_match
130*412f47f9SXin Li	ands	chr_match, chr_match, tmp4
131*412f47f9SXin Li	/* And pick the values corresponding to the last match.  */
132*412f47f9SXin Li	csel	src_match, src, src_match, ne
133*412f47f9SXin Li	csel	src_offset, chr_match, src_offset, ne
134*412f47f9SXin Li
135*412f47f9SXin Li	/* Count down from the top of the syndrome to find the last match.  */
136*412f47f9SXin Li	clz	tmp3, src_offset
137*412f47f9SXin Li	/* Src_match points beyond the word containing the match, so we can
138*412f47f9SXin Li	   simply subtract half the bit-offset into the syndrome.  Because
139*412f47f9SXin Li	   we are counting down, we need to go back one more character.  */
140*412f47f9SXin Li	add	tmp3, tmp3, #2
141*412f47f9SXin Li	sub	result, src_match, tmp3, lsr #1
142*412f47f9SXin Li	/* But if the syndrome shows no match was found, then return NULL.  */
143*412f47f9SXin Li	cmp	src_offset, #0
144*412f47f9SXin Li	csel	result, result, xzr, ne
145*412f47f9SXin Li
146*412f47f9SXin Li	ret
147*412f47f9SXin Li
148*412f47f9SXin LiEND (__strrchr_aarch64)
149*412f47f9SXin Li
150