xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/strchr.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * strchr - find a character in a string
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2014-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64
11*412f47f9SXin Li * Neon Available.
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li/* Arguments and results.  */
17*412f47f9SXin Li#define srcin		x0
18*412f47f9SXin Li#define chrin		w1
19*412f47f9SXin Li
20*412f47f9SXin Li#define result		x0
21*412f47f9SXin Li
22*412f47f9SXin Li#define src		x2
23*412f47f9SXin Li#define	tmp1		x3
24*412f47f9SXin Li#define wtmp2		w4
25*412f47f9SXin Li#define tmp3		x5
26*412f47f9SXin Li
27*412f47f9SXin Li#define vrepchr		v0
28*412f47f9SXin Li#define vdata1		v1
29*412f47f9SXin Li#define vdata2		v2
30*412f47f9SXin Li#define vhas_nul1	v3
31*412f47f9SXin Li#define vhas_nul2	v4
32*412f47f9SXin Li#define vhas_chr1	v5
33*412f47f9SXin Li#define vhas_chr2	v6
34*412f47f9SXin Li#define vrepmask_0	v7
35*412f47f9SXin Li#define vrepmask_c	v16
36*412f47f9SXin Li#define vend1		v17
37*412f47f9SXin Li#define vend2		v18
38*412f47f9SXin Li
39*412f47f9SXin Li/* Core algorithm.
40*412f47f9SXin Li
41*412f47f9SXin Li   For each 32-byte hunk we calculate a 64-bit syndrome value, with
42*412f47f9SXin Li   two bits per byte (LSB is always in bits 0 and 1, for both big
43*412f47f9SXin Li   and little-endian systems).  For each tuple, bit 0 is set iff
44*412f47f9SXin Li   the relevant byte matched the requested character; bit 1 is set
45*412f47f9SXin Li   iff the relevant byte matched the NUL end of string (we trigger
46*412f47f9SXin Li   off bit0 for the special case of looking for NUL).  Since the bits
47*412f47f9SXin Li   in the syndrome reflect exactly the order in which things occur
48*412f47f9SXin Li   in the original string a count_trailing_zeros() operation will
49*412f47f9SXin Li   identify exactly which byte is causing the termination, and why.  */
50*412f47f9SXin Li
51*412f47f9SXin Li/* Locals and temporaries.  */
52*412f47f9SXin Li
53*412f47f9SXin LiENTRY (__strchr_aarch64)
54*412f47f9SXin Li	PTR_ARG (0)
55*412f47f9SXin Li	/* Magic constant 0xc0300c03 to allow us to identify which lane
56*412f47f9SXin Li	   matches the requested byte.  Even bits are set if the character
57*412f47f9SXin Li	   matches, odd bits if either the char is NUL or matches.  */
58*412f47f9SXin Li	mov	wtmp2, 0x0c03
59*412f47f9SXin Li	movk	wtmp2, 0xc030, lsl 16
60*412f47f9SXin Li	dup	vrepchr.16b, chrin
61*412f47f9SXin Li	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
62*412f47f9SXin Li	dup	vrepmask_c.4s, wtmp2
63*412f47f9SXin Li	ands	tmp1, srcin, #31
64*412f47f9SXin Li	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
65*412f47f9SXin Li	b.eq	L(loop)
66*412f47f9SXin Li
67*412f47f9SXin Li	/* Input string is not 32-byte aligned.  Rather than forcing
68*412f47f9SXin Li	   the padding bytes to a safe value, we calculate the syndrome
69*412f47f9SXin Li	   for all the bytes, but then mask off those bits of the
70*412f47f9SXin Li	   syndrome that are related to the padding.  */
71*412f47f9SXin Li	ld1	{vdata1.16b, vdata2.16b}, [src], #32
72*412f47f9SXin Li	neg	tmp1, tmp1
73*412f47f9SXin Li	cmeq	vhas_nul1.16b, vdata1.16b, #0
74*412f47f9SXin Li	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
75*412f47f9SXin Li	cmeq	vhas_nul2.16b, vdata2.16b, #0
76*412f47f9SXin Li	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
77*412f47f9SXin Li	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
78*412f47f9SXin Li	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
79*412f47f9SXin Li	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
80*412f47f9SXin Li	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
81*412f47f9SXin Li	lsl	tmp1, tmp1, #1
82*412f47f9SXin Li	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
83*412f47f9SXin Li	mov	tmp3, #~0
84*412f47f9SXin Li	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
85*412f47f9SXin Li	lsr	tmp1, tmp3, tmp1
86*412f47f9SXin Li
87*412f47f9SXin Li	mov	tmp3, vend1.d[0]
88*412f47f9SXin Li	bic	tmp1, tmp3, tmp1	// Mask padding bits.
89*412f47f9SXin Li	cbnz	tmp1, L(tail)
90*412f47f9SXin Li
91*412f47f9SXin Li	.p2align 4
92*412f47f9SXin LiL(loop):
93*412f47f9SXin Li	ld1	{vdata1.16b, vdata2.16b}, [src], #32
94*412f47f9SXin Li	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
95*412f47f9SXin Li	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
96*412f47f9SXin Li	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
97*412f47f9SXin Li	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
98*412f47f9SXin Li	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
99*412f47f9SXin Li	umaxp	vend1.16b, vend1.16b, vend1.16b
100*412f47f9SXin Li	mov	tmp1, vend1.d[0]
101*412f47f9SXin Li	cbz	tmp1, L(loop)
102*412f47f9SXin Li
103*412f47f9SXin Li	/* Termination condition found.  Now need to establish exactly why
104*412f47f9SXin Li	   we terminated.  */
105*412f47f9SXin Li	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
106*412f47f9SXin Li	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
107*412f47f9SXin Li	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
108*412f47f9SXin Li	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
109*412f47f9SXin Li	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
110*412f47f9SXin Li	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
111*412f47f9SXin Li	mov	tmp1, vend1.d[0]
112*412f47f9SXin LiL(tail):
113*412f47f9SXin Li	/* Count the trailing zeros, by bit reversing...  */
114*412f47f9SXin Li	rbit	tmp1, tmp1
115*412f47f9SXin Li	/* Re-bias source.  */
116*412f47f9SXin Li	sub	src, src, #32
117*412f47f9SXin Li	clz	tmp1, tmp1	/* And counting the leading zeros.  */
118*412f47f9SXin Li	/* Tmp1 is even if the target charager was found first.  Otherwise
119*412f47f9SXin Li	   we've found the end of string and we weren't looking for NUL.  */
120*412f47f9SXin Li	tst	tmp1, #1
121*412f47f9SXin Li	add	result, src, tmp1, lsr #1
122*412f47f9SXin Li	csel	result, result, xzr, eq
123*412f47f9SXin Li	ret
124*412f47f9SXin Li
125*412f47f9SXin LiEND (__strchr_aarch64)
126*412f47f9SXin Li
127