xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/memchr.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * memchr - find a character in a memory zone
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2014-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64
11*412f47f9SXin Li * Neon Available.
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li/* Arguments and results.  */
17*412f47f9SXin Li#define srcin		x0
18*412f47f9SXin Li#define chrin		w1
19*412f47f9SXin Li#define cntin		x2
20*412f47f9SXin Li
21*412f47f9SXin Li#define result		x0
22*412f47f9SXin Li
23*412f47f9SXin Li#define src		x3
24*412f47f9SXin Li#define	tmp		x4
25*412f47f9SXin Li#define wtmp2		w5
26*412f47f9SXin Li#define synd		x6
27*412f47f9SXin Li#define soff		x9
28*412f47f9SXin Li#define cntrem		x10
29*412f47f9SXin Li
30*412f47f9SXin Li#define vrepchr		v0
31*412f47f9SXin Li#define vdata1		v1
32*412f47f9SXin Li#define vdata2		v2
33*412f47f9SXin Li#define vhas_chr1	v3
34*412f47f9SXin Li#define vhas_chr2	v4
35*412f47f9SXin Li#define vrepmask	v5
36*412f47f9SXin Li#define vend		v6
37*412f47f9SXin Li
38*412f47f9SXin Li/*
39*412f47f9SXin Li * Core algorithm:
40*412f47f9SXin Li *
41*412f47f9SXin Li * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
42*412f47f9SXin Li * per byte. For each tuple, bit 0 is set if the relevant byte matched the
43*412f47f9SXin Li * requested character and bit 1 is not used (faster than using a 32bit
44*412f47f9SXin Li * syndrome). Since the bits in the syndrome reflect exactly the order in which
45*412f47f9SXin Li * things occur in the original string, counting trailing zeros allows to
46*412f47f9SXin Li * identify exactly which byte has matched.
47*412f47f9SXin Li */
48*412f47f9SXin Li
49*412f47f9SXin LiENTRY (__memchr_aarch64)
50*412f47f9SXin Li	PTR_ARG (0)
51*412f47f9SXin Li	SIZE_ARG (2)
52*412f47f9SXin Li	/* Do not dereference srcin if no bytes to compare.  */
53*412f47f9SXin Li	cbz	cntin, L(zero_length)
54*412f47f9SXin Li	/*
55*412f47f9SXin Li	 * Magic constant 0x40100401 allows us to identify which lane matches
56*412f47f9SXin Li	 * the requested byte.
57*412f47f9SXin Li	 */
58*412f47f9SXin Li	mov	wtmp2, #0x0401
59*412f47f9SXin Li	movk	wtmp2, #0x4010, lsl #16
60*412f47f9SXin Li	dup	vrepchr.16b, chrin
61*412f47f9SXin Li	/* Work with aligned 32-byte chunks */
62*412f47f9SXin Li	bic	src, srcin, #31
63*412f47f9SXin Li	dup	vrepmask.4s, wtmp2
64*412f47f9SXin Li	ands	soff, srcin, #31
65*412f47f9SXin Li	and	cntrem, cntin, #31
66*412f47f9SXin Li	b.eq	L(loop)
67*412f47f9SXin Li
68*412f47f9SXin Li	/*
69*412f47f9SXin Li	 * Input string is not 32-byte aligned. We calculate the syndrome
70*412f47f9SXin Li	 * value for the aligned 32 bytes block containing the first bytes
71*412f47f9SXin Li	 * and mask the irrelevant part.
72*412f47f9SXin Li	 */
73*412f47f9SXin Li
74*412f47f9SXin Li	ld1	{vdata1.16b, vdata2.16b}, [src], #32
75*412f47f9SXin Li	sub	tmp, soff, #32
76*412f47f9SXin Li	adds	cntin, cntin, tmp
77*412f47f9SXin Li	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
78*412f47f9SXin Li	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
79*412f47f9SXin Li	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
80*412f47f9SXin Li	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
81*412f47f9SXin Li	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
82*412f47f9SXin Li	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
83*412f47f9SXin Li	mov	synd, vend.d[0]
84*412f47f9SXin Li	/* Clear the soff*2 lower bits */
85*412f47f9SXin Li	lsl	tmp, soff, #1
86*412f47f9SXin Li	lsr	synd, synd, tmp
87*412f47f9SXin Li	lsl	synd, synd, tmp
88*412f47f9SXin Li	/* The first block can also be the last */
89*412f47f9SXin Li	b.ls	L(masklast)
90*412f47f9SXin Li	/* Have we found something already? */
91*412f47f9SXin Li	cbnz	synd, L(tail)
92*412f47f9SXin Li
93*412f47f9SXin LiL(loop):
94*412f47f9SXin Li	ld1	{vdata1.16b, vdata2.16b}, [src], #32
95*412f47f9SXin Li	subs	cntin, cntin, #32
96*412f47f9SXin Li	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
97*412f47f9SXin Li	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
98*412f47f9SXin Li	/* If we're out of data we finish regardless of the result */
99*412f47f9SXin Li	b.ls	L(end)
100*412f47f9SXin Li	/* Use a fast check for the termination condition */
101*412f47f9SXin Li	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
102*412f47f9SXin Li	addp	vend.2d, vend.2d, vend.2d
103*412f47f9SXin Li	mov	synd, vend.d[0]
104*412f47f9SXin Li	/* We're not out of data, loop if we haven't found the character */
105*412f47f9SXin Li	cbz	synd, L(loop)
106*412f47f9SXin Li
107*412f47f9SXin LiL(end):
108*412f47f9SXin Li	/* Termination condition found, let's calculate the syndrome value */
109*412f47f9SXin Li	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
110*412f47f9SXin Li	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
111*412f47f9SXin Li	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
112*412f47f9SXin Li	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
113*412f47f9SXin Li	mov	synd, vend.d[0]
114*412f47f9SXin Li	/* Only do the clear for the last possible block */
115*412f47f9SXin Li	b.hs	L(tail)
116*412f47f9SXin Li
117*412f47f9SXin LiL(masklast):
118*412f47f9SXin Li	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
119*412f47f9SXin Li	add	tmp, cntrem, soff
120*412f47f9SXin Li	and	tmp, tmp, #31
121*412f47f9SXin Li	sub	tmp, tmp, #32
122*412f47f9SXin Li	neg	tmp, tmp, lsl #1
123*412f47f9SXin Li	lsl	synd, synd, tmp
124*412f47f9SXin Li	lsr	synd, synd, tmp
125*412f47f9SXin Li
126*412f47f9SXin LiL(tail):
127*412f47f9SXin Li	/* Count the trailing zeros using bit reversing */
128*412f47f9SXin Li	rbit	synd, synd
129*412f47f9SXin Li	/* Compensate the last post-increment */
130*412f47f9SXin Li	sub	src, src, #32
131*412f47f9SXin Li	/* Check that we have found a character */
132*412f47f9SXin Li	cmp	synd, #0
133*412f47f9SXin Li	/* And count the leading zeros */
134*412f47f9SXin Li	clz	synd, synd
135*412f47f9SXin Li	/* Compute the potential result */
136*412f47f9SXin Li	add	result, src, synd, lsr #1
137*412f47f9SXin Li	/* Select result or NULL */
138*412f47f9SXin Li	csel	result, xzr, result, eq
139*412f47f9SXin Li	ret
140*412f47f9SXin Li
141*412f47f9SXin LiL(zero_length):
142*412f47f9SXin Li	mov	result, #0
143*412f47f9SXin Li	ret
144*412f47f9SXin Li
145*412f47f9SXin LiEND (__memchr_aarch64)
146*412f47f9SXin Li
147