xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/strrchr-mte.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * strrchr - find last position of a character in a string.
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2020-2023, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD.
11*412f47f9SXin Li * MTE compatible.
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li#define srcin		x0
17*412f47f9SXin Li#define chrin		w1
18*412f47f9SXin Li#define result		x0
19*412f47f9SXin Li
20*412f47f9SXin Li#define src		x2
21*412f47f9SXin Li#define tmp		x3
22*412f47f9SXin Li#define synd		x3
23*412f47f9SXin Li#define shift		x4
24*412f47f9SXin Li#define src_match	x4
25*412f47f9SXin Li#define nul_match	x5
26*412f47f9SXin Li#define chr_match	x6
27*412f47f9SXin Li
28*412f47f9SXin Li#define vrepchr		v0
29*412f47f9SXin Li#define vdata		v1
30*412f47f9SXin Li#define vhas_nul	v2
31*412f47f9SXin Li#define vhas_chr	v3
32*412f47f9SXin Li#define vrepmask	v4
33*412f47f9SXin Li#define vend		v5
34*412f47f9SXin Li#define dend		d5
35*412f47f9SXin Li
36*412f47f9SXin Li/* Core algorithm.
37*412f47f9SXin Li
38*412f47f9SXin Li   For each 16-byte chunk we calculate a 64-bit syndrome value, with
39*412f47f9SXin Li   four bits per byte (LSB is always in bits 0 and 1, for both big
40*412f47f9SXin Li   and little-endian systems).  For each tuple, bits 0-1 are set if
41*412f47f9SXin Li   the relevant byte matched the requested character; bits 2-3 are set
42*412f47f9SXin Li   if the relevant byte matched the NUL end of string.  */
43*412f47f9SXin Li
44*412f47f9SXin LiENTRY (__strrchr_aarch64_mte)
45*412f47f9SXin Li	PTR_ARG (0)
46*412f47f9SXin Li	bic	src, srcin, 15
47*412f47f9SXin Li	dup	vrepchr.16b, chrin
48*412f47f9SXin Li	movi	vrepmask.16b, 0x33
49*412f47f9SXin Li	ld1	{vdata.16b}, [src]
50*412f47f9SXin Li	cmeq	vhas_nul.16b, vdata.16b, 0
51*412f47f9SXin Li	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
52*412f47f9SXin Li	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
53*412f47f9SXin Li	shrn	vend.8b, vhas_nul.8h, 4
54*412f47f9SXin Li	lsl	shift, srcin, 2
55*412f47f9SXin Li	fmov	synd, dend
56*412f47f9SXin Li	lsr	synd, synd, shift
57*412f47f9SXin Li	lsl	synd, synd, shift
58*412f47f9SXin Li	ands	nul_match, synd, 0xcccccccccccccccc
59*412f47f9SXin Li	bne	L(tail)
60*412f47f9SXin Li	cbnz	synd, L(loop2_start)
61*412f47f9SXin Li
62*412f47f9SXin Li	.p2align 4
63*412f47f9SXin LiL(loop1):
64*412f47f9SXin Li	ldr	q1, [src, 16]
65*412f47f9SXin Li	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
66*412f47f9SXin Li	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
67*412f47f9SXin Li	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
68*412f47f9SXin Li	fmov	synd, dend
69*412f47f9SXin Li	cbnz	synd, L(loop1_end)
70*412f47f9SXin Li	ldr	q1, [src, 32]!
71*412f47f9SXin Li	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
72*412f47f9SXin Li	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
73*412f47f9SXin Li	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
74*412f47f9SXin Li	fmov	synd, dend
75*412f47f9SXin Li	cbz	synd, L(loop1)
76*412f47f9SXin Li	sub	src, src, 16
77*412f47f9SXin LiL(loop1_end):
78*412f47f9SXin Li	add	src, src, 16
79*412f47f9SXin Li	cmeq	vhas_nul.16b, vdata.16b, 0
80*412f47f9SXin Li#ifdef __AARCH64EB__
81*412f47f9SXin Li	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
82*412f47f9SXin Li	shrn	vend.8b, vhas_nul.8h, 4
83*412f47f9SXin Li	fmov	synd, dend
84*412f47f9SXin Li	rbit	synd, synd
85*412f47f9SXin Li#else
86*412f47f9SXin Li	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
87*412f47f9SXin Li	shrn	vend.8b, vhas_nul.8h, 4
88*412f47f9SXin Li	fmov	synd, dend
89*412f47f9SXin Li#endif
90*412f47f9SXin Li	ands	nul_match, synd, 0xcccccccccccccccc
91*412f47f9SXin Li	beq	L(loop2_start)
92*412f47f9SXin LiL(tail):
93*412f47f9SXin Li	sub	nul_match, nul_match, 1
94*412f47f9SXin Li	and	chr_match, synd, 0x3333333333333333
95*412f47f9SXin Li	ands	chr_match, chr_match, nul_match
96*412f47f9SXin Li	add	result, src, 15
97*412f47f9SXin Li	clz	tmp, chr_match
98*412f47f9SXin Li	sub	result, result, tmp, lsr 2
99*412f47f9SXin Li	csel	result, result, xzr, ne
100*412f47f9SXin Li	ret
101*412f47f9SXin Li
102*412f47f9SXin Li	.p2align 4
103*412f47f9SXin Li	nop
104*412f47f9SXin Li	nop
105*412f47f9SXin LiL(loop2_start):
106*412f47f9SXin Li	add	src, src, 16
107*412f47f9SXin Li	bic	vrepmask.8h, 0xf0
108*412f47f9SXin Li
109*412f47f9SXin LiL(loop2):
110*412f47f9SXin Li	cmp	synd, 0
111*412f47f9SXin Li	csel	src_match, src, src_match, ne
112*412f47f9SXin Li	csel	chr_match, synd, chr_match, ne
113*412f47f9SXin Li	ld1	{vdata.16b}, [src], 16
114*412f47f9SXin Li	cmeq	vhas_nul.16b, vdata.16b, 0
115*412f47f9SXin Li	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
116*412f47f9SXin Li	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
117*412f47f9SXin Li	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
118*412f47f9SXin Li	fmov	synd, dend
119*412f47f9SXin Li	tst	synd, 0xcccccccccccccccc
120*412f47f9SXin Li	beq	L(loop2)
121*412f47f9SXin Li
122*412f47f9SXin Li	bic	vhas_nul.8h, 0x0f, lsl 8
123*412f47f9SXin Li	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
124*412f47f9SXin Li	fmov	synd, dend
125*412f47f9SXin Li	and	nul_match, synd, 0xcccccccccccccccc
126*412f47f9SXin Li	sub	nul_match, nul_match, 1
127*412f47f9SXin Li	and	tmp, synd, 0x3333333333333333
128*412f47f9SXin Li	ands	tmp, tmp, nul_match
129*412f47f9SXin Li	csel	chr_match, tmp, chr_match, ne
130*412f47f9SXin Li	csel	src_match, src, src_match, ne
131*412f47f9SXin Li	sub	src_match, src_match, 1
132*412f47f9SXin Li	clz	tmp, chr_match
133*412f47f9SXin Li	sub	result, src_match, tmp, lsr 2
134*412f47f9SXin Li	ret
135*412f47f9SXin Li
136*412f47f9SXin LiEND (__strrchr_aarch64_mte)
137*412f47f9SXin Li
138