xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/strcmp.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * strcmp - compare two strings
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2012-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li
9*412f47f9SXin Li/* Assumptions:
10*412f47f9SXin Li *
11*412f47f9SXin Li * ARMv8-a, AArch64.
12*412f47f9SXin Li * MTE compatible.
13*412f47f9SXin Li */
14*412f47f9SXin Li
15*412f47f9SXin Li#include "asmdefs.h"
16*412f47f9SXin Li
17*412f47f9SXin Li#define REP8_01 0x0101010101010101
18*412f47f9SXin Li#define REP8_7f 0x7f7f7f7f7f7f7f7f
19*412f47f9SXin Li
20*412f47f9SXin Li#define src1		x0
21*412f47f9SXin Li#define src2		x1
22*412f47f9SXin Li#define result		x0
23*412f47f9SXin Li
24*412f47f9SXin Li#define data1		x2
25*412f47f9SXin Li#define data1w		w2
26*412f47f9SXin Li#define data2		x3
27*412f47f9SXin Li#define data2w		w3
28*412f47f9SXin Li#define has_nul		x4
29*412f47f9SXin Li#define diff		x5
30*412f47f9SXin Li#define off1		x5
31*412f47f9SXin Li#define syndrome	x6
32*412f47f9SXin Li#define tmp		x6
33*412f47f9SXin Li#define data3		x7
34*412f47f9SXin Li#define zeroones	x8
35*412f47f9SXin Li#define shift		x9
36*412f47f9SXin Li#define off2		x10
37*412f47f9SXin Li
38*412f47f9SXin Li/* On big-endian early bytes are at MSB and on little-endian LSB.
39*412f47f9SXin Li   LS_FW means shifting towards early bytes.  */
40*412f47f9SXin Li#ifdef __AARCH64EB__
41*412f47f9SXin Li# define LS_FW lsl
42*412f47f9SXin Li#else
43*412f47f9SXin Li# define LS_FW lsr
44*412f47f9SXin Li#endif
45*412f47f9SXin Li
46*412f47f9SXin Li/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
47*412f47f9SXin Li   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
48*412f47f9SXin Li   can be done in parallel across the entire word.
49*412f47f9SXin Li   Since carry propagation makes 0x1 bytes before a NUL byte appear
50*412f47f9SXin Li   NUL too in big-endian, byte-reverse the data before the NUL check.  */
51*412f47f9SXin Li
52*412f47f9SXin Li
53*412f47f9SXin LiENTRY (__strcmp_aarch64)
54*412f47f9SXin Li	PTR_ARG (0)
55*412f47f9SXin Li	PTR_ARG (1)
56*412f47f9SXin Li	sub	off2, src2, src1
57*412f47f9SXin Li	mov	zeroones, REP8_01
58*412f47f9SXin Li	and	tmp, src1, 7
59*412f47f9SXin Li	tst	off2, 7
60*412f47f9SXin Li	b.ne	L(misaligned8)
61*412f47f9SXin Li	cbnz	tmp, L(mutual_align)
62*412f47f9SXin Li
63*412f47f9SXin Li	.p2align 4
64*412f47f9SXin Li
65*412f47f9SXin LiL(loop_aligned):
66*412f47f9SXin Li	ldr	data2, [src1, off2]
67*412f47f9SXin Li	ldr	data1, [src1], 8
68*412f47f9SXin LiL(start_realigned):
69*412f47f9SXin Li#ifdef __AARCH64EB__
70*412f47f9SXin Li	rev	tmp, data1
71*412f47f9SXin Li	sub	has_nul, tmp, zeroones
72*412f47f9SXin Li	orr	tmp, tmp, REP8_7f
73*412f47f9SXin Li#else
74*412f47f9SXin Li	sub	has_nul, data1, zeroones
75*412f47f9SXin Li	orr	tmp, data1, REP8_7f
76*412f47f9SXin Li#endif
77*412f47f9SXin Li	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
78*412f47f9SXin Li	ccmp	data1, data2, 0, eq
79*412f47f9SXin Li	b.eq	L(loop_aligned)
80*412f47f9SXin Li#ifdef __AARCH64EB__
81*412f47f9SXin Li	rev	has_nul, has_nul
82*412f47f9SXin Li#endif
83*412f47f9SXin Li	eor	diff, data1, data2
84*412f47f9SXin Li	orr	syndrome, diff, has_nul
85*412f47f9SXin LiL(end):
86*412f47f9SXin Li#ifndef __AARCH64EB__
87*412f47f9SXin Li	rev	syndrome, syndrome
88*412f47f9SXin Li	rev	data1, data1
89*412f47f9SXin Li	rev	data2, data2
90*412f47f9SXin Li#endif
91*412f47f9SXin Li	clz	shift, syndrome
92*412f47f9SXin Li	/* The most-significant-non-zero bit of the syndrome marks either the
93*412f47f9SXin Li	   first bit that is different, or the top bit of the first zero byte.
94*412f47f9SXin Li	   Shifting left now will bring the critical information into the
95*412f47f9SXin Li	   top bits.  */
96*412f47f9SXin Li	lsl	data1, data1, shift
97*412f47f9SXin Li	lsl	data2, data2, shift
98*412f47f9SXin Li	/* But we need to zero-extend (char is unsigned) the value and then
99*412f47f9SXin Li	   perform a signed 32-bit subtraction.  */
100*412f47f9SXin Li	lsr	data1, data1, 56
101*412f47f9SXin Li	sub	result, data1, data2, lsr 56
102*412f47f9SXin Li	ret
103*412f47f9SXin Li
104*412f47f9SXin Li	.p2align 4
105*412f47f9SXin Li
106*412f47f9SXin LiL(mutual_align):
107*412f47f9SXin Li	/* Sources are mutually aligned, but are not currently at an
108*412f47f9SXin Li	   alignment boundary.  Round down the addresses and then mask off
109*412f47f9SXin Li	   the bytes that precede the start point.  */
110*412f47f9SXin Li	bic	src1, src1, 7
111*412f47f9SXin Li	ldr	data2, [src1, off2]
112*412f47f9SXin Li	ldr	data1, [src1], 8
113*412f47f9SXin Li	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
114*412f47f9SXin Li	mov	tmp, -1
115*412f47f9SXin Li	LS_FW	tmp, tmp, shift
116*412f47f9SXin Li	orr	data1, data1, tmp
117*412f47f9SXin Li	orr	data2, data2, tmp
118*412f47f9SXin Li	b	L(start_realigned)
119*412f47f9SXin Li
120*412f47f9SXin LiL(misaligned8):
121*412f47f9SXin Li	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
122*412f47f9SXin Li	   checking to make sure that we don't access beyond the end of SRC2.  */
123*412f47f9SXin Li	cbz	tmp, L(src1_aligned)
124*412f47f9SXin LiL(do_misaligned):
125*412f47f9SXin Li	ldrb	data1w, [src1], 1
126*412f47f9SXin Li	ldrb	data2w, [src2], 1
127*412f47f9SXin Li	cmp	data1w, 0
128*412f47f9SXin Li	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
129*412f47f9SXin Li	b.ne	L(done)
130*412f47f9SXin Li	tst	src1, 7
131*412f47f9SXin Li	b.ne	L(do_misaligned)
132*412f47f9SXin Li
133*412f47f9SXin LiL(src1_aligned):
134*412f47f9SXin Li	neg	shift, src2, lsl 3
135*412f47f9SXin Li	bic	src2, src2, 7
136*412f47f9SXin Li	ldr	data3, [src2], 8
137*412f47f9SXin Li#ifdef __AARCH64EB__
138*412f47f9SXin Li	rev	data3, data3
139*412f47f9SXin Li#endif
140*412f47f9SXin Li	lsr	tmp, zeroones, shift
141*412f47f9SXin Li	orr	data3, data3, tmp
142*412f47f9SXin Li	sub	has_nul, data3, zeroones
143*412f47f9SXin Li	orr	tmp, data3, REP8_7f
144*412f47f9SXin Li	bics	has_nul, has_nul, tmp
145*412f47f9SXin Li	b.ne	L(tail)
146*412f47f9SXin Li
147*412f47f9SXin Li	sub	off1, src2, src1
148*412f47f9SXin Li
149*412f47f9SXin Li	.p2align 4
150*412f47f9SXin Li
151*412f47f9SXin LiL(loop_unaligned):
152*412f47f9SXin Li	ldr	data3, [src1, off1]
153*412f47f9SXin Li	ldr	data2, [src1, off2]
154*412f47f9SXin Li#ifdef __AARCH64EB__
155*412f47f9SXin Li	rev	data3, data3
156*412f47f9SXin Li#endif
157*412f47f9SXin Li	sub	has_nul, data3, zeroones
158*412f47f9SXin Li	orr	tmp, data3, REP8_7f
159*412f47f9SXin Li	ldr	data1, [src1], 8
160*412f47f9SXin Li	bics	has_nul, has_nul, tmp
161*412f47f9SXin Li	ccmp	data1, data2, 0, eq
162*412f47f9SXin Li	b.eq	L(loop_unaligned)
163*412f47f9SXin Li
164*412f47f9SXin Li	lsl	tmp, has_nul, shift
165*412f47f9SXin Li#ifdef __AARCH64EB__
166*412f47f9SXin Li	rev	tmp, tmp
167*412f47f9SXin Li#endif
168*412f47f9SXin Li	eor	diff, data1, data2
169*412f47f9SXin Li	orr	syndrome, diff, tmp
170*412f47f9SXin Li	cbnz	syndrome, L(end)
171*412f47f9SXin LiL(tail):
172*412f47f9SXin Li	ldr	data1, [src1]
173*412f47f9SXin Li	neg	shift, shift
174*412f47f9SXin Li	lsr	data2, data3, shift
175*412f47f9SXin Li	lsr	has_nul, has_nul, shift
176*412f47f9SXin Li#ifdef __AARCH64EB__
177*412f47f9SXin Li	rev     data2, data2
178*412f47f9SXin Li	rev	has_nul, has_nul
179*412f47f9SXin Li#endif
180*412f47f9SXin Li	eor	diff, data1, data2
181*412f47f9SXin Li	orr	syndrome, diff, has_nul
182*412f47f9SXin Li	b	L(end)
183*412f47f9SXin Li
184*412f47f9SXin LiL(done):
185*412f47f9SXin Li	sub	result, data1, data2
186*412f47f9SXin Li	ret
187*412f47f9SXin Li
188*412f47f9SXin LiEND (__strcmp_aarch64)
189*412f47f9SXin Li
190