xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/strncmp.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * strncmp - compare two strings
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2013-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/* Assumptions:
9*412f47f9SXin Li *
10*412f47f9SXin Li * ARMv8-a, AArch64.
11*412f47f9SXin Li * MTE compatible.
12*412f47f9SXin Li */
13*412f47f9SXin Li
14*412f47f9SXin Li#include "asmdefs.h"
15*412f47f9SXin Li
16*412f47f9SXin Li#define REP8_01 0x0101010101010101
17*412f47f9SXin Li#define REP8_7f 0x7f7f7f7f7f7f7f7f
18*412f47f9SXin Li
19*412f47f9SXin Li/* Parameters and result.  */
20*412f47f9SXin Li#define src1		x0
21*412f47f9SXin Li#define src2		x1
22*412f47f9SXin Li#define limit		x2
23*412f47f9SXin Li#define result		x0
24*412f47f9SXin Li
25*412f47f9SXin Li/* Internal variables.  */
26*412f47f9SXin Li#define data1		x3
27*412f47f9SXin Li#define data1w		w3
28*412f47f9SXin Li#define data2		x4
29*412f47f9SXin Li#define data2w		w4
30*412f47f9SXin Li#define has_nul		x5
31*412f47f9SXin Li#define diff		x6
32*412f47f9SXin Li#define syndrome	x7
33*412f47f9SXin Li#define tmp1		x8
34*412f47f9SXin Li#define tmp2		x9
35*412f47f9SXin Li#define tmp3		x10
36*412f47f9SXin Li#define zeroones	x11
37*412f47f9SXin Li#define pos		x12
38*412f47f9SXin Li#define mask		x13
39*412f47f9SXin Li#define endloop		x14
40*412f47f9SXin Li#define count		mask
41*412f47f9SXin Li#define offset		pos
42*412f47f9SXin Li#define neg_offset	x15
43*412f47f9SXin Li
44*412f47f9SXin Li/* Define endian dependent shift operations.
45*412f47f9SXin Li   On big-endian early bytes are at MSB and on little-endian LSB.
46*412f47f9SXin Li   LS_FW means shifting towards early bytes.
47*412f47f9SXin Li   LS_BK means shifting towards later bytes.
48*412f47f9SXin Li   */
49*412f47f9SXin Li#ifdef __AARCH64EB__
50*412f47f9SXin Li#define LS_FW lsl
51*412f47f9SXin Li#define LS_BK lsr
52*412f47f9SXin Li#else
53*412f47f9SXin Li#define LS_FW lsr
54*412f47f9SXin Li#define LS_BK lsl
55*412f47f9SXin Li#endif
56*412f47f9SXin Li
57*412f47f9SXin LiENTRY (__strncmp_aarch64)
58*412f47f9SXin Li	PTR_ARG (0)
59*412f47f9SXin Li	PTR_ARG (1)
60*412f47f9SXin Li	SIZE_ARG (2)
61*412f47f9SXin Li	cbz	limit, L(ret0)
62*412f47f9SXin Li	eor	tmp1, src1, src2
63*412f47f9SXin Li	mov	zeroones, #REP8_01
64*412f47f9SXin Li	tst	tmp1, #7
65*412f47f9SXin Li	and	count, src1, #7
66*412f47f9SXin Li	b.ne	L(misaligned8)
67*412f47f9SXin Li	cbnz	count, L(mutual_align)
68*412f47f9SXin Li
69*412f47f9SXin Li	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
70*412f47f9SXin Li	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
71*412f47f9SXin Li	   can be done in parallel across the entire word.  */
72*412f47f9SXin Li	.p2align 4
73*412f47f9SXin LiL(loop_aligned):
74*412f47f9SXin Li	ldr	data1, [src1], #8
75*412f47f9SXin Li	ldr	data2, [src2], #8
76*412f47f9SXin LiL(start_realigned):
77*412f47f9SXin Li	subs	limit, limit, #8
78*412f47f9SXin Li	sub	tmp1, data1, zeroones
79*412f47f9SXin Li	orr	tmp2, data1, #REP8_7f
80*412f47f9SXin Li	eor	diff, data1, data2	/* Non-zero if differences found.  */
81*412f47f9SXin Li	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
82*412f47f9SXin Li	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
83*412f47f9SXin Li	ccmp	endloop, #0, #0, eq
84*412f47f9SXin Li	b.eq	L(loop_aligned)
85*412f47f9SXin Li	/* End of main loop */
86*412f47f9SXin Li
87*412f47f9SXin LiL(full_check):
88*412f47f9SXin Li#ifndef __AARCH64EB__
89*412f47f9SXin Li	orr	syndrome, diff, has_nul
90*412f47f9SXin Li	add	limit, limit, 8	/* Rewind limit to before last subs. */
91*412f47f9SXin LiL(syndrome_check):
92*412f47f9SXin Li	/* Limit was reached. Check if the NUL byte or the difference
93*412f47f9SXin Li	   is before the limit. */
94*412f47f9SXin Li	rev	syndrome, syndrome
95*412f47f9SXin Li	rev	data1, data1
96*412f47f9SXin Li	clz	pos, syndrome
97*412f47f9SXin Li	rev	data2, data2
98*412f47f9SXin Li	lsl	data1, data1, pos
99*412f47f9SXin Li	cmp	limit, pos, lsr #3
100*412f47f9SXin Li	lsl	data2, data2, pos
101*412f47f9SXin Li	/* But we need to zero-extend (char is unsigned) the value and then
102*412f47f9SXin Li	   perform a signed 32-bit subtraction.  */
103*412f47f9SXin Li	lsr	data1, data1, #56
104*412f47f9SXin Li	sub	result, data1, data2, lsr #56
105*412f47f9SXin Li	csel result, result, xzr, hi
106*412f47f9SXin Li	ret
107*412f47f9SXin Li#else
108*412f47f9SXin Li	/* Not reached the limit, must have found the end or a diff.  */
109*412f47f9SXin Li	tbz	limit, #63, L(not_limit)
110*412f47f9SXin Li	add	tmp1, limit, 8
111*412f47f9SXin Li	cbz	limit, L(not_limit)
112*412f47f9SXin Li
113*412f47f9SXin Li	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
114*412f47f9SXin Li	mov	mask, #~0
115*412f47f9SXin Li	lsr	mask, mask, limit
116*412f47f9SXin Li	bic	data1, data1, mask
117*412f47f9SXin Li	bic	data2, data2, mask
118*412f47f9SXin Li
119*412f47f9SXin Li	/* Make sure that the NUL byte is marked in the syndrome.  */
120*412f47f9SXin Li	orr	has_nul, has_nul, mask
121*412f47f9SXin Li
122*412f47f9SXin LiL(not_limit):
123*412f47f9SXin Li	/* For big-endian we cannot use the trick with the syndrome value
124*412f47f9SXin Li	   as carry-propagation can corrupt the upper bits if the trailing
125*412f47f9SXin Li	   bytes in the string contain 0x01.  */
126*412f47f9SXin Li	/* However, if there is no NUL byte in the dword, we can generate
127*412f47f9SXin Li	   the result directly.  We can't just subtract the bytes as the
128*412f47f9SXin Li	   MSB might be significant.  */
129*412f47f9SXin Li	cbnz	has_nul, 1f
130*412f47f9SXin Li	cmp	data1, data2
131*412f47f9SXin Li	cset	result, ne
132*412f47f9SXin Li	cneg	result, result, lo
133*412f47f9SXin Li	ret
134*412f47f9SXin Li1:
135*412f47f9SXin Li	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
136*412f47f9SXin Li	rev	tmp3, data1
137*412f47f9SXin Li	sub	tmp1, tmp3, zeroones
138*412f47f9SXin Li	orr	tmp2, tmp3, #REP8_7f
139*412f47f9SXin Li	bic	has_nul, tmp1, tmp2
140*412f47f9SXin Li	rev	has_nul, has_nul
141*412f47f9SXin Li	orr	syndrome, diff, has_nul
142*412f47f9SXin Li	clz	pos, syndrome
143*412f47f9SXin Li	/* The most-significant-non-zero bit of the syndrome marks either the
144*412f47f9SXin Li	   first bit that is different, or the top bit of the first zero byte.
145*412f47f9SXin Li	   Shifting left now will bring the critical information into the
146*412f47f9SXin Li	   top bits.  */
147*412f47f9SXin LiL(end_quick):
148*412f47f9SXin Li	lsl	data1, data1, pos
149*412f47f9SXin Li	lsl	data2, data2, pos
150*412f47f9SXin Li	/* But we need to zero-extend (char is unsigned) the value and then
151*412f47f9SXin Li	   perform a signed 32-bit subtraction.  */
152*412f47f9SXin Li	lsr	data1, data1, #56
153*412f47f9SXin Li	sub	result, data1, data2, lsr #56
154*412f47f9SXin Li	ret
155*412f47f9SXin Li#endif
156*412f47f9SXin Li
157*412f47f9SXin LiL(mutual_align):
158*412f47f9SXin Li	/* Sources are mutually aligned, but are not currently at an
159*412f47f9SXin Li	   alignment boundary.  Round down the addresses and then mask off
160*412f47f9SXin Li	   the bytes that precede the start point.
161*412f47f9SXin Li	   We also need to adjust the limit calculations, but without
162*412f47f9SXin Li	   overflowing if the limit is near ULONG_MAX.  */
163*412f47f9SXin Li	bic	src1, src1, #7
164*412f47f9SXin Li	bic	src2, src2, #7
165*412f47f9SXin Li	ldr	data1, [src1], #8
166*412f47f9SXin Li	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
167*412f47f9SXin Li	ldr	data2, [src2], #8
168*412f47f9SXin Li	mov	tmp2, #~0
169*412f47f9SXin Li	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
170*412f47f9SXin Li	/* Adjust the limit and ensure it doesn't overflow.  */
171*412f47f9SXin Li	adds	limit, limit, count
172*412f47f9SXin Li	csinv	limit, limit, xzr, lo
173*412f47f9SXin Li	orr	data1, data1, tmp2
174*412f47f9SXin Li	orr	data2, data2, tmp2
175*412f47f9SXin Li	b	L(start_realigned)
176*412f47f9SXin Li
177*412f47f9SXin Li	.p2align 4
178*412f47f9SXin Li	/* Don't bother with dwords for up to 16 bytes.  */
179*412f47f9SXin LiL(misaligned8):
180*412f47f9SXin Li	cmp	limit, #16
181*412f47f9SXin Li	b.hs	L(try_misaligned_words)
182*412f47f9SXin Li
183*412f47f9SXin LiL(byte_loop):
184*412f47f9SXin Li	/* Perhaps we can do better than this.  */
185*412f47f9SXin Li	ldrb	data1w, [src1], #1
186*412f47f9SXin Li	ldrb	data2w, [src2], #1
187*412f47f9SXin Li	subs	limit, limit, #1
188*412f47f9SXin Li	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
189*412f47f9SXin Li	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
190*412f47f9SXin Li	b.eq	L(byte_loop)
191*412f47f9SXin LiL(done):
192*412f47f9SXin Li	sub	result, data1, data2
193*412f47f9SXin Li	ret
194*412f47f9SXin Li	/* Align the SRC1 to a dword by doing a bytewise compare and then do
195*412f47f9SXin Li	   the dword loop.  */
196*412f47f9SXin LiL(try_misaligned_words):
197*412f47f9SXin Li	cbz	count, L(src1_aligned)
198*412f47f9SXin Li
199*412f47f9SXin Li	neg	count, count
200*412f47f9SXin Li	and	count, count, #7
201*412f47f9SXin Li	sub	limit, limit, count
202*412f47f9SXin Li
203*412f47f9SXin LiL(page_end_loop):
204*412f47f9SXin Li	ldrb	data1w, [src1], #1
205*412f47f9SXin Li	ldrb	data2w, [src2], #1
206*412f47f9SXin Li	cmp	data1w, #1
207*412f47f9SXin Li	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
208*412f47f9SXin Li	b.ne	L(done)
209*412f47f9SXin Li	subs	count, count, #1
210*412f47f9SXin Li	b.hi	L(page_end_loop)
211*412f47f9SXin Li
212*412f47f9SXin Li	/* The following diagram explains the comparison of misaligned strings.
213*412f47f9SXin Li	   The bytes are shown in natural order. For little-endian, it is
214*412f47f9SXin Li	   reversed in the registers. The "x" bytes are before the string.
215*412f47f9SXin Li	   The "|" separates data that is loaded at one time.
216*412f47f9SXin Li	   src1     | a a a a a a a a | b b b c c c c c | . . .
217*412f47f9SXin Li	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
218*412f47f9SXin Li
219*412f47f9SXin Li	   After shifting in each step, the data looks like this:
220*412f47f9SXin Li	                STEP_A              STEP_B              STEP_C
221*412f47f9SXin Li	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
222*412f47f9SXin Li	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
223*412f47f9SXin Li
224*412f47f9SXin Li	   The bytes with "0" are eliminated from the syndrome via mask.
225*412f47f9SXin Li
226*412f47f9SXin Li	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
227*412f47f9SXin Li	   time from SRC2. The comparison happens in 3 steps. After each step
228*412f47f9SXin Li	   the loop can exit, or read from SRC1 or SRC2. */
229*412f47f9SXin LiL(src1_aligned):
230*412f47f9SXin Li	/* Calculate offset from 8 byte alignment to string start in bits. No
231*412f47f9SXin Li	   need to mask offset since shifts are ignoring upper bits. */
232*412f47f9SXin Li	lsl	offset, src2, #3
233*412f47f9SXin Li	bic	src2, src2, #0xf
234*412f47f9SXin Li	mov	mask, -1
235*412f47f9SXin Li	neg	neg_offset, offset
236*412f47f9SXin Li	ldr	data1, [src1], #8
237*412f47f9SXin Li	ldp	tmp1, tmp2, [src2], #16
238*412f47f9SXin Li	LS_BK	mask, mask, neg_offset
239*412f47f9SXin Li	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
240*412f47f9SXin Li	/* Skip the first compare if data in tmp1 is irrelevant. */
241*412f47f9SXin Li	tbnz	offset, 6, L(misaligned_mid_loop)
242*412f47f9SXin Li
243*412f47f9SXin LiL(loop_misaligned):
244*412f47f9SXin Li	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
245*412f47f9SXin Li	LS_FW	data2, tmp1, offset
246*412f47f9SXin Li	LS_BK	tmp1, tmp2, neg_offset
247*412f47f9SXin Li	subs	limit, limit, #8
248*412f47f9SXin Li	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
249*412f47f9SXin Li	sub	has_nul, data1, zeroones
250*412f47f9SXin Li	eor	diff, data1, data2	/* Non-zero if differences found.  */
251*412f47f9SXin Li	orr	tmp3, data1, #REP8_7f
252*412f47f9SXin Li	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
253*412f47f9SXin Li	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
254*412f47f9SXin Li	orr	tmp3, endloop, has_nul
255*412f47f9SXin Li	cbnz	tmp3, L(full_check)
256*412f47f9SXin Li
257*412f47f9SXin Li	ldr	data1, [src1], #8
258*412f47f9SXin LiL(misaligned_mid_loop):
259*412f47f9SXin Li	/* STEP_B: Compare first part of data1 to second part of tmp2. */
260*412f47f9SXin Li	LS_FW	data2, tmp2, offset
261*412f47f9SXin Li#ifdef __AARCH64EB__
262*412f47f9SXin Li	/* For big-endian we do a byte reverse to avoid carry-propagation
263*412f47f9SXin Li	problem described above. This way we can reuse the has_nul in the
264*412f47f9SXin Li	next step and also use syndrome value trick at the end. */
265*412f47f9SXin Li	rev	tmp3, data1
266*412f47f9SXin Li	#define data1_fixed tmp3
267*412f47f9SXin Li#else
268*412f47f9SXin Li	#define data1_fixed data1
269*412f47f9SXin Li#endif
270*412f47f9SXin Li	sub	has_nul, data1_fixed, zeroones
271*412f47f9SXin Li	orr	tmp3, data1_fixed, #REP8_7f
272*412f47f9SXin Li	eor	diff, data2, data1	/* Non-zero if differences found.  */
273*412f47f9SXin Li	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
274*412f47f9SXin Li#ifdef __AARCH64EB__
275*412f47f9SXin Li	rev	has_nul, has_nul
276*412f47f9SXin Li#endif
277*412f47f9SXin Li	cmp	limit, neg_offset, lsr #3
278*412f47f9SXin Li	orr	syndrome, diff, has_nul
279*412f47f9SXin Li	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
280*412f47f9SXin Li	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
281*412f47f9SXin Li	cbnz	tmp3, L(syndrome_check)
282*412f47f9SXin Li
283*412f47f9SXin Li	/* STEP_C: Compare second part of data1 to first part of tmp1. */
284*412f47f9SXin Li	ldp	tmp1, tmp2, [src2], #16
285*412f47f9SXin Li	cmp	limit, #8
286*412f47f9SXin Li	LS_BK	data2, tmp1, neg_offset
287*412f47f9SXin Li	eor	diff, data2, data1	/* Non-zero if differences found.  */
288*412f47f9SXin Li	orr	syndrome, diff, has_nul
289*412f47f9SXin Li	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
290*412f47f9SXin Li	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
291*412f47f9SXin Li	cbnz	tmp3, L(syndrome_check)
292*412f47f9SXin Li
293*412f47f9SXin Li	ldr	data1, [src1], #8
294*412f47f9SXin Li	sub	limit, limit, #8
295*412f47f9SXin Li	b	L(loop_misaligned)
296*412f47f9SXin Li
297*412f47f9SXin Li#ifdef	__AARCH64EB__
298*412f47f9SXin LiL(syndrome_check):
299*412f47f9SXin Li	clz	pos, syndrome
300*412f47f9SXin Li	cmp	pos, limit, lsl #3
301*412f47f9SXin Li	b.lo	L(end_quick)
302*412f47f9SXin Li#endif
303*412f47f9SXin Li
304*412f47f9SXin LiL(ret0):
305*412f47f9SXin Li	mov	result, #0
306*412f47f9SXin Li	ret
307*412f47f9SXin LiEND(__strncmp_aarch64)
308*412f47f9SXin Li
309