xref: /aosp_15_r20/external/arm-optimized-routines/string/aarch64/memcmp.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/* memcmp - compare memory
2*412f47f9SXin Li *
3*412f47f9SXin Li * Copyright (c) 2013-2022, Arm Limited.
4*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
5*412f47f9SXin Li */
6*412f47f9SXin Li
7*412f47f9SXin Li/* Assumptions:
8*412f47f9SXin Li *
9*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
10*412f47f9SXin Li */
11*412f47f9SXin Li
12*412f47f9SXin Li#include "asmdefs.h"
13*412f47f9SXin Li
14*412f47f9SXin Li#define src1	x0
15*412f47f9SXin Li#define src2	x1
16*412f47f9SXin Li#define limit	x2
17*412f47f9SXin Li#define result	w0
18*412f47f9SXin Li
19*412f47f9SXin Li#define data1	x3
20*412f47f9SXin Li#define data1w	w3
21*412f47f9SXin Li#define data2	x4
22*412f47f9SXin Li#define data2w	w4
23*412f47f9SXin Li#define data3	x5
24*412f47f9SXin Li#define data3w	w5
25*412f47f9SXin Li#define data4	x6
26*412f47f9SXin Li#define data4w	w6
27*412f47f9SXin Li#define tmp	x6
28*412f47f9SXin Li#define src1end	x7
29*412f47f9SXin Li#define src2end	x8
30*412f47f9SXin Li
31*412f47f9SXin Li
32*412f47f9SXin LiENTRY (__memcmp_aarch64)
33*412f47f9SXin Li	PTR_ARG (0)
34*412f47f9SXin Li	PTR_ARG (1)
35*412f47f9SXin Li	SIZE_ARG (2)
36*412f47f9SXin Li
37*412f47f9SXin Li	cmp	limit, 16
38*412f47f9SXin Li	b.lo	L(less16)
39*412f47f9SXin Li	ldp	data1, data3, [src1]
40*412f47f9SXin Li	ldp	data2, data4, [src2]
41*412f47f9SXin Li	ccmp	data1, data2, 0, ne
42*412f47f9SXin Li	ccmp	data3, data4, 0, eq
43*412f47f9SXin Li	b.ne	L(return2)
44*412f47f9SXin Li
45*412f47f9SXin Li	add	src1end, src1, limit
46*412f47f9SXin Li	add	src2end, src2, limit
47*412f47f9SXin Li	cmp	limit, 32
48*412f47f9SXin Li	b.ls	L(last_bytes)
49*412f47f9SXin Li	cmp	limit, 160
50*412f47f9SXin Li	b.hs	L(loop_align)
51*412f47f9SXin Li	sub	limit, limit, 32
52*412f47f9SXin Li
53*412f47f9SXin Li	.p2align 4
54*412f47f9SXin LiL(loop32):
55*412f47f9SXin Li	ldp	data1, data3, [src1, 16]
56*412f47f9SXin Li	ldp	data2, data4, [src2, 16]
57*412f47f9SXin Li	cmp	data1, data2
58*412f47f9SXin Li	ccmp	data3, data4, 0, eq
59*412f47f9SXin Li	b.ne	L(return2)
60*412f47f9SXin Li	cmp	limit, 16
61*412f47f9SXin Li	b.ls	L(last_bytes)
62*412f47f9SXin Li
63*412f47f9SXin Li	ldp	data1, data3, [src1, 32]
64*412f47f9SXin Li	ldp	data2, data4, [src2, 32]
65*412f47f9SXin Li	cmp	data1, data2
66*412f47f9SXin Li	ccmp	data3, data4, 0, eq
67*412f47f9SXin Li	b.ne	L(return2)
68*412f47f9SXin Li	add	src1, src1, 32
69*412f47f9SXin Li	add	src2, src2, 32
70*412f47f9SXin LiL(last64):
71*412f47f9SXin Li	subs	limit, limit, 32
72*412f47f9SXin Li	b.hi	L(loop32)
73*412f47f9SXin Li
74*412f47f9SXin Li	/* Compare last 1-16 bytes using unaligned access.  */
75*412f47f9SXin LiL(last_bytes):
76*412f47f9SXin Li	ldp	data1, data3, [src1end, -16]
77*412f47f9SXin Li	ldp	data2, data4, [src2end, -16]
78*412f47f9SXin LiL(return2):
79*412f47f9SXin Li	cmp	data1, data2
80*412f47f9SXin Li	csel	data1, data1, data3, ne
81*412f47f9SXin Li	csel	data2, data2, data4, ne
82*412f47f9SXin Li
83*412f47f9SXin Li	/* Compare data bytes and set return value to 0, -1 or 1.  */
84*412f47f9SXin LiL(return):
85*412f47f9SXin Li#ifndef __AARCH64EB__
86*412f47f9SXin Li	rev	data1, data1
87*412f47f9SXin Li	rev	data2, data2
88*412f47f9SXin Li#endif
89*412f47f9SXin Li	cmp	data1, data2
90*412f47f9SXin Li	cset	result, ne
91*412f47f9SXin Li	cneg	result, result, lo
92*412f47f9SXin Li	ret
93*412f47f9SXin Li
94*412f47f9SXin Li	.p2align 4
95*412f47f9SXin LiL(less16):
96*412f47f9SXin Li	add	src1end, src1, limit
97*412f47f9SXin Li	add	src2end, src2, limit
98*412f47f9SXin Li	tbz	limit, 3, L(less8)
99*412f47f9SXin Li	ldr	data1, [src1]
100*412f47f9SXin Li	ldr	data2, [src2]
101*412f47f9SXin Li	ldr	data3, [src1end, -8]
102*412f47f9SXin Li	ldr	data4, [src2end, -8]
103*412f47f9SXin Li	b	L(return2)
104*412f47f9SXin Li
105*412f47f9SXin Li	.p2align 4
106*412f47f9SXin LiL(less8):
107*412f47f9SXin Li	tbz	limit, 2, L(less4)
108*412f47f9SXin Li	ldr	data1w, [src1]
109*412f47f9SXin Li	ldr	data2w, [src2]
110*412f47f9SXin Li	ldr	data3w, [src1end, -4]
111*412f47f9SXin Li	ldr	data4w, [src2end, -4]
112*412f47f9SXin Li	b	L(return2)
113*412f47f9SXin Li
114*412f47f9SXin LiL(less4):
115*412f47f9SXin Li	tbz	limit, 1, L(less2)
116*412f47f9SXin Li	ldrh	data1w, [src1]
117*412f47f9SXin Li	ldrh	data2w, [src2]
118*412f47f9SXin Li	cmp	data1w, data2w
119*412f47f9SXin Li	b.ne	L(return)
120*412f47f9SXin LiL(less2):
121*412f47f9SXin Li	mov	result, 0
122*412f47f9SXin Li	tbz	limit, 0, L(return_zero)
123*412f47f9SXin Li	ldrb	data1w, [src1end, -1]
124*412f47f9SXin Li	ldrb	data2w, [src2end, -1]
125*412f47f9SXin Li	sub	result, data1w, data2w
126*412f47f9SXin LiL(return_zero):
127*412f47f9SXin Li	ret
128*412f47f9SXin Li
129*412f47f9SXin LiL(loop_align):
130*412f47f9SXin Li	ldp	data1, data3, [src1, 16]
131*412f47f9SXin Li	ldp	data2, data4, [src2, 16]
132*412f47f9SXin Li	cmp	data1, data2
133*412f47f9SXin Li	ccmp	data3, data4, 0, eq
134*412f47f9SXin Li	b.ne	L(return2)
135*412f47f9SXin Li
136*412f47f9SXin Li	/* Align src2 and adjust src1, src2 and limit.  */
137*412f47f9SXin Li	and	tmp, src2, 15
138*412f47f9SXin Li	sub	tmp, tmp, 16
139*412f47f9SXin Li	sub	src2, src2, tmp
140*412f47f9SXin Li	add	limit, limit, tmp
141*412f47f9SXin Li	sub	src1, src1, tmp
142*412f47f9SXin Li	sub	limit, limit, 64 + 16
143*412f47f9SXin Li
144*412f47f9SXin Li	.p2align 4
145*412f47f9SXin LiL(loop64):
146*412f47f9SXin Li	ldr	q0, [src1, 16]
147*412f47f9SXin Li	ldr	q1, [src2, 16]
148*412f47f9SXin Li	subs	limit, limit, 64
149*412f47f9SXin Li	ldr	q2, [src1, 32]
150*412f47f9SXin Li	ldr	q3, [src2, 32]
151*412f47f9SXin Li	eor	v0.16b, v0.16b, v1.16b
152*412f47f9SXin Li	eor	v1.16b, v2.16b, v3.16b
153*412f47f9SXin Li	ldr	q2, [src1, 48]
154*412f47f9SXin Li	ldr	q3, [src2, 48]
155*412f47f9SXin Li	umaxp	v0.16b, v0.16b, v1.16b
156*412f47f9SXin Li	ldr	q4, [src1, 64]!
157*412f47f9SXin Li	ldr	q5, [src2, 64]!
158*412f47f9SXin Li	eor	v1.16b, v2.16b, v3.16b
159*412f47f9SXin Li	eor	v2.16b, v4.16b, v5.16b
160*412f47f9SXin Li	umaxp	v1.16b, v1.16b, v2.16b
161*412f47f9SXin Li	umaxp	v0.16b, v0.16b, v1.16b
162*412f47f9SXin Li	umaxp	v0.16b, v0.16b, v0.16b
163*412f47f9SXin Li	fmov	tmp, d0
164*412f47f9SXin Li	ccmp	tmp, 0, 0, hi
165*412f47f9SXin Li	b.eq	L(loop64)
166*412f47f9SXin Li
167*412f47f9SXin Li	/* If equal, process last 1-64 bytes using scalar loop.  */
168*412f47f9SXin Li	add	limit, limit, 64 + 16
169*412f47f9SXin Li	cbz	tmp, L(last64)
170*412f47f9SXin Li
171*412f47f9SXin Li	/* Determine the 8-byte aligned offset of the first difference.  */
172*412f47f9SXin Li#ifdef __AARCH64EB__
173*412f47f9SXin Li	rev16	tmp, tmp
174*412f47f9SXin Li#endif
175*412f47f9SXin Li	rev	tmp, tmp
176*412f47f9SXin Li	clz	tmp, tmp
177*412f47f9SXin Li	bic	tmp, tmp, 7
178*412f47f9SXin Li	sub	tmp, tmp, 48
179*412f47f9SXin Li	ldr	data1, [src1, tmp]
180*412f47f9SXin Li	ldr	data2, [src2, tmp]
181*412f47f9SXin Li#ifndef __AARCH64EB__
182*412f47f9SXin Li	rev	data1, data1
183*412f47f9SXin Li	rev	data2, data2
184*412f47f9SXin Li#endif
185*412f47f9SXin Li	mov	result, 1
186*412f47f9SXin Li	cmp	data1, data2
187*412f47f9SXin Li	cneg	result, result, lo
188*412f47f9SXin Li	ret
189*412f47f9SXin Li
190*412f47f9SXin LiEND (__memcmp_aarch64)
191