1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "asm_amd64.h"
7#include "textflag.h"
8
9TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
10	// AX = a_base (want in SI)
11	// BX = a_len  (want in BX)
12	// CX = a_cap  (unused)
13	// DI = b_base (want in DI)
14	// SI = b_len  (want in DX)
15	// R8 = b_cap  (unused)
16	MOVQ	SI, DX
17	MOVQ	AX, SI
18	JMP	cmpbody<>(SB)
19
20TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
21	// AX = a_base (want in SI)
22	// BX = a_len  (want in BX)
23	// CX = b_base (want in DI)
24	// DI = b_len  (want in DX)
25	MOVQ	AX, SI
26	MOVQ	DI, DX
27	MOVQ	CX, DI
28	JMP	cmpbody<>(SB)
29
30// input:
31//   SI = a
32//   DI = b
33//   BX = alen
34//   DX = blen
35// output:
36//   AX = output (-1/0/1)
37TEXT cmpbody<>(SB),NOSPLIT,$0-0
38	CMPQ	SI, DI
39	JEQ	allsame
40	CMPQ	BX, DX
41	MOVQ	DX, R8
42	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
43	CMPQ	R8, $8
44	JB	small
45
46	CMPQ	R8, $63
47	JBE	loop
48#ifndef hasAVX2
49	CMPB	internalcpu·X86+const_offsetX86HasAVX2(SB), $1
50	JEQ     big_loop_avx2
51	JMP	big_loop
52#else
53	JMP	big_loop_avx2
54#endif
55loop:
56	CMPQ	R8, $16
57	JBE	_0through16
58	MOVOU	(SI), X0
59	MOVOU	(DI), X1
60	PCMPEQB X0, X1
61	PMOVMSKB X1, AX
62	XORQ	$0xffff, AX	// convert EQ to NE
63	JNE	diff16	// branch if at least one byte is not equal
64	ADDQ	$16, SI
65	ADDQ	$16, DI
66	SUBQ	$16, R8
67	JMP	loop
68
69diff64:
70	ADDQ	$48, SI
71	ADDQ	$48, DI
72	JMP	diff16
73diff48:
74	ADDQ	$32, SI
75	ADDQ	$32, DI
76	JMP	diff16
77diff32:
78	ADDQ	$16, SI
79	ADDQ	$16, DI
80	// AX = bit mask of differences
81diff16:
82	BSFQ	AX, BX	// index of first byte that differs
83	XORQ	AX, AX
84	MOVB	(SI)(BX*1), CX
85	CMPB	CX, (DI)(BX*1)
86	SETHI	AX
87	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
88	RET
89
90	// 0 through 16 bytes left, alen>=8, blen>=8
91_0through16:
92	CMPQ	R8, $8
93	JBE	_0through8
94	MOVQ	(SI), AX
95	MOVQ	(DI), CX
96	CMPQ	AX, CX
97	JNE	diff8
98_0through8:
99	MOVQ	-8(SI)(R8*1), AX
100	MOVQ	-8(DI)(R8*1), CX
101	CMPQ	AX, CX
102	JEQ	allsame
103
104	// AX and CX contain parts of a and b that differ.
105diff8:
106	BSWAPQ	AX	// reverse order of bytes
107	BSWAPQ	CX
108	XORQ	AX, CX
109	BSRQ	CX, CX	// index of highest bit difference
110	SHRQ	CX, AX	// move a's bit to bottom
111	ANDQ	$1, AX	// mask bit
112	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
113	RET
114
115	// 0-7 bytes in common
116small:
117	LEAQ	(R8*8), CX	// bytes left -> bits left
118	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
119	JEQ	allsame
120
121	// load bytes of a into high bytes of AX
122	CMPB	SI, $0xf8
123	JA	si_high
124	MOVQ	(SI), SI
125	JMP	si_finish
126si_high:
127	MOVQ	-8(SI)(R8*1), SI
128	SHRQ	CX, SI
129si_finish:
130	SHLQ	CX, SI
131
132	// load bytes of b in to high bytes of BX
133	CMPB	DI, $0xf8
134	JA	di_high
135	MOVQ	(DI), DI
136	JMP	di_finish
137di_high:
138	MOVQ	-8(DI)(R8*1), DI
139	SHRQ	CX, DI
140di_finish:
141	SHLQ	CX, DI
142
143	BSWAPQ	SI	// reverse order of bytes
144	BSWAPQ	DI
145	XORQ	SI, DI	// find bit differences
146	JEQ	allsame
147	BSRQ	DI, CX	// index of highest bit difference
148	SHRQ	CX, SI	// move a's bit to bottom
149	ANDQ	$1, SI	// mask bit
150	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
151	RET
152
153allsame:
154	XORQ	AX, AX
155	XORQ	CX, CX
156	CMPQ	BX, DX
157	SETGT	AX	// 1 if alen > blen
158	SETEQ	CX	// 1 if alen == blen
159	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
160	RET
161
162	// this works for >= 64 bytes of data.
163#ifndef hasAVX2
164big_loop:
165	MOVOU	(SI), X0
166	MOVOU	(DI), X1
167	PCMPEQB X0, X1
168	PMOVMSKB X1, AX
169	XORQ	$0xffff, AX
170	JNE	diff16
171
172	MOVOU	16(SI), X0
173	MOVOU	16(DI), X1
174	PCMPEQB X0, X1
175	PMOVMSKB X1, AX
176	XORQ	$0xffff, AX
177	JNE	diff32
178
179	MOVOU	32(SI), X0
180	MOVOU	32(DI), X1
181	PCMPEQB X0, X1
182	PMOVMSKB X1, AX
183	XORQ	$0xffff, AX
184	JNE	diff48
185
186	MOVOU	48(SI), X0
187	MOVOU	48(DI), X1
188	PCMPEQB X0, X1
189	PMOVMSKB X1, AX
190	XORQ	$0xffff, AX
191	JNE	diff64
192
193	ADDQ	$64, SI
194	ADDQ	$64, DI
195	SUBQ	$64, R8
196	CMPQ	R8, $64
197	JBE	loop
198	JMP	big_loop
199#endif
200
201	// Compare 64-bytes per loop iteration.
202	// Loop is unrolled and uses AVX2.
203big_loop_avx2:
204	VMOVDQU	(SI), Y2
205	VMOVDQU	(DI), Y3
206	VMOVDQU	32(SI), Y4
207	VMOVDQU	32(DI), Y5
208	VPCMPEQB Y2, Y3, Y0
209	VPMOVMSKB Y0, AX
210	XORL	$0xffffffff, AX
211	JNE	diff32_avx2
212	VPCMPEQB Y4, Y5, Y6
213	VPMOVMSKB Y6, AX
214	XORL	$0xffffffff, AX
215	JNE	diff64_avx2
216
217	ADDQ	$64, SI
218	ADDQ	$64, DI
219	SUBQ	$64, R8
220	CMPQ	R8, $64
221	JB	big_loop_avx2_exit
222	JMP	big_loop_avx2
223
224	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
225diff32_avx2:
226	VZEROUPPER
227	JMP diff16
228
229	// Same as diff32_avx2, but for last 32 bytes.
230diff64_avx2:
231	VZEROUPPER
232	JMP diff48
233
234	// For <64 bytes remainder jump to normal loop.
235big_loop_avx2_exit:
236	VZEROUPPER
237	JMP loop
238