1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "asm_amd64.h"
7#include "textflag.h"
8
9// memequal(a, b unsafe.Pointer, size uintptr) bool
10TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
11	// AX = a    (want in SI)
12	// BX = b    (want in DI)
13	// CX = size (want in BX)
14	CMPQ	AX, BX
15	JNE	neq
16	MOVQ	$1, AX	// return 1
17	RET
18neq:
19	MOVQ	AX, SI
20	MOVQ	BX, DI
21	MOVQ	CX, BX
22	JMP	memeqbody<>(SB)
23
24// memequal_varlen(a, b unsafe.Pointer) bool
25TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
26	// AX = a       (want in SI)
27	// BX = b       (want in DI)
28	// 8(DX) = size (want in BX)
29	CMPQ	AX, BX
30	JNE	neq
31	MOVQ	$1, AX	// return 1
32	RET
33neq:
34	MOVQ	AX, SI
35	MOVQ	BX, DI
36	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
37	JMP	memeqbody<>(SB)
38
39// Input:
40//   a in SI
41//   b in DI
42//   count in BX
43// Output:
44//   result in AX
45TEXT memeqbody<>(SB),NOSPLIT,$0-0
46	CMPQ	BX, $8
47	JB	small
48	CMPQ	BX, $64
49	JB	bigloop
50#ifndef hasAVX2
51	CMPB	internalcpu·X86+const_offsetX86HasAVX2(SB), $1
52	JE	hugeloop_avx2
53
54	// 64 bytes at a time using xmm registers
55	PCALIGN $16
56hugeloop:
57	CMPQ	BX, $64
58	JB	bigloop
59	MOVOU	(SI), X0
60	MOVOU	(DI), X1
61	MOVOU	16(SI), X2
62	MOVOU	16(DI), X3
63	MOVOU	32(SI), X4
64	MOVOU	32(DI), X5
65	MOVOU	48(SI), X6
66	MOVOU	48(DI), X7
67	PCMPEQB	X1, X0
68	PCMPEQB	X3, X2
69	PCMPEQB	X5, X4
70	PCMPEQB	X7, X6
71	PAND	X2, X0
72	PAND	X6, X4
73	PAND	X4, X0
74	PMOVMSKB X0, DX
75	ADDQ	$64, SI
76	ADDQ	$64, DI
77	SUBQ	$64, BX
78	CMPL	DX, $0xffff
79	JEQ	hugeloop
80	XORQ	AX, AX	// return 0
81	RET
82#endif
83
84	// 64 bytes at a time using ymm registers
85	PCALIGN $16
86hugeloop_avx2:
87	CMPQ	BX, $64
88	JB	bigloop_avx2
89	VMOVDQU	(SI), Y0
90	VMOVDQU	(DI), Y1
91	VMOVDQU	32(SI), Y2
92	VMOVDQU	32(DI), Y3
93	VPCMPEQB	Y1, Y0, Y4
94	VPCMPEQB	Y2, Y3, Y5
95	VPAND	Y4, Y5, Y6
96	VPMOVMSKB Y6, DX
97	ADDQ	$64, SI
98	ADDQ	$64, DI
99	SUBQ	$64, BX
100	CMPL	DX, $0xffffffff
101	JEQ	hugeloop_avx2
102	VZEROUPPER
103	XORQ	AX, AX	// return 0
104	RET
105
106bigloop_avx2:
107	VZEROUPPER
108
109	// 8 bytes at a time using 64-bit register
110	PCALIGN $16
111bigloop:
112	CMPQ	BX, $8
113	JBE	leftover
114	MOVQ	(SI), CX
115	MOVQ	(DI), DX
116	ADDQ	$8, SI
117	ADDQ	$8, DI
118	SUBQ	$8, BX
119	CMPQ	CX, DX
120	JEQ	bigloop
121	XORQ	AX, AX	// return 0
122	RET
123
124	// remaining 0-8 bytes
125leftover:
126	MOVQ	-8(SI)(BX*1), CX
127	MOVQ	-8(DI)(BX*1), DX
128	CMPQ	CX, DX
129	SETEQ	AX
130	RET
131
132small:
133	CMPQ	BX, $0
134	JEQ	equal
135
136	LEAQ	0(BX*8), CX
137	NEGQ	CX
138
139	CMPB	SI, $0xf8
140	JA	si_high
141
142	// load at SI won't cross a page boundary.
143	MOVQ	(SI), SI
144	JMP	si_finish
145si_high:
146	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
147	MOVQ	-8(SI)(BX*1), SI
148	SHRQ	CX, SI
149si_finish:
150
151	// same for DI.
152	CMPB	DI, $0xf8
153	JA	di_high
154	MOVQ	(DI), DI
155	JMP	di_finish
156di_high:
157	MOVQ	-8(DI)(BX*1), DI
158	SHRQ	CX, DI
159di_finish:
160
161	SUBQ	SI, DI
162	SHLQ	CX, DI
163equal:
164	SETEQ	AX
165	RET
166