1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ppc64 || ppc64le
6
7#include "go_asm.h"
8#include "textflag.h"
9
10// 4K (smallest case) page size offset mask for PPC64.
11#define PAGE_OFFSET 4095
12
13// Likewise, the BC opcode is hard to read, and no extended
14// mnemonics are offered for these forms.
15#define BGELR_CR6 BC  4, CR6LT, (LR)
16#define BEQLR     BC 12, CR0EQ, (LR)
17
18// memequal(a, b unsafe.Pointer, size uintptr) bool
19TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
20	// R3 = a
21	// R4 = b
22	// R5 = size
23	BR	memeqbody<>(SB)
24
25// memequal_varlen(a, b unsafe.Pointer) bool
26TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
27	// R3 = a
28	// R4 = b
29	CMP	R3, R4
30	BEQ	eq
31	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
32	BR	memeqbody<>(SB)
33eq:
34	MOVD	$1, R3
35	RET
36
37// Do an efficient memequal for ppc64
38// R3 = s1
39// R4 = s2
40// R5 = len
41// On exit:
42// R3 = return value
43TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
44	MOVD	R3, R8		// Move s1 into R8
45	ADD	R5, R3, R9	// &s1[len(s1)]
46	ADD	R5, R4, R10	// &s2[len(s2)]
47	MOVD	$1, R11
48	CMP	R5, $16		// Use GPR checks for check for len <= 16
49	BLE	check0_16
50	MOVD	$0, R3		// Assume no-match in case BGELR CR6 returns
51	CMP	R5, $32		// Use overlapping VSX loads for len <= 32
52	BLE	check17_32	// Do a pair of overlapping VSR compares
53	CMP	R5, $64
54	BLE	check33_64	// Hybrid check + overlap compare.
55
56setup64:
57	SRD	$6, R5, R6	// number of 64 byte chunks to compare
58	MOVD	R6, CTR
59	MOVD	$16, R14	// index for VSX loads and stores
60	MOVD	$32, R15
61	MOVD	$48, R16
62	ANDCC	$0x3F, R5, R5	// len%64==0?
63
64	PCALIGN $16
65loop64:
66	LXVD2X	(R8+R0), V0
67	LXVD2X	(R4+R0), V1
68	VCMPEQUBCC V0, V1, V2	// compare, setting CR6
69	BGELR_CR6
70	LXVD2X	(R8+R14), V0
71	LXVD2X	(R4+R14), V1
72	VCMPEQUBCC	V0, V1, V2
73	BGELR_CR6
74	LXVD2X	(R8+R15), V0
75	LXVD2X	(R4+R15), V1
76	VCMPEQUBCC	V0, V1, V2
77	BGELR_CR6
78	LXVD2X	(R8+R16), V0
79	LXVD2X	(R4+R16), V1
80	VCMPEQUBCC	V0, V1, V2
81	BGELR_CR6
82	ADD	$64,R8		// bump up to next 64
83	ADD	$64,R4
84	BDNZ	loop64
85
86	ISEL	CR0EQ, R11, R3, R3	// If no tail, return 1, otherwise R3 remains 0.
87	BEQLR				// return if no tail.
88
89	ADD	$-64, R9, R8
90	ADD	$-64, R10, R4
91	LXVD2X	(R8+R0), V0
92	LXVD2X	(R4+R0), V1
93	VCMPEQUBCC	V0, V1, V2
94	BGELR_CR6
95	LXVD2X	(R8+R14), V0
96	LXVD2X	(R4+R14), V1
97	VCMPEQUBCC	V0, V1, V2
98	BGELR_CR6
99	LXVD2X	(R8+R15), V0
100	LXVD2X	(R4+R15), V1
101	VCMPEQUBCC	V0, V1, V2
102	BGELR_CR6
103	LXVD2X	(R8+R16), V0
104	LXVD2X	(R4+R16), V1
105	VCMPEQUBCC	V0, V1, V2
106	ISEL	CR6LT, R11, R0, R3
107	RET
108
109check33_64:
110	// Bytes 0-15
111	LXVD2X	(R8+R0), V0
112	LXVD2X	(R4+R0), V1
113	VCMPEQUBCC	V0, V1, V2
114	BGELR_CR6
115	ADD	$16, R8
116	ADD	$16, R4
117
118	// Bytes 16-31
119	LXVD2X	(R8+R0), V0
120	LXVD2X	(R4+R0), V1
121	VCMPEQUBCC	V0, V1, V2
122	BGELR_CR6
123
124	// A little tricky, but point R4,R8 to &sx[len-32],
125	// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
126	ADD	$-32, R9, R8
127	ADD	$-32, R10, R4
128	// Fallthrough
129
130check17_32:
131	LXVD2X	(R8+R0), V0
132	LXVD2X	(R4+R0), V1
133	VCMPEQUBCC	V0, V1, V2
134	ISEL	CR6LT, R11, R0, R5
135
136	// Load sX[len(sX)-16:len(sX)] and compare.
137	ADD	$-16, R9
138	ADD	$-16, R10
139	LXVD2X	(R9+R0), V0
140	LXVD2X	(R10+R0), V1
141	VCMPEQUBCC	V0, V1, V2
142	ISEL	CR6LT, R5, R0, R3
143	RET
144
145check0_16:
146#ifdef GOPPC64_power10
147	SLD	$56, R5, R7
148	LXVL	R8, R7, V0
149	LXVL	R4, R7, V1
150	VCMPEQUDCC	V0, V1, V2
151	ISEL	CR6LT, R11, R0, R3
152	RET
153#else
154	CMP	R5, $8
155	BLT	check0_7
156	// Load sX[0:7] and compare.
157	MOVD	(R8), R6
158	MOVD	(R4), R7
159	CMP	R6, R7
160	ISEL	CR0EQ, R11, R0, R5
161	// Load sX[len(sX)-8:len(sX)] and compare.
162	MOVD	-8(R9), R6
163	MOVD	-8(R10), R7
164	CMP	R6, R7
165	ISEL	CR0EQ, R5, R0, R3
166	RET
167
168check0_7:
169	CMP	R5,$0
170	MOVD	$1, R3
171	BEQLR		// return if len == 0
172
173	// Check < 8B loads with a single compare, but select the load address
174	// such that it cannot cross a page boundary. Load a few bytes from the
175	// lower address if that does not cross the lower page. Or, load a few
176	// extra bytes from the higher addresses. And align those values
177	// consistently in register as either address may have differing
178	// alignment requirements.
179	ANDCC	$PAGE_OFFSET, R8, R6	// &sX & PAGE_OFFSET
180	ANDCC	$PAGE_OFFSET, R4, R9
181	SUBC	R5, $8, R12		// 8-len
182	SLD	$3, R12, R14		// (8-len)*8
183	CMPU	R6, R12, CR1		// Enough bytes lower in the page to load lower?
184	CMPU	R9, R12, CR0
185	SUB	R12, R8, R6		// compute lower load address
186	SUB	R12, R4, R9
187	ISEL	CR1LT, R8, R6, R8	// R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
188	ISEL	CR0LT, R4, R9, R4	// Similar for s2
189	MOVD	(R8), R15
190	MOVD	(R4), R16
191	SLD	R14, R15, R7
192	SLD	R14, R16, R17
193	SRD	R14, R7, R7		// Clear the upper (8-len) bytes (with 2 shifts)
194	SRD	R14, R17, R17
195	SRD	R14, R15, R6		// Clear the lower (8-len) bytes
196	SRD	R14, R16, R9
197#ifdef GOARCH_ppc64le
198	ISEL	CR1LT, R7, R6, R8      // Choose the correct len bytes to compare based on alignment
199	ISEL	CR0LT, R17, R9, R4
200#else
201	ISEL	CR1LT, R6, R7, R8
202	ISEL	CR0LT, R9, R17, R4
203#endif
204	CMP	R4, R8
205	ISEL	CR0EQ, R11, R0, R3
206	RET
207#endif	// tail processing if !defined(GOPPC64_power10)
208