1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "asm_amd64.h"
7#include "textflag.h"
8
9TEXT ·Count(SB),NOSPLIT,$0-40
10#ifndef hasPOPCNT
11	CMPB	internalcpu·X86+const_offsetX86HasPOPCNT(SB), $1
12	JEQ	2(PC)
13	JMP	·countGeneric(SB)
14#endif
15	MOVQ	b_base+0(FP), SI
16	MOVQ	b_len+8(FP), BX
17	MOVB	c+24(FP), AL
18	LEAQ	ret+32(FP), R8
19	JMP	countbody<>(SB)
20
21TEXT ·CountString(SB),NOSPLIT,$0-32
22#ifndef hasPOPCNT
23	CMPB	internalcpu·X86+const_offsetX86HasPOPCNT(SB), $1
24	JEQ	2(PC)
25	JMP	·countGenericString(SB)
26#endif
27	MOVQ	s_base+0(FP), SI
28	MOVQ	s_len+8(FP), BX
29	MOVB	c+16(FP), AL
30	LEAQ	ret+24(FP), R8
31	JMP	countbody<>(SB)
32
33// input:
34//   SI: data
35//   BX: data len
36//   AL: byte sought
37//   R8: address to put result
38// This function requires the POPCNT instruction.
39TEXT countbody<>(SB),NOSPLIT,$0
40	// Shuffle X0 around so that each byte contains
41	// the character we're looking for.
42	MOVD AX, X0
43	PUNPCKLBW X0, X0
44	PUNPCKLBW X0, X0
45	PSHUFL $0, X0, X0
46
47	CMPQ BX, $16
48	JLT small
49
50	MOVQ $0, R12 // Accumulator
51
52	MOVQ SI, DI
53
54	CMPQ BX, $64
55	JAE avx2
56sse:
57	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
58	JMP	sseloopentry
59
60	PCALIGN $16
61sseloop:
62	// Move the next 16-byte chunk of the data into X1.
63	MOVOU	(DI), X1
64	// Compare bytes in X0 to X1.
65	PCMPEQB	X0, X1
66	// Take the top bit of each byte in X1 and put the result in DX.
67	PMOVMSKB X1, DX
68	// Count number of matching bytes
69	POPCNTL DX, DX
70	// Accumulate into R12
71	ADDQ DX, R12
72	// Advance to next block.
73	ADDQ	$16, DI
74sseloopentry:
75	CMPQ	DI, AX
76	JBE	sseloop
77
78	// Get the number of bytes to consider in the last 16 bytes
79	ANDQ $15, BX
80	JZ end
81
82	// Create mask to ignore overlap between previous 16 byte block
83	// and the next.
84	MOVQ $16,CX
85	SUBQ BX, CX
86	MOVQ $0xFFFF, R10
87	SARQ CL, R10
88	SALQ CL, R10
89
90	// Process the last 16-byte chunk. This chunk may overlap with the
91	// chunks we've already searched so we need to mask part of it.
92	MOVOU	(AX), X1
93	PCMPEQB	X0, X1
94	PMOVMSKB X1, DX
95	// Apply mask
96	ANDQ R10, DX
97	POPCNTL DX, DX
98	ADDQ DX, R12
99end:
100	MOVQ R12, (R8)
101	RET
102
103// handle for lengths < 16
104small:
105	TESTQ	BX, BX
106	JEQ	endzero
107
108	// Check if we'll load across a page boundary.
109	LEAQ	16(SI), AX
110	TESTW	$0xff0, AX
111	JEQ	endofpage
112
113	// We must ignore high bytes as they aren't part of our slice.
114	// Create mask.
115	MOVB BX, CX
116	MOVQ $1, R10
117	SALQ CL, R10
118	SUBQ $1, R10
119
120	// Load data
121	MOVOU	(SI), X1
122	// Compare target byte with each byte in data.
123	PCMPEQB	X0, X1
124	// Move result bits to integer register.
125	PMOVMSKB X1, DX
126	// Apply mask
127	ANDQ R10, DX
128	POPCNTL DX, DX
129	// Directly return DX, we don't need to accumulate
130	// since we have <16 bytes.
131	MOVQ	DX, (R8)
132	RET
133endzero:
134	MOVQ $0, (R8)
135	RET
136
137endofpage:
138	// We must ignore low bytes as they aren't part of our slice.
139	MOVQ $16,CX
140	SUBQ BX, CX
141	MOVQ $0xFFFF, R10
142	SARQ CL, R10
143	SALQ CL, R10
144
145	// Load data into the high end of X1.
146	MOVOU	-16(SI)(BX*1), X1
147	// Compare target byte with each byte in data.
148	PCMPEQB	X0, X1
149	// Move result bits to integer register.
150	PMOVMSKB X1, DX
151	// Apply mask
152	ANDQ R10, DX
153	// Directly return DX, we don't need to accumulate
154	// since we have <16 bytes.
155	POPCNTL DX, DX
156	MOVQ	DX, (R8)
157	RET
158
159avx2:
160#ifndef hasAVX2
161	CMPB   internalcpu·X86+const_offsetX86HasAVX2(SB), $1
162	JNE sse
163#endif
164	MOVD AX, X0
165	LEAQ -64(SI)(BX*1), R11
166	LEAQ (SI)(BX*1), R13
167	VPBROADCASTB  X0, Y1
168	PCALIGN $32
169avx2_loop:
170	VMOVDQU (DI), Y2
171	VMOVDQU 32(DI), Y4
172	VPCMPEQB Y1, Y2, Y3
173	VPCMPEQB Y1, Y4, Y5
174	VPMOVMSKB Y3, DX
175	VPMOVMSKB Y5, CX
176	POPCNTL DX, DX
177	POPCNTL CX, CX
178	ADDQ DX, R12
179	ADDQ CX, R12
180	ADDQ $64, DI
181	CMPQ DI, R11
182	JLE avx2_loop
183
184	// If last block is already processed,
185	// skip to the end.
186	//
187	// This check is NOT an optimization; if the input length is a
188	// multiple of 64, we must not go through the last leg of the
189	// function because the bit shift count passed to SALQ below would
190	// be 64, which is outside of the 0-63 range supported by those
191	// instructions.
192	//
193	// Tests in the bytes and strings packages with input lengths that
194	// are multiples of 64 will break if this condition were removed.
195	CMPQ DI, R13
196	JEQ endavx
197
198	// Load address of the last 64 bytes.
199	// There is an overlap with the previous block.
200	MOVQ R11, DI
201	VMOVDQU (DI), Y2
202	VMOVDQU 32(DI), Y4
203	VPCMPEQB Y1, Y2, Y3
204	VPCMPEQB Y1, Y4, Y5
205	VPMOVMSKB Y3, DX
206	VPMOVMSKB Y5, CX
207	// Exit AVX mode.
208	VZEROUPPER
209	SALQ $32, CX
210	ORQ CX, DX
211
212	// Create mask to ignore overlap between previous 64 byte block
213	// and the next.
214	ANDQ $63, BX
215	MOVQ $64, CX
216	SUBQ BX, CX
217	MOVQ $0xFFFFFFFFFFFFFFFF, R10
218	SALQ CL, R10
219	// Apply mask
220	ANDQ R10, DX
221	POPCNTQ DX, DX
222	ADDQ DX, R12
223	MOVQ R12, (R8)
224	RET
225endavx:
226	// Exit AVX mode.
227	VZEROUPPER
228	MOVQ R12, (R8)
229	RET
230