1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The vectorized implementation found below is a derived work
6// from code written by Anton Blanchard <[email protected]> found
7// at https://github.com/antonblanchard/crc32-vpmsum.  The original
8// is dual licensed under GPL and Apache 2.  As the copyright holder
9// for the work, IBM has contributed this new work under
10// the golang license.
11
12// Changes include porting to Go assembler with modifications for
13// the Go ABI for ppc64le.
14
15#include "textflag.h"
16
17#define POWER8_OFFSET 132
18
19#define off16	R16
20#define off32	R17
21#define off48	R18
22#define off64	R19
23#define off80	R20
24#define off96	R21
25#define	off112	R22
26
27#define const1	V24
28#define const2	V25
29
30#define byteswap	V26
31#define mask_32bit	V27
32#define mask_64bit	V28
33#define zeroes		V29
34
35#define MAX_SIZE	32*1024
36#define REFLECT
37
38TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
39	MOVWZ	crc+0(FP), R3   // incoming crc
40	MOVD    table8+8(FP), R4   // *Table
41	MOVD    p+16(FP), R5
42	MOVD    p_len+24(FP), R6 // p len
43
44	CMP     $0,R6           // len == 0?
45	BNE     start
46	MOVW    R3,ret+40(FP)   // return crc
47	RET
48
49start:
50	NOR     R3,R3,R7        // ^crc
51	MOVWZ	R7,R7		// 32 bits
52	CMP	R6,$16
53	MOVD	R6,CTR
54	BLT	short
55	SRAD    $3,R6,R8        // 8 byte chunks
56	MOVD    R8,CTR
57
58loop:
59	MOVWZ	0(R5),R8	// 0-3 bytes of p ?Endian?
60	MOVWZ	4(R5),R9	// 4-7 bytes of p
61	MOVD	R4,R10		// &tab[0]
62	XOR	R7,R8,R7	// crc ^= byte[0:3]
63	RLDICL	$40,R9,$56,R17	// p[7]
64	SLD	$2,R17,R17	// p[7]*4
65	RLDICL	$40,R7,$56,R8	// crc>>24
66	SLD	$2,R8,R8	// crc>>24*4
67	RLDICL	$48,R9,$56,R18	// p[6]
68	SLD	$2,R18,R18	// p[6]*4
69	MOVWZ	(R10)(R17),R21	// tab[0][p[7]]
70	ADD	$1024,R10,R10	// tab[1]
71	RLDICL	$56,R9,$56,R19	// p[5]
72	SLD	$2,R19,R19	// p[5]*4:1
73	MOVWZ	(R10)(R18),R22	// tab[1][p[6]]
74	ADD	$1024,R10,R10	// tab[2]
75	XOR	R21,R22,R21	// xor done R22
76	CLRLSLDI $56,R9,$2,R20
77	MOVWZ	(R10)(R19),R23	// tab[2][p[5]]
78	ADD	$1024,R10,R10	// &tab[3]
79	XOR	R21,R23,R21	// xor done R23
80	MOVWZ	(R10)(R20),R24	// tab[3][p[4]]
81	ADD 	$1024,R10,R10   // &tab[4]
82	XOR	R21,R24,R21	// xor done R24
83	MOVWZ	(R10)(R8),R25	// tab[4][crc>>24]
84	RLDICL	$48,R7,$56,R24	// crc>>16&0xFF
85	XOR	R21,R25,R21	// xor done R25
86	ADD	$1024,R10,R10	// &tab[5]
87	SLD	$2,R24,R24	// crc>>16&0xFF*4
88	MOVWZ	(R10)(R24),R26	// tab[5][crc>>16&0xFF]
89	XOR	R21,R26,R21	// xor done R26
90	RLDICL	$56,R7,$56,R25	// crc>>8
91	ADD	$1024,R10,R10	// &tab[6]
92	SLD	$2,R25,R25	// crc>>8&FF*2
93	MOVBZ   R7,R26          // crc&0xFF
94	MOVWZ	(R10)(R25),R27	// tab[6][crc>>8&0xFF]
95	ADD 	$1024,R10,R10   // &tab[7]
96	SLD	$2,R26,R26	// crc&0xFF*2
97	XOR	R21,R27,R21	// xor done R27
98	ADD     $8,R5           // p = p[8:]
99	MOVWZ	(R10)(R26),R28	// tab[7][crc&0xFF]
100	XOR	R21,R28,R21	// xor done R28
101	MOVWZ	R21,R7		// crc for next round
102	BDNZ 	loop
103	ANDCC	$7,R6,R8	// any leftover bytes
104	BEQ	done		// none --> done
105	MOVD	R8,CTR		// byte count
106	PCALIGN $16             // align short loop
107short:
108	MOVBZ 	0(R5),R8	// get v
109	XOR 	R8,R7,R8	// byte(crc)^v -> R8
110	RLDIC	$2,R8,$54,R8	// rldicl r8,r8,2,22
111	SRD 	$8,R7,R14	// crc>>8
112	MOVWZ	(R4)(R8),R10
113	ADD	$1,R5
114	XOR 	R10,R14,R7	// loop crc in R7
115	BDNZ 	short
116done:
117	NOR     R7,R7,R7        // ^crc
118	MOVW    R7,ret+40(FP)   // return crc
119	RET
120
121#ifdef BYTESWAP_DATA
122DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
123DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
124
125GLOBL ·byteswapcons+0(SB),RODATA,$16
126#endif
127
128TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
129	MOVWZ	crc+0(FP), R3   // incoming crc
130	MOVWZ	ctab+4(FP), R14   // crc poly id
131	MOVD    p+8(FP), R4
132	MOVD    p_len+16(FP), R5 // p len
133
134	// R3 = incoming crc
135	// R14 = constant table identifier
136	// R5 = address of bytes
137	// R6 = length of bytes
138
139	// defines for index loads
140
141	MOVD	$16,off16
142	MOVD	$32,off32
143	MOVD	$48,off48
144	MOVD	$64,off64
145	MOVD	$80,off80
146	MOVD	$96,off96
147	MOVD	$112,off112
148	MOVD	$0,R15
149
150	MOVD	R3,R10	// save initial crc
151
152	NOR	R3,R3,R3  // ^crc
153	MOVWZ	R3,R3	// 32 bits
154	VXOR	zeroes,zeroes,zeroes  // clear the V reg
155	VSPLTISW $-1,V0
156	VSLDOI	$4,V29,V0,mask_32bit
157	VSLDOI	$8,V29,V0,mask_64bit
158
159	VXOR	V8,V8,V8
160	MTVSRD	R3,VS40	// crc initial value VS40 = V8
161
162#ifdef REFLECT
163	VSLDOI	$8,zeroes,V8,V8  // or: VSLDOI V29,V8,V27,4 for top 32 bits?
164#else
165	VSLDOI	$4,V8,zeroes,V8
166#endif
167
168#ifdef BYTESWAP_DATA
169	MOVDbyteswapcons(SB),R3
170	LVX	(R3),byteswap
171#endif
172
173	CMPU	R5,$256		// length of bytes
174	BLT	short
175
176	RLDICR	$0,R5,$56,R6 // chunk to process
177
178	// First step for larger sizes
179l1:	MOVD	$32768,R7
180	MOVD	R7,R9
181	CMP	R6,R7   // compare R6, R7 (MAX SIZE)
182	BGT	top	// less than MAX, just do remainder
183	MOVD	R6,R7
184top:
185	SUB	R7,R6,R6
186
187	// mainloop does 128 bytes at a time
188	SRD	$7,R7
189
190	// determine the offset into the constants table to start with.
191	// Each constant is 128 bytes, used against 16 bytes of data.
192	SLD	$4,R7,R8
193	SRD	$3,R9,R9
194	SUB	R8,R9,R8
195
196	// The last iteration is reduced in a separate step
197	ADD	$-1,R7
198	MOVD	R7,CTR
199
200	// Determine which constant table (depends on poly)
201	CMP	R14,$1
202	BNE	castTable
203	MOVDIEEEConst(SB),R3
204	BR	startConst
205castTable:
206	MOVDCastConst(SB),R3
207
208startConst:
209	ADD	R3,R8,R3	// starting point in constants table
210
211	VXOR	V0,V0,V0	// clear the V regs
212	VXOR	V1,V1,V1
213	VXOR	V2,V2,V2
214	VXOR	V3,V3,V3
215	VXOR	V4,V4,V4
216	VXOR	V5,V5,V5
217	VXOR	V6,V6,V6
218	VXOR	V7,V7,V7
219
220	LVX	(R3),const1	// loading constant values
221
222	CMP	R15,$1		// Identify warm up pass
223	BEQ	next
224
225	// First warm up pass: load the bytes to process
226	LVX	(R4),V16
227	LVX	(R4+off16),V17
228	LVX	(R4+off32),V18
229	LVX	(R4+off48),V19
230	LVX	(R4+off64),V20
231	LVX	(R4+off80),V21
232	LVX	(R4+off96),V22
233	LVX	(R4+off112),V23
234	ADD	$128,R4		// bump up to next 128 bytes in buffer
235
236	VXOR	V16,V8,V16	// xor in initial CRC in V8
237
238next:
239	BC	18,0,first_warm_up_done
240
241	ADD	$16,R3		// bump up to next constants
242	LVX	(R3),const2	// table values
243
244	VPMSUMD	V16,const1,V8 // second warm up pass
245	LVX	(R4),V16	// load from buffer
246	OR	$0,R2,R2
247
248	VPMSUMD	V17,const1,V9	// vpmsumd with constants
249	LVX	(R4+off16),V17	// load next from buffer
250	OR	$0,R2,R2
251
252	VPMSUMD	V18,const1,V10	// vpmsumd with constants
253	LVX	(R4+off32),V18	// load next from buffer
254	OR	$0,R2,R2
255
256	VPMSUMD	V19,const1,V11	// vpmsumd with constants
257	LVX	(R4+off48),V19	// load next from buffer
258	OR	$0,R2,R2
259
260	VPMSUMD	V20,const1,V12	// vpmsumd with constants
261	LVX	(R4+off64),V20	// load next from buffer
262	OR	$0,R2,R2
263
264	VPMSUMD	V21,const1,V13	// vpmsumd with constants
265	LVX	(R4+off80),V21	// load next from buffer
266	OR	$0,R2,R2
267
268	VPMSUMD	V22,const1,V14	// vpmsumd with constants
269	LVX	(R4+off96),V22	// load next from buffer
270	OR	$0,R2,R2
271
272	VPMSUMD	V23,const1,V15	// vpmsumd with constants
273	LVX	(R4+off112),V23	// load next from buffer
274
275	ADD	$128,R4		// bump up to next 128 bytes in buffer
276
277	BC	18,0,first_cool_down
278
279cool_top:
280	LVX	(R3),const1	// constants
281	ADD	$16,R3		// inc to next constants
282	OR	$0,R2,R2
283
284	VXOR	V0,V8,V0	// xor in previous vpmsumd
285	VPMSUMD	V16,const2,V8	// vpmsumd with constants
286	LVX	(R4),V16	// buffer
287	OR	$0,R2,R2
288
289	VXOR	V1,V9,V1	// xor in previous
290	VPMSUMD	V17,const2,V9	// vpmsumd with constants
291	LVX	(R4+off16),V17	// next in buffer
292	OR	$0,R2,R2
293
294	VXOR	V2,V10,V2	// xor in previous
295	VPMSUMD	V18,const2,V10	// vpmsumd with constants
296	LVX	(R4+off32),V18	// next in buffer
297	OR	$0,R2,R2
298
299	VXOR	V3,V11,V3	// xor in previous
300	VPMSUMD	V19,const2,V11	// vpmsumd with constants
301	LVX	(R4+off48),V19	// next in buffer
302	LVX	(R3),const2	// get next constant
303	OR	$0,R2,R2
304
305	VXOR	V4,V12,V4	// xor in previous
306	VPMSUMD	V20,const1,V12	// vpmsumd with constants
307	LVX	(R4+off64),V20	// next in buffer
308	OR	$0,R2,R2
309
310	VXOR	V5,V13,V5	// xor in previous
311	VPMSUMD	V21,const1,V13	// vpmsumd with constants
312	LVX	(R4+off80),V21	// next in buffer
313	OR	$0,R2,R2
314
315	VXOR	V6,V14,V6	// xor in previous
316	VPMSUMD	V22,const1,V14	// vpmsumd with constants
317	LVX	(R4+off96),V22	// next in buffer
318	OR	$0,R2,R2
319
320	VXOR	V7,V15,V7	// xor in previous
321	VPMSUMD	V23,const1,V15	// vpmsumd with constants
322	LVX	(R4+off112),V23	// next in buffer
323
324	ADD	$128,R4		// bump up buffer pointer
325	BDNZ	cool_top	// are we done?
326
327first_cool_down:
328
329	// load the constants
330	// xor in the previous value
331	// vpmsumd the result with constants
332
333	LVX	(R3),const1
334	ADD	$16,R3
335
336	VXOR	V0,V8,V0
337	VPMSUMD V16,const1,V8
338	OR	$0,R2,R2
339
340	VXOR	V1,V9,V1
341	VPMSUMD	V17,const1,V9
342	OR	$0,R2,R2
343
344	VXOR	V2,V10,V2
345	VPMSUMD	V18,const1,V10
346	OR	$0,R2,R2
347
348	VXOR	V3,V11,V3
349	VPMSUMD	V19,const1,V11
350	OR	$0,R2,R2
351
352	VXOR	V4,V12,V4
353	VPMSUMD	V20,const1,V12
354	OR	$0,R2,R2
355
356	VXOR	V5,V13,V5
357	VPMSUMD	V21,const1,V13
358	OR	$0,R2,R2
359
360	VXOR	V6,V14,V6
361	VPMSUMD	V22,const1,V14
362	OR	$0,R2,R2
363
364	VXOR	V7,V15,V7
365	VPMSUMD	V23,const1,V15
366	OR	$0,R2,R2
367
368second_cool_down:
369
370	VXOR    V0,V8,V0
371	VXOR    V1,V9,V1
372	VXOR    V2,V10,V2
373	VXOR    V3,V11,V3
374	VXOR    V4,V12,V4
375	VXOR    V5,V13,V5
376	VXOR    V6,V14,V6
377	VXOR    V7,V15,V7
378
379#ifdef REFLECT
380	VSLDOI  $4,V0,zeroes,V0
381	VSLDOI  $4,V1,zeroes,V1
382	VSLDOI  $4,V2,zeroes,V2
383	VSLDOI  $4,V3,zeroes,V3
384	VSLDOI  $4,V4,zeroes,V4
385	VSLDOI  $4,V5,zeroes,V5
386	VSLDOI  $4,V6,zeroes,V6
387	VSLDOI  $4,V7,zeroes,V7
388#endif
389
390	LVX	(R4),V8
391	LVX	(R4+off16),V9
392	LVX	(R4+off32),V10
393	LVX	(R4+off48),V11
394	LVX	(R4+off64),V12
395	LVX	(R4+off80),V13
396	LVX	(R4+off96),V14
397	LVX	(R4+off112),V15
398
399	ADD	$128,R4
400
401	VXOR	V0,V8,V16
402	VXOR	V1,V9,V17
403	VXOR	V2,V10,V18
404	VXOR	V3,V11,V19
405	VXOR	V4,V12,V20
406	VXOR	V5,V13,V21
407	VXOR	V6,V14,V22
408	VXOR	V7,V15,V23
409
410	MOVD    $1,R15
411	CMP     $0,R6
412	ADD     $128,R6
413
414	BNE	l1
415	ANDCC   $127,R5
416	SUBC	R5,$128,R6
417	ADD	R3,R6,R3
418
419	SRD	$4,R5,R7
420	MOVD	R7,CTR
421	LVX	(R3),V0
422	LVX	(R3+off16),V1
423	LVX	(R3+off32),V2
424	LVX	(R3+off48),V3
425	LVX	(R3+off64),V4
426	LVX	(R3+off80),V5
427	LVX	(R3+off96),V6
428	LVX	(R3+off112),V7
429
430	ADD	$128,R3
431
432	VPMSUMW	V16,V0,V0
433	VPMSUMW	V17,V1,V1
434	VPMSUMW	V18,V2,V2
435	VPMSUMW	V19,V3,V3
436	VPMSUMW	V20,V4,V4
437	VPMSUMW	V21,V5,V5
438	VPMSUMW	V22,V6,V6
439	VPMSUMW	V23,V7,V7
440
441	// now reduce the tail
442
443	CMP	$0,R7
444	BEQ	next1
445
446	LVX	(R4),V16
447	LVX	(R3),V17
448	VPMSUMW	V16,V17,V16
449	VXOR	V0,V16,V0
450	BC	18,0,next1
451
452	LVX	(R4+off16),V16
453	LVX	(R3+off16),V17
454	VPMSUMW	V16,V17,V16
455	VXOR	V0,V16,V0
456	BC	18,0,next1
457
458	LVX	(R4+off32),V16
459	LVX	(R3+off32),V17
460	VPMSUMW	V16,V17,V16
461	VXOR	V0,V16,V0
462	BC	18,0,next1
463
464	LVX	(R4+off48),V16
465	LVX	(R3+off48),V17
466	VPMSUMW	V16,V17,V16
467	VXOR	V0,V16,V0
468	BC	18,0,next1
469
470	LVX	(R4+off64),V16
471	LVX	(R3+off64),V17
472	VPMSUMW	V16,V17,V16
473	VXOR	V0,V16,V0
474	BC	18,0,next1
475
476	LVX	(R4+off80),V16
477	LVX	(R3+off80),V17
478	VPMSUMW	V16,V17,V16
479	VXOR	V0,V16,V0
480	BC	18,0,next1
481
482	LVX	(R4+off96),V16
483	LVX	(R3+off96),V17
484	VPMSUMW	V16,V17,V16
485	VXOR	V0,V16,V0
486
487next1:
488	VXOR	V0,V1,V0
489	VXOR	V2,V3,V2
490	VXOR	V4,V5,V4
491	VXOR	V6,V7,V6
492	VXOR	V0,V2,V0
493	VXOR	V4,V6,V4
494	VXOR	V0,V4,V0
495
496barrett_reduction:
497
498	CMP	R14,$1
499	BNE	barcstTable
500	MOVDIEEEBarConst(SB),R3
501	BR	startbarConst
502barcstTable:
503	MOVDCastBarConst(SB),R3
504
505startbarConst:
506	LVX	(R3),const1
507	LVX	(R3+off16),const2
508
509	VSLDOI	$8,V0,V0,V1
510	VXOR	V0,V1,V0
511
512#ifdef REFLECT
513	VSPLTISB $1,V1
514	VSL	V0,V1,V0
515#endif
516
517	VAND	V0,mask_64bit,V0
518
519#ifndef	REFLECT
520
521	VPMSUMD	V0,const1,V1
522	VSLDOI	$8,zeroes,V1,V1
523	VPMSUMD	V1,const2,V1
524	VXOR	V0,V1,V0
525	VSLDOI	$8,V0,zeroes,V0
526
527#else
528
529	VAND	V0,mask_32bit,V1
530	VPMSUMD	V1,const1,V1
531	VAND	V1,mask_32bit,V1
532	VPMSUMD	V1,const2,V1
533	VXOR	V0,V1,V0
534	VSLDOI  $4,V0,zeroes,V0
535
536#endif
537
538	MFVSRD	VS32,R3 // VS32 = V0
539
540	NOR	R3,R3,R3 // return ^crc
541	MOVW	R3,ret+32(FP)
542	RET
543
544first_warm_up_done:
545
546	LVX	(R3),const1
547	ADD	$16,R3
548
549	VPMSUMD	V16,const1,V8
550	VPMSUMD	V17,const1,V9
551	VPMSUMD	V18,const1,V10
552	VPMSUMD	V19,const1,V11
553	VPMSUMD	V20,const1,V12
554	VPMSUMD	V21,const1,V13
555	VPMSUMD	V22,const1,V14
556	VPMSUMD	V23,const1,V15
557
558	BR	second_cool_down
559
560short:
561	CMP	$0,R5
562	BEQ	zero
563
564	// compute short constants
565
566	CMP     R14,$1
567	BNE     castshTable
568	MOVDIEEEConst(SB),R3
569	ADD	$4080,R3
570	BR      startshConst
571castshTable:
572	MOVDCastConst(SB),R3
573	ADD	$4080,R3
574
575startshConst:
576	SUBC	R5,$256,R6	// sub from 256
577	ADD	R3,R6,R3
578
579	// calculate where to start
580
581	SRD	$4,R5,R7
582	MOVD	R7,CTR
583
584	VXOR	V19,V19,V19
585	VXOR	V20,V20,V20
586
587	LVX	(R4),V0
588	LVX	(R3),V16
589	VXOR	V0,V8,V0
590	VPMSUMW	V0,V16,V0
591	BC	18,0,v0
592
593	LVX	(R4+off16),V1
594	LVX	(R3+off16),V17
595	VPMSUMW	V1,V17,V1
596	BC	18,0,v1
597
598	LVX	(R4+off32),V2
599	LVX	(R3+off32),V16
600	VPMSUMW	V2,V16,V2
601	BC	18,0,v2
602
603	LVX	(R4+off48),V3
604	LVX	(R3+off48),V17
605	VPMSUMW	V3,V17,V3
606	BC	18,0,v3
607
608	LVX	(R4+off64),V4
609	LVX	(R3+off64),V16
610	VPMSUMW	V4,V16,V4
611	BC	18,0,v4
612
613	LVX	(R4+off80),V5
614	LVX	(R3+off80),V17
615	VPMSUMW	V5,V17,V5
616	BC	18,0,v5
617
618	LVX	(R4+off96),V6
619	LVX	(R3+off96),V16
620	VPMSUMW	V6,V16,V6
621	BC	18,0,v6
622
623	LVX	(R4+off112),V7
624	LVX	(R3+off112),V17
625	VPMSUMW	V7,V17,V7
626	BC	18,0,v7
627
628	ADD	$128,R3
629	ADD	$128,R4
630
631	LVX	(R4),V8
632	LVX	(R3),V16
633	VPMSUMW	V8,V16,V8
634	BC	18,0,v8
635
636	LVX	(R4+off16),V9
637	LVX	(R3+off16),V17
638	VPMSUMW	V9,V17,V9
639	BC	18,0,v9
640
641	LVX	(R4+off32),V10
642	LVX	(R3+off32),V16
643	VPMSUMW	V10,V16,V10
644	BC	18,0,v10
645
646	LVX	(R4+off48),V11
647	LVX	(R3+off48),V17
648	VPMSUMW	V11,V17,V11
649	BC	18,0,v11
650
651	LVX	(R4+off64),V12
652	LVX	(R3+off64),V16
653	VPMSUMW	V12,V16,V12
654	BC	18,0,v12
655
656	LVX	(R4+off80),V13
657	LVX	(R3+off80),V17
658	VPMSUMW	V13,V17,V13
659	BC	18,0,v13
660
661	LVX	(R4+off96),V14
662	LVX	(R3+off96),V16
663	VPMSUMW	V14,V16,V14
664	BC	18,0,v14
665
666	LVX	(R4+off112),V15
667	LVX	(R3+off112),V17
668	VPMSUMW	V15,V17,V15
669
670	VXOR	V19,V15,V19
671v14:	VXOR	V20,V14,V20
672v13:	VXOR	V19,V13,V19
673v12:	VXOR	V20,V12,V20
674v11:	VXOR	V19,V11,V19
675v10:	VXOR	V20,V10,V20
676v9:	VXOR	V19,V9,V19
677v8:	VXOR	V20,V8,V20
678v7:	VXOR	V19,V7,V19
679v6:	VXOR	V20,V6,V20
680v5:	VXOR	V19,V5,V19
681v4:	VXOR	V20,V4,V20
682v3:	VXOR	V19,V3,V19
683v2:	VXOR	V20,V2,V20
684v1:	VXOR	V19,V1,V19
685v0:	VXOR	V20,V0,V20
686
687	VXOR	V19,V20,V0
688
689	BR	barrett_reduction
690
691zero:
692	// This case is the original crc, so just return it
693	MOVW    R10,ret+32(FP)
694	RET
695