1// Copyright 2017 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// The vectorized implementation found below is a derived work 6// from code written by Anton Blanchard <[email protected]> found 7// at https://github.com/antonblanchard/crc32-vpmsum. The original 8// is dual licensed under GPL and Apache 2. As the copyright holder 9// for the work, IBM has contributed this new work under 10// the golang license. 11 12// Changes include porting to Go assembler with modifications for 13// the Go ABI for ppc64le. 14 15#include "textflag.h" 16 17#define POWER8_OFFSET 132 18 19#define off16 R16 20#define off32 R17 21#define off48 R18 22#define off64 R19 23#define off80 R20 24#define off96 R21 25#define off112 R22 26 27#define const1 V24 28#define const2 V25 29 30#define byteswap V26 31#define mask_32bit V27 32#define mask_64bit V28 33#define zeroes V29 34 35#define MAX_SIZE 32*1024 36#define REFLECT 37 38TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44 39 MOVWZ crc+0(FP), R3 // incoming crc 40 MOVD table8+8(FP), R4 // *Table 41 MOVD p+16(FP), R5 42 MOVD p_len+24(FP), R6 // p len 43 44 CMP $0,R6 // len == 0? 45 BNE start 46 MOVW R3,ret+40(FP) // return crc 47 RET 48 49start: 50 NOR R3,R3,R7 // ^crc 51 MOVWZ R7,R7 // 32 bits 52 CMP R6,$16 53 MOVD R6,CTR 54 BLT short 55 SRAD $3,R6,R8 // 8 byte chunks 56 MOVD R8,CTR 57 58loop: 59 MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian? 60 MOVWZ 4(R5),R9 // 4-7 bytes of p 61 MOVD R4,R10 // &tab[0] 62 XOR R7,R8,R7 // crc ^= byte[0:3] 63 RLDICL $40,R9,$56,R17 // p[7] 64 SLD $2,R17,R17 // p[7]*4 65 RLDICL $40,R7,$56,R8 // crc>>24 66 SLD $2,R8,R8 // crc>>24*4 67 RLDICL $48,R9,$56,R18 // p[6] 68 SLD $2,R18,R18 // p[6]*4 69 MOVWZ (R10)(R17),R21 // tab[0][p[7]] 70 ADD $1024,R10,R10 // tab[1] 71 RLDICL $56,R9,$56,R19 // p[5] 72 SLD $2,R19,R19 // p[5]*4:1 73 MOVWZ (R10)(R18),R22 // tab[1][p[6]] 74 ADD $1024,R10,R10 // tab[2] 75 XOR R21,R22,R21 // xor done R22 76 CLRLSLDI $56,R9,$2,R20 77 MOVWZ (R10)(R19),R23 // tab[2][p[5]] 78 ADD $1024,R10,R10 // &tab[3] 79 XOR R21,R23,R21 // xor done R23 80 MOVWZ (R10)(R20),R24 // tab[3][p[4]] 81 ADD $1024,R10,R10 // &tab[4] 82 XOR R21,R24,R21 // xor done R24 83 MOVWZ (R10)(R8),R25 // tab[4][crc>>24] 84 RLDICL $48,R7,$56,R24 // crc>>16&0xFF 85 XOR R21,R25,R21 // xor done R25 86 ADD $1024,R10,R10 // &tab[5] 87 SLD $2,R24,R24 // crc>>16&0xFF*4 88 MOVWZ (R10)(R24),R26 // tab[5][crc>>16&0xFF] 89 XOR R21,R26,R21 // xor done R26 90 RLDICL $56,R7,$56,R25 // crc>>8 91 ADD $1024,R10,R10 // &tab[6] 92 SLD $2,R25,R25 // crc>>8&FF*2 93 MOVBZ R7,R26 // crc&0xFF 94 MOVWZ (R10)(R25),R27 // tab[6][crc>>8&0xFF] 95 ADD $1024,R10,R10 // &tab[7] 96 SLD $2,R26,R26 // crc&0xFF*2 97 XOR R21,R27,R21 // xor done R27 98 ADD $8,R5 // p = p[8:] 99 MOVWZ (R10)(R26),R28 // tab[7][crc&0xFF] 100 XOR R21,R28,R21 // xor done R28 101 MOVWZ R21,R7 // crc for next round 102 BDNZ loop 103 ANDCC $7,R6,R8 // any leftover bytes 104 BEQ done // none --> done 105 MOVD R8,CTR // byte count 106 PCALIGN $16 // align short loop 107short: 108 MOVBZ 0(R5),R8 // get v 109 XOR R8,R7,R8 // byte(crc)^v -> R8 110 RLDIC $2,R8,$54,R8 // rldicl r8,r8,2,22 111 SRD $8,R7,R14 // crc>>8 112 MOVWZ (R4)(R8),R10 113 ADD $1,R5 114 XOR R10,R14,R7 // loop crc in R7 115 BDNZ short 116done: 117 NOR R7,R7,R7 // ^crc 118 MOVW R7,ret+40(FP) // return crc 119 RET 120 121#ifdef BYTESWAP_DATA 122DATA ·byteswapcons+0(SB)/8,$0x0706050403020100 123DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908 124 125GLOBL ·byteswapcons+0(SB),RODATA,$16 126#endif 127 128TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36 129 MOVWZ crc+0(FP), R3 // incoming crc 130 MOVWZ ctab+4(FP), R14 // crc poly id 131 MOVD p+8(FP), R4 132 MOVD p_len+16(FP), R5 // p len 133 134 // R3 = incoming crc 135 // R14 = constant table identifier 136 // R5 = address of bytes 137 // R6 = length of bytes 138 139 // defines for index loads 140 141 MOVD $16,off16 142 MOVD $32,off32 143 MOVD $48,off48 144 MOVD $64,off64 145 MOVD $80,off80 146 MOVD $96,off96 147 MOVD $112,off112 148 MOVD $0,R15 149 150 MOVD R3,R10 // save initial crc 151 152 NOR R3,R3,R3 // ^crc 153 MOVWZ R3,R3 // 32 bits 154 VXOR zeroes,zeroes,zeroes // clear the V reg 155 VSPLTISW $-1,V0 156 VSLDOI $4,V29,V0,mask_32bit 157 VSLDOI $8,V29,V0,mask_64bit 158 159 VXOR V8,V8,V8 160 MTVSRD R3,VS40 // crc initial value VS40 = V8 161 162#ifdef REFLECT 163 VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits? 164#else 165 VSLDOI $4,V8,zeroes,V8 166#endif 167 168#ifdef BYTESWAP_DATA 169 MOVD $·byteswapcons(SB),R3 170 LVX (R3),byteswap 171#endif 172 173 CMPU R5,$256 // length of bytes 174 BLT short 175 176 RLDICR $0,R5,$56,R6 // chunk to process 177 178 // First step for larger sizes 179l1: MOVD $32768,R7 180 MOVD R7,R9 181 CMP R6,R7 // compare R6, R7 (MAX SIZE) 182 BGT top // less than MAX, just do remainder 183 MOVD R6,R7 184top: 185 SUB R7,R6,R6 186 187 // mainloop does 128 bytes at a time 188 SRD $7,R7 189 190 // determine the offset into the constants table to start with. 191 // Each constant is 128 bytes, used against 16 bytes of data. 192 SLD $4,R7,R8 193 SRD $3,R9,R9 194 SUB R8,R9,R8 195 196 // The last iteration is reduced in a separate step 197 ADD $-1,R7 198 MOVD R7,CTR 199 200 // Determine which constant table (depends on poly) 201 CMP R14,$1 202 BNE castTable 203 MOVD $·IEEEConst(SB),R3 204 BR startConst 205castTable: 206 MOVD $·CastConst(SB),R3 207 208startConst: 209 ADD R3,R8,R3 // starting point in constants table 210 211 VXOR V0,V0,V0 // clear the V regs 212 VXOR V1,V1,V1 213 VXOR V2,V2,V2 214 VXOR V3,V3,V3 215 VXOR V4,V4,V4 216 VXOR V5,V5,V5 217 VXOR V6,V6,V6 218 VXOR V7,V7,V7 219 220 LVX (R3),const1 // loading constant values 221 222 CMP R15,$1 // Identify warm up pass 223 BEQ next 224 225 // First warm up pass: load the bytes to process 226 LVX (R4),V16 227 LVX (R4+off16),V17 228 LVX (R4+off32),V18 229 LVX (R4+off48),V19 230 LVX (R4+off64),V20 231 LVX (R4+off80),V21 232 LVX (R4+off96),V22 233 LVX (R4+off112),V23 234 ADD $128,R4 // bump up to next 128 bytes in buffer 235 236 VXOR V16,V8,V16 // xor in initial CRC in V8 237 238next: 239 BC 18,0,first_warm_up_done 240 241 ADD $16,R3 // bump up to next constants 242 LVX (R3),const2 // table values 243 244 VPMSUMD V16,const1,V8 // second warm up pass 245 LVX (R4),V16 // load from buffer 246 OR $0,R2,R2 247 248 VPMSUMD V17,const1,V9 // vpmsumd with constants 249 LVX (R4+off16),V17 // load next from buffer 250 OR $0,R2,R2 251 252 VPMSUMD V18,const1,V10 // vpmsumd with constants 253 LVX (R4+off32),V18 // load next from buffer 254 OR $0,R2,R2 255 256 VPMSUMD V19,const1,V11 // vpmsumd with constants 257 LVX (R4+off48),V19 // load next from buffer 258 OR $0,R2,R2 259 260 VPMSUMD V20,const1,V12 // vpmsumd with constants 261 LVX (R4+off64),V20 // load next from buffer 262 OR $0,R2,R2 263 264 VPMSUMD V21,const1,V13 // vpmsumd with constants 265 LVX (R4+off80),V21 // load next from buffer 266 OR $0,R2,R2 267 268 VPMSUMD V22,const1,V14 // vpmsumd with constants 269 LVX (R4+off96),V22 // load next from buffer 270 OR $0,R2,R2 271 272 VPMSUMD V23,const1,V15 // vpmsumd with constants 273 LVX (R4+off112),V23 // load next from buffer 274 275 ADD $128,R4 // bump up to next 128 bytes in buffer 276 277 BC 18,0,first_cool_down 278 279cool_top: 280 LVX (R3),const1 // constants 281 ADD $16,R3 // inc to next constants 282 OR $0,R2,R2 283 284 VXOR V0,V8,V0 // xor in previous vpmsumd 285 VPMSUMD V16,const2,V8 // vpmsumd with constants 286 LVX (R4),V16 // buffer 287 OR $0,R2,R2 288 289 VXOR V1,V9,V1 // xor in previous 290 VPMSUMD V17,const2,V9 // vpmsumd with constants 291 LVX (R4+off16),V17 // next in buffer 292 OR $0,R2,R2 293 294 VXOR V2,V10,V2 // xor in previous 295 VPMSUMD V18,const2,V10 // vpmsumd with constants 296 LVX (R4+off32),V18 // next in buffer 297 OR $0,R2,R2 298 299 VXOR V3,V11,V3 // xor in previous 300 VPMSUMD V19,const2,V11 // vpmsumd with constants 301 LVX (R4+off48),V19 // next in buffer 302 LVX (R3),const2 // get next constant 303 OR $0,R2,R2 304 305 VXOR V4,V12,V4 // xor in previous 306 VPMSUMD V20,const1,V12 // vpmsumd with constants 307 LVX (R4+off64),V20 // next in buffer 308 OR $0,R2,R2 309 310 VXOR V5,V13,V5 // xor in previous 311 VPMSUMD V21,const1,V13 // vpmsumd with constants 312 LVX (R4+off80),V21 // next in buffer 313 OR $0,R2,R2 314 315 VXOR V6,V14,V6 // xor in previous 316 VPMSUMD V22,const1,V14 // vpmsumd with constants 317 LVX (R4+off96),V22 // next in buffer 318 OR $0,R2,R2 319 320 VXOR V7,V15,V7 // xor in previous 321 VPMSUMD V23,const1,V15 // vpmsumd with constants 322 LVX (R4+off112),V23 // next in buffer 323 324 ADD $128,R4 // bump up buffer pointer 325 BDNZ cool_top // are we done? 326 327first_cool_down: 328 329 // load the constants 330 // xor in the previous value 331 // vpmsumd the result with constants 332 333 LVX (R3),const1 334 ADD $16,R3 335 336 VXOR V0,V8,V0 337 VPMSUMD V16,const1,V8 338 OR $0,R2,R2 339 340 VXOR V1,V9,V1 341 VPMSUMD V17,const1,V9 342 OR $0,R2,R2 343 344 VXOR V2,V10,V2 345 VPMSUMD V18,const1,V10 346 OR $0,R2,R2 347 348 VXOR V3,V11,V3 349 VPMSUMD V19,const1,V11 350 OR $0,R2,R2 351 352 VXOR V4,V12,V4 353 VPMSUMD V20,const1,V12 354 OR $0,R2,R2 355 356 VXOR V5,V13,V5 357 VPMSUMD V21,const1,V13 358 OR $0,R2,R2 359 360 VXOR V6,V14,V6 361 VPMSUMD V22,const1,V14 362 OR $0,R2,R2 363 364 VXOR V7,V15,V7 365 VPMSUMD V23,const1,V15 366 OR $0,R2,R2 367 368second_cool_down: 369 370 VXOR V0,V8,V0 371 VXOR V1,V9,V1 372 VXOR V2,V10,V2 373 VXOR V3,V11,V3 374 VXOR V4,V12,V4 375 VXOR V5,V13,V5 376 VXOR V6,V14,V6 377 VXOR V7,V15,V7 378 379#ifdef REFLECT 380 VSLDOI $4,V0,zeroes,V0 381 VSLDOI $4,V1,zeroes,V1 382 VSLDOI $4,V2,zeroes,V2 383 VSLDOI $4,V3,zeroes,V3 384 VSLDOI $4,V4,zeroes,V4 385 VSLDOI $4,V5,zeroes,V5 386 VSLDOI $4,V6,zeroes,V6 387 VSLDOI $4,V7,zeroes,V7 388#endif 389 390 LVX (R4),V8 391 LVX (R4+off16),V9 392 LVX (R4+off32),V10 393 LVX (R4+off48),V11 394 LVX (R4+off64),V12 395 LVX (R4+off80),V13 396 LVX (R4+off96),V14 397 LVX (R4+off112),V15 398 399 ADD $128,R4 400 401 VXOR V0,V8,V16 402 VXOR V1,V9,V17 403 VXOR V2,V10,V18 404 VXOR V3,V11,V19 405 VXOR V4,V12,V20 406 VXOR V5,V13,V21 407 VXOR V6,V14,V22 408 VXOR V7,V15,V23 409 410 MOVD $1,R15 411 CMP $0,R6 412 ADD $128,R6 413 414 BNE l1 415 ANDCC $127,R5 416 SUBC R5,$128,R6 417 ADD R3,R6,R3 418 419 SRD $4,R5,R7 420 MOVD R7,CTR 421 LVX (R3),V0 422 LVX (R3+off16),V1 423 LVX (R3+off32),V2 424 LVX (R3+off48),V3 425 LVX (R3+off64),V4 426 LVX (R3+off80),V5 427 LVX (R3+off96),V6 428 LVX (R3+off112),V7 429 430 ADD $128,R3 431 432 VPMSUMW V16,V0,V0 433 VPMSUMW V17,V1,V1 434 VPMSUMW V18,V2,V2 435 VPMSUMW V19,V3,V3 436 VPMSUMW V20,V4,V4 437 VPMSUMW V21,V5,V5 438 VPMSUMW V22,V6,V6 439 VPMSUMW V23,V7,V7 440 441 // now reduce the tail 442 443 CMP $0,R7 444 BEQ next1 445 446 LVX (R4),V16 447 LVX (R3),V17 448 VPMSUMW V16,V17,V16 449 VXOR V0,V16,V0 450 BC 18,0,next1 451 452 LVX (R4+off16),V16 453 LVX (R3+off16),V17 454 VPMSUMW V16,V17,V16 455 VXOR V0,V16,V0 456 BC 18,0,next1 457 458 LVX (R4+off32),V16 459 LVX (R3+off32),V17 460 VPMSUMW V16,V17,V16 461 VXOR V0,V16,V0 462 BC 18,0,next1 463 464 LVX (R4+off48),V16 465 LVX (R3+off48),V17 466 VPMSUMW V16,V17,V16 467 VXOR V0,V16,V0 468 BC 18,0,next1 469 470 LVX (R4+off64),V16 471 LVX (R3+off64),V17 472 VPMSUMW V16,V17,V16 473 VXOR V0,V16,V0 474 BC 18,0,next1 475 476 LVX (R4+off80),V16 477 LVX (R3+off80),V17 478 VPMSUMW V16,V17,V16 479 VXOR V0,V16,V0 480 BC 18,0,next1 481 482 LVX (R4+off96),V16 483 LVX (R3+off96),V17 484 VPMSUMW V16,V17,V16 485 VXOR V0,V16,V0 486 487next1: 488 VXOR V0,V1,V0 489 VXOR V2,V3,V2 490 VXOR V4,V5,V4 491 VXOR V6,V7,V6 492 VXOR V0,V2,V0 493 VXOR V4,V6,V4 494 VXOR V0,V4,V0 495 496barrett_reduction: 497 498 CMP R14,$1 499 BNE barcstTable 500 MOVD $·IEEEBarConst(SB),R3 501 BR startbarConst 502barcstTable: 503 MOVD $·CastBarConst(SB),R3 504 505startbarConst: 506 LVX (R3),const1 507 LVX (R3+off16),const2 508 509 VSLDOI $8,V0,V0,V1 510 VXOR V0,V1,V0 511 512#ifdef REFLECT 513 VSPLTISB $1,V1 514 VSL V0,V1,V0 515#endif 516 517 VAND V0,mask_64bit,V0 518 519#ifndef REFLECT 520 521 VPMSUMD V0,const1,V1 522 VSLDOI $8,zeroes,V1,V1 523 VPMSUMD V1,const2,V1 524 VXOR V0,V1,V0 525 VSLDOI $8,V0,zeroes,V0 526 527#else 528 529 VAND V0,mask_32bit,V1 530 VPMSUMD V1,const1,V1 531 VAND V1,mask_32bit,V1 532 VPMSUMD V1,const2,V1 533 VXOR V0,V1,V0 534 VSLDOI $4,V0,zeroes,V0 535 536#endif 537 538 MFVSRD VS32,R3 // VS32 = V0 539 540 NOR R3,R3,R3 // return ^crc 541 MOVW R3,ret+32(FP) 542 RET 543 544first_warm_up_done: 545 546 LVX (R3),const1 547 ADD $16,R3 548 549 VPMSUMD V16,const1,V8 550 VPMSUMD V17,const1,V9 551 VPMSUMD V18,const1,V10 552 VPMSUMD V19,const1,V11 553 VPMSUMD V20,const1,V12 554 VPMSUMD V21,const1,V13 555 VPMSUMD V22,const1,V14 556 VPMSUMD V23,const1,V15 557 558 BR second_cool_down 559 560short: 561 CMP $0,R5 562 BEQ zero 563 564 // compute short constants 565 566 CMP R14,$1 567 BNE castshTable 568 MOVD $·IEEEConst(SB),R3 569 ADD $4080,R3 570 BR startshConst 571castshTable: 572 MOVD $·CastConst(SB),R3 573 ADD $4080,R3 574 575startshConst: 576 SUBC R5,$256,R6 // sub from 256 577 ADD R3,R6,R3 578 579 // calculate where to start 580 581 SRD $4,R5,R7 582 MOVD R7,CTR 583 584 VXOR V19,V19,V19 585 VXOR V20,V20,V20 586 587 LVX (R4),V0 588 LVX (R3),V16 589 VXOR V0,V8,V0 590 VPMSUMW V0,V16,V0 591 BC 18,0,v0 592 593 LVX (R4+off16),V1 594 LVX (R3+off16),V17 595 VPMSUMW V1,V17,V1 596 BC 18,0,v1 597 598 LVX (R4+off32),V2 599 LVX (R3+off32),V16 600 VPMSUMW V2,V16,V2 601 BC 18,0,v2 602 603 LVX (R4+off48),V3 604 LVX (R3+off48),V17 605 VPMSUMW V3,V17,V3 606 BC 18,0,v3 607 608 LVX (R4+off64),V4 609 LVX (R3+off64),V16 610 VPMSUMW V4,V16,V4 611 BC 18,0,v4 612 613 LVX (R4+off80),V5 614 LVX (R3+off80),V17 615 VPMSUMW V5,V17,V5 616 BC 18,0,v5 617 618 LVX (R4+off96),V6 619 LVX (R3+off96),V16 620 VPMSUMW V6,V16,V6 621 BC 18,0,v6 622 623 LVX (R4+off112),V7 624 LVX (R3+off112),V17 625 VPMSUMW V7,V17,V7 626 BC 18,0,v7 627 628 ADD $128,R3 629 ADD $128,R4 630 631 LVX (R4),V8 632 LVX (R3),V16 633 VPMSUMW V8,V16,V8 634 BC 18,0,v8 635 636 LVX (R4+off16),V9 637 LVX (R3+off16),V17 638 VPMSUMW V9,V17,V9 639 BC 18,0,v9 640 641 LVX (R4+off32),V10 642 LVX (R3+off32),V16 643 VPMSUMW V10,V16,V10 644 BC 18,0,v10 645 646 LVX (R4+off48),V11 647 LVX (R3+off48),V17 648 VPMSUMW V11,V17,V11 649 BC 18,0,v11 650 651 LVX (R4+off64),V12 652 LVX (R3+off64),V16 653 VPMSUMW V12,V16,V12 654 BC 18,0,v12 655 656 LVX (R4+off80),V13 657 LVX (R3+off80),V17 658 VPMSUMW V13,V17,V13 659 BC 18,0,v13 660 661 LVX (R4+off96),V14 662 LVX (R3+off96),V16 663 VPMSUMW V14,V16,V14 664 BC 18,0,v14 665 666 LVX (R4+off112),V15 667 LVX (R3+off112),V17 668 VPMSUMW V15,V17,V15 669 670 VXOR V19,V15,V19 671v14: VXOR V20,V14,V20 672v13: VXOR V19,V13,V19 673v12: VXOR V20,V12,V20 674v11: VXOR V19,V11,V19 675v10: VXOR V20,V10,V20 676v9: VXOR V19,V9,V19 677v8: VXOR V20,V8,V20 678v7: VXOR V19,V7,V19 679v6: VXOR V20,V6,V20 680v5: VXOR V19,V5,V19 681v4: VXOR V20,V4,V20 682v3: VXOR V19,V3,V19 683v2: VXOR V20,V2,V20 684v1: VXOR V19,V1,V19 685v0: VXOR V20,V0,V20 686 687 VXOR V19,V20,V0 688 689 BR barrett_reduction 690 691zero: 692 // This case is the original crc, so just return it 693 MOVW R10,ret+32(FP) 694 RET 695