1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5#include "textflag.h" 6 7// castagnoliSSE42 updates the (non-inverted) crc with the given buffer. 8// 9// func castagnoliSSE42(crc uint32, p []byte) uint32 10TEXT ·castagnoliSSE42(SB),NOSPLIT,$0 11 MOVL crc+0(FP), AX // CRC value 12 MOVQ p+8(FP), SI // data pointer 13 MOVQ p_len+16(FP), CX // len(p) 14 15 // If there are fewer than 8 bytes to process, skip alignment. 16 CMPQ CX, $8 17 JL less_than_8 18 19 MOVQ SI, BX 20 ANDQ $7, BX 21 JZ aligned 22 23 // Process the first few bytes to 8-byte align the input. 24 25 // BX = 8 - BX. We need to process this many bytes to align. 26 SUBQ $1, BX 27 XORQ $7, BX 28 29 BTQ $0, BX 30 JNC align_2 31 32 CRC32B (SI), AX 33 DECQ CX 34 INCQ SI 35 36align_2: 37 BTQ $1, BX 38 JNC align_4 39 40 CRC32W (SI), AX 41 42 SUBQ $2, CX 43 ADDQ $2, SI 44 45align_4: 46 BTQ $2, BX 47 JNC aligned 48 49 CRC32L (SI), AX 50 51 SUBQ $4, CX 52 ADDQ $4, SI 53 54aligned: 55 // The input is now 8-byte aligned and we can process 8-byte chunks. 56 CMPQ CX, $8 57 JL less_than_8 58 59 CRC32Q (SI), AX 60 ADDQ $8, SI 61 SUBQ $8, CX 62 JMP aligned 63 64less_than_8: 65 // We may have some bytes left over; process 4 bytes, then 2, then 1. 66 BTQ $2, CX 67 JNC less_than_4 68 69 CRC32L (SI), AX 70 ADDQ $4, SI 71 72less_than_4: 73 BTQ $1, CX 74 JNC less_than_2 75 76 CRC32W (SI), AX 77 ADDQ $2, SI 78 79less_than_2: 80 BTQ $0, CX 81 JNC done 82 83 CRC32B (SI), AX 84 85done: 86 MOVL AX, ret+32(FP) 87 RET 88 89// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) 90// bytes from each buffer. 91// 92// func castagnoliSSE42Triple( 93// crc1, crc2, crc3 uint32, 94// a, b, c []byte, 95// rounds uint32, 96// ) (retA uint32, retB uint32, retC uint32) 97TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0 98 MOVL crcA+0(FP), AX 99 MOVL crcB+4(FP), CX 100 MOVL crcC+8(FP), DX 101 102 MOVQ a+16(FP), R8 // data pointer 103 MOVQ b+40(FP), R9 // data pointer 104 MOVQ c+64(FP), R10 // data pointer 105 106 MOVL rounds+88(FP), R11 107 108loop: 109 CRC32Q (R8), AX 110 CRC32Q (R9), CX 111 CRC32Q (R10), DX 112 113 CRC32Q 8(R8), AX 114 CRC32Q 8(R9), CX 115 CRC32Q 8(R10), DX 116 117 CRC32Q 16(R8), AX 118 CRC32Q 16(R9), CX 119 CRC32Q 16(R10), DX 120 121 ADDQ $24, R8 122 ADDQ $24, R9 123 ADDQ $24, R10 124 125 DECQ R11 126 JNZ loop 127 128 MOVL AX, retA+96(FP) 129 MOVL CX, retB+100(FP) 130 MOVL DX, retC+104(FP) 131 RET 132 133// CRC32 polynomial data 134// 135// These constants are lifted from the 136// Linux kernel, since they avoid the costly 137// PSHUFB 16 byte reversal proposed in the 138// original Intel paper. 139DATA r2r1<>+0(SB)/8, $0x154442bd4 140DATA r2r1<>+8(SB)/8, $0x1c6e41596 141DATA r4r3<>+0(SB)/8, $0x1751997d0 142DATA r4r3<>+8(SB)/8, $0x0ccaa009e 143DATA rupoly<>+0(SB)/8, $0x1db710641 144DATA rupoly<>+8(SB)/8, $0x1f7011641 145DATA r5<>+0(SB)/8, $0x163cd6124 146 147GLOBL r2r1<>(SB),RODATA,$16 148GLOBL r4r3<>(SB),RODATA,$16 149GLOBL rupoly<>(SB),RODATA,$16 150GLOBL r5<>(SB),RODATA,$8 151 152// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 153// len(p) must be at least 64, and must be a multiple of 16. 154 155// func ieeeCLMUL(crc uint32, p []byte) uint32 156TEXT ·ieeeCLMUL(SB),NOSPLIT,$0 157 MOVL crc+0(FP), X0 // Initial CRC value 158 MOVQ p+8(FP), SI // data pointer 159 MOVQ p_len+16(FP), CX // len(p) 160 161 MOVOU (SI), X1 162 MOVOU 16(SI), X2 163 MOVOU 32(SI), X3 164 MOVOU 48(SI), X4 165 PXOR X0, X1 166 ADDQ $64, SI // buf+=64 167 SUBQ $64, CX // len-=64 168 CMPQ CX, $64 // Less than 64 bytes left 169 JB remain64 170 171 MOVOA r2r1<>+0(SB), X0 172loopback64: 173 MOVOA X1, X5 174 MOVOA X2, X6 175 MOVOA X3, X7 176 MOVOA X4, X8 177 178 PCLMULQDQ $0, X0, X1 179 PCLMULQDQ $0, X0, X2 180 PCLMULQDQ $0, X0, X3 181 PCLMULQDQ $0, X0, X4 182 183 /* Load next early */ 184 MOVOU (SI), X11 185 MOVOU 16(SI), X12 186 MOVOU 32(SI), X13 187 MOVOU 48(SI), X14 188 189 PCLMULQDQ $0x11, X0, X5 190 PCLMULQDQ $0x11, X0, X6 191 PCLMULQDQ $0x11, X0, X7 192 PCLMULQDQ $0x11, X0, X8 193 194 PXOR X5, X1 195 PXOR X6, X2 196 PXOR X7, X3 197 PXOR X8, X4 198 199 PXOR X11, X1 200 PXOR X12, X2 201 PXOR X13, X3 202 PXOR X14, X4 203 204 ADDQ $0x40, DI 205 ADDQ $64, SI // buf+=64 206 SUBQ $64, CX // len-=64 207 CMPQ CX, $64 // Less than 64 bytes left? 208 JGE loopback64 209 210 /* Fold result into a single register (X1) */ 211remain64: 212 MOVOA r4r3<>+0(SB), X0 213 214 MOVOA X1, X5 215 PCLMULQDQ $0, X0, X1 216 PCLMULQDQ $0x11, X0, X5 217 PXOR X5, X1 218 PXOR X2, X1 219 220 MOVOA X1, X5 221 PCLMULQDQ $0, X0, X1 222 PCLMULQDQ $0x11, X0, X5 223 PXOR X5, X1 224 PXOR X3, X1 225 226 MOVOA X1, X5 227 PCLMULQDQ $0, X0, X1 228 PCLMULQDQ $0x11, X0, X5 229 PXOR X5, X1 230 PXOR X4, X1 231 232 /* If there is less than 16 bytes left we are done */ 233 CMPQ CX, $16 234 JB finish 235 236 /* Encode 16 bytes */ 237remain16: 238 MOVOU (SI), X10 239 MOVOA X1, X5 240 PCLMULQDQ $0, X0, X1 241 PCLMULQDQ $0x11, X0, X5 242 PXOR X5, X1 243 PXOR X10, X1 244 SUBQ $16, CX 245 ADDQ $16, SI 246 CMPQ CX, $16 247 JGE remain16 248 249finish: 250 /* Fold final result into 32 bits and return it */ 251 PCMPEQB X3, X3 252 PCLMULQDQ $1, X1, X0 253 PSRLDQ $8, X1 254 PXOR X0, X1 255 256 MOVOA X1, X2 257 MOVQ r5<>+0(SB), X0 258 259 /* Creates 32 bit mask. Note that we don't care about upper half. */ 260 PSRLQ $32, X3 261 262 PSRLDQ $4, X2 263 PAND X3, X1 264 PCLMULQDQ $0, X0, X1 265 PXOR X2, X1 266 267 MOVOA rupoly<>+0(SB), X0 268 269 MOVOA X1, X2 270 PAND X3, X1 271 PCLMULQDQ $0x10, X0, X1 272 PAND X3, X1 273 PCLMULQDQ $0, X0, X1 274 PXOR X2, X1 275 276 PEXTRD $1, X1, AX 277 MOVL AX, ret+32(FP) 278 279 RET 280