Lines Matching +full:0 +full:- +full:128

2 # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
50 # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
73 pclmulqdq $0x00, FOLD_CONSTS, \reg1
74 pclmulqdq $0x11, FOLD_CONSTS, %xmm8
75 pclmulqdq $0x00, FOLD_CONSTS, \reg2
76 pclmulqdq $0x11, FOLD_CONSTS, %xmm13
86 pclmulqdq $0x11, FOLD_CONSTS, \src_reg
87 pclmulqdq $0x00, FOLD_CONSTS, %xmm8
101 # For sizes less than 256 bytes, we can't fold 128 bytes at a time.
105 # Load the first 128 data bytes. Byte swapping is necessary to make the
107 movdqu 16*0(buf), %xmm0
115 add $128, buf
132 # Subtract 128 for the 128 data bytes just consumed. Subtract another
133 # 128 to simplify the termination condition of the following loop.
136 # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
137 # bytes xmm0-7 into them, storing the result back into xmm0-7.
139 fold_32_bytes 0, %xmm0, %xmm1
143 add $128, buf
144 sub $128, len
147 # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
163 # Add 128 to get the correct number of data bytes remaining in 0...127
164 # (not counting xmm7), following the previous extra subtraction by 128.
167 add $128-16, len
174 pclmulqdq $0x11, FOLD_CONSTS, %xmm7
175 pclmulqdq $0x00, FOLD_CONSTS, %xmm8
185 # Add 16 to get the correct number of data bytes remaining in 0...15
200 movdqu -16(buf, len), %xmm1
203 # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
209 # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
213 # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
214 # then '16-len' bytes from xmm2 (high-order bytes).
219 pclmulqdq $0x11, FOLD_CONSTS, %xmm7
220 pclmulqdq $0x00, FOLD_CONSTS, %xmm8
225 # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
231 # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
232 # whose low 48 bits are 0.
234 pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
238 # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
239 # value congruent to x^64 * M(x) and whose low 48 bits are 0.
243 pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
251 pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
253 pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x)
258 pextrw $0, %xmm0, %eax
287 # Fold constants precomputed from the polynomial 0x18bb7
288 # G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
290 .quad 0x0000000000006123 # x^(8*128) mod G(x)
291 .quad 0x0000000000002295 # x^(8*128+64) mod G(x)
293 .quad 0x0000000000001069 # x^(4*128) mod G(x)
294 .quad 0x000000000000dd31 # x^(4*128+64) mod G(x)
296 .quad 0x000000000000857d # x^(2*128) mod G(x)
297 .quad 0x0000000000007acc # x^(2*128+64) mod G(x)
299 .quad 0x000000000000a010 # x^(1*128) mod G(x)
300 .quad 0x0000000000001faa # x^(1*128+64) mod G(x)
302 .quad 0x1368000000000000 # x^48 * (x^48 mod G(x))
303 .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x))
305 .quad 0x0000000000018bb7 # G(x)
306 .quad 0x00000001f65a57f8 # floor(x^48 / G(x))
311 .octa 0x80808080808080808080808080808080
316 .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
321 .octa 0x000102030405060708090A0B0C0D0E0F
325 # For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
326 # is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
327 # 0x80} XOR the index vector to shift right by '16 - len' bytes.
329 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
330 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
331 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
332 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0