crct10dif-pcl-asm_64.S - OpenGrok cross reference for /linux-6.14.4/arch/x86/lib/crct10dif-pcl-asm

Lines Matching +full:0 +full:- +full:128
2 # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
50 #  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
73 	pclmulqdq	$0x00, FOLD_CONSTS, \reg1
74 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm8
75 	pclmulqdq	$0x00, FOLD_CONSTS, \reg2
76 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm13
86 	pclmulqdq	$0x11, FOLD_CONSTS, \src_reg
87 	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
101 	# For sizes less than 256 bytes, we can't fold 128 bytes at a time.
105 	# Load the first 128 data bytes.  Byte swapping is necessary to make the
107 	movdqu	16*0(buf), %xmm0
115 	add	$128, buf
132 	# Subtract 128 for the 128 data bytes just consumed.  Subtract another
133 	# 128 to simplify the termination condition of the following loop.
136 	# While >= 128 data bytes remain (not counting xmm0-7), fold the 128
137 	# bytes xmm0-7 into them, storing the result back into xmm0-7.
139 	fold_32_bytes	0, %xmm0, %xmm1
143 	add	$128, buf
144 	sub	$128, len
147 	# Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
163 	# Add 128 to get the correct number of data bytes remaining in 0...127
164 	# (not counting xmm7), following the previous extra subtraction by 128.
167 	add	$128-16, len
174 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
175 	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
185 	# Add 16 to get the correct number of data bytes remaining in 0...15
200 	movdqu	-16(buf, len), %xmm1
203 	# xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
209 	# xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
213 	# xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
214 	# then '16-len' bytes from xmm2 (high-order bytes).
219 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
220 	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
225 	# Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
231 	# x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
232 	# whose low 48 bits are 0.
234 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
238 	# Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
239 	# value congruent to x^64 * M(x) and whose low 48 bits are 0.
243 	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
251 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
253 	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # *= G(x)
258 	pextrw	$0, %xmm0, %eax
287 # Fold constants precomputed from the polynomial 0x18bb7
288 # G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
290 	.quad		0x0000000000006123	# x^(8*128)	mod G(x)
291 	.quad		0x0000000000002295	# x^(8*128+64)	mod G(x)
293 	.quad		0x0000000000001069	# x^(4*128)	mod G(x)
294 	.quad		0x000000000000dd31	# x^(4*128+64)	mod G(x)
296 	.quad		0x000000000000857d	# x^(2*128)	mod G(x)
297 	.quad		0x0000000000007acc	# x^(2*128+64)	mod G(x)
299 	.quad		0x000000000000a010	# x^(1*128)	mod G(x)
300 	.quad		0x0000000000001faa	# x^(1*128+64)	mod G(x)
302 	.quad		0x1368000000000000	# x^48 * (x^48 mod G(x))
303 	.quad		0x2d56000000000000	# x^48 * (x^80 mod G(x))
305 	.quad		0x0000000000018bb7	# G(x)
306 	.quad		0x00000001f65a57f8	# floor(x^48 / G(x))
311 	.octa	0x80808080808080808080808080808080
316 	.octa	0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
321 	.octa	0x000102030405060708090A0B0C0D0E0F
325 # For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
326 # is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
327 # 0x80} XOR the index vector to shift right by '16 - len' bytes.
329 	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
330 	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
331 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
332 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0