crc-t10dif-core.S - OpenGrok cross reference for /linux-6.14.4/arch/arm/lib/crc-t10dif-core.S

Lines Matching +full:0 +full:- +full:128
2 // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
14 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
62 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
75 	.arch		armv8-a
76 	.fpu		crypto-neon-fp-armv8
116 	 * Pairwise long polynomial multiplication of two 16-bit values
120 	 * by two 64-bit values
125 	 * significant. The resulting 80-bit vectors are XOR'ed together.
133 	 *  0            w0*x0 ^              |        y0*z0 ^
148 	 * and after performing 8x8->16 bit long polynomial multiplication of
150 	 * we obtain the following four vectors of 16-bit elements:
161 	 * final 80-bit result.
165 	vld1.64		{q12}, [r4, :128]
167 	vtbl.8		d24, {\v16\()_L-\v16\()_H}, d24
168 	vtbl.8		d25, {\v16\()_L-\v16\()_H}, d25
183 	vmov.i32	d25, #0
184 	vmov.i32	d29, #0
200 	vld1.64		{q8-q9}, [buf]!
218 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
224 	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
230 	// Load the first 128 data bytes.  Byte swapping is necessary to make
232 	vld1.64		{q0-q1}, [buf]!
233 	vld1.64		{q2-q3}, [buf]!
234 	vld1.64		{q4-q5}, [buf]!
235 	vld1.64		{q6-q7}, [buf]!
254 	vmov.i8		q8h, #0
258 	// Load the constants for folding across 128 bytes.
259 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
261 	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
262 	// 128 to simplify the termination condition of the following loop.
265 	// While >= 128 data bytes remain (not counting q0-q7), fold the 128
266 	// bytes q0-q7 into them, storing the result back into q0-q7.
272 	subs		len, len, #128
275 	// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
278 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
289 	// Add 128 to get the correct number of data bytes remaining in 0...127
290 	// (not counting q7), following the previous extra subtraction by 128.
293 	adds		len, len, #(128-16)
308 	// Add 16 to get the correct number of data bytes remaining in 0...15
327 	// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
331 	vtbl.8		q1l, {q7l-q7h}, q2l
332 	vtbl.8		q1h, {q7l-q7h}, q2h
334 	// q3 = first chunk: q7 right-shifted by '16-len' bytes.
335 	vmov.i8		q3, #0x80
337 	vtbl.8		q3l, {q7l-q7h}, q2l
338 	vtbl.8		q3h, {q7l-q7h}, q2h
340 	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
343 	// q2 = second chunk: 'len' bytes from q0 (low-order bytes),
344 	// then '16-len' bytes from q1 (high-order bytes).
363 	vmov.i8		q0h, #0
367 	// Load the fold-across-16-bytes constants.
368 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
388 	// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
391 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
394 	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
395 	// whose low 48 bits are 0.
399 	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
400 	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
401 	vmov.i8		q1, #0
408 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]
418 	vmov.u16	r0, q0l[0]
430 	vst1.64		{q7}, [r3, :128]
437 // Fold constants precomputed from the polynomial 0x18bb7
438 // G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
440 	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
441 	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
443 	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
444 	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
446 	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
447 	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
449 	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
450 	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
452 	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
453 	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
455 	.quad		0x0000000000018bb7	// G(x)
456 	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
458 // For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
459 // len] is the index vector to shift left by 'len' bytes, and is also {0x80,
460 // ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
462 	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
463 	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
464 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
465 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
468 	.quad		0x808080800000000, 0x909090901010101