Lines Matching +full:0 +full:- +full:128
2 // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
14 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
62 // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
75 .arch armv8-a
76 .fpu crypto-neon-fp-armv8
116 * Pairwise long polynomial multiplication of two 16-bit values
120 * by two 64-bit values
125 * significant. The resulting 80-bit vectors are XOR'ed together.
133 * 0 w0*x0 ^ | y0*z0 ^
148 * and after performing 8x8->16 bit long polynomial multiplication of
150 * we obtain the following four vectors of 16-bit elements:
161 * final 80-bit result.
165 vld1.64 {q12}, [r4, :128]
167 vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
168 vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
183 vmov.i32 d25, #0
184 vmov.i32 d29, #0
200 vld1.64 {q8-q9}, [buf]!
218 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
224 // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
230 // Load the first 128 data bytes. Byte swapping is necessary to make
232 vld1.64 {q0-q1}, [buf]!
233 vld1.64 {q2-q3}, [buf]!
234 vld1.64 {q4-q5}, [buf]!
235 vld1.64 {q6-q7}, [buf]!
254 vmov.i8 q8h, #0
258 // Load the constants for folding across 128 bytes.
259 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
261 // Subtract 128 for the 128 data bytes just consumed. Subtract another
262 // 128 to simplify the termination condition of the following loop.
265 // While >= 128 data bytes remain (not counting q0-q7), fold the 128
266 // bytes q0-q7 into them, storing the result back into q0-q7.
272 subs len, len, #128
275 // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
278 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
289 // Add 128 to get the correct number of data bytes remaining in 0...127
290 // (not counting q7), following the previous extra subtraction by 128.
293 adds len, len, #(128-16)
308 // Add 16 to get the correct number of data bytes remaining in 0...15
327 // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
331 vtbl.8 q1l, {q7l-q7h}, q2l
332 vtbl.8 q1h, {q7l-q7h}, q2h
334 // q3 = first chunk: q7 right-shifted by '16-len' bytes.
335 vmov.i8 q3, #0x80
337 vtbl.8 q3l, {q7l-q7h}, q2l
338 vtbl.8 q3h, {q7l-q7h}, q2h
340 // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
343 // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
344 // then '16-len' bytes from q1 (high-order bytes).
363 vmov.i8 q0h, #0
367 // Load the fold-across-16-bytes constants.
368 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
388 // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
391 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
394 // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
395 // whose low 48 bits are 0.
399 // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
400 // value congruent to x^64 * M(x) and whose low 48 bits are 0.
401 vmov.i8 q1, #0
408 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
418 vmov.u16 r0, q0l[0]
430 vst1.64 {q7}, [r3, :128]
437 // Fold constants precomputed from the polynomial 0x18bb7
438 // G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
440 .quad 0x0000000000006123 // x^(8*128) mod G(x)
441 .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
443 .quad 0x0000000000001069 // x^(4*128) mod G(x)
444 .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
446 .quad 0x000000000000857d // x^(2*128) mod G(x)
447 .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
449 .quad 0x000000000000a010 // x^(1*128) mod G(x)
450 .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
452 .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
453 .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
455 .quad 0x0000000000018bb7 // G(x)
456 .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
458 // For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
459 // len] is the index vector to shift left by 'len' bytes, and is also {0x80,
460 // ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
462 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
463 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
464 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
465 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
468 .quad 0x808080800000000, 0x909090901010101