Lines Matching +full:16 +full:- +full:byte
2 * x86_64/AVX/AES-NI assembler implementation of Camellia
4 * Copyright © 2012-2013 Jussi Kivilinna <[email protected]>
14 * Version licensed under 2-clause BSD License is available at:
15 * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
31 16-way camellia
44 * x0..x7: byte-sliced AB state
48 * x0..x7: new byte-sliced CD state
53 * S-function with AES subbytes \
132 * P-function \
159 vpxor 0 * 16(mem_cd), x4, x4; \
162 vpxor 1 * 16(mem_cd), x5, x5; \
169 vpxor 2 * 16(mem_cd), x6, x6; \
172 vpxor 3 * 16(mem_cd), x7, x7; \
175 vpxor 4 * 16(mem_cd), x0, x0; \
178 vpxor 5 * 16(mem_cd), x1, x1; \
181 vpxor 6 * 16(mem_cd), x2, x2; \
184 vpxor 7 * 16(mem_cd), x3, x3;
188 * larger and would only be 0.5% faster (on sandy-bridge).
208 * x0..x7: byte-sliced AB state preloaded
209 * mem_ab: byte-sliced AB state in memory
210 * mem_cb: byte-sliced CD state in memory
217 vmovdqu x4, 0 * 16(mem_cd); \
218 vmovdqu x5, 1 * 16(mem_cd); \
219 vmovdqu x6, 2 * 16(mem_cd); \
220 vmovdqu x7, 3 * 16(mem_cd); \
221 vmovdqu x0, 4 * 16(mem_cd); \
222 vmovdqu x1, 5 * 16(mem_cd); \
223 vmovdqu x2, 6 * 16(mem_cd); \
224 vmovdqu x3, 7 * 16(mem_cd); \
235 vmovdqu x0, 0 * 16(mem_ab); \
236 vmovdqu x1, 1 * 16(mem_ab); \
237 vmovdqu x2, 2 * 16(mem_ab); \
238 vmovdqu x3, 3 * 16(mem_ab); \
239 vmovdqu x4, 4 * 16(mem_ab); \
240 vmovdqu x5, 5 * 16(mem_ab); \
241 vmovdqu x6, 6 * 16(mem_ab); \
242 vmovdqu x7, 7 * 16(mem_ab);
256 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
258 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
260 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
264 * v0..3: byte-sliced 32-bit integers
293 * r: byte-sliced AB state in memory
294 * l: byte-sliced CD state in memory
296 * x0..x7: new byte-sliced CD state
323 vmovdqu l4, 4 * 16(l); \
325 vmovdqu l5, 5 * 16(l); \
327 vmovdqu l6, 6 * 16(l); \
329 vmovdqu l7, 7 * 16(l); \
346 vpor 4 * 16(r), t0, t0; \
347 vpor 5 * 16(r), t1, t1; \
348 vpor 6 * 16(r), t2, t2; \
349 vpor 7 * 16(r), t3, t3; \
351 vpxor 0 * 16(r), t0, t0; \
352 vpxor 1 * 16(r), t1, t1; \
353 vpxor 2 * 16(r), t2, t2; \
354 vpxor 3 * 16(r), t3, t3; \
355 vmovdqu t0, 0 * 16(r); \
356 vmovdqu t1, 1 * 16(r); \
357 vmovdqu t2, 2 * 16(r); \
358 vmovdqu t3, 3 * 16(r); \
374 vpand 0 * 16(r), t0, t0; \
375 vpand 1 * 16(r), t1, t1; \
376 vpand 2 * 16(r), t2, t2; \
377 vpand 3 * 16(r), t3, t3; \
381 vpxor 4 * 16(r), t0, t0; \
382 vpxor 5 * 16(r), t1, t1; \
383 vpxor 6 * 16(r), t2, t2; \
384 vpxor 7 * 16(r), t3, t3; \
385 vmovdqu t0, 4 * 16(r); \
386 vmovdqu t1, 5 * 16(r); \
387 vmovdqu t2, 6 * 16(r); \
388 vmovdqu t3, 7 * 16(r); \
411 vmovdqu l0, 0 * 16(l); \
413 vmovdqu l1, 1 * 16(l); \
415 vmovdqu l2, 2 * 16(l); \
417 vmovdqu l3, 3 * 16(l);
481 /* load blocks to registers and apply pre-whitening */
487 vpxor 0 * 16(rio), x0, y7; \
488 vpxor 1 * 16(rio), x0, y6; \
489 vpxor 2 * 16(rio), x0, y5; \
490 vpxor 3 * 16(rio), x0, y4; \
491 vpxor 4 * 16(rio), x0, y3; \
492 vpxor 5 * 16(rio), x0, y2; \
493 vpxor 6 * 16(rio), x0, y1; \
494 vpxor 7 * 16(rio), x0, y0; \
495 vpxor 8 * 16(rio), x0, x7; \
496 vpxor 9 * 16(rio), x0, x6; \
497 vpxor 10 * 16(rio), x0, x5; \
498 vpxor 11 * 16(rio), x0, x4; \
499 vpxor 12 * 16(rio), x0, x3; \
500 vpxor 13 * 16(rio), x0, x2; \
501 vpxor 14 * 16(rio), x0, x1; \
502 vpxor 15 * 16(rio), x0, x0;
504 /* byteslice pre-whitened blocks and store to temporary memory */
510 vmovdqu x0, 0 * 16(mem_ab); \
511 vmovdqu x1, 1 * 16(mem_ab); \
512 vmovdqu x2, 2 * 16(mem_ab); \
513 vmovdqu x3, 3 * 16(mem_ab); \
514 vmovdqu x4, 4 * 16(mem_ab); \
515 vmovdqu x5, 5 * 16(mem_ab); \
516 vmovdqu x6, 6 * 16(mem_ab); \
517 vmovdqu x7, 7 * 16(mem_ab); \
518 vmovdqu y0, 0 * 16(mem_cd); \
519 vmovdqu y1, 1 * 16(mem_cd); \
520 vmovdqu y2, 2 * 16(mem_cd); \
521 vmovdqu y3, 3 * 16(mem_cd); \
522 vmovdqu y4, 4 * 16(mem_cd); \
523 vmovdqu y5, 5 * 16(mem_cd); \
524 vmovdqu y6, 6 * 16(mem_cd); \
525 vmovdqu y7, 7 * 16(mem_cd);
527 /* de-byteslice, apply post-whitening and store blocks */
557 vmovdqu x0, 0 * 16(rio); \
558 vmovdqu x1, 1 * 16(rio); \
559 vmovdqu x2, 2 * 16(rio); \
560 vmovdqu x3, 3 * 16(rio); \
561 vmovdqu x4, 4 * 16(rio); \
562 vmovdqu x5, 5 * 16(rio); \
563 vmovdqu x6, 6 * 16(rio); \
564 vmovdqu x7, 7 * 16(rio); \
565 vmovdqu y0, 8 * 16(rio); \
566 vmovdqu y1, 9 * 16(rio); \
567 vmovdqu y2, 10 * 16(rio); \
568 vmovdqu y3, 11 * 16(rio); \
569 vmovdqu y4, 12 * 16(rio); \
570 vmovdqu y5, 13 * 16(rio); \
571 vmovdqu y6, 14 * 16(rio); \
572 vmovdqu y7, 15 * 16(rio);
575 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
576 .section .rodata.cst16, "aM", @progbits, 16
577 .align 16
583 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
592 * pre-SubByte transform
594 * pre-lookup for sbox1, sbox2, sbox3:
606 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
607 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
609 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
610 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
613 * pre-SubByte transform
615 * pre-lookup for sbox4:
627 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
628 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
630 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
631 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
634 * post-SubByte transform
636 * post-lookup for sbox1, sbox4:
650 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
651 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
653 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
654 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
657 * post-SubByte transform
659 * post-lookup for sbox2:
673 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
674 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
676 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
677 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
680 * post-SubByte transform
682 * post-lookup for sbox3:
696 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
697 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
699 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
700 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
704 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
705 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
707 /* 4-bit mask */
719 * %xmm0..%xmm15: 16 plaintext blocks
721 * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
726 leaq 8 * 16(%rax), %rcx;
751 ((key_table + (16) * 8) + 0)(CTX),
752 ((key_table + (16) * 8) + 4)(CTX),
753 ((key_table + (16) * 8) + 8)(CTX),
754 ((key_table + (16) * 8) + 12)(CTX));
758 %xmm15, %rax, %rcx, 16);
761 cmpl $16, key_length(CTX);
766 vmovdqu 0 * 16(%rcx), %xmm8;
767 vmovdqu 1 * 16(%rcx), %xmm9;
768 vmovdqu 2 * 16(%rcx), %xmm10;
769 vmovdqu 3 * 16(%rcx), %xmm11;
770 vmovdqu 4 * 16(%rcx), %xmm12;
771 vmovdqu 5 * 16(%rcx), %xmm13;
772 vmovdqu 6 * 16(%rcx), %xmm14;
773 vmovdqu 7 * 16(%rcx), %xmm15;
777 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
805 * %r8d: 24 for 16 byte key, 32 for larger
806 * %xmm0..%xmm15: 16 encrypted blocks
808 * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
813 leaq 8 * 16(%rax), %rcx;
825 %xmm15, %rax, %rcx, 16);
830 ((key_table + (16) * 8) + 8)(CTX),
831 ((key_table + (16) * 8) + 12)(CTX),
832 ((key_table + (16) * 8) + 0)(CTX),
833 ((key_table + (16) * 8) + 4)(CTX));
852 vmovdqu 0 * 16(%rcx), %xmm8;
853 vmovdqu 1 * 16(%rcx), %xmm9;
854 vmovdqu 2 * 16(%rcx), %xmm10;
855 vmovdqu 3 * 16(%rcx), %xmm11;
856 vmovdqu 4 * 16(%rcx), %xmm12;
857 vmovdqu 5 * 16(%rcx), %xmm13;
858 vmovdqu 6 * 16(%rcx), %xmm14;
859 vmovdqu 7 * 16(%rcx), %xmm15;
863 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
888 * %rsi: dst (16 blocks)
889 * %rdx: src (16 blocks)
913 * %rsi: dst (16 blocks)
914 * %rdx: src (16 blocks)
918 cmpl $16, key_length(CTX);
943 * %rsi: dst (16 blocks)
944 * %rdx: src (16 blocks)
948 cmpl $16, key_length(CTX);
958 * dst might still be in-use (in case dst == src), so use stack for
961 subq $(16 * 16), %rsp;
966 addq $(16 * 16), %rsp;
968 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
969 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
970 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
971 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
972 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
973 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
974 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
975 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
976 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
977 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
978 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
979 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
980 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
981 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
982 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;