Lines Matching +full:0 +full:- +full:32
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * x86_64/AVX2/AES-NI assembler implementation of Camellia
14 #define key_table 0
51 32-way camellia
56 * x0..x7: byte-sliced AB state
60 * x0..x7: new byte-sliced CD state
65 * S-function with AES subbytes \
147 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
159 /* P-function */ \
193 vpxor 5 * 32(mem_cd), x1, x1; \
200 vpxor 4 * 32(mem_cd), x0, x0; \
203 vpxor 6 * 32(mem_cd), x2, x2; \
206 vpxor 7 * 32(mem_cd), x3, x3; \
209 vpxor 0 * 32(mem_cd), x4, x4; \
212 vpxor 1 * 32(mem_cd), x5, x5; \
215 vpxor 2 * 32(mem_cd), x6, x6; \
218 vpxor 3 * 32(mem_cd), x7, x7;
240 * x0..x7: byte-sliced AB state preloaded
241 * mem_ab: byte-sliced AB state in memory
242 * mem_cb: byte-sliced CD state in memory
249 vmovdqu x0, 4 * 32(mem_cd); \
250 vmovdqu x1, 5 * 32(mem_cd); \
251 vmovdqu x2, 6 * 32(mem_cd); \
252 vmovdqu x3, 7 * 32(mem_cd); \
253 vmovdqu x4, 0 * 32(mem_cd); \
254 vmovdqu x5, 1 * 32(mem_cd); \
255 vmovdqu x6, 2 * 32(mem_cd); \
256 vmovdqu x7, 3 * 32(mem_cd); \
267 vmovdqu x4, 4 * 32(mem_ab); \
268 vmovdqu x5, 5 * 32(mem_ab); \
269 vmovdqu x6, 6 * 32(mem_ab); \
270 vmovdqu x7, 7 * 32(mem_ab); \
271 vmovdqu x0, 0 * 32(mem_ab); \
272 vmovdqu x1, 1 * 32(mem_ab); \
273 vmovdqu x2, 2 * 32(mem_ab); \
274 vmovdqu x3, 3 * 32(mem_ab);
288 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
290 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
292 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
296 * v0..3: byte-sliced 32-bit integers
325 * r: byte-sliced AB state in memory
326 * l: byte-sliced CD state in memory
328 * x0..x7: new byte-sliced CD state
337 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
355 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
356 vmovdqu l4, 4 * 32(l); \
358 vmovdqu l5, 5 * 32(l); \
360 vmovdqu l6, 6 * 32(l); \
362 vmovdqu l7, 7 * 32(l); \
378 vpor 4 * 32(r), t0, t0; \
379 vpor 5 * 32(r), t1, t1; \
380 vpor 6 * 32(r), t2, t2; \
381 vpor 7 * 32(r), t3, t3; \
383 vpxor 0 * 32(r), t0, t0; \
384 vpxor 1 * 32(r), t1, t1; \
385 vpxor 2 * 32(r), t2, t2; \
386 vpxor 3 * 32(r), t3, t3; \
387 vmovdqu t0, 0 * 32(r); \
388 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
389 vmovdqu t1, 1 * 32(r); \
390 vmovdqu t2, 2 * 32(r); \
391 vmovdqu t3, 3 * 32(r); \
406 vpand 0 * 32(r), t0, t0; \
407 vpand 1 * 32(r), t1, t1; \
408 vpand 2 * 32(r), t2, t2; \
409 vpand 3 * 32(r), t3, t3; \
413 vpxor 4 * 32(r), t0, t0; \
414 vpxor 5 * 32(r), t1, t1; \
415 vpxor 6 * 32(r), t2, t2; \
416 vpxor 7 * 32(r), t3, t3; \
417 vmovdqu t0, 4 * 32(r); \
418 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
419 vmovdqu t1, 5 * 32(r); \
420 vmovdqu t2, 6 * 32(r); \
421 vmovdqu t3, 7 * 32(r); \
443 vmovdqu l0, 0 * 32(l); \
445 vmovdqu l1, 1 * 32(l); \
447 vmovdqu l2, 2 * 32(l); \
449 vmovdqu l3, 3 * 32(l);
513 /* load blocks to registers and apply pre-whitening */
519 vpxor 0 * 32(rio), x0, y7; \
520 vpxor 1 * 32(rio), x0, y6; \
521 vpxor 2 * 32(rio), x0, y5; \
522 vpxor 3 * 32(rio), x0, y4; \
523 vpxor 4 * 32(rio), x0, y3; \
524 vpxor 5 * 32(rio), x0, y2; \
525 vpxor 6 * 32(rio), x0, y1; \
526 vpxor 7 * 32(rio), x0, y0; \
527 vpxor 8 * 32(rio), x0, x7; \
528 vpxor 9 * 32(rio), x0, x6; \
529 vpxor 10 * 32(rio), x0, x5; \
530 vpxor 11 * 32(rio), x0, x4; \
531 vpxor 12 * 32(rio), x0, x3; \
532 vpxor 13 * 32(rio), x0, x2; \
533 vpxor 14 * 32(rio), x0, x1; \
534 vpxor 15 * 32(rio), x0, x0;
536 /* byteslice pre-whitened blocks and store to temporary memory */
542 vmovdqu x0, 0 * 32(mem_ab); \
543 vmovdqu x1, 1 * 32(mem_ab); \
544 vmovdqu x2, 2 * 32(mem_ab); \
545 vmovdqu x3, 3 * 32(mem_ab); \
546 vmovdqu x4, 4 * 32(mem_ab); \
547 vmovdqu x5, 5 * 32(mem_ab); \
548 vmovdqu x6, 6 * 32(mem_ab); \
549 vmovdqu x7, 7 * 32(mem_ab); \
550 vmovdqu y0, 0 * 32(mem_cd); \
551 vmovdqu y1, 1 * 32(mem_cd); \
552 vmovdqu y2, 2 * 32(mem_cd); \
553 vmovdqu y3, 3 * 32(mem_cd); \
554 vmovdqu y4, 4 * 32(mem_cd); \
555 vmovdqu y5, 5 * 32(mem_cd); \
556 vmovdqu y6, 6 * 32(mem_cd); \
557 vmovdqu y7, 7 * 32(mem_cd);
559 /* de-byteslice, apply post-whitening and store blocks */
589 vmovdqu x0, 0 * 32(rio); \
590 vmovdqu x1, 1 * 32(rio); \
591 vmovdqu x2, 2 * 32(rio); \
592 vmovdqu x3, 3 * 32(rio); \
593 vmovdqu x4, 4 * 32(rio); \
594 vmovdqu x5, 5 * 32(rio); \
595 vmovdqu x6, 6 * 32(rio); \
596 vmovdqu x7, 7 * 32(rio); \
597 vmovdqu y0, 8 * 32(rio); \
598 vmovdqu y1, 9 * 32(rio); \
599 vmovdqu y2, 10 * 32(rio); \
600 vmovdqu y3, 11 * 32(rio); \
601 vmovdqu y4, 12 * 32(rio); \
602 vmovdqu y5, 13 * 32(rio); \
603 vmovdqu y6, 14 * 32(rio); \
604 vmovdqu y7, 15 * 32(rio);
607 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
608 .align 32
610 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
612 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
613 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
615 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
616 .align 32
618 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
619 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
621 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
626 * pre-SubByte transform
628 * pre-lookup for sbox1, sbox2, sbox3:
637 * (note: '⊕ 0xc5' inside camellia_f())
640 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
641 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
643 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
644 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
647 * pre-SubByte transform
649 * pre-lookup for sbox4:
658 * (note: '⊕ 0xc5' inside camellia_f())
661 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
662 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
664 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
665 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
668 * post-SubByte transform
670 * post-lookup for sbox1, sbox4:
681 * (note: '⊕ 0x6e' inside camellia_h())
684 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
685 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
687 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
688 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
691 * post-SubByte transform
693 * post-lookup for sbox2:
704 * (note: '⊕ 0x6e' inside camellia_h())
707 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
708 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
710 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
711 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
714 * post-SubByte transform
716 * post-lookup for sbox3:
727 * (note: '⊕ 0x6e' inside camellia_h())
730 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
731 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
733 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
734 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
738 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
739 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
743 /* 4-bit mask */
745 .long 0x0f0f0f0f
753 * %ymm0..%ymm15: 32 plaintext blocks
755 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
756 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
760 leaq 8 * 32(%rax), %rcx;
768 %ymm15, %rax, %rcx, 0);
773 ((key_table + (8) * 8) + 0)(CTX),
785 ((key_table + (16) * 8) + 0)(CTX),
800 vmovdqu 0 * 32(%rcx), %ymm8;
801 vmovdqu 1 * 32(%rcx), %ymm9;
802 vmovdqu 2 * 32(%rcx), %ymm10;
803 vmovdqu 3 * 32(%rcx), %ymm11;
804 vmovdqu 4 * 32(%rcx), %ymm12;
805 vmovdqu 5 * 32(%rcx), %ymm13;
806 vmovdqu 6 * 32(%rcx), %ymm14;
807 vmovdqu 7 * 32(%rcx), %ymm15;
811 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
818 movl $32, %r8d;
823 ((key_table + (24) * 8) + 0)(CTX),
839 * %r8d: 24 for 16 byte key, 32 for larger
843 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
847 leaq 8 * 32(%rax), %rcx;
853 cmpl $32, %r8d;
866 ((key_table + (16) * 8) + 0)(CTX),
878 ((key_table + (8) * 8) + 0)(CTX),
883 %ymm15, %rax, %rcx, 0);
886 vmovdqu 0 * 32(%rcx), %ymm8;
887 vmovdqu 1 * 32(%rcx), %ymm9;
888 vmovdqu 2 * 32(%rcx), %ymm10;
889 vmovdqu 3 * 32(%rcx), %ymm11;
890 vmovdqu 4 * 32(%rcx), %ymm12;
891 vmovdqu 5 * 32(%rcx), %ymm13;
892 vmovdqu 6 * 32(%rcx), %ymm14;
893 vmovdqu 7 * 32(%rcx), %ymm15;
897 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
913 ((key_table + (24) * 8) + 0)(CTX),
922 * %rsi: dst (32 blocks)
923 * %rdx: src (32 blocks)
951 * %rsi: dst (32 blocks)
952 * %rdx: src (32 blocks)
959 movl $32, %r8d;
985 * %rsi: dst (32 blocks)
986 * %rdx: src (32 blocks)
989 subq $(16 * 32), %rsp;
994 movl $32, %r8d;
1011 * dst still in-use (because dst == src), so use stack for temporary
1023 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1024 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1025 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1026 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1027 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1028 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1029 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1030 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1031 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1032 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1033 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1034 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1035 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1036 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1037 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1044 addq $(16 * 32), %rsp;