1// Original source:
2//	http://www.zorinaq.com/papers/md5-amd64.html
3//	http://www.zorinaq.com/papers/md5-amd64.tar.bz2
4//
5// MD5 optimized for ppc64le using Go's assembler for
6// ppc64le, based on md5block_amd64.s implementation by
7// the Go authors.
8//
9// Author: Marc Bevand <bevand_m (at) epita.fr>
10// Licence: I hereby disclaim the copyright on this code and place it
11// in the public domain.
12
13//go:build (ppc64 || ppc64le) && !purego
14
15#include "textflag.h"
16
17// ENDIAN_MOVE generates the appropriate
18// 4 byte load for big or little endian.
19// The 4 bytes at ptr+off is loaded into dst.
20// The idx reg is only needed for big endian
21// and is clobbered when used.
22#ifdef GOARCH_ppc64le
23#define ENDIAN_MOVE(off, ptr, dst, idx) \
24	MOVWZ	off(ptr),dst
25#else
26#define ENDIAN_MOVE(off, ptr, dst, idx) \
27	MOVD	$off,idx; \
28	MOVWBR	(idx)(ptr), dst
29#endif
30
31#define M00 R18
32#define M01 R19
33#define M02 R20
34#define M03 R24
35#define M04 R25
36#define M05 R26
37#define M06 R27
38#define M07 R28
39#define M08 R29
40#define M09 R21
41#define M10 R11
42#define M11 R8
43#define M12 R7
44#define M13 R12
45#define M14 R23
46#define M15 R10
47
48#define ROUND1(a, b, c, d, index, const, shift) \
49	ADD	$const, index, R9; \
50	ADD	R9, a; \
51	AND     b, c, R9; \
52	ANDN    b, d, R31; \
53	OR	R9, R31, R9; \
54	ADD	R9, a; \
55	ROTLW	$shift, a; \
56	ADD	b, a;
57
58#define ROUND2(a, b, c, d, index, const, shift) \
59	ADD	$const, index, R9; \
60	ADD	R9, a; \
61	AND	b, d, R31; \
62	ANDN	d, c, R9; \
63	OR	R9, R31; \
64	ADD	R31, a; \
65	ROTLW	$shift, a; \
66	ADD	b, a;
67
68#define ROUND3(a, b, c, d, index, const, shift) \
69	ADD	$const, index, R9; \
70	ADD	R9, a; \
71	XOR	d, c, R31; \
72	XOR	b, R31; \
73	ADD	R31, a; \
74	ROTLW	$shift, a; \
75	ADD	b, a;
76
77#define ROUND4(a, b, c, d, index, const, shift) \
78	ADD	$const, index, R9; \
79	ADD	R9, a; \
80	ORN     d, b, R31; \
81	XOR	c, R31; \
82	ADD	R31, a; \
83	ROTLW	$shift, a; \
84	ADD	b, a;
85
86
87TEXT ·block(SB),NOSPLIT,$0-32
88	MOVD	dig+0(FP), R10
89	MOVD	p+8(FP), R6
90	MOVD	p_len+16(FP), R5
91
92	// We assume p_len >= 64
93	SRD 	$6, R5
94	MOVD	R5, CTR
95
96	MOVWZ	0(R10), R22
97	MOVWZ	4(R10), R3
98	MOVWZ	8(R10), R4
99	MOVWZ	12(R10), R5
100
101loop:
102	MOVD	R22, R14
103	MOVD	R3, R15
104	MOVD	R4, R16
105	MOVD	R5, R17
106
107	ENDIAN_MOVE( 0,R6,M00,M15)
108	ENDIAN_MOVE( 4,R6,M01,M15)
109	ENDIAN_MOVE( 8,R6,M02,M15)
110	ENDIAN_MOVE(12,R6,M03,M15)
111
112	ROUND1(R22,R3,R4,R5,M00,0xd76aa478, 7);
113	ROUND1(R5,R22,R3,R4,M01,0xe8c7b756,12);
114	ROUND1(R4,R5,R22,R3,M02,0x242070db,17);
115	ROUND1(R3,R4,R5,R22,M03,0xc1bdceee,22);
116
117	ENDIAN_MOVE(16,R6,M04,M15)
118	ENDIAN_MOVE(20,R6,M05,M15)
119	ENDIAN_MOVE(24,R6,M06,M15)
120	ENDIAN_MOVE(28,R6,M07,M15)
121
122	ROUND1(R22,R3,R4,R5,M04,0xf57c0faf, 7);
123	ROUND1(R5,R22,R3,R4,M05,0x4787c62a,12);
124	ROUND1(R4,R5,R22,R3,M06,0xa8304613,17);
125	ROUND1(R3,R4,R5,R22,M07,0xfd469501,22);
126
127	ENDIAN_MOVE(32,R6,M08,M15)
128	ENDIAN_MOVE(36,R6,M09,M15)
129	ENDIAN_MOVE(40,R6,M10,M15)
130	ENDIAN_MOVE(44,R6,M11,M15)
131
132	ROUND1(R22,R3,R4,R5,M08,0x698098d8, 7);
133	ROUND1(R5,R22,R3,R4,M09,0x8b44f7af,12);
134	ROUND1(R4,R5,R22,R3,M10,0xffff5bb1,17);
135	ROUND1(R3,R4,R5,R22,M11,0x895cd7be,22);
136
137	ENDIAN_MOVE(48,R6,M12,M15)
138	ENDIAN_MOVE(52,R6,M13,M15)
139	ENDIAN_MOVE(56,R6,M14,M15)
140	ENDIAN_MOVE(60,R6,M15,M15)
141
142	ROUND1(R22,R3,R4,R5,M12,0x6b901122, 7);
143	ROUND1(R5,R22,R3,R4,M13,0xfd987193,12);
144	ROUND1(R4,R5,R22,R3,M14,0xa679438e,17);
145	ROUND1(R3,R4,R5,R22,M15,0x49b40821,22);
146
147	ROUND2(R22,R3,R4,R5,M01,0xf61e2562, 5);
148	ROUND2(R5,R22,R3,R4,M06,0xc040b340, 9);
149	ROUND2(R4,R5,R22,R3,M11,0x265e5a51,14);
150	ROUND2(R3,R4,R5,R22,M00,0xe9b6c7aa,20);
151	ROUND2(R22,R3,R4,R5,M05,0xd62f105d, 5);
152	ROUND2(R5,R22,R3,R4,M10, 0x2441453, 9);
153	ROUND2(R4,R5,R22,R3,M15,0xd8a1e681,14);
154	ROUND2(R3,R4,R5,R22,M04,0xe7d3fbc8,20);
155	ROUND2(R22,R3,R4,R5,M09,0x21e1cde6, 5);
156	ROUND2(R5,R22,R3,R4,M14,0xc33707d6, 9);
157	ROUND2(R4,R5,R22,R3,M03,0xf4d50d87,14);
158	ROUND2(R3,R4,R5,R22,M08,0x455a14ed,20);
159	ROUND2(R22,R3,R4,R5,M13,0xa9e3e905, 5);
160	ROUND2(R5,R22,R3,R4,M02,0xfcefa3f8, 9);
161	ROUND2(R4,R5,R22,R3,M07,0x676f02d9,14);
162	ROUND2(R3,R4,R5,R22,M12,0x8d2a4c8a,20);
163
164	ROUND3(R22,R3,R4,R5,M05,0xfffa3942, 4);
165	ROUND3(R5,R22,R3,R4,M08,0x8771f681,11);
166	ROUND3(R4,R5,R22,R3,M11,0x6d9d6122,16);
167	ROUND3(R3,R4,R5,R22,M14,0xfde5380c,23);
168	ROUND3(R22,R3,R4,R5,M01,0xa4beea44, 4);
169	ROUND3(R5,R22,R3,R4,M04,0x4bdecfa9,11);
170	ROUND3(R4,R5,R22,R3,M07,0xf6bb4b60,16);
171	ROUND3(R3,R4,R5,R22,M10,0xbebfbc70,23);
172	ROUND3(R22,R3,R4,R5,M13,0x289b7ec6, 4);
173	ROUND3(R5,R22,R3,R4,M00,0xeaa127fa,11);
174	ROUND3(R4,R5,R22,R3,M03,0xd4ef3085,16);
175	ROUND3(R3,R4,R5,R22,M06, 0x4881d05,23);
176	ROUND3(R22,R3,R4,R5,M09,0xd9d4d039, 4);
177	ROUND3(R5,R22,R3,R4,M12,0xe6db99e5,11);
178	ROUND3(R4,R5,R22,R3,M15,0x1fa27cf8,16);
179	ROUND3(R3,R4,R5,R22,M02,0xc4ac5665,23);
180
181	ROUND4(R22,R3,R4,R5,M00,0xf4292244, 6);
182	ROUND4(R5,R22,R3,R4,M07,0x432aff97,10);
183	ROUND4(R4,R5,R22,R3,M14,0xab9423a7,15);
184	ROUND4(R3,R4,R5,R22,M05,0xfc93a039,21);
185	ROUND4(R22,R3,R4,R5,M12,0x655b59c3, 6);
186	ROUND4(R5,R22,R3,R4,M03,0x8f0ccc92,10);
187	ROUND4(R4,R5,R22,R3,M10,0xffeff47d,15);
188	ROUND4(R3,R4,R5,R22,M01,0x85845dd1,21);
189	ROUND4(R22,R3,R4,R5,M08,0x6fa87e4f, 6);
190	ROUND4(R5,R22,R3,R4,M15,0xfe2ce6e0,10);
191	ROUND4(R4,R5,R22,R3,M06,0xa3014314,15);
192	ROUND4(R3,R4,R5,R22,M13,0x4e0811a1,21);
193	ROUND4(R22,R3,R4,R5,M04,0xf7537e82, 6);
194	ROUND4(R5,R22,R3,R4,M11,0xbd3af235,10);
195	ROUND4(R4,R5,R22,R3,M02,0x2ad7d2bb,15);
196	ROUND4(R3,R4,R5,R22,M09,0xeb86d391,21);
197
198	ADD	R14, R22
199	ADD	R15, R3
200	ADD	R16, R4
201	ADD	R17, R5
202	ADD	$64, R6
203	BDNZ	loop
204
205end:
206	MOVD	dig+0(FP), R10
207	MOVWZ	R22, 0(R10)
208	MOVWZ	R3, 4(R10)
209	MOVWZ	R4, 8(R10)
210	MOVWZ	R5, 12(R10)
211
212	RET
213