1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build (ppc64 || ppc64le) && !purego
6
7// Based on CRYPTOGAMS code with the following comment:
8// # ====================================================================
9// # Written by Andy Polyakov <[email protected]> for the OpenSSL
10// # project. The module is, however, dual licensed under OpenSSL and
11// # CRYPTOGAMS licenses depending on where you obtain it. For further
12// # details see http://www.openssl.org/~appro/cryptogams/.
13// # ====================================================================
14
15#include "textflag.h"
16
17// SHA256 block routine. See sha256block.go for Go equivalent.
18//
19// The algorithm is detailed in FIPS 180-4:
20//
21//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
22//
23// Wt = Mt; for 0 <= t <= 15
24// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
25//
26// a = H0
27// b = H1
28// c = H2
29// d = H3
30// e = H4
31// f = H5
32// g = H6
33// h = H7
34//
35// for t = 0 to 63 {
36//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
37//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
38//    h = g
39//    g = f
40//    f = e
41//    e = d + T1
42//    d = c
43//    c = b
44//    b = a
45//    a = T1 + T2
46// }
47//
48// H0 = a + H0
49// H1 = b + H1
50// H2 = c + H2
51// H3 = d + H3
52// H4 = e + H4
53// H5 = f + H5
54// H6 = g + H6
55// H7 = h + H7
56
57#define CTX	R3
58#define INP	R4
59#define END	R5
60#define TBL	R6 // Pointer into kcon table
61#define LEN	R9
62#define TEMP	R12
63
64#define TBL_STRT	R7 // Pointer to start of kcon table.
65
66#define R_x000	R0
67#define R_x010	R8
68#define R_x020	R10
69#define R_x030	R11
70#define R_x040	R14
71#define R_x050	R15
72#define R_x060	R16
73#define R_x070	R17
74#define R_x080	R18
75#define R_x090	R19
76#define R_x0a0	R20
77#define R_x0b0	R21
78#define R_x0c0	R22
79#define R_x0d0	R23
80#define R_x0e0	R24
81#define R_x0f0	R25
82#define R_x100	R26
83#define R_x110	R27
84
85
86// V0-V7 are A-H
87// V8-V23 are used for the message schedule
88#define KI	V24
89#define FUNC	V25
90#define S0	V26
91#define S1	V27
92#define s0	V28
93#define s1	V29
94#define LEMASK	V31 // Permutation control register for little endian
95
96// 4 copies of each Kt, to fill all 4 words of a vector register
97DATA  ·kcon+0x000(SB)/8, $0x428a2f98428a2f98
98DATA  ·kcon+0x008(SB)/8, $0x428a2f98428a2f98
99DATA  ·kcon+0x010(SB)/8, $0x7137449171374491
100DATA  ·kcon+0x018(SB)/8, $0x7137449171374491
101DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf
102DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf
103DATA  ·kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5
104DATA  ·kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5
105DATA  ·kcon+0x040(SB)/8, $0x3956c25b3956c25b
106DATA  ·kcon+0x048(SB)/8, $0x3956c25b3956c25b
107DATA  ·kcon+0x050(SB)/8, $0x59f111f159f111f1
108DATA  ·kcon+0x058(SB)/8, $0x59f111f159f111f1
109DATA  ·kcon+0x060(SB)/8, $0x923f82a4923f82a4
110DATA  ·kcon+0x068(SB)/8, $0x923f82a4923f82a4
111DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5
112DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5
113DATA  ·kcon+0x080(SB)/8, $0xd807aa98d807aa98
114DATA  ·kcon+0x088(SB)/8, $0xd807aa98d807aa98
115DATA  ·kcon+0x090(SB)/8, $0x12835b0112835b01
116DATA  ·kcon+0x098(SB)/8, $0x12835b0112835b01
117DATA  ·kcon+0x0A0(SB)/8, $0x243185be243185be
118DATA  ·kcon+0x0A8(SB)/8, $0x243185be243185be
119DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3
120DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3
121DATA  ·kcon+0x0C0(SB)/8, $0x72be5d7472be5d74
122DATA  ·kcon+0x0C8(SB)/8, $0x72be5d7472be5d74
123DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe
124DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe
125DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7
126DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7
127DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174c19bf174
128DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174c19bf174
129DATA  ·kcon+0x100(SB)/8, $0xe49b69c1e49b69c1
130DATA  ·kcon+0x108(SB)/8, $0xe49b69c1e49b69c1
131DATA  ·kcon+0x110(SB)/8, $0xefbe4786efbe4786
132DATA  ·kcon+0x118(SB)/8, $0xefbe4786efbe4786
133DATA  ·kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6
134DATA  ·kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6
135DATA  ·kcon+0x130(SB)/8, $0x240ca1cc240ca1cc
136DATA  ·kcon+0x138(SB)/8, $0x240ca1cc240ca1cc
137DATA  ·kcon+0x140(SB)/8, $0x2de92c6f2de92c6f
138DATA  ·kcon+0x148(SB)/8, $0x2de92c6f2de92c6f
139DATA  ·kcon+0x150(SB)/8, $0x4a7484aa4a7484aa
140DATA  ·kcon+0x158(SB)/8, $0x4a7484aa4a7484aa
141DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc
142DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc
143DATA  ·kcon+0x170(SB)/8, $0x76f988da76f988da
144DATA  ·kcon+0x178(SB)/8, $0x76f988da76f988da
145DATA  ·kcon+0x180(SB)/8, $0x983e5152983e5152
146DATA  ·kcon+0x188(SB)/8, $0x983e5152983e5152
147DATA  ·kcon+0x190(SB)/8, $0xa831c66da831c66d
148DATA  ·kcon+0x198(SB)/8, $0xa831c66da831c66d
149DATA  ·kcon+0x1A0(SB)/8, $0xb00327c8b00327c8
150DATA  ·kcon+0x1A8(SB)/8, $0xb00327c8b00327c8
151DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7
152DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7
153DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3
154DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3
155DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147d5a79147
156DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147d5a79147
157DATA  ·kcon+0x1E0(SB)/8, $0x06ca635106ca6351
158DATA  ·kcon+0x1E8(SB)/8, $0x06ca635106ca6351
159DATA  ·kcon+0x1F0(SB)/8, $0x1429296714292967
160DATA  ·kcon+0x1F8(SB)/8, $0x1429296714292967
161DATA  ·kcon+0x200(SB)/8, $0x27b70a8527b70a85
162DATA  ·kcon+0x208(SB)/8, $0x27b70a8527b70a85
163DATA  ·kcon+0x210(SB)/8, $0x2e1b21382e1b2138
164DATA  ·kcon+0x218(SB)/8, $0x2e1b21382e1b2138
165DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc
166DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc
167DATA  ·kcon+0x230(SB)/8, $0x53380d1353380d13
168DATA  ·kcon+0x238(SB)/8, $0x53380d1353380d13
169DATA  ·kcon+0x240(SB)/8, $0x650a7354650a7354
170DATA  ·kcon+0x248(SB)/8, $0x650a7354650a7354
171DATA  ·kcon+0x250(SB)/8, $0x766a0abb766a0abb
172DATA  ·kcon+0x258(SB)/8, $0x766a0abb766a0abb
173DATA  ·kcon+0x260(SB)/8, $0x81c2c92e81c2c92e
174DATA  ·kcon+0x268(SB)/8, $0x81c2c92e81c2c92e
175DATA  ·kcon+0x270(SB)/8, $0x92722c8592722c85
176DATA  ·kcon+0x278(SB)/8, $0x92722c8592722c85
177DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1
178DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1
179DATA  ·kcon+0x290(SB)/8, $0xa81a664ba81a664b
180DATA  ·kcon+0x298(SB)/8, $0xa81a664ba81a664b
181DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70
182DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70
183DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3
184DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3
185DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d192e819
186DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d192e819
187DATA  ·kcon+0x2D0(SB)/8, $0xd6990624d6990624
188DATA  ·kcon+0x2D8(SB)/8, $0xd6990624d6990624
189DATA  ·kcon+0x2E0(SB)/8, $0xf40e3585f40e3585
190DATA  ·kcon+0x2E8(SB)/8, $0xf40e3585f40e3585
191DATA  ·kcon+0x2F0(SB)/8, $0x106aa070106aa070
192DATA  ·kcon+0x2F8(SB)/8, $0x106aa070106aa070
193DATA  ·kcon+0x300(SB)/8, $0x19a4c11619a4c116
194DATA  ·kcon+0x308(SB)/8, $0x19a4c11619a4c116
195DATA  ·kcon+0x310(SB)/8, $0x1e376c081e376c08
196DATA  ·kcon+0x318(SB)/8, $0x1e376c081e376c08
197DATA  ·kcon+0x320(SB)/8, $0x2748774c2748774c
198DATA  ·kcon+0x328(SB)/8, $0x2748774c2748774c
199DATA  ·kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5
200DATA  ·kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5
201DATA  ·kcon+0x340(SB)/8, $0x391c0cb3391c0cb3
202DATA  ·kcon+0x348(SB)/8, $0x391c0cb3391c0cb3
203DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a
204DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a
205DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f
206DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f
207DATA  ·kcon+0x370(SB)/8, $0x682e6ff3682e6ff3
208DATA  ·kcon+0x378(SB)/8, $0x682e6ff3682e6ff3
209DATA  ·kcon+0x380(SB)/8, $0x748f82ee748f82ee
210DATA  ·kcon+0x388(SB)/8, $0x748f82ee748f82ee
211DATA  ·kcon+0x390(SB)/8, $0x78a5636f78a5636f
212DATA  ·kcon+0x398(SB)/8, $0x78a5636f78a5636f
213DATA  ·kcon+0x3A0(SB)/8, $0x84c8781484c87814
214DATA  ·kcon+0x3A8(SB)/8, $0x84c8781484c87814
215DATA  ·kcon+0x3B0(SB)/8, $0x8cc702088cc70208
216DATA  ·kcon+0x3B8(SB)/8, $0x8cc702088cc70208
217DATA  ·kcon+0x3C0(SB)/8, $0x90befffa90befffa
218DATA  ·kcon+0x3C8(SB)/8, $0x90befffa90befffa
219DATA  ·kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb
220DATA  ·kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb
221DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7
222DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7
223DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2
224DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2
225DATA  ·kcon+0x400(SB)/8, $0x0000000000000000
226DATA  ·kcon+0x408(SB)/8, $0x0000000000000000
227
228#ifdef GOARCH_ppc64le
229DATA  ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors
230DATA  ·kcon+0x418(SB)/8, $0x1011121300010203
231DATA  ·kcon+0x420(SB)/8, $0x1011121310111213
232DATA  ·kcon+0x428(SB)/8, $0x0405060700010203
233DATA  ·kcon+0x430(SB)/8, $0x1011121308090a0b
234DATA  ·kcon+0x438(SB)/8, $0x0405060700010203
235#else
236DATA  ·kcon+0x410(SB)/8, $0x1011121300010203
237DATA  ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors
238DATA  ·kcon+0x420(SB)/8, $0x0405060700010203
239DATA  ·kcon+0x428(SB)/8, $0x1011121310111213
240DATA  ·kcon+0x430(SB)/8, $0x0001020304050607
241DATA  ·kcon+0x438(SB)/8, $0x08090a0b10111213
242#endif
243
244GLOBL ·kcon(SB), RODATA, $1088
245
246#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
247	VSEL		g, f, e, FUNC; \
248	VSHASIGMAW	$15, e, $1, S1; \
249	VADDUWM		xi, h, h; \
250	VSHASIGMAW	$0, a, $1, S0; \
251	VADDUWM		FUNC, h, h; \
252	VXOR		b, a, FUNC; \
253	VADDUWM		S1, h, h; \
254	VSEL		b, c, FUNC, FUNC; \
255	VADDUWM		KI, g, g; \
256	VADDUWM		h, d, d; \
257	VADDUWM		FUNC, S0, S0; \
258	LVX		(TBL)(idx), KI; \
259	VADDUWM		S0, h, h
260
261#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
262	VSHASIGMAW	$0, xj_1, $0, s0; \
263	VSEL		g, f, e, FUNC; \
264	VSHASIGMAW	$15, e, $1, S1; \
265	VADDUWM		xi, h, h; \
266	VSHASIGMAW	$0, a, $1, S0; \
267	VSHASIGMAW	$15, xj_14, $0, s1; \
268	VADDUWM		FUNC, h, h; \
269	VXOR		b, a, FUNC; \
270	VADDUWM		xj_9, xj, xj; \
271	VADDUWM		S1, h, h; \
272	VSEL		b, c, FUNC, FUNC; \
273	VADDUWM		KI, g, g; \
274	VADDUWM		h, d, d; \
275	VADDUWM		FUNC, S0, S0; \
276	VADDUWM		s0, xj, xj; \
277	LVX		(TBL)(idx), KI; \
278	VADDUWM		S0, h, h; \
279	VADDUWM		s1, xj, xj
280
281#ifdef GOARCH_ppc64le
282#define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt
283#else
284#define VPERMLE(va,vb,vc,vt)
285#endif
286
287// func block(dig *digest, p []byte)
288TEXT ·block(SB),0,$0-32
289	MOVD	dig+0(FP), CTX
290	MOVD	p_base+8(FP), INP
291	MOVD	p_len+16(FP), LEN
292
293	SRD	$6, LEN
294	SLD	$6, LEN
295	ADD	INP, LEN, END
296
297	CMP	INP, END
298	BEQ	end
299
300	MOVDkcon(SB), TBL_STRT
301	MOVD	$0x10, R_x010
302
303#ifdef GOARCH_ppc64le
304	MOVWZ	$8, TEMP
305	LVSL	(TEMP)(R0), LEMASK
306	VSPLTISB	$0x0F, KI
307	VXOR	KI, LEMASK, LEMASK
308#endif
309
310	LXVW4X	(CTX)(R_x000), V0
311	LXVW4X	(CTX)(R_x010), V4
312
313	// unpack the input values into vector registers
314	VSLDOI	$4, V0, V0, V1
315	VSLDOI	$8, V0, V0, V2
316	VSLDOI	$12, V0, V0, V3
317	VSLDOI	$4, V4, V4, V5
318	VSLDOI	$8, V4, V4, V6
319	VSLDOI	$12, V4, V4, V7
320
321	MOVD	$0x020, R_x020
322	MOVD	$0x030, R_x030
323	MOVD	$0x040, R_x040
324	MOVD	$0x050, R_x050
325	MOVD	$0x060, R_x060
326	MOVD	$0x070, R_x070
327	MOVD	$0x080, R_x080
328	MOVD	$0x090, R_x090
329	MOVD	$0x0a0, R_x0a0
330	MOVD	$0x0b0, R_x0b0
331	MOVD	$0x0c0, R_x0c0
332	MOVD	$0x0d0, R_x0d0
333	MOVD	$0x0e0, R_x0e0
334	MOVD	$0x0f0, R_x0f0
335	MOVD	$0x100, R_x100
336	MOVD	$0x110, R_x110
337
338loop:
339	MOVD	TBL_STRT, TBL
340	LVX	(TBL)(R_x000), KI
341
342	LXVD2X	(INP)(R_x000), V8 // load v8 in advance
343
344	// Offload to VSR24-31 (aka FPR24-31)
345	XXLOR	V0, V0, VS24
346	XXLOR	V1, V1, VS25
347	XXLOR	V2, V2, VS26
348	XXLOR	V3, V3, VS27
349	XXLOR	V4, V4, VS28
350	XXLOR	V5, V5, VS29
351	XXLOR	V6, V6, VS30
352	XXLOR	V7, V7, VS31
353
354	VADDUWM	KI, V7, V7        // h+K[i]
355	LVX	(TBL)(R_x010), KI
356
357	VPERMLE(V8, V8, LEMASK, V8)
358	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
359	VSLDOI	$4, V8, V8, V9
360	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
361	VSLDOI	$4, V9, V9, V10
362	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
363	LXVD2X	(INP)(R_x010), V12 // load v12 in advance
364	VSLDOI	$4, V10, V10, V11
365	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
366	VPERMLE(V12, V12, LEMASK, V12)
367	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
368	VSLDOI	$4, V12, V12, V13
369	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
370	VSLDOI	$4, V13, V13, V14
371	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
372	LXVD2X	(INP)(R_x020), V16 // load v16 in advance
373	VSLDOI	$4, V14, V14, V15
374	SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
375	VPERMLE(V16, V16, LEMASK, V16)
376	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
377	VSLDOI	$4, V16, V16, V17
378	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
379	VSLDOI	$4, V17, V17, V18
380	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
381	VSLDOI	$4, V18, V18, V19
382	LXVD2X	(INP)(R_x030), V20 // load v20 in advance
383	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
384	VPERMLE(V20, V20, LEMASK, V20)
385	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
386	VSLDOI	$4, V20, V20, V21
387	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
388	VSLDOI	$4, V21, V21, V22
389	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
390	VSLDOI	$4, V22, V22, V23
391	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
392
393	MOVD	$3, TEMP
394	MOVD	TEMP, CTR
395	ADD	$0x120, TBL
396	ADD	$0x40, INP
397
398L16_xx:
399	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
400	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
401	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
402	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
403	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
404	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
405	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
406	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
407	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
408	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
409	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
410	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
411	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
412	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
413	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
414	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
415	ADD	$0x100, TBL
416
417	BDNZ	L16_xx
418
419	XXLOR	VS24, VS24, V10
420
421	XXLOR	VS25, VS25, V11
422	VADDUWM	V10, V0, V0
423	XXLOR	VS26, VS26, V12
424	VADDUWM	V11, V1, V1
425	XXLOR	VS27, VS27, V13
426	VADDUWM	V12, V2, V2
427	XXLOR	VS28, VS28, V14
428	VADDUWM	V13, V3, V3
429	XXLOR	VS29, VS29, V15
430	VADDUWM	V14, V4, V4
431	XXLOR	VS30, VS30, V16
432	VADDUWM	V15, V5, V5
433	XXLOR	VS31, VS31, V17
434	VADDUWM	V16, V6, V6
435	VADDUWM	V17, V7, V7
436
437	CMPU	INP, END
438	BLT	loop
439
440	LVX	(TBL)(R_x000), V8
441	VPERM	V0, V1, KI, V0
442	LVX	(TBL)(R_x010), V9
443	VPERM	V4, V5, KI, V4
444	VPERM	V0, V2, V8, V0
445	VPERM	V4, V6, V8, V4
446	VPERM	V0, V3, V9, V0
447	VPERM	V4, V7, V9, V4
448	STXVD2X	V0, (CTX+R_x000)
449	STXVD2X	V4, (CTX+R_x010)
450
451end:
452	RET
453
454