xref: /aosp_15_r20/external/boringssl/src/gen/bcm/vpaes-armv7-linux.S (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1*8fb009dcSAndroid Build Coastguard Worker// This file is generated from a similarly-named Perl script in the BoringSSL
2*8fb009dcSAndroid Build Coastguard Worker// source tree. Do not edit by hand.
3*8fb009dcSAndroid Build Coastguard Worker
4*8fb009dcSAndroid Build Coastguard Worker#include <openssl/asm_base.h>
5*8fb009dcSAndroid Build Coastguard Worker
6*8fb009dcSAndroid Build Coastguard Worker#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
7*8fb009dcSAndroid Build Coastguard Worker.syntax	unified
8*8fb009dcSAndroid Build Coastguard Worker
9*8fb009dcSAndroid Build Coastguard Worker.arch	armv7-a
10*8fb009dcSAndroid Build Coastguard Worker.fpu	neon
11*8fb009dcSAndroid Build Coastguard Worker
12*8fb009dcSAndroid Build Coastguard Worker#if defined(__thumb2__)
13*8fb009dcSAndroid Build Coastguard Worker.thumb
14*8fb009dcSAndroid Build Coastguard Worker#else
15*8fb009dcSAndroid Build Coastguard Worker.code	32
16*8fb009dcSAndroid Build Coastguard Worker#endif
17*8fb009dcSAndroid Build Coastguard Worker
18*8fb009dcSAndroid Build Coastguard Worker.text
19*8fb009dcSAndroid Build Coastguard Worker
20*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_consts,%object
21*8fb009dcSAndroid Build Coastguard Worker.align	7	@ totally strategic alignment
22*8fb009dcSAndroid Build Coastguard Worker_vpaes_consts:
23*8fb009dcSAndroid Build Coastguard Worker.Lk_mc_forward:@ mc_forward
24*8fb009dcSAndroid Build Coastguard Worker.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
25*8fb009dcSAndroid Build Coastguard Worker.quad	0x080B0A0904070605, 0x000302010C0F0E0D
26*8fb009dcSAndroid Build Coastguard Worker.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
27*8fb009dcSAndroid Build Coastguard Worker.quad	0x000302010C0F0E0D, 0x080B0A0904070605
28*8fb009dcSAndroid Build Coastguard Worker.Lk_mc_backward:@ mc_backward
29*8fb009dcSAndroid Build Coastguard Worker.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
30*8fb009dcSAndroid Build Coastguard Worker.quad	0x020100030E0D0C0F, 0x0A09080B06050407
31*8fb009dcSAndroid Build Coastguard Worker.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
32*8fb009dcSAndroid Build Coastguard Worker.quad	0x0A09080B06050407, 0x020100030E0D0C0F
33*8fb009dcSAndroid Build Coastguard Worker.Lk_sr:@ sr
34*8fb009dcSAndroid Build Coastguard Worker.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
35*8fb009dcSAndroid Build Coastguard Worker.quad	0x030E09040F0A0500, 0x0B06010C07020D08
36*8fb009dcSAndroid Build Coastguard Worker.quad	0x0F060D040B020900, 0x070E050C030A0108
37*8fb009dcSAndroid Build Coastguard Worker.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
38*8fb009dcSAndroid Build Coastguard Worker
39*8fb009dcSAndroid Build Coastguard Worker@
40*8fb009dcSAndroid Build Coastguard Worker@ "Hot" constants
41*8fb009dcSAndroid Build Coastguard Worker@
42*8fb009dcSAndroid Build Coastguard Worker.Lk_inv:@ inv, inva
43*8fb009dcSAndroid Build Coastguard Worker.quad	0x0E05060F0D080180, 0x040703090A0B0C02
44*8fb009dcSAndroid Build Coastguard Worker.quad	0x01040A060F0B0780, 0x030D0E0C02050809
45*8fb009dcSAndroid Build Coastguard Worker.Lk_ipt:@ input transform (lo, hi)
46*8fb009dcSAndroid Build Coastguard Worker.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
47*8fb009dcSAndroid Build Coastguard Worker.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
48*8fb009dcSAndroid Build Coastguard Worker.Lk_sbo:@ sbou, sbot
49*8fb009dcSAndroid Build Coastguard Worker.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
50*8fb009dcSAndroid Build Coastguard Worker.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
51*8fb009dcSAndroid Build Coastguard Worker.Lk_sb1:@ sb1u, sb1t
52*8fb009dcSAndroid Build Coastguard Worker.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
53*8fb009dcSAndroid Build Coastguard Worker.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
54*8fb009dcSAndroid Build Coastguard Worker.Lk_sb2:@ sb2u, sb2t
55*8fb009dcSAndroid Build Coastguard Worker.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
56*8fb009dcSAndroid Build Coastguard Worker.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
57*8fb009dcSAndroid Build Coastguard Worker
58*8fb009dcSAndroid Build Coastguard Worker.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
59*8fb009dcSAndroid Build Coastguard Worker.align	2
60*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_consts,.-_vpaes_consts
61*8fb009dcSAndroid Build Coastguard Worker.align	6
62*8fb009dcSAndroid Build Coastguard Worker@@
63*8fb009dcSAndroid Build Coastguard Worker@@  _aes_preheat
64*8fb009dcSAndroid Build Coastguard Worker@@
65*8fb009dcSAndroid Build Coastguard Worker@@  Fills q9-q15 as specified below.
66*8fb009dcSAndroid Build Coastguard Worker@@
67*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_preheat,%function
68*8fb009dcSAndroid Build Coastguard Worker.align	4
69*8fb009dcSAndroid Build Coastguard Worker_vpaes_preheat:
70*8fb009dcSAndroid Build Coastguard Worker	adr	r10, .Lk_inv
71*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q9, #0x0f		@ .Lk_s0F
72*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q10,q11}, [r10]!	@ .Lk_inv
73*8fb009dcSAndroid Build Coastguard Worker	add	r10, r10, #64		@ Skip .Lk_ipt, .Lk_sbo
74*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q12,q13}, [r10]!	@ .Lk_sb1
75*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q14,q15}, [r10]	@ .Lk_sb2
76*8fb009dcSAndroid Build Coastguard Worker	bx	lr
77*8fb009dcSAndroid Build Coastguard Worker
78*8fb009dcSAndroid Build Coastguard Worker@@
79*8fb009dcSAndroid Build Coastguard Worker@@  _aes_encrypt_core
80*8fb009dcSAndroid Build Coastguard Worker@@
81*8fb009dcSAndroid Build Coastguard Worker@@  AES-encrypt q0.
82*8fb009dcSAndroid Build Coastguard Worker@@
83*8fb009dcSAndroid Build Coastguard Worker@@  Inputs:
84*8fb009dcSAndroid Build Coastguard Worker@@     q0 = input
85*8fb009dcSAndroid Build Coastguard Worker@@     q9-q15 as in _vpaes_preheat
86*8fb009dcSAndroid Build Coastguard Worker@@    [r2] = scheduled keys
87*8fb009dcSAndroid Build Coastguard Worker@@
88*8fb009dcSAndroid Build Coastguard Worker@@  Output in q0
89*8fb009dcSAndroid Build Coastguard Worker@@  Clobbers  q1-q5, r8-r11
90*8fb009dcSAndroid Build Coastguard Worker@@  Preserves q6-q8 so you get some local vectors
91*8fb009dcSAndroid Build Coastguard Worker@@
92*8fb009dcSAndroid Build Coastguard Worker@@
93*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_encrypt_core,%function
94*8fb009dcSAndroid Build Coastguard Worker.align	4
95*8fb009dcSAndroid Build Coastguard Worker_vpaes_encrypt_core:
96*8fb009dcSAndroid Build Coastguard Worker	mov	r9, r2
97*8fb009dcSAndroid Build Coastguard Worker	ldr	r8, [r2,#240]		@ pull rounds
98*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_ipt
99*8fb009dcSAndroid Build Coastguard Worker	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
100*8fb009dcSAndroid Build Coastguard Worker	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
101*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q2, q3}, [r11]
102*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_mc_forward+16
103*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
104*8fb009dcSAndroid Build Coastguard Worker	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
105*8fb009dcSAndroid Build Coastguard Worker	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
106*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q2}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm1
107*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q2}, d3
108*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q3}, d0	@ vpshufb	%xmm0,	%xmm3,	%xmm2
109*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q3}, d1
110*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
111*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
112*8fb009dcSAndroid Build Coastguard Worker
113*8fb009dcSAndroid Build Coastguard Worker	@ .Lenc_entry ends with a bnz instruction which is normally paired with
114*8fb009dcSAndroid Build Coastguard Worker	@ subs in .Lenc_loop.
115*8fb009dcSAndroid Build Coastguard Worker	tst	r8, r8
116*8fb009dcSAndroid Build Coastguard Worker	b	.Lenc_entry
117*8fb009dcSAndroid Build Coastguard Worker
118*8fb009dcSAndroid Build Coastguard Worker.align	4
119*8fb009dcSAndroid Build Coastguard Worker.Lenc_loop:
120*8fb009dcSAndroid Build Coastguard Worker	@ middle of middle round
121*8fb009dcSAndroid Build Coastguard Worker	add	r10, r11, #0x40
122*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q13}, d4	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
123*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q13}, d5
124*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
125*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d0, {q12}, d6	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
126*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d1, {q12}, d7
127*8fb009dcSAndroid Build Coastguard Worker	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
128*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d10, {q15}, d4	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
129*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d11, {q15}, d5
130*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
131*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q14}, d6	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
132*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q14}, d7
133*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
134*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q0}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
135*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q0}, d3
136*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
137*8fb009dcSAndroid Build Coastguard Worker	@ Write to q5 instead of q0, so the table and destination registers do
138*8fb009dcSAndroid Build Coastguard Worker	@ not overlap.
139*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d10, {q0}, d8	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
140*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d11, {q0}, d9
141*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
142*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
143*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q3}, d3
144*8fb009dcSAndroid Build Coastguard Worker	@ Here we restore the original q0/q5 usage.
145*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
146*8fb009dcSAndroid Build Coastguard Worker	and	r11, r11, #~(1<<6)	@ and		$0x30,	%r11		# ... mod 4
147*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
148*8fb009dcSAndroid Build Coastguard Worker	subs	r8, r8, #1		@ nr--
149*8fb009dcSAndroid Build Coastguard Worker
150*8fb009dcSAndroid Build Coastguard Worker.Lenc_entry:
151*8fb009dcSAndroid Build Coastguard Worker	@ top of round
152*8fb009dcSAndroid Build Coastguard Worker	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
153*8fb009dcSAndroid Build Coastguard Worker	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
154*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d10, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
155*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d11, {q11}, d3
156*8fb009dcSAndroid Build Coastguard Worker	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
157*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
158*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q10}, d1
159*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
160*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q10}, d3
161*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
162*8fb009dcSAndroid Build Coastguard Worker	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
163*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
164*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q10}, d7
165*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
166*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q10}, d9
167*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
168*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
169*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
170*8fb009dcSAndroid Build Coastguard Worker	bne	.Lenc_loop
171*8fb009dcSAndroid Build Coastguard Worker
172*8fb009dcSAndroid Build Coastguard Worker	@ middle of last round
173*8fb009dcSAndroid Build Coastguard Worker	add	r10, r11, #0x80
174*8fb009dcSAndroid Build Coastguard Worker
175*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_sbo
176*8fb009dcSAndroid Build Coastguard Worker	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
177*8fb009dcSAndroid Build Coastguard Worker	@ overlap table and destination registers.
178*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
179*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
180*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
181*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q1}, d5
182*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
183*8fb009dcSAndroid Build Coastguard Worker	@ Write to q2 instead of q0 below, to avoid overlapping table and
184*8fb009dcSAndroid Build Coastguard Worker	@ destination registers.
185*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q0}, d6	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
186*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q0}, d7
187*8fb009dcSAndroid Build Coastguard Worker	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
188*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
189*8fb009dcSAndroid Build Coastguard Worker	@ Here we restore the original q0/q2 usage.
190*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0
191*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d1, {q2}, d3
192*8fb009dcSAndroid Build Coastguard Worker	bx	lr
193*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
194*8fb009dcSAndroid Build Coastguard Worker
195*8fb009dcSAndroid Build Coastguard Worker.globl	vpaes_encrypt
196*8fb009dcSAndroid Build Coastguard Worker.hidden	vpaes_encrypt
197*8fb009dcSAndroid Build Coastguard Worker.type	vpaes_encrypt,%function
198*8fb009dcSAndroid Build Coastguard Worker.align	4
199*8fb009dcSAndroid Build Coastguard Workervpaes_encrypt:
200*8fb009dcSAndroid Build Coastguard Worker	@ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
201*8fb009dcSAndroid Build Coastguard Worker	@ alignment.
202*8fb009dcSAndroid Build Coastguard Worker	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
203*8fb009dcSAndroid Build Coastguard Worker	@ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
204*8fb009dcSAndroid Build Coastguard Worker	vstmdb	sp!, {d8,d9,d10,d11}
205*8fb009dcSAndroid Build Coastguard Worker
206*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r0]
207*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_preheat
208*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_encrypt_core
209*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q0}, [r1]
210*8fb009dcSAndroid Build Coastguard Worker
211*8fb009dcSAndroid Build Coastguard Worker	vldmia	sp!, {d8,d9,d10,d11}
212*8fb009dcSAndroid Build Coastguard Worker	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
213*8fb009dcSAndroid Build Coastguard Worker.size	vpaes_encrypt,.-vpaes_encrypt
214*8fb009dcSAndroid Build Coastguard Worker
215*8fb009dcSAndroid Build Coastguard Worker@
216*8fb009dcSAndroid Build Coastguard Worker@  Decryption stuff
217*8fb009dcSAndroid Build Coastguard Worker@
218*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_decrypt_consts,%object
219*8fb009dcSAndroid Build Coastguard Worker.align	4
220*8fb009dcSAndroid Build Coastguard Worker_vpaes_decrypt_consts:
221*8fb009dcSAndroid Build Coastguard Worker.Lk_dipt:@ decryption input transform
222*8fb009dcSAndroid Build Coastguard Worker.quad	0x0F505B040B545F00, 0x154A411E114E451A
223*8fb009dcSAndroid Build Coastguard Worker.quad	0x86E383E660056500, 0x12771772F491F194
224*8fb009dcSAndroid Build Coastguard Worker.Lk_dsbo:@ decryption sbox final output
225*8fb009dcSAndroid Build Coastguard Worker.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
226*8fb009dcSAndroid Build Coastguard Worker.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
227*8fb009dcSAndroid Build Coastguard Worker.Lk_dsb9:@ decryption sbox output *9*u, *9*t
228*8fb009dcSAndroid Build Coastguard Worker.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
229*8fb009dcSAndroid Build Coastguard Worker.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
230*8fb009dcSAndroid Build Coastguard Worker.Lk_dsbd:@ decryption sbox output *D*u, *D*t
231*8fb009dcSAndroid Build Coastguard Worker.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
232*8fb009dcSAndroid Build Coastguard Worker.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
233*8fb009dcSAndroid Build Coastguard Worker.Lk_dsbb:@ decryption sbox output *B*u, *B*t
234*8fb009dcSAndroid Build Coastguard Worker.quad	0xD022649296B44200, 0x602646F6B0F2D404
235*8fb009dcSAndroid Build Coastguard Worker.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
236*8fb009dcSAndroid Build Coastguard Worker.Lk_dsbe:@ decryption sbox output *E*u, *E*t
237*8fb009dcSAndroid Build Coastguard Worker.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
238*8fb009dcSAndroid Build Coastguard Worker.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
239*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_decrypt_consts,.-_vpaes_decrypt_consts
240*8fb009dcSAndroid Build Coastguard Worker
241*8fb009dcSAndroid Build Coastguard Worker@@
242*8fb009dcSAndroid Build Coastguard Worker@@  Decryption core
243*8fb009dcSAndroid Build Coastguard Worker@@
244*8fb009dcSAndroid Build Coastguard Worker@@  Same API as encryption core, except it clobbers q12-q15 rather than using
245*8fb009dcSAndroid Build Coastguard Worker@@  the values from _vpaes_preheat. q9-q11 must still be set from
246*8fb009dcSAndroid Build Coastguard Worker@@  _vpaes_preheat.
247*8fb009dcSAndroid Build Coastguard Worker@@
248*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_decrypt_core,%function
249*8fb009dcSAndroid Build Coastguard Worker.align	4
250*8fb009dcSAndroid Build Coastguard Worker_vpaes_decrypt_core:
251*8fb009dcSAndroid Build Coastguard Worker	mov	r9, r2
252*8fb009dcSAndroid Build Coastguard Worker	ldr	r8, [r2,#240]		@ pull rounds
253*8fb009dcSAndroid Build Coastguard Worker
254*8fb009dcSAndroid Build Coastguard Worker	@ This function performs shuffles with various constants. The x86_64
255*8fb009dcSAndroid Build Coastguard Worker	@ version loads them on-demand into %xmm0-%xmm5. This does not work well
256*8fb009dcSAndroid Build Coastguard Worker	@ for ARMv7 because those registers are shuffle destinations. The ARMv8
257*8fb009dcSAndroid Build Coastguard Worker	@ version preloads those constants into registers, but ARMv7 has half
258*8fb009dcSAndroid Build Coastguard Worker	@ the registers to work with. Instead, we load them on-demand into
259*8fb009dcSAndroid Build Coastguard Worker	@ q12-q15, registers normally use for preloaded constants. This is fine
260*8fb009dcSAndroid Build Coastguard Worker	@ because decryption doesn't use those constants. The values are
261*8fb009dcSAndroid Build Coastguard Worker	@ constant, so this does not interfere with potential 2x optimizations.
262*8fb009dcSAndroid Build Coastguard Worker	adr	r7, .Lk_dipt
263*8fb009dcSAndroid Build Coastguard Worker
264*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q12,q13}, [r7]		@ vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
265*8fb009dcSAndroid Build Coastguard Worker	lsl	r11, r8, #4		@ mov		%rax,	%r11;	shl	$4, %r11
266*8fb009dcSAndroid Build Coastguard Worker	eor	r11, r11, #0x30		@ xor		$0x30,	%r11
267*8fb009dcSAndroid Build Coastguard Worker	adr	r10, .Lk_sr
268*8fb009dcSAndroid Build Coastguard Worker	and	r11, r11, #0x30		@ and		$0x30,	%r11
269*8fb009dcSAndroid Build Coastguard Worker	add	r11, r11, r10
270*8fb009dcSAndroid Build Coastguard Worker	adr	r10, .Lk_mc_forward+48
271*8fb009dcSAndroid Build Coastguard Worker
272*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q4}, [r9]!		@ vmovdqu	(%r9),	%xmm4		# round0 key
273*8fb009dcSAndroid Build Coastguard Worker	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1
274*8fb009dcSAndroid Build Coastguard Worker	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
275*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q12}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
276*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q12}, d3
277*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q5}, [r10]		@ vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
278*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
279*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d0, {q13}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
280*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d1, {q13}, d1
281*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q4		@ vpxor		%xmm4,	%xmm2,	%xmm2
282*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q2		@ vpxor		%xmm2,	%xmm0,	%xmm0
283*8fb009dcSAndroid Build Coastguard Worker
284*8fb009dcSAndroid Build Coastguard Worker	@ .Ldec_entry ends with a bnz instruction which is normally paired with
285*8fb009dcSAndroid Build Coastguard Worker	@ subs in .Ldec_loop.
286*8fb009dcSAndroid Build Coastguard Worker	tst	r8, r8
287*8fb009dcSAndroid Build Coastguard Worker	b	.Ldec_entry
288*8fb009dcSAndroid Build Coastguard Worker
289*8fb009dcSAndroid Build Coastguard Worker.align	4
290*8fb009dcSAndroid Build Coastguard Worker.Ldec_loop:
291*8fb009dcSAndroid Build Coastguard Worker@
292*8fb009dcSAndroid Build Coastguard Worker@  Inverse mix columns
293*8fb009dcSAndroid Build Coastguard Worker@
294*8fb009dcSAndroid Build Coastguard Worker
295*8fb009dcSAndroid Build Coastguard Worker	@ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
296*8fb009dcSAndroid Build Coastguard Worker	@ the function.
297*8fb009dcSAndroid Build Coastguard Worker	adr	r10, .Lk_dsb9
298*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
299*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
300*8fb009dcSAndroid Build Coastguard Worker	@ Load sbd* ahead of time.
301*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
302*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
303*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
304*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q12}, d5
305*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
306*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q13}, d7
307*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q4, q0		@ vpxor		%xmm4,	%xmm0,	%xmm0
308*8fb009dcSAndroid Build Coastguard Worker
309*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
310*8fb009dcSAndroid Build Coastguard Worker
311*8fb009dcSAndroid Build Coastguard Worker	@ Load sbb* ahead of time.
312*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	0x20(%r10),%xmm4		# 4 : sbbu
313*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	0x30(%r10),%xmm1		# 0 : sbbt
314*8fb009dcSAndroid Build Coastguard Worker
315*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
316*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q14}, d5
317*8fb009dcSAndroid Build Coastguard Worker	@ Write to q1 instead of q0, so the table and destination registers do
318*8fb009dcSAndroid Build Coastguard Worker	@ not overlap.
319*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
320*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q0}, d11
321*8fb009dcSAndroid Build Coastguard Worker	@ Here we restore the original q0/q1 usage. This instruction is
322*8fb009dcSAndroid Build Coastguard Worker	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
323*8fb009dcSAndroid Build Coastguard Worker	@ below.
324*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
325*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
326*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q15}, d7
327*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
328*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
329*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
330*8fb009dcSAndroid Build Coastguard Worker
331*8fb009dcSAndroid Build Coastguard Worker	@ Load sbd* ahead of time.
332*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x40(%r10),%xmm4		# 4 : sbeu
333*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	0x50(%r10),%xmm1		# 0 : sbet
334*8fb009dcSAndroid Build Coastguard Worker
335*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
336*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q12}, d5
337*8fb009dcSAndroid Build Coastguard Worker	@ Write to q1 instead of q0, so the table and destination registers do
338*8fb009dcSAndroid Build Coastguard Worker	@ not overlap.
339*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
340*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q0}, d11
341*8fb009dcSAndroid Build Coastguard Worker	@ Here we restore the original q0/q1 usage. This instruction is
342*8fb009dcSAndroid Build Coastguard Worker	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
343*8fb009dcSAndroid Build Coastguard Worker	@ below.
344*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
345*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
346*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q13}, d7
347*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
348*8fb009dcSAndroid Build Coastguard Worker
349*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
350*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q14}, d5
351*8fb009dcSAndroid Build Coastguard Worker	@ Write to q1 instead of q0, so the table and destination registers do
352*8fb009dcSAndroid Build Coastguard Worker	@ not overlap.
353*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
354*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q0}, d11
355*8fb009dcSAndroid Build Coastguard Worker	@ Here we restore the original q0/q1 usage. This instruction is
356*8fb009dcSAndroid Build Coastguard Worker	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
357*8fb009dcSAndroid Build Coastguard Worker	@ below.
358*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
359*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
360*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q15}, d7
361*8fb009dcSAndroid Build Coastguard Worker	vext.8	q5, q5, q5, #12		@ vpalignr 	$12,	%xmm5,	%xmm5,	%xmm5
362*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
363*8fb009dcSAndroid Build Coastguard Worker	subs	r8, r8, #1		@ sub		$1,%rax			# nr--
364*8fb009dcSAndroid Build Coastguard Worker
365*8fb009dcSAndroid Build Coastguard Worker.Ldec_entry:
366*8fb009dcSAndroid Build Coastguard Worker	@ top of round
367*8fb009dcSAndroid Build Coastguard Worker	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1	# 0 = k
368*8fb009dcSAndroid Build Coastguard Worker	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
369*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
370*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q11}, d3
371*8fb009dcSAndroid Build Coastguard Worker	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
372*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
373*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q10}, d1
374*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
375*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q10}, d3
376*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
377*8fb009dcSAndroid Build Coastguard Worker	veor	q4, q4, q2		@ vpxor		%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
378*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
379*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q10}, d7
380*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
381*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q10}, d9
382*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
383*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q3, q0		@ vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
384*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r9]!		@ vmovdqu	(%r9),	%xmm0
385*8fb009dcSAndroid Build Coastguard Worker	bne	.Ldec_loop
386*8fb009dcSAndroid Build Coastguard Worker
387*8fb009dcSAndroid Build Coastguard Worker	@ middle of last round
388*8fb009dcSAndroid Build Coastguard Worker
389*8fb009dcSAndroid Build Coastguard Worker	adr	r10, .Lk_dsbo
390*8fb009dcSAndroid Build Coastguard Worker
391*8fb009dcSAndroid Build Coastguard Worker	@ Write to q1 rather than q4 to avoid overlapping table and destination.
392*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r10]!		@ vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
393*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
394*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q1}, d5
395*8fb009dcSAndroid Build Coastguard Worker	@ Write to q2 rather than q1 to avoid overlapping table and destination.
396*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q2}, [r10]		@ vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
397*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q2}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
398*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q2}, d7
399*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q2}, [r11]		@ vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
400*8fb009dcSAndroid Build Coastguard Worker	veor	q4, q4, q0		@ vpxor		%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
401*8fb009dcSAndroid Build Coastguard Worker	@ Write to q1 rather than q0 so the table and destination registers
402*8fb009dcSAndroid Build Coastguard Worker	@ below do not overlap.
403*8fb009dcSAndroid Build Coastguard Worker	veor	q1, q1, q4		@ vpxor		%xmm4,	%xmm1,	%xmm0	# 0 = A
404*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d0, {q1}, d4	@ vpshufb	%xmm2,	%xmm0,	%xmm0
405*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d1, {q1}, d5
406*8fb009dcSAndroid Build Coastguard Worker	bx	lr
407*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
408*8fb009dcSAndroid Build Coastguard Worker
409*8fb009dcSAndroid Build Coastguard Worker.globl	vpaes_decrypt
410*8fb009dcSAndroid Build Coastguard Worker.hidden	vpaes_decrypt
411*8fb009dcSAndroid Build Coastguard Worker.type	vpaes_decrypt,%function
412*8fb009dcSAndroid Build Coastguard Worker.align	4
413*8fb009dcSAndroid Build Coastguard Workervpaes_decrypt:
414*8fb009dcSAndroid Build Coastguard Worker	@ _vpaes_decrypt_core uses r7-r11.
415*8fb009dcSAndroid Build Coastguard Worker	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
416*8fb009dcSAndroid Build Coastguard Worker	@ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
417*8fb009dcSAndroid Build Coastguard Worker	vstmdb	sp!, {d8,d9,d10,d11}
418*8fb009dcSAndroid Build Coastguard Worker
419*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r0]
420*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_preheat
421*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_decrypt_core
422*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q0}, [r1]
423*8fb009dcSAndroid Build Coastguard Worker
424*8fb009dcSAndroid Build Coastguard Worker	vldmia	sp!, {d8,d9,d10,d11}
425*8fb009dcSAndroid Build Coastguard Worker	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
426*8fb009dcSAndroid Build Coastguard Worker.size	vpaes_decrypt,.-vpaes_decrypt
427*8fb009dcSAndroid Build Coastguard Worker@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
428*8fb009dcSAndroid Build Coastguard Worker@@                                                    @@
429*8fb009dcSAndroid Build Coastguard Worker@@                  AES key schedule                  @@
430*8fb009dcSAndroid Build Coastguard Worker@@                                                    @@
431*8fb009dcSAndroid Build Coastguard Worker@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
432*8fb009dcSAndroid Build Coastguard Worker
433*8fb009dcSAndroid Build Coastguard Worker@ This function diverges from both x86_64 and armv7 in which constants are
434*8fb009dcSAndroid Build Coastguard Worker@ pinned. x86_64 has a common preheat function for all operations. aarch64
435*8fb009dcSAndroid Build Coastguard Worker@ separates them because it has enough registers to pin nearly all constants.
436*8fb009dcSAndroid Build Coastguard Worker@ armv7 does not have enough registers, but needing explicit loads and stores
437*8fb009dcSAndroid Build Coastguard Worker@ also complicates using x86_64's register allocation directly.
438*8fb009dcSAndroid Build Coastguard Worker@
439*8fb009dcSAndroid Build Coastguard Worker@ We pin some constants for convenience and leave q14 and q15 free to load
440*8fb009dcSAndroid Build Coastguard Worker@ others on demand.
441*8fb009dcSAndroid Build Coastguard Worker
442*8fb009dcSAndroid Build Coastguard Worker@
443*8fb009dcSAndroid Build Coastguard Worker@  Key schedule constants
444*8fb009dcSAndroid Build Coastguard Worker@
445*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_key_consts,%object
446*8fb009dcSAndroid Build Coastguard Worker.align	4
447*8fb009dcSAndroid Build Coastguard Worker_vpaes_key_consts:
448*8fb009dcSAndroid Build Coastguard Worker.Lk_dksd:@ decryption key schedule: invskew x*D
449*8fb009dcSAndroid Build Coastguard Worker.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
450*8fb009dcSAndroid Build Coastguard Worker.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
451*8fb009dcSAndroid Build Coastguard Worker.Lk_dksb:@ decryption key schedule: invskew x*B
452*8fb009dcSAndroid Build Coastguard Worker.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
453*8fb009dcSAndroid Build Coastguard Worker.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
454*8fb009dcSAndroid Build Coastguard Worker.Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
455*8fb009dcSAndroid Build Coastguard Worker.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
456*8fb009dcSAndroid Build Coastguard Worker.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
457*8fb009dcSAndroid Build Coastguard Worker.Lk_dks9:@ decryption key schedule: invskew x*9
458*8fb009dcSAndroid Build Coastguard Worker.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
459*8fb009dcSAndroid Build Coastguard Worker.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
460*8fb009dcSAndroid Build Coastguard Worker
461*8fb009dcSAndroid Build Coastguard Worker.Lk_rcon:@ rcon
462*8fb009dcSAndroid Build Coastguard Worker.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
463*8fb009dcSAndroid Build Coastguard Worker
464*8fb009dcSAndroid Build Coastguard Worker.Lk_opt:@ output transform
465*8fb009dcSAndroid Build Coastguard Worker.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
466*8fb009dcSAndroid Build Coastguard Worker.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
467*8fb009dcSAndroid Build Coastguard Worker.Lk_deskew:@ deskew tables: inverts the sbox's "skew"
468*8fb009dcSAndroid Build Coastguard Worker.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
469*8fb009dcSAndroid Build Coastguard Worker.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
470*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_key_consts,.-_vpaes_key_consts
471*8fb009dcSAndroid Build Coastguard Worker
472*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_key_preheat,%function
473*8fb009dcSAndroid Build Coastguard Worker.align	4
474*8fb009dcSAndroid Build Coastguard Worker_vpaes_key_preheat:
475*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_rcon
476*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q12, #0x5b			@ .Lk_s63
477*8fb009dcSAndroid Build Coastguard Worker	adr	r10, .Lk_inv			@ Must be aligned to 8 mod 16.
478*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q9, #0x0f			@ .Lk_s0F
479*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q10,q11}, [r10]		@ .Lk_inv
480*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q8}, [r11]			@ .Lk_rcon
481*8fb009dcSAndroid Build Coastguard Worker	bx	lr
482*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_key_preheat,.-_vpaes_key_preheat
483*8fb009dcSAndroid Build Coastguard Worker
484*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_schedule_core,%function
485*8fb009dcSAndroid Build Coastguard Worker.align	4
486*8fb009dcSAndroid Build Coastguard Worker_vpaes_schedule_core:
487*8fb009dcSAndroid Build Coastguard Worker	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
488*8fb009dcSAndroid Build Coastguard Worker	@ so save an extra register.
489*8fb009dcSAndroid Build Coastguard Worker	stmdb	sp!, {r3,lr}
490*8fb009dcSAndroid Build Coastguard Worker
491*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_key_preheat	@ load the tables
492*8fb009dcSAndroid Build Coastguard Worker
493*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_ipt		@ Must be aligned to 8 mod 16.
494*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r0]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
495*8fb009dcSAndroid Build Coastguard Worker
496*8fb009dcSAndroid Build Coastguard Worker	@ input transform
497*8fb009dcSAndroid Build Coastguard Worker	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
498*8fb009dcSAndroid Build Coastguard Worker	@ overlap table and destination.
499*8fb009dcSAndroid Build Coastguard Worker	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
500*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_transform
501*8fb009dcSAndroid Build Coastguard Worker	adr	r10, .Lk_sr		@ Must be aligned to 8 mod 16.
502*8fb009dcSAndroid Build Coastguard Worker	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
503*8fb009dcSAndroid Build Coastguard Worker
504*8fb009dcSAndroid Build Coastguard Worker	add	r8, r8, r10
505*8fb009dcSAndroid Build Coastguard Worker	tst	r3, r3
506*8fb009dcSAndroid Build Coastguard Worker	bne	.Lschedule_am_decrypting
507*8fb009dcSAndroid Build Coastguard Worker
508*8fb009dcSAndroid Build Coastguard Worker	@ encrypting, output zeroth round key after transform
509*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q0}, [r2]		@ vmovdqu	%xmm0,	(%rdx)
510*8fb009dcSAndroid Build Coastguard Worker	b	.Lschedule_go
511*8fb009dcSAndroid Build Coastguard Worker
512*8fb009dcSAndroid Build Coastguard Worker.Lschedule_am_decrypting:
513*8fb009dcSAndroid Build Coastguard Worker	@ decrypting, output zeroth round key after shiftrows
514*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
515*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q4}, d2	@ vpshufb  	%xmm1,	%xmm3,	%xmm3
516*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q4}, d3
517*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q3}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
518*8fb009dcSAndroid Build Coastguard Worker	eor	r8, r8, #0x30		@ xor	$0x30, %r8
519*8fb009dcSAndroid Build Coastguard Worker
520*8fb009dcSAndroid Build Coastguard Worker.Lschedule_go:
521*8fb009dcSAndroid Build Coastguard Worker	cmp	r1, #192		@ cmp	$192,	%esi
522*8fb009dcSAndroid Build Coastguard Worker	bhi	.Lschedule_256
523*8fb009dcSAndroid Build Coastguard Worker	beq	.Lschedule_192
524*8fb009dcSAndroid Build Coastguard Worker	@ 128: fall though
525*8fb009dcSAndroid Build Coastguard Worker
526*8fb009dcSAndroid Build Coastguard Worker@@
527*8fb009dcSAndroid Build Coastguard Worker@@  .schedule_128
528*8fb009dcSAndroid Build Coastguard Worker@@
529*8fb009dcSAndroid Build Coastguard Worker@@  128-bit specific part of key schedule.
530*8fb009dcSAndroid Build Coastguard Worker@@
531*8fb009dcSAndroid Build Coastguard Worker@@  This schedule is really simple, because all its parts
532*8fb009dcSAndroid Build Coastguard Worker@@  are accomplished by the subroutines.
533*8fb009dcSAndroid Build Coastguard Worker@@
534*8fb009dcSAndroid Build Coastguard Worker.Lschedule_128:
535*8fb009dcSAndroid Build Coastguard Worker	mov	r0, #10		@ mov	$10, %esi
536*8fb009dcSAndroid Build Coastguard Worker
537*8fb009dcSAndroid Build Coastguard Worker.Loop_schedule_128:
538*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_round
539*8fb009dcSAndroid Build Coastguard Worker	subs	r0, r0, #1		@ dec	%esi
540*8fb009dcSAndroid Build Coastguard Worker	beq	.Lschedule_mangle_last
541*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_mangle	@ write output
542*8fb009dcSAndroid Build Coastguard Worker	b	.Loop_schedule_128
543*8fb009dcSAndroid Build Coastguard Worker
544*8fb009dcSAndroid Build Coastguard Worker@@
545*8fb009dcSAndroid Build Coastguard Worker@@  .aes_schedule_192
546*8fb009dcSAndroid Build Coastguard Worker@@
547*8fb009dcSAndroid Build Coastguard Worker@@  192-bit specific part of key schedule.
548*8fb009dcSAndroid Build Coastguard Worker@@
549*8fb009dcSAndroid Build Coastguard Worker@@  The main body of this schedule is the same as the 128-bit
550*8fb009dcSAndroid Build Coastguard Worker@@  schedule, but with more smearing.  The long, high side is
551*8fb009dcSAndroid Build Coastguard Worker@@  stored in q7 as before, and the short, low side is in
552*8fb009dcSAndroid Build Coastguard Worker@@  the high bits of q6.
553*8fb009dcSAndroid Build Coastguard Worker@@
554*8fb009dcSAndroid Build Coastguard Worker@@  This schedule is somewhat nastier, however, because each
555*8fb009dcSAndroid Build Coastguard Worker@@  round produces 192 bits of key material, or 1.5 round keys.
556*8fb009dcSAndroid Build Coastguard Worker@@  Therefore, on each cycle we do 2 rounds and produce 3 round
557*8fb009dcSAndroid Build Coastguard Worker@@  keys.
558*8fb009dcSAndroid Build Coastguard Worker@@
559*8fb009dcSAndroid Build Coastguard Worker.align	4
560*8fb009dcSAndroid Build Coastguard Worker.Lschedule_192:
561*8fb009dcSAndroid Build Coastguard Worker	sub	r0, r0, #8
562*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r0]			@ vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
563*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_transform	@ input transform
564*8fb009dcSAndroid Build Coastguard Worker	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save short part
565*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	d12, #0			@ vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
566*8fb009dcSAndroid Build Coastguard Worker						@ vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
567*8fb009dcSAndroid Build Coastguard Worker	mov	r0, #4			@ mov	$4,	%esi
568*8fb009dcSAndroid Build Coastguard Worker
569*8fb009dcSAndroid Build Coastguard Worker.Loop_schedule_192:
570*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_round
571*8fb009dcSAndroid Build Coastguard Worker	vext.8	q0, q6, q0, #8			@ vpalignr	$8,%xmm6,%xmm0,%xmm0
572*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_mangle		@ save key n
573*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_192_smear
574*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_mangle		@ save key n+1
575*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_round
576*8fb009dcSAndroid Build Coastguard Worker	subs	r0, r0, #1			@ dec	%esi
577*8fb009dcSAndroid Build Coastguard Worker	beq	.Lschedule_mangle_last
578*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_mangle		@ save key n+2
579*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_192_smear
580*8fb009dcSAndroid Build Coastguard Worker	b	.Loop_schedule_192
581*8fb009dcSAndroid Build Coastguard Worker
582*8fb009dcSAndroid Build Coastguard Worker@@
583*8fb009dcSAndroid Build Coastguard Worker@@  .aes_schedule_256
584*8fb009dcSAndroid Build Coastguard Worker@@
585*8fb009dcSAndroid Build Coastguard Worker@@  256-bit specific part of key schedule.
586*8fb009dcSAndroid Build Coastguard Worker@@
587*8fb009dcSAndroid Build Coastguard Worker@@  The structure here is very similar to the 128-bit
588*8fb009dcSAndroid Build Coastguard Worker@@  schedule, but with an additional "low side" in
589*8fb009dcSAndroid Build Coastguard Worker@@  q6.  The low side's rounds are the same as the
590*8fb009dcSAndroid Build Coastguard Worker@@  high side's, except no rcon and no rotation.
591*8fb009dcSAndroid Build Coastguard Worker@@
592*8fb009dcSAndroid Build Coastguard Worker.align	4
593*8fb009dcSAndroid Build Coastguard Worker.Lschedule_256:
594*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r0]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
595*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_transform	@ input transform
596*8fb009dcSAndroid Build Coastguard Worker	mov	r0, #7			@ mov	$7, %esi
597*8fb009dcSAndroid Build Coastguard Worker
598*8fb009dcSAndroid Build Coastguard Worker.Loop_schedule_256:
599*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_mangle		@ output low result
600*8fb009dcSAndroid Build Coastguard Worker	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
601*8fb009dcSAndroid Build Coastguard Worker
602*8fb009dcSAndroid Build Coastguard Worker	@ high round
603*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_round
604*8fb009dcSAndroid Build Coastguard Worker	subs	r0, r0, #1			@ dec	%esi
605*8fb009dcSAndroid Build Coastguard Worker	beq	.Lschedule_mangle_last
606*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_mangle
607*8fb009dcSAndroid Build Coastguard Worker
608*8fb009dcSAndroid Build Coastguard Worker	@ low round. swap xmm7 and xmm6
609*8fb009dcSAndroid Build Coastguard Worker	vdup.32	q0, d1[1]		@ vpshufd	$0xFF,	%xmm0,	%xmm0
610*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q4, #0
611*8fb009dcSAndroid Build Coastguard Worker	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
612*8fb009dcSAndroid Build Coastguard Worker	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
613*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_low_round
614*8fb009dcSAndroid Build Coastguard Worker	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
615*8fb009dcSAndroid Build Coastguard Worker
616*8fb009dcSAndroid Build Coastguard Worker	b	.Loop_schedule_256
617*8fb009dcSAndroid Build Coastguard Worker
618*8fb009dcSAndroid Build Coastguard Worker@@
619*8fb009dcSAndroid Build Coastguard Worker@@  .aes_schedule_mangle_last
620*8fb009dcSAndroid Build Coastguard Worker@@
621*8fb009dcSAndroid Build Coastguard Worker@@  Mangler for last round of key schedule
622*8fb009dcSAndroid Build Coastguard Worker@@  Mangles q0
623*8fb009dcSAndroid Build Coastguard Worker@@    when encrypting, outputs out(q0) ^ 63
624*8fb009dcSAndroid Build Coastguard Worker@@    when decrypting, outputs unskew(q0)
625*8fb009dcSAndroid Build Coastguard Worker@@
626*8fb009dcSAndroid Build Coastguard Worker@@  Always called right before return... jumps to cleanup and exits
627*8fb009dcSAndroid Build Coastguard Worker@@
628*8fb009dcSAndroid Build Coastguard Worker.align	4
629*8fb009dcSAndroid Build Coastguard Worker.Lschedule_mangle_last:
630*8fb009dcSAndroid Build Coastguard Worker	@ schedule last round key from xmm0
631*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_deskew			@ lea	.Lk_deskew(%rip),%r11	# prepare to deskew
632*8fb009dcSAndroid Build Coastguard Worker	tst	r3, r3
633*8fb009dcSAndroid Build Coastguard Worker	bne	.Lschedule_mangle_last_dec
634*8fb009dcSAndroid Build Coastguard Worker
635*8fb009dcSAndroid Build Coastguard Worker	@ encrypting
636*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
637*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_opt		@ lea		.Lk_opt(%rip),	%r11		# prepare to output transform
638*8fb009dcSAndroid Build Coastguard Worker	add	r2, r2, #32		@ add		$32,	%rdx
639*8fb009dcSAndroid Build Coastguard Worker	vmov	q2, q0
640*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
641*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d1, {q2}, d3
642*8fb009dcSAndroid Build Coastguard Worker
643*8fb009dcSAndroid Build Coastguard Worker.Lschedule_mangle_last_dec:
644*8fb009dcSAndroid Build Coastguard Worker	sub	r2, r2, #16			@ add	$-16,	%rdx
645*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q12			@ vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
646*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_transform	@ output transform
647*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q0}, [r2]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
648*8fb009dcSAndroid Build Coastguard Worker
649*8fb009dcSAndroid Build Coastguard Worker	@ cleanup
650*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
651*8fb009dcSAndroid Build Coastguard Worker	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
652*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
653*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
654*8fb009dcSAndroid Build Coastguard Worker	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
655*8fb009dcSAndroid Build Coastguard Worker	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
656*8fb009dcSAndroid Build Coastguard Worker	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
657*8fb009dcSAndroid Build Coastguard Worker	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
658*8fb009dcSAndroid Build Coastguard Worker	ldmia	sp!, {r3,pc}		@ return
659*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_schedule_core,.-_vpaes_schedule_core
660*8fb009dcSAndroid Build Coastguard Worker
661*8fb009dcSAndroid Build Coastguard Worker@@
662*8fb009dcSAndroid Build Coastguard Worker@@  .aes_schedule_192_smear
663*8fb009dcSAndroid Build Coastguard Worker@@
664*8fb009dcSAndroid Build Coastguard Worker@@  Smear the short, low side in the 192-bit key schedule.
665*8fb009dcSAndroid Build Coastguard Worker@@
666*8fb009dcSAndroid Build Coastguard Worker@@  Inputs:
667*8fb009dcSAndroid Build Coastguard Worker@@    q7: high side, b  a  x  y
668*8fb009dcSAndroid Build Coastguard Worker@@    q6:  low side, d  c  0  0
669*8fb009dcSAndroid Build Coastguard Worker@@
670*8fb009dcSAndroid Build Coastguard Worker@@  Outputs:
671*8fb009dcSAndroid Build Coastguard Worker@@    q6: b+c+d  b+c  0  0
672*8fb009dcSAndroid Build Coastguard Worker@@    q0: b+c+d  b+c  b  a
673*8fb009dcSAndroid Build Coastguard Worker@@
674*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_schedule_192_smear,%function
675*8fb009dcSAndroid Build Coastguard Worker.align	4
676*8fb009dcSAndroid Build Coastguard Worker_vpaes_schedule_192_smear:
677*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q1, #0
678*8fb009dcSAndroid Build Coastguard Worker	vdup.32	q0, d15[1]
679*8fb009dcSAndroid Build Coastguard Worker	vshl.i64	q1, q6, #32		@ vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
680*8fb009dcSAndroid Build Coastguard Worker	vmov	d0, d15		@ vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
681*8fb009dcSAndroid Build Coastguard Worker	veor	q6, q6, q1		@ vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
682*8fb009dcSAndroid Build Coastguard Worker	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
683*8fb009dcSAndroid Build Coastguard Worker	veor	q6, q6, q0		@ vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
684*8fb009dcSAndroid Build Coastguard Worker	vmov	q0, q6			@ vmovdqa	%xmm6,	%xmm0
685*8fb009dcSAndroid Build Coastguard Worker	vmov	d12, d2		@ vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
686*8fb009dcSAndroid Build Coastguard Worker	bx	lr
687*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
688*8fb009dcSAndroid Build Coastguard Worker
689*8fb009dcSAndroid Build Coastguard Worker@@
690*8fb009dcSAndroid Build Coastguard Worker@@  .aes_schedule_round
691*8fb009dcSAndroid Build Coastguard Worker@@
692*8fb009dcSAndroid Build Coastguard Worker@@  Runs one main round of the key schedule on q0, q7
693*8fb009dcSAndroid Build Coastguard Worker@@
694*8fb009dcSAndroid Build Coastguard Worker@@  Specifically, runs subbytes on the high dword of q0
695*8fb009dcSAndroid Build Coastguard Worker@@  then rotates it by one byte and xors into the low dword of
696*8fb009dcSAndroid Build Coastguard Worker@@  q7.
697*8fb009dcSAndroid Build Coastguard Worker@@
698*8fb009dcSAndroid Build Coastguard Worker@@  Adds rcon from low byte of q8, then rotates q8 for
699*8fb009dcSAndroid Build Coastguard Worker@@  next rcon.
700*8fb009dcSAndroid Build Coastguard Worker@@
701*8fb009dcSAndroid Build Coastguard Worker@@  Smears the dwords of q7 by xoring the low into the
702*8fb009dcSAndroid Build Coastguard Worker@@  second low, result into third, result into highest.
703*8fb009dcSAndroid Build Coastguard Worker@@
704*8fb009dcSAndroid Build Coastguard Worker@@  Returns results in q7 = q0.
705*8fb009dcSAndroid Build Coastguard Worker@@  Clobbers q1-q4, r11.
706*8fb009dcSAndroid Build Coastguard Worker@@
707*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_schedule_round,%function
708*8fb009dcSAndroid Build Coastguard Worker.align	4
709*8fb009dcSAndroid Build Coastguard Worker_vpaes_schedule_round:
710*8fb009dcSAndroid Build Coastguard Worker	@ extract rcon from xmm8
711*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
712*8fb009dcSAndroid Build Coastguard Worker	vext.8	q1, q8, q4, #15		@ vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
713*8fb009dcSAndroid Build Coastguard Worker	vext.8	q8, q8, q8, #15	@ vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
714*8fb009dcSAndroid Build Coastguard Worker	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
715*8fb009dcSAndroid Build Coastguard Worker
716*8fb009dcSAndroid Build Coastguard Worker	@ rotate
717*8fb009dcSAndroid Build Coastguard Worker	vdup.32	q0, d1[1]			@ vpshufd	$0xFF,	%xmm0,	%xmm0
718*8fb009dcSAndroid Build Coastguard Worker	vext.8	q0, q0, q0, #1			@ vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
719*8fb009dcSAndroid Build Coastguard Worker
720*8fb009dcSAndroid Build Coastguard Worker	@ fall through...
721*8fb009dcSAndroid Build Coastguard Worker
722*8fb009dcSAndroid Build Coastguard Worker	@ low round: same as high round, but no rotation and no rcon.
723*8fb009dcSAndroid Build Coastguard Worker_vpaes_schedule_low_round:
724*8fb009dcSAndroid Build Coastguard Worker	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
725*8fb009dcSAndroid Build Coastguard Worker	@ We pin other values in _vpaes_key_preheat, so load them now.
726*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_sb1
727*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q14,q15}, [r11]
728*8fb009dcSAndroid Build Coastguard Worker
729*8fb009dcSAndroid Build Coastguard Worker	@ smear xmm7
730*8fb009dcSAndroid Build Coastguard Worker	vext.8	q1, q4, q7, #12			@ vpslldq	$4,	%xmm7,	%xmm1
731*8fb009dcSAndroid Build Coastguard Worker	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
732*8fb009dcSAndroid Build Coastguard Worker	vext.8	q4, q4, q7, #8			@ vpslldq	$8,	%xmm7,	%xmm4
733*8fb009dcSAndroid Build Coastguard Worker
734*8fb009dcSAndroid Build Coastguard Worker	@ subbytes
735*8fb009dcSAndroid Build Coastguard Worker	vand	q1, q0, q9			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
736*8fb009dcSAndroid Build Coastguard Worker	vshr.u8	q0, q0, #4			@ vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
737*8fb009dcSAndroid Build Coastguard Worker	veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
738*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q11}, d2		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
739*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q11}, d3
740*8fb009dcSAndroid Build Coastguard Worker	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
741*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q10}, d0		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
742*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q10}, d1
743*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
744*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q10}, d2		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
745*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q10}, d3
746*8fb009dcSAndroid Build Coastguard Worker	veor	q7, q7, q12			@ vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
747*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q10}, d6		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
748*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q10}, d7
749*8fb009dcSAndroid Build Coastguard Worker	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
750*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q10}, d8		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
751*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q10}, d9
752*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
753*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
754*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q15}, d6		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
755*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q15}, d7
756*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q14}, d4		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
757*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q14}, d5
758*8fb009dcSAndroid Build Coastguard Worker	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
759*8fb009dcSAndroid Build Coastguard Worker
760*8fb009dcSAndroid Build Coastguard Worker	@ add in smeared stuff
761*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
762*8fb009dcSAndroid Build Coastguard Worker	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
763*8fb009dcSAndroid Build Coastguard Worker	bx	lr
764*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_schedule_round,.-_vpaes_schedule_round
765*8fb009dcSAndroid Build Coastguard Worker
766*8fb009dcSAndroid Build Coastguard Worker@@
767*8fb009dcSAndroid Build Coastguard Worker@@  .aes_schedule_transform
768*8fb009dcSAndroid Build Coastguard Worker@@
769*8fb009dcSAndroid Build Coastguard Worker@@  Linear-transform q0 according to tables at [r11]
770*8fb009dcSAndroid Build Coastguard Worker@@
771*8fb009dcSAndroid Build Coastguard Worker@@  Requires that q9 = 0x0F0F... as in preheat
772*8fb009dcSAndroid Build Coastguard Worker@@  Output in q0
773*8fb009dcSAndroid Build Coastguard Worker@@  Clobbers q1, q2, q14, q15
774*8fb009dcSAndroid Build Coastguard Worker@@
775*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_schedule_transform,%function
776*8fb009dcSAndroid Build Coastguard Worker.align	4
777*8fb009dcSAndroid Build Coastguard Worker_vpaes_schedule_transform:
778*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
779*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	16(%r11),	%xmm1 # hi
780*8fb009dcSAndroid Build Coastguard Worker	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
781*8fb009dcSAndroid Build Coastguard Worker	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
782*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q14}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
783*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q14}, d3
784*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d0, {q15}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
785*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d1, {q15}, d1
786*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
787*8fb009dcSAndroid Build Coastguard Worker	bx	lr
788*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
789*8fb009dcSAndroid Build Coastguard Worker
790*8fb009dcSAndroid Build Coastguard Worker@@
791*8fb009dcSAndroid Build Coastguard Worker@@  .aes_schedule_mangle
792*8fb009dcSAndroid Build Coastguard Worker@@
793*8fb009dcSAndroid Build Coastguard Worker@@  Mangles q0 from (basis-transformed) standard version
794*8fb009dcSAndroid Build Coastguard Worker@@  to our version.
795*8fb009dcSAndroid Build Coastguard Worker@@
796*8fb009dcSAndroid Build Coastguard Worker@@  On encrypt,
797*8fb009dcSAndroid Build Coastguard Worker@@    xor with 0x63
798*8fb009dcSAndroid Build Coastguard Worker@@    multiply by circulant 0,1,1,1
799*8fb009dcSAndroid Build Coastguard Worker@@    apply shiftrows transform
800*8fb009dcSAndroid Build Coastguard Worker@@
801*8fb009dcSAndroid Build Coastguard Worker@@  On decrypt,
802*8fb009dcSAndroid Build Coastguard Worker@@    xor with 0x63
803*8fb009dcSAndroid Build Coastguard Worker@@    multiply by "inverse mixcolumns" circulant E,B,D,9
804*8fb009dcSAndroid Build Coastguard Worker@@    deskew
805*8fb009dcSAndroid Build Coastguard Worker@@    apply shiftrows transform
806*8fb009dcSAndroid Build Coastguard Worker@@
807*8fb009dcSAndroid Build Coastguard Worker@@
808*8fb009dcSAndroid Build Coastguard Worker@@  Writes out to [r2], and increments or decrements it
809*8fb009dcSAndroid Build Coastguard Worker@@  Keeps track of round number mod 4 in r8
810*8fb009dcSAndroid Build Coastguard Worker@@  Preserves q0
811*8fb009dcSAndroid Build Coastguard Worker@@  Clobbers q1-q5
812*8fb009dcSAndroid Build Coastguard Worker@@
813*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_schedule_mangle,%function
814*8fb009dcSAndroid Build Coastguard Worker.align	4
815*8fb009dcSAndroid Build Coastguard Worker_vpaes_schedule_mangle:
816*8fb009dcSAndroid Build Coastguard Worker	tst	r3, r3
817*8fb009dcSAndroid Build Coastguard Worker	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
818*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
819*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q5}, [r11]		@ vmovdqa	.Lk_mc_forward(%rip),%xmm5
820*8fb009dcSAndroid Build Coastguard Worker	bne	.Lschedule_mangle_dec
821*8fb009dcSAndroid Build Coastguard Worker
822*8fb009dcSAndroid Build Coastguard Worker	@ encrypting
823*8fb009dcSAndroid Build Coastguard Worker	@ Write to q2 so we do not overlap table and destination below.
824*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q0, q12		@ vpxor		.Lk_s63(%rip),	%xmm0,	%xmm4
825*8fb009dcSAndroid Build Coastguard Worker	add	r2, r2, #16		@ add		$16,	%rdx
826*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q2}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm4
827*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q2}, d11
828*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q4}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm1
829*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q4}, d11
830*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q1}, d10	@ vpshufb	%xmm5,	%xmm1,	%xmm3
831*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q1}, d11
832*8fb009dcSAndroid Build Coastguard Worker	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
833*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
834*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
835*8fb009dcSAndroid Build Coastguard Worker
836*8fb009dcSAndroid Build Coastguard Worker	b	.Lschedule_mangle_both
837*8fb009dcSAndroid Build Coastguard Worker.align	4
838*8fb009dcSAndroid Build Coastguard Worker.Lschedule_mangle_dec:
839*8fb009dcSAndroid Build Coastguard Worker	@ inverse mix columns
840*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_dksd 		@ lea		.Lk_dksd(%rip),%r11
841*8fb009dcSAndroid Build Coastguard Worker	vshr.u8	q1, q4, #4		@ vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
842*8fb009dcSAndroid Build Coastguard Worker	vand	q4, q4, q9		@ vpand		%xmm9,	%xmm4,	%xmm4	# 4 = lo
843*8fb009dcSAndroid Build Coastguard Worker
844*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x00(%r11),	%xmm2
845*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	0x10(%r11),	%xmm3
846*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
847*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q14}, d9
848*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
849*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q15}, d3
850*8fb009dcSAndroid Build Coastguard Worker	@ Load .Lk_dksb ahead of time.
851*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x20(%r11),	%xmm2
852*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	0x30(%r11),	%xmm3
853*8fb009dcSAndroid Build Coastguard Worker	@ Write to q13 so we do not overlap table and destination.
854*8fb009dcSAndroid Build Coastguard Worker	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
855*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
856*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q13}, d11
857*8fb009dcSAndroid Build Coastguard Worker
858*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
859*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q14}, d9
860*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
861*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
862*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q15}, d3
863*8fb009dcSAndroid Build Coastguard Worker	@ Load .Lk_dkse ahead of time.
864*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x40(%r11),	%xmm2
865*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	0x50(%r11),	%xmm3
866*8fb009dcSAndroid Build Coastguard Worker	@ Write to q13 so we do not overlap table and destination.
867*8fb009dcSAndroid Build Coastguard Worker	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
868*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
869*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q13}, d11
870*8fb009dcSAndroid Build Coastguard Worker
871*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
872*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q14}, d9
873*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
874*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
875*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q15}, d3
876*8fb009dcSAndroid Build Coastguard Worker	@ Load .Lk_dkse ahead of time.
877*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x60(%r11),	%xmm2
878*8fb009dcSAndroid Build Coastguard Worker					@ vmovdqa	0x70(%r11),	%xmm4
879*8fb009dcSAndroid Build Coastguard Worker	@ Write to q13 so we do not overlap table and destination.
880*8fb009dcSAndroid Build Coastguard Worker	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
881*8fb009dcSAndroid Build Coastguard Worker
882*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
883*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q14}, d9
884*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
885*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d7, {q13}, d11
886*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d8, {q15}, d2	@ vpshufb	%xmm1,	%xmm4,	%xmm4
887*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d9, {q15}, d3
888*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
889*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q3		@ vpxor	%xmm3,	%xmm2,	%xmm2
890*8fb009dcSAndroid Build Coastguard Worker	veor	q3, q4, q2		@ vpxor	%xmm2,	%xmm4,	%xmm3
891*8fb009dcSAndroid Build Coastguard Worker
892*8fb009dcSAndroid Build Coastguard Worker	sub	r2, r2, #16		@ add	$-16,	%rdx
893*8fb009dcSAndroid Build Coastguard Worker
894*8fb009dcSAndroid Build Coastguard Worker.Lschedule_mangle_both:
895*8fb009dcSAndroid Build Coastguard Worker	@ Write to q2 so table and destination do not overlap.
896*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
897*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q3}, d3
898*8fb009dcSAndroid Build Coastguard Worker	add	r8, r8, #64-16		@ add	$-16,	%r8
899*8fb009dcSAndroid Build Coastguard Worker	and	r8, r8, #~(1<<6)	@ and	$0x30,	%r8
900*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q2}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
901*8fb009dcSAndroid Build Coastguard Worker	bx	lr
902*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
903*8fb009dcSAndroid Build Coastguard Worker
904*8fb009dcSAndroid Build Coastguard Worker.globl	vpaes_set_encrypt_key
905*8fb009dcSAndroid Build Coastguard Worker.hidden	vpaes_set_encrypt_key
906*8fb009dcSAndroid Build Coastguard Worker.type	vpaes_set_encrypt_key,%function
907*8fb009dcSAndroid Build Coastguard Worker.align	4
908*8fb009dcSAndroid Build Coastguard Workervpaes_set_encrypt_key:
909*8fb009dcSAndroid Build Coastguard Worker	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
910*8fb009dcSAndroid Build Coastguard Worker	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
911*8fb009dcSAndroid Build Coastguard Worker
912*8fb009dcSAndroid Build Coastguard Worker	lsr	r9, r1, #5		@ shr	$5,%eax
913*8fb009dcSAndroid Build Coastguard Worker	add	r9, r9, #5		@ $5,%eax
914*8fb009dcSAndroid Build Coastguard Worker	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
915*8fb009dcSAndroid Build Coastguard Worker
916*8fb009dcSAndroid Build Coastguard Worker	mov	r3, #0		@ mov	$0,%ecx
917*8fb009dcSAndroid Build Coastguard Worker	mov	r8, #0x30		@ mov	$0x30,%r8d
918*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_core
919*8fb009dcSAndroid Build Coastguard Worker	eor	r0, r0, r0
920*8fb009dcSAndroid Build Coastguard Worker
921*8fb009dcSAndroid Build Coastguard Worker	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
922*8fb009dcSAndroid Build Coastguard Worker	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
923*8fb009dcSAndroid Build Coastguard Worker.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
924*8fb009dcSAndroid Build Coastguard Worker
925*8fb009dcSAndroid Build Coastguard Worker.globl	vpaes_set_decrypt_key
926*8fb009dcSAndroid Build Coastguard Worker.hidden	vpaes_set_decrypt_key
927*8fb009dcSAndroid Build Coastguard Worker.type	vpaes_set_decrypt_key,%function
928*8fb009dcSAndroid Build Coastguard Worker.align	4
929*8fb009dcSAndroid Build Coastguard Workervpaes_set_decrypt_key:
930*8fb009dcSAndroid Build Coastguard Worker	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
931*8fb009dcSAndroid Build Coastguard Worker	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
932*8fb009dcSAndroid Build Coastguard Worker
933*8fb009dcSAndroid Build Coastguard Worker	lsr	r9, r1, #5		@ shr	$5,%eax
934*8fb009dcSAndroid Build Coastguard Worker	add	r9, r9, #5		@ $5,%eax
935*8fb009dcSAndroid Build Coastguard Worker	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
936*8fb009dcSAndroid Build Coastguard Worker	lsl	r9, r9, #4		@ shl	$4,%eax
937*8fb009dcSAndroid Build Coastguard Worker	add	r2, r2, #16		@ lea	16(%rdx,%rax),%rdx
938*8fb009dcSAndroid Build Coastguard Worker	add	r2, r2, r9
939*8fb009dcSAndroid Build Coastguard Worker
940*8fb009dcSAndroid Build Coastguard Worker	mov	r3, #1		@ mov	$1,%ecx
941*8fb009dcSAndroid Build Coastguard Worker	lsr	r8, r1, #1		@ shr	$1,%r8d
942*8fb009dcSAndroid Build Coastguard Worker	and	r8, r8, #32		@ and	$32,%r8d
943*8fb009dcSAndroid Build Coastguard Worker	eor	r8, r8, #32		@ xor	$32,%r8d	# nbits==192?0:32
944*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_core
945*8fb009dcSAndroid Build Coastguard Worker
946*8fb009dcSAndroid Build Coastguard Worker	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
947*8fb009dcSAndroid Build Coastguard Worker	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
948*8fb009dcSAndroid Build Coastguard Worker.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
949*8fb009dcSAndroid Build Coastguard Worker
950*8fb009dcSAndroid Build Coastguard Worker@ Additional constants for converting to bsaes.
951*8fb009dcSAndroid Build Coastguard Worker.type	_vpaes_convert_consts,%object
952*8fb009dcSAndroid Build Coastguard Worker.align	4
953*8fb009dcSAndroid Build Coastguard Worker_vpaes_convert_consts:
954*8fb009dcSAndroid Build Coastguard Worker@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
955*8fb009dcSAndroid Build Coastguard Worker@ transform in the AES S-box. 0x63 is incorporated into the low half of the
956*8fb009dcSAndroid Build Coastguard Worker@ table. This was computed with the following script:
957*8fb009dcSAndroid Build Coastguard Worker@
958*8fb009dcSAndroid Build Coastguard Worker@   def u64s_to_u128(x, y):
959*8fb009dcSAndroid Build Coastguard Worker@       return x | (y << 64)
960*8fb009dcSAndroid Build Coastguard Worker@   def u128_to_u64s(w):
961*8fb009dcSAndroid Build Coastguard Worker@       return w & ((1<<64)-1), w >> 64
962*8fb009dcSAndroid Build Coastguard Worker@   def get_byte(w, i):
963*8fb009dcSAndroid Build Coastguard Worker@       return (w >> (i*8)) & 0xff
964*8fb009dcSAndroid Build Coastguard Worker@   def apply_table(table, b):
965*8fb009dcSAndroid Build Coastguard Worker@       lo = b & 0xf
966*8fb009dcSAndroid Build Coastguard Worker@       hi = b >> 4
967*8fb009dcSAndroid Build Coastguard Worker@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
968*8fb009dcSAndroid Build Coastguard Worker@   def opt(b):
969*8fb009dcSAndroid Build Coastguard Worker@       table = [
970*8fb009dcSAndroid Build Coastguard Worker@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
971*8fb009dcSAndroid Build Coastguard Worker@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
972*8fb009dcSAndroid Build Coastguard Worker@       ]
973*8fb009dcSAndroid Build Coastguard Worker@       return apply_table(table, b)
974*8fb009dcSAndroid Build Coastguard Worker@   def rot_byte(b, n):
975*8fb009dcSAndroid Build Coastguard Worker@       return 0xff & ((b << n) | (b >> (8-n)))
976*8fb009dcSAndroid Build Coastguard Worker@   def skew(x):
977*8fb009dcSAndroid Build Coastguard Worker@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
978*8fb009dcSAndroid Build Coastguard Worker@               rot_byte(x, 4))
979*8fb009dcSAndroid Build Coastguard Worker@   table = [0, 0]
980*8fb009dcSAndroid Build Coastguard Worker@   for i in range(16):
981*8fb009dcSAndroid Build Coastguard Worker@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
982*8fb009dcSAndroid Build Coastguard Worker@       table[1] |= skew(opt(i<<4)) << (i*8)
983*8fb009dcSAndroid Build Coastguard Worker@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[0]))
984*8fb009dcSAndroid Build Coastguard Worker@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[1]))
985*8fb009dcSAndroid Build Coastguard Worker.Lk_opt_then_skew:
986*8fb009dcSAndroid Build Coastguard Worker.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
987*8fb009dcSAndroid Build Coastguard Worker.quad	0x1f30062936192f00, 0xb49bad829db284ab
988*8fb009dcSAndroid Build Coastguard Worker
989*8fb009dcSAndroid Build Coastguard Worker@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
990*8fb009dcSAndroid Build Coastguard Worker@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
991*8fb009dcSAndroid Build Coastguard Worker@ becomes 0x22334411 and then 0x11443322.
992*8fb009dcSAndroid Build Coastguard Worker.Lk_decrypt_transform:
993*8fb009dcSAndroid Build Coastguard Worker.quad	0x0704050603000102, 0x0f0c0d0e0b08090a
994*8fb009dcSAndroid Build Coastguard Worker.size	_vpaes_convert_consts,.-_vpaes_convert_consts
995*8fb009dcSAndroid Build Coastguard Worker
996*8fb009dcSAndroid Build Coastguard Worker@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
997*8fb009dcSAndroid Build Coastguard Worker.globl	vpaes_encrypt_key_to_bsaes
998*8fb009dcSAndroid Build Coastguard Worker.hidden	vpaes_encrypt_key_to_bsaes
999*8fb009dcSAndroid Build Coastguard Worker.type	vpaes_encrypt_key_to_bsaes,%function
1000*8fb009dcSAndroid Build Coastguard Worker.align	4
1001*8fb009dcSAndroid Build Coastguard Workervpaes_encrypt_key_to_bsaes:
1002*8fb009dcSAndroid Build Coastguard Worker	stmdb	sp!, {r11, lr}
1003*8fb009dcSAndroid Build Coastguard Worker
1004*8fb009dcSAndroid Build Coastguard Worker	@ See _vpaes_schedule_core for the key schedule logic. In particular,
1005*8fb009dcSAndroid Build Coastguard Worker	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
1006*8fb009dcSAndroid Build Coastguard Worker	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
1007*8fb009dcSAndroid Build Coastguard Worker	@ contain the transformations not in the bsaes representation. This
1008*8fb009dcSAndroid Build Coastguard Worker	@ function inverts those transforms.
1009*8fb009dcSAndroid Build Coastguard Worker	@
1010*8fb009dcSAndroid Build Coastguard Worker	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
1011*8fb009dcSAndroid Build Coastguard Worker	@ representation, which does not match the other aes_nohw_*
1012*8fb009dcSAndroid Build Coastguard Worker	@ implementations. The ARM aes_nohw_* stores each 32-bit word
1013*8fb009dcSAndroid Build Coastguard Worker	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
1014*8fb009dcSAndroid Build Coastguard Worker	@ cost of extra REV and VREV32 operations in little-endian ARM.
1015*8fb009dcSAndroid Build Coastguard Worker
1016*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
1017*8fb009dcSAndroid Build Coastguard Worker	adr	r2, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
1018*8fb009dcSAndroid Build Coastguard Worker	add	r3, r2, 0x90		@ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
1019*8fb009dcSAndroid Build Coastguard Worker
1020*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q12}, [r2]
1021*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q10, #0x5b		@ .Lk_s63 from vpaes-x86_64
1022*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_opt		@ Must be aligned to 8 mod 16.
1023*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q11, #0x63		@ .LK_s63 without .Lk_ipt applied
1024*8fb009dcSAndroid Build Coastguard Worker
1025*8fb009dcSAndroid Build Coastguard Worker	@ vpaes stores one fewer round count than bsaes, but the number of keys
1026*8fb009dcSAndroid Build Coastguard Worker	@ is the same.
1027*8fb009dcSAndroid Build Coastguard Worker	ldr	r2, [r1,#240]
1028*8fb009dcSAndroid Build Coastguard Worker	add	r2, r2, #1
1029*8fb009dcSAndroid Build Coastguard Worker	str	r2, [r0,#240]
1030*8fb009dcSAndroid Build Coastguard Worker
1031*8fb009dcSAndroid Build Coastguard Worker	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
1032*8fb009dcSAndroid Build Coastguard Worker	@ Invert this with .Lk_opt.
1033*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r1]!
1034*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_transform
1035*8fb009dcSAndroid Build Coastguard Worker	vrev32.8	q0, q0
1036*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q0}, [r0]!
1037*8fb009dcSAndroid Build Coastguard Worker
1038*8fb009dcSAndroid Build Coastguard Worker	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
1039*8fb009dcSAndroid Build Coastguard Worker	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
1040*8fb009dcSAndroid Build Coastguard Worker	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
1041*8fb009dcSAndroid Build Coastguard Worker.Loop_enc_key_to_bsaes:
1042*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r1]!
1043*8fb009dcSAndroid Build Coastguard Worker
1044*8fb009dcSAndroid Build Coastguard Worker	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
1045*8fb009dcSAndroid Build Coastguard Worker	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
1046*8fb009dcSAndroid Build Coastguard Worker	@ We use r3 rather than r8 to avoid a callee-saved register.
1047*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r3]
1048*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q0}, d2
1049*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q0}, d3
1050*8fb009dcSAndroid Build Coastguard Worker	add	r3, r3, #16
1051*8fb009dcSAndroid Build Coastguard Worker	and	r3, r3, #~(1<<6)
1052*8fb009dcSAndroid Build Coastguard Worker	vmov	q0, q2
1053*8fb009dcSAndroid Build Coastguard Worker
1054*8fb009dcSAndroid Build Coastguard Worker	@ Handle the last key differently.
1055*8fb009dcSAndroid Build Coastguard Worker	subs	r2, r2, #1
1056*8fb009dcSAndroid Build Coastguard Worker	beq	.Loop_enc_key_to_bsaes_last
1057*8fb009dcSAndroid Build Coastguard Worker
1058*8fb009dcSAndroid Build Coastguard Worker	@ Multiply by the circulant. This is its own inverse.
1059*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q0}, d24
1060*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q0}, d25
1061*8fb009dcSAndroid Build Coastguard Worker	vmov	q0, q1
1062*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q1}, d24
1063*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q1}, d25
1064*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q2
1065*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q2}, d24
1066*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q2}, d25
1067*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q1
1068*8fb009dcSAndroid Build Coastguard Worker
1069*8fb009dcSAndroid Build Coastguard Worker	@ XOR and finish.
1070*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q10
1071*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_transform
1072*8fb009dcSAndroid Build Coastguard Worker	vrev32.8	q0, q0
1073*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q0}, [r0]!
1074*8fb009dcSAndroid Build Coastguard Worker	b	.Loop_enc_key_to_bsaes
1075*8fb009dcSAndroid Build Coastguard Worker
1076*8fb009dcSAndroid Build Coastguard Worker.Loop_enc_key_to_bsaes_last:
1077*8fb009dcSAndroid Build Coastguard Worker	@ The final key does not have a basis transform (note
1078*8fb009dcSAndroid Build Coastguard Worker	@ .Lschedule_mangle_last inverts the original transform). It only XORs
1079*8fb009dcSAndroid Build Coastguard Worker	@ 0x63 and applies ShiftRows. The latter was already inverted in the
1080*8fb009dcSAndroid Build Coastguard Worker	@ loop. Note that, because we act on the original representation, we use
1081*8fb009dcSAndroid Build Coastguard Worker	@ q11, not q10.
1082*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q11
1083*8fb009dcSAndroid Build Coastguard Worker	vrev32.8	q0, q0
1084*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q0}, [r0]
1085*8fb009dcSAndroid Build Coastguard Worker
1086*8fb009dcSAndroid Build Coastguard Worker	@ Wipe registers which contained key material.
1087*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q0
1088*8fb009dcSAndroid Build Coastguard Worker	veor	q1, q1, q1
1089*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q2
1090*8fb009dcSAndroid Build Coastguard Worker
1091*8fb009dcSAndroid Build Coastguard Worker	ldmia	sp!, {r11, pc}	@ return
1092*8fb009dcSAndroid Build Coastguard Worker.size	vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
1093*8fb009dcSAndroid Build Coastguard Worker
1094*8fb009dcSAndroid Build Coastguard Worker@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
1095*8fb009dcSAndroid Build Coastguard Worker.globl	vpaes_decrypt_key_to_bsaes
1096*8fb009dcSAndroid Build Coastguard Worker.hidden	vpaes_decrypt_key_to_bsaes
1097*8fb009dcSAndroid Build Coastguard Worker.type	vpaes_decrypt_key_to_bsaes,%function
1098*8fb009dcSAndroid Build Coastguard Worker.align	4
1099*8fb009dcSAndroid Build Coastguard Workervpaes_decrypt_key_to_bsaes:
1100*8fb009dcSAndroid Build Coastguard Worker	stmdb	sp!, {r11, lr}
1101*8fb009dcSAndroid Build Coastguard Worker
1102*8fb009dcSAndroid Build Coastguard Worker	@ See _vpaes_schedule_core for the key schedule logic. Note vpaes
1103*8fb009dcSAndroid Build Coastguard Worker	@ computes the decryption key schedule in reverse. Additionally,
1104*8fb009dcSAndroid Build Coastguard Worker	@ aes-x86_64.pl shares some transformations, so we must only partially
1105*8fb009dcSAndroid Build Coastguard Worker	@ invert vpaes's transformations. In general, vpaes computes in a
1106*8fb009dcSAndroid Build Coastguard Worker	@ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
1107*8fb009dcSAndroid Build Coastguard Worker	@ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
1108*8fb009dcSAndroid Build Coastguard Worker	@ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
1109*8fb009dcSAndroid Build Coastguard Worker	@
1110*8fb009dcSAndroid Build Coastguard Worker	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
1111*8fb009dcSAndroid Build Coastguard Worker	@ representation, which does not match the other aes_nohw_*
1112*8fb009dcSAndroid Build Coastguard Worker	@ implementations. The ARM aes_nohw_* stores each 32-bit word
1113*8fb009dcSAndroid Build Coastguard Worker	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
1114*8fb009dcSAndroid Build Coastguard Worker	@ cost of extra REV and VREV32 operations in little-endian ARM.
1115*8fb009dcSAndroid Build Coastguard Worker
1116*8fb009dcSAndroid Build Coastguard Worker	adr	r2, .Lk_decrypt_transform
1117*8fb009dcSAndroid Build Coastguard Worker	adr	r3, .Lk_sr+0x30
1118*8fb009dcSAndroid Build Coastguard Worker	adr	r11, .Lk_opt_then_skew	@ Input to _vpaes_schedule_transform.
1119*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q12}, [r2]	@ Reuse q12 from encryption.
1120*8fb009dcSAndroid Build Coastguard Worker	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
1121*8fb009dcSAndroid Build Coastguard Worker
1122*8fb009dcSAndroid Build Coastguard Worker	@ vpaes stores one fewer round count than bsaes, but the number of keys
1123*8fb009dcSAndroid Build Coastguard Worker	@ is the same.
1124*8fb009dcSAndroid Build Coastguard Worker	ldr	r2, [r1,#240]
1125*8fb009dcSAndroid Build Coastguard Worker	add	r2, r2, #1
1126*8fb009dcSAndroid Build Coastguard Worker	str	r2, [r0,#240]
1127*8fb009dcSAndroid Build Coastguard Worker
1128*8fb009dcSAndroid Build Coastguard Worker	@ Undo the basis change and reapply the S-box affine transform. See
1129*8fb009dcSAndroid Build Coastguard Worker	@ .Lschedule_mangle_last.
1130*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r1]!
1131*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_transform
1132*8fb009dcSAndroid Build Coastguard Worker	vrev32.8	q0, q0
1133*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q0}, [r0]!
1134*8fb009dcSAndroid Build Coastguard Worker
1135*8fb009dcSAndroid Build Coastguard Worker	@ See _vpaes_schedule_mangle for the transform on the middle keys. Note
1136*8fb009dcSAndroid Build Coastguard Worker	@ it simultaneously inverts MixColumns and the S-box affine transform.
1137*8fb009dcSAndroid Build Coastguard Worker	@ See .Lk_dksd through .Lk_dks9.
1138*8fb009dcSAndroid Build Coastguard Worker.Loop_dec_key_to_bsaes:
1139*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q0}, [r1]!
1140*8fb009dcSAndroid Build Coastguard Worker
1141*8fb009dcSAndroid Build Coastguard Worker	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
1142*8fb009dcSAndroid Build Coastguard Worker	@ forwards cancels inverting for which direction we cycle r3. We use r3
1143*8fb009dcSAndroid Build Coastguard Worker	@ rather than r8 to avoid a callee-saved register.
1144*8fb009dcSAndroid Build Coastguard Worker	vld1.64	{q1}, [r3]
1145*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d4, {q0}, d2
1146*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d5, {q0}, d3
1147*8fb009dcSAndroid Build Coastguard Worker	add	r3, r3, #64-16
1148*8fb009dcSAndroid Build Coastguard Worker	and	r3, r3, #~(1<<6)
1149*8fb009dcSAndroid Build Coastguard Worker	vmov	q0, q2
1150*8fb009dcSAndroid Build Coastguard Worker
1151*8fb009dcSAndroid Build Coastguard Worker	@ Handle the last key differently.
1152*8fb009dcSAndroid Build Coastguard Worker	subs	r2, r2, #1
1153*8fb009dcSAndroid Build Coastguard Worker	beq	.Loop_dec_key_to_bsaes_last
1154*8fb009dcSAndroid Build Coastguard Worker
1155*8fb009dcSAndroid Build Coastguard Worker	@ Undo the basis change and reapply the S-box affine transform.
1156*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_schedule_transform
1157*8fb009dcSAndroid Build Coastguard Worker
1158*8fb009dcSAndroid Build Coastguard Worker	@ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
1159*8fb009dcSAndroid Build Coastguard Worker	@ combine the two operations in .Lk_decrypt_transform.
1160*8fb009dcSAndroid Build Coastguard Worker	@
1161*8fb009dcSAndroid Build Coastguard Worker	@ TODO(davidben): Where does the rotation come from?
1162*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d2, {q0}, d24
1163*8fb009dcSAndroid Build Coastguard Worker	vtbl.8	d3, {q0}, d25
1164*8fb009dcSAndroid Build Coastguard Worker
1165*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q1}, [r0]!
1166*8fb009dcSAndroid Build Coastguard Worker	b	.Loop_dec_key_to_bsaes
1167*8fb009dcSAndroid Build Coastguard Worker
1168*8fb009dcSAndroid Build Coastguard Worker.Loop_dec_key_to_bsaes_last:
1169*8fb009dcSAndroid Build Coastguard Worker	@ The final key only inverts ShiftRows (already done in the loop). See
1170*8fb009dcSAndroid Build Coastguard Worker	@ .Lschedule_am_decrypting. Its basis is not transformed.
1171*8fb009dcSAndroid Build Coastguard Worker	vrev32.8	q0, q0
1172*8fb009dcSAndroid Build Coastguard Worker	vst1.64	{q0}, [r0]!
1173*8fb009dcSAndroid Build Coastguard Worker
1174*8fb009dcSAndroid Build Coastguard Worker	@ Wipe registers which contained key material.
1175*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q0
1176*8fb009dcSAndroid Build Coastguard Worker	veor	q1, q1, q1
1177*8fb009dcSAndroid Build Coastguard Worker	veor	q2, q2, q2
1178*8fb009dcSAndroid Build Coastguard Worker
1179*8fb009dcSAndroid Build Coastguard Worker	ldmia	sp!, {r11, pc}	@ return
1180*8fb009dcSAndroid Build Coastguard Worker.size	vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes
1181*8fb009dcSAndroid Build Coastguard Worker.globl	vpaes_ctr32_encrypt_blocks
1182*8fb009dcSAndroid Build Coastguard Worker.hidden	vpaes_ctr32_encrypt_blocks
1183*8fb009dcSAndroid Build Coastguard Worker.type	vpaes_ctr32_encrypt_blocks,%function
1184*8fb009dcSAndroid Build Coastguard Worker.align	4
1185*8fb009dcSAndroid Build Coastguard Workervpaes_ctr32_encrypt_blocks:
1186*8fb009dcSAndroid Build Coastguard Worker	mov	ip, sp
1187*8fb009dcSAndroid Build Coastguard Worker	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
1188*8fb009dcSAndroid Build Coastguard Worker	@ This function uses q4-q7 (d8-d15), which are callee-saved.
1189*8fb009dcSAndroid Build Coastguard Worker	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
1190*8fb009dcSAndroid Build Coastguard Worker
1191*8fb009dcSAndroid Build Coastguard Worker	cmp	r2, #0
1192*8fb009dcSAndroid Build Coastguard Worker	@ r8 is passed on the stack.
1193*8fb009dcSAndroid Build Coastguard Worker	ldr	r8, [ip]
1194*8fb009dcSAndroid Build Coastguard Worker	beq	.Lctr32_done
1195*8fb009dcSAndroid Build Coastguard Worker
1196*8fb009dcSAndroid Build Coastguard Worker	@ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
1197*8fb009dcSAndroid Build Coastguard Worker	mov	r9, r3
1198*8fb009dcSAndroid Build Coastguard Worker	mov	r3, r2
1199*8fb009dcSAndroid Build Coastguard Worker	mov	r2, r9
1200*8fb009dcSAndroid Build Coastguard Worker
1201*8fb009dcSAndroid Build Coastguard Worker	@ Load the IV and counter portion.
1202*8fb009dcSAndroid Build Coastguard Worker	ldr	r7, [r8, #12]
1203*8fb009dcSAndroid Build Coastguard Worker	vld1.8	{q7}, [r8]
1204*8fb009dcSAndroid Build Coastguard Worker
1205*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_preheat
1206*8fb009dcSAndroid Build Coastguard Worker	rev	r7, r7		@ The counter is big-endian.
1207*8fb009dcSAndroid Build Coastguard Worker
1208*8fb009dcSAndroid Build Coastguard Worker.Lctr32_loop:
1209*8fb009dcSAndroid Build Coastguard Worker	vmov	q0, q7
1210*8fb009dcSAndroid Build Coastguard Worker	vld1.8	{q6}, [r0]!		@ .Load input ahead of time
1211*8fb009dcSAndroid Build Coastguard Worker	bl	_vpaes_encrypt_core
1212*8fb009dcSAndroid Build Coastguard Worker	veor	q0, q0, q6		@ XOR input and result
1213*8fb009dcSAndroid Build Coastguard Worker	vst1.8	{q0}, [r1]!
1214*8fb009dcSAndroid Build Coastguard Worker	subs	r3, r3, #1
1215*8fb009dcSAndroid Build Coastguard Worker	@ Update the counter.
1216*8fb009dcSAndroid Build Coastguard Worker	add	r7, r7, #1
1217*8fb009dcSAndroid Build Coastguard Worker	rev	r9, r7
1218*8fb009dcSAndroid Build Coastguard Worker	vmov.32	d15[1], r9
1219*8fb009dcSAndroid Build Coastguard Worker	bne	.Lctr32_loop
1220*8fb009dcSAndroid Build Coastguard Worker
1221*8fb009dcSAndroid Build Coastguard Worker.Lctr32_done:
1222*8fb009dcSAndroid Build Coastguard Worker	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
1223*8fb009dcSAndroid Build Coastguard Worker	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
1224*8fb009dcSAndroid Build Coastguard Worker.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
1225*8fb009dcSAndroid Build Coastguard Worker#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
1226