xref: /aosp_15_r20/external/boringssl/src/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1#! /usr/bin/env perl
2
3# Copyright (c) 2022, ARM Inc.
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
16
17#========================================================================
18# Written by Fangming Fang <[email protected]> for the OpenSSL project,
19# derived from https://github.com/ARM-software/AArch64cryptolib, original
20# author Samuel Lee <[email protected]>.
21#========================================================================
22#
23# Approach - assume we don't want to reload constants, so reserve ~half of
24# vector register file for constants
25#
26# main loop to act on 4 16B blocks per iteration, and then do modulo of the
27# accumulated intermediate hashes from the 4 blocks
28#
29#  ____________________________________________________
30# |                                                    |
31# | PRE                                                |
32# |____________________________________________________|
33# |                |                |                  |
34# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
35# |________________|________________|__________________|
36# |                |                |                  |
37# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
38# |________________|________________|__________________|
39# |                |                |                  |
40# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
41# |________________|________________|__________________|
42# |                |                |                  |
43# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
44# |________________|____(mostly)____|__________________|
45# |                                                    |
46# | MODULO                                             |
47# |____________________________________________________|
48#
49# PRE: Ensure previous generated intermediate hash is aligned and merged with
50# result for GHASH 4k+0
51#
52# EXT low_acc, low_acc, low_acc, #8
53# EOR res_curr (4k+0), res_curr (4k+0), low_acc
54#
55# CTR block: Increment and byte reverse counter in scalar registers and transfer
56# to SIMD registers
57#
58# REV     ctr32, rev_ctr32
59# ORR     ctr64, constctr96_top32, ctr32, LSL #32
60# // Keeping this in scalar registers to free up space in SIMD RF
61# INS     ctr_next.d[0], constctr96_bottom64
62# INS     ctr_next.d[1], ctr64X
63# ADD     rev_ctr32, #1
64#
65# AES block:
66#
67# Do AES encryption/decryption on CTR block X and EOR it with input block X.
68# Take 256 bytes key below for example. Doing small trick here of loading input
69# in scalar registers, EORing with last key and then transferring Given we are
70# very constrained in our ASIMD registers this is quite important
71#
72#     Encrypt:
73# LDR     input_low, [ input_ptr  ], #8
74# LDR     input_high, [ input_ptr  ], #8
75# EOR     input_low, k14_low
76# EOR     input_high, k14_high
77# INS     res_curr.d[0], input_low
78# INS     res_curr.d[1], input_high
79# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
80# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
81# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
82# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
83# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
84# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
85# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
86# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
87# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
88# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
89# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
90# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
91# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
92# AESE    ctr_curr, k13
93# EOR     res_curr, res_curr, ctr_curr
94# ST1     { res_curr.16b  }, [ output_ptr  ], #16
95#
96#     Decrypt:
97# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
98# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
99# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
100# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
101# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
102# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
103# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
104# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
105# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
106# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
107# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
108# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
109# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
110# AESE    ctr_curr, k13
111# LDR     res_curr, [ input_ptr  ], #16
112# EOR     res_curr, res_curr, ctr_curr
113# MOV     output_low, res_curr.d[0]
114# MOV     output_high, res_curr.d[1]
115# EOR     output_low, k14_low
116# EOR     output_high, k14_high
117# STP     output_low, output_high, [ output_ptr  ], #16
118#
119# GHASH block X:
120#     Do 128b karatsuba polynomial multiplication on block. We only have
121#     64b->128b polynomial multipliers, naively that means we need to do 4 64b
122#     multiplies to generate a 128b.
123#
124# multiplication:
125#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
126#                   (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
127#
128#     The idea behind Karatsuba multiplication is that we can do just 3 64b
129#     multiplies:
130#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
131#                   (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^
132#                   Pmull(Al,Bl))<<64
133#
134#     There is some complication here because the bit order of GHASH's PMULL is
135#     reversed compared to elsewhere, so we are multiplying with "twisted"
136#     powers of H
137#
138# Note: We can PMULL directly into the acc_x in first GHASH of the loop
139#
140# Note: For scheduling big cores we want to split the processing to happen over
141#       two loop iterations - otherwise the critical path latency dominates the
142#       performance.
143#
144#       This has a knock on effect on register pressure, so we have to be a bit
145#       more clever with our temporary registers than indicated here
146#
147# REV64   res_curr, res_curr
148# INS     t_m.d[0], res_curr.d[1]
149# EOR     t_m.8B, t_m.8B, res_curr.8B
150# PMULL2  t_h, res_curr, HX
151# PMULL   t_l, res_curr, HX
152# PMULL   t_m, t_m, HX_k
153# EOR     acc_h, acc_h, t_h
154# EOR     acc_l, acc_l, t_l
155# EOR     acc_m, acc_m, t_m
156#
157# MODULO: take the partial accumulators (~representing sum of 256b
158#         multiplication results), from GHASH and do modulo reduction on them
159#         There is some complication here because the bit order of GHASH's
160#         PMULL is reversed compared to elsewhere, so we are doing modulo with
161#         a reversed constant
162#
163# EOR     acc_m, acc_m, acc_h
164# EOR     acc_m, acc_m, acc_l                // Finish off karatsuba processing
165# PMULL   t_mod, acc_h, mod_constant
166# EXT     acc_h, acc_h, acc_h, #8
167# EOR     acc_m, acc_m, acc_h
168# EOR     acc_m, acc_m, t_mod
169# PMULL   acc_h, acc_m, mod_constant
170# EXT     acc_m, acc_m, acc_m, #8
171# EOR     acc_l, acc_l, acc_h
172# EOR     acc_l, acc_l, acc_m
173#
174# This code was then modified to merge the AES-128-GCM, AES-192-GCM, and
175# AES-256-GCM implementations into a single function to reduce size. We move the
176# last two round keys into consistent registers across all sizes, as they're
177# treated special. Then, after rounds 0 through 8, we added some branches to
178# conditionally run rounds 9-10 (AES-192 + AES-256) and 11-12 (AES-256), before
179# merging back into code which finishes up the last two rounds.
180#
181# There is a mostly decision to be made around how much parallel work goes
182# before or after the conditional part. We attempted to preserve the original
183# scheduling where possible, but it's possible other schedulings are more
184# optimal with the current ordering.
185
186$flavour = shift;
187$output  = shift;
188
189$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
190( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
191( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
192die "can't locate arm-xlate.pl";
193
194open OUT,"| \"$^X\" $xlate $flavour $output";
195*STDOUT=*OUT;
196
197$code=<<___;
198#include <openssl/arm_arch.h>
199#if __ARM_MAX_ARCH__ >= 8
200
201.arch armv8-a+crypto
202.text
203___
204
205$input_ptr="x0";  #argument block
206$bit_length="x1";
207$output_ptr="x2";
208$current_tag="x3";
209$Htable="x6";
210$counter="x16";
211$cc="x8";
212
213{
214my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
215my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
216my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
217my ($output_l0,$output_h0)=map("x$_",(6..7));
218
219# rkN_l and rkN_h store the final round key, which is handled slightly
220# differently because it is EORed through general-purpose registers.
221my $ctr32w="w9";
222my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rkN_l,$rkN_h,$len)=map("x$_",(9..15));
223my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
224
225my $rounds="x17";
226my $roundsw="w17";
227
228my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
229my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
230my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
231my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
232
233my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
234my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
235my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
236
237my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
238my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
239my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
240
241my $t0="v8";
242my $t0d="d8";
243my $t1="v4";
244my $t1d="d4";
245my $t2="v8";
246my $t2d="d8";
247my $t3="v4";
248my $t3d="d4";
249my $t4="v4";
250my $t4d="d4";
251my $t5="v5";
252my $t5d="d5";
253my $t6="v8";
254my $t6d="d8";
255my $t7="v5";
256my $t7d="d5";
257my $t8="v6";
258my $t8d="d6";
259my $t9="v4";
260my $t9d="d4";
261
262my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
263my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
264my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
265
266my $mod_constantd="d8";
267my $mod_constant="v8";
268my $mod_t="v7";
269
270# rkNm1 stores the second-to-last round key, which is handled slightly
271# differently because it uses plain AESE instead of an AESE + AESMC macro-op.
272my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rkNm1)=map("v$_.16b",(18..31));
273my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rkNm1q)=map("q$_",(18..31));
274my $rk2q1="v20.1q";
275my $rk3q1="v21.1q";
276my $rk4v="v22";
277my $rk4d="d22";
278
279################################################################################
280# size_t aes_gcm_enc_kernel(const uint8_t *in,
281#                           size_t len_bits,
282#                           uint8_t *out,
283#                           u64 *Xi,
284#                           uint8_t ivec[16],
285#                           const void *key,
286#                           const void *Htable);
287#
288$code.=<<___;
289.global aes_gcm_enc_kernel
290.type   aes_gcm_enc_kernel,%function
291.align  4
292aes_gcm_enc_kernel:
293	AARCH64_SIGN_LINK_REGISTER
294	stp	x29, x30, [sp, #-128]!
295	mov	x29, sp
296	stp     x19, x20, [sp, #16]
297	mov     $counter, x4
298	mov     $cc, x5
299	stp     x21, x22, [sp, #32]
300	stp     x23, x24, [sp, #48]
301	stp     d8, d9, [sp, #64]
302	stp     d10, d11, [sp, #80]
303	stp     d12, d13, [sp, #96]
304	stp     d14, d15, [sp, #112]
305	ldr	$roundsw, [$cc, #240]
306	add	$input_l1, $cc, $rounds, lsl #4                   // borrow input_l1 for last key
307	ldp     $rkN_l, $rkN_h, [$input_l1]                       // load round N keys
308	ldr     $rkNm1q, [$input_l1, #-16]                        // load round N-1 keys
309	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   // end_input_ptr
310	lsr     $main_end_input_ptr, $bit_length, #3              // byte_len
311	mov     $len, $main_end_input_ptr
312	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              // ctr96_b64, ctr96_t32
313	ld1     { $ctr0b}, [$counter]                             // special case vector load initial counter so we can start first AES block as quickly as possible
314	sub     $main_end_input_ptr, $main_end_input_ptr, #1      // byte_len - 1
315	ldr     $rk0q, [$cc, #0]                                  // load rk0
316	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
317	ldr     $rk7q, [$cc, #112]                                // load rk7
318	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
319	lsr     $rctr32x, $ctr96_t32x, #32
320	fmov    $ctr2d, $ctr96_b64x                               // CTR block 2
321	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
322	rev     $rctr32w, $rctr32w                                // rev_ctr32
323	fmov    $ctr1d, $ctr96_b64x                               // CTR block 1
324	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 0
325	add     $rctr32w, $rctr32w, #1                            // increment rev_ctr32
326	rev     $ctr32w, $rctr32w                                 // CTR block 1
327	fmov    $ctr3d, $ctr96_b64x                               // CTR block 3
328	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 1
329	add     $rctr32w, $rctr32w, #1                            // CTR block 1
330	ldr     $rk1q, [$cc, #16]                                 // load rk1
331	fmov    $ctr1.d[1], $ctr32x                               // CTR block 1
332	rev     $ctr32w, $rctr32w                                 // CTR block 2
333	add     $rctr32w, $rctr32w, #1                            // CTR block 2
334	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 2
335	ldr     $rk2q, [$cc, #32]                                 // load rk2
336	fmov    $ctr2.d[1], $ctr32x                               // CTR block 2
337	rev     $ctr32w, $rctr32w                                 // CTR block 3
338	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 1
339	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 3
340	fmov    $ctr3.d[1], $ctr32x                               // CTR block 3
341	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 0
342	ldr     $rk3q, [$cc, #48]                                 // load rk3
343	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 2
344	ldr     $rk6q, [$cc, #96]                                 // load rk6
345	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 0
346	ldr     $rk5q, [$cc, #80]                                 // load rk5
347	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 1
348	ldr     $h3q, [$Htable, #48]                              // load h3l | h3h
349	ext     $h3b, $h3b, $h3b, #8
350	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 0
351	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 1
352	ldr     $rk4q, [$cc, #64]                                 // load rk4
353	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 2
354	ldr     $h2q, [$Htable, #32]                              // load h2l | h2h
355	ext     $h2b, $h2b, $h2b, #8
356	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 1
357	ldr     $rk12q, [$cc, #192]                               // load rk12
358	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 2
359	ldr     $h4q, [$Htable, #80]                              // load h4l | h4h
360	ext     $h4b, $h4b, $h4b, #8
361	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 3
362	ldr     $rk11q, [$cc, #176]                               // load rk11
363	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 2
364	ldr     $rk8q, [$cc, #128]                                // load rk8
365	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 3
366	add     $rctr32w, $rctr32w, #1                            // CTR block 3
367	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 3
368	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 3
369	ld1     { $acc_lb}, [$current_tag]
370	ext     $acc_lb, $acc_lb, $acc_lb, #8
371	rev64   $acc_lb, $acc_lb
372	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 4
373	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 4
374	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 4
375	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 4
376	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
377	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 5
378	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 5
379	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 5
380	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 5
381	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 6
382	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      // h4l | h3l
383	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 6
384	ldr     $rk9q, [$cc, #144]                                // load rk9
385	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 6
386	ldr     $h1q, [$Htable]                                   // load h1l | h1h
387	ext     $h1b, $h1b, $h1b, #8
388	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 6
389	ldr     $rk10q, [$cc, #160]                               // load rk10
390	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 7
391	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      // h4h | h3h
392	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 7
393	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 7
394	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 7
395	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      // h2l | h1l
396	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 8
397	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 8
398	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 8
399	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 8
400	b.lt	.Lenc_finish_first_blocks                         // branch if AES-128
401
402	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 9
403	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 9
404	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 9
405	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 9
406	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 10
407	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 10
408	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 10
409	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 10
410	b.eq	.Lenc_finish_first_blocks                         // branch if AES-192
411
412	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 11
413	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 11
414	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 11
415	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 11
416	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 12
417	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 12
418	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 12
419	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 12
420
421.Lenc_finish_first_blocks:
422	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 4 blocks
423	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  // h4k | h3k
424	aese    $ctr2b, $rkNm1                                    // AES block 2 - round N-1
425	trn1    $t0.2d,    $h1.2d,    $h2.2d                      // h2h | h1h
426	aese    $ctr1b, $rkNm1                                    // AES block 1 - round N-1
427	aese    $ctr0b, $rkNm1                                    // AES block 0 - round N-1
428	aese    $ctr3b, $rkNm1                                    // AES block 3 - round N-1
429	eor     $h12k.16b, $h12k.16b, $t0.16b                     // h2k | h1k
430	b.ge    .Lenc_tail                                        // handle tail
431
432	ldp     $input_l1, $input_h1, [$input_ptr, #16]           // AES block 1 - load plaintext
433	rev     $ctr32w, $rctr32w                                 // CTR block 4
434	ldp     $input_l0, $input_h0, [$input_ptr, #0]            // AES block 0 - load plaintext
435	ldp     $input_l3, $input_h3, [$input_ptr, #48]           // AES block 3 - load plaintext
436	ldp     $input_l2, $input_h2, [$input_ptr, #32]           // AES block 2 - load plaintext
437	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
438	eor     $input_l1, $input_l1, $rkN_l                      // AES block 1 - round N low
439	eor     $input_h1, $input_h1, $rkN_h                      // AES block 1 - round N high
440	fmov    $ctr_t1d, $input_l1                               // AES block 1 - mov low
441	eor     $input_l0, $input_l0, $rkN_l                      // AES block 0 - round N low
442	eor     $input_h0, $input_h0, $rkN_h                      // AES block 0 - round N high
443	eor     $input_h3, $input_h3, $rkN_h                      // AES block 3 - round N high
444	fmov    $ctr_t0d, $input_l0                               // AES block 0 - mov low
445	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 8 blocks
446	fmov    $ctr_t0.d[1], $input_h0                           // AES block 0 - mov high
447	eor     $input_l3, $input_l3, $rkN_l                      // AES block 3 - round N low
448	eor     $input_l2, $input_l2, $rkN_l                      // AES block 2 - round N low
449	fmov    $ctr_t1.d[1], $input_h1                           // AES block 1 - mov high
450	fmov    $ctr_t2d, $input_l2                               // AES block 2 - mov low
451	add     $rctr32w, $rctr32w, #1                            // CTR block 4
452	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4
453	fmov    $ctr_t3d, $input_l3                               // AES block 3 - mov low
454	eor     $input_h2, $input_h2, $rkN_h                      // AES block 2 - round N high
455	fmov    $ctr_t2.d[1], $input_h2                           // AES block 2 - mov high
456	eor     $res0b, $ctr_t0b, $ctr0b                          // AES block 0 - result
457	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4
458	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4
459	rev     $ctr32w, $rctr32w                                 // CTR block 5
460	add     $rctr32w, $rctr32w, #1                            // CTR block 5
461	eor     $res1b, $ctr_t1b, $ctr1b                          // AES block 1 - result
462	fmov    $ctr1d, $ctr96_b64x                               // CTR block 5
463	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 5
464	fmov    $ctr1.d[1], $ctr32x                               // CTR block 5
465	rev     $ctr32w, $rctr32w                                 // CTR block 6
466	st1     { $res0b}, [$output_ptr], #16                     // AES block 0 - store result
467	fmov    $ctr_t3.d[1], $input_h3                           // AES block 3 - mov high
468	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 6
469	eor     $res2b, $ctr_t2b, $ctr2b                          // AES block 2 - result
470	st1     { $res1b}, [$output_ptr], #16                     // AES block 1 - store result
471	add     $rctr32w, $rctr32w, #1                            // CTR block 6
472	fmov    $ctr2d, $ctr96_b64x                               // CTR block 6
473	fmov    $ctr2.d[1], $ctr32x                               // CTR block 6
474	st1     { $res2b}, [$output_ptr], #16                     // AES block 2 - store result
475	rev     $ctr32w, $rctr32w                                 // CTR block 7
476	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 7
477	eor     $res3b, $ctr_t3b, $ctr3b                          // AES block 3 - result
478	st1     { $res3b}, [$output_ptr], #16                     // AES block 3 - store result
479	b.ge    .Lenc_prepretail                                  // do prepretail
480
481.Lenc_main_loop:                                                  // main loop start
482	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
483	rev64   $res0b, $res0b                                    // GHASH block 4k (only t0 is free)
484	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
485	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+3
486	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
487	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
488	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
489	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+3
490	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
491	ldp     $input_l3, $input_h3, [$input_ptr, #48]           // AES block 4k+7 - load plaintext
492	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
493	ldp     $input_l2, $input_h2, [$input_ptr, #32]           // AES block 4k+6 - load plaintext
494	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
495	eor     $res0b, $res0b, $acc_lb                           // PRE 1
496	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
497	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
498	eor     $input_l3, $input_l3, $rkN_l                      // AES block 4k+7 - round N low
499	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
500	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
501	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
502	eor     $input_h2, $input_h2, $rkN_h                      // AES block 4k+6 - round N high
503	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
504	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
505	rev64   $res1b, $res1b                                    // GHASH block 4k+1 (t0 and t1 free)
506	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
507	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
508	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
509	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
510	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
511	rev64   $res3b, $res3b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
512	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
513	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
514	rev64   $res2b, $res2b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
515	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
516	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
517	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
518	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
519	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
520	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
521	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
522	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
523	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
524	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
525	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
526	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
527	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
528	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
529	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
530	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
531	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
532	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
533	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
534	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
535	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
536	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
537	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
538	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
539	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
540	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
541	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
542	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
543	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
544	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
545	ldp     $input_l1, $input_h1, [$input_ptr, #16]           // AES block 4k+5 - load plaintext
546	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
547	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
548	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
549	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
550	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
551	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
552	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
553	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
554	eor     $input_l1, $input_l1, $rkN_l                      // AES block 4k+5 - round N low
555	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
556	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
557	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
558	eor     $input_l2, $input_l2, $rkN_l                      // AES block 4k+6 - round N low
559	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
560	movi    $mod_constant.8b, #0xc2
561	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
562	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
563	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
564	fmov    $ctr_t1d, $input_l1                               // AES block 4k+5 - mov low
565	ldp     $input_l0, $input_h0, [$input_ptr, #0]            // AES block 4k+4 - load plaintext
566	b.lt	.Lenc_main_loop_continue                          // branch if AES-128
567
568	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
569	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
570	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
571	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
572	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
573	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
574	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
575	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
576	b.eq	.Lenc_main_loop_continue                          // branch if AES-192
577
578	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
579	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
580	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
581	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
582	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
583	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
584	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
585	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
586
587.Lenc_main_loop_continue:
588	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
589	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
590	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
591	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+3
592	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
593	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
594	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
595	rev     $ctr32w, $rctr32w                                 // CTR block 4k+8
596	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
597	eor     $input_l0, $input_l0, $rkN_l                      // AES block 4k+4 - round N low
598	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
599	eor     $input_h0, $input_h0, $rkN_h                      // AES block 4k+4 - round N high
600	fmov    $ctr_t0d, $input_l0                               // AES block 4k+4 - mov low
601	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+8
602	eor     $mod_t.16b, $acc_hb, $mod_t.16b                   // MODULO - fold into mid
603	eor     $input_h1, $input_h1, $rkN_h                      // AES block 4k+5 - round N high
604	eor     $input_h3, $input_h3, $rkN_h                      // AES block 4k+7 - round N high
605	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+8
606	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
607	fmov    $ctr_t0.d[1], $input_h0                           // AES block 4k+4 - mov high
608	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
609	fmov    $ctr_t3d, $input_l3                               // AES block 4k+7 - mov low
610	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
611	fmov    $ctr_t1.d[1], $input_h1                           // AES block 4k+5 - mov high
612	fmov    $ctr_t2d, $input_l2                               // AES block 4k+6 - mov low
613	cmp     $input_ptr, $main_end_input_ptr                   // LOOP CONTROL
614	fmov    $ctr_t2.d[1], $input_h2                           // AES block 4k+6 - mov high
615	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d            // MODULO - mid 64b align with low
616	eor     $res0b, $ctr_t0b, $ctr0b                          // AES block 4k+4 - result
617	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4k+8
618	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4k+8
619	rev     $ctr32w, $rctr32w                                 // CTR block 4k+9
620	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+9
621	eor     $res1b, $ctr_t1b, $ctr1b                          // AES block 4k+5 - result
622	fmov    $ctr1d, $ctr96_b64x                               // CTR block 4k+9
623	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+9
624	fmov    $ctr1.d[1], $ctr32x                               // CTR block 4k+9
625	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
626	rev     $ctr32w, $rctr32w                                 // CTR block 4k+10
627	st1     { $res0b}, [$output_ptr], #16                     // AES block 4k+4 - store result
628	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+10
629	eor     $acc_lb, $acc_lb, $acc_hb                         // MODULO - fold into low
630	fmov    $ctr_t3.d[1], $input_h3                           // AES block 4k+7 - mov high
631	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
632	st1     { $res1b}, [$output_ptr], #16                     // AES block 4k+5 - store result
633	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+10
634	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
635	eor     $res2b, $ctr_t2b, $ctr2b                          // AES block 4k+6 - result
636	fmov    $ctr2d, $ctr96_b64x                               // CTR block 4k+10
637	st1     { $res2b}, [$output_ptr], #16                     // AES block 4k+6 - store result
638	fmov    $ctr2.d[1], $ctr32x                               // CTR block 4k+10
639	rev     $ctr32w, $rctr32w                                 // CTR block 4k+11
640	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
641	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+11
642	eor     $res3b, $ctr_t3b, $ctr3b                          // AES block 4k+7 - result
643	st1     { $res3b}, [$output_ptr], #16                     // AES block 4k+7 - store result
644	b.lt    .Lenc_main_loop
645
646.Lenc_prepretail:                                                 // PREPRETAIL
647	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
648	rev64   $res2b, $res2b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
649	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
650	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+3
651	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
652	rev64   $res0b, $res0b                                    // GHASH block 4k (only t0 is free)
653	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+3
654	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
655	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
656	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
657	eor     $res0b, $res0b, $acc_lb                           // PRE 1
658	rev64   $res1b, $res1b                                    // GHASH block 4k+1 (t0 and t1 free)
659	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
660	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
661	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
662	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
663	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
664	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
665	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
666	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
667	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
668	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
669	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
670	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
671	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
672	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
673	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
674	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
675	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
676	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
677	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
678	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
679	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
680	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
681	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
682	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
683	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
684	rev64   $res3b, $res3b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
685	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
686	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
687	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
688	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+3
689	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
690	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
691	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
692	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
693	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
694	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
695	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
696	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
697	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
698	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
699	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
700	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
701	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
702	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
703	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
704	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
705	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
706	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
707	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
708	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
709	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
710	movi    $mod_constant.8b, #0xc2
711	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
712	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
713	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
714	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
715	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
716	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
717	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
718	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
719	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
720	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
721	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
722	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
723	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
724	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
725	eor     $acc_mb, $acc_mb, $acc_hb                         // karatsuba tidy up
726	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
727	pmull   $t1.1q, $acc_h.1d, $mod_constant.1d
728	ext     $acc_hb, $acc_hb, $acc_hb, #8
729	eor     $acc_mb, $acc_mb, $acc_lb
730	b.lt	.Lenc_finish_prepretail                           // branch if AES-128
731
732	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
733	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
734	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
735	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
736	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
737	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
738	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
739	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
740	b.eq	.Lenc_finish_prepretail                           // branch if AES-192
741
742	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
743	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
744	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
745	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
746	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
747	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
748	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
749	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
750
751.Lenc_finish_prepretail:
752	eor     $acc_mb, $acc_mb, $t1.16b
753	eor     $acc_mb, $acc_mb, $acc_hb
754	pmull   $t1.1q, $acc_m.1d, $mod_constant.1d
755	ext     $acc_mb, $acc_mb, $acc_mb, #8
756	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
757	eor     $acc_lb, $acc_lb, $t1.16b
758	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
759	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
760	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
761	eor     $acc_lb, $acc_lb, $acc_mb
762
763.Lenc_tail:                                                       // TAIL
764	ext     $t0.16b, $acc_lb, $acc_lb, #8                     // prepare final partial tag
765	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   // main_end_input_ptr is number of bytes left to process
766	ldp     $input_l0, $input_h0, [$input_ptr], #16           // AES block 4k+4 - load plaintext
767	eor     $input_l0, $input_l0, $rkN_l                      // AES block 4k+4 - round N low
768	eor     $input_h0, $input_h0, $rkN_h                      // AES block 4k+4 - round N high
769	cmp     $main_end_input_ptr, #48
770	fmov    $ctr_t0d, $input_l0                               // AES block 4k+4 - mov low
771	fmov    $ctr_t0.d[1], $input_h0                           // AES block 4k+4 - mov high
772	eor     $res1b, $ctr_t0b, $ctr0b                          // AES block 4k+4 - result
773	b.gt    .Lenc_blocks_more_than_3
774	cmp     $main_end_input_ptr, #32
775	mov     $ctr3b, $ctr2b
776	movi    $acc_l.8b, #0
777	movi    $acc_h.8b, #0
778	sub     $rctr32w, $rctr32w, #1
779	mov     $ctr2b, $ctr1b
780	movi    $acc_m.8b, #0
781	b.gt    .Lenc_blocks_more_than_2
782	mov     $ctr3b, $ctr1b
783	sub     $rctr32w, $rctr32w, #1
784	cmp     $main_end_input_ptr, #16
785	b.gt    .Lenc_blocks_more_than_1
786	sub     $rctr32w, $rctr32w, #1
787	b       .Lenc_blocks_less_than_1
788.Lenc_blocks_more_than_3:                                        // blocks left >  3
789	st1     { $res1b}, [$output_ptr], #16                    // AES final-3 block  - store result
790	ldp     $input_l0, $input_h0, [$input_ptr], #16          // AES final-2 block - load input low & high
791	rev64   $res0b, $res1b                                   // GHASH final-3 block
792	eor     $input_l0, $input_l0, $rkN_l                     // AES final-2 block - round N low
793	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
794	eor     $input_h0, $input_h0, $rkN_h                     // AES final-2 block - round N high
795	mov     $rk4d, $res0.d[1]                                // GHASH final-3 block - mid
796	fmov    $res1d, $input_l0                                // AES final-2 block - mov low
797	fmov    $res1.d[1], $input_h0                            // AES final-2 block - mov high
798	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-3 block - mid
799	movi    $t0.8b, #0                                       // suppress further partial tag feed in
800	mov     $acc_md, $h34k.d[1]                              // GHASH final-3 block - mid
801	pmull   $acc_l.1q, $res0.1d, $h4.1d                      // GHASH final-3 block - low
802	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      // GHASH final-3 block - high
803	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   // GHASH final-3 block - mid
804	eor     $res1b, $res1b, $ctr1b                           // AES final-2 block - result
805.Lenc_blocks_more_than_2:                                        // blocks left >  2
806	st1     { $res1b}, [$output_ptr], #16                    // AES final-2 block - store result
807	ldp     $input_l0, $input_h0, [$input_ptr], #16          // AES final-1 block - load input low & high
808	rev64   $res0b, $res1b                                   // GHASH final-2 block
809	eor     $input_l0, $input_l0, $rkN_l                     // AES final-1 block - round N low
810	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
811	fmov    $res1d, $input_l0                                // AES final-1 block - mov low
812	eor     $input_h0, $input_h0, $rkN_h                     // AES final-1 block - round N high
813	fmov    $res1.d[1], $input_h0                            // AES final-1 block - mov high
814	movi    $t0.8b, #0                                       // suppress further partial tag feed in
815	pmull2  $rk2q1, $res0.2d, $h3.2d                         // GHASH final-2 block - high
816	mov     $rk4d, $res0.d[1]                                // GHASH final-2 block - mid
817	pmull   $rk3q1, $res0.1d, $h3.1d                         // GHASH final-2 block - low
818	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-2 block - mid
819	eor     $res1b, $res1b, $ctr2b                           // AES final-1 block - result
820	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-2 block - high
821	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     // GHASH final-2 block - mid
822	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-2 block - low
823	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-2 block - mid
824.Lenc_blocks_more_than_1:                                        // blocks left >  1
825	st1     { $res1b}, [$output_ptr], #16                    // AES final-1 block - store result
826	rev64   $res0b, $res1b                                   // GHASH final-1 block
827	ldp     $input_l0, $input_h0, [$input_ptr], #16          // AES final block - load input low & high
828	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
829	movi    $t0.8b, #0                                       // suppress further partial tag feed in
830	eor     $input_l0, $input_l0, $rkN_l                     // AES final block - round N low
831	mov     $rk4d, $res0.d[1]                                // GHASH final-1 block - mid
832	pmull2  $rk2q1, $res0.2d, $h2.2d                         // GHASH final-1 block - high
833	eor     $input_h0, $input_h0, $rkN_h                     // AES final block - round N high
834	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-1 block - mid
835	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-1 block - high
836	ins     $rk4v.d[1], $rk4v.d[0]                           // GHASH final-1 block - mid
837	fmov    $res1d, $input_l0                                // AES final block - mov low
838	fmov    $res1.d[1], $input_h0                            // AES final block - mov high
839	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     // GHASH final-1 block - mid
840	pmull   $rk3q1, $res0.1d, $h2.1d                         // GHASH final-1 block - low
841	eor     $res1b, $res1b, $ctr3b                           // AES final block - result
842	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-1 block - mid
843	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-1 block - low
844.Lenc_blocks_less_than_1:                                        // blocks left <= 1
845	and     $bit_length, $bit_length, #127                   // bit_length %= 128
846	mvn     $rkN_l, xzr                                      // rkN_l = 0xffffffffffffffff
847	sub     $bit_length, $bit_length, #128                   // bit_length -= 128
848	neg     $bit_length, $bit_length                         // bit_length = 128 - #bits in input (in range [1,128])
849	ld1     { $rk0}, [$output_ptr]                           // load existing bytes where the possibly partial last block is to be stored
850	mvn     $rkN_h, xzr                                      // rkN_h = 0xffffffffffffffff
851	and     $bit_length, $bit_length, #127                   // bit_length %= 128
852	lsr     $rkN_h, $rkN_h, $bit_length                      // rkN_h is mask for top 64b of last block
853	cmp     $bit_length, #64
854	csel    $input_l0, $rkN_l, $rkN_h, lt
855	csel    $input_h0, $rkN_h, xzr, lt
856	fmov    $ctr0d, $input_l0                                // ctr0b is mask for last block
857	fmov    $ctr0.d[1], $input_h0
858	and     $res1b, $res1b, $ctr0b                           // possibly partial last block has zeroes in highest bits
859	rev64   $res0b, $res1b                                   // GHASH final block
860	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
861	bif     $res1b, $rk0, $ctr0b                             // insert existing bytes in top end of result before storing
862	pmull2  $rk2q1, $res0.2d, $h1.2d                         // GHASH final block - high
863	mov     $t0d, $res0.d[1]                                 // GHASH final block - mid
864	rev     $ctr32w, $rctr32w
865	pmull   $rk3q1, $res0.1d, $h1.1d                         // GHASH final block - low
866	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final block - high
867	eor     $t0.8b, $t0.8b, $res0.8b                         // GHASH final block - mid
868	pmull   $t0.1q, $t0.1d, $h12k.1d                         // GHASH final block - mid
869	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final block - low
870	eor     $acc_mb, $acc_mb, $t0.16b                        // GHASH final block - mid
871	movi    $mod_constant.8b, #0xc2
872	eor     $t9.16b, $acc_lb, $acc_hb                        // MODULO - karatsuba tidy up
873	shl     $mod_constantd, $mod_constantd, #56              // mod_constant
874	eor     $acc_mb, $acc_mb, $t9.16b                        // MODULO - karatsuba tidy up
875	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d           // MODULO - top 64b align with mid
876	ext     $acc_hb, $acc_hb, $acc_hb, #8                    // MODULO - other top alignment
877	eor     $acc_mb, $acc_mb, $mod_t.16b                     // MODULO - fold into mid
878	eor     $acc_mb, $acc_mb, $acc_hb                        // MODULO - fold into mid
879	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d           // MODULO - mid 64b align with low
880	ext     $acc_mb, $acc_mb, $acc_mb, #8                    // MODULO - other mid alignment
881	str     $ctr32w, [$counter, #12]                         // store the updated counter
882	st1     { $res1b}, [$output_ptr]                         // store all 16B
883	eor     $acc_lb, $acc_lb, $acc_hb                        // MODULO - fold into low
884	eor     $acc_lb, $acc_lb, $acc_mb                        // MODULO - fold into low
885	ext     $acc_lb, $acc_lb, $acc_lb, #8
886	rev64   $acc_lb, $acc_lb
887	mov     x0, $len
888	st1     { $acc_l.16b }, [$current_tag]
889	ldp     x19, x20, [sp, #16]
890	ldp     x21, x22, [sp, #32]
891	ldp     x23, x24, [sp, #48]
892	ldp     d8, d9, [sp, #64]
893	ldp     d10, d11, [sp, #80]
894	ldp     d12, d13, [sp, #96]
895	ldp     d14, d15, [sp, #112]
896	ldp     x29, x30, [sp], #128
897	AARCH64_VALIDATE_LINK_REGISTER
898	ret
899.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel
900___
901
902{
903my $t8="v4";
904my $t8d="d4";
905my $t9="v6";
906my $t9d="d6";
907################################################################################
908# size_t aes_gcm_dec_kernel(const uint8_t *in,
909#                           size_t len_bits,
910#                           uint8_t *out,
911#                           u64 *Xi,
912#                           uint8_t ivec[16],
913#                           const void *key);
914#
915$code.=<<___;
916.global aes_gcm_dec_kernel
917.type   aes_gcm_dec_kernel,%function
918.align  4
919aes_gcm_dec_kernel:
920	AARCH64_SIGN_LINK_REGISTER
921	stp	x29, x30, [sp, #-128]!
922	mov	x29, sp
923	stp     x19, x20, [sp, #16]
924	mov     $counter, x4
925	mov     $cc, x5
926	stp     x21, x22, [sp, #32]
927	stp     x23, x24, [sp, #48]
928	stp     d8, d9, [sp, #64]
929	stp     d10, d11, [sp, #80]
930	stp     d12, d13, [sp, #96]
931	stp     d14, d15, [sp, #112]
932	ldr	$roundsw, [$cc, #240]
933	add	$input_l1, $cc, $rounds, lsl #4                   // borrow input_l1 for last key
934	ldp     $rkN_l, $rkN_h, [$input_l1]                       // load round N keys
935	ldr     $rkNm1q, [$input_l1, #-16]                        // load round N-1 keys
936	lsr     $main_end_input_ptr, $bit_length, #3              // byte_len
937	mov     $len, $main_end_input_ptr
938	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              // ctr96_b64, ctr96_t32
939	ldr     $rk8q, [$cc, #128]                                // load rk8
940	sub     $main_end_input_ptr, $main_end_input_ptr, #1      // byte_len - 1
941	ldr     $rk7q, [$cc, #112]                                // load rk7
942	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
943	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   // end_input_ptr
944	ldr     $rk6q, [$cc, #96]                                 // load rk6
945	lsr     $rctr32x, $ctr96_t32x, #32
946	ldr     $rk5q, [$cc, #80]                                 // load rk5
947	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
948	ldr     $rk3q, [$cc, #48]                                 // load rk3
949	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
950	rev     $rctr32w, $rctr32w                                // rev_ctr32
951	add     $rctr32w, $rctr32w, #1                            // increment rev_ctr32
952	fmov    $ctr3d, $ctr96_b64x                               // CTR block 3
953	rev     $ctr32w, $rctr32w                                 // CTR block 1
954	add     $rctr32w, $rctr32w, #1                            // CTR block 1
955	fmov    $ctr1d, $ctr96_b64x                               // CTR block 1
956	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 1
957	ld1     { $ctr0b}, [$counter]                             // special case vector load initial counter so we can start first AES block as quickly as possible
958	fmov    $ctr1.d[1], $ctr32x                               // CTR block 1
959	rev     $ctr32w, $rctr32w                                 // CTR block 2
960	add     $rctr32w, $rctr32w, #1                            // CTR block 2
961	fmov    $ctr2d, $ctr96_b64x                               // CTR block 2
962	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 2
963	fmov    $ctr2.d[1], $ctr32x                               // CTR block 2
964	rev     $ctr32w, $rctr32w                                 // CTR block 3
965	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 3
966	ldr     $rk0q, [$cc, #0]                                  // load rk0
967	fmov    $ctr3.d[1], $ctr32x                               // CTR block 3
968	add     $rctr32w, $rctr32w, #1                            // CTR block 3
969	ldr     $rk4q, [$cc, #64]                                 // load rk4
970	ldr     $rk1q, [$cc, #16]                                 // load rk1
971	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 0
972	ldr     $h3q, [$Htable, #48]                              // load h3l | h3h
973	ext     $h3b, $h3b, $h3b, #8
974	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 0
975	ldr     $h4q, [$Htable, #80]                              // load h4l | h4h
976	ext     $h4b, $h4b, $h4b, #8
977	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 0
978	ldr     $h2q, [$Htable, #32]                              // load h2l | h2h
979	ext     $h2b, $h2b, $h2b, #8
980	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 0
981	ldr     $rk2q, [$cc, #32]                                 // load rk2
982	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 1
983	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 1
984	ld1     { $acc_lb}, [$current_tag]
985	ext     $acc_lb, $acc_lb, $acc_lb, #8
986	rev64   $acc_lb, $acc_lb
987	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 1
988	ldr     $rk9q, [$cc, #144]                                // load rk9
989	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 1
990	ldr     $rk12q, [$cc, #192]                               // load rk12
991	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 2
992	ldr     $h1q, [$Htable]                                   // load h1l | h1h
993	ext     $h1b, $h1b, $h1b, #8
994	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 2
995	ldr     $rk10q, [$cc, #160]                               // load rk10
996	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 2
997	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 3
998	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 2
999	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 3
1000	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 4
1001	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 3
1002	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 3
1003	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 4
1004	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 4
1005	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 4
1006	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 5
1007	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 5
1008	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 5
1009	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 5
1010	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 6
1011	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 6
1012	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
1013	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 6
1014	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 6
1015	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 7
1016	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 7
1017	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 7
1018	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 8
1019	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 7
1020	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 8
1021	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 8
1022	ldr     $rk11q, [$cc, #176]                               // load rk11
1023	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 8
1024	b.lt	.Ldec_finish_first_blocks                         // branch if AES-128
1025
1026	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 9
1027	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 9
1028	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 9
1029	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 9
1030	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 10
1031	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 10
1032	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 10
1033	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 10
1034	b.eq	.Ldec_finish_first_blocks                         // branch if AES-192
1035
1036	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 11
1037	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 11
1038	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 11
1039	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 11
1040	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 12
1041	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 12
1042	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 12
1043	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 12
1044
1045.Ldec_finish_first_blocks:
1046	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 4 blocks
1047	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      // h4h | h3h
1048	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      // h4l | h3l
1049	trn1    $t0.2d,    $h1.2d,    $h2.2d                      // h2h | h1h
1050	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      // h2l | h1l
1051	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  // h4k | h3k
1052	aese    $ctr1b, $rkNm1                                    // AES block 1 - round N-1
1053	aese    $ctr2b, $rkNm1                                    // AES block 2 - round N-1
1054	eor     $h12k.16b, $h12k.16b, $t0.16b                     // h2k | h1k
1055	aese    $ctr3b, $rkNm1                                    // AES block 3 - round N-1
1056	aese    $ctr0b, $rkNm1                                    // AES block 0 - round N-1
1057	b.ge    .Ldec_tail                                        // handle tail
1058
1059	ldr     $res0q, [$input_ptr, #0]                          // AES block 0 - load ciphertext
1060	ldr     $res1q, [$input_ptr, #16]                         // AES block 1 - load ciphertext
1061	rev     $ctr32w, $rctr32w                                 // CTR block 4
1062	eor     $ctr0b, $res0b, $ctr0b                            // AES block 0 - result
1063	eor     $ctr1b, $res1b, $ctr1b                            // AES block 1 - result
1064	rev64   $res1b, $res1b                                    // GHASH block 1
1065	ldr     $res3q, [$input_ptr, #48]                         // AES block 3 - load ciphertext
1066	mov     $output_h0, $ctr0.d[1]                            // AES block 0 - mov high
1067	mov     $output_l0, $ctr0.d[0]                            // AES block 0 - mov low
1068	rev64   $res0b, $res0b                                    // GHASH block 0
1069	add     $rctr32w, $rctr32w, #1                            // CTR block 4
1070	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4
1071	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4
1072	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4
1073	rev     $ctr32w, $rctr32w                                 // CTR block 5
1074	add     $rctr32w, $rctr32w, #1                            // CTR block 5
1075	mov     $output_l1, $ctr1.d[0]                            // AES block 1 - mov low
1076	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 5
1077	mov     $output_h1, $ctr1.d[1]                            // AES block 1 - mov high
1078	eor     $output_h0, $output_h0, $rkN_h                    // AES block 0 - round N high
1079	eor     $output_l0, $output_l0, $rkN_l                    // AES block 0 - round N low
1080	stp     $output_l0, $output_h0, [$output_ptr], #16        // AES block 0 - store result
1081	fmov    $ctr1d, $ctr96_b64x                               // CTR block 5
1082	ldr     $res2q, [$input_ptr, #32]                         // AES block 2 - load ciphertext
1083	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
1084	fmov    $ctr1.d[1], $ctr32x                               // CTR block 5
1085	rev     $ctr32w, $rctr32w                                 // CTR block 6
1086	add     $rctr32w, $rctr32w, #1                            // CTR block 6
1087	eor     $output_l1, $output_l1, $rkN_l                    // AES block 1 - round N low
1088	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 6
1089	eor     $output_h1, $output_h1, $rkN_h                    // AES block 1 - round N high
1090	stp     $output_l1, $output_h1, [$output_ptr], #16        // AES block 1 - store result
1091	eor     $ctr2b, $res2b, $ctr2b                            // AES block 2 - result
1092	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 8 blocks
1093	b.ge    .Ldec_prepretail                                  // do prepretail
1094
1095.Ldec_main_loop:                                                  // main loop start
1096	mov     $output_l2, $ctr2.d[0]                            // AES block 4k+2 - mov low
1097	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
1098	eor     $ctr3b, $res3b, $ctr3b                            // AES block 4k+3 - result
1099	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
1100	mov     $output_h2, $ctr2.d[1]                            // AES block 4k+2 - mov high
1101	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
1102	fmov    $ctr2d, $ctr96_b64x                               // CTR block 4k+6
1103	fmov    $ctr2.d[1], $ctr32x                               // CTR block 4k+6
1104	eor     $res0b, $res0b, $acc_lb                           // PRE 1
1105	rev     $ctr32w, $rctr32w                                 // CTR block 4k+7
1106	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
1107	mov     $output_h3, $ctr3.d[1]                            // AES block 4k+3 - mov high
1108	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
1109	mov     $output_l3, $ctr3.d[0]                            // AES block 4k+3 - mov low
1110	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
1111	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
1112	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+7
1113	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
1114	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+7
1115	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
1116	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+7
1117	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
1118	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
1119	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
1120	eor     $output_h2, $output_h2, $rkN_h                    // AES block 4k+2 - round N high
1121	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
1122	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
1123	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
1124	rev64   $res2b, $res2b                                    // GHASH block 4k+2
1125	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
1126	eor     $output_l2, $output_l2, $rkN_l                    // AES block 4k+2 - round N low
1127	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
1128	stp     $output_l2, $output_h2, [$output_ptr], #16        // AES block 4k+2 - store result
1129	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
1130	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
1131	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
1132	rev64   $res3b, $res3b                                    // GHASH block 4k+3
1133	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
1134	eor     $output_l3, $output_l3, $rkN_l                    // AES block 4k+3 - round N low
1135	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
1136	eor     $output_h3, $output_h3, $rkN_h                    // AES block 4k+3 - round N high
1137	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
1138	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
1139	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
1140	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
1141	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
1142	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
1143	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
1144	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+7
1145	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
1146	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
1147	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
1148	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
1149	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
1150	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
1151	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
1152	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
1153	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
1154	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
1155	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
1156	rev     $ctr32w, $rctr32w                                 // CTR block 4k+8
1157	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
1158	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
1159	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
1160	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+8
1161	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
1162	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
1163	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
1164	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
1165	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
1166	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
1167	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
1168	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
1169	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
1170	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
1171	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
1172	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
1173	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+8
1174	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
1175	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
1176	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
1177	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
1178	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
1179	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
1180	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
1181	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
1182	movi    $mod_constant.8b, #0xc2
1183	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
1184	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
1185	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
1186	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
1187	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
1188	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
1189	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
1190	b.lt	.Ldec_main_loop_continue                          // branch if AES-128
1191
1192	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
1193	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
1194	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
1195	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
1196	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
1197	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
1198	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
1199	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
1200	b.eq	.Ldec_main_loop_continue                          // branch if AES-192
1201
1202	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
1203	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
1204	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
1205	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
1206	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
1207	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
1208	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
1209	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
1210
1211.Ldec_main_loop_continue:
1212	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
1213	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
1214	ldr     $res0q, [$input_ptr, #0]                          // AES block 4k+4 - load ciphertext
1215	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
1216	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
1217	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
1218	ldr     $res1q, [$input_ptr, #16]                         // AES block 4k+5 - load ciphertext
1219	eor     $ctr0b, $res0b, $ctr0b                            // AES block 4k+4 - result
1220	stp     $output_l3, $output_h3, [$output_ptr], #16        // AES block 4k+3 - store result
1221	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
1222	ldr     $res3q, [$input_ptr, #48]                         // AES block 4k+7 - load ciphertext
1223	ldr     $res2q, [$input_ptr, #32]                         // AES block 4k+6 - load ciphertext
1224	mov     $output_h0, $ctr0.d[1]                            // AES block 4k+4 - mov high
1225	eor     $acc_mb, $acc_mb, $acc_hb                         // MODULO - fold into mid
1226	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
1227	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
1228	mov     $output_l0, $ctr0.d[0]                            // AES block 4k+4 - mov low
1229	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4k+8
1230	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4k+8
1231	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     // MODULO - mid 64b align with low
1232	eor     $ctr1b, $res1b, $ctr1b                            // AES block 4k+5 - result
1233	rev     $ctr32w, $rctr32w                                 // CTR block 4k+9
1234	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
1235	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+9
1236	cmp     $input_ptr, $main_end_input_ptr                   // LOOP CONTROL
1237	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+9
1238	eor     $output_l0, $output_l0, $rkN_l                    // AES block 4k+4 - round N low
1239	eor     $output_h0, $output_h0, $rkN_h                    // AES block 4k+4 - round N high
1240	mov     $output_h1, $ctr1.d[1]                            // AES block 4k+5 - mov high
1241	eor     $ctr2b, $res2b, $ctr2b                            // AES block 4k+6 - result
1242	eor     $acc_lb, $acc_lb, $mod_constant.16b               // MODULO - fold into low
1243	mov     $output_l1, $ctr1.d[0]                            // AES block 4k+5 - mov low
1244	fmov    $ctr1d, $ctr96_b64x                               // CTR block 4k+9
1245	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
1246	fmov    $ctr1.d[1], $ctr32x                               // CTR block 4k+9
1247	rev     $ctr32w, $rctr32w                                 // CTR block 4k+10
1248	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+10
1249	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
1250	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+10
1251	rev64   $res1b, $res1b                                    // GHASH block 4k+5
1252	eor     $output_h1, $output_h1, $rkN_h                    // AES block 4k+5 - round N high
1253	stp     $output_l0, $output_h0, [$output_ptr], #16        // AES block 4k+4 - store result
1254	eor     $output_l1, $output_l1, $rkN_l                    // AES block 4k+5 - round N low
1255	stp     $output_l1, $output_h1, [$output_ptr], #16        // AES block 4k+5 - store result
1256	rev64   $res0b, $res0b                                    // GHASH block 4k+4
1257	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
1258	b.lt    .Ldec_main_loop
1259
1260.Ldec_prepretail:                                                 // PREPRETAIL
1261	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
1262	mov     $output_l2, $ctr2.d[0]                            // AES block 4k+2 - mov low
1263	eor     $ctr3b, $res3b, $ctr3b                            // AES block 4k+3 - result
1264	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
1265	mov     $output_h2, $ctr2.d[1]                            // AES block 4k+2 - mov high
1266	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
1267	fmov    $ctr2d, $ctr96_b64x                               // CTR block 4k+6
1268	fmov    $ctr2.d[1], $ctr32x                               // CTR block 4k+6
1269	rev     $ctr32w, $rctr32w                                 // CTR block 4k+7
1270	eor     $res0b, $res0b, $acc_lb                           // PRE 1
1271	rev64   $res2b, $res2b                                    // GHASH block 4k+2
1272	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+7
1273	mov     $output_l3, $ctr3.d[0]                            // AES block 4k+3 - mov low
1274	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
1275	mov     $output_h3, $ctr3.d[1]                            // AES block 4k+3 - mov high
1276	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
1277	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
1278	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+7
1279	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
1280	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+7
1281	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
1282	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
1283	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
1284	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
1285	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
1286	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
1287	rev64   $res3b, $res3b                                    // GHASH block 4k+3
1288	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
1289	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
1290	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
1291	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
1292	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
1293	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
1294	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
1295	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
1296	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
1297	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
1298	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
1299	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
1300	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
1301	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
1302	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
1303	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
1304	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
1305	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
1306	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
1307	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
1308	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
1309	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
1310	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
1311	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
1312	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
1313	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
1314	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
1315	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
1316	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
1317	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
1318	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
1319	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
1320	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
1321	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
1322	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
1323	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
1324	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
1325	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
1326	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
1327	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
1328	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
1329	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
1330	movi    $mod_constant.8b, #0xc2
1331	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
1332	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
1333	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
1334	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
1335	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
1336	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
1337	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
1338	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
1339	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
1340	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
1341	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
1342	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
1343	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
1344	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
1345	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
1346	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
1347	b.lt	.Ldec_finish_prepretail                           // branch if AES-128
1348
1349	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
1350	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
1351	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
1352	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
1353	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
1354	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
1355	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
1356	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
1357	b.eq	.Ldec_finish_prepretail                           // branch if AES-192
1358
1359	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
1360	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
1361	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
1362	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
1363	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
1364	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
1365	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
1366	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
1367
1368.Ldec_finish_prepretail:
1369	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
1370	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
1371	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
1372	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
1373	eor     $output_h2, $output_h2, $rkN_h                    // AES block 4k+2 - round N high
1374	eor     $output_l3, $output_l3, $rkN_l                    // AES block 4k+3 - round N low
1375	eor     $acc_mb, $acc_mb, $acc_hb                         // MODULO - fold into mid
1376	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+7
1377	eor     $output_l2, $output_l2, $rkN_l                    // AES block 4k+2 - round N low
1378	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     // MODULO - mid 64b align with low
1379	eor     $output_h3, $output_h3, $rkN_h                    // AES block 4k+3 - round N high
1380	stp     $output_l2, $output_h2, [$output_ptr], #16        // AES block 4k+2 - store result
1381	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
1382	stp     $output_l3, $output_h3, [$output_ptr], #16        // AES block 4k+3 - store result
1383
1384	eor     $acc_lb, $acc_lb, $mod_constant.16b               // MODULO - fold into low
1385	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
1386	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
1387	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
1388	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
1389	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
1390
1391.Ldec_tail:                                                       // TAIL
1392	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   // main_end_input_ptr is number of bytes left to process
1393	ld1     { $res1b}, [$input_ptr], #16                      // AES block 4k+4 - load ciphertext
1394	eor     $ctr0b, $res1b, $ctr0b                            // AES block 4k+4 - result
1395	mov     $output_l0, $ctr0.d[0]                            // AES block 4k+4 - mov low
1396	mov     $output_h0, $ctr0.d[1]                            // AES block 4k+4 - mov high
1397	ext     $t0.16b, $acc_lb, $acc_lb, #8                     // prepare final partial tag
1398	cmp     $main_end_input_ptr, #48
1399	eor     $output_l0, $output_l0, $rkN_l                    // AES block 4k+4 - round N low
1400	eor     $output_h0, $output_h0, $rkN_h                    // AES block 4k+4 - round N high
1401	b.gt    .Ldec_blocks_more_than_3
1402	sub     $rctr32w, $rctr32w, #1
1403	mov     $ctr3b, $ctr2b
1404	movi    $acc_m.8b, #0
1405	movi    $acc_l.8b, #0
1406	cmp     $main_end_input_ptr, #32
1407	movi    $acc_h.8b, #0
1408	mov     $ctr2b, $ctr1b
1409	b.gt    .Ldec_blocks_more_than_2
1410	sub     $rctr32w, $rctr32w, #1
1411	mov     $ctr3b, $ctr1b
1412	cmp     $main_end_input_ptr, #16
1413	b.gt    .Ldec_blocks_more_than_1
1414	sub     $rctr32w, $rctr32w, #1
1415	b       .Ldec_blocks_less_than_1
1416.Ldec_blocks_more_than_3:                                    // blocks left >  3
1417	rev64   $res0b, $res1b                                   // GHASH final-3 block
1418	ld1     { $res1b}, [$input_ptr], #16                     // AES final-2 block - load ciphertext
1419	stp     $output_l0, $output_h0, [$output_ptr], #16       // AES final-3 block  - store result
1420	mov     $acc_md, $h34k.d[1]                              // GHASH final-3 block - mid
1421	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
1422	eor     $ctr0b, $res1b, $ctr1b                           // AES final-2 block - result
1423	mov     $rk4d, $res0.d[1]                                // GHASH final-3 block - mid
1424	mov     $output_l0, $ctr0.d[0]                           // AES final-2 block - mov low
1425	mov     $output_h0, $ctr0.d[1]                           // AES final-2 block - mov high
1426	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-3 block - mid
1427	movi    $t0.8b, #0                                       // suppress further partial tag feed in
1428	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      // GHASH final-3 block - high
1429	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   // GHASH final-3 block - mid
1430	eor     $output_l0, $output_l0, $rkN_l                   // AES final-2 block - round N low
1431	pmull   $acc_l.1q, $res0.1d, $h4.1d                      // GHASH final-3 block - low
1432	eor     $output_h0, $output_h0, $rkN_h                   // AES final-2 block - round N high
1433.Ldec_blocks_more_than_2:                                    // blocks left >  2
1434	rev64   $res0b, $res1b                                   // GHASH final-2 block
1435	ld1     { $res1b}, [$input_ptr], #16                     // AES final-1 block - load ciphertext
1436	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
1437	stp     $output_l0, $output_h0, [$output_ptr], #16       // AES final-2 block  - store result
1438	eor     $ctr0b, $res1b, $ctr2b                           // AES final-1 block - result
1439	mov     $rk4d, $res0.d[1]                                // GHASH final-2 block - mid
1440	pmull   $rk3q1, $res0.1d, $h3.1d                         // GHASH final-2 block - low
1441	pmull2  $rk2q1, $res0.2d, $h3.2d                         // GHASH final-2 block - high
1442	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-2 block - mid
1443	mov     $output_l0, $ctr0.d[0]                           // AES final-1 block - mov low
1444	mov     $output_h0, $ctr0.d[1]                           // AES final-1 block - mov high
1445	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-2 block - low
1446	movi    $t0.8b, #0                                       // suppress further partial tag feed in
1447	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     // GHASH final-2 block - mid
1448	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-2 block - high
1449	eor     $output_l0, $output_l0, $rkN_l                   // AES final-1 block - round N low
1450	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-2 block - mid
1451	eor     $output_h0, $output_h0, $rkN_h                   // AES final-1 block - round N high
1452.Ldec_blocks_more_than_1:                                        // blocks left >  1
1453	stp     $output_l0, $output_h0, [$output_ptr], #16       // AES final-1 block  - store result
1454	rev64   $res0b, $res1b                                   // GHASH final-1 block
1455	ld1     { $res1b}, [$input_ptr], #16                     // AES final block - load ciphertext
1456	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
1457	movi    $t0.8b, #0                                       // suppress further partial tag feed in
1458	mov     $rk4d, $res0.d[1]                                // GHASH final-1 block - mid
1459	eor     $ctr0b, $res1b, $ctr3b                           // AES final block - result
1460	pmull2  $rk2q1, $res0.2d, $h2.2d                         // GHASH final-1 block - high
1461	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-1 block - mid
1462	pmull   $rk3q1, $res0.1d, $h2.1d                         // GHASH final-1 block - low
1463	mov     $output_l0, $ctr0.d[0]                           // AES final block - mov low
1464	ins     $rk4v.d[1], $rk4v.d[0]                           // GHASH final-1 block - mid
1465	mov     $output_h0, $ctr0.d[1]                           // AES final block - mov high
1466	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     // GHASH final-1 block - mid
1467	eor     $output_l0, $output_l0, $rkN_l                   // AES final block - round N low
1468	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-1 block - low
1469	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-1 block - high
1470	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-1 block - mid
1471	eor     $output_h0, $output_h0, $rkN_h                   // AES final block - round N high
1472.Ldec_blocks_less_than_1:                                        // blocks left <= 1
1473	and     $bit_length, $bit_length, #127                   // bit_length %= 128
1474	mvn     $rkN_h, xzr                                      // rkN_h = 0xffffffffffffffff
1475	sub     $bit_length, $bit_length, #128                   // bit_length -= 128
1476	mvn     $rkN_l, xzr                                      // rkN_l = 0xffffffffffffffff
1477	ldp     $end_input_ptr, $main_end_input_ptr, [$output_ptr] // load existing bytes we need to not overwrite
1478	neg     $bit_length, $bit_length                         // bit_length = 128 - #bits in input (in range [1,128])
1479	and     $bit_length, $bit_length, #127                   // bit_length %= 128
1480	lsr     $rkN_h, $rkN_h, $bit_length                      // rkN_h is mask for top 64b of last block
1481	cmp     $bit_length, #64
1482	csel    $ctr32x, $rkN_l, $rkN_h, lt
1483	csel    $ctr96_b64x, $rkN_h, xzr, lt
1484	fmov    $ctr0d, $ctr32x                                  // ctr0b is mask for last block
1485	and     $output_l0, $output_l0, $ctr32x
1486	mov     $ctr0.d[1], $ctr96_b64x
1487	bic     $end_input_ptr, $end_input_ptr, $ctr32x          // mask out low existing bytes
1488	rev     $ctr32w, $rctr32w
1489	bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x      // mask out high existing bytes
1490	orr     $output_l0, $output_l0, $end_input_ptr
1491	and     $output_h0, $output_h0, $ctr96_b64x
1492	orr     $output_h0, $output_h0, $main_end_input_ptr
1493	and     $res1b, $res1b, $ctr0b                            // possibly partial last block has zeroes in highest bits
1494	rev64   $res0b, $res1b                                    // GHASH final block
1495	eor     $res0b, $res0b, $t0.16b                           // feed in partial tag
1496	pmull   $rk3q1, $res0.1d, $h1.1d                          // GHASH final block - low
1497	mov     $t0d, $res0.d[1]                                  // GHASH final block - mid
1498	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH final block - mid
1499	pmull2  $rk2q1, $res0.2d, $h1.2d                          // GHASH final block - high
1500	pmull   $t0.1q, $t0.1d, $h12k.1d                          // GHASH final block - mid
1501	eor     $acc_hb, $acc_hb, $rk2                            // GHASH final block - high
1502	eor     $acc_lb, $acc_lb, $rk3                            // GHASH final block - low
1503	eor     $acc_mb, $acc_mb, $t0.16b                         // GHASH final block - mid
1504	movi    $mod_constant.8b, #0xc2
1505	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
1506	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
1507	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
1508	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
1509	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
1510	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
1511	eor     $acc_mb, $acc_mb, $acc_hb                         // MODULO - fold into mid
1512	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     // MODULO - mid 64b align with low
1513	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
1514	eor     $acc_lb, $acc_lb, $mod_constant.16b               // MODULO - fold into low
1515	stp     $output_l0, $output_h0, [$output_ptr]
1516	str     $ctr32w, [$counter, #12]                          // store the updated counter
1517	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
1518	ext     $acc_lb, $acc_lb, $acc_lb, #8
1519	rev64   $acc_lb, $acc_lb
1520	mov     x0, $len
1521	st1     { $acc_l.16b }, [$current_tag]
1522	ldp     x19, x20, [sp, #16]
1523	ldp     x21, x22, [sp, #32]
1524	ldp     x23, x24, [sp, #48]
1525	ldp     d8, d9, [sp, #64]
1526	ldp     d10, d11, [sp, #80]
1527	ldp     d12, d13, [sp, #96]
1528	ldp     d14, d15, [sp, #112]
1529	ldp     x29, x30, [sp], #128
1530	AARCH64_VALIDATE_LINK_REGISTER
1531	ret
1532.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
1533___
1534}
1535}
1536
1537$code.=<<___;
1538#endif
1539___
1540
1541print $code;
1542close STDOUT or die "error closing STDOUT: $!"; # enforce flush
1543