1#! /usr/bin/env perl 2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10###################################################################### 11## Constant-time SSSE3 AES core implementation. 12## version 0.1 13## 14## By Mike Hamburg (Stanford University), 2009 15## Public domain. 16## 17## For details see http://shiftleft.org/papers/vector_aes/ and 18## http://crypto.stanford.edu/vpaes/. 19## 20###################################################################### 21# Adapted from the original x86_64 version and <[email protected]>'s ARMv8 22# version. 23# 24# armv7, aarch64, and x86_64 differ in several ways: 25# 26# * x86_64 SSSE3 instructions are two-address (destination operand is also a 27# source), while NEON is three-address (destination operand is separate from 28# two sources). 29# 30# * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16. 31# 32# * x86_64 instructions can take memory references, while ARM is a load/store 33# architecture. This means we sometimes need a spare register. 34# 35# * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb), 36# while armv7 only has a 64-bit byte shuffle (vtbl). 37# 38# This means this armv7 version must be a mix of both aarch64 and x86_64 39# implementations. armv7 and aarch64 have analogous SIMD instructions, so we 40# base the instructions on aarch64. However, we cannot use aarch64's register 41# allocation. x86_64's register count matches, but x86_64 is two-address. 42# vpaes-armv8.pl already accounts for this in the comments, which use 43# three-address AVX instructions instead of the original SSSE3 ones. We base 44# register usage on these comments, which are preserved in this file. 45# 46# This means we do not use separate input and output registers as in aarch64 and 47# cannot pin as many constants in the preheat functions. However, the load/store 48# architecture means we must still deviate from x86_64 in places. 49# 50# Next, we account for the byte shuffle instructions. vtbl takes 64-bit source 51# and destination and 128-bit table. Fortunately, armv7 also allows addressing 52# upper and lower halves of each 128-bit register. The lower half of q{N} is 53# d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent 54# instruction, 55# 56# vtbl.8 q0, q1, q2 @ Index each of q2's 16 bytes into q1. Store in q0. 57# 58# we write: 59# 60# vtbl.8 d0, q1, d4 @ Index each of d4's 8 bytes into q1. Store in d0. 61# vtbl.8 d1, q1, d5 @ Index each of d5's 8 bytes into q1. Store in d1. 62# 63# For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and 64# post-process before outputting. (This is adapted from ghash-armv4.pl.) Note, 65# however, that destination (q0) and table (q1) registers may no longer match. 66# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the 67# two-address pshufb always matched these operands, so this is common.) 68# 69# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR 70# expands to an ADD or SUB of the pc register to find an address. That immediate 71# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation. 72# This means larger values must be more aligned. 73# 74# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may 75# use either encoding (do we actually need to support this?). In ARM mode, the 76# distances get large enough to require 16-byte alignment. Moving constants 77# closer to their use resolves most of this, but common constants in 78# _vpaes_consts are used by the whole file. Affected ADR instructions must be 79# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this 80# constraint have been commented. 81# 82# For details on ARM's immediate value encoding scheme, see 83# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/ 84# 85# Finally, a summary of armv7 and aarch64 SIMD syntax differences: 86# 87# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not. 88# 89# * armv7 SIMD registers are named like q0 (and d0 for the half-width ones). 90# aarch64 names registers like v0, and denotes half-width operations in an 91# instruction suffix (see below). 92# 93# * aarch64 embeds size and lane information in register suffixes. v0.16b is 94# 16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s. 95# armv7 embeds the total size in the register name (see above) and the size of 96# each element in an instruction suffix, which may look like vmov.i8, 97# vshr.u8, or vtbl.8, depending on instruction. 98 99use strict; 100 101my $flavour = shift; 102my $output; 103while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 104 105$0 =~ m/(.*[\/\\])[^\/\\]+$/; 106my $dir=$1; 107my $xlate; 108( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 109( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 110die "can't locate arm-xlate.pl"; 111 112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 113*STDOUT=*OUT; 114 115my $code = ""; 116 117$code.=<<___; 118.syntax unified 119 120.arch armv7-a 121.fpu neon 122 123#if defined(__thumb2__) 124.thumb 125#else 126.code 32 127#endif 128 129.text 130 131.type _vpaes_consts,%object 132.align 7 @ totally strategic alignment 133_vpaes_consts: 134.Lk_mc_forward: @ mc_forward 135 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 136 .quad 0x080B0A0904070605, 0x000302010C0F0E0D 137 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 138 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 139.Lk_mc_backward:@ mc_backward 140 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B 141 .quad 0x020100030E0D0C0F, 0x0A09080B06050407 142 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 143 .quad 0x0A09080B06050407, 0x020100030E0D0C0F 144.Lk_sr: @ sr 145 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 146 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 147 .quad 0x0F060D040B020900, 0x070E050C030A0108 148 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 149 150@ 151@ "Hot" constants 152@ 153.Lk_inv: @ inv, inva 154 .quad 0x0E05060F0D080180, 0x040703090A0B0C02 155 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 156.Lk_ipt: @ input transform (lo, hi) 157 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 158 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 159.Lk_sbo: @ sbou, sbot 160 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 161 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 162.Lk_sb1: @ sb1u, sb1t 163 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 164 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 165.Lk_sb2: @ sb2u, sb2t 166 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 167 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 168 169.asciz "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)" 170.size _vpaes_consts,.-_vpaes_consts 171.align 6 172___ 173 174{ 175my ($inp,$out,$key) = map("r$_", (0..2)); 176 177my ($invlo,$invhi) = map("q$_", (10..11)); 178my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15)); 179 180$code.=<<___; 181@@ 182@@ _aes_preheat 183@@ 184@@ Fills q9-q15 as specified below. 185@@ 186.type _vpaes_preheat,%function 187.align 4 188_vpaes_preheat: 189 adr r10, .Lk_inv 190 vmov.i8 q9, #0x0f @ .Lk_s0F 191 vld1.64 {q10,q11}, [r10]! @ .Lk_inv 192 add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo 193 vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 194 vld1.64 {q14,q15}, [r10] @ .Lk_sb2 195 bx lr 196 197@@ 198@@ _aes_encrypt_core 199@@ 200@@ AES-encrypt q0. 201@@ 202@@ Inputs: 203@@ q0 = input 204@@ q9-q15 as in _vpaes_preheat 205@@ [$key] = scheduled keys 206@@ 207@@ Output in q0 208@@ Clobbers q1-q5, r8-r11 209@@ Preserves q6-q8 so you get some local vectors 210@@ 211@@ 212.type _vpaes_encrypt_core,%function 213.align 4 214_vpaes_encrypt_core: 215 mov r9, $key 216 ldr r8, [$key,#240] @ pull rounds 217 adr r11, .Lk_ipt 218 @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 219 @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 220 vld1.64 {q2, q3}, [r11] 221 adr r11, .Lk_mc_forward+16 222 vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key 223 vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 224 vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 225 vtbl.8 q1#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm1 226 vtbl.8 q1#hi, {q2}, q1#hi 227 vtbl.8 q2#lo, {q3}, q0#lo @ vpshufb %xmm0, %xmm3, %xmm2 228 vtbl.8 q2#hi, {q3}, q0#hi 229 veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 230 veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 231 232 @ .Lenc_entry ends with a bnz instruction which is normally paired with 233 @ subs in .Lenc_loop. 234 tst r8, r8 235 b .Lenc_entry 236 237.align 4 238.Lenc_loop: 239 @ middle of middle round 240 add r10, r11, #0x40 241 vtbl.8 q4#lo, {$sb1t}, q2#lo @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 242 vtbl.8 q4#hi, {$sb1t}, q2#hi 243 vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 244 vtbl.8 q0#lo, {$sb1u}, q3#lo @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 245 vtbl.8 q0#hi, {$sb1u}, q3#hi 246 veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 247 vtbl.8 q5#lo, {$sb2t}, q2#lo @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 248 vtbl.8 q5#hi, {$sb2t}, q2#hi 249 veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A 250 vtbl.8 q2#lo, {$sb2u}, q3#lo @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 251 vtbl.8 q2#hi, {$sb2u}, q3#hi 252 vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 253 vtbl.8 q3#lo, {q0}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 254 vtbl.8 q3#hi, {q0}, q1#hi 255 veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 256 @ Write to q5 instead of q0, so the table and destination registers do 257 @ not overlap. 258 vtbl.8 q5#lo, {q0}, q4#lo @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 259 vtbl.8 q5#hi, {q0}, q4#hi 260 veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 261 vtbl.8 q4#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 262 vtbl.8 q4#hi, {q3}, q1#hi 263 @ Here we restore the original q0/q5 usage. 264 veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 265 and r11, r11, #~(1<<6) @ and \$0x30, %r11 # ... mod 4 266 veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 267 subs r8, r8, #1 @ nr-- 268 269.Lenc_entry: 270 @ top of round 271 vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k 272 vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i 273 vtbl.8 q5#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 274 vtbl.8 q5#hi, {$invhi}, q1#hi 275 veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j 276 vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 277 vtbl.8 q3#hi, {$invlo}, q0#hi 278 vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 279 vtbl.8 q4#hi, {$invlo}, q1#hi 280 veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 281 veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 282 vtbl.8 q2#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 283 vtbl.8 q2#hi, {$invlo}, q3#hi 284 vtbl.8 q3#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 285 vtbl.8 q3#hi, {$invlo}, q4#hi 286 veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io 287 veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 288 vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 289 bne .Lenc_loop 290 291 @ middle of last round 292 add r10, r11, #0x80 293 294 adr r11, .Lk_sbo 295 @ Read to q1 instead of q4, so the vtbl.8 instruction below does not 296 @ overlap table and destination registers. 297 vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou 298 vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 299 vtbl.8 q4#lo, {q1}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 300 vtbl.8 q4#hi, {q1}, q2#hi 301 vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 302 @ Write to q2 instead of q0 below, to avoid overlapping table and 303 @ destination registers. 304 vtbl.8 q2#lo, {q0}, q3#lo @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 305 vtbl.8 q2#hi, {q0}, q3#hi 306 veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 307 veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A 308 @ Here we restore the original q0/q2 usage. 309 vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 310 vtbl.8 q0#hi, {q2}, q1#hi 311 bx lr 312.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 313 314.globl vpaes_encrypt 315.type vpaes_encrypt,%function 316.align 4 317vpaes_encrypt: 318 @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack 319 @ alignment. 320 stmdb sp!, {r7-r11,lr} 321 @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. 322 vstmdb sp!, {d8-d11} 323 324 vld1.64 {q0}, [$inp] 325 bl _vpaes_preheat 326 bl _vpaes_encrypt_core 327 vst1.64 {q0}, [$out] 328 329 vldmia sp!, {d8-d11} 330 ldmia sp!, {r7-r11, pc} @ return 331.size vpaes_encrypt,.-vpaes_encrypt 332 333@ 334@ Decryption stuff 335@ 336.type _vpaes_decrypt_consts,%object 337.align 4 338_vpaes_decrypt_consts: 339.Lk_dipt: @ decryption input transform 340 .quad 0x0F505B040B545F00, 0x154A411E114E451A 341 .quad 0x86E383E660056500, 0x12771772F491F194 342.Lk_dsbo: @ decryption sbox final output 343 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 344 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 345.Lk_dsb9: @ decryption sbox output *9*u, *9*t 346 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 347 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 348.Lk_dsbd: @ decryption sbox output *D*u, *D*t 349 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 350 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 351.Lk_dsbb: @ decryption sbox output *B*u, *B*t 352 .quad 0xD022649296B44200, 0x602646F6B0F2D404 353 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 354.Lk_dsbe: @ decryption sbox output *E*u, *E*t 355 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 356 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 357.size _vpaes_decrypt_consts,.-_vpaes_decrypt_consts 358 359@@ 360@@ Decryption core 361@@ 362@@ Same API as encryption core, except it clobbers q12-q15 rather than using 363@@ the values from _vpaes_preheat. q9-q11 must still be set from 364@@ _vpaes_preheat. 365@@ 366.type _vpaes_decrypt_core,%function 367.align 4 368_vpaes_decrypt_core: 369 mov r9, $key 370 ldr r8, [$key,#240] @ pull rounds 371 372 @ This function performs shuffles with various constants. The x86_64 373 @ version loads them on-demand into %xmm0-%xmm5. This does not work well 374 @ for ARMv7 because those registers are shuffle destinations. The ARMv8 375 @ version preloads those constants into registers, but ARMv7 has half 376 @ the registers to work with. Instead, we load them on-demand into 377 @ q12-q15, registers normally use for preloaded constants. This is fine 378 @ because decryption doesn't use those constants. The values are 379 @ constant, so this does not interfere with potential 2x optimizations. 380 adr r7, .Lk_dipt 381 382 vld1.64 {q12,q13}, [r7] @ vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 383 lsl r11, r8, #4 @ mov %rax, %r11; shl \$4, %r11 384 eor r11, r11, #0x30 @ xor \$0x30, %r11 385 adr r10, .Lk_sr 386 and r11, r11, #0x30 @ and \$0x30, %r11 387 add r11, r11, r10 388 adr r10, .Lk_mc_forward+48 389 390 vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key 391 vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 392 vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 393 vtbl.8 q2#lo, {q12}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm2 394 vtbl.8 q2#hi, {q12}, q1#hi 395 vld1.64 {q5}, [r10] @ vmovdqa .Lk_mc_forward+48(%rip), %xmm5 396 @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 397 vtbl.8 q0#lo, {q13}, q0#lo @ vpshufb %xmm0, %xmm1, %xmm0 398 vtbl.8 q0#hi, {q13}, q0#hi 399 veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2 400 veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 401 402 @ .Ldec_entry ends with a bnz instruction which is normally paired with 403 @ subs in .Ldec_loop. 404 tst r8, r8 405 b .Ldec_entry 406 407.align 4 408.Ldec_loop: 409@ 410@ Inverse mix columns 411@ 412 413 @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of 414 @ the function. 415 adr r10, .Lk_dsb9 416 vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 417 @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 418 @ Load sbd* ahead of time. 419 vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 420 @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 421 vtbl.8 q4#lo, {q12}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 422 vtbl.8 q4#hi, {q12}, q2#hi 423 vtbl.8 q1#lo, {q13}, q3#lo @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 424 vtbl.8 q1#hi, {q13}, q3#hi 425 veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0 426 427 veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 428 429 @ Load sbb* ahead of time. 430 vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu 431 @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt 432 433 vtbl.8 q4#lo, {q14}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 434 vtbl.8 q4#hi, {q14}, q2#hi 435 @ Write to q1 instead of q0, so the table and destination registers do 436 @ not overlap. 437 vtbl.8 q1#lo, {q0}, q5#lo @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch 438 vtbl.8 q1#hi, {q0}, q5#hi 439 @ Here we restore the original q0/q1 usage. This instruction is 440 @ reordered from the ARMv8 version so we do not clobber the vtbl.8 441 @ below. 442 veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 443 vtbl.8 q1#lo, {q15}, q3#lo @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 444 vtbl.8 q1#hi, {q15}, q3#hi 445 @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 446 veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 447 @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 448 449 @ Load sbd* ahead of time. 450 vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu 451 @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet 452 453 vtbl.8 q4#lo, {q12}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 454 vtbl.8 q4#hi, {q12}, q2#hi 455 @ Write to q1 instead of q0, so the table and destination registers do 456 @ not overlap. 457 vtbl.8 q1#lo, {q0}, q5#lo @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch 458 vtbl.8 q1#hi, {q0}, q5#hi 459 @ Here we restore the original q0/q1 usage. This instruction is 460 @ reordered from the ARMv8 version so we do not clobber the vtbl.8 461 @ below. 462 veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 463 vtbl.8 q1#lo, {q13}, q3#lo @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 464 vtbl.8 q1#hi, {q13}, q3#hi 465 veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 466 467 vtbl.8 q4#lo, {q14}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 468 vtbl.8 q4#hi, {q14}, q2#hi 469 @ Write to q1 instead of q0, so the table and destination registers do 470 @ not overlap. 471 vtbl.8 q1#lo, {q0}, q5#lo @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch 472 vtbl.8 q1#hi, {q0}, q5#hi 473 @ Here we restore the original q0/q1 usage. This instruction is 474 @ reordered from the ARMv8 version so we do not clobber the vtbl.8 475 @ below. 476 veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 477 vtbl.8 q1#lo, {q15}, q3#lo @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 478 vtbl.8 q1#hi, {q15}, q3#hi 479 vext.8 q5, q5, q5, #12 @ vpalignr \$12, %xmm5, %xmm5, %xmm5 480 veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 481 subs r8, r8, #1 @ sub \$1,%rax # nr-- 482 483.Ldec_entry: 484 @ top of round 485 vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k 486 vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i 487 vtbl.8 q2#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 488 vtbl.8 q2#hi, {$invhi}, q1#hi 489 veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j 490 vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 491 vtbl.8 q3#hi, {$invlo}, q0#hi 492 vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 493 vtbl.8 q4#hi, {$invlo}, q1#hi 494 veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 495 veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 496 vtbl.8 q2#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 497 vtbl.8 q2#hi, {$invlo}, q3#hi 498 vtbl.8 q3#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 499 vtbl.8 q3#hi, {$invlo}, q4#hi 500 veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io 501 veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 502 vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0 503 bne .Ldec_loop 504 505 @ middle of last round 506 507 adr r10, .Lk_dsbo 508 509 @ Write to q1 rather than q4 to avoid overlapping table and destination. 510 vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 511 vtbl.8 q4#lo, {q1}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 512 vtbl.8 q4#hi, {q1}, q2#hi 513 @ Write to q2 rather than q1 to avoid overlapping table and destination. 514 vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 515 vtbl.8 q1#lo, {q2}, q3#lo @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 516 vtbl.8 q1#hi, {q2}, q3#hi 517 vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 518 veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 519 @ Write to q1 rather than q0 so the table and destination registers 520 @ below do not overlap. 521 veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A 522 vtbl.8 q0#lo, {q1}, q2#lo @ vpshufb %xmm2, %xmm0, %xmm0 523 vtbl.8 q0#hi, {q1}, q2#hi 524 bx lr 525.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 526 527.globl vpaes_decrypt 528.type vpaes_decrypt,%function 529.align 4 530vpaes_decrypt: 531 @ _vpaes_decrypt_core uses r7-r11. 532 stmdb sp!, {r7-r11,lr} 533 @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved. 534 vstmdb sp!, {d8-d11} 535 536 vld1.64 {q0}, [$inp] 537 bl _vpaes_preheat 538 bl _vpaes_decrypt_core 539 vst1.64 {q0}, [$out] 540 541 vldmia sp!, {d8-d11} 542 ldmia sp!, {r7-r11, pc} @ return 543.size vpaes_decrypt,.-vpaes_decrypt 544___ 545} 546{ 547my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3"); 548my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12)); 549 550$code.=<<___; 551@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 552@@ @@ 553@@ AES key schedule @@ 554@@ @@ 555@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 556 557@ This function diverges from both x86_64 and armv7 in which constants are 558@ pinned. x86_64 has a common preheat function for all operations. aarch64 559@ separates them because it has enough registers to pin nearly all constants. 560@ armv7 does not have enough registers, but needing explicit loads and stores 561@ also complicates using x86_64's register allocation directly. 562@ 563@ We pin some constants for convenience and leave q14 and q15 free to load 564@ others on demand. 565 566@ 567@ Key schedule constants 568@ 569.type _vpaes_key_consts,%object 570.align 4 571_vpaes_key_consts: 572.Lk_dksd: @ decryption key schedule: invskew x*D 573 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 574 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 575.Lk_dksb: @ decryption key schedule: invskew x*B 576 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 577 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 578.Lk_dkse: @ decryption key schedule: invskew x*E + 0x63 579 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 580 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 581.Lk_dks9: @ decryption key schedule: invskew x*9 582 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 583 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 584 585.Lk_rcon: @ rcon 586 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 587 588.Lk_opt: @ output transform 589 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 590 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 591.Lk_deskew: @ deskew tables: inverts the sbox's "skew" 592 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 593 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 594.size _vpaes_key_consts,.-_vpaes_key_consts 595 596.type _vpaes_key_preheat,%function 597.align 4 598_vpaes_key_preheat: 599 adr r11, .Lk_rcon 600 vmov.i8 $s63, #0x5b @ .Lk_s63 601 adr r10, .Lk_inv @ Must be aligned to 8 mod 16. 602 vmov.i8 $s0F, #0x0f @ .Lk_s0F 603 vld1.64 {$invlo,$invhi}, [r10] @ .Lk_inv 604 vld1.64 {$rcon}, [r11] @ .Lk_rcon 605 bx lr 606.size _vpaes_key_preheat,.-_vpaes_key_preheat 607 608.type _vpaes_schedule_core,%function 609.align 4 610_vpaes_schedule_core: 611 @ We only need to save lr, but ARM requires an 8-byte stack alignment, 612 @ so save an extra register. 613 stmdb sp!, {r3,lr} 614 615 bl _vpaes_key_preheat @ load the tables 616 617 adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. 618 vld1.64 {q0}, [$inp]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) 619 620 @ input transform 621 @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not 622 @ overlap table and destination. 623 vmov q4, q0 @ vmovdqa %xmm0, %xmm3 624 bl _vpaes_schedule_transform 625 adr r10, .Lk_sr @ Must be aligned to 8 mod 16. 626 vmov q7, q0 @ vmovdqa %xmm0, %xmm7 627 628 add r8, r8, r10 629 tst $dir, $dir 630 bne .Lschedule_am_decrypting 631 632 @ encrypting, output zeroth round key after transform 633 vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) 634 b .Lschedule_go 635 636.Lschedule_am_decrypting: 637 @ decrypting, output zeroth round key after shiftrows 638 vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 639 vtbl.8 q3#lo, {q4}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3 640 vtbl.8 q3#hi, {q4}, q1#hi 641 vst1.64 {q3}, [$out] @ vmovdqu %xmm3, (%rdx) 642 eor r8, r8, #0x30 @ xor \$0x30, %r8 643 644.Lschedule_go: 645 cmp $bits, #192 @ cmp \$192, %esi 646 bhi .Lschedule_256 647 beq .Lschedule_192 648 @ 128: fall though 649 650@@ 651@@ .schedule_128 652@@ 653@@ 128-bit specific part of key schedule. 654@@ 655@@ This schedule is really simple, because all its parts 656@@ are accomplished by the subroutines. 657@@ 658.Lschedule_128: 659 mov $inp, #10 @ mov \$10, %esi 660 661.Loop_schedule_128: 662 bl _vpaes_schedule_round 663 subs $inp, $inp, #1 @ dec %esi 664 beq .Lschedule_mangle_last 665 bl _vpaes_schedule_mangle @ write output 666 b .Loop_schedule_128 667 668@@ 669@@ .aes_schedule_192 670@@ 671@@ 192-bit specific part of key schedule. 672@@ 673@@ The main body of this schedule is the same as the 128-bit 674@@ schedule, but with more smearing. The long, high side is 675@@ stored in q7 as before, and the short, low side is in 676@@ the high bits of q6. 677@@ 678@@ This schedule is somewhat nastier, however, because each 679@@ round produces 192 bits of key material, or 1.5 round keys. 680@@ Therefore, on each cycle we do 2 rounds and produce 3 round 681@@ keys. 682@@ 683.align 4 684.Lschedule_192: 685 sub $inp, $inp, #8 686 vld1.64 {q0}, [$inp] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 687 bl _vpaes_schedule_transform @ input transform 688 vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part 689 vmov.i8 q6#lo, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4 690 @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 691 mov $inp, #4 @ mov \$4, %esi 692 693.Loop_schedule_192: 694 bl _vpaes_schedule_round 695 vext.8 q0, q6, q0, #8 @ vpalignr \$8,%xmm6,%xmm0,%xmm0 696 bl _vpaes_schedule_mangle @ save key n 697 bl _vpaes_schedule_192_smear 698 bl _vpaes_schedule_mangle @ save key n+1 699 bl _vpaes_schedule_round 700 subs $inp, $inp, #1 @ dec %esi 701 beq .Lschedule_mangle_last 702 bl _vpaes_schedule_mangle @ save key n+2 703 bl _vpaes_schedule_192_smear 704 b .Loop_schedule_192 705 706@@ 707@@ .aes_schedule_256 708@@ 709@@ 256-bit specific part of key schedule. 710@@ 711@@ The structure here is very similar to the 128-bit 712@@ schedule, but with an additional "low side" in 713@@ q6. The low side's rounds are the same as the 714@@ high side's, except no rcon and no rotation. 715@@ 716.align 4 717.Lschedule_256: 718 vld1.64 {q0}, [$inp] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 719 bl _vpaes_schedule_transform @ input transform 720 mov $inp, #7 @ mov \$7, %esi 721 722.Loop_schedule_256: 723 bl _vpaes_schedule_mangle @ output low result 724 vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 725 726 @ high round 727 bl _vpaes_schedule_round 728 subs $inp, $inp, #1 @ dec %esi 729 beq .Lschedule_mangle_last 730 bl _vpaes_schedule_mangle 731 732 @ low round. swap xmm7 and xmm6 733 vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 734 vmov.i8 q4, #0 735 vmov q5, q7 @ vmovdqa %xmm7, %xmm5 736 vmov q7, q6 @ vmovdqa %xmm6, %xmm7 737 bl _vpaes_schedule_low_round 738 vmov q7, q5 @ vmovdqa %xmm5, %xmm7 739 740 b .Loop_schedule_256 741 742@@ 743@@ .aes_schedule_mangle_last 744@@ 745@@ Mangler for last round of key schedule 746@@ Mangles q0 747@@ when encrypting, outputs out(q0) ^ 63 748@@ when decrypting, outputs unskew(q0) 749@@ 750@@ Always called right before return... jumps to cleanup and exits 751@@ 752.align 4 753.Lschedule_mangle_last: 754 @ schedule last round key from xmm0 755 adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew 756 tst $dir, $dir 757 bne .Lschedule_mangle_last_dec 758 759 @ encrypting 760 vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 761 adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform 762 add $out, $out, #32 @ add \$32, %rdx 763 vmov q2, q0 764 vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 # output permute 765 vtbl.8 q0#hi, {q2}, q1#hi 766 767.Lschedule_mangle_last_dec: 768 sub $out, $out, #16 @ add \$-16, %rdx 769 veor q0, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 770 bl _vpaes_schedule_transform @ output transform 771 vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) # save last key 772 773 @ cleanup 774 veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 775 veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 776 veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 777 veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 778 veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 779 veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 780 veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 781 veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 782 ldmia sp!, {r3,pc} @ return 783.size _vpaes_schedule_core,.-_vpaes_schedule_core 784 785@@ 786@@ .aes_schedule_192_smear 787@@ 788@@ Smear the short, low side in the 192-bit key schedule. 789@@ 790@@ Inputs: 791@@ q7: high side, b a x y 792@@ q6: low side, d c 0 0 793@@ 794@@ Outputs: 795@@ q6: b+c+d b+c 0 0 796@@ q0: b+c+d b+c b a 797@@ 798.type _vpaes_schedule_192_smear,%function 799.align 4 800_vpaes_schedule_192_smear: 801 vmov.i8 q1, #0 802 vdup.32 q0, q7#hi[1] 803 vshl.i64 q1, q6, #32 @ vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 804 vmov q0#lo, q7#hi @ vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 805 veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 806 veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 807 veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 808 vmov q0, q6 @ vmovdqa %xmm6, %xmm0 809 vmov q6#lo, q1#lo @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 810 bx lr 811.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 812 813@@ 814@@ .aes_schedule_round 815@@ 816@@ Runs one main round of the key schedule on q0, q7 817@@ 818@@ Specifically, runs subbytes on the high dword of q0 819@@ then rotates it by one byte and xors into the low dword of 820@@ q7. 821@@ 822@@ Adds rcon from low byte of q8, then rotates q8 for 823@@ next rcon. 824@@ 825@@ Smears the dwords of q7 by xoring the low into the 826@@ second low, result into third, result into highest. 827@@ 828@@ Returns results in q7 = q0. 829@@ Clobbers q1-q4, r11. 830@@ 831.type _vpaes_schedule_round,%function 832.align 4 833_vpaes_schedule_round: 834 @ extract rcon from xmm8 835 vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 836 vext.8 q1, $rcon, q4, #15 @ vpalignr \$15, %xmm8, %xmm4, %xmm1 837 vext.8 $rcon, $rcon, $rcon, #15 @ vpalignr \$15, %xmm8, %xmm8, %xmm8 838 veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 839 840 @ rotate 841 vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 842 vext.8 q0, q0, q0, #1 @ vpalignr \$1, %xmm0, %xmm0, %xmm0 843 844 @ fall through... 845 846 @ low round: same as high round, but no rotation and no rcon. 847_vpaes_schedule_low_round: 848 @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. 849 @ We pin other values in _vpaes_key_preheat, so load them now. 850 adr r11, .Lk_sb1 851 vld1.64 {q14,q15}, [r11] 852 853 @ smear xmm7 854 vext.8 q1, q4, q7, #12 @ vpslldq \$4, %xmm7, %xmm1 855 veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 856 vext.8 q4, q4, q7, #8 @ vpslldq \$8, %xmm7, %xmm4 857 858 @ subbytes 859 vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 # 0 = k 860 vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i 861 veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 862 vtbl.8 q2#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 863 vtbl.8 q2#hi, {$invhi}, q1#hi 864 veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j 865 vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 866 vtbl.8 q3#hi, {$invlo}, q0#hi 867 veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 868 vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 869 vtbl.8 q4#hi, {$invlo}, q1#hi 870 veor q7, q7, $s63 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 871 vtbl.8 q3#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 872 vtbl.8 q3#hi, {$invlo}, q3#hi 873 veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 874 vtbl.8 q2#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 875 vtbl.8 q2#hi, {$invlo}, q4#hi 876 veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io 877 veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 878 vtbl.8 q4#lo, {q15}, q3#lo @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 879 vtbl.8 q4#hi, {q15}, q3#hi 880 vtbl.8 q1#lo, {q14}, q2#lo @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 881 vtbl.8 q1#hi, {q14}, q2#hi 882 veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 883 884 @ add in smeared stuff 885 veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 886 veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 887 bx lr 888.size _vpaes_schedule_round,.-_vpaes_schedule_round 889 890@@ 891@@ .aes_schedule_transform 892@@ 893@@ Linear-transform q0 according to tables at [r11] 894@@ 895@@ Requires that q9 = 0x0F0F... as in preheat 896@@ Output in q0 897@@ Clobbers q1, q2, q14, q15 898@@ 899.type _vpaes_schedule_transform,%function 900.align 4 901_vpaes_schedule_transform: 902 vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo 903 @ vmovdqa 16(%r11), %xmm1 # hi 904 vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 905 vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 906 vtbl.8 q2#lo, {q14}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm2 907 vtbl.8 q2#hi, {q14}, q1#hi 908 vtbl.8 q0#lo, {q15}, q0#lo @ vpshufb %xmm0, %xmm1, %xmm0 909 vtbl.8 q0#hi, {q15}, q0#hi 910 veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 911 bx lr 912.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 913 914@@ 915@@ .aes_schedule_mangle 916@@ 917@@ Mangles q0 from (basis-transformed) standard version 918@@ to our version. 919@@ 920@@ On encrypt, 921@@ xor with 0x63 922@@ multiply by circulant 0,1,1,1 923@@ apply shiftrows transform 924@@ 925@@ On decrypt, 926@@ xor with 0x63 927@@ multiply by "inverse mixcolumns" circulant E,B,D,9 928@@ deskew 929@@ apply shiftrows transform 930@@ 931@@ 932@@ Writes out to [r2], and increments or decrements it 933@@ Keeps track of round number mod 4 in r8 934@@ Preserves q0 935@@ Clobbers q1-q5 936@@ 937.type _vpaes_schedule_mangle,%function 938.align 4 939_vpaes_schedule_mangle: 940 tst $dir, $dir 941 vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later 942 adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. 943 vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 944 bne .Lschedule_mangle_dec 945 946 @ encrypting 947 @ Write to q2 so we do not overlap table and destination below. 948 veor q2, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 949 add $out, $out, #16 @ add \$16, %rdx 950 vtbl.8 q4#lo, {q2}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm4 951 vtbl.8 q4#hi, {q2}, q5#hi 952 vtbl.8 q1#lo, {q4}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm1 953 vtbl.8 q1#hi, {q4}, q5#hi 954 vtbl.8 q3#lo, {q1}, q5#lo @ vpshufb %xmm5, %xmm1, %xmm3 955 vtbl.8 q3#hi, {q1}, q5#hi 956 veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 957 vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 958 veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 959 960 b .Lschedule_mangle_both 961.align 4 962.Lschedule_mangle_dec: 963 @ inverse mix columns 964 adr r11, .Lk_dksd @ lea .Lk_dksd(%rip),%r11 965 vshr.u8 q1, q4, #4 @ vpsrlb \$4, %xmm4, %xmm1 # 1 = hi 966 vand q4, q4, $s0F @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo 967 968 vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2 969 @ vmovdqa 0x10(%r11), %xmm3 970 vtbl.8 q2#lo, {q14}, q4#lo @ vpshufb %xmm4, %xmm2, %xmm2 971 vtbl.8 q2#hi, {q14}, q4#hi 972 vtbl.8 q3#lo, {q15}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3 973 vtbl.8 q3#hi, {q15}, q1#hi 974 @ Load .Lk_dksb ahead of time. 975 vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2 976 @ vmovdqa 0x30(%r11), %xmm3 977 @ Write to q13 so we do not overlap table and destination. 978 veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 979 vtbl.8 q3#lo, {q13}, q5#lo @ vpshufb %xmm5, %xmm3, %xmm3 980 vtbl.8 q3#hi, {q13}, q5#hi 981 982 vtbl.8 q2#lo, {q14}, q4#lo @ vpshufb %xmm4, %xmm2, %xmm2 983 vtbl.8 q2#hi, {q14}, q4#hi 984 veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 985 vtbl.8 q3#lo, {q15}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3 986 vtbl.8 q3#hi, {q15}, q1#hi 987 @ Load .Lk_dkse ahead of time. 988 vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2 989 @ vmovdqa 0x50(%r11), %xmm3 990 @ Write to q13 so we do not overlap table and destination. 991 veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 992 vtbl.8 q3#lo, {q13}, q5#lo @ vpshufb %xmm5, %xmm3, %xmm3 993 vtbl.8 q3#hi, {q13}, q5#hi 994 995 vtbl.8 q2#lo, {q14}, q4#lo @ vpshufb %xmm4, %xmm2, %xmm2 996 vtbl.8 q2#hi, {q14}, q4#hi 997 veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 998 vtbl.8 q3#lo, {q15}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3 999 vtbl.8 q3#hi, {q15}, q1#hi 1000 @ Load .Lk_dkse ahead of time. 1001 vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2 1002 @ vmovdqa 0x70(%r11), %xmm4 1003 @ Write to q13 so we do not overlap table and destination. 1004 veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 1005 1006 vtbl.8 q2#lo, {q14}, q4#lo @ vpshufb %xmm4, %xmm2, %xmm2 1007 vtbl.8 q2#hi, {q14}, q4#hi 1008 vtbl.8 q3#lo, {q13}, q5#lo @ vpshufb %xmm5, %xmm3, %xmm3 1009 vtbl.8 q3#hi, {q13}, q5#hi 1010 vtbl.8 q4#lo, {q15}, q1#lo @ vpshufb %xmm1, %xmm4, %xmm4 1011 vtbl.8 q4#hi, {q15}, q1#hi 1012 vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 1013 veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 1014 veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3 1015 1016 sub $out, $out, #16 @ add \$-16, %rdx 1017 1018.Lschedule_mangle_both: 1019 @ Write to q2 so table and destination do not overlap. 1020 vtbl.8 q2#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3 1021 vtbl.8 q2#hi, {q3}, q1#hi 1022 add r8, r8, #64-16 @ add \$-16, %r8 1023 and r8, r8, #~(1<<6) @ and \$0x30, %r8 1024 vst1.64 {q2}, [$out] @ vmovdqu %xmm3, (%rdx) 1025 bx lr 1026.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 1027 1028.globl vpaes_set_encrypt_key 1029.type vpaes_set_encrypt_key,%function 1030.align 4 1031vpaes_set_encrypt_key: 1032 stmdb sp!, {r7-r11, lr} 1033 vstmdb sp!, {d8-d15} 1034 1035 lsr r9, $bits, #5 @ shr \$5,%eax 1036 add r9, r9, #5 @ \$5,%eax 1037 str r9, [$out,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1038 1039 mov $dir, #0 @ mov \$0,%ecx 1040 mov r8, #0x30 @ mov \$0x30,%r8d 1041 bl _vpaes_schedule_core 1042 eor r0, r0, r0 1043 1044 vldmia sp!, {d8-d15} 1045 ldmia sp!, {r7-r11, pc} @ return 1046.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1047 1048.globl vpaes_set_decrypt_key 1049.type vpaes_set_decrypt_key,%function 1050.align 4 1051vpaes_set_decrypt_key: 1052 stmdb sp!, {r7-r11, lr} 1053 vstmdb sp!, {d8-d15} 1054 1055 lsr r9, $bits, #5 @ shr \$5,%eax 1056 add r9, r9, #5 @ \$5,%eax 1057 str r9, [$out,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1058 lsl r9, r9, #4 @ shl \$4,%eax 1059 add $out, $out, #16 @ lea 16(%rdx,%rax),%rdx 1060 add $out, $out, r9 1061 1062 mov $dir, #1 @ mov \$1,%ecx 1063 lsr r8, $bits, #1 @ shr \$1,%r8d 1064 and r8, r8, #32 @ and \$32,%r8d 1065 eor r8, r8, #32 @ xor \$32,%r8d # nbits==192?0:32 1066 bl _vpaes_schedule_core 1067 1068 vldmia sp!, {d8-d15} 1069 ldmia sp!, {r7-r11, pc} @ return 1070.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1071___ 1072} 1073 1074{ 1075my ($out, $inp) = map("r$_", (0..1)); 1076my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12)); 1077 1078$code .= <<___; 1079 1080@ Additional constants for converting to bsaes. 1081.type _vpaes_convert_consts,%object 1082.align 4 1083_vpaes_convert_consts: 1084@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear 1085@ transform in the AES S-box. 0x63 is incorporated into the low half of the 1086@ table. This was computed with the following script: 1087@ 1088@ def u64s_to_u128(x, y): 1089@ return x | (y << 64) 1090@ def u128_to_u64s(w): 1091@ return w & ((1<<64)-1), w >> 64 1092@ def get_byte(w, i): 1093@ return (w >> (i*8)) & 0xff 1094@ def apply_table(table, b): 1095@ lo = b & 0xf 1096@ hi = b >> 4 1097@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) 1098@ def opt(b): 1099@ table = [ 1100@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), 1101@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), 1102@ ] 1103@ return apply_table(table, b) 1104@ def rot_byte(b, n): 1105@ return 0xff & ((b << n) | (b >> (8-n))) 1106@ def skew(x): 1107@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ 1108@ rot_byte(x, 4)) 1109@ table = [0, 0] 1110@ for i in range(16): 1111@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) 1112@ table[1] |= skew(opt(i<<4)) << (i*8) 1113@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0])) 1114@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1])) 1115.Lk_opt_then_skew: 1116 .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b 1117 .quad 0x1f30062936192f00, 0xb49bad829db284ab 1118 1119@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation 1120@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344 1121@ becomes 0x22334411 and then 0x11443322. 1122.Lk_decrypt_transform: 1123 .quad 0x0704050603000102, 0x0f0c0d0e0b08090a 1124.size _vpaes_convert_consts,.-_vpaes_convert_consts 1125 1126@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); 1127.globl vpaes_encrypt_key_to_bsaes 1128.type vpaes_encrypt_key_to_bsaes,%function 1129.align 4 1130vpaes_encrypt_key_to_bsaes: 1131 stmdb sp!, {r11, lr} 1132 1133 @ See _vpaes_schedule_core for the key schedule logic. In particular, 1134 @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), 1135 @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last 1136 @ contain the transformations not in the bsaes representation. This 1137 @ function inverts those transforms. 1138 @ 1139 @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key 1140 @ representation, which does not match the other aes_nohw_* 1141 @ implementations. The ARM aes_nohw_* stores each 32-bit word 1142 @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the 1143 @ cost of extra REV and VREV32 operations in little-endian ARM. 1144 1145 vmov.i8 $s0F, #0x0f @ Required by _vpaes_schedule_transform 1146 adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. 1147 add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) 1148 1149 vld1.64 {$mc_forward}, [r2] 1150 vmov.i8 $s63, #0x5b @ .Lk_s63 from vpaes-x86_64 1151 adr r11, .Lk_opt @ Must be aligned to 8 mod 16. 1152 vmov.i8 $s63_raw, #0x63 @ .LK_s63 without .Lk_ipt applied 1153 1154 @ vpaes stores one fewer round count than bsaes, but the number of keys 1155 @ is the same. 1156 ldr r2, [$inp,#240] 1157 add r2, r2, #1 1158 str r2, [$out,#240] 1159 1160 @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). 1161 @ Invert this with .Lk_opt. 1162 vld1.64 {q0}, [$inp]! 1163 bl _vpaes_schedule_transform 1164 vrev32.8 q0, q0 1165 vst1.64 {q0}, [$out]! 1166 1167 @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, 1168 @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, 1169 @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. 1170.Loop_enc_key_to_bsaes: 1171 vld1.64 {q0}, [$inp]! 1172 1173 @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle 1174 @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. 1175 @ We use r3 rather than r8 to avoid a callee-saved register. 1176 vld1.64 {q1}, [r3] 1177 vtbl.8 q2#lo, {q0}, q1#lo 1178 vtbl.8 q2#hi, {q0}, q1#hi 1179 add r3, r3, #16 1180 and r3, r3, #~(1<<6) 1181 vmov q0, q2 1182 1183 @ Handle the last key differently. 1184 subs r2, r2, #1 1185 beq .Loop_enc_key_to_bsaes_last 1186 1187 @ Multiply by the circulant. This is its own inverse. 1188 vtbl.8 q1#lo, {q0}, $mc_forward#lo 1189 vtbl.8 q1#hi, {q0}, $mc_forward#hi 1190 vmov q0, q1 1191 vtbl.8 q2#lo, {q1}, $mc_forward#lo 1192 vtbl.8 q2#hi, {q1}, $mc_forward#hi 1193 veor q0, q0, q2 1194 vtbl.8 q1#lo, {q2}, $mc_forward#lo 1195 vtbl.8 q1#hi, {q2}, $mc_forward#hi 1196 veor q0, q0, q1 1197 1198 @ XOR and finish. 1199 veor q0, q0, $s63 1200 bl _vpaes_schedule_transform 1201 vrev32.8 q0, q0 1202 vst1.64 {q0}, [$out]! 1203 b .Loop_enc_key_to_bsaes 1204 1205.Loop_enc_key_to_bsaes_last: 1206 @ The final key does not have a basis transform (note 1207 @ .Lschedule_mangle_last inverts the original transform). It only XORs 1208 @ 0x63 and applies ShiftRows. The latter was already inverted in the 1209 @ loop. Note that, because we act on the original representation, we use 1210 @ $s63_raw, not $s63. 1211 veor q0, q0, $s63_raw 1212 vrev32.8 q0, q0 1213 vst1.64 {q0}, [$out] 1214 1215 @ Wipe registers which contained key material. 1216 veor q0, q0, q0 1217 veor q1, q1, q1 1218 veor q2, q2, q2 1219 1220 ldmia sp!, {r11, pc} @ return 1221.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes 1222 1223@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes); 1224.globl vpaes_decrypt_key_to_bsaes 1225.type vpaes_decrypt_key_to_bsaes,%function 1226.align 4 1227vpaes_decrypt_key_to_bsaes: 1228 stmdb sp!, {r11, lr} 1229 1230 @ See _vpaes_schedule_core for the key schedule logic. Note vpaes 1231 @ computes the decryption key schedule in reverse. Additionally, 1232 @ aes-x86_64.pl shares some transformations, so we must only partially 1233 @ invert vpaes's transformations. In general, vpaes computes in a 1234 @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of 1235 @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is 1236 @ split into a linear skew and XOR of 0x63). We undo all but MixColumns. 1237 @ 1238 @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key 1239 @ representation, which does not match the other aes_nohw_* 1240 @ implementations. The ARM aes_nohw_* stores each 32-bit word 1241 @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the 1242 @ cost of extra REV and VREV32 operations in little-endian ARM. 1243 1244 adr r2, .Lk_decrypt_transform 1245 adr r3, .Lk_sr+0x30 1246 adr r11, .Lk_opt_then_skew @ Input to _vpaes_schedule_transform. 1247 vld1.64 {$mc_forward}, [r2] @ Reuse $mc_forward from encryption. 1248 vmov.i8 $s0F, #0x0f @ Required by _vpaes_schedule_transform 1249 1250 @ vpaes stores one fewer round count than bsaes, but the number of keys 1251 @ is the same. 1252 ldr r2, [$inp,#240] 1253 add r2, r2, #1 1254 str r2, [$out,#240] 1255 1256 @ Undo the basis change and reapply the S-box affine transform. See 1257 @ .Lschedule_mangle_last. 1258 vld1.64 {q0}, [$inp]! 1259 bl _vpaes_schedule_transform 1260 vrev32.8 q0, q0 1261 vst1.64 {q0}, [$out]! 1262 1263 @ See _vpaes_schedule_mangle for the transform on the middle keys. Note 1264 @ it simultaneously inverts MixColumns and the S-box affine transform. 1265 @ See .Lk_dksd through .Lk_dks9. 1266.Loop_dec_key_to_bsaes: 1267 vld1.64 {q0}, [$inp]! 1268 1269 @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going 1270 @ forwards cancels inverting for which direction we cycle r3. We use r3 1271 @ rather than r8 to avoid a callee-saved register. 1272 vld1.64 {q1}, [r3] 1273 vtbl.8 q2#lo, {q0}, q1#lo 1274 vtbl.8 q2#hi, {q0}, q1#hi 1275 add r3, r3, #64-16 1276 and r3, r3, #~(1<<6) 1277 vmov q0, q2 1278 1279 @ Handle the last key differently. 1280 subs r2, r2, #1 1281 beq .Loop_dec_key_to_bsaes_last 1282 1283 @ Undo the basis change and reapply the S-box affine transform. 1284 bl _vpaes_schedule_transform 1285 1286 @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We 1287 @ combine the two operations in .Lk_decrypt_transform. 1288 @ 1289 @ TODO(davidben): Where does the rotation come from? 1290 vtbl.8 q1#lo, {q0}, $mc_forward#lo 1291 vtbl.8 q1#hi, {q0}, $mc_forward#hi 1292 1293 vst1.64 {q1}, [$out]! 1294 b .Loop_dec_key_to_bsaes 1295 1296.Loop_dec_key_to_bsaes_last: 1297 @ The final key only inverts ShiftRows (already done in the loop). See 1298 @ .Lschedule_am_decrypting. Its basis is not transformed. 1299 vrev32.8 q0, q0 1300 vst1.64 {q0}, [$out]! 1301 1302 @ Wipe registers which contained key material. 1303 veor q0, q0, q0 1304 veor q1, q1, q1 1305 veor q2, q2, q2 1306 1307 ldmia sp!, {r11, pc} @ return 1308.size vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes 1309___ 1310} 1311 1312{ 1313# Register-passed parameters. 1314my ($inp, $out, $len, $key) = map("r$_", 0..3); 1315# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and 1316# $tmp. $ctr is r7 because it must be preserved across calls. 1317my ($ctr, $ivec, $tmp) = map("r$_", 7..9); 1318 1319# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, 1320# const AES_KEY *key, const uint8_t ivec[16]); 1321$code .= <<___; 1322.globl vpaes_ctr32_encrypt_blocks 1323.type vpaes_ctr32_encrypt_blocks,%function 1324.align 4 1325vpaes_ctr32_encrypt_blocks: 1326 mov ip, sp 1327 stmdb sp!, {r7-r11, lr} 1328 @ This function uses q4-q7 (d8-d15), which are callee-saved. 1329 vstmdb sp!, {d8-d15} 1330 1331 cmp $len, #0 1332 @ $ivec is passed on the stack. 1333 ldr $ivec, [ip] 1334 beq .Lctr32_done 1335 1336 @ _vpaes_encrypt_core expects the key in r2, so swap $len and $key. 1337 mov $tmp, $key 1338 mov $key, $len 1339 mov $len, $tmp 1340___ 1341my ($len, $key) = ($key, $len); 1342$code .= <<___; 1343 1344 @ Load the IV and counter portion. 1345 ldr $ctr, [$ivec, #12] 1346 vld1.8 {q7}, [$ivec] 1347 1348 bl _vpaes_preheat 1349 rev $ctr, $ctr @ The counter is big-endian. 1350 1351.Lctr32_loop: 1352 vmov q0, q7 1353 vld1.8 {q6}, [$inp]! @ Load input ahead of time 1354 bl _vpaes_encrypt_core 1355 veor q0, q0, q6 @ XOR input and result 1356 vst1.8 {q0}, [$out]! 1357 subs $len, $len, #1 1358 @ Update the counter. 1359 add $ctr, $ctr, #1 1360 rev $tmp, $ctr 1361 vmov.32 q7#hi[1], $tmp 1362 bne .Lctr32_loop 1363 1364.Lctr32_done: 1365 vldmia sp!, {d8-d15} 1366 ldmia sp!, {r7-r11, pc} @ return 1367.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks 1368___ 1369} 1370 1371foreach (split("\n",$code)) { 1372 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1373 print $_,"\n"; 1374} 1375 1376close STDOUT or die "error closing STDOUT: $!"; 1377