1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include <ring-core/arm_arch.h> 8 9.section .rodata 10 11.type _vpaes_consts,%object 12.align 7 // totally strategic alignment 13_vpaes_consts: 14.Lk_mc_forward: // mc_forward 15.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 16.quad 0x080B0A0904070605, 0x000302010C0F0E0D 17.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 18.quad 0x000302010C0F0E0D, 0x080B0A0904070605 19.Lk_mc_backward: // mc_backward 20.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 21.quad 0x020100030E0D0C0F, 0x0A09080B06050407 22.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 23.quad 0x0A09080B06050407, 0x020100030E0D0C0F 24.Lk_sr: // sr 25.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 26.quad 0x030E09040F0A0500, 0x0B06010C07020D08 27.quad 0x0F060D040B020900, 0x070E050C030A0108 28.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 29 30// 31// "Hot" constants 32// 33.Lk_inv: // inv, inva 34.quad 0x0E05060F0D080180, 0x040703090A0B0C02 35.quad 0x01040A060F0B0780, 0x030D0E0C02050809 36.Lk_ipt: // input transform (lo, hi) 37.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 38.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 39.Lk_sbo: // sbou, sbot 40.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 41.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 42.Lk_sb1: // sb1u, sb1t 43.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 44.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 45.Lk_sb2: // sb2u, sb2t 46.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 47.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 48 49// 50// Key schedule constants 51// 52.Lk_dksd: // decryption key schedule: invskew x*D 53.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 54.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 55.Lk_dksb: // decryption key schedule: invskew x*B 56.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 57.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 58.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 59.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 60.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 61.Lk_dks9: // decryption key schedule: invskew x*9 62.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 63.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 64 65.Lk_rcon: // rcon 66.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 67 68.Lk_opt: // output transform 69.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 70.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 71.Lk_deskew: // deskew tables: inverts the sbox's "skew" 72.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 73.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 74 75.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 76.align 2 77.size _vpaes_consts,.-_vpaes_consts 78.align 6 79 80.text 81## 82## _aes_preheat 83## 84## Fills register %r10 -> .aes_consts (so you can -fPIC) 85## and %xmm9-%xmm15 as specified below. 86## 87.type _vpaes_encrypt_preheat,%function 88.align 4 89_vpaes_encrypt_preheat: 90 adrp x10, .Lk_inv 91 add x10, x10, :lo12:.Lk_inv 92 movi v17.16b, #0x0f 93 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 94 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 95 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 96 ret 97.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 98 99## 100## _aes_encrypt_core 101## 102## AES-encrypt %xmm0. 103## 104## Inputs: 105## %xmm0 = input 106## %xmm9-%xmm15 as in _vpaes_preheat 107## (%rdx) = scheduled keys 108## 109## Output in %xmm0 110## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 111## Preserves %xmm6 - %xmm8 so you get some local vectors 112## 113## 114.type _vpaes_encrypt_core,%function 115.align 4 116_vpaes_encrypt_core: 117 mov x9, x2 118 ldr w8, [x2,#240] // pull rounds 119 adrp x11, .Lk_mc_forward+16 120 add x11, x11, :lo12:.Lk_mc_forward+16 121 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 122 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 123 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 124 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 125 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 126 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 127 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 128 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 129 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 130 b .Lenc_entry 131 132.align 4 133.Lenc_loop: 134 // middle of middle round 135 add x10, x11, #0x40 136 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 137 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 138 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 139 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 140 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 141 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 142 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 143 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 144 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 145 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 146 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 147 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 148 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 149 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 150 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 151 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 152 sub w8, w8, #1 // nr-- 153 154.Lenc_entry: 155 // top of round 156 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 157 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 158 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 159 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 160 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 161 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 162 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 163 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 164 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 165 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 166 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 167 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 168 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 169 cbnz w8, .Lenc_loop 170 171 // middle of last round 172 add x10, x11, #0x80 173 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 174 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 175 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 176 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 177 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 178 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 179 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 180 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 181 ret 182.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 183 184.globl vpaes_encrypt 185.hidden vpaes_encrypt 186.type vpaes_encrypt,%function 187.align 4 188vpaes_encrypt: 189 AARCH64_SIGN_LINK_REGISTER 190 stp x29,x30,[sp,#-16]! 191 add x29,sp,#0 192 193 ld1 {v7.16b}, [x0] 194 bl _vpaes_encrypt_preheat 195 bl _vpaes_encrypt_core 196 st1 {v0.16b}, [x1] 197 198 ldp x29,x30,[sp],#16 199 AARCH64_VALIDATE_LINK_REGISTER 200 ret 201.size vpaes_encrypt,.-vpaes_encrypt 202 203.type _vpaes_encrypt_2x,%function 204.align 4 205_vpaes_encrypt_2x: 206 mov x9, x2 207 ldr w8, [x2,#240] // pull rounds 208 adrp x11, .Lk_mc_forward+16 209 add x11, x11, :lo12:.Lk_mc_forward+16 210 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 211 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 212 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 213 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 214 and v9.16b, v15.16b, v17.16b 215 ushr v8.16b, v15.16b, #4 216 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 217 tbl v9.16b, {v20.16b}, v9.16b 218 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 219 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 220 tbl v10.16b, {v21.16b}, v8.16b 221 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 222 eor v8.16b, v9.16b, v16.16b 223 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 224 eor v8.16b, v8.16b, v10.16b 225 b .Lenc_2x_entry 226 227.align 4 228.Lenc_2x_loop: 229 // middle of middle round 230 add x10, x11, #0x40 231 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 232 tbl v12.16b, {v25.16b}, v10.16b 233 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 234 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 235 tbl v8.16b, {v24.16b}, v11.16b 236 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 237 eor v12.16b, v12.16b, v16.16b 238 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 239 tbl v13.16b, {v27.16b}, v10.16b 240 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 241 eor v8.16b, v8.16b, v12.16b 242 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 243 tbl v10.16b, {v26.16b}, v11.16b 244 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 245 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 246 tbl v11.16b, {v8.16b}, v1.16b 247 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 248 eor v10.16b, v10.16b, v13.16b 249 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 250 tbl v8.16b, {v8.16b}, v4.16b 251 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 252 eor v11.16b, v11.16b, v10.16b 253 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 254 tbl v12.16b, {v11.16b},v1.16b 255 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 256 eor v8.16b, v8.16b, v11.16b 257 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 258 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 259 eor v8.16b, v8.16b, v12.16b 260 sub w8, w8, #1 // nr-- 261 262.Lenc_2x_entry: 263 // top of round 264 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 265 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 266 and v9.16b, v8.16b, v17.16b 267 ushr v8.16b, v8.16b, #4 268 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 269 tbl v13.16b, {v19.16b},v9.16b 270 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 271 eor v9.16b, v9.16b, v8.16b 272 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 273 tbl v11.16b, {v18.16b},v8.16b 274 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 275 tbl v12.16b, {v18.16b},v9.16b 276 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 277 eor v11.16b, v11.16b, v13.16b 278 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 279 eor v12.16b, v12.16b, v13.16b 280 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 281 tbl v10.16b, {v18.16b},v11.16b 282 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 283 tbl v11.16b, {v18.16b},v12.16b 284 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 285 eor v10.16b, v10.16b, v9.16b 286 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 287 eor v11.16b, v11.16b, v8.16b 288 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 289 cbnz w8, .Lenc_2x_loop 290 291 // middle of last round 292 add x10, x11, #0x80 293 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 294 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 295 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 296 tbl v12.16b, {v22.16b}, v10.16b 297 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 298 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 299 tbl v8.16b, {v23.16b}, v11.16b 300 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 301 eor v12.16b, v12.16b, v16.16b 302 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 303 eor v8.16b, v8.16b, v12.16b 304 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 305 tbl v1.16b, {v8.16b},v1.16b 306 ret 307.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 308######################################################## 309## ## 310## AES key schedule ## 311## ## 312######################################################## 313.type _vpaes_key_preheat,%function 314.align 4 315_vpaes_key_preheat: 316 adrp x10, .Lk_inv 317 add x10, x10, :lo12:.Lk_inv 318 movi v16.16b, #0x5b // .Lk_s63 319 adrp x11, .Lk_sb1 320 add x11, x11, :lo12:.Lk_sb1 321 movi v17.16b, #0x0f // .Lk_s0F 322 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 323 adrp x10, .Lk_dksd 324 add x10, x10, :lo12:.Lk_dksd 325 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 326 adrp x11, .Lk_mc_forward 327 add x11, x11, :lo12:.Lk_mc_forward 328 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 329 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 330 ld1 {v8.2d}, [x10] // .Lk_rcon 331 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 332 ret 333.size _vpaes_key_preheat,.-_vpaes_key_preheat 334 335.type _vpaes_schedule_core,%function 336.align 4 337_vpaes_schedule_core: 338 AARCH64_SIGN_LINK_REGISTER 339 stp x29, x30, [sp,#-16]! 340 add x29,sp,#0 341 342 bl _vpaes_key_preheat // load the tables 343 344 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 345 346 // input transform 347 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 348 bl _vpaes_schedule_transform 349 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 350 351 adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 352 add x10, x10, :lo12:.Lk_sr 353 354 add x8, x8, x10 355 356 // encrypting, output zeroth round key after transform 357 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 358 359 cmp w1, #192 // cmp $192, %esi 360 b.hi .Lschedule_256 361 b.eq .Lschedule_192 362 // 128: fall though 363 364## 365## .schedule_128 366## 367## 128-bit specific part of key schedule. 368## 369## This schedule is really simple, because all its parts 370## are accomplished by the subroutines. 371## 372.Lschedule_128: 373 mov x0, #10 // mov $10, %esi 374 375.Loop_schedule_128: 376 sub x0, x0, #1 // dec %esi 377 bl _vpaes_schedule_round 378 cbz x0, .Lschedule_mangle_last 379 bl _vpaes_schedule_mangle // write output 380 b .Loop_schedule_128 381 382## 383## .aes_schedule_192 384## 385## 192-bit specific part of key schedule. 386## 387## The main body of this schedule is the same as the 128-bit 388## schedule, but with more smearing. The long, high side is 389## stored in %xmm7 as before, and the short, low side is in 390## the high bits of %xmm6. 391## 392## This schedule is somewhat nastier, however, because each 393## round produces 192 bits of key material, or 1.5 round keys. 394## Therefore, on each cycle we do 2 rounds and produce 3 round 395## keys. 396## 397.align 4 398.Lschedule_192: 399 sub x0, x0, #8 400 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 401 bl _vpaes_schedule_transform // input transform 402 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 403 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 404 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 405 mov x0, #4 // mov $4, %esi 406 407.Loop_schedule_192: 408 sub x0, x0, #1 // dec %esi 409 bl _vpaes_schedule_round 410 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 411 bl _vpaes_schedule_mangle // save key n 412 bl _vpaes_schedule_192_smear 413 bl _vpaes_schedule_mangle // save key n+1 414 bl _vpaes_schedule_round 415 cbz x0, .Lschedule_mangle_last 416 bl _vpaes_schedule_mangle // save key n+2 417 bl _vpaes_schedule_192_smear 418 b .Loop_schedule_192 419 420## 421## .aes_schedule_256 422## 423## 256-bit specific part of key schedule. 424## 425## The structure here is very similar to the 128-bit 426## schedule, but with an additional "low side" in 427## %xmm6. The low side's rounds are the same as the 428## high side's, except no rcon and no rotation. 429## 430.align 4 431.Lschedule_256: 432 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 433 bl _vpaes_schedule_transform // input transform 434 mov x0, #7 // mov $7, %esi 435 436.Loop_schedule_256: 437 sub x0, x0, #1 // dec %esi 438 bl _vpaes_schedule_mangle // output low result 439 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 440 441 // high round 442 bl _vpaes_schedule_round 443 cbz x0, .Lschedule_mangle_last 444 bl _vpaes_schedule_mangle 445 446 // low round. swap xmm7 and xmm6 447 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 448 movi v4.16b, #0 449 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 450 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 451 bl _vpaes_schedule_low_round 452 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 453 454 b .Loop_schedule_256 455 456## 457## .aes_schedule_mangle_last 458## 459## Mangler for last round of key schedule 460## Mangles %xmm0 461## when encrypting, outputs out(%xmm0) ^ 63 462## when decrypting, outputs unskew(%xmm0) 463## 464## Always called right before return... jumps to cleanup and exits 465## 466.align 4 467.Lschedule_mangle_last: 468 // schedule last round key from xmm0 469 adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 470 add x11, x11, :lo12:.Lk_deskew 471 472 cbnz w3, .Lschedule_mangle_last_dec 473 474 // encrypting 475 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 476 adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 477 add x11, x11, :lo12:.Lk_opt 478 add x2, x2, #32 // add $32, %rdx 479 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 480 481.Lschedule_mangle_last_dec: 482 ld1 {v20.2d,v21.2d}, [x11] // reload constants 483 sub x2, x2, #16 // add $-16, %rdx 484 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 485 bl _vpaes_schedule_transform // output transform 486 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 487 488 // cleanup 489 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 490 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 491 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 492 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 493 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 494 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 495 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 496 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 497 ldp x29, x30, [sp],#16 498 AARCH64_VALIDATE_LINK_REGISTER 499 ret 500.size _vpaes_schedule_core,.-_vpaes_schedule_core 501 502## 503## .aes_schedule_192_smear 504## 505## Smear the short, low side in the 192-bit key schedule. 506## 507## Inputs: 508## %xmm7: high side, b a x y 509## %xmm6: low side, d c 0 0 510## %xmm13: 0 511## 512## Outputs: 513## %xmm6: b+c+d b+c 0 0 514## %xmm0: b+c+d b+c b a 515## 516.type _vpaes_schedule_192_smear,%function 517.align 4 518_vpaes_schedule_192_smear: 519 movi v1.16b, #0 520 dup v0.4s, v7.s[3] 521 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 522 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 523 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 524 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 525 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 526 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 527 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 528 ret 529.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 530 531## 532## .aes_schedule_round 533## 534## Runs one main round of the key schedule on %xmm0, %xmm7 535## 536## Specifically, runs subbytes on the high dword of %xmm0 537## then rotates it by one byte and xors into the low dword of 538## %xmm7. 539## 540## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 541## next rcon. 542## 543## Smears the dwords of %xmm7 by xoring the low into the 544## second low, result into third, result into highest. 545## 546## Returns results in %xmm7 = %xmm0. 547## Clobbers %xmm1-%xmm4, %r11. 548## 549.type _vpaes_schedule_round,%function 550.align 4 551_vpaes_schedule_round: 552 // extract rcon from xmm8 553 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 554 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 555 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 556 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 557 558 // rotate 559 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 560 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 561 562 // fall through... 563 564 // low round: same as high round, but no rotation and no rcon. 565_vpaes_schedule_low_round: 566 // smear xmm7 567 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 568 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 569 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 570 571 // subbytes 572 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 573 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 574 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 575 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 576 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 577 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 578 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 579 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 580 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 581 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 582 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 583 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 584 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 585 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 586 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 587 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 588 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 589 590 // add in smeared stuff 591 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 592 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 593 ret 594.size _vpaes_schedule_round,.-_vpaes_schedule_round 595 596## 597## .aes_schedule_transform 598## 599## Linear-transform %xmm0 according to tables at (%r11) 600## 601## Requires that %xmm9 = 0x0F0F... as in preheat 602## Output in %xmm0 603## Clobbers %xmm1, %xmm2 604## 605.type _vpaes_schedule_transform,%function 606.align 4 607_vpaes_schedule_transform: 608 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 609 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 610 // vmovdqa (%r11), %xmm2 # lo 611 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 612 // vmovdqa 16(%r11), %xmm1 # hi 613 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 614 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 615 ret 616.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 617 618## 619## .aes_schedule_mangle 620## 621## Mangle xmm0 from (basis-transformed) standard version 622## to our version. 623## 624## On encrypt, 625## xor with 0x63 626## multiply by circulant 0,1,1,1 627## apply shiftrows transform 628## 629## On decrypt, 630## xor with 0x63 631## multiply by "inverse mixcolumns" circulant E,B,D,9 632## deskew 633## apply shiftrows transform 634## 635## 636## Writes out to (%rdx), and increments or decrements it 637## Keeps track of round number mod 4 in %r8 638## Preserves xmm0 639## Clobbers xmm1-xmm5 640## 641.type _vpaes_schedule_mangle,%function 642.align 4 643_vpaes_schedule_mangle: 644 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 645 // vmovdqa .Lk_mc_forward(%rip),%xmm5 646 647 // encrypting 648 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 649 add x2, x2, #16 // add $16, %rdx 650 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 651 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 652 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 653 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 654 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 655 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 656 657.Lschedule_mangle_both: 658 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 659 add x8, x8, #48 // add $-16, %r8 660 and x8, x8, #~(1<<6) // and $0x30, %r8 661 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 662 ret 663.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 664 665.globl vpaes_set_encrypt_key 666.hidden vpaes_set_encrypt_key 667.type vpaes_set_encrypt_key,%function 668.align 4 669vpaes_set_encrypt_key: 670 AARCH64_SIGN_LINK_REGISTER 671 stp x29,x30,[sp,#-16]! 672 add x29,sp,#0 673 stp d8,d9,[sp,#-16]! // ABI spec says so 674 675 lsr w9, w1, #5 // shr $5,%eax 676 add w9, w9, #5 // $5,%eax 677 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 678 679 mov w3, #0 // mov $0,%ecx 680 mov x8, #0x30 // mov $0x30,%r8d 681 bl _vpaes_schedule_core 682 eor x0, x0, x0 683 684 ldp d8,d9,[sp],#16 685 ldp x29,x30,[sp],#16 686 AARCH64_VALIDATE_LINK_REGISTER 687 ret 688.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 689.globl vpaes_ctr32_encrypt_blocks 690.hidden vpaes_ctr32_encrypt_blocks 691.type vpaes_ctr32_encrypt_blocks,%function 692.align 4 693vpaes_ctr32_encrypt_blocks: 694 AARCH64_SIGN_LINK_REGISTER 695 stp x29,x30,[sp,#-16]! 696 add x29,sp,#0 697 stp d8,d9,[sp,#-16]! // ABI spec says so 698 stp d10,d11,[sp,#-16]! 699 stp d12,d13,[sp,#-16]! 700 stp d14,d15,[sp,#-16]! 701 702 cbz x2, .Lctr32_done 703 704 // Note, unlike the other functions, x2 here is measured in blocks, 705 // not bytes. 706 mov x17, x2 707 mov x2, x3 708 709 // Load the IV and counter portion. 710 ldr w6, [x4, #12] 711 ld1 {v7.16b}, [x4] 712 713 bl _vpaes_encrypt_preheat 714 tst x17, #1 715 rev w6, w6 // The counter is big-endian. 716 b.eq .Lctr32_prep_loop 717 718 // Handle one block so the remaining block count is even for 719 // _vpaes_encrypt_2x. 720 ld1 {v6.16b}, [x0], #16 // .Load input ahead of time 721 bl _vpaes_encrypt_core 722 eor v0.16b, v0.16b, v6.16b // XOR input and result 723 st1 {v0.16b}, [x1], #16 724 subs x17, x17, #1 725 // Update the counter. 726 add w6, w6, #1 727 rev w7, w6 728 mov v7.s[3], w7 729 b.ls .Lctr32_done 730 731.Lctr32_prep_loop: 732 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 733 // uses v14 and v15. 734 mov v15.16b, v7.16b 735 mov v14.16b, v7.16b 736 add w6, w6, #1 737 rev w7, w6 738 mov v15.s[3], w7 739 740.Lctr32_loop: 741 ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time 742 bl _vpaes_encrypt_2x 743 eor v0.16b, v0.16b, v6.16b // XOR input and result 744 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 745 st1 {v0.16b,v1.16b}, [x1], #32 746 subs x17, x17, #2 747 // Update the counter. 748 add w7, w6, #1 749 add w6, w6, #2 750 rev w7, w7 751 mov v14.s[3], w7 752 rev w7, w6 753 mov v15.s[3], w7 754 b.hi .Lctr32_loop 755 756.Lctr32_done: 757 ldp d14,d15,[sp],#16 758 ldp d12,d13,[sp],#16 759 ldp d10,d11,[sp],#16 760 ldp d8,d9,[sp],#16 761 ldp x29,x30,[sp],#16 762 AARCH64_VALIDATE_LINK_REGISTER 763 ret 764.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks 765#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 766