1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include <openssl/arm_arch.h> 8#if __ARM_MAX_ARCH__ >= 8 9 10.arch armv8-a+crypto 11.text 12.globl aes_gcm_enc_kernel 13 14.def aes_gcm_enc_kernel 15 .type 32 16.endef 17.align 4 18aes_gcm_enc_kernel: 19 AARCH64_SIGN_LINK_REGISTER 20 stp x29, x30, [sp, #-128]! 21 mov x29, sp 22 stp x19, x20, [sp, #16] 23 mov x16, x4 24 mov x8, x5 25 stp x21, x22, [sp, #32] 26 stp x23, x24, [sp, #48] 27 stp d8, d9, [sp, #64] 28 stp d10, d11, [sp, #80] 29 stp d12, d13, [sp, #96] 30 stp d14, d15, [sp, #112] 31 ldr w17, [x8, #240] 32 add x19, x8, x17, lsl #4 // borrow input_l1 for last key 33 ldp x13, x14, [x19] // load round N keys 34 ldr q31, [x19, #-16] // load round N-1 keys 35 add x4, x0, x1, lsr #3 // end_input_ptr 36 lsr x5, x1, #3 // byte_len 37 mov x15, x5 38 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 39 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible 40 sub x5, x5, #1 // byte_len - 1 41 ldr q18, [x8, #0] // load rk0 42 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 43 ldr q25, [x8, #112] // load rk7 44 add x5, x5, x0 45 lsr x12, x11, #32 46 fmov d2, x10 // CTR block 2 47 orr w11, w11, w11 48 rev w12, w12 // rev_ctr32 49 fmov d1, x10 // CTR block 1 50 aese v0.16b, v18.16b 51 aesmc v0.16b, v0.16b // AES block 0 - round 0 52 add w12, w12, #1 // increment rev_ctr32 53 rev w9, w12 // CTR block 1 54 fmov d3, x10 // CTR block 3 55 orr x9, x11, x9, lsl #32 // CTR block 1 56 add w12, w12, #1 // CTR block 1 57 ldr q19, [x8, #16] // load rk1 58 fmov v1.d[1], x9 // CTR block 1 59 rev w9, w12 // CTR block 2 60 add w12, w12, #1 // CTR block 2 61 orr x9, x11, x9, lsl #32 // CTR block 2 62 ldr q20, [x8, #32] // load rk2 63 fmov v2.d[1], x9 // CTR block 2 64 rev w9, w12 // CTR block 3 65 aese v0.16b, v19.16b 66 aesmc v0.16b, v0.16b // AES block 0 - round 1 67 orr x9, x11, x9, lsl #32 // CTR block 3 68 fmov v3.d[1], x9 // CTR block 3 69 aese v1.16b, v18.16b 70 aesmc v1.16b, v1.16b // AES block 1 - round 0 71 ldr q21, [x8, #48] // load rk3 72 aese v0.16b, v20.16b 73 aesmc v0.16b, v0.16b // AES block 0 - round 2 74 ldr q24, [x8, #96] // load rk6 75 aese v2.16b, v18.16b 76 aesmc v2.16b, v2.16b // AES block 2 - round 0 77 ldr q23, [x8, #80] // load rk5 78 aese v1.16b, v19.16b 79 aesmc v1.16b, v1.16b // AES block 1 - round 1 80 ldr q14, [x6, #48] // load h3l | h3h 81 ext v14.16b, v14.16b, v14.16b, #8 82 aese v3.16b, v18.16b 83 aesmc v3.16b, v3.16b // AES block 3 - round 0 84 aese v2.16b, v19.16b 85 aesmc v2.16b, v2.16b // AES block 2 - round 1 86 ldr q22, [x8, #64] // load rk4 87 aese v1.16b, v20.16b 88 aesmc v1.16b, v1.16b // AES block 1 - round 2 89 ldr q13, [x6, #32] // load h2l | h2h 90 ext v13.16b, v13.16b, v13.16b, #8 91 aese v3.16b, v19.16b 92 aesmc v3.16b, v3.16b // AES block 3 - round 1 93 ldr q30, [x8, #192] // load rk12 94 aese v2.16b, v20.16b 95 aesmc v2.16b, v2.16b // AES block 2 - round 2 96 ldr q15, [x6, #80] // load h4l | h4h 97 ext v15.16b, v15.16b, v15.16b, #8 98 aese v1.16b, v21.16b 99 aesmc v1.16b, v1.16b // AES block 1 - round 3 100 ldr q29, [x8, #176] // load rk11 101 aese v3.16b, v20.16b 102 aesmc v3.16b, v3.16b // AES block 3 - round 2 103 ldr q26, [x8, #128] // load rk8 104 aese v2.16b, v21.16b 105 aesmc v2.16b, v2.16b // AES block 2 - round 3 106 add w12, w12, #1 // CTR block 3 107 aese v0.16b, v21.16b 108 aesmc v0.16b, v0.16b // AES block 0 - round 3 109 aese v3.16b, v21.16b 110 aesmc v3.16b, v3.16b // AES block 3 - round 3 111 ld1 { v11.16b}, [x3] 112 ext v11.16b, v11.16b, v11.16b, #8 113 rev64 v11.16b, v11.16b 114 aese v2.16b, v22.16b 115 aesmc v2.16b, v2.16b // AES block 2 - round 4 116 aese v0.16b, v22.16b 117 aesmc v0.16b, v0.16b // AES block 0 - round 4 118 aese v1.16b, v22.16b 119 aesmc v1.16b, v1.16b // AES block 1 - round 4 120 aese v3.16b, v22.16b 121 aesmc v3.16b, v3.16b // AES block 3 - round 4 122 cmp x17, #12 // setup flags for AES-128/192/256 check 123 aese v0.16b, v23.16b 124 aesmc v0.16b, v0.16b // AES block 0 - round 5 125 aese v1.16b, v23.16b 126 aesmc v1.16b, v1.16b // AES block 1 - round 5 127 aese v3.16b, v23.16b 128 aesmc v3.16b, v3.16b // AES block 3 - round 5 129 aese v2.16b, v23.16b 130 aesmc v2.16b, v2.16b // AES block 2 - round 5 131 aese v1.16b, v24.16b 132 aesmc v1.16b, v1.16b // AES block 1 - round 6 133 trn2 v17.2d, v14.2d, v15.2d // h4l | h3l 134 aese v3.16b, v24.16b 135 aesmc v3.16b, v3.16b // AES block 3 - round 6 136 ldr q27, [x8, #144] // load rk9 137 aese v0.16b, v24.16b 138 aesmc v0.16b, v0.16b // AES block 0 - round 6 139 ldr q12, [x6] // load h1l | h1h 140 ext v12.16b, v12.16b, v12.16b, #8 141 aese v2.16b, v24.16b 142 aesmc v2.16b, v2.16b // AES block 2 - round 6 143 ldr q28, [x8, #160] // load rk10 144 aese v1.16b, v25.16b 145 aesmc v1.16b, v1.16b // AES block 1 - round 7 146 trn1 v9.2d, v14.2d, v15.2d // h4h | h3h 147 aese v0.16b, v25.16b 148 aesmc v0.16b, v0.16b // AES block 0 - round 7 149 aese v2.16b, v25.16b 150 aesmc v2.16b, v2.16b // AES block 2 - round 7 151 aese v3.16b, v25.16b 152 aesmc v3.16b, v3.16b // AES block 3 - round 7 153 trn2 v16.2d, v12.2d, v13.2d // h2l | h1l 154 aese v1.16b, v26.16b 155 aesmc v1.16b, v1.16b // AES block 1 - round 8 156 aese v2.16b, v26.16b 157 aesmc v2.16b, v2.16b // AES block 2 - round 8 158 aese v3.16b, v26.16b 159 aesmc v3.16b, v3.16b // AES block 3 - round 8 160 aese v0.16b, v26.16b 161 aesmc v0.16b, v0.16b // AES block 0 - round 8 162 b.lt Lenc_finish_first_blocks // branch if AES-128 163 164 aese v1.16b, v27.16b 165 aesmc v1.16b, v1.16b // AES block 1 - round 9 166 aese v2.16b, v27.16b 167 aesmc v2.16b, v2.16b // AES block 2 - round 9 168 aese v3.16b, v27.16b 169 aesmc v3.16b, v3.16b // AES block 3 - round 9 170 aese v0.16b, v27.16b 171 aesmc v0.16b, v0.16b // AES block 0 - round 9 172 aese v1.16b, v28.16b 173 aesmc v1.16b, v1.16b // AES block 1 - round 10 174 aese v2.16b, v28.16b 175 aesmc v2.16b, v2.16b // AES block 2 - round 10 176 aese v3.16b, v28.16b 177 aesmc v3.16b, v3.16b // AES block 3 - round 10 178 aese v0.16b, v28.16b 179 aesmc v0.16b, v0.16b // AES block 0 - round 10 180 b.eq Lenc_finish_first_blocks // branch if AES-192 181 182 aese v1.16b, v29.16b 183 aesmc v1.16b, v1.16b // AES block 1 - round 11 184 aese v2.16b, v29.16b 185 aesmc v2.16b, v2.16b // AES block 2 - round 11 186 aese v0.16b, v29.16b 187 aesmc v0.16b, v0.16b // AES block 0 - round 11 188 aese v3.16b, v29.16b 189 aesmc v3.16b, v3.16b // AES block 3 - round 11 190 aese v1.16b, v30.16b 191 aesmc v1.16b, v1.16b // AES block 1 - round 12 192 aese v2.16b, v30.16b 193 aesmc v2.16b, v2.16b // AES block 2 - round 12 194 aese v0.16b, v30.16b 195 aesmc v0.16b, v0.16b // AES block 0 - round 12 196 aese v3.16b, v30.16b 197 aesmc v3.16b, v3.16b // AES block 3 - round 12 198 199Lenc_finish_first_blocks: 200 cmp x0, x5 // check if we have <= 4 blocks 201 eor v17.16b, v17.16b, v9.16b // h4k | h3k 202 aese v2.16b, v31.16b // AES block 2 - round N-1 203 trn1 v8.2d, v12.2d, v13.2d // h2h | h1h 204 aese v1.16b, v31.16b // AES block 1 - round N-1 205 aese v0.16b, v31.16b // AES block 0 - round N-1 206 aese v3.16b, v31.16b // AES block 3 - round N-1 207 eor v16.16b, v16.16b, v8.16b // h2k | h1k 208 b.ge Lenc_tail // handle tail 209 210 ldp x19, x20, [x0, #16] // AES block 1 - load plaintext 211 rev w9, w12 // CTR block 4 212 ldp x6, x7, [x0, #0] // AES block 0 - load plaintext 213 ldp x23, x24, [x0, #48] // AES block 3 - load plaintext 214 ldp x21, x22, [x0, #32] // AES block 2 - load plaintext 215 add x0, x0, #64 // AES input_ptr update 216 eor x19, x19, x13 // AES block 1 - round N low 217 eor x20, x20, x14 // AES block 1 - round N high 218 fmov d5, x19 // AES block 1 - mov low 219 eor x6, x6, x13 // AES block 0 - round N low 220 eor x7, x7, x14 // AES block 0 - round N high 221 eor x24, x24, x14 // AES block 3 - round N high 222 fmov d4, x6 // AES block 0 - mov low 223 cmp x0, x5 // check if we have <= 8 blocks 224 fmov v4.d[1], x7 // AES block 0 - mov high 225 eor x23, x23, x13 // AES block 3 - round N low 226 eor x21, x21, x13 // AES block 2 - round N low 227 fmov v5.d[1], x20 // AES block 1 - mov high 228 fmov d6, x21 // AES block 2 - mov low 229 add w12, w12, #1 // CTR block 4 230 orr x9, x11, x9, lsl #32 // CTR block 4 231 fmov d7, x23 // AES block 3 - mov low 232 eor x22, x22, x14 // AES block 2 - round N high 233 fmov v6.d[1], x22 // AES block 2 - mov high 234 eor v4.16b, v4.16b, v0.16b // AES block 0 - result 235 fmov d0, x10 // CTR block 4 236 fmov v0.d[1], x9 // CTR block 4 237 rev w9, w12 // CTR block 5 238 add w12, w12, #1 // CTR block 5 239 eor v5.16b, v5.16b, v1.16b // AES block 1 - result 240 fmov d1, x10 // CTR block 5 241 orr x9, x11, x9, lsl #32 // CTR block 5 242 fmov v1.d[1], x9 // CTR block 5 243 rev w9, w12 // CTR block 6 244 st1 { v4.16b}, [x2], #16 // AES block 0 - store result 245 fmov v7.d[1], x24 // AES block 3 - mov high 246 orr x9, x11, x9, lsl #32 // CTR block 6 247 eor v6.16b, v6.16b, v2.16b // AES block 2 - result 248 st1 { v5.16b}, [x2], #16 // AES block 1 - store result 249 add w12, w12, #1 // CTR block 6 250 fmov d2, x10 // CTR block 6 251 fmov v2.d[1], x9 // CTR block 6 252 st1 { v6.16b}, [x2], #16 // AES block 2 - store result 253 rev w9, w12 // CTR block 7 254 orr x9, x11, x9, lsl #32 // CTR block 7 255 eor v7.16b, v7.16b, v3.16b // AES block 3 - result 256 st1 { v7.16b}, [x2], #16 // AES block 3 - store result 257 b.ge Lenc_prepretail // do prepretail 258 259Lenc_main_loop: // main loop start 260 aese v0.16b, v18.16b 261 aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 262 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) 263 aese v1.16b, v18.16b 264 aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 265 fmov d3, x10 // CTR block 4k+3 266 aese v2.16b, v18.16b 267 aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 268 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 269 aese v0.16b, v19.16b 270 aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 271 fmov v3.d[1], x9 // CTR block 4k+3 272 aese v1.16b, v19.16b 273 aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 274 ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext 275 aese v2.16b, v19.16b 276 aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 277 ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext 278 aese v0.16b, v20.16b 279 aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 280 eor v4.16b, v4.16b, v11.16b // PRE 1 281 aese v1.16b, v20.16b 282 aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 283 aese v3.16b, v18.16b 284 aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 285 eor x23, x23, x13 // AES block 4k+7 - round N low 286 aese v0.16b, v21.16b 287 aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 288 mov d10, v17.d[1] // GHASH block 4k - mid 289 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high 290 eor x22, x22, x14 // AES block 4k+6 - round N high 291 mov d8, v4.d[1] // GHASH block 4k - mid 292 aese v3.16b, v19.16b 293 aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 294 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) 295 aese v0.16b, v22.16b 296 aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 297 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low 298 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid 299 aese v2.16b, v20.16b 300 aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 301 aese v0.16b, v23.16b 302 aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 303 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) 304 pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high 305 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid 306 rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) 307 pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low 308 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high 309 mov d4, v5.d[1] // GHASH block 4k+1 - mid 310 aese v1.16b, v21.16b 311 aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 312 aese v3.16b, v20.16b 313 aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 314 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low 315 aese v2.16b, v21.16b 316 aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 317 aese v1.16b, v22.16b 318 aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 319 mov d8, v6.d[1] // GHASH block 4k+2 - mid 320 aese v3.16b, v21.16b 321 aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 322 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid 323 aese v2.16b, v22.16b 324 aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 325 aese v0.16b, v24.16b 326 aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 327 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid 328 aese v3.16b, v22.16b 329 aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 330 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid 331 aese v0.16b, v25.16b 332 aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 333 aese v3.16b, v23.16b 334 aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 335 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid 336 aese v1.16b, v23.16b 337 aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 338 aese v0.16b, v26.16b 339 aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 340 aese v2.16b, v23.16b 341 aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 342 aese v1.16b, v24.16b 343 aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 344 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid 345 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high 346 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low 347 aese v1.16b, v25.16b 348 aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 349 pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low 350 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high 351 aese v3.16b, v24.16b 352 aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 353 ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext 354 aese v1.16b, v26.16b 355 aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 356 mov d4, v7.d[1] // GHASH block 4k+3 - mid 357 aese v2.16b, v24.16b 358 aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 359 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low 360 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid 361 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high 362 eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid 363 aese v2.16b, v25.16b 364 aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 365 eor x19, x19, x13 // AES block 4k+5 - round N low 366 aese v2.16b, v26.16b 367 aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 368 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid 369 aese v3.16b, v25.16b 370 aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 371 eor x21, x21, x13 // AES block 4k+6 - round N low 372 aese v3.16b, v26.16b 373 aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 374 movi v8.8b, #0xc2 375 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid 376 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high 377 cmp x17, #12 // setup flags for AES-128/192/256 check 378 fmov d5, x19 // AES block 4k+5 - mov low 379 ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext 380 b.lt Lenc_main_loop_continue // branch if AES-128 381 382 aese v1.16b, v27.16b 383 aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 384 aese v0.16b, v27.16b 385 aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 386 aese v2.16b, v27.16b 387 aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 388 aese v3.16b, v27.16b 389 aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 390 aese v0.16b, v28.16b 391 aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 392 aese v1.16b, v28.16b 393 aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 394 aese v2.16b, v28.16b 395 aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 396 aese v3.16b, v28.16b 397 aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 398 b.eq Lenc_main_loop_continue // branch if AES-192 399 400 aese v0.16b, v29.16b 401 aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 402 aese v1.16b, v29.16b 403 aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 404 aese v2.16b, v29.16b 405 aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 406 aese v3.16b, v29.16b 407 aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 408 aese v1.16b, v30.16b 409 aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 410 aese v0.16b, v30.16b 411 aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 412 aese v2.16b, v30.16b 413 aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 414 aese v3.16b, v30.16b 415 aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 416 417Lenc_main_loop_continue: 418 shl d8, d8, #56 // mod_constant 419 eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low 420 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid 421 add w12, w12, #1 // CTR block 4k+3 422 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 423 add x0, x0, #64 // AES input_ptr update 424 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 425 rev w9, w12 // CTR block 4k+8 426 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 427 eor x6, x6, x13 // AES block 4k+4 - round N low 428 eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up 429 eor x7, x7, x14 // AES block 4k+4 - round N high 430 fmov d4, x6 // AES block 4k+4 - mov low 431 orr x9, x11, x9, lsl #32 // CTR block 4k+8 432 eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid 433 eor x20, x20, x14 // AES block 4k+5 - round N high 434 eor x24, x24, x14 // AES block 4k+7 - round N high 435 add w12, w12, #1 // CTR block 4k+8 436 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 437 fmov v4.d[1], x7 // AES block 4k+4 - mov high 438 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 439 fmov d7, x23 // AES block 4k+7 - mov low 440 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 441 fmov v5.d[1], x20 // AES block 4k+5 - mov high 442 fmov d6, x21 // AES block 4k+6 - mov low 443 cmp x0, x5 // LOOP CONTROL 444 fmov v6.d[1], x22 // AES block 4k+6 - mov high 445 pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 446 eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result 447 fmov d0, x10 // CTR block 4k+8 448 fmov v0.d[1], x9 // CTR block 4k+8 449 rev w9, w12 // CTR block 4k+9 450 add w12, w12, #1 // CTR block 4k+9 451 eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result 452 fmov d1, x10 // CTR block 4k+9 453 orr x9, x11, x9, lsl #32 // CTR block 4k+9 454 fmov v1.d[1], x9 // CTR block 4k+9 455 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 456 rev w9, w12 // CTR block 4k+10 457 st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result 458 orr x9, x11, x9, lsl #32 // CTR block 4k+10 459 eor v11.16b, v11.16b, v9.16b // MODULO - fold into low 460 fmov v7.d[1], x24 // AES block 4k+7 - mov high 461 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 462 st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result 463 add w12, w12, #1 // CTR block 4k+10 464 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 465 eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result 466 fmov d2, x10 // CTR block 4k+10 467 st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result 468 fmov v2.d[1], x9 // CTR block 4k+10 469 rev w9, w12 // CTR block 4k+11 470 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 471 orr x9, x11, x9, lsl #32 // CTR block 4k+11 472 eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result 473 st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result 474 b.lt Lenc_main_loop 475 476Lenc_prepretail: // PREPRETAIL 477 aese v1.16b, v18.16b 478 aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 479 rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) 480 aese v2.16b, v18.16b 481 aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 482 fmov d3, x10 // CTR block 4k+3 483 aese v0.16b, v18.16b 484 aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 485 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) 486 fmov v3.d[1], x9 // CTR block 4k+3 487 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 488 aese v2.16b, v19.16b 489 aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 490 aese v0.16b, v19.16b 491 aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 492 eor v4.16b, v4.16b, v11.16b // PRE 1 493 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) 494 aese v2.16b, v20.16b 495 aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 496 aese v3.16b, v18.16b 497 aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 498 mov d10, v17.d[1] // GHASH block 4k - mid 499 aese v1.16b, v19.16b 500 aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 501 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low 502 mov d8, v4.d[1] // GHASH block 4k - mid 503 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high 504 aese v2.16b, v21.16b 505 aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 506 aese v1.16b, v20.16b 507 aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 508 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid 509 aese v0.16b, v20.16b 510 aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 511 aese v3.16b, v19.16b 512 aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 513 aese v1.16b, v21.16b 514 aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 515 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid 516 pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high 517 pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low 518 aese v3.16b, v20.16b 519 aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 520 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high 521 mov d4, v5.d[1] // GHASH block 4k+1 - mid 522 aese v0.16b, v21.16b 523 aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 524 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low 525 aese v3.16b, v21.16b 526 aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 527 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid 528 mov d8, v6.d[1] // GHASH block 4k+2 - mid 529 aese v0.16b, v22.16b 530 aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 531 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) 532 aese v3.16b, v22.16b 533 aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 534 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid 535 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid 536 add w12, w12, #1 // CTR block 4k+3 537 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low 538 aese v3.16b, v23.16b 539 aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 540 aese v2.16b, v22.16b 541 aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 542 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid 543 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high 544 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low 545 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid 546 aese v2.16b, v23.16b 547 aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 548 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high 549 mov d4, v7.d[1] // GHASH block 4k+3 - mid 550 aese v1.16b, v22.16b 551 aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 552 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid 553 eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid 554 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high 555 aese v1.16b, v23.16b 556 aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 557 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid 558 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid 559 aese v0.16b, v23.16b 560 aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 561 aese v1.16b, v24.16b 562 aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 563 aese v2.16b, v24.16b 564 aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 565 aese v0.16b, v24.16b 566 aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 567 movi v8.8b, #0xc2 568 aese v3.16b, v24.16b 569 aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 570 aese v1.16b, v25.16b 571 aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 572 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high 573 aese v0.16b, v25.16b 574 aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 575 aese v3.16b, v25.16b 576 aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 577 shl d8, d8, #56 // mod_constant 578 aese v1.16b, v26.16b 579 aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 580 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid 581 pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low 582 aese v3.16b, v26.16b 583 aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 584 cmp x17, #12 // setup flags for AES-128/192/256 check 585 aese v0.16b, v26.16b 586 aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 587 eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low 588 aese v2.16b, v25.16b 589 aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 590 eor v10.16b, v10.16b, v9.16b // karatsuba tidy up 591 aese v2.16b, v26.16b 592 aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 593 pmull v4.1q, v9.1d, v8.1d 594 ext v9.16b, v9.16b, v9.16b, #8 595 eor v10.16b, v10.16b, v11.16b 596 b.lt Lenc_finish_prepretail // branch if AES-128 597 598 aese v1.16b, v27.16b 599 aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 600 aese v3.16b, v27.16b 601 aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 602 aese v0.16b, v27.16b 603 aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 604 aese v2.16b, v27.16b 605 aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 606 aese v3.16b, v28.16b 607 aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 608 aese v1.16b, v28.16b 609 aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 610 aese v0.16b, v28.16b 611 aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 612 aese v2.16b, v28.16b 613 aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 614 b.eq Lenc_finish_prepretail // branch if AES-192 615 616 aese v1.16b, v29.16b 617 aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 618 aese v0.16b, v29.16b 619 aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 620 aese v3.16b, v29.16b 621 aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 622 aese v2.16b, v29.16b 623 aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 624 aese v1.16b, v30.16b 625 aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 626 aese v0.16b, v30.16b 627 aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 628 aese v3.16b, v30.16b 629 aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 630 aese v2.16b, v30.16b 631 aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 632 633Lenc_finish_prepretail: 634 eor v10.16b, v10.16b, v4.16b 635 eor v10.16b, v10.16b, v9.16b 636 pmull v4.1q, v10.1d, v8.1d 637 ext v10.16b, v10.16b, v10.16b, #8 638 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 639 eor v11.16b, v11.16b, v4.16b 640 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 641 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 642 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 643 eor v11.16b, v11.16b, v10.16b 644 645Lenc_tail: // TAIL 646 ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag 647 sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process 648 ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext 649 eor x6, x6, x13 // AES block 4k+4 - round N low 650 eor x7, x7, x14 // AES block 4k+4 - round N high 651 cmp x5, #48 652 fmov d4, x6 // AES block 4k+4 - mov low 653 fmov v4.d[1], x7 // AES block 4k+4 - mov high 654 eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result 655 b.gt Lenc_blocks_more_than_3 656 cmp x5, #32 657 mov v3.16b, v2.16b 658 movi v11.8b, #0 659 movi v9.8b, #0 660 sub w12, w12, #1 661 mov v2.16b, v1.16b 662 movi v10.8b, #0 663 b.gt Lenc_blocks_more_than_2 664 mov v3.16b, v1.16b 665 sub w12, w12, #1 666 cmp x5, #16 667 b.gt Lenc_blocks_more_than_1 668 sub w12, w12, #1 669 b Lenc_blocks_less_than_1 670Lenc_blocks_more_than_3: // blocks left > 3 671 st1 { v5.16b}, [x2], #16 // AES final-3 block - store result 672 ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high 673 rev64 v4.16b, v5.16b // GHASH final-3 block 674 eor x6, x6, x13 // AES final-2 block - round N low 675 eor v4.16b, v4.16b, v8.16b // feed in partial tag 676 eor x7, x7, x14 // AES final-2 block - round N high 677 mov d22, v4.d[1] // GHASH final-3 block - mid 678 fmov d5, x6 // AES final-2 block - mov low 679 fmov v5.d[1], x7 // AES final-2 block - mov high 680 eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid 681 movi v8.8b, #0 // suppress further partial tag feed in 682 mov d10, v17.d[1] // GHASH final-3 block - mid 683 pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low 684 pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high 685 pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid 686 eor v5.16b, v5.16b, v1.16b // AES final-2 block - result 687Lenc_blocks_more_than_2: // blocks left > 2 688 st1 { v5.16b}, [x2], #16 // AES final-2 block - store result 689 ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high 690 rev64 v4.16b, v5.16b // GHASH final-2 block 691 eor x6, x6, x13 // AES final-1 block - round N low 692 eor v4.16b, v4.16b, v8.16b // feed in partial tag 693 fmov d5, x6 // AES final-1 block - mov low 694 eor x7, x7, x14 // AES final-1 block - round N high 695 fmov v5.d[1], x7 // AES final-1 block - mov high 696 movi v8.8b, #0 // suppress further partial tag feed in 697 pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high 698 mov d22, v4.d[1] // GHASH final-2 block - mid 699 pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low 700 eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid 701 eor v5.16b, v5.16b, v2.16b // AES final-1 block - result 702 eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high 703 pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid 704 eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low 705 eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid 706Lenc_blocks_more_than_1: // blocks left > 1 707 st1 { v5.16b}, [x2], #16 // AES final-1 block - store result 708 rev64 v4.16b, v5.16b // GHASH final-1 block 709 ldp x6, x7, [x0], #16 // AES final block - load input low & high 710 eor v4.16b, v4.16b, v8.16b // feed in partial tag 711 movi v8.8b, #0 // suppress further partial tag feed in 712 eor x6, x6, x13 // AES final block - round N low 713 mov d22, v4.d[1] // GHASH final-1 block - mid 714 pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high 715 eor x7, x7, x14 // AES final block - round N high 716 eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid 717 eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high 718 ins v22.d[1], v22.d[0] // GHASH final-1 block - mid 719 fmov d5, x6 // AES final block - mov low 720 fmov v5.d[1], x7 // AES final block - mov high 721 pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid 722 pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low 723 eor v5.16b, v5.16b, v3.16b // AES final block - result 724 eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid 725 eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low 726Lenc_blocks_less_than_1: // blocks left <= 1 727 and x1, x1, #127 // bit_length %= 128 728 mvn x13, xzr // rkN_l = 0xffffffffffffffff 729 sub x1, x1, #128 // bit_length -= 128 730 neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) 731 ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored 732 mvn x14, xzr // rkN_h = 0xffffffffffffffff 733 and x1, x1, #127 // bit_length %= 128 734 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block 735 cmp x1, #64 736 csel x6, x13, x14, lt 737 csel x7, x14, xzr, lt 738 fmov d0, x6 // ctr0b is mask for last block 739 fmov v0.d[1], x7 740 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits 741 rev64 v4.16b, v5.16b // GHASH final block 742 eor v4.16b, v4.16b, v8.16b // feed in partial tag 743 bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing 744 pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high 745 mov d8, v4.d[1] // GHASH final block - mid 746 rev w9, w12 747 pmull v21.1q, v4.1d, v12.1d // GHASH final block - low 748 eor v9.16b, v9.16b, v20.16b // GHASH final block - high 749 eor v8.8b, v8.8b, v4.8b // GHASH final block - mid 750 pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid 751 eor v11.16b, v11.16b, v21.16b // GHASH final block - low 752 eor v10.16b, v10.16b, v8.16b // GHASH final block - mid 753 movi v8.8b, #0xc2 754 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 755 shl d8, d8, #56 // mod_constant 756 eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up 757 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 758 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 759 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 760 eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid 761 pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 762 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 763 str w9, [x16, #12] // store the updated counter 764 st1 { v5.16b}, [x2] // store all 16B 765 eor v11.16b, v11.16b, v9.16b // MODULO - fold into low 766 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 767 ext v11.16b, v11.16b, v11.16b, #8 768 rev64 v11.16b, v11.16b 769 mov x0, x15 770 st1 { v11.16b }, [x3] 771 ldp x19, x20, [sp, #16] 772 ldp x21, x22, [sp, #32] 773 ldp x23, x24, [sp, #48] 774 ldp d8, d9, [sp, #64] 775 ldp d10, d11, [sp, #80] 776 ldp d12, d13, [sp, #96] 777 ldp d14, d15, [sp, #112] 778 ldp x29, x30, [sp], #128 779 AARCH64_VALIDATE_LINK_REGISTER 780 ret 781 782.globl aes_gcm_dec_kernel 783 784.def aes_gcm_dec_kernel 785 .type 32 786.endef 787.align 4 788aes_gcm_dec_kernel: 789 AARCH64_SIGN_LINK_REGISTER 790 stp x29, x30, [sp, #-128]! 791 mov x29, sp 792 stp x19, x20, [sp, #16] 793 mov x16, x4 794 mov x8, x5 795 stp x21, x22, [sp, #32] 796 stp x23, x24, [sp, #48] 797 stp d8, d9, [sp, #64] 798 stp d10, d11, [sp, #80] 799 stp d12, d13, [sp, #96] 800 stp d14, d15, [sp, #112] 801 ldr w17, [x8, #240] 802 add x19, x8, x17, lsl #4 // borrow input_l1 for last key 803 ldp x13, x14, [x19] // load round N keys 804 ldr q31, [x19, #-16] // load round N-1 keys 805 lsr x5, x1, #3 // byte_len 806 mov x15, x5 807 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 808 ldr q26, [x8, #128] // load rk8 809 sub x5, x5, #1 // byte_len - 1 810 ldr q25, [x8, #112] // load rk7 811 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 812 add x4, x0, x1, lsr #3 // end_input_ptr 813 ldr q24, [x8, #96] // load rk6 814 lsr x12, x11, #32 815 ldr q23, [x8, #80] // load rk5 816 orr w11, w11, w11 817 ldr q21, [x8, #48] // load rk3 818 add x5, x5, x0 819 rev w12, w12 // rev_ctr32 820 add w12, w12, #1 // increment rev_ctr32 821 fmov d3, x10 // CTR block 3 822 rev w9, w12 // CTR block 1 823 add w12, w12, #1 // CTR block 1 824 fmov d1, x10 // CTR block 1 825 orr x9, x11, x9, lsl #32 // CTR block 1 826 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible 827 fmov v1.d[1], x9 // CTR block 1 828 rev w9, w12 // CTR block 2 829 add w12, w12, #1 // CTR block 2 830 fmov d2, x10 // CTR block 2 831 orr x9, x11, x9, lsl #32 // CTR block 2 832 fmov v2.d[1], x9 // CTR block 2 833 rev w9, w12 // CTR block 3 834 orr x9, x11, x9, lsl #32 // CTR block 3 835 ldr q18, [x8, #0] // load rk0 836 fmov v3.d[1], x9 // CTR block 3 837 add w12, w12, #1 // CTR block 3 838 ldr q22, [x8, #64] // load rk4 839 ldr q19, [x8, #16] // load rk1 840 aese v0.16b, v18.16b 841 aesmc v0.16b, v0.16b // AES block 0 - round 0 842 ldr q14, [x6, #48] // load h3l | h3h 843 ext v14.16b, v14.16b, v14.16b, #8 844 aese v3.16b, v18.16b 845 aesmc v3.16b, v3.16b // AES block 3 - round 0 846 ldr q15, [x6, #80] // load h4l | h4h 847 ext v15.16b, v15.16b, v15.16b, #8 848 aese v1.16b, v18.16b 849 aesmc v1.16b, v1.16b // AES block 1 - round 0 850 ldr q13, [x6, #32] // load h2l | h2h 851 ext v13.16b, v13.16b, v13.16b, #8 852 aese v2.16b, v18.16b 853 aesmc v2.16b, v2.16b // AES block 2 - round 0 854 ldr q20, [x8, #32] // load rk2 855 aese v0.16b, v19.16b 856 aesmc v0.16b, v0.16b // AES block 0 - round 1 857 aese v1.16b, v19.16b 858 aesmc v1.16b, v1.16b // AES block 1 - round 1 859 ld1 { v11.16b}, [x3] 860 ext v11.16b, v11.16b, v11.16b, #8 861 rev64 v11.16b, v11.16b 862 aese v2.16b, v19.16b 863 aesmc v2.16b, v2.16b // AES block 2 - round 1 864 ldr q27, [x8, #144] // load rk9 865 aese v3.16b, v19.16b 866 aesmc v3.16b, v3.16b // AES block 3 - round 1 867 ldr q30, [x8, #192] // load rk12 868 aese v0.16b, v20.16b 869 aesmc v0.16b, v0.16b // AES block 0 - round 2 870 ldr q12, [x6] // load h1l | h1h 871 ext v12.16b, v12.16b, v12.16b, #8 872 aese v2.16b, v20.16b 873 aesmc v2.16b, v2.16b // AES block 2 - round 2 874 ldr q28, [x8, #160] // load rk10 875 aese v3.16b, v20.16b 876 aesmc v3.16b, v3.16b // AES block 3 - round 2 877 aese v0.16b, v21.16b 878 aesmc v0.16b, v0.16b // AES block 0 - round 3 879 aese v1.16b, v20.16b 880 aesmc v1.16b, v1.16b // AES block 1 - round 2 881 aese v3.16b, v21.16b 882 aesmc v3.16b, v3.16b // AES block 3 - round 3 883 aese v0.16b, v22.16b 884 aesmc v0.16b, v0.16b // AES block 0 - round 4 885 aese v2.16b, v21.16b 886 aesmc v2.16b, v2.16b // AES block 2 - round 3 887 aese v1.16b, v21.16b 888 aesmc v1.16b, v1.16b // AES block 1 - round 3 889 aese v3.16b, v22.16b 890 aesmc v3.16b, v3.16b // AES block 3 - round 4 891 aese v2.16b, v22.16b 892 aesmc v2.16b, v2.16b // AES block 2 - round 4 893 aese v1.16b, v22.16b 894 aesmc v1.16b, v1.16b // AES block 1 - round 4 895 aese v3.16b, v23.16b 896 aesmc v3.16b, v3.16b // AES block 3 - round 5 897 aese v0.16b, v23.16b 898 aesmc v0.16b, v0.16b // AES block 0 - round 5 899 aese v1.16b, v23.16b 900 aesmc v1.16b, v1.16b // AES block 1 - round 5 901 aese v2.16b, v23.16b 902 aesmc v2.16b, v2.16b // AES block 2 - round 5 903 aese v0.16b, v24.16b 904 aesmc v0.16b, v0.16b // AES block 0 - round 6 905 aese v3.16b, v24.16b 906 aesmc v3.16b, v3.16b // AES block 3 - round 6 907 cmp x17, #12 // setup flags for AES-128/192/256 check 908 aese v1.16b, v24.16b 909 aesmc v1.16b, v1.16b // AES block 1 - round 6 910 aese v2.16b, v24.16b 911 aesmc v2.16b, v2.16b // AES block 2 - round 6 912 aese v0.16b, v25.16b 913 aesmc v0.16b, v0.16b // AES block 0 - round 7 914 aese v1.16b, v25.16b 915 aesmc v1.16b, v1.16b // AES block 1 - round 7 916 aese v3.16b, v25.16b 917 aesmc v3.16b, v3.16b // AES block 3 - round 7 918 aese v0.16b, v26.16b 919 aesmc v0.16b, v0.16b // AES block 0 - round 8 920 aese v2.16b, v25.16b 921 aesmc v2.16b, v2.16b // AES block 2 - round 7 922 aese v3.16b, v26.16b 923 aesmc v3.16b, v3.16b // AES block 3 - round 8 924 aese v1.16b, v26.16b 925 aesmc v1.16b, v1.16b // AES block 1 - round 8 926 ldr q29, [x8, #176] // load rk11 927 aese v2.16b, v26.16b 928 aesmc v2.16b, v2.16b // AES block 2 - round 8 929 b.lt Ldec_finish_first_blocks // branch if AES-128 930 931 aese v0.16b, v27.16b 932 aesmc v0.16b, v0.16b // AES block 0 - round 9 933 aese v1.16b, v27.16b 934 aesmc v1.16b, v1.16b // AES block 1 - round 9 935 aese v3.16b, v27.16b 936 aesmc v3.16b, v3.16b // AES block 3 - round 9 937 aese v2.16b, v27.16b 938 aesmc v2.16b, v2.16b // AES block 2 - round 9 939 aese v0.16b, v28.16b 940 aesmc v0.16b, v0.16b // AES block 0 - round 10 941 aese v1.16b, v28.16b 942 aesmc v1.16b, v1.16b // AES block 1 - round 10 943 aese v3.16b, v28.16b 944 aesmc v3.16b, v3.16b // AES block 3 - round 10 945 aese v2.16b, v28.16b 946 aesmc v2.16b, v2.16b // AES block 2 - round 10 947 b.eq Ldec_finish_first_blocks // branch if AES-192 948 949 aese v0.16b, v29.16b 950 aesmc v0.16b, v0.16b // AES block 0 - round 11 951 aese v3.16b, v29.16b 952 aesmc v3.16b, v3.16b // AES block 3 - round 11 953 aese v1.16b, v29.16b 954 aesmc v1.16b, v1.16b // AES block 1 - round 11 955 aese v2.16b, v29.16b 956 aesmc v2.16b, v2.16b // AES block 2 - round 11 957 aese v1.16b, v30.16b 958 aesmc v1.16b, v1.16b // AES block 1 - round 12 959 aese v0.16b, v30.16b 960 aesmc v0.16b, v0.16b // AES block 0 - round 12 961 aese v2.16b, v30.16b 962 aesmc v2.16b, v2.16b // AES block 2 - round 12 963 aese v3.16b, v30.16b 964 aesmc v3.16b, v3.16b // AES block 3 - round 12 965 966Ldec_finish_first_blocks: 967 cmp x0, x5 // check if we have <= 4 blocks 968 trn1 v9.2d, v14.2d, v15.2d // h4h | h3h 969 trn2 v17.2d, v14.2d, v15.2d // h4l | h3l 970 trn1 v8.2d, v12.2d, v13.2d // h2h | h1h 971 trn2 v16.2d, v12.2d, v13.2d // h2l | h1l 972 eor v17.16b, v17.16b, v9.16b // h4k | h3k 973 aese v1.16b, v31.16b // AES block 1 - round N-1 974 aese v2.16b, v31.16b // AES block 2 - round N-1 975 eor v16.16b, v16.16b, v8.16b // h2k | h1k 976 aese v3.16b, v31.16b // AES block 3 - round N-1 977 aese v0.16b, v31.16b // AES block 0 - round N-1 978 b.ge Ldec_tail // handle tail 979 980 ldr q4, [x0, #0] // AES block 0 - load ciphertext 981 ldr q5, [x0, #16] // AES block 1 - load ciphertext 982 rev w9, w12 // CTR block 4 983 eor v0.16b, v4.16b, v0.16b // AES block 0 - result 984 eor v1.16b, v5.16b, v1.16b // AES block 1 - result 985 rev64 v5.16b, v5.16b // GHASH block 1 986 ldr q7, [x0, #48] // AES block 3 - load ciphertext 987 mov x7, v0.d[1] // AES block 0 - mov high 988 mov x6, v0.d[0] // AES block 0 - mov low 989 rev64 v4.16b, v4.16b // GHASH block 0 990 add w12, w12, #1 // CTR block 4 991 fmov d0, x10 // CTR block 4 992 orr x9, x11, x9, lsl #32 // CTR block 4 993 fmov v0.d[1], x9 // CTR block 4 994 rev w9, w12 // CTR block 5 995 add w12, w12, #1 // CTR block 5 996 mov x19, v1.d[0] // AES block 1 - mov low 997 orr x9, x11, x9, lsl #32 // CTR block 5 998 mov x20, v1.d[1] // AES block 1 - mov high 999 eor x7, x7, x14 // AES block 0 - round N high 1000 eor x6, x6, x13 // AES block 0 - round N low 1001 stp x6, x7, [x2], #16 // AES block 0 - store result 1002 fmov d1, x10 // CTR block 5 1003 ldr q6, [x0, #32] // AES block 2 - load ciphertext 1004 add x0, x0, #64 // AES input_ptr update 1005 fmov v1.d[1], x9 // CTR block 5 1006 rev w9, w12 // CTR block 6 1007 add w12, w12, #1 // CTR block 6 1008 eor x19, x19, x13 // AES block 1 - round N low 1009 orr x9, x11, x9, lsl #32 // CTR block 6 1010 eor x20, x20, x14 // AES block 1 - round N high 1011 stp x19, x20, [x2], #16 // AES block 1 - store result 1012 eor v2.16b, v6.16b, v2.16b // AES block 2 - result 1013 cmp x0, x5 // check if we have <= 8 blocks 1014 b.ge Ldec_prepretail // do prepretail 1015 1016Ldec_main_loop: // main loop start 1017 mov x21, v2.d[0] // AES block 4k+2 - mov low 1018 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 1019 eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result 1020 aese v0.16b, v18.16b 1021 aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 1022 mov x22, v2.d[1] // AES block 4k+2 - mov high 1023 aese v1.16b, v18.16b 1024 aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 1025 fmov d2, x10 // CTR block 4k+6 1026 fmov v2.d[1], x9 // CTR block 4k+6 1027 eor v4.16b, v4.16b, v11.16b // PRE 1 1028 rev w9, w12 // CTR block 4k+7 1029 aese v0.16b, v19.16b 1030 aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 1031 mov x24, v3.d[1] // AES block 4k+3 - mov high 1032 aese v1.16b, v19.16b 1033 aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 1034 mov x23, v3.d[0] // AES block 4k+3 - mov low 1035 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high 1036 mov d8, v4.d[1] // GHASH block 4k - mid 1037 fmov d3, x10 // CTR block 4k+7 1038 aese v0.16b, v20.16b 1039 aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 1040 orr x9, x11, x9, lsl #32 // CTR block 4k+7 1041 aese v2.16b, v18.16b 1042 aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 1043 fmov v3.d[1], x9 // CTR block 4k+7 1044 aese v1.16b, v20.16b 1045 aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 1046 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid 1047 aese v0.16b, v21.16b 1048 aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 1049 eor x22, x22, x14 // AES block 4k+2 - round N high 1050 aese v2.16b, v19.16b 1051 aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 1052 mov d10, v17.d[1] // GHASH block 4k - mid 1053 aese v1.16b, v21.16b 1054 aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 1055 rev64 v6.16b, v6.16b // GHASH block 4k+2 1056 aese v3.16b, v18.16b 1057 aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 1058 eor x21, x21, x13 // AES block 4k+2 - round N low 1059 aese v2.16b, v20.16b 1060 aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 1061 stp x21, x22, [x2], #16 // AES block 4k+2 - store result 1062 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low 1063 pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high 1064 aese v2.16b, v21.16b 1065 aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 1066 rev64 v7.16b, v7.16b // GHASH block 4k+3 1067 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid 1068 eor x23, x23, x13 // AES block 4k+3 - round N low 1069 pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low 1070 eor x24, x24, x14 // AES block 4k+3 - round N high 1071 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high 1072 aese v2.16b, v22.16b 1073 aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 1074 aese v3.16b, v19.16b 1075 aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 1076 mov d4, v5.d[1] // GHASH block 4k+1 - mid 1077 aese v0.16b, v22.16b 1078 aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 1079 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low 1080 aese v2.16b, v23.16b 1081 aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 1082 add w12, w12, #1 // CTR block 4k+7 1083 aese v3.16b, v20.16b 1084 aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 1085 mov d8, v6.d[1] // GHASH block 4k+2 - mid 1086 aese v1.16b, v22.16b 1087 aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 1088 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid 1089 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low 1090 aese v3.16b, v21.16b 1091 aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 1092 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid 1093 aese v1.16b, v23.16b 1094 aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 1095 aese v0.16b, v23.16b 1096 aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 1097 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low 1098 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid 1099 rev w9, w12 // CTR block 4k+8 1100 aese v1.16b, v24.16b 1101 aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 1102 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid 1103 aese v0.16b, v24.16b 1104 aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 1105 add w12, w12, #1 // CTR block 4k+8 1106 aese v3.16b, v22.16b 1107 aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 1108 aese v1.16b, v25.16b 1109 aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 1110 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid 1111 aese v0.16b, v25.16b 1112 aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 1113 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high 1114 mov d6, v7.d[1] // GHASH block 4k+3 - mid 1115 aese v3.16b, v23.16b 1116 aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 1117 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid 1118 aese v0.16b, v26.16b 1119 aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 1120 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high 1121 aese v3.16b, v24.16b 1122 aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 1123 pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low 1124 orr x9, x11, x9, lsl #32 // CTR block 4k+8 1125 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid 1126 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high 1127 cmp x17, #12 // setup flags for AES-128/192/256 check 1128 eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid 1129 aese v1.16b, v26.16b 1130 aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 1131 aese v2.16b, v24.16b 1132 aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 1133 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high 1134 pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid 1135 movi v8.8b, #0xc2 1136 aese v2.16b, v25.16b 1137 aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 1138 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low 1139 aese v3.16b, v25.16b 1140 aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 1141 shl d8, d8, #56 // mod_constant 1142 aese v2.16b, v26.16b 1143 aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 1144 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid 1145 aese v3.16b, v26.16b 1146 aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 1147 b.lt Ldec_main_loop_continue // branch if AES-128 1148 1149 aese v0.16b, v27.16b 1150 aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 1151 aese v2.16b, v27.16b 1152 aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 1153 aese v1.16b, v27.16b 1154 aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 1155 aese v3.16b, v27.16b 1156 aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 1157 aese v0.16b, v28.16b 1158 aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 1159 aese v1.16b, v28.16b 1160 aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 1161 aese v2.16b, v28.16b 1162 aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 1163 aese v3.16b, v28.16b 1164 aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 1165 b.eq Ldec_main_loop_continue // branch if AES-192 1166 1167 aese v0.16b, v29.16b 1168 aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 1169 aese v1.16b, v29.16b 1170 aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 1171 aese v2.16b, v29.16b 1172 aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 1173 aese v3.16b, v29.16b 1174 aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 1175 aese v0.16b, v30.16b 1176 aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 1177 aese v1.16b, v30.16b 1178 aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 1179 aese v2.16b, v30.16b 1180 aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 1181 aese v3.16b, v30.16b 1182 aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 1183 1184Ldec_main_loop_continue: 1185 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 1186 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 1187 ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext 1188 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 1189 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 1190 eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up 1191 ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext 1192 eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result 1193 stp x23, x24, [x2], #16 // AES block 4k+3 - store result 1194 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 1195 ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext 1196 ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext 1197 mov x7, v0.d[1] // AES block 4k+4 - mov high 1198 eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid 1199 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 1200 add x0, x0, #64 // AES input_ptr update 1201 mov x6, v0.d[0] // AES block 4k+4 - mov low 1202 fmov d0, x10 // CTR block 4k+8 1203 fmov v0.d[1], x9 // CTR block 4k+8 1204 pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 1205 eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result 1206 rev w9, w12 // CTR block 4k+9 1207 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 1208 orr x9, x11, x9, lsl #32 // CTR block 4k+9 1209 cmp x0, x5 // LOOP CONTROL 1210 add w12, w12, #1 // CTR block 4k+9 1211 eor x6, x6, x13 // AES block 4k+4 - round N low 1212 eor x7, x7, x14 // AES block 4k+4 - round N high 1213 mov x20, v1.d[1] // AES block 4k+5 - mov high 1214 eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result 1215 eor v11.16b, v11.16b, v8.16b // MODULO - fold into low 1216 mov x19, v1.d[0] // AES block 4k+5 - mov low 1217 fmov d1, x10 // CTR block 4k+9 1218 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 1219 fmov v1.d[1], x9 // CTR block 4k+9 1220 rev w9, w12 // CTR block 4k+10 1221 add w12, w12, #1 // CTR block 4k+10 1222 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 1223 orr x9, x11, x9, lsl #32 // CTR block 4k+10 1224 rev64 v5.16b, v5.16b // GHASH block 4k+5 1225 eor x20, x20, x14 // AES block 4k+5 - round N high 1226 stp x6, x7, [x2], #16 // AES block 4k+4 - store result 1227 eor x19, x19, x13 // AES block 4k+5 - round N low 1228 stp x19, x20, [x2], #16 // AES block 4k+5 - store result 1229 rev64 v4.16b, v4.16b // GHASH block 4k+4 1230 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 1231 b.lt Ldec_main_loop 1232 1233Ldec_prepretail: // PREPRETAIL 1234 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 1235 mov x21, v2.d[0] // AES block 4k+2 - mov low 1236 eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result 1237 aese v0.16b, v18.16b 1238 aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 1239 mov x22, v2.d[1] // AES block 4k+2 - mov high 1240 aese v1.16b, v18.16b 1241 aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 1242 fmov d2, x10 // CTR block 4k+6 1243 fmov v2.d[1], x9 // CTR block 4k+6 1244 rev w9, w12 // CTR block 4k+7 1245 eor v4.16b, v4.16b, v11.16b // PRE 1 1246 rev64 v6.16b, v6.16b // GHASH block 4k+2 1247 orr x9, x11, x9, lsl #32 // CTR block 4k+7 1248 mov x23, v3.d[0] // AES block 4k+3 - mov low 1249 aese v1.16b, v19.16b 1250 aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 1251 mov x24, v3.d[1] // AES block 4k+3 - mov high 1252 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low 1253 mov d8, v4.d[1] // GHASH block 4k - mid 1254 fmov d3, x10 // CTR block 4k+7 1255 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high 1256 fmov v3.d[1], x9 // CTR block 4k+7 1257 aese v2.16b, v18.16b 1258 aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 1259 mov d10, v17.d[1] // GHASH block 4k - mid 1260 aese v0.16b, v19.16b 1261 aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 1262 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid 1263 pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high 1264 aese v2.16b, v19.16b 1265 aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 1266 rev64 v7.16b, v7.16b // GHASH block 4k+3 1267 aese v3.16b, v18.16b 1268 aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 1269 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid 1270 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high 1271 pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low 1272 aese v3.16b, v19.16b 1273 aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 1274 mov d4, v5.d[1] // GHASH block 4k+1 - mid 1275 aese v0.16b, v20.16b 1276 aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 1277 aese v1.16b, v20.16b 1278 aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 1279 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low 1280 aese v2.16b, v20.16b 1281 aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 1282 aese v0.16b, v21.16b 1283 aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 1284 mov d8, v6.d[1] // GHASH block 4k+2 - mid 1285 aese v3.16b, v20.16b 1286 aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 1287 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid 1288 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low 1289 aese v0.16b, v22.16b 1290 aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 1291 aese v3.16b, v21.16b 1292 aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 1293 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid 1294 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid 1295 aese v0.16b, v23.16b 1296 aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 1297 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low 1298 aese v3.16b, v22.16b 1299 aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 1300 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high 1301 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid 1302 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high 1303 aese v3.16b, v23.16b 1304 aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 1305 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid 1306 aese v2.16b, v21.16b 1307 aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 1308 aese v1.16b, v21.16b 1309 aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 1310 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high 1311 pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low 1312 aese v2.16b, v22.16b 1313 aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 1314 mov d6, v7.d[1] // GHASH block 4k+3 - mid 1315 aese v1.16b, v22.16b 1316 aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 1317 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid 1318 aese v2.16b, v23.16b 1319 aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 1320 eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid 1321 aese v1.16b, v23.16b 1322 aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 1323 aese v3.16b, v24.16b 1324 aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 1325 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid 1326 aese v2.16b, v24.16b 1327 aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 1328 aese v0.16b, v24.16b 1329 aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 1330 movi v8.8b, #0xc2 1331 aese v1.16b, v24.16b 1332 aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 1333 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low 1334 pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid 1335 aese v3.16b, v25.16b 1336 aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 1337 cmp x17, #12 // setup flags for AES-128/192/256 check 1338 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high 1339 aese v1.16b, v25.16b 1340 aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 1341 aese v0.16b, v25.16b 1342 aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 1343 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid 1344 aese v3.16b, v26.16b 1345 aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 1346 aese v2.16b, v25.16b 1347 aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 1348 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 1349 aese v1.16b, v26.16b 1350 aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 1351 aese v0.16b, v26.16b 1352 aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 1353 shl d8, d8, #56 // mod_constant 1354 aese v2.16b, v26.16b 1355 aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 1356 b.lt Ldec_finish_prepretail // branch if AES-128 1357 1358 aese v1.16b, v27.16b 1359 aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 1360 aese v2.16b, v27.16b 1361 aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 1362 aese v3.16b, v27.16b 1363 aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 1364 aese v0.16b, v27.16b 1365 aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 1366 aese v2.16b, v28.16b 1367 aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 1368 aese v3.16b, v28.16b 1369 aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 1370 aese v0.16b, v28.16b 1371 aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 1372 aese v1.16b, v28.16b 1373 aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 1374 b.eq Ldec_finish_prepretail // branch if AES-192 1375 1376 aese v2.16b, v29.16b 1377 aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 1378 aese v0.16b, v29.16b 1379 aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 1380 aese v1.16b, v29.16b 1381 aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 1382 aese v2.16b, v30.16b 1383 aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 1384 aese v3.16b, v29.16b 1385 aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 1386 aese v1.16b, v30.16b 1387 aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 1388 aese v0.16b, v30.16b 1389 aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 1390 aese v3.16b, v30.16b 1391 aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 1392 1393Ldec_finish_prepretail: 1394 eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up 1395 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 1396 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 1397 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 1398 eor x22, x22, x14 // AES block 4k+2 - round N high 1399 eor x23, x23, x13 // AES block 4k+3 - round N low 1400 eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid 1401 add w12, w12, #1 // CTR block 4k+7 1402 eor x21, x21, x13 // AES block 4k+2 - round N low 1403 pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 1404 eor x24, x24, x14 // AES block 4k+3 - round N high 1405 stp x21, x22, [x2], #16 // AES block 4k+2 - store result 1406 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 1407 stp x23, x24, [x2], #16 // AES block 4k+3 - store result 1408 1409 eor v11.16b, v11.16b, v8.16b // MODULO - fold into low 1410 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 1411 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 1412 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 1413 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 1414 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 1415 1416Ldec_tail: // TAIL 1417 sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process 1418 ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext 1419 eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result 1420 mov x6, v0.d[0] // AES block 4k+4 - mov low 1421 mov x7, v0.d[1] // AES block 4k+4 - mov high 1422 ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag 1423 cmp x5, #48 1424 eor x6, x6, x13 // AES block 4k+4 - round N low 1425 eor x7, x7, x14 // AES block 4k+4 - round N high 1426 b.gt Ldec_blocks_more_than_3 1427 sub w12, w12, #1 1428 mov v3.16b, v2.16b 1429 movi v10.8b, #0 1430 movi v11.8b, #0 1431 cmp x5, #32 1432 movi v9.8b, #0 1433 mov v2.16b, v1.16b 1434 b.gt Ldec_blocks_more_than_2 1435 sub w12, w12, #1 1436 mov v3.16b, v1.16b 1437 cmp x5, #16 1438 b.gt Ldec_blocks_more_than_1 1439 sub w12, w12, #1 1440 b Ldec_blocks_less_than_1 1441Ldec_blocks_more_than_3: // blocks left > 3 1442 rev64 v4.16b, v5.16b // GHASH final-3 block 1443 ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext 1444 stp x6, x7, [x2], #16 // AES final-3 block - store result 1445 mov d10, v17.d[1] // GHASH final-3 block - mid 1446 eor v4.16b, v4.16b, v8.16b // feed in partial tag 1447 eor v0.16b, v5.16b, v1.16b // AES final-2 block - result 1448 mov d22, v4.d[1] // GHASH final-3 block - mid 1449 mov x6, v0.d[0] // AES final-2 block - mov low 1450 mov x7, v0.d[1] // AES final-2 block - mov high 1451 eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid 1452 movi v8.8b, #0 // suppress further partial tag feed in 1453 pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high 1454 pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid 1455 eor x6, x6, x13 // AES final-2 block - round N low 1456 pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low 1457 eor x7, x7, x14 // AES final-2 block - round N high 1458Ldec_blocks_more_than_2: // blocks left > 2 1459 rev64 v4.16b, v5.16b // GHASH final-2 block 1460 ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext 1461 eor v4.16b, v4.16b, v8.16b // feed in partial tag 1462 stp x6, x7, [x2], #16 // AES final-2 block - store result 1463 eor v0.16b, v5.16b, v2.16b // AES final-1 block - result 1464 mov d22, v4.d[1] // GHASH final-2 block - mid 1465 pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low 1466 pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high 1467 eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid 1468 mov x6, v0.d[0] // AES final-1 block - mov low 1469 mov x7, v0.d[1] // AES final-1 block - mov high 1470 eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low 1471 movi v8.8b, #0 // suppress further partial tag feed in 1472 pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid 1473 eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high 1474 eor x6, x6, x13 // AES final-1 block - round N low 1475 eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid 1476 eor x7, x7, x14 // AES final-1 block - round N high 1477Ldec_blocks_more_than_1: // blocks left > 1 1478 stp x6, x7, [x2], #16 // AES final-1 block - store result 1479 rev64 v4.16b, v5.16b // GHASH final-1 block 1480 ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext 1481 eor v4.16b, v4.16b, v8.16b // feed in partial tag 1482 movi v8.8b, #0 // suppress further partial tag feed in 1483 mov d22, v4.d[1] // GHASH final-1 block - mid 1484 eor v0.16b, v5.16b, v3.16b // AES final block - result 1485 pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high 1486 eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid 1487 pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low 1488 mov x6, v0.d[0] // AES final block - mov low 1489 ins v22.d[1], v22.d[0] // GHASH final-1 block - mid 1490 mov x7, v0.d[1] // AES final block - mov high 1491 pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid 1492 eor x6, x6, x13 // AES final block - round N low 1493 eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low 1494 eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high 1495 eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid 1496 eor x7, x7, x14 // AES final block - round N high 1497Ldec_blocks_less_than_1: // blocks left <= 1 1498 and x1, x1, #127 // bit_length %= 128 1499 mvn x14, xzr // rkN_h = 0xffffffffffffffff 1500 sub x1, x1, #128 // bit_length -= 128 1501 mvn x13, xzr // rkN_l = 0xffffffffffffffff 1502 ldp x4, x5, [x2] // load existing bytes we need to not overwrite 1503 neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) 1504 and x1, x1, #127 // bit_length %= 128 1505 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block 1506 cmp x1, #64 1507 csel x9, x13, x14, lt 1508 csel x10, x14, xzr, lt 1509 fmov d0, x9 // ctr0b is mask for last block 1510 and x6, x6, x9 1511 mov v0.d[1], x10 1512 bic x4, x4, x9 // mask out low existing bytes 1513 rev w9, w12 1514 bic x5, x5, x10 // mask out high existing bytes 1515 orr x6, x6, x4 1516 and x7, x7, x10 1517 orr x7, x7, x5 1518 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits 1519 rev64 v4.16b, v5.16b // GHASH final block 1520 eor v4.16b, v4.16b, v8.16b // feed in partial tag 1521 pmull v21.1q, v4.1d, v12.1d // GHASH final block - low 1522 mov d8, v4.d[1] // GHASH final block - mid 1523 eor v8.8b, v8.8b, v4.8b // GHASH final block - mid 1524 pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high 1525 pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid 1526 eor v9.16b, v9.16b, v20.16b // GHASH final block - high 1527 eor v11.16b, v11.16b, v21.16b // GHASH final block - low 1528 eor v10.16b, v10.16b, v8.16b // GHASH final block - mid 1529 movi v8.8b, #0xc2 1530 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 1531 shl d8, d8, #56 // mod_constant 1532 eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up 1533 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 1534 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 1535 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 1536 eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid 1537 pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 1538 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 1539 eor v11.16b, v11.16b, v8.16b // MODULO - fold into low 1540 stp x6, x7, [x2] 1541 str w9, [x16, #12] // store the updated counter 1542 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 1543 ext v11.16b, v11.16b, v11.16b, #8 1544 rev64 v11.16b, v11.16b 1545 mov x0, x15 1546 st1 { v11.16b }, [x3] 1547 ldp x19, x20, [sp, #16] 1548 ldp x21, x22, [sp, #32] 1549 ldp x23, x24, [sp, #48] 1550 ldp d8, d9, [sp, #64] 1551 ldp d10, d11, [sp, #80] 1552 ldp d12, d13, [sp, #96] 1553 ldp d14, d15, [sp, #112] 1554 ldp x29, x30, [sp], #128 1555 AARCH64_VALIDATE_LINK_REGISTER 1556 ret 1557 1558#endif 1559#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 1560