1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include <openssl/arm_arch.h> 8#if __ARM_MAX_ARCH__ >= 8 9 10.arch armv8-a+crypto 11.text 12.globl aes_gcm_enc_kernel 13.hidden aes_gcm_enc_kernel 14.type aes_gcm_enc_kernel,%function 15.align 4 16aes_gcm_enc_kernel: 17 AARCH64_SIGN_LINK_REGISTER 18 stp x29, x30, [sp, #-128]! 19 mov x29, sp 20 stp x19, x20, [sp, #16] 21 mov x16, x4 22 mov x8, x5 23 stp x21, x22, [sp, #32] 24 stp x23, x24, [sp, #48] 25 stp d8, d9, [sp, #64] 26 stp d10, d11, [sp, #80] 27 stp d12, d13, [sp, #96] 28 stp d14, d15, [sp, #112] 29 ldr w17, [x8, #240] 30 add x19, x8, x17, lsl #4 // borrow input_l1 for last key 31 ldp x13, x14, [x19] // load round N keys 32 ldr q31, [x19, #-16] // load round N-1 keys 33 add x4, x0, x1, lsr #3 // end_input_ptr 34 lsr x5, x1, #3 // byte_len 35 mov x15, x5 36 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 37 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible 38 sub x5, x5, #1 // byte_len - 1 39 ldr q18, [x8, #0] // load rk0 40 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 41 ldr q25, [x8, #112] // load rk7 42 add x5, x5, x0 43 lsr x12, x11, #32 44 fmov d2, x10 // CTR block 2 45 orr w11, w11, w11 46 rev w12, w12 // rev_ctr32 47 fmov d1, x10 // CTR block 1 48 aese v0.16b, v18.16b 49 aesmc v0.16b, v0.16b // AES block 0 - round 0 50 add w12, w12, #1 // increment rev_ctr32 51 rev w9, w12 // CTR block 1 52 fmov d3, x10 // CTR block 3 53 orr x9, x11, x9, lsl #32 // CTR block 1 54 add w12, w12, #1 // CTR block 1 55 ldr q19, [x8, #16] // load rk1 56 fmov v1.d[1], x9 // CTR block 1 57 rev w9, w12 // CTR block 2 58 add w12, w12, #1 // CTR block 2 59 orr x9, x11, x9, lsl #32 // CTR block 2 60 ldr q20, [x8, #32] // load rk2 61 fmov v2.d[1], x9 // CTR block 2 62 rev w9, w12 // CTR block 3 63 aese v0.16b, v19.16b 64 aesmc v0.16b, v0.16b // AES block 0 - round 1 65 orr x9, x11, x9, lsl #32 // CTR block 3 66 fmov v3.d[1], x9 // CTR block 3 67 aese v1.16b, v18.16b 68 aesmc v1.16b, v1.16b // AES block 1 - round 0 69 ldr q21, [x8, #48] // load rk3 70 aese v0.16b, v20.16b 71 aesmc v0.16b, v0.16b // AES block 0 - round 2 72 ldr q24, [x8, #96] // load rk6 73 aese v2.16b, v18.16b 74 aesmc v2.16b, v2.16b // AES block 2 - round 0 75 ldr q23, [x8, #80] // load rk5 76 aese v1.16b, v19.16b 77 aesmc v1.16b, v1.16b // AES block 1 - round 1 78 ldr q14, [x6, #48] // load h3l | h3h 79 ext v14.16b, v14.16b, v14.16b, #8 80 aese v3.16b, v18.16b 81 aesmc v3.16b, v3.16b // AES block 3 - round 0 82 aese v2.16b, v19.16b 83 aesmc v2.16b, v2.16b // AES block 2 - round 1 84 ldr q22, [x8, #64] // load rk4 85 aese v1.16b, v20.16b 86 aesmc v1.16b, v1.16b // AES block 1 - round 2 87 ldr q13, [x6, #32] // load h2l | h2h 88 ext v13.16b, v13.16b, v13.16b, #8 89 aese v3.16b, v19.16b 90 aesmc v3.16b, v3.16b // AES block 3 - round 1 91 ldr q30, [x8, #192] // load rk12 92 aese v2.16b, v20.16b 93 aesmc v2.16b, v2.16b // AES block 2 - round 2 94 ldr q15, [x6, #80] // load h4l | h4h 95 ext v15.16b, v15.16b, v15.16b, #8 96 aese v1.16b, v21.16b 97 aesmc v1.16b, v1.16b // AES block 1 - round 3 98 ldr q29, [x8, #176] // load rk11 99 aese v3.16b, v20.16b 100 aesmc v3.16b, v3.16b // AES block 3 - round 2 101 ldr q26, [x8, #128] // load rk8 102 aese v2.16b, v21.16b 103 aesmc v2.16b, v2.16b // AES block 2 - round 3 104 add w12, w12, #1 // CTR block 3 105 aese v0.16b, v21.16b 106 aesmc v0.16b, v0.16b // AES block 0 - round 3 107 aese v3.16b, v21.16b 108 aesmc v3.16b, v3.16b // AES block 3 - round 3 109 ld1 { v11.16b}, [x3] 110 ext v11.16b, v11.16b, v11.16b, #8 111 rev64 v11.16b, v11.16b 112 aese v2.16b, v22.16b 113 aesmc v2.16b, v2.16b // AES block 2 - round 4 114 aese v0.16b, v22.16b 115 aesmc v0.16b, v0.16b // AES block 0 - round 4 116 aese v1.16b, v22.16b 117 aesmc v1.16b, v1.16b // AES block 1 - round 4 118 aese v3.16b, v22.16b 119 aesmc v3.16b, v3.16b // AES block 3 - round 4 120 cmp x17, #12 // setup flags for AES-128/192/256 check 121 aese v0.16b, v23.16b 122 aesmc v0.16b, v0.16b // AES block 0 - round 5 123 aese v1.16b, v23.16b 124 aesmc v1.16b, v1.16b // AES block 1 - round 5 125 aese v3.16b, v23.16b 126 aesmc v3.16b, v3.16b // AES block 3 - round 5 127 aese v2.16b, v23.16b 128 aesmc v2.16b, v2.16b // AES block 2 - round 5 129 aese v1.16b, v24.16b 130 aesmc v1.16b, v1.16b // AES block 1 - round 6 131 trn2 v17.2d, v14.2d, v15.2d // h4l | h3l 132 aese v3.16b, v24.16b 133 aesmc v3.16b, v3.16b // AES block 3 - round 6 134 ldr q27, [x8, #144] // load rk9 135 aese v0.16b, v24.16b 136 aesmc v0.16b, v0.16b // AES block 0 - round 6 137 ldr q12, [x6] // load h1l | h1h 138 ext v12.16b, v12.16b, v12.16b, #8 139 aese v2.16b, v24.16b 140 aesmc v2.16b, v2.16b // AES block 2 - round 6 141 ldr q28, [x8, #160] // load rk10 142 aese v1.16b, v25.16b 143 aesmc v1.16b, v1.16b // AES block 1 - round 7 144 trn1 v9.2d, v14.2d, v15.2d // h4h | h3h 145 aese v0.16b, v25.16b 146 aesmc v0.16b, v0.16b // AES block 0 - round 7 147 aese v2.16b, v25.16b 148 aesmc v2.16b, v2.16b // AES block 2 - round 7 149 aese v3.16b, v25.16b 150 aesmc v3.16b, v3.16b // AES block 3 - round 7 151 trn2 v16.2d, v12.2d, v13.2d // h2l | h1l 152 aese v1.16b, v26.16b 153 aesmc v1.16b, v1.16b // AES block 1 - round 8 154 aese v2.16b, v26.16b 155 aesmc v2.16b, v2.16b // AES block 2 - round 8 156 aese v3.16b, v26.16b 157 aesmc v3.16b, v3.16b // AES block 3 - round 8 158 aese v0.16b, v26.16b 159 aesmc v0.16b, v0.16b // AES block 0 - round 8 160 b.lt .Lenc_finish_first_blocks // branch if AES-128 161 162 aese v1.16b, v27.16b 163 aesmc v1.16b, v1.16b // AES block 1 - round 9 164 aese v2.16b, v27.16b 165 aesmc v2.16b, v2.16b // AES block 2 - round 9 166 aese v3.16b, v27.16b 167 aesmc v3.16b, v3.16b // AES block 3 - round 9 168 aese v0.16b, v27.16b 169 aesmc v0.16b, v0.16b // AES block 0 - round 9 170 aese v1.16b, v28.16b 171 aesmc v1.16b, v1.16b // AES block 1 - round 10 172 aese v2.16b, v28.16b 173 aesmc v2.16b, v2.16b // AES block 2 - round 10 174 aese v3.16b, v28.16b 175 aesmc v3.16b, v3.16b // AES block 3 - round 10 176 aese v0.16b, v28.16b 177 aesmc v0.16b, v0.16b // AES block 0 - round 10 178 b.eq .Lenc_finish_first_blocks // branch if AES-192 179 180 aese v1.16b, v29.16b 181 aesmc v1.16b, v1.16b // AES block 1 - round 11 182 aese v2.16b, v29.16b 183 aesmc v2.16b, v2.16b // AES block 2 - round 11 184 aese v0.16b, v29.16b 185 aesmc v0.16b, v0.16b // AES block 0 - round 11 186 aese v3.16b, v29.16b 187 aesmc v3.16b, v3.16b // AES block 3 - round 11 188 aese v1.16b, v30.16b 189 aesmc v1.16b, v1.16b // AES block 1 - round 12 190 aese v2.16b, v30.16b 191 aesmc v2.16b, v2.16b // AES block 2 - round 12 192 aese v0.16b, v30.16b 193 aesmc v0.16b, v0.16b // AES block 0 - round 12 194 aese v3.16b, v30.16b 195 aesmc v3.16b, v3.16b // AES block 3 - round 12 196 197.Lenc_finish_first_blocks: 198 cmp x0, x5 // check if we have <= 4 blocks 199 eor v17.16b, v17.16b, v9.16b // h4k | h3k 200 aese v2.16b, v31.16b // AES block 2 - round N-1 201 trn1 v8.2d, v12.2d, v13.2d // h2h | h1h 202 aese v1.16b, v31.16b // AES block 1 - round N-1 203 aese v0.16b, v31.16b // AES block 0 - round N-1 204 aese v3.16b, v31.16b // AES block 3 - round N-1 205 eor v16.16b, v16.16b, v8.16b // h2k | h1k 206 b.ge .Lenc_tail // handle tail 207 208 ldp x19, x20, [x0, #16] // AES block 1 - load plaintext 209 rev w9, w12 // CTR block 4 210 ldp x6, x7, [x0, #0] // AES block 0 - load plaintext 211 ldp x23, x24, [x0, #48] // AES block 3 - load plaintext 212 ldp x21, x22, [x0, #32] // AES block 2 - load plaintext 213 add x0, x0, #64 // AES input_ptr update 214 eor x19, x19, x13 // AES block 1 - round N low 215 eor x20, x20, x14 // AES block 1 - round N high 216 fmov d5, x19 // AES block 1 - mov low 217 eor x6, x6, x13 // AES block 0 - round N low 218 eor x7, x7, x14 // AES block 0 - round N high 219 eor x24, x24, x14 // AES block 3 - round N high 220 fmov d4, x6 // AES block 0 - mov low 221 cmp x0, x5 // check if we have <= 8 blocks 222 fmov v4.d[1], x7 // AES block 0 - mov high 223 eor x23, x23, x13 // AES block 3 - round N low 224 eor x21, x21, x13 // AES block 2 - round N low 225 fmov v5.d[1], x20 // AES block 1 - mov high 226 fmov d6, x21 // AES block 2 - mov low 227 add w12, w12, #1 // CTR block 4 228 orr x9, x11, x9, lsl #32 // CTR block 4 229 fmov d7, x23 // AES block 3 - mov low 230 eor x22, x22, x14 // AES block 2 - round N high 231 fmov v6.d[1], x22 // AES block 2 - mov high 232 eor v4.16b, v4.16b, v0.16b // AES block 0 - result 233 fmov d0, x10 // CTR block 4 234 fmov v0.d[1], x9 // CTR block 4 235 rev w9, w12 // CTR block 5 236 add w12, w12, #1 // CTR block 5 237 eor v5.16b, v5.16b, v1.16b // AES block 1 - result 238 fmov d1, x10 // CTR block 5 239 orr x9, x11, x9, lsl #32 // CTR block 5 240 fmov v1.d[1], x9 // CTR block 5 241 rev w9, w12 // CTR block 6 242 st1 { v4.16b}, [x2], #16 // AES block 0 - store result 243 fmov v7.d[1], x24 // AES block 3 - mov high 244 orr x9, x11, x9, lsl #32 // CTR block 6 245 eor v6.16b, v6.16b, v2.16b // AES block 2 - result 246 st1 { v5.16b}, [x2], #16 // AES block 1 - store result 247 add w12, w12, #1 // CTR block 6 248 fmov d2, x10 // CTR block 6 249 fmov v2.d[1], x9 // CTR block 6 250 st1 { v6.16b}, [x2], #16 // AES block 2 - store result 251 rev w9, w12 // CTR block 7 252 orr x9, x11, x9, lsl #32 // CTR block 7 253 eor v7.16b, v7.16b, v3.16b // AES block 3 - result 254 st1 { v7.16b}, [x2], #16 // AES block 3 - store result 255 b.ge .Lenc_prepretail // do prepretail 256 257.Lenc_main_loop: // main loop start 258 aese v0.16b, v18.16b 259 aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 260 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) 261 aese v1.16b, v18.16b 262 aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 263 fmov d3, x10 // CTR block 4k+3 264 aese v2.16b, v18.16b 265 aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 266 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 267 aese v0.16b, v19.16b 268 aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 269 fmov v3.d[1], x9 // CTR block 4k+3 270 aese v1.16b, v19.16b 271 aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 272 ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext 273 aese v2.16b, v19.16b 274 aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 275 ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext 276 aese v0.16b, v20.16b 277 aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 278 eor v4.16b, v4.16b, v11.16b // PRE 1 279 aese v1.16b, v20.16b 280 aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 281 aese v3.16b, v18.16b 282 aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 283 eor x23, x23, x13 // AES block 4k+7 - round N low 284 aese v0.16b, v21.16b 285 aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 286 mov d10, v17.d[1] // GHASH block 4k - mid 287 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high 288 eor x22, x22, x14 // AES block 4k+6 - round N high 289 mov d8, v4.d[1] // GHASH block 4k - mid 290 aese v3.16b, v19.16b 291 aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 292 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) 293 aese v0.16b, v22.16b 294 aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 295 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low 296 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid 297 aese v2.16b, v20.16b 298 aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 299 aese v0.16b, v23.16b 300 aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 301 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) 302 pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high 303 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid 304 rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) 305 pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low 306 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high 307 mov d4, v5.d[1] // GHASH block 4k+1 - mid 308 aese v1.16b, v21.16b 309 aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 310 aese v3.16b, v20.16b 311 aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 312 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low 313 aese v2.16b, v21.16b 314 aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 315 aese v1.16b, v22.16b 316 aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 317 mov d8, v6.d[1] // GHASH block 4k+2 - mid 318 aese v3.16b, v21.16b 319 aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 320 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid 321 aese v2.16b, v22.16b 322 aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 323 aese v0.16b, v24.16b 324 aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 325 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid 326 aese v3.16b, v22.16b 327 aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 328 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid 329 aese v0.16b, v25.16b 330 aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 331 aese v3.16b, v23.16b 332 aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 333 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid 334 aese v1.16b, v23.16b 335 aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 336 aese v0.16b, v26.16b 337 aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 338 aese v2.16b, v23.16b 339 aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 340 aese v1.16b, v24.16b 341 aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 342 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid 343 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high 344 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low 345 aese v1.16b, v25.16b 346 aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 347 pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low 348 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high 349 aese v3.16b, v24.16b 350 aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 351 ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext 352 aese v1.16b, v26.16b 353 aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 354 mov d4, v7.d[1] // GHASH block 4k+3 - mid 355 aese v2.16b, v24.16b 356 aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 357 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low 358 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid 359 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high 360 eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid 361 aese v2.16b, v25.16b 362 aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 363 eor x19, x19, x13 // AES block 4k+5 - round N low 364 aese v2.16b, v26.16b 365 aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 366 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid 367 aese v3.16b, v25.16b 368 aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 369 eor x21, x21, x13 // AES block 4k+6 - round N low 370 aese v3.16b, v26.16b 371 aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 372 movi v8.8b, #0xc2 373 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid 374 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high 375 cmp x17, #12 // setup flags for AES-128/192/256 check 376 fmov d5, x19 // AES block 4k+5 - mov low 377 ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext 378 b.lt .Lenc_main_loop_continue // branch if AES-128 379 380 aese v1.16b, v27.16b 381 aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 382 aese v0.16b, v27.16b 383 aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 384 aese v2.16b, v27.16b 385 aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 386 aese v3.16b, v27.16b 387 aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 388 aese v0.16b, v28.16b 389 aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 390 aese v1.16b, v28.16b 391 aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 392 aese v2.16b, v28.16b 393 aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 394 aese v3.16b, v28.16b 395 aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 396 b.eq .Lenc_main_loop_continue // branch if AES-192 397 398 aese v0.16b, v29.16b 399 aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 400 aese v1.16b, v29.16b 401 aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 402 aese v2.16b, v29.16b 403 aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 404 aese v3.16b, v29.16b 405 aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 406 aese v1.16b, v30.16b 407 aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 408 aese v0.16b, v30.16b 409 aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 410 aese v2.16b, v30.16b 411 aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 412 aese v3.16b, v30.16b 413 aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 414 415.Lenc_main_loop_continue: 416 shl d8, d8, #56 // mod_constant 417 eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low 418 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid 419 add w12, w12, #1 // CTR block 4k+3 420 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 421 add x0, x0, #64 // AES input_ptr update 422 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 423 rev w9, w12 // CTR block 4k+8 424 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 425 eor x6, x6, x13 // AES block 4k+4 - round N low 426 eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up 427 eor x7, x7, x14 // AES block 4k+4 - round N high 428 fmov d4, x6 // AES block 4k+4 - mov low 429 orr x9, x11, x9, lsl #32 // CTR block 4k+8 430 eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid 431 eor x20, x20, x14 // AES block 4k+5 - round N high 432 eor x24, x24, x14 // AES block 4k+7 - round N high 433 add w12, w12, #1 // CTR block 4k+8 434 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 435 fmov v4.d[1], x7 // AES block 4k+4 - mov high 436 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 437 fmov d7, x23 // AES block 4k+7 - mov low 438 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 439 fmov v5.d[1], x20 // AES block 4k+5 - mov high 440 fmov d6, x21 // AES block 4k+6 - mov low 441 cmp x0, x5 // .LOOP CONTROL 442 fmov v6.d[1], x22 // AES block 4k+6 - mov high 443 pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 444 eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result 445 fmov d0, x10 // CTR block 4k+8 446 fmov v0.d[1], x9 // CTR block 4k+8 447 rev w9, w12 // CTR block 4k+9 448 add w12, w12, #1 // CTR block 4k+9 449 eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result 450 fmov d1, x10 // CTR block 4k+9 451 orr x9, x11, x9, lsl #32 // CTR block 4k+9 452 fmov v1.d[1], x9 // CTR block 4k+9 453 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 454 rev w9, w12 // CTR block 4k+10 455 st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result 456 orr x9, x11, x9, lsl #32 // CTR block 4k+10 457 eor v11.16b, v11.16b, v9.16b // MODULO - fold into low 458 fmov v7.d[1], x24 // AES block 4k+7 - mov high 459 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 460 st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result 461 add w12, w12, #1 // CTR block 4k+10 462 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 463 eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result 464 fmov d2, x10 // CTR block 4k+10 465 st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result 466 fmov v2.d[1], x9 // CTR block 4k+10 467 rev w9, w12 // CTR block 4k+11 468 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 469 orr x9, x11, x9, lsl #32 // CTR block 4k+11 470 eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result 471 st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result 472 b.lt .Lenc_main_loop 473 474.Lenc_prepretail: // PREPRETAIL 475 aese v1.16b, v18.16b 476 aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 477 rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) 478 aese v2.16b, v18.16b 479 aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 480 fmov d3, x10 // CTR block 4k+3 481 aese v0.16b, v18.16b 482 aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 483 rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) 484 fmov v3.d[1], x9 // CTR block 4k+3 485 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 486 aese v2.16b, v19.16b 487 aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 488 aese v0.16b, v19.16b 489 aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 490 eor v4.16b, v4.16b, v11.16b // PRE 1 491 rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) 492 aese v2.16b, v20.16b 493 aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 494 aese v3.16b, v18.16b 495 aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 496 mov d10, v17.d[1] // GHASH block 4k - mid 497 aese v1.16b, v19.16b 498 aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 499 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low 500 mov d8, v4.d[1] // GHASH block 4k - mid 501 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high 502 aese v2.16b, v21.16b 503 aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 504 aese v1.16b, v20.16b 505 aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 506 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid 507 aese v0.16b, v20.16b 508 aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 509 aese v3.16b, v19.16b 510 aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 511 aese v1.16b, v21.16b 512 aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 513 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid 514 pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high 515 pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low 516 aese v3.16b, v20.16b 517 aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 518 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high 519 mov d4, v5.d[1] // GHASH block 4k+1 - mid 520 aese v0.16b, v21.16b 521 aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 522 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low 523 aese v3.16b, v21.16b 524 aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 525 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid 526 mov d8, v6.d[1] // GHASH block 4k+2 - mid 527 aese v0.16b, v22.16b 528 aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 529 rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) 530 aese v3.16b, v22.16b 531 aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 532 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid 533 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid 534 add w12, w12, #1 // CTR block 4k+3 535 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low 536 aese v3.16b, v23.16b 537 aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 538 aese v2.16b, v22.16b 539 aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 540 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid 541 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high 542 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low 543 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid 544 aese v2.16b, v23.16b 545 aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 546 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high 547 mov d4, v7.d[1] // GHASH block 4k+3 - mid 548 aese v1.16b, v22.16b 549 aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 550 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid 551 eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid 552 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high 553 aese v1.16b, v23.16b 554 aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 555 pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid 556 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid 557 aese v0.16b, v23.16b 558 aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 559 aese v1.16b, v24.16b 560 aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 561 aese v2.16b, v24.16b 562 aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 563 aese v0.16b, v24.16b 564 aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 565 movi v8.8b, #0xc2 566 aese v3.16b, v24.16b 567 aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 568 aese v1.16b, v25.16b 569 aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 570 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high 571 aese v0.16b, v25.16b 572 aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 573 aese v3.16b, v25.16b 574 aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 575 shl d8, d8, #56 // mod_constant 576 aese v1.16b, v26.16b 577 aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 578 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid 579 pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low 580 aese v3.16b, v26.16b 581 aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 582 cmp x17, #12 // setup flags for AES-128/192/256 check 583 aese v0.16b, v26.16b 584 aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 585 eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low 586 aese v2.16b, v25.16b 587 aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 588 eor v10.16b, v10.16b, v9.16b // karatsuba tidy up 589 aese v2.16b, v26.16b 590 aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 591 pmull v4.1q, v9.1d, v8.1d 592 ext v9.16b, v9.16b, v9.16b, #8 593 eor v10.16b, v10.16b, v11.16b 594 b.lt .Lenc_finish_prepretail // branch if AES-128 595 596 aese v1.16b, v27.16b 597 aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 598 aese v3.16b, v27.16b 599 aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 600 aese v0.16b, v27.16b 601 aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 602 aese v2.16b, v27.16b 603 aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 604 aese v3.16b, v28.16b 605 aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 606 aese v1.16b, v28.16b 607 aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 608 aese v0.16b, v28.16b 609 aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 610 aese v2.16b, v28.16b 611 aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 612 b.eq .Lenc_finish_prepretail // branch if AES-192 613 614 aese v1.16b, v29.16b 615 aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 616 aese v0.16b, v29.16b 617 aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 618 aese v3.16b, v29.16b 619 aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 620 aese v2.16b, v29.16b 621 aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 622 aese v1.16b, v30.16b 623 aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 624 aese v0.16b, v30.16b 625 aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 626 aese v3.16b, v30.16b 627 aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 628 aese v2.16b, v30.16b 629 aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 630 631.Lenc_finish_prepretail: 632 eor v10.16b, v10.16b, v4.16b 633 eor v10.16b, v10.16b, v9.16b 634 pmull v4.1q, v10.1d, v8.1d 635 ext v10.16b, v10.16b, v10.16b, #8 636 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 637 eor v11.16b, v11.16b, v4.16b 638 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 639 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 640 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 641 eor v11.16b, v11.16b, v10.16b 642 643.Lenc_tail: // TAIL 644 ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag 645 sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process 646 ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext 647 eor x6, x6, x13 // AES block 4k+4 - round N low 648 eor x7, x7, x14 // AES block 4k+4 - round N high 649 cmp x5, #48 650 fmov d4, x6 // AES block 4k+4 - mov low 651 fmov v4.d[1], x7 // AES block 4k+4 - mov high 652 eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result 653 b.gt .Lenc_blocks_more_than_3 654 cmp x5, #32 655 mov v3.16b, v2.16b 656 movi v11.8b, #0 657 movi v9.8b, #0 658 sub w12, w12, #1 659 mov v2.16b, v1.16b 660 movi v10.8b, #0 661 b.gt .Lenc_blocks_more_than_2 662 mov v3.16b, v1.16b 663 sub w12, w12, #1 664 cmp x5, #16 665 b.gt .Lenc_blocks_more_than_1 666 sub w12, w12, #1 667 b .Lenc_blocks_less_than_1 668.Lenc_blocks_more_than_3: // blocks left > 3 669 st1 { v5.16b}, [x2], #16 // AES final-3 block - store result 670 ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high 671 rev64 v4.16b, v5.16b // GHASH final-3 block 672 eor x6, x6, x13 // AES final-2 block - round N low 673 eor v4.16b, v4.16b, v8.16b // feed in partial tag 674 eor x7, x7, x14 // AES final-2 block - round N high 675 mov d22, v4.d[1] // GHASH final-3 block - mid 676 fmov d5, x6 // AES final-2 block - mov low 677 fmov v5.d[1], x7 // AES final-2 block - mov high 678 eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid 679 movi v8.8b, #0 // suppress further partial tag feed in 680 mov d10, v17.d[1] // GHASH final-3 block - mid 681 pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low 682 pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high 683 pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid 684 eor v5.16b, v5.16b, v1.16b // AES final-2 block - result 685.Lenc_blocks_more_than_2: // blocks left > 2 686 st1 { v5.16b}, [x2], #16 // AES final-2 block - store result 687 ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high 688 rev64 v4.16b, v5.16b // GHASH final-2 block 689 eor x6, x6, x13 // AES final-1 block - round N low 690 eor v4.16b, v4.16b, v8.16b // feed in partial tag 691 fmov d5, x6 // AES final-1 block - mov low 692 eor x7, x7, x14 // AES final-1 block - round N high 693 fmov v5.d[1], x7 // AES final-1 block - mov high 694 movi v8.8b, #0 // suppress further partial tag feed in 695 pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high 696 mov d22, v4.d[1] // GHASH final-2 block - mid 697 pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low 698 eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid 699 eor v5.16b, v5.16b, v2.16b // AES final-1 block - result 700 eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high 701 pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid 702 eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low 703 eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid 704.Lenc_blocks_more_than_1: // blocks left > 1 705 st1 { v5.16b}, [x2], #16 // AES final-1 block - store result 706 rev64 v4.16b, v5.16b // GHASH final-1 block 707 ldp x6, x7, [x0], #16 // AES final block - load input low & high 708 eor v4.16b, v4.16b, v8.16b // feed in partial tag 709 movi v8.8b, #0 // suppress further partial tag feed in 710 eor x6, x6, x13 // AES final block - round N low 711 mov d22, v4.d[1] // GHASH final-1 block - mid 712 pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high 713 eor x7, x7, x14 // AES final block - round N high 714 eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid 715 eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high 716 ins v22.d[1], v22.d[0] // GHASH final-1 block - mid 717 fmov d5, x6 // AES final block - mov low 718 fmov v5.d[1], x7 // AES final block - mov high 719 pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid 720 pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low 721 eor v5.16b, v5.16b, v3.16b // AES final block - result 722 eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid 723 eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low 724.Lenc_blocks_less_than_1: // blocks left <= 1 725 and x1, x1, #127 // bit_length %= 128 726 mvn x13, xzr // rkN_l = 0xffffffffffffffff 727 sub x1, x1, #128 // bit_length -= 128 728 neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) 729 ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored 730 mvn x14, xzr // rkN_h = 0xffffffffffffffff 731 and x1, x1, #127 // bit_length %= 128 732 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block 733 cmp x1, #64 734 csel x6, x13, x14, lt 735 csel x7, x14, xzr, lt 736 fmov d0, x6 // ctr0b is mask for last block 737 fmov v0.d[1], x7 738 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits 739 rev64 v4.16b, v5.16b // GHASH final block 740 eor v4.16b, v4.16b, v8.16b // feed in partial tag 741 bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing 742 pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high 743 mov d8, v4.d[1] // GHASH final block - mid 744 rev w9, w12 745 pmull v21.1q, v4.1d, v12.1d // GHASH final block - low 746 eor v9.16b, v9.16b, v20.16b // GHASH final block - high 747 eor v8.8b, v8.8b, v4.8b // GHASH final block - mid 748 pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid 749 eor v11.16b, v11.16b, v21.16b // GHASH final block - low 750 eor v10.16b, v10.16b, v8.16b // GHASH final block - mid 751 movi v8.8b, #0xc2 752 eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 753 shl d8, d8, #56 // mod_constant 754 eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up 755 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 756 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 757 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 758 eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid 759 pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 760 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 761 str w9, [x16, #12] // store the updated counter 762 st1 { v5.16b}, [x2] // store all 16B 763 eor v11.16b, v11.16b, v9.16b // MODULO - fold into low 764 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 765 ext v11.16b, v11.16b, v11.16b, #8 766 rev64 v11.16b, v11.16b 767 mov x0, x15 768 st1 { v11.16b }, [x3] 769 ldp x19, x20, [sp, #16] 770 ldp x21, x22, [sp, #32] 771 ldp x23, x24, [sp, #48] 772 ldp d8, d9, [sp, #64] 773 ldp d10, d11, [sp, #80] 774 ldp d12, d13, [sp, #96] 775 ldp d14, d15, [sp, #112] 776 ldp x29, x30, [sp], #128 777 AARCH64_VALIDATE_LINK_REGISTER 778 ret 779.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel 780.globl aes_gcm_dec_kernel 781.hidden aes_gcm_dec_kernel 782.type aes_gcm_dec_kernel,%function 783.align 4 784aes_gcm_dec_kernel: 785 AARCH64_SIGN_LINK_REGISTER 786 stp x29, x30, [sp, #-128]! 787 mov x29, sp 788 stp x19, x20, [sp, #16] 789 mov x16, x4 790 mov x8, x5 791 stp x21, x22, [sp, #32] 792 stp x23, x24, [sp, #48] 793 stp d8, d9, [sp, #64] 794 stp d10, d11, [sp, #80] 795 stp d12, d13, [sp, #96] 796 stp d14, d15, [sp, #112] 797 ldr w17, [x8, #240] 798 add x19, x8, x17, lsl #4 // borrow input_l1 for last key 799 ldp x13, x14, [x19] // load round N keys 800 ldr q31, [x19, #-16] // load round N-1 keys 801 lsr x5, x1, #3 // byte_len 802 mov x15, x5 803 ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 804 ldr q26, [x8, #128] // load rk8 805 sub x5, x5, #1 // byte_len - 1 806 ldr q25, [x8, #112] // load rk7 807 and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 808 add x4, x0, x1, lsr #3 // end_input_ptr 809 ldr q24, [x8, #96] // load rk6 810 lsr x12, x11, #32 811 ldr q23, [x8, #80] // load rk5 812 orr w11, w11, w11 813 ldr q21, [x8, #48] // load rk3 814 add x5, x5, x0 815 rev w12, w12 // rev_ctr32 816 add w12, w12, #1 // increment rev_ctr32 817 fmov d3, x10 // CTR block 3 818 rev w9, w12 // CTR block 1 819 add w12, w12, #1 // CTR block 1 820 fmov d1, x10 // CTR block 1 821 orr x9, x11, x9, lsl #32 // CTR block 1 822 ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible 823 fmov v1.d[1], x9 // CTR block 1 824 rev w9, w12 // CTR block 2 825 add w12, w12, #1 // CTR block 2 826 fmov d2, x10 // CTR block 2 827 orr x9, x11, x9, lsl #32 // CTR block 2 828 fmov v2.d[1], x9 // CTR block 2 829 rev w9, w12 // CTR block 3 830 orr x9, x11, x9, lsl #32 // CTR block 3 831 ldr q18, [x8, #0] // load rk0 832 fmov v3.d[1], x9 // CTR block 3 833 add w12, w12, #1 // CTR block 3 834 ldr q22, [x8, #64] // load rk4 835 ldr q19, [x8, #16] // load rk1 836 aese v0.16b, v18.16b 837 aesmc v0.16b, v0.16b // AES block 0 - round 0 838 ldr q14, [x6, #48] // load h3l | h3h 839 ext v14.16b, v14.16b, v14.16b, #8 840 aese v3.16b, v18.16b 841 aesmc v3.16b, v3.16b // AES block 3 - round 0 842 ldr q15, [x6, #80] // load h4l | h4h 843 ext v15.16b, v15.16b, v15.16b, #8 844 aese v1.16b, v18.16b 845 aesmc v1.16b, v1.16b // AES block 1 - round 0 846 ldr q13, [x6, #32] // load h2l | h2h 847 ext v13.16b, v13.16b, v13.16b, #8 848 aese v2.16b, v18.16b 849 aesmc v2.16b, v2.16b // AES block 2 - round 0 850 ldr q20, [x8, #32] // load rk2 851 aese v0.16b, v19.16b 852 aesmc v0.16b, v0.16b // AES block 0 - round 1 853 aese v1.16b, v19.16b 854 aesmc v1.16b, v1.16b // AES block 1 - round 1 855 ld1 { v11.16b}, [x3] 856 ext v11.16b, v11.16b, v11.16b, #8 857 rev64 v11.16b, v11.16b 858 aese v2.16b, v19.16b 859 aesmc v2.16b, v2.16b // AES block 2 - round 1 860 ldr q27, [x8, #144] // load rk9 861 aese v3.16b, v19.16b 862 aesmc v3.16b, v3.16b // AES block 3 - round 1 863 ldr q30, [x8, #192] // load rk12 864 aese v0.16b, v20.16b 865 aesmc v0.16b, v0.16b // AES block 0 - round 2 866 ldr q12, [x6] // load h1l | h1h 867 ext v12.16b, v12.16b, v12.16b, #8 868 aese v2.16b, v20.16b 869 aesmc v2.16b, v2.16b // AES block 2 - round 2 870 ldr q28, [x8, #160] // load rk10 871 aese v3.16b, v20.16b 872 aesmc v3.16b, v3.16b // AES block 3 - round 2 873 aese v0.16b, v21.16b 874 aesmc v0.16b, v0.16b // AES block 0 - round 3 875 aese v1.16b, v20.16b 876 aesmc v1.16b, v1.16b // AES block 1 - round 2 877 aese v3.16b, v21.16b 878 aesmc v3.16b, v3.16b // AES block 3 - round 3 879 aese v0.16b, v22.16b 880 aesmc v0.16b, v0.16b // AES block 0 - round 4 881 aese v2.16b, v21.16b 882 aesmc v2.16b, v2.16b // AES block 2 - round 3 883 aese v1.16b, v21.16b 884 aesmc v1.16b, v1.16b // AES block 1 - round 3 885 aese v3.16b, v22.16b 886 aesmc v3.16b, v3.16b // AES block 3 - round 4 887 aese v2.16b, v22.16b 888 aesmc v2.16b, v2.16b // AES block 2 - round 4 889 aese v1.16b, v22.16b 890 aesmc v1.16b, v1.16b // AES block 1 - round 4 891 aese v3.16b, v23.16b 892 aesmc v3.16b, v3.16b // AES block 3 - round 5 893 aese v0.16b, v23.16b 894 aesmc v0.16b, v0.16b // AES block 0 - round 5 895 aese v1.16b, v23.16b 896 aesmc v1.16b, v1.16b // AES block 1 - round 5 897 aese v2.16b, v23.16b 898 aesmc v2.16b, v2.16b // AES block 2 - round 5 899 aese v0.16b, v24.16b 900 aesmc v0.16b, v0.16b // AES block 0 - round 6 901 aese v3.16b, v24.16b 902 aesmc v3.16b, v3.16b // AES block 3 - round 6 903 cmp x17, #12 // setup flags for AES-128/192/256 check 904 aese v1.16b, v24.16b 905 aesmc v1.16b, v1.16b // AES block 1 - round 6 906 aese v2.16b, v24.16b 907 aesmc v2.16b, v2.16b // AES block 2 - round 6 908 aese v0.16b, v25.16b 909 aesmc v0.16b, v0.16b // AES block 0 - round 7 910 aese v1.16b, v25.16b 911 aesmc v1.16b, v1.16b // AES block 1 - round 7 912 aese v3.16b, v25.16b 913 aesmc v3.16b, v3.16b // AES block 3 - round 7 914 aese v0.16b, v26.16b 915 aesmc v0.16b, v0.16b // AES block 0 - round 8 916 aese v2.16b, v25.16b 917 aesmc v2.16b, v2.16b // AES block 2 - round 7 918 aese v3.16b, v26.16b 919 aesmc v3.16b, v3.16b // AES block 3 - round 8 920 aese v1.16b, v26.16b 921 aesmc v1.16b, v1.16b // AES block 1 - round 8 922 ldr q29, [x8, #176] // load rk11 923 aese v2.16b, v26.16b 924 aesmc v2.16b, v2.16b // AES block 2 - round 8 925 b.lt .Ldec_finish_first_blocks // branch if AES-128 926 927 aese v0.16b, v27.16b 928 aesmc v0.16b, v0.16b // AES block 0 - round 9 929 aese v1.16b, v27.16b 930 aesmc v1.16b, v1.16b // AES block 1 - round 9 931 aese v3.16b, v27.16b 932 aesmc v3.16b, v3.16b // AES block 3 - round 9 933 aese v2.16b, v27.16b 934 aesmc v2.16b, v2.16b // AES block 2 - round 9 935 aese v0.16b, v28.16b 936 aesmc v0.16b, v0.16b // AES block 0 - round 10 937 aese v1.16b, v28.16b 938 aesmc v1.16b, v1.16b // AES block 1 - round 10 939 aese v3.16b, v28.16b 940 aesmc v3.16b, v3.16b // AES block 3 - round 10 941 aese v2.16b, v28.16b 942 aesmc v2.16b, v2.16b // AES block 2 - round 10 943 b.eq .Ldec_finish_first_blocks // branch if AES-192 944 945 aese v0.16b, v29.16b 946 aesmc v0.16b, v0.16b // AES block 0 - round 11 947 aese v3.16b, v29.16b 948 aesmc v3.16b, v3.16b // AES block 3 - round 11 949 aese v1.16b, v29.16b 950 aesmc v1.16b, v1.16b // AES block 1 - round 11 951 aese v2.16b, v29.16b 952 aesmc v2.16b, v2.16b // AES block 2 - round 11 953 aese v1.16b, v30.16b 954 aesmc v1.16b, v1.16b // AES block 1 - round 12 955 aese v0.16b, v30.16b 956 aesmc v0.16b, v0.16b // AES block 0 - round 12 957 aese v2.16b, v30.16b 958 aesmc v2.16b, v2.16b // AES block 2 - round 12 959 aese v3.16b, v30.16b 960 aesmc v3.16b, v3.16b // AES block 3 - round 12 961 962.Ldec_finish_first_blocks: 963 cmp x0, x5 // check if we have <= 4 blocks 964 trn1 v9.2d, v14.2d, v15.2d // h4h | h3h 965 trn2 v17.2d, v14.2d, v15.2d // h4l | h3l 966 trn1 v8.2d, v12.2d, v13.2d // h2h | h1h 967 trn2 v16.2d, v12.2d, v13.2d // h2l | h1l 968 eor v17.16b, v17.16b, v9.16b // h4k | h3k 969 aese v1.16b, v31.16b // AES block 1 - round N-1 970 aese v2.16b, v31.16b // AES block 2 - round N-1 971 eor v16.16b, v16.16b, v8.16b // h2k | h1k 972 aese v3.16b, v31.16b // AES block 3 - round N-1 973 aese v0.16b, v31.16b // AES block 0 - round N-1 974 b.ge .Ldec_tail // handle tail 975 976 ldr q4, [x0, #0] // AES block 0 - load ciphertext 977 ldr q5, [x0, #16] // AES block 1 - load ciphertext 978 rev w9, w12 // CTR block 4 979 eor v0.16b, v4.16b, v0.16b // AES block 0 - result 980 eor v1.16b, v5.16b, v1.16b // AES block 1 - result 981 rev64 v5.16b, v5.16b // GHASH block 1 982 ldr q7, [x0, #48] // AES block 3 - load ciphertext 983 mov x7, v0.d[1] // AES block 0 - mov high 984 mov x6, v0.d[0] // AES block 0 - mov low 985 rev64 v4.16b, v4.16b // GHASH block 0 986 add w12, w12, #1 // CTR block 4 987 fmov d0, x10 // CTR block 4 988 orr x9, x11, x9, lsl #32 // CTR block 4 989 fmov v0.d[1], x9 // CTR block 4 990 rev w9, w12 // CTR block 5 991 add w12, w12, #1 // CTR block 5 992 mov x19, v1.d[0] // AES block 1 - mov low 993 orr x9, x11, x9, lsl #32 // CTR block 5 994 mov x20, v1.d[1] // AES block 1 - mov high 995 eor x7, x7, x14 // AES block 0 - round N high 996 eor x6, x6, x13 // AES block 0 - round N low 997 stp x6, x7, [x2], #16 // AES block 0 - store result 998 fmov d1, x10 // CTR block 5 999 ldr q6, [x0, #32] // AES block 2 - load ciphertext 1000 add x0, x0, #64 // AES input_ptr update 1001 fmov v1.d[1], x9 // CTR block 5 1002 rev w9, w12 // CTR block 6 1003 add w12, w12, #1 // CTR block 6 1004 eor x19, x19, x13 // AES block 1 - round N low 1005 orr x9, x11, x9, lsl #32 // CTR block 6 1006 eor x20, x20, x14 // AES block 1 - round N high 1007 stp x19, x20, [x2], #16 // AES block 1 - store result 1008 eor v2.16b, v6.16b, v2.16b // AES block 2 - result 1009 cmp x0, x5 // check if we have <= 8 blocks 1010 b.ge .Ldec_prepretail // do prepretail 1011 1012.Ldec_main_loop: // main loop start 1013 mov x21, v2.d[0] // AES block 4k+2 - mov low 1014 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 1015 eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result 1016 aese v0.16b, v18.16b 1017 aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 1018 mov x22, v2.d[1] // AES block 4k+2 - mov high 1019 aese v1.16b, v18.16b 1020 aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 1021 fmov d2, x10 // CTR block 4k+6 1022 fmov v2.d[1], x9 // CTR block 4k+6 1023 eor v4.16b, v4.16b, v11.16b // PRE 1 1024 rev w9, w12 // CTR block 4k+7 1025 aese v0.16b, v19.16b 1026 aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 1027 mov x24, v3.d[1] // AES block 4k+3 - mov high 1028 aese v1.16b, v19.16b 1029 aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 1030 mov x23, v3.d[0] // AES block 4k+3 - mov low 1031 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high 1032 mov d8, v4.d[1] // GHASH block 4k - mid 1033 fmov d3, x10 // CTR block 4k+7 1034 aese v0.16b, v20.16b 1035 aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 1036 orr x9, x11, x9, lsl #32 // CTR block 4k+7 1037 aese v2.16b, v18.16b 1038 aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 1039 fmov v3.d[1], x9 // CTR block 4k+7 1040 aese v1.16b, v20.16b 1041 aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 1042 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid 1043 aese v0.16b, v21.16b 1044 aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 1045 eor x22, x22, x14 // AES block 4k+2 - round N high 1046 aese v2.16b, v19.16b 1047 aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 1048 mov d10, v17.d[1] // GHASH block 4k - mid 1049 aese v1.16b, v21.16b 1050 aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 1051 rev64 v6.16b, v6.16b // GHASH block 4k+2 1052 aese v3.16b, v18.16b 1053 aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 1054 eor x21, x21, x13 // AES block 4k+2 - round N low 1055 aese v2.16b, v20.16b 1056 aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 1057 stp x21, x22, [x2], #16 // AES block 4k+2 - store result 1058 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low 1059 pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high 1060 aese v2.16b, v21.16b 1061 aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 1062 rev64 v7.16b, v7.16b // GHASH block 4k+3 1063 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid 1064 eor x23, x23, x13 // AES block 4k+3 - round N low 1065 pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low 1066 eor x24, x24, x14 // AES block 4k+3 - round N high 1067 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high 1068 aese v2.16b, v22.16b 1069 aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 1070 aese v3.16b, v19.16b 1071 aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 1072 mov d4, v5.d[1] // GHASH block 4k+1 - mid 1073 aese v0.16b, v22.16b 1074 aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 1075 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low 1076 aese v2.16b, v23.16b 1077 aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 1078 add w12, w12, #1 // CTR block 4k+7 1079 aese v3.16b, v20.16b 1080 aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 1081 mov d8, v6.d[1] // GHASH block 4k+2 - mid 1082 aese v1.16b, v22.16b 1083 aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 1084 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid 1085 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low 1086 aese v3.16b, v21.16b 1087 aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 1088 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid 1089 aese v1.16b, v23.16b 1090 aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 1091 aese v0.16b, v23.16b 1092 aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 1093 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low 1094 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid 1095 rev w9, w12 // CTR block 4k+8 1096 aese v1.16b, v24.16b 1097 aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 1098 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid 1099 aese v0.16b, v24.16b 1100 aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 1101 add w12, w12, #1 // CTR block 4k+8 1102 aese v3.16b, v22.16b 1103 aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 1104 aese v1.16b, v25.16b 1105 aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 1106 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid 1107 aese v0.16b, v25.16b 1108 aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 1109 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high 1110 mov d6, v7.d[1] // GHASH block 4k+3 - mid 1111 aese v3.16b, v23.16b 1112 aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 1113 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid 1114 aese v0.16b, v26.16b 1115 aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 1116 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high 1117 aese v3.16b, v24.16b 1118 aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 1119 pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low 1120 orr x9, x11, x9, lsl #32 // CTR block 4k+8 1121 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid 1122 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high 1123 cmp x17, #12 // setup flags for AES-128/192/256 check 1124 eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid 1125 aese v1.16b, v26.16b 1126 aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 1127 aese v2.16b, v24.16b 1128 aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 1129 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high 1130 pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid 1131 movi v8.8b, #0xc2 1132 aese v2.16b, v25.16b 1133 aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 1134 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low 1135 aese v3.16b, v25.16b 1136 aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 1137 shl d8, d8, #56 // mod_constant 1138 aese v2.16b, v26.16b 1139 aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 1140 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid 1141 aese v3.16b, v26.16b 1142 aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 1143 b.lt .Ldec_main_loop_continue // branch if AES-128 1144 1145 aese v0.16b, v27.16b 1146 aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 1147 aese v2.16b, v27.16b 1148 aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 1149 aese v1.16b, v27.16b 1150 aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 1151 aese v3.16b, v27.16b 1152 aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 1153 aese v0.16b, v28.16b 1154 aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 1155 aese v1.16b, v28.16b 1156 aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 1157 aese v2.16b, v28.16b 1158 aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 1159 aese v3.16b, v28.16b 1160 aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 1161 b.eq .Ldec_main_loop_continue // branch if AES-192 1162 1163 aese v0.16b, v29.16b 1164 aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 1165 aese v1.16b, v29.16b 1166 aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 1167 aese v2.16b, v29.16b 1168 aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 1169 aese v3.16b, v29.16b 1170 aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 1171 aese v0.16b, v30.16b 1172 aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 1173 aese v1.16b, v30.16b 1174 aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 1175 aese v2.16b, v30.16b 1176 aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 1177 aese v3.16b, v30.16b 1178 aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 1179 1180.Ldec_main_loop_continue: 1181 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 1182 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 1183 ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext 1184 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 1185 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 1186 eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up 1187 ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext 1188 eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result 1189 stp x23, x24, [x2], #16 // AES block 4k+3 - store result 1190 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 1191 ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext 1192 ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext 1193 mov x7, v0.d[1] // AES block 4k+4 - mov high 1194 eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid 1195 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 1196 add x0, x0, #64 // AES input_ptr update 1197 mov x6, v0.d[0] // AES block 4k+4 - mov low 1198 fmov d0, x10 // CTR block 4k+8 1199 fmov v0.d[1], x9 // CTR block 4k+8 1200 pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 1201 eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result 1202 rev w9, w12 // CTR block 4k+9 1203 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 1204 orr x9, x11, x9, lsl #32 // CTR block 4k+9 1205 cmp x0, x5 // .LOOP CONTROL 1206 add w12, w12, #1 // CTR block 4k+9 1207 eor x6, x6, x13 // AES block 4k+4 - round N low 1208 eor x7, x7, x14 // AES block 4k+4 - round N high 1209 mov x20, v1.d[1] // AES block 4k+5 - mov high 1210 eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result 1211 eor v11.16b, v11.16b, v8.16b // MODULO - fold into low 1212 mov x19, v1.d[0] // AES block 4k+5 - mov low 1213 fmov d1, x10 // CTR block 4k+9 1214 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 1215 fmov v1.d[1], x9 // CTR block 4k+9 1216 rev w9, w12 // CTR block 4k+10 1217 add w12, w12, #1 // CTR block 4k+10 1218 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 1219 orr x9, x11, x9, lsl #32 // CTR block 4k+10 1220 rev64 v5.16b, v5.16b // GHASH block 4k+5 1221 eor x20, x20, x14 // AES block 4k+5 - round N high 1222 stp x6, x7, [x2], #16 // AES block 4k+4 - store result 1223 eor x19, x19, x13 // AES block 4k+5 - round N low 1224 stp x19, x20, [x2], #16 // AES block 4k+5 - store result 1225 rev64 v4.16b, v4.16b // GHASH block 4k+4 1226 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 1227 b.lt .Ldec_main_loop 1228 1229.Ldec_prepretail: // PREPRETAIL 1230 ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 1231 mov x21, v2.d[0] // AES block 4k+2 - mov low 1232 eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result 1233 aese v0.16b, v18.16b 1234 aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 1235 mov x22, v2.d[1] // AES block 4k+2 - mov high 1236 aese v1.16b, v18.16b 1237 aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 1238 fmov d2, x10 // CTR block 4k+6 1239 fmov v2.d[1], x9 // CTR block 4k+6 1240 rev w9, w12 // CTR block 4k+7 1241 eor v4.16b, v4.16b, v11.16b // PRE 1 1242 rev64 v6.16b, v6.16b // GHASH block 4k+2 1243 orr x9, x11, x9, lsl #32 // CTR block 4k+7 1244 mov x23, v3.d[0] // AES block 4k+3 - mov low 1245 aese v1.16b, v19.16b 1246 aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 1247 mov x24, v3.d[1] // AES block 4k+3 - mov high 1248 pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low 1249 mov d8, v4.d[1] // GHASH block 4k - mid 1250 fmov d3, x10 // CTR block 4k+7 1251 pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high 1252 fmov v3.d[1], x9 // CTR block 4k+7 1253 aese v2.16b, v18.16b 1254 aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 1255 mov d10, v17.d[1] // GHASH block 4k - mid 1256 aese v0.16b, v19.16b 1257 aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 1258 eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid 1259 pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high 1260 aese v2.16b, v19.16b 1261 aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 1262 rev64 v7.16b, v7.16b // GHASH block 4k+3 1263 aese v3.16b, v18.16b 1264 aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 1265 pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid 1266 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high 1267 pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low 1268 aese v3.16b, v19.16b 1269 aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 1270 mov d4, v5.d[1] // GHASH block 4k+1 - mid 1271 aese v0.16b, v20.16b 1272 aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 1273 aese v1.16b, v20.16b 1274 aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 1275 eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low 1276 aese v2.16b, v20.16b 1277 aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 1278 aese v0.16b, v21.16b 1279 aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 1280 mov d8, v6.d[1] // GHASH block 4k+2 - mid 1281 aese v3.16b, v20.16b 1282 aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 1283 eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid 1284 pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low 1285 aese v0.16b, v22.16b 1286 aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 1287 aese v3.16b, v21.16b 1288 aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 1289 eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid 1290 pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid 1291 aese v0.16b, v23.16b 1292 aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 1293 eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low 1294 aese v3.16b, v22.16b 1295 aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 1296 pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high 1297 eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid 1298 pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high 1299 aese v3.16b, v23.16b 1300 aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 1301 ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid 1302 aese v2.16b, v21.16b 1303 aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 1304 aese v1.16b, v21.16b 1305 aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 1306 eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high 1307 pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low 1308 aese v2.16b, v22.16b 1309 aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 1310 mov d6, v7.d[1] // GHASH block 4k+3 - mid 1311 aese v1.16b, v22.16b 1312 aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 1313 pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid 1314 aese v2.16b, v23.16b 1315 aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 1316 eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid 1317 aese v1.16b, v23.16b 1318 aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 1319 aese v3.16b, v24.16b 1320 aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 1321 eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid 1322 aese v2.16b, v24.16b 1323 aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 1324 aese v0.16b, v24.16b 1325 aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 1326 movi v8.8b, #0xc2 1327 aese v1.16b, v24.16b 1328 aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 1329 eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low 1330 pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid 1331 aese v3.16b, v25.16b 1332 aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 1333 cmp x17, #12 // setup flags for AES-128/192/256 check 1334 eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high 1335 aese v1.16b, v25.16b 1336 aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 1337 aese v0.16b, v25.16b 1338 aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 1339 eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid 1340 aese v3.16b, v26.16b 1341 aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 1342 aese v2.16b, v25.16b 1343 aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 1344 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 1345 aese v1.16b, v26.16b 1346 aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 1347 aese v0.16b, v26.16b 1348 aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 1349 shl d8, d8, #56 // mod_constant 1350 aese v2.16b, v26.16b 1351 aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 1352 b.lt .Ldec_finish_prepretail // branch if AES-128 1353 1354 aese v1.16b, v27.16b 1355 aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 1356 aese v2.16b, v27.16b 1357 aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 1358 aese v3.16b, v27.16b 1359 aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 1360 aese v0.16b, v27.16b 1361 aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 1362 aese v2.16b, v28.16b 1363 aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 1364 aese v3.16b, v28.16b 1365 aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 1366 aese v0.16b, v28.16b 1367 aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 1368 aese v1.16b, v28.16b 1369 aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 1370 b.eq .Ldec_finish_prepretail // branch if AES-192 1371 1372 aese v2.16b, v29.16b 1373 aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 1374 aese v0.16b, v29.16b 1375 aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 1376 aese v1.16b, v29.16b 1377 aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 1378 aese v2.16b, v30.16b 1379 aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 1380 aese v3.16b, v29.16b 1381 aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 1382 aese v1.16b, v30.16b 1383 aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 1384 aese v0.16b, v30.16b 1385 aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 1386 aese v3.16b, v30.16b 1387 aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 1388 1389.Ldec_finish_prepretail: 1390 eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up 1391 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 1392 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 1393 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 1394 eor x22, x22, x14 // AES block 4k+2 - round N high 1395 eor x23, x23, x13 // AES block 4k+3 - round N low 1396 eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid 1397 add w12, w12, #1 // CTR block 4k+7 1398 eor x21, x21, x13 // AES block 4k+2 - round N low 1399 pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 1400 eor x24, x24, x14 // AES block 4k+3 - round N high 1401 stp x21, x22, [x2], #16 // AES block 4k+2 - store result 1402 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 1403 stp x23, x24, [x2], #16 // AES block 4k+3 - store result 1404 1405 eor v11.16b, v11.16b, v8.16b // MODULO - fold into low 1406 aese v1.16b, v31.16b // AES block 4k+5 - round N-1 1407 aese v0.16b, v31.16b // AES block 4k+4 - round N-1 1408 aese v3.16b, v31.16b // AES block 4k+7 - round N-1 1409 aese v2.16b, v31.16b // AES block 4k+6 - round N-1 1410 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 1411 1412.Ldec_tail: // TAIL 1413 sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process 1414 ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext 1415 eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result 1416 mov x6, v0.d[0] // AES block 4k+4 - mov low 1417 mov x7, v0.d[1] // AES block 4k+4 - mov high 1418 ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag 1419 cmp x5, #48 1420 eor x6, x6, x13 // AES block 4k+4 - round N low 1421 eor x7, x7, x14 // AES block 4k+4 - round N high 1422 b.gt .Ldec_blocks_more_than_3 1423 sub w12, w12, #1 1424 mov v3.16b, v2.16b 1425 movi v10.8b, #0 1426 movi v11.8b, #0 1427 cmp x5, #32 1428 movi v9.8b, #0 1429 mov v2.16b, v1.16b 1430 b.gt .Ldec_blocks_more_than_2 1431 sub w12, w12, #1 1432 mov v3.16b, v1.16b 1433 cmp x5, #16 1434 b.gt .Ldec_blocks_more_than_1 1435 sub w12, w12, #1 1436 b .Ldec_blocks_less_than_1 1437.Ldec_blocks_more_than_3: // blocks left > 3 1438 rev64 v4.16b, v5.16b // GHASH final-3 block 1439 ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext 1440 stp x6, x7, [x2], #16 // AES final-3 block - store result 1441 mov d10, v17.d[1] // GHASH final-3 block - mid 1442 eor v4.16b, v4.16b, v8.16b // feed in partial tag 1443 eor v0.16b, v5.16b, v1.16b // AES final-2 block - result 1444 mov d22, v4.d[1] // GHASH final-3 block - mid 1445 mov x6, v0.d[0] // AES final-2 block - mov low 1446 mov x7, v0.d[1] // AES final-2 block - mov high 1447 eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid 1448 movi v8.8b, #0 // suppress further partial tag feed in 1449 pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high 1450 pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid 1451 eor x6, x6, x13 // AES final-2 block - round N low 1452 pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low 1453 eor x7, x7, x14 // AES final-2 block - round N high 1454.Ldec_blocks_more_than_2: // blocks left > 2 1455 rev64 v4.16b, v5.16b // GHASH final-2 block 1456 ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext 1457 eor v4.16b, v4.16b, v8.16b // feed in partial tag 1458 stp x6, x7, [x2], #16 // AES final-2 block - store result 1459 eor v0.16b, v5.16b, v2.16b // AES final-1 block - result 1460 mov d22, v4.d[1] // GHASH final-2 block - mid 1461 pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low 1462 pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high 1463 eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid 1464 mov x6, v0.d[0] // AES final-1 block - mov low 1465 mov x7, v0.d[1] // AES final-1 block - mov high 1466 eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low 1467 movi v8.8b, #0 // suppress further partial tag feed in 1468 pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid 1469 eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high 1470 eor x6, x6, x13 // AES final-1 block - round N low 1471 eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid 1472 eor x7, x7, x14 // AES final-1 block - round N high 1473.Ldec_blocks_more_than_1: // blocks left > 1 1474 stp x6, x7, [x2], #16 // AES final-1 block - store result 1475 rev64 v4.16b, v5.16b // GHASH final-1 block 1476 ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext 1477 eor v4.16b, v4.16b, v8.16b // feed in partial tag 1478 movi v8.8b, #0 // suppress further partial tag feed in 1479 mov d22, v4.d[1] // GHASH final-1 block - mid 1480 eor v0.16b, v5.16b, v3.16b // AES final block - result 1481 pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high 1482 eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid 1483 pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low 1484 mov x6, v0.d[0] // AES final block - mov low 1485 ins v22.d[1], v22.d[0] // GHASH final-1 block - mid 1486 mov x7, v0.d[1] // AES final block - mov high 1487 pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid 1488 eor x6, x6, x13 // AES final block - round N low 1489 eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low 1490 eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high 1491 eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid 1492 eor x7, x7, x14 // AES final block - round N high 1493.Ldec_blocks_less_than_1: // blocks left <= 1 1494 and x1, x1, #127 // bit_length %= 128 1495 mvn x14, xzr // rkN_h = 0xffffffffffffffff 1496 sub x1, x1, #128 // bit_length -= 128 1497 mvn x13, xzr // rkN_l = 0xffffffffffffffff 1498 ldp x4, x5, [x2] // load existing bytes we need to not overwrite 1499 neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) 1500 and x1, x1, #127 // bit_length %= 128 1501 lsr x14, x14, x1 // rkN_h is mask for top 64b of last block 1502 cmp x1, #64 1503 csel x9, x13, x14, lt 1504 csel x10, x14, xzr, lt 1505 fmov d0, x9 // ctr0b is mask for last block 1506 and x6, x6, x9 1507 mov v0.d[1], x10 1508 bic x4, x4, x9 // mask out low existing bytes 1509 rev w9, w12 1510 bic x5, x5, x10 // mask out high existing bytes 1511 orr x6, x6, x4 1512 and x7, x7, x10 1513 orr x7, x7, x5 1514 and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits 1515 rev64 v4.16b, v5.16b // GHASH final block 1516 eor v4.16b, v4.16b, v8.16b // feed in partial tag 1517 pmull v21.1q, v4.1d, v12.1d // GHASH final block - low 1518 mov d8, v4.d[1] // GHASH final block - mid 1519 eor v8.8b, v8.8b, v4.8b // GHASH final block - mid 1520 pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high 1521 pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid 1522 eor v9.16b, v9.16b, v20.16b // GHASH final block - high 1523 eor v11.16b, v11.16b, v21.16b // GHASH final block - low 1524 eor v10.16b, v10.16b, v8.16b // GHASH final block - mid 1525 movi v8.8b, #0xc2 1526 eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up 1527 shl d8, d8, #56 // mod_constant 1528 eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up 1529 pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid 1530 ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment 1531 eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid 1532 eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid 1533 pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low 1534 ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment 1535 eor v11.16b, v11.16b, v8.16b // MODULO - fold into low 1536 stp x6, x7, [x2] 1537 str w9, [x16, #12] // store the updated counter 1538 eor v11.16b, v11.16b, v10.16b // MODULO - fold into low 1539 ext v11.16b, v11.16b, v11.16b, #8 1540 rev64 v11.16b, v11.16b 1541 mov x0, x15 1542 st1 { v11.16b }, [x3] 1543 ldp x19, x20, [sp, #16] 1544 ldp x21, x22, [sp, #32] 1545 ldp x23, x24, [sp, #48] 1546 ldp d8, d9, [sp, #64] 1547 ldp d10, d11, [sp, #80] 1548 ldp d12, d13, [sp, #96] 1549 ldp d14, d15, [sp, #112] 1550 ldp x29, x30, [sp], #128 1551 AARCH64_VALIDATE_LINK_REGISTER 1552 ret 1553.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel 1554#endif 1555#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 1556