1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) 7#include <ring-core/arm_arch.h> 8.section __TEXT,__const 9 10.align 7 11Lchacha20_consts: 12.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 13Linc: 14.long 1,2,3,4 15Lrol8: 16.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 17Lclamp: 18.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC 19 20.text 21 22 23.align 6 24Lpoly_hash_ad_internal: 25.cfi_startproc 26 cbnz x4, Lpoly_hash_intro 27 ret 28 29Lpoly_hash_intro: 30 cmp x4, #16 31 b.lt Lpoly_hash_ad_tail 32 ldp x11, x12, [x3], 16 33 adds x8, x8, x11 34 adcs x9, x9, x12 35 adc x10, x10, x15 36 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 37 umulh x12, x8, x16 38 mul x13, x9, x16 39 umulh x14, x9, x16 40 adds x12, x12, x13 41 mul x13, x10, x16 42 adc x13, x13, x14 43 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 44 umulh x8, x8, x17 45 adds x12, x12, x14 46 mul x14, x9, x17 47 umulh x9, x9, x17 48 adcs x14, x14, x8 49 mul x10, x10, x17 50 adc x10, x10, x9 51 adds x13, x13, x14 52 adc x14, x10, xzr 53 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 54 and x8, x13, #-4 55 extr x13, x14, x13, #2 56 adds x8, x8, x11 57 lsr x11, x14, #2 58 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 59 adds x8, x8, x13 60 adcs x9, x9, x12 61 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 62 sub x4, x4, #16 63 b Lpoly_hash_ad_internal 64 65Lpoly_hash_ad_tail: 66 cbz x4, Lpoly_hash_ad_ret 67 68 eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD 69 sub x4, x4, #1 70 71Lpoly_hash_tail_16_compose: 72 ext v20.16b, v20.16b, v20.16b, #15 73 ldrb w11, [x3, x4] 74 mov v20.b[0], w11 75 subs x4, x4, #1 76 b.ge Lpoly_hash_tail_16_compose 77 mov x11, v20.d[0] 78 mov x12, v20.d[1] 79 adds x8, x8, x11 80 adcs x9, x9, x12 81 adc x10, x10, x15 82 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 83 umulh x12, x8, x16 84 mul x13, x9, x16 85 umulh x14, x9, x16 86 adds x12, x12, x13 87 mul x13, x10, x16 88 adc x13, x13, x14 89 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 90 umulh x8, x8, x17 91 adds x12, x12, x14 92 mul x14, x9, x17 93 umulh x9, x9, x17 94 adcs x14, x14, x8 95 mul x10, x10, x17 96 adc x10, x10, x9 97 adds x13, x13, x14 98 adc x14, x10, xzr 99 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 100 and x8, x13, #-4 101 extr x13, x14, x13, #2 102 adds x8, x8, x11 103 lsr x11, x14, #2 104 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 105 adds x8, x8, x13 106 adcs x9, x9, x12 107 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 108 109Lpoly_hash_ad_ret: 110 ret 111.cfi_endproc 112 113 114///////////////////////////////// 115// 116// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); 117// 118.globl _chacha20_poly1305_seal 119.private_extern _chacha20_poly1305_seal 120 121.align 6 122_chacha20_poly1305_seal: 123 AARCH64_SIGN_LINK_REGISTER 124.cfi_startproc 125 stp x29, x30, [sp, #-80]! 126.cfi_def_cfa_offset 80 127.cfi_offset w30, -72 128.cfi_offset w29, -80 129 mov x29, sp 130 // We probably could do .cfi_def_cfa w29, 80 at this point, but since 131 // we don't actually use the frame pointer like that, it's probably not 132 // worth bothering. 133 stp d8, d9, [sp, #16] 134 stp d10, d11, [sp, #32] 135 stp d12, d13, [sp, #48] 136 stp d14, d15, [sp, #64] 137.cfi_offset b15, -8 138.cfi_offset b14, -16 139.cfi_offset b13, -24 140.cfi_offset b12, -32 141.cfi_offset b11, -40 142.cfi_offset b10, -48 143.cfi_offset b9, -56 144.cfi_offset b8, -64 145 146 adrp x11, Lchacha20_consts@PAGE 147 add x11, x11, Lchacha20_consts@PAGEOFF 148 149 ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values 150 ld1 {v28.16b - v30.16b}, [x5] 151 152 mov x15, #1 // Prepare the Poly1305 state 153 mov x8, #0 154 mov x9, #0 155 mov x10, #0 156 157 ldr x12, [x5, #56] // The total cipher text length includes extra_in_len 158 add x12, x12, x2 159 mov v31.d[0], x4 // Store the input and aad lengths 160 mov v31.d[1], x12 161 162 cmp x2, #128 163 b.le Lseal_128 // Optimization for smaller buffers 164 165 // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, 166 // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, 167 // the fifth block (A4-D4) horizontally. 168 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 169 mov v4.16b, v24.16b 170 171 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 172 mov v9.16b, v28.16b 173 174 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 175 mov v14.16b, v29.16b 176 177 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 178 add v15.4s, v15.4s, v25.4s 179 mov v19.16b, v30.16b 180 181 sub x5, x5, #32 182 183 mov x6, #10 184 185.align 5 186Lseal_init_rounds: 187 add v0.4s, v0.4s, v5.4s 188 add v1.4s, v1.4s, v6.4s 189 add v2.4s, v2.4s, v7.4s 190 add v3.4s, v3.4s, v8.4s 191 add v4.4s, v4.4s, v9.4s 192 193 eor v15.16b, v15.16b, v0.16b 194 eor v16.16b, v16.16b, v1.16b 195 eor v17.16b, v17.16b, v2.16b 196 eor v18.16b, v18.16b, v3.16b 197 eor v19.16b, v19.16b, v4.16b 198 199 rev32 v15.8h, v15.8h 200 rev32 v16.8h, v16.8h 201 rev32 v17.8h, v17.8h 202 rev32 v18.8h, v18.8h 203 rev32 v19.8h, v19.8h 204 205 add v10.4s, v10.4s, v15.4s 206 add v11.4s, v11.4s, v16.4s 207 add v12.4s, v12.4s, v17.4s 208 add v13.4s, v13.4s, v18.4s 209 add v14.4s, v14.4s, v19.4s 210 211 eor v5.16b, v5.16b, v10.16b 212 eor v6.16b, v6.16b, v11.16b 213 eor v7.16b, v7.16b, v12.16b 214 eor v8.16b, v8.16b, v13.16b 215 eor v9.16b, v9.16b, v14.16b 216 217 ushr v20.4s, v5.4s, #20 218 sli v20.4s, v5.4s, #12 219 ushr v5.4s, v6.4s, #20 220 sli v5.4s, v6.4s, #12 221 ushr v6.4s, v7.4s, #20 222 sli v6.4s, v7.4s, #12 223 ushr v7.4s, v8.4s, #20 224 sli v7.4s, v8.4s, #12 225 ushr v8.4s, v9.4s, #20 226 sli v8.4s, v9.4s, #12 227 228 add v0.4s, v0.4s, v20.4s 229 add v1.4s, v1.4s, v5.4s 230 add v2.4s, v2.4s, v6.4s 231 add v3.4s, v3.4s, v7.4s 232 add v4.4s, v4.4s, v8.4s 233 234 eor v15.16b, v15.16b, v0.16b 235 eor v16.16b, v16.16b, v1.16b 236 eor v17.16b, v17.16b, v2.16b 237 eor v18.16b, v18.16b, v3.16b 238 eor v19.16b, v19.16b, v4.16b 239 240 tbl v15.16b, {v15.16b}, v26.16b 241 tbl v16.16b, {v16.16b}, v26.16b 242 tbl v17.16b, {v17.16b}, v26.16b 243 tbl v18.16b, {v18.16b}, v26.16b 244 tbl v19.16b, {v19.16b}, v26.16b 245 246 add v10.4s, v10.4s, v15.4s 247 add v11.4s, v11.4s, v16.4s 248 add v12.4s, v12.4s, v17.4s 249 add v13.4s, v13.4s, v18.4s 250 add v14.4s, v14.4s, v19.4s 251 252 eor v20.16b, v20.16b, v10.16b 253 eor v5.16b, v5.16b, v11.16b 254 eor v6.16b, v6.16b, v12.16b 255 eor v7.16b, v7.16b, v13.16b 256 eor v8.16b, v8.16b, v14.16b 257 258 ushr v9.4s, v8.4s, #25 259 sli v9.4s, v8.4s, #7 260 ushr v8.4s, v7.4s, #25 261 sli v8.4s, v7.4s, #7 262 ushr v7.4s, v6.4s, #25 263 sli v7.4s, v6.4s, #7 264 ushr v6.4s, v5.4s, #25 265 sli v6.4s, v5.4s, #7 266 ushr v5.4s, v20.4s, #25 267 sli v5.4s, v20.4s, #7 268 269 ext v9.16b, v9.16b, v9.16b, #4 270 ext v14.16b, v14.16b, v14.16b, #8 271 ext v19.16b, v19.16b, v19.16b, #12 272 add v0.4s, v0.4s, v6.4s 273 add v1.4s, v1.4s, v7.4s 274 add v2.4s, v2.4s, v8.4s 275 add v3.4s, v3.4s, v5.4s 276 add v4.4s, v4.4s, v9.4s 277 278 eor v18.16b, v18.16b, v0.16b 279 eor v15.16b, v15.16b, v1.16b 280 eor v16.16b, v16.16b, v2.16b 281 eor v17.16b, v17.16b, v3.16b 282 eor v19.16b, v19.16b, v4.16b 283 284 rev32 v18.8h, v18.8h 285 rev32 v15.8h, v15.8h 286 rev32 v16.8h, v16.8h 287 rev32 v17.8h, v17.8h 288 rev32 v19.8h, v19.8h 289 290 add v12.4s, v12.4s, v18.4s 291 add v13.4s, v13.4s, v15.4s 292 add v10.4s, v10.4s, v16.4s 293 add v11.4s, v11.4s, v17.4s 294 add v14.4s, v14.4s, v19.4s 295 296 eor v6.16b, v6.16b, v12.16b 297 eor v7.16b, v7.16b, v13.16b 298 eor v8.16b, v8.16b, v10.16b 299 eor v5.16b, v5.16b, v11.16b 300 eor v9.16b, v9.16b, v14.16b 301 302 ushr v20.4s, v6.4s, #20 303 sli v20.4s, v6.4s, #12 304 ushr v6.4s, v7.4s, #20 305 sli v6.4s, v7.4s, #12 306 ushr v7.4s, v8.4s, #20 307 sli v7.4s, v8.4s, #12 308 ushr v8.4s, v5.4s, #20 309 sli v8.4s, v5.4s, #12 310 ushr v5.4s, v9.4s, #20 311 sli v5.4s, v9.4s, #12 312 313 add v0.4s, v0.4s, v20.4s 314 add v1.4s, v1.4s, v6.4s 315 add v2.4s, v2.4s, v7.4s 316 add v3.4s, v3.4s, v8.4s 317 add v4.4s, v4.4s, v5.4s 318 319 eor v18.16b, v18.16b, v0.16b 320 eor v15.16b, v15.16b, v1.16b 321 eor v16.16b, v16.16b, v2.16b 322 eor v17.16b, v17.16b, v3.16b 323 eor v19.16b, v19.16b, v4.16b 324 325 tbl v18.16b, {v18.16b}, v26.16b 326 tbl v15.16b, {v15.16b}, v26.16b 327 tbl v16.16b, {v16.16b}, v26.16b 328 tbl v17.16b, {v17.16b}, v26.16b 329 tbl v19.16b, {v19.16b}, v26.16b 330 331 add v12.4s, v12.4s, v18.4s 332 add v13.4s, v13.4s, v15.4s 333 add v10.4s, v10.4s, v16.4s 334 add v11.4s, v11.4s, v17.4s 335 add v14.4s, v14.4s, v19.4s 336 337 eor v20.16b, v20.16b, v12.16b 338 eor v6.16b, v6.16b, v13.16b 339 eor v7.16b, v7.16b, v10.16b 340 eor v8.16b, v8.16b, v11.16b 341 eor v5.16b, v5.16b, v14.16b 342 343 ushr v9.4s, v5.4s, #25 344 sli v9.4s, v5.4s, #7 345 ushr v5.4s, v8.4s, #25 346 sli v5.4s, v8.4s, #7 347 ushr v8.4s, v7.4s, #25 348 sli v8.4s, v7.4s, #7 349 ushr v7.4s, v6.4s, #25 350 sli v7.4s, v6.4s, #7 351 ushr v6.4s, v20.4s, #25 352 sli v6.4s, v20.4s, #7 353 354 ext v9.16b, v9.16b, v9.16b, #12 355 ext v14.16b, v14.16b, v14.16b, #8 356 ext v19.16b, v19.16b, v19.16b, #4 357 subs x6, x6, #1 358 b.hi Lseal_init_rounds 359 360 add v15.4s, v15.4s, v25.4s 361 mov x11, #4 362 dup v20.4s, w11 363 add v25.4s, v25.4s, v20.4s 364 365 zip1 v20.4s, v0.4s, v1.4s 366 zip2 v21.4s, v0.4s, v1.4s 367 zip1 v22.4s, v2.4s, v3.4s 368 zip2 v23.4s, v2.4s, v3.4s 369 370 zip1 v0.2d, v20.2d, v22.2d 371 zip2 v1.2d, v20.2d, v22.2d 372 zip1 v2.2d, v21.2d, v23.2d 373 zip2 v3.2d, v21.2d, v23.2d 374 375 zip1 v20.4s, v5.4s, v6.4s 376 zip2 v21.4s, v5.4s, v6.4s 377 zip1 v22.4s, v7.4s, v8.4s 378 zip2 v23.4s, v7.4s, v8.4s 379 380 zip1 v5.2d, v20.2d, v22.2d 381 zip2 v6.2d, v20.2d, v22.2d 382 zip1 v7.2d, v21.2d, v23.2d 383 zip2 v8.2d, v21.2d, v23.2d 384 385 zip1 v20.4s, v10.4s, v11.4s 386 zip2 v21.4s, v10.4s, v11.4s 387 zip1 v22.4s, v12.4s, v13.4s 388 zip2 v23.4s, v12.4s, v13.4s 389 390 zip1 v10.2d, v20.2d, v22.2d 391 zip2 v11.2d, v20.2d, v22.2d 392 zip1 v12.2d, v21.2d, v23.2d 393 zip2 v13.2d, v21.2d, v23.2d 394 395 zip1 v20.4s, v15.4s, v16.4s 396 zip2 v21.4s, v15.4s, v16.4s 397 zip1 v22.4s, v17.4s, v18.4s 398 zip2 v23.4s, v17.4s, v18.4s 399 400 zip1 v15.2d, v20.2d, v22.2d 401 zip2 v16.2d, v20.2d, v22.2d 402 zip1 v17.2d, v21.2d, v23.2d 403 zip2 v18.2d, v21.2d, v23.2d 404 405 add v4.4s, v4.4s, v24.4s 406 add v9.4s, v9.4s, v28.4s 407 and v4.16b, v4.16b, v27.16b 408 409 add v0.4s, v0.4s, v24.4s 410 add v5.4s, v5.4s, v28.4s 411 add v10.4s, v10.4s, v29.4s 412 add v15.4s, v15.4s, v30.4s 413 414 add v1.4s, v1.4s, v24.4s 415 add v6.4s, v6.4s, v28.4s 416 add v11.4s, v11.4s, v29.4s 417 add v16.4s, v16.4s, v30.4s 418 419 add v2.4s, v2.4s, v24.4s 420 add v7.4s, v7.4s, v28.4s 421 add v12.4s, v12.4s, v29.4s 422 add v17.4s, v17.4s, v30.4s 423 424 add v3.4s, v3.4s, v24.4s 425 add v8.4s, v8.4s, v28.4s 426 add v13.4s, v13.4s, v29.4s 427 add v18.4s, v18.4s, v30.4s 428 429 mov x16, v4.d[0] // Move the R key to GPRs 430 mov x17, v4.d[1] 431 mov v27.16b, v9.16b // Store the S key 432 433 bl Lpoly_hash_ad_internal 434 435 mov x3, x0 436 cmp x2, #256 437 b.le Lseal_tail 438 439 ld1 {v20.16b - v23.16b}, [x1], #64 440 eor v20.16b, v20.16b, v0.16b 441 eor v21.16b, v21.16b, v5.16b 442 eor v22.16b, v22.16b, v10.16b 443 eor v23.16b, v23.16b, v15.16b 444 st1 {v20.16b - v23.16b}, [x0], #64 445 446 ld1 {v20.16b - v23.16b}, [x1], #64 447 eor v20.16b, v20.16b, v1.16b 448 eor v21.16b, v21.16b, v6.16b 449 eor v22.16b, v22.16b, v11.16b 450 eor v23.16b, v23.16b, v16.16b 451 st1 {v20.16b - v23.16b}, [x0], #64 452 453 ld1 {v20.16b - v23.16b}, [x1], #64 454 eor v20.16b, v20.16b, v2.16b 455 eor v21.16b, v21.16b, v7.16b 456 eor v22.16b, v22.16b, v12.16b 457 eor v23.16b, v23.16b, v17.16b 458 st1 {v20.16b - v23.16b}, [x0], #64 459 460 ld1 {v20.16b - v23.16b}, [x1], #64 461 eor v20.16b, v20.16b, v3.16b 462 eor v21.16b, v21.16b, v8.16b 463 eor v22.16b, v22.16b, v13.16b 464 eor v23.16b, v23.16b, v18.16b 465 st1 {v20.16b - v23.16b}, [x0], #64 466 467 sub x2, x2, #256 468 469 mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds 470 mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 471 472Lseal_main_loop: 473 adrp x11, Lchacha20_consts@PAGE 474 add x11, x11, Lchacha20_consts@PAGEOFF 475 476 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 477 mov v4.16b, v24.16b 478 479 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 480 mov v9.16b, v28.16b 481 482 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 483 mov v14.16b, v29.16b 484 485 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 486 add v15.4s, v15.4s, v25.4s 487 mov v19.16b, v30.16b 488 489 eor v20.16b, v20.16b, v20.16b //zero 490 not v21.16b, v20.16b // -1 491 sub v21.4s, v25.4s, v21.4s // Add +1 492 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 493 add v19.4s, v19.4s, v20.4s 494 495 sub x5, x5, #32 496.align 5 497Lseal_main_loop_rounds: 498 add v0.4s, v0.4s, v5.4s 499 add v1.4s, v1.4s, v6.4s 500 add v2.4s, v2.4s, v7.4s 501 add v3.4s, v3.4s, v8.4s 502 add v4.4s, v4.4s, v9.4s 503 504 eor v15.16b, v15.16b, v0.16b 505 eor v16.16b, v16.16b, v1.16b 506 eor v17.16b, v17.16b, v2.16b 507 eor v18.16b, v18.16b, v3.16b 508 eor v19.16b, v19.16b, v4.16b 509 510 rev32 v15.8h, v15.8h 511 rev32 v16.8h, v16.8h 512 rev32 v17.8h, v17.8h 513 rev32 v18.8h, v18.8h 514 rev32 v19.8h, v19.8h 515 516 add v10.4s, v10.4s, v15.4s 517 add v11.4s, v11.4s, v16.4s 518 add v12.4s, v12.4s, v17.4s 519 add v13.4s, v13.4s, v18.4s 520 add v14.4s, v14.4s, v19.4s 521 522 eor v5.16b, v5.16b, v10.16b 523 eor v6.16b, v6.16b, v11.16b 524 eor v7.16b, v7.16b, v12.16b 525 eor v8.16b, v8.16b, v13.16b 526 eor v9.16b, v9.16b, v14.16b 527 528 ushr v20.4s, v5.4s, #20 529 sli v20.4s, v5.4s, #12 530 ushr v5.4s, v6.4s, #20 531 sli v5.4s, v6.4s, #12 532 ushr v6.4s, v7.4s, #20 533 sli v6.4s, v7.4s, #12 534 ushr v7.4s, v8.4s, #20 535 sli v7.4s, v8.4s, #12 536 ushr v8.4s, v9.4s, #20 537 sli v8.4s, v9.4s, #12 538 539 add v0.4s, v0.4s, v20.4s 540 add v1.4s, v1.4s, v5.4s 541 add v2.4s, v2.4s, v6.4s 542 add v3.4s, v3.4s, v7.4s 543 add v4.4s, v4.4s, v8.4s 544 545 eor v15.16b, v15.16b, v0.16b 546 eor v16.16b, v16.16b, v1.16b 547 eor v17.16b, v17.16b, v2.16b 548 eor v18.16b, v18.16b, v3.16b 549 eor v19.16b, v19.16b, v4.16b 550 551 tbl v15.16b, {v15.16b}, v26.16b 552 tbl v16.16b, {v16.16b}, v26.16b 553 tbl v17.16b, {v17.16b}, v26.16b 554 tbl v18.16b, {v18.16b}, v26.16b 555 tbl v19.16b, {v19.16b}, v26.16b 556 557 add v10.4s, v10.4s, v15.4s 558 add v11.4s, v11.4s, v16.4s 559 add v12.4s, v12.4s, v17.4s 560 add v13.4s, v13.4s, v18.4s 561 add v14.4s, v14.4s, v19.4s 562 563 eor v20.16b, v20.16b, v10.16b 564 eor v5.16b, v5.16b, v11.16b 565 eor v6.16b, v6.16b, v12.16b 566 eor v7.16b, v7.16b, v13.16b 567 eor v8.16b, v8.16b, v14.16b 568 569 ushr v9.4s, v8.4s, #25 570 sli v9.4s, v8.4s, #7 571 ushr v8.4s, v7.4s, #25 572 sli v8.4s, v7.4s, #7 573 ushr v7.4s, v6.4s, #25 574 sli v7.4s, v6.4s, #7 575 ushr v6.4s, v5.4s, #25 576 sli v6.4s, v5.4s, #7 577 ushr v5.4s, v20.4s, #25 578 sli v5.4s, v20.4s, #7 579 580 ext v9.16b, v9.16b, v9.16b, #4 581 ext v14.16b, v14.16b, v14.16b, #8 582 ext v19.16b, v19.16b, v19.16b, #12 583 ldp x11, x12, [x3], 16 584 adds x8, x8, x11 585 adcs x9, x9, x12 586 adc x10, x10, x15 587 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 588 umulh x12, x8, x16 589 mul x13, x9, x16 590 umulh x14, x9, x16 591 adds x12, x12, x13 592 mul x13, x10, x16 593 adc x13, x13, x14 594 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 595 umulh x8, x8, x17 596 adds x12, x12, x14 597 mul x14, x9, x17 598 umulh x9, x9, x17 599 adcs x14, x14, x8 600 mul x10, x10, x17 601 adc x10, x10, x9 602 adds x13, x13, x14 603 adc x14, x10, xzr 604 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 605 and x8, x13, #-4 606 extr x13, x14, x13, #2 607 adds x8, x8, x11 608 lsr x11, x14, #2 609 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 610 adds x8, x8, x13 611 adcs x9, x9, x12 612 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 613 add v0.4s, v0.4s, v6.4s 614 add v1.4s, v1.4s, v7.4s 615 add v2.4s, v2.4s, v8.4s 616 add v3.4s, v3.4s, v5.4s 617 add v4.4s, v4.4s, v9.4s 618 619 eor v18.16b, v18.16b, v0.16b 620 eor v15.16b, v15.16b, v1.16b 621 eor v16.16b, v16.16b, v2.16b 622 eor v17.16b, v17.16b, v3.16b 623 eor v19.16b, v19.16b, v4.16b 624 625 rev32 v18.8h, v18.8h 626 rev32 v15.8h, v15.8h 627 rev32 v16.8h, v16.8h 628 rev32 v17.8h, v17.8h 629 rev32 v19.8h, v19.8h 630 631 add v12.4s, v12.4s, v18.4s 632 add v13.4s, v13.4s, v15.4s 633 add v10.4s, v10.4s, v16.4s 634 add v11.4s, v11.4s, v17.4s 635 add v14.4s, v14.4s, v19.4s 636 637 eor v6.16b, v6.16b, v12.16b 638 eor v7.16b, v7.16b, v13.16b 639 eor v8.16b, v8.16b, v10.16b 640 eor v5.16b, v5.16b, v11.16b 641 eor v9.16b, v9.16b, v14.16b 642 643 ushr v20.4s, v6.4s, #20 644 sli v20.4s, v6.4s, #12 645 ushr v6.4s, v7.4s, #20 646 sli v6.4s, v7.4s, #12 647 ushr v7.4s, v8.4s, #20 648 sli v7.4s, v8.4s, #12 649 ushr v8.4s, v5.4s, #20 650 sli v8.4s, v5.4s, #12 651 ushr v5.4s, v9.4s, #20 652 sli v5.4s, v9.4s, #12 653 654 add v0.4s, v0.4s, v20.4s 655 add v1.4s, v1.4s, v6.4s 656 add v2.4s, v2.4s, v7.4s 657 add v3.4s, v3.4s, v8.4s 658 add v4.4s, v4.4s, v5.4s 659 660 eor v18.16b, v18.16b, v0.16b 661 eor v15.16b, v15.16b, v1.16b 662 eor v16.16b, v16.16b, v2.16b 663 eor v17.16b, v17.16b, v3.16b 664 eor v19.16b, v19.16b, v4.16b 665 666 tbl v18.16b, {v18.16b}, v26.16b 667 tbl v15.16b, {v15.16b}, v26.16b 668 tbl v16.16b, {v16.16b}, v26.16b 669 tbl v17.16b, {v17.16b}, v26.16b 670 tbl v19.16b, {v19.16b}, v26.16b 671 672 add v12.4s, v12.4s, v18.4s 673 add v13.4s, v13.4s, v15.4s 674 add v10.4s, v10.4s, v16.4s 675 add v11.4s, v11.4s, v17.4s 676 add v14.4s, v14.4s, v19.4s 677 678 eor v20.16b, v20.16b, v12.16b 679 eor v6.16b, v6.16b, v13.16b 680 eor v7.16b, v7.16b, v10.16b 681 eor v8.16b, v8.16b, v11.16b 682 eor v5.16b, v5.16b, v14.16b 683 684 ushr v9.4s, v5.4s, #25 685 sli v9.4s, v5.4s, #7 686 ushr v5.4s, v8.4s, #25 687 sli v5.4s, v8.4s, #7 688 ushr v8.4s, v7.4s, #25 689 sli v8.4s, v7.4s, #7 690 ushr v7.4s, v6.4s, #25 691 sli v7.4s, v6.4s, #7 692 ushr v6.4s, v20.4s, #25 693 sli v6.4s, v20.4s, #7 694 695 ext v9.16b, v9.16b, v9.16b, #12 696 ext v14.16b, v14.16b, v14.16b, #8 697 ext v19.16b, v19.16b, v19.16b, #4 698 subs x6, x6, #1 699 b.ge Lseal_main_loop_rounds 700 ldp x11, x12, [x3], 16 701 adds x8, x8, x11 702 adcs x9, x9, x12 703 adc x10, x10, x15 704 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 705 umulh x12, x8, x16 706 mul x13, x9, x16 707 umulh x14, x9, x16 708 adds x12, x12, x13 709 mul x13, x10, x16 710 adc x13, x13, x14 711 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 712 umulh x8, x8, x17 713 adds x12, x12, x14 714 mul x14, x9, x17 715 umulh x9, x9, x17 716 adcs x14, x14, x8 717 mul x10, x10, x17 718 adc x10, x10, x9 719 adds x13, x13, x14 720 adc x14, x10, xzr 721 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 722 and x8, x13, #-4 723 extr x13, x14, x13, #2 724 adds x8, x8, x11 725 lsr x11, x14, #2 726 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 727 adds x8, x8, x13 728 adcs x9, x9, x12 729 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 730 subs x7, x7, #1 731 b.gt Lseal_main_loop_rounds 732 733 eor v20.16b, v20.16b, v20.16b //zero 734 not v21.16b, v20.16b // -1 735 sub v21.4s, v25.4s, v21.4s // Add +1 736 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 737 add v19.4s, v19.4s, v20.4s 738 739 add v15.4s, v15.4s, v25.4s 740 mov x11, #5 741 dup v20.4s, w11 742 add v25.4s, v25.4s, v20.4s 743 744 zip1 v20.4s, v0.4s, v1.4s 745 zip2 v21.4s, v0.4s, v1.4s 746 zip1 v22.4s, v2.4s, v3.4s 747 zip2 v23.4s, v2.4s, v3.4s 748 749 zip1 v0.2d, v20.2d, v22.2d 750 zip2 v1.2d, v20.2d, v22.2d 751 zip1 v2.2d, v21.2d, v23.2d 752 zip2 v3.2d, v21.2d, v23.2d 753 754 zip1 v20.4s, v5.4s, v6.4s 755 zip2 v21.4s, v5.4s, v6.4s 756 zip1 v22.4s, v7.4s, v8.4s 757 zip2 v23.4s, v7.4s, v8.4s 758 759 zip1 v5.2d, v20.2d, v22.2d 760 zip2 v6.2d, v20.2d, v22.2d 761 zip1 v7.2d, v21.2d, v23.2d 762 zip2 v8.2d, v21.2d, v23.2d 763 764 zip1 v20.4s, v10.4s, v11.4s 765 zip2 v21.4s, v10.4s, v11.4s 766 zip1 v22.4s, v12.4s, v13.4s 767 zip2 v23.4s, v12.4s, v13.4s 768 769 zip1 v10.2d, v20.2d, v22.2d 770 zip2 v11.2d, v20.2d, v22.2d 771 zip1 v12.2d, v21.2d, v23.2d 772 zip2 v13.2d, v21.2d, v23.2d 773 774 zip1 v20.4s, v15.4s, v16.4s 775 zip2 v21.4s, v15.4s, v16.4s 776 zip1 v22.4s, v17.4s, v18.4s 777 zip2 v23.4s, v17.4s, v18.4s 778 779 zip1 v15.2d, v20.2d, v22.2d 780 zip2 v16.2d, v20.2d, v22.2d 781 zip1 v17.2d, v21.2d, v23.2d 782 zip2 v18.2d, v21.2d, v23.2d 783 784 add v0.4s, v0.4s, v24.4s 785 add v5.4s, v5.4s, v28.4s 786 add v10.4s, v10.4s, v29.4s 787 add v15.4s, v15.4s, v30.4s 788 789 add v1.4s, v1.4s, v24.4s 790 add v6.4s, v6.4s, v28.4s 791 add v11.4s, v11.4s, v29.4s 792 add v16.4s, v16.4s, v30.4s 793 794 add v2.4s, v2.4s, v24.4s 795 add v7.4s, v7.4s, v28.4s 796 add v12.4s, v12.4s, v29.4s 797 add v17.4s, v17.4s, v30.4s 798 799 add v3.4s, v3.4s, v24.4s 800 add v8.4s, v8.4s, v28.4s 801 add v13.4s, v13.4s, v29.4s 802 add v18.4s, v18.4s, v30.4s 803 804 add v4.4s, v4.4s, v24.4s 805 add v9.4s, v9.4s, v28.4s 806 add v14.4s, v14.4s, v29.4s 807 add v19.4s, v19.4s, v30.4s 808 809 cmp x2, #320 810 b.le Lseal_tail 811 812 ld1 {v20.16b - v23.16b}, [x1], #64 813 eor v20.16b, v20.16b, v0.16b 814 eor v21.16b, v21.16b, v5.16b 815 eor v22.16b, v22.16b, v10.16b 816 eor v23.16b, v23.16b, v15.16b 817 st1 {v20.16b - v23.16b}, [x0], #64 818 819 ld1 {v20.16b - v23.16b}, [x1], #64 820 eor v20.16b, v20.16b, v1.16b 821 eor v21.16b, v21.16b, v6.16b 822 eor v22.16b, v22.16b, v11.16b 823 eor v23.16b, v23.16b, v16.16b 824 st1 {v20.16b - v23.16b}, [x0], #64 825 826 ld1 {v20.16b - v23.16b}, [x1], #64 827 eor v20.16b, v20.16b, v2.16b 828 eor v21.16b, v21.16b, v7.16b 829 eor v22.16b, v22.16b, v12.16b 830 eor v23.16b, v23.16b, v17.16b 831 st1 {v20.16b - v23.16b}, [x0], #64 832 833 ld1 {v20.16b - v23.16b}, [x1], #64 834 eor v20.16b, v20.16b, v3.16b 835 eor v21.16b, v21.16b, v8.16b 836 eor v22.16b, v22.16b, v13.16b 837 eor v23.16b, v23.16b, v18.16b 838 st1 {v20.16b - v23.16b}, [x0], #64 839 840 ld1 {v20.16b - v23.16b}, [x1], #64 841 eor v20.16b, v20.16b, v4.16b 842 eor v21.16b, v21.16b, v9.16b 843 eor v22.16b, v22.16b, v14.16b 844 eor v23.16b, v23.16b, v19.16b 845 st1 {v20.16b - v23.16b}, [x0], #64 846 847 sub x2, x2, #320 848 849 mov x6, #0 850 mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration 851 852 b Lseal_main_loop 853 854Lseal_tail: 855 // This part of the function handles the storage and authentication of the last [0,320) bytes 856 // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. 857 cmp x2, #64 858 b.lt Lseal_tail_64 859 860 // Store and authenticate 64B blocks per iteration 861 ld1 {v20.16b - v23.16b}, [x1], #64 862 863 eor v20.16b, v20.16b, v0.16b 864 eor v21.16b, v21.16b, v5.16b 865 eor v22.16b, v22.16b, v10.16b 866 eor v23.16b, v23.16b, v15.16b 867 mov x11, v20.d[0] 868 mov x12, v20.d[1] 869 adds x8, x8, x11 870 adcs x9, x9, x12 871 adc x10, x10, x15 872 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 873 umulh x12, x8, x16 874 mul x13, x9, x16 875 umulh x14, x9, x16 876 adds x12, x12, x13 877 mul x13, x10, x16 878 adc x13, x13, x14 879 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 880 umulh x8, x8, x17 881 adds x12, x12, x14 882 mul x14, x9, x17 883 umulh x9, x9, x17 884 adcs x14, x14, x8 885 mul x10, x10, x17 886 adc x10, x10, x9 887 adds x13, x13, x14 888 adc x14, x10, xzr 889 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 890 and x8, x13, #-4 891 extr x13, x14, x13, #2 892 adds x8, x8, x11 893 lsr x11, x14, #2 894 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 895 adds x8, x8, x13 896 adcs x9, x9, x12 897 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 898 mov x11, v21.d[0] 899 mov x12, v21.d[1] 900 adds x8, x8, x11 901 adcs x9, x9, x12 902 adc x10, x10, x15 903 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 904 umulh x12, x8, x16 905 mul x13, x9, x16 906 umulh x14, x9, x16 907 adds x12, x12, x13 908 mul x13, x10, x16 909 adc x13, x13, x14 910 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 911 umulh x8, x8, x17 912 adds x12, x12, x14 913 mul x14, x9, x17 914 umulh x9, x9, x17 915 adcs x14, x14, x8 916 mul x10, x10, x17 917 adc x10, x10, x9 918 adds x13, x13, x14 919 adc x14, x10, xzr 920 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 921 and x8, x13, #-4 922 extr x13, x14, x13, #2 923 adds x8, x8, x11 924 lsr x11, x14, #2 925 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 926 adds x8, x8, x13 927 adcs x9, x9, x12 928 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 929 mov x11, v22.d[0] 930 mov x12, v22.d[1] 931 adds x8, x8, x11 932 adcs x9, x9, x12 933 adc x10, x10, x15 934 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 935 umulh x12, x8, x16 936 mul x13, x9, x16 937 umulh x14, x9, x16 938 adds x12, x12, x13 939 mul x13, x10, x16 940 adc x13, x13, x14 941 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 942 umulh x8, x8, x17 943 adds x12, x12, x14 944 mul x14, x9, x17 945 umulh x9, x9, x17 946 adcs x14, x14, x8 947 mul x10, x10, x17 948 adc x10, x10, x9 949 adds x13, x13, x14 950 adc x14, x10, xzr 951 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 952 and x8, x13, #-4 953 extr x13, x14, x13, #2 954 adds x8, x8, x11 955 lsr x11, x14, #2 956 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 957 adds x8, x8, x13 958 adcs x9, x9, x12 959 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 960 mov x11, v23.d[0] 961 mov x12, v23.d[1] 962 adds x8, x8, x11 963 adcs x9, x9, x12 964 adc x10, x10, x15 965 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 966 umulh x12, x8, x16 967 mul x13, x9, x16 968 umulh x14, x9, x16 969 adds x12, x12, x13 970 mul x13, x10, x16 971 adc x13, x13, x14 972 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 973 umulh x8, x8, x17 974 adds x12, x12, x14 975 mul x14, x9, x17 976 umulh x9, x9, x17 977 adcs x14, x14, x8 978 mul x10, x10, x17 979 adc x10, x10, x9 980 adds x13, x13, x14 981 adc x14, x10, xzr 982 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 983 and x8, x13, #-4 984 extr x13, x14, x13, #2 985 adds x8, x8, x11 986 lsr x11, x14, #2 987 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 988 adds x8, x8, x13 989 adcs x9, x9, x12 990 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 991 st1 {v20.16b - v23.16b}, [x0], #64 992 sub x2, x2, #64 993 994 // Shift the state left by 64 bytes for the next iteration of the loop 995 mov v0.16b, v1.16b 996 mov v5.16b, v6.16b 997 mov v10.16b, v11.16b 998 mov v15.16b, v16.16b 999 1000 mov v1.16b, v2.16b 1001 mov v6.16b, v7.16b 1002 mov v11.16b, v12.16b 1003 mov v16.16b, v17.16b 1004 1005 mov v2.16b, v3.16b 1006 mov v7.16b, v8.16b 1007 mov v12.16b, v13.16b 1008 mov v17.16b, v18.16b 1009 1010 mov v3.16b, v4.16b 1011 mov v8.16b, v9.16b 1012 mov v13.16b, v14.16b 1013 mov v18.16b, v19.16b 1014 1015 b Lseal_tail 1016 1017Lseal_tail_64: 1018 ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr 1019 1020 // Here we handle the last [0,64) bytes of plaintext 1021 cmp x2, #16 1022 b.lt Lseal_tail_16 1023 // Each iteration encrypt and authenticate a 16B block 1024 ld1 {v20.16b}, [x1], #16 1025 eor v20.16b, v20.16b, v0.16b 1026 mov x11, v20.d[0] 1027 mov x12, v20.d[1] 1028 adds x8, x8, x11 1029 adcs x9, x9, x12 1030 adc x10, x10, x15 1031 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1032 umulh x12, x8, x16 1033 mul x13, x9, x16 1034 umulh x14, x9, x16 1035 adds x12, x12, x13 1036 mul x13, x10, x16 1037 adc x13, x13, x14 1038 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1039 umulh x8, x8, x17 1040 adds x12, x12, x14 1041 mul x14, x9, x17 1042 umulh x9, x9, x17 1043 adcs x14, x14, x8 1044 mul x10, x10, x17 1045 adc x10, x10, x9 1046 adds x13, x13, x14 1047 adc x14, x10, xzr 1048 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1049 and x8, x13, #-4 1050 extr x13, x14, x13, #2 1051 adds x8, x8, x11 1052 lsr x11, x14, #2 1053 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1054 adds x8, x8, x13 1055 adcs x9, x9, x12 1056 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1057 st1 {v20.16b}, [x0], #16 1058 1059 sub x2, x2, #16 1060 1061 // Shift the state left by 16 bytes for the next iteration of the loop 1062 mov v0.16b, v5.16b 1063 mov v5.16b, v10.16b 1064 mov v10.16b, v15.16b 1065 1066 b Lseal_tail_64 1067 1068Lseal_tail_16: 1069 // Here we handle the last [0,16) bytes of ciphertext that require a padded block 1070 cbz x2, Lseal_hash_extra 1071 1072 eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in 1073 eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes 1074 not v22.16b, v20.16b 1075 1076 mov x6, x2 1077 add x1, x1, x2 1078 1079 cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding 1080 1081 mov x7, #16 // We need to load some extra_in first for padding 1082 sub x7, x7, x2 1083 cmp x4, x7 1084 csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register 1085 mov x12, x7 1086 add x3, x3, x7 1087 sub x4, x4, x7 1088 1089Lseal_tail16_compose_extra_in: 1090 ext v20.16b, v20.16b, v20.16b, #15 1091 ldrb w11, [x3, #-1]! 1092 mov v20.b[0], w11 1093 subs x7, x7, #1 1094 b.gt Lseal_tail16_compose_extra_in 1095 1096 add x3, x3, x12 1097 1098Lseal_tail_16_compose: 1099 ext v20.16b, v20.16b, v20.16b, #15 1100 ldrb w11, [x1, #-1]! 1101 mov v20.b[0], w11 1102 ext v21.16b, v22.16b, v21.16b, #15 1103 subs x2, x2, #1 1104 b.gt Lseal_tail_16_compose 1105 1106 and v0.16b, v0.16b, v21.16b 1107 eor v20.16b, v20.16b, v0.16b 1108 mov v21.16b, v20.16b 1109 1110Lseal_tail_16_store: 1111 umov w11, v20.b[0] 1112 strb w11, [x0], #1 1113 ext v20.16b, v20.16b, v20.16b, #1 1114 subs x6, x6, #1 1115 b.gt Lseal_tail_16_store 1116 1117 // Hash in the final ct block concatenated with extra_in 1118 mov x11, v21.d[0] 1119 mov x12, v21.d[1] 1120 adds x8, x8, x11 1121 adcs x9, x9, x12 1122 adc x10, x10, x15 1123 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1124 umulh x12, x8, x16 1125 mul x13, x9, x16 1126 umulh x14, x9, x16 1127 adds x12, x12, x13 1128 mul x13, x10, x16 1129 adc x13, x13, x14 1130 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1131 umulh x8, x8, x17 1132 adds x12, x12, x14 1133 mul x14, x9, x17 1134 umulh x9, x9, x17 1135 adcs x14, x14, x8 1136 mul x10, x10, x17 1137 adc x10, x10, x9 1138 adds x13, x13, x14 1139 adc x14, x10, xzr 1140 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1141 and x8, x13, #-4 1142 extr x13, x14, x13, #2 1143 adds x8, x8, x11 1144 lsr x11, x14, #2 1145 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1146 adds x8, x8, x13 1147 adcs x9, x9, x12 1148 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1149 1150Lseal_hash_extra: 1151 cbz x4, Lseal_finalize 1152 1153Lseal_hash_extra_loop: 1154 cmp x4, #16 1155 b.lt Lseal_hash_extra_tail 1156 ld1 {v20.16b}, [x3], #16 1157 mov x11, v20.d[0] 1158 mov x12, v20.d[1] 1159 adds x8, x8, x11 1160 adcs x9, x9, x12 1161 adc x10, x10, x15 1162 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1163 umulh x12, x8, x16 1164 mul x13, x9, x16 1165 umulh x14, x9, x16 1166 adds x12, x12, x13 1167 mul x13, x10, x16 1168 adc x13, x13, x14 1169 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1170 umulh x8, x8, x17 1171 adds x12, x12, x14 1172 mul x14, x9, x17 1173 umulh x9, x9, x17 1174 adcs x14, x14, x8 1175 mul x10, x10, x17 1176 adc x10, x10, x9 1177 adds x13, x13, x14 1178 adc x14, x10, xzr 1179 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1180 and x8, x13, #-4 1181 extr x13, x14, x13, #2 1182 adds x8, x8, x11 1183 lsr x11, x14, #2 1184 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1185 adds x8, x8, x13 1186 adcs x9, x9, x12 1187 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1188 sub x4, x4, #16 1189 b Lseal_hash_extra_loop 1190 1191Lseal_hash_extra_tail: 1192 cbz x4, Lseal_finalize 1193 eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext 1194 add x3, x3, x4 1195 1196Lseal_hash_extra_load: 1197 ext v20.16b, v20.16b, v20.16b, #15 1198 ldrb w11, [x3, #-1]! 1199 mov v20.b[0], w11 1200 subs x4, x4, #1 1201 b.gt Lseal_hash_extra_load 1202 1203 // Hash in the final padded extra_in blcok 1204 mov x11, v20.d[0] 1205 mov x12, v20.d[1] 1206 adds x8, x8, x11 1207 adcs x9, x9, x12 1208 adc x10, x10, x15 1209 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1210 umulh x12, x8, x16 1211 mul x13, x9, x16 1212 umulh x14, x9, x16 1213 adds x12, x12, x13 1214 mul x13, x10, x16 1215 adc x13, x13, x14 1216 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1217 umulh x8, x8, x17 1218 adds x12, x12, x14 1219 mul x14, x9, x17 1220 umulh x9, x9, x17 1221 adcs x14, x14, x8 1222 mul x10, x10, x17 1223 adc x10, x10, x9 1224 adds x13, x13, x14 1225 adc x14, x10, xzr 1226 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1227 and x8, x13, #-4 1228 extr x13, x14, x13, #2 1229 adds x8, x8, x11 1230 lsr x11, x14, #2 1231 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1232 adds x8, x8, x13 1233 adcs x9, x9, x12 1234 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1235 1236Lseal_finalize: 1237 mov x11, v31.d[0] 1238 mov x12, v31.d[1] 1239 adds x8, x8, x11 1240 adcs x9, x9, x12 1241 adc x10, x10, x15 1242 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1243 umulh x12, x8, x16 1244 mul x13, x9, x16 1245 umulh x14, x9, x16 1246 adds x12, x12, x13 1247 mul x13, x10, x16 1248 adc x13, x13, x14 1249 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1250 umulh x8, x8, x17 1251 adds x12, x12, x14 1252 mul x14, x9, x17 1253 umulh x9, x9, x17 1254 adcs x14, x14, x8 1255 mul x10, x10, x17 1256 adc x10, x10, x9 1257 adds x13, x13, x14 1258 adc x14, x10, xzr 1259 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1260 and x8, x13, #-4 1261 extr x13, x14, x13, #2 1262 adds x8, x8, x11 1263 lsr x11, x14, #2 1264 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1265 adds x8, x8, x13 1266 adcs x9, x9, x12 1267 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1268 // Final reduction step 1269 sub x12, xzr, x15 1270 orr x13, xzr, #3 1271 subs x11, x8, #-5 1272 sbcs x12, x9, x12 1273 sbcs x13, x10, x13 1274 csel x8, x11, x8, cs 1275 csel x9, x12, x9, cs 1276 csel x10, x13, x10, cs 1277 mov x11, v27.d[0] 1278 mov x12, v27.d[1] 1279 adds x8, x8, x11 1280 adcs x9, x9, x12 1281 adc x10, x10, x15 1282 1283 stp x8, x9, [x5] 1284 1285 ldp d8, d9, [sp, #16] 1286 ldp d10, d11, [sp, #32] 1287 ldp d12, d13, [sp, #48] 1288 ldp d14, d15, [sp, #64] 1289.cfi_restore b15 1290.cfi_restore b14 1291.cfi_restore b13 1292.cfi_restore b12 1293.cfi_restore b11 1294.cfi_restore b10 1295.cfi_restore b9 1296.cfi_restore b8 1297 ldp x29, x30, [sp], 80 1298.cfi_restore w29 1299.cfi_restore w30 1300.cfi_def_cfa_offset 0 1301 AARCH64_VALIDATE_LINK_REGISTER 1302 ret 1303 1304Lseal_128: 1305 // On some architectures preparing 5 blocks for small buffers is wasteful 1306 eor v25.16b, v25.16b, v25.16b 1307 mov x11, #1 1308 mov v25.s[0], w11 1309 mov v0.16b, v24.16b 1310 mov v1.16b, v24.16b 1311 mov v2.16b, v24.16b 1312 mov v5.16b, v28.16b 1313 mov v6.16b, v28.16b 1314 mov v7.16b, v28.16b 1315 mov v10.16b, v29.16b 1316 mov v11.16b, v29.16b 1317 mov v12.16b, v29.16b 1318 mov v17.16b, v30.16b 1319 add v15.4s, v17.4s, v25.4s 1320 add v16.4s, v15.4s, v25.4s 1321 1322 mov x6, #10 1323 1324Lseal_128_rounds: 1325 add v0.4s, v0.4s, v5.4s 1326 add v1.4s, v1.4s, v6.4s 1327 add v2.4s, v2.4s, v7.4s 1328 eor v15.16b, v15.16b, v0.16b 1329 eor v16.16b, v16.16b, v1.16b 1330 eor v17.16b, v17.16b, v2.16b 1331 rev32 v15.8h, v15.8h 1332 rev32 v16.8h, v16.8h 1333 rev32 v17.8h, v17.8h 1334 1335 add v10.4s, v10.4s, v15.4s 1336 add v11.4s, v11.4s, v16.4s 1337 add v12.4s, v12.4s, v17.4s 1338 eor v5.16b, v5.16b, v10.16b 1339 eor v6.16b, v6.16b, v11.16b 1340 eor v7.16b, v7.16b, v12.16b 1341 ushr v20.4s, v5.4s, #20 1342 sli v20.4s, v5.4s, #12 1343 ushr v5.4s, v6.4s, #20 1344 sli v5.4s, v6.4s, #12 1345 ushr v6.4s, v7.4s, #20 1346 sli v6.4s, v7.4s, #12 1347 1348 add v0.4s, v0.4s, v20.4s 1349 add v1.4s, v1.4s, v5.4s 1350 add v2.4s, v2.4s, v6.4s 1351 eor v15.16b, v15.16b, v0.16b 1352 eor v16.16b, v16.16b, v1.16b 1353 eor v17.16b, v17.16b, v2.16b 1354 tbl v15.16b, {v15.16b}, v26.16b 1355 tbl v16.16b, {v16.16b}, v26.16b 1356 tbl v17.16b, {v17.16b}, v26.16b 1357 1358 add v10.4s, v10.4s, v15.4s 1359 add v11.4s, v11.4s, v16.4s 1360 add v12.4s, v12.4s, v17.4s 1361 eor v20.16b, v20.16b, v10.16b 1362 eor v5.16b, v5.16b, v11.16b 1363 eor v6.16b, v6.16b, v12.16b 1364 ushr v7.4s, v6.4s, #25 1365 sli v7.4s, v6.4s, #7 1366 ushr v6.4s, v5.4s, #25 1367 sli v6.4s, v5.4s, #7 1368 ushr v5.4s, v20.4s, #25 1369 sli v5.4s, v20.4s, #7 1370 1371 ext v5.16b, v5.16b, v5.16b, #4 1372 ext v6.16b, v6.16b, v6.16b, #4 1373 ext v7.16b, v7.16b, v7.16b, #4 1374 1375 ext v10.16b, v10.16b, v10.16b, #8 1376 ext v11.16b, v11.16b, v11.16b, #8 1377 ext v12.16b, v12.16b, v12.16b, #8 1378 1379 ext v15.16b, v15.16b, v15.16b, #12 1380 ext v16.16b, v16.16b, v16.16b, #12 1381 ext v17.16b, v17.16b, v17.16b, #12 1382 add v0.4s, v0.4s, v5.4s 1383 add v1.4s, v1.4s, v6.4s 1384 add v2.4s, v2.4s, v7.4s 1385 eor v15.16b, v15.16b, v0.16b 1386 eor v16.16b, v16.16b, v1.16b 1387 eor v17.16b, v17.16b, v2.16b 1388 rev32 v15.8h, v15.8h 1389 rev32 v16.8h, v16.8h 1390 rev32 v17.8h, v17.8h 1391 1392 add v10.4s, v10.4s, v15.4s 1393 add v11.4s, v11.4s, v16.4s 1394 add v12.4s, v12.4s, v17.4s 1395 eor v5.16b, v5.16b, v10.16b 1396 eor v6.16b, v6.16b, v11.16b 1397 eor v7.16b, v7.16b, v12.16b 1398 ushr v20.4s, v5.4s, #20 1399 sli v20.4s, v5.4s, #12 1400 ushr v5.4s, v6.4s, #20 1401 sli v5.4s, v6.4s, #12 1402 ushr v6.4s, v7.4s, #20 1403 sli v6.4s, v7.4s, #12 1404 1405 add v0.4s, v0.4s, v20.4s 1406 add v1.4s, v1.4s, v5.4s 1407 add v2.4s, v2.4s, v6.4s 1408 eor v15.16b, v15.16b, v0.16b 1409 eor v16.16b, v16.16b, v1.16b 1410 eor v17.16b, v17.16b, v2.16b 1411 tbl v15.16b, {v15.16b}, v26.16b 1412 tbl v16.16b, {v16.16b}, v26.16b 1413 tbl v17.16b, {v17.16b}, v26.16b 1414 1415 add v10.4s, v10.4s, v15.4s 1416 add v11.4s, v11.4s, v16.4s 1417 add v12.4s, v12.4s, v17.4s 1418 eor v20.16b, v20.16b, v10.16b 1419 eor v5.16b, v5.16b, v11.16b 1420 eor v6.16b, v6.16b, v12.16b 1421 ushr v7.4s, v6.4s, #25 1422 sli v7.4s, v6.4s, #7 1423 ushr v6.4s, v5.4s, #25 1424 sli v6.4s, v5.4s, #7 1425 ushr v5.4s, v20.4s, #25 1426 sli v5.4s, v20.4s, #7 1427 1428 ext v5.16b, v5.16b, v5.16b, #12 1429 ext v6.16b, v6.16b, v6.16b, #12 1430 ext v7.16b, v7.16b, v7.16b, #12 1431 1432 ext v10.16b, v10.16b, v10.16b, #8 1433 ext v11.16b, v11.16b, v11.16b, #8 1434 ext v12.16b, v12.16b, v12.16b, #8 1435 1436 ext v15.16b, v15.16b, v15.16b, #4 1437 ext v16.16b, v16.16b, v16.16b, #4 1438 ext v17.16b, v17.16b, v17.16b, #4 1439 subs x6, x6, #1 1440 b.hi Lseal_128_rounds 1441 1442 add v0.4s, v0.4s, v24.4s 1443 add v1.4s, v1.4s, v24.4s 1444 add v2.4s, v2.4s, v24.4s 1445 1446 add v5.4s, v5.4s, v28.4s 1447 add v6.4s, v6.4s, v28.4s 1448 add v7.4s, v7.4s, v28.4s 1449 1450 // Only the first 32 bytes of the third block (counter = 0) are needed, 1451 // so skip updating v12 and v17. 1452 add v10.4s, v10.4s, v29.4s 1453 add v11.4s, v11.4s, v29.4s 1454 1455 add v30.4s, v30.4s, v25.4s 1456 add v15.4s, v15.4s, v30.4s 1457 add v30.4s, v30.4s, v25.4s 1458 add v16.4s, v16.4s, v30.4s 1459 1460 and v2.16b, v2.16b, v27.16b 1461 mov x16, v2.d[0] // Move the R key to GPRs 1462 mov x17, v2.d[1] 1463 mov v27.16b, v7.16b // Store the S key 1464 1465 bl Lpoly_hash_ad_internal 1466 b Lseal_tail 1467.cfi_endproc 1468 1469 1470///////////////////////////////// 1471// 1472// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); 1473// 1474.globl _chacha20_poly1305_open 1475.private_extern _chacha20_poly1305_open 1476 1477.align 6 1478_chacha20_poly1305_open: 1479 AARCH64_SIGN_LINK_REGISTER 1480.cfi_startproc 1481 stp x29, x30, [sp, #-80]! 1482.cfi_def_cfa_offset 80 1483.cfi_offset w30, -72 1484.cfi_offset w29, -80 1485 mov x29, sp 1486 // We probably could do .cfi_def_cfa w29, 80 at this point, but since 1487 // we don't actually use the frame pointer like that, it's probably not 1488 // worth bothering. 1489 stp d8, d9, [sp, #16] 1490 stp d10, d11, [sp, #32] 1491 stp d12, d13, [sp, #48] 1492 stp d14, d15, [sp, #64] 1493.cfi_offset b15, -8 1494.cfi_offset b14, -16 1495.cfi_offset b13, -24 1496.cfi_offset b12, -32 1497.cfi_offset b11, -40 1498.cfi_offset b10, -48 1499.cfi_offset b9, -56 1500.cfi_offset b8, -64 1501 1502 adrp x11, Lchacha20_consts@PAGE 1503 add x11, x11, Lchacha20_consts@PAGEOFF 1504 1505 ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values 1506 ld1 {v28.16b - v30.16b}, [x5] 1507 1508 mov x15, #1 // Prepare the Poly1305 state 1509 mov x8, #0 1510 mov x9, #0 1511 mov x10, #0 1512 1513 mov v31.d[0], x4 // Store the input and aad lengths 1514 mov v31.d[1], x2 1515 1516 cmp x2, #128 1517 b.le Lopen_128 // Optimization for smaller buffers 1518 1519 // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys 1520 mov v0.16b, v24.16b 1521 mov v5.16b, v28.16b 1522 mov v10.16b, v29.16b 1523 mov v15.16b, v30.16b 1524 1525 mov x6, #10 1526 1527.align 5 1528Lopen_init_rounds: 1529 add v0.4s, v0.4s, v5.4s 1530 eor v15.16b, v15.16b, v0.16b 1531 rev32 v15.8h, v15.8h 1532 1533 add v10.4s, v10.4s, v15.4s 1534 eor v5.16b, v5.16b, v10.16b 1535 ushr v20.4s, v5.4s, #20 1536 sli v20.4s, v5.4s, #12 1537 add v0.4s, v0.4s, v20.4s 1538 eor v15.16b, v15.16b, v0.16b 1539 tbl v15.16b, {v15.16b}, v26.16b 1540 1541 add v10.4s, v10.4s, v15.4s 1542 eor v20.16b, v20.16b, v10.16b 1543 ushr v5.4s, v20.4s, #25 1544 sli v5.4s, v20.4s, #7 1545 ext v5.16b, v5.16b, v5.16b, #4 1546 ext v10.16b, v10.16b, v10.16b, #8 1547 ext v15.16b, v15.16b, v15.16b, #12 1548 add v0.4s, v0.4s, v5.4s 1549 eor v15.16b, v15.16b, v0.16b 1550 rev32 v15.8h, v15.8h 1551 1552 add v10.4s, v10.4s, v15.4s 1553 eor v5.16b, v5.16b, v10.16b 1554 ushr v20.4s, v5.4s, #20 1555 sli v20.4s, v5.4s, #12 1556 add v0.4s, v0.4s, v20.4s 1557 eor v15.16b, v15.16b, v0.16b 1558 tbl v15.16b, {v15.16b}, v26.16b 1559 1560 add v10.4s, v10.4s, v15.4s 1561 eor v20.16b, v20.16b, v10.16b 1562 ushr v5.4s, v20.4s, #25 1563 sli v5.4s, v20.4s, #7 1564 ext v5.16b, v5.16b, v5.16b, #12 1565 ext v10.16b, v10.16b, v10.16b, #8 1566 ext v15.16b, v15.16b, v15.16b, #4 1567 subs x6, x6, #1 1568 b.hi Lopen_init_rounds 1569 1570 add v0.4s, v0.4s, v24.4s 1571 add v5.4s, v5.4s, v28.4s 1572 1573 and v0.16b, v0.16b, v27.16b 1574 mov x16, v0.d[0] // Move the R key to GPRs 1575 mov x17, v0.d[1] 1576 mov v27.16b, v5.16b // Store the S key 1577 1578 bl Lpoly_hash_ad_internal 1579 1580Lopen_ad_done: 1581 mov x3, x1 1582 1583// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes 1584Lopen_main_loop: 1585 1586 cmp x2, #192 1587 b.lt Lopen_tail 1588 1589 adrp x11, Lchacha20_consts@PAGE 1590 add x11, x11, Lchacha20_consts@PAGEOFF 1591 1592 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 1593 mov v4.16b, v24.16b 1594 1595 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 1596 mov v9.16b, v28.16b 1597 1598 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 1599 mov v14.16b, v29.16b 1600 1601 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 1602 sub x5, x5, #32 1603 add v15.4s, v15.4s, v25.4s 1604 mov v19.16b, v30.16b 1605 1606 eor v20.16b, v20.16b, v20.16b //zero 1607 not v21.16b, v20.16b // -1 1608 sub v21.4s, v25.4s, v21.4s // Add +1 1609 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 1610 add v19.4s, v19.4s, v20.4s 1611 1612 lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 1613 sub x4, x4, #10 1614 1615 mov x7, #10 1616 subs x6, x7, x4 1617 subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash 1618 csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full 1619 1620 cbz x7, Lopen_main_loop_rounds_short 1621 1622.align 5 1623Lopen_main_loop_rounds: 1624 ldp x11, x12, [x3], 16 1625 adds x8, x8, x11 1626 adcs x9, x9, x12 1627 adc x10, x10, x15 1628 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1629 umulh x12, x8, x16 1630 mul x13, x9, x16 1631 umulh x14, x9, x16 1632 adds x12, x12, x13 1633 mul x13, x10, x16 1634 adc x13, x13, x14 1635 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1636 umulh x8, x8, x17 1637 adds x12, x12, x14 1638 mul x14, x9, x17 1639 umulh x9, x9, x17 1640 adcs x14, x14, x8 1641 mul x10, x10, x17 1642 adc x10, x10, x9 1643 adds x13, x13, x14 1644 adc x14, x10, xzr 1645 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1646 and x8, x13, #-4 1647 extr x13, x14, x13, #2 1648 adds x8, x8, x11 1649 lsr x11, x14, #2 1650 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1651 adds x8, x8, x13 1652 adcs x9, x9, x12 1653 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1654Lopen_main_loop_rounds_short: 1655 add v0.4s, v0.4s, v5.4s 1656 add v1.4s, v1.4s, v6.4s 1657 add v2.4s, v2.4s, v7.4s 1658 add v3.4s, v3.4s, v8.4s 1659 add v4.4s, v4.4s, v9.4s 1660 1661 eor v15.16b, v15.16b, v0.16b 1662 eor v16.16b, v16.16b, v1.16b 1663 eor v17.16b, v17.16b, v2.16b 1664 eor v18.16b, v18.16b, v3.16b 1665 eor v19.16b, v19.16b, v4.16b 1666 1667 rev32 v15.8h, v15.8h 1668 rev32 v16.8h, v16.8h 1669 rev32 v17.8h, v17.8h 1670 rev32 v18.8h, v18.8h 1671 rev32 v19.8h, v19.8h 1672 1673 add v10.4s, v10.4s, v15.4s 1674 add v11.4s, v11.4s, v16.4s 1675 add v12.4s, v12.4s, v17.4s 1676 add v13.4s, v13.4s, v18.4s 1677 add v14.4s, v14.4s, v19.4s 1678 1679 eor v5.16b, v5.16b, v10.16b 1680 eor v6.16b, v6.16b, v11.16b 1681 eor v7.16b, v7.16b, v12.16b 1682 eor v8.16b, v8.16b, v13.16b 1683 eor v9.16b, v9.16b, v14.16b 1684 1685 ushr v20.4s, v5.4s, #20 1686 sli v20.4s, v5.4s, #12 1687 ushr v5.4s, v6.4s, #20 1688 sli v5.4s, v6.4s, #12 1689 ushr v6.4s, v7.4s, #20 1690 sli v6.4s, v7.4s, #12 1691 ushr v7.4s, v8.4s, #20 1692 sli v7.4s, v8.4s, #12 1693 ushr v8.4s, v9.4s, #20 1694 sli v8.4s, v9.4s, #12 1695 1696 add v0.4s, v0.4s, v20.4s 1697 add v1.4s, v1.4s, v5.4s 1698 add v2.4s, v2.4s, v6.4s 1699 add v3.4s, v3.4s, v7.4s 1700 add v4.4s, v4.4s, v8.4s 1701 1702 eor v15.16b, v15.16b, v0.16b 1703 eor v16.16b, v16.16b, v1.16b 1704 eor v17.16b, v17.16b, v2.16b 1705 eor v18.16b, v18.16b, v3.16b 1706 eor v19.16b, v19.16b, v4.16b 1707 1708 tbl v15.16b, {v15.16b}, v26.16b 1709 tbl v16.16b, {v16.16b}, v26.16b 1710 tbl v17.16b, {v17.16b}, v26.16b 1711 tbl v18.16b, {v18.16b}, v26.16b 1712 tbl v19.16b, {v19.16b}, v26.16b 1713 1714 add v10.4s, v10.4s, v15.4s 1715 add v11.4s, v11.4s, v16.4s 1716 add v12.4s, v12.4s, v17.4s 1717 add v13.4s, v13.4s, v18.4s 1718 add v14.4s, v14.4s, v19.4s 1719 1720 eor v20.16b, v20.16b, v10.16b 1721 eor v5.16b, v5.16b, v11.16b 1722 eor v6.16b, v6.16b, v12.16b 1723 eor v7.16b, v7.16b, v13.16b 1724 eor v8.16b, v8.16b, v14.16b 1725 1726 ushr v9.4s, v8.4s, #25 1727 sli v9.4s, v8.4s, #7 1728 ushr v8.4s, v7.4s, #25 1729 sli v8.4s, v7.4s, #7 1730 ushr v7.4s, v6.4s, #25 1731 sli v7.4s, v6.4s, #7 1732 ushr v6.4s, v5.4s, #25 1733 sli v6.4s, v5.4s, #7 1734 ushr v5.4s, v20.4s, #25 1735 sli v5.4s, v20.4s, #7 1736 1737 ext v9.16b, v9.16b, v9.16b, #4 1738 ext v14.16b, v14.16b, v14.16b, #8 1739 ext v19.16b, v19.16b, v19.16b, #12 1740 ldp x11, x12, [x3], 16 1741 adds x8, x8, x11 1742 adcs x9, x9, x12 1743 adc x10, x10, x15 1744 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1745 umulh x12, x8, x16 1746 mul x13, x9, x16 1747 umulh x14, x9, x16 1748 adds x12, x12, x13 1749 mul x13, x10, x16 1750 adc x13, x13, x14 1751 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1752 umulh x8, x8, x17 1753 adds x12, x12, x14 1754 mul x14, x9, x17 1755 umulh x9, x9, x17 1756 adcs x14, x14, x8 1757 mul x10, x10, x17 1758 adc x10, x10, x9 1759 adds x13, x13, x14 1760 adc x14, x10, xzr 1761 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1762 and x8, x13, #-4 1763 extr x13, x14, x13, #2 1764 adds x8, x8, x11 1765 lsr x11, x14, #2 1766 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1767 adds x8, x8, x13 1768 adcs x9, x9, x12 1769 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1770 add v0.4s, v0.4s, v6.4s 1771 add v1.4s, v1.4s, v7.4s 1772 add v2.4s, v2.4s, v8.4s 1773 add v3.4s, v3.4s, v5.4s 1774 add v4.4s, v4.4s, v9.4s 1775 1776 eor v18.16b, v18.16b, v0.16b 1777 eor v15.16b, v15.16b, v1.16b 1778 eor v16.16b, v16.16b, v2.16b 1779 eor v17.16b, v17.16b, v3.16b 1780 eor v19.16b, v19.16b, v4.16b 1781 1782 rev32 v18.8h, v18.8h 1783 rev32 v15.8h, v15.8h 1784 rev32 v16.8h, v16.8h 1785 rev32 v17.8h, v17.8h 1786 rev32 v19.8h, v19.8h 1787 1788 add v12.4s, v12.4s, v18.4s 1789 add v13.4s, v13.4s, v15.4s 1790 add v10.4s, v10.4s, v16.4s 1791 add v11.4s, v11.4s, v17.4s 1792 add v14.4s, v14.4s, v19.4s 1793 1794 eor v6.16b, v6.16b, v12.16b 1795 eor v7.16b, v7.16b, v13.16b 1796 eor v8.16b, v8.16b, v10.16b 1797 eor v5.16b, v5.16b, v11.16b 1798 eor v9.16b, v9.16b, v14.16b 1799 1800 ushr v20.4s, v6.4s, #20 1801 sli v20.4s, v6.4s, #12 1802 ushr v6.4s, v7.4s, #20 1803 sli v6.4s, v7.4s, #12 1804 ushr v7.4s, v8.4s, #20 1805 sli v7.4s, v8.4s, #12 1806 ushr v8.4s, v5.4s, #20 1807 sli v8.4s, v5.4s, #12 1808 ushr v5.4s, v9.4s, #20 1809 sli v5.4s, v9.4s, #12 1810 1811 add v0.4s, v0.4s, v20.4s 1812 add v1.4s, v1.4s, v6.4s 1813 add v2.4s, v2.4s, v7.4s 1814 add v3.4s, v3.4s, v8.4s 1815 add v4.4s, v4.4s, v5.4s 1816 1817 eor v18.16b, v18.16b, v0.16b 1818 eor v15.16b, v15.16b, v1.16b 1819 eor v16.16b, v16.16b, v2.16b 1820 eor v17.16b, v17.16b, v3.16b 1821 eor v19.16b, v19.16b, v4.16b 1822 1823 tbl v18.16b, {v18.16b}, v26.16b 1824 tbl v15.16b, {v15.16b}, v26.16b 1825 tbl v16.16b, {v16.16b}, v26.16b 1826 tbl v17.16b, {v17.16b}, v26.16b 1827 tbl v19.16b, {v19.16b}, v26.16b 1828 1829 add v12.4s, v12.4s, v18.4s 1830 add v13.4s, v13.4s, v15.4s 1831 add v10.4s, v10.4s, v16.4s 1832 add v11.4s, v11.4s, v17.4s 1833 add v14.4s, v14.4s, v19.4s 1834 1835 eor v20.16b, v20.16b, v12.16b 1836 eor v6.16b, v6.16b, v13.16b 1837 eor v7.16b, v7.16b, v10.16b 1838 eor v8.16b, v8.16b, v11.16b 1839 eor v5.16b, v5.16b, v14.16b 1840 1841 ushr v9.4s, v5.4s, #25 1842 sli v9.4s, v5.4s, #7 1843 ushr v5.4s, v8.4s, #25 1844 sli v5.4s, v8.4s, #7 1845 ushr v8.4s, v7.4s, #25 1846 sli v8.4s, v7.4s, #7 1847 ushr v7.4s, v6.4s, #25 1848 sli v7.4s, v6.4s, #7 1849 ushr v6.4s, v20.4s, #25 1850 sli v6.4s, v20.4s, #7 1851 1852 ext v9.16b, v9.16b, v9.16b, #12 1853 ext v14.16b, v14.16b, v14.16b, #8 1854 ext v19.16b, v19.16b, v19.16b, #4 1855 subs x7, x7, #1 1856 b.gt Lopen_main_loop_rounds 1857 subs x6, x6, #1 1858 b.ge Lopen_main_loop_rounds_short 1859 1860 eor v20.16b, v20.16b, v20.16b //zero 1861 not v21.16b, v20.16b // -1 1862 sub v21.4s, v25.4s, v21.4s // Add +1 1863 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 1864 add v19.4s, v19.4s, v20.4s 1865 1866 add v15.4s, v15.4s, v25.4s 1867 mov x11, #5 1868 dup v20.4s, w11 1869 add v25.4s, v25.4s, v20.4s 1870 1871 zip1 v20.4s, v0.4s, v1.4s 1872 zip2 v21.4s, v0.4s, v1.4s 1873 zip1 v22.4s, v2.4s, v3.4s 1874 zip2 v23.4s, v2.4s, v3.4s 1875 1876 zip1 v0.2d, v20.2d, v22.2d 1877 zip2 v1.2d, v20.2d, v22.2d 1878 zip1 v2.2d, v21.2d, v23.2d 1879 zip2 v3.2d, v21.2d, v23.2d 1880 1881 zip1 v20.4s, v5.4s, v6.4s 1882 zip2 v21.4s, v5.4s, v6.4s 1883 zip1 v22.4s, v7.4s, v8.4s 1884 zip2 v23.4s, v7.4s, v8.4s 1885 1886 zip1 v5.2d, v20.2d, v22.2d 1887 zip2 v6.2d, v20.2d, v22.2d 1888 zip1 v7.2d, v21.2d, v23.2d 1889 zip2 v8.2d, v21.2d, v23.2d 1890 1891 zip1 v20.4s, v10.4s, v11.4s 1892 zip2 v21.4s, v10.4s, v11.4s 1893 zip1 v22.4s, v12.4s, v13.4s 1894 zip2 v23.4s, v12.4s, v13.4s 1895 1896 zip1 v10.2d, v20.2d, v22.2d 1897 zip2 v11.2d, v20.2d, v22.2d 1898 zip1 v12.2d, v21.2d, v23.2d 1899 zip2 v13.2d, v21.2d, v23.2d 1900 1901 zip1 v20.4s, v15.4s, v16.4s 1902 zip2 v21.4s, v15.4s, v16.4s 1903 zip1 v22.4s, v17.4s, v18.4s 1904 zip2 v23.4s, v17.4s, v18.4s 1905 1906 zip1 v15.2d, v20.2d, v22.2d 1907 zip2 v16.2d, v20.2d, v22.2d 1908 zip1 v17.2d, v21.2d, v23.2d 1909 zip2 v18.2d, v21.2d, v23.2d 1910 1911 add v0.4s, v0.4s, v24.4s 1912 add v5.4s, v5.4s, v28.4s 1913 add v10.4s, v10.4s, v29.4s 1914 add v15.4s, v15.4s, v30.4s 1915 1916 add v1.4s, v1.4s, v24.4s 1917 add v6.4s, v6.4s, v28.4s 1918 add v11.4s, v11.4s, v29.4s 1919 add v16.4s, v16.4s, v30.4s 1920 1921 add v2.4s, v2.4s, v24.4s 1922 add v7.4s, v7.4s, v28.4s 1923 add v12.4s, v12.4s, v29.4s 1924 add v17.4s, v17.4s, v30.4s 1925 1926 add v3.4s, v3.4s, v24.4s 1927 add v8.4s, v8.4s, v28.4s 1928 add v13.4s, v13.4s, v29.4s 1929 add v18.4s, v18.4s, v30.4s 1930 1931 add v4.4s, v4.4s, v24.4s 1932 add v9.4s, v9.4s, v28.4s 1933 add v14.4s, v14.4s, v29.4s 1934 add v19.4s, v19.4s, v30.4s 1935 1936 // We can always safely store 192 bytes 1937 ld1 {v20.16b - v23.16b}, [x1], #64 1938 eor v20.16b, v20.16b, v0.16b 1939 eor v21.16b, v21.16b, v5.16b 1940 eor v22.16b, v22.16b, v10.16b 1941 eor v23.16b, v23.16b, v15.16b 1942 st1 {v20.16b - v23.16b}, [x0], #64 1943 1944 ld1 {v20.16b - v23.16b}, [x1], #64 1945 eor v20.16b, v20.16b, v1.16b 1946 eor v21.16b, v21.16b, v6.16b 1947 eor v22.16b, v22.16b, v11.16b 1948 eor v23.16b, v23.16b, v16.16b 1949 st1 {v20.16b - v23.16b}, [x0], #64 1950 1951 ld1 {v20.16b - v23.16b}, [x1], #64 1952 eor v20.16b, v20.16b, v2.16b 1953 eor v21.16b, v21.16b, v7.16b 1954 eor v22.16b, v22.16b, v12.16b 1955 eor v23.16b, v23.16b, v17.16b 1956 st1 {v20.16b - v23.16b}, [x0], #64 1957 1958 sub x2, x2, #192 1959 1960 mov v0.16b, v3.16b 1961 mov v5.16b, v8.16b 1962 mov v10.16b, v13.16b 1963 mov v15.16b, v18.16b 1964 1965 cmp x2, #64 1966 b.lt Lopen_tail_64_store 1967 1968 ld1 {v20.16b - v23.16b}, [x1], #64 1969 eor v20.16b, v20.16b, v3.16b 1970 eor v21.16b, v21.16b, v8.16b 1971 eor v22.16b, v22.16b, v13.16b 1972 eor v23.16b, v23.16b, v18.16b 1973 st1 {v20.16b - v23.16b}, [x0], #64 1974 1975 sub x2, x2, #64 1976 1977 mov v0.16b, v4.16b 1978 mov v5.16b, v9.16b 1979 mov v10.16b, v14.16b 1980 mov v15.16b, v19.16b 1981 1982 cmp x2, #64 1983 b.lt Lopen_tail_64_store 1984 1985 ld1 {v20.16b - v23.16b}, [x1], #64 1986 eor v20.16b, v20.16b, v4.16b 1987 eor v21.16b, v21.16b, v9.16b 1988 eor v22.16b, v22.16b, v14.16b 1989 eor v23.16b, v23.16b, v19.16b 1990 st1 {v20.16b - v23.16b}, [x0], #64 1991 1992 sub x2, x2, #64 1993 b Lopen_main_loop 1994 1995Lopen_tail: 1996 1997 cbz x2, Lopen_finalize 1998 1999 lsr x4, x2, #4 // How many whole blocks we have to hash 2000 2001 cmp x2, #64 2002 b.le Lopen_tail_64 2003 cmp x2, #128 2004 b.le Lopen_tail_128 2005 2006Lopen_tail_192: 2007 // We need three more blocks 2008 mov v0.16b, v24.16b 2009 mov v1.16b, v24.16b 2010 mov v2.16b, v24.16b 2011 mov v5.16b, v28.16b 2012 mov v6.16b, v28.16b 2013 mov v7.16b, v28.16b 2014 mov v10.16b, v29.16b 2015 mov v11.16b, v29.16b 2016 mov v12.16b, v29.16b 2017 mov v15.16b, v30.16b 2018 mov v16.16b, v30.16b 2019 mov v17.16b, v30.16b 2020 eor v23.16b, v23.16b, v23.16b 2021 eor v21.16b, v21.16b, v21.16b 2022 ins v23.s[0], v25.s[0] 2023 ins v21.d[0], x15 2024 2025 add v22.4s, v23.4s, v21.4s 2026 add v21.4s, v22.4s, v21.4s 2027 2028 add v15.4s, v15.4s, v21.4s 2029 add v16.4s, v16.4s, v23.4s 2030 add v17.4s, v17.4s, v22.4s 2031 2032 mov x7, #10 2033 subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash 2034 csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing 2035 sub x4, x4, x7 2036 2037 cbz x7, Lopen_tail_192_rounds_no_hash 2038 2039Lopen_tail_192_rounds: 2040 ldp x11, x12, [x3], 16 2041 adds x8, x8, x11 2042 adcs x9, x9, x12 2043 adc x10, x10, x15 2044 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2045 umulh x12, x8, x16 2046 mul x13, x9, x16 2047 umulh x14, x9, x16 2048 adds x12, x12, x13 2049 mul x13, x10, x16 2050 adc x13, x13, x14 2051 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2052 umulh x8, x8, x17 2053 adds x12, x12, x14 2054 mul x14, x9, x17 2055 umulh x9, x9, x17 2056 adcs x14, x14, x8 2057 mul x10, x10, x17 2058 adc x10, x10, x9 2059 adds x13, x13, x14 2060 adc x14, x10, xzr 2061 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2062 and x8, x13, #-4 2063 extr x13, x14, x13, #2 2064 adds x8, x8, x11 2065 lsr x11, x14, #2 2066 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2067 adds x8, x8, x13 2068 adcs x9, x9, x12 2069 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2070Lopen_tail_192_rounds_no_hash: 2071 add v0.4s, v0.4s, v5.4s 2072 add v1.4s, v1.4s, v6.4s 2073 add v2.4s, v2.4s, v7.4s 2074 eor v15.16b, v15.16b, v0.16b 2075 eor v16.16b, v16.16b, v1.16b 2076 eor v17.16b, v17.16b, v2.16b 2077 rev32 v15.8h, v15.8h 2078 rev32 v16.8h, v16.8h 2079 rev32 v17.8h, v17.8h 2080 2081 add v10.4s, v10.4s, v15.4s 2082 add v11.4s, v11.4s, v16.4s 2083 add v12.4s, v12.4s, v17.4s 2084 eor v5.16b, v5.16b, v10.16b 2085 eor v6.16b, v6.16b, v11.16b 2086 eor v7.16b, v7.16b, v12.16b 2087 ushr v20.4s, v5.4s, #20 2088 sli v20.4s, v5.4s, #12 2089 ushr v5.4s, v6.4s, #20 2090 sli v5.4s, v6.4s, #12 2091 ushr v6.4s, v7.4s, #20 2092 sli v6.4s, v7.4s, #12 2093 2094 add v0.4s, v0.4s, v20.4s 2095 add v1.4s, v1.4s, v5.4s 2096 add v2.4s, v2.4s, v6.4s 2097 eor v15.16b, v15.16b, v0.16b 2098 eor v16.16b, v16.16b, v1.16b 2099 eor v17.16b, v17.16b, v2.16b 2100 tbl v15.16b, {v15.16b}, v26.16b 2101 tbl v16.16b, {v16.16b}, v26.16b 2102 tbl v17.16b, {v17.16b}, v26.16b 2103 2104 add v10.4s, v10.4s, v15.4s 2105 add v11.4s, v11.4s, v16.4s 2106 add v12.4s, v12.4s, v17.4s 2107 eor v20.16b, v20.16b, v10.16b 2108 eor v5.16b, v5.16b, v11.16b 2109 eor v6.16b, v6.16b, v12.16b 2110 ushr v7.4s, v6.4s, #25 2111 sli v7.4s, v6.4s, #7 2112 ushr v6.4s, v5.4s, #25 2113 sli v6.4s, v5.4s, #7 2114 ushr v5.4s, v20.4s, #25 2115 sli v5.4s, v20.4s, #7 2116 2117 ext v5.16b, v5.16b, v5.16b, #4 2118 ext v6.16b, v6.16b, v6.16b, #4 2119 ext v7.16b, v7.16b, v7.16b, #4 2120 2121 ext v10.16b, v10.16b, v10.16b, #8 2122 ext v11.16b, v11.16b, v11.16b, #8 2123 ext v12.16b, v12.16b, v12.16b, #8 2124 2125 ext v15.16b, v15.16b, v15.16b, #12 2126 ext v16.16b, v16.16b, v16.16b, #12 2127 ext v17.16b, v17.16b, v17.16b, #12 2128 add v0.4s, v0.4s, v5.4s 2129 add v1.4s, v1.4s, v6.4s 2130 add v2.4s, v2.4s, v7.4s 2131 eor v15.16b, v15.16b, v0.16b 2132 eor v16.16b, v16.16b, v1.16b 2133 eor v17.16b, v17.16b, v2.16b 2134 rev32 v15.8h, v15.8h 2135 rev32 v16.8h, v16.8h 2136 rev32 v17.8h, v17.8h 2137 2138 add v10.4s, v10.4s, v15.4s 2139 add v11.4s, v11.4s, v16.4s 2140 add v12.4s, v12.4s, v17.4s 2141 eor v5.16b, v5.16b, v10.16b 2142 eor v6.16b, v6.16b, v11.16b 2143 eor v7.16b, v7.16b, v12.16b 2144 ushr v20.4s, v5.4s, #20 2145 sli v20.4s, v5.4s, #12 2146 ushr v5.4s, v6.4s, #20 2147 sli v5.4s, v6.4s, #12 2148 ushr v6.4s, v7.4s, #20 2149 sli v6.4s, v7.4s, #12 2150 2151 add v0.4s, v0.4s, v20.4s 2152 add v1.4s, v1.4s, v5.4s 2153 add v2.4s, v2.4s, v6.4s 2154 eor v15.16b, v15.16b, v0.16b 2155 eor v16.16b, v16.16b, v1.16b 2156 eor v17.16b, v17.16b, v2.16b 2157 tbl v15.16b, {v15.16b}, v26.16b 2158 tbl v16.16b, {v16.16b}, v26.16b 2159 tbl v17.16b, {v17.16b}, v26.16b 2160 2161 add v10.4s, v10.4s, v15.4s 2162 add v11.4s, v11.4s, v16.4s 2163 add v12.4s, v12.4s, v17.4s 2164 eor v20.16b, v20.16b, v10.16b 2165 eor v5.16b, v5.16b, v11.16b 2166 eor v6.16b, v6.16b, v12.16b 2167 ushr v7.4s, v6.4s, #25 2168 sli v7.4s, v6.4s, #7 2169 ushr v6.4s, v5.4s, #25 2170 sli v6.4s, v5.4s, #7 2171 ushr v5.4s, v20.4s, #25 2172 sli v5.4s, v20.4s, #7 2173 2174 ext v5.16b, v5.16b, v5.16b, #12 2175 ext v6.16b, v6.16b, v6.16b, #12 2176 ext v7.16b, v7.16b, v7.16b, #12 2177 2178 ext v10.16b, v10.16b, v10.16b, #8 2179 ext v11.16b, v11.16b, v11.16b, #8 2180 ext v12.16b, v12.16b, v12.16b, #8 2181 2182 ext v15.16b, v15.16b, v15.16b, #4 2183 ext v16.16b, v16.16b, v16.16b, #4 2184 ext v17.16b, v17.16b, v17.16b, #4 2185 subs x7, x7, #1 2186 b.gt Lopen_tail_192_rounds 2187 subs x6, x6, #1 2188 b.ge Lopen_tail_192_rounds_no_hash 2189 2190 // We hashed 160 bytes at most, may still have 32 bytes left 2191Lopen_tail_192_hash: 2192 cbz x4, Lopen_tail_192_hash_done 2193 ldp x11, x12, [x3], 16 2194 adds x8, x8, x11 2195 adcs x9, x9, x12 2196 adc x10, x10, x15 2197 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2198 umulh x12, x8, x16 2199 mul x13, x9, x16 2200 umulh x14, x9, x16 2201 adds x12, x12, x13 2202 mul x13, x10, x16 2203 adc x13, x13, x14 2204 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2205 umulh x8, x8, x17 2206 adds x12, x12, x14 2207 mul x14, x9, x17 2208 umulh x9, x9, x17 2209 adcs x14, x14, x8 2210 mul x10, x10, x17 2211 adc x10, x10, x9 2212 adds x13, x13, x14 2213 adc x14, x10, xzr 2214 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2215 and x8, x13, #-4 2216 extr x13, x14, x13, #2 2217 adds x8, x8, x11 2218 lsr x11, x14, #2 2219 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2220 adds x8, x8, x13 2221 adcs x9, x9, x12 2222 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2223 sub x4, x4, #1 2224 b Lopen_tail_192_hash 2225 2226Lopen_tail_192_hash_done: 2227 2228 add v0.4s, v0.4s, v24.4s 2229 add v1.4s, v1.4s, v24.4s 2230 add v2.4s, v2.4s, v24.4s 2231 add v5.4s, v5.4s, v28.4s 2232 add v6.4s, v6.4s, v28.4s 2233 add v7.4s, v7.4s, v28.4s 2234 add v10.4s, v10.4s, v29.4s 2235 add v11.4s, v11.4s, v29.4s 2236 add v12.4s, v12.4s, v29.4s 2237 add v15.4s, v15.4s, v30.4s 2238 add v16.4s, v16.4s, v30.4s 2239 add v17.4s, v17.4s, v30.4s 2240 2241 add v15.4s, v15.4s, v21.4s 2242 add v16.4s, v16.4s, v23.4s 2243 add v17.4s, v17.4s, v22.4s 2244 2245 ld1 {v20.16b - v23.16b}, [x1], #64 2246 2247 eor v20.16b, v20.16b, v1.16b 2248 eor v21.16b, v21.16b, v6.16b 2249 eor v22.16b, v22.16b, v11.16b 2250 eor v23.16b, v23.16b, v16.16b 2251 2252 st1 {v20.16b - v23.16b}, [x0], #64 2253 2254 ld1 {v20.16b - v23.16b}, [x1], #64 2255 2256 eor v20.16b, v20.16b, v2.16b 2257 eor v21.16b, v21.16b, v7.16b 2258 eor v22.16b, v22.16b, v12.16b 2259 eor v23.16b, v23.16b, v17.16b 2260 2261 st1 {v20.16b - v23.16b}, [x0], #64 2262 2263 sub x2, x2, #128 2264 b Lopen_tail_64_store 2265 2266Lopen_tail_128: 2267 // We need two more blocks 2268 mov v0.16b, v24.16b 2269 mov v1.16b, v24.16b 2270 mov v5.16b, v28.16b 2271 mov v6.16b, v28.16b 2272 mov v10.16b, v29.16b 2273 mov v11.16b, v29.16b 2274 mov v15.16b, v30.16b 2275 mov v16.16b, v30.16b 2276 eor v23.16b, v23.16b, v23.16b 2277 eor v22.16b, v22.16b, v22.16b 2278 ins v23.s[0], v25.s[0] 2279 ins v22.d[0], x15 2280 add v22.4s, v22.4s, v23.4s 2281 2282 add v15.4s, v15.4s, v22.4s 2283 add v16.4s, v16.4s, v23.4s 2284 2285 mov x6, #10 2286 sub x6, x6, x4 2287 2288Lopen_tail_128_rounds: 2289 add v0.4s, v0.4s, v5.4s 2290 eor v15.16b, v15.16b, v0.16b 2291 rev32 v15.8h, v15.8h 2292 2293 add v10.4s, v10.4s, v15.4s 2294 eor v5.16b, v5.16b, v10.16b 2295 ushr v20.4s, v5.4s, #20 2296 sli v20.4s, v5.4s, #12 2297 add v0.4s, v0.4s, v20.4s 2298 eor v15.16b, v15.16b, v0.16b 2299 tbl v15.16b, {v15.16b}, v26.16b 2300 2301 add v10.4s, v10.4s, v15.4s 2302 eor v20.16b, v20.16b, v10.16b 2303 ushr v5.4s, v20.4s, #25 2304 sli v5.4s, v20.4s, #7 2305 ext v5.16b, v5.16b, v5.16b, #4 2306 ext v10.16b, v10.16b, v10.16b, #8 2307 ext v15.16b, v15.16b, v15.16b, #12 2308 add v1.4s, v1.4s, v6.4s 2309 eor v16.16b, v16.16b, v1.16b 2310 rev32 v16.8h, v16.8h 2311 2312 add v11.4s, v11.4s, v16.4s 2313 eor v6.16b, v6.16b, v11.16b 2314 ushr v20.4s, v6.4s, #20 2315 sli v20.4s, v6.4s, #12 2316 add v1.4s, v1.4s, v20.4s 2317 eor v16.16b, v16.16b, v1.16b 2318 tbl v16.16b, {v16.16b}, v26.16b 2319 2320 add v11.4s, v11.4s, v16.4s 2321 eor v20.16b, v20.16b, v11.16b 2322 ushr v6.4s, v20.4s, #25 2323 sli v6.4s, v20.4s, #7 2324 ext v6.16b, v6.16b, v6.16b, #4 2325 ext v11.16b, v11.16b, v11.16b, #8 2326 ext v16.16b, v16.16b, v16.16b, #12 2327 add v0.4s, v0.4s, v5.4s 2328 eor v15.16b, v15.16b, v0.16b 2329 rev32 v15.8h, v15.8h 2330 2331 add v10.4s, v10.4s, v15.4s 2332 eor v5.16b, v5.16b, v10.16b 2333 ushr v20.4s, v5.4s, #20 2334 sli v20.4s, v5.4s, #12 2335 add v0.4s, v0.4s, v20.4s 2336 eor v15.16b, v15.16b, v0.16b 2337 tbl v15.16b, {v15.16b}, v26.16b 2338 2339 add v10.4s, v10.4s, v15.4s 2340 eor v20.16b, v20.16b, v10.16b 2341 ushr v5.4s, v20.4s, #25 2342 sli v5.4s, v20.4s, #7 2343 ext v5.16b, v5.16b, v5.16b, #12 2344 ext v10.16b, v10.16b, v10.16b, #8 2345 ext v15.16b, v15.16b, v15.16b, #4 2346 add v1.4s, v1.4s, v6.4s 2347 eor v16.16b, v16.16b, v1.16b 2348 rev32 v16.8h, v16.8h 2349 2350 add v11.4s, v11.4s, v16.4s 2351 eor v6.16b, v6.16b, v11.16b 2352 ushr v20.4s, v6.4s, #20 2353 sli v20.4s, v6.4s, #12 2354 add v1.4s, v1.4s, v20.4s 2355 eor v16.16b, v16.16b, v1.16b 2356 tbl v16.16b, {v16.16b}, v26.16b 2357 2358 add v11.4s, v11.4s, v16.4s 2359 eor v20.16b, v20.16b, v11.16b 2360 ushr v6.4s, v20.4s, #25 2361 sli v6.4s, v20.4s, #7 2362 ext v6.16b, v6.16b, v6.16b, #12 2363 ext v11.16b, v11.16b, v11.16b, #8 2364 ext v16.16b, v16.16b, v16.16b, #4 2365 subs x6, x6, #1 2366 b.gt Lopen_tail_128_rounds 2367 cbz x4, Lopen_tail_128_rounds_done 2368 subs x4, x4, #1 2369 ldp x11, x12, [x3], 16 2370 adds x8, x8, x11 2371 adcs x9, x9, x12 2372 adc x10, x10, x15 2373 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2374 umulh x12, x8, x16 2375 mul x13, x9, x16 2376 umulh x14, x9, x16 2377 adds x12, x12, x13 2378 mul x13, x10, x16 2379 adc x13, x13, x14 2380 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2381 umulh x8, x8, x17 2382 adds x12, x12, x14 2383 mul x14, x9, x17 2384 umulh x9, x9, x17 2385 adcs x14, x14, x8 2386 mul x10, x10, x17 2387 adc x10, x10, x9 2388 adds x13, x13, x14 2389 adc x14, x10, xzr 2390 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2391 and x8, x13, #-4 2392 extr x13, x14, x13, #2 2393 adds x8, x8, x11 2394 lsr x11, x14, #2 2395 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2396 adds x8, x8, x13 2397 adcs x9, x9, x12 2398 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2399 b Lopen_tail_128_rounds 2400 2401Lopen_tail_128_rounds_done: 2402 add v0.4s, v0.4s, v24.4s 2403 add v1.4s, v1.4s, v24.4s 2404 add v5.4s, v5.4s, v28.4s 2405 add v6.4s, v6.4s, v28.4s 2406 add v10.4s, v10.4s, v29.4s 2407 add v11.4s, v11.4s, v29.4s 2408 add v15.4s, v15.4s, v30.4s 2409 add v16.4s, v16.4s, v30.4s 2410 add v15.4s, v15.4s, v22.4s 2411 add v16.4s, v16.4s, v23.4s 2412 2413 ld1 {v20.16b - v23.16b}, [x1], #64 2414 2415 eor v20.16b, v20.16b, v1.16b 2416 eor v21.16b, v21.16b, v6.16b 2417 eor v22.16b, v22.16b, v11.16b 2418 eor v23.16b, v23.16b, v16.16b 2419 2420 st1 {v20.16b - v23.16b}, [x0], #64 2421 sub x2, x2, #64 2422 2423 b Lopen_tail_64_store 2424 2425Lopen_tail_64: 2426 // We just need a single block 2427 mov v0.16b, v24.16b 2428 mov v5.16b, v28.16b 2429 mov v10.16b, v29.16b 2430 mov v15.16b, v30.16b 2431 eor v23.16b, v23.16b, v23.16b 2432 ins v23.s[0], v25.s[0] 2433 add v15.4s, v15.4s, v23.4s 2434 2435 mov x6, #10 2436 sub x6, x6, x4 2437 2438Lopen_tail_64_rounds: 2439 add v0.4s, v0.4s, v5.4s 2440 eor v15.16b, v15.16b, v0.16b 2441 rev32 v15.8h, v15.8h 2442 2443 add v10.4s, v10.4s, v15.4s 2444 eor v5.16b, v5.16b, v10.16b 2445 ushr v20.4s, v5.4s, #20 2446 sli v20.4s, v5.4s, #12 2447 add v0.4s, v0.4s, v20.4s 2448 eor v15.16b, v15.16b, v0.16b 2449 tbl v15.16b, {v15.16b}, v26.16b 2450 2451 add v10.4s, v10.4s, v15.4s 2452 eor v20.16b, v20.16b, v10.16b 2453 ushr v5.4s, v20.4s, #25 2454 sli v5.4s, v20.4s, #7 2455 ext v5.16b, v5.16b, v5.16b, #4 2456 ext v10.16b, v10.16b, v10.16b, #8 2457 ext v15.16b, v15.16b, v15.16b, #12 2458 add v0.4s, v0.4s, v5.4s 2459 eor v15.16b, v15.16b, v0.16b 2460 rev32 v15.8h, v15.8h 2461 2462 add v10.4s, v10.4s, v15.4s 2463 eor v5.16b, v5.16b, v10.16b 2464 ushr v20.4s, v5.4s, #20 2465 sli v20.4s, v5.4s, #12 2466 add v0.4s, v0.4s, v20.4s 2467 eor v15.16b, v15.16b, v0.16b 2468 tbl v15.16b, {v15.16b}, v26.16b 2469 2470 add v10.4s, v10.4s, v15.4s 2471 eor v20.16b, v20.16b, v10.16b 2472 ushr v5.4s, v20.4s, #25 2473 sli v5.4s, v20.4s, #7 2474 ext v5.16b, v5.16b, v5.16b, #12 2475 ext v10.16b, v10.16b, v10.16b, #8 2476 ext v15.16b, v15.16b, v15.16b, #4 2477 subs x6, x6, #1 2478 b.gt Lopen_tail_64_rounds 2479 cbz x4, Lopen_tail_64_rounds_done 2480 subs x4, x4, #1 2481 ldp x11, x12, [x3], 16 2482 adds x8, x8, x11 2483 adcs x9, x9, x12 2484 adc x10, x10, x15 2485 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2486 umulh x12, x8, x16 2487 mul x13, x9, x16 2488 umulh x14, x9, x16 2489 adds x12, x12, x13 2490 mul x13, x10, x16 2491 adc x13, x13, x14 2492 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2493 umulh x8, x8, x17 2494 adds x12, x12, x14 2495 mul x14, x9, x17 2496 umulh x9, x9, x17 2497 adcs x14, x14, x8 2498 mul x10, x10, x17 2499 adc x10, x10, x9 2500 adds x13, x13, x14 2501 adc x14, x10, xzr 2502 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2503 and x8, x13, #-4 2504 extr x13, x14, x13, #2 2505 adds x8, x8, x11 2506 lsr x11, x14, #2 2507 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2508 adds x8, x8, x13 2509 adcs x9, x9, x12 2510 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2511 b Lopen_tail_64_rounds 2512 2513Lopen_tail_64_rounds_done: 2514 add v0.4s, v0.4s, v24.4s 2515 add v5.4s, v5.4s, v28.4s 2516 add v10.4s, v10.4s, v29.4s 2517 add v15.4s, v15.4s, v30.4s 2518 add v15.4s, v15.4s, v23.4s 2519 2520Lopen_tail_64_store: 2521 cmp x2, #16 2522 b.lt Lopen_tail_16 2523 2524 ld1 {v20.16b}, [x1], #16 2525 eor v20.16b, v20.16b, v0.16b 2526 st1 {v20.16b}, [x0], #16 2527 mov v0.16b, v5.16b 2528 mov v5.16b, v10.16b 2529 mov v10.16b, v15.16b 2530 sub x2, x2, #16 2531 b Lopen_tail_64_store 2532 2533Lopen_tail_16: 2534 // Here we handle the last [0,16) bytes that require a padded block 2535 cbz x2, Lopen_finalize 2536 2537 eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext 2538 eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask 2539 not v22.16b, v20.16b 2540 2541 add x7, x1, x2 2542 mov x6, x2 2543 2544Lopen_tail_16_compose: 2545 ext v20.16b, v20.16b, v20.16b, #15 2546 ldrb w11, [x7, #-1]! 2547 mov v20.b[0], w11 2548 ext v21.16b, v22.16b, v21.16b, #15 2549 subs x2, x2, #1 2550 b.gt Lopen_tail_16_compose 2551 2552 and v20.16b, v20.16b, v21.16b 2553 // Hash in the final padded block 2554 mov x11, v20.d[0] 2555 mov x12, v20.d[1] 2556 adds x8, x8, x11 2557 adcs x9, x9, x12 2558 adc x10, x10, x15 2559 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2560 umulh x12, x8, x16 2561 mul x13, x9, x16 2562 umulh x14, x9, x16 2563 adds x12, x12, x13 2564 mul x13, x10, x16 2565 adc x13, x13, x14 2566 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2567 umulh x8, x8, x17 2568 adds x12, x12, x14 2569 mul x14, x9, x17 2570 umulh x9, x9, x17 2571 adcs x14, x14, x8 2572 mul x10, x10, x17 2573 adc x10, x10, x9 2574 adds x13, x13, x14 2575 adc x14, x10, xzr 2576 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2577 and x8, x13, #-4 2578 extr x13, x14, x13, #2 2579 adds x8, x8, x11 2580 lsr x11, x14, #2 2581 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2582 adds x8, x8, x13 2583 adcs x9, x9, x12 2584 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2585 eor v20.16b, v20.16b, v0.16b 2586 2587Lopen_tail_16_store: 2588 umov w11, v20.b[0] 2589 strb w11, [x0], #1 2590 ext v20.16b, v20.16b, v20.16b, #1 2591 subs x6, x6, #1 2592 b.gt Lopen_tail_16_store 2593 2594Lopen_finalize: 2595 mov x11, v31.d[0] 2596 mov x12, v31.d[1] 2597 adds x8, x8, x11 2598 adcs x9, x9, x12 2599 adc x10, x10, x15 2600 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2601 umulh x12, x8, x16 2602 mul x13, x9, x16 2603 umulh x14, x9, x16 2604 adds x12, x12, x13 2605 mul x13, x10, x16 2606 adc x13, x13, x14 2607 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2608 umulh x8, x8, x17 2609 adds x12, x12, x14 2610 mul x14, x9, x17 2611 umulh x9, x9, x17 2612 adcs x14, x14, x8 2613 mul x10, x10, x17 2614 adc x10, x10, x9 2615 adds x13, x13, x14 2616 adc x14, x10, xzr 2617 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2618 and x8, x13, #-4 2619 extr x13, x14, x13, #2 2620 adds x8, x8, x11 2621 lsr x11, x14, #2 2622 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2623 adds x8, x8, x13 2624 adcs x9, x9, x12 2625 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2626 // Final reduction step 2627 sub x12, xzr, x15 2628 orr x13, xzr, #3 2629 subs x11, x8, #-5 2630 sbcs x12, x9, x12 2631 sbcs x13, x10, x13 2632 csel x8, x11, x8, cs 2633 csel x9, x12, x9, cs 2634 csel x10, x13, x10, cs 2635 mov x11, v27.d[0] 2636 mov x12, v27.d[1] 2637 adds x8, x8, x11 2638 adcs x9, x9, x12 2639 adc x10, x10, x15 2640 2641 stp x8, x9, [x5] 2642 2643 ldp d8, d9, [sp, #16] 2644 ldp d10, d11, [sp, #32] 2645 ldp d12, d13, [sp, #48] 2646 ldp d14, d15, [sp, #64] 2647.cfi_restore b15 2648.cfi_restore b14 2649.cfi_restore b13 2650.cfi_restore b12 2651.cfi_restore b11 2652.cfi_restore b10 2653.cfi_restore b9 2654.cfi_restore b8 2655 ldp x29, x30, [sp], 80 2656.cfi_restore w29 2657.cfi_restore w30 2658.cfi_def_cfa_offset 0 2659 AARCH64_VALIDATE_LINK_REGISTER 2660 ret 2661 2662Lopen_128: 2663 // On some architectures preparing 5 blocks for small buffers is wasteful 2664 eor v25.16b, v25.16b, v25.16b 2665 mov x11, #1 2666 mov v25.s[0], w11 2667 mov v0.16b, v24.16b 2668 mov v1.16b, v24.16b 2669 mov v2.16b, v24.16b 2670 mov v5.16b, v28.16b 2671 mov v6.16b, v28.16b 2672 mov v7.16b, v28.16b 2673 mov v10.16b, v29.16b 2674 mov v11.16b, v29.16b 2675 mov v12.16b, v29.16b 2676 mov v17.16b, v30.16b 2677 add v15.4s, v17.4s, v25.4s 2678 add v16.4s, v15.4s, v25.4s 2679 2680 mov x6, #10 2681 2682Lopen_128_rounds: 2683 add v0.4s, v0.4s, v5.4s 2684 add v1.4s, v1.4s, v6.4s 2685 add v2.4s, v2.4s, v7.4s 2686 eor v15.16b, v15.16b, v0.16b 2687 eor v16.16b, v16.16b, v1.16b 2688 eor v17.16b, v17.16b, v2.16b 2689 rev32 v15.8h, v15.8h 2690 rev32 v16.8h, v16.8h 2691 rev32 v17.8h, v17.8h 2692 2693 add v10.4s, v10.4s, v15.4s 2694 add v11.4s, v11.4s, v16.4s 2695 add v12.4s, v12.4s, v17.4s 2696 eor v5.16b, v5.16b, v10.16b 2697 eor v6.16b, v6.16b, v11.16b 2698 eor v7.16b, v7.16b, v12.16b 2699 ushr v20.4s, v5.4s, #20 2700 sli v20.4s, v5.4s, #12 2701 ushr v5.4s, v6.4s, #20 2702 sli v5.4s, v6.4s, #12 2703 ushr v6.4s, v7.4s, #20 2704 sli v6.4s, v7.4s, #12 2705 2706 add v0.4s, v0.4s, v20.4s 2707 add v1.4s, v1.4s, v5.4s 2708 add v2.4s, v2.4s, v6.4s 2709 eor v15.16b, v15.16b, v0.16b 2710 eor v16.16b, v16.16b, v1.16b 2711 eor v17.16b, v17.16b, v2.16b 2712 tbl v15.16b, {v15.16b}, v26.16b 2713 tbl v16.16b, {v16.16b}, v26.16b 2714 tbl v17.16b, {v17.16b}, v26.16b 2715 2716 add v10.4s, v10.4s, v15.4s 2717 add v11.4s, v11.4s, v16.4s 2718 add v12.4s, v12.4s, v17.4s 2719 eor v20.16b, v20.16b, v10.16b 2720 eor v5.16b, v5.16b, v11.16b 2721 eor v6.16b, v6.16b, v12.16b 2722 ushr v7.4s, v6.4s, #25 2723 sli v7.4s, v6.4s, #7 2724 ushr v6.4s, v5.4s, #25 2725 sli v6.4s, v5.4s, #7 2726 ushr v5.4s, v20.4s, #25 2727 sli v5.4s, v20.4s, #7 2728 2729 ext v5.16b, v5.16b, v5.16b, #4 2730 ext v6.16b, v6.16b, v6.16b, #4 2731 ext v7.16b, v7.16b, v7.16b, #4 2732 2733 ext v10.16b, v10.16b, v10.16b, #8 2734 ext v11.16b, v11.16b, v11.16b, #8 2735 ext v12.16b, v12.16b, v12.16b, #8 2736 2737 ext v15.16b, v15.16b, v15.16b, #12 2738 ext v16.16b, v16.16b, v16.16b, #12 2739 ext v17.16b, v17.16b, v17.16b, #12 2740 add v0.4s, v0.4s, v5.4s 2741 add v1.4s, v1.4s, v6.4s 2742 add v2.4s, v2.4s, v7.4s 2743 eor v15.16b, v15.16b, v0.16b 2744 eor v16.16b, v16.16b, v1.16b 2745 eor v17.16b, v17.16b, v2.16b 2746 rev32 v15.8h, v15.8h 2747 rev32 v16.8h, v16.8h 2748 rev32 v17.8h, v17.8h 2749 2750 add v10.4s, v10.4s, v15.4s 2751 add v11.4s, v11.4s, v16.4s 2752 add v12.4s, v12.4s, v17.4s 2753 eor v5.16b, v5.16b, v10.16b 2754 eor v6.16b, v6.16b, v11.16b 2755 eor v7.16b, v7.16b, v12.16b 2756 ushr v20.4s, v5.4s, #20 2757 sli v20.4s, v5.4s, #12 2758 ushr v5.4s, v6.4s, #20 2759 sli v5.4s, v6.4s, #12 2760 ushr v6.4s, v7.4s, #20 2761 sli v6.4s, v7.4s, #12 2762 2763 add v0.4s, v0.4s, v20.4s 2764 add v1.4s, v1.4s, v5.4s 2765 add v2.4s, v2.4s, v6.4s 2766 eor v15.16b, v15.16b, v0.16b 2767 eor v16.16b, v16.16b, v1.16b 2768 eor v17.16b, v17.16b, v2.16b 2769 tbl v15.16b, {v15.16b}, v26.16b 2770 tbl v16.16b, {v16.16b}, v26.16b 2771 tbl v17.16b, {v17.16b}, v26.16b 2772 2773 add v10.4s, v10.4s, v15.4s 2774 add v11.4s, v11.4s, v16.4s 2775 add v12.4s, v12.4s, v17.4s 2776 eor v20.16b, v20.16b, v10.16b 2777 eor v5.16b, v5.16b, v11.16b 2778 eor v6.16b, v6.16b, v12.16b 2779 ushr v7.4s, v6.4s, #25 2780 sli v7.4s, v6.4s, #7 2781 ushr v6.4s, v5.4s, #25 2782 sli v6.4s, v5.4s, #7 2783 ushr v5.4s, v20.4s, #25 2784 sli v5.4s, v20.4s, #7 2785 2786 ext v5.16b, v5.16b, v5.16b, #12 2787 ext v6.16b, v6.16b, v6.16b, #12 2788 ext v7.16b, v7.16b, v7.16b, #12 2789 2790 ext v10.16b, v10.16b, v10.16b, #8 2791 ext v11.16b, v11.16b, v11.16b, #8 2792 ext v12.16b, v12.16b, v12.16b, #8 2793 2794 ext v15.16b, v15.16b, v15.16b, #4 2795 ext v16.16b, v16.16b, v16.16b, #4 2796 ext v17.16b, v17.16b, v17.16b, #4 2797 subs x6, x6, #1 2798 b.hi Lopen_128_rounds 2799 2800 add v0.4s, v0.4s, v24.4s 2801 add v1.4s, v1.4s, v24.4s 2802 add v2.4s, v2.4s, v24.4s 2803 2804 add v5.4s, v5.4s, v28.4s 2805 add v6.4s, v6.4s, v28.4s 2806 add v7.4s, v7.4s, v28.4s 2807 2808 add v10.4s, v10.4s, v29.4s 2809 add v11.4s, v11.4s, v29.4s 2810 2811 add v30.4s, v30.4s, v25.4s 2812 add v15.4s, v15.4s, v30.4s 2813 add v30.4s, v30.4s, v25.4s 2814 add v16.4s, v16.4s, v30.4s 2815 2816 and v2.16b, v2.16b, v27.16b 2817 mov x16, v2.d[0] // Move the R key to GPRs 2818 mov x17, v2.d[1] 2819 mov v27.16b, v7.16b // Store the S key 2820 2821 bl Lpoly_hash_ad_internal 2822 2823Lopen_128_store: 2824 cmp x2, #64 2825 b.lt Lopen_128_store_64 2826 2827 ld1 {v20.16b - v23.16b}, [x1], #64 2828 2829 mov x11, v20.d[0] 2830 mov x12, v20.d[1] 2831 adds x8, x8, x11 2832 adcs x9, x9, x12 2833 adc x10, x10, x15 2834 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2835 umulh x12, x8, x16 2836 mul x13, x9, x16 2837 umulh x14, x9, x16 2838 adds x12, x12, x13 2839 mul x13, x10, x16 2840 adc x13, x13, x14 2841 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2842 umulh x8, x8, x17 2843 adds x12, x12, x14 2844 mul x14, x9, x17 2845 umulh x9, x9, x17 2846 adcs x14, x14, x8 2847 mul x10, x10, x17 2848 adc x10, x10, x9 2849 adds x13, x13, x14 2850 adc x14, x10, xzr 2851 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2852 and x8, x13, #-4 2853 extr x13, x14, x13, #2 2854 adds x8, x8, x11 2855 lsr x11, x14, #2 2856 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2857 adds x8, x8, x13 2858 adcs x9, x9, x12 2859 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2860 mov x11, v21.d[0] 2861 mov x12, v21.d[1] 2862 adds x8, x8, x11 2863 adcs x9, x9, x12 2864 adc x10, x10, x15 2865 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2866 umulh x12, x8, x16 2867 mul x13, x9, x16 2868 umulh x14, x9, x16 2869 adds x12, x12, x13 2870 mul x13, x10, x16 2871 adc x13, x13, x14 2872 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2873 umulh x8, x8, x17 2874 adds x12, x12, x14 2875 mul x14, x9, x17 2876 umulh x9, x9, x17 2877 adcs x14, x14, x8 2878 mul x10, x10, x17 2879 adc x10, x10, x9 2880 adds x13, x13, x14 2881 adc x14, x10, xzr 2882 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2883 and x8, x13, #-4 2884 extr x13, x14, x13, #2 2885 adds x8, x8, x11 2886 lsr x11, x14, #2 2887 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2888 adds x8, x8, x13 2889 adcs x9, x9, x12 2890 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2891 mov x11, v22.d[0] 2892 mov x12, v22.d[1] 2893 adds x8, x8, x11 2894 adcs x9, x9, x12 2895 adc x10, x10, x15 2896 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2897 umulh x12, x8, x16 2898 mul x13, x9, x16 2899 umulh x14, x9, x16 2900 adds x12, x12, x13 2901 mul x13, x10, x16 2902 adc x13, x13, x14 2903 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2904 umulh x8, x8, x17 2905 adds x12, x12, x14 2906 mul x14, x9, x17 2907 umulh x9, x9, x17 2908 adcs x14, x14, x8 2909 mul x10, x10, x17 2910 adc x10, x10, x9 2911 adds x13, x13, x14 2912 adc x14, x10, xzr 2913 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2914 and x8, x13, #-4 2915 extr x13, x14, x13, #2 2916 adds x8, x8, x11 2917 lsr x11, x14, #2 2918 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2919 adds x8, x8, x13 2920 adcs x9, x9, x12 2921 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2922 mov x11, v23.d[0] 2923 mov x12, v23.d[1] 2924 adds x8, x8, x11 2925 adcs x9, x9, x12 2926 adc x10, x10, x15 2927 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2928 umulh x12, x8, x16 2929 mul x13, x9, x16 2930 umulh x14, x9, x16 2931 adds x12, x12, x13 2932 mul x13, x10, x16 2933 adc x13, x13, x14 2934 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2935 umulh x8, x8, x17 2936 adds x12, x12, x14 2937 mul x14, x9, x17 2938 umulh x9, x9, x17 2939 adcs x14, x14, x8 2940 mul x10, x10, x17 2941 adc x10, x10, x9 2942 adds x13, x13, x14 2943 adc x14, x10, xzr 2944 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2945 and x8, x13, #-4 2946 extr x13, x14, x13, #2 2947 adds x8, x8, x11 2948 lsr x11, x14, #2 2949 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2950 adds x8, x8, x13 2951 adcs x9, x9, x12 2952 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2953 2954 eor v20.16b, v20.16b, v0.16b 2955 eor v21.16b, v21.16b, v5.16b 2956 eor v22.16b, v22.16b, v10.16b 2957 eor v23.16b, v23.16b, v15.16b 2958 2959 st1 {v20.16b - v23.16b}, [x0], #64 2960 2961 sub x2, x2, #64 2962 2963 mov v0.16b, v1.16b 2964 mov v5.16b, v6.16b 2965 mov v10.16b, v11.16b 2966 mov v15.16b, v16.16b 2967 2968Lopen_128_store_64: 2969 2970 lsr x4, x2, #4 2971 mov x3, x1 2972 2973Lopen_128_hash_64: 2974 cbz x4, Lopen_tail_64_store 2975 ldp x11, x12, [x3], 16 2976 adds x8, x8, x11 2977 adcs x9, x9, x12 2978 adc x10, x10, x15 2979 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2980 umulh x12, x8, x16 2981 mul x13, x9, x16 2982 umulh x14, x9, x16 2983 adds x12, x12, x13 2984 mul x13, x10, x16 2985 adc x13, x13, x14 2986 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2987 umulh x8, x8, x17 2988 adds x12, x12, x14 2989 mul x14, x9, x17 2990 umulh x9, x9, x17 2991 adcs x14, x14, x8 2992 mul x10, x10, x17 2993 adc x10, x10, x9 2994 adds x13, x13, x14 2995 adc x14, x10, xzr 2996 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2997 and x8, x13, #-4 2998 extr x13, x14, x13, #2 2999 adds x8, x8, x11 3000 lsr x11, x14, #2 3001 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 3002 adds x8, x8, x13 3003 adcs x9, x9, x12 3004 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 3005 sub x4, x4, #1 3006 b Lopen_128_hash_64 3007.cfi_endproc 3008 3009#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) 3010