1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include <ring-core/arm_arch.h> 8.section .rodata 9 10.align 7 11Lchacha20_consts: 12.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 13Linc: 14.long 1,2,3,4 15Lrol8: 16.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 17Lclamp: 18.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC 19 20.text 21 22.def Lpoly_hash_ad_internal 23 .type 32 24.endef 25.align 6 26Lpoly_hash_ad_internal: 27.cfi_startproc 28 cbnz x4, Lpoly_hash_intro 29 ret 30 31Lpoly_hash_intro: 32 cmp x4, #16 33 b.lt Lpoly_hash_ad_tail 34 ldp x11, x12, [x3], 16 35 adds x8, x8, x11 36 adcs x9, x9, x12 37 adc x10, x10, x15 38 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 39 umulh x12, x8, x16 40 mul x13, x9, x16 41 umulh x14, x9, x16 42 adds x12, x12, x13 43 mul x13, x10, x16 44 adc x13, x13, x14 45 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 46 umulh x8, x8, x17 47 adds x12, x12, x14 48 mul x14, x9, x17 49 umulh x9, x9, x17 50 adcs x14, x14, x8 51 mul x10, x10, x17 52 adc x10, x10, x9 53 adds x13, x13, x14 54 adc x14, x10, xzr 55 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 56 and x8, x13, #-4 57 extr x13, x14, x13, #2 58 adds x8, x8, x11 59 lsr x11, x14, #2 60 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 61 adds x8, x8, x13 62 adcs x9, x9, x12 63 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 64 sub x4, x4, #16 65 b Lpoly_hash_ad_internal 66 67Lpoly_hash_ad_tail: 68 cbz x4, Lpoly_hash_ad_ret 69 70 eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD 71 sub x4, x4, #1 72 73Lpoly_hash_tail_16_compose: 74 ext v20.16b, v20.16b, v20.16b, #15 75 ldrb w11, [x3, x4] 76 mov v20.b[0], w11 77 subs x4, x4, #1 78 b.ge Lpoly_hash_tail_16_compose 79 mov x11, v20.d[0] 80 mov x12, v20.d[1] 81 adds x8, x8, x11 82 adcs x9, x9, x12 83 adc x10, x10, x15 84 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 85 umulh x12, x8, x16 86 mul x13, x9, x16 87 umulh x14, x9, x16 88 adds x12, x12, x13 89 mul x13, x10, x16 90 adc x13, x13, x14 91 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 92 umulh x8, x8, x17 93 adds x12, x12, x14 94 mul x14, x9, x17 95 umulh x9, x9, x17 96 adcs x14, x14, x8 97 mul x10, x10, x17 98 adc x10, x10, x9 99 adds x13, x13, x14 100 adc x14, x10, xzr 101 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 102 and x8, x13, #-4 103 extr x13, x14, x13, #2 104 adds x8, x8, x11 105 lsr x11, x14, #2 106 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 107 adds x8, x8, x13 108 adcs x9, x9, x12 109 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 110 111Lpoly_hash_ad_ret: 112 ret 113.cfi_endproc 114 115 116///////////////////////////////// 117// 118// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); 119// 120.globl chacha20_poly1305_seal 121 122.def chacha20_poly1305_seal 123 .type 32 124.endef 125.align 6 126chacha20_poly1305_seal: 127 AARCH64_SIGN_LINK_REGISTER 128.cfi_startproc 129 stp x29, x30, [sp, #-80]! 130.cfi_def_cfa_offset 80 131.cfi_offset w30, -72 132.cfi_offset w29, -80 133 mov x29, sp 134 // We probably could do .cfi_def_cfa w29, 80 at this point, but since 135 // we don't actually use the frame pointer like that, it's probably not 136 // worth bothering. 137 stp d8, d9, [sp, #16] 138 stp d10, d11, [sp, #32] 139 stp d12, d13, [sp, #48] 140 stp d14, d15, [sp, #64] 141.cfi_offset b15, -8 142.cfi_offset b14, -16 143.cfi_offset b13, -24 144.cfi_offset b12, -32 145.cfi_offset b11, -40 146.cfi_offset b10, -48 147.cfi_offset b9, -56 148.cfi_offset b8, -64 149 150 adrp x11, Lchacha20_consts 151 add x11, x11, :lo12:Lchacha20_consts 152 153 ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values 154 ld1 {v28.16b - v30.16b}, [x5] 155 156 mov x15, #1 // Prepare the Poly1305 state 157 mov x8, #0 158 mov x9, #0 159 mov x10, #0 160 161 ldr x12, [x5, #56] // The total cipher text length includes extra_in_len 162 add x12, x12, x2 163 mov v31.d[0], x4 // Store the input and aad lengths 164 mov v31.d[1], x12 165 166 cmp x2, #128 167 b.le Lseal_128 // Optimization for smaller buffers 168 169 // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, 170 // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, 171 // the fifth block (A4-D4) horizontally. 172 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 173 mov v4.16b, v24.16b 174 175 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 176 mov v9.16b, v28.16b 177 178 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 179 mov v14.16b, v29.16b 180 181 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 182 add v15.4s, v15.4s, v25.4s 183 mov v19.16b, v30.16b 184 185 sub x5, x5, #32 186 187 mov x6, #10 188 189.align 5 190Lseal_init_rounds: 191 add v0.4s, v0.4s, v5.4s 192 add v1.4s, v1.4s, v6.4s 193 add v2.4s, v2.4s, v7.4s 194 add v3.4s, v3.4s, v8.4s 195 add v4.4s, v4.4s, v9.4s 196 197 eor v15.16b, v15.16b, v0.16b 198 eor v16.16b, v16.16b, v1.16b 199 eor v17.16b, v17.16b, v2.16b 200 eor v18.16b, v18.16b, v3.16b 201 eor v19.16b, v19.16b, v4.16b 202 203 rev32 v15.8h, v15.8h 204 rev32 v16.8h, v16.8h 205 rev32 v17.8h, v17.8h 206 rev32 v18.8h, v18.8h 207 rev32 v19.8h, v19.8h 208 209 add v10.4s, v10.4s, v15.4s 210 add v11.4s, v11.4s, v16.4s 211 add v12.4s, v12.4s, v17.4s 212 add v13.4s, v13.4s, v18.4s 213 add v14.4s, v14.4s, v19.4s 214 215 eor v5.16b, v5.16b, v10.16b 216 eor v6.16b, v6.16b, v11.16b 217 eor v7.16b, v7.16b, v12.16b 218 eor v8.16b, v8.16b, v13.16b 219 eor v9.16b, v9.16b, v14.16b 220 221 ushr v20.4s, v5.4s, #20 222 sli v20.4s, v5.4s, #12 223 ushr v5.4s, v6.4s, #20 224 sli v5.4s, v6.4s, #12 225 ushr v6.4s, v7.4s, #20 226 sli v6.4s, v7.4s, #12 227 ushr v7.4s, v8.4s, #20 228 sli v7.4s, v8.4s, #12 229 ushr v8.4s, v9.4s, #20 230 sli v8.4s, v9.4s, #12 231 232 add v0.4s, v0.4s, v20.4s 233 add v1.4s, v1.4s, v5.4s 234 add v2.4s, v2.4s, v6.4s 235 add v3.4s, v3.4s, v7.4s 236 add v4.4s, v4.4s, v8.4s 237 238 eor v15.16b, v15.16b, v0.16b 239 eor v16.16b, v16.16b, v1.16b 240 eor v17.16b, v17.16b, v2.16b 241 eor v18.16b, v18.16b, v3.16b 242 eor v19.16b, v19.16b, v4.16b 243 244 tbl v15.16b, {v15.16b}, v26.16b 245 tbl v16.16b, {v16.16b}, v26.16b 246 tbl v17.16b, {v17.16b}, v26.16b 247 tbl v18.16b, {v18.16b}, v26.16b 248 tbl v19.16b, {v19.16b}, v26.16b 249 250 add v10.4s, v10.4s, v15.4s 251 add v11.4s, v11.4s, v16.4s 252 add v12.4s, v12.4s, v17.4s 253 add v13.4s, v13.4s, v18.4s 254 add v14.4s, v14.4s, v19.4s 255 256 eor v20.16b, v20.16b, v10.16b 257 eor v5.16b, v5.16b, v11.16b 258 eor v6.16b, v6.16b, v12.16b 259 eor v7.16b, v7.16b, v13.16b 260 eor v8.16b, v8.16b, v14.16b 261 262 ushr v9.4s, v8.4s, #25 263 sli v9.4s, v8.4s, #7 264 ushr v8.4s, v7.4s, #25 265 sli v8.4s, v7.4s, #7 266 ushr v7.4s, v6.4s, #25 267 sli v7.4s, v6.4s, #7 268 ushr v6.4s, v5.4s, #25 269 sli v6.4s, v5.4s, #7 270 ushr v5.4s, v20.4s, #25 271 sli v5.4s, v20.4s, #7 272 273 ext v9.16b, v9.16b, v9.16b, #4 274 ext v14.16b, v14.16b, v14.16b, #8 275 ext v19.16b, v19.16b, v19.16b, #12 276 add v0.4s, v0.4s, v6.4s 277 add v1.4s, v1.4s, v7.4s 278 add v2.4s, v2.4s, v8.4s 279 add v3.4s, v3.4s, v5.4s 280 add v4.4s, v4.4s, v9.4s 281 282 eor v18.16b, v18.16b, v0.16b 283 eor v15.16b, v15.16b, v1.16b 284 eor v16.16b, v16.16b, v2.16b 285 eor v17.16b, v17.16b, v3.16b 286 eor v19.16b, v19.16b, v4.16b 287 288 rev32 v18.8h, v18.8h 289 rev32 v15.8h, v15.8h 290 rev32 v16.8h, v16.8h 291 rev32 v17.8h, v17.8h 292 rev32 v19.8h, v19.8h 293 294 add v12.4s, v12.4s, v18.4s 295 add v13.4s, v13.4s, v15.4s 296 add v10.4s, v10.4s, v16.4s 297 add v11.4s, v11.4s, v17.4s 298 add v14.4s, v14.4s, v19.4s 299 300 eor v6.16b, v6.16b, v12.16b 301 eor v7.16b, v7.16b, v13.16b 302 eor v8.16b, v8.16b, v10.16b 303 eor v5.16b, v5.16b, v11.16b 304 eor v9.16b, v9.16b, v14.16b 305 306 ushr v20.4s, v6.4s, #20 307 sli v20.4s, v6.4s, #12 308 ushr v6.4s, v7.4s, #20 309 sli v6.4s, v7.4s, #12 310 ushr v7.4s, v8.4s, #20 311 sli v7.4s, v8.4s, #12 312 ushr v8.4s, v5.4s, #20 313 sli v8.4s, v5.4s, #12 314 ushr v5.4s, v9.4s, #20 315 sli v5.4s, v9.4s, #12 316 317 add v0.4s, v0.4s, v20.4s 318 add v1.4s, v1.4s, v6.4s 319 add v2.4s, v2.4s, v7.4s 320 add v3.4s, v3.4s, v8.4s 321 add v4.4s, v4.4s, v5.4s 322 323 eor v18.16b, v18.16b, v0.16b 324 eor v15.16b, v15.16b, v1.16b 325 eor v16.16b, v16.16b, v2.16b 326 eor v17.16b, v17.16b, v3.16b 327 eor v19.16b, v19.16b, v4.16b 328 329 tbl v18.16b, {v18.16b}, v26.16b 330 tbl v15.16b, {v15.16b}, v26.16b 331 tbl v16.16b, {v16.16b}, v26.16b 332 tbl v17.16b, {v17.16b}, v26.16b 333 tbl v19.16b, {v19.16b}, v26.16b 334 335 add v12.4s, v12.4s, v18.4s 336 add v13.4s, v13.4s, v15.4s 337 add v10.4s, v10.4s, v16.4s 338 add v11.4s, v11.4s, v17.4s 339 add v14.4s, v14.4s, v19.4s 340 341 eor v20.16b, v20.16b, v12.16b 342 eor v6.16b, v6.16b, v13.16b 343 eor v7.16b, v7.16b, v10.16b 344 eor v8.16b, v8.16b, v11.16b 345 eor v5.16b, v5.16b, v14.16b 346 347 ushr v9.4s, v5.4s, #25 348 sli v9.4s, v5.4s, #7 349 ushr v5.4s, v8.4s, #25 350 sli v5.4s, v8.4s, #7 351 ushr v8.4s, v7.4s, #25 352 sli v8.4s, v7.4s, #7 353 ushr v7.4s, v6.4s, #25 354 sli v7.4s, v6.4s, #7 355 ushr v6.4s, v20.4s, #25 356 sli v6.4s, v20.4s, #7 357 358 ext v9.16b, v9.16b, v9.16b, #12 359 ext v14.16b, v14.16b, v14.16b, #8 360 ext v19.16b, v19.16b, v19.16b, #4 361 subs x6, x6, #1 362 b.hi Lseal_init_rounds 363 364 add v15.4s, v15.4s, v25.4s 365 mov x11, #4 366 dup v20.4s, w11 367 add v25.4s, v25.4s, v20.4s 368 369 zip1 v20.4s, v0.4s, v1.4s 370 zip2 v21.4s, v0.4s, v1.4s 371 zip1 v22.4s, v2.4s, v3.4s 372 zip2 v23.4s, v2.4s, v3.4s 373 374 zip1 v0.2d, v20.2d, v22.2d 375 zip2 v1.2d, v20.2d, v22.2d 376 zip1 v2.2d, v21.2d, v23.2d 377 zip2 v3.2d, v21.2d, v23.2d 378 379 zip1 v20.4s, v5.4s, v6.4s 380 zip2 v21.4s, v5.4s, v6.4s 381 zip1 v22.4s, v7.4s, v8.4s 382 zip2 v23.4s, v7.4s, v8.4s 383 384 zip1 v5.2d, v20.2d, v22.2d 385 zip2 v6.2d, v20.2d, v22.2d 386 zip1 v7.2d, v21.2d, v23.2d 387 zip2 v8.2d, v21.2d, v23.2d 388 389 zip1 v20.4s, v10.4s, v11.4s 390 zip2 v21.4s, v10.4s, v11.4s 391 zip1 v22.4s, v12.4s, v13.4s 392 zip2 v23.4s, v12.4s, v13.4s 393 394 zip1 v10.2d, v20.2d, v22.2d 395 zip2 v11.2d, v20.2d, v22.2d 396 zip1 v12.2d, v21.2d, v23.2d 397 zip2 v13.2d, v21.2d, v23.2d 398 399 zip1 v20.4s, v15.4s, v16.4s 400 zip2 v21.4s, v15.4s, v16.4s 401 zip1 v22.4s, v17.4s, v18.4s 402 zip2 v23.4s, v17.4s, v18.4s 403 404 zip1 v15.2d, v20.2d, v22.2d 405 zip2 v16.2d, v20.2d, v22.2d 406 zip1 v17.2d, v21.2d, v23.2d 407 zip2 v18.2d, v21.2d, v23.2d 408 409 add v4.4s, v4.4s, v24.4s 410 add v9.4s, v9.4s, v28.4s 411 and v4.16b, v4.16b, v27.16b 412 413 add v0.4s, v0.4s, v24.4s 414 add v5.4s, v5.4s, v28.4s 415 add v10.4s, v10.4s, v29.4s 416 add v15.4s, v15.4s, v30.4s 417 418 add v1.4s, v1.4s, v24.4s 419 add v6.4s, v6.4s, v28.4s 420 add v11.4s, v11.4s, v29.4s 421 add v16.4s, v16.4s, v30.4s 422 423 add v2.4s, v2.4s, v24.4s 424 add v7.4s, v7.4s, v28.4s 425 add v12.4s, v12.4s, v29.4s 426 add v17.4s, v17.4s, v30.4s 427 428 add v3.4s, v3.4s, v24.4s 429 add v8.4s, v8.4s, v28.4s 430 add v13.4s, v13.4s, v29.4s 431 add v18.4s, v18.4s, v30.4s 432 433 mov x16, v4.d[0] // Move the R key to GPRs 434 mov x17, v4.d[1] 435 mov v27.16b, v9.16b // Store the S key 436 437 bl Lpoly_hash_ad_internal 438 439 mov x3, x0 440 cmp x2, #256 441 b.le Lseal_tail 442 443 ld1 {v20.16b - v23.16b}, [x1], #64 444 eor v20.16b, v20.16b, v0.16b 445 eor v21.16b, v21.16b, v5.16b 446 eor v22.16b, v22.16b, v10.16b 447 eor v23.16b, v23.16b, v15.16b 448 st1 {v20.16b - v23.16b}, [x0], #64 449 450 ld1 {v20.16b - v23.16b}, [x1], #64 451 eor v20.16b, v20.16b, v1.16b 452 eor v21.16b, v21.16b, v6.16b 453 eor v22.16b, v22.16b, v11.16b 454 eor v23.16b, v23.16b, v16.16b 455 st1 {v20.16b - v23.16b}, [x0], #64 456 457 ld1 {v20.16b - v23.16b}, [x1], #64 458 eor v20.16b, v20.16b, v2.16b 459 eor v21.16b, v21.16b, v7.16b 460 eor v22.16b, v22.16b, v12.16b 461 eor v23.16b, v23.16b, v17.16b 462 st1 {v20.16b - v23.16b}, [x0], #64 463 464 ld1 {v20.16b - v23.16b}, [x1], #64 465 eor v20.16b, v20.16b, v3.16b 466 eor v21.16b, v21.16b, v8.16b 467 eor v22.16b, v22.16b, v13.16b 468 eor v23.16b, v23.16b, v18.16b 469 st1 {v20.16b - v23.16b}, [x0], #64 470 471 sub x2, x2, #256 472 473 mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds 474 mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 475 476Lseal_main_loop: 477 adrp x11, Lchacha20_consts 478 add x11, x11, :lo12:Lchacha20_consts 479 480 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 481 mov v4.16b, v24.16b 482 483 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 484 mov v9.16b, v28.16b 485 486 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 487 mov v14.16b, v29.16b 488 489 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 490 add v15.4s, v15.4s, v25.4s 491 mov v19.16b, v30.16b 492 493 eor v20.16b, v20.16b, v20.16b //zero 494 not v21.16b, v20.16b // -1 495 sub v21.4s, v25.4s, v21.4s // Add +1 496 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 497 add v19.4s, v19.4s, v20.4s 498 499 sub x5, x5, #32 500.align 5 501Lseal_main_loop_rounds: 502 add v0.4s, v0.4s, v5.4s 503 add v1.4s, v1.4s, v6.4s 504 add v2.4s, v2.4s, v7.4s 505 add v3.4s, v3.4s, v8.4s 506 add v4.4s, v4.4s, v9.4s 507 508 eor v15.16b, v15.16b, v0.16b 509 eor v16.16b, v16.16b, v1.16b 510 eor v17.16b, v17.16b, v2.16b 511 eor v18.16b, v18.16b, v3.16b 512 eor v19.16b, v19.16b, v4.16b 513 514 rev32 v15.8h, v15.8h 515 rev32 v16.8h, v16.8h 516 rev32 v17.8h, v17.8h 517 rev32 v18.8h, v18.8h 518 rev32 v19.8h, v19.8h 519 520 add v10.4s, v10.4s, v15.4s 521 add v11.4s, v11.4s, v16.4s 522 add v12.4s, v12.4s, v17.4s 523 add v13.4s, v13.4s, v18.4s 524 add v14.4s, v14.4s, v19.4s 525 526 eor v5.16b, v5.16b, v10.16b 527 eor v6.16b, v6.16b, v11.16b 528 eor v7.16b, v7.16b, v12.16b 529 eor v8.16b, v8.16b, v13.16b 530 eor v9.16b, v9.16b, v14.16b 531 532 ushr v20.4s, v5.4s, #20 533 sli v20.4s, v5.4s, #12 534 ushr v5.4s, v6.4s, #20 535 sli v5.4s, v6.4s, #12 536 ushr v6.4s, v7.4s, #20 537 sli v6.4s, v7.4s, #12 538 ushr v7.4s, v8.4s, #20 539 sli v7.4s, v8.4s, #12 540 ushr v8.4s, v9.4s, #20 541 sli v8.4s, v9.4s, #12 542 543 add v0.4s, v0.4s, v20.4s 544 add v1.4s, v1.4s, v5.4s 545 add v2.4s, v2.4s, v6.4s 546 add v3.4s, v3.4s, v7.4s 547 add v4.4s, v4.4s, v8.4s 548 549 eor v15.16b, v15.16b, v0.16b 550 eor v16.16b, v16.16b, v1.16b 551 eor v17.16b, v17.16b, v2.16b 552 eor v18.16b, v18.16b, v3.16b 553 eor v19.16b, v19.16b, v4.16b 554 555 tbl v15.16b, {v15.16b}, v26.16b 556 tbl v16.16b, {v16.16b}, v26.16b 557 tbl v17.16b, {v17.16b}, v26.16b 558 tbl v18.16b, {v18.16b}, v26.16b 559 tbl v19.16b, {v19.16b}, v26.16b 560 561 add v10.4s, v10.4s, v15.4s 562 add v11.4s, v11.4s, v16.4s 563 add v12.4s, v12.4s, v17.4s 564 add v13.4s, v13.4s, v18.4s 565 add v14.4s, v14.4s, v19.4s 566 567 eor v20.16b, v20.16b, v10.16b 568 eor v5.16b, v5.16b, v11.16b 569 eor v6.16b, v6.16b, v12.16b 570 eor v7.16b, v7.16b, v13.16b 571 eor v8.16b, v8.16b, v14.16b 572 573 ushr v9.4s, v8.4s, #25 574 sli v9.4s, v8.4s, #7 575 ushr v8.4s, v7.4s, #25 576 sli v8.4s, v7.4s, #7 577 ushr v7.4s, v6.4s, #25 578 sli v7.4s, v6.4s, #7 579 ushr v6.4s, v5.4s, #25 580 sli v6.4s, v5.4s, #7 581 ushr v5.4s, v20.4s, #25 582 sli v5.4s, v20.4s, #7 583 584 ext v9.16b, v9.16b, v9.16b, #4 585 ext v14.16b, v14.16b, v14.16b, #8 586 ext v19.16b, v19.16b, v19.16b, #12 587 ldp x11, x12, [x3], 16 588 adds x8, x8, x11 589 adcs x9, x9, x12 590 adc x10, x10, x15 591 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 592 umulh x12, x8, x16 593 mul x13, x9, x16 594 umulh x14, x9, x16 595 adds x12, x12, x13 596 mul x13, x10, x16 597 adc x13, x13, x14 598 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 599 umulh x8, x8, x17 600 adds x12, x12, x14 601 mul x14, x9, x17 602 umulh x9, x9, x17 603 adcs x14, x14, x8 604 mul x10, x10, x17 605 adc x10, x10, x9 606 adds x13, x13, x14 607 adc x14, x10, xzr 608 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 609 and x8, x13, #-4 610 extr x13, x14, x13, #2 611 adds x8, x8, x11 612 lsr x11, x14, #2 613 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 614 adds x8, x8, x13 615 adcs x9, x9, x12 616 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 617 add v0.4s, v0.4s, v6.4s 618 add v1.4s, v1.4s, v7.4s 619 add v2.4s, v2.4s, v8.4s 620 add v3.4s, v3.4s, v5.4s 621 add v4.4s, v4.4s, v9.4s 622 623 eor v18.16b, v18.16b, v0.16b 624 eor v15.16b, v15.16b, v1.16b 625 eor v16.16b, v16.16b, v2.16b 626 eor v17.16b, v17.16b, v3.16b 627 eor v19.16b, v19.16b, v4.16b 628 629 rev32 v18.8h, v18.8h 630 rev32 v15.8h, v15.8h 631 rev32 v16.8h, v16.8h 632 rev32 v17.8h, v17.8h 633 rev32 v19.8h, v19.8h 634 635 add v12.4s, v12.4s, v18.4s 636 add v13.4s, v13.4s, v15.4s 637 add v10.4s, v10.4s, v16.4s 638 add v11.4s, v11.4s, v17.4s 639 add v14.4s, v14.4s, v19.4s 640 641 eor v6.16b, v6.16b, v12.16b 642 eor v7.16b, v7.16b, v13.16b 643 eor v8.16b, v8.16b, v10.16b 644 eor v5.16b, v5.16b, v11.16b 645 eor v9.16b, v9.16b, v14.16b 646 647 ushr v20.4s, v6.4s, #20 648 sli v20.4s, v6.4s, #12 649 ushr v6.4s, v7.4s, #20 650 sli v6.4s, v7.4s, #12 651 ushr v7.4s, v8.4s, #20 652 sli v7.4s, v8.4s, #12 653 ushr v8.4s, v5.4s, #20 654 sli v8.4s, v5.4s, #12 655 ushr v5.4s, v9.4s, #20 656 sli v5.4s, v9.4s, #12 657 658 add v0.4s, v0.4s, v20.4s 659 add v1.4s, v1.4s, v6.4s 660 add v2.4s, v2.4s, v7.4s 661 add v3.4s, v3.4s, v8.4s 662 add v4.4s, v4.4s, v5.4s 663 664 eor v18.16b, v18.16b, v0.16b 665 eor v15.16b, v15.16b, v1.16b 666 eor v16.16b, v16.16b, v2.16b 667 eor v17.16b, v17.16b, v3.16b 668 eor v19.16b, v19.16b, v4.16b 669 670 tbl v18.16b, {v18.16b}, v26.16b 671 tbl v15.16b, {v15.16b}, v26.16b 672 tbl v16.16b, {v16.16b}, v26.16b 673 tbl v17.16b, {v17.16b}, v26.16b 674 tbl v19.16b, {v19.16b}, v26.16b 675 676 add v12.4s, v12.4s, v18.4s 677 add v13.4s, v13.4s, v15.4s 678 add v10.4s, v10.4s, v16.4s 679 add v11.4s, v11.4s, v17.4s 680 add v14.4s, v14.4s, v19.4s 681 682 eor v20.16b, v20.16b, v12.16b 683 eor v6.16b, v6.16b, v13.16b 684 eor v7.16b, v7.16b, v10.16b 685 eor v8.16b, v8.16b, v11.16b 686 eor v5.16b, v5.16b, v14.16b 687 688 ushr v9.4s, v5.4s, #25 689 sli v9.4s, v5.4s, #7 690 ushr v5.4s, v8.4s, #25 691 sli v5.4s, v8.4s, #7 692 ushr v8.4s, v7.4s, #25 693 sli v8.4s, v7.4s, #7 694 ushr v7.4s, v6.4s, #25 695 sli v7.4s, v6.4s, #7 696 ushr v6.4s, v20.4s, #25 697 sli v6.4s, v20.4s, #7 698 699 ext v9.16b, v9.16b, v9.16b, #12 700 ext v14.16b, v14.16b, v14.16b, #8 701 ext v19.16b, v19.16b, v19.16b, #4 702 subs x6, x6, #1 703 b.ge Lseal_main_loop_rounds 704 ldp x11, x12, [x3], 16 705 adds x8, x8, x11 706 adcs x9, x9, x12 707 adc x10, x10, x15 708 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 709 umulh x12, x8, x16 710 mul x13, x9, x16 711 umulh x14, x9, x16 712 adds x12, x12, x13 713 mul x13, x10, x16 714 adc x13, x13, x14 715 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 716 umulh x8, x8, x17 717 adds x12, x12, x14 718 mul x14, x9, x17 719 umulh x9, x9, x17 720 adcs x14, x14, x8 721 mul x10, x10, x17 722 adc x10, x10, x9 723 adds x13, x13, x14 724 adc x14, x10, xzr 725 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 726 and x8, x13, #-4 727 extr x13, x14, x13, #2 728 adds x8, x8, x11 729 lsr x11, x14, #2 730 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 731 adds x8, x8, x13 732 adcs x9, x9, x12 733 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 734 subs x7, x7, #1 735 b.gt Lseal_main_loop_rounds 736 737 eor v20.16b, v20.16b, v20.16b //zero 738 not v21.16b, v20.16b // -1 739 sub v21.4s, v25.4s, v21.4s // Add +1 740 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 741 add v19.4s, v19.4s, v20.4s 742 743 add v15.4s, v15.4s, v25.4s 744 mov x11, #5 745 dup v20.4s, w11 746 add v25.4s, v25.4s, v20.4s 747 748 zip1 v20.4s, v0.4s, v1.4s 749 zip2 v21.4s, v0.4s, v1.4s 750 zip1 v22.4s, v2.4s, v3.4s 751 zip2 v23.4s, v2.4s, v3.4s 752 753 zip1 v0.2d, v20.2d, v22.2d 754 zip2 v1.2d, v20.2d, v22.2d 755 zip1 v2.2d, v21.2d, v23.2d 756 zip2 v3.2d, v21.2d, v23.2d 757 758 zip1 v20.4s, v5.4s, v6.4s 759 zip2 v21.4s, v5.4s, v6.4s 760 zip1 v22.4s, v7.4s, v8.4s 761 zip2 v23.4s, v7.4s, v8.4s 762 763 zip1 v5.2d, v20.2d, v22.2d 764 zip2 v6.2d, v20.2d, v22.2d 765 zip1 v7.2d, v21.2d, v23.2d 766 zip2 v8.2d, v21.2d, v23.2d 767 768 zip1 v20.4s, v10.4s, v11.4s 769 zip2 v21.4s, v10.4s, v11.4s 770 zip1 v22.4s, v12.4s, v13.4s 771 zip2 v23.4s, v12.4s, v13.4s 772 773 zip1 v10.2d, v20.2d, v22.2d 774 zip2 v11.2d, v20.2d, v22.2d 775 zip1 v12.2d, v21.2d, v23.2d 776 zip2 v13.2d, v21.2d, v23.2d 777 778 zip1 v20.4s, v15.4s, v16.4s 779 zip2 v21.4s, v15.4s, v16.4s 780 zip1 v22.4s, v17.4s, v18.4s 781 zip2 v23.4s, v17.4s, v18.4s 782 783 zip1 v15.2d, v20.2d, v22.2d 784 zip2 v16.2d, v20.2d, v22.2d 785 zip1 v17.2d, v21.2d, v23.2d 786 zip2 v18.2d, v21.2d, v23.2d 787 788 add v0.4s, v0.4s, v24.4s 789 add v5.4s, v5.4s, v28.4s 790 add v10.4s, v10.4s, v29.4s 791 add v15.4s, v15.4s, v30.4s 792 793 add v1.4s, v1.4s, v24.4s 794 add v6.4s, v6.4s, v28.4s 795 add v11.4s, v11.4s, v29.4s 796 add v16.4s, v16.4s, v30.4s 797 798 add v2.4s, v2.4s, v24.4s 799 add v7.4s, v7.4s, v28.4s 800 add v12.4s, v12.4s, v29.4s 801 add v17.4s, v17.4s, v30.4s 802 803 add v3.4s, v3.4s, v24.4s 804 add v8.4s, v8.4s, v28.4s 805 add v13.4s, v13.4s, v29.4s 806 add v18.4s, v18.4s, v30.4s 807 808 add v4.4s, v4.4s, v24.4s 809 add v9.4s, v9.4s, v28.4s 810 add v14.4s, v14.4s, v29.4s 811 add v19.4s, v19.4s, v30.4s 812 813 cmp x2, #320 814 b.le Lseal_tail 815 816 ld1 {v20.16b - v23.16b}, [x1], #64 817 eor v20.16b, v20.16b, v0.16b 818 eor v21.16b, v21.16b, v5.16b 819 eor v22.16b, v22.16b, v10.16b 820 eor v23.16b, v23.16b, v15.16b 821 st1 {v20.16b - v23.16b}, [x0], #64 822 823 ld1 {v20.16b - v23.16b}, [x1], #64 824 eor v20.16b, v20.16b, v1.16b 825 eor v21.16b, v21.16b, v6.16b 826 eor v22.16b, v22.16b, v11.16b 827 eor v23.16b, v23.16b, v16.16b 828 st1 {v20.16b - v23.16b}, [x0], #64 829 830 ld1 {v20.16b - v23.16b}, [x1], #64 831 eor v20.16b, v20.16b, v2.16b 832 eor v21.16b, v21.16b, v7.16b 833 eor v22.16b, v22.16b, v12.16b 834 eor v23.16b, v23.16b, v17.16b 835 st1 {v20.16b - v23.16b}, [x0], #64 836 837 ld1 {v20.16b - v23.16b}, [x1], #64 838 eor v20.16b, v20.16b, v3.16b 839 eor v21.16b, v21.16b, v8.16b 840 eor v22.16b, v22.16b, v13.16b 841 eor v23.16b, v23.16b, v18.16b 842 st1 {v20.16b - v23.16b}, [x0], #64 843 844 ld1 {v20.16b - v23.16b}, [x1], #64 845 eor v20.16b, v20.16b, v4.16b 846 eor v21.16b, v21.16b, v9.16b 847 eor v22.16b, v22.16b, v14.16b 848 eor v23.16b, v23.16b, v19.16b 849 st1 {v20.16b - v23.16b}, [x0], #64 850 851 sub x2, x2, #320 852 853 mov x6, #0 854 mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration 855 856 b Lseal_main_loop 857 858Lseal_tail: 859 // This part of the function handles the storage and authentication of the last [0,320) bytes 860 // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. 861 cmp x2, #64 862 b.lt Lseal_tail_64 863 864 // Store and authenticate 64B blocks per iteration 865 ld1 {v20.16b - v23.16b}, [x1], #64 866 867 eor v20.16b, v20.16b, v0.16b 868 eor v21.16b, v21.16b, v5.16b 869 eor v22.16b, v22.16b, v10.16b 870 eor v23.16b, v23.16b, v15.16b 871 mov x11, v20.d[0] 872 mov x12, v20.d[1] 873 adds x8, x8, x11 874 adcs x9, x9, x12 875 adc x10, x10, x15 876 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 877 umulh x12, x8, x16 878 mul x13, x9, x16 879 umulh x14, x9, x16 880 adds x12, x12, x13 881 mul x13, x10, x16 882 adc x13, x13, x14 883 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 884 umulh x8, x8, x17 885 adds x12, x12, x14 886 mul x14, x9, x17 887 umulh x9, x9, x17 888 adcs x14, x14, x8 889 mul x10, x10, x17 890 adc x10, x10, x9 891 adds x13, x13, x14 892 adc x14, x10, xzr 893 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 894 and x8, x13, #-4 895 extr x13, x14, x13, #2 896 adds x8, x8, x11 897 lsr x11, x14, #2 898 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 899 adds x8, x8, x13 900 adcs x9, x9, x12 901 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 902 mov x11, v21.d[0] 903 mov x12, v21.d[1] 904 adds x8, x8, x11 905 adcs x9, x9, x12 906 adc x10, x10, x15 907 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 908 umulh x12, x8, x16 909 mul x13, x9, x16 910 umulh x14, x9, x16 911 adds x12, x12, x13 912 mul x13, x10, x16 913 adc x13, x13, x14 914 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 915 umulh x8, x8, x17 916 adds x12, x12, x14 917 mul x14, x9, x17 918 umulh x9, x9, x17 919 adcs x14, x14, x8 920 mul x10, x10, x17 921 adc x10, x10, x9 922 adds x13, x13, x14 923 adc x14, x10, xzr 924 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 925 and x8, x13, #-4 926 extr x13, x14, x13, #2 927 adds x8, x8, x11 928 lsr x11, x14, #2 929 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 930 adds x8, x8, x13 931 adcs x9, x9, x12 932 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 933 mov x11, v22.d[0] 934 mov x12, v22.d[1] 935 adds x8, x8, x11 936 adcs x9, x9, x12 937 adc x10, x10, x15 938 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 939 umulh x12, x8, x16 940 mul x13, x9, x16 941 umulh x14, x9, x16 942 adds x12, x12, x13 943 mul x13, x10, x16 944 adc x13, x13, x14 945 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 946 umulh x8, x8, x17 947 adds x12, x12, x14 948 mul x14, x9, x17 949 umulh x9, x9, x17 950 adcs x14, x14, x8 951 mul x10, x10, x17 952 adc x10, x10, x9 953 adds x13, x13, x14 954 adc x14, x10, xzr 955 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 956 and x8, x13, #-4 957 extr x13, x14, x13, #2 958 adds x8, x8, x11 959 lsr x11, x14, #2 960 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 961 adds x8, x8, x13 962 adcs x9, x9, x12 963 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 964 mov x11, v23.d[0] 965 mov x12, v23.d[1] 966 adds x8, x8, x11 967 adcs x9, x9, x12 968 adc x10, x10, x15 969 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 970 umulh x12, x8, x16 971 mul x13, x9, x16 972 umulh x14, x9, x16 973 adds x12, x12, x13 974 mul x13, x10, x16 975 adc x13, x13, x14 976 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 977 umulh x8, x8, x17 978 adds x12, x12, x14 979 mul x14, x9, x17 980 umulh x9, x9, x17 981 adcs x14, x14, x8 982 mul x10, x10, x17 983 adc x10, x10, x9 984 adds x13, x13, x14 985 adc x14, x10, xzr 986 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 987 and x8, x13, #-4 988 extr x13, x14, x13, #2 989 adds x8, x8, x11 990 lsr x11, x14, #2 991 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 992 adds x8, x8, x13 993 adcs x9, x9, x12 994 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 995 st1 {v20.16b - v23.16b}, [x0], #64 996 sub x2, x2, #64 997 998 // Shift the state left by 64 bytes for the next iteration of the loop 999 mov v0.16b, v1.16b 1000 mov v5.16b, v6.16b 1001 mov v10.16b, v11.16b 1002 mov v15.16b, v16.16b 1003 1004 mov v1.16b, v2.16b 1005 mov v6.16b, v7.16b 1006 mov v11.16b, v12.16b 1007 mov v16.16b, v17.16b 1008 1009 mov v2.16b, v3.16b 1010 mov v7.16b, v8.16b 1011 mov v12.16b, v13.16b 1012 mov v17.16b, v18.16b 1013 1014 mov v3.16b, v4.16b 1015 mov v8.16b, v9.16b 1016 mov v13.16b, v14.16b 1017 mov v18.16b, v19.16b 1018 1019 b Lseal_tail 1020 1021Lseal_tail_64: 1022 ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr 1023 1024 // Here we handle the last [0,64) bytes of plaintext 1025 cmp x2, #16 1026 b.lt Lseal_tail_16 1027 // Each iteration encrypt and authenticate a 16B block 1028 ld1 {v20.16b}, [x1], #16 1029 eor v20.16b, v20.16b, v0.16b 1030 mov x11, v20.d[0] 1031 mov x12, v20.d[1] 1032 adds x8, x8, x11 1033 adcs x9, x9, x12 1034 adc x10, x10, x15 1035 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1036 umulh x12, x8, x16 1037 mul x13, x9, x16 1038 umulh x14, x9, x16 1039 adds x12, x12, x13 1040 mul x13, x10, x16 1041 adc x13, x13, x14 1042 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1043 umulh x8, x8, x17 1044 adds x12, x12, x14 1045 mul x14, x9, x17 1046 umulh x9, x9, x17 1047 adcs x14, x14, x8 1048 mul x10, x10, x17 1049 adc x10, x10, x9 1050 adds x13, x13, x14 1051 adc x14, x10, xzr 1052 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1053 and x8, x13, #-4 1054 extr x13, x14, x13, #2 1055 adds x8, x8, x11 1056 lsr x11, x14, #2 1057 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1058 adds x8, x8, x13 1059 adcs x9, x9, x12 1060 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1061 st1 {v20.16b}, [x0], #16 1062 1063 sub x2, x2, #16 1064 1065 // Shift the state left by 16 bytes for the next iteration of the loop 1066 mov v0.16b, v5.16b 1067 mov v5.16b, v10.16b 1068 mov v10.16b, v15.16b 1069 1070 b Lseal_tail_64 1071 1072Lseal_tail_16: 1073 // Here we handle the last [0,16) bytes of ciphertext that require a padded block 1074 cbz x2, Lseal_hash_extra 1075 1076 eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in 1077 eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes 1078 not v22.16b, v20.16b 1079 1080 mov x6, x2 1081 add x1, x1, x2 1082 1083 cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding 1084 1085 mov x7, #16 // We need to load some extra_in first for padding 1086 sub x7, x7, x2 1087 cmp x4, x7 1088 csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register 1089 mov x12, x7 1090 add x3, x3, x7 1091 sub x4, x4, x7 1092 1093Lseal_tail16_compose_extra_in: 1094 ext v20.16b, v20.16b, v20.16b, #15 1095 ldrb w11, [x3, #-1]! 1096 mov v20.b[0], w11 1097 subs x7, x7, #1 1098 b.gt Lseal_tail16_compose_extra_in 1099 1100 add x3, x3, x12 1101 1102Lseal_tail_16_compose: 1103 ext v20.16b, v20.16b, v20.16b, #15 1104 ldrb w11, [x1, #-1]! 1105 mov v20.b[0], w11 1106 ext v21.16b, v22.16b, v21.16b, #15 1107 subs x2, x2, #1 1108 b.gt Lseal_tail_16_compose 1109 1110 and v0.16b, v0.16b, v21.16b 1111 eor v20.16b, v20.16b, v0.16b 1112 mov v21.16b, v20.16b 1113 1114Lseal_tail_16_store: 1115 umov w11, v20.b[0] 1116 strb w11, [x0], #1 1117 ext v20.16b, v20.16b, v20.16b, #1 1118 subs x6, x6, #1 1119 b.gt Lseal_tail_16_store 1120 1121 // Hash in the final ct block concatenated with extra_in 1122 mov x11, v21.d[0] 1123 mov x12, v21.d[1] 1124 adds x8, x8, x11 1125 adcs x9, x9, x12 1126 adc x10, x10, x15 1127 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1128 umulh x12, x8, x16 1129 mul x13, x9, x16 1130 umulh x14, x9, x16 1131 adds x12, x12, x13 1132 mul x13, x10, x16 1133 adc x13, x13, x14 1134 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1135 umulh x8, x8, x17 1136 adds x12, x12, x14 1137 mul x14, x9, x17 1138 umulh x9, x9, x17 1139 adcs x14, x14, x8 1140 mul x10, x10, x17 1141 adc x10, x10, x9 1142 adds x13, x13, x14 1143 adc x14, x10, xzr 1144 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1145 and x8, x13, #-4 1146 extr x13, x14, x13, #2 1147 adds x8, x8, x11 1148 lsr x11, x14, #2 1149 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1150 adds x8, x8, x13 1151 adcs x9, x9, x12 1152 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1153 1154Lseal_hash_extra: 1155 cbz x4, Lseal_finalize 1156 1157Lseal_hash_extra_loop: 1158 cmp x4, #16 1159 b.lt Lseal_hash_extra_tail 1160 ld1 {v20.16b}, [x3], #16 1161 mov x11, v20.d[0] 1162 mov x12, v20.d[1] 1163 adds x8, x8, x11 1164 adcs x9, x9, x12 1165 adc x10, x10, x15 1166 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1167 umulh x12, x8, x16 1168 mul x13, x9, x16 1169 umulh x14, x9, x16 1170 adds x12, x12, x13 1171 mul x13, x10, x16 1172 adc x13, x13, x14 1173 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1174 umulh x8, x8, x17 1175 adds x12, x12, x14 1176 mul x14, x9, x17 1177 umulh x9, x9, x17 1178 adcs x14, x14, x8 1179 mul x10, x10, x17 1180 adc x10, x10, x9 1181 adds x13, x13, x14 1182 adc x14, x10, xzr 1183 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1184 and x8, x13, #-4 1185 extr x13, x14, x13, #2 1186 adds x8, x8, x11 1187 lsr x11, x14, #2 1188 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1189 adds x8, x8, x13 1190 adcs x9, x9, x12 1191 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1192 sub x4, x4, #16 1193 b Lseal_hash_extra_loop 1194 1195Lseal_hash_extra_tail: 1196 cbz x4, Lseal_finalize 1197 eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext 1198 add x3, x3, x4 1199 1200Lseal_hash_extra_load: 1201 ext v20.16b, v20.16b, v20.16b, #15 1202 ldrb w11, [x3, #-1]! 1203 mov v20.b[0], w11 1204 subs x4, x4, #1 1205 b.gt Lseal_hash_extra_load 1206 1207 // Hash in the final padded extra_in blcok 1208 mov x11, v20.d[0] 1209 mov x12, v20.d[1] 1210 adds x8, x8, x11 1211 adcs x9, x9, x12 1212 adc x10, x10, x15 1213 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1214 umulh x12, x8, x16 1215 mul x13, x9, x16 1216 umulh x14, x9, x16 1217 adds x12, x12, x13 1218 mul x13, x10, x16 1219 adc x13, x13, x14 1220 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1221 umulh x8, x8, x17 1222 adds x12, x12, x14 1223 mul x14, x9, x17 1224 umulh x9, x9, x17 1225 adcs x14, x14, x8 1226 mul x10, x10, x17 1227 adc x10, x10, x9 1228 adds x13, x13, x14 1229 adc x14, x10, xzr 1230 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1231 and x8, x13, #-4 1232 extr x13, x14, x13, #2 1233 adds x8, x8, x11 1234 lsr x11, x14, #2 1235 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1236 adds x8, x8, x13 1237 adcs x9, x9, x12 1238 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1239 1240Lseal_finalize: 1241 mov x11, v31.d[0] 1242 mov x12, v31.d[1] 1243 adds x8, x8, x11 1244 adcs x9, x9, x12 1245 adc x10, x10, x15 1246 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1247 umulh x12, x8, x16 1248 mul x13, x9, x16 1249 umulh x14, x9, x16 1250 adds x12, x12, x13 1251 mul x13, x10, x16 1252 adc x13, x13, x14 1253 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1254 umulh x8, x8, x17 1255 adds x12, x12, x14 1256 mul x14, x9, x17 1257 umulh x9, x9, x17 1258 adcs x14, x14, x8 1259 mul x10, x10, x17 1260 adc x10, x10, x9 1261 adds x13, x13, x14 1262 adc x14, x10, xzr 1263 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1264 and x8, x13, #-4 1265 extr x13, x14, x13, #2 1266 adds x8, x8, x11 1267 lsr x11, x14, #2 1268 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1269 adds x8, x8, x13 1270 adcs x9, x9, x12 1271 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1272 // Final reduction step 1273 sub x12, xzr, x15 1274 orr x13, xzr, #3 1275 subs x11, x8, #-5 1276 sbcs x12, x9, x12 1277 sbcs x13, x10, x13 1278 csel x8, x11, x8, cs 1279 csel x9, x12, x9, cs 1280 csel x10, x13, x10, cs 1281 mov x11, v27.d[0] 1282 mov x12, v27.d[1] 1283 adds x8, x8, x11 1284 adcs x9, x9, x12 1285 adc x10, x10, x15 1286 1287 stp x8, x9, [x5] 1288 1289 ldp d8, d9, [sp, #16] 1290 ldp d10, d11, [sp, #32] 1291 ldp d12, d13, [sp, #48] 1292 ldp d14, d15, [sp, #64] 1293.cfi_restore b15 1294.cfi_restore b14 1295.cfi_restore b13 1296.cfi_restore b12 1297.cfi_restore b11 1298.cfi_restore b10 1299.cfi_restore b9 1300.cfi_restore b8 1301 ldp x29, x30, [sp], 80 1302.cfi_restore w29 1303.cfi_restore w30 1304.cfi_def_cfa_offset 0 1305 AARCH64_VALIDATE_LINK_REGISTER 1306 ret 1307 1308Lseal_128: 1309 // On some architectures preparing 5 blocks for small buffers is wasteful 1310 eor v25.16b, v25.16b, v25.16b 1311 mov x11, #1 1312 mov v25.s[0], w11 1313 mov v0.16b, v24.16b 1314 mov v1.16b, v24.16b 1315 mov v2.16b, v24.16b 1316 mov v5.16b, v28.16b 1317 mov v6.16b, v28.16b 1318 mov v7.16b, v28.16b 1319 mov v10.16b, v29.16b 1320 mov v11.16b, v29.16b 1321 mov v12.16b, v29.16b 1322 mov v17.16b, v30.16b 1323 add v15.4s, v17.4s, v25.4s 1324 add v16.4s, v15.4s, v25.4s 1325 1326 mov x6, #10 1327 1328Lseal_128_rounds: 1329 add v0.4s, v0.4s, v5.4s 1330 add v1.4s, v1.4s, v6.4s 1331 add v2.4s, v2.4s, v7.4s 1332 eor v15.16b, v15.16b, v0.16b 1333 eor v16.16b, v16.16b, v1.16b 1334 eor v17.16b, v17.16b, v2.16b 1335 rev32 v15.8h, v15.8h 1336 rev32 v16.8h, v16.8h 1337 rev32 v17.8h, v17.8h 1338 1339 add v10.4s, v10.4s, v15.4s 1340 add v11.4s, v11.4s, v16.4s 1341 add v12.4s, v12.4s, v17.4s 1342 eor v5.16b, v5.16b, v10.16b 1343 eor v6.16b, v6.16b, v11.16b 1344 eor v7.16b, v7.16b, v12.16b 1345 ushr v20.4s, v5.4s, #20 1346 sli v20.4s, v5.4s, #12 1347 ushr v5.4s, v6.4s, #20 1348 sli v5.4s, v6.4s, #12 1349 ushr v6.4s, v7.4s, #20 1350 sli v6.4s, v7.4s, #12 1351 1352 add v0.4s, v0.4s, v20.4s 1353 add v1.4s, v1.4s, v5.4s 1354 add v2.4s, v2.4s, v6.4s 1355 eor v15.16b, v15.16b, v0.16b 1356 eor v16.16b, v16.16b, v1.16b 1357 eor v17.16b, v17.16b, v2.16b 1358 tbl v15.16b, {v15.16b}, v26.16b 1359 tbl v16.16b, {v16.16b}, v26.16b 1360 tbl v17.16b, {v17.16b}, v26.16b 1361 1362 add v10.4s, v10.4s, v15.4s 1363 add v11.4s, v11.4s, v16.4s 1364 add v12.4s, v12.4s, v17.4s 1365 eor v20.16b, v20.16b, v10.16b 1366 eor v5.16b, v5.16b, v11.16b 1367 eor v6.16b, v6.16b, v12.16b 1368 ushr v7.4s, v6.4s, #25 1369 sli v7.4s, v6.4s, #7 1370 ushr v6.4s, v5.4s, #25 1371 sli v6.4s, v5.4s, #7 1372 ushr v5.4s, v20.4s, #25 1373 sli v5.4s, v20.4s, #7 1374 1375 ext v5.16b, v5.16b, v5.16b, #4 1376 ext v6.16b, v6.16b, v6.16b, #4 1377 ext v7.16b, v7.16b, v7.16b, #4 1378 1379 ext v10.16b, v10.16b, v10.16b, #8 1380 ext v11.16b, v11.16b, v11.16b, #8 1381 ext v12.16b, v12.16b, v12.16b, #8 1382 1383 ext v15.16b, v15.16b, v15.16b, #12 1384 ext v16.16b, v16.16b, v16.16b, #12 1385 ext v17.16b, v17.16b, v17.16b, #12 1386 add v0.4s, v0.4s, v5.4s 1387 add v1.4s, v1.4s, v6.4s 1388 add v2.4s, v2.4s, v7.4s 1389 eor v15.16b, v15.16b, v0.16b 1390 eor v16.16b, v16.16b, v1.16b 1391 eor v17.16b, v17.16b, v2.16b 1392 rev32 v15.8h, v15.8h 1393 rev32 v16.8h, v16.8h 1394 rev32 v17.8h, v17.8h 1395 1396 add v10.4s, v10.4s, v15.4s 1397 add v11.4s, v11.4s, v16.4s 1398 add v12.4s, v12.4s, v17.4s 1399 eor v5.16b, v5.16b, v10.16b 1400 eor v6.16b, v6.16b, v11.16b 1401 eor v7.16b, v7.16b, v12.16b 1402 ushr v20.4s, v5.4s, #20 1403 sli v20.4s, v5.4s, #12 1404 ushr v5.4s, v6.4s, #20 1405 sli v5.4s, v6.4s, #12 1406 ushr v6.4s, v7.4s, #20 1407 sli v6.4s, v7.4s, #12 1408 1409 add v0.4s, v0.4s, v20.4s 1410 add v1.4s, v1.4s, v5.4s 1411 add v2.4s, v2.4s, v6.4s 1412 eor v15.16b, v15.16b, v0.16b 1413 eor v16.16b, v16.16b, v1.16b 1414 eor v17.16b, v17.16b, v2.16b 1415 tbl v15.16b, {v15.16b}, v26.16b 1416 tbl v16.16b, {v16.16b}, v26.16b 1417 tbl v17.16b, {v17.16b}, v26.16b 1418 1419 add v10.4s, v10.4s, v15.4s 1420 add v11.4s, v11.4s, v16.4s 1421 add v12.4s, v12.4s, v17.4s 1422 eor v20.16b, v20.16b, v10.16b 1423 eor v5.16b, v5.16b, v11.16b 1424 eor v6.16b, v6.16b, v12.16b 1425 ushr v7.4s, v6.4s, #25 1426 sli v7.4s, v6.4s, #7 1427 ushr v6.4s, v5.4s, #25 1428 sli v6.4s, v5.4s, #7 1429 ushr v5.4s, v20.4s, #25 1430 sli v5.4s, v20.4s, #7 1431 1432 ext v5.16b, v5.16b, v5.16b, #12 1433 ext v6.16b, v6.16b, v6.16b, #12 1434 ext v7.16b, v7.16b, v7.16b, #12 1435 1436 ext v10.16b, v10.16b, v10.16b, #8 1437 ext v11.16b, v11.16b, v11.16b, #8 1438 ext v12.16b, v12.16b, v12.16b, #8 1439 1440 ext v15.16b, v15.16b, v15.16b, #4 1441 ext v16.16b, v16.16b, v16.16b, #4 1442 ext v17.16b, v17.16b, v17.16b, #4 1443 subs x6, x6, #1 1444 b.hi Lseal_128_rounds 1445 1446 add v0.4s, v0.4s, v24.4s 1447 add v1.4s, v1.4s, v24.4s 1448 add v2.4s, v2.4s, v24.4s 1449 1450 add v5.4s, v5.4s, v28.4s 1451 add v6.4s, v6.4s, v28.4s 1452 add v7.4s, v7.4s, v28.4s 1453 1454 // Only the first 32 bytes of the third block (counter = 0) are needed, 1455 // so skip updating v12 and v17. 1456 add v10.4s, v10.4s, v29.4s 1457 add v11.4s, v11.4s, v29.4s 1458 1459 add v30.4s, v30.4s, v25.4s 1460 add v15.4s, v15.4s, v30.4s 1461 add v30.4s, v30.4s, v25.4s 1462 add v16.4s, v16.4s, v30.4s 1463 1464 and v2.16b, v2.16b, v27.16b 1465 mov x16, v2.d[0] // Move the R key to GPRs 1466 mov x17, v2.d[1] 1467 mov v27.16b, v7.16b // Store the S key 1468 1469 bl Lpoly_hash_ad_internal 1470 b Lseal_tail 1471.cfi_endproc 1472 1473 1474///////////////////////////////// 1475// 1476// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); 1477// 1478.globl chacha20_poly1305_open 1479 1480.def chacha20_poly1305_open 1481 .type 32 1482.endef 1483.align 6 1484chacha20_poly1305_open: 1485 AARCH64_SIGN_LINK_REGISTER 1486.cfi_startproc 1487 stp x29, x30, [sp, #-80]! 1488.cfi_def_cfa_offset 80 1489.cfi_offset w30, -72 1490.cfi_offset w29, -80 1491 mov x29, sp 1492 // We probably could do .cfi_def_cfa w29, 80 at this point, but since 1493 // we don't actually use the frame pointer like that, it's probably not 1494 // worth bothering. 1495 stp d8, d9, [sp, #16] 1496 stp d10, d11, [sp, #32] 1497 stp d12, d13, [sp, #48] 1498 stp d14, d15, [sp, #64] 1499.cfi_offset b15, -8 1500.cfi_offset b14, -16 1501.cfi_offset b13, -24 1502.cfi_offset b12, -32 1503.cfi_offset b11, -40 1504.cfi_offset b10, -48 1505.cfi_offset b9, -56 1506.cfi_offset b8, -64 1507 1508 adrp x11, Lchacha20_consts 1509 add x11, x11, :lo12:Lchacha20_consts 1510 1511 ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values 1512 ld1 {v28.16b - v30.16b}, [x5] 1513 1514 mov x15, #1 // Prepare the Poly1305 state 1515 mov x8, #0 1516 mov x9, #0 1517 mov x10, #0 1518 1519 mov v31.d[0], x4 // Store the input and aad lengths 1520 mov v31.d[1], x2 1521 1522 cmp x2, #128 1523 b.le Lopen_128 // Optimization for smaller buffers 1524 1525 // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys 1526 mov v0.16b, v24.16b 1527 mov v5.16b, v28.16b 1528 mov v10.16b, v29.16b 1529 mov v15.16b, v30.16b 1530 1531 mov x6, #10 1532 1533.align 5 1534Lopen_init_rounds: 1535 add v0.4s, v0.4s, v5.4s 1536 eor v15.16b, v15.16b, v0.16b 1537 rev32 v15.8h, v15.8h 1538 1539 add v10.4s, v10.4s, v15.4s 1540 eor v5.16b, v5.16b, v10.16b 1541 ushr v20.4s, v5.4s, #20 1542 sli v20.4s, v5.4s, #12 1543 add v0.4s, v0.4s, v20.4s 1544 eor v15.16b, v15.16b, v0.16b 1545 tbl v15.16b, {v15.16b}, v26.16b 1546 1547 add v10.4s, v10.4s, v15.4s 1548 eor v20.16b, v20.16b, v10.16b 1549 ushr v5.4s, v20.4s, #25 1550 sli v5.4s, v20.4s, #7 1551 ext v5.16b, v5.16b, v5.16b, #4 1552 ext v10.16b, v10.16b, v10.16b, #8 1553 ext v15.16b, v15.16b, v15.16b, #12 1554 add v0.4s, v0.4s, v5.4s 1555 eor v15.16b, v15.16b, v0.16b 1556 rev32 v15.8h, v15.8h 1557 1558 add v10.4s, v10.4s, v15.4s 1559 eor v5.16b, v5.16b, v10.16b 1560 ushr v20.4s, v5.4s, #20 1561 sli v20.4s, v5.4s, #12 1562 add v0.4s, v0.4s, v20.4s 1563 eor v15.16b, v15.16b, v0.16b 1564 tbl v15.16b, {v15.16b}, v26.16b 1565 1566 add v10.4s, v10.4s, v15.4s 1567 eor v20.16b, v20.16b, v10.16b 1568 ushr v5.4s, v20.4s, #25 1569 sli v5.4s, v20.4s, #7 1570 ext v5.16b, v5.16b, v5.16b, #12 1571 ext v10.16b, v10.16b, v10.16b, #8 1572 ext v15.16b, v15.16b, v15.16b, #4 1573 subs x6, x6, #1 1574 b.hi Lopen_init_rounds 1575 1576 add v0.4s, v0.4s, v24.4s 1577 add v5.4s, v5.4s, v28.4s 1578 1579 and v0.16b, v0.16b, v27.16b 1580 mov x16, v0.d[0] // Move the R key to GPRs 1581 mov x17, v0.d[1] 1582 mov v27.16b, v5.16b // Store the S key 1583 1584 bl Lpoly_hash_ad_internal 1585 1586Lopen_ad_done: 1587 mov x3, x1 1588 1589// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes 1590Lopen_main_loop: 1591 1592 cmp x2, #192 1593 b.lt Lopen_tail 1594 1595 adrp x11, Lchacha20_consts 1596 add x11, x11, :lo12:Lchacha20_consts 1597 1598 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 1599 mov v4.16b, v24.16b 1600 1601 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 1602 mov v9.16b, v28.16b 1603 1604 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 1605 mov v14.16b, v29.16b 1606 1607 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 1608 sub x5, x5, #32 1609 add v15.4s, v15.4s, v25.4s 1610 mov v19.16b, v30.16b 1611 1612 eor v20.16b, v20.16b, v20.16b //zero 1613 not v21.16b, v20.16b // -1 1614 sub v21.4s, v25.4s, v21.4s // Add +1 1615 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 1616 add v19.4s, v19.4s, v20.4s 1617 1618 lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 1619 sub x4, x4, #10 1620 1621 mov x7, #10 1622 subs x6, x7, x4 1623 subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash 1624 csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full 1625 1626 cbz x7, Lopen_main_loop_rounds_short 1627 1628.align 5 1629Lopen_main_loop_rounds: 1630 ldp x11, x12, [x3], 16 1631 adds x8, x8, x11 1632 adcs x9, x9, x12 1633 adc x10, x10, x15 1634 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1635 umulh x12, x8, x16 1636 mul x13, x9, x16 1637 umulh x14, x9, x16 1638 adds x12, x12, x13 1639 mul x13, x10, x16 1640 adc x13, x13, x14 1641 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1642 umulh x8, x8, x17 1643 adds x12, x12, x14 1644 mul x14, x9, x17 1645 umulh x9, x9, x17 1646 adcs x14, x14, x8 1647 mul x10, x10, x17 1648 adc x10, x10, x9 1649 adds x13, x13, x14 1650 adc x14, x10, xzr 1651 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1652 and x8, x13, #-4 1653 extr x13, x14, x13, #2 1654 adds x8, x8, x11 1655 lsr x11, x14, #2 1656 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1657 adds x8, x8, x13 1658 adcs x9, x9, x12 1659 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1660Lopen_main_loop_rounds_short: 1661 add v0.4s, v0.4s, v5.4s 1662 add v1.4s, v1.4s, v6.4s 1663 add v2.4s, v2.4s, v7.4s 1664 add v3.4s, v3.4s, v8.4s 1665 add v4.4s, v4.4s, v9.4s 1666 1667 eor v15.16b, v15.16b, v0.16b 1668 eor v16.16b, v16.16b, v1.16b 1669 eor v17.16b, v17.16b, v2.16b 1670 eor v18.16b, v18.16b, v3.16b 1671 eor v19.16b, v19.16b, v4.16b 1672 1673 rev32 v15.8h, v15.8h 1674 rev32 v16.8h, v16.8h 1675 rev32 v17.8h, v17.8h 1676 rev32 v18.8h, v18.8h 1677 rev32 v19.8h, v19.8h 1678 1679 add v10.4s, v10.4s, v15.4s 1680 add v11.4s, v11.4s, v16.4s 1681 add v12.4s, v12.4s, v17.4s 1682 add v13.4s, v13.4s, v18.4s 1683 add v14.4s, v14.4s, v19.4s 1684 1685 eor v5.16b, v5.16b, v10.16b 1686 eor v6.16b, v6.16b, v11.16b 1687 eor v7.16b, v7.16b, v12.16b 1688 eor v8.16b, v8.16b, v13.16b 1689 eor v9.16b, v9.16b, v14.16b 1690 1691 ushr v20.4s, v5.4s, #20 1692 sli v20.4s, v5.4s, #12 1693 ushr v5.4s, v6.4s, #20 1694 sli v5.4s, v6.4s, #12 1695 ushr v6.4s, v7.4s, #20 1696 sli v6.4s, v7.4s, #12 1697 ushr v7.4s, v8.4s, #20 1698 sli v7.4s, v8.4s, #12 1699 ushr v8.4s, v9.4s, #20 1700 sli v8.4s, v9.4s, #12 1701 1702 add v0.4s, v0.4s, v20.4s 1703 add v1.4s, v1.4s, v5.4s 1704 add v2.4s, v2.4s, v6.4s 1705 add v3.4s, v3.4s, v7.4s 1706 add v4.4s, v4.4s, v8.4s 1707 1708 eor v15.16b, v15.16b, v0.16b 1709 eor v16.16b, v16.16b, v1.16b 1710 eor v17.16b, v17.16b, v2.16b 1711 eor v18.16b, v18.16b, v3.16b 1712 eor v19.16b, v19.16b, v4.16b 1713 1714 tbl v15.16b, {v15.16b}, v26.16b 1715 tbl v16.16b, {v16.16b}, v26.16b 1716 tbl v17.16b, {v17.16b}, v26.16b 1717 tbl v18.16b, {v18.16b}, v26.16b 1718 tbl v19.16b, {v19.16b}, v26.16b 1719 1720 add v10.4s, v10.4s, v15.4s 1721 add v11.4s, v11.4s, v16.4s 1722 add v12.4s, v12.4s, v17.4s 1723 add v13.4s, v13.4s, v18.4s 1724 add v14.4s, v14.4s, v19.4s 1725 1726 eor v20.16b, v20.16b, v10.16b 1727 eor v5.16b, v5.16b, v11.16b 1728 eor v6.16b, v6.16b, v12.16b 1729 eor v7.16b, v7.16b, v13.16b 1730 eor v8.16b, v8.16b, v14.16b 1731 1732 ushr v9.4s, v8.4s, #25 1733 sli v9.4s, v8.4s, #7 1734 ushr v8.4s, v7.4s, #25 1735 sli v8.4s, v7.4s, #7 1736 ushr v7.4s, v6.4s, #25 1737 sli v7.4s, v6.4s, #7 1738 ushr v6.4s, v5.4s, #25 1739 sli v6.4s, v5.4s, #7 1740 ushr v5.4s, v20.4s, #25 1741 sli v5.4s, v20.4s, #7 1742 1743 ext v9.16b, v9.16b, v9.16b, #4 1744 ext v14.16b, v14.16b, v14.16b, #8 1745 ext v19.16b, v19.16b, v19.16b, #12 1746 ldp x11, x12, [x3], 16 1747 adds x8, x8, x11 1748 adcs x9, x9, x12 1749 adc x10, x10, x15 1750 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1751 umulh x12, x8, x16 1752 mul x13, x9, x16 1753 umulh x14, x9, x16 1754 adds x12, x12, x13 1755 mul x13, x10, x16 1756 adc x13, x13, x14 1757 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1758 umulh x8, x8, x17 1759 adds x12, x12, x14 1760 mul x14, x9, x17 1761 umulh x9, x9, x17 1762 adcs x14, x14, x8 1763 mul x10, x10, x17 1764 adc x10, x10, x9 1765 adds x13, x13, x14 1766 adc x14, x10, xzr 1767 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1768 and x8, x13, #-4 1769 extr x13, x14, x13, #2 1770 adds x8, x8, x11 1771 lsr x11, x14, #2 1772 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1773 adds x8, x8, x13 1774 adcs x9, x9, x12 1775 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1776 add v0.4s, v0.4s, v6.4s 1777 add v1.4s, v1.4s, v7.4s 1778 add v2.4s, v2.4s, v8.4s 1779 add v3.4s, v3.4s, v5.4s 1780 add v4.4s, v4.4s, v9.4s 1781 1782 eor v18.16b, v18.16b, v0.16b 1783 eor v15.16b, v15.16b, v1.16b 1784 eor v16.16b, v16.16b, v2.16b 1785 eor v17.16b, v17.16b, v3.16b 1786 eor v19.16b, v19.16b, v4.16b 1787 1788 rev32 v18.8h, v18.8h 1789 rev32 v15.8h, v15.8h 1790 rev32 v16.8h, v16.8h 1791 rev32 v17.8h, v17.8h 1792 rev32 v19.8h, v19.8h 1793 1794 add v12.4s, v12.4s, v18.4s 1795 add v13.4s, v13.4s, v15.4s 1796 add v10.4s, v10.4s, v16.4s 1797 add v11.4s, v11.4s, v17.4s 1798 add v14.4s, v14.4s, v19.4s 1799 1800 eor v6.16b, v6.16b, v12.16b 1801 eor v7.16b, v7.16b, v13.16b 1802 eor v8.16b, v8.16b, v10.16b 1803 eor v5.16b, v5.16b, v11.16b 1804 eor v9.16b, v9.16b, v14.16b 1805 1806 ushr v20.4s, v6.4s, #20 1807 sli v20.4s, v6.4s, #12 1808 ushr v6.4s, v7.4s, #20 1809 sli v6.4s, v7.4s, #12 1810 ushr v7.4s, v8.4s, #20 1811 sli v7.4s, v8.4s, #12 1812 ushr v8.4s, v5.4s, #20 1813 sli v8.4s, v5.4s, #12 1814 ushr v5.4s, v9.4s, #20 1815 sli v5.4s, v9.4s, #12 1816 1817 add v0.4s, v0.4s, v20.4s 1818 add v1.4s, v1.4s, v6.4s 1819 add v2.4s, v2.4s, v7.4s 1820 add v3.4s, v3.4s, v8.4s 1821 add v4.4s, v4.4s, v5.4s 1822 1823 eor v18.16b, v18.16b, v0.16b 1824 eor v15.16b, v15.16b, v1.16b 1825 eor v16.16b, v16.16b, v2.16b 1826 eor v17.16b, v17.16b, v3.16b 1827 eor v19.16b, v19.16b, v4.16b 1828 1829 tbl v18.16b, {v18.16b}, v26.16b 1830 tbl v15.16b, {v15.16b}, v26.16b 1831 tbl v16.16b, {v16.16b}, v26.16b 1832 tbl v17.16b, {v17.16b}, v26.16b 1833 tbl v19.16b, {v19.16b}, v26.16b 1834 1835 add v12.4s, v12.4s, v18.4s 1836 add v13.4s, v13.4s, v15.4s 1837 add v10.4s, v10.4s, v16.4s 1838 add v11.4s, v11.4s, v17.4s 1839 add v14.4s, v14.4s, v19.4s 1840 1841 eor v20.16b, v20.16b, v12.16b 1842 eor v6.16b, v6.16b, v13.16b 1843 eor v7.16b, v7.16b, v10.16b 1844 eor v8.16b, v8.16b, v11.16b 1845 eor v5.16b, v5.16b, v14.16b 1846 1847 ushr v9.4s, v5.4s, #25 1848 sli v9.4s, v5.4s, #7 1849 ushr v5.4s, v8.4s, #25 1850 sli v5.4s, v8.4s, #7 1851 ushr v8.4s, v7.4s, #25 1852 sli v8.4s, v7.4s, #7 1853 ushr v7.4s, v6.4s, #25 1854 sli v7.4s, v6.4s, #7 1855 ushr v6.4s, v20.4s, #25 1856 sli v6.4s, v20.4s, #7 1857 1858 ext v9.16b, v9.16b, v9.16b, #12 1859 ext v14.16b, v14.16b, v14.16b, #8 1860 ext v19.16b, v19.16b, v19.16b, #4 1861 subs x7, x7, #1 1862 b.gt Lopen_main_loop_rounds 1863 subs x6, x6, #1 1864 b.ge Lopen_main_loop_rounds_short 1865 1866 eor v20.16b, v20.16b, v20.16b //zero 1867 not v21.16b, v20.16b // -1 1868 sub v21.4s, v25.4s, v21.4s // Add +1 1869 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 1870 add v19.4s, v19.4s, v20.4s 1871 1872 add v15.4s, v15.4s, v25.4s 1873 mov x11, #5 1874 dup v20.4s, w11 1875 add v25.4s, v25.4s, v20.4s 1876 1877 zip1 v20.4s, v0.4s, v1.4s 1878 zip2 v21.4s, v0.4s, v1.4s 1879 zip1 v22.4s, v2.4s, v3.4s 1880 zip2 v23.4s, v2.4s, v3.4s 1881 1882 zip1 v0.2d, v20.2d, v22.2d 1883 zip2 v1.2d, v20.2d, v22.2d 1884 zip1 v2.2d, v21.2d, v23.2d 1885 zip2 v3.2d, v21.2d, v23.2d 1886 1887 zip1 v20.4s, v5.4s, v6.4s 1888 zip2 v21.4s, v5.4s, v6.4s 1889 zip1 v22.4s, v7.4s, v8.4s 1890 zip2 v23.4s, v7.4s, v8.4s 1891 1892 zip1 v5.2d, v20.2d, v22.2d 1893 zip2 v6.2d, v20.2d, v22.2d 1894 zip1 v7.2d, v21.2d, v23.2d 1895 zip2 v8.2d, v21.2d, v23.2d 1896 1897 zip1 v20.4s, v10.4s, v11.4s 1898 zip2 v21.4s, v10.4s, v11.4s 1899 zip1 v22.4s, v12.4s, v13.4s 1900 zip2 v23.4s, v12.4s, v13.4s 1901 1902 zip1 v10.2d, v20.2d, v22.2d 1903 zip2 v11.2d, v20.2d, v22.2d 1904 zip1 v12.2d, v21.2d, v23.2d 1905 zip2 v13.2d, v21.2d, v23.2d 1906 1907 zip1 v20.4s, v15.4s, v16.4s 1908 zip2 v21.4s, v15.4s, v16.4s 1909 zip1 v22.4s, v17.4s, v18.4s 1910 zip2 v23.4s, v17.4s, v18.4s 1911 1912 zip1 v15.2d, v20.2d, v22.2d 1913 zip2 v16.2d, v20.2d, v22.2d 1914 zip1 v17.2d, v21.2d, v23.2d 1915 zip2 v18.2d, v21.2d, v23.2d 1916 1917 add v0.4s, v0.4s, v24.4s 1918 add v5.4s, v5.4s, v28.4s 1919 add v10.4s, v10.4s, v29.4s 1920 add v15.4s, v15.4s, v30.4s 1921 1922 add v1.4s, v1.4s, v24.4s 1923 add v6.4s, v6.4s, v28.4s 1924 add v11.4s, v11.4s, v29.4s 1925 add v16.4s, v16.4s, v30.4s 1926 1927 add v2.4s, v2.4s, v24.4s 1928 add v7.4s, v7.4s, v28.4s 1929 add v12.4s, v12.4s, v29.4s 1930 add v17.4s, v17.4s, v30.4s 1931 1932 add v3.4s, v3.4s, v24.4s 1933 add v8.4s, v8.4s, v28.4s 1934 add v13.4s, v13.4s, v29.4s 1935 add v18.4s, v18.4s, v30.4s 1936 1937 add v4.4s, v4.4s, v24.4s 1938 add v9.4s, v9.4s, v28.4s 1939 add v14.4s, v14.4s, v29.4s 1940 add v19.4s, v19.4s, v30.4s 1941 1942 // We can always safely store 192 bytes 1943 ld1 {v20.16b - v23.16b}, [x1], #64 1944 eor v20.16b, v20.16b, v0.16b 1945 eor v21.16b, v21.16b, v5.16b 1946 eor v22.16b, v22.16b, v10.16b 1947 eor v23.16b, v23.16b, v15.16b 1948 st1 {v20.16b - v23.16b}, [x0], #64 1949 1950 ld1 {v20.16b - v23.16b}, [x1], #64 1951 eor v20.16b, v20.16b, v1.16b 1952 eor v21.16b, v21.16b, v6.16b 1953 eor v22.16b, v22.16b, v11.16b 1954 eor v23.16b, v23.16b, v16.16b 1955 st1 {v20.16b - v23.16b}, [x0], #64 1956 1957 ld1 {v20.16b - v23.16b}, [x1], #64 1958 eor v20.16b, v20.16b, v2.16b 1959 eor v21.16b, v21.16b, v7.16b 1960 eor v22.16b, v22.16b, v12.16b 1961 eor v23.16b, v23.16b, v17.16b 1962 st1 {v20.16b - v23.16b}, [x0], #64 1963 1964 sub x2, x2, #192 1965 1966 mov v0.16b, v3.16b 1967 mov v5.16b, v8.16b 1968 mov v10.16b, v13.16b 1969 mov v15.16b, v18.16b 1970 1971 cmp x2, #64 1972 b.lt Lopen_tail_64_store 1973 1974 ld1 {v20.16b - v23.16b}, [x1], #64 1975 eor v20.16b, v20.16b, v3.16b 1976 eor v21.16b, v21.16b, v8.16b 1977 eor v22.16b, v22.16b, v13.16b 1978 eor v23.16b, v23.16b, v18.16b 1979 st1 {v20.16b - v23.16b}, [x0], #64 1980 1981 sub x2, x2, #64 1982 1983 mov v0.16b, v4.16b 1984 mov v5.16b, v9.16b 1985 mov v10.16b, v14.16b 1986 mov v15.16b, v19.16b 1987 1988 cmp x2, #64 1989 b.lt Lopen_tail_64_store 1990 1991 ld1 {v20.16b - v23.16b}, [x1], #64 1992 eor v20.16b, v20.16b, v4.16b 1993 eor v21.16b, v21.16b, v9.16b 1994 eor v22.16b, v22.16b, v14.16b 1995 eor v23.16b, v23.16b, v19.16b 1996 st1 {v20.16b - v23.16b}, [x0], #64 1997 1998 sub x2, x2, #64 1999 b Lopen_main_loop 2000 2001Lopen_tail: 2002 2003 cbz x2, Lopen_finalize 2004 2005 lsr x4, x2, #4 // How many whole blocks we have to hash 2006 2007 cmp x2, #64 2008 b.le Lopen_tail_64 2009 cmp x2, #128 2010 b.le Lopen_tail_128 2011 2012Lopen_tail_192: 2013 // We need three more blocks 2014 mov v0.16b, v24.16b 2015 mov v1.16b, v24.16b 2016 mov v2.16b, v24.16b 2017 mov v5.16b, v28.16b 2018 mov v6.16b, v28.16b 2019 mov v7.16b, v28.16b 2020 mov v10.16b, v29.16b 2021 mov v11.16b, v29.16b 2022 mov v12.16b, v29.16b 2023 mov v15.16b, v30.16b 2024 mov v16.16b, v30.16b 2025 mov v17.16b, v30.16b 2026 eor v23.16b, v23.16b, v23.16b 2027 eor v21.16b, v21.16b, v21.16b 2028 ins v23.s[0], v25.s[0] 2029 ins v21.d[0], x15 2030 2031 add v22.4s, v23.4s, v21.4s 2032 add v21.4s, v22.4s, v21.4s 2033 2034 add v15.4s, v15.4s, v21.4s 2035 add v16.4s, v16.4s, v23.4s 2036 add v17.4s, v17.4s, v22.4s 2037 2038 mov x7, #10 2039 subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash 2040 csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing 2041 sub x4, x4, x7 2042 2043 cbz x7, Lopen_tail_192_rounds_no_hash 2044 2045Lopen_tail_192_rounds: 2046 ldp x11, x12, [x3], 16 2047 adds x8, x8, x11 2048 adcs x9, x9, x12 2049 adc x10, x10, x15 2050 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2051 umulh x12, x8, x16 2052 mul x13, x9, x16 2053 umulh x14, x9, x16 2054 adds x12, x12, x13 2055 mul x13, x10, x16 2056 adc x13, x13, x14 2057 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2058 umulh x8, x8, x17 2059 adds x12, x12, x14 2060 mul x14, x9, x17 2061 umulh x9, x9, x17 2062 adcs x14, x14, x8 2063 mul x10, x10, x17 2064 adc x10, x10, x9 2065 adds x13, x13, x14 2066 adc x14, x10, xzr 2067 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2068 and x8, x13, #-4 2069 extr x13, x14, x13, #2 2070 adds x8, x8, x11 2071 lsr x11, x14, #2 2072 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2073 adds x8, x8, x13 2074 adcs x9, x9, x12 2075 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2076Lopen_tail_192_rounds_no_hash: 2077 add v0.4s, v0.4s, v5.4s 2078 add v1.4s, v1.4s, v6.4s 2079 add v2.4s, v2.4s, v7.4s 2080 eor v15.16b, v15.16b, v0.16b 2081 eor v16.16b, v16.16b, v1.16b 2082 eor v17.16b, v17.16b, v2.16b 2083 rev32 v15.8h, v15.8h 2084 rev32 v16.8h, v16.8h 2085 rev32 v17.8h, v17.8h 2086 2087 add v10.4s, v10.4s, v15.4s 2088 add v11.4s, v11.4s, v16.4s 2089 add v12.4s, v12.4s, v17.4s 2090 eor v5.16b, v5.16b, v10.16b 2091 eor v6.16b, v6.16b, v11.16b 2092 eor v7.16b, v7.16b, v12.16b 2093 ushr v20.4s, v5.4s, #20 2094 sli v20.4s, v5.4s, #12 2095 ushr v5.4s, v6.4s, #20 2096 sli v5.4s, v6.4s, #12 2097 ushr v6.4s, v7.4s, #20 2098 sli v6.4s, v7.4s, #12 2099 2100 add v0.4s, v0.4s, v20.4s 2101 add v1.4s, v1.4s, v5.4s 2102 add v2.4s, v2.4s, v6.4s 2103 eor v15.16b, v15.16b, v0.16b 2104 eor v16.16b, v16.16b, v1.16b 2105 eor v17.16b, v17.16b, v2.16b 2106 tbl v15.16b, {v15.16b}, v26.16b 2107 tbl v16.16b, {v16.16b}, v26.16b 2108 tbl v17.16b, {v17.16b}, v26.16b 2109 2110 add v10.4s, v10.4s, v15.4s 2111 add v11.4s, v11.4s, v16.4s 2112 add v12.4s, v12.4s, v17.4s 2113 eor v20.16b, v20.16b, v10.16b 2114 eor v5.16b, v5.16b, v11.16b 2115 eor v6.16b, v6.16b, v12.16b 2116 ushr v7.4s, v6.4s, #25 2117 sli v7.4s, v6.4s, #7 2118 ushr v6.4s, v5.4s, #25 2119 sli v6.4s, v5.4s, #7 2120 ushr v5.4s, v20.4s, #25 2121 sli v5.4s, v20.4s, #7 2122 2123 ext v5.16b, v5.16b, v5.16b, #4 2124 ext v6.16b, v6.16b, v6.16b, #4 2125 ext v7.16b, v7.16b, v7.16b, #4 2126 2127 ext v10.16b, v10.16b, v10.16b, #8 2128 ext v11.16b, v11.16b, v11.16b, #8 2129 ext v12.16b, v12.16b, v12.16b, #8 2130 2131 ext v15.16b, v15.16b, v15.16b, #12 2132 ext v16.16b, v16.16b, v16.16b, #12 2133 ext v17.16b, v17.16b, v17.16b, #12 2134 add v0.4s, v0.4s, v5.4s 2135 add v1.4s, v1.4s, v6.4s 2136 add v2.4s, v2.4s, v7.4s 2137 eor v15.16b, v15.16b, v0.16b 2138 eor v16.16b, v16.16b, v1.16b 2139 eor v17.16b, v17.16b, v2.16b 2140 rev32 v15.8h, v15.8h 2141 rev32 v16.8h, v16.8h 2142 rev32 v17.8h, v17.8h 2143 2144 add v10.4s, v10.4s, v15.4s 2145 add v11.4s, v11.4s, v16.4s 2146 add v12.4s, v12.4s, v17.4s 2147 eor v5.16b, v5.16b, v10.16b 2148 eor v6.16b, v6.16b, v11.16b 2149 eor v7.16b, v7.16b, v12.16b 2150 ushr v20.4s, v5.4s, #20 2151 sli v20.4s, v5.4s, #12 2152 ushr v5.4s, v6.4s, #20 2153 sli v5.4s, v6.4s, #12 2154 ushr v6.4s, v7.4s, #20 2155 sli v6.4s, v7.4s, #12 2156 2157 add v0.4s, v0.4s, v20.4s 2158 add v1.4s, v1.4s, v5.4s 2159 add v2.4s, v2.4s, v6.4s 2160 eor v15.16b, v15.16b, v0.16b 2161 eor v16.16b, v16.16b, v1.16b 2162 eor v17.16b, v17.16b, v2.16b 2163 tbl v15.16b, {v15.16b}, v26.16b 2164 tbl v16.16b, {v16.16b}, v26.16b 2165 tbl v17.16b, {v17.16b}, v26.16b 2166 2167 add v10.4s, v10.4s, v15.4s 2168 add v11.4s, v11.4s, v16.4s 2169 add v12.4s, v12.4s, v17.4s 2170 eor v20.16b, v20.16b, v10.16b 2171 eor v5.16b, v5.16b, v11.16b 2172 eor v6.16b, v6.16b, v12.16b 2173 ushr v7.4s, v6.4s, #25 2174 sli v7.4s, v6.4s, #7 2175 ushr v6.4s, v5.4s, #25 2176 sli v6.4s, v5.4s, #7 2177 ushr v5.4s, v20.4s, #25 2178 sli v5.4s, v20.4s, #7 2179 2180 ext v5.16b, v5.16b, v5.16b, #12 2181 ext v6.16b, v6.16b, v6.16b, #12 2182 ext v7.16b, v7.16b, v7.16b, #12 2183 2184 ext v10.16b, v10.16b, v10.16b, #8 2185 ext v11.16b, v11.16b, v11.16b, #8 2186 ext v12.16b, v12.16b, v12.16b, #8 2187 2188 ext v15.16b, v15.16b, v15.16b, #4 2189 ext v16.16b, v16.16b, v16.16b, #4 2190 ext v17.16b, v17.16b, v17.16b, #4 2191 subs x7, x7, #1 2192 b.gt Lopen_tail_192_rounds 2193 subs x6, x6, #1 2194 b.ge Lopen_tail_192_rounds_no_hash 2195 2196 // We hashed 160 bytes at most, may still have 32 bytes left 2197Lopen_tail_192_hash: 2198 cbz x4, Lopen_tail_192_hash_done 2199 ldp x11, x12, [x3], 16 2200 adds x8, x8, x11 2201 adcs x9, x9, x12 2202 adc x10, x10, x15 2203 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2204 umulh x12, x8, x16 2205 mul x13, x9, x16 2206 umulh x14, x9, x16 2207 adds x12, x12, x13 2208 mul x13, x10, x16 2209 adc x13, x13, x14 2210 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2211 umulh x8, x8, x17 2212 adds x12, x12, x14 2213 mul x14, x9, x17 2214 umulh x9, x9, x17 2215 adcs x14, x14, x8 2216 mul x10, x10, x17 2217 adc x10, x10, x9 2218 adds x13, x13, x14 2219 adc x14, x10, xzr 2220 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2221 and x8, x13, #-4 2222 extr x13, x14, x13, #2 2223 adds x8, x8, x11 2224 lsr x11, x14, #2 2225 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2226 adds x8, x8, x13 2227 adcs x9, x9, x12 2228 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2229 sub x4, x4, #1 2230 b Lopen_tail_192_hash 2231 2232Lopen_tail_192_hash_done: 2233 2234 add v0.4s, v0.4s, v24.4s 2235 add v1.4s, v1.4s, v24.4s 2236 add v2.4s, v2.4s, v24.4s 2237 add v5.4s, v5.4s, v28.4s 2238 add v6.4s, v6.4s, v28.4s 2239 add v7.4s, v7.4s, v28.4s 2240 add v10.4s, v10.4s, v29.4s 2241 add v11.4s, v11.4s, v29.4s 2242 add v12.4s, v12.4s, v29.4s 2243 add v15.4s, v15.4s, v30.4s 2244 add v16.4s, v16.4s, v30.4s 2245 add v17.4s, v17.4s, v30.4s 2246 2247 add v15.4s, v15.4s, v21.4s 2248 add v16.4s, v16.4s, v23.4s 2249 add v17.4s, v17.4s, v22.4s 2250 2251 ld1 {v20.16b - v23.16b}, [x1], #64 2252 2253 eor v20.16b, v20.16b, v1.16b 2254 eor v21.16b, v21.16b, v6.16b 2255 eor v22.16b, v22.16b, v11.16b 2256 eor v23.16b, v23.16b, v16.16b 2257 2258 st1 {v20.16b - v23.16b}, [x0], #64 2259 2260 ld1 {v20.16b - v23.16b}, [x1], #64 2261 2262 eor v20.16b, v20.16b, v2.16b 2263 eor v21.16b, v21.16b, v7.16b 2264 eor v22.16b, v22.16b, v12.16b 2265 eor v23.16b, v23.16b, v17.16b 2266 2267 st1 {v20.16b - v23.16b}, [x0], #64 2268 2269 sub x2, x2, #128 2270 b Lopen_tail_64_store 2271 2272Lopen_tail_128: 2273 // We need two more blocks 2274 mov v0.16b, v24.16b 2275 mov v1.16b, v24.16b 2276 mov v5.16b, v28.16b 2277 mov v6.16b, v28.16b 2278 mov v10.16b, v29.16b 2279 mov v11.16b, v29.16b 2280 mov v15.16b, v30.16b 2281 mov v16.16b, v30.16b 2282 eor v23.16b, v23.16b, v23.16b 2283 eor v22.16b, v22.16b, v22.16b 2284 ins v23.s[0], v25.s[0] 2285 ins v22.d[0], x15 2286 add v22.4s, v22.4s, v23.4s 2287 2288 add v15.4s, v15.4s, v22.4s 2289 add v16.4s, v16.4s, v23.4s 2290 2291 mov x6, #10 2292 sub x6, x6, x4 2293 2294Lopen_tail_128_rounds: 2295 add v0.4s, v0.4s, v5.4s 2296 eor v15.16b, v15.16b, v0.16b 2297 rev32 v15.8h, v15.8h 2298 2299 add v10.4s, v10.4s, v15.4s 2300 eor v5.16b, v5.16b, v10.16b 2301 ushr v20.4s, v5.4s, #20 2302 sli v20.4s, v5.4s, #12 2303 add v0.4s, v0.4s, v20.4s 2304 eor v15.16b, v15.16b, v0.16b 2305 tbl v15.16b, {v15.16b}, v26.16b 2306 2307 add v10.4s, v10.4s, v15.4s 2308 eor v20.16b, v20.16b, v10.16b 2309 ushr v5.4s, v20.4s, #25 2310 sli v5.4s, v20.4s, #7 2311 ext v5.16b, v5.16b, v5.16b, #4 2312 ext v10.16b, v10.16b, v10.16b, #8 2313 ext v15.16b, v15.16b, v15.16b, #12 2314 add v1.4s, v1.4s, v6.4s 2315 eor v16.16b, v16.16b, v1.16b 2316 rev32 v16.8h, v16.8h 2317 2318 add v11.4s, v11.4s, v16.4s 2319 eor v6.16b, v6.16b, v11.16b 2320 ushr v20.4s, v6.4s, #20 2321 sli v20.4s, v6.4s, #12 2322 add v1.4s, v1.4s, v20.4s 2323 eor v16.16b, v16.16b, v1.16b 2324 tbl v16.16b, {v16.16b}, v26.16b 2325 2326 add v11.4s, v11.4s, v16.4s 2327 eor v20.16b, v20.16b, v11.16b 2328 ushr v6.4s, v20.4s, #25 2329 sli v6.4s, v20.4s, #7 2330 ext v6.16b, v6.16b, v6.16b, #4 2331 ext v11.16b, v11.16b, v11.16b, #8 2332 ext v16.16b, v16.16b, v16.16b, #12 2333 add v0.4s, v0.4s, v5.4s 2334 eor v15.16b, v15.16b, v0.16b 2335 rev32 v15.8h, v15.8h 2336 2337 add v10.4s, v10.4s, v15.4s 2338 eor v5.16b, v5.16b, v10.16b 2339 ushr v20.4s, v5.4s, #20 2340 sli v20.4s, v5.4s, #12 2341 add v0.4s, v0.4s, v20.4s 2342 eor v15.16b, v15.16b, v0.16b 2343 tbl v15.16b, {v15.16b}, v26.16b 2344 2345 add v10.4s, v10.4s, v15.4s 2346 eor v20.16b, v20.16b, v10.16b 2347 ushr v5.4s, v20.4s, #25 2348 sli v5.4s, v20.4s, #7 2349 ext v5.16b, v5.16b, v5.16b, #12 2350 ext v10.16b, v10.16b, v10.16b, #8 2351 ext v15.16b, v15.16b, v15.16b, #4 2352 add v1.4s, v1.4s, v6.4s 2353 eor v16.16b, v16.16b, v1.16b 2354 rev32 v16.8h, v16.8h 2355 2356 add v11.4s, v11.4s, v16.4s 2357 eor v6.16b, v6.16b, v11.16b 2358 ushr v20.4s, v6.4s, #20 2359 sli v20.4s, v6.4s, #12 2360 add v1.4s, v1.4s, v20.4s 2361 eor v16.16b, v16.16b, v1.16b 2362 tbl v16.16b, {v16.16b}, v26.16b 2363 2364 add v11.4s, v11.4s, v16.4s 2365 eor v20.16b, v20.16b, v11.16b 2366 ushr v6.4s, v20.4s, #25 2367 sli v6.4s, v20.4s, #7 2368 ext v6.16b, v6.16b, v6.16b, #12 2369 ext v11.16b, v11.16b, v11.16b, #8 2370 ext v16.16b, v16.16b, v16.16b, #4 2371 subs x6, x6, #1 2372 b.gt Lopen_tail_128_rounds 2373 cbz x4, Lopen_tail_128_rounds_done 2374 subs x4, x4, #1 2375 ldp x11, x12, [x3], 16 2376 adds x8, x8, x11 2377 adcs x9, x9, x12 2378 adc x10, x10, x15 2379 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2380 umulh x12, x8, x16 2381 mul x13, x9, x16 2382 umulh x14, x9, x16 2383 adds x12, x12, x13 2384 mul x13, x10, x16 2385 adc x13, x13, x14 2386 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2387 umulh x8, x8, x17 2388 adds x12, x12, x14 2389 mul x14, x9, x17 2390 umulh x9, x9, x17 2391 adcs x14, x14, x8 2392 mul x10, x10, x17 2393 adc x10, x10, x9 2394 adds x13, x13, x14 2395 adc x14, x10, xzr 2396 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2397 and x8, x13, #-4 2398 extr x13, x14, x13, #2 2399 adds x8, x8, x11 2400 lsr x11, x14, #2 2401 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2402 adds x8, x8, x13 2403 adcs x9, x9, x12 2404 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2405 b Lopen_tail_128_rounds 2406 2407Lopen_tail_128_rounds_done: 2408 add v0.4s, v0.4s, v24.4s 2409 add v1.4s, v1.4s, v24.4s 2410 add v5.4s, v5.4s, v28.4s 2411 add v6.4s, v6.4s, v28.4s 2412 add v10.4s, v10.4s, v29.4s 2413 add v11.4s, v11.4s, v29.4s 2414 add v15.4s, v15.4s, v30.4s 2415 add v16.4s, v16.4s, v30.4s 2416 add v15.4s, v15.4s, v22.4s 2417 add v16.4s, v16.4s, v23.4s 2418 2419 ld1 {v20.16b - v23.16b}, [x1], #64 2420 2421 eor v20.16b, v20.16b, v1.16b 2422 eor v21.16b, v21.16b, v6.16b 2423 eor v22.16b, v22.16b, v11.16b 2424 eor v23.16b, v23.16b, v16.16b 2425 2426 st1 {v20.16b - v23.16b}, [x0], #64 2427 sub x2, x2, #64 2428 2429 b Lopen_tail_64_store 2430 2431Lopen_tail_64: 2432 // We just need a single block 2433 mov v0.16b, v24.16b 2434 mov v5.16b, v28.16b 2435 mov v10.16b, v29.16b 2436 mov v15.16b, v30.16b 2437 eor v23.16b, v23.16b, v23.16b 2438 ins v23.s[0], v25.s[0] 2439 add v15.4s, v15.4s, v23.4s 2440 2441 mov x6, #10 2442 sub x6, x6, x4 2443 2444Lopen_tail_64_rounds: 2445 add v0.4s, v0.4s, v5.4s 2446 eor v15.16b, v15.16b, v0.16b 2447 rev32 v15.8h, v15.8h 2448 2449 add v10.4s, v10.4s, v15.4s 2450 eor v5.16b, v5.16b, v10.16b 2451 ushr v20.4s, v5.4s, #20 2452 sli v20.4s, v5.4s, #12 2453 add v0.4s, v0.4s, v20.4s 2454 eor v15.16b, v15.16b, v0.16b 2455 tbl v15.16b, {v15.16b}, v26.16b 2456 2457 add v10.4s, v10.4s, v15.4s 2458 eor v20.16b, v20.16b, v10.16b 2459 ushr v5.4s, v20.4s, #25 2460 sli v5.4s, v20.4s, #7 2461 ext v5.16b, v5.16b, v5.16b, #4 2462 ext v10.16b, v10.16b, v10.16b, #8 2463 ext v15.16b, v15.16b, v15.16b, #12 2464 add v0.4s, v0.4s, v5.4s 2465 eor v15.16b, v15.16b, v0.16b 2466 rev32 v15.8h, v15.8h 2467 2468 add v10.4s, v10.4s, v15.4s 2469 eor v5.16b, v5.16b, v10.16b 2470 ushr v20.4s, v5.4s, #20 2471 sli v20.4s, v5.4s, #12 2472 add v0.4s, v0.4s, v20.4s 2473 eor v15.16b, v15.16b, v0.16b 2474 tbl v15.16b, {v15.16b}, v26.16b 2475 2476 add v10.4s, v10.4s, v15.4s 2477 eor v20.16b, v20.16b, v10.16b 2478 ushr v5.4s, v20.4s, #25 2479 sli v5.4s, v20.4s, #7 2480 ext v5.16b, v5.16b, v5.16b, #12 2481 ext v10.16b, v10.16b, v10.16b, #8 2482 ext v15.16b, v15.16b, v15.16b, #4 2483 subs x6, x6, #1 2484 b.gt Lopen_tail_64_rounds 2485 cbz x4, Lopen_tail_64_rounds_done 2486 subs x4, x4, #1 2487 ldp x11, x12, [x3], 16 2488 adds x8, x8, x11 2489 adcs x9, x9, x12 2490 adc x10, x10, x15 2491 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2492 umulh x12, x8, x16 2493 mul x13, x9, x16 2494 umulh x14, x9, x16 2495 adds x12, x12, x13 2496 mul x13, x10, x16 2497 adc x13, x13, x14 2498 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2499 umulh x8, x8, x17 2500 adds x12, x12, x14 2501 mul x14, x9, x17 2502 umulh x9, x9, x17 2503 adcs x14, x14, x8 2504 mul x10, x10, x17 2505 adc x10, x10, x9 2506 adds x13, x13, x14 2507 adc x14, x10, xzr 2508 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2509 and x8, x13, #-4 2510 extr x13, x14, x13, #2 2511 adds x8, x8, x11 2512 lsr x11, x14, #2 2513 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2514 adds x8, x8, x13 2515 adcs x9, x9, x12 2516 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2517 b Lopen_tail_64_rounds 2518 2519Lopen_tail_64_rounds_done: 2520 add v0.4s, v0.4s, v24.4s 2521 add v5.4s, v5.4s, v28.4s 2522 add v10.4s, v10.4s, v29.4s 2523 add v15.4s, v15.4s, v30.4s 2524 add v15.4s, v15.4s, v23.4s 2525 2526Lopen_tail_64_store: 2527 cmp x2, #16 2528 b.lt Lopen_tail_16 2529 2530 ld1 {v20.16b}, [x1], #16 2531 eor v20.16b, v20.16b, v0.16b 2532 st1 {v20.16b}, [x0], #16 2533 mov v0.16b, v5.16b 2534 mov v5.16b, v10.16b 2535 mov v10.16b, v15.16b 2536 sub x2, x2, #16 2537 b Lopen_tail_64_store 2538 2539Lopen_tail_16: 2540 // Here we handle the last [0,16) bytes that require a padded block 2541 cbz x2, Lopen_finalize 2542 2543 eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext 2544 eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask 2545 not v22.16b, v20.16b 2546 2547 add x7, x1, x2 2548 mov x6, x2 2549 2550Lopen_tail_16_compose: 2551 ext v20.16b, v20.16b, v20.16b, #15 2552 ldrb w11, [x7, #-1]! 2553 mov v20.b[0], w11 2554 ext v21.16b, v22.16b, v21.16b, #15 2555 subs x2, x2, #1 2556 b.gt Lopen_tail_16_compose 2557 2558 and v20.16b, v20.16b, v21.16b 2559 // Hash in the final padded block 2560 mov x11, v20.d[0] 2561 mov x12, v20.d[1] 2562 adds x8, x8, x11 2563 adcs x9, x9, x12 2564 adc x10, x10, x15 2565 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2566 umulh x12, x8, x16 2567 mul x13, x9, x16 2568 umulh x14, x9, x16 2569 adds x12, x12, x13 2570 mul x13, x10, x16 2571 adc x13, x13, x14 2572 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2573 umulh x8, x8, x17 2574 adds x12, x12, x14 2575 mul x14, x9, x17 2576 umulh x9, x9, x17 2577 adcs x14, x14, x8 2578 mul x10, x10, x17 2579 adc x10, x10, x9 2580 adds x13, x13, x14 2581 adc x14, x10, xzr 2582 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2583 and x8, x13, #-4 2584 extr x13, x14, x13, #2 2585 adds x8, x8, x11 2586 lsr x11, x14, #2 2587 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2588 adds x8, x8, x13 2589 adcs x9, x9, x12 2590 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2591 eor v20.16b, v20.16b, v0.16b 2592 2593Lopen_tail_16_store: 2594 umov w11, v20.b[0] 2595 strb w11, [x0], #1 2596 ext v20.16b, v20.16b, v20.16b, #1 2597 subs x6, x6, #1 2598 b.gt Lopen_tail_16_store 2599 2600Lopen_finalize: 2601 mov x11, v31.d[0] 2602 mov x12, v31.d[1] 2603 adds x8, x8, x11 2604 adcs x9, x9, x12 2605 adc x10, x10, x15 2606 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2607 umulh x12, x8, x16 2608 mul x13, x9, x16 2609 umulh x14, x9, x16 2610 adds x12, x12, x13 2611 mul x13, x10, x16 2612 adc x13, x13, x14 2613 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2614 umulh x8, x8, x17 2615 adds x12, x12, x14 2616 mul x14, x9, x17 2617 umulh x9, x9, x17 2618 adcs x14, x14, x8 2619 mul x10, x10, x17 2620 adc x10, x10, x9 2621 adds x13, x13, x14 2622 adc x14, x10, xzr 2623 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2624 and x8, x13, #-4 2625 extr x13, x14, x13, #2 2626 adds x8, x8, x11 2627 lsr x11, x14, #2 2628 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2629 adds x8, x8, x13 2630 adcs x9, x9, x12 2631 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2632 // Final reduction step 2633 sub x12, xzr, x15 2634 orr x13, xzr, #3 2635 subs x11, x8, #-5 2636 sbcs x12, x9, x12 2637 sbcs x13, x10, x13 2638 csel x8, x11, x8, cs 2639 csel x9, x12, x9, cs 2640 csel x10, x13, x10, cs 2641 mov x11, v27.d[0] 2642 mov x12, v27.d[1] 2643 adds x8, x8, x11 2644 adcs x9, x9, x12 2645 adc x10, x10, x15 2646 2647 stp x8, x9, [x5] 2648 2649 ldp d8, d9, [sp, #16] 2650 ldp d10, d11, [sp, #32] 2651 ldp d12, d13, [sp, #48] 2652 ldp d14, d15, [sp, #64] 2653.cfi_restore b15 2654.cfi_restore b14 2655.cfi_restore b13 2656.cfi_restore b12 2657.cfi_restore b11 2658.cfi_restore b10 2659.cfi_restore b9 2660.cfi_restore b8 2661 ldp x29, x30, [sp], 80 2662.cfi_restore w29 2663.cfi_restore w30 2664.cfi_def_cfa_offset 0 2665 AARCH64_VALIDATE_LINK_REGISTER 2666 ret 2667 2668Lopen_128: 2669 // On some architectures preparing 5 blocks for small buffers is wasteful 2670 eor v25.16b, v25.16b, v25.16b 2671 mov x11, #1 2672 mov v25.s[0], w11 2673 mov v0.16b, v24.16b 2674 mov v1.16b, v24.16b 2675 mov v2.16b, v24.16b 2676 mov v5.16b, v28.16b 2677 mov v6.16b, v28.16b 2678 mov v7.16b, v28.16b 2679 mov v10.16b, v29.16b 2680 mov v11.16b, v29.16b 2681 mov v12.16b, v29.16b 2682 mov v17.16b, v30.16b 2683 add v15.4s, v17.4s, v25.4s 2684 add v16.4s, v15.4s, v25.4s 2685 2686 mov x6, #10 2687 2688Lopen_128_rounds: 2689 add v0.4s, v0.4s, v5.4s 2690 add v1.4s, v1.4s, v6.4s 2691 add v2.4s, v2.4s, v7.4s 2692 eor v15.16b, v15.16b, v0.16b 2693 eor v16.16b, v16.16b, v1.16b 2694 eor v17.16b, v17.16b, v2.16b 2695 rev32 v15.8h, v15.8h 2696 rev32 v16.8h, v16.8h 2697 rev32 v17.8h, v17.8h 2698 2699 add v10.4s, v10.4s, v15.4s 2700 add v11.4s, v11.4s, v16.4s 2701 add v12.4s, v12.4s, v17.4s 2702 eor v5.16b, v5.16b, v10.16b 2703 eor v6.16b, v6.16b, v11.16b 2704 eor v7.16b, v7.16b, v12.16b 2705 ushr v20.4s, v5.4s, #20 2706 sli v20.4s, v5.4s, #12 2707 ushr v5.4s, v6.4s, #20 2708 sli v5.4s, v6.4s, #12 2709 ushr v6.4s, v7.4s, #20 2710 sli v6.4s, v7.4s, #12 2711 2712 add v0.4s, v0.4s, v20.4s 2713 add v1.4s, v1.4s, v5.4s 2714 add v2.4s, v2.4s, v6.4s 2715 eor v15.16b, v15.16b, v0.16b 2716 eor v16.16b, v16.16b, v1.16b 2717 eor v17.16b, v17.16b, v2.16b 2718 tbl v15.16b, {v15.16b}, v26.16b 2719 tbl v16.16b, {v16.16b}, v26.16b 2720 tbl v17.16b, {v17.16b}, v26.16b 2721 2722 add v10.4s, v10.4s, v15.4s 2723 add v11.4s, v11.4s, v16.4s 2724 add v12.4s, v12.4s, v17.4s 2725 eor v20.16b, v20.16b, v10.16b 2726 eor v5.16b, v5.16b, v11.16b 2727 eor v6.16b, v6.16b, v12.16b 2728 ushr v7.4s, v6.4s, #25 2729 sli v7.4s, v6.4s, #7 2730 ushr v6.4s, v5.4s, #25 2731 sli v6.4s, v5.4s, #7 2732 ushr v5.4s, v20.4s, #25 2733 sli v5.4s, v20.4s, #7 2734 2735 ext v5.16b, v5.16b, v5.16b, #4 2736 ext v6.16b, v6.16b, v6.16b, #4 2737 ext v7.16b, v7.16b, v7.16b, #4 2738 2739 ext v10.16b, v10.16b, v10.16b, #8 2740 ext v11.16b, v11.16b, v11.16b, #8 2741 ext v12.16b, v12.16b, v12.16b, #8 2742 2743 ext v15.16b, v15.16b, v15.16b, #12 2744 ext v16.16b, v16.16b, v16.16b, #12 2745 ext v17.16b, v17.16b, v17.16b, #12 2746 add v0.4s, v0.4s, v5.4s 2747 add v1.4s, v1.4s, v6.4s 2748 add v2.4s, v2.4s, v7.4s 2749 eor v15.16b, v15.16b, v0.16b 2750 eor v16.16b, v16.16b, v1.16b 2751 eor v17.16b, v17.16b, v2.16b 2752 rev32 v15.8h, v15.8h 2753 rev32 v16.8h, v16.8h 2754 rev32 v17.8h, v17.8h 2755 2756 add v10.4s, v10.4s, v15.4s 2757 add v11.4s, v11.4s, v16.4s 2758 add v12.4s, v12.4s, v17.4s 2759 eor v5.16b, v5.16b, v10.16b 2760 eor v6.16b, v6.16b, v11.16b 2761 eor v7.16b, v7.16b, v12.16b 2762 ushr v20.4s, v5.4s, #20 2763 sli v20.4s, v5.4s, #12 2764 ushr v5.4s, v6.4s, #20 2765 sli v5.4s, v6.4s, #12 2766 ushr v6.4s, v7.4s, #20 2767 sli v6.4s, v7.4s, #12 2768 2769 add v0.4s, v0.4s, v20.4s 2770 add v1.4s, v1.4s, v5.4s 2771 add v2.4s, v2.4s, v6.4s 2772 eor v15.16b, v15.16b, v0.16b 2773 eor v16.16b, v16.16b, v1.16b 2774 eor v17.16b, v17.16b, v2.16b 2775 tbl v15.16b, {v15.16b}, v26.16b 2776 tbl v16.16b, {v16.16b}, v26.16b 2777 tbl v17.16b, {v17.16b}, v26.16b 2778 2779 add v10.4s, v10.4s, v15.4s 2780 add v11.4s, v11.4s, v16.4s 2781 add v12.4s, v12.4s, v17.4s 2782 eor v20.16b, v20.16b, v10.16b 2783 eor v5.16b, v5.16b, v11.16b 2784 eor v6.16b, v6.16b, v12.16b 2785 ushr v7.4s, v6.4s, #25 2786 sli v7.4s, v6.4s, #7 2787 ushr v6.4s, v5.4s, #25 2788 sli v6.4s, v5.4s, #7 2789 ushr v5.4s, v20.4s, #25 2790 sli v5.4s, v20.4s, #7 2791 2792 ext v5.16b, v5.16b, v5.16b, #12 2793 ext v6.16b, v6.16b, v6.16b, #12 2794 ext v7.16b, v7.16b, v7.16b, #12 2795 2796 ext v10.16b, v10.16b, v10.16b, #8 2797 ext v11.16b, v11.16b, v11.16b, #8 2798 ext v12.16b, v12.16b, v12.16b, #8 2799 2800 ext v15.16b, v15.16b, v15.16b, #4 2801 ext v16.16b, v16.16b, v16.16b, #4 2802 ext v17.16b, v17.16b, v17.16b, #4 2803 subs x6, x6, #1 2804 b.hi Lopen_128_rounds 2805 2806 add v0.4s, v0.4s, v24.4s 2807 add v1.4s, v1.4s, v24.4s 2808 add v2.4s, v2.4s, v24.4s 2809 2810 add v5.4s, v5.4s, v28.4s 2811 add v6.4s, v6.4s, v28.4s 2812 add v7.4s, v7.4s, v28.4s 2813 2814 add v10.4s, v10.4s, v29.4s 2815 add v11.4s, v11.4s, v29.4s 2816 2817 add v30.4s, v30.4s, v25.4s 2818 add v15.4s, v15.4s, v30.4s 2819 add v30.4s, v30.4s, v25.4s 2820 add v16.4s, v16.4s, v30.4s 2821 2822 and v2.16b, v2.16b, v27.16b 2823 mov x16, v2.d[0] // Move the R key to GPRs 2824 mov x17, v2.d[1] 2825 mov v27.16b, v7.16b // Store the S key 2826 2827 bl Lpoly_hash_ad_internal 2828 2829Lopen_128_store: 2830 cmp x2, #64 2831 b.lt Lopen_128_store_64 2832 2833 ld1 {v20.16b - v23.16b}, [x1], #64 2834 2835 mov x11, v20.d[0] 2836 mov x12, v20.d[1] 2837 adds x8, x8, x11 2838 adcs x9, x9, x12 2839 adc x10, x10, x15 2840 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2841 umulh x12, x8, x16 2842 mul x13, x9, x16 2843 umulh x14, x9, x16 2844 adds x12, x12, x13 2845 mul x13, x10, x16 2846 adc x13, x13, x14 2847 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2848 umulh x8, x8, x17 2849 adds x12, x12, x14 2850 mul x14, x9, x17 2851 umulh x9, x9, x17 2852 adcs x14, x14, x8 2853 mul x10, x10, x17 2854 adc x10, x10, x9 2855 adds x13, x13, x14 2856 adc x14, x10, xzr 2857 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2858 and x8, x13, #-4 2859 extr x13, x14, x13, #2 2860 adds x8, x8, x11 2861 lsr x11, x14, #2 2862 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2863 adds x8, x8, x13 2864 adcs x9, x9, x12 2865 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2866 mov x11, v21.d[0] 2867 mov x12, v21.d[1] 2868 adds x8, x8, x11 2869 adcs x9, x9, x12 2870 adc x10, x10, x15 2871 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2872 umulh x12, x8, x16 2873 mul x13, x9, x16 2874 umulh x14, x9, x16 2875 adds x12, x12, x13 2876 mul x13, x10, x16 2877 adc x13, x13, x14 2878 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2879 umulh x8, x8, x17 2880 adds x12, x12, x14 2881 mul x14, x9, x17 2882 umulh x9, x9, x17 2883 adcs x14, x14, x8 2884 mul x10, x10, x17 2885 adc x10, x10, x9 2886 adds x13, x13, x14 2887 adc x14, x10, xzr 2888 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2889 and x8, x13, #-4 2890 extr x13, x14, x13, #2 2891 adds x8, x8, x11 2892 lsr x11, x14, #2 2893 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2894 adds x8, x8, x13 2895 adcs x9, x9, x12 2896 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2897 mov x11, v22.d[0] 2898 mov x12, v22.d[1] 2899 adds x8, x8, x11 2900 adcs x9, x9, x12 2901 adc x10, x10, x15 2902 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2903 umulh x12, x8, x16 2904 mul x13, x9, x16 2905 umulh x14, x9, x16 2906 adds x12, x12, x13 2907 mul x13, x10, x16 2908 adc x13, x13, x14 2909 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2910 umulh x8, x8, x17 2911 adds x12, x12, x14 2912 mul x14, x9, x17 2913 umulh x9, x9, x17 2914 adcs x14, x14, x8 2915 mul x10, x10, x17 2916 adc x10, x10, x9 2917 adds x13, x13, x14 2918 adc x14, x10, xzr 2919 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2920 and x8, x13, #-4 2921 extr x13, x14, x13, #2 2922 adds x8, x8, x11 2923 lsr x11, x14, #2 2924 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2925 adds x8, x8, x13 2926 adcs x9, x9, x12 2927 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2928 mov x11, v23.d[0] 2929 mov x12, v23.d[1] 2930 adds x8, x8, x11 2931 adcs x9, x9, x12 2932 adc x10, x10, x15 2933 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2934 umulh x12, x8, x16 2935 mul x13, x9, x16 2936 umulh x14, x9, x16 2937 adds x12, x12, x13 2938 mul x13, x10, x16 2939 adc x13, x13, x14 2940 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2941 umulh x8, x8, x17 2942 adds x12, x12, x14 2943 mul x14, x9, x17 2944 umulh x9, x9, x17 2945 adcs x14, x14, x8 2946 mul x10, x10, x17 2947 adc x10, x10, x9 2948 adds x13, x13, x14 2949 adc x14, x10, xzr 2950 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2951 and x8, x13, #-4 2952 extr x13, x14, x13, #2 2953 adds x8, x8, x11 2954 lsr x11, x14, #2 2955 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2956 adds x8, x8, x13 2957 adcs x9, x9, x12 2958 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2959 2960 eor v20.16b, v20.16b, v0.16b 2961 eor v21.16b, v21.16b, v5.16b 2962 eor v22.16b, v22.16b, v10.16b 2963 eor v23.16b, v23.16b, v15.16b 2964 2965 st1 {v20.16b - v23.16b}, [x0], #64 2966 2967 sub x2, x2, #64 2968 2969 mov v0.16b, v1.16b 2970 mov v5.16b, v6.16b 2971 mov v10.16b, v11.16b 2972 mov v15.16b, v16.16b 2973 2974Lopen_128_store_64: 2975 2976 lsr x4, x2, #4 2977 mov x3, x1 2978 2979Lopen_128_hash_64: 2980 cbz x4, Lopen_tail_64_store 2981 ldp x11, x12, [x3], 16 2982 adds x8, x8, x11 2983 adcs x9, x9, x12 2984 adc x10, x10, x15 2985 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2986 umulh x12, x8, x16 2987 mul x13, x9, x16 2988 umulh x14, x9, x16 2989 adds x12, x12, x13 2990 mul x13, x10, x16 2991 adc x13, x13, x14 2992 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2993 umulh x8, x8, x17 2994 adds x12, x12, x14 2995 mul x14, x9, x17 2996 umulh x9, x9, x17 2997 adcs x14, x14, x8 2998 mul x10, x10, x17 2999 adc x10, x10, x9 3000 adds x13, x13, x14 3001 adc x14, x10, xzr 3002 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 3003 and x8, x13, #-4 3004 extr x13, x14, x13, #2 3005 adds x8, x8, x11 3006 lsr x11, x14, #2 3007 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 3008 adds x8, x8, x13 3009 adcs x9, x9, x12 3010 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 3011 sub x4, x4, #1 3012 b Lopen_128_hash_64 3013.cfi_endproc 3014 3015#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 3016