1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include <ring-core/arm_arch.h> 8 9#if __ARM_MAX_ARCH__>=7 10.text 11.arch armv8-a+crypto 12.globl gcm_init_clmul 13.hidden gcm_init_clmul 14.type gcm_init_clmul,%function 15.align 4 16gcm_init_clmul: 17 AARCH64_VALID_CALL_TARGET 18 ld1 {v17.2d},[x1] //load input H 19 movi v19.16b,#0xe1 20 shl v19.2d,v19.2d,#57 //0xc2.0 21 ext v3.16b,v17.16b,v17.16b,#8 22 ushr v18.2d,v19.2d,#63 23 dup v17.4s,v17.s[1] 24 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 25 ushr v18.2d,v3.2d,#63 26 sshr v17.4s,v17.4s,#31 //broadcast carry bit 27 and v18.16b,v18.16b,v16.16b 28 shl v3.2d,v3.2d,#1 29 ext v18.16b,v18.16b,v18.16b,#8 30 and v16.16b,v16.16b,v17.16b 31 orr v3.16b,v3.16b,v18.16b //H<<<=1 32 eor v20.16b,v3.16b,v16.16b //twisted H 33 st1 {v20.2d},[x0],#16 //store Htable[0] 34 35 //calculate H^2 36 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 37 pmull v0.1q,v20.1d,v20.1d 38 eor v16.16b,v16.16b,v20.16b 39 pmull2 v2.1q,v20.2d,v20.2d 40 pmull v1.1q,v16.1d,v16.1d 41 42 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 43 eor v18.16b,v0.16b,v2.16b 44 eor v1.16b,v1.16b,v17.16b 45 eor v1.16b,v1.16b,v18.16b 46 pmull v18.1q,v0.1d,v19.1d //1st phase 47 48 ins v2.d[0],v1.d[1] 49 ins v1.d[1],v0.d[0] 50 eor v0.16b,v1.16b,v18.16b 51 52 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 53 pmull v0.1q,v0.1d,v19.1d 54 eor v18.16b,v18.16b,v2.16b 55 eor v22.16b,v0.16b,v18.16b 56 57 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 58 eor v17.16b,v17.16b,v22.16b 59 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 60 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 61 //calculate H^3 and H^4 62 pmull v0.1q,v20.1d, v22.1d 63 pmull v5.1q,v22.1d,v22.1d 64 pmull2 v2.1q,v20.2d, v22.2d 65 pmull2 v7.1q,v22.2d,v22.2d 66 pmull v1.1q,v16.1d,v17.1d 67 pmull v6.1q,v17.1d,v17.1d 68 69 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 70 ext v17.16b,v5.16b,v7.16b,#8 71 eor v18.16b,v0.16b,v2.16b 72 eor v1.16b,v1.16b,v16.16b 73 eor v4.16b,v5.16b,v7.16b 74 eor v6.16b,v6.16b,v17.16b 75 eor v1.16b,v1.16b,v18.16b 76 pmull v18.1q,v0.1d,v19.1d //1st phase 77 eor v6.16b,v6.16b,v4.16b 78 pmull v4.1q,v5.1d,v19.1d 79 80 ins v2.d[0],v1.d[1] 81 ins v7.d[0],v6.d[1] 82 ins v1.d[1],v0.d[0] 83 ins v6.d[1],v5.d[0] 84 eor v0.16b,v1.16b,v18.16b 85 eor v5.16b,v6.16b,v4.16b 86 87 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 88 ext v4.16b,v5.16b,v5.16b,#8 89 pmull v0.1q,v0.1d,v19.1d 90 pmull v5.1q,v5.1d,v19.1d 91 eor v18.16b,v18.16b,v2.16b 92 eor v4.16b,v4.16b,v7.16b 93 eor v20.16b, v0.16b,v18.16b //H^3 94 eor v22.16b,v5.16b,v4.16b //H^4 95 96 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing 97 ext v17.16b,v22.16b,v22.16b,#8 98 eor v16.16b,v16.16b,v20.16b 99 eor v17.16b,v17.16b,v22.16b 100 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 101 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] 102 ret 103.size gcm_init_clmul,.-gcm_init_clmul 104.globl gcm_gmult_clmul 105.hidden gcm_gmult_clmul 106.type gcm_gmult_clmul,%function 107.align 4 108gcm_gmult_clmul: 109 AARCH64_VALID_CALL_TARGET 110 ld1 {v17.2d},[x0] //load Xi 111 movi v19.16b,#0xe1 112 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 113 shl v19.2d,v19.2d,#57 114#ifndef __AARCH64EB__ 115 rev64 v17.16b,v17.16b 116#endif 117 ext v3.16b,v17.16b,v17.16b,#8 118 119 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 120 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 121 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 122 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 123 124 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 125 eor v18.16b,v0.16b,v2.16b 126 eor v1.16b,v1.16b,v17.16b 127 eor v1.16b,v1.16b,v18.16b 128 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 129 130 ins v2.d[0],v1.d[1] 131 ins v1.d[1],v0.d[0] 132 eor v0.16b,v1.16b,v18.16b 133 134 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 135 pmull v0.1q,v0.1d,v19.1d 136 eor v18.16b,v18.16b,v2.16b 137 eor v0.16b,v0.16b,v18.16b 138 139#ifndef __AARCH64EB__ 140 rev64 v0.16b,v0.16b 141#endif 142 ext v0.16b,v0.16b,v0.16b,#8 143 st1 {v0.2d},[x0] //write out Xi 144 145 ret 146.size gcm_gmult_clmul,.-gcm_gmult_clmul 147.globl gcm_ghash_clmul 148.hidden gcm_ghash_clmul 149.type gcm_ghash_clmul,%function 150.align 4 151gcm_ghash_clmul: 152 AARCH64_VALID_CALL_TARGET 153 cmp x3,#64 154 b.hs .Lgcm_ghash_v8_4x 155 ld1 {v0.2d},[x0] //load [rotated] Xi 156 //"[rotated]" means that 157 //loaded value would have 158 //to be rotated in order to 159 //make it appear as in 160 //algorithm specification 161 subs x3,x3,#32 //see if x3 is 32 or larger 162 mov x12,#16 //x12 is used as post- 163 //increment for input pointer; 164 //as loop is modulo-scheduled 165 //x12 is zeroed just in time 166 //to preclude overstepping 167 //inp[len], which means that 168 //last block[s] are actually 169 //loaded twice, but last 170 //copy is not processed 171 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 172 movi v19.16b,#0xe1 173 ld1 {v22.2d},[x1] 174 csel x12,xzr,x12,eq //is it time to zero x12? 175 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 176 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 177 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 178#ifndef __AARCH64EB__ 179 rev64 v16.16b,v16.16b 180 rev64 v0.16b,v0.16b 181#endif 182 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 183 b.lo .Lodd_tail_v8 //x3 was less than 32 184 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 185#ifndef __AARCH64EB__ 186 rev64 v17.16b,v17.16b 187#endif 188 ext v7.16b,v17.16b,v17.16b,#8 189 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 190 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 191 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 192 pmull2 v6.1q,v20.2d,v7.2d 193 b .Loop_mod2x_v8 194 195.align 4 196.Loop_mod2x_v8: 197 ext v18.16b,v3.16b,v3.16b,#8 198 subs x3,x3,#32 //is there more data? 199 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 200 csel x12,xzr,x12,lo //is it time to zero x12? 201 202 pmull v5.1q,v21.1d,v17.1d 203 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 204 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 205 eor v0.16b,v0.16b,v4.16b //accumulate 206 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 207 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 208 209 eor v2.16b,v2.16b,v6.16b 210 csel x12,xzr,x12,eq //is it time to zero x12? 211 eor v1.16b,v1.16b,v5.16b 212 213 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 214 eor v18.16b,v0.16b,v2.16b 215 eor v1.16b,v1.16b,v17.16b 216 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 217#ifndef __AARCH64EB__ 218 rev64 v16.16b,v16.16b 219#endif 220 eor v1.16b,v1.16b,v18.16b 221 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 222 223#ifndef __AARCH64EB__ 224 rev64 v17.16b,v17.16b 225#endif 226 ins v2.d[0],v1.d[1] 227 ins v1.d[1],v0.d[0] 228 ext v7.16b,v17.16b,v17.16b,#8 229 ext v3.16b,v16.16b,v16.16b,#8 230 eor v0.16b,v1.16b,v18.16b 231 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 232 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 233 234 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 235 pmull v0.1q,v0.1d,v19.1d 236 eor v3.16b,v3.16b,v18.16b 237 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 238 eor v3.16b,v3.16b,v0.16b 239 pmull2 v6.1q,v20.2d,v7.2d 240 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 241 242 eor v2.16b,v2.16b,v18.16b 243 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 244 adds x3,x3,#32 //re-construct x3 245 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 246 b.eq .Ldone_v8 //is x3 zero? 247.Lodd_tail_v8: 248 ext v18.16b,v0.16b,v0.16b,#8 249 eor v3.16b,v3.16b,v0.16b //inp^=Xi 250 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 251 252 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 253 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 254 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 255 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 256 257 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 258 eor v18.16b,v0.16b,v2.16b 259 eor v1.16b,v1.16b,v17.16b 260 eor v1.16b,v1.16b,v18.16b 261 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 262 263 ins v2.d[0],v1.d[1] 264 ins v1.d[1],v0.d[0] 265 eor v0.16b,v1.16b,v18.16b 266 267 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 268 pmull v0.1q,v0.1d,v19.1d 269 eor v18.16b,v18.16b,v2.16b 270 eor v0.16b,v0.16b,v18.16b 271 272.Ldone_v8: 273#ifndef __AARCH64EB__ 274 rev64 v0.16b,v0.16b 275#endif 276 ext v0.16b,v0.16b,v0.16b,#8 277 st1 {v0.2d},[x0] //write out Xi 278 279 ret 280.size gcm_ghash_clmul,.-gcm_ghash_clmul 281.type gcm_ghash_v8_4x,%function 282.align 4 283gcm_ghash_v8_4x: 284.Lgcm_ghash_v8_4x: 285 ld1 {v0.2d},[x0] //load [rotated] Xi 286 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 287 movi v19.16b,#0xe1 288 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 289 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 290 291 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 292#ifndef __AARCH64EB__ 293 rev64 v0.16b,v0.16b 294 rev64 v5.16b,v5.16b 295 rev64 v6.16b,v6.16b 296 rev64 v7.16b,v7.16b 297 rev64 v4.16b,v4.16b 298#endif 299 ext v25.16b,v7.16b,v7.16b,#8 300 ext v24.16b,v6.16b,v6.16b,#8 301 ext v23.16b,v5.16b,v5.16b,#8 302 303 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 304 eor v7.16b,v7.16b,v25.16b 305 pmull2 v31.1q,v20.2d,v25.2d 306 pmull v30.1q,v21.1d,v7.1d 307 308 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 309 eor v6.16b,v6.16b,v24.16b 310 pmull2 v24.1q,v22.2d,v24.2d 311 pmull2 v6.1q,v21.2d,v6.2d 312 313 eor v29.16b,v29.16b,v16.16b 314 eor v31.16b,v31.16b,v24.16b 315 eor v30.16b,v30.16b,v6.16b 316 317 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 318 eor v5.16b,v5.16b,v23.16b 319 pmull2 v23.1q,v26.2d,v23.2d 320 pmull v5.1q,v27.1d,v5.1d 321 322 eor v29.16b,v29.16b,v7.16b 323 eor v31.16b,v31.16b,v23.16b 324 eor v30.16b,v30.16b,v5.16b 325 326 subs x3,x3,#128 327 b.lo .Ltail4x 328 329 b .Loop4x 330 331.align 4 332.Loop4x: 333 eor v16.16b,v4.16b,v0.16b 334 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 335 ext v3.16b,v16.16b,v16.16b,#8 336#ifndef __AARCH64EB__ 337 rev64 v5.16b,v5.16b 338 rev64 v6.16b,v6.16b 339 rev64 v7.16b,v7.16b 340 rev64 v4.16b,v4.16b 341#endif 342 343 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 344 eor v16.16b,v16.16b,v3.16b 345 pmull2 v2.1q,v28.2d,v3.2d 346 ext v25.16b,v7.16b,v7.16b,#8 347 pmull2 v1.1q,v27.2d,v16.2d 348 349 eor v0.16b,v0.16b,v29.16b 350 eor v2.16b,v2.16b,v31.16b 351 ext v24.16b,v6.16b,v6.16b,#8 352 eor v1.16b,v1.16b,v30.16b 353 ext v23.16b,v5.16b,v5.16b,#8 354 355 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 356 eor v18.16b,v0.16b,v2.16b 357 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 358 eor v7.16b,v7.16b,v25.16b 359 eor v1.16b,v1.16b,v17.16b 360 pmull2 v31.1q,v20.2d,v25.2d 361 eor v1.16b,v1.16b,v18.16b 362 pmull v30.1q,v21.1d,v7.1d 363 364 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 365 ins v2.d[0],v1.d[1] 366 ins v1.d[1],v0.d[0] 367 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 368 eor v6.16b,v6.16b,v24.16b 369 pmull2 v24.1q,v22.2d,v24.2d 370 eor v0.16b,v1.16b,v18.16b 371 pmull2 v6.1q,v21.2d,v6.2d 372 373 eor v29.16b,v29.16b,v16.16b 374 eor v31.16b,v31.16b,v24.16b 375 eor v30.16b,v30.16b,v6.16b 376 377 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 378 pmull v0.1q,v0.1d,v19.1d 379 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 380 eor v5.16b,v5.16b,v23.16b 381 eor v18.16b,v18.16b,v2.16b 382 pmull2 v23.1q,v26.2d,v23.2d 383 pmull v5.1q,v27.1d,v5.1d 384 385 eor v0.16b,v0.16b,v18.16b 386 eor v29.16b,v29.16b,v7.16b 387 eor v31.16b,v31.16b,v23.16b 388 ext v0.16b,v0.16b,v0.16b,#8 389 eor v30.16b,v30.16b,v5.16b 390 391 subs x3,x3,#64 392 b.hs .Loop4x 393 394.Ltail4x: 395 eor v16.16b,v4.16b,v0.16b 396 ext v3.16b,v16.16b,v16.16b,#8 397 398 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 399 eor v16.16b,v16.16b,v3.16b 400 pmull2 v2.1q,v28.2d,v3.2d 401 pmull2 v1.1q,v27.2d,v16.2d 402 403 eor v0.16b,v0.16b,v29.16b 404 eor v2.16b,v2.16b,v31.16b 405 eor v1.16b,v1.16b,v30.16b 406 407 adds x3,x3,#64 408 b.eq .Ldone4x 409 410 cmp x3,#32 411 b.lo .Lone 412 b.eq .Ltwo 413.Lthree: 414 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 415 eor v18.16b,v0.16b,v2.16b 416 eor v1.16b,v1.16b,v17.16b 417 ld1 {v4.2d,v5.2d,v6.2d},[x2] 418 eor v1.16b,v1.16b,v18.16b 419#ifndef __AARCH64EB__ 420 rev64 v5.16b,v5.16b 421 rev64 v6.16b,v6.16b 422 rev64 v4.16b,v4.16b 423#endif 424 425 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 426 ins v2.d[0],v1.d[1] 427 ins v1.d[1],v0.d[0] 428 ext v24.16b,v6.16b,v6.16b,#8 429 ext v23.16b,v5.16b,v5.16b,#8 430 eor v0.16b,v1.16b,v18.16b 431 432 pmull v29.1q,v20.1d,v24.1d //H·Ii+2 433 eor v6.16b,v6.16b,v24.16b 434 435 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 436 pmull v0.1q,v0.1d,v19.1d 437 eor v18.16b,v18.16b,v2.16b 438 pmull2 v31.1q,v20.2d,v24.2d 439 pmull v30.1q,v21.1d,v6.1d 440 eor v0.16b,v0.16b,v18.16b 441 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 442 eor v5.16b,v5.16b,v23.16b 443 ext v0.16b,v0.16b,v0.16b,#8 444 445 pmull2 v23.1q,v22.2d,v23.2d 446 eor v16.16b,v4.16b,v0.16b 447 pmull2 v5.1q,v21.2d,v5.2d 448 ext v3.16b,v16.16b,v16.16b,#8 449 450 eor v29.16b,v29.16b,v7.16b 451 eor v31.16b,v31.16b,v23.16b 452 eor v30.16b,v30.16b,v5.16b 453 454 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 455 eor v16.16b,v16.16b,v3.16b 456 pmull2 v2.1q,v26.2d,v3.2d 457 pmull v1.1q,v27.1d,v16.1d 458 459 eor v0.16b,v0.16b,v29.16b 460 eor v2.16b,v2.16b,v31.16b 461 eor v1.16b,v1.16b,v30.16b 462 b .Ldone4x 463 464.align 4 465.Ltwo: 466 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 467 eor v18.16b,v0.16b,v2.16b 468 eor v1.16b,v1.16b,v17.16b 469 ld1 {v4.2d,v5.2d},[x2] 470 eor v1.16b,v1.16b,v18.16b 471#ifndef __AARCH64EB__ 472 rev64 v5.16b,v5.16b 473 rev64 v4.16b,v4.16b 474#endif 475 476 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 477 ins v2.d[0],v1.d[1] 478 ins v1.d[1],v0.d[0] 479 ext v23.16b,v5.16b,v5.16b,#8 480 eor v0.16b,v1.16b,v18.16b 481 482 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 483 pmull v0.1q,v0.1d,v19.1d 484 eor v18.16b,v18.16b,v2.16b 485 eor v0.16b,v0.16b,v18.16b 486 ext v0.16b,v0.16b,v0.16b,#8 487 488 pmull v29.1q,v20.1d,v23.1d //H·Ii+1 489 eor v5.16b,v5.16b,v23.16b 490 491 eor v16.16b,v4.16b,v0.16b 492 ext v3.16b,v16.16b,v16.16b,#8 493 494 pmull2 v31.1q,v20.2d,v23.2d 495 pmull v30.1q,v21.1d,v5.1d 496 497 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 498 eor v16.16b,v16.16b,v3.16b 499 pmull2 v2.1q,v22.2d,v3.2d 500 pmull2 v1.1q,v21.2d,v16.2d 501 502 eor v0.16b,v0.16b,v29.16b 503 eor v2.16b,v2.16b,v31.16b 504 eor v1.16b,v1.16b,v30.16b 505 b .Ldone4x 506 507.align 4 508.Lone: 509 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 510 eor v18.16b,v0.16b,v2.16b 511 eor v1.16b,v1.16b,v17.16b 512 ld1 {v4.2d},[x2] 513 eor v1.16b,v1.16b,v18.16b 514#ifndef __AARCH64EB__ 515 rev64 v4.16b,v4.16b 516#endif 517 518 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 519 ins v2.d[0],v1.d[1] 520 ins v1.d[1],v0.d[0] 521 eor v0.16b,v1.16b,v18.16b 522 523 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 524 pmull v0.1q,v0.1d,v19.1d 525 eor v18.16b,v18.16b,v2.16b 526 eor v0.16b,v0.16b,v18.16b 527 ext v0.16b,v0.16b,v0.16b,#8 528 529 eor v16.16b,v4.16b,v0.16b 530 ext v3.16b,v16.16b,v16.16b,#8 531 532 pmull v0.1q,v20.1d,v3.1d 533 eor v16.16b,v16.16b,v3.16b 534 pmull2 v2.1q,v20.2d,v3.2d 535 pmull v1.1q,v21.1d,v16.1d 536 537.Ldone4x: 538 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 539 eor v18.16b,v0.16b,v2.16b 540 eor v1.16b,v1.16b,v17.16b 541 eor v1.16b,v1.16b,v18.16b 542 543 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 544 ins v2.d[0],v1.d[1] 545 ins v1.d[1],v0.d[0] 546 eor v0.16b,v1.16b,v18.16b 547 548 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 549 pmull v0.1q,v0.1d,v19.1d 550 eor v18.16b,v18.16b,v2.16b 551 eor v0.16b,v0.16b,v18.16b 552 ext v0.16b,v0.16b,v0.16b,#8 553 554#ifndef __AARCH64EB__ 555 rev64 v0.16b,v0.16b 556#endif 557 st1 {v0.2d},[x0] //write out Xi 558 559 ret 560.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 561.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 562.align 2 563.align 2 564#endif 565#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 566