1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include <ring-core/arm_arch.h> 8 9.text 10 11.globl gcm_init_neon 12.hidden gcm_init_neon 13.type gcm_init_neon,%function 14.align 4 15gcm_init_neon: 16 AARCH64_VALID_CALL_TARGET 17 // This function is adapted from gcm_init_v8. xC2 is t3. 18 ld1 {v17.2d}, [x1] // load H 19 movi v19.16b, #0xe1 20 shl v19.2d, v19.2d, #57 // 0xc2.0 21 ext v3.16b, v17.16b, v17.16b, #8 22 ushr v18.2d, v19.2d, #63 23 dup v17.4s, v17.s[1] 24 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 25 ushr v18.2d, v3.2d, #63 26 sshr v17.4s, v17.4s, #31 // broadcast carry bit 27 and v18.16b, v18.16b, v16.16b 28 shl v3.2d, v3.2d, #1 29 ext v18.16b, v18.16b, v18.16b, #8 30 and v16.16b, v16.16b, v17.16b 31 orr v3.16b, v3.16b, v18.16b // H<<<=1 32 eor v5.16b, v3.16b, v16.16b // twisted H 33 st1 {v5.2d}, [x0] // store Htable[0] 34 ret 35.size gcm_init_neon,.-gcm_init_neon 36 37.globl gcm_gmult_neon 38.hidden gcm_gmult_neon 39.type gcm_gmult_neon,%function 40.align 4 41gcm_gmult_neon: 42 AARCH64_VALID_CALL_TARGET 43 ld1 {v3.16b}, [x0] // load Xi 44 ld1 {v5.1d}, [x1], #8 // load twisted H 45 ld1 {v6.1d}, [x1] 46 adrp x9, .Lmasks // load constants 47 add x9, x9, :lo12:.Lmasks 48 ld1 {v24.2d, v25.2d}, [x9] 49 rev64 v3.16b, v3.16b // byteswap Xi 50 ext v3.16b, v3.16b, v3.16b, #8 51 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 52 53 mov x3, #16 54 b .Lgmult_neon 55.size gcm_gmult_neon,.-gcm_gmult_neon 56 57.globl gcm_ghash_neon 58.hidden gcm_ghash_neon 59.type gcm_ghash_neon,%function 60.align 4 61gcm_ghash_neon: 62 AARCH64_VALID_CALL_TARGET 63 ld1 {v0.16b}, [x0] // load Xi 64 ld1 {v5.1d}, [x1], #8 // load twisted H 65 ld1 {v6.1d}, [x1] 66 adrp x9, .Lmasks // load constants 67 add x9, x9, :lo12:.Lmasks 68 ld1 {v24.2d, v25.2d}, [x9] 69 rev64 v0.16b, v0.16b // byteswap Xi 70 ext v0.16b, v0.16b, v0.16b, #8 71 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 72 73.Loop_neon: 74 ld1 {v3.16b}, [x2], #16 // load inp 75 rev64 v3.16b, v3.16b // byteswap inp 76 ext v3.16b, v3.16b, v3.16b, #8 77 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 78 79.Lgmult_neon: 80 // Split the input into v3 and v4. (The upper halves are unused, 81 // so it is okay to leave them alone.) 82 ins v4.d[0], v3.d[1] 83 ext v16.8b, v5.8b, v5.8b, #1 // A1 84 pmull v16.8h, v16.8b, v3.8b // F = A1*B 85 ext v0.8b, v3.8b, v3.8b, #1 // B1 86 pmull v0.8h, v5.8b, v0.8b // E = A*B1 87 ext v17.8b, v5.8b, v5.8b, #2 // A2 88 pmull v17.8h, v17.8b, v3.8b // H = A2*B 89 ext v19.8b, v3.8b, v3.8b, #2 // B2 90 pmull v19.8h, v5.8b, v19.8b // G = A*B2 91 ext v18.8b, v5.8b, v5.8b, #3 // A3 92 eor v16.16b, v16.16b, v0.16b // L = E + F 93 pmull v18.8h, v18.8b, v3.8b // J = A3*B 94 ext v0.8b, v3.8b, v3.8b, #3 // B3 95 eor v17.16b, v17.16b, v19.16b // M = G + H 96 pmull v0.8h, v5.8b, v0.8b // I = A*B3 97 98 // Here we diverge from the 32-bit version. It computes the following 99 // (instructions reordered for clarity): 100 // 101 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 102 // vand $t0#hi, $t0#hi, $k48 103 // veor $t0#lo, $t0#lo, $t0#hi 104 // 105 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 106 // vand $t1#hi, $t1#hi, $k32 107 // veor $t1#lo, $t1#lo, $t1#hi 108 // 109 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 110 // vand $t2#hi, $t2#hi, $k16 111 // veor $t2#lo, $t2#lo, $t2#hi 112 // 113 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 114 // vmov.i64 $t3#hi, #0 115 // 116 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 117 // upper halves of SIMD registers, so we must split each half into 118 // separate registers. To compensate, we pair computations up and 119 // parallelize. 120 121 ext v19.8b, v3.8b, v3.8b, #4 // B4 122 eor v18.16b, v18.16b, v0.16b // N = I + J 123 pmull v19.8h, v5.8b, v19.8b // K = A*B4 124 125 // This can probably be scheduled more efficiently. For now, we just 126 // pair up independent instructions. 127 zip1 v20.2d, v16.2d, v17.2d 128 zip1 v22.2d, v18.2d, v19.2d 129 zip2 v21.2d, v16.2d, v17.2d 130 zip2 v23.2d, v18.2d, v19.2d 131 eor v20.16b, v20.16b, v21.16b 132 eor v22.16b, v22.16b, v23.16b 133 and v21.16b, v21.16b, v24.16b 134 and v23.16b, v23.16b, v25.16b 135 eor v20.16b, v20.16b, v21.16b 136 eor v22.16b, v22.16b, v23.16b 137 zip1 v16.2d, v20.2d, v21.2d 138 zip1 v18.2d, v22.2d, v23.2d 139 zip2 v17.2d, v20.2d, v21.2d 140 zip2 v19.2d, v22.2d, v23.2d 141 142 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 143 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 144 pmull v0.8h, v5.8b, v3.8b // D = A*B 145 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 146 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 147 eor v16.16b, v16.16b, v17.16b 148 eor v18.16b, v18.16b, v19.16b 149 eor v0.16b, v0.16b, v16.16b 150 eor v0.16b, v0.16b, v18.16b 151 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 152 ext v16.8b, v7.8b, v7.8b, #1 // A1 153 pmull v16.8h, v16.8b, v3.8b // F = A1*B 154 ext v1.8b, v3.8b, v3.8b, #1 // B1 155 pmull v1.8h, v7.8b, v1.8b // E = A*B1 156 ext v17.8b, v7.8b, v7.8b, #2 // A2 157 pmull v17.8h, v17.8b, v3.8b // H = A2*B 158 ext v19.8b, v3.8b, v3.8b, #2 // B2 159 pmull v19.8h, v7.8b, v19.8b // G = A*B2 160 ext v18.8b, v7.8b, v7.8b, #3 // A3 161 eor v16.16b, v16.16b, v1.16b // L = E + F 162 pmull v18.8h, v18.8b, v3.8b // J = A3*B 163 ext v1.8b, v3.8b, v3.8b, #3 // B3 164 eor v17.16b, v17.16b, v19.16b // M = G + H 165 pmull v1.8h, v7.8b, v1.8b // I = A*B3 166 167 // Here we diverge from the 32-bit version. It computes the following 168 // (instructions reordered for clarity): 169 // 170 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 171 // vand $t0#hi, $t0#hi, $k48 172 // veor $t0#lo, $t0#lo, $t0#hi 173 // 174 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 175 // vand $t1#hi, $t1#hi, $k32 176 // veor $t1#lo, $t1#lo, $t1#hi 177 // 178 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 179 // vand $t2#hi, $t2#hi, $k16 180 // veor $t2#lo, $t2#lo, $t2#hi 181 // 182 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 183 // vmov.i64 $t3#hi, #0 184 // 185 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 186 // upper halves of SIMD registers, so we must split each half into 187 // separate registers. To compensate, we pair computations up and 188 // parallelize. 189 190 ext v19.8b, v3.8b, v3.8b, #4 // B4 191 eor v18.16b, v18.16b, v1.16b // N = I + J 192 pmull v19.8h, v7.8b, v19.8b // K = A*B4 193 194 // This can probably be scheduled more efficiently. For now, we just 195 // pair up independent instructions. 196 zip1 v20.2d, v16.2d, v17.2d 197 zip1 v22.2d, v18.2d, v19.2d 198 zip2 v21.2d, v16.2d, v17.2d 199 zip2 v23.2d, v18.2d, v19.2d 200 eor v20.16b, v20.16b, v21.16b 201 eor v22.16b, v22.16b, v23.16b 202 and v21.16b, v21.16b, v24.16b 203 and v23.16b, v23.16b, v25.16b 204 eor v20.16b, v20.16b, v21.16b 205 eor v22.16b, v22.16b, v23.16b 206 zip1 v16.2d, v20.2d, v21.2d 207 zip1 v18.2d, v22.2d, v23.2d 208 zip2 v17.2d, v20.2d, v21.2d 209 zip2 v19.2d, v22.2d, v23.2d 210 211 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 212 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 213 pmull v1.8h, v7.8b, v3.8b // D = A*B 214 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 215 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 216 eor v16.16b, v16.16b, v17.16b 217 eor v18.16b, v18.16b, v19.16b 218 eor v1.16b, v1.16b, v16.16b 219 eor v1.16b, v1.16b, v18.16b 220 ext v16.8b, v6.8b, v6.8b, #1 // A1 221 pmull v16.8h, v16.8b, v4.8b // F = A1*B 222 ext v2.8b, v4.8b, v4.8b, #1 // B1 223 pmull v2.8h, v6.8b, v2.8b // E = A*B1 224 ext v17.8b, v6.8b, v6.8b, #2 // A2 225 pmull v17.8h, v17.8b, v4.8b // H = A2*B 226 ext v19.8b, v4.8b, v4.8b, #2 // B2 227 pmull v19.8h, v6.8b, v19.8b // G = A*B2 228 ext v18.8b, v6.8b, v6.8b, #3 // A3 229 eor v16.16b, v16.16b, v2.16b // L = E + F 230 pmull v18.8h, v18.8b, v4.8b // J = A3*B 231 ext v2.8b, v4.8b, v4.8b, #3 // B3 232 eor v17.16b, v17.16b, v19.16b // M = G + H 233 pmull v2.8h, v6.8b, v2.8b // I = A*B3 234 235 // Here we diverge from the 32-bit version. It computes the following 236 // (instructions reordered for clarity): 237 // 238 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 239 // vand $t0#hi, $t0#hi, $k48 240 // veor $t0#lo, $t0#lo, $t0#hi 241 // 242 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 243 // vand $t1#hi, $t1#hi, $k32 244 // veor $t1#lo, $t1#lo, $t1#hi 245 // 246 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 247 // vand $t2#hi, $t2#hi, $k16 248 // veor $t2#lo, $t2#lo, $t2#hi 249 // 250 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 251 // vmov.i64 $t3#hi, #0 252 // 253 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 254 // upper halves of SIMD registers, so we must split each half into 255 // separate registers. To compensate, we pair computations up and 256 // parallelize. 257 258 ext v19.8b, v4.8b, v4.8b, #4 // B4 259 eor v18.16b, v18.16b, v2.16b // N = I + J 260 pmull v19.8h, v6.8b, v19.8b // K = A*B4 261 262 // This can probably be scheduled more efficiently. For now, we just 263 // pair up independent instructions. 264 zip1 v20.2d, v16.2d, v17.2d 265 zip1 v22.2d, v18.2d, v19.2d 266 zip2 v21.2d, v16.2d, v17.2d 267 zip2 v23.2d, v18.2d, v19.2d 268 eor v20.16b, v20.16b, v21.16b 269 eor v22.16b, v22.16b, v23.16b 270 and v21.16b, v21.16b, v24.16b 271 and v23.16b, v23.16b, v25.16b 272 eor v20.16b, v20.16b, v21.16b 273 eor v22.16b, v22.16b, v23.16b 274 zip1 v16.2d, v20.2d, v21.2d 275 zip1 v18.2d, v22.2d, v23.2d 276 zip2 v17.2d, v20.2d, v21.2d 277 zip2 v19.2d, v22.2d, v23.2d 278 279 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 280 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 281 pmull v2.8h, v6.8b, v4.8b // D = A*B 282 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 283 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 284 eor v16.16b, v16.16b, v17.16b 285 eor v18.16b, v18.16b, v19.16b 286 eor v2.16b, v2.16b, v16.16b 287 eor v2.16b, v2.16b, v18.16b 288 ext v16.16b, v0.16b, v2.16b, #8 289 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 290 eor v1.16b, v1.16b, v2.16b 291 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 292 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 293 // This is a no-op due to the ins instruction below. 294 // ins v2.d[0], v1.d[1] 295 296 // equivalent of reduction_avx from ghash-x86_64.pl 297 shl v17.2d, v0.2d, #57 // 1st phase 298 shl v18.2d, v0.2d, #62 299 eor v18.16b, v18.16b, v17.16b // 300 shl v17.2d, v0.2d, #63 301 eor v18.16b, v18.16b, v17.16b // 302 // Note Xm contains {Xl.d[1], Xh.d[0]}. 303 eor v18.16b, v18.16b, v1.16b 304 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 305 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 306 307 ushr v18.2d, v0.2d, #1 // 2nd phase 308 eor v2.16b, v2.16b,v0.16b 309 eor v0.16b, v0.16b,v18.16b // 310 ushr v18.2d, v18.2d, #6 311 ushr v0.2d, v0.2d, #1 // 312 eor v0.16b, v0.16b, v2.16b // 313 eor v0.16b, v0.16b, v18.16b // 314 315 subs x3, x3, #16 316 bne .Loop_neon 317 318 rev64 v0.16b, v0.16b // byteswap Xi and write 319 ext v0.16b, v0.16b, v0.16b, #8 320 st1 {v0.16b}, [x0] 321 322 ret 323.size gcm_ghash_neon,.-gcm_ghash_neon 324 325.section .rodata 326.align 4 327.Lmasks: 328.quad 0x0000ffffffffffff // k48 329.quad 0x00000000ffffffff // k32 330.quad 0x000000000000ffff // k16 331.quad 0x0000000000000000 // k0 332.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 333.align 2 334.align 2 335#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 336