1*8fb009dcSAndroid Build Coastguard Worker// This file is generated from a similarly-named Perl script in the BoringSSL 2*8fb009dcSAndroid Build Coastguard Worker// source tree. Do not edit by hand. 3*8fb009dcSAndroid Build Coastguard Worker 4*8fb009dcSAndroid Build Coastguard Worker#include <openssl/asm_base.h> 5*8fb009dcSAndroid Build Coastguard Worker 6*8fb009dcSAndroid Build Coastguard Worker#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7*8fb009dcSAndroid Build Coastguard Worker#include <openssl/arm_arch.h> 8*8fb009dcSAndroid Build Coastguard Worker 9*8fb009dcSAndroid Build Coastguard Worker.text 10*8fb009dcSAndroid Build Coastguard Worker 11*8fb009dcSAndroid Build Coastguard Worker.globl gcm_init_neon 12*8fb009dcSAndroid Build Coastguard Worker 13*8fb009dcSAndroid Build Coastguard Worker.def gcm_init_neon 14*8fb009dcSAndroid Build Coastguard Worker .type 32 15*8fb009dcSAndroid Build Coastguard Worker.endef 16*8fb009dcSAndroid Build Coastguard Worker.align 4 17*8fb009dcSAndroid Build Coastguard Workergcm_init_neon: 18*8fb009dcSAndroid Build Coastguard Worker AARCH64_VALID_CALL_TARGET 19*8fb009dcSAndroid Build Coastguard Worker // This function is adapted from gcm_init_v8. xC2 is t3. 20*8fb009dcSAndroid Build Coastguard Worker ld1 {v17.2d}, [x1] // load H 21*8fb009dcSAndroid Build Coastguard Worker movi v19.16b, #0xe1 22*8fb009dcSAndroid Build Coastguard Worker shl v19.2d, v19.2d, #57 // 0xc2.0 23*8fb009dcSAndroid Build Coastguard Worker ext v3.16b, v17.16b, v17.16b, #8 24*8fb009dcSAndroid Build Coastguard Worker ushr v18.2d, v19.2d, #63 25*8fb009dcSAndroid Build Coastguard Worker dup v17.4s, v17.s[1] 26*8fb009dcSAndroid Build Coastguard Worker ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 27*8fb009dcSAndroid Build Coastguard Worker ushr v18.2d, v3.2d, #63 28*8fb009dcSAndroid Build Coastguard Worker sshr v17.4s, v17.4s, #31 // broadcast carry bit 29*8fb009dcSAndroid Build Coastguard Worker and v18.16b, v18.16b, v16.16b 30*8fb009dcSAndroid Build Coastguard Worker shl v3.2d, v3.2d, #1 31*8fb009dcSAndroid Build Coastguard Worker ext v18.16b, v18.16b, v18.16b, #8 32*8fb009dcSAndroid Build Coastguard Worker and v16.16b, v16.16b, v17.16b 33*8fb009dcSAndroid Build Coastguard Worker orr v3.16b, v3.16b, v18.16b // H<<<=1 34*8fb009dcSAndroid Build Coastguard Worker eor v5.16b, v3.16b, v16.16b // twisted H 35*8fb009dcSAndroid Build Coastguard Worker st1 {v5.2d}, [x0] // store Htable[0] 36*8fb009dcSAndroid Build Coastguard Worker ret 37*8fb009dcSAndroid Build Coastguard Worker 38*8fb009dcSAndroid Build Coastguard Worker 39*8fb009dcSAndroid Build Coastguard Worker.globl gcm_gmult_neon 40*8fb009dcSAndroid Build Coastguard Worker 41*8fb009dcSAndroid Build Coastguard Worker.def gcm_gmult_neon 42*8fb009dcSAndroid Build Coastguard Worker .type 32 43*8fb009dcSAndroid Build Coastguard Worker.endef 44*8fb009dcSAndroid Build Coastguard Worker.align 4 45*8fb009dcSAndroid Build Coastguard Workergcm_gmult_neon: 46*8fb009dcSAndroid Build Coastguard Worker AARCH64_VALID_CALL_TARGET 47*8fb009dcSAndroid Build Coastguard Worker ld1 {v3.16b}, [x0] // load Xi 48*8fb009dcSAndroid Build Coastguard Worker ld1 {v5.1d}, [x1], #8 // load twisted H 49*8fb009dcSAndroid Build Coastguard Worker ld1 {v6.1d}, [x1] 50*8fb009dcSAndroid Build Coastguard Worker adrp x9, Lmasks // load constants 51*8fb009dcSAndroid Build Coastguard Worker add x9, x9, :lo12:Lmasks 52*8fb009dcSAndroid Build Coastguard Worker ld1 {v24.2d, v25.2d}, [x9] 53*8fb009dcSAndroid Build Coastguard Worker rev64 v3.16b, v3.16b // byteswap Xi 54*8fb009dcSAndroid Build Coastguard Worker ext v3.16b, v3.16b, v3.16b, #8 55*8fb009dcSAndroid Build Coastguard Worker eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 56*8fb009dcSAndroid Build Coastguard Worker 57*8fb009dcSAndroid Build Coastguard Worker mov x3, #16 58*8fb009dcSAndroid Build Coastguard Worker b Lgmult_neon 59*8fb009dcSAndroid Build Coastguard Worker 60*8fb009dcSAndroid Build Coastguard Worker 61*8fb009dcSAndroid Build Coastguard Worker.globl gcm_ghash_neon 62*8fb009dcSAndroid Build Coastguard Worker 63*8fb009dcSAndroid Build Coastguard Worker.def gcm_ghash_neon 64*8fb009dcSAndroid Build Coastguard Worker .type 32 65*8fb009dcSAndroid Build Coastguard Worker.endef 66*8fb009dcSAndroid Build Coastguard Worker.align 4 67*8fb009dcSAndroid Build Coastguard Workergcm_ghash_neon: 68*8fb009dcSAndroid Build Coastguard Worker AARCH64_VALID_CALL_TARGET 69*8fb009dcSAndroid Build Coastguard Worker ld1 {v0.16b}, [x0] // load Xi 70*8fb009dcSAndroid Build Coastguard Worker ld1 {v5.1d}, [x1], #8 // load twisted H 71*8fb009dcSAndroid Build Coastguard Worker ld1 {v6.1d}, [x1] 72*8fb009dcSAndroid Build Coastguard Worker adrp x9, Lmasks // load constants 73*8fb009dcSAndroid Build Coastguard Worker add x9, x9, :lo12:Lmasks 74*8fb009dcSAndroid Build Coastguard Worker ld1 {v24.2d, v25.2d}, [x9] 75*8fb009dcSAndroid Build Coastguard Worker rev64 v0.16b, v0.16b // byteswap Xi 76*8fb009dcSAndroid Build Coastguard Worker ext v0.16b, v0.16b, v0.16b, #8 77*8fb009dcSAndroid Build Coastguard Worker eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 78*8fb009dcSAndroid Build Coastguard Worker 79*8fb009dcSAndroid Build Coastguard WorkerLoop_neon: 80*8fb009dcSAndroid Build Coastguard Worker ld1 {v3.16b}, [x2], #16 // load inp 81*8fb009dcSAndroid Build Coastguard Worker rev64 v3.16b, v3.16b // byteswap inp 82*8fb009dcSAndroid Build Coastguard Worker ext v3.16b, v3.16b, v3.16b, #8 83*8fb009dcSAndroid Build Coastguard Worker eor v3.16b, v3.16b, v0.16b // inp ^= Xi 84*8fb009dcSAndroid Build Coastguard Worker 85*8fb009dcSAndroid Build Coastguard WorkerLgmult_neon: 86*8fb009dcSAndroid Build Coastguard Worker // Split the input into v3 and v4. (The upper halves are unused, 87*8fb009dcSAndroid Build Coastguard Worker // so it is okay to leave them alone.) 88*8fb009dcSAndroid Build Coastguard Worker ins v4.d[0], v3.d[1] 89*8fb009dcSAndroid Build Coastguard Worker ext v16.8b, v5.8b, v5.8b, #1 // A1 90*8fb009dcSAndroid Build Coastguard Worker pmull v16.8h, v16.8b, v3.8b // F = A1*B 91*8fb009dcSAndroid Build Coastguard Worker ext v0.8b, v3.8b, v3.8b, #1 // B1 92*8fb009dcSAndroid Build Coastguard Worker pmull v0.8h, v5.8b, v0.8b // E = A*B1 93*8fb009dcSAndroid Build Coastguard Worker ext v17.8b, v5.8b, v5.8b, #2 // A2 94*8fb009dcSAndroid Build Coastguard Worker pmull v17.8h, v17.8b, v3.8b // H = A2*B 95*8fb009dcSAndroid Build Coastguard Worker ext v19.8b, v3.8b, v3.8b, #2 // B2 96*8fb009dcSAndroid Build Coastguard Worker pmull v19.8h, v5.8b, v19.8b // G = A*B2 97*8fb009dcSAndroid Build Coastguard Worker ext v18.8b, v5.8b, v5.8b, #3 // A3 98*8fb009dcSAndroid Build Coastguard Worker eor v16.16b, v16.16b, v0.16b // L = E + F 99*8fb009dcSAndroid Build Coastguard Worker pmull v18.8h, v18.8b, v3.8b // J = A3*B 100*8fb009dcSAndroid Build Coastguard Worker ext v0.8b, v3.8b, v3.8b, #3 // B3 101*8fb009dcSAndroid Build Coastguard Worker eor v17.16b, v17.16b, v19.16b // M = G + H 102*8fb009dcSAndroid Build Coastguard Worker pmull v0.8h, v5.8b, v0.8b // I = A*B3 103*8fb009dcSAndroid Build Coastguard Worker 104*8fb009dcSAndroid Build Coastguard Worker // Here we diverge from the 32-bit version. It computes the following 105*8fb009dcSAndroid Build Coastguard Worker // (instructions reordered for clarity): 106*8fb009dcSAndroid Build Coastguard Worker // 107*8fb009dcSAndroid Build Coastguard Worker // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 108*8fb009dcSAndroid Build Coastguard Worker // vand $t0#hi, $t0#hi, $k48 109*8fb009dcSAndroid Build Coastguard Worker // veor $t0#lo, $t0#lo, $t0#hi 110*8fb009dcSAndroid Build Coastguard Worker // 111*8fb009dcSAndroid Build Coastguard Worker // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 112*8fb009dcSAndroid Build Coastguard Worker // vand $t1#hi, $t1#hi, $k32 113*8fb009dcSAndroid Build Coastguard Worker // veor $t1#lo, $t1#lo, $t1#hi 114*8fb009dcSAndroid Build Coastguard Worker // 115*8fb009dcSAndroid Build Coastguard Worker // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 116*8fb009dcSAndroid Build Coastguard Worker // vand $t2#hi, $t2#hi, $k16 117*8fb009dcSAndroid Build Coastguard Worker // veor $t2#lo, $t2#lo, $t2#hi 118*8fb009dcSAndroid Build Coastguard Worker // 119*8fb009dcSAndroid Build Coastguard Worker // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 120*8fb009dcSAndroid Build Coastguard Worker // vmov.i64 $t3#hi, #0 121*8fb009dcSAndroid Build Coastguard Worker // 122*8fb009dcSAndroid Build Coastguard Worker // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 123*8fb009dcSAndroid Build Coastguard Worker // upper halves of SIMD registers, so we must split each half into 124*8fb009dcSAndroid Build Coastguard Worker // separate registers. To compensate, we pair computations up and 125*8fb009dcSAndroid Build Coastguard Worker // parallelize. 126*8fb009dcSAndroid Build Coastguard Worker 127*8fb009dcSAndroid Build Coastguard Worker ext v19.8b, v3.8b, v3.8b, #4 // B4 128*8fb009dcSAndroid Build Coastguard Worker eor v18.16b, v18.16b, v0.16b // N = I + J 129*8fb009dcSAndroid Build Coastguard Worker pmull v19.8h, v5.8b, v19.8b // K = A*B4 130*8fb009dcSAndroid Build Coastguard Worker 131*8fb009dcSAndroid Build Coastguard Worker // This can probably be scheduled more efficiently. For now, we just 132*8fb009dcSAndroid Build Coastguard Worker // pair up independent instructions. 133*8fb009dcSAndroid Build Coastguard Worker zip1 v20.2d, v16.2d, v17.2d 134*8fb009dcSAndroid Build Coastguard Worker zip1 v22.2d, v18.2d, v19.2d 135*8fb009dcSAndroid Build Coastguard Worker zip2 v21.2d, v16.2d, v17.2d 136*8fb009dcSAndroid Build Coastguard Worker zip2 v23.2d, v18.2d, v19.2d 137*8fb009dcSAndroid Build Coastguard Worker eor v20.16b, v20.16b, v21.16b 138*8fb009dcSAndroid Build Coastguard Worker eor v22.16b, v22.16b, v23.16b 139*8fb009dcSAndroid Build Coastguard Worker and v21.16b, v21.16b, v24.16b 140*8fb009dcSAndroid Build Coastguard Worker and v23.16b, v23.16b, v25.16b 141*8fb009dcSAndroid Build Coastguard Worker eor v20.16b, v20.16b, v21.16b 142*8fb009dcSAndroid Build Coastguard Worker eor v22.16b, v22.16b, v23.16b 143*8fb009dcSAndroid Build Coastguard Worker zip1 v16.2d, v20.2d, v21.2d 144*8fb009dcSAndroid Build Coastguard Worker zip1 v18.2d, v22.2d, v23.2d 145*8fb009dcSAndroid Build Coastguard Worker zip2 v17.2d, v20.2d, v21.2d 146*8fb009dcSAndroid Build Coastguard Worker zip2 v19.2d, v22.2d, v23.2d 147*8fb009dcSAndroid Build Coastguard Worker 148*8fb009dcSAndroid Build Coastguard Worker ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 149*8fb009dcSAndroid Build Coastguard Worker ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 150*8fb009dcSAndroid Build Coastguard Worker pmull v0.8h, v5.8b, v3.8b // D = A*B 151*8fb009dcSAndroid Build Coastguard Worker ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 152*8fb009dcSAndroid Build Coastguard Worker ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 153*8fb009dcSAndroid Build Coastguard Worker eor v16.16b, v16.16b, v17.16b 154*8fb009dcSAndroid Build Coastguard Worker eor v18.16b, v18.16b, v19.16b 155*8fb009dcSAndroid Build Coastguard Worker eor v0.16b, v0.16b, v16.16b 156*8fb009dcSAndroid Build Coastguard Worker eor v0.16b, v0.16b, v18.16b 157*8fb009dcSAndroid Build Coastguard Worker eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 158*8fb009dcSAndroid Build Coastguard Worker ext v16.8b, v7.8b, v7.8b, #1 // A1 159*8fb009dcSAndroid Build Coastguard Worker pmull v16.8h, v16.8b, v3.8b // F = A1*B 160*8fb009dcSAndroid Build Coastguard Worker ext v1.8b, v3.8b, v3.8b, #1 // B1 161*8fb009dcSAndroid Build Coastguard Worker pmull v1.8h, v7.8b, v1.8b // E = A*B1 162*8fb009dcSAndroid Build Coastguard Worker ext v17.8b, v7.8b, v7.8b, #2 // A2 163*8fb009dcSAndroid Build Coastguard Worker pmull v17.8h, v17.8b, v3.8b // H = A2*B 164*8fb009dcSAndroid Build Coastguard Worker ext v19.8b, v3.8b, v3.8b, #2 // B2 165*8fb009dcSAndroid Build Coastguard Worker pmull v19.8h, v7.8b, v19.8b // G = A*B2 166*8fb009dcSAndroid Build Coastguard Worker ext v18.8b, v7.8b, v7.8b, #3 // A3 167*8fb009dcSAndroid Build Coastguard Worker eor v16.16b, v16.16b, v1.16b // L = E + F 168*8fb009dcSAndroid Build Coastguard Worker pmull v18.8h, v18.8b, v3.8b // J = A3*B 169*8fb009dcSAndroid Build Coastguard Worker ext v1.8b, v3.8b, v3.8b, #3 // B3 170*8fb009dcSAndroid Build Coastguard Worker eor v17.16b, v17.16b, v19.16b // M = G + H 171*8fb009dcSAndroid Build Coastguard Worker pmull v1.8h, v7.8b, v1.8b // I = A*B3 172*8fb009dcSAndroid Build Coastguard Worker 173*8fb009dcSAndroid Build Coastguard Worker // Here we diverge from the 32-bit version. It computes the following 174*8fb009dcSAndroid Build Coastguard Worker // (instructions reordered for clarity): 175*8fb009dcSAndroid Build Coastguard Worker // 176*8fb009dcSAndroid Build Coastguard Worker // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 177*8fb009dcSAndroid Build Coastguard Worker // vand $t0#hi, $t0#hi, $k48 178*8fb009dcSAndroid Build Coastguard Worker // veor $t0#lo, $t0#lo, $t0#hi 179*8fb009dcSAndroid Build Coastguard Worker // 180*8fb009dcSAndroid Build Coastguard Worker // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 181*8fb009dcSAndroid Build Coastguard Worker // vand $t1#hi, $t1#hi, $k32 182*8fb009dcSAndroid Build Coastguard Worker // veor $t1#lo, $t1#lo, $t1#hi 183*8fb009dcSAndroid Build Coastguard Worker // 184*8fb009dcSAndroid Build Coastguard Worker // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 185*8fb009dcSAndroid Build Coastguard Worker // vand $t2#hi, $t2#hi, $k16 186*8fb009dcSAndroid Build Coastguard Worker // veor $t2#lo, $t2#lo, $t2#hi 187*8fb009dcSAndroid Build Coastguard Worker // 188*8fb009dcSAndroid Build Coastguard Worker // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 189*8fb009dcSAndroid Build Coastguard Worker // vmov.i64 $t3#hi, #0 190*8fb009dcSAndroid Build Coastguard Worker // 191*8fb009dcSAndroid Build Coastguard Worker // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 192*8fb009dcSAndroid Build Coastguard Worker // upper halves of SIMD registers, so we must split each half into 193*8fb009dcSAndroid Build Coastguard Worker // separate registers. To compensate, we pair computations up and 194*8fb009dcSAndroid Build Coastguard Worker // parallelize. 195*8fb009dcSAndroid Build Coastguard Worker 196*8fb009dcSAndroid Build Coastguard Worker ext v19.8b, v3.8b, v3.8b, #4 // B4 197*8fb009dcSAndroid Build Coastguard Worker eor v18.16b, v18.16b, v1.16b // N = I + J 198*8fb009dcSAndroid Build Coastguard Worker pmull v19.8h, v7.8b, v19.8b // K = A*B4 199*8fb009dcSAndroid Build Coastguard Worker 200*8fb009dcSAndroid Build Coastguard Worker // This can probably be scheduled more efficiently. For now, we just 201*8fb009dcSAndroid Build Coastguard Worker // pair up independent instructions. 202*8fb009dcSAndroid Build Coastguard Worker zip1 v20.2d, v16.2d, v17.2d 203*8fb009dcSAndroid Build Coastguard Worker zip1 v22.2d, v18.2d, v19.2d 204*8fb009dcSAndroid Build Coastguard Worker zip2 v21.2d, v16.2d, v17.2d 205*8fb009dcSAndroid Build Coastguard Worker zip2 v23.2d, v18.2d, v19.2d 206*8fb009dcSAndroid Build Coastguard Worker eor v20.16b, v20.16b, v21.16b 207*8fb009dcSAndroid Build Coastguard Worker eor v22.16b, v22.16b, v23.16b 208*8fb009dcSAndroid Build Coastguard Worker and v21.16b, v21.16b, v24.16b 209*8fb009dcSAndroid Build Coastguard Worker and v23.16b, v23.16b, v25.16b 210*8fb009dcSAndroid Build Coastguard Worker eor v20.16b, v20.16b, v21.16b 211*8fb009dcSAndroid Build Coastguard Worker eor v22.16b, v22.16b, v23.16b 212*8fb009dcSAndroid Build Coastguard Worker zip1 v16.2d, v20.2d, v21.2d 213*8fb009dcSAndroid Build Coastguard Worker zip1 v18.2d, v22.2d, v23.2d 214*8fb009dcSAndroid Build Coastguard Worker zip2 v17.2d, v20.2d, v21.2d 215*8fb009dcSAndroid Build Coastguard Worker zip2 v19.2d, v22.2d, v23.2d 216*8fb009dcSAndroid Build Coastguard Worker 217*8fb009dcSAndroid Build Coastguard Worker ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 218*8fb009dcSAndroid Build Coastguard Worker ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 219*8fb009dcSAndroid Build Coastguard Worker pmull v1.8h, v7.8b, v3.8b // D = A*B 220*8fb009dcSAndroid Build Coastguard Worker ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 221*8fb009dcSAndroid Build Coastguard Worker ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 222*8fb009dcSAndroid Build Coastguard Worker eor v16.16b, v16.16b, v17.16b 223*8fb009dcSAndroid Build Coastguard Worker eor v18.16b, v18.16b, v19.16b 224*8fb009dcSAndroid Build Coastguard Worker eor v1.16b, v1.16b, v16.16b 225*8fb009dcSAndroid Build Coastguard Worker eor v1.16b, v1.16b, v18.16b 226*8fb009dcSAndroid Build Coastguard Worker ext v16.8b, v6.8b, v6.8b, #1 // A1 227*8fb009dcSAndroid Build Coastguard Worker pmull v16.8h, v16.8b, v4.8b // F = A1*B 228*8fb009dcSAndroid Build Coastguard Worker ext v2.8b, v4.8b, v4.8b, #1 // B1 229*8fb009dcSAndroid Build Coastguard Worker pmull v2.8h, v6.8b, v2.8b // E = A*B1 230*8fb009dcSAndroid Build Coastguard Worker ext v17.8b, v6.8b, v6.8b, #2 // A2 231*8fb009dcSAndroid Build Coastguard Worker pmull v17.8h, v17.8b, v4.8b // H = A2*B 232*8fb009dcSAndroid Build Coastguard Worker ext v19.8b, v4.8b, v4.8b, #2 // B2 233*8fb009dcSAndroid Build Coastguard Worker pmull v19.8h, v6.8b, v19.8b // G = A*B2 234*8fb009dcSAndroid Build Coastguard Worker ext v18.8b, v6.8b, v6.8b, #3 // A3 235*8fb009dcSAndroid Build Coastguard Worker eor v16.16b, v16.16b, v2.16b // L = E + F 236*8fb009dcSAndroid Build Coastguard Worker pmull v18.8h, v18.8b, v4.8b // J = A3*B 237*8fb009dcSAndroid Build Coastguard Worker ext v2.8b, v4.8b, v4.8b, #3 // B3 238*8fb009dcSAndroid Build Coastguard Worker eor v17.16b, v17.16b, v19.16b // M = G + H 239*8fb009dcSAndroid Build Coastguard Worker pmull v2.8h, v6.8b, v2.8b // I = A*B3 240*8fb009dcSAndroid Build Coastguard Worker 241*8fb009dcSAndroid Build Coastguard Worker // Here we diverge from the 32-bit version. It computes the following 242*8fb009dcSAndroid Build Coastguard Worker // (instructions reordered for clarity): 243*8fb009dcSAndroid Build Coastguard Worker // 244*8fb009dcSAndroid Build Coastguard Worker // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 245*8fb009dcSAndroid Build Coastguard Worker // vand $t0#hi, $t0#hi, $k48 246*8fb009dcSAndroid Build Coastguard Worker // veor $t0#lo, $t0#lo, $t0#hi 247*8fb009dcSAndroid Build Coastguard Worker // 248*8fb009dcSAndroid Build Coastguard Worker // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 249*8fb009dcSAndroid Build Coastguard Worker // vand $t1#hi, $t1#hi, $k32 250*8fb009dcSAndroid Build Coastguard Worker // veor $t1#lo, $t1#lo, $t1#hi 251*8fb009dcSAndroid Build Coastguard Worker // 252*8fb009dcSAndroid Build Coastguard Worker // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 253*8fb009dcSAndroid Build Coastguard Worker // vand $t2#hi, $t2#hi, $k16 254*8fb009dcSAndroid Build Coastguard Worker // veor $t2#lo, $t2#lo, $t2#hi 255*8fb009dcSAndroid Build Coastguard Worker // 256*8fb009dcSAndroid Build Coastguard Worker // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 257*8fb009dcSAndroid Build Coastguard Worker // vmov.i64 $t3#hi, #0 258*8fb009dcSAndroid Build Coastguard Worker // 259*8fb009dcSAndroid Build Coastguard Worker // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 260*8fb009dcSAndroid Build Coastguard Worker // upper halves of SIMD registers, so we must split each half into 261*8fb009dcSAndroid Build Coastguard Worker // separate registers. To compensate, we pair computations up and 262*8fb009dcSAndroid Build Coastguard Worker // parallelize. 263*8fb009dcSAndroid Build Coastguard Worker 264*8fb009dcSAndroid Build Coastguard Worker ext v19.8b, v4.8b, v4.8b, #4 // B4 265*8fb009dcSAndroid Build Coastguard Worker eor v18.16b, v18.16b, v2.16b // N = I + J 266*8fb009dcSAndroid Build Coastguard Worker pmull v19.8h, v6.8b, v19.8b // K = A*B4 267*8fb009dcSAndroid Build Coastguard Worker 268*8fb009dcSAndroid Build Coastguard Worker // This can probably be scheduled more efficiently. For now, we just 269*8fb009dcSAndroid Build Coastguard Worker // pair up independent instructions. 270*8fb009dcSAndroid Build Coastguard Worker zip1 v20.2d, v16.2d, v17.2d 271*8fb009dcSAndroid Build Coastguard Worker zip1 v22.2d, v18.2d, v19.2d 272*8fb009dcSAndroid Build Coastguard Worker zip2 v21.2d, v16.2d, v17.2d 273*8fb009dcSAndroid Build Coastguard Worker zip2 v23.2d, v18.2d, v19.2d 274*8fb009dcSAndroid Build Coastguard Worker eor v20.16b, v20.16b, v21.16b 275*8fb009dcSAndroid Build Coastguard Worker eor v22.16b, v22.16b, v23.16b 276*8fb009dcSAndroid Build Coastguard Worker and v21.16b, v21.16b, v24.16b 277*8fb009dcSAndroid Build Coastguard Worker and v23.16b, v23.16b, v25.16b 278*8fb009dcSAndroid Build Coastguard Worker eor v20.16b, v20.16b, v21.16b 279*8fb009dcSAndroid Build Coastguard Worker eor v22.16b, v22.16b, v23.16b 280*8fb009dcSAndroid Build Coastguard Worker zip1 v16.2d, v20.2d, v21.2d 281*8fb009dcSAndroid Build Coastguard Worker zip1 v18.2d, v22.2d, v23.2d 282*8fb009dcSAndroid Build Coastguard Worker zip2 v17.2d, v20.2d, v21.2d 283*8fb009dcSAndroid Build Coastguard Worker zip2 v19.2d, v22.2d, v23.2d 284*8fb009dcSAndroid Build Coastguard Worker 285*8fb009dcSAndroid Build Coastguard Worker ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 286*8fb009dcSAndroid Build Coastguard Worker ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 287*8fb009dcSAndroid Build Coastguard Worker pmull v2.8h, v6.8b, v4.8b // D = A*B 288*8fb009dcSAndroid Build Coastguard Worker ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 289*8fb009dcSAndroid Build Coastguard Worker ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 290*8fb009dcSAndroid Build Coastguard Worker eor v16.16b, v16.16b, v17.16b 291*8fb009dcSAndroid Build Coastguard Worker eor v18.16b, v18.16b, v19.16b 292*8fb009dcSAndroid Build Coastguard Worker eor v2.16b, v2.16b, v16.16b 293*8fb009dcSAndroid Build Coastguard Worker eor v2.16b, v2.16b, v18.16b 294*8fb009dcSAndroid Build Coastguard Worker ext v16.16b, v0.16b, v2.16b, #8 295*8fb009dcSAndroid Build Coastguard Worker eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 296*8fb009dcSAndroid Build Coastguard Worker eor v1.16b, v1.16b, v2.16b 297*8fb009dcSAndroid Build Coastguard Worker eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 298*8fb009dcSAndroid Build Coastguard Worker ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 299*8fb009dcSAndroid Build Coastguard Worker // This is a no-op due to the ins instruction below. 300*8fb009dcSAndroid Build Coastguard Worker // ins v2.d[0], v1.d[1] 301*8fb009dcSAndroid Build Coastguard Worker 302*8fb009dcSAndroid Build Coastguard Worker // equivalent of reduction_avx from ghash-x86_64.pl 303*8fb009dcSAndroid Build Coastguard Worker shl v17.2d, v0.2d, #57 // 1st phase 304*8fb009dcSAndroid Build Coastguard Worker shl v18.2d, v0.2d, #62 305*8fb009dcSAndroid Build Coastguard Worker eor v18.16b, v18.16b, v17.16b // 306*8fb009dcSAndroid Build Coastguard Worker shl v17.2d, v0.2d, #63 307*8fb009dcSAndroid Build Coastguard Worker eor v18.16b, v18.16b, v17.16b // 308*8fb009dcSAndroid Build Coastguard Worker // Note Xm contains {Xl.d[1], Xh.d[0]}. 309*8fb009dcSAndroid Build Coastguard Worker eor v18.16b, v18.16b, v1.16b 310*8fb009dcSAndroid Build Coastguard Worker ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 311*8fb009dcSAndroid Build Coastguard Worker ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 312*8fb009dcSAndroid Build Coastguard Worker 313*8fb009dcSAndroid Build Coastguard Worker ushr v18.2d, v0.2d, #1 // 2nd phase 314*8fb009dcSAndroid Build Coastguard Worker eor v2.16b, v2.16b,v0.16b 315*8fb009dcSAndroid Build Coastguard Worker eor v0.16b, v0.16b,v18.16b // 316*8fb009dcSAndroid Build Coastguard Worker ushr v18.2d, v18.2d, #6 317*8fb009dcSAndroid Build Coastguard Worker ushr v0.2d, v0.2d, #1 // 318*8fb009dcSAndroid Build Coastguard Worker eor v0.16b, v0.16b, v2.16b // 319*8fb009dcSAndroid Build Coastguard Worker eor v0.16b, v0.16b, v18.16b // 320*8fb009dcSAndroid Build Coastguard Worker 321*8fb009dcSAndroid Build Coastguard Worker subs x3, x3, #16 322*8fb009dcSAndroid Build Coastguard Worker bne Loop_neon 323*8fb009dcSAndroid Build Coastguard Worker 324*8fb009dcSAndroid Build Coastguard Worker rev64 v0.16b, v0.16b // byteswap Xi and write 325*8fb009dcSAndroid Build Coastguard Worker ext v0.16b, v0.16b, v0.16b, #8 326*8fb009dcSAndroid Build Coastguard Worker st1 {v0.16b}, [x0] 327*8fb009dcSAndroid Build Coastguard Worker 328*8fb009dcSAndroid Build Coastguard Worker ret 329*8fb009dcSAndroid Build Coastguard Worker 330*8fb009dcSAndroid Build Coastguard Worker 331*8fb009dcSAndroid Build Coastguard Worker.section .rodata 332*8fb009dcSAndroid Build Coastguard Worker.align 4 333*8fb009dcSAndroid Build Coastguard WorkerLmasks: 334*8fb009dcSAndroid Build Coastguard Worker.quad 0x0000ffffffffffff // k48 335*8fb009dcSAndroid Build Coastguard Worker.quad 0x00000000ffffffff // k32 336*8fb009dcSAndroid Build Coastguard Worker.quad 0x000000000000ffff // k16 337*8fb009dcSAndroid Build Coastguard Worker.quad 0x0000000000000000 // k0 338*8fb009dcSAndroid Build Coastguard Worker.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 339*8fb009dcSAndroid Build Coastguard Worker.align 2 340*8fb009dcSAndroid Build Coastguard Worker.align 2 341*8fb009dcSAndroid Build Coastguard Worker#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 342