1*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -instcombine -S | FileCheck %s 2*9880d681SAndroid Build Coastguard Worker 3*9880d681SAndroid Build Coastguard Worker;; MASKED LOADS 4*9880d681SAndroid Build Coastguard Worker 5*9880d681SAndroid Build Coastguard Worker; If the mask isn't constant, do nothing. 6*9880d681SAndroid Build Coastguard Worker 7*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload(i8* %f, <4 x i32> %mask) { 8*9880d681SAndroid Build Coastguard Worker %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask) 9*9880d681SAndroid Build Coastguard Worker ret <4 x float> %ld 10*9880d681SAndroid Build Coastguard Worker 11*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload( 12*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask) 13*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <4 x float> %ld 14*9880d681SAndroid Build Coastguard Worker} 15*9880d681SAndroid Build Coastguard Worker 16*9880d681SAndroid Build Coastguard Worker; Zero mask returns a zero vector. 17*9880d681SAndroid Build Coastguard Worker 18*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload_zeros(i8* %f) { 19*9880d681SAndroid Build Coastguard Worker %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer) 20*9880d681SAndroid Build Coastguard Worker ret <4 x float> %ld 21*9880d681SAndroid Build Coastguard Worker 22*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_zeros( 23*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <4 x float> zeroinitializer 24*9880d681SAndroid Build Coastguard Worker} 25*9880d681SAndroid Build Coastguard Worker 26*9880d681SAndroid Build Coastguard Worker; Only the sign bit matters. 27*9880d681SAndroid Build Coastguard Worker 28*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload_fake_ones(i8* %f) { 29*9880d681SAndroid Build Coastguard Worker %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>) 30*9880d681SAndroid Build Coastguard Worker ret <4 x float> %ld 31*9880d681SAndroid Build Coastguard Worker 32*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_fake_ones( 33*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <4 x float> zeroinitializer 34*9880d681SAndroid Build Coastguard Worker} 35*9880d681SAndroid Build Coastguard Worker 36*9880d681SAndroid Build Coastguard Worker; All mask bits are set, so this is just a vector load. 37*9880d681SAndroid Build Coastguard Worker 38*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload_real_ones(i8* %f) { 39*9880d681SAndroid Build Coastguard Worker %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 2147483648>) 40*9880d681SAndroid Build Coastguard Worker ret <4 x float> %ld 41*9880d681SAndroid Build Coastguard Worker 42*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_real_ones( 43*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>* 44*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %unmaskedload = load <4 x float>, <4 x float>* %castvec 45*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <4 x float> %unmaskedload 46*9880d681SAndroid Build Coastguard Worker} 47*9880d681SAndroid Build Coastguard Worker 48*9880d681SAndroid Build Coastguard Worker; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further. 49*9880d681SAndroid Build Coastguard Worker 50*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload_one_one(i8* %f) { 51*9880d681SAndroid Build Coastguard Worker %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>) 52*9880d681SAndroid Build Coastguard Worker ret <4 x float> %ld 53*9880d681SAndroid Build Coastguard Worker 54*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_one_one( 55*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>* 56*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> zeroinitializer) 57*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <4 x float> %1 58*9880d681SAndroid Build Coastguard Worker} 59*9880d681SAndroid Build Coastguard Worker 60*9880d681SAndroid Build Coastguard Worker; Try doubles. 61*9880d681SAndroid Build Coastguard Worker 62*9880d681SAndroid Build Coastguard Workerdefine <2 x double> @mload_one_one_double(i8* %f) { 63*9880d681SAndroid Build Coastguard Worker %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(i8* %f, <2 x i64> <i64 -1, i64 0>) 64*9880d681SAndroid Build Coastguard Worker ret <2 x double> %ld 65*9880d681SAndroid Build Coastguard Worker 66*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_one_one_double( 67*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x double>* 68*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %1 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %castvec, i32 1, <2 x i1> <i1 true, i1 false>, <2 x double> zeroinitializer) 69*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <2 x double> %1 70*9880d681SAndroid Build Coastguard Worker} 71*9880d681SAndroid Build Coastguard Worker 72*9880d681SAndroid Build Coastguard Worker; Try 256-bit FP ops. 73*9880d681SAndroid Build Coastguard Worker 74*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @mload_v8f32(i8* %f) { 75*9880d681SAndroid Build Coastguard Worker %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>) 76*9880d681SAndroid Build Coastguard Worker ret <8 x float> %ld 77*9880d681SAndroid Build Coastguard Worker 78*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v8f32( 79*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x float>* 80*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %1 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %castvec, i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> zeroinitializer) 81*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <8 x float> %1 82*9880d681SAndroid Build Coastguard Worker} 83*9880d681SAndroid Build Coastguard Worker 84*9880d681SAndroid Build Coastguard Workerdefine <4 x double> @mload_v4f64(i8* %f) { 85*9880d681SAndroid Build Coastguard Worker %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>) 86*9880d681SAndroid Build Coastguard Worker ret <4 x double> %ld 87*9880d681SAndroid Build Coastguard Worker 88*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v4f64( 89*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x double>* 90*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %1 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %castvec, i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> zeroinitializer) 91*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <4 x double> %1 92*9880d681SAndroid Build Coastguard Worker} 93*9880d681SAndroid Build Coastguard Worker 94*9880d681SAndroid Build Coastguard Worker; Try the AVX2 variants. 95*9880d681SAndroid Build Coastguard Worker 96*9880d681SAndroid Build Coastguard Workerdefine <4 x i32> @mload_v4i32(i8* %f) { 97*9880d681SAndroid Build Coastguard Worker %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>) 98*9880d681SAndroid Build Coastguard Worker ret <4 x i32> %ld 99*9880d681SAndroid Build Coastguard Worker 100*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v4i32( 101*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i32>* 102*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer) 103*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <4 x i32> %1 104*9880d681SAndroid Build Coastguard Worker} 105*9880d681SAndroid Build Coastguard Worker 106*9880d681SAndroid Build Coastguard Workerdefine <2 x i64> @mload_v2i64(i8* %f) { 107*9880d681SAndroid Build Coastguard Worker %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %f, <2 x i64> <i64 -1, i64 0>) 108*9880d681SAndroid Build Coastguard Worker ret <2 x i64> %ld 109*9880d681SAndroid Build Coastguard Worker 110*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v2i64( 111*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x i64>* 112*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %1 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %castvec, i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> zeroinitializer) 113*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <2 x i64> %1 114*9880d681SAndroid Build Coastguard Worker} 115*9880d681SAndroid Build Coastguard Worker 116*9880d681SAndroid Build Coastguard Workerdefine <8 x i32> @mload_v8i32(i8* %f) { 117*9880d681SAndroid Build Coastguard Worker %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>) 118*9880d681SAndroid Build Coastguard Worker ret <8 x i32> %ld 119*9880d681SAndroid Build Coastguard Worker 120*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v8i32( 121*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x i32>* 122*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %castvec, i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> zeroinitializer) 123*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <8 x i32> %1 124*9880d681SAndroid Build Coastguard Worker} 125*9880d681SAndroid Build Coastguard Worker 126*9880d681SAndroid Build Coastguard Workerdefine <4 x i64> @mload_v4i64(i8* %f) { 127*9880d681SAndroid Build Coastguard Worker %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>) 128*9880d681SAndroid Build Coastguard Worker ret <4 x i64> %ld 129*9880d681SAndroid Build Coastguard Worker 130*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v4i64( 131*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i64>* 132*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %1 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %castvec, i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> zeroinitializer) 133*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret <4 x i64> %1 134*9880d681SAndroid Build Coastguard Worker} 135*9880d681SAndroid Build Coastguard Worker 136*9880d681SAndroid Build Coastguard Worker 137*9880d681SAndroid Build Coastguard Worker;; MASKED STORES 138*9880d681SAndroid Build Coastguard Worker 139*9880d681SAndroid Build Coastguard Worker; If the mask isn't constant, do nothing. 140*9880d681SAndroid Build Coastguard Worker 141*9880d681SAndroid Build Coastguard Workerdefine void @mstore(i8* %f, <4 x i32> %mask, <4 x float> %v) { 142*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v) 143*9880d681SAndroid Build Coastguard Worker ret void 144*9880d681SAndroid Build Coastguard Worker 145*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore( 146*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v) 147*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 148*9880d681SAndroid Build Coastguard Worker} 149*9880d681SAndroid Build Coastguard Worker 150*9880d681SAndroid Build Coastguard Worker; Zero mask is a nop. 151*9880d681SAndroid Build Coastguard Worker 152*9880d681SAndroid Build Coastguard Workerdefine void @mstore_zeros(i8* %f, <4 x float> %v) { 153*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> zeroinitializer, <4 x float> %v) 154*9880d681SAndroid Build Coastguard Worker ret void 155*9880d681SAndroid Build Coastguard Worker 156*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_zeros( 157*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 158*9880d681SAndroid Build Coastguard Worker} 159*9880d681SAndroid Build Coastguard Worker 160*9880d681SAndroid Build Coastguard Worker; Only the sign bit matters. 161*9880d681SAndroid Build Coastguard Worker 162*9880d681SAndroid Build Coastguard Workerdefine void @mstore_fake_ones(i8* %f, <4 x float> %v) { 163*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>, <4 x float> %v) 164*9880d681SAndroid Build Coastguard Worker ret void 165*9880d681SAndroid Build Coastguard Worker 166*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_fake_ones( 167*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 168*9880d681SAndroid Build Coastguard Worker} 169*9880d681SAndroid Build Coastguard Worker 170*9880d681SAndroid Build Coastguard Worker; All mask bits are set, so this is just a vector store. 171*9880d681SAndroid Build Coastguard Worker 172*9880d681SAndroid Build Coastguard Workerdefine void @mstore_real_ones(i8* %f, <4 x float> %v) { 173*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -2147483648>, <4 x float> %v) 174*9880d681SAndroid Build Coastguard Worker ret void 175*9880d681SAndroid Build Coastguard Worker 176*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_real_ones( 177*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>* 178*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: store <4 x float> %v, <4 x float>* %castvec 179*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 180*9880d681SAndroid Build Coastguard Worker} 181*9880d681SAndroid Build Coastguard Worker 182*9880d681SAndroid Build Coastguard Worker; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further. 183*9880d681SAndroid Build Coastguard Worker 184*9880d681SAndroid Build Coastguard Workerdefine void @mstore_one_one(i8* %f, <4 x float> %v) { 185*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, <4 x float> %v) 186*9880d681SAndroid Build Coastguard Worker ret void 187*9880d681SAndroid Build Coastguard Worker 188*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_one_one( 189*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>* 190*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %v, <4 x float>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>) 191*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 192*9880d681SAndroid Build Coastguard Worker} 193*9880d681SAndroid Build Coastguard Worker 194*9880d681SAndroid Build Coastguard Worker; Try doubles. 195*9880d681SAndroid Build Coastguard Worker 196*9880d681SAndroid Build Coastguard Workerdefine void @mstore_one_one_double(i8* %f, <2 x double> %v) { 197*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx.maskstore.pd(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x double> %v) 198*9880d681SAndroid Build Coastguard Worker ret void 199*9880d681SAndroid Build Coastguard Worker 200*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_one_one_double( 201*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x double>* 202*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %v, <2 x double>* %castvec, i32 1, <2 x i1> <i1 true, i1 false>) 203*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 204*9880d681SAndroid Build Coastguard Worker} 205*9880d681SAndroid Build Coastguard Worker 206*9880d681SAndroid Build Coastguard Worker; Try 256-bit FP ops. 207*9880d681SAndroid Build Coastguard Worker 208*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v8f32(i8* %f, <8 x float> %v) { 209*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx.maskstore.ps.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x float> %v) 210*9880d681SAndroid Build Coastguard Worker ret void 211*9880d681SAndroid Build Coastguard Worker 212*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v8f32( 213*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x float>* 214*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %v, <8 x float>* %castvec, i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>) 215*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 216*9880d681SAndroid Build Coastguard Worker} 217*9880d681SAndroid Build Coastguard Worker 218*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v4f64(i8* %f, <4 x double> %v) { 219*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx.maskstore.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x double> %v) 220*9880d681SAndroid Build Coastguard Worker ret void 221*9880d681SAndroid Build Coastguard Worker 222*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v4f64( 223*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x double>* 224*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %v, <4 x double>* %castvec, i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>) 225*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 226*9880d681SAndroid Build Coastguard Worker} 227*9880d681SAndroid Build Coastguard Worker 228*9880d681SAndroid Build Coastguard Worker; Try the AVX2 variants. 229*9880d681SAndroid Build Coastguard Worker 230*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v4i32(i8* %f, <4 x i32> %v) { 231*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx2.maskstore.d(i8* %f, <4 x i32> <i32 0, i32 1, i32 -1, i32 -2>, <4 x i32> %v) 232*9880d681SAndroid Build Coastguard Worker ret void 233*9880d681SAndroid Build Coastguard Worker 234*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v4i32( 235*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i32>* 236*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v, <4 x i32>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 true, i1 true>) 237*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 238*9880d681SAndroid Build Coastguard Worker} 239*9880d681SAndroid Build Coastguard Worker 240*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v2i64(i8* %f, <2 x i64> %v) { 241*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx2.maskstore.q(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x i64> %v) 242*9880d681SAndroid Build Coastguard Worker ret void 243*9880d681SAndroid Build Coastguard Worker 244*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v2i64( 245*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x i64>* 246*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %v, <2 x i64>* %castvec, i32 1, <2 x i1> <i1 true, i1 false>) 247*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 248*9880d681SAndroid Build Coastguard Worker} 249*9880d681SAndroid Build Coastguard Worker 250*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v8i32(i8* %f, <8 x i32> %v) { 251*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx2.maskstore.d.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x i32> %v) 252*9880d681SAndroid Build Coastguard Worker ret void 253*9880d681SAndroid Build Coastguard Worker 254*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v8i32( 255*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x i32>* 256*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v, <8 x i32>* %castvec, i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>) 257*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 258*9880d681SAndroid Build Coastguard Worker} 259*9880d681SAndroid Build Coastguard Worker 260*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v4i64(i8* %f, <4 x i64> %v) { 261*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.avx2.maskstore.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x i64> %v) 262*9880d681SAndroid Build Coastguard Worker ret void 263*9880d681SAndroid Build Coastguard Worker 264*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v4i64( 265*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i64>* 266*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %v, <4 x i64>* %castvec, i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>) 267*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 268*9880d681SAndroid Build Coastguard Worker} 269*9880d681SAndroid Build Coastguard Worker 270*9880d681SAndroid Build Coastguard Worker; The original SSE2 masked store variant. 271*9880d681SAndroid Build Coastguard Worker 272*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v16i8_sse2_zeros(<16 x i8> %d, i8* %p) { 273*9880d681SAndroid Build Coastguard Worker tail call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %d, <16 x i8> zeroinitializer, i8* %p) 274*9880d681SAndroid Build Coastguard Worker ret void 275*9880d681SAndroid Build Coastguard Worker 276*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v16i8_sse2_zeros( 277*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ret void 278*9880d681SAndroid Build Coastguard Worker} 279*9880d681SAndroid Build Coastguard Worker 280*9880d681SAndroid Build Coastguard Worker 281*9880d681SAndroid Build Coastguard Workerdeclare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) 282*9880d681SAndroid Build Coastguard Workerdeclare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) 283*9880d681SAndroid Build Coastguard Workerdeclare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) 284*9880d681SAndroid Build Coastguard Workerdeclare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) 285*9880d681SAndroid Build Coastguard Worker 286*9880d681SAndroid Build Coastguard Workerdeclare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) 287*9880d681SAndroid Build Coastguard Workerdeclare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) 288*9880d681SAndroid Build Coastguard Workerdeclare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) 289*9880d681SAndroid Build Coastguard Workerdeclare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) 290*9880d681SAndroid Build Coastguard Worker 291*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) 292*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) 293*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) 294*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) 295*9880d681SAndroid Build Coastguard Worker 296*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) 297*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) 298*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) 299*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) 300*9880d681SAndroid Build Coastguard Worker 301*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) 302*9880d681SAndroid Build Coastguard Worker 303