xref: /aosp_15_r20/external/llvm/test/Transforms/InstCombine/x86-masked-memops.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1*9880d681SAndroid Build Coastguard Worker; RUN: opt < %s -instcombine -S | FileCheck %s
2*9880d681SAndroid Build Coastguard Worker
3*9880d681SAndroid Build Coastguard Worker;; MASKED LOADS
4*9880d681SAndroid Build Coastguard Worker
5*9880d681SAndroid Build Coastguard Worker; If the mask isn't constant, do nothing.
6*9880d681SAndroid Build Coastguard Worker
7*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload(i8* %f, <4 x i32> %mask) {
8*9880d681SAndroid Build Coastguard Worker  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
9*9880d681SAndroid Build Coastguard Worker  ret <4 x float> %ld
10*9880d681SAndroid Build Coastguard Worker
11*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload(
12*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
13*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <4 x float> %ld
14*9880d681SAndroid Build Coastguard Worker}
15*9880d681SAndroid Build Coastguard Worker
16*9880d681SAndroid Build Coastguard Worker; Zero mask returns a zero vector.
17*9880d681SAndroid Build Coastguard Worker
18*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload_zeros(i8* %f) {
19*9880d681SAndroid Build Coastguard Worker  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer)
20*9880d681SAndroid Build Coastguard Worker  ret <4 x float> %ld
21*9880d681SAndroid Build Coastguard Worker
22*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_zeros(
23*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <4 x float> zeroinitializer
24*9880d681SAndroid Build Coastguard Worker}
25*9880d681SAndroid Build Coastguard Worker
26*9880d681SAndroid Build Coastguard Worker; Only the sign bit matters.
27*9880d681SAndroid Build Coastguard Worker
28*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload_fake_ones(i8* %f) {
29*9880d681SAndroid Build Coastguard Worker  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>)
30*9880d681SAndroid Build Coastguard Worker  ret <4 x float> %ld
31*9880d681SAndroid Build Coastguard Worker
32*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_fake_ones(
33*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <4 x float> zeroinitializer
34*9880d681SAndroid Build Coastguard Worker}
35*9880d681SAndroid Build Coastguard Worker
36*9880d681SAndroid Build Coastguard Worker; All mask bits are set, so this is just a vector load.
37*9880d681SAndroid Build Coastguard Worker
38*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload_real_ones(i8* %f) {
39*9880d681SAndroid Build Coastguard Worker  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 2147483648>)
40*9880d681SAndroid Build Coastguard Worker  ret <4 x float> %ld
41*9880d681SAndroid Build Coastguard Worker
42*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_real_ones(
43*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x float>*
44*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %unmaskedload = load <4 x float>, <4 x float>* %castvec
45*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <4 x float> %unmaskedload
46*9880d681SAndroid Build Coastguard Worker}
47*9880d681SAndroid Build Coastguard Worker
48*9880d681SAndroid Build Coastguard Worker; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
49*9880d681SAndroid Build Coastguard Worker
50*9880d681SAndroid Build Coastguard Workerdefine <4 x float> @mload_one_one(i8* %f) {
51*9880d681SAndroid Build Coastguard Worker  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
52*9880d681SAndroid Build Coastguard Worker  ret <4 x float> %ld
53*9880d681SAndroid Build Coastguard Worker
54*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_one_one(
55*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x float>*
56*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> zeroinitializer)
57*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <4 x float> %1
58*9880d681SAndroid Build Coastguard Worker}
59*9880d681SAndroid Build Coastguard Worker
60*9880d681SAndroid Build Coastguard Worker; Try doubles.
61*9880d681SAndroid Build Coastguard Worker
62*9880d681SAndroid Build Coastguard Workerdefine <2 x double> @mload_one_one_double(i8* %f) {
63*9880d681SAndroid Build Coastguard Worker  %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(i8* %f, <2 x i64> <i64 -1, i64 0>)
64*9880d681SAndroid Build Coastguard Worker  ret <2 x double> %ld
65*9880d681SAndroid Build Coastguard Worker
66*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_one_one_double(
67*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <2 x double>*
68*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %1 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %castvec, i32 1, <2 x i1> <i1 true, i1 false>, <2 x double> zeroinitializer)
69*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <2 x double> %1
70*9880d681SAndroid Build Coastguard Worker}
71*9880d681SAndroid Build Coastguard Worker
72*9880d681SAndroid Build Coastguard Worker; Try 256-bit FP ops.
73*9880d681SAndroid Build Coastguard Worker
74*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @mload_v8f32(i8* %f) {
75*9880d681SAndroid Build Coastguard Worker  %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
76*9880d681SAndroid Build Coastguard Worker  ret <8 x float> %ld
77*9880d681SAndroid Build Coastguard Worker
78*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v8f32(
79*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <8 x float>*
80*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %1 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %castvec, i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> zeroinitializer)
81*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <8 x float> %1
82*9880d681SAndroid Build Coastguard Worker}
83*9880d681SAndroid Build Coastguard Worker
84*9880d681SAndroid Build Coastguard Workerdefine <4 x double> @mload_v4f64(i8* %f) {
85*9880d681SAndroid Build Coastguard Worker  %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
86*9880d681SAndroid Build Coastguard Worker  ret <4 x double> %ld
87*9880d681SAndroid Build Coastguard Worker
88*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v4f64(
89*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x double>*
90*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %1 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %castvec, i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> zeroinitializer)
91*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <4 x double> %1
92*9880d681SAndroid Build Coastguard Worker}
93*9880d681SAndroid Build Coastguard Worker
94*9880d681SAndroid Build Coastguard Worker; Try the AVX2 variants.
95*9880d681SAndroid Build Coastguard Worker
96*9880d681SAndroid Build Coastguard Workerdefine <4 x i32> @mload_v4i32(i8* %f) {
97*9880d681SAndroid Build Coastguard Worker  %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
98*9880d681SAndroid Build Coastguard Worker  ret <4 x i32> %ld
99*9880d681SAndroid Build Coastguard Worker
100*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v4i32(
101*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x i32>*
102*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
103*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <4 x i32> %1
104*9880d681SAndroid Build Coastguard Worker}
105*9880d681SAndroid Build Coastguard Worker
106*9880d681SAndroid Build Coastguard Workerdefine <2 x i64> @mload_v2i64(i8* %f) {
107*9880d681SAndroid Build Coastguard Worker  %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %f, <2 x i64> <i64 -1, i64 0>)
108*9880d681SAndroid Build Coastguard Worker  ret <2 x i64> %ld
109*9880d681SAndroid Build Coastguard Worker
110*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v2i64(
111*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <2 x i64>*
112*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %1 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %castvec, i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> zeroinitializer)
113*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <2 x i64> %1
114*9880d681SAndroid Build Coastguard Worker}
115*9880d681SAndroid Build Coastguard Worker
116*9880d681SAndroid Build Coastguard Workerdefine <8 x i32> @mload_v8i32(i8* %f) {
117*9880d681SAndroid Build Coastguard Worker  %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
118*9880d681SAndroid Build Coastguard Worker  ret <8 x i32> %ld
119*9880d681SAndroid Build Coastguard Worker
120*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v8i32(
121*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <8 x i32>*
122*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %castvec, i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> zeroinitializer)
123*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <8 x i32> %1
124*9880d681SAndroid Build Coastguard Worker}
125*9880d681SAndroid Build Coastguard Worker
126*9880d681SAndroid Build Coastguard Workerdefine <4 x i64> @mload_v4i64(i8* %f) {
127*9880d681SAndroid Build Coastguard Worker  %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
128*9880d681SAndroid Build Coastguard Worker  ret <4 x i64> %ld
129*9880d681SAndroid Build Coastguard Worker
130*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mload_v4i64(
131*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x i64>*
132*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %1 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %castvec, i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> zeroinitializer)
133*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret <4 x i64> %1
134*9880d681SAndroid Build Coastguard Worker}
135*9880d681SAndroid Build Coastguard Worker
136*9880d681SAndroid Build Coastguard Worker
137*9880d681SAndroid Build Coastguard Worker;; MASKED STORES
138*9880d681SAndroid Build Coastguard Worker
139*9880d681SAndroid Build Coastguard Worker; If the mask isn't constant, do nothing.
140*9880d681SAndroid Build Coastguard Worker
141*9880d681SAndroid Build Coastguard Workerdefine void @mstore(i8* %f, <4 x i32> %mask, <4 x float> %v) {
142*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v)
143*9880d681SAndroid Build Coastguard Worker  ret void
144*9880d681SAndroid Build Coastguard Worker
145*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore(
146*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v)
147*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
148*9880d681SAndroid Build Coastguard Worker}
149*9880d681SAndroid Build Coastguard Worker
150*9880d681SAndroid Build Coastguard Worker; Zero mask is a nop.
151*9880d681SAndroid Build Coastguard Worker
152*9880d681SAndroid Build Coastguard Workerdefine void @mstore_zeros(i8* %f, <4 x float> %v)  {
153*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> zeroinitializer, <4 x float> %v)
154*9880d681SAndroid Build Coastguard Worker  ret void
155*9880d681SAndroid Build Coastguard Worker
156*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_zeros(
157*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
158*9880d681SAndroid Build Coastguard Worker}
159*9880d681SAndroid Build Coastguard Worker
160*9880d681SAndroid Build Coastguard Worker; Only the sign bit matters.
161*9880d681SAndroid Build Coastguard Worker
162*9880d681SAndroid Build Coastguard Workerdefine void @mstore_fake_ones(i8* %f, <4 x float> %v) {
163*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>, <4 x float> %v)
164*9880d681SAndroid Build Coastguard Worker  ret void
165*9880d681SAndroid Build Coastguard Worker
166*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_fake_ones(
167*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
168*9880d681SAndroid Build Coastguard Worker}
169*9880d681SAndroid Build Coastguard Worker
170*9880d681SAndroid Build Coastguard Worker; All mask bits are set, so this is just a vector store.
171*9880d681SAndroid Build Coastguard Worker
172*9880d681SAndroid Build Coastguard Workerdefine void @mstore_real_ones(i8* %f, <4 x float> %v) {
173*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -2147483648>, <4 x float> %v)
174*9880d681SAndroid Build Coastguard Worker  ret void
175*9880d681SAndroid Build Coastguard Worker
176*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_real_ones(
177*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x float>*
178*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  store <4 x float> %v, <4 x float>* %castvec
179*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
180*9880d681SAndroid Build Coastguard Worker}
181*9880d681SAndroid Build Coastguard Worker
182*9880d681SAndroid Build Coastguard Worker; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
183*9880d681SAndroid Build Coastguard Worker
184*9880d681SAndroid Build Coastguard Workerdefine void @mstore_one_one(i8* %f, <4 x float> %v) {
185*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, <4 x float> %v)
186*9880d681SAndroid Build Coastguard Worker  ret void
187*9880d681SAndroid Build Coastguard Worker
188*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_one_one(
189*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x float>*
190*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %v, <4 x float>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
191*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
192*9880d681SAndroid Build Coastguard Worker}
193*9880d681SAndroid Build Coastguard Worker
194*9880d681SAndroid Build Coastguard Worker; Try doubles.
195*9880d681SAndroid Build Coastguard Worker
196*9880d681SAndroid Build Coastguard Workerdefine void @mstore_one_one_double(i8* %f, <2 x double> %v) {
197*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx.maskstore.pd(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x double> %v)
198*9880d681SAndroid Build Coastguard Worker  ret void
199*9880d681SAndroid Build Coastguard Worker
200*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_one_one_double(
201*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <2 x double>*
202*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %v, <2 x double>* %castvec, i32 1, <2 x i1> <i1 true, i1 false>)
203*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
204*9880d681SAndroid Build Coastguard Worker}
205*9880d681SAndroid Build Coastguard Worker
206*9880d681SAndroid Build Coastguard Worker; Try 256-bit FP ops.
207*9880d681SAndroid Build Coastguard Worker
208*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v8f32(i8* %f, <8 x float> %v) {
209*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx.maskstore.ps.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x float> %v)
210*9880d681SAndroid Build Coastguard Worker  ret void
211*9880d681SAndroid Build Coastguard Worker
212*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v8f32(
213*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <8 x float>*
214*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %v, <8 x float>* %castvec, i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
215*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
216*9880d681SAndroid Build Coastguard Worker}
217*9880d681SAndroid Build Coastguard Worker
218*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v4f64(i8* %f, <4 x double> %v) {
219*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx.maskstore.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x double> %v)
220*9880d681SAndroid Build Coastguard Worker  ret void
221*9880d681SAndroid Build Coastguard Worker
222*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v4f64(
223*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x double>*
224*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %v, <4 x double>* %castvec, i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
225*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
226*9880d681SAndroid Build Coastguard Worker}
227*9880d681SAndroid Build Coastguard Worker
228*9880d681SAndroid Build Coastguard Worker; Try the AVX2 variants.
229*9880d681SAndroid Build Coastguard Worker
230*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v4i32(i8* %f, <4 x i32> %v) {
231*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx2.maskstore.d(i8* %f, <4 x i32> <i32 0, i32 1, i32 -1, i32 -2>, <4 x i32> %v)
232*9880d681SAndroid Build Coastguard Worker  ret void
233*9880d681SAndroid Build Coastguard Worker
234*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v4i32(
235*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x i32>*
236*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v, <4 x i32>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
237*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
238*9880d681SAndroid Build Coastguard Worker}
239*9880d681SAndroid Build Coastguard Worker
240*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v2i64(i8* %f, <2 x i64> %v) {
241*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx2.maskstore.q(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x i64> %v)
242*9880d681SAndroid Build Coastguard Worker  ret void
243*9880d681SAndroid Build Coastguard Worker
244*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v2i64(
245*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <2 x i64>*
246*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %v, <2 x i64>* %castvec, i32 1, <2 x i1> <i1 true, i1 false>)
247*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
248*9880d681SAndroid Build Coastguard Worker}
249*9880d681SAndroid Build Coastguard Worker
250*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v8i32(i8* %f, <8 x i32> %v) {
251*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx2.maskstore.d.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x i32> %v)
252*9880d681SAndroid Build Coastguard Worker  ret void
253*9880d681SAndroid Build Coastguard Worker
254*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v8i32(
255*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <8 x i32>*
256*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v, <8 x i32>* %castvec, i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
257*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
258*9880d681SAndroid Build Coastguard Worker}
259*9880d681SAndroid Build Coastguard Worker
260*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v4i64(i8* %f, <4 x i64> %v) {
261*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.avx2.maskstore.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x i64> %v)
262*9880d681SAndroid Build Coastguard Worker  ret void
263*9880d681SAndroid Build Coastguard Worker
264*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v4i64(
265*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x i64>*
266*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %v, <4 x i64>* %castvec, i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
267*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
268*9880d681SAndroid Build Coastguard Worker}
269*9880d681SAndroid Build Coastguard Worker
270*9880d681SAndroid Build Coastguard Worker; The original SSE2 masked store variant.
271*9880d681SAndroid Build Coastguard Worker
272*9880d681SAndroid Build Coastguard Workerdefine void @mstore_v16i8_sse2_zeros(<16 x i8> %d, i8* %p) {
273*9880d681SAndroid Build Coastguard Worker  tail call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %d, <16 x i8> zeroinitializer, i8* %p)
274*9880d681SAndroid Build Coastguard Worker  ret void
275*9880d681SAndroid Build Coastguard Worker
276*9880d681SAndroid Build Coastguard Worker; CHECK-LABEL: @mstore_v16i8_sse2_zeros(
277*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT:  ret void
278*9880d681SAndroid Build Coastguard Worker}
279*9880d681SAndroid Build Coastguard Worker
280*9880d681SAndroid Build Coastguard Worker
281*9880d681SAndroid Build Coastguard Workerdeclare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>)
282*9880d681SAndroid Build Coastguard Workerdeclare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>)
283*9880d681SAndroid Build Coastguard Workerdeclare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>)
284*9880d681SAndroid Build Coastguard Workerdeclare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>)
285*9880d681SAndroid Build Coastguard Worker
286*9880d681SAndroid Build Coastguard Workerdeclare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>)
287*9880d681SAndroid Build Coastguard Workerdeclare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>)
288*9880d681SAndroid Build Coastguard Workerdeclare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>)
289*9880d681SAndroid Build Coastguard Workerdeclare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>)
290*9880d681SAndroid Build Coastguard Worker
291*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>)
292*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>)
293*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>)
294*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>)
295*9880d681SAndroid Build Coastguard Worker
296*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>)
297*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>)
298*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>)
299*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>)
300*9880d681SAndroid Build Coastguard Worker
301*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*)
302*9880d681SAndroid Build Coastguard Worker
303