xref: /aosp_15_r20/external/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c (revision 67e74705e28f6214e480b399dd47ea732279e315)
1*67e74705SXin Li // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
2*67e74705SXin Li // RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
3*67e74705SXin Li 
4*67e74705SXin Li // Test new aarch64 intrinsics and types
5*67e74705SXin Li 
6*67e74705SXin Li #include <arm_neon.h>
7*67e74705SXin Li 
8*67e74705SXin Li 
9*67e74705SXin Li // CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
10*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
11*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
12*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
13*67e74705SXin Li // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
14*67e74705SXin Li // CHECK:   ret float [[MUL]]
test_vmuls_lane_f32(float32_t a,float32x2_t b)15*67e74705SXin Li float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
16*67e74705SXin Li   return vmuls_lane_f32(a, b, 1);
17*67e74705SXin Li }
18*67e74705SXin Li 
19*67e74705SXin Li // CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
20*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
21*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
22*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
23*67e74705SXin Li // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
24*67e74705SXin Li // CHECK:   ret double [[MUL]]
test_vmuld_lane_f64(float64_t a,float64x1_t b)25*67e74705SXin Li float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
26*67e74705SXin Li   return vmuld_lane_f64(a, b, 0);
27*67e74705SXin Li }
28*67e74705SXin Li 
29*67e74705SXin Li // CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #0 {
30*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
31*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
32*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
33*67e74705SXin Li // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
34*67e74705SXin Li // CHECK:   ret float [[MUL]]
test_vmuls_laneq_f32(float32_t a,float32x4_t b)35*67e74705SXin Li float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
36*67e74705SXin Li   return vmuls_laneq_f32(a, b, 3);
37*67e74705SXin Li }
38*67e74705SXin Li 
39*67e74705SXin Li // CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #0 {
40*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
41*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
42*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
43*67e74705SXin Li // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
44*67e74705SXin Li // CHECK:   ret double [[MUL]]
test_vmuld_laneq_f64(float64_t a,float64x2_t b)45*67e74705SXin Li float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
46*67e74705SXin Li   return vmuld_laneq_f64(a, b, 1);
47*67e74705SXin Li }
48*67e74705SXin Li 
49*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
50*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
51*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
52*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP1]] to double
53*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
54*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
55*67e74705SXin Li // CHECK:   ret <1 x double> [[TMP4]]
test_vmul_n_f64(float64x1_t a,float64_t b)56*67e74705SXin Li float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
57*67e74705SXin Li   return vmul_n_f64(a, b);
58*67e74705SXin Li }
59*67e74705SXin Li 
60*67e74705SXin Li // CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
61*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
62*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
63*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
64*67e74705SXin Li // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) #2
65*67e74705SXin Li // CHECK:   ret float [[VMULXS_F32_I]]
test_vmulxs_lane_f32(float32_t a,float32x2_t b)66*67e74705SXin Li float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
67*67e74705SXin Li   return vmulxs_lane_f32(a, b, 1);
68*67e74705SXin Li }
69*67e74705SXin Li 
70*67e74705SXin Li // CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #0 {
71*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
72*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
73*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
74*67e74705SXin Li // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) #2
75*67e74705SXin Li // CHECK:   ret float [[VMULXS_F32_I]]
test_vmulxs_laneq_f32(float32_t a,float32x4_t b)76*67e74705SXin Li float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
77*67e74705SXin Li   return vmulxs_laneq_f32(a, b, 3);
78*67e74705SXin Li }
79*67e74705SXin Li 
80*67e74705SXin Li // CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
81*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
82*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
83*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
84*67e74705SXin Li // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) #2
85*67e74705SXin Li // CHECK:   ret double [[VMULXD_F64_I]]
test_vmulxd_lane_f64(float64_t a,float64x1_t b)86*67e74705SXin Li float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
87*67e74705SXin Li   return vmulxd_lane_f64(a, b, 0);
88*67e74705SXin Li }
89*67e74705SXin Li 
90*67e74705SXin Li // CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #0 {
91*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
92*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
93*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
94*67e74705SXin Li // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) #2
95*67e74705SXin Li // CHECK:   ret double [[VMULXD_F64_I]]
test_vmulxd_laneq_f64(float64_t a,float64x2_t b)96*67e74705SXin Li float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
97*67e74705SXin Li   return vmulxd_laneq_f64(a, b, 1);
98*67e74705SXin Li }
99*67e74705SXin Li 
100*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
101*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
102*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
103*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
104*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8>
105*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
106*67e74705SXin Li // CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
107*67e74705SXin Li // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]]) #2
108*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
109*67e74705SXin Li // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
110*67e74705SXin Li // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
111*67e74705SXin Li // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_lane_f64(float64x1_t a,float64x1_t b)112*67e74705SXin Li float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
113*67e74705SXin Li   return vmulx_lane_f64(a, b, 0);
114*67e74705SXin Li }
115*67e74705SXin Li 
116*67e74705SXin Li 
117*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #0 {
118*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
119*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
120*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
121*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
122*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
123*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
124*67e74705SXin Li // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
125*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
126*67e74705SXin Li // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
127*67e74705SXin Li // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
128*67e74705SXin Li // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_laneq_f64_0(float64x1_t a,float64x2_t b)129*67e74705SXin Li float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
130*67e74705SXin Li   return vmulx_laneq_f64(a, b, 0);
131*67e74705SXin Li }
132*67e74705SXin Li 
133*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #0 {
134*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
135*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
136*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
137*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
138*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
139*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
140*67e74705SXin Li // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
141*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
142*67e74705SXin Li // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
143*67e74705SXin Li // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
144*67e74705SXin Li // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_laneq_f64_1(float64x1_t a,float64x2_t b)145*67e74705SXin Li float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
146*67e74705SXin Li   return vmulx_laneq_f64(a, b, 1);
147*67e74705SXin Li }
148*67e74705SXin Li 
149*67e74705SXin Li 
150*67e74705SXin Li // CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
151*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
152*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
153*67e74705SXin Li // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
154*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
155*67e74705SXin Li // CHECK:   ret float [[TMP2]]
test_vfmas_lane_f32(float32_t a,float32_t b,float32x2_t c)156*67e74705SXin Li float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
157*67e74705SXin Li   return vfmas_lane_f32(a, b, c, 1);
158*67e74705SXin Li }
159*67e74705SXin Li 
160*67e74705SXin Li // CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
161*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8>
162*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
163*67e74705SXin Li // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
164*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
165*67e74705SXin Li // CHECK:   ret double [[TMP2]]
test_vfmad_lane_f64(float64_t a,float64_t b,float64x1_t c)166*67e74705SXin Li float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
167*67e74705SXin Li   return vfmad_lane_f64(a, b, c, 0);
168*67e74705SXin Li }
169*67e74705SXin Li 
170*67e74705SXin Li // CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 {
171*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8>
172*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
173*67e74705SXin Li // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
174*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
175*67e74705SXin Li // CHECK:   ret double [[TMP2]]
test_vfmad_laneq_f64(float64_t a,float64_t b,float64x2_t c)176*67e74705SXin Li float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
177*67e74705SXin Li   return vfmad_laneq_f64(a, b, c, 1);
178*67e74705SXin Li }
179*67e74705SXin Li 
180*67e74705SXin Li // CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
181*67e74705SXin Li // CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
182*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
183*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
184*67e74705SXin Li // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
185*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
186*67e74705SXin Li // CHECK:   ret float [[TMP2]]
test_vfmss_lane_f32(float32_t a,float32_t b,float32x2_t c)187*67e74705SXin Li float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
188*67e74705SXin Li   return vfmss_lane_f32(a, b, c, 1);
189*67e74705SXin Li }
190*67e74705SXin Li 
191*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
192*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
193*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
194*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
195*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
196*67e74705SXin Li // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
197*67e74705SXin Li // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
198*67e74705SXin Li // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
199*67e74705SXin Li // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
200*67e74705SXin Li // CHECK:   ret <1 x double> [[FMLA2]]
test_vfma_lane_f64(float64x1_t a,float64x1_t b,float64x1_t v)201*67e74705SXin Li float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
202*67e74705SXin Li   return vfma_lane_f64(a, b, v, 0);
203*67e74705SXin Li }
204*67e74705SXin Li 
205*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
206*67e74705SXin Li // CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
207*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
208*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
209*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
210*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
211*67e74705SXin Li // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
212*67e74705SXin Li // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
213*67e74705SXin Li // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
214*67e74705SXin Li // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
215*67e74705SXin Li // CHECK:   ret <1 x double> [[FMLA2]]
test_vfms_lane_f64(float64x1_t a,float64x1_t b,float64x1_t v)216*67e74705SXin Li float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
217*67e74705SXin Li   return vfms_lane_f64(a, b, v, 0);
218*67e74705SXin Li }
219*67e74705SXin Li 
220*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
221*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
222*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
223*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
224*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
225*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
226*67e74705SXin Li // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
227*67e74705SXin Li // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
228*67e74705SXin Li // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
229*67e74705SXin Li // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
230*67e74705SXin Li // CHECK:   ret <1 x double> [[TMP7]]
test_vfma_laneq_f64(float64x1_t a,float64x1_t b,float64x2_t v)231*67e74705SXin Li float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
232*67e74705SXin Li   return vfma_laneq_f64(a, b, v, 0);
233*67e74705SXin Li }
234*67e74705SXin Li 
235*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
236*67e74705SXin Li // CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
237*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
238*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
239*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
240*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
241*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
242*67e74705SXin Li // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
243*67e74705SXin Li // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
244*67e74705SXin Li // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
245*67e74705SXin Li // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
246*67e74705SXin Li // CHECK:   ret <1 x double> [[TMP7]]
test_vfms_laneq_f64(float64x1_t a,float64x1_t b,float64x2_t v)247*67e74705SXin Li float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
248*67e74705SXin Li   return vfms_laneq_f64(a, b, v, 0);
249*67e74705SXin Li }
250*67e74705SXin Li 
251*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
252*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
253*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
254*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
255*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
256*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
257*67e74705SXin Li // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
258*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
259*67e74705SXin Li // CHECK:   ret i32 [[TMP4]]
test_vqdmullh_lane_s16(int16_t a,int16x4_t b)260*67e74705SXin Li int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
261*67e74705SXin Li   return vqdmullh_lane_s16(a, b, 3);
262*67e74705SXin Li }
263*67e74705SXin Li 
264*67e74705SXin Li // CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
265*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
266*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
267*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
268*67e74705SXin Li // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) #2
269*67e74705SXin Li // CHECK:   ret i64 [[VQDMULLS_S32_I]]
test_vqdmulls_lane_s32(int32_t a,int32x2_t b)270*67e74705SXin Li int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
271*67e74705SXin Li   return vqdmulls_lane_s32(a, b, 1);
272*67e74705SXin Li }
273*67e74705SXin Li 
274*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
275*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
276*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
277*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
278*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
279*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
280*67e74705SXin Li // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
281*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
282*67e74705SXin Li // CHECK:   ret i32 [[TMP4]]
test_vqdmullh_laneq_s16(int16_t a,int16x8_t b)283*67e74705SXin Li int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
284*67e74705SXin Li   return vqdmullh_laneq_s16(a, b, 7);
285*67e74705SXin Li }
286*67e74705SXin Li 
287*67e74705SXin Li // CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #0 {
288*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
289*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
290*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
291*67e74705SXin Li // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) #2
292*67e74705SXin Li // CHECK:   ret i64 [[VQDMULLS_S32_I]]
test_vqdmulls_laneq_s32(int32_t a,int32x4_t b)293*67e74705SXin Li int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
294*67e74705SXin Li   return vqdmulls_laneq_s32(a, b, 3);
295*67e74705SXin Li }
296*67e74705SXin Li 
297*67e74705SXin Li // CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
298*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
299*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
300*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
301*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
302*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
303*67e74705SXin Li // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
304*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
305*67e74705SXin Li // CHECK:   ret i16 [[TMP4]]
test_vqdmulhh_lane_s16(int16_t a,int16x4_t b)306*67e74705SXin Li int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
307*67e74705SXin Li   return vqdmulhh_lane_s16(a, b, 3);
308*67e74705SXin Li }
309*67e74705SXin Li 
310*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
311*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
312*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
313*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
314*67e74705SXin Li // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
315*67e74705SXin Li // CHECK:   ret i32 [[VQDMULHS_S32_I]]
test_vqdmulhs_lane_s32(int32_t a,int32x2_t b)316*67e74705SXin Li int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
317*67e74705SXin Li   return vqdmulhs_lane_s32(a, b, 1);
318*67e74705SXin Li }
319*67e74705SXin Li 
320*67e74705SXin Li 
321*67e74705SXin Li // CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
322*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
323*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
324*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
325*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
326*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
327*67e74705SXin Li // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
328*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
329*67e74705SXin Li // CHECK:   ret i16 [[TMP4]]
test_vqdmulhh_laneq_s16(int16_t a,int16x8_t b)330*67e74705SXin Li int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
331*67e74705SXin Li   return vqdmulhh_laneq_s16(a, b, 7);
332*67e74705SXin Li }
333*67e74705SXin Li 
334*67e74705SXin Li 
335*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
336*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
337*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
338*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
339*67e74705SXin Li // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
340*67e74705SXin Li // CHECK:   ret i32 [[VQDMULHS_S32_I]]
test_vqdmulhs_laneq_s32(int32_t a,int32x4_t b)341*67e74705SXin Li int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
342*67e74705SXin Li   return vqdmulhs_laneq_s32(a, b, 3);
343*67e74705SXin Li }
344*67e74705SXin Li 
345*67e74705SXin Li // CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
346*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
347*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
348*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
349*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
350*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
351*67e74705SXin Li // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
352*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
353*67e74705SXin Li // CHECK:   ret i16 [[TMP4]]
test_vqrdmulhh_lane_s16(int16_t a,int16x4_t b)354*67e74705SXin Li int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
355*67e74705SXin Li   return vqrdmulhh_lane_s16(a, b, 3);
356*67e74705SXin Li }
357*67e74705SXin Li 
358*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
359*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
360*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
361*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
362*67e74705SXin Li // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
363*67e74705SXin Li // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
test_vqrdmulhs_lane_s32(int32_t a,int32x2_t b)364*67e74705SXin Li int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
365*67e74705SXin Li   return vqrdmulhs_lane_s32(a, b, 1);
366*67e74705SXin Li }
367*67e74705SXin Li 
368*67e74705SXin Li 
369*67e74705SXin Li // CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
370*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
371*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
372*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
373*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
374*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
375*67e74705SXin Li // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
376*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
377*67e74705SXin Li // CHECK:   ret i16 [[TMP4]]
test_vqrdmulhh_laneq_s16(int16_t a,int16x8_t b)378*67e74705SXin Li int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
379*67e74705SXin Li   return vqrdmulhh_laneq_s16(a, b, 7);
380*67e74705SXin Li }
381*67e74705SXin Li 
382*67e74705SXin Li 
383*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
384*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
385*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
386*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
387*67e74705SXin Li // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
388*67e74705SXin Li // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
test_vqrdmulhs_laneq_s32(int32_t a,int32x4_t b)389*67e74705SXin Li int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
390*67e74705SXin Li   return vqrdmulhs_laneq_s32(a, b, 3);
391*67e74705SXin Li }
392*67e74705SXin Li 
393*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
394*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
395*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
396*67e74705SXin Li // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
397*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
398*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
399*67e74705SXin Li // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
400*67e74705SXin Li // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
401*67e74705SXin Li // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
402*67e74705SXin Li // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlalh_lane_s16(int32_t a,int16_t b,int16x4_t c)403*67e74705SXin Li int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
404*67e74705SXin Li   return vqdmlalh_lane_s16(a, b, c, 3);
405*67e74705SXin Li }
406*67e74705SXin Li 
407*67e74705SXin Li // CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
408*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
409*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
410*67e74705SXin Li // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
411*67e74705SXin Li // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
412*67e74705SXin Li // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
413*67e74705SXin Li // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlals_lane_s32(int64_t a,int32_t b,int32x2_t c)414*67e74705SXin Li int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
415*67e74705SXin Li   return vqdmlals_lane_s32(a, b, c, 1);
416*67e74705SXin Li }
417*67e74705SXin Li 
418*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
419*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
420*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
421*67e74705SXin Li // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
422*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
423*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
424*67e74705SXin Li // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
425*67e74705SXin Li // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
426*67e74705SXin Li // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
427*67e74705SXin Li // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlalh_laneq_s16(int32_t a,int16_t b,int16x8_t c)428*67e74705SXin Li int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
429*67e74705SXin Li   return vqdmlalh_laneq_s16(a, b, c, 7);
430*67e74705SXin Li }
431*67e74705SXin Li 
432*67e74705SXin Li // CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
433*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
434*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
435*67e74705SXin Li // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
436*67e74705SXin Li // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
437*67e74705SXin Li // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
438*67e74705SXin Li // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlals_laneq_s32(int64_t a,int32_t b,int32x4_t c)439*67e74705SXin Li int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
440*67e74705SXin Li   return vqdmlals_laneq_s32(a, b, c, 3);
441*67e74705SXin Li }
442*67e74705SXin Li 
443*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
444*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
445*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
446*67e74705SXin Li // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
447*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
448*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
449*67e74705SXin Li // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
450*67e74705SXin Li // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
451*67e74705SXin Li // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
452*67e74705SXin Li // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlslh_lane_s16(int32_t a,int16_t b,int16x4_t c)453*67e74705SXin Li int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
454*67e74705SXin Li   return vqdmlslh_lane_s16(a, b, c, 3);
455*67e74705SXin Li }
456*67e74705SXin Li 
457*67e74705SXin Li // CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
458*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
459*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
460*67e74705SXin Li // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
461*67e74705SXin Li // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
462*67e74705SXin Li // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
463*67e74705SXin Li // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlsls_lane_s32(int64_t a,int32_t b,int32x2_t c)464*67e74705SXin Li int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
465*67e74705SXin Li   return vqdmlsls_lane_s32(a, b, c, 1);
466*67e74705SXin Li }
467*67e74705SXin Li 
468*67e74705SXin Li // CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
469*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
470*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
471*67e74705SXin Li // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
472*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
473*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
474*67e74705SXin Li // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
475*67e74705SXin Li // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
476*67e74705SXin Li // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
477*67e74705SXin Li // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlslh_laneq_s16(int32_t a,int16_t b,int16x8_t c)478*67e74705SXin Li int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
479*67e74705SXin Li   return vqdmlslh_laneq_s16(a, b, c, 7);
480*67e74705SXin Li }
481*67e74705SXin Li 
482*67e74705SXin Li // CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
483*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
484*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
485*67e74705SXin Li // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
486*67e74705SXin Li // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
487*67e74705SXin Li // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
488*67e74705SXin Li // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlsls_laneq_s32(int64_t a,int32_t b,int32x4_t c)489*67e74705SXin Li int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
490*67e74705SXin Li   return vqdmlsls_laneq_s32(a, b, c, 3);
491*67e74705SXin Li }
492*67e74705SXin Li 
493*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
494*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
495*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
496*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
497*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
498*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
499*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8>
500*67e74705SXin Li // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
501*67e74705SXin Li // CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0
502*67e74705SXin Li // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]]) #2
503*67e74705SXin Li // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
504*67e74705SXin Li // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
505*67e74705SXin Li // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
506*67e74705SXin Li // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_lane_f64_0()507*67e74705SXin Li float64x1_t test_vmulx_lane_f64_0() {
508*67e74705SXin Li       float64x1_t arg1;
509*67e74705SXin Li       float64x1_t arg2;
510*67e74705SXin Li       float64x1_t result;
511*67e74705SXin Li       float64_t sarg1, sarg2, sres;
512*67e74705SXin Li       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
513*67e74705SXin Li       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
514*67e74705SXin Li       result = vmulx_lane_f64(arg1, arg2, 0);
515*67e74705SXin Li       return result;
516*67e74705SXin Li }
517*67e74705SXin Li 
518*67e74705SXin Li // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #0 {
519*67e74705SXin Li // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
520*67e74705SXin Li // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
521*67e74705SXin Li // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
522*67e74705SXin Li // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
523*67e74705SXin Li // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
524*67e74705SXin Li // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
525*67e74705SXin Li // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8>
526*67e74705SXin Li // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
527*67e74705SXin Li // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
528*67e74705SXin Li // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
529*67e74705SXin Li // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
530*67e74705SXin Li // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
531*67e74705SXin Li // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
532*67e74705SXin Li // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_laneq_f64_2()533*67e74705SXin Li float64x1_t test_vmulx_laneq_f64_2() {
534*67e74705SXin Li       float64x1_t arg1;
535*67e74705SXin Li       float64x1_t arg2;
536*67e74705SXin Li       float64x2_t arg3;
537*67e74705SXin Li       float64x1_t result;
538*67e74705SXin Li       float64_t sarg1, sarg2, sres;
539*67e74705SXin Li       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
540*67e74705SXin Li       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
541*67e74705SXin Li       arg3 = vcombine_f64(arg1, arg2);
542*67e74705SXin Li       result = vmulx_laneq_f64(arg1, arg3, 1);
543*67e74705SXin Li       return result;
544*67e74705SXin Li }
545